agent-vision-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +117 -0
- package/dist/browser/cdp/browser-cdp-discovery-service.d.ts +10 -0
- package/dist/browser/cdp/browser-cdp-discovery-service.js +28 -0
- package/dist/browser/cdp/browser-live-tab-service.d.ts +16 -0
- package/dist/browser/cdp/browser-live-tab-service.js +42 -0
- package/dist/browser/cdp/browser-see-service.d.ts +33 -0
- package/dist/browser/cdp/browser-see-service.js +76 -0
- package/dist/browser/cdp/browser-tab-context-service.d.ts +23 -0
- package/dist/browser/cdp/browser-tab-context-service.js +90 -0
- package/dist/browser/cdp/browser-tab-resolution-service.d.ts +9 -0
- package/dist/browser/cdp/browser-tab-resolution-service.js +65 -0
- package/dist/browser/cdp/browser-tab-screenshot-service.d.ts +20 -0
- package/dist/browser/cdp/browser-tab-screenshot-service.js +59 -0
- package/dist/browser/cdp/cdp-websocket-session.d.ts +9 -0
- package/dist/browser/cdp/cdp-websocket-session.js +99 -0
- package/dist/browser/cdp/chrome-cdp-client.d.ts +12 -0
- package/dist/browser/cdp/chrome-cdp-client.js +141 -0
- package/dist/browser/cdp/live-browser-tab-registry.d.ts +12 -0
- package/dist/browser/cdp/live-browser-tab-registry.js +96 -0
- package/dist/browser/cdp/png-metadata.d.ts +5 -0
- package/dist/browser/cdp/png-metadata.js +16 -0
- package/dist/browser/cdp/tab-model.d.ts +33 -0
- package/dist/browser/cdp/tab-model.js +15 -0
- package/dist/browser/cdp/tab-resolution.d.ts +27 -0
- package/dist/browser/cdp/tab-resolution.js +48 -0
- package/dist/browser/cdp/types.d.ts +71 -0
- package/dist/browser/cdp/types.js +1 -0
- package/dist/capture/capture-pipeline.d.ts +5 -0
- package/dist/capture/capture-pipeline.js +1 -0
- package/dist/capture/create-screen-capture-provider.d.ts +3 -0
- package/dist/capture/create-screen-capture-provider.js +8 -0
- package/dist/capture/in-memory-capture-pipeline.d.ts +13 -0
- package/dist/capture/in-memory-capture-pipeline.js +52 -0
- package/dist/capture/in-memory-image-compositor.d.ts +5 -0
- package/dist/capture/in-memory-image-compositor.js +34 -0
- package/dist/capture/linux-portal-screenshot-provider.d.ts +8 -0
- package/dist/capture/linux-portal-screenshot-provider.js +181 -0
- package/dist/capture/mock-screen-capture-provider.d.ts +5 -0
- package/dist/capture/mock-screen-capture-provider.js +22 -0
- package/dist/capture/png-metadata.d.ts +5 -0
- package/dist/capture/png-metadata.js +18 -0
- package/dist/capture/screen-capture-provider.d.ts +4 -0
- package/dist/capture/screen-capture-provider.js +1 -0
- package/dist/capture/types.d.ts +38 -0
- package/dist/capture/types.js +1 -0
- package/dist/cdp-demo.d.ts +1 -0
- package/dist/cdp-demo.js +41 -0
- package/dist/demo.d.ts +1 -0
- package/dist/demo.js +54 -0
- package/dist/desktop/capture-now.d.ts +1 -0
- package/dist/desktop/capture-now.js +48 -0
- package/dist/desktop/controller.d.ts +25 -0
- package/dist/desktop/controller.js +77 -0
- package/dist/desktop/main.d.ts +1 -0
- package/dist/desktop/main.js +80 -0
- package/dist/desktop/preload.d.ts +1 -0
- package/dist/desktop/preload.js +26 -0
- package/dist/desktop/types.d.ts +31 -0
- package/dist/desktop/types.js +1 -0
- package/dist/errors/app-error.d.ts +7 -0
- package/dist/errors/app-error.js +11 -0
- package/dist/flow/types.d.ts +48 -0
- package/dist/flow/types.js +1 -0
- package/dist/flow/visual-capture-flow.d.ts +13 -0
- package/dist/flow/visual-capture-flow.js +196 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +3 -0
- package/dist/logging/logger.d.ts +15 -0
- package/dist/logging/logger.js +28 -0
- package/dist/mcp/stdio-server.d.ts +19 -0
- package/dist/mcp/stdio-server.js +272 -0
- package/dist/mcp/tool-registry.d.ts +21 -0
- package/dist/mcp/tool-registry.js +33 -0
- package/dist/mcp-stdio.d.ts +2 -0
- package/dist/mcp-stdio.js +8 -0
- package/dist/overlay/local-overlay-agent.d.ts +46 -0
- package/dist/overlay/local-overlay-agent.js +551 -0
- package/dist/overlay/overlay-bundle-factory.d.ts +4 -0
- package/dist/overlay/overlay-bundle-factory.js +24 -0
- package/dist/overlay/types.d.ts +83 -0
- package/dist/overlay/types.js +1 -0
- package/dist/server.d.ts +19 -0
- package/dist/server.js +158 -0
- package/dist/session/capture-session-service.d.ts +21 -0
- package/dist/session/capture-session-service.js +50 -0
- package/dist/session/session-manager.d.ts +29 -0
- package/dist/session/session-manager.js +217 -0
- package/dist/session/session-store.d.ts +8 -0
- package/dist/session/session-store.js +15 -0
- package/dist/session/session-waiter.d.ts +14 -0
- package/dist/session/session-waiter.js +102 -0
- package/dist/types/annotation.d.ts +32 -0
- package/dist/types/annotation.js +1 -0
- package/dist/types/capture.d.ts +33 -0
- package/dist/types/capture.js +1 -0
- package/dist/types/session.d.ts +36 -0
- package/dist/types/session.js +1 -0
- package/package.json +38 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kedar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>Agent Vision</h1>
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="./docs/banner.png" width="800" />
|
|
5
|
+
</p>
|
|
6
|
+
<p>
|
|
7
|
+
<img alt="TypeScript" src="https://img.shields.io/badge/TypeScript-ES2022-3178C6?style=flat-square" />
|
|
8
|
+
<img alt="Chrome CDP" src="https://img.shields.io/badge/Chrome-CDP-4285F4?style=flat-square" />
|
|
9
|
+
<img alt="MCP" src="https://img.shields.io/badge/MCP-stdio-7C3AED?style=flat-square" />
|
|
10
|
+
<img alt="License" src="https://img.shields.io/badge/License-MIT-22C55E?style=flat-square" />
|
|
11
|
+
</p>
|
|
12
|
+
</div>
|
|
13
|
+
|
|
14
|
+
Agent Vision is a browser-first MCP server that gives agents direct visual access to live Chrome or Brave tabs through the Chrome DevTools Protocol. Instead of asking users to manually capture screenshots and upload them into chat, Agent Vision lets a coding agent resolve a tab by title or URL, capture a real screenshot on demand, and pull structured browser context like visible text, page metadata, and viewport details in the same flow.
|
|
15
|
+
|
|
16
|
+
## Problem
|
|
17
|
+
|
|
18
|
+
LLM workflows break down when visual browser context is trapped behind manual steps. A user sees an issue in a tab, but the agent cannot see the same thing unless the user pauses, takes a screenshot, uploads it, and adds extra explanation. That friction slows debugging, weakens iteration speed, and loses important structured context that browsers already expose natively. Agent Vision solves this by making browser state directly available as MCP tools.
|
|
19
|
+
|
|
20
|
+
## Workflow
|
|
21
|
+
|
|
22
|
+
```mermaid
|
|
23
|
+
flowchart TD
|
|
24
|
+
A[User asks agent to inspect a browser tab] --> B[Codex calls Agent Vision MCP tool]
|
|
25
|
+
B --> C[Agent Vision queries Chrome DevTools Protocol]
|
|
26
|
+
C --> D[Discover live tabs]
|
|
27
|
+
D --> E[Resolve best matching tab by title or URL]
|
|
28
|
+
E --> F[Capture screenshot through CDP]
|
|
29
|
+
E --> G[Evaluate page for visible text and metadata]
|
|
30
|
+
F --> H[Return MCP image content]
|
|
31
|
+
G --> I[Return MCP structured text content]
|
|
32
|
+
H --> J[Agent receives visual + structured browser context]
|
|
33
|
+
I --> J
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Tools
|
|
37
|
+
|
|
38
|
+
| Tool | What it does |
|
|
39
|
+
| --- | --- |
|
|
40
|
+
| `getBrowserCdpStatus` | Checks whether Agent Vision can reach the configured Chrome DevTools Protocol endpoint. |
|
|
41
|
+
| `discoverBrowserTabsViaCdp` | Returns the raw live tab list exposed by Chrome DevTools Protocol. |
|
|
42
|
+
| `refreshLiveBrowserTabs` | Refreshes the normalized in-memory live tab model from CDP. |
|
|
43
|
+
| `listLiveBrowserTabs` | Returns the cached normalized live tab model without re-querying the browser. |
|
|
44
|
+
| `pruneStaleLiveBrowserTabs` | Removes cached tabs that have become stale because they have not been refreshed recently. |
|
|
45
|
+
| `resolveLiveBrowserTab` | Resolves the active or best matching tab for a `/see`-style query using title and URL heuristics. |
|
|
46
|
+
| `captureResolvedBrowserTabScreenshot` | Captures a real PNG screenshot from the resolved browser tab via CDP. |
|
|
47
|
+
| `getResolvedBrowserTabContext` | Extracts structured page context such as visible text, page title, page URL, language, content type, and viewport metadata. |
|
|
48
|
+
| `seeBrowserTabViaCdp` | Runs the full high-level browser inspection flow: resolve tab, capture screenshot, and return structured browser context. |
|
|
49
|
+
|
|
50
|
+
## Setup And Installation
|
|
51
|
+
|
|
52
|
+
### Prerequisites
|
|
53
|
+
|
|
54
|
+
- Node.js 22+
|
|
55
|
+
- Brave, Chrome, or another Chromium-based browser
|
|
56
|
+
- Codex configured locally with MCP support
|
|
57
|
+
|
|
58
|
+
### Install dependencies
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
npm install
|
|
62
|
+
npm run build
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Start a CDP-enabled browser session
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
brave-browser --remote-debugging-port=9222 --user-data-dir=/tmp/agent-vision-cdp
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Register Agent Vision in Codex
|
|
72
|
+
|
|
73
|
+
Add this to `~/.codex/config.toml`:
|
|
74
|
+
|
|
75
|
+
```toml
|
|
76
|
+
[mcp_servers.agent-vision]
|
|
77
|
+
command = "/home/kedar/.nvm/versions/node/v22.22.0/bin/node"
|
|
78
|
+
args = ["/home/kedar/Desktop/Projects/llm_vision/dist/mcp-stdio.js"]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Then restart Codex.
|
|
82
|
+
|
|
83
|
+
### Verify the MCP server is available
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
codex mcp list
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Example Usage
|
|
90
|
+
|
|
91
|
+
### Manual local verification
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
npm run demo-cdp
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Example Codex prompts
|
|
98
|
+
|
|
99
|
+
```text
|
|
100
|
+
Use the agent-vision MCP server to list my live browser tabs.
|
|
101
|
+
Use the agent-vision MCP server to inspect the active browser tab.
|
|
102
|
+
Use the agent-vision MCP server to inspect the tab matching "docs".
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Future npm-distributed registration shape
|
|
106
|
+
|
|
107
|
+
After publishing, the Codex config can switch to:
|
|
108
|
+
|
|
109
|
+
```toml
|
|
110
|
+
[mcp_servers.agent-vision]
|
|
111
|
+
command = "npx"
|
|
112
|
+
args = ["-y", "agent-vision-mcp"]
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## License
|
|
116
|
+
|
|
117
|
+
MIT. See [LICENSE](./LICENSE).
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { Logger } from "../../logging/logger.js";
|
|
2
|
+
import { ChromeCdpClient } from "./chrome-cdp-client.js";
|
|
3
|
+
import type { CdpConnectionStatus, CdpTabDiscoveryResult } from "./types.js";
|
|
4
|
+
export declare class BrowserCdpDiscoveryService {
|
|
5
|
+
private readonly client;
|
|
6
|
+
private readonly logger;
|
|
7
|
+
constructor(client: ChromeCdpClient, logger: Logger);
|
|
8
|
+
getConnectionStatus(endpointOverride?: string): Promise<CdpConnectionStatus>;
|
|
9
|
+
discoverTabs(endpointOverride?: string): Promise<CdpTabDiscoveryResult>;
|
|
10
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export class BrowserCdpDiscoveryService {
|
|
2
|
+
client;
|
|
3
|
+
logger;
|
|
4
|
+
constructor(client, logger) {
|
|
5
|
+
this.client = client;
|
|
6
|
+
this.logger = logger;
|
|
7
|
+
}
|
|
8
|
+
async getConnectionStatus(endpointOverride) {
|
|
9
|
+
const status = await this.client.getConnectionStatus(endpointOverride);
|
|
10
|
+
if (status.connected) {
|
|
11
|
+
this.logger.info("CDP connection ready", {
|
|
12
|
+
endpoint: status.endpoint,
|
|
13
|
+
browser: status.browser,
|
|
14
|
+
protocolVersion: status.protocolVersion
|
|
15
|
+
});
|
|
16
|
+
}
|
|
17
|
+
return status;
|
|
18
|
+
}
|
|
19
|
+
async discoverTabs(endpointOverride) {
|
|
20
|
+
const result = await this.client.discoverTabs(endpointOverride);
|
|
21
|
+
this.logger.info("CDP tab discovery completed", {
|
|
22
|
+
endpoint: result.endpoint,
|
|
23
|
+
tabCount: result.tabs.length,
|
|
24
|
+
browser: result.browser
|
|
25
|
+
});
|
|
26
|
+
return result;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { Logger } from "../../logging/logger.js";
|
|
2
|
+
import { BrowserCdpDiscoveryService } from "./browser-cdp-discovery-service.js";
|
|
3
|
+
import { LiveBrowserTabRegistry } from "./live-browser-tab-registry.js";
|
|
4
|
+
import { type LiveBrowserTabDiscoveryResult, type LiveBrowserTab } from "./tab-model.js";
|
|
5
|
+
export declare class BrowserLiveTabService {
|
|
6
|
+
private readonly discovery;
|
|
7
|
+
private readonly registry;
|
|
8
|
+
private readonly logger;
|
|
9
|
+
constructor(discovery: BrowserCdpDiscoveryService, registry: LiveBrowserTabRegistry, logger: Logger);
|
|
10
|
+
refresh(endpointOverride?: string): Promise<LiveBrowserTabDiscoveryResult>;
|
|
11
|
+
pruneStale(maxAgeMs?: number): {
|
|
12
|
+
removed: LiveBrowserTab[];
|
|
13
|
+
remaining: LiveBrowserTabDiscoveryResult;
|
|
14
|
+
};
|
|
15
|
+
list(): LiveBrowserTabDiscoveryResult;
|
|
16
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { toDiscoveryResult } from "./tab-model.js";
|
|
2
|
+
export class BrowserLiveTabService {
|
|
3
|
+
discovery;
|
|
4
|
+
registry;
|
|
5
|
+
logger;
|
|
6
|
+
constructor(discovery, registry, logger) {
|
|
7
|
+
this.discovery = discovery;
|
|
8
|
+
this.registry = registry;
|
|
9
|
+
this.logger = logger;
|
|
10
|
+
}
|
|
11
|
+
async refresh(endpointOverride) {
|
|
12
|
+
const discoveryResult = await this.discovery.discoverTabs(endpointOverride);
|
|
13
|
+
const tabs = this.registry.upsertDiscovery({
|
|
14
|
+
endpoint: discoveryResult.endpoint,
|
|
15
|
+
discoveredAt: discoveryResult.discoveredAt,
|
|
16
|
+
tabs: discoveryResult.tabs
|
|
17
|
+
});
|
|
18
|
+
this.logger.info("Refreshed live browser tab model", {
|
|
19
|
+
endpoint: discoveryResult.endpoint,
|
|
20
|
+
tabCount: tabs.length,
|
|
21
|
+
browser: discoveryResult.browser
|
|
22
|
+
});
|
|
23
|
+
return toDiscoveryResult(discoveryResult, tabs);
|
|
24
|
+
}
|
|
25
|
+
pruneStale(maxAgeMs) {
|
|
26
|
+
const removed = this.registry.pruneStale(maxAgeMs);
|
|
27
|
+
return {
|
|
28
|
+
removed,
|
|
29
|
+
remaining: this.list()
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
list() {
|
|
33
|
+
const tabs = this.registry.list();
|
|
34
|
+
const discoveredAt = tabs[0]?.lastSeenAt ?? new Date().toISOString();
|
|
35
|
+
const endpoint = tabs[0]?.lastDiscoveryEndpoint ?? "unknown";
|
|
36
|
+
return {
|
|
37
|
+
endpoint,
|
|
38
|
+
discoveredAt,
|
|
39
|
+
tabs
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { Logger } from "../../logging/logger.js";
|
|
2
|
+
import { BrowserLiveTabService } from "./browser-live-tab-service.js";
|
|
3
|
+
import { BrowserTabContextService } from "./browser-tab-context-service.js";
|
|
4
|
+
import { BrowserTabScreenshotService } from "./browser-tab-screenshot-service.js";
|
|
5
|
+
import type { BrowserTabScreenshot, BrowserTabStructuredContext } from "./types.js";
|
|
6
|
+
import type { LiveBrowserTabCandidate } from "./tab-resolution.js";
|
|
7
|
+
export type BrowserSeeResult = {
|
|
8
|
+
status: "completed";
|
|
9
|
+
query?: string;
|
|
10
|
+
stage: "completed";
|
|
11
|
+
matchedCandidate: LiveBrowserTabCandidate;
|
|
12
|
+
candidateCount: number;
|
|
13
|
+
screenshot: BrowserTabScreenshot;
|
|
14
|
+
context: BrowserTabStructuredContext;
|
|
15
|
+
message: string;
|
|
16
|
+
recoveredAfterRefresh?: boolean;
|
|
17
|
+
} | {
|
|
18
|
+
status: "ambiguous" | "not_found";
|
|
19
|
+
query?: string;
|
|
20
|
+
stage: "needs_disambiguation" | "not_found";
|
|
21
|
+
candidates: LiveBrowserTabCandidate[];
|
|
22
|
+
message: string;
|
|
23
|
+
recoveredAfterRefresh?: boolean;
|
|
24
|
+
};
|
|
25
|
+
export declare class BrowserSeeService {
|
|
26
|
+
private readonly liveTabs;
|
|
27
|
+
private readonly screenshots;
|
|
28
|
+
private readonly contexts;
|
|
29
|
+
private readonly logger;
|
|
30
|
+
constructor(liveTabs: BrowserLiveTabService, screenshots: BrowserTabScreenshotService, contexts: BrowserTabContextService, logger: Logger);
|
|
31
|
+
see(query?: string): Promise<BrowserSeeResult>;
|
|
32
|
+
private seeOnce;
|
|
33
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
export class BrowserSeeService {
|
|
2
|
+
liveTabs;
|
|
3
|
+
screenshots;
|
|
4
|
+
contexts;
|
|
5
|
+
logger;
|
|
6
|
+
constructor(liveTabs, screenshots, contexts, logger) {
|
|
7
|
+
this.liveTabs = liveTabs;
|
|
8
|
+
this.screenshots = screenshots;
|
|
9
|
+
this.contexts = contexts;
|
|
10
|
+
this.logger = logger;
|
|
11
|
+
}
|
|
12
|
+
async see(query) {
|
|
13
|
+
try {
|
|
14
|
+
const firstAttempt = await this.seeOnce(query, false);
|
|
15
|
+
if (firstAttempt.status !== "not_found") {
|
|
16
|
+
return firstAttempt;
|
|
17
|
+
}
|
|
18
|
+
this.logger.warn("Browser-first /see returned not_found, retrying after refresh", {
|
|
19
|
+
query,
|
|
20
|
+
candidateCount: firstAttempt.candidates.length
|
|
21
|
+
});
|
|
22
|
+
await this.liveTabs.refresh();
|
|
23
|
+
return await this.seeOnce(query, true);
|
|
24
|
+
}
|
|
25
|
+
catch (error) {
|
|
26
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
27
|
+
this.logger.warn("Browser-first /see failed before completion, retrying after refresh", {
|
|
28
|
+
query,
|
|
29
|
+
errorMessage: message
|
|
30
|
+
});
|
|
31
|
+
await this.liveTabs.refresh();
|
|
32
|
+
return await this.seeOnce(query, true);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
async seeOnce(query, recoveredAfterRefresh) {
|
|
36
|
+
const screenshotResult = await this.screenshots.captureResolved(query);
|
|
37
|
+
if (screenshotResult.status !== "completed") {
|
|
38
|
+
this.logger.info("Browser-first /see did not complete immediately", {
|
|
39
|
+
query,
|
|
40
|
+
status: screenshotResult.status,
|
|
41
|
+
candidateCount: screenshotResult.candidates.length,
|
|
42
|
+
recoveredAfterRefresh
|
|
43
|
+
});
|
|
44
|
+
return {
|
|
45
|
+
status: screenshotResult.status,
|
|
46
|
+
query: screenshotResult.query,
|
|
47
|
+
stage: screenshotResult.status === "ambiguous" ? "needs_disambiguation" : "not_found",
|
|
48
|
+
candidates: screenshotResult.candidates,
|
|
49
|
+
message: screenshotResult.message,
|
|
50
|
+
...(recoveredAfterRefresh ? { recoveredAfterRefresh: true } : {})
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
const context = await this.contexts.getForResolvedTab(screenshotResult.resolution, query);
|
|
54
|
+
const mapped = {
|
|
55
|
+
status: "completed",
|
|
56
|
+
query: screenshotResult.query,
|
|
57
|
+
stage: "completed",
|
|
58
|
+
matchedCandidate: screenshotResult.resolution.matchedCandidate,
|
|
59
|
+
candidateCount: screenshotResult.resolution.candidateCount,
|
|
60
|
+
screenshot: screenshotResult.screenshot,
|
|
61
|
+
context,
|
|
62
|
+
message: `Captured browser tab \"${screenshotResult.screenshot.title}\" through CDP with structured page context.`,
|
|
63
|
+
...(recoveredAfterRefresh ? { recoveredAfterRefresh: true } : {})
|
|
64
|
+
};
|
|
65
|
+
this.logger.info("Completed browser-first /see flow", {
|
|
66
|
+
query,
|
|
67
|
+
title: mapped.screenshot.title,
|
|
68
|
+
targetId: mapped.screenshot.targetId,
|
|
69
|
+
width: mapped.screenshot.width,
|
|
70
|
+
height: mapped.screenshot.height,
|
|
71
|
+
visibleTextLength: mapped.context.visibleTextLength,
|
|
72
|
+
recoveredAfterRefresh
|
|
73
|
+
});
|
|
74
|
+
return mapped;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { Logger } from "../../logging/logger.js";
|
|
2
|
+
import { BrowserTabResolutionService } from "./browser-tab-resolution-service.js";
|
|
3
|
+
import type { LiveBrowserTabResolution } from "./tab-resolution.js";
|
|
4
|
+
import type { BrowserTabStructuredContext } from "./types.js";
|
|
5
|
+
export type GetResolvedBrowserTabContextResult = {
|
|
6
|
+
status: "completed";
|
|
7
|
+
query?: string;
|
|
8
|
+
resolution: Extract<LiveBrowserTabResolution, {
|
|
9
|
+
status: "resolved";
|
|
10
|
+
}>;
|
|
11
|
+
context: BrowserTabStructuredContext;
|
|
12
|
+
} | Extract<LiveBrowserTabResolution, {
|
|
13
|
+
status: "ambiguous" | "not_found";
|
|
14
|
+
}>;
|
|
15
|
+
export declare class BrowserTabContextService {
|
|
16
|
+
private readonly resolver;
|
|
17
|
+
private readonly logger;
|
|
18
|
+
constructor(resolver: BrowserTabResolutionService, logger: Logger);
|
|
19
|
+
getForResolvedTab(resolution: Extract<LiveBrowserTabResolution, {
|
|
20
|
+
status: "resolved";
|
|
21
|
+
}>, query?: string): Promise<BrowserTabStructuredContext>;
|
|
22
|
+
getResolvedContext(query?: string): Promise<GetResolvedBrowserTabContextResult>;
|
|
23
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { withCdpTabSession } from "./cdp-websocket-session.js";
|
|
2
|
+
const nowIso = () => new Date().toISOString();
|
|
3
|
+
const CONTEXT_EVALUATION_EXPRESSION = `(() => {
|
|
4
|
+
const bodyText = document.body?.innerText ?? "";
|
|
5
|
+
return {
|
|
6
|
+
pageTitle: document.title ?? "",
|
|
7
|
+
pageUrl: window.location.href ?? "",
|
|
8
|
+
documentLanguage: document.documentElement?.lang ?? "",
|
|
9
|
+
contentType: document.contentType ?? "",
|
|
10
|
+
visibleText: bodyText,
|
|
11
|
+
viewport: {
|
|
12
|
+
width: window.innerWidth,
|
|
13
|
+
height: window.innerHeight,
|
|
14
|
+
devicePixelRatio: window.devicePixelRatio
|
|
15
|
+
}
|
|
16
|
+
};
|
|
17
|
+
})()`;
|
|
18
|
+
const asOptionalString = (value) => typeof value === "string" && value.trim() !== "" ? value : undefined;
|
|
19
|
+
const asVisibleText = (value) => (typeof value === "string" ? value : "");
|
|
20
|
+
const asViewport = (value) => {
|
|
21
|
+
if (!value || typeof value !== "object") {
|
|
22
|
+
return undefined;
|
|
23
|
+
}
|
|
24
|
+
const candidate = value;
|
|
25
|
+
return {
|
|
26
|
+
width: typeof candidate.width === "number" ? candidate.width : undefined,
|
|
27
|
+
height: typeof candidate.height === "number" ? candidate.height : undefined,
|
|
28
|
+
devicePixelRatio: typeof candidate.devicePixelRatio === "number" ? candidate.devicePixelRatio : undefined
|
|
29
|
+
};
|
|
30
|
+
};
|
|
31
|
+
const collectTabContext = async (resolution) => await withCdpTabSession(resolution.tab, async (session) => {
|
|
32
|
+
await session.sendCommand("Page.enable");
|
|
33
|
+
await session.sendCommand("Runtime.enable");
|
|
34
|
+
const evaluation = await session.sendCommand("Runtime.evaluate", {
|
|
35
|
+
expression: CONTEXT_EVALUATION_EXPRESSION,
|
|
36
|
+
returnByValue: true
|
|
37
|
+
});
|
|
38
|
+
const result = evaluation.result;
|
|
39
|
+
const value = result && typeof result === "object" && "value" in result
|
|
40
|
+
? result.value
|
|
41
|
+
: undefined;
|
|
42
|
+
const payload = value && typeof value === "object" ? value : {};
|
|
43
|
+
const visibleText = asVisibleText(payload.visibleText);
|
|
44
|
+
return {
|
|
45
|
+
targetId: resolution.tab.targetId,
|
|
46
|
+
title: resolution.tab.title,
|
|
47
|
+
url: resolution.tab.url,
|
|
48
|
+
browserName: resolution.tab.browserName,
|
|
49
|
+
pageTitle: asOptionalString(payload.pageTitle),
|
|
50
|
+
pageUrl: asOptionalString(payload.pageUrl),
|
|
51
|
+
documentLanguage: asOptionalString(payload.documentLanguage),
|
|
52
|
+
contentType: asOptionalString(payload.contentType),
|
|
53
|
+
visibleText,
|
|
54
|
+
visibleTextLength: visibleText.length,
|
|
55
|
+
viewport: asViewport(payload.viewport),
|
|
56
|
+
collectedAt: nowIso(),
|
|
57
|
+
backend: "cdp-runtime-evaluate"
|
|
58
|
+
};
|
|
59
|
+
});
|
|
60
|
+
export class BrowserTabContextService {
|
|
61
|
+
resolver;
|
|
62
|
+
logger;
|
|
63
|
+
constructor(resolver, logger) {
|
|
64
|
+
this.resolver = resolver;
|
|
65
|
+
this.logger = logger;
|
|
66
|
+
}
|
|
67
|
+
async getForResolvedTab(resolution, query) {
|
|
68
|
+
const context = await collectTabContext(resolution);
|
|
69
|
+
this.logger.info("Collected structured browser tab context via CDP", {
|
|
70
|
+
query,
|
|
71
|
+
targetId: resolution.tab.targetId,
|
|
72
|
+
title: resolution.tab.title,
|
|
73
|
+
visibleTextLength: context.visibleTextLength
|
|
74
|
+
});
|
|
75
|
+
return context;
|
|
76
|
+
}
|
|
77
|
+
async getResolvedContext(query) {
|
|
78
|
+
const resolution = this.resolver.resolve(query);
|
|
79
|
+
if (resolution.status !== "resolved") {
|
|
80
|
+
return resolution;
|
|
81
|
+
}
|
|
82
|
+
const context = await this.getForResolvedTab(resolution, query);
|
|
83
|
+
return {
|
|
84
|
+
status: "completed",
|
|
85
|
+
query,
|
|
86
|
+
resolution,
|
|
87
|
+
context
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { Logger } from "../../logging/logger.js";
|
|
2
|
+
import { BrowserLiveTabService } from "./browser-live-tab-service.js";
|
|
3
|
+
import { type LiveBrowserTabResolution } from "./tab-resolution.js";
|
|
4
|
+
export declare class BrowserTabResolutionService {
|
|
5
|
+
private readonly liveTabs;
|
|
6
|
+
private readonly logger;
|
|
7
|
+
constructor(liveTabs: BrowserLiveTabService, logger: Logger);
|
|
8
|
+
resolve(query?: string): LiveBrowserTabResolution;
|
|
9
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { scoreLiveTab, sortCandidates } from "./tab-resolution.js";
|
|
2
|
+
export class BrowserTabResolutionService {
|
|
3
|
+
liveTabs;
|
|
4
|
+
logger;
|
|
5
|
+
constructor(liveTabs, logger) {
|
|
6
|
+
this.liveTabs = liveTabs;
|
|
7
|
+
this.logger = logger;
|
|
8
|
+
}
|
|
9
|
+
resolve(query) {
|
|
10
|
+
const live = this.liveTabs.list();
|
|
11
|
+
const tabs = live.tabs;
|
|
12
|
+
if (tabs.length === 0) {
|
|
13
|
+
return {
|
|
14
|
+
status: "not_found",
|
|
15
|
+
query,
|
|
16
|
+
candidates: [],
|
|
17
|
+
message: "No live browser tabs are available. Refresh CDP discovery first."
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
const candidates = tabs
|
|
21
|
+
.map((tab) => scoreLiveTab(tab, query))
|
|
22
|
+
.filter((candidate) => candidate !== undefined)
|
|
23
|
+
.sort(sortCandidates);
|
|
24
|
+
if (candidates.length === 0) {
|
|
25
|
+
return {
|
|
26
|
+
status: "not_found",
|
|
27
|
+
query,
|
|
28
|
+
candidates: tabs.slice(0, 5).map((tab) => scoreLiveTab(tab)).sort(sortCandidates),
|
|
29
|
+
message: "No live browser tab matched the query."
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
if (candidates.length > 1 && candidates[0].matchScore === candidates[1].matchScore) {
|
|
33
|
+
return {
|
|
34
|
+
status: "ambiguous",
|
|
35
|
+
query,
|
|
36
|
+
candidates: candidates.slice(0, 5),
|
|
37
|
+
message: "Multiple live browser tabs matched the query. Provide a more specific title or URL."
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
const matched = tabs.find((tab) => tab.targetId === candidates[0].targetId);
|
|
41
|
+
if (!matched) {
|
|
42
|
+
return {
|
|
43
|
+
status: "not_found",
|
|
44
|
+
query,
|
|
45
|
+
candidates: candidates.slice(0, 5),
|
|
46
|
+
message: "The best matching browser tab disappeared before it could be resolved."
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
this.logger.info("Resolved live browser tab", {
|
|
50
|
+
query,
|
|
51
|
+
targetId: matched.targetId,
|
|
52
|
+
title: matched.title,
|
|
53
|
+
url: matched.url,
|
|
54
|
+
matchReason: candidates[0].matchReason,
|
|
55
|
+
matchScore: candidates[0].matchScore
|
|
56
|
+
});
|
|
57
|
+
return {
|
|
58
|
+
status: "resolved",
|
|
59
|
+
query,
|
|
60
|
+
tab: matched,
|
|
61
|
+
matchedCandidate: candidates[0],
|
|
62
|
+
candidateCount: candidates.length
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { Logger } from "../../logging/logger.js";
|
|
2
|
+
import { BrowserTabResolutionService } from "./browser-tab-resolution-service.js";
|
|
3
|
+
import type { LiveBrowserTabResolution } from "./tab-resolution.js";
|
|
4
|
+
import type { BrowserTabScreenshot } from "./types.js";
|
|
5
|
+
export type CaptureResolvedLiveBrowserTabScreenshotResult = {
|
|
6
|
+
status: "completed";
|
|
7
|
+
query?: string;
|
|
8
|
+
resolution: Extract<LiveBrowserTabResolution, {
|
|
9
|
+
status: "resolved";
|
|
10
|
+
}>;
|
|
11
|
+
screenshot: BrowserTabScreenshot;
|
|
12
|
+
} | Extract<LiveBrowserTabResolution, {
|
|
13
|
+
status: "ambiguous" | "not_found";
|
|
14
|
+
}>;
|
|
15
|
+
export declare class BrowserTabScreenshotService {
|
|
16
|
+
private readonly resolver;
|
|
17
|
+
private readonly logger;
|
|
18
|
+
constructor(resolver: BrowserTabResolutionService, logger: Logger);
|
|
19
|
+
captureResolved(query?: string): Promise<CaptureResolvedLiveBrowserTabScreenshotResult>;
|
|
20
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { withCdpTabSession } from "./cdp-websocket-session.js";
|
|
2
|
+
import { readPngDimensions } from "./png-metadata.js";
|
|
3
|
+
const nowIso = () => new Date().toISOString();
|
|
4
|
+
const captureTabScreenshot = async (resolution) => await withCdpTabSession(resolution.tab, async (session) => {
|
|
5
|
+
await session.sendCommand("Page.enable");
|
|
6
|
+
const captureResult = await session.sendCommand("Page.captureScreenshot", {
|
|
7
|
+
format: "png",
|
|
8
|
+
fromSurface: true,
|
|
9
|
+
captureBeyondViewport: true
|
|
10
|
+
});
|
|
11
|
+
const data = captureResult.data;
|
|
12
|
+
if (typeof data !== "string" || data.trim() === "") {
|
|
13
|
+
throw new Error("CDP screenshot response did not include PNG data");
|
|
14
|
+
}
|
|
15
|
+
const buffer = Buffer.from(data, "base64");
|
|
16
|
+
const dimensions = readPngDimensions(buffer);
|
|
17
|
+
return {
|
|
18
|
+
targetId: resolution.tab.targetId,
|
|
19
|
+
title: resolution.tab.title,
|
|
20
|
+
url: resolution.tab.url,
|
|
21
|
+
browserName: resolution.tab.browserName,
|
|
22
|
+
mimeType: "image/png",
|
|
23
|
+
bytesBase64: data,
|
|
24
|
+
width: dimensions.width,
|
|
25
|
+
height: dimensions.height,
|
|
26
|
+
byteLength: buffer.byteLength,
|
|
27
|
+
capturedAt: nowIso(),
|
|
28
|
+
backend: "cdp-page-capture"
|
|
29
|
+
};
|
|
30
|
+
});
|
|
31
|
+
export class BrowserTabScreenshotService {
|
|
32
|
+
resolver;
|
|
33
|
+
logger;
|
|
34
|
+
constructor(resolver, logger) {
|
|
35
|
+
this.resolver = resolver;
|
|
36
|
+
this.logger = logger;
|
|
37
|
+
}
|
|
38
|
+
async captureResolved(query) {
|
|
39
|
+
const resolution = this.resolver.resolve(query);
|
|
40
|
+
if (resolution.status !== "resolved") {
|
|
41
|
+
return resolution;
|
|
42
|
+
}
|
|
43
|
+
const screenshot = await captureTabScreenshot(resolution);
|
|
44
|
+
this.logger.info("Captured browser tab screenshot via CDP", {
|
|
45
|
+
query,
|
|
46
|
+
targetId: resolution.tab.targetId,
|
|
47
|
+
title: resolution.tab.title,
|
|
48
|
+
width: screenshot.width,
|
|
49
|
+
height: screenshot.height,
|
|
50
|
+
byteLength: screenshot.byteLength
|
|
51
|
+
});
|
|
52
|
+
return {
|
|
53
|
+
status: "completed",
|
|
54
|
+
query,
|
|
55
|
+
resolution,
|
|
56
|
+
screenshot
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import WebSocket from "ws";
|
|
2
|
+
import type { LiveBrowserTab } from "./tab-model.js";
|
|
3
|
+
export declare class CdpWebSocketSession {
|
|
4
|
+
private readonly socket;
|
|
5
|
+
private nextCommandId;
|
|
6
|
+
constructor(socket: WebSocket);
|
|
7
|
+
sendCommand(method: string, params?: Record<string, unknown>, timeoutMs?: number): Promise<Record<string, unknown>>;
|
|
8
|
+
}
|
|
9
|
+
export declare const withCdpTabSession: <T>(tab: LiveBrowserTab, run: (session: CdpWebSocketSession) => Promise<T>) => Promise<T>;
|