@poncho-ai/browser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +14 -0
- package/LICENSE +21 -0
- package/dist/index.d.ts +110 -0
- package/dist/index.js +687 -0
- package/package.json +44 -0
- package/src/index.ts +12 -0
- package/src/session.ts +595 -0
- package/src/tools.ts +167 -0
- package/src/types.ts +60 -0
- package/tsconfig.json +8 -0
package/src/tools.ts
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import type { ToolDefinition, FileContentPart } from "@poncho-ai/sdk";
|
|
2
|
+
import type { BrowserSession } from "./session.js";
|
|
3
|
+
|
|
4
|
+
type BrowserToolInput = Record<string, unknown>;
|
|
5
|
+
|
|
6
|
+
export function createBrowserTools(
|
|
7
|
+
getSession: () => BrowserSession,
|
|
8
|
+
getConversationId: () => string,
|
|
9
|
+
): ToolDefinition[] {
|
|
10
|
+
return [
|
|
11
|
+
{
|
|
12
|
+
name: "browser_open",
|
|
13
|
+
description:
|
|
14
|
+
"Open a URL in a headless browser. Returns the page title. Use this to navigate to websites and web applications.",
|
|
15
|
+
inputSchema: {
|
|
16
|
+
type: "object",
|
|
17
|
+
properties: {
|
|
18
|
+
url: {
|
|
19
|
+
type: "string",
|
|
20
|
+
description: "The URL to navigate to (must include protocol, e.g. https://)",
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
required: ["url"],
|
|
24
|
+
},
|
|
25
|
+
handler: async (input: BrowserToolInput) => {
|
|
26
|
+
const session = getSession();
|
|
27
|
+
const cid = getConversationId();
|
|
28
|
+
const url = String(input.url ?? "");
|
|
29
|
+
if (!url) throw new Error("url is required");
|
|
30
|
+
const result = await session.open(cid, url);
|
|
31
|
+
session.startScreencast(cid).catch((err) => {
|
|
32
|
+
console.error("[poncho][browser] startScreencast failed:", err?.message ?? err);
|
|
33
|
+
});
|
|
34
|
+
return { url, title: result.title ?? "(no title)" };
|
|
35
|
+
},
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
name: "browser_snapshot",
|
|
39
|
+
description:
|
|
40
|
+
"Get the current page as a compact accessibility tree with element refs (@e1, @e2, ...). " +
|
|
41
|
+
"Use refs to interact with elements via browser_click and browser_type. " +
|
|
42
|
+
"Re-snapshot after each interaction since refs change when the page updates.",
|
|
43
|
+
inputSchema: {
|
|
44
|
+
type: "object",
|
|
45
|
+
properties: {},
|
|
46
|
+
},
|
|
47
|
+
handler: async () => {
|
|
48
|
+
const session = getSession();
|
|
49
|
+
const snapshot = await session.snapshot(getConversationId());
|
|
50
|
+
return { snapshot };
|
|
51
|
+
},
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
name: "browser_click",
|
|
55
|
+
description:
|
|
56
|
+
"Click an element identified by its ref from the last snapshot (e.g. @e2). " +
|
|
57
|
+
"Always take a snapshot first to get current refs.",
|
|
58
|
+
inputSchema: {
|
|
59
|
+
type: "object",
|
|
60
|
+
properties: {
|
|
61
|
+
ref: {
|
|
62
|
+
type: "string",
|
|
63
|
+
description: 'Element ref from the snapshot (e.g. "@e2")',
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
required: ["ref"],
|
|
67
|
+
},
|
|
68
|
+
handler: async (input: BrowserToolInput) => {
|
|
69
|
+
const session = getSession();
|
|
70
|
+
const ref = String(input.ref ?? "");
|
|
71
|
+
if (!ref) throw new Error("ref is required");
|
|
72
|
+
await session.click(getConversationId(), ref);
|
|
73
|
+
return { clicked: ref };
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
name: "browser_type",
|
|
78
|
+
description:
|
|
79
|
+
"Type text into a form field identified by its ref from the last snapshot. " +
|
|
80
|
+
"This clears the field first, then types the new value.",
|
|
81
|
+
inputSchema: {
|
|
82
|
+
type: "object",
|
|
83
|
+
properties: {
|
|
84
|
+
ref: {
|
|
85
|
+
type: "string",
|
|
86
|
+
description: 'Element ref from the snapshot (e.g. "@e3")',
|
|
87
|
+
},
|
|
88
|
+
text: {
|
|
89
|
+
type: "string",
|
|
90
|
+
description: "Text to type into the field",
|
|
91
|
+
},
|
|
92
|
+
},
|
|
93
|
+
required: ["ref", "text"],
|
|
94
|
+
},
|
|
95
|
+
handler: async (input: BrowserToolInput) => {
|
|
96
|
+
const session = getSession();
|
|
97
|
+
const ref = String(input.ref ?? "");
|
|
98
|
+
const text = String(input.text ?? "");
|
|
99
|
+
if (!ref) throw new Error("ref is required");
|
|
100
|
+
await session.type(getConversationId(), ref, text);
|
|
101
|
+
return { typed: text, into: ref };
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
name: "browser_screenshot",
|
|
106
|
+
description:
|
|
107
|
+
"Take a screenshot of the current page. Returns the image so you can see exactly what the page looks like. " +
|
|
108
|
+
"Use this when you need to see visual layout, verify actions, or read content that isn't in the accessibility tree.",
|
|
109
|
+
inputSchema: {
|
|
110
|
+
type: "object",
|
|
111
|
+
properties: {},
|
|
112
|
+
},
|
|
113
|
+
handler: async () => {
|
|
114
|
+
const session = getSession();
|
|
115
|
+
const base64 = await session.screenshot(getConversationId());
|
|
116
|
+
const filePart: FileContentPart = {
|
|
117
|
+
type: "file",
|
|
118
|
+
data: base64,
|
|
119
|
+
mediaType: "image/jpeg",
|
|
120
|
+
filename: "screenshot.jpg",
|
|
121
|
+
};
|
|
122
|
+
return { screenshot: filePart };
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
name: "browser_scroll",
|
|
127
|
+
description:
|
|
128
|
+
"Scroll the page up or down. Use this to see content that's below or above the current viewport.",
|
|
129
|
+
inputSchema: {
|
|
130
|
+
type: "object",
|
|
131
|
+
properties: {
|
|
132
|
+
direction: {
|
|
133
|
+
type: "string",
|
|
134
|
+
enum: ["up", "down"],
|
|
135
|
+
description: "Scroll direction",
|
|
136
|
+
},
|
|
137
|
+
amount: {
|
|
138
|
+
type: "number",
|
|
139
|
+
description: "Pixels to scroll (default: one viewport height)",
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
required: ["direction"],
|
|
143
|
+
},
|
|
144
|
+
handler: async (input: BrowserToolInput) => {
|
|
145
|
+
const session = getSession();
|
|
146
|
+
const direction = String(input.direction ?? "down") as "up" | "down";
|
|
147
|
+
const amount = typeof input.amount === "number" ? input.amount : undefined;
|
|
148
|
+
await session.scroll(getConversationId(), direction, amount);
|
|
149
|
+
return { scrolled: direction, amount: amount ?? "viewport" };
|
|
150
|
+
},
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
name: "browser_close",
|
|
154
|
+
description:
|
|
155
|
+
"Close the browser tab for this conversation. Call this when you're done with browser tasks to free resources.",
|
|
156
|
+
inputSchema: {
|
|
157
|
+
type: "object",
|
|
158
|
+
properties: {},
|
|
159
|
+
},
|
|
160
|
+
handler: async () => {
|
|
161
|
+
const session = getSession();
|
|
162
|
+
await session.closeTab(getConversationId());
|
|
163
|
+
return { closed: true };
|
|
164
|
+
},
|
|
165
|
+
},
|
|
166
|
+
] as ToolDefinition[];
|
|
167
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
export interface BrowserFrame {
|
|
2
|
+
data: string;
|
|
3
|
+
width: number;
|
|
4
|
+
height: number;
|
|
5
|
+
timestamp: number;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface BrowserStatus {
|
|
9
|
+
active: boolean;
|
|
10
|
+
url?: string;
|
|
11
|
+
interactionAllowed: boolean;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface ViewportOptions {
|
|
15
|
+
width?: number;
|
|
16
|
+
height?: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ScreencastOptions {
|
|
20
|
+
format?: "jpeg" | "png";
|
|
21
|
+
quality?: number;
|
|
22
|
+
maxWidth?: number;
|
|
23
|
+
maxHeight?: number;
|
|
24
|
+
everyNthFrame?: number;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface MouseInputEvent {
|
|
28
|
+
type: "mousePressed" | "mouseReleased" | "mouseMoved" | "mouseWheel";
|
|
29
|
+
x: number;
|
|
30
|
+
y: number;
|
|
31
|
+
button?: "left" | "right" | "middle" | "none";
|
|
32
|
+
clickCount?: number;
|
|
33
|
+
deltaX?: number;
|
|
34
|
+
deltaY?: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export interface KeyboardInputEvent {
|
|
38
|
+
type: "keyDown" | "keyUp" | "char";
|
|
39
|
+
key: string;
|
|
40
|
+
code?: string;
|
|
41
|
+
text?: string;
|
|
42
|
+
keyCode?: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface ScrollInputEvent {
|
|
46
|
+
deltaX: number;
|
|
47
|
+
deltaY: number;
|
|
48
|
+
x?: number;
|
|
49
|
+
y?: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface BrowserConfig {
|
|
53
|
+
viewport?: ViewportOptions;
|
|
54
|
+
quality?: number;
|
|
55
|
+
everyNthFrame?: number;
|
|
56
|
+
profileDir?: string;
|
|
57
|
+
sessionName?: string;
|
|
58
|
+
executablePath?: string;
|
|
59
|
+
headless?: boolean;
|
|
60
|
+
}
|