code-puppy 0.0.356__py3-none-any.whl → 0.0.357__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_puppy/agents/agent_qa_kitten.py +10 -5
- code_puppy/agents/agent_terminal_qa.py +323 -0
- code_puppy/api/app.py +79 -2
- code_puppy/api/routers/commands.py +21 -2
- code_puppy/api/routers/sessions.py +49 -8
- code_puppy/config.py +5 -2
- code_puppy/tools/__init__.py +37 -0
- code_puppy/tools/agent_tools.py +26 -1
- code_puppy/tools/browser/__init__.py +41 -0
- code_puppy/tools/browser/browser_control.py +6 -6
- code_puppy/tools/browser/browser_interactions.py +21 -20
- code_puppy/tools/browser/browser_locators.py +9 -9
- code_puppy/tools/browser/browser_navigation.py +7 -7
- code_puppy/tools/browser/browser_screenshot.py +60 -135
- code_puppy/tools/browser/browser_screenshot_vqa.py +195 -0
- code_puppy/tools/browser/browser_scripts.py +15 -13
- code_puppy/tools/browser/camoufox_manager.py +226 -64
- code_puppy/tools/browser/chromium_terminal_manager.py +259 -0
- code_puppy/tools/browser/terminal_command_tools.py +521 -0
- code_puppy/tools/browser/terminal_screenshot_tools.py +520 -0
- code_puppy/tools/browser/terminal_tools.py +525 -0
- code_puppy/tools/browser/vqa_agent.py +138 -34
- code_puppy/tools/command_runner.py +0 -1
- {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/METADATA +1 -1
- {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/RECORD +30 -24
- {code_puppy-0.0.356.data → code_puppy-0.0.357.data}/data/code_puppy/models.json +0 -0
- {code_puppy-0.0.356.data → code_puppy-0.0.357.data}/data/code_puppy/models_dev_api.json +0 -0
- {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/WHEEL +0 -0
- {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/entry_points.txt +0 -0
- {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,19 +1,21 @@
|
|
|
1
|
-
"""Screenshot
|
|
1
|
+
"""Screenshot tool for browser automation.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Captures screenshots and returns them as base64 data that multimodal
|
|
4
|
+
models can directly see and analyze - no separate VQA agent needed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import base64
|
|
4
8
|
from datetime import datetime
|
|
5
9
|
from pathlib import Path
|
|
6
10
|
from tempfile import gettempdir, mkdtemp
|
|
7
11
|
from typing import Any, Dict, Optional
|
|
8
12
|
|
|
9
|
-
from pydantic import BaseModel
|
|
10
13
|
from pydantic_ai import RunContext
|
|
11
14
|
|
|
12
15
|
from code_puppy.messaging import emit_error, emit_info, emit_success
|
|
13
16
|
from code_puppy.tools.common import generate_group_id
|
|
14
17
|
|
|
15
|
-
from .camoufox_manager import
|
|
16
|
-
from .vqa_agent import run_vqa_analysis
|
|
18
|
+
from .camoufox_manager import get_session_browser_manager
|
|
17
19
|
|
|
18
20
|
_TEMP_SCREENSHOT_ROOT = Path(
|
|
19
21
|
mkdtemp(prefix="code_puppy_screenshots_", dir=gettempdir())
|
|
@@ -21,21 +23,11 @@ _TEMP_SCREENSHOT_ROOT = Path(
|
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def _build_screenshot_path(timestamp: str) -> Path:
|
|
24
|
-
"""Return the target path for a screenshot
|
|
26
|
+
"""Return the target path for a screenshot."""
|
|
25
27
|
filename = f"screenshot_{timestamp}.png"
|
|
26
28
|
return _TEMP_SCREENSHOT_ROOT / filename
|
|
27
29
|
|
|
28
30
|
|
|
29
|
-
class ScreenshotResult(BaseModel):
|
|
30
|
-
"""Result from screenshot operation."""
|
|
31
|
-
|
|
32
|
-
success: bool
|
|
33
|
-
screenshot_path: Optional[str] = None
|
|
34
|
-
screenshot_data: Optional[bytes] = None
|
|
35
|
-
timestamp: Optional[str] = None
|
|
36
|
-
error: Optional[str] = None
|
|
37
|
-
|
|
38
|
-
|
|
39
31
|
async def _capture_screenshot(
|
|
40
32
|
page,
|
|
41
33
|
full_page: bool = False,
|
|
@@ -45,41 +37,38 @@ async def _capture_screenshot(
|
|
|
45
37
|
) -> Dict[str, Any]:
|
|
46
38
|
"""Internal screenshot capture function."""
|
|
47
39
|
try:
|
|
48
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%
|
|
40
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
|
49
41
|
|
|
50
42
|
# Take screenshot
|
|
51
43
|
if element_selector:
|
|
52
|
-
# Screenshot specific element
|
|
53
44
|
element = await page.locator(element_selector).first
|
|
54
45
|
if not await element.is_visible():
|
|
55
46
|
return {
|
|
56
47
|
"success": False,
|
|
57
48
|
"error": f"Element '{element_selector}' is not visible",
|
|
58
49
|
}
|
|
59
|
-
|
|
50
|
+
screenshot_bytes = await element.screenshot()
|
|
60
51
|
else:
|
|
61
|
-
|
|
62
|
-
screenshot_data = await page.screenshot(full_page=full_page)
|
|
52
|
+
screenshot_bytes = await page.screenshot(full_page=full_page)
|
|
63
53
|
|
|
64
|
-
result = {
|
|
54
|
+
result: Dict[str, Any] = {
|
|
65
55
|
"success": True,
|
|
66
|
-
"
|
|
56
|
+
"screenshot_bytes": screenshot_bytes,
|
|
57
|
+
"base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
|
|
67
58
|
"timestamp": timestamp,
|
|
68
59
|
}
|
|
69
60
|
|
|
70
61
|
if save_screenshot:
|
|
71
62
|
screenshot_path = _build_screenshot_path(timestamp)
|
|
72
63
|
screenshot_path.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
-
|
|
74
64
|
with open(screenshot_path, "wb") as f:
|
|
75
|
-
f.write(
|
|
76
|
-
|
|
65
|
+
f.write(screenshot_bytes)
|
|
77
66
|
result["screenshot_path"] = str(screenshot_path)
|
|
78
|
-
|
|
67
|
+
|
|
79
68
|
if group_id:
|
|
80
|
-
emit_success(
|
|
81
|
-
|
|
82
|
-
|
|
69
|
+
emit_success(
|
|
70
|
+
f"Screenshot saved: {screenshot_path}", message_group=group_id
|
|
71
|
+
)
|
|
83
72
|
|
|
84
73
|
return result
|
|
85
74
|
|
|
@@ -87,46 +76,43 @@ async def _capture_screenshot(
|
|
|
87
76
|
return {"success": False, "error": str(e)}
|
|
88
77
|
|
|
89
78
|
|
|
90
|
-
async def
|
|
91
|
-
question: str,
|
|
79
|
+
async def take_screenshot(
|
|
92
80
|
full_page: bool = False,
|
|
93
81
|
element_selector: Optional[str] = None,
|
|
94
82
|
save_screenshot: bool = True,
|
|
95
83
|
) -> Dict[str, Any]:
|
|
96
|
-
"""
|
|
97
|
-
|
|
84
|
+
"""Take a screenshot of the browser page.
|
|
85
|
+
|
|
86
|
+
Returns the screenshot as base64-encoded PNG data that multimodal
|
|
87
|
+
models can directly see and analyze.
|
|
98
88
|
|
|
99
89
|
Args:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
save_screenshot: Whether to save the screenshot to disk
|
|
90
|
+
full_page: Whether to capture full page or just viewport.
|
|
91
|
+
element_selector: Optional selector to screenshot specific element.
|
|
92
|
+
save_screenshot: Whether to save the screenshot to disk.
|
|
104
93
|
|
|
105
94
|
Returns:
|
|
106
|
-
Dict containing
|
|
95
|
+
Dict containing:
|
|
96
|
+
- success (bool): True if screenshot was captured.
|
|
97
|
+
- base64_image (str): Base64-encoded PNG image data.
|
|
98
|
+
- media_type (str): Always "image/png".
|
|
99
|
+
- screenshot_path (str): Path to saved file (if saved).
|
|
100
|
+
- error (str): Error message if unsuccessful.
|
|
107
101
|
"""
|
|
108
102
|
target = element_selector or ("full_page" if full_page else "viewport")
|
|
109
|
-
group_id = generate_group_id(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
emit_info(
|
|
113
|
-
f"BROWSER SCREENSHOT ANALYZE 📷 question='{question[:100]}{'...' if len(question) > 100 else ''}' target={target}",
|
|
114
|
-
message_group=group_id,
|
|
115
|
-
)
|
|
103
|
+
group_id = generate_group_id("browser_screenshot", target)
|
|
104
|
+
emit_info(f"BROWSER SCREENSHOT 📷 target={target}", message_group=group_id)
|
|
105
|
+
|
|
116
106
|
try:
|
|
117
|
-
|
|
118
|
-
browser_manager = get_camoufox_manager()
|
|
107
|
+
browser_manager = get_session_browser_manager()
|
|
119
108
|
page = await browser_manager.get_current_page()
|
|
120
109
|
|
|
121
110
|
if not page:
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
"question": question,
|
|
126
|
-
}
|
|
111
|
+
error_msg = "No active browser page. Navigate to a webpage first."
|
|
112
|
+
emit_error(error_msg, message_group=group_id)
|
|
113
|
+
return {"success": False, "error": error_msg}
|
|
127
114
|
|
|
128
|
-
|
|
129
|
-
screenshot_result = await _capture_screenshot(
|
|
115
|
+
result = await _capture_screenshot(
|
|
130
116
|
page,
|
|
131
117
|
full_page=full_page,
|
|
132
118
|
element_selector=element_selector,
|
|
@@ -134,108 +120,47 @@ async def take_screenshot_and_analyze(
|
|
|
134
120
|
group_id=group_id,
|
|
135
121
|
)
|
|
136
122
|
|
|
137
|
-
if not
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
f"Screenshot capture failed: {error_message}",
|
|
141
|
-
message_group=group_id,
|
|
142
|
-
)
|
|
143
|
-
return {
|
|
144
|
-
"success": False,
|
|
145
|
-
"error": error_message,
|
|
146
|
-
"question": question,
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
screenshot_bytes = screenshot_result.get("screenshot_data")
|
|
150
|
-
if not screenshot_bytes:
|
|
151
|
-
emit_error(
|
|
152
|
-
"Screenshot captured but pixel data missing; cannot run visual analysis.",
|
|
153
|
-
message_group=group_id,
|
|
154
|
-
)
|
|
155
|
-
return {
|
|
156
|
-
"success": False,
|
|
157
|
-
"error": "Screenshot captured but no image bytes available for analysis.",
|
|
158
|
-
"question": question,
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
try:
|
|
162
|
-
vqa_result = await asyncio.to_thread(
|
|
163
|
-
run_vqa_analysis,
|
|
164
|
-
question,
|
|
165
|
-
screenshot_bytes,
|
|
166
|
-
)
|
|
167
|
-
except Exception as exc:
|
|
168
|
-
emit_error(
|
|
169
|
-
f"Visual question answering failed: {exc}",
|
|
170
|
-
message_group=group_id,
|
|
171
|
-
)
|
|
172
|
-
return {
|
|
173
|
-
"success": False,
|
|
174
|
-
"error": f"Visual analysis failed: {exc}",
|
|
175
|
-
"question": question,
|
|
176
|
-
"screenshot_info": {
|
|
177
|
-
"path": screenshot_result.get("screenshot_path"),
|
|
178
|
-
"timestamp": screenshot_result.get("timestamp"),
|
|
179
|
-
"full_page": full_page,
|
|
180
|
-
"element_selector": element_selector,
|
|
181
|
-
},
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
emit_success(
|
|
185
|
-
f"Visual analysis answer: {vqa_result.answer}",
|
|
186
|
-
message_group=group_id,
|
|
187
|
-
)
|
|
188
|
-
emit_info(
|
|
189
|
-
f"Observations: {vqa_result.observations}",
|
|
190
|
-
message_group=group_id,
|
|
191
|
-
)
|
|
123
|
+
if not result["success"]:
|
|
124
|
+
emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
|
|
125
|
+
return result
|
|
192
126
|
|
|
193
127
|
return {
|
|
194
128
|
"success": True,
|
|
195
|
-
"
|
|
196
|
-
"
|
|
197
|
-
"
|
|
198
|
-
"
|
|
199
|
-
"screenshot_info": {
|
|
200
|
-
"path": screenshot_result.get("screenshot_path"),
|
|
201
|
-
"size": len(screenshot_bytes),
|
|
202
|
-
"timestamp": screenshot_result.get("timestamp"),
|
|
203
|
-
"full_page": full_page,
|
|
204
|
-
"element_selector": element_selector,
|
|
205
|
-
},
|
|
129
|
+
"base64_image": result["base64_data"],
|
|
130
|
+
"media_type": "image/png",
|
|
131
|
+
"screenshot_path": result.get("screenshot_path"),
|
|
132
|
+
"message": "Screenshot captured. The base64_image contains the browser view.",
|
|
206
133
|
}
|
|
207
134
|
|
|
208
135
|
except Exception as e:
|
|
209
|
-
|
|
210
|
-
|
|
136
|
+
error_msg = f"Screenshot failed: {str(e)}"
|
|
137
|
+
emit_error(error_msg, message_group=group_id)
|
|
138
|
+
return {"success": False, "error": error_msg}
|
|
211
139
|
|
|
212
140
|
|
|
213
141
|
def register_take_screenshot_and_analyze(agent):
|
|
214
|
-
"""Register the screenshot
|
|
142
|
+
"""Register the screenshot tool."""
|
|
215
143
|
|
|
216
144
|
@agent.tool
|
|
217
145
|
async def browser_screenshot_analyze(
|
|
218
146
|
context: RunContext,
|
|
219
|
-
question: str,
|
|
220
147
|
full_page: bool = False,
|
|
221
148
|
element_selector: Optional[str] = None,
|
|
222
|
-
save_screenshot: bool = True,
|
|
223
149
|
) -> Dict[str, Any]:
|
|
224
150
|
"""
|
|
225
|
-
Take a screenshot
|
|
151
|
+
Take a screenshot of the browser page.
|
|
152
|
+
|
|
153
|
+
Returns the screenshot as base64 image data that you can see directly.
|
|
154
|
+
Use this to see what's displayed in the browser.
|
|
226
155
|
|
|
227
156
|
Args:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
element_selector: Optional CSS/XPath selector to screenshot specific element
|
|
231
|
-
save_screenshot: Whether to save the screenshot to disk
|
|
157
|
+
full_page: Capture full page (True) or just viewport (False).
|
|
158
|
+
element_selector: Optional CSS selector to screenshot specific element.
|
|
232
159
|
|
|
233
160
|
Returns:
|
|
234
|
-
Dict with
|
|
161
|
+
Dict with base64_image (PNG data you can see), screenshot_path, etc.
|
|
235
162
|
"""
|
|
236
|
-
return await
|
|
237
|
-
question=question,
|
|
163
|
+
return await take_screenshot(
|
|
238
164
|
full_page=full_page,
|
|
239
165
|
element_selector=element_selector,
|
|
240
|
-
save_screenshot=save_screenshot,
|
|
241
166
|
)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""VQA-based Screenshot tool for browser automation (qa-kitten).
|
|
2
|
+
|
|
3
|
+
This module provides screenshot analysis using a dedicated VQA agent.
|
|
4
|
+
Unlike browser_screenshot.py which returns raw base64 bytes for multimodal
|
|
5
|
+
models to see directly, this version offloads the visual analysis to a
|
|
6
|
+
separate VQA agent, helping manage context in the calling agent.
|
|
7
|
+
|
|
8
|
+
Use this for qa-kitten where context management is important.
|
|
9
|
+
Use browser_screenshot.py for terminal-qa where direct image viewing is needed.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic_ai import RunContext
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
|
|
17
|
+
from code_puppy.messaging import emit_error, emit_info, emit_success
|
|
18
|
+
from code_puppy.tools.common import generate_group_id
|
|
19
|
+
|
|
20
|
+
from .browser_screenshot import _capture_screenshot
|
|
21
|
+
from .camoufox_manager import get_session_browser_manager
|
|
22
|
+
from .vqa_agent import run_vqa_analysis_stream
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def take_screenshot_and_analyze(
|
|
26
|
+
question: str,
|
|
27
|
+
full_page: bool = False,
|
|
28
|
+
element_selector: Optional[str] = None,
|
|
29
|
+
save_screenshot: bool = True,
|
|
30
|
+
) -> Dict[str, Any]:
|
|
31
|
+
"""Take a screenshot and analyze it using the VQA agent.
|
|
32
|
+
|
|
33
|
+
This function captures a screenshot and passes it to a dedicated
|
|
34
|
+
VQA (Visual Question Answering) agent for analysis. The VQA agent
|
|
35
|
+
runs separately, keeping the image analysis out of the calling
|
|
36
|
+
agent's context window.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
question: The question to ask about the screenshot.
|
|
40
|
+
Examples:
|
|
41
|
+
- "What buttons are visible on this page?"
|
|
42
|
+
- "Is there an error message displayed?"
|
|
43
|
+
- "What is the main heading text?"
|
|
44
|
+
- "Describe the layout of this form."
|
|
45
|
+
full_page: Whether to capture full page or just viewport.
|
|
46
|
+
Defaults to False (viewport only).
|
|
47
|
+
element_selector: Optional CSS selector to screenshot a specific
|
|
48
|
+
element instead of the whole page.
|
|
49
|
+
save_screenshot: Whether to save the screenshot to disk.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Dict containing:
|
|
53
|
+
- success (bool): True if analysis succeeded.
|
|
54
|
+
- answer (str): The VQA agent's streamed answer to your question.
|
|
55
|
+
- screenshot_info (dict): Path, timestamp, and other metadata.
|
|
56
|
+
- error (str): Error message if unsuccessful.
|
|
57
|
+
"""
|
|
58
|
+
target = element_selector or ("full_page" if full_page else "viewport")
|
|
59
|
+
group_id = generate_group_id(
|
|
60
|
+
"browser_screenshot_analyze", f"{question[:50]}_{target}"
|
|
61
|
+
)
|
|
62
|
+
emit_info(
|
|
63
|
+
f"BROWSER SCREENSHOT ANALYZE 📷 question='{question[:100]}{'...' if len(question) > 100 else ''}' target={target}",
|
|
64
|
+
message_group=group_id,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
# Get the browser page
|
|
69
|
+
browser_manager = get_session_browser_manager()
|
|
70
|
+
page = await browser_manager.get_current_page()
|
|
71
|
+
|
|
72
|
+
if not page:
|
|
73
|
+
error_msg = "No active browser page. Navigate to a webpage first."
|
|
74
|
+
emit_error(error_msg, message_group=group_id)
|
|
75
|
+
return {"success": False, "error": error_msg, "question": question}
|
|
76
|
+
|
|
77
|
+
# Capture the screenshot
|
|
78
|
+
screenshot_result = await _capture_screenshot(
|
|
79
|
+
page,
|
|
80
|
+
full_page=full_page,
|
|
81
|
+
element_selector=element_selector,
|
|
82
|
+
save_screenshot=save_screenshot,
|
|
83
|
+
group_id=group_id,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if not screenshot_result["success"]:
|
|
87
|
+
error_msg = screenshot_result.get("error", "Screenshot failed")
|
|
88
|
+
emit_error(
|
|
89
|
+
f"Screenshot capture failed: {error_msg}", message_group=group_id
|
|
90
|
+
)
|
|
91
|
+
return {"success": False, "error": error_msg, "question": question}
|
|
92
|
+
|
|
93
|
+
screenshot_bytes = screenshot_result.get("screenshot_bytes")
|
|
94
|
+
if not screenshot_bytes:
|
|
95
|
+
emit_error(
|
|
96
|
+
"Screenshot captured but pixel data missing; cannot run visual analysis.",
|
|
97
|
+
message_group=group_id,
|
|
98
|
+
)
|
|
99
|
+
return {
|
|
100
|
+
"success": False,
|
|
101
|
+
"error": "Screenshot captured but no image bytes available for analysis.",
|
|
102
|
+
"question": question,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Run VQA analysis with streaming output
|
|
106
|
+
try:
|
|
107
|
+
console = Console()
|
|
108
|
+
console.print() # Newline before streaming starts
|
|
109
|
+
console.print("[bold cyan]🔍 VQA Analysis:[/bold cyan]")
|
|
110
|
+
|
|
111
|
+
vqa_answer = await run_vqa_analysis_stream(
|
|
112
|
+
question,
|
|
113
|
+
screenshot_bytes,
|
|
114
|
+
)
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
emit_error(
|
|
117
|
+
f"Visual question answering failed: {exc}",
|
|
118
|
+
message_group=group_id,
|
|
119
|
+
)
|
|
120
|
+
return {
|
|
121
|
+
"success": False,
|
|
122
|
+
"error": f"Visual analysis failed: {exc}",
|
|
123
|
+
"question": question,
|
|
124
|
+
"screenshot_info": {
|
|
125
|
+
"path": screenshot_result.get("screenshot_path"),
|
|
126
|
+
"timestamp": screenshot_result.get("timestamp"),
|
|
127
|
+
"full_page": full_page,
|
|
128
|
+
"element_selector": element_selector,
|
|
129
|
+
},
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
emit_success(
|
|
133
|
+
"Visual analysis complete",
|
|
134
|
+
message_group=group_id,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
"success": True,
|
|
139
|
+
"question": question,
|
|
140
|
+
"answer": vqa_answer,
|
|
141
|
+
"screenshot_info": {
|
|
142
|
+
"path": screenshot_result.get("screenshot_path"),
|
|
143
|
+
"size": len(screenshot_bytes),
|
|
144
|
+
"timestamp": screenshot_result.get("timestamp"),
|
|
145
|
+
"full_page": full_page,
|
|
146
|
+
"element_selector": element_selector,
|
|
147
|
+
},
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
error_msg = f"Screenshot analysis failed: {str(e)}"
|
|
152
|
+
emit_error(error_msg, message_group=group_id)
|
|
153
|
+
return {"success": False, "error": error_msg, "question": question}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def register_take_screenshot_and_analyze_vqa(agent):
|
|
157
|
+
"""Register the VQA-based screenshot tool.
|
|
158
|
+
|
|
159
|
+
This tool takes a screenshot and analyzes it using a separate VQA agent.
|
|
160
|
+
Use this for agents where context management is important (like qa-kitten).
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
@agent.tool
|
|
164
|
+
async def browser_screenshot_vqa(
|
|
165
|
+
context: RunContext,
|
|
166
|
+
question: str,
|
|
167
|
+
full_page: bool = False,
|
|
168
|
+
element_selector: Optional[str] = None,
|
|
169
|
+
) -> Dict[str, Any]:
|
|
170
|
+
"""
|
|
171
|
+
Take a screenshot and analyze it with VQA.
|
|
172
|
+
|
|
173
|
+
Captures a screenshot of the browser and uses a visual AI to
|
|
174
|
+
answer your question about what's visible on the page.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
question: What you want to know about the screenshot.
|
|
178
|
+
Examples:
|
|
179
|
+
- "What buttons are visible?"
|
|
180
|
+
- "Is there an error message?"
|
|
181
|
+
- "What is the page title?"
|
|
182
|
+
- "Is the form filled out correctly?"
|
|
183
|
+
full_page: Capture full page (True) or just viewport (False).
|
|
184
|
+
element_selector: Optional CSS selector to screenshot specific element.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dict with:
|
|
188
|
+
- answer: The streamed answer to your question
|
|
189
|
+
- screenshot_info: Where the screenshot was saved, etc.
|
|
190
|
+
"""
|
|
191
|
+
return await take_screenshot_and_analyze(
|
|
192
|
+
question=question,
|
|
193
|
+
full_page=full_page,
|
|
194
|
+
element_selector=element_selector,
|
|
195
|
+
)
|
|
@@ -7,7 +7,7 @@ from pydantic_ai import RunContext
|
|
|
7
7
|
from code_puppy.messaging import emit_error, emit_info, emit_success
|
|
8
8
|
from code_puppy.tools.common import generate_group_id
|
|
9
9
|
|
|
10
|
-
from .camoufox_manager import
|
|
10
|
+
from .camoufox_manager import get_session_browser_manager
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
async def execute_javascript(
|
|
@@ -21,14 +21,16 @@ async def execute_javascript(
|
|
|
21
21
|
message_group=group_id,
|
|
22
22
|
)
|
|
23
23
|
try:
|
|
24
|
-
browser_manager =
|
|
24
|
+
browser_manager = get_session_browser_manager()
|
|
25
25
|
page = await browser_manager.get_current_page()
|
|
26
26
|
|
|
27
27
|
if not page:
|
|
28
28
|
return {"success": False, "error": "No active browser page available"}
|
|
29
29
|
|
|
30
30
|
# Execute JavaScript
|
|
31
|
-
|
|
31
|
+
# Note: page.evaluate() does NOT accept a timeout parameter
|
|
32
|
+
# The timeout arg to this function is kept for API compatibility but unused
|
|
33
|
+
result = await page.evaluate(script)
|
|
32
34
|
|
|
33
35
|
emit_success("JavaScript executed successfully", message_group=group_id)
|
|
34
36
|
|
|
@@ -52,7 +54,7 @@ async def scroll_page(
|
|
|
52
54
|
message_group=group_id,
|
|
53
55
|
)
|
|
54
56
|
try:
|
|
55
|
-
browser_manager =
|
|
57
|
+
browser_manager = get_session_browser_manager()
|
|
56
58
|
page = await browser_manager.get_current_page()
|
|
57
59
|
|
|
58
60
|
if not page:
|
|
@@ -60,7 +62,7 @@ async def scroll_page(
|
|
|
60
62
|
|
|
61
63
|
if element_selector:
|
|
62
64
|
# Scroll specific element
|
|
63
|
-
element = page.locator(element_selector)
|
|
65
|
+
element = page.locator(element_selector).first
|
|
64
66
|
await element.scroll_into_view_if_needed()
|
|
65
67
|
|
|
66
68
|
# Get element's current scroll position and dimensions
|
|
@@ -146,13 +148,13 @@ async def scroll_to_element(
|
|
|
146
148
|
message_group=group_id,
|
|
147
149
|
)
|
|
148
150
|
try:
|
|
149
|
-
browser_manager =
|
|
151
|
+
browser_manager = get_session_browser_manager()
|
|
150
152
|
page = await browser_manager.get_current_page()
|
|
151
153
|
|
|
152
154
|
if not page:
|
|
153
155
|
return {"success": False, "error": "No active browser page available"}
|
|
154
156
|
|
|
155
|
-
element = page.locator(selector)
|
|
157
|
+
element = page.locator(selector).first
|
|
156
158
|
await element.wait_for(state="attached", timeout=timeout)
|
|
157
159
|
await element.scroll_into_view_if_needed()
|
|
158
160
|
|
|
@@ -178,7 +180,7 @@ async def set_viewport_size(
|
|
|
178
180
|
message_group=group_id,
|
|
179
181
|
)
|
|
180
182
|
try:
|
|
181
|
-
browser_manager =
|
|
183
|
+
browser_manager = get_session_browser_manager()
|
|
182
184
|
page = await browser_manager.get_current_page()
|
|
183
185
|
|
|
184
186
|
if not page:
|
|
@@ -209,13 +211,13 @@ async def wait_for_element(
|
|
|
209
211
|
message_group=group_id,
|
|
210
212
|
)
|
|
211
213
|
try:
|
|
212
|
-
browser_manager =
|
|
214
|
+
browser_manager = get_session_browser_manager()
|
|
213
215
|
page = await browser_manager.get_current_page()
|
|
214
216
|
|
|
215
217
|
if not page:
|
|
216
218
|
return {"success": False, "error": "No active browser page available"}
|
|
217
219
|
|
|
218
|
-
element = page.locator(selector)
|
|
220
|
+
element = page.locator(selector).first
|
|
219
221
|
await element.wait_for(state=state, timeout=timeout)
|
|
220
222
|
|
|
221
223
|
emit_success(f"Element {selector} is now {state}", message_group=group_id)
|
|
@@ -240,13 +242,13 @@ async def highlight_element(
|
|
|
240
242
|
message_group=group_id,
|
|
241
243
|
)
|
|
242
244
|
try:
|
|
243
|
-
browser_manager =
|
|
245
|
+
browser_manager = get_session_browser_manager()
|
|
244
246
|
page = await browser_manager.get_current_page()
|
|
245
247
|
|
|
246
248
|
if not page:
|
|
247
249
|
return {"success": False, "error": "No active browser page available"}
|
|
248
250
|
|
|
249
|
-
element = page.locator(selector)
|
|
251
|
+
element = page.locator(selector).first
|
|
250
252
|
await element.wait_for(state="visible", timeout=timeout)
|
|
251
253
|
|
|
252
254
|
# Add highlight style
|
|
@@ -277,7 +279,7 @@ async def clear_highlights() -> Dict[str, Any]:
|
|
|
277
279
|
message_group=group_id,
|
|
278
280
|
)
|
|
279
281
|
try:
|
|
280
|
-
browser_manager =
|
|
282
|
+
browser_manager = get_session_browser_manager()
|
|
281
283
|
page = await browser_manager.get_current_page()
|
|
282
284
|
|
|
283
285
|
if not page:
|