code-puppy 0.0.358__py3-none-any.whl → 0.0.360__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_puppy/agents/agent_qa_kitten.py +13 -13
- code_puppy/agents/event_stream_handler.py +27 -7
- code_puppy/config.py +1 -63
- code_puppy/tools/__init__.py +1 -5
- code_puppy/tools/browser/__init__.py +0 -4
- code_puppy/tools/browser/browser_screenshot.py +41 -28
- code_puppy/tools/browser/terminal_screenshot_tools.py +113 -77
- {code_puppy-0.0.358.dist-info → code_puppy-0.0.360.dist-info}/METADATA +1 -1
- {code_puppy-0.0.358.dist-info → code_puppy-0.0.360.dist-info}/RECORD +14 -16
- code_puppy/tools/browser/browser_screenshot_vqa.py +0 -195
- code_puppy/tools/browser/vqa_agent.py +0 -194
- {code_puppy-0.0.358.data → code_puppy-0.0.360.data}/data/code_puppy/models.json +0 -0
- {code_puppy-0.0.358.data → code_puppy-0.0.360.data}/data/code_puppy/models_dev_api.json +0 -0
- {code_puppy-0.0.358.dist-info → code_puppy-0.0.360.dist-info}/WHEEL +0 -0
- {code_puppy-0.0.358.dist-info → code_puppy-0.0.360.dist-info}/entry_points.txt +0 -0
- {code_puppy-0.0.358.dist-info → code_puppy-0.0.360.dist-info}/licenses/LICENSE +0 -0
|
@@ -16,7 +16,7 @@ class QualityAssuranceKittenAgent(BaseAgent):
|
|
|
16
16
|
|
|
17
17
|
@property
|
|
18
18
|
def description(self) -> str:
|
|
19
|
-
return "Advanced web browser automation and quality assurance testing using Playwright with
|
|
19
|
+
return "Advanced web browser automation and quality assurance testing using Playwright with visual analysis capabilities"
|
|
20
20
|
|
|
21
21
|
def get_available_tools(self) -> list[str]:
|
|
22
22
|
"""Get the list of tools available to Web Browser Puppy."""
|
|
@@ -63,8 +63,9 @@ class QualityAssuranceKittenAgent(BaseAgent):
|
|
|
63
63
|
"browser_wait_for_element",
|
|
64
64
|
"browser_highlight_element",
|
|
65
65
|
"browser_clear_highlights",
|
|
66
|
-
# Screenshots
|
|
67
|
-
"
|
|
66
|
+
# Screenshots (returns BinaryContent for direct visual analysis)
|
|
67
|
+
"browser_screenshot_analyze",
|
|
68
|
+
"load_image_for_analysis",
|
|
68
69
|
# Workflow management
|
|
69
70
|
"browser_save_workflow",
|
|
70
71
|
"browser_list_workflows",
|
|
@@ -78,7 +79,7 @@ You are Quality Assurance Kitten 🐱, an advanced autonomous browser automation
|
|
|
78
79
|
|
|
79
80
|
You specialize in:
|
|
80
81
|
🎯 **Quality Assurance Testing** - automated testing of web applications and user workflows
|
|
81
|
-
👁️ **Visual verification** - taking screenshots
|
|
82
|
+
👁️ **Visual verification** - taking screenshots you can directly see and analyze for bugs
|
|
82
83
|
🔍 **Element discovery** - finding elements using semantic locators and accessibility best practices
|
|
83
84
|
📝 **Data extraction** - scraping content and gathering information from web pages
|
|
84
85
|
🧪 **Web automation** - filling forms, clicking buttons, navigating sites with precision
|
|
@@ -117,12 +118,10 @@ For any browser task, follow this approach:
|
|
|
117
118
|
|
|
118
119
|
### Visual Verification Workflow
|
|
119
120
|
- **Before critical actions**: Use browser_highlight_element to visually confirm
|
|
120
|
-
- **After interactions**: Use
|
|
121
|
-
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
- "Is the form filled out correctly?"
|
|
125
|
-
- "What is the main heading text?"
|
|
121
|
+
- **After interactions**: Use browser_screenshot_analyze to verify results
|
|
122
|
+
- The screenshot is returned directly as an image you can see and analyze
|
|
123
|
+
- No need to ask questions - just analyze what you see in the returned image
|
|
124
|
+
- Use load_image_for_analysis to load mockups or reference images for comparison
|
|
126
125
|
|
|
127
126
|
### Form Input Best Practices
|
|
128
127
|
- **ALWAYS check current values** with browser_get_value before typing
|
|
@@ -135,14 +134,15 @@ For any browser task, follow this approach:
|
|
|
135
134
|
**When Element Discovery Fails:**
|
|
136
135
|
1. Try different semantic locators first
|
|
137
136
|
2. Use browser_find_buttons or browser_find_links to see available elements
|
|
138
|
-
3. Take a screenshot with browser_screenshot_analyze to understand the page layout
|
|
137
|
+
3. Take a screenshot with browser_screenshot_analyze to see and understand the page layout
|
|
139
138
|
4. Only use XPath as absolute last resort
|
|
140
139
|
|
|
141
140
|
**When Page Interactions Fail:**
|
|
142
141
|
1. Check if element is visible with browser_wait_for_element
|
|
143
142
|
2. Scroll element into view with browser_scroll_to_element
|
|
144
143
|
3. Use browser_highlight_element to confirm element location
|
|
145
|
-
4.
|
|
144
|
+
4. Take a screenshot with browser_screenshot_analyze to see the actual page state
|
|
145
|
+
5. Try browser_execute_js for complex interactions
|
|
146
146
|
|
|
147
147
|
### JavaScript Execution
|
|
148
148
|
- Use browser_execute_js for:
|
|
@@ -187,7 +187,7 @@ For any browser task, follow this approach:
|
|
|
187
187
|
## Specialized Capabilities
|
|
188
188
|
|
|
189
189
|
🌐 **WCAG 2.2 Level AA Compliance**: Always prioritize accessibility in element discovery
|
|
190
|
-
📸 **Visual
|
|
190
|
+
📸 **Direct Visual Analysis**: Use browser_screenshot_analyze to see and analyze page content directly
|
|
191
191
|
🚀 **Semantic Web Navigation**: Prefer role-based and label-based element discovery
|
|
192
192
|
⚡ **Playwright Power**: Full access to modern browser automation capabilities
|
|
193
193
|
📋 **Workflow Management**: Save, load, and reuse automation patterns for consistency
|
|
@@ -119,6 +119,7 @@ async def event_stream_handler(
|
|
|
119
119
|
tool_parts: set[int] = set() # Track which parts are tool calls
|
|
120
120
|
banner_printed: set[int] = set() # Track if banner was already printed
|
|
121
121
|
token_count: dict[int, int] = {} # Track token count per text/tool part
|
|
122
|
+
tool_names: dict[int, str] = {} # Track tool name per tool part index
|
|
122
123
|
did_stream_anything = False # Track if we streamed any content
|
|
123
124
|
|
|
124
125
|
# Termflow streaming state for text parts
|
|
@@ -203,6 +204,8 @@ async def event_stream_handler(
|
|
|
203
204
|
streaming_parts.add(event.index)
|
|
204
205
|
tool_parts.add(event.index)
|
|
205
206
|
token_count[event.index] = 0 # Initialize token counter
|
|
207
|
+
# Capture tool name from the start event
|
|
208
|
+
tool_names[event.index] = part.tool_name or ""
|
|
206
209
|
# Track tool name for display
|
|
207
210
|
banner_printed.add(
|
|
208
211
|
event.index
|
|
@@ -253,20 +256,36 @@ async def event_stream_handler(
|
|
|
253
256
|
escaped = escape(delta.content_delta)
|
|
254
257
|
console.print(f"[dim]{escaped}[/dim]", end="")
|
|
255
258
|
elif isinstance(delta, ToolCallPartDelta):
|
|
256
|
-
# For tool calls,
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
259
|
+
# For tool calls, estimate tokens from args_delta content
|
|
260
|
+
# args_delta contains the streaming JSON arguments
|
|
261
|
+
args_delta = getattr(delta, "args_delta", "") or ""
|
|
262
|
+
if args_delta:
|
|
263
|
+
# Rough estimate: 4 chars ≈ 1 token (same heuristic as subagent_stream_handler)
|
|
264
|
+
estimated_tokens = max(1, len(args_delta) // 4)
|
|
265
|
+
token_count[event.index] += estimated_tokens
|
|
266
|
+
else:
|
|
267
|
+
# Even empty deltas count as activity
|
|
268
|
+
token_count[event.index] += 1
|
|
269
|
+
|
|
270
|
+
# Update tool name if delta provides more of it
|
|
271
|
+
tool_name_delta = getattr(delta, "tool_name_delta", "") or ""
|
|
272
|
+
if tool_name_delta:
|
|
273
|
+
tool_names[event.index] = (
|
|
274
|
+
tool_names.get(event.index, "") + tool_name_delta
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Use stored tool name for display
|
|
278
|
+
tool_name = tool_names.get(event.index, "")
|
|
260
279
|
count = token_count[event.index]
|
|
261
280
|
# Display with tool wrench icon and tool name
|
|
262
281
|
if tool_name:
|
|
263
282
|
console.print(
|
|
264
|
-
f" \U0001f527 Calling {tool_name}... {count}
|
|
283
|
+
f" \U0001f527 Calling {tool_name}... {count} token(s) ",
|
|
265
284
|
end="\r",
|
|
266
285
|
)
|
|
267
286
|
else:
|
|
268
287
|
console.print(
|
|
269
|
-
f" \U0001f527 Calling tool... {count}
|
|
288
|
+
f" \U0001f527 Calling tool... {count} token(s) ",
|
|
270
289
|
end="\r",
|
|
271
290
|
)
|
|
272
291
|
|
|
@@ -311,8 +330,9 @@ async def event_stream_handler(
|
|
|
311
330
|
elif event.index in banner_printed:
|
|
312
331
|
console.print() # Final newline after streaming
|
|
313
332
|
|
|
314
|
-
# Clean up token count
|
|
333
|
+
# Clean up token count and tool names
|
|
315
334
|
token_count.pop(event.index, None)
|
|
335
|
+
tool_names.pop(event.index, None)
|
|
316
336
|
# Clean up all tracking sets
|
|
317
337
|
streaming_parts.discard(event.index)
|
|
318
338
|
thinking_parts.discard(event.index)
|
code_puppy/config.py
CHANGED
|
@@ -98,7 +98,6 @@ _CURRENT_AUTOSAVE_ID: Optional[str] = None
|
|
|
98
98
|
_model_validation_cache = {}
|
|
99
99
|
_default_model_cache = None
|
|
100
100
|
_default_vision_model_cache = None
|
|
101
|
-
_default_vqa_model_cache = None
|
|
102
101
|
|
|
103
102
|
|
|
104
103
|
def ensure_config_exists():
|
|
@@ -358,47 +357,6 @@ def _default_vision_model_from_models_json() -> str:
|
|
|
358
357
|
return "gpt-4.1"
|
|
359
358
|
|
|
360
359
|
|
|
361
|
-
def _default_vqa_model_from_models_json() -> str:
|
|
362
|
-
"""Select a default VQA-capable model, preferring vision-ready options."""
|
|
363
|
-
global _default_vqa_model_cache
|
|
364
|
-
|
|
365
|
-
if _default_vqa_model_cache is not None:
|
|
366
|
-
return _default_vqa_model_cache
|
|
367
|
-
|
|
368
|
-
try:
|
|
369
|
-
from code_puppy.model_factory import ModelFactory
|
|
370
|
-
|
|
371
|
-
models_config = ModelFactory.load_config()
|
|
372
|
-
if models_config:
|
|
373
|
-
# Allow explicit VQA hints if present
|
|
374
|
-
for name, config in models_config.items():
|
|
375
|
-
if config.get("supports_vqa"):
|
|
376
|
-
_default_vqa_model_cache = name
|
|
377
|
-
return name
|
|
378
|
-
|
|
379
|
-
# Reuse multimodal heuristics before falling back to generic default
|
|
380
|
-
preferred_candidates = (
|
|
381
|
-
"gpt-4.1",
|
|
382
|
-
"gpt-4.1-mini",
|
|
383
|
-
"claude-4-0-sonnet",
|
|
384
|
-
"gemini-2.5-flash-preview-05-20",
|
|
385
|
-
"gpt-4.1-nano",
|
|
386
|
-
)
|
|
387
|
-
for candidate in preferred_candidates:
|
|
388
|
-
if candidate in models_config:
|
|
389
|
-
_default_vqa_model_cache = candidate
|
|
390
|
-
return candidate
|
|
391
|
-
|
|
392
|
-
_default_vqa_model_cache = _default_model_from_models_json()
|
|
393
|
-
return _default_vqa_model_cache
|
|
394
|
-
|
|
395
|
-
_default_vqa_model_cache = "gpt-4.1"
|
|
396
|
-
return "gpt-4.1"
|
|
397
|
-
except Exception:
|
|
398
|
-
_default_vqa_model_cache = "gpt-4.1"
|
|
399
|
-
return "gpt-4.1"
|
|
400
|
-
|
|
401
|
-
|
|
402
360
|
def _validate_model_exists(model_name: str) -> bool:
|
|
403
361
|
"""Check if a model exists in models.json with caching to avoid redundant calls."""
|
|
404
362
|
global _model_validation_cache
|
|
@@ -424,15 +382,10 @@ def _validate_model_exists(model_name: str) -> bool:
|
|
|
424
382
|
|
|
425
383
|
def clear_model_cache():
|
|
426
384
|
"""Clear the model validation cache. Call this when models.json changes."""
|
|
427
|
-
global
|
|
428
|
-
_model_validation_cache, \
|
|
429
|
-
_default_model_cache, \
|
|
430
|
-
_default_vision_model_cache, \
|
|
431
|
-
_default_vqa_model_cache
|
|
385
|
+
global _model_validation_cache, _default_model_cache, _default_vision_model_cache
|
|
432
386
|
_model_validation_cache.clear()
|
|
433
387
|
_default_model_cache = None
|
|
434
388
|
_default_vision_model_cache = None
|
|
435
|
-
_default_vqa_model_cache = None
|
|
436
389
|
|
|
437
390
|
|
|
438
391
|
def model_supports_setting(model_name: str, setting: str) -> bool:
|
|
@@ -503,21 +456,6 @@ def set_model_name(model: str):
|
|
|
503
456
|
clear_model_cache()
|
|
504
457
|
|
|
505
458
|
|
|
506
|
-
def get_vqa_model_name() -> str:
|
|
507
|
-
"""Return the configured VQA model, falling back to the global model."""
|
|
508
|
-
stored_model = get_value("vqa_model_name")
|
|
509
|
-
if stored_model and _validate_model_exists(stored_model):
|
|
510
|
-
return stored_model
|
|
511
|
-
# Fall back to the global model if no specific VQA model is set
|
|
512
|
-
return get_global_model_name()
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
def set_vqa_model_name(model: str):
|
|
516
|
-
"""Persist the configured VQA model name and refresh caches."""
|
|
517
|
-
set_config_value("vqa_model_name", model or "")
|
|
518
|
-
clear_model_cache()
|
|
519
|
-
|
|
520
|
-
|
|
521
459
|
def get_puppy_token():
|
|
522
460
|
"""Returns the puppy_token from config, or None if not set."""
|
|
523
461
|
return get_value("puppy_token")
|
code_puppy/tools/__init__.py
CHANGED
|
@@ -41,9 +41,6 @@ from code_puppy.tools.browser.browser_navigation import (
|
|
|
41
41
|
from code_puppy.tools.browser.browser_screenshot import (
|
|
42
42
|
register_take_screenshot_and_analyze,
|
|
43
43
|
)
|
|
44
|
-
from code_puppy.tools.browser.browser_screenshot_vqa import (
|
|
45
|
-
register_take_screenshot_and_analyze_vqa,
|
|
46
|
-
)
|
|
47
44
|
from code_puppy.tools.browser.browser_scripts import (
|
|
48
45
|
register_browser_clear_highlights,
|
|
49
46
|
register_browser_highlight_element,
|
|
@@ -146,9 +143,8 @@ TOOL_REGISTRY = {
|
|
|
146
143
|
"browser_wait_for_element": register_wait_for_element,
|
|
147
144
|
"browser_highlight_element": register_browser_highlight_element,
|
|
148
145
|
"browser_clear_highlights": register_browser_clear_highlights,
|
|
149
|
-
# Browser Screenshots
|
|
146
|
+
# Browser Screenshots
|
|
150
147
|
"browser_screenshot_analyze": register_take_screenshot_and_analyze,
|
|
151
|
-
"browser_screenshot_vqa": register_take_screenshot_and_analyze_vqa,
|
|
152
148
|
# Browser Workflows
|
|
153
149
|
"browser_save_workflow": register_save_workflow,
|
|
154
150
|
"browser_list_workflows": register_list_workflows,
|
|
@@ -11,7 +11,6 @@ from .camoufox_manager import (
|
|
|
11
11
|
get_session_browser_manager,
|
|
12
12
|
set_browser_session,
|
|
13
13
|
)
|
|
14
|
-
from .vqa_agent import VisualAnalysisResult, run_vqa_analysis, run_vqa_analysis_stream
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
def format_terminal_banner(text: str) -> str:
|
|
@@ -35,7 +34,4 @@ __all__ = [
|
|
|
35
34
|
"get_browser_session",
|
|
36
35
|
"get_session_browser_manager",
|
|
37
36
|
"set_browser_session",
|
|
38
|
-
"VisualAnalysisResult",
|
|
39
|
-
"run_vqa_analysis",
|
|
40
|
-
"run_vqa_analysis_stream",
|
|
41
37
|
]
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
"""Screenshot tool for browser automation.
|
|
2
2
|
|
|
3
|
-
Captures screenshots and returns them
|
|
4
|
-
models can directly see and analyze - no separate VQA agent needed.
|
|
3
|
+
Captures screenshots and returns them via ToolReturn with BinaryContent
|
|
4
|
+
so multimodal models can directly see and analyze - no separate VQA agent needed.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import
|
|
7
|
+
import time
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from tempfile import gettempdir, mkdtemp
|
|
11
|
-
from typing import Any, Dict, Optional
|
|
11
|
+
from typing import Any, Dict, Optional, Union
|
|
12
12
|
|
|
13
|
-
from pydantic_ai import RunContext
|
|
13
|
+
from pydantic_ai import BinaryContent, RunContext, ToolReturn
|
|
14
14
|
|
|
15
15
|
from code_puppy.messaging import emit_error, emit_info, emit_success
|
|
16
16
|
from code_puppy.tools.common import generate_group_id
|
|
@@ -54,7 +54,6 @@ async def _capture_screenshot(
|
|
|
54
54
|
result: Dict[str, Any] = {
|
|
55
55
|
"success": True,
|
|
56
56
|
"screenshot_bytes": screenshot_bytes,
|
|
57
|
-
"base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
|
|
58
57
|
"timestamp": timestamp,
|
|
59
58
|
}
|
|
60
59
|
|
|
@@ -80,11 +79,11 @@ async def take_screenshot(
|
|
|
80
79
|
full_page: bool = False,
|
|
81
80
|
element_selector: Optional[str] = None,
|
|
82
81
|
save_screenshot: bool = True,
|
|
83
|
-
) -> Dict[str, Any]:
|
|
82
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
84
83
|
"""Take a screenshot of the browser page.
|
|
85
84
|
|
|
86
|
-
Returns
|
|
87
|
-
|
|
85
|
+
Returns a ToolReturn with BinaryContent so multimodal models can
|
|
86
|
+
directly see and analyze the screenshot.
|
|
88
87
|
|
|
89
88
|
Args:
|
|
90
89
|
full_page: Whether to capture full page or just viewport.
|
|
@@ -92,12 +91,11 @@ async def take_screenshot(
|
|
|
92
91
|
save_screenshot: Whether to save the screenshot to disk.
|
|
93
92
|
|
|
94
93
|
Returns:
|
|
95
|
-
|
|
96
|
-
-
|
|
97
|
-
-
|
|
98
|
-
-
|
|
99
|
-
|
|
100
|
-
- error (str): Error message if unsuccessful.
|
|
94
|
+
ToolReturn containing:
|
|
95
|
+
- return_value: Success message with screenshot path
|
|
96
|
+
- content: List with description and BinaryContent image
|
|
97
|
+
- metadata: Screenshot details (path, target, timestamp)
|
|
98
|
+
Or Dict with error info if failed.
|
|
101
99
|
"""
|
|
102
100
|
target = element_selector or ("full_page" if full_page else "viewport")
|
|
103
101
|
group_id = generate_group_id("browser_screenshot", target)
|
|
@@ -122,15 +120,30 @@ async def take_screenshot(
|
|
|
122
120
|
|
|
123
121
|
if not result["success"]:
|
|
124
122
|
emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
|
|
125
|
-
return result
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
"
|
|
132
|
-
|
|
133
|
-
|
|
123
|
+
return {"success": False, "error": result.get("error")}
|
|
124
|
+
|
|
125
|
+
screenshot_path = result.get("screenshot_path", "(not saved)")
|
|
126
|
+
|
|
127
|
+
# Return as ToolReturn with BinaryContent so the model can SEE the image!
|
|
128
|
+
return ToolReturn(
|
|
129
|
+
return_value=f"Screenshot captured successfully. Saved to: {screenshot_path}",
|
|
130
|
+
content=[
|
|
131
|
+
f"Here's the browser screenshot ({target}):",
|
|
132
|
+
BinaryContent(
|
|
133
|
+
data=result["screenshot_bytes"],
|
|
134
|
+
media_type="image/png",
|
|
135
|
+
),
|
|
136
|
+
"Please analyze what you see and describe any relevant details.",
|
|
137
|
+
],
|
|
138
|
+
metadata={
|
|
139
|
+
"success": True,
|
|
140
|
+
"screenshot_path": screenshot_path,
|
|
141
|
+
"target": target,
|
|
142
|
+
"full_page": full_page,
|
|
143
|
+
"element_selector": element_selector,
|
|
144
|
+
"timestamp": time.time(),
|
|
145
|
+
},
|
|
146
|
+
)
|
|
134
147
|
|
|
135
148
|
except Exception as e:
|
|
136
149
|
error_msg = f"Screenshot failed: {str(e)}"
|
|
@@ -146,19 +159,19 @@ def register_take_screenshot_and_analyze(agent):
|
|
|
146
159
|
context: RunContext,
|
|
147
160
|
full_page: bool = False,
|
|
148
161
|
element_selector: Optional[str] = None,
|
|
149
|
-
) -> Dict[str, Any]:
|
|
162
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
150
163
|
"""
|
|
151
164
|
Take a screenshot of the browser page.
|
|
152
165
|
|
|
153
|
-
Returns the screenshot
|
|
154
|
-
Use this to see what's displayed in the browser.
|
|
166
|
+
Returns the screenshot via ToolReturn with BinaryContent that you can
|
|
167
|
+
see directly. Use this to see what's displayed in the browser.
|
|
155
168
|
|
|
156
169
|
Args:
|
|
157
170
|
full_page: Capture full page (True) or just viewport (False).
|
|
158
171
|
element_selector: Optional CSS selector to screenshot specific element.
|
|
159
172
|
|
|
160
173
|
Returns:
|
|
161
|
-
|
|
174
|
+
ToolReturn with the screenshot image you can analyze, or error dict.
|
|
162
175
|
"""
|
|
163
176
|
return await take_screenshot(
|
|
164
177
|
full_page=full_page,
|
|
@@ -5,22 +5,22 @@ This module provides tools for:
|
|
|
5
5
|
- Reading terminal output by scraping xterm.js DOM
|
|
6
6
|
- Loading images from the filesystem
|
|
7
7
|
|
|
8
|
-
Screenshots are returned
|
|
9
|
-
can directly see and analyze
|
|
8
|
+
Screenshots and images are returned via ToolReturn with BinaryContent
|
|
9
|
+
so multimodal models can directly see and analyze them.
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
Images are automatically resized to reduce token usage.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
-
import base64
|
|
15
14
|
import io
|
|
16
15
|
import logging
|
|
16
|
+
import time
|
|
17
17
|
from datetime import datetime
|
|
18
18
|
from pathlib import Path
|
|
19
19
|
from tempfile import gettempdir, mkdtemp
|
|
20
|
-
from typing import Any, Dict
|
|
20
|
+
from typing import Any, Dict, Union
|
|
21
21
|
|
|
22
22
|
from PIL import Image
|
|
23
|
-
from pydantic_ai import RunContext
|
|
23
|
+
from pydantic_ai import BinaryContent, RunContext, ToolReturn
|
|
24
24
|
from rich.text import Text
|
|
25
25
|
|
|
26
26
|
from code_puppy.messaging import emit_error, emit_info, emit_success
|
|
@@ -178,7 +178,6 @@ async def _capture_terminal_screenshot(
|
|
|
178
178
|
result: Dict[str, Any] = {
|
|
179
179
|
"success": True,
|
|
180
180
|
"screenshot_bytes": screenshot_bytes,
|
|
181
|
-
"base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
|
|
182
181
|
}
|
|
183
182
|
|
|
184
183
|
# Save to disk if requested (save the resized version)
|
|
@@ -205,11 +204,11 @@ async def _capture_terminal_screenshot(
|
|
|
205
204
|
async def terminal_screenshot(
|
|
206
205
|
full_page: bool = False,
|
|
207
206
|
save_to_disk: bool = True,
|
|
208
|
-
) -> Dict[str, Any]:
|
|
207
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
209
208
|
"""Take a screenshot of the terminal browser.
|
|
210
209
|
|
|
211
|
-
Captures a screenshot and returns it
|
|
212
|
-
|
|
210
|
+
Captures a screenshot and returns it via ToolReturn with BinaryContent
|
|
211
|
+
so multimodal models can directly see and analyze the image.
|
|
213
212
|
|
|
214
213
|
Args:
|
|
215
214
|
full_page: Whether to capture the full page or just viewport.
|
|
@@ -218,18 +217,11 @@ async def terminal_screenshot(
|
|
|
218
217
|
Defaults to True.
|
|
219
218
|
|
|
220
219
|
Returns:
|
|
221
|
-
|
|
222
|
-
-
|
|
223
|
-
-
|
|
224
|
-
-
|
|
225
|
-
|
|
226
|
-
- error (str): Error message if unsuccessful.
|
|
227
|
-
|
|
228
|
-
Example:
|
|
229
|
-
>>> result = await terminal_screenshot()
|
|
230
|
-
>>> if result["success"]:
|
|
231
|
-
... # The base64_image can be shown to multimodal models
|
|
232
|
-
... print(f"Screenshot saved to: {result['screenshot_path']}")
|
|
220
|
+
ToolReturn containing:
|
|
221
|
+
- return_value: Success message with screenshot path
|
|
222
|
+
- content: List with description and BinaryContent image
|
|
223
|
+
- metadata: Screenshot details (path, target, timestamp)
|
|
224
|
+
Or Dict with error info if failed.
|
|
233
225
|
"""
|
|
234
226
|
target = "full_page" if full_page else "viewport"
|
|
235
227
|
group_id = generate_group_id("terminal_screenshot", target)
|
|
@@ -249,14 +241,27 @@ async def terminal_screenshot(
|
|
|
249
241
|
emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
|
|
250
242
|
return result
|
|
251
243
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
"
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
244
|
+
screenshot_path = result.get("screenshot_path", "(not saved)")
|
|
245
|
+
|
|
246
|
+
# Return as ToolReturn with BinaryContent so the model can SEE the image!
|
|
247
|
+
return ToolReturn(
|
|
248
|
+
return_value=f"Terminal screenshot captured. Saved to: {screenshot_path}",
|
|
249
|
+
content=[
|
|
250
|
+
f"Here's the terminal screenshot ({target}):",
|
|
251
|
+
BinaryContent(
|
|
252
|
+
data=result["screenshot_bytes"],
|
|
253
|
+
media_type="image/png",
|
|
254
|
+
),
|
|
255
|
+
"Please analyze what you see in the terminal.",
|
|
256
|
+
],
|
|
257
|
+
metadata={
|
|
258
|
+
"success": True,
|
|
259
|
+
"screenshot_path": screenshot_path,
|
|
260
|
+
"target": target,
|
|
261
|
+
"full_page": full_page,
|
|
262
|
+
"timestamp": time.time(),
|
|
263
|
+
},
|
|
264
|
+
)
|
|
260
265
|
|
|
261
266
|
|
|
262
267
|
async def terminal_read_output(lines: int = 50) -> Dict[str, Any]:
|
|
@@ -328,23 +333,22 @@ async def terminal_read_output(lines: int = 50) -> Dict[str, Any]:
|
|
|
328
333
|
async def load_image(
|
|
329
334
|
image_path: str,
|
|
330
335
|
max_height: int = DEFAULT_MAX_HEIGHT,
|
|
331
|
-
) -> Dict[str, Any]:
|
|
332
|
-
"""Load an image from the filesystem
|
|
336
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
337
|
+
"""Load an image from the filesystem for visual analysis.
|
|
333
338
|
|
|
334
339
|
Loads any image file, resizes it to reduce token usage, and returns
|
|
335
|
-
it
|
|
340
|
+
it via ToolReturn with BinaryContent so multimodal models can see it.
|
|
336
341
|
|
|
337
342
|
Args:
|
|
338
343
|
image_path: Path to the image file.
|
|
339
344
|
max_height: Maximum height for resizing (default 768px).
|
|
340
345
|
|
|
341
346
|
Returns:
|
|
342
|
-
|
|
343
|
-
-
|
|
344
|
-
-
|
|
345
|
-
-
|
|
346
|
-
|
|
347
|
-
- error (str): Error message if unsuccessful.
|
|
347
|
+
ToolReturn containing:
|
|
348
|
+
- return_value: Success message with path info
|
|
349
|
+
- content: List with description and BinaryContent image
|
|
350
|
+
- metadata: Image details (path, resized height)
|
|
351
|
+
Or Dict with error info if failed.
|
|
348
352
|
"""
|
|
349
353
|
group_id = generate_group_id("load_image", image_path)
|
|
350
354
|
emit_info(f"LOAD IMAGE 🖼️ {image_path}", message_group=group_id)
|
|
@@ -368,18 +372,26 @@ async def load_image(
|
|
|
368
372
|
# Resize to reduce token usage
|
|
369
373
|
image_bytes = _resize_image(original_bytes, max_height=max_height)
|
|
370
374
|
|
|
371
|
-
# Always return as PNG after resizing (consistent format)
|
|
372
|
-
base64_data = base64.b64encode(image_bytes).decode("utf-8")
|
|
373
|
-
|
|
374
375
|
emit_success(f"Loaded image: {image_path}", message_group=group_id)
|
|
375
376
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
"
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
377
|
+
# Return as ToolReturn with BinaryContent so the model can SEE the image!
|
|
378
|
+
return ToolReturn(
|
|
379
|
+
return_value=f"Image loaded from: {image_path}",
|
|
380
|
+
content=[
|
|
381
|
+
f"Here's the image from {image_file.name}:",
|
|
382
|
+
BinaryContent(
|
|
383
|
+
data=image_bytes,
|
|
384
|
+
media_type="image/png", # Always PNG after resize
|
|
385
|
+
),
|
|
386
|
+
"Please analyze what you see in this image.",
|
|
387
|
+
],
|
|
388
|
+
metadata={
|
|
389
|
+
"success": True,
|
|
390
|
+
"image_path": image_path,
|
|
391
|
+
"max_height": max_height,
|
|
392
|
+
"timestamp": time.time(),
|
|
393
|
+
},
|
|
394
|
+
)
|
|
383
395
|
|
|
384
396
|
except Exception as e:
|
|
385
397
|
error_msg = f"Failed to load image: {str(e)}"
|
|
@@ -400,18 +412,18 @@ def register_terminal_screenshot(agent):
|
|
|
400
412
|
async def terminal_screenshot_analyze(
|
|
401
413
|
context: RunContext,
|
|
402
414
|
full_page: bool = False,
|
|
403
|
-
) -> Dict[str, Any]:
|
|
415
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
404
416
|
"""
|
|
405
417
|
Take a screenshot of the terminal browser.
|
|
406
418
|
|
|
407
|
-
Returns the screenshot
|
|
408
|
-
Use this to see what's displayed in the terminal.
|
|
419
|
+
Returns the screenshot via ToolReturn with BinaryContent that you can
|
|
420
|
+
see directly. Use this to see what's displayed in the terminal.
|
|
409
421
|
|
|
410
422
|
Args:
|
|
411
423
|
full_page: Capture full page (True) or just viewport (False).
|
|
412
424
|
|
|
413
425
|
Returns:
|
|
414
|
-
|
|
426
|
+
ToolReturn with the terminal screenshot you can analyze, or error dict.
|
|
415
427
|
"""
|
|
416
428
|
# Session is set by invoke_agent via contextvar
|
|
417
429
|
return await terminal_screenshot(full_page=full_page)
|
|
@@ -449,17 +461,18 @@ def register_load_image(agent):
|
|
|
449
461
|
async def load_image_for_analysis(
|
|
450
462
|
context: RunContext,
|
|
451
463
|
image_path: str,
|
|
452
|
-
) -> Dict[str, Any]:
|
|
464
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
453
465
|
"""
|
|
454
466
|
Load an image file so you can see and analyze it.
|
|
455
467
|
|
|
456
|
-
Returns the image
|
|
468
|
+
Returns the image via ToolReturn with BinaryContent that you can
|
|
469
|
+
see directly.
|
|
457
470
|
|
|
458
471
|
Args:
|
|
459
472
|
image_path: Path to the image file.
|
|
460
473
|
|
|
461
474
|
Returns:
|
|
462
|
-
|
|
475
|
+
ToolReturn with the image you can analyze, or error dict.
|
|
463
476
|
"""
|
|
464
477
|
# Session is set by invoke_agent via contextvar
|
|
465
478
|
return await load_image(image_path=image_path)
|
|
@@ -472,18 +485,18 @@ def register_terminal_compare_mockup(agent):
|
|
|
472
485
|
async def terminal_compare_mockup(
|
|
473
486
|
context: RunContext,
|
|
474
487
|
mockup_path: str,
|
|
475
|
-
) -> Dict[str, Any]:
|
|
488
|
+
) -> Union[ToolReturn, Dict[str, Any]]:
|
|
476
489
|
"""
|
|
477
490
|
Compare the terminal to a mockup image.
|
|
478
491
|
|
|
479
492
|
Takes a screenshot of the terminal and loads the mockup image.
|
|
480
|
-
Returns both
|
|
493
|
+
Returns both via ToolReturn with BinaryContent so you can compare them.
|
|
481
494
|
|
|
482
495
|
Args:
|
|
483
496
|
mockup_path: Path to the mockup/expected image.
|
|
484
497
|
|
|
485
498
|
Returns:
|
|
486
|
-
|
|
499
|
+
ToolReturn with both images (terminal and mockup) you can compare.
|
|
487
500
|
"""
|
|
488
501
|
# Session is set by invoke_agent via contextvar
|
|
489
502
|
group_id = generate_group_id("terminal_compare_mockup", mockup_path)
|
|
@@ -493,28 +506,51 @@ def register_terminal_compare_mockup(agent):
|
|
|
493
506
|
message_group=group_id,
|
|
494
507
|
)
|
|
495
508
|
|
|
496
|
-
#
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
509
|
+
# Capture terminal screenshot (get raw result for bytes)
|
|
510
|
+
terminal_capture = await _capture_terminal_screenshot(
|
|
511
|
+
full_page=False,
|
|
512
|
+
save_to_disk=True,
|
|
513
|
+
group_id=group_id,
|
|
514
|
+
)
|
|
515
|
+
if not terminal_capture["success"]:
|
|
516
|
+
return terminal_capture
|
|
500
517
|
|
|
501
|
-
#
|
|
502
|
-
|
|
503
|
-
if not
|
|
504
|
-
|
|
518
|
+
# Load the mockup image
|
|
519
|
+
mockup_file = Path(mockup_path)
|
|
520
|
+
if not mockup_file.exists():
|
|
521
|
+
error_msg = f"Mockup file not found: {mockup_path}"
|
|
522
|
+
emit_error(error_msg, message_group=group_id)
|
|
523
|
+
return {"success": False, "error": error_msg}
|
|
524
|
+
|
|
525
|
+
mockup_bytes = _resize_image(mockup_file.read_bytes())
|
|
505
526
|
|
|
506
527
|
emit_success(
|
|
507
528
|
"Both images loaded. Compare them visually.",
|
|
508
529
|
message_group=group_id,
|
|
509
530
|
)
|
|
510
531
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
"
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
532
|
+
terminal_path = terminal_capture.get("screenshot_path", "(not saved)")
|
|
533
|
+
|
|
534
|
+
# Return as ToolReturn with BOTH images as BinaryContent!
|
|
535
|
+
return ToolReturn(
|
|
536
|
+
return_value=f"Comparison ready: terminal vs mockup ({mockup_path})",
|
|
537
|
+
content=[
|
|
538
|
+
"Here's the CURRENT terminal screenshot:",
|
|
539
|
+
BinaryContent(
|
|
540
|
+
data=terminal_capture["screenshot_bytes"],
|
|
541
|
+
media_type="image/png",
|
|
542
|
+
),
|
|
543
|
+
f"And here's the EXPECTED mockup ({mockup_file.name}):",
|
|
544
|
+
BinaryContent(
|
|
545
|
+
data=mockup_bytes,
|
|
546
|
+
media_type="image/png",
|
|
547
|
+
),
|
|
548
|
+
"Please compare these images and describe any differences.",
|
|
549
|
+
],
|
|
550
|
+
metadata={
|
|
551
|
+
"success": True,
|
|
552
|
+
"terminal_path": terminal_path,
|
|
553
|
+
"mockup_path": mockup_path,
|
|
554
|
+
"timestamp": time.time(),
|
|
555
|
+
},
|
|
556
|
+
)
|
|
@@ -4,7 +4,7 @@ code_puppy/callbacks.py,sha256=Pp0VyeXJBEtk-N_RSWr5pbveelovsdLUiJ4f11dzwGw,10775
|
|
|
4
4
|
code_puppy/chatgpt_codex_client.py,sha256=Om0ANB_kpHubhCwNzF9ENf8RvKBqs0IYzBLl_SNw0Vk,9833
|
|
5
5
|
code_puppy/claude_cache_client.py,sha256=Gl6um5ZaKpcnxOvoFSM8Lwm_Vu4-VyWz8Nli8DnRLa4,22508
|
|
6
6
|
code_puppy/cli_runner.py,sha256=w5CLKgQYYaT7My3Cga2StXYol-u6DBxNzzUuhhsfhsA,34952
|
|
7
|
-
code_puppy/config.py,sha256=
|
|
7
|
+
code_puppy/config.py,sha256=z4c-rKwQOEsg13HHd1KskIQG4Ygdr9krQsCAiZU-Wa0,52441
|
|
8
8
|
code_puppy/error_logging.py,sha256=a80OILCUtJhexI6a9GM-r5LqIdjvSRzggfgPp2jv1X0,3297
|
|
9
9
|
code_puppy/gemini_code_assist.py,sha256=KGS7sO5OLc83nDF3xxS-QiU6vxW9vcm6hmzilu79Ef8,13867
|
|
10
10
|
code_puppy/http_utils.py,sha256=H3N5Qz2B1CcsGUYOycGWAqoNMr2P1NCVluKX3aRwRqI,10358
|
|
@@ -38,12 +38,12 @@ code_puppy/agents/agent_planning.py,sha256=6q3s5qCko2FcUfaLzImOFNDi0H61WBc2PNtsO
|
|
|
38
38
|
code_puppy/agents/agent_python_programmer.py,sha256=R-7XoGIFJ58EY9LE9mWGcQQ8gSsMzi-1HD6wigJQPL8,6846
|
|
39
39
|
code_puppy/agents/agent_python_reviewer.py,sha256=J8lqzoKJlohs8NWMbgUpHXNt1bXHNIkuGjzLd9Af8qE,5854
|
|
40
40
|
code_puppy/agents/agent_qa_expert.py,sha256=5Ikb4U3SZQknUEfwlHZiyZXKqnffnOTQagr_wrkUkPk,10125
|
|
41
|
-
code_puppy/agents/agent_qa_kitten.py,sha256=
|
|
41
|
+
code_puppy/agents/agent_qa_kitten.py,sha256=qvry-1u_CiXi8eRueHTax4OtqsS_mQrtXHsbTXWzGYs,9517
|
|
42
42
|
code_puppy/agents/agent_security_auditor.py,sha256=SpiYNA0XAsIwBj7S2_EQPRslRUmF_-b89pIJyW7DYtY,12022
|
|
43
43
|
code_puppy/agents/agent_terminal_qa.py,sha256=U-iyP7OBWdAmchW_oUU8k6asH2aignTMmgqqYDyf-ms,10343
|
|
44
44
|
code_puppy/agents/agent_typescript_reviewer.py,sha256=vsnpp98xg6cIoFAEJrRTUM_i4wLEWGm5nJxs6fhHobM,10275
|
|
45
45
|
code_puppy/agents/base_agent.py,sha256=oKlX9CEIWSvdXyQDVi9F1jauA6rjKleY_n6044Ux5DY,73840
|
|
46
|
-
code_puppy/agents/event_stream_handler.py,sha256=
|
|
46
|
+
code_puppy/agents/event_stream_handler.py,sha256=JttLZJpNADE5HXiXY-GZ6tpwaBeFRODcy34KiquPOvU,14952
|
|
47
47
|
code_puppy/agents/json_agent.py,sha256=lhopDJDoiSGHvD8A6t50hi9ZBoNRKgUywfxd0Po_Dzc,4886
|
|
48
48
|
code_puppy/agents/prompt_reviewer.py,sha256=JJrJ0m5q0Puxl8vFsyhAbY9ftU9n6c6UxEVdNct1E-Q,5558
|
|
49
49
|
code_puppy/agents/subagent_stream_handler.py,sha256=5imUFYOJCv7blfv4fTHm6OQ7JpqlyFv_luBYGSj16MA,10329
|
|
@@ -184,7 +184,7 @@ code_puppy/plugins/shell_safety/command_cache.py,sha256=adYtSPNVOZfW_6dQdtEihO6E
|
|
|
184
184
|
code_puppy/plugins/shell_safety/register_callbacks.py,sha256=W3v664RR48Fdbbbltf_NnX22_Ahw2AvAOtvXvWc7KxQ,7322
|
|
185
185
|
code_puppy/prompts/antigravity_system_prompt.md,sha256=ZaTfRyY57ttROyZMmOBtqZQu1to7sdTNTv8_0fTgPNw,6807
|
|
186
186
|
code_puppy/prompts/codex_system_prompt.md,sha256=hEFTCziroLqZmqNle5kG34A8kvTteOWezCiVrAEKhE0,24400
|
|
187
|
-
code_puppy/tools/__init__.py,sha256=
|
|
187
|
+
code_puppy/tools/__init__.py,sha256=9bzVIjX9CAr2YTZkhD7IWFYt4KpnFRx6ge_Tqazugbs,7425
|
|
188
188
|
code_puppy/tools/agent_tools.py,sha256=XvBQ_IPa4NHLmIA2mdyPwy9GPlYGQwhtdn-w_3i239g,25517
|
|
189
189
|
code_puppy/tools/command_runner.py,sha256=Sresr_ykou_c2V1sKoNxqrqCQovKF5yDiQJ8r3E9lak,50995
|
|
190
190
|
code_puppy/tools/common.py,sha256=lVtF94cn6jtC5YKfitV7L3rk37Ts2gMoHLQrqDFD2E4,46411
|
|
@@ -193,25 +193,23 @@ code_puppy/tools/file_modifications.py,sha256=vz9n7R0AGDSdLUArZr_55yJLkyI30M8zre
|
|
|
193
193
|
code_puppy/tools/file_operations.py,sha256=CqhpuBnOFOcQCIYXOujskxq2VMLWYJhibYrH0YcPSfA,35692
|
|
194
194
|
code_puppy/tools/subagent_context.py,sha256=zsiKV3B3DxZ_Y5IHHhtE-SMFDg_jMrY7Hi6r5LH--IU,4781
|
|
195
195
|
code_puppy/tools/tools_content.py,sha256=bsBqW-ppd1XNAS_g50B3UHDQBWEALC1UneH6-afz1zo,2365
|
|
196
|
-
code_puppy/tools/browser/__init__.py,sha256=
|
|
196
|
+
code_puppy/tools/browser/__init__.py,sha256=SPiEQwsDj5KoxDwX_viNUKFsn4tczxY-Jq2C64EzSNI,927
|
|
197
197
|
code_puppy/tools/browser/browser_control.py,sha256=YntpjfWTIv0TDlAO5BqTV_hDbUBw-8wmMn29K3TDQo0,8430
|
|
198
198
|
code_puppy/tools/browser/browser_interactions.py,sha256=ZyJmA2-ZtIATF76uGMt08cfVaYiqg7W2-cHfAzNI0F8,16775
|
|
199
199
|
code_puppy/tools/browser/browser_locators.py,sha256=sxXNm-K087poeSp7Um5Gc1sZxb7HlSZOu0F0r2b0ty8,19177
|
|
200
200
|
code_puppy/tools/browser/browser_navigation.py,sha256=RJdG14UXtA6wz4PNLw2Tqeu4oUDQilOyNbyTjgIFCrY,7416
|
|
201
|
-
code_puppy/tools/browser/browser_screenshot.py,sha256=
|
|
202
|
-
code_puppy/tools/browser/browser_screenshot_vqa.py,sha256=DBdQuV7eIaJX2Qy_liwitfakIzrcVziB-zAGIngM5GE,7349
|
|
201
|
+
code_puppy/tools/browser/browser_screenshot.py,sha256=AJe9JbZv8vC93AFWzsAUlrg1YNshv4SWNde-O-_mfQU,6282
|
|
203
202
|
code_puppy/tools/browser/browser_scripts.py,sha256=CYWdQMtjKTNvJNSCkB2vGo-MOzmT_gw2oFMGtkfuzuA,14779
|
|
204
203
|
code_puppy/tools/browser/browser_workflows.py,sha256=nitW42vCf0ieTX1gLabozTugNQ8phtoFzZbiAhw1V90,6491
|
|
205
204
|
code_puppy/tools/browser/camoufox_manager.py,sha256=WIr98SrGeC5jd6jX5tjhFR6A3janqV4tq9Mbznnlh44,13920
|
|
206
205
|
code_puppy/tools/browser/chromium_terminal_manager.py,sha256=w1thQ_ACb6oV45L93TSqPQD0o0cTh3FqT5I9zcOOWlM,8226
|
|
207
206
|
code_puppy/tools/browser/terminal_command_tools.py,sha256=9byOZku-dwvTtCl532xt7Lumed_jTn0sLvUe_X75XCQ,19068
|
|
208
|
-
code_puppy/tools/browser/terminal_screenshot_tools.py,sha256=
|
|
207
|
+
code_puppy/tools/browser/terminal_screenshot_tools.py,sha256=J_21YO_495NvYgNFu9KQP6VYg2K_f8CtSdZuF94Yhnw,18448
|
|
209
208
|
code_puppy/tools/browser/terminal_tools.py,sha256=F5LjVH3udSCFHmqC3O1UJLoLozZFZsEdX42jOmkqkW0,17853
|
|
210
|
-
code_puppy/
|
|
211
|
-
code_puppy-0.0.
|
|
212
|
-
code_puppy-0.0.
|
|
213
|
-
code_puppy-0.0.
|
|
214
|
-
code_puppy-0.0.
|
|
215
|
-
code_puppy-0.0.
|
|
216
|
-
code_puppy-0.0.
|
|
217
|
-
code_puppy-0.0.358.dist-info/RECORD,,
|
|
209
|
+
code_puppy-0.0.360.data/data/code_puppy/models.json,sha256=FMQdE_yvP_8y0xxt3K918UkFL9cZMYAqW1SfXcQkU_k,3105
|
|
210
|
+
code_puppy-0.0.360.data/data/code_puppy/models_dev_api.json,sha256=wHjkj-IM_fx1oHki6-GqtOoCrRMR0ScK0f-Iz0UEcy8,548187
|
|
211
|
+
code_puppy-0.0.360.dist-info/METADATA,sha256=FZ7fXsTCXSepDHZqKoSNnjd_MyyMDx-Ntb2nnz3Jizg,27614
|
|
212
|
+
code_puppy-0.0.360.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
213
|
+
code_puppy-0.0.360.dist-info/entry_points.txt,sha256=Tp4eQC99WY3HOKd3sdvb22vZODRq0XkZVNpXOag_KdI,91
|
|
214
|
+
code_puppy-0.0.360.dist-info/licenses/LICENSE,sha256=31u8x0SPgdOq3izJX41kgFazWsM43zPEF9eskzqbJMY,1075
|
|
215
|
+
code_puppy-0.0.360.dist-info/RECORD,,
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
"""VQA-based Screenshot tool for browser automation (qa-kitten).
|
|
2
|
-
|
|
3
|
-
This module provides screenshot analysis using a dedicated VQA agent.
|
|
4
|
-
Unlike browser_screenshot.py which returns raw base64 bytes for multimodal
|
|
5
|
-
models to see directly, this version offloads the visual analysis to a
|
|
6
|
-
separate VQA agent, helping manage context in the calling agent.
|
|
7
|
-
|
|
8
|
-
Use this for qa-kitten where context management is important.
|
|
9
|
-
Use browser_screenshot.py for terminal-qa where direct image viewing is needed.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
from typing import Any, Dict, Optional
|
|
13
|
-
|
|
14
|
-
from pydantic_ai import RunContext
|
|
15
|
-
from rich.console import Console
|
|
16
|
-
|
|
17
|
-
from code_puppy.messaging import emit_error, emit_info, emit_success
|
|
18
|
-
from code_puppy.tools.common import generate_group_id
|
|
19
|
-
|
|
20
|
-
from .browser_screenshot import _capture_screenshot
|
|
21
|
-
from .camoufox_manager import get_session_browser_manager
|
|
22
|
-
from .vqa_agent import run_vqa_analysis_stream
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
async def take_screenshot_and_analyze(
|
|
26
|
-
question: str,
|
|
27
|
-
full_page: bool = False,
|
|
28
|
-
element_selector: Optional[str] = None,
|
|
29
|
-
save_screenshot: bool = True,
|
|
30
|
-
) -> Dict[str, Any]:
|
|
31
|
-
"""Take a screenshot and analyze it using the VQA agent.
|
|
32
|
-
|
|
33
|
-
This function captures a screenshot and passes it to a dedicated
|
|
34
|
-
VQA (Visual Question Answering) agent for analysis. The VQA agent
|
|
35
|
-
runs separately, keeping the image analysis out of the calling
|
|
36
|
-
agent's context window.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
question: The question to ask about the screenshot.
|
|
40
|
-
Examples:
|
|
41
|
-
- "What buttons are visible on this page?"
|
|
42
|
-
- "Is there an error message displayed?"
|
|
43
|
-
- "What is the main heading text?"
|
|
44
|
-
- "Describe the layout of this form."
|
|
45
|
-
full_page: Whether to capture full page or just viewport.
|
|
46
|
-
Defaults to False (viewport only).
|
|
47
|
-
element_selector: Optional CSS selector to screenshot a specific
|
|
48
|
-
element instead of the whole page.
|
|
49
|
-
save_screenshot: Whether to save the screenshot to disk.
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
Dict containing:
|
|
53
|
-
- success (bool): True if analysis succeeded.
|
|
54
|
-
- answer (str): The VQA agent's streamed answer to your question.
|
|
55
|
-
- screenshot_info (dict): Path, timestamp, and other metadata.
|
|
56
|
-
- error (str): Error message if unsuccessful.
|
|
57
|
-
"""
|
|
58
|
-
target = element_selector or ("full_page" if full_page else "viewport")
|
|
59
|
-
group_id = generate_group_id(
|
|
60
|
-
"browser_screenshot_analyze", f"{question[:50]}_{target}"
|
|
61
|
-
)
|
|
62
|
-
emit_info(
|
|
63
|
-
f"BROWSER SCREENSHOT ANALYZE 📷 question='{question[:100]}{'...' if len(question) > 100 else ''}' target={target}",
|
|
64
|
-
message_group=group_id,
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
try:
|
|
68
|
-
# Get the browser page
|
|
69
|
-
browser_manager = get_session_browser_manager()
|
|
70
|
-
page = await browser_manager.get_current_page()
|
|
71
|
-
|
|
72
|
-
if not page:
|
|
73
|
-
error_msg = "No active browser page. Navigate to a webpage first."
|
|
74
|
-
emit_error(error_msg, message_group=group_id)
|
|
75
|
-
return {"success": False, "error": error_msg, "question": question}
|
|
76
|
-
|
|
77
|
-
# Capture the screenshot
|
|
78
|
-
screenshot_result = await _capture_screenshot(
|
|
79
|
-
page,
|
|
80
|
-
full_page=full_page,
|
|
81
|
-
element_selector=element_selector,
|
|
82
|
-
save_screenshot=save_screenshot,
|
|
83
|
-
group_id=group_id,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
if not screenshot_result["success"]:
|
|
87
|
-
error_msg = screenshot_result.get("error", "Screenshot failed")
|
|
88
|
-
emit_error(
|
|
89
|
-
f"Screenshot capture failed: {error_msg}", message_group=group_id
|
|
90
|
-
)
|
|
91
|
-
return {"success": False, "error": error_msg, "question": question}
|
|
92
|
-
|
|
93
|
-
screenshot_bytes = screenshot_result.get("screenshot_bytes")
|
|
94
|
-
if not screenshot_bytes:
|
|
95
|
-
emit_error(
|
|
96
|
-
"Screenshot captured but pixel data missing; cannot run visual analysis.",
|
|
97
|
-
message_group=group_id,
|
|
98
|
-
)
|
|
99
|
-
return {
|
|
100
|
-
"success": False,
|
|
101
|
-
"error": "Screenshot captured but no image bytes available for analysis.",
|
|
102
|
-
"question": question,
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
# Run VQA analysis with streaming output
|
|
106
|
-
try:
|
|
107
|
-
console = Console()
|
|
108
|
-
console.print() # Newline before streaming starts
|
|
109
|
-
console.print("[bold cyan]🔍 VQA Analysis:[/bold cyan]")
|
|
110
|
-
|
|
111
|
-
vqa_answer = await run_vqa_analysis_stream(
|
|
112
|
-
question,
|
|
113
|
-
screenshot_bytes,
|
|
114
|
-
)
|
|
115
|
-
except Exception as exc:
|
|
116
|
-
emit_error(
|
|
117
|
-
f"Visual question answering failed: {exc}",
|
|
118
|
-
message_group=group_id,
|
|
119
|
-
)
|
|
120
|
-
return {
|
|
121
|
-
"success": False,
|
|
122
|
-
"error": f"Visual analysis failed: {exc}",
|
|
123
|
-
"question": question,
|
|
124
|
-
"screenshot_info": {
|
|
125
|
-
"path": screenshot_result.get("screenshot_path"),
|
|
126
|
-
"timestamp": screenshot_result.get("timestamp"),
|
|
127
|
-
"full_page": full_page,
|
|
128
|
-
"element_selector": element_selector,
|
|
129
|
-
},
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
emit_success(
|
|
133
|
-
"Visual analysis complete",
|
|
134
|
-
message_group=group_id,
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
return {
|
|
138
|
-
"success": True,
|
|
139
|
-
"question": question,
|
|
140
|
-
"answer": vqa_answer,
|
|
141
|
-
"screenshot_info": {
|
|
142
|
-
"path": screenshot_result.get("screenshot_path"),
|
|
143
|
-
"size": len(screenshot_bytes),
|
|
144
|
-
"timestamp": screenshot_result.get("timestamp"),
|
|
145
|
-
"full_page": full_page,
|
|
146
|
-
"element_selector": element_selector,
|
|
147
|
-
},
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
except Exception as e:
|
|
151
|
-
error_msg = f"Screenshot analysis failed: {str(e)}"
|
|
152
|
-
emit_error(error_msg, message_group=group_id)
|
|
153
|
-
return {"success": False, "error": error_msg, "question": question}
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def register_take_screenshot_and_analyze_vqa(agent):
|
|
157
|
-
"""Register the VQA-based screenshot tool.
|
|
158
|
-
|
|
159
|
-
This tool takes a screenshot and analyzes it using a separate VQA agent.
|
|
160
|
-
Use this for agents where context management is important (like qa-kitten).
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
@agent.tool
|
|
164
|
-
async def browser_screenshot_vqa(
|
|
165
|
-
context: RunContext,
|
|
166
|
-
question: str,
|
|
167
|
-
full_page: bool = False,
|
|
168
|
-
element_selector: Optional[str] = None,
|
|
169
|
-
) -> Dict[str, Any]:
|
|
170
|
-
"""
|
|
171
|
-
Take a screenshot and analyze it with VQA.
|
|
172
|
-
|
|
173
|
-
Captures a screenshot of the browser and uses a visual AI to
|
|
174
|
-
answer your question about what's visible on the page.
|
|
175
|
-
|
|
176
|
-
Args:
|
|
177
|
-
question: What you want to know about the screenshot.
|
|
178
|
-
Examples:
|
|
179
|
-
- "What buttons are visible?"
|
|
180
|
-
- "Is there an error message?"
|
|
181
|
-
- "What is the page title?"
|
|
182
|
-
- "Is the form filled out correctly?"
|
|
183
|
-
full_page: Capture full page (True) or just viewport (False).
|
|
184
|
-
element_selector: Optional CSS selector to screenshot specific element.
|
|
185
|
-
|
|
186
|
-
Returns:
|
|
187
|
-
Dict with:
|
|
188
|
-
- answer: The streamed answer to your question
|
|
189
|
-
- screenshot_info: Where the screenshot was saved, etc.
|
|
190
|
-
"""
|
|
191
|
-
return await take_screenshot_and_analyze(
|
|
192
|
-
question=question,
|
|
193
|
-
full_page=full_page,
|
|
194
|
-
element_selector=element_selector,
|
|
195
|
-
)
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
"""Utilities for running visual question-answering via pydantic-ai."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from collections.abc import AsyncIterable
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
from pydantic import BaseModel, Field
|
|
9
|
-
from pydantic_ai import Agent, BinaryContent, PartDeltaEvent, PartStartEvent, RunContext
|
|
10
|
-
from pydantic_ai.messages import TextPart, TextPartDelta
|
|
11
|
-
|
|
12
|
-
from code_puppy.config import get_use_dbos, get_vqa_model_name
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class VisualAnalysisResult(BaseModel):
|
|
16
|
-
"""Structured response from the VQA agent."""
|
|
17
|
-
|
|
18
|
-
answer: str
|
|
19
|
-
confidence: float = Field(ge=0.0, le=1.0)
|
|
20
|
-
observations: str
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
DEFAULT_VQA_INSTRUCTIONS = (
|
|
24
|
-
"You are a visual analysis specialist. Answer the user's question about the provided image. "
|
|
25
|
-
"Always respond using the structured schema: answer, confidence (0-1 float), observations. "
|
|
26
|
-
"Confidence reflects how certain you are about the answer. Observations should include useful, concise context."
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
async def run_vqa_analysis(
|
|
31
|
-
question: str,
|
|
32
|
-
image_bytes: bytes,
|
|
33
|
-
media_type: str = "image/png",
|
|
34
|
-
) -> str:
|
|
35
|
-
"""Execute the VQA agent asynchronously against screenshot bytes.
|
|
36
|
-
|
|
37
|
-
Follows the same pattern as agent_tools.py for prompt preparation
|
|
38
|
-
and model configuration.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
question: The question to ask about the image.
|
|
42
|
-
image_bytes: The raw image bytes.
|
|
43
|
-
media_type: The MIME type of the image (default: "image/png").
|
|
44
|
-
system_prompt: Optional custom system prompt. If None, uses default VQA instructions.
|
|
45
|
-
|
|
46
|
-
Returns:
|
|
47
|
-
str: The answer from the VQA analysis.
|
|
48
|
-
"""
|
|
49
|
-
from code_puppy import callbacks
|
|
50
|
-
from code_puppy.model_factory import ModelFactory
|
|
51
|
-
from code_puppy.model_utils import prepare_prompt_for_model
|
|
52
|
-
|
|
53
|
-
# Get model configuration
|
|
54
|
-
model_name = get_vqa_model_name()
|
|
55
|
-
models_config = ModelFactory.load_config()
|
|
56
|
-
model = ModelFactory.get_model(model_name, models_config)
|
|
57
|
-
|
|
58
|
-
# Build instructions: custom system_prompt or default VQA instructions
|
|
59
|
-
instructions = DEFAULT_VQA_INSTRUCTIONS
|
|
60
|
-
|
|
61
|
-
# Apply prompt additions (like file permission handling) - same as agent_tools.py
|
|
62
|
-
prompt_additions = callbacks.on_load_prompt()
|
|
63
|
-
if prompt_additions:
|
|
64
|
-
instructions += "\n" + "\n".join(prompt_additions)
|
|
65
|
-
|
|
66
|
-
# Handle claude-code models: swap instructions, prepend system prompt to user question
|
|
67
|
-
# Following the exact pattern from agent_tools.py
|
|
68
|
-
prepared = prepare_prompt_for_model(
|
|
69
|
-
model_name, instructions, question, prepend_system_to_user=True
|
|
70
|
-
)
|
|
71
|
-
instructions = prepared.instructions
|
|
72
|
-
question = prepared.user_prompt
|
|
73
|
-
|
|
74
|
-
# Create the VQA agent with string output
|
|
75
|
-
vqa_agent = Agent(
|
|
76
|
-
model=model,
|
|
77
|
-
instructions=instructions,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
# Wrap with DBOS if enabled
|
|
81
|
-
if get_use_dbos():
|
|
82
|
-
from pydantic_ai.durable_exec.dbos import DBOSAgent
|
|
83
|
-
|
|
84
|
-
vqa_agent = DBOSAgent(vqa_agent, name="vqa-agent")
|
|
85
|
-
|
|
86
|
-
# Run the agent with the image
|
|
87
|
-
result = await vqa_agent.run(
|
|
88
|
-
[
|
|
89
|
-
question,
|
|
90
|
-
BinaryContent(data=image_bytes, media_type=media_type),
|
|
91
|
-
]
|
|
92
|
-
)
|
|
93
|
-
return result.output
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def _create_vqa_stream_handler(
|
|
97
|
-
accumulator: list[str],
|
|
98
|
-
):
|
|
99
|
-
"""Create an event stream handler that accumulates text.
|
|
100
|
-
|
|
101
|
-
Args:
|
|
102
|
-
accumulator: List to accumulate text chunks into (pass empty list).
|
|
103
|
-
|
|
104
|
-
Returns:
|
|
105
|
-
Async event stream handler function.
|
|
106
|
-
"""
|
|
107
|
-
|
|
108
|
-
async def vqa_event_stream_handler(
|
|
109
|
-
ctx: RunContext,
|
|
110
|
-
events: AsyncIterable[Any],
|
|
111
|
-
) -> None:
|
|
112
|
-
"""Handle streaming events - print text as it arrives."""
|
|
113
|
-
async for event in events:
|
|
114
|
-
# Handle text part start - might have initial content
|
|
115
|
-
if isinstance(event, PartStartEvent):
|
|
116
|
-
if isinstance(event.part, TextPart) and event.part.content:
|
|
117
|
-
accumulator.append(event.part.content)
|
|
118
|
-
|
|
119
|
-
# Handle text deltas - the streaming bits
|
|
120
|
-
elif isinstance(event, PartDeltaEvent):
|
|
121
|
-
if isinstance(event.delta, TextPartDelta) and event.delta.content_delta:
|
|
122
|
-
accumulator.append(event.delta.content_delta)
|
|
123
|
-
|
|
124
|
-
return vqa_event_stream_handler
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
async def run_vqa_analysis_stream(
|
|
128
|
-
question: str,
|
|
129
|
-
image_bytes: bytes,
|
|
130
|
-
media_type: str = "image/png",
|
|
131
|
-
) -> str:
|
|
132
|
-
"""Execute the VQA agent with streaming output.
|
|
133
|
-
|
|
134
|
-
Streams text to console as it arrives and accumulates the full response.
|
|
135
|
-
|
|
136
|
-
Args:
|
|
137
|
-
question: The question to ask about the image.
|
|
138
|
-
image_bytes: The raw image bytes.
|
|
139
|
-
media_type: The MIME type of the image (default: "image/png").
|
|
140
|
-
|
|
141
|
-
Returns:
|
|
142
|
-
str: The accumulated answer from the VQA analysis.
|
|
143
|
-
"""
|
|
144
|
-
from code_puppy import callbacks
|
|
145
|
-
from code_puppy.model_factory import ModelFactory
|
|
146
|
-
from code_puppy.model_utils import prepare_prompt_for_model
|
|
147
|
-
|
|
148
|
-
# Get model configuration
|
|
149
|
-
model_name = get_vqa_model_name()
|
|
150
|
-
models_config = ModelFactory.load_config()
|
|
151
|
-
model = ModelFactory.get_model(model_name, models_config)
|
|
152
|
-
|
|
153
|
-
# Build instructions
|
|
154
|
-
instructions = DEFAULT_VQA_INSTRUCTIONS
|
|
155
|
-
|
|
156
|
-
# Apply prompt additions (like file permission handling)
|
|
157
|
-
prompt_additions = callbacks.on_load_prompt()
|
|
158
|
-
if prompt_additions:
|
|
159
|
-
instructions += "\n" + "\n".join(prompt_additions)
|
|
160
|
-
|
|
161
|
-
# Handle claude-code models: swap instructions, prepend system prompt to user question
|
|
162
|
-
prepared = prepare_prompt_for_model(
|
|
163
|
-
model_name, instructions, question, prepend_system_to_user=True
|
|
164
|
-
)
|
|
165
|
-
instructions = prepared.instructions
|
|
166
|
-
question = prepared.user_prompt
|
|
167
|
-
|
|
168
|
-
# Create the VQA agent
|
|
169
|
-
vqa_agent = Agent(
|
|
170
|
-
model=model,
|
|
171
|
-
instructions=instructions,
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
# Wrap with DBOS if enabled
|
|
175
|
-
if get_use_dbos():
|
|
176
|
-
from pydantic_ai.durable_exec.dbos import DBOSAgent
|
|
177
|
-
|
|
178
|
-
vqa_agent = DBOSAgent(vqa_agent, name="vqa-agent-stream")
|
|
179
|
-
|
|
180
|
-
# Accumulator for streamed text (use list to allow mutation in handler)
|
|
181
|
-
accumulated_chunks: list[str] = []
|
|
182
|
-
|
|
183
|
-
# Create the stream handler
|
|
184
|
-
stream_handler = _create_vqa_stream_handler(accumulated_chunks)
|
|
185
|
-
|
|
186
|
-
# Run the agent with event_stream_handler
|
|
187
|
-
result = await vqa_agent.run(
|
|
188
|
-
[
|
|
189
|
-
question,
|
|
190
|
-
BinaryContent(data=image_bytes, media_type=media_type),
|
|
191
|
-
],
|
|
192
|
-
event_stream_handler=stream_handler,
|
|
193
|
-
)
|
|
194
|
-
return result.output
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|