code-puppy 0.0.358__py3-none-any.whl → 0.0.360__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ class QualityAssuranceKittenAgent(BaseAgent):
16
16
 
17
17
  @property
18
18
  def description(self) -> str:
19
- return "Advanced web browser automation and quality assurance testing using Playwright with VQA capabilities"
19
+ return "Advanced web browser automation and quality assurance testing using Playwright with visual analysis capabilities"
20
20
 
21
21
  def get_available_tools(self) -> list[str]:
22
22
  """Get the list of tools available to Web Browser Puppy."""
@@ -63,8 +63,9 @@ class QualityAssuranceKittenAgent(BaseAgent):
63
63
  "browser_wait_for_element",
64
64
  "browser_highlight_element",
65
65
  "browser_clear_highlights",
66
- # Screenshots and VQA (uses dedicated VQA agent for context management)
67
- "browser_screenshot_vqa",
66
+ # Screenshots (returns BinaryContent for direct visual analysis)
67
+ "browser_screenshot_analyze",
68
+ "load_image_for_analysis",
68
69
  # Workflow management
69
70
  "browser_save_workflow",
70
71
  "browser_list_workflows",
@@ -78,7 +79,7 @@ You are Quality Assurance Kitten 🐱, an advanced autonomous browser automation
78
79
 
79
80
  You specialize in:
80
81
  🎯 **Quality Assurance Testing** - automated testing of web applications and user workflows
81
- 👁️ **Visual verification** - taking screenshots and analyzing page content for bugs
82
+ 👁️ **Visual verification** - taking screenshots you can directly see and analyze for bugs
82
83
  🔍 **Element discovery** - finding elements using semantic locators and accessibility best practices
83
84
  📝 **Data extraction** - scraping content and gathering information from web pages
84
85
  🧪 **Web automation** - filling forms, clicking buttons, navigating sites with precision
@@ -117,12 +118,10 @@ For any browser task, follow this approach:
117
118
 
118
119
  ### Visual Verification Workflow
119
120
  - **Before critical actions**: Use browser_highlight_element to visually confirm
120
- - **After interactions**: Use browser_screenshot_vqa to verify results
121
- - **Ask specific questions**: The VQA tool requires a question like:
122
- - "Is the login button visible?"
123
- - "What error message is displayed?"
124
- - "Is the form filled out correctly?"
125
- - "What is the main heading text?"
121
+ - **After interactions**: Use browser_screenshot_analyze to verify results
122
+ - The screenshot is returned directly as an image you can see and analyze
123
+ - No need to ask questions - just analyze what you see in the returned image
124
+ - Use load_image_for_analysis to load mockups or reference images for comparison
126
125
 
127
126
  ### Form Input Best Practices
128
127
  - **ALWAYS check current values** with browser_get_value before typing
@@ -135,14 +134,15 @@ For any browser task, follow this approach:
135
134
  **When Element Discovery Fails:**
136
135
  1. Try different semantic locators first
137
136
  2. Use browser_find_buttons or browser_find_links to see available elements
138
- 3. Take a screenshot with browser_screenshot_analyze to understand the page layout
137
+ 3. Take a screenshot with browser_screenshot_analyze to see and understand the page layout
139
138
  4. Only use XPath as absolute last resort
140
139
 
141
140
  **When Page Interactions Fail:**
142
141
  1. Check if element is visible with browser_wait_for_element
143
142
  2. Scroll element into view with browser_scroll_to_element
144
143
  3. Use browser_highlight_element to confirm element location
145
- 4. Try browser_execute_js for complex interactions
144
+ 4. Take a screenshot with browser_screenshot_analyze to see the actual page state
145
+ 5. Try browser_execute_js for complex interactions
146
146
 
147
147
  ### JavaScript Execution
148
148
  - Use browser_execute_js for:
@@ -187,7 +187,7 @@ For any browser task, follow this approach:
187
187
  ## Specialized Capabilities
188
188
 
189
189
  🌐 **WCAG 2.2 Level AA Compliance**: Always prioritize accessibility in element discovery
190
- 📸 **Visual Question Answering**: Use browser_screenshot_vqa for intelligent page analysis (uses dedicated VQA agent)
190
+ 📸 **Direct Visual Analysis**: Use browser_screenshot_analyze to see and analyze page content directly
191
191
  🚀 **Semantic Web Navigation**: Prefer role-based and label-based element discovery
192
192
  ⚡ **Playwright Power**: Full access to modern browser automation capabilities
193
193
  📋 **Workflow Management**: Save, load, and reuse automation patterns for consistency
@@ -119,6 +119,7 @@ async def event_stream_handler(
119
119
  tool_parts: set[int] = set() # Track which parts are tool calls
120
120
  banner_printed: set[int] = set() # Track if banner was already printed
121
121
  token_count: dict[int, int] = {} # Track token count per text/tool part
122
+ tool_names: dict[int, str] = {} # Track tool name per tool part index
122
123
  did_stream_anything = False # Track if we streamed any content
123
124
 
124
125
  # Termflow streaming state for text parts
@@ -203,6 +204,8 @@ async def event_stream_handler(
203
204
  streaming_parts.add(event.index)
204
205
  tool_parts.add(event.index)
205
206
  token_count[event.index] = 0 # Initialize token counter
207
+ # Capture tool name from the start event
208
+ tool_names[event.index] = part.tool_name or ""
206
209
  # Track tool name for display
207
210
  banner_printed.add(
208
211
  event.index
@@ -253,20 +256,36 @@ async def event_stream_handler(
253
256
  escaped = escape(delta.content_delta)
254
257
  console.print(f"[dim]{escaped}[/dim]", end="")
255
258
  elif isinstance(delta, ToolCallPartDelta):
256
- # For tool calls, count chunks received
257
- token_count[event.index] += 1
258
- # Get tool name if available
259
- tool_name = getattr(delta, "tool_name_delta", "")
259
+ # For tool calls, estimate tokens from args_delta content
260
+ # args_delta contains the streaming JSON arguments
261
+ args_delta = getattr(delta, "args_delta", "") or ""
262
+ if args_delta:
263
+ # Rough estimate: 4 chars ≈ 1 token (same heuristic as subagent_stream_handler)
264
+ estimated_tokens = max(1, len(args_delta) // 4)
265
+ token_count[event.index] += estimated_tokens
266
+ else:
267
+ # Even empty deltas count as activity
268
+ token_count[event.index] += 1
269
+
270
+ # Update tool name if delta provides more of it
271
+ tool_name_delta = getattr(delta, "tool_name_delta", "") or ""
272
+ if tool_name_delta:
273
+ tool_names[event.index] = (
274
+ tool_names.get(event.index, "") + tool_name_delta
275
+ )
276
+
277
+ # Use stored tool name for display
278
+ tool_name = tool_names.get(event.index, "")
260
279
  count = token_count[event.index]
261
280
  # Display with tool wrench icon and tool name
262
281
  if tool_name:
263
282
  console.print(
264
- f" \U0001f527 Calling {tool_name}... {count} chunks ",
283
+ f" \U0001f527 Calling {tool_name}... {count} token(s) ",
265
284
  end="\r",
266
285
  )
267
286
  else:
268
287
  console.print(
269
- f" \U0001f527 Calling tool... {count} chunks ",
288
+ f" \U0001f527 Calling tool... {count} token(s) ",
270
289
  end="\r",
271
290
  )
272
291
 
@@ -311,8 +330,9 @@ async def event_stream_handler(
311
330
  elif event.index in banner_printed:
312
331
  console.print() # Final newline after streaming
313
332
 
314
- # Clean up token count
333
+ # Clean up token count and tool names
315
334
  token_count.pop(event.index, None)
335
+ tool_names.pop(event.index, None)
316
336
  # Clean up all tracking sets
317
337
  streaming_parts.discard(event.index)
318
338
  thinking_parts.discard(event.index)
code_puppy/config.py CHANGED
@@ -98,7 +98,6 @@ _CURRENT_AUTOSAVE_ID: Optional[str] = None
98
98
  _model_validation_cache = {}
99
99
  _default_model_cache = None
100
100
  _default_vision_model_cache = None
101
- _default_vqa_model_cache = None
102
101
 
103
102
 
104
103
  def ensure_config_exists():
@@ -358,47 +357,6 @@ def _default_vision_model_from_models_json() -> str:
358
357
  return "gpt-4.1"
359
358
 
360
359
 
361
- def _default_vqa_model_from_models_json() -> str:
362
- """Select a default VQA-capable model, preferring vision-ready options."""
363
- global _default_vqa_model_cache
364
-
365
- if _default_vqa_model_cache is not None:
366
- return _default_vqa_model_cache
367
-
368
- try:
369
- from code_puppy.model_factory import ModelFactory
370
-
371
- models_config = ModelFactory.load_config()
372
- if models_config:
373
- # Allow explicit VQA hints if present
374
- for name, config in models_config.items():
375
- if config.get("supports_vqa"):
376
- _default_vqa_model_cache = name
377
- return name
378
-
379
- # Reuse multimodal heuristics before falling back to generic default
380
- preferred_candidates = (
381
- "gpt-4.1",
382
- "gpt-4.1-mini",
383
- "claude-4-0-sonnet",
384
- "gemini-2.5-flash-preview-05-20",
385
- "gpt-4.1-nano",
386
- )
387
- for candidate in preferred_candidates:
388
- if candidate in models_config:
389
- _default_vqa_model_cache = candidate
390
- return candidate
391
-
392
- _default_vqa_model_cache = _default_model_from_models_json()
393
- return _default_vqa_model_cache
394
-
395
- _default_vqa_model_cache = "gpt-4.1"
396
- return "gpt-4.1"
397
- except Exception:
398
- _default_vqa_model_cache = "gpt-4.1"
399
- return "gpt-4.1"
400
-
401
-
402
360
  def _validate_model_exists(model_name: str) -> bool:
403
361
  """Check if a model exists in models.json with caching to avoid redundant calls."""
404
362
  global _model_validation_cache
@@ -424,15 +382,10 @@ def _validate_model_exists(model_name: str) -> bool:
424
382
 
425
383
  def clear_model_cache():
426
384
  """Clear the model validation cache. Call this when models.json changes."""
427
- global \
428
- _model_validation_cache, \
429
- _default_model_cache, \
430
- _default_vision_model_cache, \
431
- _default_vqa_model_cache
385
+ global _model_validation_cache, _default_model_cache, _default_vision_model_cache
432
386
  _model_validation_cache.clear()
433
387
  _default_model_cache = None
434
388
  _default_vision_model_cache = None
435
- _default_vqa_model_cache = None
436
389
 
437
390
 
438
391
  def model_supports_setting(model_name: str, setting: str) -> bool:
@@ -503,21 +456,6 @@ def set_model_name(model: str):
503
456
  clear_model_cache()
504
457
 
505
458
 
506
- def get_vqa_model_name() -> str:
507
- """Return the configured VQA model, falling back to the global model."""
508
- stored_model = get_value("vqa_model_name")
509
- if stored_model and _validate_model_exists(stored_model):
510
- return stored_model
511
- # Fall back to the global model if no specific VQA model is set
512
- return get_global_model_name()
513
-
514
-
515
- def set_vqa_model_name(model: str):
516
- """Persist the configured VQA model name and refresh caches."""
517
- set_config_value("vqa_model_name", model or "")
518
- clear_model_cache()
519
-
520
-
521
459
  def get_puppy_token():
522
460
  """Returns the puppy_token from config, or None if not set."""
523
461
  return get_value("puppy_token")
@@ -41,9 +41,6 @@ from code_puppy.tools.browser.browser_navigation import (
41
41
  from code_puppy.tools.browser.browser_screenshot import (
42
42
  register_take_screenshot_and_analyze,
43
43
  )
44
- from code_puppy.tools.browser.browser_screenshot_vqa import (
45
- register_take_screenshot_and_analyze_vqa,
46
- )
47
44
  from code_puppy.tools.browser.browser_scripts import (
48
45
  register_browser_clear_highlights,
49
46
  register_browser_highlight_element,
@@ -146,9 +143,8 @@ TOOL_REGISTRY = {
146
143
  "browser_wait_for_element": register_wait_for_element,
147
144
  "browser_highlight_element": register_browser_highlight_element,
148
145
  "browser_clear_highlights": register_browser_clear_highlights,
149
- # Browser Screenshots and VQA
146
+ # Browser Screenshots
150
147
  "browser_screenshot_analyze": register_take_screenshot_and_analyze,
151
- "browser_screenshot_vqa": register_take_screenshot_and_analyze_vqa,
152
148
  # Browser Workflows
153
149
  "browser_save_workflow": register_save_workflow,
154
150
  "browser_list_workflows": register_list_workflows,
@@ -11,7 +11,6 @@ from .camoufox_manager import (
11
11
  get_session_browser_manager,
12
12
  set_browser_session,
13
13
  )
14
- from .vqa_agent import VisualAnalysisResult, run_vqa_analysis, run_vqa_analysis_stream
15
14
 
16
15
 
17
16
  def format_terminal_banner(text: str) -> str:
@@ -35,7 +34,4 @@ __all__ = [
35
34
  "get_browser_session",
36
35
  "get_session_browser_manager",
37
36
  "set_browser_session",
38
- "VisualAnalysisResult",
39
- "run_vqa_analysis",
40
- "run_vqa_analysis_stream",
41
37
  ]
@@ -1,16 +1,16 @@
1
1
  """Screenshot tool for browser automation.
2
2
 
3
- Captures screenshots and returns them as base64 data that multimodal
4
- models can directly see and analyze - no separate VQA agent needed.
3
+ Captures screenshots and returns them via ToolReturn with BinaryContent
4
+ so multimodal models can directly see and analyze - no separate VQA agent needed.
5
5
  """
6
6
 
7
- import base64
7
+ import time
8
8
  from datetime import datetime
9
9
  from pathlib import Path
10
10
  from tempfile import gettempdir, mkdtemp
11
- from typing import Any, Dict, Optional
11
+ from typing import Any, Dict, Optional, Union
12
12
 
13
- from pydantic_ai import RunContext
13
+ from pydantic_ai import BinaryContent, RunContext, ToolReturn
14
14
 
15
15
  from code_puppy.messaging import emit_error, emit_info, emit_success
16
16
  from code_puppy.tools.common import generate_group_id
@@ -54,7 +54,6 @@ async def _capture_screenshot(
54
54
  result: Dict[str, Any] = {
55
55
  "success": True,
56
56
  "screenshot_bytes": screenshot_bytes,
57
- "base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
58
57
  "timestamp": timestamp,
59
58
  }
60
59
 
@@ -80,11 +79,11 @@ async def take_screenshot(
80
79
  full_page: bool = False,
81
80
  element_selector: Optional[str] = None,
82
81
  save_screenshot: bool = True,
83
- ) -> Dict[str, Any]:
82
+ ) -> Union[ToolReturn, Dict[str, Any]]:
84
83
  """Take a screenshot of the browser page.
85
84
 
86
- Returns the screenshot as base64-encoded PNG data that multimodal
87
- models can directly see and analyze.
85
+ Returns a ToolReturn with BinaryContent so multimodal models can
86
+ directly see and analyze the screenshot.
88
87
 
89
88
  Args:
90
89
  full_page: Whether to capture full page or just viewport.
@@ -92,12 +91,11 @@ async def take_screenshot(
92
91
  save_screenshot: Whether to save the screenshot to disk.
93
92
 
94
93
  Returns:
95
- Dict containing:
96
- - success (bool): True if screenshot was captured.
97
- - base64_image (str): Base64-encoded PNG image data.
98
- - media_type (str): Always "image/png".
99
- - screenshot_path (str): Path to saved file (if saved).
100
- - error (str): Error message if unsuccessful.
94
+ ToolReturn containing:
95
+ - return_value: Success message with screenshot path
96
+ - content: List with description and BinaryContent image
97
+ - metadata: Screenshot details (path, target, timestamp)
98
+ Or Dict with error info if failed.
101
99
  """
102
100
  target = element_selector or ("full_page" if full_page else "viewport")
103
101
  group_id = generate_group_id("browser_screenshot", target)
@@ -122,15 +120,30 @@ async def take_screenshot(
122
120
 
123
121
  if not result["success"]:
124
122
  emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
125
- return result
126
-
127
- return {
128
- "success": True,
129
- "base64_image": result["base64_data"],
130
- "media_type": "image/png",
131
- "screenshot_path": result.get("screenshot_path"),
132
- "message": "Screenshot captured. The base64_image contains the browser view.",
133
- }
123
+ return {"success": False, "error": result.get("error")}
124
+
125
+ screenshot_path = result.get("screenshot_path", "(not saved)")
126
+
127
+ # Return as ToolReturn with BinaryContent so the model can SEE the image!
128
+ return ToolReturn(
129
+ return_value=f"Screenshot captured successfully. Saved to: {screenshot_path}",
130
+ content=[
131
+ f"Here's the browser screenshot ({target}):",
132
+ BinaryContent(
133
+ data=result["screenshot_bytes"],
134
+ media_type="image/png",
135
+ ),
136
+ "Please analyze what you see and describe any relevant details.",
137
+ ],
138
+ metadata={
139
+ "success": True,
140
+ "screenshot_path": screenshot_path,
141
+ "target": target,
142
+ "full_page": full_page,
143
+ "element_selector": element_selector,
144
+ "timestamp": time.time(),
145
+ },
146
+ )
134
147
 
135
148
  except Exception as e:
136
149
  error_msg = f"Screenshot failed: {str(e)}"
@@ -146,19 +159,19 @@ def register_take_screenshot_and_analyze(agent):
146
159
  context: RunContext,
147
160
  full_page: bool = False,
148
161
  element_selector: Optional[str] = None,
149
- ) -> Dict[str, Any]:
162
+ ) -> Union[ToolReturn, Dict[str, Any]]:
150
163
  """
151
164
  Take a screenshot of the browser page.
152
165
 
153
- Returns the screenshot as base64 image data that you can see directly.
154
- Use this to see what's displayed in the browser.
166
+ Returns the screenshot via ToolReturn with BinaryContent that you can
167
+ see directly. Use this to see what's displayed in the browser.
155
168
 
156
169
  Args:
157
170
  full_page: Capture full page (True) or just viewport (False).
158
171
  element_selector: Optional CSS selector to screenshot specific element.
159
172
 
160
173
  Returns:
161
- Dict with base64_image (PNG data you can see), screenshot_path, etc.
174
+ ToolReturn with the screenshot image you can analyze, or error dict.
162
175
  """
163
176
  return await take_screenshot(
164
177
  full_page=full_page,
@@ -5,22 +5,22 @@ This module provides tools for:
5
5
  - Reading terminal output by scraping xterm.js DOM
6
6
  - Loading images from the filesystem
7
7
 
8
- Screenshots are returned as base64-encoded data that multimodal models
9
- can directly see and analyze - no separate VQA agent needed.
8
+ Screenshots and images are returned via ToolReturn with BinaryContent
9
+ so multimodal models can directly see and analyze them.
10
10
 
11
- Screenshots are automatically resized to reduce token usage.
11
+ Images are automatically resized to reduce token usage.
12
12
  """
13
13
 
14
- import base64
15
14
  import io
16
15
  import logging
16
+ import time
17
17
  from datetime import datetime
18
18
  from pathlib import Path
19
19
  from tempfile import gettempdir, mkdtemp
20
- from typing import Any, Dict
20
+ from typing import Any, Dict, Union
21
21
 
22
22
  from PIL import Image
23
- from pydantic_ai import RunContext
23
+ from pydantic_ai import BinaryContent, RunContext, ToolReturn
24
24
  from rich.text import Text
25
25
 
26
26
  from code_puppy.messaging import emit_error, emit_info, emit_success
@@ -178,7 +178,6 @@ async def _capture_terminal_screenshot(
178
178
  result: Dict[str, Any] = {
179
179
  "success": True,
180
180
  "screenshot_bytes": screenshot_bytes,
181
- "base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
182
181
  }
183
182
 
184
183
  # Save to disk if requested (save the resized version)
@@ -205,11 +204,11 @@ async def _capture_terminal_screenshot(
205
204
  async def terminal_screenshot(
206
205
  full_page: bool = False,
207
206
  save_to_disk: bool = True,
208
- ) -> Dict[str, Any]:
207
+ ) -> Union[ToolReturn, Dict[str, Any]]:
209
208
  """Take a screenshot of the terminal browser.
210
209
 
211
- Captures a screenshot and returns it as base64-encoded PNG data.
212
- Multimodal models can directly see and analyze this image.
210
+ Captures a screenshot and returns it via ToolReturn with BinaryContent
211
+ so multimodal models can directly see and analyze the image.
213
212
 
214
213
  Args:
215
214
  full_page: Whether to capture the full page or just viewport.
@@ -218,18 +217,11 @@ async def terminal_screenshot(
218
217
  Defaults to True.
219
218
 
220
219
  Returns:
221
- A dictionary containing:
222
- - success (bool): True if screenshot was captured.
223
- - base64_image (str): Base64-encoded PNG image data.
224
- - media_type (str): Always "image/png".
225
- - screenshot_path (str): Path to saved file (if save_to_disk=True).
226
- - error (str): Error message if unsuccessful.
227
-
228
- Example:
229
- >>> result = await terminal_screenshot()
230
- >>> if result["success"]:
231
- ... # The base64_image can be shown to multimodal models
232
- ... print(f"Screenshot saved to: {result['screenshot_path']}")
220
+ ToolReturn containing:
221
+ - return_value: Success message with screenshot path
222
+ - content: List with description and BinaryContent image
223
+ - metadata: Screenshot details (path, target, timestamp)
224
+ Or Dict with error info if failed.
233
225
  """
234
226
  target = "full_page" if full_page else "viewport"
235
227
  group_id = generate_group_id("terminal_screenshot", target)
@@ -249,14 +241,27 @@ async def terminal_screenshot(
249
241
  emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
250
242
  return result
251
243
 
252
- # Return clean result with base64 image for model consumption
253
- return {
254
- "success": True,
255
- "base64_image": result["base64_data"],
256
- "media_type": "image/png",
257
- "screenshot_path": result.get("screenshot_path"),
258
- "message": "Screenshot captured. The base64_image contains the terminal view.",
259
- }
244
+ screenshot_path = result.get("screenshot_path", "(not saved)")
245
+
246
+ # Return as ToolReturn with BinaryContent so the model can SEE the image!
247
+ return ToolReturn(
248
+ return_value=f"Terminal screenshot captured. Saved to: {screenshot_path}",
249
+ content=[
250
+ f"Here's the terminal screenshot ({target}):",
251
+ BinaryContent(
252
+ data=result["screenshot_bytes"],
253
+ media_type="image/png",
254
+ ),
255
+ "Please analyze what you see in the terminal.",
256
+ ],
257
+ metadata={
258
+ "success": True,
259
+ "screenshot_path": screenshot_path,
260
+ "target": target,
261
+ "full_page": full_page,
262
+ "timestamp": time.time(),
263
+ },
264
+ )
260
265
 
261
266
 
262
267
  async def terminal_read_output(lines: int = 50) -> Dict[str, Any]:
@@ -328,23 +333,22 @@ async def terminal_read_output(lines: int = 50) -> Dict[str, Any]:
328
333
  async def load_image(
329
334
  image_path: str,
330
335
  max_height: int = DEFAULT_MAX_HEIGHT,
331
- ) -> Dict[str, Any]:
332
- """Load an image from the filesystem as base64 data.
336
+ ) -> Union[ToolReturn, Dict[str, Any]]:
337
+ """Load an image from the filesystem for visual analysis.
333
338
 
334
339
  Loads any image file, resizes it to reduce token usage, and returns
335
- it as base64-encoded data that multimodal models can directly see.
340
+ it via ToolReturn with BinaryContent so multimodal models can see it.
336
341
 
337
342
  Args:
338
343
  image_path: Path to the image file.
339
344
  max_height: Maximum height for resizing (default 768px).
340
345
 
341
346
  Returns:
342
- A dictionary containing:
343
- - success (bool): True if image was loaded.
344
- - base64_image (str): Base64-encoded image data (resized).
345
- - media_type (str): The image MIME type (e.g., "image/png").
346
- - image_path (str): The original path.
347
- - error (str): Error message if unsuccessful.
347
+ ToolReturn containing:
348
+ - return_value: Success message with path info
349
+ - content: List with description and BinaryContent image
350
+ - metadata: Image details (path, resized height)
351
+ Or Dict with error info if failed.
348
352
  """
349
353
  group_id = generate_group_id("load_image", image_path)
350
354
  emit_info(f"LOAD IMAGE 🖼️ {image_path}", message_group=group_id)
@@ -368,18 +372,26 @@ async def load_image(
368
372
  # Resize to reduce token usage
369
373
  image_bytes = _resize_image(original_bytes, max_height=max_height)
370
374
 
371
- # Always return as PNG after resizing (consistent format)
372
- base64_data = base64.b64encode(image_bytes).decode("utf-8")
373
-
374
375
  emit_success(f"Loaded image: {image_path}", message_group=group_id)
375
376
 
376
- return {
377
- "success": True,
378
- "base64_image": base64_data,
379
- "media_type": "image/png", # Always PNG after resize
380
- "image_path": image_path,
381
- "message": f"Image loaded (resized to max {max_height}px height for token efficiency).",
382
- }
377
+ # Return as ToolReturn with BinaryContent so the model can SEE the image!
378
+ return ToolReturn(
379
+ return_value=f"Image loaded from: {image_path}",
380
+ content=[
381
+ f"Here's the image from {image_file.name}:",
382
+ BinaryContent(
383
+ data=image_bytes,
384
+ media_type="image/png", # Always PNG after resize
385
+ ),
386
+ "Please analyze what you see in this image.",
387
+ ],
388
+ metadata={
389
+ "success": True,
390
+ "image_path": image_path,
391
+ "max_height": max_height,
392
+ "timestamp": time.time(),
393
+ },
394
+ )
383
395
 
384
396
  except Exception as e:
385
397
  error_msg = f"Failed to load image: {str(e)}"
@@ -400,18 +412,18 @@ def register_terminal_screenshot(agent):
400
412
  async def terminal_screenshot_analyze(
401
413
  context: RunContext,
402
414
  full_page: bool = False,
403
- ) -> Dict[str, Any]:
415
+ ) -> Union[ToolReturn, Dict[str, Any]]:
404
416
  """
405
417
  Take a screenshot of the terminal browser.
406
418
 
407
- Returns the screenshot as base64 image data that you can see directly.
408
- Use this to see what's displayed in the terminal.
419
+ Returns the screenshot via ToolReturn with BinaryContent that you can
420
+ see directly. Use this to see what's displayed in the terminal.
409
421
 
410
422
  Args:
411
423
  full_page: Capture full page (True) or just viewport (False).
412
424
 
413
425
  Returns:
414
- Dict with base64_image (PNG data you can see), screenshot_path, etc.
426
+ ToolReturn with the terminal screenshot you can analyze, or error dict.
415
427
  """
416
428
  # Session is set by invoke_agent via contextvar
417
429
  return await terminal_screenshot(full_page=full_page)
@@ -449,17 +461,18 @@ def register_load_image(agent):
449
461
  async def load_image_for_analysis(
450
462
  context: RunContext,
451
463
  image_path: str,
452
- ) -> Dict[str, Any]:
464
+ ) -> Union[ToolReturn, Dict[str, Any]]:
453
465
  """
454
466
  Load an image file so you can see and analyze it.
455
467
 
456
- Returns the image as base64 data that you can see directly.
468
+ Returns the image via ToolReturn with BinaryContent that you can
469
+ see directly.
457
470
 
458
471
  Args:
459
472
  image_path: Path to the image file.
460
473
 
461
474
  Returns:
462
- Dict with base64_image (you can see this), media_type, etc.
475
+ ToolReturn with the image you can analyze, or error dict.
463
476
  """
464
477
  # Session is set by invoke_agent via contextvar
465
478
  return await load_image(image_path=image_path)
@@ -472,18 +485,18 @@ def register_terminal_compare_mockup(agent):
472
485
  async def terminal_compare_mockup(
473
486
  context: RunContext,
474
487
  mockup_path: str,
475
- ) -> Dict[str, Any]:
488
+ ) -> Union[ToolReturn, Dict[str, Any]]:
476
489
  """
477
490
  Compare the terminal to a mockup image.
478
491
 
479
492
  Takes a screenshot of the terminal and loads the mockup image.
480
- Returns both as base64 so you can visually compare them.
493
+ Returns both via ToolReturn with BinaryContent so you can compare them.
481
494
 
482
495
  Args:
483
496
  mockup_path: Path to the mockup/expected image.
484
497
 
485
498
  Returns:
486
- Dict with terminal_image, mockup_image (both base64), paths, etc.
499
+ ToolReturn with both images (terminal and mockup) you can compare.
487
500
  """
488
501
  # Session is set by invoke_agent via contextvar
489
502
  group_id = generate_group_id("terminal_compare_mockup", mockup_path)
@@ -493,28 +506,51 @@ def register_terminal_compare_mockup(agent):
493
506
  message_group=group_id,
494
507
  )
495
508
 
496
- # Load the mockup
497
- mockup_result = await load_image(mockup_path)
498
- if not mockup_result["success"]:
499
- return mockup_result
509
+ # Capture terminal screenshot (get raw result for bytes)
510
+ terminal_capture = await _capture_terminal_screenshot(
511
+ full_page=False,
512
+ save_to_disk=True,
513
+ group_id=group_id,
514
+ )
515
+ if not terminal_capture["success"]:
516
+ return terminal_capture
500
517
 
501
- # Take terminal screenshot
502
- terminal_result = await terminal_screenshot(full_page=False)
503
- if not terminal_result["success"]:
504
- return terminal_result
518
+ # Load the mockup image
519
+ mockup_file = Path(mockup_path)
520
+ if not mockup_file.exists():
521
+ error_msg = f"Mockup file not found: {mockup_path}"
522
+ emit_error(error_msg, message_group=group_id)
523
+ return {"success": False, "error": error_msg}
524
+
525
+ mockup_bytes = _resize_image(mockup_file.read_bytes())
505
526
 
506
527
  emit_success(
507
528
  "Both images loaded. Compare them visually.",
508
529
  message_group=group_id,
509
530
  )
510
531
 
511
- return {
512
- "success": True,
513
- "terminal_image": terminal_result["base64_image"],
514
- "mockup_image": mockup_result["base64_image"],
515
- "media_type": "image/png",
516
- "terminal_path": terminal_result.get("screenshot_path"),
517
- "mockup_path": mockup_path,
518
- "message": "Both images loaded. terminal_image shows the current terminal, "
519
- "mockup_image shows the expected design. Compare them visually.",
520
- }
532
+ terminal_path = terminal_capture.get("screenshot_path", "(not saved)")
533
+
534
+ # Return as ToolReturn with BOTH images as BinaryContent!
535
+ return ToolReturn(
536
+ return_value=f"Comparison ready: terminal vs mockup ({mockup_path})",
537
+ content=[
538
+ "Here's the CURRENT terminal screenshot:",
539
+ BinaryContent(
540
+ data=terminal_capture["screenshot_bytes"],
541
+ media_type="image/png",
542
+ ),
543
+ f"And here's the EXPECTED mockup ({mockup_file.name}):",
544
+ BinaryContent(
545
+ data=mockup_bytes,
546
+ media_type="image/png",
547
+ ),
548
+ "Please compare these images and describe any differences.",
549
+ ],
550
+ metadata={
551
+ "success": True,
552
+ "terminal_path": terminal_path,
553
+ "mockup_path": mockup_path,
554
+ "timestamp": time.time(),
555
+ },
556
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-puppy
3
- Version: 0.0.358
3
+ Version: 0.0.360
4
4
  Summary: Code generation agent
5
5
  Project-URL: repository, https://github.com/mpfaffenberger/code_puppy
6
6
  Project-URL: HomePage, https://github.com/mpfaffenberger/code_puppy
@@ -4,7 +4,7 @@ code_puppy/callbacks.py,sha256=Pp0VyeXJBEtk-N_RSWr5pbveelovsdLUiJ4f11dzwGw,10775
4
4
  code_puppy/chatgpt_codex_client.py,sha256=Om0ANB_kpHubhCwNzF9ENf8RvKBqs0IYzBLl_SNw0Vk,9833
5
5
  code_puppy/claude_cache_client.py,sha256=Gl6um5ZaKpcnxOvoFSM8Lwm_Vu4-VyWz8Nli8DnRLa4,22508
6
6
  code_puppy/cli_runner.py,sha256=w5CLKgQYYaT7My3Cga2StXYol-u6DBxNzzUuhhsfhsA,34952
7
- code_puppy/config.py,sha256=gwOK-WDuYBzJKwyGCFALJiW0pstiA39pRDm1O1zFek4,54528
7
+ code_puppy/config.py,sha256=z4c-rKwQOEsg13HHd1KskIQG4Ygdr9krQsCAiZU-Wa0,52441
8
8
  code_puppy/error_logging.py,sha256=a80OILCUtJhexI6a9GM-r5LqIdjvSRzggfgPp2jv1X0,3297
9
9
  code_puppy/gemini_code_assist.py,sha256=KGS7sO5OLc83nDF3xxS-QiU6vxW9vcm6hmzilu79Ef8,13867
10
10
  code_puppy/http_utils.py,sha256=H3N5Qz2B1CcsGUYOycGWAqoNMr2P1NCVluKX3aRwRqI,10358
@@ -38,12 +38,12 @@ code_puppy/agents/agent_planning.py,sha256=6q3s5qCko2FcUfaLzImOFNDi0H61WBc2PNtsO
38
38
  code_puppy/agents/agent_python_programmer.py,sha256=R-7XoGIFJ58EY9LE9mWGcQQ8gSsMzi-1HD6wigJQPL8,6846
39
39
  code_puppy/agents/agent_python_reviewer.py,sha256=J8lqzoKJlohs8NWMbgUpHXNt1bXHNIkuGjzLd9Af8qE,5854
40
40
  code_puppy/agents/agent_qa_expert.py,sha256=5Ikb4U3SZQknUEfwlHZiyZXKqnffnOTQagr_wrkUkPk,10125
41
- code_puppy/agents/agent_qa_kitten.py,sha256=bjQdAPL_VMjSDn012mHQgnduuQkGG0JeXuC3T1KrU6g,9372
41
+ code_puppy/agents/agent_qa_kitten.py,sha256=qvry-1u_CiXi8eRueHTax4OtqsS_mQrtXHsbTXWzGYs,9517
42
42
  code_puppy/agents/agent_security_auditor.py,sha256=SpiYNA0XAsIwBj7S2_EQPRslRUmF_-b89pIJyW7DYtY,12022
43
43
  code_puppy/agents/agent_terminal_qa.py,sha256=U-iyP7OBWdAmchW_oUU8k6asH2aignTMmgqqYDyf-ms,10343
44
44
  code_puppy/agents/agent_typescript_reviewer.py,sha256=vsnpp98xg6cIoFAEJrRTUM_i4wLEWGm5nJxs6fhHobM,10275
45
45
  code_puppy/agents/base_agent.py,sha256=oKlX9CEIWSvdXyQDVi9F1jauA6rjKleY_n6044Ux5DY,73840
46
- code_puppy/agents/event_stream_handler.py,sha256=HM62_THZpMVnqKIB6Vbo6IwmJt6Kjoc3YbyRy2FclA4,13805
46
+ code_puppy/agents/event_stream_handler.py,sha256=JttLZJpNADE5HXiXY-GZ6tpwaBeFRODcy34KiquPOvU,14952
47
47
  code_puppy/agents/json_agent.py,sha256=lhopDJDoiSGHvD8A6t50hi9ZBoNRKgUywfxd0Po_Dzc,4886
48
48
  code_puppy/agents/prompt_reviewer.py,sha256=JJrJ0m5q0Puxl8vFsyhAbY9ftU9n6c6UxEVdNct1E-Q,5558
49
49
  code_puppy/agents/subagent_stream_handler.py,sha256=5imUFYOJCv7blfv4fTHm6OQ7JpqlyFv_luBYGSj16MA,10329
@@ -184,7 +184,7 @@ code_puppy/plugins/shell_safety/command_cache.py,sha256=adYtSPNVOZfW_6dQdtEihO6E
184
184
  code_puppy/plugins/shell_safety/register_callbacks.py,sha256=W3v664RR48Fdbbbltf_NnX22_Ahw2AvAOtvXvWc7KxQ,7322
185
185
  code_puppy/prompts/antigravity_system_prompt.md,sha256=ZaTfRyY57ttROyZMmOBtqZQu1to7sdTNTv8_0fTgPNw,6807
186
186
  code_puppy/prompts/codex_system_prompt.md,sha256=hEFTCziroLqZmqNle5kG34A8kvTteOWezCiVrAEKhE0,24400
187
- code_puppy/tools/__init__.py,sha256=WC1DO3OeTVSibpvIIoyfdxbeeC0oigiBSUqpmdw8G4o,7615
187
+ code_puppy/tools/__init__.py,sha256=9bzVIjX9CAr2YTZkhD7IWFYt4KpnFRx6ge_Tqazugbs,7425
188
188
  code_puppy/tools/agent_tools.py,sha256=XvBQ_IPa4NHLmIA2mdyPwy9GPlYGQwhtdn-w_3i239g,25517
189
189
  code_puppy/tools/command_runner.py,sha256=Sresr_ykou_c2V1sKoNxqrqCQovKF5yDiQJ8r3E9lak,50995
190
190
  code_puppy/tools/common.py,sha256=lVtF94cn6jtC5YKfitV7L3rk37Ts2gMoHLQrqDFD2E4,46411
@@ -193,25 +193,23 @@ code_puppy/tools/file_modifications.py,sha256=vz9n7R0AGDSdLUArZr_55yJLkyI30M8zre
193
193
  code_puppy/tools/file_operations.py,sha256=CqhpuBnOFOcQCIYXOujskxq2VMLWYJhibYrH0YcPSfA,35692
194
194
  code_puppy/tools/subagent_context.py,sha256=zsiKV3B3DxZ_Y5IHHhtE-SMFDg_jMrY7Hi6r5LH--IU,4781
195
195
  code_puppy/tools/tools_content.py,sha256=bsBqW-ppd1XNAS_g50B3UHDQBWEALC1UneH6-afz1zo,2365
196
- code_puppy/tools/browser/__init__.py,sha256=HqP5_AKL9IuaXeGLhL_Y799DBU28QZBd2x5ISKJlprc,1097
196
+ code_puppy/tools/browser/__init__.py,sha256=SPiEQwsDj5KoxDwX_viNUKFsn4tczxY-Jq2C64EzSNI,927
197
197
  code_puppy/tools/browser/browser_control.py,sha256=YntpjfWTIv0TDlAO5BqTV_hDbUBw-8wmMn29K3TDQo0,8430
198
198
  code_puppy/tools/browser/browser_interactions.py,sha256=ZyJmA2-ZtIATF76uGMt08cfVaYiqg7W2-cHfAzNI0F8,16775
199
199
  code_puppy/tools/browser/browser_locators.py,sha256=sxXNm-K087poeSp7Um5Gc1sZxb7HlSZOu0F0r2b0ty8,19177
200
200
  code_puppy/tools/browser/browser_navigation.py,sha256=RJdG14UXtA6wz4PNLw2Tqeu4oUDQilOyNbyTjgIFCrY,7416
201
- code_puppy/tools/browser/browser_screenshot.py,sha256=POlHDG7WJbjF3uBPUD7X2elAs-CKP9Dq7UQ7UZyvZGQ,5666
202
- code_puppy/tools/browser/browser_screenshot_vqa.py,sha256=DBdQuV7eIaJX2Qy_liwitfakIzrcVziB-zAGIngM5GE,7349
201
+ code_puppy/tools/browser/browser_screenshot.py,sha256=AJe9JbZv8vC93AFWzsAUlrg1YNshv4SWNde-O-_mfQU,6282
203
202
  code_puppy/tools/browser/browser_scripts.py,sha256=CYWdQMtjKTNvJNSCkB2vGo-MOzmT_gw2oFMGtkfuzuA,14779
204
203
  code_puppy/tools/browser/browser_workflows.py,sha256=nitW42vCf0ieTX1gLabozTugNQ8phtoFzZbiAhw1V90,6491
205
204
  code_puppy/tools/browser/camoufox_manager.py,sha256=WIr98SrGeC5jd6jX5tjhFR6A3janqV4tq9Mbznnlh44,13920
206
205
  code_puppy/tools/browser/chromium_terminal_manager.py,sha256=w1thQ_ACb6oV45L93TSqPQD0o0cTh3FqT5I9zcOOWlM,8226
207
206
  code_puppy/tools/browser/terminal_command_tools.py,sha256=9byOZku-dwvTtCl532xt7Lumed_jTn0sLvUe_X75XCQ,19068
208
- code_puppy/tools/browser/terminal_screenshot_tools.py,sha256=DAqzlqOoTfQZCKKeXccElrrY7s2CwSblvBV7A6A2GYw,17224
207
+ code_puppy/tools/browser/terminal_screenshot_tools.py,sha256=J_21YO_495NvYgNFu9KQP6VYg2K_f8CtSdZuF94Yhnw,18448
209
208
  code_puppy/tools/browser/terminal_tools.py,sha256=F5LjVH3udSCFHmqC3O1UJLoLozZFZsEdX42jOmkqkW0,17853
210
- code_puppy/tools/browser/vqa_agent.py,sha256=0IbS1X3l8ADZI9pGcJbKFoN0-ZuTJa8QvHZ_hGKBKRM,6339
211
- code_puppy-0.0.358.data/data/code_puppy/models.json,sha256=FMQdE_yvP_8y0xxt3K918UkFL9cZMYAqW1SfXcQkU_k,3105
212
- code_puppy-0.0.358.data/data/code_puppy/models_dev_api.json,sha256=wHjkj-IM_fx1oHki6-GqtOoCrRMR0ScK0f-Iz0UEcy8,548187
213
- code_puppy-0.0.358.dist-info/METADATA,sha256=ILk2_wUf4vZlImo2UX7dUsmXsNTPhvncu0AdLbO41GM,27614
214
- code_puppy-0.0.358.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
215
- code_puppy-0.0.358.dist-info/entry_points.txt,sha256=Tp4eQC99WY3HOKd3sdvb22vZODRq0XkZVNpXOag_KdI,91
216
- code_puppy-0.0.358.dist-info/licenses/LICENSE,sha256=31u8x0SPgdOq3izJX41kgFazWsM43zPEF9eskzqbJMY,1075
217
- code_puppy-0.0.358.dist-info/RECORD,,
209
+ code_puppy-0.0.360.data/data/code_puppy/models.json,sha256=FMQdE_yvP_8y0xxt3K918UkFL9cZMYAqW1SfXcQkU_k,3105
210
+ code_puppy-0.0.360.data/data/code_puppy/models_dev_api.json,sha256=wHjkj-IM_fx1oHki6-GqtOoCrRMR0ScK0f-Iz0UEcy8,548187
211
+ code_puppy-0.0.360.dist-info/METADATA,sha256=FZ7fXsTCXSepDHZqKoSNnjd_MyyMDx-Ntb2nnz3Jizg,27614
212
+ code_puppy-0.0.360.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
213
+ code_puppy-0.0.360.dist-info/entry_points.txt,sha256=Tp4eQC99WY3HOKd3sdvb22vZODRq0XkZVNpXOag_KdI,91
214
+ code_puppy-0.0.360.dist-info/licenses/LICENSE,sha256=31u8x0SPgdOq3izJX41kgFazWsM43zPEF9eskzqbJMY,1075
215
+ code_puppy-0.0.360.dist-info/RECORD,,
@@ -1,195 +0,0 @@
1
- """VQA-based Screenshot tool for browser automation (qa-kitten).
2
-
3
- This module provides screenshot analysis using a dedicated VQA agent.
4
- Unlike browser_screenshot.py which returns raw base64 bytes for multimodal
5
- models to see directly, this version offloads the visual analysis to a
6
- separate VQA agent, helping manage context in the calling agent.
7
-
8
- Use this for qa-kitten where context management is important.
9
- Use browser_screenshot.py for terminal-qa where direct image viewing is needed.
10
- """
11
-
12
- from typing import Any, Dict, Optional
13
-
14
- from pydantic_ai import RunContext
15
- from rich.console import Console
16
-
17
- from code_puppy.messaging import emit_error, emit_info, emit_success
18
- from code_puppy.tools.common import generate_group_id
19
-
20
- from .browser_screenshot import _capture_screenshot
21
- from .camoufox_manager import get_session_browser_manager
22
- from .vqa_agent import run_vqa_analysis_stream
23
-
24
-
25
- async def take_screenshot_and_analyze(
26
- question: str,
27
- full_page: bool = False,
28
- element_selector: Optional[str] = None,
29
- save_screenshot: bool = True,
30
- ) -> Dict[str, Any]:
31
- """Take a screenshot and analyze it using the VQA agent.
32
-
33
- This function captures a screenshot and passes it to a dedicated
34
- VQA (Visual Question Answering) agent for analysis. The VQA agent
35
- runs separately, keeping the image analysis out of the calling
36
- agent's context window.
37
-
38
- Args:
39
- question: The question to ask about the screenshot.
40
- Examples:
41
- - "What buttons are visible on this page?"
42
- - "Is there an error message displayed?"
43
- - "What is the main heading text?"
44
- - "Describe the layout of this form."
45
- full_page: Whether to capture full page or just viewport.
46
- Defaults to False (viewport only).
47
- element_selector: Optional CSS selector to screenshot a specific
48
- element instead of the whole page.
49
- save_screenshot: Whether to save the screenshot to disk.
50
-
51
- Returns:
52
- Dict containing:
53
- - success (bool): True if analysis succeeded.
54
- - answer (str): The VQA agent's streamed answer to your question.
55
- - screenshot_info (dict): Path, timestamp, and other metadata.
56
- - error (str): Error message if unsuccessful.
57
- """
58
- target = element_selector or ("full_page" if full_page else "viewport")
59
- group_id = generate_group_id(
60
- "browser_screenshot_analyze", f"{question[:50]}_{target}"
61
- )
62
- emit_info(
63
- f"BROWSER SCREENSHOT ANALYZE 📷 question='{question[:100]}{'...' if len(question) > 100 else ''}' target={target}",
64
- message_group=group_id,
65
- )
66
-
67
- try:
68
- # Get the browser page
69
- browser_manager = get_session_browser_manager()
70
- page = await browser_manager.get_current_page()
71
-
72
- if not page:
73
- error_msg = "No active browser page. Navigate to a webpage first."
74
- emit_error(error_msg, message_group=group_id)
75
- return {"success": False, "error": error_msg, "question": question}
76
-
77
- # Capture the screenshot
78
- screenshot_result = await _capture_screenshot(
79
- page,
80
- full_page=full_page,
81
- element_selector=element_selector,
82
- save_screenshot=save_screenshot,
83
- group_id=group_id,
84
- )
85
-
86
- if not screenshot_result["success"]:
87
- error_msg = screenshot_result.get("error", "Screenshot failed")
88
- emit_error(
89
- f"Screenshot capture failed: {error_msg}", message_group=group_id
90
- )
91
- return {"success": False, "error": error_msg, "question": question}
92
-
93
- screenshot_bytes = screenshot_result.get("screenshot_bytes")
94
- if not screenshot_bytes:
95
- emit_error(
96
- "Screenshot captured but pixel data missing; cannot run visual analysis.",
97
- message_group=group_id,
98
- )
99
- return {
100
- "success": False,
101
- "error": "Screenshot captured but no image bytes available for analysis.",
102
- "question": question,
103
- }
104
-
105
- # Run VQA analysis with streaming output
106
- try:
107
- console = Console()
108
- console.print() # Newline before streaming starts
109
- console.print("[bold cyan]🔍 VQA Analysis:[/bold cyan]")
110
-
111
- vqa_answer = await run_vqa_analysis_stream(
112
- question,
113
- screenshot_bytes,
114
- )
115
- except Exception as exc:
116
- emit_error(
117
- f"Visual question answering failed: {exc}",
118
- message_group=group_id,
119
- )
120
- return {
121
- "success": False,
122
- "error": f"Visual analysis failed: {exc}",
123
- "question": question,
124
- "screenshot_info": {
125
- "path": screenshot_result.get("screenshot_path"),
126
- "timestamp": screenshot_result.get("timestamp"),
127
- "full_page": full_page,
128
- "element_selector": element_selector,
129
- },
130
- }
131
-
132
- emit_success(
133
- "Visual analysis complete",
134
- message_group=group_id,
135
- )
136
-
137
- return {
138
- "success": True,
139
- "question": question,
140
- "answer": vqa_answer,
141
- "screenshot_info": {
142
- "path": screenshot_result.get("screenshot_path"),
143
- "size": len(screenshot_bytes),
144
- "timestamp": screenshot_result.get("timestamp"),
145
- "full_page": full_page,
146
- "element_selector": element_selector,
147
- },
148
- }
149
-
150
- except Exception as e:
151
- error_msg = f"Screenshot analysis failed: {str(e)}"
152
- emit_error(error_msg, message_group=group_id)
153
- return {"success": False, "error": error_msg, "question": question}
154
-
155
-
156
- def register_take_screenshot_and_analyze_vqa(agent):
157
- """Register the VQA-based screenshot tool.
158
-
159
- This tool takes a screenshot and analyzes it using a separate VQA agent.
160
- Use this for agents where context management is important (like qa-kitten).
161
- """
162
-
163
- @agent.tool
164
- async def browser_screenshot_vqa(
165
- context: RunContext,
166
- question: str,
167
- full_page: bool = False,
168
- element_selector: Optional[str] = None,
169
- ) -> Dict[str, Any]:
170
- """
171
- Take a screenshot and analyze it with VQA.
172
-
173
- Captures a screenshot of the browser and uses a visual AI to
174
- answer your question about what's visible on the page.
175
-
176
- Args:
177
- question: What you want to know about the screenshot.
178
- Examples:
179
- - "What buttons are visible?"
180
- - "Is there an error message?"
181
- - "What is the page title?"
182
- - "Is the form filled out correctly?"
183
- full_page: Capture full page (True) or just viewport (False).
184
- element_selector: Optional CSS selector to screenshot specific element.
185
-
186
- Returns:
187
- Dict with:
188
- - answer: The streamed answer to your question
189
- - screenshot_info: Where the screenshot was saved, etc.
190
- """
191
- return await take_screenshot_and_analyze(
192
- question=question,
193
- full_page=full_page,
194
- element_selector=element_selector,
195
- )
@@ -1,194 +0,0 @@
1
- """Utilities for running visual question-answering via pydantic-ai."""
2
-
3
- from __future__ import annotations
4
-
5
- from collections.abc import AsyncIterable
6
- from typing import Any
7
-
8
- from pydantic import BaseModel, Field
9
- from pydantic_ai import Agent, BinaryContent, PartDeltaEvent, PartStartEvent, RunContext
10
- from pydantic_ai.messages import TextPart, TextPartDelta
11
-
12
- from code_puppy.config import get_use_dbos, get_vqa_model_name
13
-
14
-
15
- class VisualAnalysisResult(BaseModel):
16
- """Structured response from the VQA agent."""
17
-
18
- answer: str
19
- confidence: float = Field(ge=0.0, le=1.0)
20
- observations: str
21
-
22
-
23
- DEFAULT_VQA_INSTRUCTIONS = (
24
- "You are a visual analysis specialist. Answer the user's question about the provided image. "
25
- "Always respond using the structured schema: answer, confidence (0-1 float), observations. "
26
- "Confidence reflects how certain you are about the answer. Observations should include useful, concise context."
27
- )
28
-
29
-
30
- async def run_vqa_analysis(
31
- question: str,
32
- image_bytes: bytes,
33
- media_type: str = "image/png",
34
- ) -> str:
35
- """Execute the VQA agent asynchronously against screenshot bytes.
36
-
37
- Follows the same pattern as agent_tools.py for prompt preparation
38
- and model configuration.
39
-
40
- Args:
41
- question: The question to ask about the image.
42
- image_bytes: The raw image bytes.
43
- media_type: The MIME type of the image (default: "image/png").
44
- system_prompt: Optional custom system prompt. If None, uses default VQA instructions.
45
-
46
- Returns:
47
- str: The answer from the VQA analysis.
48
- """
49
- from code_puppy import callbacks
50
- from code_puppy.model_factory import ModelFactory
51
- from code_puppy.model_utils import prepare_prompt_for_model
52
-
53
- # Get model configuration
54
- model_name = get_vqa_model_name()
55
- models_config = ModelFactory.load_config()
56
- model = ModelFactory.get_model(model_name, models_config)
57
-
58
- # Build instructions: custom system_prompt or default VQA instructions
59
- instructions = DEFAULT_VQA_INSTRUCTIONS
60
-
61
- # Apply prompt additions (like file permission handling) - same as agent_tools.py
62
- prompt_additions = callbacks.on_load_prompt()
63
- if prompt_additions:
64
- instructions += "\n" + "\n".join(prompt_additions)
65
-
66
- # Handle claude-code models: swap instructions, prepend system prompt to user question
67
- # Following the exact pattern from agent_tools.py
68
- prepared = prepare_prompt_for_model(
69
- model_name, instructions, question, prepend_system_to_user=True
70
- )
71
- instructions = prepared.instructions
72
- question = prepared.user_prompt
73
-
74
- # Create the VQA agent with string output
75
- vqa_agent = Agent(
76
- model=model,
77
- instructions=instructions,
78
- )
79
-
80
- # Wrap with DBOS if enabled
81
- if get_use_dbos():
82
- from pydantic_ai.durable_exec.dbos import DBOSAgent
83
-
84
- vqa_agent = DBOSAgent(vqa_agent, name="vqa-agent")
85
-
86
- # Run the agent with the image
87
- result = await vqa_agent.run(
88
- [
89
- question,
90
- BinaryContent(data=image_bytes, media_type=media_type),
91
- ]
92
- )
93
- return result.output
94
-
95
-
96
- def _create_vqa_stream_handler(
97
- accumulator: list[str],
98
- ):
99
- """Create an event stream handler that accumulates text.
100
-
101
- Args:
102
- accumulator: List to accumulate text chunks into (pass empty list).
103
-
104
- Returns:
105
- Async event stream handler function.
106
- """
107
-
108
- async def vqa_event_stream_handler(
109
- ctx: RunContext,
110
- events: AsyncIterable[Any],
111
- ) -> None:
112
- """Handle streaming events - print text as it arrives."""
113
- async for event in events:
114
- # Handle text part start - might have initial content
115
- if isinstance(event, PartStartEvent):
116
- if isinstance(event.part, TextPart) and event.part.content:
117
- accumulator.append(event.part.content)
118
-
119
- # Handle text deltas - the streaming bits
120
- elif isinstance(event, PartDeltaEvent):
121
- if isinstance(event.delta, TextPartDelta) and event.delta.content_delta:
122
- accumulator.append(event.delta.content_delta)
123
-
124
- return vqa_event_stream_handler
125
-
126
-
127
- async def run_vqa_analysis_stream(
128
- question: str,
129
- image_bytes: bytes,
130
- media_type: str = "image/png",
131
- ) -> str:
132
- """Execute the VQA agent with streaming output.
133
-
134
- Streams text to console as it arrives and accumulates the full response.
135
-
136
- Args:
137
- question: The question to ask about the image.
138
- image_bytes: The raw image bytes.
139
- media_type: The MIME type of the image (default: "image/png").
140
-
141
- Returns:
142
- str: The accumulated answer from the VQA analysis.
143
- """
144
- from code_puppy import callbacks
145
- from code_puppy.model_factory import ModelFactory
146
- from code_puppy.model_utils import prepare_prompt_for_model
147
-
148
- # Get model configuration
149
- model_name = get_vqa_model_name()
150
- models_config = ModelFactory.load_config()
151
- model = ModelFactory.get_model(model_name, models_config)
152
-
153
- # Build instructions
154
- instructions = DEFAULT_VQA_INSTRUCTIONS
155
-
156
- # Apply prompt additions (like file permission handling)
157
- prompt_additions = callbacks.on_load_prompt()
158
- if prompt_additions:
159
- instructions += "\n" + "\n".join(prompt_additions)
160
-
161
- # Handle claude-code models: swap instructions, prepend system prompt to user question
162
- prepared = prepare_prompt_for_model(
163
- model_name, instructions, question, prepend_system_to_user=True
164
- )
165
- instructions = prepared.instructions
166
- question = prepared.user_prompt
167
-
168
- # Create the VQA agent
169
- vqa_agent = Agent(
170
- model=model,
171
- instructions=instructions,
172
- )
173
-
174
- # Wrap with DBOS if enabled
175
- if get_use_dbos():
176
- from pydantic_ai.durable_exec.dbos import DBOSAgent
177
-
178
- vqa_agent = DBOSAgent(vqa_agent, name="vqa-agent-stream")
179
-
180
- # Accumulator for streamed text (use list to allow mutation in handler)
181
- accumulated_chunks: list[str] = []
182
-
183
- # Create the stream handler
184
- stream_handler = _create_vqa_stream_handler(accumulated_chunks)
185
-
186
- # Run the agent with event_stream_handler
187
- result = await vqa_agent.run(
188
- [
189
- question,
190
- BinaryContent(data=image_bytes, media_type=media_type),
191
- ],
192
- event_stream_handler=stream_handler,
193
- )
194
- return result.output