code-puppy 0.0.356__py3-none-any.whl → 0.0.357__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. code_puppy/agents/agent_qa_kitten.py +10 -5
  2. code_puppy/agents/agent_terminal_qa.py +323 -0
  3. code_puppy/api/app.py +79 -2
  4. code_puppy/api/routers/commands.py +21 -2
  5. code_puppy/api/routers/sessions.py +49 -8
  6. code_puppy/config.py +5 -2
  7. code_puppy/tools/__init__.py +37 -0
  8. code_puppy/tools/agent_tools.py +26 -1
  9. code_puppy/tools/browser/__init__.py +41 -0
  10. code_puppy/tools/browser/browser_control.py +6 -6
  11. code_puppy/tools/browser/browser_interactions.py +21 -20
  12. code_puppy/tools/browser/browser_locators.py +9 -9
  13. code_puppy/tools/browser/browser_navigation.py +7 -7
  14. code_puppy/tools/browser/browser_screenshot.py +60 -135
  15. code_puppy/tools/browser/browser_screenshot_vqa.py +195 -0
  16. code_puppy/tools/browser/browser_scripts.py +15 -13
  17. code_puppy/tools/browser/camoufox_manager.py +226 -64
  18. code_puppy/tools/browser/chromium_terminal_manager.py +259 -0
  19. code_puppy/tools/browser/terminal_command_tools.py +521 -0
  20. code_puppy/tools/browser/terminal_screenshot_tools.py +520 -0
  21. code_puppy/tools/browser/terminal_tools.py +525 -0
  22. code_puppy/tools/browser/vqa_agent.py +138 -34
  23. code_puppy/tools/command_runner.py +0 -1
  24. {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/METADATA +1 -1
  25. {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/RECORD +30 -24
  26. {code_puppy-0.0.356.data → code_puppy-0.0.357.data}/data/code_puppy/models.json +0 -0
  27. {code_puppy-0.0.356.data → code_puppy-0.0.357.data}/data/code_puppy/models_dev_api.json +0 -0
  28. {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/WHEEL +0 -0
  29. {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/entry_points.txt +0 -0
  30. {code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,520 @@
1
+ """Terminal Screenshot Tools.
2
+
3
+ This module provides tools for:
4
+ - Taking screenshots of the terminal browser
5
+ - Reading terminal output by scraping xterm.js DOM
6
+ - Loading images from the filesystem
7
+
8
+ Screenshots are returned as base64-encoded data that multimodal models
9
+ can directly see and analyze - no separate VQA agent needed.
10
+
11
+ Screenshots are automatically resized to reduce token usage.
12
+ """
13
+
14
+ import base64
15
+ import io
16
+ import logging
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from tempfile import gettempdir, mkdtemp
20
+ from typing import Any, Dict
21
+
22
+ from PIL import Image
23
+ from pydantic_ai import RunContext
24
+ from rich.text import Text
25
+
26
+ from code_puppy.messaging import emit_error, emit_info, emit_success
27
+ from code_puppy.tools.browser import format_terminal_banner
28
+ from code_puppy.tools.common import generate_group_id
29
+
30
+ from .terminal_tools import get_session_manager
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Default max height for screenshots (reduces token usage significantly)
35
+ DEFAULT_MAX_HEIGHT = 768
36
+
37
+ # Temporary directory for screenshots
38
+ _TEMP_SCREENSHOT_ROOT = Path(
39
+ mkdtemp(prefix="code_puppy_terminal_screenshots_", dir=gettempdir())
40
+ )
41
+
42
+ # JavaScript to extract text content from xterm.js terminal
43
+ XTERM_TEXT_EXTRACTION_JS = """
44
+ () => {
45
+ const selectors = [
46
+ '.xterm-rows',
47
+ '.xterm .xterm-rows',
48
+ '[class*="xterm-rows"]',
49
+ '.xterm-screen',
50
+ ];
51
+
52
+ let container = null;
53
+ for (const selector of selectors) {
54
+ container = document.querySelector(selector);
55
+ if (container) break;
56
+ }
57
+
58
+ if (!container) {
59
+ const xtermElement = document.querySelector('.xterm');
60
+ if (xtermElement) {
61
+ return {
62
+ success: true,
63
+ lines: xtermElement.innerText.split('\\n').filter(line => line.trim()),
64
+ method: 'innerText'
65
+ };
66
+ }
67
+ return { success: false, error: 'Could not find xterm.js terminal container' };
68
+ }
69
+
70
+ const rows = container.querySelectorAll('div');
71
+ const lines = [];
72
+
73
+ rows.forEach(row => {
74
+ let text = '';
75
+ const spans = row.querySelectorAll('span');
76
+ if (spans.length > 0) {
77
+ spans.forEach(span => {
78
+ text += span.textContent || '';
79
+ });
80
+ } else {
81
+ text = row.textContent || '';
82
+ }
83
+ if (text.trim()) {
84
+ lines.push(text);
85
+ }
86
+ });
87
+
88
+ return {
89
+ success: true,
90
+ lines: lines,
91
+ method: 'row_extraction'
92
+ };
93
+ }
94
+ """
95
+
96
+
97
+ def _build_screenshot_path(prefix: str = "terminal_screenshot") -> Path:
98
+ """Generate a unique screenshot path."""
99
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
100
+ return _TEMP_SCREENSHOT_ROOT / f"{prefix}_{timestamp}.png"
101
+
102
+
103
+ def _resize_image(image_bytes: bytes, max_height: int = DEFAULT_MAX_HEIGHT) -> bytes:
104
+ """Resize image to max height while maintaining aspect ratio.
105
+
106
+ This dramatically reduces token usage for multimodal models.
107
+
108
+ Args:
109
+ image_bytes: Original PNG image bytes.
110
+ max_height: Maximum height in pixels (default 384).
111
+
112
+ Returns:
113
+ Resized PNG image bytes.
114
+ """
115
+ try:
116
+ img = Image.open(io.BytesIO(image_bytes))
117
+
118
+ # Only resize if image is taller than max_height
119
+ if img.height <= max_height:
120
+ return image_bytes
121
+
122
+ # Calculate new dimensions maintaining aspect ratio
123
+ ratio = max_height / img.height
124
+ new_width = int(img.width * ratio)
125
+ new_height = max_height
126
+
127
+ # Resize with high quality resampling
128
+ resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
129
+
130
+ # Save to bytes
131
+ output = io.BytesIO()
132
+ resized.save(output, format="PNG", optimize=True)
133
+ output.seek(0)
134
+
135
+ logger.debug(
136
+ f"Resized image from {img.width}x{img.height} to {new_width}x{new_height}"
137
+ )
138
+ return output.read()
139
+
140
+ except Exception as e:
141
+ logger.warning(f"Failed to resize image: {e}, using original")
142
+ return image_bytes
143
+
144
+
145
+ async def _capture_terminal_screenshot(
146
+ full_page: bool = False,
147
+ save_to_disk: bool = True,
148
+ group_id: str | None = None,
149
+ max_height: int = DEFAULT_MAX_HEIGHT,
150
+ ) -> Dict[str, Any]:
151
+ """Internal function to capture terminal screenshot.
152
+
153
+ Args:
154
+ full_page: Whether to capture full page or just viewport.
155
+ save_to_disk: Whether to save screenshot to disk.
156
+ group_id: Optional message group for logging.
157
+ max_height: Maximum height for resizing (default 768px).
158
+
159
+ Returns:
160
+ Dict with screenshot_bytes, screenshot_path, base64_data, and success status.
161
+ """
162
+ try:
163
+ manager = get_session_manager()
164
+ page = await manager.get_current_page()
165
+
166
+ if not page:
167
+ return {
168
+ "success": False,
169
+ "error": "No active terminal page. Open terminal first.",
170
+ }
171
+
172
+ # Capture screenshot as bytes
173
+ original_bytes = await page.screenshot(full_page=full_page, type="png")
174
+
175
+ # Resize to reduce token usage for multimodal models
176
+ screenshot_bytes = _resize_image(original_bytes, max_height=max_height)
177
+
178
+ result: Dict[str, Any] = {
179
+ "success": True,
180
+ "screenshot_bytes": screenshot_bytes,
181
+ "base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
182
+ }
183
+
184
+ # Save to disk if requested (save the resized version)
185
+ if save_to_disk:
186
+ screenshot_path = _build_screenshot_path()
187
+ screenshot_path.parent.mkdir(parents=True, exist_ok=True)
188
+ with open(screenshot_path, "wb") as f:
189
+ f.write(screenshot_bytes)
190
+ result["screenshot_path"] = str(screenshot_path)
191
+
192
+ if group_id:
193
+ emit_success(
194
+ f"Terminal screenshot saved: {screenshot_path}",
195
+ message_group=group_id,
196
+ )
197
+
198
+ return result
199
+
200
+ except Exception as e:
201
+ logger.exception("Error capturing terminal screenshot")
202
+ return {"success": False, "error": str(e)}
203
+
204
+
205
+ async def terminal_screenshot(
206
+ full_page: bool = False,
207
+ save_to_disk: bool = True,
208
+ ) -> Dict[str, Any]:
209
+ """Take a screenshot of the terminal browser.
210
+
211
+ Captures a screenshot and returns it as base64-encoded PNG data.
212
+ Multimodal models can directly see and analyze this image.
213
+
214
+ Args:
215
+ full_page: Whether to capture the full page or just viewport.
216
+ Defaults to False (viewport only - what's visible on screen).
217
+ save_to_disk: Whether to save the screenshot to disk.
218
+ Defaults to True.
219
+
220
+ Returns:
221
+ A dictionary containing:
222
+ - success (bool): True if screenshot was captured.
223
+ - base64_image (str): Base64-encoded PNG image data.
224
+ - media_type (str): Always "image/png".
225
+ - screenshot_path (str): Path to saved file (if save_to_disk=True).
226
+ - error (str): Error message if unsuccessful.
227
+
228
+ Example:
229
+ >>> result = await terminal_screenshot()
230
+ >>> if result["success"]:
231
+ ... # The base64_image can be shown to multimodal models
232
+ ... print(f"Screenshot saved to: {result['screenshot_path']}")
233
+ """
234
+ target = "full_page" if full_page else "viewport"
235
+ group_id = generate_group_id("terminal_screenshot", target)
236
+ banner = format_terminal_banner("TERMINAL SCREENSHOT 📷")
237
+ emit_info(
238
+ Text.from_markup(f"{banner} [bold cyan]{target}[/bold cyan]"),
239
+ message_group=group_id,
240
+ )
241
+
242
+ result = await _capture_terminal_screenshot(
243
+ full_page=full_page,
244
+ save_to_disk=save_to_disk,
245
+ group_id=group_id,
246
+ )
247
+
248
+ if not result["success"]:
249
+ emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
250
+ return result
251
+
252
+ # Return clean result with base64 image for model consumption
253
+ return {
254
+ "success": True,
255
+ "base64_image": result["base64_data"],
256
+ "media_type": "image/png",
257
+ "screenshot_path": result.get("screenshot_path"),
258
+ "message": "Screenshot captured. The base64_image contains the terminal view.",
259
+ }
260
+
261
+
262
+ async def terminal_read_output(lines: int = 50) -> Dict[str, Any]:
263
+ """Read text output from the terminal by scraping the xterm.js DOM.
264
+
265
+ Extracts text content from the terminal by parsing xterm.js DOM.
266
+ This is useful when you need the actual text rather than an image.
267
+
268
+ Args:
269
+ lines: Number of lines to return from the end. Defaults to 50.
270
+
271
+ Returns:
272
+ A dictionary containing:
273
+ - success (bool): True if text was extracted.
274
+ - output (str): The terminal text content.
275
+ - line_count (int): Number of lines extracted.
276
+ - error (str): Error message if unsuccessful.
277
+ """
278
+ group_id = generate_group_id("terminal_read_output", f"lines_{lines}")
279
+ banner = format_terminal_banner("TERMINAL READ OUTPUT 📖")
280
+ emit_info(
281
+ Text.from_markup(f"{banner} [dim]last {lines} lines[/dim]"),
282
+ message_group=group_id,
283
+ )
284
+
285
+ try:
286
+ manager = get_session_manager()
287
+ page = await manager.get_current_page()
288
+
289
+ if not page:
290
+ error_msg = "No active terminal page. Open terminal first."
291
+ emit_error(error_msg, message_group=group_id)
292
+ return {"success": False, "error": error_msg}
293
+
294
+ # Execute JavaScript to extract text
295
+ result = await page.evaluate(XTERM_TEXT_EXTRACTION_JS)
296
+
297
+ if not result.get("success"):
298
+ error_msg = result.get("error", "Failed to extract terminal text")
299
+ emit_error(error_msg, message_group=group_id)
300
+ return {"success": False, "error": error_msg}
301
+
302
+ extracted_lines = result.get("lines", [])
303
+
304
+ # Get the last N lines
305
+ if len(extracted_lines) > lines:
306
+ extracted_lines = extracted_lines[-lines:]
307
+
308
+ output_text = "\n".join(extracted_lines)
309
+
310
+ emit_success(
311
+ f"Extracted {len(extracted_lines)} lines from terminal",
312
+ message_group=group_id,
313
+ )
314
+
315
+ return {
316
+ "success": True,
317
+ "output": output_text,
318
+ "line_count": len(extracted_lines),
319
+ }
320
+
321
+ except Exception as e:
322
+ error_msg = f"Failed to read terminal output: {str(e)}"
323
+ emit_error(error_msg, message_group=group_id)
324
+ logger.exception("Error reading terminal output")
325
+ return {"success": False, "error": error_msg}
326
+
327
+
328
+ async def load_image(
329
+ image_path: str,
330
+ max_height: int = DEFAULT_MAX_HEIGHT,
331
+ ) -> Dict[str, Any]:
332
+ """Load an image from the filesystem as base64 data.
333
+
334
+ Loads any image file, resizes it to reduce token usage, and returns
335
+ it as base64-encoded data that multimodal models can directly see.
336
+
337
+ Args:
338
+ image_path: Path to the image file.
339
+ max_height: Maximum height for resizing (default 768px).
340
+
341
+ Returns:
342
+ A dictionary containing:
343
+ - success (bool): True if image was loaded.
344
+ - base64_image (str): Base64-encoded image data (resized).
345
+ - media_type (str): The image MIME type (e.g., "image/png").
346
+ - image_path (str): The original path.
347
+ - error (str): Error message if unsuccessful.
348
+ """
349
+ group_id = generate_group_id("load_image", image_path)
350
+ emit_info(f"LOAD IMAGE 🖼️ {image_path}", message_group=group_id)
351
+
352
+ try:
353
+ image_file = Path(image_path)
354
+
355
+ if not image_file.exists():
356
+ error_msg = f"Image file not found: {image_path}"
357
+ emit_error(error_msg, message_group=group_id)
358
+ return {"success": False, "error": error_msg, "image_path": image_path}
359
+
360
+ if not image_file.is_file():
361
+ error_msg = f"Path is not a file: {image_path}"
362
+ emit_error(error_msg, message_group=group_id)
363
+ return {"success": False, "error": error_msg, "image_path": image_path}
364
+
365
+ # Read image bytes
366
+ original_bytes = image_file.read_bytes()
367
+
368
+ # Resize to reduce token usage
369
+ image_bytes = _resize_image(original_bytes, max_height=max_height)
370
+
371
+ # Always return as PNG after resizing (consistent format)
372
+ base64_data = base64.b64encode(image_bytes).decode("utf-8")
373
+
374
+ emit_success(f"Loaded image: {image_path}", message_group=group_id)
375
+
376
+ return {
377
+ "success": True,
378
+ "base64_image": base64_data,
379
+ "media_type": "image/png", # Always PNG after resize
380
+ "image_path": image_path,
381
+ "message": f"Image loaded (resized to max {max_height}px height for token efficiency).",
382
+ }
383
+
384
+ except Exception as e:
385
+ error_msg = f"Failed to load image: {str(e)}"
386
+ emit_error(error_msg, message_group=group_id)
387
+ logger.exception("Error loading image")
388
+ return {"success": False, "error": error_msg, "image_path": image_path}
389
+
390
+
391
+ # =============================================================================
392
+ # Tool Registration Functions
393
+ # =============================================================================
394
+
395
+
396
+ def register_terminal_screenshot(agent):
397
+ """Register the terminal screenshot tool."""
398
+
399
+ @agent.tool
400
+ async def terminal_screenshot_analyze(
401
+ context: RunContext,
402
+ full_page: bool = False,
403
+ ) -> Dict[str, Any]:
404
+ """
405
+ Take a screenshot of the terminal browser.
406
+
407
+ Returns the screenshot as base64 image data that you can see directly.
408
+ Use this to see what's displayed in the terminal.
409
+
410
+ Args:
411
+ full_page: Capture full page (True) or just viewport (False).
412
+
413
+ Returns:
414
+ Dict with base64_image (PNG data you can see), screenshot_path, etc.
415
+ """
416
+ # Session is set by invoke_agent via contextvar
417
+ return await terminal_screenshot(full_page=full_page)
418
+
419
+
420
+ def register_terminal_read_output(agent):
421
+ """Register the terminal text reading tool."""
422
+
423
+ @agent.tool
424
+ async def terminal_read_output(
425
+ context: RunContext,
426
+ lines: int = 50,
427
+ ) -> Dict[str, Any]:
428
+ """
429
+ Read text from the terminal (scrapes xterm.js DOM).
430
+
431
+ Use this when you need the actual text content, not just an image.
432
+
433
+ Args:
434
+ lines: Number of lines to read from end (default: 50).
435
+
436
+ Returns:
437
+ Dict with output (text content), line_count, success.
438
+ """
439
+ # Session is set by invoke_agent via contextvar
440
+ from . import terminal_screenshot_tools
441
+
442
+ return await terminal_screenshot_tools.terminal_read_output(lines=lines)
443
+
444
+
445
+ def register_load_image(agent):
446
+ """Register the image loading tool."""
447
+
448
+ @agent.tool
449
+ async def load_image_for_analysis(
450
+ context: RunContext,
451
+ image_path: str,
452
+ ) -> Dict[str, Any]:
453
+ """
454
+ Load an image file so you can see and analyze it.
455
+
456
+ Returns the image as base64 data that you can see directly.
457
+
458
+ Args:
459
+ image_path: Path to the image file.
460
+
461
+ Returns:
462
+ Dict with base64_image (you can see this), media_type, etc.
463
+ """
464
+ # Session is set by invoke_agent via contextvar
465
+ return await load_image(image_path=image_path)
466
+
467
+
468
+ def register_terminal_compare_mockup(agent):
469
+ """Register the mockup comparison tool."""
470
+
471
+ @agent.tool
472
+ async def terminal_compare_mockup(
473
+ context: RunContext,
474
+ mockup_path: str,
475
+ ) -> Dict[str, Any]:
476
+ """
477
+ Compare the terminal to a mockup image.
478
+
479
+ Takes a screenshot of the terminal and loads the mockup image.
480
+ Returns both as base64 so you can visually compare them.
481
+
482
+ Args:
483
+ mockup_path: Path to the mockup/expected image.
484
+
485
+ Returns:
486
+ Dict with terminal_image, mockup_image (both base64), paths, etc.
487
+ """
488
+ # Session is set by invoke_agent via contextvar
489
+ group_id = generate_group_id("terminal_compare_mockup", mockup_path)
490
+ banner = format_terminal_banner("TERMINAL COMPARE MOCKUP 🖼️")
491
+ emit_info(
492
+ Text.from_markup(f"{banner} [bold cyan]{mockup_path}[/bold cyan]"),
493
+ message_group=group_id,
494
+ )
495
+
496
+ # Load the mockup
497
+ mockup_result = await load_image(mockup_path)
498
+ if not mockup_result["success"]:
499
+ return mockup_result
500
+
501
+ # Take terminal screenshot
502
+ terminal_result = await terminal_screenshot(full_page=False)
503
+ if not terminal_result["success"]:
504
+ return terminal_result
505
+
506
+ emit_success(
507
+ "Both images loaded. Compare them visually.",
508
+ message_group=group_id,
509
+ )
510
+
511
+ return {
512
+ "success": True,
513
+ "terminal_image": terminal_result["base64_image"],
514
+ "mockup_image": mockup_result["base64_image"],
515
+ "media_type": "image/png",
516
+ "terminal_path": terminal_result.get("screenshot_path"),
517
+ "mockup_path": mockup_path,
518
+ "message": "Both images loaded. terminal_image shows the current terminal, "
519
+ "mockup_image shows the expected design. Compare them visually.",
520
+ }