sentienceapi 0.90.16__py3-none-any.whl → 0.98.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (90) hide show
  1. sentience/__init__.py +120 -6
  2. sentience/_extension_loader.py +156 -1
  3. sentience/action_executor.py +217 -0
  4. sentience/actions.py +758 -30
  5. sentience/agent.py +806 -293
  6. sentience/agent_config.py +3 -0
  7. sentience/agent_runtime.py +840 -0
  8. sentience/asserts/__init__.py +70 -0
  9. sentience/asserts/expect.py +621 -0
  10. sentience/asserts/query.py +383 -0
  11. sentience/async_api.py +89 -1141
  12. sentience/backends/__init__.py +137 -0
  13. sentience/backends/actions.py +372 -0
  14. sentience/backends/browser_use_adapter.py +241 -0
  15. sentience/backends/cdp_backend.py +393 -0
  16. sentience/backends/exceptions.py +211 -0
  17. sentience/backends/playwright_backend.py +194 -0
  18. sentience/backends/protocol.py +216 -0
  19. sentience/backends/sentience_context.py +469 -0
  20. sentience/backends/snapshot.py +483 -0
  21. sentience/base_agent.py +95 -0
  22. sentience/browser.py +678 -39
  23. sentience/browser_evaluator.py +299 -0
  24. sentience/canonicalization.py +207 -0
  25. sentience/cloud_tracing.py +507 -42
  26. sentience/constants.py +6 -0
  27. sentience/conversational_agent.py +77 -43
  28. sentience/cursor_policy.py +142 -0
  29. sentience/element_filter.py +136 -0
  30. sentience/expect.py +98 -2
  31. sentience/extension/background.js +56 -185
  32. sentience/extension/content.js +150 -287
  33. sentience/extension/injected_api.js +1088 -1368
  34. sentience/extension/manifest.json +1 -1
  35. sentience/extension/pkg/sentience_core.d.ts +22 -22
  36. sentience/extension/pkg/sentience_core.js +275 -433
  37. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  38. sentience/extension/release.json +47 -47
  39. sentience/failure_artifacts.py +241 -0
  40. sentience/formatting.py +9 -53
  41. sentience/inspector.py +183 -1
  42. sentience/integrations/__init__.py +6 -0
  43. sentience/integrations/langchain/__init__.py +12 -0
  44. sentience/integrations/langchain/context.py +18 -0
  45. sentience/integrations/langchain/core.py +326 -0
  46. sentience/integrations/langchain/tools.py +180 -0
  47. sentience/integrations/models.py +46 -0
  48. sentience/integrations/pydanticai/__init__.py +15 -0
  49. sentience/integrations/pydanticai/deps.py +20 -0
  50. sentience/integrations/pydanticai/toolset.py +468 -0
  51. sentience/llm_interaction_handler.py +191 -0
  52. sentience/llm_provider.py +765 -66
  53. sentience/llm_provider_utils.py +120 -0
  54. sentience/llm_response_builder.py +153 -0
  55. sentience/models.py +595 -3
  56. sentience/ordinal.py +280 -0
  57. sentience/overlay.py +109 -2
  58. sentience/protocols.py +228 -0
  59. sentience/query.py +67 -5
  60. sentience/read.py +95 -3
  61. sentience/recorder.py +223 -3
  62. sentience/schemas/trace_v1.json +128 -9
  63. sentience/screenshot.py +48 -2
  64. sentience/sentience_methods.py +86 -0
  65. sentience/snapshot.py +599 -55
  66. sentience/snapshot_diff.py +126 -0
  67. sentience/text_search.py +120 -5
  68. sentience/trace_event_builder.py +148 -0
  69. sentience/trace_file_manager.py +197 -0
  70. sentience/trace_indexing/index_schema.py +95 -7
  71. sentience/trace_indexing/indexer.py +105 -48
  72. sentience/tracer_factory.py +120 -9
  73. sentience/tracing.py +172 -8
  74. sentience/utils/__init__.py +40 -0
  75. sentience/utils/browser.py +46 -0
  76. sentience/{utils.py → utils/element.py} +3 -42
  77. sentience/utils/formatting.py +59 -0
  78. sentience/verification.py +618 -0
  79. sentience/visual_agent.py +2058 -0
  80. sentience/wait.py +68 -2
  81. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +199 -40
  82. sentienceapi-0.98.0.dist-info/RECORD +92 -0
  83. sentience/extension/test-content.js +0 -4
  84. sentienceapi-0.90.16.dist-info/RECORD +0 -50
  85. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
  86. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
  87. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
  88. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
  89. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
  90. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2058 @@
1
+ """
2
+ Visual Agent - Uses labeled screenshots with vision-capable LLMs
3
+
4
+ This agent extends SentienceAgentAsync to use visual prompts:
5
+ 1. Takes snapshot with screenshot enabled
6
+ 2. Draws bounding boxes and labels element IDs on the screenshot
7
+ 3. Uses anti-collision algorithm to position labels (4 sides + 4 corners)
8
+ 4. Sends labeled screenshot to vision-capable LLM
9
+ 5. Extracts element ID from LLM response
10
+ 6. Clicks the element using click_async
11
+
12
+ Dependencies:
13
+ - Pillow (PIL): Required for image processing and drawing bounding boxes
14
+ Install with: pip install Pillow
15
+ """
16
+
17
+ import base64
18
+ import hashlib
19
+ import io
20
+ import re
21
+ import time
22
+ import uuid
23
+ from pathlib import Path
24
+ from typing import TYPE_CHECKING, Any, Optional
25
+
26
+ from .actions import click, click_async
27
+ from .agent import SentienceAgent, SentienceAgentAsync, _safe_tracer_call
28
+ from .async_api import AsyncSentienceBrowser
29
+ from .browser import SentienceBrowser
30
+ from .llm_provider import LLMProvider, LLMResponse
31
+ from .models import AgentActionResult, Element, Snapshot, SnapshotOptions
32
+ from .snapshot import snapshot
33
+ from .snapshot_diff import SnapshotDiff
34
+ from .trace_event_builder import TraceEventBuilder
35
+
36
+ # Only import PIL types for type checking, not at runtime
37
+ if TYPE_CHECKING:
38
+ from PIL import Image, ImageDraw, ImageFont
39
+ else:
40
+ # Create a dummy type for runtime when PIL is not available
41
+ Image = None
42
+ ImageDraw = None
43
+ ImageFont = None
44
+
45
+ try:
46
+ from PIL import Image as PILImage
47
+ from PIL import ImageDraw as PILImageDraw
48
+ from PIL import ImageFont as PILImageFont
49
+
50
+ PIL_AVAILABLE = True
51
+ except ImportError:
52
+ PIL_AVAILABLE = False
53
+ # Define dummy values so type hints don't fail
54
+ PILImage = None # type: ignore
55
+ PILImageDraw = None # type: ignore
56
+ PILImageFont = None # type: ignore
57
+ # Don't print warning here - it will be printed when the class is instantiated
58
+
59
+
60
+ class SentienceVisualAgentAsync(SentienceAgentAsync):
61
+ """
62
+ Async visual agent that uses labeled screenshots with vision-capable LLMs.
63
+
64
+ Extends SentienceAgentAsync to override act() method with visual prompting.
65
+
66
+ Requirements:
67
+ - Pillow (PIL): Required for image processing and drawing bounding boxes
68
+ Install with: pip install Pillow
69
+ - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ browser: AsyncSentienceBrowser,
75
+ llm: LLMProvider,
76
+ default_snapshot_limit: int = 50,
77
+ verbose: bool = True,
78
+ tracer: Any | None = None,
79
+ config: Any | None = None,
80
+ ):
81
+ """
82
+ Initialize Visual Agent
83
+
84
+ Args:
85
+ browser: AsyncSentienceBrowser instance
86
+ llm: LLM provider (must support vision, e.g., GPT-4o, Claude 3)
87
+ default_snapshot_limit: Default maximum elements to include
88
+ verbose: Print execution logs
89
+ tracer: Optional Tracer instance
90
+ config: Optional AgentConfig
91
+ """
92
+ super().__init__(browser, llm, default_snapshot_limit, verbose, tracer, config)
93
+
94
+ if not PIL_AVAILABLE:
95
+ raise ImportError(
96
+ "PIL/Pillow is required for SentienceVisualAgentAsync. Install with: pip install Pillow"
97
+ )
98
+
99
+ # Track previous snapshot for diff computation
100
+ self._previous_snapshot: Snapshot | None = None
101
+
102
+ def _decode_screenshot(self, screenshot_data_url: str) -> "PILImage.Image":
103
+ """
104
+ Decode base64 screenshot data URL to PIL Image
105
+
106
+ Args:
107
+ screenshot_data_url: Base64-encoded data URL (e.g., "data:image/png;base64,...")
108
+
109
+ Returns:
110
+ PIL Image object
111
+ """
112
+ # Extract base64 data from data URL
113
+ if screenshot_data_url.startswith("data:image/"):
114
+ # Format: "data:image/png;base64,<base64_data>"
115
+ base64_data = screenshot_data_url.split(",", 1)[1]
116
+ else:
117
+ # Assume it's already base64
118
+ base64_data = screenshot_data_url
119
+
120
+ # Decode base64 to bytes
121
+ image_bytes = base64.b64decode(base64_data)
122
+
123
+ # Create PIL Image from bytes
124
+ return PILImage.open(io.BytesIO(image_bytes))
125
+
126
+ def _find_label_position(
127
+ self,
128
+ bbox: dict[str, float],
129
+ existing_labels: list[dict[str, Any]],
130
+ image_width: int,
131
+ image_height: int,
132
+ label_width: int,
133
+ label_height: int,
134
+ ) -> tuple[float, float]:
135
+ """
136
+ Find best position for label using anti-collision algorithm.
137
+
138
+ Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners (top-left, top-right, bottom-left, bottom-right)
139
+
140
+ Args:
141
+ bbox: Element bounding box {x, y, width, height}
142
+ existing_labels: List of existing label positions {x, y, width, height}
143
+ image_width: Screenshot width
144
+ image_height: Screenshot height
145
+ label_width: Label text width
146
+ label_height: Label text height
147
+
148
+ Returns:
149
+ (x, y) position for label
150
+ """
151
+ x, y, width, height = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
152
+ center_x = x + width / 2
153
+ center_y = y + height / 2
154
+
155
+ # Anti-collision algorithm
156
+ # Define 8 candidate positions (4 sides + 4 corners)
157
+ # Increased distance from element to avoid confusion (15px instead of 5px)
158
+ label_offset = 15 # Increased from 5 to make labels more clearly separate
159
+ candidates = [
160
+ # 4 sides
161
+ (center_x - label_width / 2, y - label_height - label_offset, "top"), # Above element
162
+ (center_x - label_width / 2, y + height + label_offset, "bottom"), # Below element
163
+ (
164
+ x - label_width - label_offset,
165
+ center_y - label_height / 2,
166
+ "left",
167
+ ), # Left of element
168
+ (x + width + label_offset, center_y - label_height / 2, "right"), # Right of element
169
+ # 4 corners
170
+ (
171
+ x - label_width - label_offset,
172
+ y - label_height - label_offset,
173
+ "top-left",
174
+ ), # Top-left corner
175
+ (
176
+ x + width + label_offset,
177
+ y - label_height - label_offset,
178
+ "top-right",
179
+ ), # Top-right corner
180
+ (
181
+ x - label_width - label_offset,
182
+ y + height + label_offset,
183
+ "bottom-left",
184
+ ), # Bottom-left corner
185
+ (
186
+ x + width + label_offset,
187
+ y + height + label_offset,
188
+ "bottom-right",
189
+ ), # Bottom-right corner
190
+ ]
191
+
192
+ # Check each candidate position for collisions
193
+ for candidate_x, candidate_y, _ in candidates:
194
+ # Check bounds
195
+ if candidate_x < 0 or candidate_y < 0:
196
+ continue
197
+ if candidate_x + label_width > image_width or candidate_y + label_height > image_height:
198
+ continue
199
+
200
+ # Check collision with existing labels
201
+ collision = False
202
+ for existing in existing_labels:
203
+ ex, ey, ew, eh = existing["x"], existing["y"], existing["width"], existing["height"]
204
+ # Check if rectangles overlap
205
+ if not (
206
+ candidate_x + label_width < ex
207
+ or candidate_x > ex + ew
208
+ or candidate_y + label_height < ey
209
+ or candidate_y > ey + eh
210
+ ):
211
+ collision = True
212
+ break
213
+
214
+ if not collision:
215
+ return (candidate_x, candidate_y)
216
+
217
+ # If all positions collide, use top position (may overlap but better than nothing)
218
+ return (center_x - label_width / 2, y - label_height - 15)
219
+
220
+ def _draw_labeled_screenshot(
221
+ self,
222
+ snapshot: Snapshot,
223
+ elements: list[Element],
224
+ ) -> "PILImage.Image":
225
+ """
226
+ Draw bounding boxes and labels on screenshot.
227
+
228
+ Args:
229
+ snapshot: Snapshot with screenshot data
230
+ elements: List of elements to draw
231
+
232
+ Returns:
233
+ PIL Image with bounding boxes and labels
234
+ """
235
+ if not snapshot.screenshot:
236
+ raise ValueError("Screenshot not available in snapshot")
237
+
238
+ # Decode screenshot
239
+ img = self._decode_screenshot(snapshot.screenshot)
240
+ draw = PILImageDraw.Draw(img)
241
+
242
+ # Try to load a font, fallback to default if not available
243
+ try:
244
+ # Try to use a system font
245
+ font = PILImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
246
+ except:
247
+ try:
248
+ font = PILImageFont.truetype("arial.ttf", 16)
249
+ except:
250
+ # Use default font if system fonts not available
251
+ font = PILImageFont.load_default()
252
+
253
+ image_width, image_height = img.size
254
+ existing_labels: list[dict[str, Any]] = []
255
+
256
+ # Neon green color: #39FF14 (bright, vibrant green)
257
+ neon_green = "#39FF14"
258
+
259
+ # Draw bounding boxes and labels for each element
260
+ for element in elements:
261
+ bbox = element.bbox
262
+ x, y, width, height = bbox.x, bbox.y, bbox.width, bbox.height
263
+
264
+ # Draw bounding box rectangle (neon green with 2px width)
265
+ draw.rectangle(
266
+ [(x, y), (x + width, y + height)],
267
+ outline=neon_green,
268
+ width=2,
269
+ )
270
+
271
+ # Prepare label text (just the number - keep it simple and compact)
272
+ label_text = str(element.id)
273
+
274
+ # Measure label text size
275
+ bbox_text = draw.textbbox((0, 0), label_text, font=font)
276
+ label_width = bbox_text[2] - bbox_text[0]
277
+ label_height = bbox_text[3] - bbox_text[1]
278
+
279
+ # Find best position for label (anti-collision)
280
+ label_x, label_y = self._find_label_position(
281
+ {"x": x, "y": y, "width": width, "height": height},
282
+ existing_labels,
283
+ image_width,
284
+ image_height,
285
+ label_width + 8, # Add padding
286
+ label_height + 4, # Add padding
287
+ )
288
+
289
+ # Calculate connection points for a clearer visual link
290
+ # Connect from the nearest corner/edge of element to the label
291
+ element_center_x = x + width / 2
292
+ element_center_y = y + height / 2
293
+ label_center_x = label_x + label_width / 2
294
+ label_center_y = label_y + label_height / 2
295
+
296
+ # Determine which edge of the element is closest to the label
297
+ # and draw line from that edge point to the label
298
+ dist_top = abs(label_center_y - y)
299
+ dist_bottom = abs(label_center_y - (y + height))
300
+ dist_left = abs(label_center_x - x)
301
+ dist_right = abs(label_center_x - (x + width))
302
+
303
+ min_dist = min(dist_top, dist_bottom, dist_left, dist_right)
304
+
305
+ if min_dist == dist_top:
306
+ # Label is above - connect from top edge
307
+ line_start = (element_center_x, y)
308
+ elif min_dist == dist_bottom:
309
+ # Label is below - connect from bottom edge
310
+ line_start = (element_center_x, y + height)
311
+ elif min_dist == dist_left:
312
+ # Label is left - connect from left edge
313
+ line_start = (x, element_center_y)
314
+ else:
315
+ # Label is right - connect from right edge
316
+ line_start = (x + width, element_center_y)
317
+
318
+ # Draw connecting line from element edge to label (makes it clear the label belongs to the element)
319
+ draw.line(
320
+ [line_start, (label_center_x, label_center_y)],
321
+ fill=neon_green,
322
+ width=2, # Slightly thicker for better visibility
323
+ )
324
+
325
+ # Draw label background (white with neon green border)
326
+ label_bg_x1 = label_x - 4
327
+ label_bg_y1 = label_y - 2
328
+ label_bg_x2 = label_x + label_width + 4
329
+ label_bg_y2 = label_y + label_height + 2
330
+
331
+ # Draw white background with neon green border (makes label stand out as separate)
332
+ draw.rectangle(
333
+ [(label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2)],
334
+ fill="white",
335
+ outline=neon_green,
336
+ width=2, # Thicker border to make it more distinct
337
+ )
338
+
339
+ # Draw label text (black for high contrast)
340
+ draw.text(
341
+ (label_x, label_y),
342
+ label_text,
343
+ fill="black",
344
+ font=font,
345
+ )
346
+
347
+ # Record label position for collision detection
348
+ existing_labels.append(
349
+ {
350
+ "x": label_bg_x1,
351
+ "y": label_bg_y1,
352
+ "width": label_bg_x2 - label_bg_x1,
353
+ "height": label_bg_y2 - label_bg_y1,
354
+ }
355
+ )
356
+
357
+ return img
358
+
359
+ def _encode_image_to_base64(
360
+ self, image: "PILImage.Image", format: str = "PNG", max_size_mb: float = 20.0
361
+ ) -> str:
362
+ """
363
+ Encode PIL Image to base64 data URL with size optimization.
364
+
365
+ Vision LLM APIs typically have size limits (e.g., 20MB for OpenAI).
366
+ This function automatically compresses images if they're too large.
367
+
368
+ Args:
369
+ image: PIL Image object
370
+ format: Image format (PNG or JPEG)
371
+ max_size_mb: Maximum size in MB before compression (default: 20MB)
372
+
373
+ Returns:
374
+ Base64-encoded data URL
375
+ """
376
+ # Convert format for PIL
377
+ pil_format = format.upper()
378
+
379
+ # Try JPEG first for better compression (unless PNG is specifically requested)
380
+ if format.upper() != "PNG":
381
+ pil_format = "JPEG"
382
+ # Convert RGBA to RGB for JPEG
383
+ if image.mode in ("RGBA", "LA", "P"):
384
+ # Create white background
385
+ rgb_image = PILImage.new("RGB", image.size, (255, 255, 255))
386
+ if image.mode == "P":
387
+ image = image.convert("RGBA")
388
+ rgb_image.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None)
389
+ image = rgb_image
390
+
391
+ buffer = io.BytesIO()
392
+ quality = 95 # Start with high quality
393
+
394
+ # Try to fit within size limit
395
+ for attempt in range(3):
396
+ buffer.seek(0)
397
+ buffer.truncate(0)
398
+
399
+ if pil_format == "JPEG":
400
+ image.save(buffer, format=pil_format, quality=quality, optimize=True)
401
+ else:
402
+ image.save(buffer, format=pil_format, optimize=True)
403
+
404
+ size_mb = len(buffer.getvalue()) / (1024 * 1024)
405
+
406
+ if size_mb <= max_size_mb:
407
+ break
408
+
409
+ # Reduce quality for next attempt
410
+ quality = max(70, quality - 15)
411
+ if self.verbose and attempt == 0:
412
+ print(f" ⚠️ Image size {size_mb:.2f}MB exceeds limit, compressing...")
413
+
414
+ image_bytes = buffer.getvalue()
415
+ base64_data = base64.b64encode(image_bytes).decode("utf-8")
416
+
417
+ final_size_mb = len(image_bytes) / (1024 * 1024)
418
+ if self.verbose:
419
+ print(f" 📸 Image encoded: {final_size_mb:.2f}MB ({len(base64_data)} chars base64)")
420
+
421
+ mime_type = "image/png" if pil_format == "PNG" else "image/jpeg"
422
+ return f"data:{mime_type};base64,{base64_data}"
423
+
424
+ async def _query_llm_with_vision(
425
+ self,
426
+ image_data_url: str,
427
+ goal: str,
428
+ ) -> LLMResponse:
429
+ """
430
+ Query LLM with vision (labeled screenshot).
431
+
432
+ Args:
433
+ image_data_url: Base64-encoded image data URL
434
+ goal: User's goal/task
435
+
436
+ Returns:
437
+ LLMResponse with element ID
438
+ """
439
+ system_prompt = """You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
440
+ Each clickable element has:
441
+ - A bright neon green (#39FF14) bounding box around the element
442
+ - A white label box with a number (the element ID) connected by a green line
443
+ - The label is clearly separate from the element (not part of the UI)
444
+
445
+ CRITICAL INSTRUCTIONS:
446
+ 1. Look at the screenshot carefully
447
+ 2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
448
+ 3. Follow the green line from that element to find its label box with the ID number
449
+ 4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
450
+ 5. Do NOT include any explanation, reasoning, or other text
451
+ 6. Do NOT say "element 1" or "the first element" - just return the number
452
+ 7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
453
+
454
+ Example responses:
455
+ - Correct: "42"
456
+ - Correct: "1567"
457
+ - Wrong: "I see element 42"
458
+ - Wrong: "The element ID is 42"
459
+ - Wrong: "42 (the search box)" """
460
+
461
+ user_prompt = f"""Goal: {goal}
462
+
463
+ Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
464
+ Find the element that should be clicked to accomplish this goal.
465
+ Return ONLY the integer ID number from the label, nothing else."""
466
+
467
+ # Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
468
+ # Vision-capable providers use similar message format with image_url
469
+ if hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
470
+ # Vision-capable provider - use vision API
471
+ try:
472
+ from openai import OpenAI
473
+
474
+ # Check if it's OpenAI
475
+ if isinstance(self.llm.client, OpenAI):
476
+ messages = [
477
+ {
478
+ "role": "system",
479
+ "content": system_prompt,
480
+ },
481
+ {
482
+ "role": "user",
483
+ "content": [
484
+ {"type": "text", "text": user_prompt},
485
+ {
486
+ "type": "image_url",
487
+ "image_url": {"url": image_data_url},
488
+ },
489
+ ],
490
+ },
491
+ ]
492
+
493
+ response = self.llm.client.chat.completions.create(
494
+ model=self.llm._model_name,
495
+ messages=messages,
496
+ temperature=0.0,
497
+ # Removed max_tokens to use API default (usually higher limit)
498
+ )
499
+
500
+ content = response.choices[0].message.content or ""
501
+ usage = response.usage
502
+
503
+ from .llm_response_builder import LLMResponseBuilder
504
+
505
+ return LLMResponseBuilder.from_openai_format(
506
+ content=content,
507
+ prompt_tokens=usage.prompt_tokens if usage else None,
508
+ completion_tokens=usage.completion_tokens if usage else None,
509
+ total_tokens=usage.total_tokens if usage else None,
510
+ model_name=response.model,
511
+ finish_reason=response.choices[0].finish_reason,
512
+ )
513
+
514
+ # Check if provider supports vision API (uses OpenAI-compatible format)
515
+ elif hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
516
+ # Vision API uses similar format to OpenAI
517
+ if self.verbose:
518
+ print(f" 🔍 Using vision API with model: {self.llm._model_name}")
519
+ print(f" 📐 Image data URL length: {len(image_data_url)} chars")
520
+
521
+ messages = [
522
+ {
523
+ "role": "system",
524
+ "content": system_prompt,
525
+ },
526
+ {
527
+ "role": "user",
528
+ "content": [
529
+ {"type": "text", "text": user_prompt},
530
+ {
531
+ "type": "image_url",
532
+ "image_url": {"url": image_data_url},
533
+ },
534
+ ],
535
+ },
536
+ ]
537
+
538
+ try:
539
+ if self.verbose:
540
+ print(f" 📤 Sending request to vision API...")
541
+ print(f" 📋 Messages structure: {len(messages)} messages")
542
+ print(f" 🖼️ Image URL prefix: {image_data_url[:50]}...")
543
+
544
+ # Removed max_tokens to use API default (usually higher limit)
545
+ # This allows the model to generate complete responses without truncation
546
+ response = self.llm.client.chat.completions.create(
547
+ model=self.llm._model_name,
548
+ messages=messages,
549
+ temperature=0.0,
550
+ # No max_tokens - use API default
551
+ )
552
+
553
+ # Debug: Check response structure
554
+ if self.verbose:
555
+ print(f" 📥 Response received")
556
+ print(f" 📦 Response type: {type(response)}")
557
+ print(
558
+ f" 📦 Choices count: {len(response.choices) if hasattr(response, 'choices') else 0}"
559
+ )
560
+
561
+ if not hasattr(response, "choices") or len(response.choices) == 0:
562
+ raise ValueError("Vision API returned no choices in response")
563
+
564
+ choice = response.choices[0]
565
+ content = (
566
+ choice.message.content if hasattr(choice.message, "content") else None
567
+ )
568
+ finish_reason = (
569
+ choice.finish_reason if hasattr(choice, "finish_reason") else None
570
+ )
571
+
572
+ if self.verbose:
573
+ print(f" 📝 Content: {repr(content)}")
574
+ print(f" 🏁 Finish reason: {finish_reason}")
575
+ if finish_reason:
576
+ print(f" ⚠️ Finish reason indicates: {finish_reason}")
577
+ if finish_reason == "length":
578
+ print(
579
+ f" - Response was truncated (hit API default max_tokens limit)"
580
+ )
581
+ print(
582
+ f" - This might indicate the model needs more tokens or doesn't support vision properly"
583
+ )
584
+ # Even if truncated, there might be partial content
585
+ if content:
586
+ print(
587
+ f" - ⚠️ Partial content received: {repr(content)}"
588
+ )
589
+ elif finish_reason == "content_filter":
590
+ print(f" - Content was filtered by safety filters")
591
+ elif finish_reason == "stop":
592
+ print(f" - Normal completion")
593
+
594
+ # If finish_reason is "length", we might still have partial content
595
+ # Try to use it if available (even if truncated, it might contain the element ID)
596
+ if finish_reason == "length" and content and content.strip():
597
+ if self.verbose:
598
+ print(f" ⚠️ Using truncated response: {repr(content)}")
599
+ # Continue processing with partial content
600
+
601
+ if content is None or content == "":
602
+ error_msg = f"Vision API returned empty content (finish_reason: {finish_reason})"
603
+ if self.verbose:
604
+ print(f" ❌ {error_msg}")
605
+ print(f" 💡 Possible causes:")
606
+ print(
607
+ f" - Model {self.llm._model_name} may not support vision"
608
+ )
609
+ print(f" - Image format might not be supported")
610
+ print(f" - API default max_tokens might be too restrictive")
611
+ print(f" - API response structure might be different")
612
+ if finish_reason == "length":
613
+ print(
614
+ f" - ⚠️ Response was truncated - content might have been cut off"
615
+ )
616
+ print(
617
+ f" - Try increasing max_tokens or check response.choices[0].message for partial content"
618
+ )
619
+ raise ValueError(error_msg)
620
+
621
+ usage = response.usage if hasattr(response, "usage") else None
622
+
623
+ if self.verbose:
624
+ print(f" ✅ Vision API response received")
625
+ print(
626
+ f" 📊 Tokens: {usage.total_tokens if usage else 'N/A'} (prompt: {usage.prompt_tokens if usage else 'N/A'}, completion: {usage.completion_tokens if usage else 'N/A'})"
627
+ )
628
+
629
+ from .llm_response_builder import LLMResponseBuilder
630
+
631
+ return LLMResponseBuilder.from_openai_format(
632
+ content=content,
633
+ prompt_tokens=usage.prompt_tokens if usage else None,
634
+ completion_tokens=usage.completion_tokens if usage else None,
635
+ total_tokens=usage.total_tokens if usage else None,
636
+ model_name=(
637
+ response.model
638
+ if hasattr(response, "model")
639
+ else self.llm._model_name
640
+ ),
641
+ finish_reason=finish_reason,
642
+ )
643
+ except Exception as vision_error:
644
+ if self.verbose:
645
+ print(f" ❌ Vision API error: {vision_error}")
646
+ print(f" 💡 This might indicate:")
647
+ print(f" - Model {self.llm._model_name} doesn't support vision")
648
+ print(f" - Image format/size issue")
649
+ print(f" - API key or permissions issue")
650
+ print(f" 🔄 Attempting fallback to regular generate method...")
651
+
652
+ # Fallback: Try using the regular generate method
653
+ # Some models might need images passed differently
654
+ try:
655
+ # Try embedding image in the prompt as base64
656
+ fallback_prompt = f"{user_prompt}\n\n[Image: {image_data_url[:200]}...]"
657
+ fallback_response = self.llm.generate(
658
+ system_prompt,
659
+ fallback_prompt,
660
+ temperature=0.0,
661
+ # No max_tokens - use API default
662
+ )
663
+ if self.verbose:
664
+ print(f" ⚠️ Using fallback method (may not support vision)")
665
+ return fallback_response
666
+ except Exception as fallback_error:
667
+ if self.verbose:
668
+ print(f" ❌ Fallback also failed: {fallback_error}")
669
+ raise vision_error # Raise original error
670
+ except ImportError:
671
+ # openai or other vision SDK not available
672
+ pass
673
+ except Exception as e:
674
+ if self.verbose:
675
+ print(f"⚠️ Vision API error: {e}, falling back to text-only")
676
+
677
+ # Fallback: Try to pass image via kwargs or use text-only
678
+ # Some providers might accept image in kwargs
679
+ try:
680
+ return self.llm.generate(
681
+ system_prompt,
682
+ f"{user_prompt}\n\n[Image data: {image_data_url[:100]}...]",
683
+ temperature=0.0,
684
+ # No max_tokens - use API default
685
+ )
686
+ except Exception as e:
687
+ raise RuntimeError(
688
+ f"LLM provider {type(self.llm).__name__} may not support vision. "
689
+ f"Error: {e}. Use a vision-capable model like GPT-4o or Claude 3."
690
+ ) from e
691
+
692
+ def _extract_element_id(self, llm_response: str) -> int | None:
693
+ """
694
+ Extract element ID integer from LLM response.
695
+
696
+ Args:
697
+ llm_response: LLM response text
698
+
699
+ Returns:
700
+ Element ID as integer, or None if not found
701
+ """
702
+ if self.verbose:
703
+ print(f"🔍 Raw LLM response: {repr(llm_response)}")
704
+
705
+ # Clean the response - remove leading/trailing whitespace (handles '\n177', '177\n', etc.)
706
+ cleaned = llm_response.strip()
707
+
708
+ if self.verbose:
709
+ print(f" 🧹 After strip: {repr(cleaned)}")
710
+
711
+ # Remove common prefixes that LLMs might add
712
+ prefixes_to_remove = [
713
+ "element",
714
+ "id",
715
+ "the element",
716
+ "element id",
717
+ "the id",
718
+ "click",
719
+ "click on",
720
+ "select",
721
+ "choose",
722
+ ]
723
+ for prefix in prefixes_to_remove:
724
+ if cleaned.lower().startswith(prefix):
725
+ cleaned = cleaned[len(prefix) :].strip()
726
+ # Remove any remaining punctuation
727
+ cleaned = cleaned.lstrip(":.,;!?()[]{}")
728
+ cleaned = cleaned.strip()
729
+ if self.verbose:
730
+ print(f" 🧹 After removing prefix '{prefix}': {repr(cleaned)}")
731
+
732
+ # Try to find all integers in the cleaned response
733
+ numbers = re.findall(r"\d+", cleaned)
734
+
735
+ if self.verbose:
736
+ print(f" 🔢 Numbers found: {numbers}")
737
+
738
+ if numbers:
739
+ # If multiple numbers found, prefer the largest one (likely the actual element ID)
740
+ # Element IDs are typically larger numbers, not small ones like "1"
741
+ try:
742
+ # Convert all to int
743
+ int_numbers = [int(n) for n in numbers]
744
+ if self.verbose:
745
+ print(f" 🔢 As integers: {int_numbers}")
746
+
747
+ # Prefer larger numbers (element IDs are usually > 10)
748
+ # But if only small numbers exist, use the first one
749
+ large_numbers = [n for n in int_numbers if n > 10]
750
+ if large_numbers:
751
+ element_id = max(large_numbers) # Take the largest
752
+ if self.verbose:
753
+ print(f" ✅ Selected largest number > 10: {element_id}")
754
+ else:
755
+ element_id = int_numbers[0] # Fallback to first if all are small
756
+ if self.verbose:
757
+ print(f" ⚠️ All numbers ≤ 10, using first: {element_id}")
758
+
759
+ if self.verbose:
760
+ print(f"✅ Extracted element ID: {element_id} (from {numbers})")
761
+ return element_id
762
+ except ValueError:
763
+ if self.verbose:
764
+ print(f" ❌ Failed to convert numbers to integers")
765
+ pass
766
+
767
+ if self.verbose:
768
+ print(f"⚠️ Could not extract element ID from response: {llm_response}")
769
+ return None
770
+
771
+ def _compute_hash(self, text: str) -> str:
772
+ """Compute SHA256 hash of text."""
773
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
774
+
775
+ async def act(
776
+ self,
777
+ goal: str,
778
+ max_retries: int = 2,
779
+ snapshot_options: SnapshotOptions | None = None,
780
+ ) -> AgentActionResult:
781
+ """
782
+ Override act() method to use visual prompting with full tracing support.
783
+
784
+ Args:
785
+ goal: User's goal/task
786
+ max_retries: Maximum retry attempts
787
+ snapshot_options: Optional snapshot options (screenshot will be enabled)
788
+
789
+ Returns:
790
+ AgentActionResult
791
+ """
792
+ if self.verbose:
793
+ print(f"\n{'=' * 70}")
794
+ print(f"🤖 Visual Agent Goal: {goal}")
795
+ print(f"{'=' * 70}")
796
+
797
+ # Generate step ID for tracing
798
+ self._step_count += 1
799
+ step_id = f"step-{self._step_count}"
800
+
801
+ # Emit step_start trace event if tracer is enabled
802
+ if self.tracer:
803
+ pre_url = self.browser.page.url if self.browser.page else None
804
+ _safe_tracer_call(
805
+ self.tracer,
806
+ "emit_step_start",
807
+ self.verbose,
808
+ step_id=step_id,
809
+ step_index=self._step_count,
810
+ goal=goal,
811
+ attempt=0,
812
+ pre_url=pre_url,
813
+ )
814
+
815
+ start_time = time.time()
816
+
817
+ try:
818
+ # Ensure screenshot is enabled
819
+ if snapshot_options is None:
820
+ snapshot_options = SnapshotOptions()
821
+
822
+ # Enable screenshot if not already enabled
823
+ if snapshot_options.screenshot is False or snapshot_options.screenshot is None:
824
+ from .models import ScreenshotConfig
825
+
826
+ snapshot_options.screenshot = ScreenshotConfig(format="png")
827
+
828
+ # Set goal if not already provided
829
+ if snapshot_options.goal is None:
830
+ snapshot_options.goal = goal
831
+
832
+ # Set limit if not provided
833
+ if snapshot_options.limit is None:
834
+ snapshot_options.limit = self.default_snapshot_limit
835
+
836
+ if self.verbose:
837
+ print(f"🎯 Goal: {goal}")
838
+ print("📸 Taking snapshot with screenshot...")
839
+
840
+ # 1. Take snapshot with screenshot
841
+ from .snapshot import snapshot_async
842
+
843
+ snap = await snapshot_async(self.browser, snapshot_options)
844
+
845
+ if snap.status != "success":
846
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
847
+
848
+ if not snap.screenshot:
849
+ raise RuntimeError("Screenshot not available in snapshot")
850
+
851
+ # Compute diff_status by comparing with previous snapshot
852
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
853
+
854
+ # Create snapshot with diff_status populated
855
+ snap_with_diff = Snapshot(
856
+ status=snap.status,
857
+ timestamp=snap.timestamp,
858
+ url=snap.url,
859
+ viewport=snap.viewport,
860
+ elements=elements_with_diff,
861
+ screenshot=snap.screenshot,
862
+ screenshot_format=snap.screenshot_format,
863
+ error=snap.error,
864
+ )
865
+
866
+ # Update previous snapshot for next comparison
867
+ self._previous_snapshot = snap
868
+
869
+ # Emit snapshot trace event if tracer is enabled
870
+ if self.tracer:
871
+ # Build snapshot event data (use snap_with_diff to include diff_status)
872
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
873
+
874
+ # Always include screenshot in trace event for studio viewer compatibility
875
+ if snap.screenshot:
876
+ # Extract base64 string from data URL if needed
877
+ if snap.screenshot.startswith("data:image"):
878
+ # Format: "data:image/jpeg;base64,{base64_string}"
879
+ screenshot_base64 = (
880
+ snap.screenshot.split(",", 1)[1]
881
+ if "," in snap.screenshot
882
+ else snap.screenshot
883
+ )
884
+ else:
885
+ screenshot_base64 = snap.screenshot
886
+
887
+ snapshot_data["screenshot_base64"] = screenshot_base64
888
+ if snap.screenshot_format:
889
+ snapshot_data["screenshot_format"] = snap.screenshot_format
890
+
891
+ _safe_tracer_call(
892
+ self.tracer,
893
+ "emit",
894
+ self.verbose,
895
+ "snapshot",
896
+ snapshot_data,
897
+ step_id=step_id,
898
+ )
899
+
900
+ if self.verbose:
901
+ print(f"✅ Snapshot taken: {len(snap.elements)} elements")
902
+
903
+ # 2. Draw labeled screenshot
904
+ if self.verbose:
905
+ print("🎨 Drawing bounding boxes and labels...")
906
+ print(f" Elements to label: {len(snap.elements)}")
907
+ if len(snap.elements) > 0:
908
+ element_ids = [el.id for el in snap.elements[:10]] # Show first 10
909
+ print(f" Sample element IDs: {element_ids}")
910
+
911
+ labeled_image = self._draw_labeled_screenshot(snap, snap.elements)
912
+
913
+ # Save labeled image to disk for debugging
914
+ # Save to playground/images if running from playground, otherwise use current directory
915
+ try:
916
+ # Try to detect if we're in a playground context
917
+ import sys
918
+
919
+ cwd = Path.cwd()
920
+ playground_path = None
921
+
922
+ # Check if current working directory contains playground
923
+ if (cwd / "playground").exists():
924
+ playground_path = cwd / "playground" / "images"
925
+ else:
926
+ # Check sys.path for playground
927
+ for path_str in sys.path:
928
+ path_obj = Path(path_str)
929
+ if "playground" in str(path_obj) and path_obj.exists():
930
+ # Find the playground directory
931
+ if path_obj.name == "playground":
932
+ playground_path = path_obj / "images"
933
+ break
934
+ elif (path_obj / "playground").exists():
935
+ playground_path = path_obj / "playground" / "images"
936
+ break
937
+
938
+ if playground_path is None:
939
+ # Fallback: use current working directory
940
+ playground_path = cwd / "playground" / "images"
941
+
942
+ images_dir = playground_path
943
+ images_dir.mkdir(parents=True, exist_ok=True)
944
+ image_uuid = str(uuid.uuid4())
945
+ image_filename = f"labeled_screenshot_{image_uuid}.png"
946
+ image_path = images_dir / image_filename
947
+ labeled_image.save(image_path, format="PNG")
948
+ if self.verbose:
949
+ print(f" 💾 Saved labeled screenshot: {image_path.absolute()}")
950
+ except Exception as save_error:
951
+ # Don't fail if image save fails - it's just for debugging
952
+ if self.verbose:
953
+ print(f" ⚠️ Could not save labeled screenshot: {save_error}")
954
+
955
+ # Use JPEG for better compression (smaller file size for vision APIs)
956
+ labeled_image_data_url = self._encode_image_to_base64(
957
+ labeled_image, format="JPEG", max_size_mb=20.0
958
+ )
959
+
960
+ # 3. Query LLM with vision
961
+ if self.verbose:
962
+ print("🧠 Querying LLM with labeled screenshot...")
963
+
964
+ llm_response = await self._query_llm_with_vision(labeled_image_data_url, goal)
965
+
966
+ # Emit LLM query trace event if tracer is enabled
967
+ if self.tracer:
968
+ _safe_tracer_call(
969
+ self.tracer,
970
+ "emit",
971
+ self.verbose,
972
+ "llm_query",
973
+ {
974
+ "prompt_tokens": llm_response.prompt_tokens,
975
+ "completion_tokens": llm_response.completion_tokens,
976
+ "model": llm_response.model_name,
977
+ "response": llm_response.content[:200], # Truncate for brevity
978
+ },
979
+ step_id=step_id,
980
+ )
981
+
982
+ if self.verbose:
983
+ print(f"💭 LLM Response: {llm_response.content}")
984
+
985
+ # Track token usage
986
+ self._track_tokens(goal, llm_response)
987
+
988
+ # 4. Extract element ID
989
+ element_id = self._extract_element_id(llm_response.content)
990
+
991
+ if element_id is None:
992
+ raise ValueError(
993
+ f"Could not extract element ID from LLM response: {llm_response.content}"
994
+ )
995
+
996
+ if self.verbose:
997
+ print(f"🎯 Extracted Element ID: {element_id}")
998
+
999
+ # 5. Click the element
1000
+ if self.verbose:
1001
+ print(f"🖱️ Clicking element {element_id}...")
1002
+
1003
+ click_result = await click_async(self.browser, element_id)
1004
+
1005
+ duration_ms = int((time.time() - start_time) * 1000)
1006
+
1007
+ # Create AgentActionResult from click result
1008
+ result = AgentActionResult(
1009
+ success=click_result.success,
1010
+ action="click",
1011
+ goal=goal,
1012
+ duration_ms=duration_ms,
1013
+ attempt=0,
1014
+ element_id=element_id,
1015
+ outcome=click_result.outcome,
1016
+ url_changed=click_result.url_changed,
1017
+ error=click_result.error,
1018
+ )
1019
+
1020
+ # Emit action execution trace event if tracer is enabled
1021
+ if self.tracer:
1022
+ post_url = self.browser.page.url if self.browser.page else None
1023
+
1024
+ # Include element data for live overlay visualization
1025
+ elements_data = [
1026
+ {
1027
+ "id": el.id,
1028
+ "bbox": {
1029
+ "x": el.bbox.x,
1030
+ "y": el.bbox.y,
1031
+ "width": el.bbox.width,
1032
+ "height": el.bbox.height,
1033
+ },
1034
+ "role": el.role,
1035
+ "text": el.text[:50] if el.text else "",
1036
+ }
1037
+ for el in snap.elements[:50]
1038
+ ]
1039
+
1040
+ _safe_tracer_call(
1041
+ self.tracer,
1042
+ "emit",
1043
+ self.verbose,
1044
+ "action",
1045
+ {
1046
+ "action": result.action,
1047
+ "element_id": result.element_id,
1048
+ "success": result.success,
1049
+ "outcome": result.outcome,
1050
+ "duration_ms": duration_ms,
1051
+ "post_url": post_url,
1052
+ "elements": elements_data, # Add element data for overlay
1053
+ "target_element_id": result.element_id, # Highlight target in red
1054
+ },
1055
+ step_id=step_id,
1056
+ )
1057
+
1058
+ # Record history
1059
+ self.history.append(
1060
+ {
1061
+ "goal": goal,
1062
+ "action": f"CLICK({element_id})",
1063
+ "result": result.model_dump(), # Store as dict
1064
+ "success": result.success,
1065
+ "attempt": 0,
1066
+ "duration_ms": duration_ms,
1067
+ }
1068
+ )
1069
+
1070
+ if self.verbose:
1071
+ status = "✅" if result.success else "❌"
1072
+ print(f"{status} Completed in {duration_ms}ms")
1073
+
1074
+ # Emit step completion trace event if tracer is enabled
1075
+ if self.tracer:
1076
+ # Get pre_url from step_start (stored in tracer or use current)
1077
+ pre_url = snap.url
1078
+ post_url = self.browser.page.url if self.browser.page else None
1079
+
1080
+ # Compute snapshot digest (simplified - use URL + timestamp)
1081
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
1082
+
1083
+ # Build LLM data
1084
+ llm_response_text = llm_response.content
1085
+
1086
+ # Build execution data
1087
+ exec_data = {
1088
+ "success": result.success,
1089
+ "outcome": result.outcome,
1090
+ "action": result.action,
1091
+ "element_id": result.element_id,
1092
+ "url_changed": result.url_changed,
1093
+ "duration_ms": duration_ms,
1094
+ }
1095
+
1096
+ # Build verification data (simplified - always pass for now)
1097
+ verify_data = {
1098
+ "passed": result.success,
1099
+ "signals": {
1100
+ "url_changed": result.url_changed or False,
1101
+ },
1102
+ }
1103
+
1104
+ # Build complete step_end event
1105
+ step_end_data = TraceEventBuilder.build_step_end_event(
1106
+ step_id=step_id,
1107
+ step_index=self._step_count,
1108
+ goal=goal,
1109
+ attempt=0,
1110
+ pre_url=pre_url,
1111
+ post_url=post_url or pre_url,
1112
+ snapshot_digest=snapshot_digest,
1113
+ llm_data={
1114
+ "response_text": llm_response_text,
1115
+ "response_hash": f"sha256:{self._compute_hash(llm_response_text)}",
1116
+ },
1117
+ exec_data=exec_data,
1118
+ verify_data=verify_data,
1119
+ )
1120
+
1121
+ _safe_tracer_call(
1122
+ self.tracer,
1123
+ "emit",
1124
+ self.verbose,
1125
+ "step_end",
1126
+ step_end_data,
1127
+ step_id=step_id,
1128
+ )
1129
+
1130
+ return result
1131
+
1132
+ except Exception as e:
1133
+ # Emit error trace event if tracer is enabled
1134
+ if self.tracer:
1135
+ _safe_tracer_call(
1136
+ self.tracer,
1137
+ "emit_error",
1138
+ self.verbose,
1139
+ step_id=step_id,
1140
+ error=str(e),
1141
+ attempt=0,
1142
+ )
1143
+
1144
+ if self.verbose:
1145
+ print(f"❌ Error: {e}")
1146
+
1147
+ # Re-raise the exception
1148
+ raise
1149
+
1150
+
1151
+ class SentienceVisualAgent(SentienceAgent):
1152
+ """
1153
+ Sync visual agent that uses labeled screenshots with vision-capable LLMs.
1154
+
1155
+ Extends SentienceAgent to override act() method with visual prompting.
1156
+
1157
+ Requirements:
1158
+ - Pillow (PIL): Required for image processing and drawing bounding boxes
1159
+ Install with: pip install Pillow
1160
+ - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
1161
+ """
1162
+
1163
+ def __init__(
1164
+ self,
1165
+ browser: SentienceBrowser,
1166
+ llm: LLMProvider,
1167
+ default_snapshot_limit: int = 50,
1168
+ verbose: bool = True,
1169
+ tracer: Any | None = None,
1170
+ config: Any | None = None,
1171
+ ):
1172
+ """
1173
+ Initialize Visual Agent
1174
+
1175
+ Args:
1176
+ browser: SentienceBrowser instance
1177
+ llm: LLM provider (must support vision, e.g., GPT-4o, Claude 3)
1178
+ default_snapshot_limit: Default maximum elements to include
1179
+ verbose: Print execution logs
1180
+ tracer: Optional Tracer instance
1181
+ config: Optional AgentConfig
1182
+ """
1183
+ super().__init__(browser, llm, default_snapshot_limit, verbose, tracer, config)
1184
+
1185
+ if not PIL_AVAILABLE:
1186
+ raise ImportError(
1187
+ "PIL/Pillow is required for SentienceVisualAgent. Install with: pip install Pillow"
1188
+ )
1189
+
1190
+ # Track previous snapshot for diff computation
1191
+ self._previous_snapshot: Snapshot | None = None
1192
+
1193
+ def _decode_screenshot(self, screenshot_data_url: str) -> "PILImage.Image":
1194
+ """
1195
+ Decode base64 screenshot data URL to PIL Image
1196
+
1197
+ Args:
1198
+ screenshot_data_url: Base64-encoded data URL (e.g., "data:image/png;base64,...")
1199
+
1200
+ Returns:
1201
+ PIL Image object
1202
+ """
1203
+ # Extract base64 data from data URL
1204
+ if screenshot_data_url.startswith("data:image/"):
1205
+ # Format: "data:image/png;base64,<base64_data>"
1206
+ base64_data = screenshot_data_url.split(",", 1)[1]
1207
+ else:
1208
+ # Assume it's already base64
1209
+ base64_data = screenshot_data_url
1210
+
1211
+ # Decode base64 to bytes
1212
+ image_bytes = base64.b64decode(base64_data)
1213
+
1214
+ # Load image from bytes
1215
+ return PILImage.open(io.BytesIO(image_bytes))
1216
+
1217
+ def _find_label_position(
1218
+ self,
1219
+ element_bbox: dict[str, float],
1220
+ existing_labels: list[dict[str, float]],
1221
+ image_width: int,
1222
+ image_height: int,
1223
+ label_width: int,
1224
+ label_height: int,
1225
+ ) -> tuple[int, int]:
1226
+ """
1227
+ Find best position for label using anti-collision algorithm.
1228
+
1229
+ Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners.
1230
+ Returns the first position that doesn't collide with existing labels.
1231
+
1232
+ Args:
1233
+ element_bbox: Element bounding box {x, y, width, height}
1234
+ existing_labels: List of existing label bounding boxes
1235
+ image_width: Image width in pixels
1236
+ image_height: Image height in pixels
1237
+ label_width: Label width in pixels
1238
+ label_height: Label height in pixels
1239
+
1240
+ Returns:
1241
+ (x, y) position for label
1242
+ """
1243
+ x, y = element_bbox["x"], element_bbox["y"]
1244
+ width, height = element_bbox["width"], element_bbox["height"]
1245
+
1246
+ # Offset from element edge
1247
+ label_offset = 15 # Increased from 5px for better separation
1248
+
1249
+ # Try 8 positions: top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
1250
+ positions = [
1251
+ (int(x + width / 2 - label_width / 2), int(y - label_height - label_offset)), # Top
1252
+ (int(x + width / 2 - label_width / 2), int(y + height + label_offset)), # Bottom
1253
+ (int(x - label_width - label_offset), int(y + height / 2 - label_height / 2)), # Left
1254
+ (int(x + width + label_offset), int(y + height / 2 - label_height / 2)), # Right
1255
+ (int(x - label_width - label_offset), int(y - label_height - label_offset)), # Top-left
1256
+ (int(x + width + label_offset), int(y - label_height - label_offset)), # Top-right
1257
+ (int(x - label_width - label_offset), int(y + height + label_offset)), # Bottom-left
1258
+ (int(x + width + label_offset), int(y + height + label_offset)), # Bottom-right
1259
+ ]
1260
+
1261
+ # Check each position for collisions
1262
+ for pos_x, pos_y in positions:
1263
+ # Check bounds
1264
+ if (
1265
+ pos_x < 0
1266
+ or pos_y < 0
1267
+ or pos_x + label_width > image_width
1268
+ or pos_y + label_height > image_height
1269
+ ):
1270
+ continue
1271
+
1272
+ # Check collision with existing labels
1273
+ label_bbox = {
1274
+ "x": pos_x,
1275
+ "y": pos_y,
1276
+ "width": label_width,
1277
+ "height": label_height,
1278
+ }
1279
+
1280
+ collision = False
1281
+ for existing in existing_labels:
1282
+ # Simple AABB collision detection
1283
+ if not (
1284
+ label_bbox["x"] + label_bbox["width"] < existing["x"]
1285
+ or label_bbox["x"] > existing["x"] + existing["width"]
1286
+ or label_bbox["y"] + label_bbox["height"] < existing["y"]
1287
+ or label_bbox["y"] > existing["y"] + existing["height"]
1288
+ ):
1289
+ collision = True
1290
+ break
1291
+
1292
+ if not collision:
1293
+ return (pos_x, pos_y)
1294
+
1295
+ # If all positions collide, use top position with increased offset
1296
+ return (int(x + width / 2 - label_width / 2), int(y - label_height - label_offset * 2))
1297
+
1298
+ def _draw_labeled_screenshot(
1299
+ self,
1300
+ snapshot: Snapshot,
1301
+ elements: list[Element],
1302
+ ) -> "PILImage.Image":
1303
+ """
1304
+ Draw labeled screenshot with bounding boxes and element IDs.
1305
+
1306
+ Args:
1307
+ snapshot: Snapshot with screenshot data
1308
+ elements: List of elements to label
1309
+
1310
+ Returns:
1311
+ PIL Image with labels drawn
1312
+ """
1313
+ # Decode screenshot
1314
+ img = self._decode_screenshot(snapshot.screenshot)
1315
+ draw = PILImageDraw.Draw(img)
1316
+
1317
+ # Load font (fallback to default if not available)
1318
+ try:
1319
+ font = PILImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
1320
+ except OSError:
1321
+ try:
1322
+ font = PILImageFont.truetype(
1323
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16
1324
+ )
1325
+ except OSError:
1326
+ font = PILImageFont.load_default()
1327
+
1328
+ image_width, image_height = img.size
1329
+ existing_labels: list[dict[str, float]] = []
1330
+
1331
+ # Neon green color: #39FF14 (bright, vibrant green)
1332
+ neon_green = "#39FF14"
1333
+
1334
+ for element in elements:
1335
+ bbox = element.bbox
1336
+ x, y, width, height = bbox.x, bbox.y, bbox.width, bbox.height
1337
+
1338
+ # Draw bounding box rectangle (neon green with 2px width)
1339
+ draw.rectangle(
1340
+ [(x, y), (x + width, y + height)],
1341
+ outline=neon_green,
1342
+ width=2,
1343
+ )
1344
+
1345
+ # Prepare label text (just the number - keep it simple and compact)
1346
+ label_text = str(element.id)
1347
+
1348
+ # Measure label text size
1349
+ bbox_text = draw.textbbox((0, 0), label_text, font=font)
1350
+ label_width = bbox_text[2] - bbox_text[0]
1351
+ label_height = bbox_text[3] - bbox_text[1]
1352
+
1353
+ # Find best position for label (anti-collision)
1354
+ label_x, label_y = self._find_label_position(
1355
+ {"x": x, "y": y, "width": width, "height": height},
1356
+ existing_labels,
1357
+ image_width,
1358
+ image_height,
1359
+ label_width + 8, # Add padding
1360
+ label_height + 4, # Add padding
1361
+ )
1362
+
1363
+ # Calculate connection points for a clearer visual link
1364
+ element_center_x = x + width / 2
1365
+ element_center_y = y + height / 2
1366
+ label_center_x = label_x + label_width / 2
1367
+ label_center_y = label_y + label_height / 2
1368
+
1369
+ # Determine which edge of the element is closest to the label
1370
+ dist_top = abs(label_center_y - y)
1371
+ dist_bottom = abs(label_center_y - (y + height))
1372
+ dist_left = abs(label_center_x - x)
1373
+ dist_right = abs(label_center_x - (x + width))
1374
+
1375
+ min_dist = min(dist_top, dist_bottom, dist_left, dist_right)
1376
+
1377
+ if min_dist == dist_top:
1378
+ line_start = (element_center_x, y)
1379
+ elif min_dist == dist_bottom:
1380
+ line_start = (element_center_x, y + height)
1381
+ elif min_dist == dist_left:
1382
+ line_start = (x, element_center_y)
1383
+ else:
1384
+ line_start = (x + width, element_center_y)
1385
+
1386
+ # Draw connecting line from element edge to label
1387
+ draw.line(
1388
+ [line_start, (label_center_x, label_center_y)],
1389
+ fill=neon_green,
1390
+ width=2,
1391
+ )
1392
+
1393
+ # Draw label background (white with neon green border)
1394
+ label_bg_x1 = label_x - 4
1395
+ label_bg_y1 = label_y - 2
1396
+ label_bg_x2 = label_x + label_width + 4
1397
+ label_bg_y2 = label_y + label_height + 2
1398
+
1399
+ draw.rectangle(
1400
+ [(label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2)],
1401
+ fill="white",
1402
+ outline=neon_green,
1403
+ width=2,
1404
+ )
1405
+
1406
+ # Draw label text
1407
+ draw.text(
1408
+ (label_x, label_y),
1409
+ label_text,
1410
+ fill="black",
1411
+ font=font,
1412
+ )
1413
+
1414
+ # Record label position for collision detection
1415
+ existing_labels.append(
1416
+ {
1417
+ "x": label_bg_x1,
1418
+ "y": label_bg_y1,
1419
+ "width": label_bg_x2 - label_bg_x1,
1420
+ "height": label_bg_y2 - label_bg_y1,
1421
+ }
1422
+ )
1423
+
1424
+ return img
1425
+
1426
+ def _encode_image_to_base64(
1427
+ self,
1428
+ image: "PILImage.Image",
1429
+ format: str = "PNG",
1430
+ max_size_mb: float = 20.0,
1431
+ ) -> str:
1432
+ """
1433
+ Encode PIL Image to base64 data URL with size optimization.
1434
+
1435
+ Args:
1436
+ image: PIL Image object
1437
+ format: Output format ("PNG" or "JPEG")
1438
+ max_size_mb: Maximum size in MB (will compress if exceeded)
1439
+
1440
+ Returns:
1441
+ Base64-encoded data URL
1442
+ """
1443
+ buffer = io.BytesIO()
1444
+ pil_format = format.upper()
1445
+ quality = 95 # Start with high quality
1446
+
1447
+ # Convert RGBA to RGB for JPEG
1448
+ if pil_format == "JPEG" and image.mode == "RGBA":
1449
+ # Create white background
1450
+ rgb_image = Image.new("RGB", image.size, (255, 255, 255))
1451
+ rgb_image.paste(image, mask=image.split()[3]) # Use alpha channel as mask
1452
+ image = rgb_image
1453
+
1454
+ # Try to fit within size limit
1455
+ for attempt in range(3):
1456
+ buffer.seek(0)
1457
+ buffer.truncate(0)
1458
+
1459
+ if pil_format == "JPEG":
1460
+ image.save(buffer, format=pil_format, quality=quality, optimize=True)
1461
+ else:
1462
+ image.save(buffer, format=pil_format, optimize=True)
1463
+
1464
+ size_mb = len(buffer.getvalue()) / (1024 * 1024)
1465
+
1466
+ if size_mb <= max_size_mb:
1467
+ break
1468
+
1469
+ # Reduce quality for next attempt
1470
+ quality = max(70, quality - 15)
1471
+ if self.verbose and attempt == 0:
1472
+ print(f" ⚠️ Image size {size_mb:.2f}MB exceeds limit, compressing...")
1473
+
1474
+ image_bytes = buffer.getvalue()
1475
+ base64_data = base64.b64encode(image_bytes).decode("utf-8")
1476
+
1477
+ final_size_mb = len(image_bytes) / (1024 * 1024)
1478
+ if self.verbose:
1479
+ print(f" 📸 Image encoded: {final_size_mb:.2f}MB ({len(base64_data)} chars base64)")
1480
+
1481
+ mime_type = "image/png" if pil_format == "PNG" else "image/jpeg"
1482
+ return f"data:{mime_type};base64,{base64_data}"
1483
+
1484
+ def _query_llm_with_vision(
1485
+ self,
1486
+ image_data_url: str,
1487
+ goal: str,
1488
+ ) -> LLMResponse:
1489
+ """
1490
+ Query LLM with vision (labeled screenshot) - sync version.
1491
+
1492
+ Args:
1493
+ image_data_url: Base64-encoded image data URL
1494
+ goal: User's goal/task
1495
+
1496
+ Returns:
1497
+ LLMResponse with element ID
1498
+ """
1499
+ # Use the same prompt as async version
1500
+ system_prompt = """You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
1501
+ Each clickable element has:
1502
+ - A bright neon green (#39FF14) bounding box around the element
1503
+ - A white label box with a number (the element ID) connected by a green line
1504
+ - The label is clearly separate from the element (not part of the UI)
1505
+
1506
+ CRITICAL INSTRUCTIONS:
1507
+ 1. Look at the screenshot carefully
1508
+ 2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
1509
+ 3. Follow the green line from that element to find its label box with the ID number
1510
+ 4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
1511
+ 5. Do NOT include any explanation, reasoning, or other text
1512
+ 6. Do NOT say "element 1" or "the first element" - just return the number
1513
+ 7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
1514
+
1515
+ Example responses:
1516
+ - Correct: "42"
1517
+ - Correct: "1567"
1518
+ - Wrong: "I see element 42"
1519
+ - Wrong: "The element ID is 42"
1520
+ - Wrong: "42 (the search box)" """
1521
+
1522
+ user_prompt = f"""Goal: {goal}
1523
+
1524
+ Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
1525
+ Find the element that should be clicked to accomplish this goal.
1526
+ Return ONLY the integer ID number from the label, nothing else."""
1527
+
1528
+ # Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
1529
+ if hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
1530
+ # Vision-capable provider - use vision API
1531
+ try:
1532
+ from openai import OpenAI
1533
+
1534
+ # Check if it's OpenAI
1535
+ if isinstance(self.llm.client, OpenAI):
1536
+ messages = [
1537
+ {
1538
+ "role": "system",
1539
+ "content": system_prompt,
1540
+ },
1541
+ {
1542
+ "role": "user",
1543
+ "content": [
1544
+ {"type": "text", "text": user_prompt},
1545
+ {
1546
+ "type": "image_url",
1547
+ "image_url": {"url": image_data_url},
1548
+ },
1549
+ ],
1550
+ },
1551
+ ]
1552
+
1553
+ response = self.llm.client.chat.completions.create(
1554
+ model=self.llm._model_name,
1555
+ messages=messages,
1556
+ temperature=0.0,
1557
+ )
1558
+
1559
+ content = response.choices[0].message.content or ""
1560
+ usage = response.usage
1561
+
1562
+ from .llm_response_builder import LLMResponseBuilder
1563
+
1564
+ return LLMResponseBuilder.from_openai_format(
1565
+ content=content,
1566
+ prompt_tokens=usage.prompt_tokens if usage else None,
1567
+ completion_tokens=usage.completion_tokens if usage else None,
1568
+ total_tokens=usage.total_tokens if usage else None,
1569
+ model_name=response.model,
1570
+ finish_reason=response.choices[0].finish_reason,
1571
+ )
1572
+
1573
+ # Check if provider supports vision API (uses OpenAI-compatible format)
1574
+ elif hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
1575
+ if self.verbose:
1576
+ print(f" 🔍 Using vision API with model: {self.llm._model_name}")
1577
+ print(f" 📐 Image data URL length: {len(image_data_url)} chars")
1578
+
1579
+ messages = [
1580
+ {
1581
+ "role": "system",
1582
+ "content": system_prompt,
1583
+ },
1584
+ {
1585
+ "role": "user",
1586
+ "content": [
1587
+ {"type": "text", "text": user_prompt},
1588
+ {
1589
+ "type": "image_url",
1590
+ "image_url": {"url": image_data_url},
1591
+ },
1592
+ ],
1593
+ },
1594
+ ]
1595
+
1596
+ try:
1597
+ if self.verbose:
1598
+ print(f" 📤 Sending request to vision API...")
1599
+
1600
+ response = self.llm.client.chat.completions.create(
1601
+ model=self.llm._model_name,
1602
+ messages=messages,
1603
+ temperature=0.0,
1604
+ )
1605
+
1606
+ if not hasattr(response, "choices") or len(response.choices) == 0:
1607
+ raise ValueError("Vision API returned no choices in response")
1608
+
1609
+ choice = response.choices[0]
1610
+ content = (
1611
+ choice.message.content if hasattr(choice.message, "content") else None
1612
+ )
1613
+ finish_reason = (
1614
+ choice.finish_reason if hasattr(choice, "finish_reason") else None
1615
+ )
1616
+
1617
+ if content is None or content == "":
1618
+ error_msg = f"Vision API returned empty content (finish_reason: {finish_reason})"
1619
+ if self.verbose:
1620
+ print(f" ❌ {error_msg}")
1621
+ raise ValueError(error_msg)
1622
+
1623
+ usage = response.usage if hasattr(response, "usage") else None
1624
+
1625
+ from .llm_response_builder import LLMResponseBuilder
1626
+
1627
+ return LLMResponseBuilder.from_openai_format(
1628
+ content=content,
1629
+ prompt_tokens=usage.prompt_tokens if usage else None,
1630
+ completion_tokens=usage.completion_tokens if usage else None,
1631
+ total_tokens=usage.total_tokens if usage else None,
1632
+ model_name=(
1633
+ response.model
1634
+ if hasattr(response, "model")
1635
+ else self.llm._model_name
1636
+ ),
1637
+ finish_reason=finish_reason,
1638
+ )
1639
+ except Exception as vision_error:
1640
+ if self.verbose:
1641
+ print(f" ❌ Vision API error: {vision_error}")
1642
+ print(f" 🔄 Attempting fallback to regular generate method...")
1643
+
1644
+ # Fallback: Try using the regular generate method
1645
+ try:
1646
+ fallback_prompt = f"{user_prompt}\n\n[Image: {image_data_url[:200]}...]"
1647
+ fallback_response = self.llm.generate(
1648
+ system_prompt,
1649
+ fallback_prompt,
1650
+ temperature=0.0,
1651
+ )
1652
+ if self.verbose:
1653
+ print(f" ⚠️ Using fallback method (may not support vision)")
1654
+ return fallback_response
1655
+ except Exception as fallback_error:
1656
+ if self.verbose:
1657
+ print(f" ❌ Fallback also failed: {fallback_error}")
1658
+ raise vision_error # Raise original error
1659
+ except ImportError:
1660
+ # openai or other vision SDK not available
1661
+ pass
1662
+ except Exception as e:
1663
+ if self.verbose:
1664
+ print(f"⚠️ Vision API error: {e}, falling back to text-only")
1665
+
1666
+ # Fallback: Try to pass image via kwargs or use text-only
1667
+ try:
1668
+ return self.llm.generate(
1669
+ system_prompt,
1670
+ f"{user_prompt}\n\n[Image data: {image_data_url[:100]}...]",
1671
+ temperature=0.0,
1672
+ )
1673
+ except Exception as e:
1674
+ raise RuntimeError(
1675
+ f"LLM provider {type(self.llm).__name__} may not support vision. "
1676
+ f"Error: {e}. Use a vision-capable model like GPT-4o or Claude 3."
1677
+ ) from e
1678
+
1679
+ def _extract_element_id(self, llm_response: str) -> int | None:
1680
+ """Extract element ID integer from LLM response (shared with async version)."""
1681
+ return SentienceVisualAgentAsync._extract_element_id(self, llm_response)
1682
+
1683
+ def _compute_hash(self, text: str) -> str:
1684
+ """Compute SHA256 hash of text."""
1685
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
1686
+
1687
+ def act(
1688
+ self,
1689
+ goal: str,
1690
+ max_retries: int = 2,
1691
+ snapshot_options: SnapshotOptions | None = None,
1692
+ ) -> AgentActionResult:
1693
+ """
1694
+ Override act() method to use visual prompting with full tracing support.
1695
+
1696
+ Args:
1697
+ goal: User's goal/task
1698
+ max_retries: Maximum retry attempts
1699
+ snapshot_options: Optional snapshot options (screenshot will be enabled)
1700
+
1701
+ Returns:
1702
+ AgentActionResult
1703
+ """
1704
+ if self.verbose:
1705
+ print(f"\n{'=' * 70}")
1706
+ print(f"🤖 Visual Agent Goal: {goal}")
1707
+ print(f"{'=' * 70}")
1708
+
1709
+ # Generate step ID for tracing
1710
+ self._step_count += 1
1711
+ step_id = f"step-{self._step_count}"
1712
+
1713
+ # Emit step_start trace event if tracer is enabled
1714
+ if self.tracer:
1715
+ pre_url = self.browser.page.url if self.browser.page else None
1716
+ _safe_tracer_call(
1717
+ self.tracer,
1718
+ "emit_step_start",
1719
+ self.verbose,
1720
+ step_id=step_id,
1721
+ step_index=self._step_count,
1722
+ goal=goal,
1723
+ attempt=0,
1724
+ pre_url=pre_url,
1725
+ )
1726
+
1727
+ start_time = time.time()
1728
+
1729
+ try:
1730
+ # Ensure screenshot is enabled
1731
+ if snapshot_options is None:
1732
+ snapshot_options = SnapshotOptions()
1733
+
1734
+ # Enable screenshot if not already enabled
1735
+ if snapshot_options.screenshot is False or snapshot_options.screenshot is None:
1736
+ from .models import ScreenshotConfig
1737
+
1738
+ snapshot_options.screenshot = ScreenshotConfig(format="png")
1739
+
1740
+ # Set goal if not already provided
1741
+ if snapshot_options.goal is None:
1742
+ snapshot_options.goal = goal
1743
+
1744
+ # Set limit if not provided
1745
+ if snapshot_options.limit is None:
1746
+ snapshot_options.limit = self.default_snapshot_limit
1747
+
1748
+ if self.verbose:
1749
+ print(f"🎯 Goal: {goal}")
1750
+ print("📸 Taking snapshot with screenshot...")
1751
+
1752
+ # 1. Take snapshot with screenshot (sync version)
1753
+ snap = snapshot(self.browser, snapshot_options)
1754
+
1755
+ if snap.status != "success":
1756
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
1757
+
1758
+ if not snap.screenshot:
1759
+ raise RuntimeError("Screenshot not available in snapshot")
1760
+
1761
+ # Compute diff_status by comparing with previous snapshot
1762
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
1763
+
1764
+ # Create snapshot with diff_status populated
1765
+ snap_with_diff = Snapshot(
1766
+ status=snap.status,
1767
+ timestamp=snap.timestamp,
1768
+ url=snap.url,
1769
+ viewport=snap.viewport,
1770
+ elements=elements_with_diff,
1771
+ screenshot=snap.screenshot,
1772
+ screenshot_format=snap.screenshot_format,
1773
+ error=snap.error,
1774
+ )
1775
+
1776
+ # Update previous snapshot for next comparison
1777
+ self._previous_snapshot = snap
1778
+
1779
+ # Emit snapshot trace event if tracer is enabled
1780
+ if self.tracer:
1781
+ # Build snapshot event data (use snap_with_diff to include diff_status)
1782
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
1783
+
1784
+ # Always include screenshot in trace event for studio viewer compatibility
1785
+ if snap.screenshot:
1786
+ # Extract base64 string from data URL if needed
1787
+ if snap.screenshot.startswith("data:image"):
1788
+ # Format: "data:image/jpeg;base64,{base64_string}"
1789
+ screenshot_base64 = (
1790
+ snap.screenshot.split(",", 1)[1]
1791
+ if "," in snap.screenshot
1792
+ else snap.screenshot
1793
+ )
1794
+ else:
1795
+ screenshot_base64 = snap.screenshot
1796
+
1797
+ snapshot_data["screenshot_base64"] = screenshot_base64
1798
+ if snap.screenshot_format:
1799
+ snapshot_data["screenshot_format"] = snap.screenshot_format
1800
+
1801
+ _safe_tracer_call(
1802
+ self.tracer,
1803
+ "emit",
1804
+ self.verbose,
1805
+ "snapshot",
1806
+ snapshot_data,
1807
+ step_id=step_id,
1808
+ )
1809
+
1810
+ if self.verbose:
1811
+ print(f"✅ Snapshot taken: {len(snap.elements)} elements")
1812
+
1813
+ # 2. Draw labeled screenshot
1814
+ if self.verbose:
1815
+ print("🎨 Drawing bounding boxes and labels...")
1816
+ print(f" Elements to label: {len(snap.elements)}")
1817
+ if len(snap.elements) > 0:
1818
+ element_ids = [el.id for el in snap.elements[:10]] # Show first 10
1819
+ print(f" Sample element IDs: {element_ids}")
1820
+
1821
+ labeled_image = self._draw_labeled_screenshot(snap, snap.elements)
1822
+
1823
+ # Save labeled image to disk for debugging
1824
+ # Save to playground/images if running from playground, otherwise use current directory
1825
+ try:
1826
+ # Try to detect if we're in a playground context
1827
+ import sys
1828
+
1829
+ cwd = Path.cwd()
1830
+ playground_path = None
1831
+
1832
+ # Check if current working directory contains playground
1833
+ if (cwd / "playground").exists():
1834
+ playground_path = cwd / "playground" / "images"
1835
+ else:
1836
+ # Check sys.path for playground
1837
+ for path_str in sys.path:
1838
+ path_obj = Path(path_str)
1839
+ if "playground" in str(path_obj) and path_obj.exists():
1840
+ # Find the playground directory
1841
+ if path_obj.name == "playground":
1842
+ playground_path = path_obj / "images"
1843
+ break
1844
+ elif (path_obj / "playground").exists():
1845
+ playground_path = path_obj / "playground" / "images"
1846
+ break
1847
+
1848
+ if playground_path is None:
1849
+ # Fallback: use current working directory
1850
+ playground_path = cwd / "playground" / "images"
1851
+
1852
+ images_dir = playground_path
1853
+ images_dir.mkdir(parents=True, exist_ok=True)
1854
+ image_uuid = str(uuid.uuid4())
1855
+ image_filename = f"labeled_screenshot_{image_uuid}.png"
1856
+ image_path = images_dir / image_filename
1857
+ labeled_image.save(image_path, format="PNG")
1858
+ if self.verbose:
1859
+ print(f" 💾 Saved labeled screenshot: {image_path.absolute()}")
1860
+ except Exception as save_error:
1861
+ # Don't fail if image save fails - it's just for debugging
1862
+ if self.verbose:
1863
+ print(f" ⚠️ Could not save labeled screenshot: {save_error}")
1864
+
1865
+ # Use JPEG for better compression (smaller file size for vision APIs)
1866
+ labeled_image_data_url = self._encode_image_to_base64(
1867
+ labeled_image, format="JPEG", max_size_mb=20.0
1868
+ )
1869
+
1870
+ # 3. Query LLM with vision (sync version)
1871
+ if self.verbose:
1872
+ print("🧠 Querying LLM with labeled screenshot...")
1873
+
1874
+ llm_response = self._query_llm_with_vision(labeled_image_data_url, goal)
1875
+
1876
+ # Emit LLM query trace event if tracer is enabled
1877
+ if self.tracer:
1878
+ _safe_tracer_call(
1879
+ self.tracer,
1880
+ "emit",
1881
+ self.verbose,
1882
+ "llm_query",
1883
+ {
1884
+ "prompt_tokens": llm_response.prompt_tokens,
1885
+ "completion_tokens": llm_response.completion_tokens,
1886
+ "model": llm_response.model_name,
1887
+ "response": llm_response.content[:200], # Truncate for brevity
1888
+ },
1889
+ step_id=step_id,
1890
+ )
1891
+
1892
+ if self.verbose:
1893
+ print(f"💭 LLM Response: {llm_response.content}")
1894
+
1895
+ # Track token usage
1896
+ self._track_tokens(goal, llm_response)
1897
+
1898
+ # 4. Extract element ID
1899
+ element_id = self._extract_element_id(llm_response.content)
1900
+
1901
+ if element_id is None:
1902
+ raise ValueError(
1903
+ f"Could not extract element ID from LLM response: {llm_response.content}"
1904
+ )
1905
+
1906
+ if self.verbose:
1907
+ print(f"🎯 Extracted Element ID: {element_id}")
1908
+
1909
+ # 5. Click the element (sync version)
1910
+ if self.verbose:
1911
+ print(f"🖱️ Clicking element {element_id}...")
1912
+
1913
+ click_result = click(self.browser, element_id)
1914
+
1915
+ duration_ms = int((time.time() - start_time) * 1000)
1916
+
1917
+ # Create AgentActionResult from click result
1918
+ result = AgentActionResult(
1919
+ success=click_result.success,
1920
+ action="click",
1921
+ goal=goal,
1922
+ duration_ms=duration_ms,
1923
+ attempt=0,
1924
+ element_id=element_id,
1925
+ outcome=click_result.outcome,
1926
+ url_changed=click_result.url_changed,
1927
+ error=click_result.error,
1928
+ )
1929
+
1930
+ # Emit action execution trace event if tracer is enabled
1931
+ if self.tracer:
1932
+ post_url = self.browser.page.url if self.browser.page else None
1933
+
1934
+ # Include element data for live overlay visualization
1935
+ elements_data = [
1936
+ {
1937
+ "id": el.id,
1938
+ "bbox": {
1939
+ "x": el.bbox.x,
1940
+ "y": el.bbox.y,
1941
+ "width": el.bbox.width,
1942
+ "height": el.bbox.height,
1943
+ },
1944
+ "role": el.role,
1945
+ "text": el.text[:50] if el.text else "",
1946
+ }
1947
+ for el in snap.elements[:50]
1948
+ ]
1949
+
1950
+ _safe_tracer_call(
1951
+ self.tracer,
1952
+ "emit",
1953
+ self.verbose,
1954
+ "action",
1955
+ {
1956
+ "action": result.action,
1957
+ "element_id": result.element_id,
1958
+ "success": result.success,
1959
+ "outcome": result.outcome,
1960
+ "duration_ms": duration_ms,
1961
+ "post_url": post_url,
1962
+ "elements": elements_data, # Add element data for overlay
1963
+ "target_element_id": result.element_id, # Highlight target in red
1964
+ },
1965
+ step_id=step_id,
1966
+ )
1967
+
1968
+ # Record history
1969
+ self.history.append(
1970
+ {
1971
+ "goal": goal,
1972
+ "action": f"CLICK({element_id})",
1973
+ "result": result.model_dump(), # Store as dict
1974
+ "success": result.success,
1975
+ "attempt": 0,
1976
+ "duration_ms": duration_ms,
1977
+ }
1978
+ )
1979
+
1980
+ if self.verbose:
1981
+ status = "✅" if result.success else "❌"
1982
+ print(f"{status} Completed in {duration_ms}ms")
1983
+
1984
+ # Emit step completion trace event if tracer is enabled
1985
+ if self.tracer:
1986
+ # Get pre_url from step_start (stored in tracer or use current)
1987
+ pre_url = snap.url
1988
+ post_url = self.browser.page.url if self.browser.page else None
1989
+
1990
+ # Compute snapshot digest (simplified - use URL + timestamp)
1991
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
1992
+
1993
+ # Build LLM data
1994
+ llm_response_text = llm_response.content
1995
+
1996
+ # Build execution data
1997
+ exec_data = {
1998
+ "success": result.success,
1999
+ "outcome": result.outcome,
2000
+ "action": result.action,
2001
+ "element_id": result.element_id,
2002
+ "url_changed": result.url_changed,
2003
+ "duration_ms": duration_ms,
2004
+ }
2005
+
2006
+ # Build verification data (simplified - always pass for now)
2007
+ verify_data = {
2008
+ "passed": result.success,
2009
+ "signals": {
2010
+ "url_changed": result.url_changed or False,
2011
+ },
2012
+ }
2013
+
2014
+ # Build complete step_end event
2015
+ step_end_data = TraceEventBuilder.build_step_end_event(
2016
+ step_id=step_id,
2017
+ step_index=self._step_count,
2018
+ goal=goal,
2019
+ attempt=0,
2020
+ pre_url=pre_url,
2021
+ post_url=post_url or pre_url,
2022
+ snapshot_digest=snapshot_digest,
2023
+ llm_data={
2024
+ "response_text": llm_response_text,
2025
+ "response_hash": f"sha256:{self._compute_hash(llm_response_text)}",
2026
+ },
2027
+ exec_data=exec_data,
2028
+ verify_data=verify_data,
2029
+ )
2030
+
2031
+ _safe_tracer_call(
2032
+ self.tracer,
2033
+ "emit",
2034
+ self.verbose,
2035
+ "step_end",
2036
+ step_end_data,
2037
+ step_id=step_id,
2038
+ )
2039
+
2040
+ return result
2041
+
2042
+ except Exception as e:
2043
+ # Emit error trace event if tracer is enabled
2044
+ if self.tracer:
2045
+ _safe_tracer_call(
2046
+ self.tracer,
2047
+ "emit_error",
2048
+ self.verbose,
2049
+ step_id=step_id,
2050
+ error=str(e),
2051
+ attempt=0,
2052
+ )
2053
+
2054
+ if self.verbose:
2055
+ print(f"❌ Error: {e}")
2056
+
2057
+ # Re-raise the exception
2058
+ raise