sentienceapi 0.90.12__py3-none-any.whl → 0.92.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (63) hide show
  1. sentience/__init__.py +14 -5
  2. sentience/_extension_loader.py +40 -0
  3. sentience/action_executor.py +215 -0
  4. sentience/actions.py +408 -25
  5. sentience/agent.py +804 -310
  6. sentience/agent_config.py +3 -0
  7. sentience/async_api.py +101 -0
  8. sentience/base_agent.py +95 -0
  9. sentience/browser.py +594 -25
  10. sentience/browser_evaluator.py +299 -0
  11. sentience/cloud_tracing.py +458 -36
  12. sentience/conversational_agent.py +79 -45
  13. sentience/element_filter.py +136 -0
  14. sentience/expect.py +98 -2
  15. sentience/extension/background.js +56 -185
  16. sentience/extension/content.js +117 -289
  17. sentience/extension/injected_api.js +799 -1374
  18. sentience/extension/manifest.json +1 -1
  19. sentience/extension/pkg/sentience_core.js +190 -396
  20. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  21. sentience/extension/release.json +47 -47
  22. sentience/formatting.py +9 -53
  23. sentience/inspector.py +183 -1
  24. sentience/llm_interaction_handler.py +191 -0
  25. sentience/llm_provider.py +256 -28
  26. sentience/llm_provider_utils.py +120 -0
  27. sentience/llm_response_builder.py +153 -0
  28. sentience/models.py +66 -1
  29. sentience/overlay.py +109 -2
  30. sentience/protocols.py +228 -0
  31. sentience/query.py +1 -1
  32. sentience/read.py +95 -3
  33. sentience/recorder.py +223 -3
  34. sentience/schemas/trace_v1.json +102 -9
  35. sentience/screenshot.py +48 -2
  36. sentience/sentience_methods.py +86 -0
  37. sentience/snapshot.py +309 -64
  38. sentience/snapshot_diff.py +141 -0
  39. sentience/text_search.py +119 -5
  40. sentience/trace_event_builder.py +129 -0
  41. sentience/trace_file_manager.py +197 -0
  42. sentience/trace_indexing/index_schema.py +95 -7
  43. sentience/trace_indexing/indexer.py +117 -14
  44. sentience/tracer_factory.py +119 -6
  45. sentience/tracing.py +172 -8
  46. sentience/utils/__init__.py +40 -0
  47. sentience/utils/browser.py +46 -0
  48. sentience/utils/element.py +257 -0
  49. sentience/utils/formatting.py +59 -0
  50. sentience/utils.py +1 -1
  51. sentience/visual_agent.py +2056 -0
  52. sentience/wait.py +70 -4
  53. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/METADATA +61 -22
  54. sentienceapi-0.92.2.dist-info/RECORD +65 -0
  55. sentienceapi-0.92.2.dist-info/licenses/LICENSE +24 -0
  56. sentienceapi-0.92.2.dist-info/licenses/LICENSE-APACHE +201 -0
  57. sentienceapi-0.92.2.dist-info/licenses/LICENSE-MIT +21 -0
  58. sentience/extension/test-content.js +0 -4
  59. sentienceapi-0.90.12.dist-info/RECORD +0 -46
  60. sentienceapi-0.90.12.dist-info/licenses/LICENSE.md +0 -43
  61. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/WHEEL +0 -0
  62. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/entry_points.txt +0 -0
  63. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2056 @@
1
+ """
2
+ Visual Agent - Uses labeled screenshots with vision-capable LLMs
3
+
4
+ This agent extends SentienceAgentAsync to use visual prompts:
5
+ 1. Takes snapshot with screenshot enabled
6
+ 2. Draws bounding boxes and labels element IDs on the screenshot
7
+ 3. Uses anti-collision algorithm to position labels (4 sides + 4 corners)
8
+ 4. Sends labeled screenshot to vision-capable LLM
9
+ 5. Extracts element ID from LLM response
10
+ 6. Clicks the element using click_async
11
+
12
+ Dependencies:
13
+ - Pillow (PIL): Required for image processing and drawing bounding boxes
14
+ Install with: pip install Pillow
15
+ """
16
+
17
+ import base64
18
+ import hashlib
19
+ import io
20
+ import re
21
+ import time
22
+ import uuid
23
+ from pathlib import Path
24
+ from typing import TYPE_CHECKING, Any, Optional
25
+
26
+ from .actions import click, click_async
27
+ from .agent import SentienceAgent, SentienceAgentAsync, _safe_tracer_call
28
+ from .async_api import AsyncSentienceBrowser
29
+ from .browser import SentienceBrowser
30
+ from .llm_provider import LLMProvider, LLMResponse
31
+ from .models import AgentActionResult, Element, Snapshot, SnapshotOptions
32
+ from .snapshot import snapshot
33
+ from .snapshot_diff import SnapshotDiff
34
+ from .trace_event_builder import TraceEventBuilder
35
+
36
+ # Only import PIL types for type checking, not at runtime
37
+ if TYPE_CHECKING:
38
+ from PIL import Image, ImageDraw, ImageFont
39
+ else:
40
+ # Create a dummy type for runtime when PIL is not available
41
+ Image = None
42
+ ImageDraw = None
43
+ ImageFont = None
44
+
45
+ try:
46
+ from PIL import Image as PILImage, ImageDraw as PILImageDraw, ImageFont as PILImageFont
47
+
48
+ PIL_AVAILABLE = True
49
+ except ImportError:
50
+ PIL_AVAILABLE = False
51
+ # Define dummy values so type hints don't fail
52
+ PILImage = None # type: ignore
53
+ PILImageDraw = None # type: ignore
54
+ PILImageFont = None # type: ignore
55
+ # Don't print warning here - it will be printed when the class is instantiated
56
+
57
+
58
+ class SentienceVisualAgentAsync(SentienceAgentAsync):
59
+ """
60
+ Async visual agent that uses labeled screenshots with vision-capable LLMs.
61
+
62
+ Extends SentienceAgentAsync to override act() method with visual prompting.
63
+
64
+ Requirements:
65
+ - Pillow (PIL): Required for image processing and drawing bounding boxes
66
+ Install with: pip install Pillow
67
+ - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ browser: AsyncSentienceBrowser,
73
+ llm: LLMProvider,
74
+ default_snapshot_limit: int = 50,
75
+ verbose: bool = True,
76
+ tracer: Any | None = None,
77
+ config: Any | None = None,
78
+ ):
79
+ """
80
+ Initialize Visual Agent
81
+
82
+ Args:
83
+ browser: AsyncSentienceBrowser instance
84
+ llm: LLM provider (must support vision, e.g., GPT-4o, Claude 3)
85
+ default_snapshot_limit: Default maximum elements to include
86
+ verbose: Print execution logs
87
+ tracer: Optional Tracer instance
88
+ config: Optional AgentConfig
89
+ """
90
+ super().__init__(browser, llm, default_snapshot_limit, verbose, tracer, config)
91
+
92
+ if not PIL_AVAILABLE:
93
+ raise ImportError(
94
+ "PIL/Pillow is required for SentienceVisualAgentAsync. Install with: pip install Pillow"
95
+ )
96
+
97
+ # Track previous snapshot for diff computation
98
+ self._previous_snapshot: Snapshot | None = None
99
+
100
+ def _decode_screenshot(self, screenshot_data_url: str) -> "PILImage.Image":
101
+ """
102
+ Decode base64 screenshot data URL to PIL Image
103
+
104
+ Args:
105
+ screenshot_data_url: Base64-encoded data URL (e.g., "data:image/png;base64,...")
106
+
107
+ Returns:
108
+ PIL Image object
109
+ """
110
+ # Extract base64 data from data URL
111
+ if screenshot_data_url.startswith("data:image/"):
112
+ # Format: "data:image/png;base64,<base64_data>"
113
+ base64_data = screenshot_data_url.split(",", 1)[1]
114
+ else:
115
+ # Assume it's already base64
116
+ base64_data = screenshot_data_url
117
+
118
+ # Decode base64 to bytes
119
+ image_bytes = base64.b64decode(base64_data)
120
+
121
+ # Create PIL Image from bytes
122
+ return PILImage.open(io.BytesIO(image_bytes))
123
+
124
+ def _find_label_position(
125
+ self,
126
+ bbox: dict[str, float],
127
+ existing_labels: list[dict[str, Any]],
128
+ image_width: int,
129
+ image_height: int,
130
+ label_width: int,
131
+ label_height: int,
132
+ ) -> tuple[float, float]:
133
+ """
134
+ Find best position for label using anti-collision algorithm.
135
+
136
+ Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners (top-left, top-right, bottom-left, bottom-right)
137
+
138
+ Args:
139
+ bbox: Element bounding box {x, y, width, height}
140
+ existing_labels: List of existing label positions {x, y, width, height}
141
+ image_width: Screenshot width
142
+ image_height: Screenshot height
143
+ label_width: Label text width
144
+ label_height: Label text height
145
+
146
+ Returns:
147
+ (x, y) position for label
148
+ """
149
+ x, y, width, height = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
150
+ center_x = x + width / 2
151
+ center_y = y + height / 2
152
+
153
+ # Anti-collision algorithm
154
+ # Define 8 candidate positions (4 sides + 4 corners)
155
+ # Increased distance from element to avoid confusion (15px instead of 5px)
156
+ label_offset = 15 # Increased from 5 to make labels more clearly separate
157
+ candidates = [
158
+ # 4 sides
159
+ (center_x - label_width / 2, y - label_height - label_offset, "top"), # Above element
160
+ (center_x - label_width / 2, y + height + label_offset, "bottom"), # Below element
161
+ (
162
+ x - label_width - label_offset,
163
+ center_y - label_height / 2,
164
+ "left",
165
+ ), # Left of element
166
+ (x + width + label_offset, center_y - label_height / 2, "right"), # Right of element
167
+ # 4 corners
168
+ (
169
+ x - label_width - label_offset,
170
+ y - label_height - label_offset,
171
+ "top-left",
172
+ ), # Top-left corner
173
+ (
174
+ x + width + label_offset,
175
+ y - label_height - label_offset,
176
+ "top-right",
177
+ ), # Top-right corner
178
+ (
179
+ x - label_width - label_offset,
180
+ y + height + label_offset,
181
+ "bottom-left",
182
+ ), # Bottom-left corner
183
+ (
184
+ x + width + label_offset,
185
+ y + height + label_offset,
186
+ "bottom-right",
187
+ ), # Bottom-right corner
188
+ ]
189
+
190
+ # Check each candidate position for collisions
191
+ for candidate_x, candidate_y, _ in candidates:
192
+ # Check bounds
193
+ if candidate_x < 0 or candidate_y < 0:
194
+ continue
195
+ if candidate_x + label_width > image_width or candidate_y + label_height > image_height:
196
+ continue
197
+
198
+ # Check collision with existing labels
199
+ collision = False
200
+ for existing in existing_labels:
201
+ ex, ey, ew, eh = existing["x"], existing["y"], existing["width"], existing["height"]
202
+ # Check if rectangles overlap
203
+ if not (
204
+ candidate_x + label_width < ex
205
+ or candidate_x > ex + ew
206
+ or candidate_y + label_height < ey
207
+ or candidate_y > ey + eh
208
+ ):
209
+ collision = True
210
+ break
211
+
212
+ if not collision:
213
+ return (candidate_x, candidate_y)
214
+
215
+ # If all positions collide, use top position (may overlap but better than nothing)
216
+ return (center_x - label_width / 2, y - label_height - 15)
217
+
218
+ def _draw_labeled_screenshot(
219
+ self,
220
+ snapshot: Snapshot,
221
+ elements: list[Element],
222
+ ) -> "PILImage.Image":
223
+ """
224
+ Draw bounding boxes and labels on screenshot.
225
+
226
+ Args:
227
+ snapshot: Snapshot with screenshot data
228
+ elements: List of elements to draw
229
+
230
+ Returns:
231
+ PIL Image with bounding boxes and labels
232
+ """
233
+ if not snapshot.screenshot:
234
+ raise ValueError("Screenshot not available in snapshot")
235
+
236
+ # Decode screenshot
237
+ img = self._decode_screenshot(snapshot.screenshot)
238
+ draw = PILImageDraw.Draw(img)
239
+
240
+ # Try to load a font, fallback to default if not available
241
+ try:
242
+ # Try to use a system font
243
+ font = PILImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
244
+ except:
245
+ try:
246
+ font = PILImageFont.truetype("arial.ttf", 16)
247
+ except:
248
+ # Use default font if system fonts not available
249
+ font = PILImageFont.load_default()
250
+
251
+ image_width, image_height = img.size
252
+ existing_labels: list[dict[str, Any]] = []
253
+
254
+ # Neon green color: #39FF14 (bright, vibrant green)
255
+ neon_green = "#39FF14"
256
+
257
+ # Draw bounding boxes and labels for each element
258
+ for element in elements:
259
+ bbox = element.bbox
260
+ x, y, width, height = bbox.x, bbox.y, bbox.width, bbox.height
261
+
262
+ # Draw bounding box rectangle (neon green with 2px width)
263
+ draw.rectangle(
264
+ [(x, y), (x + width, y + height)],
265
+ outline=neon_green,
266
+ width=2,
267
+ )
268
+
269
+ # Prepare label text (just the number - keep it simple and compact)
270
+ label_text = str(element.id)
271
+
272
+ # Measure label text size
273
+ bbox_text = draw.textbbox((0, 0), label_text, font=font)
274
+ label_width = bbox_text[2] - bbox_text[0]
275
+ label_height = bbox_text[3] - bbox_text[1]
276
+
277
+ # Find best position for label (anti-collision)
278
+ label_x, label_y = self._find_label_position(
279
+ {"x": x, "y": y, "width": width, "height": height},
280
+ existing_labels,
281
+ image_width,
282
+ image_height,
283
+ label_width + 8, # Add padding
284
+ label_height + 4, # Add padding
285
+ )
286
+
287
+ # Calculate connection points for a clearer visual link
288
+ # Connect from the nearest corner/edge of element to the label
289
+ element_center_x = x + width / 2
290
+ element_center_y = y + height / 2
291
+ label_center_x = label_x + label_width / 2
292
+ label_center_y = label_y + label_height / 2
293
+
294
+ # Determine which edge of the element is closest to the label
295
+ # and draw line from that edge point to the label
296
+ dist_top = abs(label_center_y - y)
297
+ dist_bottom = abs(label_center_y - (y + height))
298
+ dist_left = abs(label_center_x - x)
299
+ dist_right = abs(label_center_x - (x + width))
300
+
301
+ min_dist = min(dist_top, dist_bottom, dist_left, dist_right)
302
+
303
+ if min_dist == dist_top:
304
+ # Label is above - connect from top edge
305
+ line_start = (element_center_x, y)
306
+ elif min_dist == dist_bottom:
307
+ # Label is below - connect from bottom edge
308
+ line_start = (element_center_x, y + height)
309
+ elif min_dist == dist_left:
310
+ # Label is left - connect from left edge
311
+ line_start = (x, element_center_y)
312
+ else:
313
+ # Label is right - connect from right edge
314
+ line_start = (x + width, element_center_y)
315
+
316
+ # Draw connecting line from element edge to label (makes it clear the label belongs to the element)
317
+ draw.line(
318
+ [line_start, (label_center_x, label_center_y)],
319
+ fill=neon_green,
320
+ width=2, # Slightly thicker for better visibility
321
+ )
322
+
323
+ # Draw label background (white with neon green border)
324
+ label_bg_x1 = label_x - 4
325
+ label_bg_y1 = label_y - 2
326
+ label_bg_x2 = label_x + label_width + 4
327
+ label_bg_y2 = label_y + label_height + 2
328
+
329
+ # Draw white background with neon green border (makes label stand out as separate)
330
+ draw.rectangle(
331
+ [(label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2)],
332
+ fill="white",
333
+ outline=neon_green,
334
+ width=2, # Thicker border to make it more distinct
335
+ )
336
+
337
+ # Draw label text (black for high contrast)
338
+ draw.text(
339
+ (label_x, label_y),
340
+ label_text,
341
+ fill="black",
342
+ font=font,
343
+ )
344
+
345
+ # Record label position for collision detection
346
+ existing_labels.append(
347
+ {
348
+ "x": label_bg_x1,
349
+ "y": label_bg_y1,
350
+ "width": label_bg_x2 - label_bg_x1,
351
+ "height": label_bg_y2 - label_bg_y1,
352
+ }
353
+ )
354
+
355
+ return img
356
+
357
+ def _encode_image_to_base64(
358
+ self, image: "PILImage.Image", format: str = "PNG", max_size_mb: float = 20.0
359
+ ) -> str:
360
+ """
361
+ Encode PIL Image to base64 data URL with size optimization.
362
+
363
+ Vision LLM APIs typically have size limits (e.g., 20MB for OpenAI).
364
+ This function automatically compresses images if they're too large.
365
+
366
+ Args:
367
+ image: PIL Image object
368
+ format: Image format (PNG or JPEG)
369
+ max_size_mb: Maximum size in MB before compression (default: 20MB)
370
+
371
+ Returns:
372
+ Base64-encoded data URL
373
+ """
374
+ # Convert format for PIL
375
+ pil_format = format.upper()
376
+
377
+ # Try JPEG first for better compression (unless PNG is specifically requested)
378
+ if format.upper() != "PNG":
379
+ pil_format = "JPEG"
380
+ # Convert RGBA to RGB for JPEG
381
+ if image.mode in ("RGBA", "LA", "P"):
382
+ # Create white background
383
+ rgb_image = PILImage.new("RGB", image.size, (255, 255, 255))
384
+ if image.mode == "P":
385
+ image = image.convert("RGBA")
386
+ rgb_image.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None)
387
+ image = rgb_image
388
+
389
+ buffer = io.BytesIO()
390
+ quality = 95 # Start with high quality
391
+
392
+ # Try to fit within size limit
393
+ for attempt in range(3):
394
+ buffer.seek(0)
395
+ buffer.truncate(0)
396
+
397
+ if pil_format == "JPEG":
398
+ image.save(buffer, format=pil_format, quality=quality, optimize=True)
399
+ else:
400
+ image.save(buffer, format=pil_format, optimize=True)
401
+
402
+ size_mb = len(buffer.getvalue()) / (1024 * 1024)
403
+
404
+ if size_mb <= max_size_mb:
405
+ break
406
+
407
+ # Reduce quality for next attempt
408
+ quality = max(70, quality - 15)
409
+ if self.verbose and attempt == 0:
410
+ print(f" ⚠️ Image size {size_mb:.2f}MB exceeds limit, compressing...")
411
+
412
+ image_bytes = buffer.getvalue()
413
+ base64_data = base64.b64encode(image_bytes).decode("utf-8")
414
+
415
+ final_size_mb = len(image_bytes) / (1024 * 1024)
416
+ if self.verbose:
417
+ print(f" 📸 Image encoded: {final_size_mb:.2f}MB ({len(base64_data)} chars base64)")
418
+
419
+ mime_type = "image/png" if pil_format == "PNG" else "image/jpeg"
420
+ return f"data:{mime_type};base64,{base64_data}"
421
+
422
+ async def _query_llm_with_vision(
423
+ self,
424
+ image_data_url: str,
425
+ goal: str,
426
+ ) -> LLMResponse:
427
+ """
428
+ Query LLM with vision (labeled screenshot).
429
+
430
+ Args:
431
+ image_data_url: Base64-encoded image data URL
432
+ goal: User's goal/task
433
+
434
+ Returns:
435
+ LLMResponse with element ID
436
+ """
437
+ system_prompt = """You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
438
+ Each clickable element has:
439
+ - A bright neon green (#39FF14) bounding box around the element
440
+ - A white label box with a number (the element ID) connected by a green line
441
+ - The label is clearly separate from the element (not part of the UI)
442
+
443
+ CRITICAL INSTRUCTIONS:
444
+ 1. Look at the screenshot carefully
445
+ 2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
446
+ 3. Follow the green line from that element to find its label box with the ID number
447
+ 4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
448
+ 5. Do NOT include any explanation, reasoning, or other text
449
+ 6. Do NOT say "element 1" or "the first element" - just return the number
450
+ 7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
451
+
452
+ Example responses:
453
+ - Correct: "42"
454
+ - Correct: "1567"
455
+ - Wrong: "I see element 42"
456
+ - Wrong: "The element ID is 42"
457
+ - Wrong: "42 (the search box)" """
458
+
459
+ user_prompt = f"""Goal: {goal}
460
+
461
+ Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
462
+ Find the element that should be clicked to accomplish this goal.
463
+ Return ONLY the integer ID number from the label, nothing else."""
464
+
465
+ # Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
466
+ # Vision-capable providers use similar message format with image_url
467
+ if hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
468
+ # Vision-capable provider - use vision API
469
+ try:
470
+ from openai import OpenAI
471
+
472
+ # Check if it's OpenAI
473
+ if isinstance(self.llm.client, OpenAI):
474
+ messages = [
475
+ {
476
+ "role": "system",
477
+ "content": system_prompt,
478
+ },
479
+ {
480
+ "role": "user",
481
+ "content": [
482
+ {"type": "text", "text": user_prompt},
483
+ {
484
+ "type": "image_url",
485
+ "image_url": {"url": image_data_url},
486
+ },
487
+ ],
488
+ },
489
+ ]
490
+
491
+ response = self.llm.client.chat.completions.create(
492
+ model=self.llm._model_name,
493
+ messages=messages,
494
+ temperature=0.0,
495
+ # Removed max_tokens to use API default (usually higher limit)
496
+ )
497
+
498
+ content = response.choices[0].message.content or ""
499
+ usage = response.usage
500
+
501
+ from .llm_response_builder import LLMResponseBuilder
502
+
503
+ return LLMResponseBuilder.from_openai_format(
504
+ content=content,
505
+ prompt_tokens=usage.prompt_tokens if usage else None,
506
+ completion_tokens=usage.completion_tokens if usage else None,
507
+ total_tokens=usage.total_tokens if usage else None,
508
+ model_name=response.model,
509
+ finish_reason=response.choices[0].finish_reason,
510
+ )
511
+
512
+ # Check if provider supports vision API (uses OpenAI-compatible format)
513
+ elif hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
514
+ # Vision API uses similar format to OpenAI
515
+ if self.verbose:
516
+ print(f" 🔍 Using vision API with model: {self.llm._model_name}")
517
+ print(f" 📐 Image data URL length: {len(image_data_url)} chars")
518
+
519
+ messages = [
520
+ {
521
+ "role": "system",
522
+ "content": system_prompt,
523
+ },
524
+ {
525
+ "role": "user",
526
+ "content": [
527
+ {"type": "text", "text": user_prompt},
528
+ {
529
+ "type": "image_url",
530
+ "image_url": {"url": image_data_url},
531
+ },
532
+ ],
533
+ },
534
+ ]
535
+
536
+ try:
537
+ if self.verbose:
538
+ print(f" 📤 Sending request to vision API...")
539
+ print(f" 📋 Messages structure: {len(messages)} messages")
540
+ print(f" 🖼️ Image URL prefix: {image_data_url[:50]}...")
541
+
542
+ # Removed max_tokens to use API default (usually higher limit)
543
+ # This allows the model to generate complete responses without truncation
544
+ response = self.llm.client.chat.completions.create(
545
+ model=self.llm._model_name,
546
+ messages=messages,
547
+ temperature=0.0,
548
+ # No max_tokens - use API default
549
+ )
550
+
551
+ # Debug: Check response structure
552
+ if self.verbose:
553
+ print(f" 📥 Response received")
554
+ print(f" 📦 Response type: {type(response)}")
555
+ print(
556
+ f" 📦 Choices count: {len(response.choices) if hasattr(response, 'choices') else 0}"
557
+ )
558
+
559
+ if not hasattr(response, "choices") or len(response.choices) == 0:
560
+ raise ValueError("Vision API returned no choices in response")
561
+
562
+ choice = response.choices[0]
563
+ content = (
564
+ choice.message.content if hasattr(choice.message, "content") else None
565
+ )
566
+ finish_reason = (
567
+ choice.finish_reason if hasattr(choice, "finish_reason") else None
568
+ )
569
+
570
+ if self.verbose:
571
+ print(f" 📝 Content: {repr(content)}")
572
+ print(f" 🏁 Finish reason: {finish_reason}")
573
+ if finish_reason:
574
+ print(f" ⚠️ Finish reason indicates: {finish_reason}")
575
+ if finish_reason == "length":
576
+ print(
577
+ f" - Response was truncated (hit API default max_tokens limit)"
578
+ )
579
+ print(
580
+ f" - This might indicate the model needs more tokens or doesn't support vision properly"
581
+ )
582
+ # Even if truncated, there might be partial content
583
+ if content:
584
+ print(
585
+ f" - ⚠️ Partial content received: {repr(content)}"
586
+ )
587
+ elif finish_reason == "content_filter":
588
+ print(f" - Content was filtered by safety filters")
589
+ elif finish_reason == "stop":
590
+ print(f" - Normal completion")
591
+
592
+ # If finish_reason is "length", we might still have partial content
593
+ # Try to use it if available (even if truncated, it might contain the element ID)
594
+ if finish_reason == "length" and content and content.strip():
595
+ if self.verbose:
596
+ print(f" ⚠️ Using truncated response: {repr(content)}")
597
+ # Continue processing with partial content
598
+
599
+ if content is None or content == "":
600
+ error_msg = f"Vision API returned empty content (finish_reason: {finish_reason})"
601
+ if self.verbose:
602
+ print(f" ❌ {error_msg}")
603
+ print(f" 💡 Possible causes:")
604
+ print(
605
+ f" - Model {self.llm._model_name} may not support vision"
606
+ )
607
+ print(f" - Image format might not be supported")
608
+ print(f" - API default max_tokens might be too restrictive")
609
+ print(f" - API response structure might be different")
610
+ if finish_reason == "length":
611
+ print(
612
+ f" - ⚠️ Response was truncated - content might have been cut off"
613
+ )
614
+ print(
615
+ f" - Try increasing max_tokens or check response.choices[0].message for partial content"
616
+ )
617
+ raise ValueError(error_msg)
618
+
619
+ usage = response.usage if hasattr(response, "usage") else None
620
+
621
+ if self.verbose:
622
+ print(f" ✅ Vision API response received")
623
+ print(
624
+ f" 📊 Tokens: {usage.total_tokens if usage else 'N/A'} (prompt: {usage.prompt_tokens if usage else 'N/A'}, completion: {usage.completion_tokens if usage else 'N/A'})"
625
+ )
626
+
627
+ from .llm_response_builder import LLMResponseBuilder
628
+
629
+ return LLMResponseBuilder.from_openai_format(
630
+ content=content,
631
+ prompt_tokens=usage.prompt_tokens if usage else None,
632
+ completion_tokens=usage.completion_tokens if usage else None,
633
+ total_tokens=usage.total_tokens if usage else None,
634
+ model_name=(
635
+ response.model
636
+ if hasattr(response, "model")
637
+ else self.llm._model_name
638
+ ),
639
+ finish_reason=finish_reason,
640
+ )
641
+ except Exception as vision_error:
642
+ if self.verbose:
643
+ print(f" ❌ Vision API error: {vision_error}")
644
+ print(f" 💡 This might indicate:")
645
+ print(f" - Model {self.llm._model_name} doesn't support vision")
646
+ print(f" - Image format/size issue")
647
+ print(f" - API key or permissions issue")
648
+ print(f" 🔄 Attempting fallback to regular generate method...")
649
+
650
+ # Fallback: Try using the regular generate method
651
+ # Some models might need images passed differently
652
+ try:
653
+ # Try embedding image in the prompt as base64
654
+ fallback_prompt = f"{user_prompt}\n\n[Image: {image_data_url[:200]}...]"
655
+ fallback_response = self.llm.generate(
656
+ system_prompt,
657
+ fallback_prompt,
658
+ temperature=0.0,
659
+ # No max_tokens - use API default
660
+ )
661
+ if self.verbose:
662
+ print(f" ⚠️ Using fallback method (may not support vision)")
663
+ return fallback_response
664
+ except Exception as fallback_error:
665
+ if self.verbose:
666
+ print(f" ❌ Fallback also failed: {fallback_error}")
667
+ raise vision_error # Raise original error
668
+ except ImportError:
669
+ # openai or other vision SDK not available
670
+ pass
671
+ except Exception as e:
672
+ if self.verbose:
673
+ print(f"⚠️ Vision API error: {e}, falling back to text-only")
674
+
675
+ # Fallback: Try to pass image via kwargs or use text-only
676
+ # Some providers might accept image in kwargs
677
+ try:
678
+ return self.llm.generate(
679
+ system_prompt,
680
+ f"{user_prompt}\n\n[Image data: {image_data_url[:100]}...]",
681
+ temperature=0.0,
682
+ # No max_tokens - use API default
683
+ )
684
+ except Exception as e:
685
+ raise RuntimeError(
686
+ f"LLM provider {type(self.llm).__name__} may not support vision. "
687
+ f"Error: {e}. Use a vision-capable model like GPT-4o or Claude 3."
688
+ ) from e
689
+
690
+ def _extract_element_id(self, llm_response: str) -> int | None:
691
+ """
692
+ Extract element ID integer from LLM response.
693
+
694
+ Args:
695
+ llm_response: LLM response text
696
+
697
+ Returns:
698
+ Element ID as integer, or None if not found
699
+ """
700
+ if self.verbose:
701
+ print(f"🔍 Raw LLM response: {repr(llm_response)}")
702
+
703
+ # Clean the response - remove leading/trailing whitespace (handles '\n177', '177\n', etc.)
704
+ cleaned = llm_response.strip()
705
+
706
+ if self.verbose:
707
+ print(f" 🧹 After strip: {repr(cleaned)}")
708
+
709
+ # Remove common prefixes that LLMs might add
710
+ prefixes_to_remove = [
711
+ "element",
712
+ "id",
713
+ "the element",
714
+ "element id",
715
+ "the id",
716
+ "click",
717
+ "click on",
718
+ "select",
719
+ "choose",
720
+ ]
721
+ for prefix in prefixes_to_remove:
722
+ if cleaned.lower().startswith(prefix):
723
+ cleaned = cleaned[len(prefix) :].strip()
724
+ # Remove any remaining punctuation
725
+ cleaned = cleaned.lstrip(":.,;!?()[]{}")
726
+ cleaned = cleaned.strip()
727
+ if self.verbose:
728
+ print(f" 🧹 After removing prefix '{prefix}': {repr(cleaned)}")
729
+
730
+ # Try to find all integers in the cleaned response
731
+ numbers = re.findall(r"\d+", cleaned)
732
+
733
+ if self.verbose:
734
+ print(f" 🔢 Numbers found: {numbers}")
735
+
736
+ if numbers:
737
+ # If multiple numbers found, prefer the largest one (likely the actual element ID)
738
+ # Element IDs are typically larger numbers, not small ones like "1"
739
+ try:
740
+ # Convert all to int
741
+ int_numbers = [int(n) for n in numbers]
742
+ if self.verbose:
743
+ print(f" 🔢 As integers: {int_numbers}")
744
+
745
+ # Prefer larger numbers (element IDs are usually > 10)
746
+ # But if only small numbers exist, use the first one
747
+ large_numbers = [n for n in int_numbers if n > 10]
748
+ if large_numbers:
749
+ element_id = max(large_numbers) # Take the largest
750
+ if self.verbose:
751
+ print(f" ✅ Selected largest number > 10: {element_id}")
752
+ else:
753
+ element_id = int_numbers[0] # Fallback to first if all are small
754
+ if self.verbose:
755
+ print(f" ⚠️ All numbers ≤ 10, using first: {element_id}")
756
+
757
+ if self.verbose:
758
+ print(f"✅ Extracted element ID: {element_id} (from {numbers})")
759
+ return element_id
760
+ except ValueError:
761
+ if self.verbose:
762
+ print(f" ❌ Failed to convert numbers to integers")
763
+ pass
764
+
765
+ if self.verbose:
766
+ print(f"⚠️ Could not extract element ID from response: {llm_response}")
767
+ return None
768
+
769
+ def _compute_hash(self, text: str) -> str:
770
+ """Compute SHA256 hash of text."""
771
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
772
+
773
+ async def act(
774
+ self,
775
+ goal: str,
776
+ max_retries: int = 2,
777
+ snapshot_options: SnapshotOptions | None = None,
778
+ ) -> AgentActionResult:
779
+ """
780
+ Override act() method to use visual prompting with full tracing support.
781
+
782
+ Args:
783
+ goal: User's goal/task
784
+ max_retries: Maximum retry attempts
785
+ snapshot_options: Optional snapshot options (screenshot will be enabled)
786
+
787
+ Returns:
788
+ AgentActionResult
789
+ """
790
+ if self.verbose:
791
+ print(f"\n{'=' * 70}")
792
+ print(f"🤖 Visual Agent Goal: {goal}")
793
+ print(f"{'=' * 70}")
794
+
795
+ # Generate step ID for tracing
796
+ self._step_count += 1
797
+ step_id = f"step-{self._step_count}"
798
+
799
+ # Emit step_start trace event if tracer is enabled
800
+ if self.tracer:
801
+ pre_url = self.browser.page.url if self.browser.page else None
802
+ _safe_tracer_call(
803
+ self.tracer,
804
+ "emit_step_start",
805
+ self.verbose,
806
+ step_id=step_id,
807
+ step_index=self._step_count,
808
+ goal=goal,
809
+ attempt=0,
810
+ pre_url=pre_url,
811
+ )
812
+
813
+ start_time = time.time()
814
+
815
+ try:
816
+ # Ensure screenshot is enabled
817
+ if snapshot_options is None:
818
+ snapshot_options = SnapshotOptions()
819
+
820
+ # Enable screenshot if not already enabled
821
+ if snapshot_options.screenshot is False or snapshot_options.screenshot is None:
822
+ from .models import ScreenshotConfig
823
+
824
+ snapshot_options.screenshot = ScreenshotConfig(format="png")
825
+
826
+ # Set goal if not already provided
827
+ if snapshot_options.goal is None:
828
+ snapshot_options.goal = goal
829
+
830
+ # Set limit if not provided
831
+ if snapshot_options.limit is None:
832
+ snapshot_options.limit = self.default_snapshot_limit
833
+
834
+ if self.verbose:
835
+ print(f"🎯 Goal: {goal}")
836
+ print("📸 Taking snapshot with screenshot...")
837
+
838
+ # 1. Take snapshot with screenshot
839
+ from .snapshot import snapshot_async
840
+
841
+ snap = await snapshot_async(self.browser, snapshot_options)
842
+
843
+ if snap.status != "success":
844
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
845
+
846
+ if not snap.screenshot:
847
+ raise RuntimeError("Screenshot not available in snapshot")
848
+
849
+ # Compute diff_status by comparing with previous snapshot
850
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
851
+
852
+ # Create snapshot with diff_status populated
853
+ snap_with_diff = Snapshot(
854
+ status=snap.status,
855
+ timestamp=snap.timestamp,
856
+ url=snap.url,
857
+ viewport=snap.viewport,
858
+ elements=elements_with_diff,
859
+ screenshot=snap.screenshot,
860
+ screenshot_format=snap.screenshot_format,
861
+ error=snap.error,
862
+ )
863
+
864
+ # Update previous snapshot for next comparison
865
+ self._previous_snapshot = snap
866
+
867
+ # Emit snapshot trace event if tracer is enabled
868
+ if self.tracer:
869
+ # Build snapshot event data (use snap_with_diff to include diff_status)
870
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
871
+
872
+ # Always include screenshot in trace event for studio viewer compatibility
873
+ if snap.screenshot:
874
+ # Extract base64 string from data URL if needed
875
+ if snap.screenshot.startswith("data:image"):
876
+ # Format: "data:image/jpeg;base64,{base64_string}"
877
+ screenshot_base64 = (
878
+ snap.screenshot.split(",", 1)[1]
879
+ if "," in snap.screenshot
880
+ else snap.screenshot
881
+ )
882
+ else:
883
+ screenshot_base64 = snap.screenshot
884
+
885
+ snapshot_data["screenshot_base64"] = screenshot_base64
886
+ if snap.screenshot_format:
887
+ snapshot_data["screenshot_format"] = snap.screenshot_format
888
+
889
+ _safe_tracer_call(
890
+ self.tracer,
891
+ "emit",
892
+ self.verbose,
893
+ "snapshot",
894
+ snapshot_data,
895
+ step_id=step_id,
896
+ )
897
+
898
+ if self.verbose:
899
+ print(f"✅ Snapshot taken: {len(snap.elements)} elements")
900
+
901
+ # 2. Draw labeled screenshot
902
+ if self.verbose:
903
+ print("🎨 Drawing bounding boxes and labels...")
904
+ print(f" Elements to label: {len(snap.elements)}")
905
+ if len(snap.elements) > 0:
906
+ element_ids = [el.id for el in snap.elements[:10]] # Show first 10
907
+ print(f" Sample element IDs: {element_ids}")
908
+
909
+ labeled_image = self._draw_labeled_screenshot(snap, snap.elements)
910
+
911
+ # Save labeled image to disk for debugging
912
+ # Save to playground/images if running from playground, otherwise use current directory
913
+ try:
914
+ # Try to detect if we're in a playground context
915
+ import sys
916
+
917
+ cwd = Path.cwd()
918
+ playground_path = None
919
+
920
+ # Check if current working directory contains playground
921
+ if (cwd / "playground").exists():
922
+ playground_path = cwd / "playground" / "images"
923
+ else:
924
+ # Check sys.path for playground
925
+ for path_str in sys.path:
926
+ path_obj = Path(path_str)
927
+ if "playground" in str(path_obj) and path_obj.exists():
928
+ # Find the playground directory
929
+ if path_obj.name == "playground":
930
+ playground_path = path_obj / "images"
931
+ break
932
+ elif (path_obj / "playground").exists():
933
+ playground_path = path_obj / "playground" / "images"
934
+ break
935
+
936
+ if playground_path is None:
937
+ # Fallback: use current working directory
938
+ playground_path = cwd / "playground" / "images"
939
+
940
+ images_dir = playground_path
941
+ images_dir.mkdir(parents=True, exist_ok=True)
942
+ image_uuid = str(uuid.uuid4())
943
+ image_filename = f"labeled_screenshot_{image_uuid}.png"
944
+ image_path = images_dir / image_filename
945
+ labeled_image.save(image_path, format="PNG")
946
+ if self.verbose:
947
+ print(f" 💾 Saved labeled screenshot: {image_path.absolute()}")
948
+ except Exception as save_error:
949
+ # Don't fail if image save fails - it's just for debugging
950
+ if self.verbose:
951
+ print(f" ⚠️ Could not save labeled screenshot: {save_error}")
952
+
953
+ # Use JPEG for better compression (smaller file size for vision APIs)
954
+ labeled_image_data_url = self._encode_image_to_base64(
955
+ labeled_image, format="JPEG", max_size_mb=20.0
956
+ )
957
+
958
+ # 3. Query LLM with vision
959
+ if self.verbose:
960
+ print("🧠 Querying LLM with labeled screenshot...")
961
+
962
+ llm_response = await self._query_llm_with_vision(labeled_image_data_url, goal)
963
+
964
+ # Emit LLM query trace event if tracer is enabled
965
+ if self.tracer:
966
+ _safe_tracer_call(
967
+ self.tracer,
968
+ "emit",
969
+ self.verbose,
970
+ "llm_query",
971
+ {
972
+ "prompt_tokens": llm_response.prompt_tokens,
973
+ "completion_tokens": llm_response.completion_tokens,
974
+ "model": llm_response.model_name,
975
+ "response": llm_response.content[:200], # Truncate for brevity
976
+ },
977
+ step_id=step_id,
978
+ )
979
+
980
+ if self.verbose:
981
+ print(f"💭 LLM Response: {llm_response.content}")
982
+
983
+ # Track token usage
984
+ self._track_tokens(goal, llm_response)
985
+
986
+ # 4. Extract element ID
987
+ element_id = self._extract_element_id(llm_response.content)
988
+
989
+ if element_id is None:
990
+ raise ValueError(
991
+ f"Could not extract element ID from LLM response: {llm_response.content}"
992
+ )
993
+
994
+ if self.verbose:
995
+ print(f"🎯 Extracted Element ID: {element_id}")
996
+
997
+ # 5. Click the element
998
+ if self.verbose:
999
+ print(f"🖱️ Clicking element {element_id}...")
1000
+
1001
+ click_result = await click_async(self.browser, element_id)
1002
+
1003
+ duration_ms = int((time.time() - start_time) * 1000)
1004
+
1005
+ # Create AgentActionResult from click result
1006
+ result = AgentActionResult(
1007
+ success=click_result.success,
1008
+ action="click",
1009
+ goal=goal,
1010
+ duration_ms=duration_ms,
1011
+ attempt=0,
1012
+ element_id=element_id,
1013
+ outcome=click_result.outcome,
1014
+ url_changed=click_result.url_changed,
1015
+ error=click_result.error,
1016
+ )
1017
+
1018
+ # Emit action execution trace event if tracer is enabled
1019
+ if self.tracer:
1020
+ post_url = self.browser.page.url if self.browser.page else None
1021
+
1022
+ # Include element data for live overlay visualization
1023
+ elements_data = [
1024
+ {
1025
+ "id": el.id,
1026
+ "bbox": {
1027
+ "x": el.bbox.x,
1028
+ "y": el.bbox.y,
1029
+ "width": el.bbox.width,
1030
+ "height": el.bbox.height,
1031
+ },
1032
+ "role": el.role,
1033
+ "text": el.text[:50] if el.text else "",
1034
+ }
1035
+ for el in snap.elements[:50]
1036
+ ]
1037
+
1038
+ _safe_tracer_call(
1039
+ self.tracer,
1040
+ "emit",
1041
+ self.verbose,
1042
+ "action",
1043
+ {
1044
+ "action": result.action,
1045
+ "element_id": result.element_id,
1046
+ "success": result.success,
1047
+ "outcome": result.outcome,
1048
+ "duration_ms": duration_ms,
1049
+ "post_url": post_url,
1050
+ "elements": elements_data, # Add element data for overlay
1051
+ "target_element_id": result.element_id, # Highlight target in red
1052
+ },
1053
+ step_id=step_id,
1054
+ )
1055
+
1056
+ # Record history
1057
+ self.history.append(
1058
+ {
1059
+ "goal": goal,
1060
+ "action": f"CLICK({element_id})",
1061
+ "result": result.model_dump(), # Store as dict
1062
+ "success": result.success,
1063
+ "attempt": 0,
1064
+ "duration_ms": duration_ms,
1065
+ }
1066
+ )
1067
+
1068
+ if self.verbose:
1069
+ status = "✅" if result.success else "❌"
1070
+ print(f"{status} Completed in {duration_ms}ms")
1071
+
1072
+ # Emit step completion trace event if tracer is enabled
1073
+ if self.tracer:
1074
+ # Get pre_url from step_start (stored in tracer or use current)
1075
+ pre_url = snap.url
1076
+ post_url = self.browser.page.url if self.browser.page else None
1077
+
1078
+ # Compute snapshot digest (simplified - use URL + timestamp)
1079
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
1080
+
1081
+ # Build LLM data
1082
+ llm_response_text = llm_response.content
1083
+
1084
+ # Build execution data
1085
+ exec_data = {
1086
+ "success": result.success,
1087
+ "outcome": result.outcome,
1088
+ "action": result.action,
1089
+ "element_id": result.element_id,
1090
+ "url_changed": result.url_changed,
1091
+ "duration_ms": duration_ms,
1092
+ }
1093
+
1094
+ # Build verification data (simplified - always pass for now)
1095
+ verify_data = {
1096
+ "passed": result.success,
1097
+ "signals": {
1098
+ "url_changed": result.url_changed or False,
1099
+ },
1100
+ }
1101
+
1102
+ # Build complete step_end event
1103
+ step_end_data = TraceEventBuilder.build_step_end_event(
1104
+ step_id=step_id,
1105
+ step_index=self._step_count,
1106
+ goal=goal,
1107
+ attempt=0,
1108
+ pre_url=pre_url,
1109
+ post_url=post_url or pre_url,
1110
+ snapshot_digest=snapshot_digest,
1111
+ llm_data={
1112
+ "response_text": llm_response_text,
1113
+ "response_hash": f"sha256:{self._compute_hash(llm_response_text)}",
1114
+ },
1115
+ exec_data=exec_data,
1116
+ verify_data=verify_data,
1117
+ )
1118
+
1119
+ _safe_tracer_call(
1120
+ self.tracer,
1121
+ "emit",
1122
+ self.verbose,
1123
+ "step_end",
1124
+ step_end_data,
1125
+ step_id=step_id,
1126
+ )
1127
+
1128
+ return result
1129
+
1130
+ except Exception as e:
1131
+ # Emit error trace event if tracer is enabled
1132
+ if self.tracer:
1133
+ _safe_tracer_call(
1134
+ self.tracer,
1135
+ "emit_error",
1136
+ self.verbose,
1137
+ step_id=step_id,
1138
+ error=str(e),
1139
+ attempt=0,
1140
+ )
1141
+
1142
+ if self.verbose:
1143
+ print(f"❌ Error: {e}")
1144
+
1145
+ # Re-raise the exception
1146
+ raise
1147
+
1148
+
1149
+ class SentienceVisualAgent(SentienceAgent):
1150
+ """
1151
+ Sync visual agent that uses labeled screenshots with vision-capable LLMs.
1152
+
1153
+ Extends SentienceAgent to override act() method with visual prompting.
1154
+
1155
+ Requirements:
1156
+ - Pillow (PIL): Required for image processing and drawing bounding boxes
1157
+ Install with: pip install Pillow
1158
+ - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
1159
+ """
1160
+
1161
+ def __init__(
1162
+ self,
1163
+ browser: SentienceBrowser,
1164
+ llm: LLMProvider,
1165
+ default_snapshot_limit: int = 50,
1166
+ verbose: bool = True,
1167
+ tracer: Any | None = None,
1168
+ config: Any | None = None,
1169
+ ):
1170
+ """
1171
+ Initialize Visual Agent
1172
+
1173
+ Args:
1174
+ browser: SentienceBrowser instance
1175
+ llm: LLM provider (must support vision, e.g., GPT-4o, Claude 3)
1176
+ default_snapshot_limit: Default maximum elements to include
1177
+ verbose: Print execution logs
1178
+ tracer: Optional Tracer instance
1179
+ config: Optional AgentConfig
1180
+ """
1181
+ super().__init__(browser, llm, default_snapshot_limit, verbose, tracer, config)
1182
+
1183
+ if not PIL_AVAILABLE:
1184
+ raise ImportError(
1185
+ "PIL/Pillow is required for SentienceVisualAgent. Install with: pip install Pillow"
1186
+ )
1187
+
1188
+ # Track previous snapshot for diff computation
1189
+ self._previous_snapshot: Snapshot | None = None
1190
+
1191
+ def _decode_screenshot(self, screenshot_data_url: str) -> "PILImage.Image":
1192
+ """
1193
+ Decode base64 screenshot data URL to PIL Image
1194
+
1195
+ Args:
1196
+ screenshot_data_url: Base64-encoded data URL (e.g., "data:image/png;base64,...")
1197
+
1198
+ Returns:
1199
+ PIL Image object
1200
+ """
1201
+ # Extract base64 data from data URL
1202
+ if screenshot_data_url.startswith("data:image/"):
1203
+ # Format: "data:image/png;base64,<base64_data>"
1204
+ base64_data = screenshot_data_url.split(",", 1)[1]
1205
+ else:
1206
+ # Assume it's already base64
1207
+ base64_data = screenshot_data_url
1208
+
1209
+ # Decode base64 to bytes
1210
+ image_bytes = base64.b64decode(base64_data)
1211
+
1212
+ # Load image from bytes
1213
+ return PILImage.open(io.BytesIO(image_bytes))
1214
+
1215
+ def _find_label_position(
1216
+ self,
1217
+ element_bbox: dict[str, float],
1218
+ existing_labels: list[dict[str, float]],
1219
+ image_width: int,
1220
+ image_height: int,
1221
+ label_width: int,
1222
+ label_height: int,
1223
+ ) -> tuple[int, int]:
1224
+ """
1225
+ Find best position for label using anti-collision algorithm.
1226
+
1227
+ Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners.
1228
+ Returns the first position that doesn't collide with existing labels.
1229
+
1230
+ Args:
1231
+ element_bbox: Element bounding box {x, y, width, height}
1232
+ existing_labels: List of existing label bounding boxes
1233
+ image_width: Image width in pixels
1234
+ image_height: Image height in pixels
1235
+ label_width: Label width in pixels
1236
+ label_height: Label height in pixels
1237
+
1238
+ Returns:
1239
+ (x, y) position for label
1240
+ """
1241
+ x, y = element_bbox["x"], element_bbox["y"]
1242
+ width, height = element_bbox["width"], element_bbox["height"]
1243
+
1244
+ # Offset from element edge
1245
+ label_offset = 15 # Increased from 5px for better separation
1246
+
1247
+ # Try 8 positions: top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
1248
+ positions = [
1249
+ (int(x + width / 2 - label_width / 2), int(y - label_height - label_offset)), # Top
1250
+ (int(x + width / 2 - label_width / 2), int(y + height + label_offset)), # Bottom
1251
+ (int(x - label_width - label_offset), int(y + height / 2 - label_height / 2)), # Left
1252
+ (int(x + width + label_offset), int(y + height / 2 - label_height / 2)), # Right
1253
+ (int(x - label_width - label_offset), int(y - label_height - label_offset)), # Top-left
1254
+ (int(x + width + label_offset), int(y - label_height - label_offset)), # Top-right
1255
+ (int(x - label_width - label_offset), int(y + height + label_offset)), # Bottom-left
1256
+ (int(x + width + label_offset), int(y + height + label_offset)), # Bottom-right
1257
+ ]
1258
+
1259
+ # Check each position for collisions
1260
+ for pos_x, pos_y in positions:
1261
+ # Check bounds
1262
+ if (
1263
+ pos_x < 0
1264
+ or pos_y < 0
1265
+ or pos_x + label_width > image_width
1266
+ or pos_y + label_height > image_height
1267
+ ):
1268
+ continue
1269
+
1270
+ # Check collision with existing labels
1271
+ label_bbox = {
1272
+ "x": pos_x,
1273
+ "y": pos_y,
1274
+ "width": label_width,
1275
+ "height": label_height,
1276
+ }
1277
+
1278
+ collision = False
1279
+ for existing in existing_labels:
1280
+ # Simple AABB collision detection
1281
+ if not (
1282
+ label_bbox["x"] + label_bbox["width"] < existing["x"]
1283
+ or label_bbox["x"] > existing["x"] + existing["width"]
1284
+ or label_bbox["y"] + label_bbox["height"] < existing["y"]
1285
+ or label_bbox["y"] > existing["y"] + existing["height"]
1286
+ ):
1287
+ collision = True
1288
+ break
1289
+
1290
+ if not collision:
1291
+ return (pos_x, pos_y)
1292
+
1293
+ # If all positions collide, use top position with increased offset
1294
+ return (int(x + width / 2 - label_width / 2), int(y - label_height - label_offset * 2))
1295
+
1296
+ def _draw_labeled_screenshot(
1297
+ self,
1298
+ snapshot: Snapshot,
1299
+ elements: list[Element],
1300
+ ) -> "PILImage.Image":
1301
+ """
1302
+ Draw labeled screenshot with bounding boxes and element IDs.
1303
+
1304
+ Args:
1305
+ snapshot: Snapshot with screenshot data
1306
+ elements: List of elements to label
1307
+
1308
+ Returns:
1309
+ PIL Image with labels drawn
1310
+ """
1311
+ # Decode screenshot
1312
+ img = self._decode_screenshot(snapshot.screenshot)
1313
+ draw = PILImageDraw.Draw(img)
1314
+
1315
+ # Load font (fallback to default if not available)
1316
+ try:
1317
+ font = PILImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
1318
+ except OSError:
1319
+ try:
1320
+ font = PILImageFont.truetype(
1321
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16
1322
+ )
1323
+ except OSError:
1324
+ font = PILImageFont.load_default()
1325
+
1326
+ image_width, image_height = img.size
1327
+ existing_labels: list[dict[str, float]] = []
1328
+
1329
+ # Neon green color: #39FF14 (bright, vibrant green)
1330
+ neon_green = "#39FF14"
1331
+
1332
+ for element in elements:
1333
+ bbox = element.bbox
1334
+ x, y, width, height = bbox.x, bbox.y, bbox.width, bbox.height
1335
+
1336
+ # Draw bounding box rectangle (neon green with 2px width)
1337
+ draw.rectangle(
1338
+ [(x, y), (x + width, y + height)],
1339
+ outline=neon_green,
1340
+ width=2,
1341
+ )
1342
+
1343
+ # Prepare label text (just the number - keep it simple and compact)
1344
+ label_text = str(element.id)
1345
+
1346
+ # Measure label text size
1347
+ bbox_text = draw.textbbox((0, 0), label_text, font=font)
1348
+ label_width = bbox_text[2] - bbox_text[0]
1349
+ label_height = bbox_text[3] - bbox_text[1]
1350
+
1351
+ # Find best position for label (anti-collision)
1352
+ label_x, label_y = self._find_label_position(
1353
+ {"x": x, "y": y, "width": width, "height": height},
1354
+ existing_labels,
1355
+ image_width,
1356
+ image_height,
1357
+ label_width + 8, # Add padding
1358
+ label_height + 4, # Add padding
1359
+ )
1360
+
1361
+ # Calculate connection points for a clearer visual link
1362
+ element_center_x = x + width / 2
1363
+ element_center_y = y + height / 2
1364
+ label_center_x = label_x + label_width / 2
1365
+ label_center_y = label_y + label_height / 2
1366
+
1367
+ # Determine which edge of the element is closest to the label
1368
+ dist_top = abs(label_center_y - y)
1369
+ dist_bottom = abs(label_center_y - (y + height))
1370
+ dist_left = abs(label_center_x - x)
1371
+ dist_right = abs(label_center_x - (x + width))
1372
+
1373
+ min_dist = min(dist_top, dist_bottom, dist_left, dist_right)
1374
+
1375
+ if min_dist == dist_top:
1376
+ line_start = (element_center_x, y)
1377
+ elif min_dist == dist_bottom:
1378
+ line_start = (element_center_x, y + height)
1379
+ elif min_dist == dist_left:
1380
+ line_start = (x, element_center_y)
1381
+ else:
1382
+ line_start = (x + width, element_center_y)
1383
+
1384
+ # Draw connecting line from element edge to label
1385
+ draw.line(
1386
+ [line_start, (label_center_x, label_center_y)],
1387
+ fill=neon_green,
1388
+ width=2,
1389
+ )
1390
+
1391
+ # Draw label background (white with neon green border)
1392
+ label_bg_x1 = label_x - 4
1393
+ label_bg_y1 = label_y - 2
1394
+ label_bg_x2 = label_x + label_width + 4
1395
+ label_bg_y2 = label_y + label_height + 2
1396
+
1397
+ draw.rectangle(
1398
+ [(label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2)],
1399
+ fill="white",
1400
+ outline=neon_green,
1401
+ width=2,
1402
+ )
1403
+
1404
+ # Draw label text
1405
+ draw.text(
1406
+ (label_x, label_y),
1407
+ label_text,
1408
+ fill="black",
1409
+ font=font,
1410
+ )
1411
+
1412
+ # Record label position for collision detection
1413
+ existing_labels.append(
1414
+ {
1415
+ "x": label_bg_x1,
1416
+ "y": label_bg_y1,
1417
+ "width": label_bg_x2 - label_bg_x1,
1418
+ "height": label_bg_y2 - label_bg_y1,
1419
+ }
1420
+ )
1421
+
1422
+ return img
1423
+
1424
+ def _encode_image_to_base64(
1425
+ self,
1426
+ image: "PILImage.Image",
1427
+ format: str = "PNG",
1428
+ max_size_mb: float = 20.0,
1429
+ ) -> str:
1430
+ """
1431
+ Encode PIL Image to base64 data URL with size optimization.
1432
+
1433
+ Args:
1434
+ image: PIL Image object
1435
+ format: Output format ("PNG" or "JPEG")
1436
+ max_size_mb: Maximum size in MB (will compress if exceeded)
1437
+
1438
+ Returns:
1439
+ Base64-encoded data URL
1440
+ """
1441
+ buffer = io.BytesIO()
1442
+ pil_format = format.upper()
1443
+ quality = 95 # Start with high quality
1444
+
1445
+ # Convert RGBA to RGB for JPEG
1446
+ if pil_format == "JPEG" and image.mode == "RGBA":
1447
+ # Create white background
1448
+ rgb_image = Image.new("RGB", image.size, (255, 255, 255))
1449
+ rgb_image.paste(image, mask=image.split()[3]) # Use alpha channel as mask
1450
+ image = rgb_image
1451
+
1452
+ # Try to fit within size limit
1453
+ for attempt in range(3):
1454
+ buffer.seek(0)
1455
+ buffer.truncate(0)
1456
+
1457
+ if pil_format == "JPEG":
1458
+ image.save(buffer, format=pil_format, quality=quality, optimize=True)
1459
+ else:
1460
+ image.save(buffer, format=pil_format, optimize=True)
1461
+
1462
+ size_mb = len(buffer.getvalue()) / (1024 * 1024)
1463
+
1464
+ if size_mb <= max_size_mb:
1465
+ break
1466
+
1467
+ # Reduce quality for next attempt
1468
+ quality = max(70, quality - 15)
1469
+ if self.verbose and attempt == 0:
1470
+ print(f" ⚠️ Image size {size_mb:.2f}MB exceeds limit, compressing...")
1471
+
1472
+ image_bytes = buffer.getvalue()
1473
+ base64_data = base64.b64encode(image_bytes).decode("utf-8")
1474
+
1475
+ final_size_mb = len(image_bytes) / (1024 * 1024)
1476
+ if self.verbose:
1477
+ print(f" 📸 Image encoded: {final_size_mb:.2f}MB ({len(base64_data)} chars base64)")
1478
+
1479
+ mime_type = "image/png" if pil_format == "PNG" else "image/jpeg"
1480
+ return f"data:{mime_type};base64,{base64_data}"
1481
+
1482
+ def _query_llm_with_vision(
1483
+ self,
1484
+ image_data_url: str,
1485
+ goal: str,
1486
+ ) -> LLMResponse:
1487
+ """
1488
+ Query LLM with vision (labeled screenshot) - sync version.
1489
+
1490
+ Args:
1491
+ image_data_url: Base64-encoded image data URL
1492
+ goal: User's goal/task
1493
+
1494
+ Returns:
1495
+ LLMResponse with element ID
1496
+ """
1497
+ # Use the same prompt as async version
1498
+ system_prompt = """You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
1499
+ Each clickable element has:
1500
+ - A bright neon green (#39FF14) bounding box around the element
1501
+ - A white label box with a number (the element ID) connected by a green line
1502
+ - The label is clearly separate from the element (not part of the UI)
1503
+
1504
+ CRITICAL INSTRUCTIONS:
1505
+ 1. Look at the screenshot carefully
1506
+ 2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
1507
+ 3. Follow the green line from that element to find its label box with the ID number
1508
+ 4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
1509
+ 5. Do NOT include any explanation, reasoning, or other text
1510
+ 6. Do NOT say "element 1" or "the first element" - just return the number
1511
+ 7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
1512
+
1513
+ Example responses:
1514
+ - Correct: "42"
1515
+ - Correct: "1567"
1516
+ - Wrong: "I see element 42"
1517
+ - Wrong: "The element ID is 42"
1518
+ - Wrong: "42 (the search box)" """
1519
+
1520
+ user_prompt = f"""Goal: {goal}
1521
+
1522
+ Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
1523
+ Find the element that should be clicked to accomplish this goal.
1524
+ Return ONLY the integer ID number from the label, nothing else."""
1525
+
1526
+ # Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
1527
+ if hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
1528
+ # Vision-capable provider - use vision API
1529
+ try:
1530
+ from openai import OpenAI
1531
+
1532
+ # Check if it's OpenAI
1533
+ if isinstance(self.llm.client, OpenAI):
1534
+ messages = [
1535
+ {
1536
+ "role": "system",
1537
+ "content": system_prompt,
1538
+ },
1539
+ {
1540
+ "role": "user",
1541
+ "content": [
1542
+ {"type": "text", "text": user_prompt},
1543
+ {
1544
+ "type": "image_url",
1545
+ "image_url": {"url": image_data_url},
1546
+ },
1547
+ ],
1548
+ },
1549
+ ]
1550
+
1551
+ response = self.llm.client.chat.completions.create(
1552
+ model=self.llm._model_name,
1553
+ messages=messages,
1554
+ temperature=0.0,
1555
+ )
1556
+
1557
+ content = response.choices[0].message.content or ""
1558
+ usage = response.usage
1559
+
1560
+ from .llm_response_builder import LLMResponseBuilder
1561
+
1562
+ return LLMResponseBuilder.from_openai_format(
1563
+ content=content,
1564
+ prompt_tokens=usage.prompt_tokens if usage else None,
1565
+ completion_tokens=usage.completion_tokens if usage else None,
1566
+ total_tokens=usage.total_tokens if usage else None,
1567
+ model_name=response.model,
1568
+ finish_reason=response.choices[0].finish_reason,
1569
+ )
1570
+
1571
+ # Check if provider supports vision API (uses OpenAI-compatible format)
1572
+ elif hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
1573
+ if self.verbose:
1574
+ print(f" 🔍 Using vision API with model: {self.llm._model_name}")
1575
+ print(f" 📐 Image data URL length: {len(image_data_url)} chars")
1576
+
1577
+ messages = [
1578
+ {
1579
+ "role": "system",
1580
+ "content": system_prompt,
1581
+ },
1582
+ {
1583
+ "role": "user",
1584
+ "content": [
1585
+ {"type": "text", "text": user_prompt},
1586
+ {
1587
+ "type": "image_url",
1588
+ "image_url": {"url": image_data_url},
1589
+ },
1590
+ ],
1591
+ },
1592
+ ]
1593
+
1594
+ try:
1595
+ if self.verbose:
1596
+ print(f" 📤 Sending request to vision API...")
1597
+
1598
+ response = self.llm.client.chat.completions.create(
1599
+ model=self.llm._model_name,
1600
+ messages=messages,
1601
+ temperature=0.0,
1602
+ )
1603
+
1604
+ if not hasattr(response, "choices") or len(response.choices) == 0:
1605
+ raise ValueError("Vision API returned no choices in response")
1606
+
1607
+ choice = response.choices[0]
1608
+ content = (
1609
+ choice.message.content if hasattr(choice.message, "content") else None
1610
+ )
1611
+ finish_reason = (
1612
+ choice.finish_reason if hasattr(choice, "finish_reason") else None
1613
+ )
1614
+
1615
+ if content is None or content == "":
1616
+ error_msg = f"Vision API returned empty content (finish_reason: {finish_reason})"
1617
+ if self.verbose:
1618
+ print(f" ❌ {error_msg}")
1619
+ raise ValueError(error_msg)
1620
+
1621
+ usage = response.usage if hasattr(response, "usage") else None
1622
+
1623
+ from .llm_response_builder import LLMResponseBuilder
1624
+
1625
+ return LLMResponseBuilder.from_openai_format(
1626
+ content=content,
1627
+ prompt_tokens=usage.prompt_tokens if usage else None,
1628
+ completion_tokens=usage.completion_tokens if usage else None,
1629
+ total_tokens=usage.total_tokens if usage else None,
1630
+ model_name=(
1631
+ response.model
1632
+ if hasattr(response, "model")
1633
+ else self.llm._model_name
1634
+ ),
1635
+ finish_reason=finish_reason,
1636
+ )
1637
+ except Exception as vision_error:
1638
+ if self.verbose:
1639
+ print(f" ❌ Vision API error: {vision_error}")
1640
+ print(f" 🔄 Attempting fallback to regular generate method...")
1641
+
1642
+ # Fallback: Try using the regular generate method
1643
+ try:
1644
+ fallback_prompt = f"{user_prompt}\n\n[Image: {image_data_url[:200]}...]"
1645
+ fallback_response = self.llm.generate(
1646
+ system_prompt,
1647
+ fallback_prompt,
1648
+ temperature=0.0,
1649
+ )
1650
+ if self.verbose:
1651
+ print(f" ⚠️ Using fallback method (may not support vision)")
1652
+ return fallback_response
1653
+ except Exception as fallback_error:
1654
+ if self.verbose:
1655
+ print(f" ❌ Fallback also failed: {fallback_error}")
1656
+ raise vision_error # Raise original error
1657
+ except ImportError:
1658
+ # openai or other vision SDK not available
1659
+ pass
1660
+ except Exception as e:
1661
+ if self.verbose:
1662
+ print(f"⚠️ Vision API error: {e}, falling back to text-only")
1663
+
1664
+ # Fallback: Try to pass image via kwargs or use text-only
1665
+ try:
1666
+ return self.llm.generate(
1667
+ system_prompt,
1668
+ f"{user_prompt}\n\n[Image data: {image_data_url[:100]}...]",
1669
+ temperature=0.0,
1670
+ )
1671
+ except Exception as e:
1672
+ raise RuntimeError(
1673
+ f"LLM provider {type(self.llm).__name__} may not support vision. "
1674
+ f"Error: {e}. Use a vision-capable model like GPT-4o or Claude 3."
1675
+ ) from e
1676
+
1677
+ def _extract_element_id(self, llm_response: str) -> int | None:
1678
+ """Extract element ID integer from LLM response (shared with async version)."""
1679
+ return SentienceVisualAgentAsync._extract_element_id(self, llm_response)
1680
+
1681
+ def _compute_hash(self, text: str) -> str:
1682
+ """Compute SHA256 hash of text."""
1683
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
1684
+
1685
+ def act(
1686
+ self,
1687
+ goal: str,
1688
+ max_retries: int = 2,
1689
+ snapshot_options: SnapshotOptions | None = None,
1690
+ ) -> AgentActionResult:
1691
+ """
1692
+ Override act() method to use visual prompting with full tracing support.
1693
+
1694
+ Args:
1695
+ goal: User's goal/task
1696
+ max_retries: Maximum retry attempts
1697
+ snapshot_options: Optional snapshot options (screenshot will be enabled)
1698
+
1699
+ Returns:
1700
+ AgentActionResult
1701
+ """
1702
+ if self.verbose:
1703
+ print(f"\n{'=' * 70}")
1704
+ print(f"🤖 Visual Agent Goal: {goal}")
1705
+ print(f"{'=' * 70}")
1706
+
1707
+ # Generate step ID for tracing
1708
+ self._step_count += 1
1709
+ step_id = f"step-{self._step_count}"
1710
+
1711
+ # Emit step_start trace event if tracer is enabled
1712
+ if self.tracer:
1713
+ pre_url = self.browser.page.url if self.browser.page else None
1714
+ _safe_tracer_call(
1715
+ self.tracer,
1716
+ "emit_step_start",
1717
+ self.verbose,
1718
+ step_id=step_id,
1719
+ step_index=self._step_count,
1720
+ goal=goal,
1721
+ attempt=0,
1722
+ pre_url=pre_url,
1723
+ )
1724
+
1725
+ start_time = time.time()
1726
+
1727
+ try:
1728
+ # Ensure screenshot is enabled
1729
+ if snapshot_options is None:
1730
+ snapshot_options = SnapshotOptions()
1731
+
1732
+ # Enable screenshot if not already enabled
1733
+ if snapshot_options.screenshot is False or snapshot_options.screenshot is None:
1734
+ from .models import ScreenshotConfig
1735
+
1736
+ snapshot_options.screenshot = ScreenshotConfig(format="png")
1737
+
1738
+ # Set goal if not already provided
1739
+ if snapshot_options.goal is None:
1740
+ snapshot_options.goal = goal
1741
+
1742
+ # Set limit if not provided
1743
+ if snapshot_options.limit is None:
1744
+ snapshot_options.limit = self.default_snapshot_limit
1745
+
1746
+ if self.verbose:
1747
+ print(f"🎯 Goal: {goal}")
1748
+ print("📸 Taking snapshot with screenshot...")
1749
+
1750
+ # 1. Take snapshot with screenshot (sync version)
1751
+ snap = snapshot(self.browser, snapshot_options)
1752
+
1753
+ if snap.status != "success":
1754
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
1755
+
1756
+ if not snap.screenshot:
1757
+ raise RuntimeError("Screenshot not available in snapshot")
1758
+
1759
+ # Compute diff_status by comparing with previous snapshot
1760
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
1761
+
1762
+ # Create snapshot with diff_status populated
1763
+ snap_with_diff = Snapshot(
1764
+ status=snap.status,
1765
+ timestamp=snap.timestamp,
1766
+ url=snap.url,
1767
+ viewport=snap.viewport,
1768
+ elements=elements_with_diff,
1769
+ screenshot=snap.screenshot,
1770
+ screenshot_format=snap.screenshot_format,
1771
+ error=snap.error,
1772
+ )
1773
+
1774
+ # Update previous snapshot for next comparison
1775
+ self._previous_snapshot = snap
1776
+
1777
+ # Emit snapshot trace event if tracer is enabled
1778
+ if self.tracer:
1779
+ # Build snapshot event data (use snap_with_diff to include diff_status)
1780
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
1781
+
1782
+ # Always include screenshot in trace event for studio viewer compatibility
1783
+ if snap.screenshot:
1784
+ # Extract base64 string from data URL if needed
1785
+ if snap.screenshot.startswith("data:image"):
1786
+ # Format: "data:image/jpeg;base64,{base64_string}"
1787
+ screenshot_base64 = (
1788
+ snap.screenshot.split(",", 1)[1]
1789
+ if "," in snap.screenshot
1790
+ else snap.screenshot
1791
+ )
1792
+ else:
1793
+ screenshot_base64 = snap.screenshot
1794
+
1795
+ snapshot_data["screenshot_base64"] = screenshot_base64
1796
+ if snap.screenshot_format:
1797
+ snapshot_data["screenshot_format"] = snap.screenshot_format
1798
+
1799
+ _safe_tracer_call(
1800
+ self.tracer,
1801
+ "emit",
1802
+ self.verbose,
1803
+ "snapshot",
1804
+ snapshot_data,
1805
+ step_id=step_id,
1806
+ )
1807
+
1808
+ if self.verbose:
1809
+ print(f"✅ Snapshot taken: {len(snap.elements)} elements")
1810
+
1811
+ # 2. Draw labeled screenshot
1812
+ if self.verbose:
1813
+ print("🎨 Drawing bounding boxes and labels...")
1814
+ print(f" Elements to label: {len(snap.elements)}")
1815
+ if len(snap.elements) > 0:
1816
+ element_ids = [el.id for el in snap.elements[:10]] # Show first 10
1817
+ print(f" Sample element IDs: {element_ids}")
1818
+
1819
+ labeled_image = self._draw_labeled_screenshot(snap, snap.elements)
1820
+
1821
+ # Save labeled image to disk for debugging
1822
+ # Save to playground/images if running from playground, otherwise use current directory
1823
+ try:
1824
+ # Try to detect if we're in a playground context
1825
+ import sys
1826
+
1827
+ cwd = Path.cwd()
1828
+ playground_path = None
1829
+
1830
+ # Check if current working directory contains playground
1831
+ if (cwd / "playground").exists():
1832
+ playground_path = cwd / "playground" / "images"
1833
+ else:
1834
+ # Check sys.path for playground
1835
+ for path_str in sys.path:
1836
+ path_obj = Path(path_str)
1837
+ if "playground" in str(path_obj) and path_obj.exists():
1838
+ # Find the playground directory
1839
+ if path_obj.name == "playground":
1840
+ playground_path = path_obj / "images"
1841
+ break
1842
+ elif (path_obj / "playground").exists():
1843
+ playground_path = path_obj / "playground" / "images"
1844
+ break
1845
+
1846
+ if playground_path is None:
1847
+ # Fallback: use current working directory
1848
+ playground_path = cwd / "playground" / "images"
1849
+
1850
+ images_dir = playground_path
1851
+ images_dir.mkdir(parents=True, exist_ok=True)
1852
+ image_uuid = str(uuid.uuid4())
1853
+ image_filename = f"labeled_screenshot_{image_uuid}.png"
1854
+ image_path = images_dir / image_filename
1855
+ labeled_image.save(image_path, format="PNG")
1856
+ if self.verbose:
1857
+ print(f" 💾 Saved labeled screenshot: {image_path.absolute()}")
1858
+ except Exception as save_error:
1859
+ # Don't fail if image save fails - it's just for debugging
1860
+ if self.verbose:
1861
+ print(f" ⚠️ Could not save labeled screenshot: {save_error}")
1862
+
1863
+ # Use JPEG for better compression (smaller file size for vision APIs)
1864
+ labeled_image_data_url = self._encode_image_to_base64(
1865
+ labeled_image, format="JPEG", max_size_mb=20.0
1866
+ )
1867
+
1868
+ # 3. Query LLM with vision (sync version)
1869
+ if self.verbose:
1870
+ print("🧠 Querying LLM with labeled screenshot...")
1871
+
1872
+ llm_response = self._query_llm_with_vision(labeled_image_data_url, goal)
1873
+
1874
+ # Emit LLM query trace event if tracer is enabled
1875
+ if self.tracer:
1876
+ _safe_tracer_call(
1877
+ self.tracer,
1878
+ "emit",
1879
+ self.verbose,
1880
+ "llm_query",
1881
+ {
1882
+ "prompt_tokens": llm_response.prompt_tokens,
1883
+ "completion_tokens": llm_response.completion_tokens,
1884
+ "model": llm_response.model_name,
1885
+ "response": llm_response.content[:200], # Truncate for brevity
1886
+ },
1887
+ step_id=step_id,
1888
+ )
1889
+
1890
+ if self.verbose:
1891
+ print(f"💭 LLM Response: {llm_response.content}")
1892
+
1893
+ # Track token usage
1894
+ self._track_tokens(goal, llm_response)
1895
+
1896
+ # 4. Extract element ID
1897
+ element_id = self._extract_element_id(llm_response.content)
1898
+
1899
+ if element_id is None:
1900
+ raise ValueError(
1901
+ f"Could not extract element ID from LLM response: {llm_response.content}"
1902
+ )
1903
+
1904
+ if self.verbose:
1905
+ print(f"🎯 Extracted Element ID: {element_id}")
1906
+
1907
+ # 5. Click the element (sync version)
1908
+ if self.verbose:
1909
+ print(f"🖱️ Clicking element {element_id}...")
1910
+
1911
+ click_result = click(self.browser, element_id)
1912
+
1913
+ duration_ms = int((time.time() - start_time) * 1000)
1914
+
1915
+ # Create AgentActionResult from click result
1916
+ result = AgentActionResult(
1917
+ success=click_result.success,
1918
+ action="click",
1919
+ goal=goal,
1920
+ duration_ms=duration_ms,
1921
+ attempt=0,
1922
+ element_id=element_id,
1923
+ outcome=click_result.outcome,
1924
+ url_changed=click_result.url_changed,
1925
+ error=click_result.error,
1926
+ )
1927
+
1928
+ # Emit action execution trace event if tracer is enabled
1929
+ if self.tracer:
1930
+ post_url = self.browser.page.url if self.browser.page else None
1931
+
1932
+ # Include element data for live overlay visualization
1933
+ elements_data = [
1934
+ {
1935
+ "id": el.id,
1936
+ "bbox": {
1937
+ "x": el.bbox.x,
1938
+ "y": el.bbox.y,
1939
+ "width": el.bbox.width,
1940
+ "height": el.bbox.height,
1941
+ },
1942
+ "role": el.role,
1943
+ "text": el.text[:50] if el.text else "",
1944
+ }
1945
+ for el in snap.elements[:50]
1946
+ ]
1947
+
1948
+ _safe_tracer_call(
1949
+ self.tracer,
1950
+ "emit",
1951
+ self.verbose,
1952
+ "action",
1953
+ {
1954
+ "action": result.action,
1955
+ "element_id": result.element_id,
1956
+ "success": result.success,
1957
+ "outcome": result.outcome,
1958
+ "duration_ms": duration_ms,
1959
+ "post_url": post_url,
1960
+ "elements": elements_data, # Add element data for overlay
1961
+ "target_element_id": result.element_id, # Highlight target in red
1962
+ },
1963
+ step_id=step_id,
1964
+ )
1965
+
1966
+ # Record history
1967
+ self.history.append(
1968
+ {
1969
+ "goal": goal,
1970
+ "action": f"CLICK({element_id})",
1971
+ "result": result.model_dump(), # Store as dict
1972
+ "success": result.success,
1973
+ "attempt": 0,
1974
+ "duration_ms": duration_ms,
1975
+ }
1976
+ )
1977
+
1978
+ if self.verbose:
1979
+ status = "✅" if result.success else "❌"
1980
+ print(f"{status} Completed in {duration_ms}ms")
1981
+
1982
+ # Emit step completion trace event if tracer is enabled
1983
+ if self.tracer:
1984
+ # Get pre_url from step_start (stored in tracer or use current)
1985
+ pre_url = snap.url
1986
+ post_url = self.browser.page.url if self.browser.page else None
1987
+
1988
+ # Compute snapshot digest (simplified - use URL + timestamp)
1989
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
1990
+
1991
+ # Build LLM data
1992
+ llm_response_text = llm_response.content
1993
+
1994
+ # Build execution data
1995
+ exec_data = {
1996
+ "success": result.success,
1997
+ "outcome": result.outcome,
1998
+ "action": result.action,
1999
+ "element_id": result.element_id,
2000
+ "url_changed": result.url_changed,
2001
+ "duration_ms": duration_ms,
2002
+ }
2003
+
2004
+ # Build verification data (simplified - always pass for now)
2005
+ verify_data = {
2006
+ "passed": result.success,
2007
+ "signals": {
2008
+ "url_changed": result.url_changed or False,
2009
+ },
2010
+ }
2011
+
2012
+ # Build complete step_end event
2013
+ step_end_data = TraceEventBuilder.build_step_end_event(
2014
+ step_id=step_id,
2015
+ step_index=self._step_count,
2016
+ goal=goal,
2017
+ attempt=0,
2018
+ pre_url=pre_url,
2019
+ post_url=post_url or pre_url,
2020
+ snapshot_digest=snapshot_digest,
2021
+ llm_data={
2022
+ "response_text": llm_response_text,
2023
+ "response_hash": f"sha256:{self._compute_hash(llm_response_text)}",
2024
+ },
2025
+ exec_data=exec_data,
2026
+ verify_data=verify_data,
2027
+ )
2028
+
2029
+ _safe_tracer_call(
2030
+ self.tracer,
2031
+ "emit",
2032
+ self.verbose,
2033
+ "step_end",
2034
+ step_end_data,
2035
+ step_id=step_id,
2036
+ )
2037
+
2038
+ return result
2039
+
2040
+ except Exception as e:
2041
+ # Emit error trace event if tracer is enabled
2042
+ if self.tracer:
2043
+ _safe_tracer_call(
2044
+ self.tracer,
2045
+ "emit_error",
2046
+ self.verbose,
2047
+ step_id=step_id,
2048
+ error=str(e),
2049
+ attempt=0,
2050
+ )
2051
+
2052
+ if self.verbose:
2053
+ print(f"❌ Error: {e}")
2054
+
2055
+ # Re-raise the exception
2056
+ raise