sentienceapi 0.90.16__py3-none-any.whl → 0.98.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +120 -6
- sentience/_extension_loader.py +156 -1
- sentience/action_executor.py +217 -0
- sentience/actions.py +758 -30
- sentience/agent.py +806 -293
- sentience/agent_config.py +3 -0
- sentience/agent_runtime.py +840 -0
- sentience/asserts/__init__.py +70 -0
- sentience/asserts/expect.py +621 -0
- sentience/asserts/query.py +383 -0
- sentience/async_api.py +89 -1141
- sentience/backends/__init__.py +137 -0
- sentience/backends/actions.py +372 -0
- sentience/backends/browser_use_adapter.py +241 -0
- sentience/backends/cdp_backend.py +393 -0
- sentience/backends/exceptions.py +211 -0
- sentience/backends/playwright_backend.py +194 -0
- sentience/backends/protocol.py +216 -0
- sentience/backends/sentience_context.py +469 -0
- sentience/backends/snapshot.py +483 -0
- sentience/base_agent.py +95 -0
- sentience/browser.py +678 -39
- sentience/browser_evaluator.py +299 -0
- sentience/canonicalization.py +207 -0
- sentience/cloud_tracing.py +507 -42
- sentience/constants.py +6 -0
- sentience/conversational_agent.py +77 -43
- sentience/cursor_policy.py +142 -0
- sentience/element_filter.py +136 -0
- sentience/expect.py +98 -2
- sentience/extension/background.js +56 -185
- sentience/extension/content.js +150 -287
- sentience/extension/injected_api.js +1088 -1368
- sentience/extension/manifest.json +1 -1
- sentience/extension/pkg/sentience_core.d.ts +22 -22
- sentience/extension/pkg/sentience_core.js +275 -433
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/release.json +47 -47
- sentience/failure_artifacts.py +241 -0
- sentience/formatting.py +9 -53
- sentience/inspector.py +183 -1
- sentience/integrations/__init__.py +6 -0
- sentience/integrations/langchain/__init__.py +12 -0
- sentience/integrations/langchain/context.py +18 -0
- sentience/integrations/langchain/core.py +326 -0
- sentience/integrations/langchain/tools.py +180 -0
- sentience/integrations/models.py +46 -0
- sentience/integrations/pydanticai/__init__.py +15 -0
- sentience/integrations/pydanticai/deps.py +20 -0
- sentience/integrations/pydanticai/toolset.py +468 -0
- sentience/llm_interaction_handler.py +191 -0
- sentience/llm_provider.py +765 -66
- sentience/llm_provider_utils.py +120 -0
- sentience/llm_response_builder.py +153 -0
- sentience/models.py +595 -3
- sentience/ordinal.py +280 -0
- sentience/overlay.py +109 -2
- sentience/protocols.py +228 -0
- sentience/query.py +67 -5
- sentience/read.py +95 -3
- sentience/recorder.py +223 -3
- sentience/schemas/trace_v1.json +128 -9
- sentience/screenshot.py +48 -2
- sentience/sentience_methods.py +86 -0
- sentience/snapshot.py +599 -55
- sentience/snapshot_diff.py +126 -0
- sentience/text_search.py +120 -5
- sentience/trace_event_builder.py +148 -0
- sentience/trace_file_manager.py +197 -0
- sentience/trace_indexing/index_schema.py +95 -7
- sentience/trace_indexing/indexer.py +105 -48
- sentience/tracer_factory.py +120 -9
- sentience/tracing.py +172 -8
- sentience/utils/__init__.py +40 -0
- sentience/utils/browser.py +46 -0
- sentience/{utils.py → utils/element.py} +3 -42
- sentience/utils/formatting.py +59 -0
- sentience/verification.py +618 -0
- sentience/visual_agent.py +2058 -0
- sentience/wait.py +68 -2
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +199 -40
- sentienceapi-0.98.0.dist-info/RECORD +92 -0
- sentience/extension/test-content.js +0 -4
- sentienceapi-0.90.16.dist-info/RECORD +0 -50
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,2058 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Visual Agent - Uses labeled screenshots with vision-capable LLMs
|
|
3
|
+
|
|
4
|
+
This agent extends SentienceAgentAsync to use visual prompts:
|
|
5
|
+
1. Takes snapshot with screenshot enabled
|
|
6
|
+
2. Draws bounding boxes and labels element IDs on the screenshot
|
|
7
|
+
3. Uses anti-collision algorithm to position labels (4 sides + 4 corners)
|
|
8
|
+
4. Sends labeled screenshot to vision-capable LLM
|
|
9
|
+
5. Extracts element ID from LLM response
|
|
10
|
+
6. Clicks the element using click_async
|
|
11
|
+
|
|
12
|
+
Dependencies:
|
|
13
|
+
- Pillow (PIL): Required for image processing and drawing bounding boxes
|
|
14
|
+
Install with: pip install Pillow
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import base64
|
|
18
|
+
import hashlib
|
|
19
|
+
import io
|
|
20
|
+
import re
|
|
21
|
+
import time
|
|
22
|
+
import uuid
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
25
|
+
|
|
26
|
+
from .actions import click, click_async
|
|
27
|
+
from .agent import SentienceAgent, SentienceAgentAsync, _safe_tracer_call
|
|
28
|
+
from .async_api import AsyncSentienceBrowser
|
|
29
|
+
from .browser import SentienceBrowser
|
|
30
|
+
from .llm_provider import LLMProvider, LLMResponse
|
|
31
|
+
from .models import AgentActionResult, Element, Snapshot, SnapshotOptions
|
|
32
|
+
from .snapshot import snapshot
|
|
33
|
+
from .snapshot_diff import SnapshotDiff
|
|
34
|
+
from .trace_event_builder import TraceEventBuilder
|
|
35
|
+
|
|
36
|
+
# Only import PIL types for type checking, not at runtime
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
39
|
+
else:
|
|
40
|
+
# Create a dummy type for runtime when PIL is not available
|
|
41
|
+
Image = None
|
|
42
|
+
ImageDraw = None
|
|
43
|
+
ImageFont = None
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
from PIL import Image as PILImage
|
|
47
|
+
from PIL import ImageDraw as PILImageDraw
|
|
48
|
+
from PIL import ImageFont as PILImageFont
|
|
49
|
+
|
|
50
|
+
PIL_AVAILABLE = True
|
|
51
|
+
except ImportError:
|
|
52
|
+
PIL_AVAILABLE = False
|
|
53
|
+
# Define dummy values so type hints don't fail
|
|
54
|
+
PILImage = None # type: ignore
|
|
55
|
+
PILImageDraw = None # type: ignore
|
|
56
|
+
PILImageFont = None # type: ignore
|
|
57
|
+
# Don't print warning here - it will be printed when the class is instantiated
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SentienceVisualAgentAsync(SentienceAgentAsync):
|
|
61
|
+
"""
|
|
62
|
+
Async visual agent that uses labeled screenshots with vision-capable LLMs.
|
|
63
|
+
|
|
64
|
+
Extends SentienceAgentAsync to override act() method with visual prompting.
|
|
65
|
+
|
|
66
|
+
Requirements:
|
|
67
|
+
- Pillow (PIL): Required for image processing and drawing bounding boxes
|
|
68
|
+
Install with: pip install Pillow
|
|
69
|
+
- Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
browser: AsyncSentienceBrowser,
|
|
75
|
+
llm: LLMProvider,
|
|
76
|
+
default_snapshot_limit: int = 50,
|
|
77
|
+
verbose: bool = True,
|
|
78
|
+
tracer: Any | None = None,
|
|
79
|
+
config: Any | None = None,
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
Initialize Visual Agent
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
browser: AsyncSentienceBrowser instance
|
|
86
|
+
llm: LLM provider (must support vision, e.g., GPT-4o, Claude 3)
|
|
87
|
+
default_snapshot_limit: Default maximum elements to include
|
|
88
|
+
verbose: Print execution logs
|
|
89
|
+
tracer: Optional Tracer instance
|
|
90
|
+
config: Optional AgentConfig
|
|
91
|
+
"""
|
|
92
|
+
super().__init__(browser, llm, default_snapshot_limit, verbose, tracer, config)
|
|
93
|
+
|
|
94
|
+
if not PIL_AVAILABLE:
|
|
95
|
+
raise ImportError(
|
|
96
|
+
"PIL/Pillow is required for SentienceVisualAgentAsync. Install with: pip install Pillow"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Track previous snapshot for diff computation
|
|
100
|
+
self._previous_snapshot: Snapshot | None = None
|
|
101
|
+
|
|
102
|
+
def _decode_screenshot(self, screenshot_data_url: str) -> "PILImage.Image":
|
|
103
|
+
"""
|
|
104
|
+
Decode base64 screenshot data URL to PIL Image
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
screenshot_data_url: Base64-encoded data URL (e.g., "data:image/png;base64,...")
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
PIL Image object
|
|
111
|
+
"""
|
|
112
|
+
# Extract base64 data from data URL
|
|
113
|
+
if screenshot_data_url.startswith("data:image/"):
|
|
114
|
+
# Format: "data:image/png;base64,<base64_data>"
|
|
115
|
+
base64_data = screenshot_data_url.split(",", 1)[1]
|
|
116
|
+
else:
|
|
117
|
+
# Assume it's already base64
|
|
118
|
+
base64_data = screenshot_data_url
|
|
119
|
+
|
|
120
|
+
# Decode base64 to bytes
|
|
121
|
+
image_bytes = base64.b64decode(base64_data)
|
|
122
|
+
|
|
123
|
+
# Create PIL Image from bytes
|
|
124
|
+
return PILImage.open(io.BytesIO(image_bytes))
|
|
125
|
+
|
|
126
|
+
def _find_label_position(
|
|
127
|
+
self,
|
|
128
|
+
bbox: dict[str, float],
|
|
129
|
+
existing_labels: list[dict[str, Any]],
|
|
130
|
+
image_width: int,
|
|
131
|
+
image_height: int,
|
|
132
|
+
label_width: int,
|
|
133
|
+
label_height: int,
|
|
134
|
+
) -> tuple[float, float]:
|
|
135
|
+
"""
|
|
136
|
+
Find best position for label using anti-collision algorithm.
|
|
137
|
+
|
|
138
|
+
Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners (top-left, top-right, bottom-left, bottom-right)
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
bbox: Element bounding box {x, y, width, height}
|
|
142
|
+
existing_labels: List of existing label positions {x, y, width, height}
|
|
143
|
+
image_width: Screenshot width
|
|
144
|
+
image_height: Screenshot height
|
|
145
|
+
label_width: Label text width
|
|
146
|
+
label_height: Label text height
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
(x, y) position for label
|
|
150
|
+
"""
|
|
151
|
+
x, y, width, height = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
|
|
152
|
+
center_x = x + width / 2
|
|
153
|
+
center_y = y + height / 2
|
|
154
|
+
|
|
155
|
+
# Anti-collision algorithm
|
|
156
|
+
# Define 8 candidate positions (4 sides + 4 corners)
|
|
157
|
+
# Increased distance from element to avoid confusion (15px instead of 5px)
|
|
158
|
+
label_offset = 15 # Increased from 5 to make labels more clearly separate
|
|
159
|
+
candidates = [
|
|
160
|
+
# 4 sides
|
|
161
|
+
(center_x - label_width / 2, y - label_height - label_offset, "top"), # Above element
|
|
162
|
+
(center_x - label_width / 2, y + height + label_offset, "bottom"), # Below element
|
|
163
|
+
(
|
|
164
|
+
x - label_width - label_offset,
|
|
165
|
+
center_y - label_height / 2,
|
|
166
|
+
"left",
|
|
167
|
+
), # Left of element
|
|
168
|
+
(x + width + label_offset, center_y - label_height / 2, "right"), # Right of element
|
|
169
|
+
# 4 corners
|
|
170
|
+
(
|
|
171
|
+
x - label_width - label_offset,
|
|
172
|
+
y - label_height - label_offset,
|
|
173
|
+
"top-left",
|
|
174
|
+
), # Top-left corner
|
|
175
|
+
(
|
|
176
|
+
x + width + label_offset,
|
|
177
|
+
y - label_height - label_offset,
|
|
178
|
+
"top-right",
|
|
179
|
+
), # Top-right corner
|
|
180
|
+
(
|
|
181
|
+
x - label_width - label_offset,
|
|
182
|
+
y + height + label_offset,
|
|
183
|
+
"bottom-left",
|
|
184
|
+
), # Bottom-left corner
|
|
185
|
+
(
|
|
186
|
+
x + width + label_offset,
|
|
187
|
+
y + height + label_offset,
|
|
188
|
+
"bottom-right",
|
|
189
|
+
), # Bottom-right corner
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
# Check each candidate position for collisions
|
|
193
|
+
for candidate_x, candidate_y, _ in candidates:
|
|
194
|
+
# Check bounds
|
|
195
|
+
if candidate_x < 0 or candidate_y < 0:
|
|
196
|
+
continue
|
|
197
|
+
if candidate_x + label_width > image_width or candidate_y + label_height > image_height:
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
# Check collision with existing labels
|
|
201
|
+
collision = False
|
|
202
|
+
for existing in existing_labels:
|
|
203
|
+
ex, ey, ew, eh = existing["x"], existing["y"], existing["width"], existing["height"]
|
|
204
|
+
# Check if rectangles overlap
|
|
205
|
+
if not (
|
|
206
|
+
candidate_x + label_width < ex
|
|
207
|
+
or candidate_x > ex + ew
|
|
208
|
+
or candidate_y + label_height < ey
|
|
209
|
+
or candidate_y > ey + eh
|
|
210
|
+
):
|
|
211
|
+
collision = True
|
|
212
|
+
break
|
|
213
|
+
|
|
214
|
+
if not collision:
|
|
215
|
+
return (candidate_x, candidate_y)
|
|
216
|
+
|
|
217
|
+
# If all positions collide, use top position (may overlap but better than nothing)
|
|
218
|
+
return (center_x - label_width / 2, y - label_height - 15)
|
|
219
|
+
|
|
220
|
+
def _draw_labeled_screenshot(
|
|
221
|
+
self,
|
|
222
|
+
snapshot: Snapshot,
|
|
223
|
+
elements: list[Element],
|
|
224
|
+
) -> "PILImage.Image":
|
|
225
|
+
"""
|
|
226
|
+
Draw bounding boxes and labels on screenshot.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
snapshot: Snapshot with screenshot data
|
|
230
|
+
elements: List of elements to draw
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
PIL Image with bounding boxes and labels
|
|
234
|
+
"""
|
|
235
|
+
if not snapshot.screenshot:
|
|
236
|
+
raise ValueError("Screenshot not available in snapshot")
|
|
237
|
+
|
|
238
|
+
# Decode screenshot
|
|
239
|
+
img = self._decode_screenshot(snapshot.screenshot)
|
|
240
|
+
draw = PILImageDraw.Draw(img)
|
|
241
|
+
|
|
242
|
+
# Try to load a font, fallback to default if not available
|
|
243
|
+
try:
|
|
244
|
+
# Try to use a system font
|
|
245
|
+
font = PILImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
|
|
246
|
+
except:
|
|
247
|
+
try:
|
|
248
|
+
font = PILImageFont.truetype("arial.ttf", 16)
|
|
249
|
+
except:
|
|
250
|
+
# Use default font if system fonts not available
|
|
251
|
+
font = PILImageFont.load_default()
|
|
252
|
+
|
|
253
|
+
image_width, image_height = img.size
|
|
254
|
+
existing_labels: list[dict[str, Any]] = []
|
|
255
|
+
|
|
256
|
+
# Neon green color: #39FF14 (bright, vibrant green)
|
|
257
|
+
neon_green = "#39FF14"
|
|
258
|
+
|
|
259
|
+
# Draw bounding boxes and labels for each element
|
|
260
|
+
for element in elements:
|
|
261
|
+
bbox = element.bbox
|
|
262
|
+
x, y, width, height = bbox.x, bbox.y, bbox.width, bbox.height
|
|
263
|
+
|
|
264
|
+
# Draw bounding box rectangle (neon green with 2px width)
|
|
265
|
+
draw.rectangle(
|
|
266
|
+
[(x, y), (x + width, y + height)],
|
|
267
|
+
outline=neon_green,
|
|
268
|
+
width=2,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Prepare label text (just the number - keep it simple and compact)
|
|
272
|
+
label_text = str(element.id)
|
|
273
|
+
|
|
274
|
+
# Measure label text size
|
|
275
|
+
bbox_text = draw.textbbox((0, 0), label_text, font=font)
|
|
276
|
+
label_width = bbox_text[2] - bbox_text[0]
|
|
277
|
+
label_height = bbox_text[3] - bbox_text[1]
|
|
278
|
+
|
|
279
|
+
# Find best position for label (anti-collision)
|
|
280
|
+
label_x, label_y = self._find_label_position(
|
|
281
|
+
{"x": x, "y": y, "width": width, "height": height},
|
|
282
|
+
existing_labels,
|
|
283
|
+
image_width,
|
|
284
|
+
image_height,
|
|
285
|
+
label_width + 8, # Add padding
|
|
286
|
+
label_height + 4, # Add padding
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Calculate connection points for a clearer visual link
|
|
290
|
+
# Connect from the nearest corner/edge of element to the label
|
|
291
|
+
element_center_x = x + width / 2
|
|
292
|
+
element_center_y = y + height / 2
|
|
293
|
+
label_center_x = label_x + label_width / 2
|
|
294
|
+
label_center_y = label_y + label_height / 2
|
|
295
|
+
|
|
296
|
+
# Determine which edge of the element is closest to the label
|
|
297
|
+
# and draw line from that edge point to the label
|
|
298
|
+
dist_top = abs(label_center_y - y)
|
|
299
|
+
dist_bottom = abs(label_center_y - (y + height))
|
|
300
|
+
dist_left = abs(label_center_x - x)
|
|
301
|
+
dist_right = abs(label_center_x - (x + width))
|
|
302
|
+
|
|
303
|
+
min_dist = min(dist_top, dist_bottom, dist_left, dist_right)
|
|
304
|
+
|
|
305
|
+
if min_dist == dist_top:
|
|
306
|
+
# Label is above - connect from top edge
|
|
307
|
+
line_start = (element_center_x, y)
|
|
308
|
+
elif min_dist == dist_bottom:
|
|
309
|
+
# Label is below - connect from bottom edge
|
|
310
|
+
line_start = (element_center_x, y + height)
|
|
311
|
+
elif min_dist == dist_left:
|
|
312
|
+
# Label is left - connect from left edge
|
|
313
|
+
line_start = (x, element_center_y)
|
|
314
|
+
else:
|
|
315
|
+
# Label is right - connect from right edge
|
|
316
|
+
line_start = (x + width, element_center_y)
|
|
317
|
+
|
|
318
|
+
# Draw connecting line from element edge to label (makes it clear the label belongs to the element)
|
|
319
|
+
draw.line(
|
|
320
|
+
[line_start, (label_center_x, label_center_y)],
|
|
321
|
+
fill=neon_green,
|
|
322
|
+
width=2, # Slightly thicker for better visibility
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Draw label background (white with neon green border)
|
|
326
|
+
label_bg_x1 = label_x - 4
|
|
327
|
+
label_bg_y1 = label_y - 2
|
|
328
|
+
label_bg_x2 = label_x + label_width + 4
|
|
329
|
+
label_bg_y2 = label_y + label_height + 2
|
|
330
|
+
|
|
331
|
+
# Draw white background with neon green border (makes label stand out as separate)
|
|
332
|
+
draw.rectangle(
|
|
333
|
+
[(label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2)],
|
|
334
|
+
fill="white",
|
|
335
|
+
outline=neon_green,
|
|
336
|
+
width=2, # Thicker border to make it more distinct
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Draw label text (black for high contrast)
|
|
340
|
+
draw.text(
|
|
341
|
+
(label_x, label_y),
|
|
342
|
+
label_text,
|
|
343
|
+
fill="black",
|
|
344
|
+
font=font,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Record label position for collision detection
|
|
348
|
+
existing_labels.append(
|
|
349
|
+
{
|
|
350
|
+
"x": label_bg_x1,
|
|
351
|
+
"y": label_bg_y1,
|
|
352
|
+
"width": label_bg_x2 - label_bg_x1,
|
|
353
|
+
"height": label_bg_y2 - label_bg_y1,
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return img
|
|
358
|
+
|
|
359
|
+
def _encode_image_to_base64(
|
|
360
|
+
self, image: "PILImage.Image", format: str = "PNG", max_size_mb: float = 20.0
|
|
361
|
+
) -> str:
|
|
362
|
+
"""
|
|
363
|
+
Encode PIL Image to base64 data URL with size optimization.
|
|
364
|
+
|
|
365
|
+
Vision LLM APIs typically have size limits (e.g., 20MB for OpenAI).
|
|
366
|
+
This function automatically compresses images if they're too large.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
image: PIL Image object
|
|
370
|
+
format: Image format (PNG or JPEG)
|
|
371
|
+
max_size_mb: Maximum size in MB before compression (default: 20MB)
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
Base64-encoded data URL
|
|
375
|
+
"""
|
|
376
|
+
# Convert format for PIL
|
|
377
|
+
pil_format = format.upper()
|
|
378
|
+
|
|
379
|
+
# Try JPEG first for better compression (unless PNG is specifically requested)
|
|
380
|
+
if format.upper() != "PNG":
|
|
381
|
+
pil_format = "JPEG"
|
|
382
|
+
# Convert RGBA to RGB for JPEG
|
|
383
|
+
if image.mode in ("RGBA", "LA", "P"):
|
|
384
|
+
# Create white background
|
|
385
|
+
rgb_image = PILImage.new("RGB", image.size, (255, 255, 255))
|
|
386
|
+
if image.mode == "P":
|
|
387
|
+
image = image.convert("RGBA")
|
|
388
|
+
rgb_image.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None)
|
|
389
|
+
image = rgb_image
|
|
390
|
+
|
|
391
|
+
buffer = io.BytesIO()
|
|
392
|
+
quality = 95 # Start with high quality
|
|
393
|
+
|
|
394
|
+
# Try to fit within size limit
|
|
395
|
+
for attempt in range(3):
|
|
396
|
+
buffer.seek(0)
|
|
397
|
+
buffer.truncate(0)
|
|
398
|
+
|
|
399
|
+
if pil_format == "JPEG":
|
|
400
|
+
image.save(buffer, format=pil_format, quality=quality, optimize=True)
|
|
401
|
+
else:
|
|
402
|
+
image.save(buffer, format=pil_format, optimize=True)
|
|
403
|
+
|
|
404
|
+
size_mb = len(buffer.getvalue()) / (1024 * 1024)
|
|
405
|
+
|
|
406
|
+
if size_mb <= max_size_mb:
|
|
407
|
+
break
|
|
408
|
+
|
|
409
|
+
# Reduce quality for next attempt
|
|
410
|
+
quality = max(70, quality - 15)
|
|
411
|
+
if self.verbose and attempt == 0:
|
|
412
|
+
print(f" ⚠️ Image size {size_mb:.2f}MB exceeds limit, compressing...")
|
|
413
|
+
|
|
414
|
+
image_bytes = buffer.getvalue()
|
|
415
|
+
base64_data = base64.b64encode(image_bytes).decode("utf-8")
|
|
416
|
+
|
|
417
|
+
final_size_mb = len(image_bytes) / (1024 * 1024)
|
|
418
|
+
if self.verbose:
|
|
419
|
+
print(f" 📸 Image encoded: {final_size_mb:.2f}MB ({len(base64_data)} chars base64)")
|
|
420
|
+
|
|
421
|
+
mime_type = "image/png" if pil_format == "PNG" else "image/jpeg"
|
|
422
|
+
return f"data:{mime_type};base64,{base64_data}"
|
|
423
|
+
|
|
424
|
+
async def _query_llm_with_vision(
|
|
425
|
+
self,
|
|
426
|
+
image_data_url: str,
|
|
427
|
+
goal: str,
|
|
428
|
+
) -> LLMResponse:
|
|
429
|
+
"""
|
|
430
|
+
Query LLM with vision (labeled screenshot).
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
image_data_url: Base64-encoded image data URL
|
|
434
|
+
goal: User's goal/task
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
LLMResponse with element ID
|
|
438
|
+
"""
|
|
439
|
+
system_prompt = """You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
|
|
440
|
+
Each clickable element has:
|
|
441
|
+
- A bright neon green (#39FF14) bounding box around the element
|
|
442
|
+
- A white label box with a number (the element ID) connected by a green line
|
|
443
|
+
- The label is clearly separate from the element (not part of the UI)
|
|
444
|
+
|
|
445
|
+
CRITICAL INSTRUCTIONS:
|
|
446
|
+
1. Look at the screenshot carefully
|
|
447
|
+
2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
|
|
448
|
+
3. Follow the green line from that element to find its label box with the ID number
|
|
449
|
+
4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
|
|
450
|
+
5. Do NOT include any explanation, reasoning, or other text
|
|
451
|
+
6. Do NOT say "element 1" or "the first element" - just return the number
|
|
452
|
+
7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
|
|
453
|
+
|
|
454
|
+
Example responses:
|
|
455
|
+
- Correct: "42"
|
|
456
|
+
- Correct: "1567"
|
|
457
|
+
- Wrong: "I see element 42"
|
|
458
|
+
- Wrong: "The element ID is 42"
|
|
459
|
+
- Wrong: "42 (the search box)" """
|
|
460
|
+
|
|
461
|
+
user_prompt = f"""Goal: {goal}
|
|
462
|
+
|
|
463
|
+
Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
|
|
464
|
+
Find the element that should be clicked to accomplish this goal.
|
|
465
|
+
Return ONLY the integer ID number from the label, nothing else."""
|
|
466
|
+
|
|
467
|
+
# Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
|
|
468
|
+
# Vision-capable providers use similar message format with image_url
|
|
469
|
+
if hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
|
|
470
|
+
# Vision-capable provider - use vision API
|
|
471
|
+
try:
|
|
472
|
+
from openai import OpenAI
|
|
473
|
+
|
|
474
|
+
# Check if it's OpenAI
|
|
475
|
+
if isinstance(self.llm.client, OpenAI):
|
|
476
|
+
messages = [
|
|
477
|
+
{
|
|
478
|
+
"role": "system",
|
|
479
|
+
"content": system_prompt,
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
"role": "user",
|
|
483
|
+
"content": [
|
|
484
|
+
{"type": "text", "text": user_prompt},
|
|
485
|
+
{
|
|
486
|
+
"type": "image_url",
|
|
487
|
+
"image_url": {"url": image_data_url},
|
|
488
|
+
},
|
|
489
|
+
],
|
|
490
|
+
},
|
|
491
|
+
]
|
|
492
|
+
|
|
493
|
+
response = self.llm.client.chat.completions.create(
|
|
494
|
+
model=self.llm._model_name,
|
|
495
|
+
messages=messages,
|
|
496
|
+
temperature=0.0,
|
|
497
|
+
# Removed max_tokens to use API default (usually higher limit)
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
content = response.choices[0].message.content or ""
|
|
501
|
+
usage = response.usage
|
|
502
|
+
|
|
503
|
+
from .llm_response_builder import LLMResponseBuilder
|
|
504
|
+
|
|
505
|
+
return LLMResponseBuilder.from_openai_format(
|
|
506
|
+
content=content,
|
|
507
|
+
prompt_tokens=usage.prompt_tokens if usage else None,
|
|
508
|
+
completion_tokens=usage.completion_tokens if usage else None,
|
|
509
|
+
total_tokens=usage.total_tokens if usage else None,
|
|
510
|
+
model_name=response.model,
|
|
511
|
+
finish_reason=response.choices[0].finish_reason,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Check if provider supports vision API (uses OpenAI-compatible format)
|
|
515
|
+
elif hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
|
|
516
|
+
# Vision API uses similar format to OpenAI
|
|
517
|
+
if self.verbose:
|
|
518
|
+
print(f" 🔍 Using vision API with model: {self.llm._model_name}")
|
|
519
|
+
print(f" 📐 Image data URL length: {len(image_data_url)} chars")
|
|
520
|
+
|
|
521
|
+
messages = [
|
|
522
|
+
{
|
|
523
|
+
"role": "system",
|
|
524
|
+
"content": system_prompt,
|
|
525
|
+
},
|
|
526
|
+
{
|
|
527
|
+
"role": "user",
|
|
528
|
+
"content": [
|
|
529
|
+
{"type": "text", "text": user_prompt},
|
|
530
|
+
{
|
|
531
|
+
"type": "image_url",
|
|
532
|
+
"image_url": {"url": image_data_url},
|
|
533
|
+
},
|
|
534
|
+
],
|
|
535
|
+
},
|
|
536
|
+
]
|
|
537
|
+
|
|
538
|
+
try:
|
|
539
|
+
if self.verbose:
|
|
540
|
+
print(f" 📤 Sending request to vision API...")
|
|
541
|
+
print(f" 📋 Messages structure: {len(messages)} messages")
|
|
542
|
+
print(f" 🖼️ Image URL prefix: {image_data_url[:50]}...")
|
|
543
|
+
|
|
544
|
+
# Removed max_tokens to use API default (usually higher limit)
|
|
545
|
+
# This allows the model to generate complete responses without truncation
|
|
546
|
+
response = self.llm.client.chat.completions.create(
|
|
547
|
+
model=self.llm._model_name,
|
|
548
|
+
messages=messages,
|
|
549
|
+
temperature=0.0,
|
|
550
|
+
# No max_tokens - use API default
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Debug: Check response structure
|
|
554
|
+
if self.verbose:
|
|
555
|
+
print(f" 📥 Response received")
|
|
556
|
+
print(f" 📦 Response type: {type(response)}")
|
|
557
|
+
print(
|
|
558
|
+
f" 📦 Choices count: {len(response.choices) if hasattr(response, 'choices') else 0}"
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
if not hasattr(response, "choices") or len(response.choices) == 0:
|
|
562
|
+
raise ValueError("Vision API returned no choices in response")
|
|
563
|
+
|
|
564
|
+
choice = response.choices[0]
|
|
565
|
+
content = (
|
|
566
|
+
choice.message.content if hasattr(choice.message, "content") else None
|
|
567
|
+
)
|
|
568
|
+
finish_reason = (
|
|
569
|
+
choice.finish_reason if hasattr(choice, "finish_reason") else None
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
if self.verbose:
|
|
573
|
+
print(f" 📝 Content: {repr(content)}")
|
|
574
|
+
print(f" 🏁 Finish reason: {finish_reason}")
|
|
575
|
+
if finish_reason:
|
|
576
|
+
print(f" ⚠️ Finish reason indicates: {finish_reason}")
|
|
577
|
+
if finish_reason == "length":
|
|
578
|
+
print(
|
|
579
|
+
f" - Response was truncated (hit API default max_tokens limit)"
|
|
580
|
+
)
|
|
581
|
+
print(
|
|
582
|
+
f" - This might indicate the model needs more tokens or doesn't support vision properly"
|
|
583
|
+
)
|
|
584
|
+
# Even if truncated, there might be partial content
|
|
585
|
+
if content:
|
|
586
|
+
print(
|
|
587
|
+
f" - ⚠️ Partial content received: {repr(content)}"
|
|
588
|
+
)
|
|
589
|
+
elif finish_reason == "content_filter":
|
|
590
|
+
print(f" - Content was filtered by safety filters")
|
|
591
|
+
elif finish_reason == "stop":
|
|
592
|
+
print(f" - Normal completion")
|
|
593
|
+
|
|
594
|
+
# If finish_reason is "length", we might still have partial content
|
|
595
|
+
# Try to use it if available (even if truncated, it might contain the element ID)
|
|
596
|
+
if finish_reason == "length" and content and content.strip():
|
|
597
|
+
if self.verbose:
|
|
598
|
+
print(f" ⚠️ Using truncated response: {repr(content)}")
|
|
599
|
+
# Continue processing with partial content
|
|
600
|
+
|
|
601
|
+
if content is None or content == "":
|
|
602
|
+
error_msg = f"Vision API returned empty content (finish_reason: {finish_reason})"
|
|
603
|
+
if self.verbose:
|
|
604
|
+
print(f" ❌ {error_msg}")
|
|
605
|
+
print(f" 💡 Possible causes:")
|
|
606
|
+
print(
|
|
607
|
+
f" - Model {self.llm._model_name} may not support vision"
|
|
608
|
+
)
|
|
609
|
+
print(f" - Image format might not be supported")
|
|
610
|
+
print(f" - API default max_tokens might be too restrictive")
|
|
611
|
+
print(f" - API response structure might be different")
|
|
612
|
+
if finish_reason == "length":
|
|
613
|
+
print(
|
|
614
|
+
f" - ⚠️ Response was truncated - content might have been cut off"
|
|
615
|
+
)
|
|
616
|
+
print(
|
|
617
|
+
f" - Try increasing max_tokens or check response.choices[0].message for partial content"
|
|
618
|
+
)
|
|
619
|
+
raise ValueError(error_msg)
|
|
620
|
+
|
|
621
|
+
usage = response.usage if hasattr(response, "usage") else None
|
|
622
|
+
|
|
623
|
+
if self.verbose:
|
|
624
|
+
print(f" ✅ Vision API response received")
|
|
625
|
+
print(
|
|
626
|
+
f" 📊 Tokens: {usage.total_tokens if usage else 'N/A'} (prompt: {usage.prompt_tokens if usage else 'N/A'}, completion: {usage.completion_tokens if usage else 'N/A'})"
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
from .llm_response_builder import LLMResponseBuilder
|
|
630
|
+
|
|
631
|
+
return LLMResponseBuilder.from_openai_format(
|
|
632
|
+
content=content,
|
|
633
|
+
prompt_tokens=usage.prompt_tokens if usage else None,
|
|
634
|
+
completion_tokens=usage.completion_tokens if usage else None,
|
|
635
|
+
total_tokens=usage.total_tokens if usage else None,
|
|
636
|
+
model_name=(
|
|
637
|
+
response.model
|
|
638
|
+
if hasattr(response, "model")
|
|
639
|
+
else self.llm._model_name
|
|
640
|
+
),
|
|
641
|
+
finish_reason=finish_reason,
|
|
642
|
+
)
|
|
643
|
+
except Exception as vision_error:
|
|
644
|
+
if self.verbose:
|
|
645
|
+
print(f" ❌ Vision API error: {vision_error}")
|
|
646
|
+
print(f" 💡 This might indicate:")
|
|
647
|
+
print(f" - Model {self.llm._model_name} doesn't support vision")
|
|
648
|
+
print(f" - Image format/size issue")
|
|
649
|
+
print(f" - API key or permissions issue")
|
|
650
|
+
print(f" 🔄 Attempting fallback to regular generate method...")
|
|
651
|
+
|
|
652
|
+
# Fallback: Try using the regular generate method
|
|
653
|
+
# Some models might need images passed differently
|
|
654
|
+
try:
|
|
655
|
+
# Try embedding image in the prompt as base64
|
|
656
|
+
fallback_prompt = f"{user_prompt}\n\n[Image: {image_data_url[:200]}...]"
|
|
657
|
+
fallback_response = self.llm.generate(
|
|
658
|
+
system_prompt,
|
|
659
|
+
fallback_prompt,
|
|
660
|
+
temperature=0.0,
|
|
661
|
+
# No max_tokens - use API default
|
|
662
|
+
)
|
|
663
|
+
if self.verbose:
|
|
664
|
+
print(f" ⚠️ Using fallback method (may not support vision)")
|
|
665
|
+
return fallback_response
|
|
666
|
+
except Exception as fallback_error:
|
|
667
|
+
if self.verbose:
|
|
668
|
+
print(f" ❌ Fallback also failed: {fallback_error}")
|
|
669
|
+
raise vision_error # Raise original error
|
|
670
|
+
except ImportError:
|
|
671
|
+
# openai or other vision SDK not available
|
|
672
|
+
pass
|
|
673
|
+
except Exception as e:
|
|
674
|
+
if self.verbose:
|
|
675
|
+
print(f"⚠️ Vision API error: {e}, falling back to text-only")
|
|
676
|
+
|
|
677
|
+
# Fallback: Try to pass image via kwargs or use text-only
|
|
678
|
+
# Some providers might accept image in kwargs
|
|
679
|
+
try:
|
|
680
|
+
return self.llm.generate(
|
|
681
|
+
system_prompt,
|
|
682
|
+
f"{user_prompt}\n\n[Image data: {image_data_url[:100]}...]",
|
|
683
|
+
temperature=0.0,
|
|
684
|
+
# No max_tokens - use API default
|
|
685
|
+
)
|
|
686
|
+
except Exception as e:
|
|
687
|
+
raise RuntimeError(
|
|
688
|
+
f"LLM provider {type(self.llm).__name__} may not support vision. "
|
|
689
|
+
f"Error: {e}. Use a vision-capable model like GPT-4o or Claude 3."
|
|
690
|
+
) from e
|
|
691
|
+
|
|
692
|
+
def _extract_element_id(self, llm_response: str) -> int | None:
|
|
693
|
+
"""
|
|
694
|
+
Extract element ID integer from LLM response.
|
|
695
|
+
|
|
696
|
+
Args:
|
|
697
|
+
llm_response: LLM response text
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
Element ID as integer, or None if not found
|
|
701
|
+
"""
|
|
702
|
+
if self.verbose:
|
|
703
|
+
print(f"🔍 Raw LLM response: {repr(llm_response)}")
|
|
704
|
+
|
|
705
|
+
# Clean the response - remove leading/trailing whitespace (handles '\n177', '177\n', etc.)
|
|
706
|
+
cleaned = llm_response.strip()
|
|
707
|
+
|
|
708
|
+
if self.verbose:
|
|
709
|
+
print(f" 🧹 After strip: {repr(cleaned)}")
|
|
710
|
+
|
|
711
|
+
# Remove common prefixes that LLMs might add
|
|
712
|
+
prefixes_to_remove = [
|
|
713
|
+
"element",
|
|
714
|
+
"id",
|
|
715
|
+
"the element",
|
|
716
|
+
"element id",
|
|
717
|
+
"the id",
|
|
718
|
+
"click",
|
|
719
|
+
"click on",
|
|
720
|
+
"select",
|
|
721
|
+
"choose",
|
|
722
|
+
]
|
|
723
|
+
for prefix in prefixes_to_remove:
|
|
724
|
+
if cleaned.lower().startswith(prefix):
|
|
725
|
+
cleaned = cleaned[len(prefix) :].strip()
|
|
726
|
+
# Remove any remaining punctuation
|
|
727
|
+
cleaned = cleaned.lstrip(":.,;!?()[]{}")
|
|
728
|
+
cleaned = cleaned.strip()
|
|
729
|
+
if self.verbose:
|
|
730
|
+
print(f" 🧹 After removing prefix '{prefix}': {repr(cleaned)}")
|
|
731
|
+
|
|
732
|
+
# Try to find all integers in the cleaned response
|
|
733
|
+
numbers = re.findall(r"\d+", cleaned)
|
|
734
|
+
|
|
735
|
+
if self.verbose:
|
|
736
|
+
print(f" 🔢 Numbers found: {numbers}")
|
|
737
|
+
|
|
738
|
+
if numbers:
|
|
739
|
+
# If multiple numbers found, prefer the largest one (likely the actual element ID)
|
|
740
|
+
# Element IDs are typically larger numbers, not small ones like "1"
|
|
741
|
+
try:
|
|
742
|
+
# Convert all to int
|
|
743
|
+
int_numbers = [int(n) for n in numbers]
|
|
744
|
+
if self.verbose:
|
|
745
|
+
print(f" 🔢 As integers: {int_numbers}")
|
|
746
|
+
|
|
747
|
+
# Prefer larger numbers (element IDs are usually > 10)
|
|
748
|
+
# But if only small numbers exist, use the first one
|
|
749
|
+
large_numbers = [n for n in int_numbers if n > 10]
|
|
750
|
+
if large_numbers:
|
|
751
|
+
element_id = max(large_numbers) # Take the largest
|
|
752
|
+
if self.verbose:
|
|
753
|
+
print(f" ✅ Selected largest number > 10: {element_id}")
|
|
754
|
+
else:
|
|
755
|
+
element_id = int_numbers[0] # Fallback to first if all are small
|
|
756
|
+
if self.verbose:
|
|
757
|
+
print(f" ⚠️ All numbers ≤ 10, using first: {element_id}")
|
|
758
|
+
|
|
759
|
+
if self.verbose:
|
|
760
|
+
print(f"✅ Extracted element ID: {element_id} (from {numbers})")
|
|
761
|
+
return element_id
|
|
762
|
+
except ValueError:
|
|
763
|
+
if self.verbose:
|
|
764
|
+
print(f" ❌ Failed to convert numbers to integers")
|
|
765
|
+
pass
|
|
766
|
+
|
|
767
|
+
if self.verbose:
|
|
768
|
+
print(f"⚠️ Could not extract element ID from response: {llm_response}")
|
|
769
|
+
return None
|
|
770
|
+
|
|
771
|
+
def _compute_hash(self, text: str) -> str:
|
|
772
|
+
"""Compute SHA256 hash of text."""
|
|
773
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
774
|
+
|
|
775
|
+
async def act(
|
|
776
|
+
self,
|
|
777
|
+
goal: str,
|
|
778
|
+
max_retries: int = 2,
|
|
779
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
780
|
+
) -> AgentActionResult:
|
|
781
|
+
"""
|
|
782
|
+
Override act() method to use visual prompting with full tracing support.
|
|
783
|
+
|
|
784
|
+
Args:
|
|
785
|
+
goal: User's goal/task
|
|
786
|
+
max_retries: Maximum retry attempts
|
|
787
|
+
snapshot_options: Optional snapshot options (screenshot will be enabled)
|
|
788
|
+
|
|
789
|
+
Returns:
|
|
790
|
+
AgentActionResult
|
|
791
|
+
"""
|
|
792
|
+
if self.verbose:
|
|
793
|
+
print(f"\n{'=' * 70}")
|
|
794
|
+
print(f"🤖 Visual Agent Goal: {goal}")
|
|
795
|
+
print(f"{'=' * 70}")
|
|
796
|
+
|
|
797
|
+
# Generate step ID for tracing
|
|
798
|
+
self._step_count += 1
|
|
799
|
+
step_id = f"step-{self._step_count}"
|
|
800
|
+
|
|
801
|
+
# Emit step_start trace event if tracer is enabled
|
|
802
|
+
if self.tracer:
|
|
803
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
804
|
+
_safe_tracer_call(
|
|
805
|
+
self.tracer,
|
|
806
|
+
"emit_step_start",
|
|
807
|
+
self.verbose,
|
|
808
|
+
step_id=step_id,
|
|
809
|
+
step_index=self._step_count,
|
|
810
|
+
goal=goal,
|
|
811
|
+
attempt=0,
|
|
812
|
+
pre_url=pre_url,
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
start_time = time.time()
|
|
816
|
+
|
|
817
|
+
try:
|
|
818
|
+
# Ensure screenshot is enabled
|
|
819
|
+
if snapshot_options is None:
|
|
820
|
+
snapshot_options = SnapshotOptions()
|
|
821
|
+
|
|
822
|
+
# Enable screenshot if not already enabled
|
|
823
|
+
if snapshot_options.screenshot is False or snapshot_options.screenshot is None:
|
|
824
|
+
from .models import ScreenshotConfig
|
|
825
|
+
|
|
826
|
+
snapshot_options.screenshot = ScreenshotConfig(format="png")
|
|
827
|
+
|
|
828
|
+
# Set goal if not already provided
|
|
829
|
+
if snapshot_options.goal is None:
|
|
830
|
+
snapshot_options.goal = goal
|
|
831
|
+
|
|
832
|
+
# Set limit if not provided
|
|
833
|
+
if snapshot_options.limit is None:
|
|
834
|
+
snapshot_options.limit = self.default_snapshot_limit
|
|
835
|
+
|
|
836
|
+
if self.verbose:
|
|
837
|
+
print(f"🎯 Goal: {goal}")
|
|
838
|
+
print("📸 Taking snapshot with screenshot...")
|
|
839
|
+
|
|
840
|
+
# 1. Take snapshot with screenshot
|
|
841
|
+
from .snapshot import snapshot_async
|
|
842
|
+
|
|
843
|
+
snap = await snapshot_async(self.browser, snapshot_options)
|
|
844
|
+
|
|
845
|
+
if snap.status != "success":
|
|
846
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
847
|
+
|
|
848
|
+
if not snap.screenshot:
|
|
849
|
+
raise RuntimeError("Screenshot not available in snapshot")
|
|
850
|
+
|
|
851
|
+
# Compute diff_status by comparing with previous snapshot
|
|
852
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
853
|
+
|
|
854
|
+
# Create snapshot with diff_status populated
|
|
855
|
+
snap_with_diff = Snapshot(
|
|
856
|
+
status=snap.status,
|
|
857
|
+
timestamp=snap.timestamp,
|
|
858
|
+
url=snap.url,
|
|
859
|
+
viewport=snap.viewport,
|
|
860
|
+
elements=elements_with_diff,
|
|
861
|
+
screenshot=snap.screenshot,
|
|
862
|
+
screenshot_format=snap.screenshot_format,
|
|
863
|
+
error=snap.error,
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
# Update previous snapshot for next comparison
|
|
867
|
+
self._previous_snapshot = snap
|
|
868
|
+
|
|
869
|
+
# Emit snapshot trace event if tracer is enabled
|
|
870
|
+
if self.tracer:
|
|
871
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
872
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
873
|
+
|
|
874
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
875
|
+
if snap.screenshot:
|
|
876
|
+
# Extract base64 string from data URL if needed
|
|
877
|
+
if snap.screenshot.startswith("data:image"):
|
|
878
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
879
|
+
screenshot_base64 = (
|
|
880
|
+
snap.screenshot.split(",", 1)[1]
|
|
881
|
+
if "," in snap.screenshot
|
|
882
|
+
else snap.screenshot
|
|
883
|
+
)
|
|
884
|
+
else:
|
|
885
|
+
screenshot_base64 = snap.screenshot
|
|
886
|
+
|
|
887
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
888
|
+
if snap.screenshot_format:
|
|
889
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
890
|
+
|
|
891
|
+
_safe_tracer_call(
|
|
892
|
+
self.tracer,
|
|
893
|
+
"emit",
|
|
894
|
+
self.verbose,
|
|
895
|
+
"snapshot",
|
|
896
|
+
snapshot_data,
|
|
897
|
+
step_id=step_id,
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
if self.verbose:
|
|
901
|
+
print(f"✅ Snapshot taken: {len(snap.elements)} elements")
|
|
902
|
+
|
|
903
|
+
# 2. Draw labeled screenshot
|
|
904
|
+
if self.verbose:
|
|
905
|
+
print("🎨 Drawing bounding boxes and labels...")
|
|
906
|
+
print(f" Elements to label: {len(snap.elements)}")
|
|
907
|
+
if len(snap.elements) > 0:
|
|
908
|
+
element_ids = [el.id for el in snap.elements[:10]] # Show first 10
|
|
909
|
+
print(f" Sample element IDs: {element_ids}")
|
|
910
|
+
|
|
911
|
+
labeled_image = self._draw_labeled_screenshot(snap, snap.elements)
|
|
912
|
+
|
|
913
|
+
# Save labeled image to disk for debugging
|
|
914
|
+
# Save to playground/images if running from playground, otherwise use current directory
|
|
915
|
+
try:
|
|
916
|
+
# Try to detect if we're in a playground context
|
|
917
|
+
import sys
|
|
918
|
+
|
|
919
|
+
cwd = Path.cwd()
|
|
920
|
+
playground_path = None
|
|
921
|
+
|
|
922
|
+
# Check if current working directory contains playground
|
|
923
|
+
if (cwd / "playground").exists():
|
|
924
|
+
playground_path = cwd / "playground" / "images"
|
|
925
|
+
else:
|
|
926
|
+
# Check sys.path for playground
|
|
927
|
+
for path_str in sys.path:
|
|
928
|
+
path_obj = Path(path_str)
|
|
929
|
+
if "playground" in str(path_obj) and path_obj.exists():
|
|
930
|
+
# Find the playground directory
|
|
931
|
+
if path_obj.name == "playground":
|
|
932
|
+
playground_path = path_obj / "images"
|
|
933
|
+
break
|
|
934
|
+
elif (path_obj / "playground").exists():
|
|
935
|
+
playground_path = path_obj / "playground" / "images"
|
|
936
|
+
break
|
|
937
|
+
|
|
938
|
+
if playground_path is None:
|
|
939
|
+
# Fallback: use current working directory
|
|
940
|
+
playground_path = cwd / "playground" / "images"
|
|
941
|
+
|
|
942
|
+
images_dir = playground_path
|
|
943
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
944
|
+
image_uuid = str(uuid.uuid4())
|
|
945
|
+
image_filename = f"labeled_screenshot_{image_uuid}.png"
|
|
946
|
+
image_path = images_dir / image_filename
|
|
947
|
+
labeled_image.save(image_path, format="PNG")
|
|
948
|
+
if self.verbose:
|
|
949
|
+
print(f" 💾 Saved labeled screenshot: {image_path.absolute()}")
|
|
950
|
+
except Exception as save_error:
|
|
951
|
+
# Don't fail if image save fails - it's just for debugging
|
|
952
|
+
if self.verbose:
|
|
953
|
+
print(f" ⚠️ Could not save labeled screenshot: {save_error}")
|
|
954
|
+
|
|
955
|
+
# Use JPEG for better compression (smaller file size for vision APIs)
|
|
956
|
+
labeled_image_data_url = self._encode_image_to_base64(
|
|
957
|
+
labeled_image, format="JPEG", max_size_mb=20.0
|
|
958
|
+
)
|
|
959
|
+
|
|
960
|
+
# 3. Query LLM with vision
|
|
961
|
+
if self.verbose:
|
|
962
|
+
print("🧠 Querying LLM with labeled screenshot...")
|
|
963
|
+
|
|
964
|
+
llm_response = await self._query_llm_with_vision(labeled_image_data_url, goal)
|
|
965
|
+
|
|
966
|
+
# Emit LLM query trace event if tracer is enabled
|
|
967
|
+
if self.tracer:
|
|
968
|
+
_safe_tracer_call(
|
|
969
|
+
self.tracer,
|
|
970
|
+
"emit",
|
|
971
|
+
self.verbose,
|
|
972
|
+
"llm_query",
|
|
973
|
+
{
|
|
974
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
975
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
976
|
+
"model": llm_response.model_name,
|
|
977
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
978
|
+
},
|
|
979
|
+
step_id=step_id,
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
if self.verbose:
|
|
983
|
+
print(f"💭 LLM Response: {llm_response.content}")
|
|
984
|
+
|
|
985
|
+
# Track token usage
|
|
986
|
+
self._track_tokens(goal, llm_response)
|
|
987
|
+
|
|
988
|
+
# 4. Extract element ID
|
|
989
|
+
element_id = self._extract_element_id(llm_response.content)
|
|
990
|
+
|
|
991
|
+
if element_id is None:
|
|
992
|
+
raise ValueError(
|
|
993
|
+
f"Could not extract element ID from LLM response: {llm_response.content}"
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
if self.verbose:
|
|
997
|
+
print(f"🎯 Extracted Element ID: {element_id}")
|
|
998
|
+
|
|
999
|
+
# 5. Click the element
|
|
1000
|
+
if self.verbose:
|
|
1001
|
+
print(f"🖱️ Clicking element {element_id}...")
|
|
1002
|
+
|
|
1003
|
+
click_result = await click_async(self.browser, element_id)
|
|
1004
|
+
|
|
1005
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
1006
|
+
|
|
1007
|
+
# Create AgentActionResult from click result
|
|
1008
|
+
result = AgentActionResult(
|
|
1009
|
+
success=click_result.success,
|
|
1010
|
+
action="click",
|
|
1011
|
+
goal=goal,
|
|
1012
|
+
duration_ms=duration_ms,
|
|
1013
|
+
attempt=0,
|
|
1014
|
+
element_id=element_id,
|
|
1015
|
+
outcome=click_result.outcome,
|
|
1016
|
+
url_changed=click_result.url_changed,
|
|
1017
|
+
error=click_result.error,
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
# Emit action execution trace event if tracer is enabled
|
|
1021
|
+
if self.tracer:
|
|
1022
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
1023
|
+
|
|
1024
|
+
# Include element data for live overlay visualization
|
|
1025
|
+
elements_data = [
|
|
1026
|
+
{
|
|
1027
|
+
"id": el.id,
|
|
1028
|
+
"bbox": {
|
|
1029
|
+
"x": el.bbox.x,
|
|
1030
|
+
"y": el.bbox.y,
|
|
1031
|
+
"width": el.bbox.width,
|
|
1032
|
+
"height": el.bbox.height,
|
|
1033
|
+
},
|
|
1034
|
+
"role": el.role,
|
|
1035
|
+
"text": el.text[:50] if el.text else "",
|
|
1036
|
+
}
|
|
1037
|
+
for el in snap.elements[:50]
|
|
1038
|
+
]
|
|
1039
|
+
|
|
1040
|
+
_safe_tracer_call(
|
|
1041
|
+
self.tracer,
|
|
1042
|
+
"emit",
|
|
1043
|
+
self.verbose,
|
|
1044
|
+
"action",
|
|
1045
|
+
{
|
|
1046
|
+
"action": result.action,
|
|
1047
|
+
"element_id": result.element_id,
|
|
1048
|
+
"success": result.success,
|
|
1049
|
+
"outcome": result.outcome,
|
|
1050
|
+
"duration_ms": duration_ms,
|
|
1051
|
+
"post_url": post_url,
|
|
1052
|
+
"elements": elements_data, # Add element data for overlay
|
|
1053
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
1054
|
+
},
|
|
1055
|
+
step_id=step_id,
|
|
1056
|
+
)
|
|
1057
|
+
|
|
1058
|
+
# Record history
|
|
1059
|
+
self.history.append(
|
|
1060
|
+
{
|
|
1061
|
+
"goal": goal,
|
|
1062
|
+
"action": f"CLICK({element_id})",
|
|
1063
|
+
"result": result.model_dump(), # Store as dict
|
|
1064
|
+
"success": result.success,
|
|
1065
|
+
"attempt": 0,
|
|
1066
|
+
"duration_ms": duration_ms,
|
|
1067
|
+
}
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
if self.verbose:
|
|
1071
|
+
status = "✅" if result.success else "❌"
|
|
1072
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
1073
|
+
|
|
1074
|
+
# Emit step completion trace event if tracer is enabled
|
|
1075
|
+
if self.tracer:
|
|
1076
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
1077
|
+
pre_url = snap.url
|
|
1078
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
1079
|
+
|
|
1080
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
1081
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
1082
|
+
|
|
1083
|
+
# Build LLM data
|
|
1084
|
+
llm_response_text = llm_response.content
|
|
1085
|
+
|
|
1086
|
+
# Build execution data
|
|
1087
|
+
exec_data = {
|
|
1088
|
+
"success": result.success,
|
|
1089
|
+
"outcome": result.outcome,
|
|
1090
|
+
"action": result.action,
|
|
1091
|
+
"element_id": result.element_id,
|
|
1092
|
+
"url_changed": result.url_changed,
|
|
1093
|
+
"duration_ms": duration_ms,
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
# Build verification data (simplified - always pass for now)
|
|
1097
|
+
verify_data = {
|
|
1098
|
+
"passed": result.success,
|
|
1099
|
+
"signals": {
|
|
1100
|
+
"url_changed": result.url_changed or False,
|
|
1101
|
+
},
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
# Build complete step_end event
|
|
1105
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
1106
|
+
step_id=step_id,
|
|
1107
|
+
step_index=self._step_count,
|
|
1108
|
+
goal=goal,
|
|
1109
|
+
attempt=0,
|
|
1110
|
+
pre_url=pre_url,
|
|
1111
|
+
post_url=post_url or pre_url,
|
|
1112
|
+
snapshot_digest=snapshot_digest,
|
|
1113
|
+
llm_data={
|
|
1114
|
+
"response_text": llm_response_text,
|
|
1115
|
+
"response_hash": f"sha256:{self._compute_hash(llm_response_text)}",
|
|
1116
|
+
},
|
|
1117
|
+
exec_data=exec_data,
|
|
1118
|
+
verify_data=verify_data,
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
_safe_tracer_call(
|
|
1122
|
+
self.tracer,
|
|
1123
|
+
"emit",
|
|
1124
|
+
self.verbose,
|
|
1125
|
+
"step_end",
|
|
1126
|
+
step_end_data,
|
|
1127
|
+
step_id=step_id,
|
|
1128
|
+
)
|
|
1129
|
+
|
|
1130
|
+
return result
|
|
1131
|
+
|
|
1132
|
+
except Exception as e:
|
|
1133
|
+
# Emit error trace event if tracer is enabled
|
|
1134
|
+
if self.tracer:
|
|
1135
|
+
_safe_tracer_call(
|
|
1136
|
+
self.tracer,
|
|
1137
|
+
"emit_error",
|
|
1138
|
+
self.verbose,
|
|
1139
|
+
step_id=step_id,
|
|
1140
|
+
error=str(e),
|
|
1141
|
+
attempt=0,
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
if self.verbose:
|
|
1145
|
+
print(f"❌ Error: {e}")
|
|
1146
|
+
|
|
1147
|
+
# Re-raise the exception
|
|
1148
|
+
raise
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
class SentienceVisualAgent(SentienceAgent):
|
|
1152
|
+
"""
|
|
1153
|
+
Sync visual agent that uses labeled screenshots with vision-capable LLMs.
|
|
1154
|
+
|
|
1155
|
+
Extends SentienceAgent to override act() method with visual prompting.
|
|
1156
|
+
|
|
1157
|
+
Requirements:
|
|
1158
|
+
- Pillow (PIL): Required for image processing and drawing bounding boxes
|
|
1159
|
+
Install with: pip install Pillow
|
|
1160
|
+
- Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
|
|
1161
|
+
"""
|
|
1162
|
+
|
|
1163
|
+
def __init__(
|
|
1164
|
+
self,
|
|
1165
|
+
browser: SentienceBrowser,
|
|
1166
|
+
llm: LLMProvider,
|
|
1167
|
+
default_snapshot_limit: int = 50,
|
|
1168
|
+
verbose: bool = True,
|
|
1169
|
+
tracer: Any | None = None,
|
|
1170
|
+
config: Any | None = None,
|
|
1171
|
+
):
|
|
1172
|
+
"""
|
|
1173
|
+
Initialize Visual Agent
|
|
1174
|
+
|
|
1175
|
+
Args:
|
|
1176
|
+
browser: SentienceBrowser instance
|
|
1177
|
+
llm: LLM provider (must support vision, e.g., GPT-4o, Claude 3)
|
|
1178
|
+
default_snapshot_limit: Default maximum elements to include
|
|
1179
|
+
verbose: Print execution logs
|
|
1180
|
+
tracer: Optional Tracer instance
|
|
1181
|
+
config: Optional AgentConfig
|
|
1182
|
+
"""
|
|
1183
|
+
super().__init__(browser, llm, default_snapshot_limit, verbose, tracer, config)
|
|
1184
|
+
|
|
1185
|
+
if not PIL_AVAILABLE:
|
|
1186
|
+
raise ImportError(
|
|
1187
|
+
"PIL/Pillow is required for SentienceVisualAgent. Install with: pip install Pillow"
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
# Track previous snapshot for diff computation
|
|
1191
|
+
self._previous_snapshot: Snapshot | None = None
|
|
1192
|
+
|
|
1193
|
+
def _decode_screenshot(self, screenshot_data_url: str) -> "PILImage.Image":
|
|
1194
|
+
"""
|
|
1195
|
+
Decode base64 screenshot data URL to PIL Image
|
|
1196
|
+
|
|
1197
|
+
Args:
|
|
1198
|
+
screenshot_data_url: Base64-encoded data URL (e.g., "data:image/png;base64,...")
|
|
1199
|
+
|
|
1200
|
+
Returns:
|
|
1201
|
+
PIL Image object
|
|
1202
|
+
"""
|
|
1203
|
+
# Extract base64 data from data URL
|
|
1204
|
+
if screenshot_data_url.startswith("data:image/"):
|
|
1205
|
+
# Format: "data:image/png;base64,<base64_data>"
|
|
1206
|
+
base64_data = screenshot_data_url.split(",", 1)[1]
|
|
1207
|
+
else:
|
|
1208
|
+
# Assume it's already base64
|
|
1209
|
+
base64_data = screenshot_data_url
|
|
1210
|
+
|
|
1211
|
+
# Decode base64 to bytes
|
|
1212
|
+
image_bytes = base64.b64decode(base64_data)
|
|
1213
|
+
|
|
1214
|
+
# Load image from bytes
|
|
1215
|
+
return PILImage.open(io.BytesIO(image_bytes))
|
|
1216
|
+
|
|
1217
|
+
def _find_label_position(
|
|
1218
|
+
self,
|
|
1219
|
+
element_bbox: dict[str, float],
|
|
1220
|
+
existing_labels: list[dict[str, float]],
|
|
1221
|
+
image_width: int,
|
|
1222
|
+
image_height: int,
|
|
1223
|
+
label_width: int,
|
|
1224
|
+
label_height: int,
|
|
1225
|
+
) -> tuple[int, int]:
|
|
1226
|
+
"""
|
|
1227
|
+
Find best position for label using anti-collision algorithm.
|
|
1228
|
+
|
|
1229
|
+
Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners.
|
|
1230
|
+
Returns the first position that doesn't collide with existing labels.
|
|
1231
|
+
|
|
1232
|
+
Args:
|
|
1233
|
+
element_bbox: Element bounding box {x, y, width, height}
|
|
1234
|
+
existing_labels: List of existing label bounding boxes
|
|
1235
|
+
image_width: Image width in pixels
|
|
1236
|
+
image_height: Image height in pixels
|
|
1237
|
+
label_width: Label width in pixels
|
|
1238
|
+
label_height: Label height in pixels
|
|
1239
|
+
|
|
1240
|
+
Returns:
|
|
1241
|
+
(x, y) position for label
|
|
1242
|
+
"""
|
|
1243
|
+
x, y = element_bbox["x"], element_bbox["y"]
|
|
1244
|
+
width, height = element_bbox["width"], element_bbox["height"]
|
|
1245
|
+
|
|
1246
|
+
# Offset from element edge
|
|
1247
|
+
label_offset = 15 # Increased from 5px for better separation
|
|
1248
|
+
|
|
1249
|
+
# Try 8 positions: top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
|
|
1250
|
+
positions = [
|
|
1251
|
+
(int(x + width / 2 - label_width / 2), int(y - label_height - label_offset)), # Top
|
|
1252
|
+
(int(x + width / 2 - label_width / 2), int(y + height + label_offset)), # Bottom
|
|
1253
|
+
(int(x - label_width - label_offset), int(y + height / 2 - label_height / 2)), # Left
|
|
1254
|
+
(int(x + width + label_offset), int(y + height / 2 - label_height / 2)), # Right
|
|
1255
|
+
(int(x - label_width - label_offset), int(y - label_height - label_offset)), # Top-left
|
|
1256
|
+
(int(x + width + label_offset), int(y - label_height - label_offset)), # Top-right
|
|
1257
|
+
(int(x - label_width - label_offset), int(y + height + label_offset)), # Bottom-left
|
|
1258
|
+
(int(x + width + label_offset), int(y + height + label_offset)), # Bottom-right
|
|
1259
|
+
]
|
|
1260
|
+
|
|
1261
|
+
# Check each position for collisions
|
|
1262
|
+
for pos_x, pos_y in positions:
|
|
1263
|
+
# Check bounds
|
|
1264
|
+
if (
|
|
1265
|
+
pos_x < 0
|
|
1266
|
+
or pos_y < 0
|
|
1267
|
+
or pos_x + label_width > image_width
|
|
1268
|
+
or pos_y + label_height > image_height
|
|
1269
|
+
):
|
|
1270
|
+
continue
|
|
1271
|
+
|
|
1272
|
+
# Check collision with existing labels
|
|
1273
|
+
label_bbox = {
|
|
1274
|
+
"x": pos_x,
|
|
1275
|
+
"y": pos_y,
|
|
1276
|
+
"width": label_width,
|
|
1277
|
+
"height": label_height,
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
collision = False
|
|
1281
|
+
for existing in existing_labels:
|
|
1282
|
+
# Simple AABB collision detection
|
|
1283
|
+
if not (
|
|
1284
|
+
label_bbox["x"] + label_bbox["width"] < existing["x"]
|
|
1285
|
+
or label_bbox["x"] > existing["x"] + existing["width"]
|
|
1286
|
+
or label_bbox["y"] + label_bbox["height"] < existing["y"]
|
|
1287
|
+
or label_bbox["y"] > existing["y"] + existing["height"]
|
|
1288
|
+
):
|
|
1289
|
+
collision = True
|
|
1290
|
+
break
|
|
1291
|
+
|
|
1292
|
+
if not collision:
|
|
1293
|
+
return (pos_x, pos_y)
|
|
1294
|
+
|
|
1295
|
+
# If all positions collide, use top position with increased offset
|
|
1296
|
+
return (int(x + width / 2 - label_width / 2), int(y - label_height - label_offset * 2))
|
|
1297
|
+
|
|
1298
|
+
def _draw_labeled_screenshot(
|
|
1299
|
+
self,
|
|
1300
|
+
snapshot: Snapshot,
|
|
1301
|
+
elements: list[Element],
|
|
1302
|
+
) -> "PILImage.Image":
|
|
1303
|
+
"""
|
|
1304
|
+
Draw labeled screenshot with bounding boxes and element IDs.
|
|
1305
|
+
|
|
1306
|
+
Args:
|
|
1307
|
+
snapshot: Snapshot with screenshot data
|
|
1308
|
+
elements: List of elements to label
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
PIL Image with labels drawn
|
|
1312
|
+
"""
|
|
1313
|
+
# Decode screenshot
|
|
1314
|
+
img = self._decode_screenshot(snapshot.screenshot)
|
|
1315
|
+
draw = PILImageDraw.Draw(img)
|
|
1316
|
+
|
|
1317
|
+
# Load font (fallback to default if not available)
|
|
1318
|
+
try:
|
|
1319
|
+
font = PILImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
|
|
1320
|
+
except OSError:
|
|
1321
|
+
try:
|
|
1322
|
+
font = PILImageFont.truetype(
|
|
1323
|
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 16
|
|
1324
|
+
)
|
|
1325
|
+
except OSError:
|
|
1326
|
+
font = PILImageFont.load_default()
|
|
1327
|
+
|
|
1328
|
+
image_width, image_height = img.size
|
|
1329
|
+
existing_labels: list[dict[str, float]] = []
|
|
1330
|
+
|
|
1331
|
+
# Neon green color: #39FF14 (bright, vibrant green)
|
|
1332
|
+
neon_green = "#39FF14"
|
|
1333
|
+
|
|
1334
|
+
for element in elements:
|
|
1335
|
+
bbox = element.bbox
|
|
1336
|
+
x, y, width, height = bbox.x, bbox.y, bbox.width, bbox.height
|
|
1337
|
+
|
|
1338
|
+
# Draw bounding box rectangle (neon green with 2px width)
|
|
1339
|
+
draw.rectangle(
|
|
1340
|
+
[(x, y), (x + width, y + height)],
|
|
1341
|
+
outline=neon_green,
|
|
1342
|
+
width=2,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
# Prepare label text (just the number - keep it simple and compact)
|
|
1346
|
+
label_text = str(element.id)
|
|
1347
|
+
|
|
1348
|
+
# Measure label text size
|
|
1349
|
+
bbox_text = draw.textbbox((0, 0), label_text, font=font)
|
|
1350
|
+
label_width = bbox_text[2] - bbox_text[0]
|
|
1351
|
+
label_height = bbox_text[3] - bbox_text[1]
|
|
1352
|
+
|
|
1353
|
+
# Find best position for label (anti-collision)
|
|
1354
|
+
label_x, label_y = self._find_label_position(
|
|
1355
|
+
{"x": x, "y": y, "width": width, "height": height},
|
|
1356
|
+
existing_labels,
|
|
1357
|
+
image_width,
|
|
1358
|
+
image_height,
|
|
1359
|
+
label_width + 8, # Add padding
|
|
1360
|
+
label_height + 4, # Add padding
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Calculate connection points for a clearer visual link
|
|
1364
|
+
element_center_x = x + width / 2
|
|
1365
|
+
element_center_y = y + height / 2
|
|
1366
|
+
label_center_x = label_x + label_width / 2
|
|
1367
|
+
label_center_y = label_y + label_height / 2
|
|
1368
|
+
|
|
1369
|
+
# Determine which edge of the element is closest to the label
|
|
1370
|
+
dist_top = abs(label_center_y - y)
|
|
1371
|
+
dist_bottom = abs(label_center_y - (y + height))
|
|
1372
|
+
dist_left = abs(label_center_x - x)
|
|
1373
|
+
dist_right = abs(label_center_x - (x + width))
|
|
1374
|
+
|
|
1375
|
+
min_dist = min(dist_top, dist_bottom, dist_left, dist_right)
|
|
1376
|
+
|
|
1377
|
+
if min_dist == dist_top:
|
|
1378
|
+
line_start = (element_center_x, y)
|
|
1379
|
+
elif min_dist == dist_bottom:
|
|
1380
|
+
line_start = (element_center_x, y + height)
|
|
1381
|
+
elif min_dist == dist_left:
|
|
1382
|
+
line_start = (x, element_center_y)
|
|
1383
|
+
else:
|
|
1384
|
+
line_start = (x + width, element_center_y)
|
|
1385
|
+
|
|
1386
|
+
# Draw connecting line from element edge to label
|
|
1387
|
+
draw.line(
|
|
1388
|
+
[line_start, (label_center_x, label_center_y)],
|
|
1389
|
+
fill=neon_green,
|
|
1390
|
+
width=2,
|
|
1391
|
+
)
|
|
1392
|
+
|
|
1393
|
+
# Draw label background (white with neon green border)
|
|
1394
|
+
label_bg_x1 = label_x - 4
|
|
1395
|
+
label_bg_y1 = label_y - 2
|
|
1396
|
+
label_bg_x2 = label_x + label_width + 4
|
|
1397
|
+
label_bg_y2 = label_y + label_height + 2
|
|
1398
|
+
|
|
1399
|
+
draw.rectangle(
|
|
1400
|
+
[(label_bg_x1, label_bg_y1), (label_bg_x2, label_bg_y2)],
|
|
1401
|
+
fill="white",
|
|
1402
|
+
outline=neon_green,
|
|
1403
|
+
width=2,
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
# Draw label text
|
|
1407
|
+
draw.text(
|
|
1408
|
+
(label_x, label_y),
|
|
1409
|
+
label_text,
|
|
1410
|
+
fill="black",
|
|
1411
|
+
font=font,
|
|
1412
|
+
)
|
|
1413
|
+
|
|
1414
|
+
# Record label position for collision detection
|
|
1415
|
+
existing_labels.append(
|
|
1416
|
+
{
|
|
1417
|
+
"x": label_bg_x1,
|
|
1418
|
+
"y": label_bg_y1,
|
|
1419
|
+
"width": label_bg_x2 - label_bg_x1,
|
|
1420
|
+
"height": label_bg_y2 - label_bg_y1,
|
|
1421
|
+
}
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
return img
|
|
1425
|
+
|
|
1426
|
+
def _encode_image_to_base64(
|
|
1427
|
+
self,
|
|
1428
|
+
image: "PILImage.Image",
|
|
1429
|
+
format: str = "PNG",
|
|
1430
|
+
max_size_mb: float = 20.0,
|
|
1431
|
+
) -> str:
|
|
1432
|
+
"""
|
|
1433
|
+
Encode PIL Image to base64 data URL with size optimization.
|
|
1434
|
+
|
|
1435
|
+
Args:
|
|
1436
|
+
image: PIL Image object
|
|
1437
|
+
format: Output format ("PNG" or "JPEG")
|
|
1438
|
+
max_size_mb: Maximum size in MB (will compress if exceeded)
|
|
1439
|
+
|
|
1440
|
+
Returns:
|
|
1441
|
+
Base64-encoded data URL
|
|
1442
|
+
"""
|
|
1443
|
+
buffer = io.BytesIO()
|
|
1444
|
+
pil_format = format.upper()
|
|
1445
|
+
quality = 95 # Start with high quality
|
|
1446
|
+
|
|
1447
|
+
# Convert RGBA to RGB for JPEG
|
|
1448
|
+
if pil_format == "JPEG" and image.mode == "RGBA":
|
|
1449
|
+
# Create white background
|
|
1450
|
+
rgb_image = Image.new("RGB", image.size, (255, 255, 255))
|
|
1451
|
+
rgb_image.paste(image, mask=image.split()[3]) # Use alpha channel as mask
|
|
1452
|
+
image = rgb_image
|
|
1453
|
+
|
|
1454
|
+
# Try to fit within size limit
|
|
1455
|
+
for attempt in range(3):
|
|
1456
|
+
buffer.seek(0)
|
|
1457
|
+
buffer.truncate(0)
|
|
1458
|
+
|
|
1459
|
+
if pil_format == "JPEG":
|
|
1460
|
+
image.save(buffer, format=pil_format, quality=quality, optimize=True)
|
|
1461
|
+
else:
|
|
1462
|
+
image.save(buffer, format=pil_format, optimize=True)
|
|
1463
|
+
|
|
1464
|
+
size_mb = len(buffer.getvalue()) / (1024 * 1024)
|
|
1465
|
+
|
|
1466
|
+
if size_mb <= max_size_mb:
|
|
1467
|
+
break
|
|
1468
|
+
|
|
1469
|
+
# Reduce quality for next attempt
|
|
1470
|
+
quality = max(70, quality - 15)
|
|
1471
|
+
if self.verbose and attempt == 0:
|
|
1472
|
+
print(f" ⚠️ Image size {size_mb:.2f}MB exceeds limit, compressing...")
|
|
1473
|
+
|
|
1474
|
+
image_bytes = buffer.getvalue()
|
|
1475
|
+
base64_data = base64.b64encode(image_bytes).decode("utf-8")
|
|
1476
|
+
|
|
1477
|
+
final_size_mb = len(image_bytes) / (1024 * 1024)
|
|
1478
|
+
if self.verbose:
|
|
1479
|
+
print(f" 📸 Image encoded: {final_size_mb:.2f}MB ({len(base64_data)} chars base64)")
|
|
1480
|
+
|
|
1481
|
+
mime_type = "image/png" if pil_format == "PNG" else "image/jpeg"
|
|
1482
|
+
return f"data:{mime_type};base64,{base64_data}"
|
|
1483
|
+
|
|
1484
|
+
def _query_llm_with_vision(
|
|
1485
|
+
self,
|
|
1486
|
+
image_data_url: str,
|
|
1487
|
+
goal: str,
|
|
1488
|
+
) -> LLMResponse:
|
|
1489
|
+
"""
|
|
1490
|
+
Query LLM with vision (labeled screenshot) - sync version.
|
|
1491
|
+
|
|
1492
|
+
Args:
|
|
1493
|
+
image_data_url: Base64-encoded image data URL
|
|
1494
|
+
goal: User's goal/task
|
|
1495
|
+
|
|
1496
|
+
Returns:
|
|
1497
|
+
LLMResponse with element ID
|
|
1498
|
+
"""
|
|
1499
|
+
# Use the same prompt as async version
|
|
1500
|
+
system_prompt = """You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
|
|
1501
|
+
Each clickable element has:
|
|
1502
|
+
- A bright neon green (#39FF14) bounding box around the element
|
|
1503
|
+
- A white label box with a number (the element ID) connected by a green line
|
|
1504
|
+
- The label is clearly separate from the element (not part of the UI)
|
|
1505
|
+
|
|
1506
|
+
CRITICAL INSTRUCTIONS:
|
|
1507
|
+
1. Look at the screenshot carefully
|
|
1508
|
+
2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
|
|
1509
|
+
3. Follow the green line from that element to find its label box with the ID number
|
|
1510
|
+
4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
|
|
1511
|
+
5. Do NOT include any explanation, reasoning, or other text
|
|
1512
|
+
6. Do NOT say "element 1" or "the first element" - just return the number
|
|
1513
|
+
7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
|
|
1514
|
+
|
|
1515
|
+
Example responses:
|
|
1516
|
+
- Correct: "42"
|
|
1517
|
+
- Correct: "1567"
|
|
1518
|
+
- Wrong: "I see element 42"
|
|
1519
|
+
- Wrong: "The element ID is 42"
|
|
1520
|
+
- Wrong: "42 (the search box)" """
|
|
1521
|
+
|
|
1522
|
+
user_prompt = f"""Goal: {goal}
|
|
1523
|
+
|
|
1524
|
+
Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
|
|
1525
|
+
Find the element that should be clicked to accomplish this goal.
|
|
1526
|
+
Return ONLY the integer ID number from the label, nothing else."""
|
|
1527
|
+
|
|
1528
|
+
# Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
|
|
1529
|
+
if hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
|
|
1530
|
+
# Vision-capable provider - use vision API
|
|
1531
|
+
try:
|
|
1532
|
+
from openai import OpenAI
|
|
1533
|
+
|
|
1534
|
+
# Check if it's OpenAI
|
|
1535
|
+
if isinstance(self.llm.client, OpenAI):
|
|
1536
|
+
messages = [
|
|
1537
|
+
{
|
|
1538
|
+
"role": "system",
|
|
1539
|
+
"content": system_prompt,
|
|
1540
|
+
},
|
|
1541
|
+
{
|
|
1542
|
+
"role": "user",
|
|
1543
|
+
"content": [
|
|
1544
|
+
{"type": "text", "text": user_prompt},
|
|
1545
|
+
{
|
|
1546
|
+
"type": "image_url",
|
|
1547
|
+
"image_url": {"url": image_data_url},
|
|
1548
|
+
},
|
|
1549
|
+
],
|
|
1550
|
+
},
|
|
1551
|
+
]
|
|
1552
|
+
|
|
1553
|
+
response = self.llm.client.chat.completions.create(
|
|
1554
|
+
model=self.llm._model_name,
|
|
1555
|
+
messages=messages,
|
|
1556
|
+
temperature=0.0,
|
|
1557
|
+
)
|
|
1558
|
+
|
|
1559
|
+
content = response.choices[0].message.content or ""
|
|
1560
|
+
usage = response.usage
|
|
1561
|
+
|
|
1562
|
+
from .llm_response_builder import LLMResponseBuilder
|
|
1563
|
+
|
|
1564
|
+
return LLMResponseBuilder.from_openai_format(
|
|
1565
|
+
content=content,
|
|
1566
|
+
prompt_tokens=usage.prompt_tokens if usage else None,
|
|
1567
|
+
completion_tokens=usage.completion_tokens if usage else None,
|
|
1568
|
+
total_tokens=usage.total_tokens if usage else None,
|
|
1569
|
+
model_name=response.model,
|
|
1570
|
+
finish_reason=response.choices[0].finish_reason,
|
|
1571
|
+
)
|
|
1572
|
+
|
|
1573
|
+
# Check if provider supports vision API (uses OpenAI-compatible format)
|
|
1574
|
+
elif hasattr(self.llm, "client") and hasattr(self.llm.client, "chat"):
|
|
1575
|
+
if self.verbose:
|
|
1576
|
+
print(f" 🔍 Using vision API with model: {self.llm._model_name}")
|
|
1577
|
+
print(f" 📐 Image data URL length: {len(image_data_url)} chars")
|
|
1578
|
+
|
|
1579
|
+
messages = [
|
|
1580
|
+
{
|
|
1581
|
+
"role": "system",
|
|
1582
|
+
"content": system_prompt,
|
|
1583
|
+
},
|
|
1584
|
+
{
|
|
1585
|
+
"role": "user",
|
|
1586
|
+
"content": [
|
|
1587
|
+
{"type": "text", "text": user_prompt},
|
|
1588
|
+
{
|
|
1589
|
+
"type": "image_url",
|
|
1590
|
+
"image_url": {"url": image_data_url},
|
|
1591
|
+
},
|
|
1592
|
+
],
|
|
1593
|
+
},
|
|
1594
|
+
]
|
|
1595
|
+
|
|
1596
|
+
try:
|
|
1597
|
+
if self.verbose:
|
|
1598
|
+
print(f" 📤 Sending request to vision API...")
|
|
1599
|
+
|
|
1600
|
+
response = self.llm.client.chat.completions.create(
|
|
1601
|
+
model=self.llm._model_name,
|
|
1602
|
+
messages=messages,
|
|
1603
|
+
temperature=0.0,
|
|
1604
|
+
)
|
|
1605
|
+
|
|
1606
|
+
if not hasattr(response, "choices") or len(response.choices) == 0:
|
|
1607
|
+
raise ValueError("Vision API returned no choices in response")
|
|
1608
|
+
|
|
1609
|
+
choice = response.choices[0]
|
|
1610
|
+
content = (
|
|
1611
|
+
choice.message.content if hasattr(choice.message, "content") else None
|
|
1612
|
+
)
|
|
1613
|
+
finish_reason = (
|
|
1614
|
+
choice.finish_reason if hasattr(choice, "finish_reason") else None
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
if content is None or content == "":
|
|
1618
|
+
error_msg = f"Vision API returned empty content (finish_reason: {finish_reason})"
|
|
1619
|
+
if self.verbose:
|
|
1620
|
+
print(f" ❌ {error_msg}")
|
|
1621
|
+
raise ValueError(error_msg)
|
|
1622
|
+
|
|
1623
|
+
usage = response.usage if hasattr(response, "usage") else None
|
|
1624
|
+
|
|
1625
|
+
from .llm_response_builder import LLMResponseBuilder
|
|
1626
|
+
|
|
1627
|
+
return LLMResponseBuilder.from_openai_format(
|
|
1628
|
+
content=content,
|
|
1629
|
+
prompt_tokens=usage.prompt_tokens if usage else None,
|
|
1630
|
+
completion_tokens=usage.completion_tokens if usage else None,
|
|
1631
|
+
total_tokens=usage.total_tokens if usage else None,
|
|
1632
|
+
model_name=(
|
|
1633
|
+
response.model
|
|
1634
|
+
if hasattr(response, "model")
|
|
1635
|
+
else self.llm._model_name
|
|
1636
|
+
),
|
|
1637
|
+
finish_reason=finish_reason,
|
|
1638
|
+
)
|
|
1639
|
+
except Exception as vision_error:
|
|
1640
|
+
if self.verbose:
|
|
1641
|
+
print(f" ❌ Vision API error: {vision_error}")
|
|
1642
|
+
print(f" 🔄 Attempting fallback to regular generate method...")
|
|
1643
|
+
|
|
1644
|
+
# Fallback: Try using the regular generate method
|
|
1645
|
+
try:
|
|
1646
|
+
fallback_prompt = f"{user_prompt}\n\n[Image: {image_data_url[:200]}...]"
|
|
1647
|
+
fallback_response = self.llm.generate(
|
|
1648
|
+
system_prompt,
|
|
1649
|
+
fallback_prompt,
|
|
1650
|
+
temperature=0.0,
|
|
1651
|
+
)
|
|
1652
|
+
if self.verbose:
|
|
1653
|
+
print(f" ⚠️ Using fallback method (may not support vision)")
|
|
1654
|
+
return fallback_response
|
|
1655
|
+
except Exception as fallback_error:
|
|
1656
|
+
if self.verbose:
|
|
1657
|
+
print(f" ❌ Fallback also failed: {fallback_error}")
|
|
1658
|
+
raise vision_error # Raise original error
|
|
1659
|
+
except ImportError:
|
|
1660
|
+
# openai or other vision SDK not available
|
|
1661
|
+
pass
|
|
1662
|
+
except Exception as e:
|
|
1663
|
+
if self.verbose:
|
|
1664
|
+
print(f"⚠️ Vision API error: {e}, falling back to text-only")
|
|
1665
|
+
|
|
1666
|
+
# Fallback: Try to pass image via kwargs or use text-only
|
|
1667
|
+
try:
|
|
1668
|
+
return self.llm.generate(
|
|
1669
|
+
system_prompt,
|
|
1670
|
+
f"{user_prompt}\n\n[Image data: {image_data_url[:100]}...]",
|
|
1671
|
+
temperature=0.0,
|
|
1672
|
+
)
|
|
1673
|
+
except Exception as e:
|
|
1674
|
+
raise RuntimeError(
|
|
1675
|
+
f"LLM provider {type(self.llm).__name__} may not support vision. "
|
|
1676
|
+
f"Error: {e}. Use a vision-capable model like GPT-4o or Claude 3."
|
|
1677
|
+
) from e
|
|
1678
|
+
|
|
1679
|
+
def _extract_element_id(self, llm_response: str) -> int | None:
|
|
1680
|
+
"""Extract element ID integer from LLM response (shared with async version)."""
|
|
1681
|
+
return SentienceVisualAgentAsync._extract_element_id(self, llm_response)
|
|
1682
|
+
|
|
1683
|
+
def _compute_hash(self, text: str) -> str:
|
|
1684
|
+
"""Compute SHA256 hash of text."""
|
|
1685
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
1686
|
+
|
|
1687
|
+
def act(
|
|
1688
|
+
self,
|
|
1689
|
+
goal: str,
|
|
1690
|
+
max_retries: int = 2,
|
|
1691
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
1692
|
+
) -> AgentActionResult:
|
|
1693
|
+
"""
|
|
1694
|
+
Override act() method to use visual prompting with full tracing support.
|
|
1695
|
+
|
|
1696
|
+
Args:
|
|
1697
|
+
goal: User's goal/task
|
|
1698
|
+
max_retries: Maximum retry attempts
|
|
1699
|
+
snapshot_options: Optional snapshot options (screenshot will be enabled)
|
|
1700
|
+
|
|
1701
|
+
Returns:
|
|
1702
|
+
AgentActionResult
|
|
1703
|
+
"""
|
|
1704
|
+
if self.verbose:
|
|
1705
|
+
print(f"\n{'=' * 70}")
|
|
1706
|
+
print(f"🤖 Visual Agent Goal: {goal}")
|
|
1707
|
+
print(f"{'=' * 70}")
|
|
1708
|
+
|
|
1709
|
+
# Generate step ID for tracing
|
|
1710
|
+
self._step_count += 1
|
|
1711
|
+
step_id = f"step-{self._step_count}"
|
|
1712
|
+
|
|
1713
|
+
# Emit step_start trace event if tracer is enabled
|
|
1714
|
+
if self.tracer:
|
|
1715
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
1716
|
+
_safe_tracer_call(
|
|
1717
|
+
self.tracer,
|
|
1718
|
+
"emit_step_start",
|
|
1719
|
+
self.verbose,
|
|
1720
|
+
step_id=step_id,
|
|
1721
|
+
step_index=self._step_count,
|
|
1722
|
+
goal=goal,
|
|
1723
|
+
attempt=0,
|
|
1724
|
+
pre_url=pre_url,
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
start_time = time.time()
|
|
1728
|
+
|
|
1729
|
+
try:
|
|
1730
|
+
# Ensure screenshot is enabled
|
|
1731
|
+
if snapshot_options is None:
|
|
1732
|
+
snapshot_options = SnapshotOptions()
|
|
1733
|
+
|
|
1734
|
+
# Enable screenshot if not already enabled
|
|
1735
|
+
if snapshot_options.screenshot is False or snapshot_options.screenshot is None:
|
|
1736
|
+
from .models import ScreenshotConfig
|
|
1737
|
+
|
|
1738
|
+
snapshot_options.screenshot = ScreenshotConfig(format="png")
|
|
1739
|
+
|
|
1740
|
+
# Set goal if not already provided
|
|
1741
|
+
if snapshot_options.goal is None:
|
|
1742
|
+
snapshot_options.goal = goal
|
|
1743
|
+
|
|
1744
|
+
# Set limit if not provided
|
|
1745
|
+
if snapshot_options.limit is None:
|
|
1746
|
+
snapshot_options.limit = self.default_snapshot_limit
|
|
1747
|
+
|
|
1748
|
+
if self.verbose:
|
|
1749
|
+
print(f"🎯 Goal: {goal}")
|
|
1750
|
+
print("📸 Taking snapshot with screenshot...")
|
|
1751
|
+
|
|
1752
|
+
# 1. Take snapshot with screenshot (sync version)
|
|
1753
|
+
snap = snapshot(self.browser, snapshot_options)
|
|
1754
|
+
|
|
1755
|
+
if snap.status != "success":
|
|
1756
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
1757
|
+
|
|
1758
|
+
if not snap.screenshot:
|
|
1759
|
+
raise RuntimeError("Screenshot not available in snapshot")
|
|
1760
|
+
|
|
1761
|
+
# Compute diff_status by comparing with previous snapshot
|
|
1762
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
1763
|
+
|
|
1764
|
+
# Create snapshot with diff_status populated
|
|
1765
|
+
snap_with_diff = Snapshot(
|
|
1766
|
+
status=snap.status,
|
|
1767
|
+
timestamp=snap.timestamp,
|
|
1768
|
+
url=snap.url,
|
|
1769
|
+
viewport=snap.viewport,
|
|
1770
|
+
elements=elements_with_diff,
|
|
1771
|
+
screenshot=snap.screenshot,
|
|
1772
|
+
screenshot_format=snap.screenshot_format,
|
|
1773
|
+
error=snap.error,
|
|
1774
|
+
)
|
|
1775
|
+
|
|
1776
|
+
# Update previous snapshot for next comparison
|
|
1777
|
+
self._previous_snapshot = snap
|
|
1778
|
+
|
|
1779
|
+
# Emit snapshot trace event if tracer is enabled
|
|
1780
|
+
if self.tracer:
|
|
1781
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
1782
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
1783
|
+
|
|
1784
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
1785
|
+
if snap.screenshot:
|
|
1786
|
+
# Extract base64 string from data URL if needed
|
|
1787
|
+
if snap.screenshot.startswith("data:image"):
|
|
1788
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
1789
|
+
screenshot_base64 = (
|
|
1790
|
+
snap.screenshot.split(",", 1)[1]
|
|
1791
|
+
if "," in snap.screenshot
|
|
1792
|
+
else snap.screenshot
|
|
1793
|
+
)
|
|
1794
|
+
else:
|
|
1795
|
+
screenshot_base64 = snap.screenshot
|
|
1796
|
+
|
|
1797
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
1798
|
+
if snap.screenshot_format:
|
|
1799
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
1800
|
+
|
|
1801
|
+
_safe_tracer_call(
|
|
1802
|
+
self.tracer,
|
|
1803
|
+
"emit",
|
|
1804
|
+
self.verbose,
|
|
1805
|
+
"snapshot",
|
|
1806
|
+
snapshot_data,
|
|
1807
|
+
step_id=step_id,
|
|
1808
|
+
)
|
|
1809
|
+
|
|
1810
|
+
if self.verbose:
|
|
1811
|
+
print(f"✅ Snapshot taken: {len(snap.elements)} elements")
|
|
1812
|
+
|
|
1813
|
+
# 2. Draw labeled screenshot
|
|
1814
|
+
if self.verbose:
|
|
1815
|
+
print("🎨 Drawing bounding boxes and labels...")
|
|
1816
|
+
print(f" Elements to label: {len(snap.elements)}")
|
|
1817
|
+
if len(snap.elements) > 0:
|
|
1818
|
+
element_ids = [el.id for el in snap.elements[:10]] # Show first 10
|
|
1819
|
+
print(f" Sample element IDs: {element_ids}")
|
|
1820
|
+
|
|
1821
|
+
labeled_image = self._draw_labeled_screenshot(snap, snap.elements)
|
|
1822
|
+
|
|
1823
|
+
# Save labeled image to disk for debugging
|
|
1824
|
+
# Save to playground/images if running from playground, otherwise use current directory
|
|
1825
|
+
try:
|
|
1826
|
+
# Try to detect if we're in a playground context
|
|
1827
|
+
import sys
|
|
1828
|
+
|
|
1829
|
+
cwd = Path.cwd()
|
|
1830
|
+
playground_path = None
|
|
1831
|
+
|
|
1832
|
+
# Check if current working directory contains playground
|
|
1833
|
+
if (cwd / "playground").exists():
|
|
1834
|
+
playground_path = cwd / "playground" / "images"
|
|
1835
|
+
else:
|
|
1836
|
+
# Check sys.path for playground
|
|
1837
|
+
for path_str in sys.path:
|
|
1838
|
+
path_obj = Path(path_str)
|
|
1839
|
+
if "playground" in str(path_obj) and path_obj.exists():
|
|
1840
|
+
# Find the playground directory
|
|
1841
|
+
if path_obj.name == "playground":
|
|
1842
|
+
playground_path = path_obj / "images"
|
|
1843
|
+
break
|
|
1844
|
+
elif (path_obj / "playground").exists():
|
|
1845
|
+
playground_path = path_obj / "playground" / "images"
|
|
1846
|
+
break
|
|
1847
|
+
|
|
1848
|
+
if playground_path is None:
|
|
1849
|
+
# Fallback: use current working directory
|
|
1850
|
+
playground_path = cwd / "playground" / "images"
|
|
1851
|
+
|
|
1852
|
+
images_dir = playground_path
|
|
1853
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
1854
|
+
image_uuid = str(uuid.uuid4())
|
|
1855
|
+
image_filename = f"labeled_screenshot_{image_uuid}.png"
|
|
1856
|
+
image_path = images_dir / image_filename
|
|
1857
|
+
labeled_image.save(image_path, format="PNG")
|
|
1858
|
+
if self.verbose:
|
|
1859
|
+
print(f" 💾 Saved labeled screenshot: {image_path.absolute()}")
|
|
1860
|
+
except Exception as save_error:
|
|
1861
|
+
# Don't fail if image save fails - it's just for debugging
|
|
1862
|
+
if self.verbose:
|
|
1863
|
+
print(f" ⚠️ Could not save labeled screenshot: {save_error}")
|
|
1864
|
+
|
|
1865
|
+
# Use JPEG for better compression (smaller file size for vision APIs)
|
|
1866
|
+
labeled_image_data_url = self._encode_image_to_base64(
|
|
1867
|
+
labeled_image, format="JPEG", max_size_mb=20.0
|
|
1868
|
+
)
|
|
1869
|
+
|
|
1870
|
+
# 3. Query LLM with vision (sync version)
|
|
1871
|
+
if self.verbose:
|
|
1872
|
+
print("🧠 Querying LLM with labeled screenshot...")
|
|
1873
|
+
|
|
1874
|
+
llm_response = self._query_llm_with_vision(labeled_image_data_url, goal)
|
|
1875
|
+
|
|
1876
|
+
# Emit LLM query trace event if tracer is enabled
|
|
1877
|
+
if self.tracer:
|
|
1878
|
+
_safe_tracer_call(
|
|
1879
|
+
self.tracer,
|
|
1880
|
+
"emit",
|
|
1881
|
+
self.verbose,
|
|
1882
|
+
"llm_query",
|
|
1883
|
+
{
|
|
1884
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
1885
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
1886
|
+
"model": llm_response.model_name,
|
|
1887
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
1888
|
+
},
|
|
1889
|
+
step_id=step_id,
|
|
1890
|
+
)
|
|
1891
|
+
|
|
1892
|
+
if self.verbose:
|
|
1893
|
+
print(f"💭 LLM Response: {llm_response.content}")
|
|
1894
|
+
|
|
1895
|
+
# Track token usage
|
|
1896
|
+
self._track_tokens(goal, llm_response)
|
|
1897
|
+
|
|
1898
|
+
# 4. Extract element ID
|
|
1899
|
+
element_id = self._extract_element_id(llm_response.content)
|
|
1900
|
+
|
|
1901
|
+
if element_id is None:
|
|
1902
|
+
raise ValueError(
|
|
1903
|
+
f"Could not extract element ID from LLM response: {llm_response.content}"
|
|
1904
|
+
)
|
|
1905
|
+
|
|
1906
|
+
if self.verbose:
|
|
1907
|
+
print(f"🎯 Extracted Element ID: {element_id}")
|
|
1908
|
+
|
|
1909
|
+
# 5. Click the element (sync version)
|
|
1910
|
+
if self.verbose:
|
|
1911
|
+
print(f"🖱️ Clicking element {element_id}...")
|
|
1912
|
+
|
|
1913
|
+
click_result = click(self.browser, element_id)
|
|
1914
|
+
|
|
1915
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
1916
|
+
|
|
1917
|
+
# Create AgentActionResult from click result
|
|
1918
|
+
result = AgentActionResult(
|
|
1919
|
+
success=click_result.success,
|
|
1920
|
+
action="click",
|
|
1921
|
+
goal=goal,
|
|
1922
|
+
duration_ms=duration_ms,
|
|
1923
|
+
attempt=0,
|
|
1924
|
+
element_id=element_id,
|
|
1925
|
+
outcome=click_result.outcome,
|
|
1926
|
+
url_changed=click_result.url_changed,
|
|
1927
|
+
error=click_result.error,
|
|
1928
|
+
)
|
|
1929
|
+
|
|
1930
|
+
# Emit action execution trace event if tracer is enabled
|
|
1931
|
+
if self.tracer:
|
|
1932
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
1933
|
+
|
|
1934
|
+
# Include element data for live overlay visualization
|
|
1935
|
+
elements_data = [
|
|
1936
|
+
{
|
|
1937
|
+
"id": el.id,
|
|
1938
|
+
"bbox": {
|
|
1939
|
+
"x": el.bbox.x,
|
|
1940
|
+
"y": el.bbox.y,
|
|
1941
|
+
"width": el.bbox.width,
|
|
1942
|
+
"height": el.bbox.height,
|
|
1943
|
+
},
|
|
1944
|
+
"role": el.role,
|
|
1945
|
+
"text": el.text[:50] if el.text else "",
|
|
1946
|
+
}
|
|
1947
|
+
for el in snap.elements[:50]
|
|
1948
|
+
]
|
|
1949
|
+
|
|
1950
|
+
_safe_tracer_call(
|
|
1951
|
+
self.tracer,
|
|
1952
|
+
"emit",
|
|
1953
|
+
self.verbose,
|
|
1954
|
+
"action",
|
|
1955
|
+
{
|
|
1956
|
+
"action": result.action,
|
|
1957
|
+
"element_id": result.element_id,
|
|
1958
|
+
"success": result.success,
|
|
1959
|
+
"outcome": result.outcome,
|
|
1960
|
+
"duration_ms": duration_ms,
|
|
1961
|
+
"post_url": post_url,
|
|
1962
|
+
"elements": elements_data, # Add element data for overlay
|
|
1963
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
1964
|
+
},
|
|
1965
|
+
step_id=step_id,
|
|
1966
|
+
)
|
|
1967
|
+
|
|
1968
|
+
# Record history
|
|
1969
|
+
self.history.append(
|
|
1970
|
+
{
|
|
1971
|
+
"goal": goal,
|
|
1972
|
+
"action": f"CLICK({element_id})",
|
|
1973
|
+
"result": result.model_dump(), # Store as dict
|
|
1974
|
+
"success": result.success,
|
|
1975
|
+
"attempt": 0,
|
|
1976
|
+
"duration_ms": duration_ms,
|
|
1977
|
+
}
|
|
1978
|
+
)
|
|
1979
|
+
|
|
1980
|
+
if self.verbose:
|
|
1981
|
+
status = "✅" if result.success else "❌"
|
|
1982
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
1983
|
+
|
|
1984
|
+
# Emit step completion trace event if tracer is enabled
|
|
1985
|
+
if self.tracer:
|
|
1986
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
1987
|
+
pre_url = snap.url
|
|
1988
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
1989
|
+
|
|
1990
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
1991
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
1992
|
+
|
|
1993
|
+
# Build LLM data
|
|
1994
|
+
llm_response_text = llm_response.content
|
|
1995
|
+
|
|
1996
|
+
# Build execution data
|
|
1997
|
+
exec_data = {
|
|
1998
|
+
"success": result.success,
|
|
1999
|
+
"outcome": result.outcome,
|
|
2000
|
+
"action": result.action,
|
|
2001
|
+
"element_id": result.element_id,
|
|
2002
|
+
"url_changed": result.url_changed,
|
|
2003
|
+
"duration_ms": duration_ms,
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
# Build verification data (simplified - always pass for now)
|
|
2007
|
+
verify_data = {
|
|
2008
|
+
"passed": result.success,
|
|
2009
|
+
"signals": {
|
|
2010
|
+
"url_changed": result.url_changed or False,
|
|
2011
|
+
},
|
|
2012
|
+
}
|
|
2013
|
+
|
|
2014
|
+
# Build complete step_end event
|
|
2015
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
2016
|
+
step_id=step_id,
|
|
2017
|
+
step_index=self._step_count,
|
|
2018
|
+
goal=goal,
|
|
2019
|
+
attempt=0,
|
|
2020
|
+
pre_url=pre_url,
|
|
2021
|
+
post_url=post_url or pre_url,
|
|
2022
|
+
snapshot_digest=snapshot_digest,
|
|
2023
|
+
llm_data={
|
|
2024
|
+
"response_text": llm_response_text,
|
|
2025
|
+
"response_hash": f"sha256:{self._compute_hash(llm_response_text)}",
|
|
2026
|
+
},
|
|
2027
|
+
exec_data=exec_data,
|
|
2028
|
+
verify_data=verify_data,
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
_safe_tracer_call(
|
|
2032
|
+
self.tracer,
|
|
2033
|
+
"emit",
|
|
2034
|
+
self.verbose,
|
|
2035
|
+
"step_end",
|
|
2036
|
+
step_end_data,
|
|
2037
|
+
step_id=step_id,
|
|
2038
|
+
)
|
|
2039
|
+
|
|
2040
|
+
return result
|
|
2041
|
+
|
|
2042
|
+
except Exception as e:
|
|
2043
|
+
# Emit error trace event if tracer is enabled
|
|
2044
|
+
if self.tracer:
|
|
2045
|
+
_safe_tracer_call(
|
|
2046
|
+
self.tracer,
|
|
2047
|
+
"emit_error",
|
|
2048
|
+
self.verbose,
|
|
2049
|
+
step_id=step_id,
|
|
2050
|
+
error=str(e),
|
|
2051
|
+
attempt=0,
|
|
2052
|
+
)
|
|
2053
|
+
|
|
2054
|
+
if self.verbose:
|
|
2055
|
+
print(f"❌ Error: {e}")
|
|
2056
|
+
|
|
2057
|
+
# Re-raise the exception
|
|
2058
|
+
raise
|