sentienceapi 0.90.16__py3-none-any.whl → 0.98.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +120 -6
- sentience/_extension_loader.py +156 -1
- sentience/action_executor.py +217 -0
- sentience/actions.py +758 -30
- sentience/agent.py +806 -293
- sentience/agent_config.py +3 -0
- sentience/agent_runtime.py +840 -0
- sentience/asserts/__init__.py +70 -0
- sentience/asserts/expect.py +621 -0
- sentience/asserts/query.py +383 -0
- sentience/async_api.py +89 -1141
- sentience/backends/__init__.py +137 -0
- sentience/backends/actions.py +372 -0
- sentience/backends/browser_use_adapter.py +241 -0
- sentience/backends/cdp_backend.py +393 -0
- sentience/backends/exceptions.py +211 -0
- sentience/backends/playwright_backend.py +194 -0
- sentience/backends/protocol.py +216 -0
- sentience/backends/sentience_context.py +469 -0
- sentience/backends/snapshot.py +483 -0
- sentience/base_agent.py +95 -0
- sentience/browser.py +678 -39
- sentience/browser_evaluator.py +299 -0
- sentience/canonicalization.py +207 -0
- sentience/cloud_tracing.py +507 -42
- sentience/constants.py +6 -0
- sentience/conversational_agent.py +77 -43
- sentience/cursor_policy.py +142 -0
- sentience/element_filter.py +136 -0
- sentience/expect.py +98 -2
- sentience/extension/background.js +56 -185
- sentience/extension/content.js +150 -287
- sentience/extension/injected_api.js +1088 -1368
- sentience/extension/manifest.json +1 -1
- sentience/extension/pkg/sentience_core.d.ts +22 -22
- sentience/extension/pkg/sentience_core.js +275 -433
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/release.json +47 -47
- sentience/failure_artifacts.py +241 -0
- sentience/formatting.py +9 -53
- sentience/inspector.py +183 -1
- sentience/integrations/__init__.py +6 -0
- sentience/integrations/langchain/__init__.py +12 -0
- sentience/integrations/langchain/context.py +18 -0
- sentience/integrations/langchain/core.py +326 -0
- sentience/integrations/langchain/tools.py +180 -0
- sentience/integrations/models.py +46 -0
- sentience/integrations/pydanticai/__init__.py +15 -0
- sentience/integrations/pydanticai/deps.py +20 -0
- sentience/integrations/pydanticai/toolset.py +468 -0
- sentience/llm_interaction_handler.py +191 -0
- sentience/llm_provider.py +765 -66
- sentience/llm_provider_utils.py +120 -0
- sentience/llm_response_builder.py +153 -0
- sentience/models.py +595 -3
- sentience/ordinal.py +280 -0
- sentience/overlay.py +109 -2
- sentience/protocols.py +228 -0
- sentience/query.py +67 -5
- sentience/read.py +95 -3
- sentience/recorder.py +223 -3
- sentience/schemas/trace_v1.json +128 -9
- sentience/screenshot.py +48 -2
- sentience/sentience_methods.py +86 -0
- sentience/snapshot.py +599 -55
- sentience/snapshot_diff.py +126 -0
- sentience/text_search.py +120 -5
- sentience/trace_event_builder.py +148 -0
- sentience/trace_file_manager.py +197 -0
- sentience/trace_indexing/index_schema.py +95 -7
- sentience/trace_indexing/indexer.py +105 -48
- sentience/tracer_factory.py +120 -9
- sentience/tracing.py +172 -8
- sentience/utils/__init__.py +40 -0
- sentience/utils/browser.py +46 -0
- sentience/{utils.py → utils/element.py} +3 -42
- sentience/utils/formatting.py +59 -0
- sentience/verification.py +618 -0
- sentience/visual_agent.py +2058 -0
- sentience/wait.py +68 -2
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +199 -40
- sentienceapi-0.98.0.dist-info/RECORD +92 -0
- sentience/extension/test-content.js +0 -4
- sentienceapi-0.90.16.dist-info/RECORD +0 -50
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
sentience/agent.py
CHANGED
|
@@ -3,13 +3,17 @@ Sentience Agent: High-level automation agent using LLM + SDK
|
|
|
3
3
|
Implements observe-think-act loop for natural language commands
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import asyncio
|
|
7
|
+
import hashlib
|
|
7
8
|
import time
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
9
|
-
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
10
|
+
|
|
11
|
+
from .action_executor import ActionExecutor
|
|
12
|
+
from .agent_config import AgentConfig
|
|
13
|
+
from .base_agent import BaseAgent, BaseAgentAsync
|
|
14
|
+
from .browser import AsyncSentienceBrowser, SentienceBrowser
|
|
15
|
+
from .element_filter import ElementFilter
|
|
16
|
+
from .llm_interaction_handler import LLMInteractionHandler
|
|
13
17
|
from .llm_provider import LLMProvider, LLMResponse
|
|
14
18
|
from .models import (
|
|
15
19
|
ActionHistory,
|
|
@@ -21,13 +25,46 @@ from .models import (
|
|
|
21
25
|
SnapshotOptions,
|
|
22
26
|
TokenStats,
|
|
23
27
|
)
|
|
24
|
-
from .
|
|
28
|
+
from .protocols import AsyncBrowserProtocol, BrowserProtocol
|
|
29
|
+
from .snapshot import snapshot, snapshot_async
|
|
30
|
+
from .snapshot_diff import SnapshotDiff
|
|
31
|
+
from .trace_event_builder import TraceEventBuilder
|
|
25
32
|
|
|
26
33
|
if TYPE_CHECKING:
|
|
27
|
-
from .agent_config import AgentConfig
|
|
28
34
|
from .tracing import Tracer
|
|
29
35
|
|
|
30
36
|
|
|
37
|
+
def _safe_tracer_call(
|
|
38
|
+
tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Safely call tracer method, catching and logging errors without breaking execution.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
tracer: Tracer instance or None
|
|
45
|
+
method_name: Name of tracer method to call (e.g., "emit", "emit_error")
|
|
46
|
+
verbose: Whether to print error messages
|
|
47
|
+
*args: Positional arguments for the tracer method
|
|
48
|
+
**kwargs: Keyword arguments for the tracer method
|
|
49
|
+
"""
|
|
50
|
+
if not tracer:
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
method = getattr(tracer, method_name)
|
|
54
|
+
if args and kwargs:
|
|
55
|
+
method(*args, **kwargs)
|
|
56
|
+
elif args:
|
|
57
|
+
method(*args)
|
|
58
|
+
elif kwargs:
|
|
59
|
+
method(**kwargs)
|
|
60
|
+
else:
|
|
61
|
+
method()
|
|
62
|
+
except Exception as tracer_error:
|
|
63
|
+
# Tracer errors should not break agent execution
|
|
64
|
+
if verbose:
|
|
65
|
+
print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
|
|
66
|
+
|
|
67
|
+
|
|
31
68
|
class SentienceAgent(BaseAgent):
|
|
32
69
|
"""
|
|
33
70
|
High-level agent that combines Sentience SDK with any LLM provider.
|
|
@@ -54,7 +91,7 @@ class SentienceAgent(BaseAgent):
|
|
|
54
91
|
|
|
55
92
|
def __init__(
|
|
56
93
|
self,
|
|
57
|
-
browser: SentienceBrowser,
|
|
94
|
+
browser: SentienceBrowser | BrowserProtocol,
|
|
58
95
|
llm: LLMProvider,
|
|
59
96
|
default_snapshot_limit: int = 50,
|
|
60
97
|
verbose: bool = True,
|
|
@@ -65,7 +102,8 @@ class SentienceAgent(BaseAgent):
|
|
|
65
102
|
Initialize Sentience Agent
|
|
66
103
|
|
|
67
104
|
Args:
|
|
68
|
-
browser: SentienceBrowser instance
|
|
105
|
+
browser: SentienceBrowser instance or BrowserProtocol-compatible object
|
|
106
|
+
(for testing, can use mock objects that implement BrowserProtocol)
|
|
69
107
|
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
70
108
|
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
71
109
|
verbose: Print execution logs (default: True)
|
|
@@ -77,8 +115,13 @@ class SentienceAgent(BaseAgent):
|
|
|
77
115
|
self.default_snapshot_limit = default_snapshot_limit
|
|
78
116
|
self.verbose = verbose
|
|
79
117
|
self.tracer = tracer
|
|
80
|
-
self.config = config
|
|
118
|
+
self.config = config or AgentConfig()
|
|
119
|
+
|
|
120
|
+
# Initialize handlers
|
|
121
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
122
|
+
self.action_executor = ActionExecutor(browser)
|
|
81
123
|
|
|
124
|
+
# Screenshot sequence counter
|
|
82
125
|
# Execution history
|
|
83
126
|
self.history: list[dict[str, Any]] = []
|
|
84
127
|
|
|
@@ -93,6 +136,27 @@ class SentienceAgent(BaseAgent):
|
|
|
93
136
|
# Step counter for tracing
|
|
94
137
|
self._step_count = 0
|
|
95
138
|
|
|
139
|
+
# Previous snapshot for diff detection
|
|
140
|
+
self._previous_snapshot: Snapshot | None = None
|
|
141
|
+
|
|
142
|
+
def _compute_hash(self, text: str) -> str:
|
|
143
|
+
"""Compute SHA256 hash of text."""
|
|
144
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
145
|
+
|
|
146
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
147
|
+
"""Get bounding box for an element from snapshot."""
|
|
148
|
+
if element_id is None:
|
|
149
|
+
return None
|
|
150
|
+
for el in snap.elements:
|
|
151
|
+
if el.id == element_id:
|
|
152
|
+
return {
|
|
153
|
+
"x": el.bbox.x,
|
|
154
|
+
"y": el.bbox.y,
|
|
155
|
+
"width": el.bbox.width,
|
|
156
|
+
"height": el.bbox.height,
|
|
157
|
+
}
|
|
158
|
+
return None
|
|
159
|
+
|
|
96
160
|
def act( # noqa: C901
|
|
97
161
|
self,
|
|
98
162
|
goal: str,
|
|
@@ -130,7 +194,10 @@ class SentienceAgent(BaseAgent):
|
|
|
130
194
|
# Emit step_start trace event if tracer is enabled
|
|
131
195
|
if self.tracer:
|
|
132
196
|
pre_url = self.browser.page.url if self.browser.page else None
|
|
133
|
-
|
|
197
|
+
_safe_tracer_call(
|
|
198
|
+
self.tracer,
|
|
199
|
+
"emit_step_start",
|
|
200
|
+
self.verbose,
|
|
134
201
|
step_id=step_id,
|
|
135
202
|
step_index=self._step_count,
|
|
136
203
|
goal=goal,
|
|
@@ -149,66 +216,107 @@ class SentienceAgent(BaseAgent):
|
|
|
149
216
|
if snap_opts.goal is None:
|
|
150
217
|
snap_opts.goal = goal
|
|
151
218
|
|
|
219
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
220
|
+
if snapshot_options is None and self.config:
|
|
221
|
+
if self.config.capture_screenshots:
|
|
222
|
+
# Create ScreenshotConfig from AgentConfig
|
|
223
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
224
|
+
format=self.config.screenshot_format,
|
|
225
|
+
quality=(
|
|
226
|
+
self.config.screenshot_quality
|
|
227
|
+
if self.config.screenshot_format == "jpeg"
|
|
228
|
+
else None
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
snap_opts.screenshot = False
|
|
233
|
+
# Apply show_overlay from AgentConfig
|
|
234
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
235
|
+
|
|
152
236
|
# Call snapshot with options object (matches TypeScript API)
|
|
153
237
|
snap = snapshot(self.browser, snap_opts)
|
|
154
238
|
|
|
155
239
|
if snap.status != "success":
|
|
156
240
|
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
157
241
|
|
|
242
|
+
# Compute diff_status by comparing with previous snapshot
|
|
243
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
244
|
+
|
|
245
|
+
# Create snapshot with diff_status populated
|
|
246
|
+
snap_with_diff = Snapshot(
|
|
247
|
+
status=snap.status,
|
|
248
|
+
timestamp=snap.timestamp,
|
|
249
|
+
url=snap.url,
|
|
250
|
+
viewport=snap.viewport,
|
|
251
|
+
elements=elements_with_diff,
|
|
252
|
+
screenshot=snap.screenshot,
|
|
253
|
+
screenshot_format=snap.screenshot_format,
|
|
254
|
+
error=snap.error,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Update previous snapshot for next comparison
|
|
258
|
+
self._previous_snapshot = snap
|
|
259
|
+
|
|
158
260
|
# Apply element filtering based on goal
|
|
159
|
-
filtered_elements = self.filter_elements(
|
|
261
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
160
262
|
|
|
161
263
|
# Emit snapshot trace event if tracer is enabled
|
|
162
264
|
if self.tracer:
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
265
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
266
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
267
|
+
|
|
268
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
269
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
270
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
271
|
+
if snap.screenshot:
|
|
272
|
+
# Extract base64 string from data URL if needed
|
|
273
|
+
if snap.screenshot.startswith("data:image"):
|
|
274
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
275
|
+
screenshot_base64 = (
|
|
276
|
+
snap.screenshot.split(",", 1)[1]
|
|
277
|
+
if "," in snap.screenshot
|
|
278
|
+
else snap.screenshot
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
screenshot_base64 = snap.screenshot
|
|
282
|
+
|
|
283
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
284
|
+
if snap.screenshot_format:
|
|
285
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
286
|
+
|
|
287
|
+
_safe_tracer_call(
|
|
288
|
+
self.tracer,
|
|
289
|
+
"emit",
|
|
290
|
+
self.verbose,
|
|
181
291
|
"snapshot",
|
|
182
|
-
|
|
183
|
-
"url": snap.url,
|
|
184
|
-
"element_count": len(snap.elements),
|
|
185
|
-
"timestamp": snap.timestamp,
|
|
186
|
-
"elements": elements_data, # Add element data for overlay
|
|
187
|
-
},
|
|
292
|
+
snapshot_data,
|
|
188
293
|
step_id=step_id,
|
|
189
294
|
)
|
|
190
295
|
|
|
191
|
-
# Create filtered snapshot
|
|
296
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
192
297
|
filtered_snap = Snapshot(
|
|
193
|
-
status=
|
|
194
|
-
timestamp=
|
|
195
|
-
url=
|
|
196
|
-
viewport=
|
|
298
|
+
status=snap_with_diff.status,
|
|
299
|
+
timestamp=snap_with_diff.timestamp,
|
|
300
|
+
url=snap_with_diff.url,
|
|
301
|
+
viewport=snap_with_diff.viewport,
|
|
197
302
|
elements=filtered_elements,
|
|
198
|
-
screenshot=
|
|
199
|
-
screenshot_format=
|
|
200
|
-
error=
|
|
303
|
+
screenshot=snap_with_diff.screenshot,
|
|
304
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
305
|
+
error=snap_with_diff.error,
|
|
201
306
|
)
|
|
202
307
|
|
|
203
308
|
# 2. GROUND: Format elements for LLM context
|
|
204
|
-
context = self.
|
|
309
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
205
310
|
|
|
206
311
|
# 3. THINK: Query LLM for next action
|
|
207
|
-
llm_response = self.
|
|
312
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
208
313
|
|
|
209
314
|
# Emit LLM query trace event if tracer is enabled
|
|
210
315
|
if self.tracer:
|
|
211
|
-
|
|
316
|
+
_safe_tracer_call(
|
|
317
|
+
self.tracer,
|
|
318
|
+
"emit",
|
|
319
|
+
self.verbose,
|
|
212
320
|
"llm_query",
|
|
213
321
|
{
|
|
214
322
|
"prompt_tokens": llm_response.prompt_tokens,
|
|
@@ -226,10 +334,10 @@ class SentienceAgent(BaseAgent):
|
|
|
226
334
|
self._track_tokens(goal, llm_response)
|
|
227
335
|
|
|
228
336
|
# Parse action from LLM response
|
|
229
|
-
action_str = self.
|
|
337
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
230
338
|
|
|
231
339
|
# 4. EXECUTE: Parse and run action
|
|
232
|
-
result_dict = self.
|
|
340
|
+
result_dict = self.action_executor.execute(action_str, filtered_snap)
|
|
233
341
|
|
|
234
342
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
235
343
|
|
|
@@ -247,6 +355,7 @@ class SentienceAgent(BaseAgent):
|
|
|
247
355
|
url_changed=result_dict.get("url_changed"),
|
|
248
356
|
error=result_dict.get("error"),
|
|
249
357
|
message=result_dict.get("message"),
|
|
358
|
+
cursor=result_dict.get("cursor"),
|
|
250
359
|
)
|
|
251
360
|
|
|
252
361
|
# Emit action execution trace event if tracer is enabled
|
|
@@ -269,7 +378,10 @@ class SentienceAgent(BaseAgent):
|
|
|
269
378
|
for el in filtered_snap.elements[:50]
|
|
270
379
|
]
|
|
271
380
|
|
|
272
|
-
|
|
381
|
+
_safe_tracer_call(
|
|
382
|
+
self.tracer,
|
|
383
|
+
"emit",
|
|
384
|
+
self.verbose,
|
|
273
385
|
"action",
|
|
274
386
|
{
|
|
275
387
|
"action": result.action,
|
|
@@ -280,6 +392,7 @@ class SentienceAgent(BaseAgent):
|
|
|
280
392
|
"post_url": post_url,
|
|
281
393
|
"elements": elements_data, # Add element data for overlay
|
|
282
394
|
"target_element_id": result.element_id, # Highlight target in red
|
|
395
|
+
"cursor": result.cursor,
|
|
283
396
|
},
|
|
284
397
|
step_id=step_id,
|
|
285
398
|
)
|
|
@@ -302,13 +415,107 @@ class SentienceAgent(BaseAgent):
|
|
|
302
415
|
|
|
303
416
|
# Emit step completion trace event if tracer is enabled
|
|
304
417
|
if self.tracer:
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
418
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
419
|
+
pre_url = snap.url
|
|
420
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
421
|
+
|
|
422
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
423
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
424
|
+
|
|
425
|
+
# Build LLM data
|
|
426
|
+
llm_response_text = llm_response.content
|
|
427
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
428
|
+
llm_data = {
|
|
429
|
+
"response_text": llm_response_text,
|
|
430
|
+
"response_hash": llm_response_hash,
|
|
431
|
+
"usage": {
|
|
432
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
433
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
434
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
311
435
|
},
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
# Build exec data
|
|
439
|
+
exec_data = {
|
|
440
|
+
"success": result.success,
|
|
441
|
+
"action": result.action,
|
|
442
|
+
"outcome": result.outcome
|
|
443
|
+
or (
|
|
444
|
+
f"Action {result.action} executed successfully"
|
|
445
|
+
if result.success
|
|
446
|
+
else f"Action {result.action} failed"
|
|
447
|
+
),
|
|
448
|
+
"duration_ms": duration_ms,
|
|
449
|
+
}
|
|
450
|
+
if result.cursor is not None:
|
|
451
|
+
exec_data["cursor"] = result.cursor
|
|
452
|
+
|
|
453
|
+
# Add optional exec fields
|
|
454
|
+
if result.element_id is not None:
|
|
455
|
+
exec_data["element_id"] = result.element_id
|
|
456
|
+
# Add bounding box if element found
|
|
457
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
458
|
+
if bbox:
|
|
459
|
+
exec_data["bounding_box"] = bbox
|
|
460
|
+
if result.text is not None:
|
|
461
|
+
exec_data["text"] = result.text
|
|
462
|
+
if result.key is not None:
|
|
463
|
+
exec_data["key"] = result.key
|
|
464
|
+
if result.error is not None:
|
|
465
|
+
exec_data["error"] = result.error
|
|
466
|
+
|
|
467
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
468
|
+
verify_passed = result.success and (
|
|
469
|
+
result.url_changed or result.action != "click"
|
|
470
|
+
)
|
|
471
|
+
verify_signals = {
|
|
472
|
+
"url_changed": result.url_changed or False,
|
|
473
|
+
}
|
|
474
|
+
if result.error:
|
|
475
|
+
verify_signals["error"] = result.error
|
|
476
|
+
|
|
477
|
+
# Add elements_found array if element was targeted
|
|
478
|
+
if result.element_id is not None:
|
|
479
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
480
|
+
if bbox:
|
|
481
|
+
verify_signals["elements_found"] = [
|
|
482
|
+
{
|
|
483
|
+
"label": f"Element {result.element_id}",
|
|
484
|
+
"bounding_box": bbox,
|
|
485
|
+
}
|
|
486
|
+
]
|
|
487
|
+
|
|
488
|
+
verify_data = {
|
|
489
|
+
"passed": verify_passed,
|
|
490
|
+
"signals": verify_signals,
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
494
|
+
# Use the same format as build_snapshot_event for consistency
|
|
495
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
496
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
497
|
+
|
|
498
|
+
# Build complete step_end event
|
|
499
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
500
|
+
step_id=step_id,
|
|
501
|
+
step_index=self._step_count,
|
|
502
|
+
goal=goal,
|
|
503
|
+
attempt=attempt,
|
|
504
|
+
pre_url=pre_url,
|
|
505
|
+
post_url=post_url,
|
|
506
|
+
snapshot_digest=snapshot_digest,
|
|
507
|
+
llm_data=llm_data,
|
|
508
|
+
exec_data=exec_data,
|
|
509
|
+
verify_data=verify_data,
|
|
510
|
+
pre_elements=pre_elements,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
_safe_tracer_call(
|
|
514
|
+
self.tracer,
|
|
515
|
+
"emit",
|
|
516
|
+
self.verbose,
|
|
517
|
+
"step_end",
|
|
518
|
+
step_end_data,
|
|
312
519
|
step_id=step_id,
|
|
313
520
|
)
|
|
314
521
|
|
|
@@ -317,7 +524,14 @@ class SentienceAgent(BaseAgent):
|
|
|
317
524
|
except Exception as e:
|
|
318
525
|
# Emit error trace event if tracer is enabled
|
|
319
526
|
if self.tracer:
|
|
320
|
-
|
|
527
|
+
_safe_tracer_call(
|
|
528
|
+
self.tracer,
|
|
529
|
+
"emit_error",
|
|
530
|
+
self.verbose,
|
|
531
|
+
step_id=step_id,
|
|
532
|
+
error=str(e),
|
|
533
|
+
attempt=attempt,
|
|
534
|
+
)
|
|
321
535
|
|
|
322
536
|
if attempt < max_retries:
|
|
323
537
|
if self.verbose:
|
|
@@ -346,195 +560,573 @@ class SentienceAgent(BaseAgent):
|
|
|
346
560
|
)
|
|
347
561
|
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
348
562
|
|
|
349
|
-
def
|
|
563
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
350
564
|
"""
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
|
|
565
|
+
Track token usage for analytics
|
|
354
566
|
|
|
355
567
|
Args:
|
|
356
|
-
|
|
357
|
-
|
|
568
|
+
goal: User goal
|
|
569
|
+
llm_response: LLM response with token usage
|
|
570
|
+
"""
|
|
571
|
+
if llm_response.prompt_tokens:
|
|
572
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
573
|
+
if llm_response.completion_tokens:
|
|
574
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
575
|
+
if llm_response.total_tokens:
|
|
576
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
577
|
+
|
|
578
|
+
self._token_usage_raw["by_action"].append(
|
|
579
|
+
{
|
|
580
|
+
"goal": goal,
|
|
581
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
582
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
583
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
584
|
+
"model": llm_response.model_name,
|
|
585
|
+
}
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
def get_token_stats(self) -> TokenStats:
|
|
589
|
+
"""
|
|
590
|
+
Get token usage statistics
|
|
358
591
|
|
|
359
592
|
Returns:
|
|
360
|
-
|
|
593
|
+
TokenStats with token usage breakdown
|
|
361
594
|
"""
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if el.visual_cues.is_clickable:
|
|
370
|
-
cues.append("CLICKABLE")
|
|
371
|
-
if el.visual_cues.background_color_name:
|
|
372
|
-
cues.append(f"color:{el.visual_cues.background_color_name}")
|
|
373
|
-
|
|
374
|
-
# Format element line
|
|
375
|
-
cues_str = f" {{{','.join(cues)}}}" if cues else ""
|
|
376
|
-
text_preview = (
|
|
377
|
-
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
|
|
378
|
-
)
|
|
595
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
596
|
+
return TokenStats(
|
|
597
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
598
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
599
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
600
|
+
by_action=by_action,
|
|
601
|
+
)
|
|
379
602
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
603
|
+
def get_history(self) -> list[ActionHistory]:
|
|
604
|
+
"""
|
|
605
|
+
Get execution history
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
List of ActionHistory entries
|
|
609
|
+
"""
|
|
610
|
+
return [ActionHistory(**h) for h in self.history]
|
|
384
611
|
|
|
385
|
-
|
|
612
|
+
def clear_history(self) -> None:
|
|
613
|
+
"""Clear execution history and reset token counters"""
|
|
614
|
+
self.history.clear()
|
|
615
|
+
self._token_usage_raw = {
|
|
616
|
+
"total_prompt_tokens": 0,
|
|
617
|
+
"total_completion_tokens": 0,
|
|
618
|
+
"total_tokens": 0,
|
|
619
|
+
"by_action": [],
|
|
620
|
+
}
|
|
386
621
|
|
|
387
|
-
def
|
|
622
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
388
623
|
"""
|
|
389
|
-
|
|
390
|
-
|
|
624
|
+
Filter elements from snapshot based on goal context.
|
|
625
|
+
|
|
626
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
627
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
391
628
|
|
|
392
629
|
Args:
|
|
393
|
-
|
|
630
|
+
snapshot: Current page snapshot
|
|
631
|
+
goal: User's goal (can inform filtering)
|
|
394
632
|
|
|
395
633
|
Returns:
|
|
396
|
-
|
|
634
|
+
Filtered list of elements
|
|
397
635
|
"""
|
|
398
|
-
|
|
636
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|
|
399
637
|
|
|
400
|
-
# Remove markdown code blocks if present
|
|
401
|
-
response = re.sub(r"```[\w]*\n?", "", response)
|
|
402
|
-
response = response.strip()
|
|
403
638
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
639
|
+
class SentienceAgentAsync(BaseAgentAsync):
|
|
640
|
+
"""
|
|
641
|
+
High-level async agent that combines Sentience SDK with any LLM provider.
|
|
407
642
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
643
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
644
|
+
1. OBSERVE: Get snapshot of current page state
|
|
645
|
+
2. THINK: Query LLM to decide next action
|
|
646
|
+
3. ACT: Execute action using SDK
|
|
411
647
|
|
|
412
|
-
|
|
413
|
-
|
|
648
|
+
Example:
|
|
649
|
+
>>> from sentience.async_api import AsyncSentienceBrowser
|
|
650
|
+
>>> from sentience.agent import SentienceAgentAsync
|
|
651
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
652
|
+
>>>
|
|
653
|
+
>>> async with AsyncSentienceBrowser() as browser:
|
|
654
|
+
>>> await browser.goto("https://google.com")
|
|
655
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
656
|
+
>>> agent = SentienceAgentAsync(browser, llm)
|
|
657
|
+
>>> await agent.act("Click the search box")
|
|
658
|
+
>>> await agent.act("Type 'magic mouse' into the search field")
|
|
659
|
+
>>> await agent.act("Press Enter key")
|
|
660
|
+
"""
|
|
414
661
|
|
|
415
|
-
def
|
|
662
|
+
def __init__(
|
|
663
|
+
self,
|
|
664
|
+
browser: AsyncSentienceBrowser,
|
|
665
|
+
llm: LLMProvider,
|
|
666
|
+
default_snapshot_limit: int = 50,
|
|
667
|
+
verbose: bool = True,
|
|
668
|
+
tracer: Optional["Tracer"] = None,
|
|
669
|
+
config: Optional["AgentConfig"] = None,
|
|
670
|
+
):
|
|
416
671
|
"""
|
|
417
|
-
|
|
672
|
+
Initialize Sentience Agent (async)
|
|
418
673
|
|
|
419
674
|
Args:
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
675
|
+
browser: AsyncSentienceBrowser instance
|
|
676
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
677
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
678
|
+
verbose: Print execution logs (default: True)
|
|
679
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
680
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
425
681
|
"""
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
VISUAL CUES EXPLAINED:
|
|
434
|
-
- {{PRIMARY}}: Main call-to-action element on the page
|
|
435
|
-
- {{CLICKABLE}}: Element is clickable
|
|
436
|
-
- {{color:X}}: Background color name
|
|
437
|
-
|
|
438
|
-
CRITICAL RESPONSE FORMAT:
|
|
439
|
-
You MUST respond with ONLY ONE of these exact action formats:
|
|
440
|
-
- CLICK(id) - Click element by ID
|
|
441
|
-
- TYPE(id, "text") - Type text into element
|
|
442
|
-
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
|
|
443
|
-
- FINISH() - Task complete
|
|
444
|
-
|
|
445
|
-
DO NOT include any explanation, reasoning, or natural language.
|
|
446
|
-
DO NOT use markdown formatting or code blocks.
|
|
447
|
-
DO NOT say "The next step is..." or anything similar.
|
|
448
|
-
|
|
449
|
-
CORRECT Examples:
|
|
450
|
-
CLICK(42)
|
|
451
|
-
TYPE(15, "magic mouse")
|
|
452
|
-
PRESS("Enter")
|
|
453
|
-
FINISH()
|
|
454
|
-
|
|
455
|
-
INCORRECT Examples (DO NOT DO THIS):
|
|
456
|
-
"The next step is to click..."
|
|
457
|
-
"I will type..."
|
|
458
|
-
```CLICK(42)```
|
|
459
|
-
"""
|
|
682
|
+
self.browser = browser
|
|
683
|
+
self.llm = llm
|
|
684
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
685
|
+
self.verbose = verbose
|
|
686
|
+
self.tracer = tracer
|
|
687
|
+
self.config = config or AgentConfig()
|
|
460
688
|
|
|
461
|
-
|
|
689
|
+
# Initialize handlers
|
|
690
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
691
|
+
self.action_executor = ActionExecutor(browser)
|
|
462
692
|
|
|
463
|
-
|
|
693
|
+
# Screenshot sequence counter
|
|
694
|
+
# Execution history
|
|
695
|
+
self.history: list[dict[str, Any]] = []
|
|
464
696
|
|
|
465
|
-
|
|
697
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
698
|
+
self._token_usage_raw = {
|
|
699
|
+
"total_prompt_tokens": 0,
|
|
700
|
+
"total_completion_tokens": 0,
|
|
701
|
+
"total_tokens": 0,
|
|
702
|
+
"by_action": [],
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
# Step counter for tracing
|
|
706
|
+
self._step_count = 0
|
|
707
|
+
|
|
708
|
+
# Previous snapshot for diff detection
|
|
709
|
+
self._previous_snapshot: Snapshot | None = None
|
|
710
|
+
|
|
711
|
+
def _compute_hash(self, text: str) -> str:
|
|
712
|
+
"""Compute SHA256 hash of text."""
|
|
713
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
714
|
+
|
|
715
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
716
|
+
"""Get bounding box for an element from snapshot."""
|
|
717
|
+
if element_id is None:
|
|
718
|
+
return None
|
|
719
|
+
for el in snap.elements:
|
|
720
|
+
if el.id == element_id:
|
|
721
|
+
return {
|
|
722
|
+
"x": el.bbox.x,
|
|
723
|
+
"y": el.bbox.y,
|
|
724
|
+
"width": el.bbox.width,
|
|
725
|
+
"height": el.bbox.height,
|
|
726
|
+
}
|
|
727
|
+
return None
|
|
728
|
+
|
|
729
|
+
async def act( # noqa: C901
|
|
730
|
+
self,
|
|
731
|
+
goal: str,
|
|
732
|
+
max_retries: int = 2,
|
|
733
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
734
|
+
) -> AgentActionResult:
|
|
466
735
|
"""
|
|
467
|
-
|
|
736
|
+
Execute a high-level goal using observe → think → act loop (async)
|
|
468
737
|
|
|
469
738
|
Args:
|
|
470
|
-
|
|
471
|
-
|
|
739
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
740
|
+
max_retries: Number of retries on failure (default: 2)
|
|
741
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
472
742
|
|
|
473
743
|
Returns:
|
|
474
|
-
|
|
744
|
+
AgentActionResult with execution details
|
|
745
|
+
|
|
746
|
+
Example:
|
|
747
|
+
>>> result = await agent.act("Click the search box")
|
|
748
|
+
>>> print(result.success, result.action, result.element_id)
|
|
749
|
+
True click 42
|
|
475
750
|
"""
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
return {
|
|
481
|
-
"success": result.success,
|
|
482
|
-
"action": "click",
|
|
483
|
-
"element_id": element_id,
|
|
484
|
-
"outcome": result.outcome,
|
|
485
|
-
"url_changed": result.url_changed,
|
|
486
|
-
}
|
|
751
|
+
if self.verbose:
|
|
752
|
+
print(f"\n{'=' * 70}")
|
|
753
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
754
|
+
print(f"{'=' * 70}")
|
|
487
755
|
|
|
488
|
-
#
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
action_str,
|
|
492
|
-
re.IGNORECASE,
|
|
493
|
-
):
|
|
494
|
-
element_id = int(match.group(1))
|
|
495
|
-
text = match.group(2)
|
|
496
|
-
result = type_text(self.browser, element_id, text)
|
|
497
|
-
return {
|
|
498
|
-
"success": result.success,
|
|
499
|
-
"action": "type",
|
|
500
|
-
"element_id": element_id,
|
|
501
|
-
"text": text,
|
|
502
|
-
"outcome": result.outcome,
|
|
503
|
-
}
|
|
756
|
+
# Generate step ID for tracing
|
|
757
|
+
self._step_count += 1
|
|
758
|
+
step_id = f"step-{self._step_count}"
|
|
504
759
|
|
|
505
|
-
#
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
"
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
760
|
+
# Emit step_start trace event if tracer is enabled
|
|
761
|
+
if self.tracer:
|
|
762
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
763
|
+
_safe_tracer_call(
|
|
764
|
+
self.tracer,
|
|
765
|
+
"emit_step_start",
|
|
766
|
+
self.verbose,
|
|
767
|
+
step_id=step_id,
|
|
768
|
+
step_index=self._step_count,
|
|
769
|
+
goal=goal,
|
|
770
|
+
attempt=0,
|
|
771
|
+
pre_url=pre_url,
|
|
772
|
+
)
|
|
515
773
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
"action": "finish",
|
|
521
|
-
"message": "Task marked as complete",
|
|
522
|
-
}
|
|
774
|
+
for attempt in range(max_retries + 1):
|
|
775
|
+
try:
|
|
776
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
777
|
+
start_time = time.time()
|
|
523
778
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
779
|
+
# Use provided options or create default
|
|
780
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
781
|
+
# Only set goal if not already provided
|
|
782
|
+
if snap_opts.goal is None:
|
|
783
|
+
snap_opts.goal = goal
|
|
529
784
|
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
785
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
786
|
+
# Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
|
|
787
|
+
# (snapshot_options.screenshot defaults to False, so we check if it's still False)
|
|
788
|
+
if self.config and (snapshot_options is None or snap_opts.screenshot is False):
|
|
789
|
+
if self.config.capture_screenshots:
|
|
790
|
+
# Create ScreenshotConfig from AgentConfig
|
|
791
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
792
|
+
format=self.config.screenshot_format,
|
|
793
|
+
quality=(
|
|
794
|
+
self.config.screenshot_quality
|
|
795
|
+
if self.config.screenshot_format == "jpeg"
|
|
796
|
+
else None
|
|
797
|
+
),
|
|
798
|
+
)
|
|
799
|
+
else:
|
|
800
|
+
snap_opts.screenshot = False
|
|
801
|
+
# Apply show_overlay from AgentConfig
|
|
802
|
+
# Note: User can override by explicitly passing show_overlay in snapshot_options
|
|
803
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
533
804
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
805
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
806
|
+
snap = await snapshot_async(self.browser, snap_opts)
|
|
807
|
+
|
|
808
|
+
if snap.status != "success":
|
|
809
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
810
|
+
|
|
811
|
+
# Compute diff_status by comparing with previous snapshot
|
|
812
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
813
|
+
|
|
814
|
+
# Create snapshot with diff_status populated
|
|
815
|
+
snap_with_diff = Snapshot(
|
|
816
|
+
status=snap.status,
|
|
817
|
+
timestamp=snap.timestamp,
|
|
818
|
+
url=snap.url,
|
|
819
|
+
viewport=snap.viewport,
|
|
820
|
+
elements=elements_with_diff,
|
|
821
|
+
screenshot=snap.screenshot,
|
|
822
|
+
screenshot_format=snap.screenshot_format,
|
|
823
|
+
error=snap.error,
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
# Update previous snapshot for next comparison
|
|
827
|
+
self._previous_snapshot = snap
|
|
828
|
+
|
|
829
|
+
# Apply element filtering based on goal
|
|
830
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
831
|
+
|
|
832
|
+
# Emit snapshot trace event if tracer is enabled
|
|
833
|
+
if self.tracer:
|
|
834
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
835
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
836
|
+
|
|
837
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
838
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
839
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
840
|
+
if snap.screenshot:
|
|
841
|
+
# Extract base64 string from data URL if needed
|
|
842
|
+
if snap.screenshot.startswith("data:image"):
|
|
843
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
844
|
+
screenshot_base64 = (
|
|
845
|
+
snap.screenshot.split(",", 1)[1]
|
|
846
|
+
if "," in snap.screenshot
|
|
847
|
+
else snap.screenshot
|
|
848
|
+
)
|
|
849
|
+
else:
|
|
850
|
+
screenshot_base64 = snap.screenshot
|
|
851
|
+
|
|
852
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
853
|
+
if snap.screenshot_format:
|
|
854
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
855
|
+
|
|
856
|
+
_safe_tracer_call(
|
|
857
|
+
self.tracer,
|
|
858
|
+
"emit",
|
|
859
|
+
self.verbose,
|
|
860
|
+
"snapshot",
|
|
861
|
+
snapshot_data,
|
|
862
|
+
step_id=step_id,
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
866
|
+
filtered_snap = Snapshot(
|
|
867
|
+
status=snap_with_diff.status,
|
|
868
|
+
timestamp=snap_with_diff.timestamp,
|
|
869
|
+
url=snap_with_diff.url,
|
|
870
|
+
viewport=snap_with_diff.viewport,
|
|
871
|
+
elements=filtered_elements,
|
|
872
|
+
screenshot=snap_with_diff.screenshot,
|
|
873
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
874
|
+
error=snap_with_diff.error,
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
# 2. GROUND: Format elements for LLM context
|
|
878
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
879
|
+
|
|
880
|
+
# 3. THINK: Query LLM for next action
|
|
881
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
882
|
+
|
|
883
|
+
# Emit LLM query trace event if tracer is enabled
|
|
884
|
+
if self.tracer:
|
|
885
|
+
_safe_tracer_call(
|
|
886
|
+
self.tracer,
|
|
887
|
+
"emit",
|
|
888
|
+
self.verbose,
|
|
889
|
+
"llm_query",
|
|
890
|
+
{
|
|
891
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
892
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
893
|
+
"model": llm_response.model_name,
|
|
894
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
895
|
+
},
|
|
896
|
+
step_id=step_id,
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
if self.verbose:
|
|
900
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
901
|
+
|
|
902
|
+
# Track token usage
|
|
903
|
+
self._track_tokens(goal, llm_response)
|
|
904
|
+
|
|
905
|
+
# Parse action from LLM response
|
|
906
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
907
|
+
|
|
908
|
+
# 4. EXECUTE: Parse and run action
|
|
909
|
+
result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
|
|
910
|
+
|
|
911
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
912
|
+
|
|
913
|
+
# Create AgentActionResult from execution result
|
|
914
|
+
result = AgentActionResult(
|
|
915
|
+
success=result_dict["success"],
|
|
916
|
+
action=result_dict["action"],
|
|
917
|
+
goal=goal,
|
|
918
|
+
duration_ms=duration_ms,
|
|
919
|
+
attempt=attempt,
|
|
920
|
+
element_id=result_dict.get("element_id"),
|
|
921
|
+
text=result_dict.get("text"),
|
|
922
|
+
key=result_dict.get("key"),
|
|
923
|
+
outcome=result_dict.get("outcome"),
|
|
924
|
+
url_changed=result_dict.get("url_changed"),
|
|
925
|
+
error=result_dict.get("error"),
|
|
926
|
+
message=result_dict.get("message"),
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
# Emit action execution trace event if tracer is enabled
|
|
930
|
+
if self.tracer:
|
|
931
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
932
|
+
|
|
933
|
+
# Include element data for live overlay visualization
|
|
934
|
+
elements_data = [
|
|
935
|
+
{
|
|
936
|
+
"id": el.id,
|
|
937
|
+
"bbox": {
|
|
938
|
+
"x": el.bbox.x,
|
|
939
|
+
"y": el.bbox.y,
|
|
940
|
+
"width": el.bbox.width,
|
|
941
|
+
"height": el.bbox.height,
|
|
942
|
+
},
|
|
943
|
+
"role": el.role,
|
|
944
|
+
"text": el.text[:50] if el.text else "",
|
|
945
|
+
}
|
|
946
|
+
for el in filtered_snap.elements[:50]
|
|
947
|
+
]
|
|
948
|
+
|
|
949
|
+
_safe_tracer_call(
|
|
950
|
+
self.tracer,
|
|
951
|
+
"emit",
|
|
952
|
+
self.verbose,
|
|
953
|
+
"action",
|
|
954
|
+
{
|
|
955
|
+
"action": result.action,
|
|
956
|
+
"element_id": result.element_id,
|
|
957
|
+
"success": result.success,
|
|
958
|
+
"outcome": result.outcome,
|
|
959
|
+
"duration_ms": duration_ms,
|
|
960
|
+
"post_url": post_url,
|
|
961
|
+
"elements": elements_data, # Add element data for overlay
|
|
962
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
963
|
+
},
|
|
964
|
+
step_id=step_id,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# 5. RECORD: Track history
|
|
968
|
+
self.history.append(
|
|
969
|
+
{
|
|
970
|
+
"goal": goal,
|
|
971
|
+
"action": action_str,
|
|
972
|
+
"result": result.model_dump(), # Store as dict
|
|
973
|
+
"success": result.success,
|
|
974
|
+
"attempt": attempt,
|
|
975
|
+
"duration_ms": duration_ms,
|
|
976
|
+
}
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
if self.verbose:
|
|
980
|
+
status = "✅" if result.success else "❌"
|
|
981
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
982
|
+
|
|
983
|
+
# Emit step completion trace event if tracer is enabled
|
|
984
|
+
if self.tracer:
|
|
985
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
986
|
+
pre_url = snap.url
|
|
987
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
988
|
+
|
|
989
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
990
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
991
|
+
|
|
992
|
+
# Build LLM data
|
|
993
|
+
llm_response_text = llm_response.content
|
|
994
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
995
|
+
llm_data = {
|
|
996
|
+
"response_text": llm_response_text,
|
|
997
|
+
"response_hash": llm_response_hash,
|
|
998
|
+
"usage": {
|
|
999
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
1000
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
1001
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
1002
|
+
},
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
# Build exec data
|
|
1006
|
+
exec_data = {
|
|
1007
|
+
"success": result.success,
|
|
1008
|
+
"action": result.action,
|
|
1009
|
+
"outcome": result.outcome
|
|
1010
|
+
or (
|
|
1011
|
+
f"Action {result.action} executed successfully"
|
|
1012
|
+
if result.success
|
|
1013
|
+
else f"Action {result.action} failed"
|
|
1014
|
+
),
|
|
1015
|
+
"duration_ms": duration_ms,
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
# Add optional exec fields
|
|
1019
|
+
if result.element_id is not None:
|
|
1020
|
+
exec_data["element_id"] = result.element_id
|
|
1021
|
+
# Add bounding box if element found
|
|
1022
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1023
|
+
if bbox:
|
|
1024
|
+
exec_data["bounding_box"] = bbox
|
|
1025
|
+
if result.text is not None:
|
|
1026
|
+
exec_data["text"] = result.text
|
|
1027
|
+
if result.key is not None:
|
|
1028
|
+
exec_data["key"] = result.key
|
|
1029
|
+
if result.error is not None:
|
|
1030
|
+
exec_data["error"] = result.error
|
|
1031
|
+
|
|
1032
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
1033
|
+
verify_passed = result.success and (
|
|
1034
|
+
result.url_changed or result.action != "click"
|
|
1035
|
+
)
|
|
1036
|
+
verify_signals = {
|
|
1037
|
+
"url_changed": result.url_changed or False,
|
|
1038
|
+
}
|
|
1039
|
+
if result.error:
|
|
1040
|
+
verify_signals["error"] = result.error
|
|
1041
|
+
|
|
1042
|
+
# Add elements_found array if element was targeted
|
|
1043
|
+
if result.element_id is not None:
|
|
1044
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1045
|
+
if bbox:
|
|
1046
|
+
verify_signals["elements_found"] = [
|
|
1047
|
+
{
|
|
1048
|
+
"label": f"Element {result.element_id}",
|
|
1049
|
+
"bounding_box": bbox,
|
|
1050
|
+
}
|
|
1051
|
+
]
|
|
1052
|
+
|
|
1053
|
+
verify_data = {
|
|
1054
|
+
"passed": verify_passed,
|
|
1055
|
+
"signals": verify_signals,
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
1059
|
+
# Use the same format as build_snapshot_event for consistency
|
|
1060
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
1061
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
1062
|
+
|
|
1063
|
+
# Build complete step_end event
|
|
1064
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
1065
|
+
step_id=step_id,
|
|
1066
|
+
step_index=self._step_count,
|
|
1067
|
+
goal=goal,
|
|
1068
|
+
attempt=attempt,
|
|
1069
|
+
pre_url=pre_url,
|
|
1070
|
+
post_url=post_url,
|
|
1071
|
+
snapshot_digest=snapshot_digest,
|
|
1072
|
+
llm_data=llm_data,
|
|
1073
|
+
exec_data=exec_data,
|
|
1074
|
+
verify_data=verify_data,
|
|
1075
|
+
pre_elements=pre_elements,
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
_safe_tracer_call(
|
|
1079
|
+
self.tracer,
|
|
1080
|
+
"emit",
|
|
1081
|
+
self.verbose,
|
|
1082
|
+
"step_end",
|
|
1083
|
+
step_end_data,
|
|
1084
|
+
step_id=step_id,
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
return result
|
|
1088
|
+
|
|
1089
|
+
except Exception as e:
|
|
1090
|
+
# Emit error trace event if tracer is enabled
|
|
1091
|
+
if self.tracer:
|
|
1092
|
+
_safe_tracer_call(
|
|
1093
|
+
self.tracer,
|
|
1094
|
+
"emit_error",
|
|
1095
|
+
self.verbose,
|
|
1096
|
+
step_id=step_id,
|
|
1097
|
+
error=str(e),
|
|
1098
|
+
attempt=attempt,
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
if attempt < max_retries:
|
|
1102
|
+
if self.verbose:
|
|
1103
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
1104
|
+
await asyncio.sleep(1.0) # Brief delay before retry
|
|
1105
|
+
continue
|
|
1106
|
+
else:
|
|
1107
|
+
# Create error result
|
|
1108
|
+
error_result = AgentActionResult(
|
|
1109
|
+
success=False,
|
|
1110
|
+
action="error",
|
|
1111
|
+
goal=goal,
|
|
1112
|
+
duration_ms=0,
|
|
1113
|
+
attempt=attempt,
|
|
1114
|
+
error=str(e),
|
|
1115
|
+
)
|
|
1116
|
+
self.history.append(
|
|
1117
|
+
{
|
|
1118
|
+
"goal": goal,
|
|
1119
|
+
"action": "error",
|
|
1120
|
+
"result": error_result.model_dump(),
|
|
1121
|
+
"success": False,
|
|
1122
|
+
"attempt": attempt,
|
|
1123
|
+
"duration_ms": 0,
|
|
1124
|
+
}
|
|
1125
|
+
)
|
|
1126
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
1127
|
+
|
|
1128
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
1129
|
+
"""Track token usage for analytics (same as sync version)"""
|
|
538
1130
|
if llm_response.prompt_tokens:
|
|
539
1131
|
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
540
1132
|
if llm_response.completion_tokens:
|
|
@@ -553,12 +1145,7 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
553
1145
|
)
|
|
554
1146
|
|
|
555
1147
|
def get_token_stats(self) -> TokenStats:
|
|
556
|
-
"""
|
|
557
|
-
Get token usage statistics
|
|
558
|
-
|
|
559
|
-
Returns:
|
|
560
|
-
TokenStats with token usage breakdown
|
|
561
|
-
"""
|
|
1148
|
+
"""Get token usage statistics (same as sync version)"""
|
|
562
1149
|
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
563
1150
|
return TokenStats(
|
|
564
1151
|
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
@@ -568,16 +1155,11 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
568
1155
|
)
|
|
569
1156
|
|
|
570
1157
|
def get_history(self) -> list[ActionHistory]:
|
|
571
|
-
"""
|
|
572
|
-
Get execution history
|
|
573
|
-
|
|
574
|
-
Returns:
|
|
575
|
-
List of ActionHistory entries
|
|
576
|
-
"""
|
|
1158
|
+
"""Get execution history (same as sync version)"""
|
|
577
1159
|
return [ActionHistory(**h) for h in self.history]
|
|
578
1160
|
|
|
579
1161
|
def clear_history(self) -> None:
|
|
580
|
-
"""Clear execution history and reset token counters"""
|
|
1162
|
+
"""Clear execution history and reset token counters (same as sync version)"""
|
|
581
1163
|
self.history.clear()
|
|
582
1164
|
self._token_usage_raw = {
|
|
583
1165
|
"total_prompt_tokens": 0,
|
|
@@ -590,8 +1172,8 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
590
1172
|
"""
|
|
591
1173
|
Filter elements from snapshot based on goal context.
|
|
592
1174
|
|
|
593
|
-
This
|
|
594
|
-
relevant elements and filters out irrelevant ones.
|
|
1175
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
1176
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
595
1177
|
|
|
596
1178
|
Args:
|
|
597
1179
|
snapshot: Current page snapshot
|
|
@@ -600,73 +1182,4 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
600
1182
|
Returns:
|
|
601
1183
|
Filtered list of elements
|
|
602
1184
|
"""
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
# If no goal provided, return all elements (up to limit)
|
|
606
|
-
if not goal:
|
|
607
|
-
return elements[: self.default_snapshot_limit]
|
|
608
|
-
|
|
609
|
-
goal_lower = goal.lower()
|
|
610
|
-
|
|
611
|
-
# Extract keywords from goal
|
|
612
|
-
keywords = self._extract_keywords(goal_lower)
|
|
613
|
-
|
|
614
|
-
# Boost elements matching goal keywords
|
|
615
|
-
scored_elements = []
|
|
616
|
-
for el in elements:
|
|
617
|
-
score = el.importance
|
|
618
|
-
|
|
619
|
-
# Boost if element text matches goal
|
|
620
|
-
if el.text and any(kw in el.text.lower() for kw in keywords):
|
|
621
|
-
score += 0.3
|
|
622
|
-
|
|
623
|
-
# Boost if role matches goal intent
|
|
624
|
-
if "click" in goal_lower and el.visual_cues.is_clickable:
|
|
625
|
-
score += 0.2
|
|
626
|
-
if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
|
|
627
|
-
score += 0.2
|
|
628
|
-
if "search" in goal_lower:
|
|
629
|
-
# Filter out non-interactive elements for search tasks
|
|
630
|
-
if el.role in ["link", "img"] and not el.visual_cues.is_primary:
|
|
631
|
-
score -= 0.5
|
|
632
|
-
|
|
633
|
-
scored_elements.append((score, el))
|
|
634
|
-
|
|
635
|
-
# Re-sort by boosted score
|
|
636
|
-
scored_elements.sort(key=lambda x: x[0], reverse=True)
|
|
637
|
-
elements = [el for _, el in scored_elements]
|
|
638
|
-
|
|
639
|
-
return elements[: self.default_snapshot_limit]
|
|
640
|
-
|
|
641
|
-
def _extract_keywords(self, text: str) -> list[str]:
|
|
642
|
-
"""
|
|
643
|
-
Extract meaningful keywords from goal text
|
|
644
|
-
|
|
645
|
-
Args:
|
|
646
|
-
text: Text to extract keywords from
|
|
647
|
-
|
|
648
|
-
Returns:
|
|
649
|
-
List of keywords
|
|
650
|
-
"""
|
|
651
|
-
stopwords = {
|
|
652
|
-
"the",
|
|
653
|
-
"a",
|
|
654
|
-
"an",
|
|
655
|
-
"and",
|
|
656
|
-
"or",
|
|
657
|
-
"but",
|
|
658
|
-
"in",
|
|
659
|
-
"on",
|
|
660
|
-
"at",
|
|
661
|
-
"to",
|
|
662
|
-
"for",
|
|
663
|
-
"of",
|
|
664
|
-
"with",
|
|
665
|
-
"by",
|
|
666
|
-
"from",
|
|
667
|
-
"as",
|
|
668
|
-
"is",
|
|
669
|
-
"was",
|
|
670
|
-
}
|
|
671
|
-
words = text.split()
|
|
672
|
-
return [w for w in words if w not in stopwords and len(w) > 2]
|
|
1185
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|