sentienceapi 0.90.16__py3-none-any.whl → 0.92.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +14 -5
- sentience/action_executor.py +215 -0
- sentience/actions.py +408 -25
- sentience/agent.py +802 -293
- sentience/agent_config.py +3 -0
- sentience/async_api.py +83 -1142
- sentience/base_agent.py +95 -0
- sentience/browser.py +484 -1
- sentience/browser_evaluator.py +299 -0
- sentience/cloud_tracing.py +457 -33
- sentience/conversational_agent.py +77 -43
- sentience/element_filter.py +136 -0
- sentience/expect.py +98 -2
- sentience/extension/background.js +56 -185
- sentience/extension/content.js +117 -289
- sentience/extension/injected_api.js +799 -1374
- sentience/extension/manifest.json +1 -1
- sentience/extension/pkg/sentience_core.js +190 -396
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/release.json +47 -47
- sentience/formatting.py +9 -53
- sentience/inspector.py +183 -1
- sentience/llm_interaction_handler.py +191 -0
- sentience/llm_provider.py +74 -52
- sentience/llm_provider_utils.py +120 -0
- sentience/llm_response_builder.py +153 -0
- sentience/models.py +60 -1
- sentience/overlay.py +109 -2
- sentience/protocols.py +228 -0
- sentience/query.py +1 -1
- sentience/read.py +95 -3
- sentience/recorder.py +223 -3
- sentience/schemas/trace_v1.json +102 -9
- sentience/screenshot.py +48 -2
- sentience/sentience_methods.py +86 -0
- sentience/snapshot.py +291 -38
- sentience/snapshot_diff.py +141 -0
- sentience/text_search.py +119 -5
- sentience/trace_event_builder.py +129 -0
- sentience/trace_file_manager.py +197 -0
- sentience/trace_indexing/index_schema.py +95 -7
- sentience/trace_indexing/indexer.py +117 -14
- sentience/tracer_factory.py +119 -6
- sentience/tracing.py +172 -8
- sentience/utils/__init__.py +40 -0
- sentience/utils/browser.py +46 -0
- sentience/utils/element.py +257 -0
- sentience/utils/formatting.py +59 -0
- sentience/utils.py +1 -1
- sentience/visual_agent.py +2056 -0
- sentience/wait.py +68 -2
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/METADATA +2 -1
- sentienceapi-0.92.2.dist-info/RECORD +65 -0
- sentience/extension/test-content.js +0 -4
- sentienceapi-0.90.16.dist-info/RECORD +0 -50
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/WHEEL +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/entry_points.txt +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/licenses/LICENSE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/licenses/LICENSE-APACHE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/licenses/LICENSE-MIT +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/top_level.txt +0 -0
sentience/agent.py
CHANGED
|
@@ -3,13 +3,17 @@ Sentience Agent: High-level automation agent using LLM + SDK
|
|
|
3
3
|
Implements observe-think-act loop for natural language commands
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import asyncio
|
|
7
|
+
import hashlib
|
|
7
8
|
import time
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
9
|
-
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
10
|
+
|
|
11
|
+
from .action_executor import ActionExecutor
|
|
12
|
+
from .agent_config import AgentConfig
|
|
13
|
+
from .base_agent import BaseAgent, BaseAgentAsync
|
|
14
|
+
from .browser import AsyncSentienceBrowser, SentienceBrowser
|
|
15
|
+
from .element_filter import ElementFilter
|
|
16
|
+
from .llm_interaction_handler import LLMInteractionHandler
|
|
13
17
|
from .llm_provider import LLMProvider, LLMResponse
|
|
14
18
|
from .models import (
|
|
15
19
|
ActionHistory,
|
|
@@ -21,13 +25,46 @@ from .models import (
|
|
|
21
25
|
SnapshotOptions,
|
|
22
26
|
TokenStats,
|
|
23
27
|
)
|
|
24
|
-
from .
|
|
28
|
+
from .protocols import AsyncBrowserProtocol, BrowserProtocol
|
|
29
|
+
from .snapshot import snapshot, snapshot_async
|
|
30
|
+
from .snapshot_diff import SnapshotDiff
|
|
31
|
+
from .trace_event_builder import TraceEventBuilder
|
|
25
32
|
|
|
26
33
|
if TYPE_CHECKING:
|
|
27
|
-
from .agent_config import AgentConfig
|
|
28
34
|
from .tracing import Tracer
|
|
29
35
|
|
|
30
36
|
|
|
37
|
+
def _safe_tracer_call(
|
|
38
|
+
tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Safely call tracer method, catching and logging errors without breaking execution.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
tracer: Tracer instance or None
|
|
45
|
+
method_name: Name of tracer method to call (e.g., "emit", "emit_error")
|
|
46
|
+
verbose: Whether to print error messages
|
|
47
|
+
*args: Positional arguments for the tracer method
|
|
48
|
+
**kwargs: Keyword arguments for the tracer method
|
|
49
|
+
"""
|
|
50
|
+
if not tracer:
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
method = getattr(tracer, method_name)
|
|
54
|
+
if args and kwargs:
|
|
55
|
+
method(*args, **kwargs)
|
|
56
|
+
elif args:
|
|
57
|
+
method(*args)
|
|
58
|
+
elif kwargs:
|
|
59
|
+
method(**kwargs)
|
|
60
|
+
else:
|
|
61
|
+
method()
|
|
62
|
+
except Exception as tracer_error:
|
|
63
|
+
# Tracer errors should not break agent execution
|
|
64
|
+
if verbose:
|
|
65
|
+
print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
|
|
66
|
+
|
|
67
|
+
|
|
31
68
|
class SentienceAgent(BaseAgent):
|
|
32
69
|
"""
|
|
33
70
|
High-level agent that combines Sentience SDK with any LLM provider.
|
|
@@ -54,7 +91,7 @@ class SentienceAgent(BaseAgent):
|
|
|
54
91
|
|
|
55
92
|
def __init__(
|
|
56
93
|
self,
|
|
57
|
-
browser: SentienceBrowser,
|
|
94
|
+
browser: SentienceBrowser | BrowserProtocol,
|
|
58
95
|
llm: LLMProvider,
|
|
59
96
|
default_snapshot_limit: int = 50,
|
|
60
97
|
verbose: bool = True,
|
|
@@ -65,7 +102,8 @@ class SentienceAgent(BaseAgent):
|
|
|
65
102
|
Initialize Sentience Agent
|
|
66
103
|
|
|
67
104
|
Args:
|
|
68
|
-
browser: SentienceBrowser instance
|
|
105
|
+
browser: SentienceBrowser instance or BrowserProtocol-compatible object
|
|
106
|
+
(for testing, can use mock objects that implement BrowserProtocol)
|
|
69
107
|
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
70
108
|
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
71
109
|
verbose: Print execution logs (default: True)
|
|
@@ -77,8 +115,13 @@ class SentienceAgent(BaseAgent):
|
|
|
77
115
|
self.default_snapshot_limit = default_snapshot_limit
|
|
78
116
|
self.verbose = verbose
|
|
79
117
|
self.tracer = tracer
|
|
80
|
-
self.config = config
|
|
118
|
+
self.config = config or AgentConfig()
|
|
119
|
+
|
|
120
|
+
# Initialize handlers
|
|
121
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
122
|
+
self.action_executor = ActionExecutor(browser)
|
|
81
123
|
|
|
124
|
+
# Screenshot sequence counter
|
|
82
125
|
# Execution history
|
|
83
126
|
self.history: list[dict[str, Any]] = []
|
|
84
127
|
|
|
@@ -93,6 +136,27 @@ class SentienceAgent(BaseAgent):
|
|
|
93
136
|
# Step counter for tracing
|
|
94
137
|
self._step_count = 0
|
|
95
138
|
|
|
139
|
+
# Previous snapshot for diff detection
|
|
140
|
+
self._previous_snapshot: Snapshot | None = None
|
|
141
|
+
|
|
142
|
+
def _compute_hash(self, text: str) -> str:
|
|
143
|
+
"""Compute SHA256 hash of text."""
|
|
144
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
145
|
+
|
|
146
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
147
|
+
"""Get bounding box for an element from snapshot."""
|
|
148
|
+
if element_id is None:
|
|
149
|
+
return None
|
|
150
|
+
for el in snap.elements:
|
|
151
|
+
if el.id == element_id:
|
|
152
|
+
return {
|
|
153
|
+
"x": el.bbox.x,
|
|
154
|
+
"y": el.bbox.y,
|
|
155
|
+
"width": el.bbox.width,
|
|
156
|
+
"height": el.bbox.height,
|
|
157
|
+
}
|
|
158
|
+
return None
|
|
159
|
+
|
|
96
160
|
def act( # noqa: C901
|
|
97
161
|
self,
|
|
98
162
|
goal: str,
|
|
@@ -130,7 +194,10 @@ class SentienceAgent(BaseAgent):
|
|
|
130
194
|
# Emit step_start trace event if tracer is enabled
|
|
131
195
|
if self.tracer:
|
|
132
196
|
pre_url = self.browser.page.url if self.browser.page else None
|
|
133
|
-
|
|
197
|
+
_safe_tracer_call(
|
|
198
|
+
self.tracer,
|
|
199
|
+
"emit_step_start",
|
|
200
|
+
self.verbose,
|
|
134
201
|
step_id=step_id,
|
|
135
202
|
step_index=self._step_count,
|
|
136
203
|
goal=goal,
|
|
@@ -149,66 +216,107 @@ class SentienceAgent(BaseAgent):
|
|
|
149
216
|
if snap_opts.goal is None:
|
|
150
217
|
snap_opts.goal = goal
|
|
151
218
|
|
|
219
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
220
|
+
if snapshot_options is None and self.config:
|
|
221
|
+
if self.config.capture_screenshots:
|
|
222
|
+
# Create ScreenshotConfig from AgentConfig
|
|
223
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
224
|
+
format=self.config.screenshot_format,
|
|
225
|
+
quality=(
|
|
226
|
+
self.config.screenshot_quality
|
|
227
|
+
if self.config.screenshot_format == "jpeg"
|
|
228
|
+
else None
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
snap_opts.screenshot = False
|
|
233
|
+
# Apply show_overlay from AgentConfig
|
|
234
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
235
|
+
|
|
152
236
|
# Call snapshot with options object (matches TypeScript API)
|
|
153
237
|
snap = snapshot(self.browser, snap_opts)
|
|
154
238
|
|
|
155
239
|
if snap.status != "success":
|
|
156
240
|
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
157
241
|
|
|
242
|
+
# Compute diff_status by comparing with previous snapshot
|
|
243
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
244
|
+
|
|
245
|
+
# Create snapshot with diff_status populated
|
|
246
|
+
snap_with_diff = Snapshot(
|
|
247
|
+
status=snap.status,
|
|
248
|
+
timestamp=snap.timestamp,
|
|
249
|
+
url=snap.url,
|
|
250
|
+
viewport=snap.viewport,
|
|
251
|
+
elements=elements_with_diff,
|
|
252
|
+
screenshot=snap.screenshot,
|
|
253
|
+
screenshot_format=snap.screenshot_format,
|
|
254
|
+
error=snap.error,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Update previous snapshot for next comparison
|
|
258
|
+
self._previous_snapshot = snap
|
|
259
|
+
|
|
158
260
|
# Apply element filtering based on goal
|
|
159
|
-
filtered_elements = self.filter_elements(
|
|
261
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
160
262
|
|
|
161
263
|
# Emit snapshot trace event if tracer is enabled
|
|
162
264
|
if self.tracer:
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
265
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
266
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
267
|
+
|
|
268
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
269
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
270
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
271
|
+
if snap.screenshot:
|
|
272
|
+
# Extract base64 string from data URL if needed
|
|
273
|
+
if snap.screenshot.startswith("data:image"):
|
|
274
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
275
|
+
screenshot_base64 = (
|
|
276
|
+
snap.screenshot.split(",", 1)[1]
|
|
277
|
+
if "," in snap.screenshot
|
|
278
|
+
else snap.screenshot
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
screenshot_base64 = snap.screenshot
|
|
282
|
+
|
|
283
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
284
|
+
if snap.screenshot_format:
|
|
285
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
286
|
+
|
|
287
|
+
_safe_tracer_call(
|
|
288
|
+
self.tracer,
|
|
289
|
+
"emit",
|
|
290
|
+
self.verbose,
|
|
181
291
|
"snapshot",
|
|
182
|
-
|
|
183
|
-
"url": snap.url,
|
|
184
|
-
"element_count": len(snap.elements),
|
|
185
|
-
"timestamp": snap.timestamp,
|
|
186
|
-
"elements": elements_data, # Add element data for overlay
|
|
187
|
-
},
|
|
292
|
+
snapshot_data,
|
|
188
293
|
step_id=step_id,
|
|
189
294
|
)
|
|
190
295
|
|
|
191
|
-
# Create filtered snapshot
|
|
296
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
192
297
|
filtered_snap = Snapshot(
|
|
193
|
-
status=
|
|
194
|
-
timestamp=
|
|
195
|
-
url=
|
|
196
|
-
viewport=
|
|
298
|
+
status=snap_with_diff.status,
|
|
299
|
+
timestamp=snap_with_diff.timestamp,
|
|
300
|
+
url=snap_with_diff.url,
|
|
301
|
+
viewport=snap_with_diff.viewport,
|
|
197
302
|
elements=filtered_elements,
|
|
198
|
-
screenshot=
|
|
199
|
-
screenshot_format=
|
|
200
|
-
error=
|
|
303
|
+
screenshot=snap_with_diff.screenshot,
|
|
304
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
305
|
+
error=snap_with_diff.error,
|
|
201
306
|
)
|
|
202
307
|
|
|
203
308
|
# 2. GROUND: Format elements for LLM context
|
|
204
|
-
context = self.
|
|
309
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
205
310
|
|
|
206
311
|
# 3. THINK: Query LLM for next action
|
|
207
|
-
llm_response = self.
|
|
312
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
208
313
|
|
|
209
314
|
# Emit LLM query trace event if tracer is enabled
|
|
210
315
|
if self.tracer:
|
|
211
|
-
|
|
316
|
+
_safe_tracer_call(
|
|
317
|
+
self.tracer,
|
|
318
|
+
"emit",
|
|
319
|
+
self.verbose,
|
|
212
320
|
"llm_query",
|
|
213
321
|
{
|
|
214
322
|
"prompt_tokens": llm_response.prompt_tokens,
|
|
@@ -226,10 +334,10 @@ class SentienceAgent(BaseAgent):
|
|
|
226
334
|
self._track_tokens(goal, llm_response)
|
|
227
335
|
|
|
228
336
|
# Parse action from LLM response
|
|
229
|
-
action_str = self.
|
|
337
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
230
338
|
|
|
231
339
|
# 4. EXECUTE: Parse and run action
|
|
232
|
-
result_dict = self.
|
|
340
|
+
result_dict = self.action_executor.execute(action_str, filtered_snap)
|
|
233
341
|
|
|
234
342
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
235
343
|
|
|
@@ -269,7 +377,10 @@ class SentienceAgent(BaseAgent):
|
|
|
269
377
|
for el in filtered_snap.elements[:50]
|
|
270
378
|
]
|
|
271
379
|
|
|
272
|
-
|
|
380
|
+
_safe_tracer_call(
|
|
381
|
+
self.tracer,
|
|
382
|
+
"emit",
|
|
383
|
+
self.verbose,
|
|
273
384
|
"action",
|
|
274
385
|
{
|
|
275
386
|
"action": result.action,
|
|
@@ -302,13 +413,105 @@ class SentienceAgent(BaseAgent):
|
|
|
302
413
|
|
|
303
414
|
# Emit step completion trace event if tracer is enabled
|
|
304
415
|
if self.tracer:
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
416
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
417
|
+
pre_url = snap.url
|
|
418
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
419
|
+
|
|
420
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
421
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
422
|
+
|
|
423
|
+
# Build LLM data
|
|
424
|
+
llm_response_text = llm_response.content
|
|
425
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
426
|
+
llm_data = {
|
|
427
|
+
"response_text": llm_response_text,
|
|
428
|
+
"response_hash": llm_response_hash,
|
|
429
|
+
"usage": {
|
|
430
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
431
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
432
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
311
433
|
},
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
# Build exec data
|
|
437
|
+
exec_data = {
|
|
438
|
+
"success": result.success,
|
|
439
|
+
"action": result.action,
|
|
440
|
+
"outcome": result.outcome
|
|
441
|
+
or (
|
|
442
|
+
f"Action {result.action} executed successfully"
|
|
443
|
+
if result.success
|
|
444
|
+
else f"Action {result.action} failed"
|
|
445
|
+
),
|
|
446
|
+
"duration_ms": duration_ms,
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
# Add optional exec fields
|
|
450
|
+
if result.element_id is not None:
|
|
451
|
+
exec_data["element_id"] = result.element_id
|
|
452
|
+
# Add bounding box if element found
|
|
453
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
454
|
+
if bbox:
|
|
455
|
+
exec_data["bounding_box"] = bbox
|
|
456
|
+
if result.text is not None:
|
|
457
|
+
exec_data["text"] = result.text
|
|
458
|
+
if result.key is not None:
|
|
459
|
+
exec_data["key"] = result.key
|
|
460
|
+
if result.error is not None:
|
|
461
|
+
exec_data["error"] = result.error
|
|
462
|
+
|
|
463
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
464
|
+
verify_passed = result.success and (
|
|
465
|
+
result.url_changed or result.action != "click"
|
|
466
|
+
)
|
|
467
|
+
verify_signals = {
|
|
468
|
+
"url_changed": result.url_changed or False,
|
|
469
|
+
}
|
|
470
|
+
if result.error:
|
|
471
|
+
verify_signals["error"] = result.error
|
|
472
|
+
|
|
473
|
+
# Add elements_found array if element was targeted
|
|
474
|
+
if result.element_id is not None:
|
|
475
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
476
|
+
if bbox:
|
|
477
|
+
verify_signals["elements_found"] = [
|
|
478
|
+
{
|
|
479
|
+
"label": f"Element {result.element_id}",
|
|
480
|
+
"bounding_box": bbox,
|
|
481
|
+
}
|
|
482
|
+
]
|
|
483
|
+
|
|
484
|
+
verify_data = {
|
|
485
|
+
"passed": verify_passed,
|
|
486
|
+
"signals": verify_signals,
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
490
|
+
# Use the same format as build_snapshot_event for consistency
|
|
491
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
492
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
493
|
+
|
|
494
|
+
# Build complete step_end event
|
|
495
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
496
|
+
step_id=step_id,
|
|
497
|
+
step_index=self._step_count,
|
|
498
|
+
goal=goal,
|
|
499
|
+
attempt=attempt,
|
|
500
|
+
pre_url=pre_url,
|
|
501
|
+
post_url=post_url,
|
|
502
|
+
snapshot_digest=snapshot_digest,
|
|
503
|
+
llm_data=llm_data,
|
|
504
|
+
exec_data=exec_data,
|
|
505
|
+
verify_data=verify_data,
|
|
506
|
+
pre_elements=pre_elements,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
_safe_tracer_call(
|
|
510
|
+
self.tracer,
|
|
511
|
+
"emit",
|
|
512
|
+
self.verbose,
|
|
513
|
+
"step_end",
|
|
514
|
+
step_end_data,
|
|
312
515
|
step_id=step_id,
|
|
313
516
|
)
|
|
314
517
|
|
|
@@ -317,7 +520,14 @@ class SentienceAgent(BaseAgent):
|
|
|
317
520
|
except Exception as e:
|
|
318
521
|
# Emit error trace event if tracer is enabled
|
|
319
522
|
if self.tracer:
|
|
320
|
-
|
|
523
|
+
_safe_tracer_call(
|
|
524
|
+
self.tracer,
|
|
525
|
+
"emit_error",
|
|
526
|
+
self.verbose,
|
|
527
|
+
step_id=step_id,
|
|
528
|
+
error=str(e),
|
|
529
|
+
attempt=attempt,
|
|
530
|
+
)
|
|
321
531
|
|
|
322
532
|
if attempt < max_retries:
|
|
323
533
|
if self.verbose:
|
|
@@ -346,195 +556,573 @@ class SentienceAgent(BaseAgent):
|
|
|
346
556
|
)
|
|
347
557
|
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
348
558
|
|
|
349
|
-
def
|
|
559
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
350
560
|
"""
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
|
|
561
|
+
Track token usage for analytics
|
|
354
562
|
|
|
355
563
|
Args:
|
|
356
|
-
|
|
357
|
-
|
|
564
|
+
goal: User goal
|
|
565
|
+
llm_response: LLM response with token usage
|
|
566
|
+
"""
|
|
567
|
+
if llm_response.prompt_tokens:
|
|
568
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
569
|
+
if llm_response.completion_tokens:
|
|
570
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
571
|
+
if llm_response.total_tokens:
|
|
572
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
573
|
+
|
|
574
|
+
self._token_usage_raw["by_action"].append(
|
|
575
|
+
{
|
|
576
|
+
"goal": goal,
|
|
577
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
578
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
579
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
580
|
+
"model": llm_response.model_name,
|
|
581
|
+
}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
def get_token_stats(self) -> TokenStats:
|
|
585
|
+
"""
|
|
586
|
+
Get token usage statistics
|
|
358
587
|
|
|
359
588
|
Returns:
|
|
360
|
-
|
|
589
|
+
TokenStats with token usage breakdown
|
|
361
590
|
"""
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if el.visual_cues.is_clickable:
|
|
370
|
-
cues.append("CLICKABLE")
|
|
371
|
-
if el.visual_cues.background_color_name:
|
|
372
|
-
cues.append(f"color:{el.visual_cues.background_color_name}")
|
|
373
|
-
|
|
374
|
-
# Format element line
|
|
375
|
-
cues_str = f" {{{','.join(cues)}}}" if cues else ""
|
|
376
|
-
text_preview = (
|
|
377
|
-
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
|
|
378
|
-
)
|
|
591
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
592
|
+
return TokenStats(
|
|
593
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
594
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
595
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
596
|
+
by_action=by_action,
|
|
597
|
+
)
|
|
379
598
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
599
|
+
def get_history(self) -> list[ActionHistory]:
|
|
600
|
+
"""
|
|
601
|
+
Get execution history
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
List of ActionHistory entries
|
|
605
|
+
"""
|
|
606
|
+
return [ActionHistory(**h) for h in self.history]
|
|
384
607
|
|
|
385
|
-
|
|
608
|
+
def clear_history(self) -> None:
|
|
609
|
+
"""Clear execution history and reset token counters"""
|
|
610
|
+
self.history.clear()
|
|
611
|
+
self._token_usage_raw = {
|
|
612
|
+
"total_prompt_tokens": 0,
|
|
613
|
+
"total_completion_tokens": 0,
|
|
614
|
+
"total_tokens": 0,
|
|
615
|
+
"by_action": [],
|
|
616
|
+
}
|
|
386
617
|
|
|
387
|
-
def
|
|
618
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
388
619
|
"""
|
|
389
|
-
|
|
390
|
-
|
|
620
|
+
Filter elements from snapshot based on goal context.
|
|
621
|
+
|
|
622
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
623
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
391
624
|
|
|
392
625
|
Args:
|
|
393
|
-
|
|
626
|
+
snapshot: Current page snapshot
|
|
627
|
+
goal: User's goal (can inform filtering)
|
|
394
628
|
|
|
395
629
|
Returns:
|
|
396
|
-
|
|
630
|
+
Filtered list of elements
|
|
397
631
|
"""
|
|
398
|
-
|
|
632
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|
|
399
633
|
|
|
400
|
-
# Remove markdown code blocks if present
|
|
401
|
-
response = re.sub(r"```[\w]*\n?", "", response)
|
|
402
|
-
response = response.strip()
|
|
403
634
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
635
|
+
class SentienceAgentAsync(BaseAgentAsync):
|
|
636
|
+
"""
|
|
637
|
+
High-level async agent that combines Sentience SDK with any LLM provider.
|
|
407
638
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
639
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
640
|
+
1. OBSERVE: Get snapshot of current page state
|
|
641
|
+
2. THINK: Query LLM to decide next action
|
|
642
|
+
3. ACT: Execute action using SDK
|
|
411
643
|
|
|
412
|
-
|
|
413
|
-
|
|
644
|
+
Example:
|
|
645
|
+
>>> from sentience.async_api import AsyncSentienceBrowser
|
|
646
|
+
>>> from sentience.agent import SentienceAgentAsync
|
|
647
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
648
|
+
>>>
|
|
649
|
+
>>> async with AsyncSentienceBrowser() as browser:
|
|
650
|
+
>>> await browser.goto("https://google.com")
|
|
651
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
652
|
+
>>> agent = SentienceAgentAsync(browser, llm)
|
|
653
|
+
>>> await agent.act("Click the search box")
|
|
654
|
+
>>> await agent.act("Type 'magic mouse' into the search field")
|
|
655
|
+
>>> await agent.act("Press Enter key")
|
|
656
|
+
"""
|
|
414
657
|
|
|
415
|
-
def
|
|
658
|
+
def __init__(
|
|
659
|
+
self,
|
|
660
|
+
browser: AsyncSentienceBrowser,
|
|
661
|
+
llm: LLMProvider,
|
|
662
|
+
default_snapshot_limit: int = 50,
|
|
663
|
+
verbose: bool = True,
|
|
664
|
+
tracer: Optional["Tracer"] = None,
|
|
665
|
+
config: Optional["AgentConfig"] = None,
|
|
666
|
+
):
|
|
416
667
|
"""
|
|
417
|
-
|
|
668
|
+
Initialize Sentience Agent (async)
|
|
418
669
|
|
|
419
670
|
Args:
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
671
|
+
browser: AsyncSentienceBrowser instance
|
|
672
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
673
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
674
|
+
verbose: Print execution logs (default: True)
|
|
675
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
676
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
425
677
|
"""
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
CORRECT Examples:
|
|
450
|
-
CLICK(42)
|
|
451
|
-
TYPE(15, "magic mouse")
|
|
452
|
-
PRESS("Enter")
|
|
453
|
-
FINISH()
|
|
454
|
-
|
|
455
|
-
INCORRECT Examples (DO NOT DO THIS):
|
|
456
|
-
"The next step is to click..."
|
|
457
|
-
"I will type..."
|
|
458
|
-
```CLICK(42)```
|
|
459
|
-
"""
|
|
678
|
+
self.browser = browser
|
|
679
|
+
self.llm = llm
|
|
680
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
681
|
+
self.verbose = verbose
|
|
682
|
+
self.tracer = tracer
|
|
683
|
+
self.config = config or AgentConfig()
|
|
684
|
+
|
|
685
|
+
# Initialize handlers
|
|
686
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
687
|
+
self.action_executor = ActionExecutor(browser)
|
|
688
|
+
|
|
689
|
+
# Screenshot sequence counter
|
|
690
|
+
# Execution history
|
|
691
|
+
self.history: list[dict[str, Any]] = []
|
|
692
|
+
|
|
693
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
694
|
+
self._token_usage_raw = {
|
|
695
|
+
"total_prompt_tokens": 0,
|
|
696
|
+
"total_completion_tokens": 0,
|
|
697
|
+
"total_tokens": 0,
|
|
698
|
+
"by_action": [],
|
|
699
|
+
}
|
|
460
700
|
|
|
461
|
-
|
|
701
|
+
# Step counter for tracing
|
|
702
|
+
self._step_count = 0
|
|
703
|
+
|
|
704
|
+
# Previous snapshot for diff detection
|
|
705
|
+
self._previous_snapshot: Snapshot | None = None
|
|
462
706
|
|
|
463
|
-
|
|
707
|
+
def _compute_hash(self, text: str) -> str:
|
|
708
|
+
"""Compute SHA256 hash of text."""
|
|
709
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
464
710
|
|
|
465
|
-
def
|
|
711
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
712
|
+
"""Get bounding box for an element from snapshot."""
|
|
713
|
+
if element_id is None:
|
|
714
|
+
return None
|
|
715
|
+
for el in snap.elements:
|
|
716
|
+
if el.id == element_id:
|
|
717
|
+
return {
|
|
718
|
+
"x": el.bbox.x,
|
|
719
|
+
"y": el.bbox.y,
|
|
720
|
+
"width": el.bbox.width,
|
|
721
|
+
"height": el.bbox.height,
|
|
722
|
+
}
|
|
723
|
+
return None
|
|
724
|
+
|
|
725
|
+
async def act( # noqa: C901
|
|
726
|
+
self,
|
|
727
|
+
goal: str,
|
|
728
|
+
max_retries: int = 2,
|
|
729
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
730
|
+
) -> AgentActionResult:
|
|
466
731
|
"""
|
|
467
|
-
|
|
732
|
+
Execute a high-level goal using observe → think → act loop (async)
|
|
468
733
|
|
|
469
734
|
Args:
|
|
470
|
-
|
|
471
|
-
|
|
735
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
736
|
+
max_retries: Number of retries on failure (default: 2)
|
|
737
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
472
738
|
|
|
473
739
|
Returns:
|
|
474
|
-
|
|
740
|
+
AgentActionResult with execution details
|
|
741
|
+
|
|
742
|
+
Example:
|
|
743
|
+
>>> result = await agent.act("Click the search box")
|
|
744
|
+
>>> print(result.success, result.action, result.element_id)
|
|
745
|
+
True click 42
|
|
475
746
|
"""
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
return {
|
|
481
|
-
"success": result.success,
|
|
482
|
-
"action": "click",
|
|
483
|
-
"element_id": element_id,
|
|
484
|
-
"outcome": result.outcome,
|
|
485
|
-
"url_changed": result.url_changed,
|
|
486
|
-
}
|
|
747
|
+
if self.verbose:
|
|
748
|
+
print(f"\n{'=' * 70}")
|
|
749
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
750
|
+
print(f"{'=' * 70}")
|
|
487
751
|
|
|
488
|
-
#
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
action_str,
|
|
492
|
-
re.IGNORECASE,
|
|
493
|
-
):
|
|
494
|
-
element_id = int(match.group(1))
|
|
495
|
-
text = match.group(2)
|
|
496
|
-
result = type_text(self.browser, element_id, text)
|
|
497
|
-
return {
|
|
498
|
-
"success": result.success,
|
|
499
|
-
"action": "type",
|
|
500
|
-
"element_id": element_id,
|
|
501
|
-
"text": text,
|
|
502
|
-
"outcome": result.outcome,
|
|
503
|
-
}
|
|
752
|
+
# Generate step ID for tracing
|
|
753
|
+
self._step_count += 1
|
|
754
|
+
step_id = f"step-{self._step_count}"
|
|
504
755
|
|
|
505
|
-
#
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
"
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
756
|
+
# Emit step_start trace event if tracer is enabled
|
|
757
|
+
if self.tracer:
|
|
758
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
759
|
+
_safe_tracer_call(
|
|
760
|
+
self.tracer,
|
|
761
|
+
"emit_step_start",
|
|
762
|
+
self.verbose,
|
|
763
|
+
step_id=step_id,
|
|
764
|
+
step_index=self._step_count,
|
|
765
|
+
goal=goal,
|
|
766
|
+
attempt=0,
|
|
767
|
+
pre_url=pre_url,
|
|
768
|
+
)
|
|
515
769
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
"action": "finish",
|
|
521
|
-
"message": "Task marked as complete",
|
|
522
|
-
}
|
|
770
|
+
for attempt in range(max_retries + 1):
|
|
771
|
+
try:
|
|
772
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
773
|
+
start_time = time.time()
|
|
523
774
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
775
|
+
# Use provided options or create default
|
|
776
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
777
|
+
# Only set goal if not already provided
|
|
778
|
+
if snap_opts.goal is None:
|
|
779
|
+
snap_opts.goal = goal
|
|
529
780
|
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
781
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
782
|
+
# Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
|
|
783
|
+
# (snapshot_options.screenshot defaults to False, so we check if it's still False)
|
|
784
|
+
if self.config and (snapshot_options is None or snap_opts.screenshot is False):
|
|
785
|
+
if self.config.capture_screenshots:
|
|
786
|
+
# Create ScreenshotConfig from AgentConfig
|
|
787
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
788
|
+
format=self.config.screenshot_format,
|
|
789
|
+
quality=(
|
|
790
|
+
self.config.screenshot_quality
|
|
791
|
+
if self.config.screenshot_format == "jpeg"
|
|
792
|
+
else None
|
|
793
|
+
),
|
|
794
|
+
)
|
|
795
|
+
else:
|
|
796
|
+
snap_opts.screenshot = False
|
|
797
|
+
# Apply show_overlay from AgentConfig
|
|
798
|
+
# Note: User can override by explicitly passing show_overlay in snapshot_options
|
|
799
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
533
800
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
801
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
802
|
+
snap = await snapshot_async(self.browser, snap_opts)
|
|
803
|
+
|
|
804
|
+
if snap.status != "success":
|
|
805
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
806
|
+
|
|
807
|
+
# Compute diff_status by comparing with previous snapshot
|
|
808
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
809
|
+
|
|
810
|
+
# Create snapshot with diff_status populated
|
|
811
|
+
snap_with_diff = Snapshot(
|
|
812
|
+
status=snap.status,
|
|
813
|
+
timestamp=snap.timestamp,
|
|
814
|
+
url=snap.url,
|
|
815
|
+
viewport=snap.viewport,
|
|
816
|
+
elements=elements_with_diff,
|
|
817
|
+
screenshot=snap.screenshot,
|
|
818
|
+
screenshot_format=snap.screenshot_format,
|
|
819
|
+
error=snap.error,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# Update previous snapshot for next comparison
|
|
823
|
+
self._previous_snapshot = snap
|
|
824
|
+
|
|
825
|
+
# Apply element filtering based on goal
|
|
826
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
827
|
+
|
|
828
|
+
# Emit snapshot trace event if tracer is enabled
|
|
829
|
+
if self.tracer:
|
|
830
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
831
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
832
|
+
|
|
833
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
834
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
835
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
836
|
+
if snap.screenshot:
|
|
837
|
+
# Extract base64 string from data URL if needed
|
|
838
|
+
if snap.screenshot.startswith("data:image"):
|
|
839
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
840
|
+
screenshot_base64 = (
|
|
841
|
+
snap.screenshot.split(",", 1)[1]
|
|
842
|
+
if "," in snap.screenshot
|
|
843
|
+
else snap.screenshot
|
|
844
|
+
)
|
|
845
|
+
else:
|
|
846
|
+
screenshot_base64 = snap.screenshot
|
|
847
|
+
|
|
848
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
849
|
+
if snap.screenshot_format:
|
|
850
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
851
|
+
|
|
852
|
+
_safe_tracer_call(
|
|
853
|
+
self.tracer,
|
|
854
|
+
"emit",
|
|
855
|
+
self.verbose,
|
|
856
|
+
"snapshot",
|
|
857
|
+
snapshot_data,
|
|
858
|
+
step_id=step_id,
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
862
|
+
filtered_snap = Snapshot(
|
|
863
|
+
status=snap_with_diff.status,
|
|
864
|
+
timestamp=snap_with_diff.timestamp,
|
|
865
|
+
url=snap_with_diff.url,
|
|
866
|
+
viewport=snap_with_diff.viewport,
|
|
867
|
+
elements=filtered_elements,
|
|
868
|
+
screenshot=snap_with_diff.screenshot,
|
|
869
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
870
|
+
error=snap_with_diff.error,
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
# 2. GROUND: Format elements for LLM context
|
|
874
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
875
|
+
|
|
876
|
+
# 3. THINK: Query LLM for next action
|
|
877
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
878
|
+
|
|
879
|
+
# Emit LLM query trace event if tracer is enabled
|
|
880
|
+
if self.tracer:
|
|
881
|
+
_safe_tracer_call(
|
|
882
|
+
self.tracer,
|
|
883
|
+
"emit",
|
|
884
|
+
self.verbose,
|
|
885
|
+
"llm_query",
|
|
886
|
+
{
|
|
887
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
888
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
889
|
+
"model": llm_response.model_name,
|
|
890
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
891
|
+
},
|
|
892
|
+
step_id=step_id,
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
if self.verbose:
|
|
896
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
897
|
+
|
|
898
|
+
# Track token usage
|
|
899
|
+
self._track_tokens(goal, llm_response)
|
|
900
|
+
|
|
901
|
+
# Parse action from LLM response
|
|
902
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
903
|
+
|
|
904
|
+
# 4. EXECUTE: Parse and run action
|
|
905
|
+
result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
|
|
906
|
+
|
|
907
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
908
|
+
|
|
909
|
+
# Create AgentActionResult from execution result
|
|
910
|
+
result = AgentActionResult(
|
|
911
|
+
success=result_dict["success"],
|
|
912
|
+
action=result_dict["action"],
|
|
913
|
+
goal=goal,
|
|
914
|
+
duration_ms=duration_ms,
|
|
915
|
+
attempt=attempt,
|
|
916
|
+
element_id=result_dict.get("element_id"),
|
|
917
|
+
text=result_dict.get("text"),
|
|
918
|
+
key=result_dict.get("key"),
|
|
919
|
+
outcome=result_dict.get("outcome"),
|
|
920
|
+
url_changed=result_dict.get("url_changed"),
|
|
921
|
+
error=result_dict.get("error"),
|
|
922
|
+
message=result_dict.get("message"),
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Emit action execution trace event if tracer is enabled
|
|
926
|
+
if self.tracer:
|
|
927
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
928
|
+
|
|
929
|
+
# Include element data for live overlay visualization
|
|
930
|
+
elements_data = [
|
|
931
|
+
{
|
|
932
|
+
"id": el.id,
|
|
933
|
+
"bbox": {
|
|
934
|
+
"x": el.bbox.x,
|
|
935
|
+
"y": el.bbox.y,
|
|
936
|
+
"width": el.bbox.width,
|
|
937
|
+
"height": el.bbox.height,
|
|
938
|
+
},
|
|
939
|
+
"role": el.role,
|
|
940
|
+
"text": el.text[:50] if el.text else "",
|
|
941
|
+
}
|
|
942
|
+
for el in filtered_snap.elements[:50]
|
|
943
|
+
]
|
|
944
|
+
|
|
945
|
+
_safe_tracer_call(
|
|
946
|
+
self.tracer,
|
|
947
|
+
"emit",
|
|
948
|
+
self.verbose,
|
|
949
|
+
"action",
|
|
950
|
+
{
|
|
951
|
+
"action": result.action,
|
|
952
|
+
"element_id": result.element_id,
|
|
953
|
+
"success": result.success,
|
|
954
|
+
"outcome": result.outcome,
|
|
955
|
+
"duration_ms": duration_ms,
|
|
956
|
+
"post_url": post_url,
|
|
957
|
+
"elements": elements_data, # Add element data for overlay
|
|
958
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
959
|
+
},
|
|
960
|
+
step_id=step_id,
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
# 5. RECORD: Track history
|
|
964
|
+
self.history.append(
|
|
965
|
+
{
|
|
966
|
+
"goal": goal,
|
|
967
|
+
"action": action_str,
|
|
968
|
+
"result": result.model_dump(), # Store as dict
|
|
969
|
+
"success": result.success,
|
|
970
|
+
"attempt": attempt,
|
|
971
|
+
"duration_ms": duration_ms,
|
|
972
|
+
}
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
if self.verbose:
|
|
976
|
+
status = "✅" if result.success else "❌"
|
|
977
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
978
|
+
|
|
979
|
+
# Emit step completion trace event if tracer is enabled
|
|
980
|
+
if self.tracer:
|
|
981
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
982
|
+
pre_url = snap.url
|
|
983
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
984
|
+
|
|
985
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
986
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
987
|
+
|
|
988
|
+
# Build LLM data
|
|
989
|
+
llm_response_text = llm_response.content
|
|
990
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
991
|
+
llm_data = {
|
|
992
|
+
"response_text": llm_response_text,
|
|
993
|
+
"response_hash": llm_response_hash,
|
|
994
|
+
"usage": {
|
|
995
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
996
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
997
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
998
|
+
},
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
# Build exec data
|
|
1002
|
+
exec_data = {
|
|
1003
|
+
"success": result.success,
|
|
1004
|
+
"action": result.action,
|
|
1005
|
+
"outcome": result.outcome
|
|
1006
|
+
or (
|
|
1007
|
+
f"Action {result.action} executed successfully"
|
|
1008
|
+
if result.success
|
|
1009
|
+
else f"Action {result.action} failed"
|
|
1010
|
+
),
|
|
1011
|
+
"duration_ms": duration_ms,
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
# Add optional exec fields
|
|
1015
|
+
if result.element_id is not None:
|
|
1016
|
+
exec_data["element_id"] = result.element_id
|
|
1017
|
+
# Add bounding box if element found
|
|
1018
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1019
|
+
if bbox:
|
|
1020
|
+
exec_data["bounding_box"] = bbox
|
|
1021
|
+
if result.text is not None:
|
|
1022
|
+
exec_data["text"] = result.text
|
|
1023
|
+
if result.key is not None:
|
|
1024
|
+
exec_data["key"] = result.key
|
|
1025
|
+
if result.error is not None:
|
|
1026
|
+
exec_data["error"] = result.error
|
|
1027
|
+
|
|
1028
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
1029
|
+
verify_passed = result.success and (
|
|
1030
|
+
result.url_changed or result.action != "click"
|
|
1031
|
+
)
|
|
1032
|
+
verify_signals = {
|
|
1033
|
+
"url_changed": result.url_changed or False,
|
|
1034
|
+
}
|
|
1035
|
+
if result.error:
|
|
1036
|
+
verify_signals["error"] = result.error
|
|
1037
|
+
|
|
1038
|
+
# Add elements_found array if element was targeted
|
|
1039
|
+
if result.element_id is not None:
|
|
1040
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1041
|
+
if bbox:
|
|
1042
|
+
verify_signals["elements_found"] = [
|
|
1043
|
+
{
|
|
1044
|
+
"label": f"Element {result.element_id}",
|
|
1045
|
+
"bounding_box": bbox,
|
|
1046
|
+
}
|
|
1047
|
+
]
|
|
1048
|
+
|
|
1049
|
+
verify_data = {
|
|
1050
|
+
"passed": verify_passed,
|
|
1051
|
+
"signals": verify_signals,
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
1055
|
+
# Use the same format as build_snapshot_event for consistency
|
|
1056
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
1057
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
1058
|
+
|
|
1059
|
+
# Build complete step_end event
|
|
1060
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
1061
|
+
step_id=step_id,
|
|
1062
|
+
step_index=self._step_count,
|
|
1063
|
+
goal=goal,
|
|
1064
|
+
attempt=attempt,
|
|
1065
|
+
pre_url=pre_url,
|
|
1066
|
+
post_url=post_url,
|
|
1067
|
+
snapshot_digest=snapshot_digest,
|
|
1068
|
+
llm_data=llm_data,
|
|
1069
|
+
exec_data=exec_data,
|
|
1070
|
+
verify_data=verify_data,
|
|
1071
|
+
pre_elements=pre_elements,
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
_safe_tracer_call(
|
|
1075
|
+
self.tracer,
|
|
1076
|
+
"emit",
|
|
1077
|
+
self.verbose,
|
|
1078
|
+
"step_end",
|
|
1079
|
+
step_end_data,
|
|
1080
|
+
step_id=step_id,
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
return result
|
|
1084
|
+
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
# Emit error trace event if tracer is enabled
|
|
1087
|
+
if self.tracer:
|
|
1088
|
+
_safe_tracer_call(
|
|
1089
|
+
self.tracer,
|
|
1090
|
+
"emit_error",
|
|
1091
|
+
self.verbose,
|
|
1092
|
+
step_id=step_id,
|
|
1093
|
+
error=str(e),
|
|
1094
|
+
attempt=attempt,
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
if attempt < max_retries:
|
|
1098
|
+
if self.verbose:
|
|
1099
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
1100
|
+
await asyncio.sleep(1.0) # Brief delay before retry
|
|
1101
|
+
continue
|
|
1102
|
+
else:
|
|
1103
|
+
# Create error result
|
|
1104
|
+
error_result = AgentActionResult(
|
|
1105
|
+
success=False,
|
|
1106
|
+
action="error",
|
|
1107
|
+
goal=goal,
|
|
1108
|
+
duration_ms=0,
|
|
1109
|
+
attempt=attempt,
|
|
1110
|
+
error=str(e),
|
|
1111
|
+
)
|
|
1112
|
+
self.history.append(
|
|
1113
|
+
{
|
|
1114
|
+
"goal": goal,
|
|
1115
|
+
"action": "error",
|
|
1116
|
+
"result": error_result.model_dump(),
|
|
1117
|
+
"success": False,
|
|
1118
|
+
"attempt": attempt,
|
|
1119
|
+
"duration_ms": 0,
|
|
1120
|
+
}
|
|
1121
|
+
)
|
|
1122
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
1123
|
+
|
|
1124
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
1125
|
+
"""Track token usage for analytics (same as sync version)"""
|
|
538
1126
|
if llm_response.prompt_tokens:
|
|
539
1127
|
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
540
1128
|
if llm_response.completion_tokens:
|
|
@@ -553,12 +1141,7 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
553
1141
|
)
|
|
554
1142
|
|
|
555
1143
|
def get_token_stats(self) -> TokenStats:
|
|
556
|
-
"""
|
|
557
|
-
Get token usage statistics
|
|
558
|
-
|
|
559
|
-
Returns:
|
|
560
|
-
TokenStats with token usage breakdown
|
|
561
|
-
"""
|
|
1144
|
+
"""Get token usage statistics (same as sync version)"""
|
|
562
1145
|
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
563
1146
|
return TokenStats(
|
|
564
1147
|
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
@@ -568,16 +1151,11 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
568
1151
|
)
|
|
569
1152
|
|
|
570
1153
|
def get_history(self) -> list[ActionHistory]:
|
|
571
|
-
"""
|
|
572
|
-
Get execution history
|
|
573
|
-
|
|
574
|
-
Returns:
|
|
575
|
-
List of ActionHistory entries
|
|
576
|
-
"""
|
|
1154
|
+
"""Get execution history (same as sync version)"""
|
|
577
1155
|
return [ActionHistory(**h) for h in self.history]
|
|
578
1156
|
|
|
579
1157
|
def clear_history(self) -> None:
|
|
580
|
-
"""Clear execution history and reset token counters"""
|
|
1158
|
+
"""Clear execution history and reset token counters (same as sync version)"""
|
|
581
1159
|
self.history.clear()
|
|
582
1160
|
self._token_usage_raw = {
|
|
583
1161
|
"total_prompt_tokens": 0,
|
|
@@ -590,8 +1168,8 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
590
1168
|
"""
|
|
591
1169
|
Filter elements from snapshot based on goal context.
|
|
592
1170
|
|
|
593
|
-
This
|
|
594
|
-
relevant elements and filters out irrelevant ones.
|
|
1171
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
1172
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
595
1173
|
|
|
596
1174
|
Args:
|
|
597
1175
|
snapshot: Current page snapshot
|
|
@@ -600,73 +1178,4 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
600
1178
|
Returns:
|
|
601
1179
|
Filtered list of elements
|
|
602
1180
|
"""
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
# If no goal provided, return all elements (up to limit)
|
|
606
|
-
if not goal:
|
|
607
|
-
return elements[: self.default_snapshot_limit]
|
|
608
|
-
|
|
609
|
-
goal_lower = goal.lower()
|
|
610
|
-
|
|
611
|
-
# Extract keywords from goal
|
|
612
|
-
keywords = self._extract_keywords(goal_lower)
|
|
613
|
-
|
|
614
|
-
# Boost elements matching goal keywords
|
|
615
|
-
scored_elements = []
|
|
616
|
-
for el in elements:
|
|
617
|
-
score = el.importance
|
|
618
|
-
|
|
619
|
-
# Boost if element text matches goal
|
|
620
|
-
if el.text and any(kw in el.text.lower() for kw in keywords):
|
|
621
|
-
score += 0.3
|
|
622
|
-
|
|
623
|
-
# Boost if role matches goal intent
|
|
624
|
-
if "click" in goal_lower and el.visual_cues.is_clickable:
|
|
625
|
-
score += 0.2
|
|
626
|
-
if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
|
|
627
|
-
score += 0.2
|
|
628
|
-
if "search" in goal_lower:
|
|
629
|
-
# Filter out non-interactive elements for search tasks
|
|
630
|
-
if el.role in ["link", "img"] and not el.visual_cues.is_primary:
|
|
631
|
-
score -= 0.5
|
|
632
|
-
|
|
633
|
-
scored_elements.append((score, el))
|
|
634
|
-
|
|
635
|
-
# Re-sort by boosted score
|
|
636
|
-
scored_elements.sort(key=lambda x: x[0], reverse=True)
|
|
637
|
-
elements = [el for _, el in scored_elements]
|
|
638
|
-
|
|
639
|
-
return elements[: self.default_snapshot_limit]
|
|
640
|
-
|
|
641
|
-
def _extract_keywords(self, text: str) -> list[str]:
|
|
642
|
-
"""
|
|
643
|
-
Extract meaningful keywords from goal text
|
|
644
|
-
|
|
645
|
-
Args:
|
|
646
|
-
text: Text to extract keywords from
|
|
647
|
-
|
|
648
|
-
Returns:
|
|
649
|
-
List of keywords
|
|
650
|
-
"""
|
|
651
|
-
stopwords = {
|
|
652
|
-
"the",
|
|
653
|
-
"a",
|
|
654
|
-
"an",
|
|
655
|
-
"and",
|
|
656
|
-
"or",
|
|
657
|
-
"but",
|
|
658
|
-
"in",
|
|
659
|
-
"on",
|
|
660
|
-
"at",
|
|
661
|
-
"to",
|
|
662
|
-
"for",
|
|
663
|
-
"of",
|
|
664
|
-
"with",
|
|
665
|
-
"by",
|
|
666
|
-
"from",
|
|
667
|
-
"as",
|
|
668
|
-
"is",
|
|
669
|
-
"was",
|
|
670
|
-
}
|
|
671
|
-
words = text.split()
|
|
672
|
-
return [w for w in words if w not in stopwords and len(w) > 2]
|
|
1181
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|