sentienceapi 0.90.12__py3-none-any.whl → 0.92.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +14 -5
- sentience/_extension_loader.py +40 -0
- sentience/action_executor.py +215 -0
- sentience/actions.py +408 -25
- sentience/agent.py +804 -310
- sentience/agent_config.py +3 -0
- sentience/async_api.py +101 -0
- sentience/base_agent.py +95 -0
- sentience/browser.py +594 -25
- sentience/browser_evaluator.py +299 -0
- sentience/cloud_tracing.py +458 -36
- sentience/conversational_agent.py +79 -45
- sentience/element_filter.py +136 -0
- sentience/expect.py +98 -2
- sentience/extension/background.js +56 -185
- sentience/extension/content.js +117 -289
- sentience/extension/injected_api.js +799 -1374
- sentience/extension/manifest.json +1 -1
- sentience/extension/pkg/sentience_core.js +190 -396
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/release.json +47 -47
- sentience/formatting.py +9 -53
- sentience/inspector.py +183 -1
- sentience/llm_interaction_handler.py +191 -0
- sentience/llm_provider.py +256 -28
- sentience/llm_provider_utils.py +120 -0
- sentience/llm_response_builder.py +153 -0
- sentience/models.py +66 -1
- sentience/overlay.py +109 -2
- sentience/protocols.py +228 -0
- sentience/query.py +1 -1
- sentience/read.py +95 -3
- sentience/recorder.py +223 -3
- sentience/schemas/trace_v1.json +102 -9
- sentience/screenshot.py +48 -2
- sentience/sentience_methods.py +86 -0
- sentience/snapshot.py +309 -64
- sentience/snapshot_diff.py +141 -0
- sentience/text_search.py +119 -5
- sentience/trace_event_builder.py +129 -0
- sentience/trace_file_manager.py +197 -0
- sentience/trace_indexing/index_schema.py +95 -7
- sentience/trace_indexing/indexer.py +117 -14
- sentience/tracer_factory.py +119 -6
- sentience/tracing.py +172 -8
- sentience/utils/__init__.py +40 -0
- sentience/utils/browser.py +46 -0
- sentience/utils/element.py +257 -0
- sentience/utils/formatting.py +59 -0
- sentience/utils.py +1 -1
- sentience/visual_agent.py +2056 -0
- sentience/wait.py +70 -4
- {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/METADATA +61 -22
- sentienceapi-0.92.2.dist-info/RECORD +65 -0
- sentienceapi-0.92.2.dist-info/licenses/LICENSE +24 -0
- sentienceapi-0.92.2.dist-info/licenses/LICENSE-APACHE +201 -0
- sentienceapi-0.92.2.dist-info/licenses/LICENSE-MIT +21 -0
- sentience/extension/test-content.js +0 -4
- sentienceapi-0.90.12.dist-info/RECORD +0 -46
- sentienceapi-0.90.12.dist-info/licenses/LICENSE.md +0 -43
- {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/WHEEL +0 -0
- {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/entry_points.txt +0 -0
- {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/top_level.txt +0 -0
sentience/agent.py
CHANGED
|
@@ -3,13 +3,17 @@ Sentience Agent: High-level automation agent using LLM + SDK
|
|
|
3
3
|
Implements observe-think-act loop for natural language commands
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import asyncio
|
|
7
|
+
import hashlib
|
|
7
8
|
import time
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
9
|
-
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
12
|
-
from .
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
10
|
+
|
|
11
|
+
from .action_executor import ActionExecutor
|
|
12
|
+
from .agent_config import AgentConfig
|
|
13
|
+
from .base_agent import BaseAgent, BaseAgentAsync
|
|
14
|
+
from .browser import AsyncSentienceBrowser, SentienceBrowser
|
|
15
|
+
from .element_filter import ElementFilter
|
|
16
|
+
from .llm_interaction_handler import LLMInteractionHandler
|
|
13
17
|
from .llm_provider import LLMProvider, LLMResponse
|
|
14
18
|
from .models import (
|
|
15
19
|
ActionHistory,
|
|
@@ -21,13 +25,46 @@ from .models import (
|
|
|
21
25
|
SnapshotOptions,
|
|
22
26
|
TokenStats,
|
|
23
27
|
)
|
|
24
|
-
from .
|
|
28
|
+
from .protocols import AsyncBrowserProtocol, BrowserProtocol
|
|
29
|
+
from .snapshot import snapshot, snapshot_async
|
|
30
|
+
from .snapshot_diff import SnapshotDiff
|
|
31
|
+
from .trace_event_builder import TraceEventBuilder
|
|
25
32
|
|
|
26
33
|
if TYPE_CHECKING:
|
|
27
|
-
from .agent_config import AgentConfig
|
|
28
34
|
from .tracing import Tracer
|
|
29
35
|
|
|
30
36
|
|
|
37
|
+
def _safe_tracer_call(
|
|
38
|
+
tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Safely call tracer method, catching and logging errors without breaking execution.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
tracer: Tracer instance or None
|
|
45
|
+
method_name: Name of tracer method to call (e.g., "emit", "emit_error")
|
|
46
|
+
verbose: Whether to print error messages
|
|
47
|
+
*args: Positional arguments for the tracer method
|
|
48
|
+
**kwargs: Keyword arguments for the tracer method
|
|
49
|
+
"""
|
|
50
|
+
if not tracer:
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
method = getattr(tracer, method_name)
|
|
54
|
+
if args and kwargs:
|
|
55
|
+
method(*args, **kwargs)
|
|
56
|
+
elif args:
|
|
57
|
+
method(*args)
|
|
58
|
+
elif kwargs:
|
|
59
|
+
method(**kwargs)
|
|
60
|
+
else:
|
|
61
|
+
method()
|
|
62
|
+
except Exception as tracer_error:
|
|
63
|
+
# Tracer errors should not break agent execution
|
|
64
|
+
if verbose:
|
|
65
|
+
print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
|
|
66
|
+
|
|
67
|
+
|
|
31
68
|
class SentienceAgent(BaseAgent):
|
|
32
69
|
"""
|
|
33
70
|
High-level agent that combines Sentience SDK with any LLM provider.
|
|
@@ -54,7 +91,7 @@ class SentienceAgent(BaseAgent):
|
|
|
54
91
|
|
|
55
92
|
def __init__(
|
|
56
93
|
self,
|
|
57
|
-
browser: SentienceBrowser,
|
|
94
|
+
browser: SentienceBrowser | BrowserProtocol,
|
|
58
95
|
llm: LLMProvider,
|
|
59
96
|
default_snapshot_limit: int = 50,
|
|
60
97
|
verbose: bool = True,
|
|
@@ -65,7 +102,8 @@ class SentienceAgent(BaseAgent):
|
|
|
65
102
|
Initialize Sentience Agent
|
|
66
103
|
|
|
67
104
|
Args:
|
|
68
|
-
browser: SentienceBrowser instance
|
|
105
|
+
browser: SentienceBrowser instance or BrowserProtocol-compatible object
|
|
106
|
+
(for testing, can use mock objects that implement BrowserProtocol)
|
|
69
107
|
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
70
108
|
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
71
109
|
verbose: Print execution logs (default: True)
|
|
@@ -77,8 +115,13 @@ class SentienceAgent(BaseAgent):
|
|
|
77
115
|
self.default_snapshot_limit = default_snapshot_limit
|
|
78
116
|
self.verbose = verbose
|
|
79
117
|
self.tracer = tracer
|
|
80
|
-
self.config = config
|
|
118
|
+
self.config = config or AgentConfig()
|
|
119
|
+
|
|
120
|
+
# Initialize handlers
|
|
121
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
122
|
+
self.action_executor = ActionExecutor(browser)
|
|
81
123
|
|
|
124
|
+
# Screenshot sequence counter
|
|
82
125
|
# Execution history
|
|
83
126
|
self.history: list[dict[str, Any]] = []
|
|
84
127
|
|
|
@@ -93,6 +136,27 @@ class SentienceAgent(BaseAgent):
|
|
|
93
136
|
# Step counter for tracing
|
|
94
137
|
self._step_count = 0
|
|
95
138
|
|
|
139
|
+
# Previous snapshot for diff detection
|
|
140
|
+
self._previous_snapshot: Snapshot | None = None
|
|
141
|
+
|
|
142
|
+
def _compute_hash(self, text: str) -> str:
|
|
143
|
+
"""Compute SHA256 hash of text."""
|
|
144
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
145
|
+
|
|
146
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
147
|
+
"""Get bounding box for an element from snapshot."""
|
|
148
|
+
if element_id is None:
|
|
149
|
+
return None
|
|
150
|
+
for el in snap.elements:
|
|
151
|
+
if el.id == element_id:
|
|
152
|
+
return {
|
|
153
|
+
"x": el.bbox.x,
|
|
154
|
+
"y": el.bbox.y,
|
|
155
|
+
"width": el.bbox.width,
|
|
156
|
+
"height": el.bbox.height,
|
|
157
|
+
}
|
|
158
|
+
return None
|
|
159
|
+
|
|
96
160
|
def act( # noqa: C901
|
|
97
161
|
self,
|
|
98
162
|
goal: str,
|
|
@@ -130,7 +194,10 @@ class SentienceAgent(BaseAgent):
|
|
|
130
194
|
# Emit step_start trace event if tracer is enabled
|
|
131
195
|
if self.tracer:
|
|
132
196
|
pre_url = self.browser.page.url if self.browser.page else None
|
|
133
|
-
|
|
197
|
+
_safe_tracer_call(
|
|
198
|
+
self.tracer,
|
|
199
|
+
"emit_step_start",
|
|
200
|
+
self.verbose,
|
|
134
201
|
step_id=step_id,
|
|
135
202
|
step_index=self._step_count,
|
|
136
203
|
goal=goal,
|
|
@@ -149,81 +216,107 @@ class SentienceAgent(BaseAgent):
|
|
|
149
216
|
if snap_opts.goal is None:
|
|
150
217
|
snap_opts.goal = goal
|
|
151
218
|
|
|
152
|
-
#
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
219
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
220
|
+
if snapshot_options is None and self.config:
|
|
221
|
+
if self.config.capture_screenshots:
|
|
222
|
+
# Create ScreenshotConfig from AgentConfig
|
|
223
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
224
|
+
format=self.config.screenshot_format,
|
|
225
|
+
quality=(
|
|
226
|
+
self.config.screenshot_quality
|
|
227
|
+
if self.config.screenshot_format == "jpeg"
|
|
228
|
+
else None
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
snap_opts.screenshot = False
|
|
233
|
+
# Apply show_overlay from AgentConfig
|
|
234
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
235
|
+
|
|
236
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
237
|
+
snap = snapshot(self.browser, snap_opts)
|
|
169
238
|
|
|
170
239
|
if snap.status != "success":
|
|
171
240
|
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
172
241
|
|
|
242
|
+
# Compute diff_status by comparing with previous snapshot
|
|
243
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
244
|
+
|
|
245
|
+
# Create snapshot with diff_status populated
|
|
246
|
+
snap_with_diff = Snapshot(
|
|
247
|
+
status=snap.status,
|
|
248
|
+
timestamp=snap.timestamp,
|
|
249
|
+
url=snap.url,
|
|
250
|
+
viewport=snap.viewport,
|
|
251
|
+
elements=elements_with_diff,
|
|
252
|
+
screenshot=snap.screenshot,
|
|
253
|
+
screenshot_format=snap.screenshot_format,
|
|
254
|
+
error=snap.error,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Update previous snapshot for next comparison
|
|
258
|
+
self._previous_snapshot = snap
|
|
259
|
+
|
|
173
260
|
# Apply element filtering based on goal
|
|
174
|
-
filtered_elements = self.filter_elements(
|
|
261
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
175
262
|
|
|
176
263
|
# Emit snapshot trace event if tracer is enabled
|
|
177
264
|
if self.tracer:
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
265
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
266
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
267
|
+
|
|
268
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
269
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
270
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
271
|
+
if snap.screenshot:
|
|
272
|
+
# Extract base64 string from data URL if needed
|
|
273
|
+
if snap.screenshot.startswith("data:image"):
|
|
274
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
275
|
+
screenshot_base64 = (
|
|
276
|
+
snap.screenshot.split(",", 1)[1]
|
|
277
|
+
if "," in snap.screenshot
|
|
278
|
+
else snap.screenshot
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
screenshot_base64 = snap.screenshot
|
|
282
|
+
|
|
283
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
284
|
+
if snap.screenshot_format:
|
|
285
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
286
|
+
|
|
287
|
+
_safe_tracer_call(
|
|
288
|
+
self.tracer,
|
|
289
|
+
"emit",
|
|
290
|
+
self.verbose,
|
|
196
291
|
"snapshot",
|
|
197
|
-
|
|
198
|
-
"url": snap.url,
|
|
199
|
-
"element_count": len(snap.elements),
|
|
200
|
-
"timestamp": snap.timestamp,
|
|
201
|
-
"elements": elements_data, # Add element data for overlay
|
|
202
|
-
},
|
|
292
|
+
snapshot_data,
|
|
203
293
|
step_id=step_id,
|
|
204
294
|
)
|
|
205
295
|
|
|
206
|
-
# Create filtered snapshot
|
|
296
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
207
297
|
filtered_snap = Snapshot(
|
|
208
|
-
status=
|
|
209
|
-
timestamp=
|
|
210
|
-
url=
|
|
211
|
-
viewport=
|
|
298
|
+
status=snap_with_diff.status,
|
|
299
|
+
timestamp=snap_with_diff.timestamp,
|
|
300
|
+
url=snap_with_diff.url,
|
|
301
|
+
viewport=snap_with_diff.viewport,
|
|
212
302
|
elements=filtered_elements,
|
|
213
|
-
screenshot=
|
|
214
|
-
screenshot_format=
|
|
215
|
-
error=
|
|
303
|
+
screenshot=snap_with_diff.screenshot,
|
|
304
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
305
|
+
error=snap_with_diff.error,
|
|
216
306
|
)
|
|
217
307
|
|
|
218
308
|
# 2. GROUND: Format elements for LLM context
|
|
219
|
-
context = self.
|
|
309
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
220
310
|
|
|
221
311
|
# 3. THINK: Query LLM for next action
|
|
222
|
-
llm_response = self.
|
|
312
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
223
313
|
|
|
224
314
|
# Emit LLM query trace event if tracer is enabled
|
|
225
315
|
if self.tracer:
|
|
226
|
-
|
|
316
|
+
_safe_tracer_call(
|
|
317
|
+
self.tracer,
|
|
318
|
+
"emit",
|
|
319
|
+
self.verbose,
|
|
227
320
|
"llm_query",
|
|
228
321
|
{
|
|
229
322
|
"prompt_tokens": llm_response.prompt_tokens,
|
|
@@ -241,10 +334,10 @@ class SentienceAgent(BaseAgent):
|
|
|
241
334
|
self._track_tokens(goal, llm_response)
|
|
242
335
|
|
|
243
336
|
# Parse action from LLM response
|
|
244
|
-
action_str = self.
|
|
337
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
245
338
|
|
|
246
339
|
# 4. EXECUTE: Parse and run action
|
|
247
|
-
result_dict = self.
|
|
340
|
+
result_dict = self.action_executor.execute(action_str, filtered_snap)
|
|
248
341
|
|
|
249
342
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
250
343
|
|
|
@@ -284,7 +377,10 @@ class SentienceAgent(BaseAgent):
|
|
|
284
377
|
for el in filtered_snap.elements[:50]
|
|
285
378
|
]
|
|
286
379
|
|
|
287
|
-
|
|
380
|
+
_safe_tracer_call(
|
|
381
|
+
self.tracer,
|
|
382
|
+
"emit",
|
|
383
|
+
self.verbose,
|
|
288
384
|
"action",
|
|
289
385
|
{
|
|
290
386
|
"action": result.action,
|
|
@@ -317,13 +413,105 @@ class SentienceAgent(BaseAgent):
|
|
|
317
413
|
|
|
318
414
|
# Emit step completion trace event if tracer is enabled
|
|
319
415
|
if self.tracer:
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
416
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
417
|
+
pre_url = snap.url
|
|
418
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
419
|
+
|
|
420
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
421
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
422
|
+
|
|
423
|
+
# Build LLM data
|
|
424
|
+
llm_response_text = llm_response.content
|
|
425
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
426
|
+
llm_data = {
|
|
427
|
+
"response_text": llm_response_text,
|
|
428
|
+
"response_hash": llm_response_hash,
|
|
429
|
+
"usage": {
|
|
430
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
431
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
432
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
326
433
|
},
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
# Build exec data
|
|
437
|
+
exec_data = {
|
|
438
|
+
"success": result.success,
|
|
439
|
+
"action": result.action,
|
|
440
|
+
"outcome": result.outcome
|
|
441
|
+
or (
|
|
442
|
+
f"Action {result.action} executed successfully"
|
|
443
|
+
if result.success
|
|
444
|
+
else f"Action {result.action} failed"
|
|
445
|
+
),
|
|
446
|
+
"duration_ms": duration_ms,
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
# Add optional exec fields
|
|
450
|
+
if result.element_id is not None:
|
|
451
|
+
exec_data["element_id"] = result.element_id
|
|
452
|
+
# Add bounding box if element found
|
|
453
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
454
|
+
if bbox:
|
|
455
|
+
exec_data["bounding_box"] = bbox
|
|
456
|
+
if result.text is not None:
|
|
457
|
+
exec_data["text"] = result.text
|
|
458
|
+
if result.key is not None:
|
|
459
|
+
exec_data["key"] = result.key
|
|
460
|
+
if result.error is not None:
|
|
461
|
+
exec_data["error"] = result.error
|
|
462
|
+
|
|
463
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
464
|
+
verify_passed = result.success and (
|
|
465
|
+
result.url_changed or result.action != "click"
|
|
466
|
+
)
|
|
467
|
+
verify_signals = {
|
|
468
|
+
"url_changed": result.url_changed or False,
|
|
469
|
+
}
|
|
470
|
+
if result.error:
|
|
471
|
+
verify_signals["error"] = result.error
|
|
472
|
+
|
|
473
|
+
# Add elements_found array if element was targeted
|
|
474
|
+
if result.element_id is not None:
|
|
475
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
476
|
+
if bbox:
|
|
477
|
+
verify_signals["elements_found"] = [
|
|
478
|
+
{
|
|
479
|
+
"label": f"Element {result.element_id}",
|
|
480
|
+
"bounding_box": bbox,
|
|
481
|
+
}
|
|
482
|
+
]
|
|
483
|
+
|
|
484
|
+
verify_data = {
|
|
485
|
+
"passed": verify_passed,
|
|
486
|
+
"signals": verify_signals,
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
490
|
+
# Use the same format as build_snapshot_event for consistency
|
|
491
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
492
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
493
|
+
|
|
494
|
+
# Build complete step_end event
|
|
495
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
496
|
+
step_id=step_id,
|
|
497
|
+
step_index=self._step_count,
|
|
498
|
+
goal=goal,
|
|
499
|
+
attempt=attempt,
|
|
500
|
+
pre_url=pre_url,
|
|
501
|
+
post_url=post_url,
|
|
502
|
+
snapshot_digest=snapshot_digest,
|
|
503
|
+
llm_data=llm_data,
|
|
504
|
+
exec_data=exec_data,
|
|
505
|
+
verify_data=verify_data,
|
|
506
|
+
pre_elements=pre_elements,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
_safe_tracer_call(
|
|
510
|
+
self.tracer,
|
|
511
|
+
"emit",
|
|
512
|
+
self.verbose,
|
|
513
|
+
"step_end",
|
|
514
|
+
step_end_data,
|
|
327
515
|
step_id=step_id,
|
|
328
516
|
)
|
|
329
517
|
|
|
@@ -332,7 +520,14 @@ class SentienceAgent(BaseAgent):
|
|
|
332
520
|
except Exception as e:
|
|
333
521
|
# Emit error trace event if tracer is enabled
|
|
334
522
|
if self.tracer:
|
|
335
|
-
|
|
523
|
+
_safe_tracer_call(
|
|
524
|
+
self.tracer,
|
|
525
|
+
"emit_error",
|
|
526
|
+
self.verbose,
|
|
527
|
+
step_id=step_id,
|
|
528
|
+
error=str(e),
|
|
529
|
+
attempt=attempt,
|
|
530
|
+
)
|
|
336
531
|
|
|
337
532
|
if attempt < max_retries:
|
|
338
533
|
if self.verbose:
|
|
@@ -361,195 +556,573 @@ class SentienceAgent(BaseAgent):
|
|
|
361
556
|
)
|
|
362
557
|
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
363
558
|
|
|
364
|
-
def
|
|
559
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
365
560
|
"""
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
|
|
561
|
+
Track token usage for analytics
|
|
369
562
|
|
|
370
563
|
Args:
|
|
371
|
-
|
|
372
|
-
|
|
564
|
+
goal: User goal
|
|
565
|
+
llm_response: LLM response with token usage
|
|
566
|
+
"""
|
|
567
|
+
if llm_response.prompt_tokens:
|
|
568
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
569
|
+
if llm_response.completion_tokens:
|
|
570
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
571
|
+
if llm_response.total_tokens:
|
|
572
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
573
|
+
|
|
574
|
+
self._token_usage_raw["by_action"].append(
|
|
575
|
+
{
|
|
576
|
+
"goal": goal,
|
|
577
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
578
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
579
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
580
|
+
"model": llm_response.model_name,
|
|
581
|
+
}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
def get_token_stats(self) -> TokenStats:
|
|
585
|
+
"""
|
|
586
|
+
Get token usage statistics
|
|
373
587
|
|
|
374
588
|
Returns:
|
|
375
|
-
|
|
589
|
+
TokenStats with token usage breakdown
|
|
376
590
|
"""
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
if el.visual_cues.is_clickable:
|
|
385
|
-
cues.append("CLICKABLE")
|
|
386
|
-
if el.visual_cues.background_color_name:
|
|
387
|
-
cues.append(f"color:{el.visual_cues.background_color_name}")
|
|
388
|
-
|
|
389
|
-
# Format element line
|
|
390
|
-
cues_str = f" {{{','.join(cues)}}}" if cues else ""
|
|
391
|
-
text_preview = (
|
|
392
|
-
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
|
|
393
|
-
)
|
|
591
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
592
|
+
return TokenStats(
|
|
593
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
594
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
595
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
596
|
+
by_action=by_action,
|
|
597
|
+
)
|
|
394
598
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
599
|
+
def get_history(self) -> list[ActionHistory]:
|
|
600
|
+
"""
|
|
601
|
+
Get execution history
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
List of ActionHistory entries
|
|
605
|
+
"""
|
|
606
|
+
return [ActionHistory(**h) for h in self.history]
|
|
399
607
|
|
|
400
|
-
|
|
608
|
+
def clear_history(self) -> None:
|
|
609
|
+
"""Clear execution history and reset token counters"""
|
|
610
|
+
self.history.clear()
|
|
611
|
+
self._token_usage_raw = {
|
|
612
|
+
"total_prompt_tokens": 0,
|
|
613
|
+
"total_completion_tokens": 0,
|
|
614
|
+
"total_tokens": 0,
|
|
615
|
+
"by_action": [],
|
|
616
|
+
}
|
|
401
617
|
|
|
402
|
-
def
|
|
618
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
403
619
|
"""
|
|
404
|
-
|
|
405
|
-
|
|
620
|
+
Filter elements from snapshot based on goal context.
|
|
621
|
+
|
|
622
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
623
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
406
624
|
|
|
407
625
|
Args:
|
|
408
|
-
|
|
626
|
+
snapshot: Current page snapshot
|
|
627
|
+
goal: User's goal (can inform filtering)
|
|
409
628
|
|
|
410
629
|
Returns:
|
|
411
|
-
|
|
630
|
+
Filtered list of elements
|
|
412
631
|
"""
|
|
413
|
-
|
|
632
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|
|
414
633
|
|
|
415
|
-
# Remove markdown code blocks if present
|
|
416
|
-
response = re.sub(r"```[\w]*\n?", "", response)
|
|
417
|
-
response = response.strip()
|
|
418
634
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
635
|
+
class SentienceAgentAsync(BaseAgentAsync):
|
|
636
|
+
"""
|
|
637
|
+
High-level async agent that combines Sentience SDK with any LLM provider.
|
|
422
638
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
639
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
640
|
+
1. OBSERVE: Get snapshot of current page state
|
|
641
|
+
2. THINK: Query LLM to decide next action
|
|
642
|
+
3. ACT: Execute action using SDK
|
|
426
643
|
|
|
427
|
-
|
|
428
|
-
|
|
644
|
+
Example:
|
|
645
|
+
>>> from sentience.async_api import AsyncSentienceBrowser
|
|
646
|
+
>>> from sentience.agent import SentienceAgentAsync
|
|
647
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
648
|
+
>>>
|
|
649
|
+
>>> async with AsyncSentienceBrowser() as browser:
|
|
650
|
+
>>> await browser.goto("https://google.com")
|
|
651
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
652
|
+
>>> agent = SentienceAgentAsync(browser, llm)
|
|
653
|
+
>>> await agent.act("Click the search box")
|
|
654
|
+
>>> await agent.act("Type 'magic mouse' into the search field")
|
|
655
|
+
>>> await agent.act("Press Enter key")
|
|
656
|
+
"""
|
|
429
657
|
|
|
430
|
-
def
|
|
658
|
+
def __init__(
|
|
659
|
+
self,
|
|
660
|
+
browser: AsyncSentienceBrowser,
|
|
661
|
+
llm: LLMProvider,
|
|
662
|
+
default_snapshot_limit: int = 50,
|
|
663
|
+
verbose: bool = True,
|
|
664
|
+
tracer: Optional["Tracer"] = None,
|
|
665
|
+
config: Optional["AgentConfig"] = None,
|
|
666
|
+
):
|
|
431
667
|
"""
|
|
432
|
-
|
|
668
|
+
Initialize Sentience Agent (async)
|
|
433
669
|
|
|
434
670
|
Args:
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
671
|
+
browser: AsyncSentienceBrowser instance
|
|
672
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
673
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
674
|
+
verbose: Print execution logs (default: True)
|
|
675
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
676
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
440
677
|
"""
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
VISUAL CUES EXPLAINED:
|
|
449
|
-
- {{PRIMARY}}: Main call-to-action element on the page
|
|
450
|
-
- {{CLICKABLE}}: Element is clickable
|
|
451
|
-
- {{color:X}}: Background color name
|
|
452
|
-
|
|
453
|
-
CRITICAL RESPONSE FORMAT:
|
|
454
|
-
You MUST respond with ONLY ONE of these exact action formats:
|
|
455
|
-
- CLICK(id) - Click element by ID
|
|
456
|
-
- TYPE(id, "text") - Type text into element
|
|
457
|
-
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
|
|
458
|
-
- FINISH() - Task complete
|
|
459
|
-
|
|
460
|
-
DO NOT include any explanation, reasoning, or natural language.
|
|
461
|
-
DO NOT use markdown formatting or code blocks.
|
|
462
|
-
DO NOT say "The next step is..." or anything similar.
|
|
463
|
-
|
|
464
|
-
CORRECT Examples:
|
|
465
|
-
CLICK(42)
|
|
466
|
-
TYPE(15, "magic mouse")
|
|
467
|
-
PRESS("Enter")
|
|
468
|
-
FINISH()
|
|
469
|
-
|
|
470
|
-
INCORRECT Examples (DO NOT DO THIS):
|
|
471
|
-
"The next step is to click..."
|
|
472
|
-
"I will type..."
|
|
473
|
-
```CLICK(42)```
|
|
474
|
-
"""
|
|
678
|
+
self.browser = browser
|
|
679
|
+
self.llm = llm
|
|
680
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
681
|
+
self.verbose = verbose
|
|
682
|
+
self.tracer = tracer
|
|
683
|
+
self.config = config or AgentConfig()
|
|
475
684
|
|
|
476
|
-
|
|
685
|
+
# Initialize handlers
|
|
686
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
687
|
+
self.action_executor = ActionExecutor(browser)
|
|
477
688
|
|
|
478
|
-
|
|
689
|
+
# Screenshot sequence counter
|
|
690
|
+
# Execution history
|
|
691
|
+
self.history: list[dict[str, Any]] = []
|
|
692
|
+
|
|
693
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
694
|
+
self._token_usage_raw = {
|
|
695
|
+
"total_prompt_tokens": 0,
|
|
696
|
+
"total_completion_tokens": 0,
|
|
697
|
+
"total_tokens": 0,
|
|
698
|
+
"by_action": [],
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
# Step counter for tracing
|
|
702
|
+
self._step_count = 0
|
|
703
|
+
|
|
704
|
+
# Previous snapshot for diff detection
|
|
705
|
+
self._previous_snapshot: Snapshot | None = None
|
|
479
706
|
|
|
480
|
-
def
|
|
707
|
+
def _compute_hash(self, text: str) -> str:
|
|
708
|
+
"""Compute SHA256 hash of text."""
|
|
709
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
710
|
+
|
|
711
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
712
|
+
"""Get bounding box for an element from snapshot."""
|
|
713
|
+
if element_id is None:
|
|
714
|
+
return None
|
|
715
|
+
for el in snap.elements:
|
|
716
|
+
if el.id == element_id:
|
|
717
|
+
return {
|
|
718
|
+
"x": el.bbox.x,
|
|
719
|
+
"y": el.bbox.y,
|
|
720
|
+
"width": el.bbox.width,
|
|
721
|
+
"height": el.bbox.height,
|
|
722
|
+
}
|
|
723
|
+
return None
|
|
724
|
+
|
|
725
|
+
async def act( # noqa: C901
|
|
726
|
+
self,
|
|
727
|
+
goal: str,
|
|
728
|
+
max_retries: int = 2,
|
|
729
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
730
|
+
) -> AgentActionResult:
|
|
481
731
|
"""
|
|
482
|
-
|
|
732
|
+
Execute a high-level goal using observe → think → act loop (async)
|
|
483
733
|
|
|
484
734
|
Args:
|
|
485
|
-
|
|
486
|
-
|
|
735
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
736
|
+
max_retries: Number of retries on failure (default: 2)
|
|
737
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
487
738
|
|
|
488
739
|
Returns:
|
|
489
|
-
|
|
740
|
+
AgentActionResult with execution details
|
|
741
|
+
|
|
742
|
+
Example:
|
|
743
|
+
>>> result = await agent.act("Click the search box")
|
|
744
|
+
>>> print(result.success, result.action, result.element_id)
|
|
745
|
+
True click 42
|
|
490
746
|
"""
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
return {
|
|
496
|
-
"success": result.success,
|
|
497
|
-
"action": "click",
|
|
498
|
-
"element_id": element_id,
|
|
499
|
-
"outcome": result.outcome,
|
|
500
|
-
"url_changed": result.url_changed,
|
|
501
|
-
}
|
|
747
|
+
if self.verbose:
|
|
748
|
+
print(f"\n{'=' * 70}")
|
|
749
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
750
|
+
print(f"{'=' * 70}")
|
|
502
751
|
|
|
503
|
-
#
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
action_str,
|
|
507
|
-
re.IGNORECASE,
|
|
508
|
-
):
|
|
509
|
-
element_id = int(match.group(1))
|
|
510
|
-
text = match.group(2)
|
|
511
|
-
result = type_text(self.browser, element_id, text)
|
|
512
|
-
return {
|
|
513
|
-
"success": result.success,
|
|
514
|
-
"action": "type",
|
|
515
|
-
"element_id": element_id,
|
|
516
|
-
"text": text,
|
|
517
|
-
"outcome": result.outcome,
|
|
518
|
-
}
|
|
752
|
+
# Generate step ID for tracing
|
|
753
|
+
self._step_count += 1
|
|
754
|
+
step_id = f"step-{self._step_count}"
|
|
519
755
|
|
|
520
|
-
#
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
"
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
756
|
+
# Emit step_start trace event if tracer is enabled
|
|
757
|
+
if self.tracer:
|
|
758
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
759
|
+
_safe_tracer_call(
|
|
760
|
+
self.tracer,
|
|
761
|
+
"emit_step_start",
|
|
762
|
+
self.verbose,
|
|
763
|
+
step_id=step_id,
|
|
764
|
+
step_index=self._step_count,
|
|
765
|
+
goal=goal,
|
|
766
|
+
attempt=0,
|
|
767
|
+
pre_url=pre_url,
|
|
768
|
+
)
|
|
530
769
|
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
"action": "finish",
|
|
536
|
-
"message": "Task marked as complete",
|
|
537
|
-
}
|
|
770
|
+
for attempt in range(max_retries + 1):
|
|
771
|
+
try:
|
|
772
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
773
|
+
start_time = time.time()
|
|
538
774
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
775
|
+
# Use provided options or create default
|
|
776
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
777
|
+
# Only set goal if not already provided
|
|
778
|
+
if snap_opts.goal is None:
|
|
779
|
+
snap_opts.goal = goal
|
|
544
780
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
781
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
782
|
+
# Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
|
|
783
|
+
# (snapshot_options.screenshot defaults to False, so we check if it's still False)
|
|
784
|
+
if self.config and (snapshot_options is None or snap_opts.screenshot is False):
|
|
785
|
+
if self.config.capture_screenshots:
|
|
786
|
+
# Create ScreenshotConfig from AgentConfig
|
|
787
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
788
|
+
format=self.config.screenshot_format,
|
|
789
|
+
quality=(
|
|
790
|
+
self.config.screenshot_quality
|
|
791
|
+
if self.config.screenshot_format == "jpeg"
|
|
792
|
+
else None
|
|
793
|
+
),
|
|
794
|
+
)
|
|
795
|
+
else:
|
|
796
|
+
snap_opts.screenshot = False
|
|
797
|
+
# Apply show_overlay from AgentConfig
|
|
798
|
+
# Note: User can override by explicitly passing show_overlay in snapshot_options
|
|
799
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
800
|
+
|
|
801
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
802
|
+
snap = await snapshot_async(self.browser, snap_opts)
|
|
548
803
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
804
|
+
if snap.status != "success":
|
|
805
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
806
|
+
|
|
807
|
+
# Compute diff_status by comparing with previous snapshot
|
|
808
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
809
|
+
|
|
810
|
+
# Create snapshot with diff_status populated
|
|
811
|
+
snap_with_diff = Snapshot(
|
|
812
|
+
status=snap.status,
|
|
813
|
+
timestamp=snap.timestamp,
|
|
814
|
+
url=snap.url,
|
|
815
|
+
viewport=snap.viewport,
|
|
816
|
+
elements=elements_with_diff,
|
|
817
|
+
screenshot=snap.screenshot,
|
|
818
|
+
screenshot_format=snap.screenshot_format,
|
|
819
|
+
error=snap.error,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# Update previous snapshot for next comparison
|
|
823
|
+
self._previous_snapshot = snap
|
|
824
|
+
|
|
825
|
+
# Apply element filtering based on goal
|
|
826
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
827
|
+
|
|
828
|
+
# Emit snapshot trace event if tracer is enabled
|
|
829
|
+
if self.tracer:
|
|
830
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
831
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
832
|
+
|
|
833
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
834
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
835
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
836
|
+
if snap.screenshot:
|
|
837
|
+
# Extract base64 string from data URL if needed
|
|
838
|
+
if snap.screenshot.startswith("data:image"):
|
|
839
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
840
|
+
screenshot_base64 = (
|
|
841
|
+
snap.screenshot.split(",", 1)[1]
|
|
842
|
+
if "," in snap.screenshot
|
|
843
|
+
else snap.screenshot
|
|
844
|
+
)
|
|
845
|
+
else:
|
|
846
|
+
screenshot_base64 = snap.screenshot
|
|
847
|
+
|
|
848
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
849
|
+
if snap.screenshot_format:
|
|
850
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
851
|
+
|
|
852
|
+
_safe_tracer_call(
|
|
853
|
+
self.tracer,
|
|
854
|
+
"emit",
|
|
855
|
+
self.verbose,
|
|
856
|
+
"snapshot",
|
|
857
|
+
snapshot_data,
|
|
858
|
+
step_id=step_id,
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
862
|
+
filtered_snap = Snapshot(
|
|
863
|
+
status=snap_with_diff.status,
|
|
864
|
+
timestamp=snap_with_diff.timestamp,
|
|
865
|
+
url=snap_with_diff.url,
|
|
866
|
+
viewport=snap_with_diff.viewport,
|
|
867
|
+
elements=filtered_elements,
|
|
868
|
+
screenshot=snap_with_diff.screenshot,
|
|
869
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
870
|
+
error=snap_with_diff.error,
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
# 2. GROUND: Format elements for LLM context
|
|
874
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
875
|
+
|
|
876
|
+
# 3. THINK: Query LLM for next action
|
|
877
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
878
|
+
|
|
879
|
+
# Emit LLM query trace event if tracer is enabled
|
|
880
|
+
if self.tracer:
|
|
881
|
+
_safe_tracer_call(
|
|
882
|
+
self.tracer,
|
|
883
|
+
"emit",
|
|
884
|
+
self.verbose,
|
|
885
|
+
"llm_query",
|
|
886
|
+
{
|
|
887
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
888
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
889
|
+
"model": llm_response.model_name,
|
|
890
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
891
|
+
},
|
|
892
|
+
step_id=step_id,
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
if self.verbose:
|
|
896
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
897
|
+
|
|
898
|
+
# Track token usage
|
|
899
|
+
self._track_tokens(goal, llm_response)
|
|
900
|
+
|
|
901
|
+
# Parse action from LLM response
|
|
902
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
903
|
+
|
|
904
|
+
# 4. EXECUTE: Parse and run action
|
|
905
|
+
result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
|
|
906
|
+
|
|
907
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
908
|
+
|
|
909
|
+
# Create AgentActionResult from execution result
|
|
910
|
+
result = AgentActionResult(
|
|
911
|
+
success=result_dict["success"],
|
|
912
|
+
action=result_dict["action"],
|
|
913
|
+
goal=goal,
|
|
914
|
+
duration_ms=duration_ms,
|
|
915
|
+
attempt=attempt,
|
|
916
|
+
element_id=result_dict.get("element_id"),
|
|
917
|
+
text=result_dict.get("text"),
|
|
918
|
+
key=result_dict.get("key"),
|
|
919
|
+
outcome=result_dict.get("outcome"),
|
|
920
|
+
url_changed=result_dict.get("url_changed"),
|
|
921
|
+
error=result_dict.get("error"),
|
|
922
|
+
message=result_dict.get("message"),
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Emit action execution trace event if tracer is enabled
|
|
926
|
+
if self.tracer:
|
|
927
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
928
|
+
|
|
929
|
+
# Include element data for live overlay visualization
|
|
930
|
+
elements_data = [
|
|
931
|
+
{
|
|
932
|
+
"id": el.id,
|
|
933
|
+
"bbox": {
|
|
934
|
+
"x": el.bbox.x,
|
|
935
|
+
"y": el.bbox.y,
|
|
936
|
+
"width": el.bbox.width,
|
|
937
|
+
"height": el.bbox.height,
|
|
938
|
+
},
|
|
939
|
+
"role": el.role,
|
|
940
|
+
"text": el.text[:50] if el.text else "",
|
|
941
|
+
}
|
|
942
|
+
for el in filtered_snap.elements[:50]
|
|
943
|
+
]
|
|
944
|
+
|
|
945
|
+
_safe_tracer_call(
|
|
946
|
+
self.tracer,
|
|
947
|
+
"emit",
|
|
948
|
+
self.verbose,
|
|
949
|
+
"action",
|
|
950
|
+
{
|
|
951
|
+
"action": result.action,
|
|
952
|
+
"element_id": result.element_id,
|
|
953
|
+
"success": result.success,
|
|
954
|
+
"outcome": result.outcome,
|
|
955
|
+
"duration_ms": duration_ms,
|
|
956
|
+
"post_url": post_url,
|
|
957
|
+
"elements": elements_data, # Add element data for overlay
|
|
958
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
959
|
+
},
|
|
960
|
+
step_id=step_id,
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
# 5. RECORD: Track history
|
|
964
|
+
self.history.append(
|
|
965
|
+
{
|
|
966
|
+
"goal": goal,
|
|
967
|
+
"action": action_str,
|
|
968
|
+
"result": result.model_dump(), # Store as dict
|
|
969
|
+
"success": result.success,
|
|
970
|
+
"attempt": attempt,
|
|
971
|
+
"duration_ms": duration_ms,
|
|
972
|
+
}
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
if self.verbose:
|
|
976
|
+
status = "✅" if result.success else "❌"
|
|
977
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
978
|
+
|
|
979
|
+
# Emit step completion trace event if tracer is enabled
|
|
980
|
+
if self.tracer:
|
|
981
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
982
|
+
pre_url = snap.url
|
|
983
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
984
|
+
|
|
985
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
986
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
987
|
+
|
|
988
|
+
# Build LLM data
|
|
989
|
+
llm_response_text = llm_response.content
|
|
990
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
991
|
+
llm_data = {
|
|
992
|
+
"response_text": llm_response_text,
|
|
993
|
+
"response_hash": llm_response_hash,
|
|
994
|
+
"usage": {
|
|
995
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
996
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
997
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
998
|
+
},
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
# Build exec data
|
|
1002
|
+
exec_data = {
|
|
1003
|
+
"success": result.success,
|
|
1004
|
+
"action": result.action,
|
|
1005
|
+
"outcome": result.outcome
|
|
1006
|
+
or (
|
|
1007
|
+
f"Action {result.action} executed successfully"
|
|
1008
|
+
if result.success
|
|
1009
|
+
else f"Action {result.action} failed"
|
|
1010
|
+
),
|
|
1011
|
+
"duration_ms": duration_ms,
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
# Add optional exec fields
|
|
1015
|
+
if result.element_id is not None:
|
|
1016
|
+
exec_data["element_id"] = result.element_id
|
|
1017
|
+
# Add bounding box if element found
|
|
1018
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1019
|
+
if bbox:
|
|
1020
|
+
exec_data["bounding_box"] = bbox
|
|
1021
|
+
if result.text is not None:
|
|
1022
|
+
exec_data["text"] = result.text
|
|
1023
|
+
if result.key is not None:
|
|
1024
|
+
exec_data["key"] = result.key
|
|
1025
|
+
if result.error is not None:
|
|
1026
|
+
exec_data["error"] = result.error
|
|
1027
|
+
|
|
1028
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
1029
|
+
verify_passed = result.success and (
|
|
1030
|
+
result.url_changed or result.action != "click"
|
|
1031
|
+
)
|
|
1032
|
+
verify_signals = {
|
|
1033
|
+
"url_changed": result.url_changed or False,
|
|
1034
|
+
}
|
|
1035
|
+
if result.error:
|
|
1036
|
+
verify_signals["error"] = result.error
|
|
1037
|
+
|
|
1038
|
+
# Add elements_found array if element was targeted
|
|
1039
|
+
if result.element_id is not None:
|
|
1040
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1041
|
+
if bbox:
|
|
1042
|
+
verify_signals["elements_found"] = [
|
|
1043
|
+
{
|
|
1044
|
+
"label": f"Element {result.element_id}",
|
|
1045
|
+
"bounding_box": bbox,
|
|
1046
|
+
}
|
|
1047
|
+
]
|
|
1048
|
+
|
|
1049
|
+
verify_data = {
|
|
1050
|
+
"passed": verify_passed,
|
|
1051
|
+
"signals": verify_signals,
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
1055
|
+
# Use the same format as build_snapshot_event for consistency
|
|
1056
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
1057
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
1058
|
+
|
|
1059
|
+
# Build complete step_end event
|
|
1060
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
1061
|
+
step_id=step_id,
|
|
1062
|
+
step_index=self._step_count,
|
|
1063
|
+
goal=goal,
|
|
1064
|
+
attempt=attempt,
|
|
1065
|
+
pre_url=pre_url,
|
|
1066
|
+
post_url=post_url,
|
|
1067
|
+
snapshot_digest=snapshot_digest,
|
|
1068
|
+
llm_data=llm_data,
|
|
1069
|
+
exec_data=exec_data,
|
|
1070
|
+
verify_data=verify_data,
|
|
1071
|
+
pre_elements=pre_elements,
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
_safe_tracer_call(
|
|
1075
|
+
self.tracer,
|
|
1076
|
+
"emit",
|
|
1077
|
+
self.verbose,
|
|
1078
|
+
"step_end",
|
|
1079
|
+
step_end_data,
|
|
1080
|
+
step_id=step_id,
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
return result
|
|
1084
|
+
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
# Emit error trace event if tracer is enabled
|
|
1087
|
+
if self.tracer:
|
|
1088
|
+
_safe_tracer_call(
|
|
1089
|
+
self.tracer,
|
|
1090
|
+
"emit_error",
|
|
1091
|
+
self.verbose,
|
|
1092
|
+
step_id=step_id,
|
|
1093
|
+
error=str(e),
|
|
1094
|
+
attempt=attempt,
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
if attempt < max_retries:
|
|
1098
|
+
if self.verbose:
|
|
1099
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
1100
|
+
await asyncio.sleep(1.0) # Brief delay before retry
|
|
1101
|
+
continue
|
|
1102
|
+
else:
|
|
1103
|
+
# Create error result
|
|
1104
|
+
error_result = AgentActionResult(
|
|
1105
|
+
success=False,
|
|
1106
|
+
action="error",
|
|
1107
|
+
goal=goal,
|
|
1108
|
+
duration_ms=0,
|
|
1109
|
+
attempt=attempt,
|
|
1110
|
+
error=str(e),
|
|
1111
|
+
)
|
|
1112
|
+
self.history.append(
|
|
1113
|
+
{
|
|
1114
|
+
"goal": goal,
|
|
1115
|
+
"action": "error",
|
|
1116
|
+
"result": error_result.model_dump(),
|
|
1117
|
+
"success": False,
|
|
1118
|
+
"attempt": attempt,
|
|
1119
|
+
"duration_ms": 0,
|
|
1120
|
+
}
|
|
1121
|
+
)
|
|
1122
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
1123
|
+
|
|
1124
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
1125
|
+
"""Track token usage for analytics (same as sync version)"""
|
|
553
1126
|
if llm_response.prompt_tokens:
|
|
554
1127
|
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
555
1128
|
if llm_response.completion_tokens:
|
|
@@ -568,12 +1141,7 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
568
1141
|
)
|
|
569
1142
|
|
|
570
1143
|
def get_token_stats(self) -> TokenStats:
|
|
571
|
-
"""
|
|
572
|
-
Get token usage statistics
|
|
573
|
-
|
|
574
|
-
Returns:
|
|
575
|
-
TokenStats with token usage breakdown
|
|
576
|
-
"""
|
|
1144
|
+
"""Get token usage statistics (same as sync version)"""
|
|
577
1145
|
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
578
1146
|
return TokenStats(
|
|
579
1147
|
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
@@ -583,16 +1151,11 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
583
1151
|
)
|
|
584
1152
|
|
|
585
1153
|
def get_history(self) -> list[ActionHistory]:
|
|
586
|
-
"""
|
|
587
|
-
Get execution history
|
|
588
|
-
|
|
589
|
-
Returns:
|
|
590
|
-
List of ActionHistory entries
|
|
591
|
-
"""
|
|
1154
|
+
"""Get execution history (same as sync version)"""
|
|
592
1155
|
return [ActionHistory(**h) for h in self.history]
|
|
593
1156
|
|
|
594
1157
|
def clear_history(self) -> None:
|
|
595
|
-
"""Clear execution history and reset token counters"""
|
|
1158
|
+
"""Clear execution history and reset token counters (same as sync version)"""
|
|
596
1159
|
self.history.clear()
|
|
597
1160
|
self._token_usage_raw = {
|
|
598
1161
|
"total_prompt_tokens": 0,
|
|
@@ -605,8 +1168,8 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
605
1168
|
"""
|
|
606
1169
|
Filter elements from snapshot based on goal context.
|
|
607
1170
|
|
|
608
|
-
This
|
|
609
|
-
relevant elements and filters out irrelevant ones.
|
|
1171
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
1172
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
610
1173
|
|
|
611
1174
|
Args:
|
|
612
1175
|
snapshot: Current page snapshot
|
|
@@ -615,73 +1178,4 @@ INCORRECT Examples (DO NOT DO THIS):
|
|
|
615
1178
|
Returns:
|
|
616
1179
|
Filtered list of elements
|
|
617
1180
|
"""
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
# If no goal provided, return all elements (up to limit)
|
|
621
|
-
if not goal:
|
|
622
|
-
return elements[: self.default_snapshot_limit]
|
|
623
|
-
|
|
624
|
-
goal_lower = goal.lower()
|
|
625
|
-
|
|
626
|
-
# Extract keywords from goal
|
|
627
|
-
keywords = self._extract_keywords(goal_lower)
|
|
628
|
-
|
|
629
|
-
# Boost elements matching goal keywords
|
|
630
|
-
scored_elements = []
|
|
631
|
-
for el in elements:
|
|
632
|
-
score = el.importance
|
|
633
|
-
|
|
634
|
-
# Boost if element text matches goal
|
|
635
|
-
if el.text and any(kw in el.text.lower() for kw in keywords):
|
|
636
|
-
score += 0.3
|
|
637
|
-
|
|
638
|
-
# Boost if role matches goal intent
|
|
639
|
-
if "click" in goal_lower and el.visual_cues.is_clickable:
|
|
640
|
-
score += 0.2
|
|
641
|
-
if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
|
|
642
|
-
score += 0.2
|
|
643
|
-
if "search" in goal_lower:
|
|
644
|
-
# Filter out non-interactive elements for search tasks
|
|
645
|
-
if el.role in ["link", "img"] and not el.visual_cues.is_primary:
|
|
646
|
-
score -= 0.5
|
|
647
|
-
|
|
648
|
-
scored_elements.append((score, el))
|
|
649
|
-
|
|
650
|
-
# Re-sort by boosted score
|
|
651
|
-
scored_elements.sort(key=lambda x: x[0], reverse=True)
|
|
652
|
-
elements = [el for _, el in scored_elements]
|
|
653
|
-
|
|
654
|
-
return elements[: self.default_snapshot_limit]
|
|
655
|
-
|
|
656
|
-
def _extract_keywords(self, text: str) -> list[str]:
|
|
657
|
-
"""
|
|
658
|
-
Extract meaningful keywords from goal text
|
|
659
|
-
|
|
660
|
-
Args:
|
|
661
|
-
text: Text to extract keywords from
|
|
662
|
-
|
|
663
|
-
Returns:
|
|
664
|
-
List of keywords
|
|
665
|
-
"""
|
|
666
|
-
stopwords = {
|
|
667
|
-
"the",
|
|
668
|
-
"a",
|
|
669
|
-
"an",
|
|
670
|
-
"and",
|
|
671
|
-
"or",
|
|
672
|
-
"but",
|
|
673
|
-
"in",
|
|
674
|
-
"on",
|
|
675
|
-
"at",
|
|
676
|
-
"to",
|
|
677
|
-
"for",
|
|
678
|
-
"of",
|
|
679
|
-
"with",
|
|
680
|
-
"by",
|
|
681
|
-
"from",
|
|
682
|
-
"as",
|
|
683
|
-
"is",
|
|
684
|
-
"was",
|
|
685
|
-
}
|
|
686
|
-
words = text.split()
|
|
687
|
-
return [w for w in words if w not in stopwords and len(w) > 2]
|
|
1181
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|