sentienceapi 0.95.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +253 -0
- sentience/_extension_loader.py +195 -0
- sentience/action_executor.py +215 -0
- sentience/actions.py +1020 -0
- sentience/agent.py +1181 -0
- sentience/agent_config.py +46 -0
- sentience/agent_runtime.py +424 -0
- sentience/asserts/__init__.py +70 -0
- sentience/asserts/expect.py +621 -0
- sentience/asserts/query.py +383 -0
- sentience/async_api.py +108 -0
- sentience/backends/__init__.py +137 -0
- sentience/backends/actions.py +343 -0
- sentience/backends/browser_use_adapter.py +241 -0
- sentience/backends/cdp_backend.py +393 -0
- sentience/backends/exceptions.py +211 -0
- sentience/backends/playwright_backend.py +194 -0
- sentience/backends/protocol.py +216 -0
- sentience/backends/sentience_context.py +469 -0
- sentience/backends/snapshot.py +427 -0
- sentience/base_agent.py +196 -0
- sentience/browser.py +1215 -0
- sentience/browser_evaluator.py +299 -0
- sentience/canonicalization.py +207 -0
- sentience/cli.py +130 -0
- sentience/cloud_tracing.py +807 -0
- sentience/constants.py +6 -0
- sentience/conversational_agent.py +543 -0
- sentience/element_filter.py +136 -0
- sentience/expect.py +188 -0
- sentience/extension/background.js +104 -0
- sentience/extension/content.js +161 -0
- sentience/extension/injected_api.js +914 -0
- sentience/extension/manifest.json +36 -0
- sentience/extension/pkg/sentience_core.d.ts +51 -0
- sentience/extension/pkg/sentience_core.js +323 -0
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
- sentience/extension/release.json +115 -0
- sentience/formatting.py +15 -0
- sentience/generator.py +202 -0
- sentience/inspector.py +367 -0
- sentience/llm_interaction_handler.py +191 -0
- sentience/llm_provider.py +875 -0
- sentience/llm_provider_utils.py +120 -0
- sentience/llm_response_builder.py +153 -0
- sentience/models.py +846 -0
- sentience/ordinal.py +280 -0
- sentience/overlay.py +222 -0
- sentience/protocols.py +228 -0
- sentience/query.py +303 -0
- sentience/read.py +188 -0
- sentience/recorder.py +589 -0
- sentience/schemas/trace_v1.json +335 -0
- sentience/screenshot.py +100 -0
- sentience/sentience_methods.py +86 -0
- sentience/snapshot.py +706 -0
- sentience/snapshot_diff.py +126 -0
- sentience/text_search.py +262 -0
- sentience/trace_event_builder.py +148 -0
- sentience/trace_file_manager.py +197 -0
- sentience/trace_indexing/__init__.py +27 -0
- sentience/trace_indexing/index_schema.py +199 -0
- sentience/trace_indexing/indexer.py +414 -0
- sentience/tracer_factory.py +322 -0
- sentience/tracing.py +449 -0
- sentience/utils/__init__.py +40 -0
- sentience/utils/browser.py +46 -0
- sentience/utils/element.py +257 -0
- sentience/utils/formatting.py +59 -0
- sentience/utils.py +296 -0
- sentience/verification.py +380 -0
- sentience/visual_agent.py +2058 -0
- sentience/wait.py +139 -0
- sentienceapi-0.95.0.dist-info/METADATA +984 -0
- sentienceapi-0.95.0.dist-info/RECORD +82 -0
- sentienceapi-0.95.0.dist-info/WHEEL +5 -0
- sentienceapi-0.95.0.dist-info/entry_points.txt +2 -0
- sentienceapi-0.95.0.dist-info/licenses/LICENSE +24 -0
- sentienceapi-0.95.0.dist-info/licenses/LICENSE-APACHE +201 -0
- sentienceapi-0.95.0.dist-info/licenses/LICENSE-MIT +21 -0
- sentienceapi-0.95.0.dist-info/top_level.txt +1 -0
sentience/agent.py
ADDED
|
@@ -0,0 +1,1181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sentience Agent: High-level automation agent using LLM + SDK
|
|
3
|
+
Implements observe-think-act loop for natural language commands
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import hashlib
|
|
8
|
+
import time
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
10
|
+
|
|
11
|
+
from .action_executor import ActionExecutor
|
|
12
|
+
from .agent_config import AgentConfig
|
|
13
|
+
from .base_agent import BaseAgent, BaseAgentAsync
|
|
14
|
+
from .browser import AsyncSentienceBrowser, SentienceBrowser
|
|
15
|
+
from .element_filter import ElementFilter
|
|
16
|
+
from .llm_interaction_handler import LLMInteractionHandler
|
|
17
|
+
from .llm_provider import LLMProvider, LLMResponse
|
|
18
|
+
from .models import (
|
|
19
|
+
ActionHistory,
|
|
20
|
+
ActionTokenUsage,
|
|
21
|
+
AgentActionResult,
|
|
22
|
+
Element,
|
|
23
|
+
ScreenshotConfig,
|
|
24
|
+
Snapshot,
|
|
25
|
+
SnapshotOptions,
|
|
26
|
+
TokenStats,
|
|
27
|
+
)
|
|
28
|
+
from .protocols import AsyncBrowserProtocol, BrowserProtocol
|
|
29
|
+
from .snapshot import snapshot, snapshot_async
|
|
30
|
+
from .snapshot_diff import SnapshotDiff
|
|
31
|
+
from .trace_event_builder import TraceEventBuilder
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from .tracing import Tracer
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _safe_tracer_call(
|
|
38
|
+
tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Safely call tracer method, catching and logging errors without breaking execution.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
tracer: Tracer instance or None
|
|
45
|
+
method_name: Name of tracer method to call (e.g., "emit", "emit_error")
|
|
46
|
+
verbose: Whether to print error messages
|
|
47
|
+
*args: Positional arguments for the tracer method
|
|
48
|
+
**kwargs: Keyword arguments for the tracer method
|
|
49
|
+
"""
|
|
50
|
+
if not tracer:
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
method = getattr(tracer, method_name)
|
|
54
|
+
if args and kwargs:
|
|
55
|
+
method(*args, **kwargs)
|
|
56
|
+
elif args:
|
|
57
|
+
method(*args)
|
|
58
|
+
elif kwargs:
|
|
59
|
+
method(**kwargs)
|
|
60
|
+
else:
|
|
61
|
+
method()
|
|
62
|
+
except Exception as tracer_error:
|
|
63
|
+
# Tracer errors should not break agent execution
|
|
64
|
+
if verbose:
|
|
65
|
+
print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class SentienceAgent(BaseAgent):
|
|
69
|
+
"""
|
|
70
|
+
High-level agent that combines Sentience SDK with any LLM provider.
|
|
71
|
+
|
|
72
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
73
|
+
1. OBSERVE: Get snapshot of current page state
|
|
74
|
+
2. THINK: Query LLM to decide next action
|
|
75
|
+
3. ACT: Execute action using SDK
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
>>> from sentience import SentienceBrowser, SentienceAgent
|
|
79
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
80
|
+
>>>
|
|
81
|
+
>>> browser = SentienceBrowser(api_key="sentience_key")
|
|
82
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
83
|
+
>>> agent = SentienceAgent(browser, llm)
|
|
84
|
+
>>>
|
|
85
|
+
>>> with browser:
|
|
86
|
+
>>> browser.page.goto("https://google.com")
|
|
87
|
+
>>> agent.act("Click the search box")
|
|
88
|
+
>>> agent.act("Type 'magic mouse' into the search field")
|
|
89
|
+
>>> agent.act("Press Enter key")
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
browser: SentienceBrowser | BrowserProtocol,
|
|
95
|
+
llm: LLMProvider,
|
|
96
|
+
default_snapshot_limit: int = 50,
|
|
97
|
+
verbose: bool = True,
|
|
98
|
+
tracer: Optional["Tracer"] = None,
|
|
99
|
+
config: Optional["AgentConfig"] = None,
|
|
100
|
+
):
|
|
101
|
+
"""
|
|
102
|
+
Initialize Sentience Agent
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
browser: SentienceBrowser instance or BrowserProtocol-compatible object
|
|
106
|
+
(for testing, can use mock objects that implement BrowserProtocol)
|
|
107
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
108
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
109
|
+
verbose: Print execution logs (default: True)
|
|
110
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
111
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
112
|
+
"""
|
|
113
|
+
self.browser = browser
|
|
114
|
+
self.llm = llm
|
|
115
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
116
|
+
self.verbose = verbose
|
|
117
|
+
self.tracer = tracer
|
|
118
|
+
self.config = config or AgentConfig()
|
|
119
|
+
|
|
120
|
+
# Initialize handlers
|
|
121
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
122
|
+
self.action_executor = ActionExecutor(browser)
|
|
123
|
+
|
|
124
|
+
# Screenshot sequence counter
|
|
125
|
+
# Execution history
|
|
126
|
+
self.history: list[dict[str, Any]] = []
|
|
127
|
+
|
|
128
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
129
|
+
self._token_usage_raw = {
|
|
130
|
+
"total_prompt_tokens": 0,
|
|
131
|
+
"total_completion_tokens": 0,
|
|
132
|
+
"total_tokens": 0,
|
|
133
|
+
"by_action": [],
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Step counter for tracing
|
|
137
|
+
self._step_count = 0
|
|
138
|
+
|
|
139
|
+
# Previous snapshot for diff detection
|
|
140
|
+
self._previous_snapshot: Snapshot | None = None
|
|
141
|
+
|
|
142
|
+
def _compute_hash(self, text: str) -> str:
|
|
143
|
+
"""Compute SHA256 hash of text."""
|
|
144
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
145
|
+
|
|
146
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
147
|
+
"""Get bounding box for an element from snapshot."""
|
|
148
|
+
if element_id is None:
|
|
149
|
+
return None
|
|
150
|
+
for el in snap.elements:
|
|
151
|
+
if el.id == element_id:
|
|
152
|
+
return {
|
|
153
|
+
"x": el.bbox.x,
|
|
154
|
+
"y": el.bbox.y,
|
|
155
|
+
"width": el.bbox.width,
|
|
156
|
+
"height": el.bbox.height,
|
|
157
|
+
}
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
def act( # noqa: C901
|
|
161
|
+
self,
|
|
162
|
+
goal: str,
|
|
163
|
+
max_retries: int = 2,
|
|
164
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
165
|
+
) -> AgentActionResult:
|
|
166
|
+
"""
|
|
167
|
+
Execute a high-level goal using observe → think → act loop
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
171
|
+
max_retries: Number of retries on failure (default: 2)
|
|
172
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
AgentActionResult with execution details
|
|
176
|
+
|
|
177
|
+
Example:
|
|
178
|
+
>>> result = agent.act("Click the search box")
|
|
179
|
+
>>> print(result.success, result.action, result.element_id)
|
|
180
|
+
True click 42
|
|
181
|
+
>>> # Backward compatible dict access
|
|
182
|
+
>>> print(result["element_id"]) # Works but shows deprecation warning
|
|
183
|
+
42
|
|
184
|
+
"""
|
|
185
|
+
if self.verbose:
|
|
186
|
+
print(f"\n{'=' * 70}")
|
|
187
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
188
|
+
print(f"{'=' * 70}")
|
|
189
|
+
|
|
190
|
+
# Generate step ID for tracing
|
|
191
|
+
self._step_count += 1
|
|
192
|
+
step_id = f"step-{self._step_count}"
|
|
193
|
+
|
|
194
|
+
# Emit step_start trace event if tracer is enabled
|
|
195
|
+
if self.tracer:
|
|
196
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
197
|
+
_safe_tracer_call(
|
|
198
|
+
self.tracer,
|
|
199
|
+
"emit_step_start",
|
|
200
|
+
self.verbose,
|
|
201
|
+
step_id=step_id,
|
|
202
|
+
step_index=self._step_count,
|
|
203
|
+
goal=goal,
|
|
204
|
+
attempt=0,
|
|
205
|
+
pre_url=pre_url,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
for attempt in range(max_retries + 1):
|
|
209
|
+
try:
|
|
210
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
211
|
+
start_time = time.time()
|
|
212
|
+
|
|
213
|
+
# Use provided options or create default
|
|
214
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
215
|
+
# Only set goal if not already provided
|
|
216
|
+
if snap_opts.goal is None:
|
|
217
|
+
snap_opts.goal = goal
|
|
218
|
+
|
|
219
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
220
|
+
if snapshot_options is None and self.config:
|
|
221
|
+
if self.config.capture_screenshots:
|
|
222
|
+
# Create ScreenshotConfig from AgentConfig
|
|
223
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
224
|
+
format=self.config.screenshot_format,
|
|
225
|
+
quality=(
|
|
226
|
+
self.config.screenshot_quality
|
|
227
|
+
if self.config.screenshot_format == "jpeg"
|
|
228
|
+
else None
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
snap_opts.screenshot = False
|
|
233
|
+
# Apply show_overlay from AgentConfig
|
|
234
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
235
|
+
|
|
236
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
237
|
+
snap = snapshot(self.browser, snap_opts)
|
|
238
|
+
|
|
239
|
+
if snap.status != "success":
|
|
240
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
241
|
+
|
|
242
|
+
# Compute diff_status by comparing with previous snapshot
|
|
243
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
244
|
+
|
|
245
|
+
# Create snapshot with diff_status populated
|
|
246
|
+
snap_with_diff = Snapshot(
|
|
247
|
+
status=snap.status,
|
|
248
|
+
timestamp=snap.timestamp,
|
|
249
|
+
url=snap.url,
|
|
250
|
+
viewport=snap.viewport,
|
|
251
|
+
elements=elements_with_diff,
|
|
252
|
+
screenshot=snap.screenshot,
|
|
253
|
+
screenshot_format=snap.screenshot_format,
|
|
254
|
+
error=snap.error,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Update previous snapshot for next comparison
|
|
258
|
+
self._previous_snapshot = snap
|
|
259
|
+
|
|
260
|
+
# Apply element filtering based on goal
|
|
261
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
262
|
+
|
|
263
|
+
# Emit snapshot trace event if tracer is enabled
|
|
264
|
+
if self.tracer:
|
|
265
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
266
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
267
|
+
|
|
268
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
269
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
270
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
271
|
+
if snap.screenshot:
|
|
272
|
+
# Extract base64 string from data URL if needed
|
|
273
|
+
if snap.screenshot.startswith("data:image"):
|
|
274
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
275
|
+
screenshot_base64 = (
|
|
276
|
+
snap.screenshot.split(",", 1)[1]
|
|
277
|
+
if "," in snap.screenshot
|
|
278
|
+
else snap.screenshot
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
screenshot_base64 = snap.screenshot
|
|
282
|
+
|
|
283
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
284
|
+
if snap.screenshot_format:
|
|
285
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
286
|
+
|
|
287
|
+
_safe_tracer_call(
|
|
288
|
+
self.tracer,
|
|
289
|
+
"emit",
|
|
290
|
+
self.verbose,
|
|
291
|
+
"snapshot",
|
|
292
|
+
snapshot_data,
|
|
293
|
+
step_id=step_id,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
297
|
+
filtered_snap = Snapshot(
|
|
298
|
+
status=snap_with_diff.status,
|
|
299
|
+
timestamp=snap_with_diff.timestamp,
|
|
300
|
+
url=snap_with_diff.url,
|
|
301
|
+
viewport=snap_with_diff.viewport,
|
|
302
|
+
elements=filtered_elements,
|
|
303
|
+
screenshot=snap_with_diff.screenshot,
|
|
304
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
305
|
+
error=snap_with_diff.error,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# 2. GROUND: Format elements for LLM context
|
|
309
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
310
|
+
|
|
311
|
+
# 3. THINK: Query LLM for next action
|
|
312
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
313
|
+
|
|
314
|
+
# Emit LLM query trace event if tracer is enabled
|
|
315
|
+
if self.tracer:
|
|
316
|
+
_safe_tracer_call(
|
|
317
|
+
self.tracer,
|
|
318
|
+
"emit",
|
|
319
|
+
self.verbose,
|
|
320
|
+
"llm_query",
|
|
321
|
+
{
|
|
322
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
323
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
324
|
+
"model": llm_response.model_name,
|
|
325
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
326
|
+
},
|
|
327
|
+
step_id=step_id,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
if self.verbose:
|
|
331
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
332
|
+
|
|
333
|
+
# Track token usage
|
|
334
|
+
self._track_tokens(goal, llm_response)
|
|
335
|
+
|
|
336
|
+
# Parse action from LLM response
|
|
337
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
338
|
+
|
|
339
|
+
# 4. EXECUTE: Parse and run action
|
|
340
|
+
result_dict = self.action_executor.execute(action_str, filtered_snap)
|
|
341
|
+
|
|
342
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
343
|
+
|
|
344
|
+
# Create AgentActionResult from execution result
|
|
345
|
+
result = AgentActionResult(
|
|
346
|
+
success=result_dict["success"],
|
|
347
|
+
action=result_dict["action"],
|
|
348
|
+
goal=goal,
|
|
349
|
+
duration_ms=duration_ms,
|
|
350
|
+
attempt=attempt,
|
|
351
|
+
element_id=result_dict.get("element_id"),
|
|
352
|
+
text=result_dict.get("text"),
|
|
353
|
+
key=result_dict.get("key"),
|
|
354
|
+
outcome=result_dict.get("outcome"),
|
|
355
|
+
url_changed=result_dict.get("url_changed"),
|
|
356
|
+
error=result_dict.get("error"),
|
|
357
|
+
message=result_dict.get("message"),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Emit action execution trace event if tracer is enabled
|
|
361
|
+
if self.tracer:
|
|
362
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
363
|
+
|
|
364
|
+
# Include element data for live overlay visualization
|
|
365
|
+
elements_data = [
|
|
366
|
+
{
|
|
367
|
+
"id": el.id,
|
|
368
|
+
"bbox": {
|
|
369
|
+
"x": el.bbox.x,
|
|
370
|
+
"y": el.bbox.y,
|
|
371
|
+
"width": el.bbox.width,
|
|
372
|
+
"height": el.bbox.height,
|
|
373
|
+
},
|
|
374
|
+
"role": el.role,
|
|
375
|
+
"text": el.text[:50] if el.text else "",
|
|
376
|
+
}
|
|
377
|
+
for el in filtered_snap.elements[:50]
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
_safe_tracer_call(
|
|
381
|
+
self.tracer,
|
|
382
|
+
"emit",
|
|
383
|
+
self.verbose,
|
|
384
|
+
"action",
|
|
385
|
+
{
|
|
386
|
+
"action": result.action,
|
|
387
|
+
"element_id": result.element_id,
|
|
388
|
+
"success": result.success,
|
|
389
|
+
"outcome": result.outcome,
|
|
390
|
+
"duration_ms": duration_ms,
|
|
391
|
+
"post_url": post_url,
|
|
392
|
+
"elements": elements_data, # Add element data for overlay
|
|
393
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
394
|
+
},
|
|
395
|
+
step_id=step_id,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# 5. RECORD: Track history
|
|
399
|
+
self.history.append(
|
|
400
|
+
{
|
|
401
|
+
"goal": goal,
|
|
402
|
+
"action": action_str,
|
|
403
|
+
"result": result.model_dump(), # Store as dict
|
|
404
|
+
"success": result.success,
|
|
405
|
+
"attempt": attempt,
|
|
406
|
+
"duration_ms": duration_ms,
|
|
407
|
+
}
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if self.verbose:
|
|
411
|
+
status = "✅" if result.success else "❌"
|
|
412
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
413
|
+
|
|
414
|
+
# Emit step completion trace event if tracer is enabled
|
|
415
|
+
if self.tracer:
|
|
416
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
417
|
+
pre_url = snap.url
|
|
418
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
419
|
+
|
|
420
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
421
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
422
|
+
|
|
423
|
+
# Build LLM data
|
|
424
|
+
llm_response_text = llm_response.content
|
|
425
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
426
|
+
llm_data = {
|
|
427
|
+
"response_text": llm_response_text,
|
|
428
|
+
"response_hash": llm_response_hash,
|
|
429
|
+
"usage": {
|
|
430
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
431
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
432
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
433
|
+
},
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
# Build exec data
|
|
437
|
+
exec_data = {
|
|
438
|
+
"success": result.success,
|
|
439
|
+
"action": result.action,
|
|
440
|
+
"outcome": result.outcome
|
|
441
|
+
or (
|
|
442
|
+
f"Action {result.action} executed successfully"
|
|
443
|
+
if result.success
|
|
444
|
+
else f"Action {result.action} failed"
|
|
445
|
+
),
|
|
446
|
+
"duration_ms": duration_ms,
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
# Add optional exec fields
|
|
450
|
+
if result.element_id is not None:
|
|
451
|
+
exec_data["element_id"] = result.element_id
|
|
452
|
+
# Add bounding box if element found
|
|
453
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
454
|
+
if bbox:
|
|
455
|
+
exec_data["bounding_box"] = bbox
|
|
456
|
+
if result.text is not None:
|
|
457
|
+
exec_data["text"] = result.text
|
|
458
|
+
if result.key is not None:
|
|
459
|
+
exec_data["key"] = result.key
|
|
460
|
+
if result.error is not None:
|
|
461
|
+
exec_data["error"] = result.error
|
|
462
|
+
|
|
463
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
464
|
+
verify_passed = result.success and (
|
|
465
|
+
result.url_changed or result.action != "click"
|
|
466
|
+
)
|
|
467
|
+
verify_signals = {
|
|
468
|
+
"url_changed": result.url_changed or False,
|
|
469
|
+
}
|
|
470
|
+
if result.error:
|
|
471
|
+
verify_signals["error"] = result.error
|
|
472
|
+
|
|
473
|
+
# Add elements_found array if element was targeted
|
|
474
|
+
if result.element_id is not None:
|
|
475
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
476
|
+
if bbox:
|
|
477
|
+
verify_signals["elements_found"] = [
|
|
478
|
+
{
|
|
479
|
+
"label": f"Element {result.element_id}",
|
|
480
|
+
"bounding_box": bbox,
|
|
481
|
+
}
|
|
482
|
+
]
|
|
483
|
+
|
|
484
|
+
verify_data = {
|
|
485
|
+
"passed": verify_passed,
|
|
486
|
+
"signals": verify_signals,
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
490
|
+
# Use the same format as build_snapshot_event for consistency
|
|
491
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
492
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
493
|
+
|
|
494
|
+
# Build complete step_end event
|
|
495
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
496
|
+
step_id=step_id,
|
|
497
|
+
step_index=self._step_count,
|
|
498
|
+
goal=goal,
|
|
499
|
+
attempt=attempt,
|
|
500
|
+
pre_url=pre_url,
|
|
501
|
+
post_url=post_url,
|
|
502
|
+
snapshot_digest=snapshot_digest,
|
|
503
|
+
llm_data=llm_data,
|
|
504
|
+
exec_data=exec_data,
|
|
505
|
+
verify_data=verify_data,
|
|
506
|
+
pre_elements=pre_elements,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
_safe_tracer_call(
|
|
510
|
+
self.tracer,
|
|
511
|
+
"emit",
|
|
512
|
+
self.verbose,
|
|
513
|
+
"step_end",
|
|
514
|
+
step_end_data,
|
|
515
|
+
step_id=step_id,
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
return result
|
|
519
|
+
|
|
520
|
+
except Exception as e:
|
|
521
|
+
# Emit error trace event if tracer is enabled
|
|
522
|
+
if self.tracer:
|
|
523
|
+
_safe_tracer_call(
|
|
524
|
+
self.tracer,
|
|
525
|
+
"emit_error",
|
|
526
|
+
self.verbose,
|
|
527
|
+
step_id=step_id,
|
|
528
|
+
error=str(e),
|
|
529
|
+
attempt=attempt,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
if attempt < max_retries:
|
|
533
|
+
if self.verbose:
|
|
534
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
535
|
+
time.sleep(1.0) # Brief delay before retry
|
|
536
|
+
continue
|
|
537
|
+
else:
|
|
538
|
+
# Create error result
|
|
539
|
+
error_result = AgentActionResult(
|
|
540
|
+
success=False,
|
|
541
|
+
action="error",
|
|
542
|
+
goal=goal,
|
|
543
|
+
duration_ms=0,
|
|
544
|
+
attempt=attempt,
|
|
545
|
+
error=str(e),
|
|
546
|
+
)
|
|
547
|
+
self.history.append(
|
|
548
|
+
{
|
|
549
|
+
"goal": goal,
|
|
550
|
+
"action": "error",
|
|
551
|
+
"result": error_result.model_dump(),
|
|
552
|
+
"success": False,
|
|
553
|
+
"attempt": attempt,
|
|
554
|
+
"duration_ms": 0,
|
|
555
|
+
}
|
|
556
|
+
)
|
|
557
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
558
|
+
|
|
559
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
560
|
+
"""
|
|
561
|
+
Track token usage for analytics
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
goal: User goal
|
|
565
|
+
llm_response: LLM response with token usage
|
|
566
|
+
"""
|
|
567
|
+
if llm_response.prompt_tokens:
|
|
568
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
569
|
+
if llm_response.completion_tokens:
|
|
570
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
571
|
+
if llm_response.total_tokens:
|
|
572
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
573
|
+
|
|
574
|
+
self._token_usage_raw["by_action"].append(
|
|
575
|
+
{
|
|
576
|
+
"goal": goal,
|
|
577
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
578
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
579
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
580
|
+
"model": llm_response.model_name,
|
|
581
|
+
}
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
def get_token_stats(self) -> TokenStats:
|
|
585
|
+
"""
|
|
586
|
+
Get token usage statistics
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
TokenStats with token usage breakdown
|
|
590
|
+
"""
|
|
591
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
592
|
+
return TokenStats(
|
|
593
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
594
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
595
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
596
|
+
by_action=by_action,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
def get_history(self) -> list[ActionHistory]:
|
|
600
|
+
"""
|
|
601
|
+
Get execution history
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
List of ActionHistory entries
|
|
605
|
+
"""
|
|
606
|
+
return [ActionHistory(**h) for h in self.history]
|
|
607
|
+
|
|
608
|
+
def clear_history(self) -> None:
|
|
609
|
+
"""Clear execution history and reset token counters"""
|
|
610
|
+
self.history.clear()
|
|
611
|
+
self._token_usage_raw = {
|
|
612
|
+
"total_prompt_tokens": 0,
|
|
613
|
+
"total_completion_tokens": 0,
|
|
614
|
+
"total_tokens": 0,
|
|
615
|
+
"by_action": [],
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
619
|
+
"""
|
|
620
|
+
Filter elements from snapshot based on goal context.
|
|
621
|
+
|
|
622
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
623
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
snapshot: Current page snapshot
|
|
627
|
+
goal: User's goal (can inform filtering)
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
Filtered list of elements
|
|
631
|
+
"""
|
|
632
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
class SentienceAgentAsync(BaseAgentAsync):
|
|
636
|
+
"""
|
|
637
|
+
High-level async agent that combines Sentience SDK with any LLM provider.
|
|
638
|
+
|
|
639
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
640
|
+
1. OBSERVE: Get snapshot of current page state
|
|
641
|
+
2. THINK: Query LLM to decide next action
|
|
642
|
+
3. ACT: Execute action using SDK
|
|
643
|
+
|
|
644
|
+
Example:
|
|
645
|
+
>>> from sentience.async_api import AsyncSentienceBrowser
|
|
646
|
+
>>> from sentience.agent import SentienceAgentAsync
|
|
647
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
648
|
+
>>>
|
|
649
|
+
>>> async with AsyncSentienceBrowser() as browser:
|
|
650
|
+
>>> await browser.goto("https://google.com")
|
|
651
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
652
|
+
>>> agent = SentienceAgentAsync(browser, llm)
|
|
653
|
+
>>> await agent.act("Click the search box")
|
|
654
|
+
>>> await agent.act("Type 'magic mouse' into the search field")
|
|
655
|
+
>>> await agent.act("Press Enter key")
|
|
656
|
+
"""
|
|
657
|
+
|
|
658
|
+
def __init__(
|
|
659
|
+
self,
|
|
660
|
+
browser: AsyncSentienceBrowser,
|
|
661
|
+
llm: LLMProvider,
|
|
662
|
+
default_snapshot_limit: int = 50,
|
|
663
|
+
verbose: bool = True,
|
|
664
|
+
tracer: Optional["Tracer"] = None,
|
|
665
|
+
config: Optional["AgentConfig"] = None,
|
|
666
|
+
):
|
|
667
|
+
"""
|
|
668
|
+
Initialize Sentience Agent (async)
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
browser: AsyncSentienceBrowser instance
|
|
672
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
673
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
674
|
+
verbose: Print execution logs (default: True)
|
|
675
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
676
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
677
|
+
"""
|
|
678
|
+
self.browser = browser
|
|
679
|
+
self.llm = llm
|
|
680
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
681
|
+
self.verbose = verbose
|
|
682
|
+
self.tracer = tracer
|
|
683
|
+
self.config = config or AgentConfig()
|
|
684
|
+
|
|
685
|
+
# Initialize handlers
|
|
686
|
+
self.llm_handler = LLMInteractionHandler(llm)
|
|
687
|
+
self.action_executor = ActionExecutor(browser)
|
|
688
|
+
|
|
689
|
+
# Screenshot sequence counter
|
|
690
|
+
# Execution history
|
|
691
|
+
self.history: list[dict[str, Any]] = []
|
|
692
|
+
|
|
693
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
694
|
+
self._token_usage_raw = {
|
|
695
|
+
"total_prompt_tokens": 0,
|
|
696
|
+
"total_completion_tokens": 0,
|
|
697
|
+
"total_tokens": 0,
|
|
698
|
+
"by_action": [],
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
# Step counter for tracing
|
|
702
|
+
self._step_count = 0
|
|
703
|
+
|
|
704
|
+
# Previous snapshot for diff detection
|
|
705
|
+
self._previous_snapshot: Snapshot | None = None
|
|
706
|
+
|
|
707
|
+
def _compute_hash(self, text: str) -> str:
|
|
708
|
+
"""Compute SHA256 hash of text."""
|
|
709
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
710
|
+
|
|
711
|
+
def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
|
|
712
|
+
"""Get bounding box for an element from snapshot."""
|
|
713
|
+
if element_id is None:
|
|
714
|
+
return None
|
|
715
|
+
for el in snap.elements:
|
|
716
|
+
if el.id == element_id:
|
|
717
|
+
return {
|
|
718
|
+
"x": el.bbox.x,
|
|
719
|
+
"y": el.bbox.y,
|
|
720
|
+
"width": el.bbox.width,
|
|
721
|
+
"height": el.bbox.height,
|
|
722
|
+
}
|
|
723
|
+
return None
|
|
724
|
+
|
|
725
|
+
async def act( # noqa: C901
|
|
726
|
+
self,
|
|
727
|
+
goal: str,
|
|
728
|
+
max_retries: int = 2,
|
|
729
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
730
|
+
) -> AgentActionResult:
|
|
731
|
+
"""
|
|
732
|
+
Execute a high-level goal using observe → think → act loop (async)
|
|
733
|
+
|
|
734
|
+
Args:
|
|
735
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
736
|
+
max_retries: Number of retries on failure (default: 2)
|
|
737
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
AgentActionResult with execution details
|
|
741
|
+
|
|
742
|
+
Example:
|
|
743
|
+
>>> result = await agent.act("Click the search box")
|
|
744
|
+
>>> print(result.success, result.action, result.element_id)
|
|
745
|
+
True click 42
|
|
746
|
+
"""
|
|
747
|
+
if self.verbose:
|
|
748
|
+
print(f"\n{'=' * 70}")
|
|
749
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
750
|
+
print(f"{'=' * 70}")
|
|
751
|
+
|
|
752
|
+
# Generate step ID for tracing
|
|
753
|
+
self._step_count += 1
|
|
754
|
+
step_id = f"step-{self._step_count}"
|
|
755
|
+
|
|
756
|
+
# Emit step_start trace event if tracer is enabled
|
|
757
|
+
if self.tracer:
|
|
758
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
759
|
+
_safe_tracer_call(
|
|
760
|
+
self.tracer,
|
|
761
|
+
"emit_step_start",
|
|
762
|
+
self.verbose,
|
|
763
|
+
step_id=step_id,
|
|
764
|
+
step_index=self._step_count,
|
|
765
|
+
goal=goal,
|
|
766
|
+
attempt=0,
|
|
767
|
+
pre_url=pre_url,
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
for attempt in range(max_retries + 1):
|
|
771
|
+
try:
|
|
772
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
773
|
+
start_time = time.time()
|
|
774
|
+
|
|
775
|
+
# Use provided options or create default
|
|
776
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
777
|
+
# Only set goal if not already provided
|
|
778
|
+
if snap_opts.goal is None:
|
|
779
|
+
snap_opts.goal = goal
|
|
780
|
+
|
|
781
|
+
# Apply AgentConfig screenshot settings if not overridden by snapshot_options
|
|
782
|
+
# Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
|
|
783
|
+
# (snapshot_options.screenshot defaults to False, so we check if it's still False)
|
|
784
|
+
if self.config and (snapshot_options is None or snap_opts.screenshot is False):
|
|
785
|
+
if self.config.capture_screenshots:
|
|
786
|
+
# Create ScreenshotConfig from AgentConfig
|
|
787
|
+
snap_opts.screenshot = ScreenshotConfig(
|
|
788
|
+
format=self.config.screenshot_format,
|
|
789
|
+
quality=(
|
|
790
|
+
self.config.screenshot_quality
|
|
791
|
+
if self.config.screenshot_format == "jpeg"
|
|
792
|
+
else None
|
|
793
|
+
),
|
|
794
|
+
)
|
|
795
|
+
else:
|
|
796
|
+
snap_opts.screenshot = False
|
|
797
|
+
# Apply show_overlay from AgentConfig
|
|
798
|
+
# Note: User can override by explicitly passing show_overlay in snapshot_options
|
|
799
|
+
snap_opts.show_overlay = self.config.show_overlay
|
|
800
|
+
|
|
801
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
802
|
+
snap = await snapshot_async(self.browser, snap_opts)
|
|
803
|
+
|
|
804
|
+
if snap.status != "success":
|
|
805
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
806
|
+
|
|
807
|
+
# Compute diff_status by comparing with previous snapshot
|
|
808
|
+
elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
|
|
809
|
+
|
|
810
|
+
# Create snapshot with diff_status populated
|
|
811
|
+
snap_with_diff = Snapshot(
|
|
812
|
+
status=snap.status,
|
|
813
|
+
timestamp=snap.timestamp,
|
|
814
|
+
url=snap.url,
|
|
815
|
+
viewport=snap.viewport,
|
|
816
|
+
elements=elements_with_diff,
|
|
817
|
+
screenshot=snap.screenshot,
|
|
818
|
+
screenshot_format=snap.screenshot_format,
|
|
819
|
+
error=snap.error,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# Update previous snapshot for next comparison
|
|
823
|
+
self._previous_snapshot = snap
|
|
824
|
+
|
|
825
|
+
# Apply element filtering based on goal
|
|
826
|
+
filtered_elements = self.filter_elements(snap_with_diff, goal)
|
|
827
|
+
|
|
828
|
+
# Emit snapshot trace event if tracer is enabled
|
|
829
|
+
if self.tracer:
|
|
830
|
+
# Build snapshot event data (use snap_with_diff to include diff_status)
|
|
831
|
+
snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
832
|
+
|
|
833
|
+
# Always include screenshot in trace event for studio viewer compatibility
|
|
834
|
+
# CloudTraceSink will extract and upload screenshots separately, then remove
|
|
835
|
+
# screenshot_base64 from events before uploading the trace file.
|
|
836
|
+
if snap.screenshot:
|
|
837
|
+
# Extract base64 string from data URL if needed
|
|
838
|
+
if snap.screenshot.startswith("data:image"):
|
|
839
|
+
# Format: "data:image/jpeg;base64,{base64_string}"
|
|
840
|
+
screenshot_base64 = (
|
|
841
|
+
snap.screenshot.split(",", 1)[1]
|
|
842
|
+
if "," in snap.screenshot
|
|
843
|
+
else snap.screenshot
|
|
844
|
+
)
|
|
845
|
+
else:
|
|
846
|
+
screenshot_base64 = snap.screenshot
|
|
847
|
+
|
|
848
|
+
snapshot_data["screenshot_base64"] = screenshot_base64
|
|
849
|
+
if snap.screenshot_format:
|
|
850
|
+
snapshot_data["screenshot_format"] = snap.screenshot_format
|
|
851
|
+
|
|
852
|
+
_safe_tracer_call(
|
|
853
|
+
self.tracer,
|
|
854
|
+
"emit",
|
|
855
|
+
self.verbose,
|
|
856
|
+
"snapshot",
|
|
857
|
+
snapshot_data,
|
|
858
|
+
step_id=step_id,
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
# Create filtered snapshot (use snap_with_diff to preserve metadata)
|
|
862
|
+
filtered_snap = Snapshot(
|
|
863
|
+
status=snap_with_diff.status,
|
|
864
|
+
timestamp=snap_with_diff.timestamp,
|
|
865
|
+
url=snap_with_diff.url,
|
|
866
|
+
viewport=snap_with_diff.viewport,
|
|
867
|
+
elements=filtered_elements,
|
|
868
|
+
screenshot=snap_with_diff.screenshot,
|
|
869
|
+
screenshot_format=snap_with_diff.screenshot_format,
|
|
870
|
+
error=snap_with_diff.error,
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
# 2. GROUND: Format elements for LLM context
|
|
874
|
+
context = self.llm_handler.build_context(filtered_snap, goal)
|
|
875
|
+
|
|
876
|
+
# 3. THINK: Query LLM for next action
|
|
877
|
+
llm_response = self.llm_handler.query_llm(context, goal)
|
|
878
|
+
|
|
879
|
+
# Emit LLM query trace event if tracer is enabled
|
|
880
|
+
if self.tracer:
|
|
881
|
+
_safe_tracer_call(
|
|
882
|
+
self.tracer,
|
|
883
|
+
"emit",
|
|
884
|
+
self.verbose,
|
|
885
|
+
"llm_query",
|
|
886
|
+
{
|
|
887
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
888
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
889
|
+
"model": llm_response.model_name,
|
|
890
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
891
|
+
},
|
|
892
|
+
step_id=step_id,
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
if self.verbose:
|
|
896
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
897
|
+
|
|
898
|
+
# Track token usage
|
|
899
|
+
self._track_tokens(goal, llm_response)
|
|
900
|
+
|
|
901
|
+
# Parse action from LLM response
|
|
902
|
+
action_str = self.llm_handler.extract_action(llm_response.content)
|
|
903
|
+
|
|
904
|
+
# 4. EXECUTE: Parse and run action
|
|
905
|
+
result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
|
|
906
|
+
|
|
907
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
908
|
+
|
|
909
|
+
# Create AgentActionResult from execution result
|
|
910
|
+
result = AgentActionResult(
|
|
911
|
+
success=result_dict["success"],
|
|
912
|
+
action=result_dict["action"],
|
|
913
|
+
goal=goal,
|
|
914
|
+
duration_ms=duration_ms,
|
|
915
|
+
attempt=attempt,
|
|
916
|
+
element_id=result_dict.get("element_id"),
|
|
917
|
+
text=result_dict.get("text"),
|
|
918
|
+
key=result_dict.get("key"),
|
|
919
|
+
outcome=result_dict.get("outcome"),
|
|
920
|
+
url_changed=result_dict.get("url_changed"),
|
|
921
|
+
error=result_dict.get("error"),
|
|
922
|
+
message=result_dict.get("message"),
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Emit action execution trace event if tracer is enabled
|
|
926
|
+
if self.tracer:
|
|
927
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
928
|
+
|
|
929
|
+
# Include element data for live overlay visualization
|
|
930
|
+
elements_data = [
|
|
931
|
+
{
|
|
932
|
+
"id": el.id,
|
|
933
|
+
"bbox": {
|
|
934
|
+
"x": el.bbox.x,
|
|
935
|
+
"y": el.bbox.y,
|
|
936
|
+
"width": el.bbox.width,
|
|
937
|
+
"height": el.bbox.height,
|
|
938
|
+
},
|
|
939
|
+
"role": el.role,
|
|
940
|
+
"text": el.text[:50] if el.text else "",
|
|
941
|
+
}
|
|
942
|
+
for el in filtered_snap.elements[:50]
|
|
943
|
+
]
|
|
944
|
+
|
|
945
|
+
_safe_tracer_call(
|
|
946
|
+
self.tracer,
|
|
947
|
+
"emit",
|
|
948
|
+
self.verbose,
|
|
949
|
+
"action",
|
|
950
|
+
{
|
|
951
|
+
"action": result.action,
|
|
952
|
+
"element_id": result.element_id,
|
|
953
|
+
"success": result.success,
|
|
954
|
+
"outcome": result.outcome,
|
|
955
|
+
"duration_ms": duration_ms,
|
|
956
|
+
"post_url": post_url,
|
|
957
|
+
"elements": elements_data, # Add element data for overlay
|
|
958
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
959
|
+
},
|
|
960
|
+
step_id=step_id,
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
# 5. RECORD: Track history
|
|
964
|
+
self.history.append(
|
|
965
|
+
{
|
|
966
|
+
"goal": goal,
|
|
967
|
+
"action": action_str,
|
|
968
|
+
"result": result.model_dump(), # Store as dict
|
|
969
|
+
"success": result.success,
|
|
970
|
+
"attempt": attempt,
|
|
971
|
+
"duration_ms": duration_ms,
|
|
972
|
+
}
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
if self.verbose:
|
|
976
|
+
status = "✅" if result.success else "❌"
|
|
977
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
978
|
+
|
|
979
|
+
# Emit step completion trace event if tracer is enabled
|
|
980
|
+
if self.tracer:
|
|
981
|
+
# Get pre_url from step_start (stored in tracer or use current)
|
|
982
|
+
pre_url = snap.url
|
|
983
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
984
|
+
|
|
985
|
+
# Compute snapshot digest (simplified - use URL + timestamp)
|
|
986
|
+
snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
|
|
987
|
+
|
|
988
|
+
# Build LLM data
|
|
989
|
+
llm_response_text = llm_response.content
|
|
990
|
+
llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
|
|
991
|
+
llm_data = {
|
|
992
|
+
"response_text": llm_response_text,
|
|
993
|
+
"response_hash": llm_response_hash,
|
|
994
|
+
"usage": {
|
|
995
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
996
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
997
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
998
|
+
},
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
# Build exec data
|
|
1002
|
+
exec_data = {
|
|
1003
|
+
"success": result.success,
|
|
1004
|
+
"action": result.action,
|
|
1005
|
+
"outcome": result.outcome
|
|
1006
|
+
or (
|
|
1007
|
+
f"Action {result.action} executed successfully"
|
|
1008
|
+
if result.success
|
|
1009
|
+
else f"Action {result.action} failed"
|
|
1010
|
+
),
|
|
1011
|
+
"duration_ms": duration_ms,
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
# Add optional exec fields
|
|
1015
|
+
if result.element_id is not None:
|
|
1016
|
+
exec_data["element_id"] = result.element_id
|
|
1017
|
+
# Add bounding box if element found
|
|
1018
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1019
|
+
if bbox:
|
|
1020
|
+
exec_data["bounding_box"] = bbox
|
|
1021
|
+
if result.text is not None:
|
|
1022
|
+
exec_data["text"] = result.text
|
|
1023
|
+
if result.key is not None:
|
|
1024
|
+
exec_data["key"] = result.key
|
|
1025
|
+
if result.error is not None:
|
|
1026
|
+
exec_data["error"] = result.error
|
|
1027
|
+
|
|
1028
|
+
# Build verify data (simplified - based on success and url_changed)
|
|
1029
|
+
verify_passed = result.success and (
|
|
1030
|
+
result.url_changed or result.action != "click"
|
|
1031
|
+
)
|
|
1032
|
+
verify_signals = {
|
|
1033
|
+
"url_changed": result.url_changed or False,
|
|
1034
|
+
}
|
|
1035
|
+
if result.error:
|
|
1036
|
+
verify_signals["error"] = result.error
|
|
1037
|
+
|
|
1038
|
+
# Add elements_found array if element was targeted
|
|
1039
|
+
if result.element_id is not None:
|
|
1040
|
+
bbox = self._get_element_bbox(result.element_id, snap)
|
|
1041
|
+
if bbox:
|
|
1042
|
+
verify_signals["elements_found"] = [
|
|
1043
|
+
{
|
|
1044
|
+
"label": f"Element {result.element_id}",
|
|
1045
|
+
"bounding_box": bbox,
|
|
1046
|
+
}
|
|
1047
|
+
]
|
|
1048
|
+
|
|
1049
|
+
verify_data = {
|
|
1050
|
+
"passed": verify_passed,
|
|
1051
|
+
"signals": verify_signals,
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
# Build elements data for pre field (include diff_status from snap_with_diff)
|
|
1055
|
+
# Use the same format as build_snapshot_event for consistency
|
|
1056
|
+
snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
|
|
1057
|
+
pre_elements = snapshot_event_data.get("elements", [])
|
|
1058
|
+
|
|
1059
|
+
# Build complete step_end event
|
|
1060
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
1061
|
+
step_id=step_id,
|
|
1062
|
+
step_index=self._step_count,
|
|
1063
|
+
goal=goal,
|
|
1064
|
+
attempt=attempt,
|
|
1065
|
+
pre_url=pre_url,
|
|
1066
|
+
post_url=post_url,
|
|
1067
|
+
snapshot_digest=snapshot_digest,
|
|
1068
|
+
llm_data=llm_data,
|
|
1069
|
+
exec_data=exec_data,
|
|
1070
|
+
verify_data=verify_data,
|
|
1071
|
+
pre_elements=pre_elements,
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
_safe_tracer_call(
|
|
1075
|
+
self.tracer,
|
|
1076
|
+
"emit",
|
|
1077
|
+
self.verbose,
|
|
1078
|
+
"step_end",
|
|
1079
|
+
step_end_data,
|
|
1080
|
+
step_id=step_id,
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
return result
|
|
1084
|
+
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
# Emit error trace event if tracer is enabled
|
|
1087
|
+
if self.tracer:
|
|
1088
|
+
_safe_tracer_call(
|
|
1089
|
+
self.tracer,
|
|
1090
|
+
"emit_error",
|
|
1091
|
+
self.verbose,
|
|
1092
|
+
step_id=step_id,
|
|
1093
|
+
error=str(e),
|
|
1094
|
+
attempt=attempt,
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
if attempt < max_retries:
|
|
1098
|
+
if self.verbose:
|
|
1099
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
1100
|
+
await asyncio.sleep(1.0) # Brief delay before retry
|
|
1101
|
+
continue
|
|
1102
|
+
else:
|
|
1103
|
+
# Create error result
|
|
1104
|
+
error_result = AgentActionResult(
|
|
1105
|
+
success=False,
|
|
1106
|
+
action="error",
|
|
1107
|
+
goal=goal,
|
|
1108
|
+
duration_ms=0,
|
|
1109
|
+
attempt=attempt,
|
|
1110
|
+
error=str(e),
|
|
1111
|
+
)
|
|
1112
|
+
self.history.append(
|
|
1113
|
+
{
|
|
1114
|
+
"goal": goal,
|
|
1115
|
+
"action": "error",
|
|
1116
|
+
"result": error_result.model_dump(),
|
|
1117
|
+
"success": False,
|
|
1118
|
+
"attempt": attempt,
|
|
1119
|
+
"duration_ms": 0,
|
|
1120
|
+
}
|
|
1121
|
+
)
|
|
1122
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
1123
|
+
|
|
1124
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
1125
|
+
"""Track token usage for analytics (same as sync version)"""
|
|
1126
|
+
if llm_response.prompt_tokens:
|
|
1127
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
1128
|
+
if llm_response.completion_tokens:
|
|
1129
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
1130
|
+
if llm_response.total_tokens:
|
|
1131
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
1132
|
+
|
|
1133
|
+
self._token_usage_raw["by_action"].append(
|
|
1134
|
+
{
|
|
1135
|
+
"goal": goal,
|
|
1136
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
1137
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
1138
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
1139
|
+
"model": llm_response.model_name,
|
|
1140
|
+
}
|
|
1141
|
+
)
|
|
1142
|
+
|
|
1143
|
+
def get_token_stats(self) -> TokenStats:
|
|
1144
|
+
"""Get token usage statistics (same as sync version)"""
|
|
1145
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
1146
|
+
return TokenStats(
|
|
1147
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
1148
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
1149
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
1150
|
+
by_action=by_action,
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
def get_history(self) -> list[ActionHistory]:
|
|
1154
|
+
"""Get execution history (same as sync version)"""
|
|
1155
|
+
return [ActionHistory(**h) for h in self.history]
|
|
1156
|
+
|
|
1157
|
+
def clear_history(self) -> None:
|
|
1158
|
+
"""Clear execution history and reset token counters (same as sync version)"""
|
|
1159
|
+
self.history.clear()
|
|
1160
|
+
self._token_usage_raw = {
|
|
1161
|
+
"total_prompt_tokens": 0,
|
|
1162
|
+
"total_completion_tokens": 0,
|
|
1163
|
+
"total_tokens": 0,
|
|
1164
|
+
"by_action": [],
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
1168
|
+
"""
|
|
1169
|
+
Filter elements from snapshot based on goal context.
|
|
1170
|
+
|
|
1171
|
+
This implementation uses ElementFilter to apply goal-based keyword matching
|
|
1172
|
+
to boost relevant elements and filters out irrelevant ones.
|
|
1173
|
+
|
|
1174
|
+
Args:
|
|
1175
|
+
snapshot: Current page snapshot
|
|
1176
|
+
goal: User's goal (can inform filtering)
|
|
1177
|
+
|
|
1178
|
+
Returns:
|
|
1179
|
+
Filtered list of elements
|
|
1180
|
+
"""
|
|
1181
|
+
return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
|