sentienceapi 0.90.16__py3-none-any.whl → 0.98.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (90) hide show
  1. sentience/__init__.py +120 -6
  2. sentience/_extension_loader.py +156 -1
  3. sentience/action_executor.py +217 -0
  4. sentience/actions.py +758 -30
  5. sentience/agent.py +806 -293
  6. sentience/agent_config.py +3 -0
  7. sentience/agent_runtime.py +840 -0
  8. sentience/asserts/__init__.py +70 -0
  9. sentience/asserts/expect.py +621 -0
  10. sentience/asserts/query.py +383 -0
  11. sentience/async_api.py +89 -1141
  12. sentience/backends/__init__.py +137 -0
  13. sentience/backends/actions.py +372 -0
  14. sentience/backends/browser_use_adapter.py +241 -0
  15. sentience/backends/cdp_backend.py +393 -0
  16. sentience/backends/exceptions.py +211 -0
  17. sentience/backends/playwright_backend.py +194 -0
  18. sentience/backends/protocol.py +216 -0
  19. sentience/backends/sentience_context.py +469 -0
  20. sentience/backends/snapshot.py +483 -0
  21. sentience/base_agent.py +95 -0
  22. sentience/browser.py +678 -39
  23. sentience/browser_evaluator.py +299 -0
  24. sentience/canonicalization.py +207 -0
  25. sentience/cloud_tracing.py +507 -42
  26. sentience/constants.py +6 -0
  27. sentience/conversational_agent.py +77 -43
  28. sentience/cursor_policy.py +142 -0
  29. sentience/element_filter.py +136 -0
  30. sentience/expect.py +98 -2
  31. sentience/extension/background.js +56 -185
  32. sentience/extension/content.js +150 -287
  33. sentience/extension/injected_api.js +1088 -1368
  34. sentience/extension/manifest.json +1 -1
  35. sentience/extension/pkg/sentience_core.d.ts +22 -22
  36. sentience/extension/pkg/sentience_core.js +275 -433
  37. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  38. sentience/extension/release.json +47 -47
  39. sentience/failure_artifacts.py +241 -0
  40. sentience/formatting.py +9 -53
  41. sentience/inspector.py +183 -1
  42. sentience/integrations/__init__.py +6 -0
  43. sentience/integrations/langchain/__init__.py +12 -0
  44. sentience/integrations/langchain/context.py +18 -0
  45. sentience/integrations/langchain/core.py +326 -0
  46. sentience/integrations/langchain/tools.py +180 -0
  47. sentience/integrations/models.py +46 -0
  48. sentience/integrations/pydanticai/__init__.py +15 -0
  49. sentience/integrations/pydanticai/deps.py +20 -0
  50. sentience/integrations/pydanticai/toolset.py +468 -0
  51. sentience/llm_interaction_handler.py +191 -0
  52. sentience/llm_provider.py +765 -66
  53. sentience/llm_provider_utils.py +120 -0
  54. sentience/llm_response_builder.py +153 -0
  55. sentience/models.py +595 -3
  56. sentience/ordinal.py +280 -0
  57. sentience/overlay.py +109 -2
  58. sentience/protocols.py +228 -0
  59. sentience/query.py +67 -5
  60. sentience/read.py +95 -3
  61. sentience/recorder.py +223 -3
  62. sentience/schemas/trace_v1.json +128 -9
  63. sentience/screenshot.py +48 -2
  64. sentience/sentience_methods.py +86 -0
  65. sentience/snapshot.py +599 -55
  66. sentience/snapshot_diff.py +126 -0
  67. sentience/text_search.py +120 -5
  68. sentience/trace_event_builder.py +148 -0
  69. sentience/trace_file_manager.py +197 -0
  70. sentience/trace_indexing/index_schema.py +95 -7
  71. sentience/trace_indexing/indexer.py +105 -48
  72. sentience/tracer_factory.py +120 -9
  73. sentience/tracing.py +172 -8
  74. sentience/utils/__init__.py +40 -0
  75. sentience/utils/browser.py +46 -0
  76. sentience/{utils.py → utils/element.py} +3 -42
  77. sentience/utils/formatting.py +59 -0
  78. sentience/verification.py +618 -0
  79. sentience/visual_agent.py +2058 -0
  80. sentience/wait.py +68 -2
  81. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +199 -40
  82. sentienceapi-0.98.0.dist-info/RECORD +92 -0
  83. sentience/extension/test-content.js +0 -4
  84. sentienceapi-0.90.16.dist-info/RECORD +0 -50
  85. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
  86. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
  87. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
  88. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
  89. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
  90. {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
sentience/agent.py CHANGED
@@ -3,13 +3,17 @@ Sentience Agent: High-level automation agent using LLM + SDK
3
3
  Implements observe-think-act loop for natural language commands
4
4
  """
5
5
 
6
- import re
6
+ import asyncio
7
+ import hashlib
7
8
  import time
8
- from typing import TYPE_CHECKING, Any, Optional
9
-
10
- from .actions import click, press, type_text
11
- from .base_agent import BaseAgent
12
- from .browser import SentienceBrowser
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
+
11
+ from .action_executor import ActionExecutor
12
+ from .agent_config import AgentConfig
13
+ from .base_agent import BaseAgent, BaseAgentAsync
14
+ from .browser import AsyncSentienceBrowser, SentienceBrowser
15
+ from .element_filter import ElementFilter
16
+ from .llm_interaction_handler import LLMInteractionHandler
13
17
  from .llm_provider import LLMProvider, LLMResponse
14
18
  from .models import (
15
19
  ActionHistory,
@@ -21,13 +25,46 @@ from .models import (
21
25
  SnapshotOptions,
22
26
  TokenStats,
23
27
  )
24
- from .snapshot import snapshot
28
+ from .protocols import AsyncBrowserProtocol, BrowserProtocol
29
+ from .snapshot import snapshot, snapshot_async
30
+ from .snapshot_diff import SnapshotDiff
31
+ from .trace_event_builder import TraceEventBuilder
25
32
 
26
33
  if TYPE_CHECKING:
27
- from .agent_config import AgentConfig
28
34
  from .tracing import Tracer
29
35
 
30
36
 
37
+ def _safe_tracer_call(
38
+ tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
39
+ ) -> None:
40
+ """
41
+ Safely call tracer method, catching and logging errors without breaking execution.
42
+
43
+ Args:
44
+ tracer: Tracer instance or None
45
+ method_name: Name of tracer method to call (e.g., "emit", "emit_error")
46
+ verbose: Whether to print error messages
47
+ *args: Positional arguments for the tracer method
48
+ **kwargs: Keyword arguments for the tracer method
49
+ """
50
+ if not tracer:
51
+ return
52
+ try:
53
+ method = getattr(tracer, method_name)
54
+ if args and kwargs:
55
+ method(*args, **kwargs)
56
+ elif args:
57
+ method(*args)
58
+ elif kwargs:
59
+ method(**kwargs)
60
+ else:
61
+ method()
62
+ except Exception as tracer_error:
63
+ # Tracer errors should not break agent execution
64
+ if verbose:
65
+ print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
66
+
67
+
31
68
  class SentienceAgent(BaseAgent):
32
69
  """
33
70
  High-level agent that combines Sentience SDK with any LLM provider.
@@ -54,7 +91,7 @@ class SentienceAgent(BaseAgent):
54
91
 
55
92
  def __init__(
56
93
  self,
57
- browser: SentienceBrowser,
94
+ browser: SentienceBrowser | BrowserProtocol,
58
95
  llm: LLMProvider,
59
96
  default_snapshot_limit: int = 50,
60
97
  verbose: bool = True,
@@ -65,7 +102,8 @@ class SentienceAgent(BaseAgent):
65
102
  Initialize Sentience Agent
66
103
 
67
104
  Args:
68
- browser: SentienceBrowser instance
105
+ browser: SentienceBrowser instance or BrowserProtocol-compatible object
106
+ (for testing, can use mock objects that implement BrowserProtocol)
69
107
  llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
70
108
  default_snapshot_limit: Default maximum elements to include in context (default: 50)
71
109
  verbose: Print execution logs (default: True)
@@ -77,8 +115,13 @@ class SentienceAgent(BaseAgent):
77
115
  self.default_snapshot_limit = default_snapshot_limit
78
116
  self.verbose = verbose
79
117
  self.tracer = tracer
80
- self.config = config
118
+ self.config = config or AgentConfig()
119
+
120
+ # Initialize handlers
121
+ self.llm_handler = LLMInteractionHandler(llm)
122
+ self.action_executor = ActionExecutor(browser)
81
123
 
124
+ # Screenshot sequence counter
82
125
  # Execution history
83
126
  self.history: list[dict[str, Any]] = []
84
127
 
@@ -93,6 +136,27 @@ class SentienceAgent(BaseAgent):
93
136
  # Step counter for tracing
94
137
  self._step_count = 0
95
138
 
139
+ # Previous snapshot for diff detection
140
+ self._previous_snapshot: Snapshot | None = None
141
+
142
+ def _compute_hash(self, text: str) -> str:
143
+ """Compute SHA256 hash of text."""
144
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
145
+
146
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
147
+ """Get bounding box for an element from snapshot."""
148
+ if element_id is None:
149
+ return None
150
+ for el in snap.elements:
151
+ if el.id == element_id:
152
+ return {
153
+ "x": el.bbox.x,
154
+ "y": el.bbox.y,
155
+ "width": el.bbox.width,
156
+ "height": el.bbox.height,
157
+ }
158
+ return None
159
+
96
160
  def act( # noqa: C901
97
161
  self,
98
162
  goal: str,
@@ -130,7 +194,10 @@ class SentienceAgent(BaseAgent):
130
194
  # Emit step_start trace event if tracer is enabled
131
195
  if self.tracer:
132
196
  pre_url = self.browser.page.url if self.browser.page else None
133
- self.tracer.emit_step_start(
197
+ _safe_tracer_call(
198
+ self.tracer,
199
+ "emit_step_start",
200
+ self.verbose,
134
201
  step_id=step_id,
135
202
  step_index=self._step_count,
136
203
  goal=goal,
@@ -149,66 +216,107 @@ class SentienceAgent(BaseAgent):
149
216
  if snap_opts.goal is None:
150
217
  snap_opts.goal = goal
151
218
 
219
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
220
+ if snapshot_options is None and self.config:
221
+ if self.config.capture_screenshots:
222
+ # Create ScreenshotConfig from AgentConfig
223
+ snap_opts.screenshot = ScreenshotConfig(
224
+ format=self.config.screenshot_format,
225
+ quality=(
226
+ self.config.screenshot_quality
227
+ if self.config.screenshot_format == "jpeg"
228
+ else None
229
+ ),
230
+ )
231
+ else:
232
+ snap_opts.screenshot = False
233
+ # Apply show_overlay from AgentConfig
234
+ snap_opts.show_overlay = self.config.show_overlay
235
+
152
236
  # Call snapshot with options object (matches TypeScript API)
153
237
  snap = snapshot(self.browser, snap_opts)
154
238
 
155
239
  if snap.status != "success":
156
240
  raise RuntimeError(f"Snapshot failed: {snap.error}")
157
241
 
242
+ # Compute diff_status by comparing with previous snapshot
243
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
244
+
245
+ # Create snapshot with diff_status populated
246
+ snap_with_diff = Snapshot(
247
+ status=snap.status,
248
+ timestamp=snap.timestamp,
249
+ url=snap.url,
250
+ viewport=snap.viewport,
251
+ elements=elements_with_diff,
252
+ screenshot=snap.screenshot,
253
+ screenshot_format=snap.screenshot_format,
254
+ error=snap.error,
255
+ )
256
+
257
+ # Update previous snapshot for next comparison
258
+ self._previous_snapshot = snap
259
+
158
260
  # Apply element filtering based on goal
159
- filtered_elements = self.filter_elements(snap, goal)
261
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
160
262
 
161
263
  # Emit snapshot trace event if tracer is enabled
162
264
  if self.tracer:
163
- # Include element data for live overlay visualization
164
- # Use filtered_elements for overlay (only relevant elements)
165
- elements_data = [
166
- {
167
- "id": el.id,
168
- "bbox": {
169
- "x": el.bbox.x,
170
- "y": el.bbox.y,
171
- "width": el.bbox.width,
172
- "height": el.bbox.height,
173
- },
174
- "role": el.role,
175
- "text": el.text[:50] if el.text else "", # Truncate for brevity
176
- }
177
- for el in filtered_elements[:50] # Limit to first 50 for performance
178
- ]
179
-
180
- self.tracer.emit(
265
+ # Build snapshot event data (use snap_with_diff to include diff_status)
266
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
267
+
268
+ # Always include screenshot in trace event for studio viewer compatibility
269
+ # CloudTraceSink will extract and upload screenshots separately, then remove
270
+ # screenshot_base64 from events before uploading the trace file.
271
+ if snap.screenshot:
272
+ # Extract base64 string from data URL if needed
273
+ if snap.screenshot.startswith("data:image"):
274
+ # Format: "data:image/jpeg;base64,{base64_string}"
275
+ screenshot_base64 = (
276
+ snap.screenshot.split(",", 1)[1]
277
+ if "," in snap.screenshot
278
+ else snap.screenshot
279
+ )
280
+ else:
281
+ screenshot_base64 = snap.screenshot
282
+
283
+ snapshot_data["screenshot_base64"] = screenshot_base64
284
+ if snap.screenshot_format:
285
+ snapshot_data["screenshot_format"] = snap.screenshot_format
286
+
287
+ _safe_tracer_call(
288
+ self.tracer,
289
+ "emit",
290
+ self.verbose,
181
291
  "snapshot",
182
- {
183
- "url": snap.url,
184
- "element_count": len(snap.elements),
185
- "timestamp": snap.timestamp,
186
- "elements": elements_data, # Add element data for overlay
187
- },
292
+ snapshot_data,
188
293
  step_id=step_id,
189
294
  )
190
295
 
191
- # Create filtered snapshot
296
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
192
297
  filtered_snap = Snapshot(
193
- status=snap.status,
194
- timestamp=snap.timestamp,
195
- url=snap.url,
196
- viewport=snap.viewport,
298
+ status=snap_with_diff.status,
299
+ timestamp=snap_with_diff.timestamp,
300
+ url=snap_with_diff.url,
301
+ viewport=snap_with_diff.viewport,
197
302
  elements=filtered_elements,
198
- screenshot=snap.screenshot,
199
- screenshot_format=snap.screenshot_format,
200
- error=snap.error,
303
+ screenshot=snap_with_diff.screenshot,
304
+ screenshot_format=snap_with_diff.screenshot_format,
305
+ error=snap_with_diff.error,
201
306
  )
202
307
 
203
308
  # 2. GROUND: Format elements for LLM context
204
- context = self._build_context(filtered_snap, goal)
309
+ context = self.llm_handler.build_context(filtered_snap, goal)
205
310
 
206
311
  # 3. THINK: Query LLM for next action
207
- llm_response = self._query_llm(context, goal)
312
+ llm_response = self.llm_handler.query_llm(context, goal)
208
313
 
209
314
  # Emit LLM query trace event if tracer is enabled
210
315
  if self.tracer:
211
- self.tracer.emit(
316
+ _safe_tracer_call(
317
+ self.tracer,
318
+ "emit",
319
+ self.verbose,
212
320
  "llm_query",
213
321
  {
214
322
  "prompt_tokens": llm_response.prompt_tokens,
@@ -226,10 +334,10 @@ class SentienceAgent(BaseAgent):
226
334
  self._track_tokens(goal, llm_response)
227
335
 
228
336
  # Parse action from LLM response
229
- action_str = self._extract_action_from_response(llm_response.content)
337
+ action_str = self.llm_handler.extract_action(llm_response.content)
230
338
 
231
339
  # 4. EXECUTE: Parse and run action
232
- result_dict = self._execute_action(action_str, filtered_snap)
340
+ result_dict = self.action_executor.execute(action_str, filtered_snap)
233
341
 
234
342
  duration_ms = int((time.time() - start_time) * 1000)
235
343
 
@@ -247,6 +355,7 @@ class SentienceAgent(BaseAgent):
247
355
  url_changed=result_dict.get("url_changed"),
248
356
  error=result_dict.get("error"),
249
357
  message=result_dict.get("message"),
358
+ cursor=result_dict.get("cursor"),
250
359
  )
251
360
 
252
361
  # Emit action execution trace event if tracer is enabled
@@ -269,7 +378,10 @@ class SentienceAgent(BaseAgent):
269
378
  for el in filtered_snap.elements[:50]
270
379
  ]
271
380
 
272
- self.tracer.emit(
381
+ _safe_tracer_call(
382
+ self.tracer,
383
+ "emit",
384
+ self.verbose,
273
385
  "action",
274
386
  {
275
387
  "action": result.action,
@@ -280,6 +392,7 @@ class SentienceAgent(BaseAgent):
280
392
  "post_url": post_url,
281
393
  "elements": elements_data, # Add element data for overlay
282
394
  "target_element_id": result.element_id, # Highlight target in red
395
+ "cursor": result.cursor,
283
396
  },
284
397
  step_id=step_id,
285
398
  )
@@ -302,13 +415,107 @@ class SentienceAgent(BaseAgent):
302
415
 
303
416
  # Emit step completion trace event if tracer is enabled
304
417
  if self.tracer:
305
- self.tracer.emit(
306
- "step_end",
307
- {
308
- "success": result.success,
309
- "duration_ms": duration_ms,
310
- "action": result.action,
418
+ # Get pre_url from step_start (stored in tracer or use current)
419
+ pre_url = snap.url
420
+ post_url = self.browser.page.url if self.browser.page else None
421
+
422
+ # Compute snapshot digest (simplified - use URL + timestamp)
423
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
424
+
425
+ # Build LLM data
426
+ llm_response_text = llm_response.content
427
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
428
+ llm_data = {
429
+ "response_text": llm_response_text,
430
+ "response_hash": llm_response_hash,
431
+ "usage": {
432
+ "prompt_tokens": llm_response.prompt_tokens or 0,
433
+ "completion_tokens": llm_response.completion_tokens or 0,
434
+ "total_tokens": llm_response.total_tokens or 0,
311
435
  },
436
+ }
437
+
438
+ # Build exec data
439
+ exec_data = {
440
+ "success": result.success,
441
+ "action": result.action,
442
+ "outcome": result.outcome
443
+ or (
444
+ f"Action {result.action} executed successfully"
445
+ if result.success
446
+ else f"Action {result.action} failed"
447
+ ),
448
+ "duration_ms": duration_ms,
449
+ }
450
+ if result.cursor is not None:
451
+ exec_data["cursor"] = result.cursor
452
+
453
+ # Add optional exec fields
454
+ if result.element_id is not None:
455
+ exec_data["element_id"] = result.element_id
456
+ # Add bounding box if element found
457
+ bbox = self._get_element_bbox(result.element_id, snap)
458
+ if bbox:
459
+ exec_data["bounding_box"] = bbox
460
+ if result.text is not None:
461
+ exec_data["text"] = result.text
462
+ if result.key is not None:
463
+ exec_data["key"] = result.key
464
+ if result.error is not None:
465
+ exec_data["error"] = result.error
466
+
467
+ # Build verify data (simplified - based on success and url_changed)
468
+ verify_passed = result.success and (
469
+ result.url_changed or result.action != "click"
470
+ )
471
+ verify_signals = {
472
+ "url_changed": result.url_changed or False,
473
+ }
474
+ if result.error:
475
+ verify_signals["error"] = result.error
476
+
477
+ # Add elements_found array if element was targeted
478
+ if result.element_id is not None:
479
+ bbox = self._get_element_bbox(result.element_id, snap)
480
+ if bbox:
481
+ verify_signals["elements_found"] = [
482
+ {
483
+ "label": f"Element {result.element_id}",
484
+ "bounding_box": bbox,
485
+ }
486
+ ]
487
+
488
+ verify_data = {
489
+ "passed": verify_passed,
490
+ "signals": verify_signals,
491
+ }
492
+
493
+ # Build elements data for pre field (include diff_status from snap_with_diff)
494
+ # Use the same format as build_snapshot_event for consistency
495
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
496
+ pre_elements = snapshot_event_data.get("elements", [])
497
+
498
+ # Build complete step_end event
499
+ step_end_data = TraceEventBuilder.build_step_end_event(
500
+ step_id=step_id,
501
+ step_index=self._step_count,
502
+ goal=goal,
503
+ attempt=attempt,
504
+ pre_url=pre_url,
505
+ post_url=post_url,
506
+ snapshot_digest=snapshot_digest,
507
+ llm_data=llm_data,
508
+ exec_data=exec_data,
509
+ verify_data=verify_data,
510
+ pre_elements=pre_elements,
511
+ )
512
+
513
+ _safe_tracer_call(
514
+ self.tracer,
515
+ "emit",
516
+ self.verbose,
517
+ "step_end",
518
+ step_end_data,
312
519
  step_id=step_id,
313
520
  )
314
521
 
@@ -317,7 +524,14 @@ class SentienceAgent(BaseAgent):
317
524
  except Exception as e:
318
525
  # Emit error trace event if tracer is enabled
319
526
  if self.tracer:
320
- self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
527
+ _safe_tracer_call(
528
+ self.tracer,
529
+ "emit_error",
530
+ self.verbose,
531
+ step_id=step_id,
532
+ error=str(e),
533
+ attempt=attempt,
534
+ )
321
535
 
322
536
  if attempt < max_retries:
323
537
  if self.verbose:
@@ -346,195 +560,573 @@ class SentienceAgent(BaseAgent):
346
560
  )
347
561
  raise RuntimeError(f"Failed after {max_retries} retries: {e}")
348
562
 
349
- def _build_context(self, snap: Snapshot, goal: str) -> str:
563
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
350
564
  """
351
- Convert snapshot elements to token-efficient prompt string
352
-
353
- Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
565
+ Track token usage for analytics
354
566
 
355
567
  Args:
356
- snap: Snapshot object
357
- goal: User goal (for context)
568
+ goal: User goal
569
+ llm_response: LLM response with token usage
570
+ """
571
+ if llm_response.prompt_tokens:
572
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
573
+ if llm_response.completion_tokens:
574
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
575
+ if llm_response.total_tokens:
576
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
577
+
578
+ self._token_usage_raw["by_action"].append(
579
+ {
580
+ "goal": goal,
581
+ "prompt_tokens": llm_response.prompt_tokens or 0,
582
+ "completion_tokens": llm_response.completion_tokens or 0,
583
+ "total_tokens": llm_response.total_tokens or 0,
584
+ "model": llm_response.model_name,
585
+ }
586
+ )
587
+
588
+ def get_token_stats(self) -> TokenStats:
589
+ """
590
+ Get token usage statistics
358
591
 
359
592
  Returns:
360
- Formatted element context string
593
+ TokenStats with token usage breakdown
361
594
  """
362
- lines = []
363
- # Note: elements are already filtered by filter_elements() in act()
364
- for el in snap.elements:
365
- # Extract visual cues
366
- cues = []
367
- if el.visual_cues.is_primary:
368
- cues.append("PRIMARY")
369
- if el.visual_cues.is_clickable:
370
- cues.append("CLICKABLE")
371
- if el.visual_cues.background_color_name:
372
- cues.append(f"color:{el.visual_cues.background_color_name}")
373
-
374
- # Format element line
375
- cues_str = f" {{{','.join(cues)}}}" if cues else ""
376
- text_preview = (
377
- (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
378
- )
595
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
596
+ return TokenStats(
597
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
598
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
599
+ total_tokens=self._token_usage_raw["total_tokens"],
600
+ by_action=by_action,
601
+ )
379
602
 
380
- lines.append(
381
- f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
382
- f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
383
- )
603
+ def get_history(self) -> list[ActionHistory]:
604
+ """
605
+ Get execution history
606
+
607
+ Returns:
608
+ List of ActionHistory entries
609
+ """
610
+ return [ActionHistory(**h) for h in self.history]
384
611
 
385
- return "\n".join(lines)
612
+ def clear_history(self) -> None:
613
+ """Clear execution history and reset token counters"""
614
+ self.history.clear()
615
+ self._token_usage_raw = {
616
+ "total_prompt_tokens": 0,
617
+ "total_completion_tokens": 0,
618
+ "total_tokens": 0,
619
+ "by_action": [],
620
+ }
386
621
 
387
- def _extract_action_from_response(self, response: str) -> str:
622
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
388
623
  """
389
- Extract action command from LLM response, handling cases where
390
- the LLM adds extra explanation despite instructions.
624
+ Filter elements from snapshot based on goal context.
625
+
626
+ This implementation uses ElementFilter to apply goal-based keyword matching
627
+ to boost relevant elements and filters out irrelevant ones.
391
628
 
392
629
  Args:
393
- response: Raw LLM response text
630
+ snapshot: Current page snapshot
631
+ goal: User's goal (can inform filtering)
394
632
 
395
633
  Returns:
396
- Cleaned action command string
634
+ Filtered list of elements
397
635
  """
398
- import re
636
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
399
637
 
400
- # Remove markdown code blocks if present
401
- response = re.sub(r"```[\w]*\n?", "", response)
402
- response = response.strip()
403
638
 
404
- # Try to find action patterns in the response
405
- # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
406
- action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
639
+ class SentienceAgentAsync(BaseAgentAsync):
640
+ """
641
+ High-level async agent that combines Sentience SDK with any LLM provider.
407
642
 
408
- match = re.search(action_pattern, response, re.IGNORECASE)
409
- if match:
410
- return match.group(1)
643
+ Uses observe-think-act loop to execute natural language commands:
644
+ 1. OBSERVE: Get snapshot of current page state
645
+ 2. THINK: Query LLM to decide next action
646
+ 3. ACT: Execute action using SDK
411
647
 
412
- # If no pattern match, return the original response (will likely fail parsing)
413
- return response
648
+ Example:
649
+ >>> from sentience.async_api import AsyncSentienceBrowser
650
+ >>> from sentience.agent import SentienceAgentAsync
651
+ >>> from sentience.llm_provider import OpenAIProvider
652
+ >>>
653
+ >>> async with AsyncSentienceBrowser() as browser:
654
+ >>> await browser.goto("https://google.com")
655
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
656
+ >>> agent = SentienceAgentAsync(browser, llm)
657
+ >>> await agent.act("Click the search box")
658
+ >>> await agent.act("Type 'magic mouse' into the search field")
659
+ >>> await agent.act("Press Enter key")
660
+ """
414
661
 
415
- def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
662
+ def __init__(
663
+ self,
664
+ browser: AsyncSentienceBrowser,
665
+ llm: LLMProvider,
666
+ default_snapshot_limit: int = 50,
667
+ verbose: bool = True,
668
+ tracer: Optional["Tracer"] = None,
669
+ config: Optional["AgentConfig"] = None,
670
+ ):
416
671
  """
417
- Query LLM with standardized prompt template
672
+ Initialize Sentience Agent (async)
418
673
 
419
674
  Args:
420
- dom_context: Formatted element context
421
- goal: User goal
422
-
423
- Returns:
424
- LLMResponse from LLM provider
675
+ browser: AsyncSentienceBrowser instance
676
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
677
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
678
+ verbose: Print execution logs (default: True)
679
+ tracer: Optional Tracer instance for execution tracking (default: None)
680
+ config: Optional AgentConfig for advanced configuration (default: None)
425
681
  """
426
- system_prompt = f"""You are an AI web automation agent.
427
-
428
- GOAL: {goal}
429
-
430
- VISIBLE ELEMENTS (sorted by importance):
431
- {dom_context}
432
-
433
- VISUAL CUES EXPLAINED:
434
- - {{PRIMARY}}: Main call-to-action element on the page
435
- - {{CLICKABLE}}: Element is clickable
436
- - {{color:X}}: Background color name
437
-
438
- CRITICAL RESPONSE FORMAT:
439
- You MUST respond with ONLY ONE of these exact action formats:
440
- - CLICK(id) - Click element by ID
441
- - TYPE(id, "text") - Type text into element
442
- - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
443
- - FINISH() - Task complete
444
-
445
- DO NOT include any explanation, reasoning, or natural language.
446
- DO NOT use markdown formatting or code blocks.
447
- DO NOT say "The next step is..." or anything similar.
448
-
449
- CORRECT Examples:
450
- CLICK(42)
451
- TYPE(15, "magic mouse")
452
- PRESS("Enter")
453
- FINISH()
454
-
455
- INCORRECT Examples (DO NOT DO THIS):
456
- "The next step is to click..."
457
- "I will type..."
458
- ```CLICK(42)```
459
- """
682
+ self.browser = browser
683
+ self.llm = llm
684
+ self.default_snapshot_limit = default_snapshot_limit
685
+ self.verbose = verbose
686
+ self.tracer = tracer
687
+ self.config = config or AgentConfig()
460
688
 
461
- user_prompt = "Return the single action command:"
689
+ # Initialize handlers
690
+ self.llm_handler = LLMInteractionHandler(llm)
691
+ self.action_executor = ActionExecutor(browser)
462
692
 
463
- return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
693
+ # Screenshot sequence counter
694
+ # Execution history
695
+ self.history: list[dict[str, Any]] = []
464
696
 
465
- def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
697
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
698
+ self._token_usage_raw = {
699
+ "total_prompt_tokens": 0,
700
+ "total_completion_tokens": 0,
701
+ "total_tokens": 0,
702
+ "by_action": [],
703
+ }
704
+
705
+ # Step counter for tracing
706
+ self._step_count = 0
707
+
708
+ # Previous snapshot for diff detection
709
+ self._previous_snapshot: Snapshot | None = None
710
+
711
+ def _compute_hash(self, text: str) -> str:
712
+ """Compute SHA256 hash of text."""
713
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
714
+
715
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
716
+ """Get bounding box for an element from snapshot."""
717
+ if element_id is None:
718
+ return None
719
+ for el in snap.elements:
720
+ if el.id == element_id:
721
+ return {
722
+ "x": el.bbox.x,
723
+ "y": el.bbox.y,
724
+ "width": el.bbox.width,
725
+ "height": el.bbox.height,
726
+ }
727
+ return None
728
+
729
+ async def act( # noqa: C901
730
+ self,
731
+ goal: str,
732
+ max_retries: int = 2,
733
+ snapshot_options: SnapshotOptions | None = None,
734
+ ) -> AgentActionResult:
466
735
  """
467
- Parse action string and execute SDK call
736
+ Execute a high-level goal using observe → think → act loop (async)
468
737
 
469
738
  Args:
470
- action_str: Action string from LLM (e.g., "CLICK(42)")
471
- snap: Current snapshot (for context)
739
+ goal: Natural language instruction (e.g., "Click the Sign In button")
740
+ max_retries: Number of retries on failure (default: 2)
741
+ snapshot_options: Optional SnapshotOptions for this specific action
472
742
 
473
743
  Returns:
474
- Execution result dictionary
744
+ AgentActionResult with execution details
745
+
746
+ Example:
747
+ >>> result = await agent.act("Click the search box")
748
+ >>> print(result.success, result.action, result.element_id)
749
+ True click 42
475
750
  """
476
- # Parse CLICK(42)
477
- if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
478
- element_id = int(match.group(1))
479
- result = click(self.browser, element_id)
480
- return {
481
- "success": result.success,
482
- "action": "click",
483
- "element_id": element_id,
484
- "outcome": result.outcome,
485
- "url_changed": result.url_changed,
486
- }
751
+ if self.verbose:
752
+ print(f"\n{'=' * 70}")
753
+ print(f"🤖 Agent Goal: {goal}")
754
+ print(f"{'=' * 70}")
487
755
 
488
- # Parse TYPE(42, "hello world")
489
- elif match := re.match(
490
- r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
491
- action_str,
492
- re.IGNORECASE,
493
- ):
494
- element_id = int(match.group(1))
495
- text = match.group(2)
496
- result = type_text(self.browser, element_id, text)
497
- return {
498
- "success": result.success,
499
- "action": "type",
500
- "element_id": element_id,
501
- "text": text,
502
- "outcome": result.outcome,
503
- }
756
+ # Generate step ID for tracing
757
+ self._step_count += 1
758
+ step_id = f"step-{self._step_count}"
504
759
 
505
- # Parse PRESS("Enter")
506
- elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
507
- key = match.group(1)
508
- result = press(self.browser, key)
509
- return {
510
- "success": result.success,
511
- "action": "press",
512
- "key": key,
513
- "outcome": result.outcome,
514
- }
760
+ # Emit step_start trace event if tracer is enabled
761
+ if self.tracer:
762
+ pre_url = self.browser.page.url if self.browser.page else None
763
+ _safe_tracer_call(
764
+ self.tracer,
765
+ "emit_step_start",
766
+ self.verbose,
767
+ step_id=step_id,
768
+ step_index=self._step_count,
769
+ goal=goal,
770
+ attempt=0,
771
+ pre_url=pre_url,
772
+ )
515
773
 
516
- # Parse FINISH()
517
- elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
518
- return {
519
- "success": True,
520
- "action": "finish",
521
- "message": "Task marked as complete",
522
- }
774
+ for attempt in range(max_retries + 1):
775
+ try:
776
+ # 1. OBSERVE: Get refined semantic snapshot
777
+ start_time = time.time()
523
778
 
524
- else:
525
- raise ValueError(
526
- f"Unknown action format: {action_str}\n"
527
- f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
528
- )
779
+ # Use provided options or create default
780
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
781
+ # Only set goal if not already provided
782
+ if snap_opts.goal is None:
783
+ snap_opts.goal = goal
529
784
 
530
- def _track_tokens(self, goal: str, llm_response: LLMResponse):
531
- """
532
- Track token usage for analytics
785
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
786
+ # Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
787
+ # (snapshot_options.screenshot defaults to False, so we check if it's still False)
788
+ if self.config and (snapshot_options is None or snap_opts.screenshot is False):
789
+ if self.config.capture_screenshots:
790
+ # Create ScreenshotConfig from AgentConfig
791
+ snap_opts.screenshot = ScreenshotConfig(
792
+ format=self.config.screenshot_format,
793
+ quality=(
794
+ self.config.screenshot_quality
795
+ if self.config.screenshot_format == "jpeg"
796
+ else None
797
+ ),
798
+ )
799
+ else:
800
+ snap_opts.screenshot = False
801
+ # Apply show_overlay from AgentConfig
802
+ # Note: User can override by explicitly passing show_overlay in snapshot_options
803
+ snap_opts.show_overlay = self.config.show_overlay
533
804
 
534
- Args:
535
- goal: User goal
536
- llm_response: LLM response with token usage
537
- """
805
+ # Call snapshot with options object (matches TypeScript API)
806
+ snap = await snapshot_async(self.browser, snap_opts)
807
+
808
+ if snap.status != "success":
809
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
810
+
811
+ # Compute diff_status by comparing with previous snapshot
812
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
813
+
814
+ # Create snapshot with diff_status populated
815
+ snap_with_diff = Snapshot(
816
+ status=snap.status,
817
+ timestamp=snap.timestamp,
818
+ url=snap.url,
819
+ viewport=snap.viewport,
820
+ elements=elements_with_diff,
821
+ screenshot=snap.screenshot,
822
+ screenshot_format=snap.screenshot_format,
823
+ error=snap.error,
824
+ )
825
+
826
+ # Update previous snapshot for next comparison
827
+ self._previous_snapshot = snap
828
+
829
+ # Apply element filtering based on goal
830
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
831
+
832
+ # Emit snapshot trace event if tracer is enabled
833
+ if self.tracer:
834
+ # Build snapshot event data (use snap_with_diff to include diff_status)
835
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
836
+
837
+ # Always include screenshot in trace event for studio viewer compatibility
838
+ # CloudTraceSink will extract and upload screenshots separately, then remove
839
+ # screenshot_base64 from events before uploading the trace file.
840
+ if snap.screenshot:
841
+ # Extract base64 string from data URL if needed
842
+ if snap.screenshot.startswith("data:image"):
843
+ # Format: "data:image/jpeg;base64,{base64_string}"
844
+ screenshot_base64 = (
845
+ snap.screenshot.split(",", 1)[1]
846
+ if "," in snap.screenshot
847
+ else snap.screenshot
848
+ )
849
+ else:
850
+ screenshot_base64 = snap.screenshot
851
+
852
+ snapshot_data["screenshot_base64"] = screenshot_base64
853
+ if snap.screenshot_format:
854
+ snapshot_data["screenshot_format"] = snap.screenshot_format
855
+
856
+ _safe_tracer_call(
857
+ self.tracer,
858
+ "emit",
859
+ self.verbose,
860
+ "snapshot",
861
+ snapshot_data,
862
+ step_id=step_id,
863
+ )
864
+
865
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
866
+ filtered_snap = Snapshot(
867
+ status=snap_with_diff.status,
868
+ timestamp=snap_with_diff.timestamp,
869
+ url=snap_with_diff.url,
870
+ viewport=snap_with_diff.viewport,
871
+ elements=filtered_elements,
872
+ screenshot=snap_with_diff.screenshot,
873
+ screenshot_format=snap_with_diff.screenshot_format,
874
+ error=snap_with_diff.error,
875
+ )
876
+
877
+ # 2. GROUND: Format elements for LLM context
878
+ context = self.llm_handler.build_context(filtered_snap, goal)
879
+
880
+ # 3. THINK: Query LLM for next action
881
+ llm_response = self.llm_handler.query_llm(context, goal)
882
+
883
+ # Emit LLM query trace event if tracer is enabled
884
+ if self.tracer:
885
+ _safe_tracer_call(
886
+ self.tracer,
887
+ "emit",
888
+ self.verbose,
889
+ "llm_query",
890
+ {
891
+ "prompt_tokens": llm_response.prompt_tokens,
892
+ "completion_tokens": llm_response.completion_tokens,
893
+ "model": llm_response.model_name,
894
+ "response": llm_response.content[:200], # Truncate for brevity
895
+ },
896
+ step_id=step_id,
897
+ )
898
+
899
+ if self.verbose:
900
+ print(f"🧠 LLM Decision: {llm_response.content}")
901
+
902
+ # Track token usage
903
+ self._track_tokens(goal, llm_response)
904
+
905
+ # Parse action from LLM response
906
+ action_str = self.llm_handler.extract_action(llm_response.content)
907
+
908
+ # 4. EXECUTE: Parse and run action
909
+ result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
910
+
911
+ duration_ms = int((time.time() - start_time) * 1000)
912
+
913
+ # Create AgentActionResult from execution result
914
+ result = AgentActionResult(
915
+ success=result_dict["success"],
916
+ action=result_dict["action"],
917
+ goal=goal,
918
+ duration_ms=duration_ms,
919
+ attempt=attempt,
920
+ element_id=result_dict.get("element_id"),
921
+ text=result_dict.get("text"),
922
+ key=result_dict.get("key"),
923
+ outcome=result_dict.get("outcome"),
924
+ url_changed=result_dict.get("url_changed"),
925
+ error=result_dict.get("error"),
926
+ message=result_dict.get("message"),
927
+ )
928
+
929
+ # Emit action execution trace event if tracer is enabled
930
+ if self.tracer:
931
+ post_url = self.browser.page.url if self.browser.page else None
932
+
933
+ # Include element data for live overlay visualization
934
+ elements_data = [
935
+ {
936
+ "id": el.id,
937
+ "bbox": {
938
+ "x": el.bbox.x,
939
+ "y": el.bbox.y,
940
+ "width": el.bbox.width,
941
+ "height": el.bbox.height,
942
+ },
943
+ "role": el.role,
944
+ "text": el.text[:50] if el.text else "",
945
+ }
946
+ for el in filtered_snap.elements[:50]
947
+ ]
948
+
949
+ _safe_tracer_call(
950
+ self.tracer,
951
+ "emit",
952
+ self.verbose,
953
+ "action",
954
+ {
955
+ "action": result.action,
956
+ "element_id": result.element_id,
957
+ "success": result.success,
958
+ "outcome": result.outcome,
959
+ "duration_ms": duration_ms,
960
+ "post_url": post_url,
961
+ "elements": elements_data, # Add element data for overlay
962
+ "target_element_id": result.element_id, # Highlight target in red
963
+ },
964
+ step_id=step_id,
965
+ )
966
+
967
+ # 5. RECORD: Track history
968
+ self.history.append(
969
+ {
970
+ "goal": goal,
971
+ "action": action_str,
972
+ "result": result.model_dump(), # Store as dict
973
+ "success": result.success,
974
+ "attempt": attempt,
975
+ "duration_ms": duration_ms,
976
+ }
977
+ )
978
+
979
+ if self.verbose:
980
+ status = "✅" if result.success else "❌"
981
+ print(f"{status} Completed in {duration_ms}ms")
982
+
983
+ # Emit step completion trace event if tracer is enabled
984
+ if self.tracer:
985
+ # Get pre_url from step_start (stored in tracer or use current)
986
+ pre_url = snap.url
987
+ post_url = self.browser.page.url if self.browser.page else None
988
+
989
+ # Compute snapshot digest (simplified - use URL + timestamp)
990
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
991
+
992
+ # Build LLM data
993
+ llm_response_text = llm_response.content
994
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
995
+ llm_data = {
996
+ "response_text": llm_response_text,
997
+ "response_hash": llm_response_hash,
998
+ "usage": {
999
+ "prompt_tokens": llm_response.prompt_tokens or 0,
1000
+ "completion_tokens": llm_response.completion_tokens or 0,
1001
+ "total_tokens": llm_response.total_tokens or 0,
1002
+ },
1003
+ }
1004
+
1005
+ # Build exec data
1006
+ exec_data = {
1007
+ "success": result.success,
1008
+ "action": result.action,
1009
+ "outcome": result.outcome
1010
+ or (
1011
+ f"Action {result.action} executed successfully"
1012
+ if result.success
1013
+ else f"Action {result.action} failed"
1014
+ ),
1015
+ "duration_ms": duration_ms,
1016
+ }
1017
+
1018
+ # Add optional exec fields
1019
+ if result.element_id is not None:
1020
+ exec_data["element_id"] = result.element_id
1021
+ # Add bounding box if element found
1022
+ bbox = self._get_element_bbox(result.element_id, snap)
1023
+ if bbox:
1024
+ exec_data["bounding_box"] = bbox
1025
+ if result.text is not None:
1026
+ exec_data["text"] = result.text
1027
+ if result.key is not None:
1028
+ exec_data["key"] = result.key
1029
+ if result.error is not None:
1030
+ exec_data["error"] = result.error
1031
+
1032
+ # Build verify data (simplified - based on success and url_changed)
1033
+ verify_passed = result.success and (
1034
+ result.url_changed or result.action != "click"
1035
+ )
1036
+ verify_signals = {
1037
+ "url_changed": result.url_changed or False,
1038
+ }
1039
+ if result.error:
1040
+ verify_signals["error"] = result.error
1041
+
1042
+ # Add elements_found array if element was targeted
1043
+ if result.element_id is not None:
1044
+ bbox = self._get_element_bbox(result.element_id, snap)
1045
+ if bbox:
1046
+ verify_signals["elements_found"] = [
1047
+ {
1048
+ "label": f"Element {result.element_id}",
1049
+ "bounding_box": bbox,
1050
+ }
1051
+ ]
1052
+
1053
+ verify_data = {
1054
+ "passed": verify_passed,
1055
+ "signals": verify_signals,
1056
+ }
1057
+
1058
+ # Build elements data for pre field (include diff_status from snap_with_diff)
1059
+ # Use the same format as build_snapshot_event for consistency
1060
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
1061
+ pre_elements = snapshot_event_data.get("elements", [])
1062
+
1063
+ # Build complete step_end event
1064
+ step_end_data = TraceEventBuilder.build_step_end_event(
1065
+ step_id=step_id,
1066
+ step_index=self._step_count,
1067
+ goal=goal,
1068
+ attempt=attempt,
1069
+ pre_url=pre_url,
1070
+ post_url=post_url,
1071
+ snapshot_digest=snapshot_digest,
1072
+ llm_data=llm_data,
1073
+ exec_data=exec_data,
1074
+ verify_data=verify_data,
1075
+ pre_elements=pre_elements,
1076
+ )
1077
+
1078
+ _safe_tracer_call(
1079
+ self.tracer,
1080
+ "emit",
1081
+ self.verbose,
1082
+ "step_end",
1083
+ step_end_data,
1084
+ step_id=step_id,
1085
+ )
1086
+
1087
+ return result
1088
+
1089
+ except Exception as e:
1090
+ # Emit error trace event if tracer is enabled
1091
+ if self.tracer:
1092
+ _safe_tracer_call(
1093
+ self.tracer,
1094
+ "emit_error",
1095
+ self.verbose,
1096
+ step_id=step_id,
1097
+ error=str(e),
1098
+ attempt=attempt,
1099
+ )
1100
+
1101
+ if attempt < max_retries:
1102
+ if self.verbose:
1103
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
1104
+ await asyncio.sleep(1.0) # Brief delay before retry
1105
+ continue
1106
+ else:
1107
+ # Create error result
1108
+ error_result = AgentActionResult(
1109
+ success=False,
1110
+ action="error",
1111
+ goal=goal,
1112
+ duration_ms=0,
1113
+ attempt=attempt,
1114
+ error=str(e),
1115
+ )
1116
+ self.history.append(
1117
+ {
1118
+ "goal": goal,
1119
+ "action": "error",
1120
+ "result": error_result.model_dump(),
1121
+ "success": False,
1122
+ "attempt": attempt,
1123
+ "duration_ms": 0,
1124
+ }
1125
+ )
1126
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
1127
+
1128
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
1129
+ """Track token usage for analytics (same as sync version)"""
538
1130
  if llm_response.prompt_tokens:
539
1131
  self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
540
1132
  if llm_response.completion_tokens:
@@ -553,12 +1145,7 @@ INCORRECT Examples (DO NOT DO THIS):
553
1145
  )
554
1146
 
555
1147
  def get_token_stats(self) -> TokenStats:
556
- """
557
- Get token usage statistics
558
-
559
- Returns:
560
- TokenStats with token usage breakdown
561
- """
1148
+ """Get token usage statistics (same as sync version)"""
562
1149
  by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
563
1150
  return TokenStats(
564
1151
  total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
@@ -568,16 +1155,11 @@ INCORRECT Examples (DO NOT DO THIS):
568
1155
  )
569
1156
 
570
1157
  def get_history(self) -> list[ActionHistory]:
571
- """
572
- Get execution history
573
-
574
- Returns:
575
- List of ActionHistory entries
576
- """
1158
+ """Get execution history (same as sync version)"""
577
1159
  return [ActionHistory(**h) for h in self.history]
578
1160
 
579
1161
  def clear_history(self) -> None:
580
- """Clear execution history and reset token counters"""
1162
+ """Clear execution history and reset token counters (same as sync version)"""
581
1163
  self.history.clear()
582
1164
  self._token_usage_raw = {
583
1165
  "total_prompt_tokens": 0,
@@ -590,8 +1172,8 @@ INCORRECT Examples (DO NOT DO THIS):
590
1172
  """
591
1173
  Filter elements from snapshot based on goal context.
592
1174
 
593
- This default implementation applies goal-based keyword matching to boost
594
- relevant elements and filters out irrelevant ones.
1175
+ This implementation uses ElementFilter to apply goal-based keyword matching
1176
+ to boost relevant elements and filters out irrelevant ones.
595
1177
 
596
1178
  Args:
597
1179
  snapshot: Current page snapshot
@@ -600,73 +1182,4 @@ INCORRECT Examples (DO NOT DO THIS):
600
1182
  Returns:
601
1183
  Filtered list of elements
602
1184
  """
603
- elements = snapshot.elements
604
-
605
- # If no goal provided, return all elements (up to limit)
606
- if not goal:
607
- return elements[: self.default_snapshot_limit]
608
-
609
- goal_lower = goal.lower()
610
-
611
- # Extract keywords from goal
612
- keywords = self._extract_keywords(goal_lower)
613
-
614
- # Boost elements matching goal keywords
615
- scored_elements = []
616
- for el in elements:
617
- score = el.importance
618
-
619
- # Boost if element text matches goal
620
- if el.text and any(kw in el.text.lower() for kw in keywords):
621
- score += 0.3
622
-
623
- # Boost if role matches goal intent
624
- if "click" in goal_lower and el.visual_cues.is_clickable:
625
- score += 0.2
626
- if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
627
- score += 0.2
628
- if "search" in goal_lower:
629
- # Filter out non-interactive elements for search tasks
630
- if el.role in ["link", "img"] and not el.visual_cues.is_primary:
631
- score -= 0.5
632
-
633
- scored_elements.append((score, el))
634
-
635
- # Re-sort by boosted score
636
- scored_elements.sort(key=lambda x: x[0], reverse=True)
637
- elements = [el for _, el in scored_elements]
638
-
639
- return elements[: self.default_snapshot_limit]
640
-
641
- def _extract_keywords(self, text: str) -> list[str]:
642
- """
643
- Extract meaningful keywords from goal text
644
-
645
- Args:
646
- text: Text to extract keywords from
647
-
648
- Returns:
649
- List of keywords
650
- """
651
- stopwords = {
652
- "the",
653
- "a",
654
- "an",
655
- "and",
656
- "or",
657
- "but",
658
- "in",
659
- "on",
660
- "at",
661
- "to",
662
- "for",
663
- "of",
664
- "with",
665
- "by",
666
- "from",
667
- "as",
668
- "is",
669
- "was",
670
- }
671
- words = text.split()
672
- return [w for w in words if w not in stopwords and len(w) > 2]
1185
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)