sentienceapi 0.90.12__py3-none-any.whl → 0.92.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (63) hide show
  1. sentience/__init__.py +14 -5
  2. sentience/_extension_loader.py +40 -0
  3. sentience/action_executor.py +215 -0
  4. sentience/actions.py +408 -25
  5. sentience/agent.py +804 -310
  6. sentience/agent_config.py +3 -0
  7. sentience/async_api.py +101 -0
  8. sentience/base_agent.py +95 -0
  9. sentience/browser.py +594 -25
  10. sentience/browser_evaluator.py +299 -0
  11. sentience/cloud_tracing.py +458 -36
  12. sentience/conversational_agent.py +79 -45
  13. sentience/element_filter.py +136 -0
  14. sentience/expect.py +98 -2
  15. sentience/extension/background.js +56 -185
  16. sentience/extension/content.js +117 -289
  17. sentience/extension/injected_api.js +799 -1374
  18. sentience/extension/manifest.json +1 -1
  19. sentience/extension/pkg/sentience_core.js +190 -396
  20. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  21. sentience/extension/release.json +47 -47
  22. sentience/formatting.py +9 -53
  23. sentience/inspector.py +183 -1
  24. sentience/llm_interaction_handler.py +191 -0
  25. sentience/llm_provider.py +256 -28
  26. sentience/llm_provider_utils.py +120 -0
  27. sentience/llm_response_builder.py +153 -0
  28. sentience/models.py +66 -1
  29. sentience/overlay.py +109 -2
  30. sentience/protocols.py +228 -0
  31. sentience/query.py +1 -1
  32. sentience/read.py +95 -3
  33. sentience/recorder.py +223 -3
  34. sentience/schemas/trace_v1.json +102 -9
  35. sentience/screenshot.py +48 -2
  36. sentience/sentience_methods.py +86 -0
  37. sentience/snapshot.py +309 -64
  38. sentience/snapshot_diff.py +141 -0
  39. sentience/text_search.py +119 -5
  40. sentience/trace_event_builder.py +129 -0
  41. sentience/trace_file_manager.py +197 -0
  42. sentience/trace_indexing/index_schema.py +95 -7
  43. sentience/trace_indexing/indexer.py +117 -14
  44. sentience/tracer_factory.py +119 -6
  45. sentience/tracing.py +172 -8
  46. sentience/utils/__init__.py +40 -0
  47. sentience/utils/browser.py +46 -0
  48. sentience/utils/element.py +257 -0
  49. sentience/utils/formatting.py +59 -0
  50. sentience/utils.py +1 -1
  51. sentience/visual_agent.py +2056 -0
  52. sentience/wait.py +70 -4
  53. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/METADATA +61 -22
  54. sentienceapi-0.92.2.dist-info/RECORD +65 -0
  55. sentienceapi-0.92.2.dist-info/licenses/LICENSE +24 -0
  56. sentienceapi-0.92.2.dist-info/licenses/LICENSE-APACHE +201 -0
  57. sentienceapi-0.92.2.dist-info/licenses/LICENSE-MIT +21 -0
  58. sentience/extension/test-content.js +0 -4
  59. sentienceapi-0.90.12.dist-info/RECORD +0 -46
  60. sentienceapi-0.90.12.dist-info/licenses/LICENSE.md +0 -43
  61. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/WHEEL +0 -0
  62. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/entry_points.txt +0 -0
  63. {sentienceapi-0.90.12.dist-info → sentienceapi-0.92.2.dist-info}/top_level.txt +0 -0
sentience/agent.py CHANGED
@@ -3,13 +3,17 @@ Sentience Agent: High-level automation agent using LLM + SDK
3
3
  Implements observe-think-act loop for natural language commands
4
4
  """
5
5
 
6
- import re
6
+ import asyncio
7
+ import hashlib
7
8
  import time
8
- from typing import TYPE_CHECKING, Any, Optional
9
-
10
- from .actions import click, press, type_text
11
- from .base_agent import BaseAgent
12
- from .browser import SentienceBrowser
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
+
11
+ from .action_executor import ActionExecutor
12
+ from .agent_config import AgentConfig
13
+ from .base_agent import BaseAgent, BaseAgentAsync
14
+ from .browser import AsyncSentienceBrowser, SentienceBrowser
15
+ from .element_filter import ElementFilter
16
+ from .llm_interaction_handler import LLMInteractionHandler
13
17
  from .llm_provider import LLMProvider, LLMResponse
14
18
  from .models import (
15
19
  ActionHistory,
@@ -21,13 +25,46 @@ from .models import (
21
25
  SnapshotOptions,
22
26
  TokenStats,
23
27
  )
24
- from .snapshot import snapshot
28
+ from .protocols import AsyncBrowserProtocol, BrowserProtocol
29
+ from .snapshot import snapshot, snapshot_async
30
+ from .snapshot_diff import SnapshotDiff
31
+ from .trace_event_builder import TraceEventBuilder
25
32
 
26
33
  if TYPE_CHECKING:
27
- from .agent_config import AgentConfig
28
34
  from .tracing import Tracer
29
35
 
30
36
 
37
+ def _safe_tracer_call(
38
+ tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
39
+ ) -> None:
40
+ """
41
+ Safely call tracer method, catching and logging errors without breaking execution.
42
+
43
+ Args:
44
+ tracer: Tracer instance or None
45
+ method_name: Name of tracer method to call (e.g., "emit", "emit_error")
46
+ verbose: Whether to print error messages
47
+ *args: Positional arguments for the tracer method
48
+ **kwargs: Keyword arguments for the tracer method
49
+ """
50
+ if not tracer:
51
+ return
52
+ try:
53
+ method = getattr(tracer, method_name)
54
+ if args and kwargs:
55
+ method(*args, **kwargs)
56
+ elif args:
57
+ method(*args)
58
+ elif kwargs:
59
+ method(**kwargs)
60
+ else:
61
+ method()
62
+ except Exception as tracer_error:
63
+ # Tracer errors should not break agent execution
64
+ if verbose:
65
+ print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
66
+
67
+
31
68
  class SentienceAgent(BaseAgent):
32
69
  """
33
70
  High-level agent that combines Sentience SDK with any LLM provider.
@@ -54,7 +91,7 @@ class SentienceAgent(BaseAgent):
54
91
 
55
92
  def __init__(
56
93
  self,
57
- browser: SentienceBrowser,
94
+ browser: SentienceBrowser | BrowserProtocol,
58
95
  llm: LLMProvider,
59
96
  default_snapshot_limit: int = 50,
60
97
  verbose: bool = True,
@@ -65,7 +102,8 @@ class SentienceAgent(BaseAgent):
65
102
  Initialize Sentience Agent
66
103
 
67
104
  Args:
68
- browser: SentienceBrowser instance
105
+ browser: SentienceBrowser instance or BrowserProtocol-compatible object
106
+ (for testing, can use mock objects that implement BrowserProtocol)
69
107
  llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
70
108
  default_snapshot_limit: Default maximum elements to include in context (default: 50)
71
109
  verbose: Print execution logs (default: True)
@@ -77,8 +115,13 @@ class SentienceAgent(BaseAgent):
77
115
  self.default_snapshot_limit = default_snapshot_limit
78
116
  self.verbose = verbose
79
117
  self.tracer = tracer
80
- self.config = config
118
+ self.config = config or AgentConfig()
119
+
120
+ # Initialize handlers
121
+ self.llm_handler = LLMInteractionHandler(llm)
122
+ self.action_executor = ActionExecutor(browser)
81
123
 
124
+ # Screenshot sequence counter
82
125
  # Execution history
83
126
  self.history: list[dict[str, Any]] = []
84
127
 
@@ -93,6 +136,27 @@ class SentienceAgent(BaseAgent):
93
136
  # Step counter for tracing
94
137
  self._step_count = 0
95
138
 
139
+ # Previous snapshot for diff detection
140
+ self._previous_snapshot: Snapshot | None = None
141
+
142
+ def _compute_hash(self, text: str) -> str:
143
+ """Compute SHA256 hash of text."""
144
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
145
+
146
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
147
+ """Get bounding box for an element from snapshot."""
148
+ if element_id is None:
149
+ return None
150
+ for el in snap.elements:
151
+ if el.id == element_id:
152
+ return {
153
+ "x": el.bbox.x,
154
+ "y": el.bbox.y,
155
+ "width": el.bbox.width,
156
+ "height": el.bbox.height,
157
+ }
158
+ return None
159
+
96
160
  def act( # noqa: C901
97
161
  self,
98
162
  goal: str,
@@ -130,7 +194,10 @@ class SentienceAgent(BaseAgent):
130
194
  # Emit step_start trace event if tracer is enabled
131
195
  if self.tracer:
132
196
  pre_url = self.browser.page.url if self.browser.page else None
133
- self.tracer.emit_step_start(
197
+ _safe_tracer_call(
198
+ self.tracer,
199
+ "emit_step_start",
200
+ self.verbose,
134
201
  step_id=step_id,
135
202
  step_index=self._step_count,
136
203
  goal=goal,
@@ -149,81 +216,107 @@ class SentienceAgent(BaseAgent):
149
216
  if snap_opts.goal is None:
150
217
  snap_opts.goal = goal
151
218
 
152
- # Convert screenshot config to dict if needed
153
- screenshot_param = snap_opts.screenshot
154
- if isinstance(snap_opts.screenshot, ScreenshotConfig):
155
- screenshot_param = {
156
- "format": snap_opts.screenshot.format,
157
- "quality": snap_opts.screenshot.quality,
158
- }
159
-
160
- # Call snapshot with converted parameters
161
- snap = snapshot(
162
- self.browser,
163
- screenshot=screenshot_param,
164
- limit=snap_opts.limit,
165
- filter=snap_opts.filter.model_dump() if snap_opts.filter else None,
166
- use_api=snap_opts.use_api,
167
- goal=snap_opts.goal, # Pass goal to snapshot
168
- )
219
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
220
+ if snapshot_options is None and self.config:
221
+ if self.config.capture_screenshots:
222
+ # Create ScreenshotConfig from AgentConfig
223
+ snap_opts.screenshot = ScreenshotConfig(
224
+ format=self.config.screenshot_format,
225
+ quality=(
226
+ self.config.screenshot_quality
227
+ if self.config.screenshot_format == "jpeg"
228
+ else None
229
+ ),
230
+ )
231
+ else:
232
+ snap_opts.screenshot = False
233
+ # Apply show_overlay from AgentConfig
234
+ snap_opts.show_overlay = self.config.show_overlay
235
+
236
+ # Call snapshot with options object (matches TypeScript API)
237
+ snap = snapshot(self.browser, snap_opts)
169
238
 
170
239
  if snap.status != "success":
171
240
  raise RuntimeError(f"Snapshot failed: {snap.error}")
172
241
 
242
+ # Compute diff_status by comparing with previous snapshot
243
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
244
+
245
+ # Create snapshot with diff_status populated
246
+ snap_with_diff = Snapshot(
247
+ status=snap.status,
248
+ timestamp=snap.timestamp,
249
+ url=snap.url,
250
+ viewport=snap.viewport,
251
+ elements=elements_with_diff,
252
+ screenshot=snap.screenshot,
253
+ screenshot_format=snap.screenshot_format,
254
+ error=snap.error,
255
+ )
256
+
257
+ # Update previous snapshot for next comparison
258
+ self._previous_snapshot = snap
259
+
173
260
  # Apply element filtering based on goal
174
- filtered_elements = self.filter_elements(snap, goal)
261
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
175
262
 
176
263
  # Emit snapshot trace event if tracer is enabled
177
264
  if self.tracer:
178
- # Include element data for live overlay visualization
179
- # Use filtered_elements for overlay (only relevant elements)
180
- elements_data = [
181
- {
182
- "id": el.id,
183
- "bbox": {
184
- "x": el.bbox.x,
185
- "y": el.bbox.y,
186
- "width": el.bbox.width,
187
- "height": el.bbox.height,
188
- },
189
- "role": el.role,
190
- "text": el.text[:50] if el.text else "", # Truncate for brevity
191
- }
192
- for el in filtered_elements[:50] # Limit to first 50 for performance
193
- ]
194
-
195
- self.tracer.emit(
265
+ # Build snapshot event data (use snap_with_diff to include diff_status)
266
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
267
+
268
+ # Always include screenshot in trace event for studio viewer compatibility
269
+ # CloudTraceSink will extract and upload screenshots separately, then remove
270
+ # screenshot_base64 from events before uploading the trace file.
271
+ if snap.screenshot:
272
+ # Extract base64 string from data URL if needed
273
+ if snap.screenshot.startswith("data:image"):
274
+ # Format: "data:image/jpeg;base64,{base64_string}"
275
+ screenshot_base64 = (
276
+ snap.screenshot.split(",", 1)[1]
277
+ if "," in snap.screenshot
278
+ else snap.screenshot
279
+ )
280
+ else:
281
+ screenshot_base64 = snap.screenshot
282
+
283
+ snapshot_data["screenshot_base64"] = screenshot_base64
284
+ if snap.screenshot_format:
285
+ snapshot_data["screenshot_format"] = snap.screenshot_format
286
+
287
+ _safe_tracer_call(
288
+ self.tracer,
289
+ "emit",
290
+ self.verbose,
196
291
  "snapshot",
197
- {
198
- "url": snap.url,
199
- "element_count": len(snap.elements),
200
- "timestamp": snap.timestamp,
201
- "elements": elements_data, # Add element data for overlay
202
- },
292
+ snapshot_data,
203
293
  step_id=step_id,
204
294
  )
205
295
 
206
- # Create filtered snapshot
296
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
207
297
  filtered_snap = Snapshot(
208
- status=snap.status,
209
- timestamp=snap.timestamp,
210
- url=snap.url,
211
- viewport=snap.viewport,
298
+ status=snap_with_diff.status,
299
+ timestamp=snap_with_diff.timestamp,
300
+ url=snap_with_diff.url,
301
+ viewport=snap_with_diff.viewport,
212
302
  elements=filtered_elements,
213
- screenshot=snap.screenshot,
214
- screenshot_format=snap.screenshot_format,
215
- error=snap.error,
303
+ screenshot=snap_with_diff.screenshot,
304
+ screenshot_format=snap_with_diff.screenshot_format,
305
+ error=snap_with_diff.error,
216
306
  )
217
307
 
218
308
  # 2. GROUND: Format elements for LLM context
219
- context = self._build_context(filtered_snap, goal)
309
+ context = self.llm_handler.build_context(filtered_snap, goal)
220
310
 
221
311
  # 3. THINK: Query LLM for next action
222
- llm_response = self._query_llm(context, goal)
312
+ llm_response = self.llm_handler.query_llm(context, goal)
223
313
 
224
314
  # Emit LLM query trace event if tracer is enabled
225
315
  if self.tracer:
226
- self.tracer.emit(
316
+ _safe_tracer_call(
317
+ self.tracer,
318
+ "emit",
319
+ self.verbose,
227
320
  "llm_query",
228
321
  {
229
322
  "prompt_tokens": llm_response.prompt_tokens,
@@ -241,10 +334,10 @@ class SentienceAgent(BaseAgent):
241
334
  self._track_tokens(goal, llm_response)
242
335
 
243
336
  # Parse action from LLM response
244
- action_str = self._extract_action_from_response(llm_response.content)
337
+ action_str = self.llm_handler.extract_action(llm_response.content)
245
338
 
246
339
  # 4. EXECUTE: Parse and run action
247
- result_dict = self._execute_action(action_str, filtered_snap)
340
+ result_dict = self.action_executor.execute(action_str, filtered_snap)
248
341
 
249
342
  duration_ms = int((time.time() - start_time) * 1000)
250
343
 
@@ -284,7 +377,10 @@ class SentienceAgent(BaseAgent):
284
377
  for el in filtered_snap.elements[:50]
285
378
  ]
286
379
 
287
- self.tracer.emit(
380
+ _safe_tracer_call(
381
+ self.tracer,
382
+ "emit",
383
+ self.verbose,
288
384
  "action",
289
385
  {
290
386
  "action": result.action,
@@ -317,13 +413,105 @@ class SentienceAgent(BaseAgent):
317
413
 
318
414
  # Emit step completion trace event if tracer is enabled
319
415
  if self.tracer:
320
- self.tracer.emit(
321
- "step_end",
322
- {
323
- "success": result.success,
324
- "duration_ms": duration_ms,
325
- "action": result.action,
416
+ # Get pre_url from step_start (stored in tracer or use current)
417
+ pre_url = snap.url
418
+ post_url = self.browser.page.url if self.browser.page else None
419
+
420
+ # Compute snapshot digest (simplified - use URL + timestamp)
421
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
422
+
423
+ # Build LLM data
424
+ llm_response_text = llm_response.content
425
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
426
+ llm_data = {
427
+ "response_text": llm_response_text,
428
+ "response_hash": llm_response_hash,
429
+ "usage": {
430
+ "prompt_tokens": llm_response.prompt_tokens or 0,
431
+ "completion_tokens": llm_response.completion_tokens or 0,
432
+ "total_tokens": llm_response.total_tokens or 0,
326
433
  },
434
+ }
435
+
436
+ # Build exec data
437
+ exec_data = {
438
+ "success": result.success,
439
+ "action": result.action,
440
+ "outcome": result.outcome
441
+ or (
442
+ f"Action {result.action} executed successfully"
443
+ if result.success
444
+ else f"Action {result.action} failed"
445
+ ),
446
+ "duration_ms": duration_ms,
447
+ }
448
+
449
+ # Add optional exec fields
450
+ if result.element_id is not None:
451
+ exec_data["element_id"] = result.element_id
452
+ # Add bounding box if element found
453
+ bbox = self._get_element_bbox(result.element_id, snap)
454
+ if bbox:
455
+ exec_data["bounding_box"] = bbox
456
+ if result.text is not None:
457
+ exec_data["text"] = result.text
458
+ if result.key is not None:
459
+ exec_data["key"] = result.key
460
+ if result.error is not None:
461
+ exec_data["error"] = result.error
462
+
463
+ # Build verify data (simplified - based on success and url_changed)
464
+ verify_passed = result.success and (
465
+ result.url_changed or result.action != "click"
466
+ )
467
+ verify_signals = {
468
+ "url_changed": result.url_changed or False,
469
+ }
470
+ if result.error:
471
+ verify_signals["error"] = result.error
472
+
473
+ # Add elements_found array if element was targeted
474
+ if result.element_id is not None:
475
+ bbox = self._get_element_bbox(result.element_id, snap)
476
+ if bbox:
477
+ verify_signals["elements_found"] = [
478
+ {
479
+ "label": f"Element {result.element_id}",
480
+ "bounding_box": bbox,
481
+ }
482
+ ]
483
+
484
+ verify_data = {
485
+ "passed": verify_passed,
486
+ "signals": verify_signals,
487
+ }
488
+
489
+ # Build elements data for pre field (include diff_status from snap_with_diff)
490
+ # Use the same format as build_snapshot_event for consistency
491
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
492
+ pre_elements = snapshot_event_data.get("elements", [])
493
+
494
+ # Build complete step_end event
495
+ step_end_data = TraceEventBuilder.build_step_end_event(
496
+ step_id=step_id,
497
+ step_index=self._step_count,
498
+ goal=goal,
499
+ attempt=attempt,
500
+ pre_url=pre_url,
501
+ post_url=post_url,
502
+ snapshot_digest=snapshot_digest,
503
+ llm_data=llm_data,
504
+ exec_data=exec_data,
505
+ verify_data=verify_data,
506
+ pre_elements=pre_elements,
507
+ )
508
+
509
+ _safe_tracer_call(
510
+ self.tracer,
511
+ "emit",
512
+ self.verbose,
513
+ "step_end",
514
+ step_end_data,
327
515
  step_id=step_id,
328
516
  )
329
517
 
@@ -332,7 +520,14 @@ class SentienceAgent(BaseAgent):
332
520
  except Exception as e:
333
521
  # Emit error trace event if tracer is enabled
334
522
  if self.tracer:
335
- self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
523
+ _safe_tracer_call(
524
+ self.tracer,
525
+ "emit_error",
526
+ self.verbose,
527
+ step_id=step_id,
528
+ error=str(e),
529
+ attempt=attempt,
530
+ )
336
531
 
337
532
  if attempt < max_retries:
338
533
  if self.verbose:
@@ -361,195 +556,573 @@ class SentienceAgent(BaseAgent):
361
556
  )
362
557
  raise RuntimeError(f"Failed after {max_retries} retries: {e}")
363
558
 
364
- def _build_context(self, snap: Snapshot, goal: str) -> str:
559
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
365
560
  """
366
- Convert snapshot elements to token-efficient prompt string
367
-
368
- Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
561
+ Track token usage for analytics
369
562
 
370
563
  Args:
371
- snap: Snapshot object
372
- goal: User goal (for context)
564
+ goal: User goal
565
+ llm_response: LLM response with token usage
566
+ """
567
+ if llm_response.prompt_tokens:
568
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
569
+ if llm_response.completion_tokens:
570
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
571
+ if llm_response.total_tokens:
572
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
573
+
574
+ self._token_usage_raw["by_action"].append(
575
+ {
576
+ "goal": goal,
577
+ "prompt_tokens": llm_response.prompt_tokens or 0,
578
+ "completion_tokens": llm_response.completion_tokens or 0,
579
+ "total_tokens": llm_response.total_tokens or 0,
580
+ "model": llm_response.model_name,
581
+ }
582
+ )
583
+
584
+ def get_token_stats(self) -> TokenStats:
585
+ """
586
+ Get token usage statistics
373
587
 
374
588
  Returns:
375
- Formatted element context string
589
+ TokenStats with token usage breakdown
376
590
  """
377
- lines = []
378
- # Note: elements are already filtered by filter_elements() in act()
379
- for el in snap.elements:
380
- # Extract visual cues
381
- cues = []
382
- if el.visual_cues.is_primary:
383
- cues.append("PRIMARY")
384
- if el.visual_cues.is_clickable:
385
- cues.append("CLICKABLE")
386
- if el.visual_cues.background_color_name:
387
- cues.append(f"color:{el.visual_cues.background_color_name}")
388
-
389
- # Format element line
390
- cues_str = f" {{{','.join(cues)}}}" if cues else ""
391
- text_preview = (
392
- (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
393
- )
591
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
592
+ return TokenStats(
593
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
594
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
595
+ total_tokens=self._token_usage_raw["total_tokens"],
596
+ by_action=by_action,
597
+ )
394
598
 
395
- lines.append(
396
- f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
397
- f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
398
- )
599
+ def get_history(self) -> list[ActionHistory]:
600
+ """
601
+ Get execution history
602
+
603
+ Returns:
604
+ List of ActionHistory entries
605
+ """
606
+ return [ActionHistory(**h) for h in self.history]
399
607
 
400
- return "\n".join(lines)
608
+ def clear_history(self) -> None:
609
+ """Clear execution history and reset token counters"""
610
+ self.history.clear()
611
+ self._token_usage_raw = {
612
+ "total_prompt_tokens": 0,
613
+ "total_completion_tokens": 0,
614
+ "total_tokens": 0,
615
+ "by_action": [],
616
+ }
401
617
 
402
- def _extract_action_from_response(self, response: str) -> str:
618
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
403
619
  """
404
- Extract action command from LLM response, handling cases where
405
- the LLM adds extra explanation despite instructions.
620
+ Filter elements from snapshot based on goal context.
621
+
622
+ This implementation uses ElementFilter to apply goal-based keyword matching
623
+ to boost relevant elements and filters out irrelevant ones.
406
624
 
407
625
  Args:
408
- response: Raw LLM response text
626
+ snapshot: Current page snapshot
627
+ goal: User's goal (can inform filtering)
409
628
 
410
629
  Returns:
411
- Cleaned action command string
630
+ Filtered list of elements
412
631
  """
413
- import re
632
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
414
633
 
415
- # Remove markdown code blocks if present
416
- response = re.sub(r"```[\w]*\n?", "", response)
417
- response = response.strip()
418
634
 
419
- # Try to find action patterns in the response
420
- # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
421
- action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
635
+ class SentienceAgentAsync(BaseAgentAsync):
636
+ """
637
+ High-level async agent that combines Sentience SDK with any LLM provider.
422
638
 
423
- match = re.search(action_pattern, response, re.IGNORECASE)
424
- if match:
425
- return match.group(1)
639
+ Uses observe-think-act loop to execute natural language commands:
640
+ 1. OBSERVE: Get snapshot of current page state
641
+ 2. THINK: Query LLM to decide next action
642
+ 3. ACT: Execute action using SDK
426
643
 
427
- # If no pattern match, return the original response (will likely fail parsing)
428
- return response
644
+ Example:
645
+ >>> from sentience.async_api import AsyncSentienceBrowser
646
+ >>> from sentience.agent import SentienceAgentAsync
647
+ >>> from sentience.llm_provider import OpenAIProvider
648
+ >>>
649
+ >>> async with AsyncSentienceBrowser() as browser:
650
+ >>> await browser.goto("https://google.com")
651
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
652
+ >>> agent = SentienceAgentAsync(browser, llm)
653
+ >>> await agent.act("Click the search box")
654
+ >>> await agent.act("Type 'magic mouse' into the search field")
655
+ >>> await agent.act("Press Enter key")
656
+ """
429
657
 
430
- def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
658
+ def __init__(
659
+ self,
660
+ browser: AsyncSentienceBrowser,
661
+ llm: LLMProvider,
662
+ default_snapshot_limit: int = 50,
663
+ verbose: bool = True,
664
+ tracer: Optional["Tracer"] = None,
665
+ config: Optional["AgentConfig"] = None,
666
+ ):
431
667
  """
432
- Query LLM with standardized prompt template
668
+ Initialize Sentience Agent (async)
433
669
 
434
670
  Args:
435
- dom_context: Formatted element context
436
- goal: User goal
437
-
438
- Returns:
439
- LLMResponse from LLM provider
671
+ browser: AsyncSentienceBrowser instance
672
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
673
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
674
+ verbose: Print execution logs (default: True)
675
+ tracer: Optional Tracer instance for execution tracking (default: None)
676
+ config: Optional AgentConfig for advanced configuration (default: None)
440
677
  """
441
- system_prompt = f"""You are an AI web automation agent.
442
-
443
- GOAL: {goal}
444
-
445
- VISIBLE ELEMENTS (sorted by importance):
446
- {dom_context}
447
-
448
- VISUAL CUES EXPLAINED:
449
- - {{PRIMARY}}: Main call-to-action element on the page
450
- - {{CLICKABLE}}: Element is clickable
451
- - {{color:X}}: Background color name
452
-
453
- CRITICAL RESPONSE FORMAT:
454
- You MUST respond with ONLY ONE of these exact action formats:
455
- - CLICK(id) - Click element by ID
456
- - TYPE(id, "text") - Type text into element
457
- - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
458
- - FINISH() - Task complete
459
-
460
- DO NOT include any explanation, reasoning, or natural language.
461
- DO NOT use markdown formatting or code blocks.
462
- DO NOT say "The next step is..." or anything similar.
463
-
464
- CORRECT Examples:
465
- CLICK(42)
466
- TYPE(15, "magic mouse")
467
- PRESS("Enter")
468
- FINISH()
469
-
470
- INCORRECT Examples (DO NOT DO THIS):
471
- "The next step is to click..."
472
- "I will type..."
473
- ```CLICK(42)```
474
- """
678
+ self.browser = browser
679
+ self.llm = llm
680
+ self.default_snapshot_limit = default_snapshot_limit
681
+ self.verbose = verbose
682
+ self.tracer = tracer
683
+ self.config = config or AgentConfig()
475
684
 
476
- user_prompt = "Return the single action command:"
685
+ # Initialize handlers
686
+ self.llm_handler = LLMInteractionHandler(llm)
687
+ self.action_executor = ActionExecutor(browser)
477
688
 
478
- return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
689
+ # Screenshot sequence counter
690
+ # Execution history
691
+ self.history: list[dict[str, Any]] = []
692
+
693
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
694
+ self._token_usage_raw = {
695
+ "total_prompt_tokens": 0,
696
+ "total_completion_tokens": 0,
697
+ "total_tokens": 0,
698
+ "by_action": [],
699
+ }
700
+
701
+ # Step counter for tracing
702
+ self._step_count = 0
703
+
704
+ # Previous snapshot for diff detection
705
+ self._previous_snapshot: Snapshot | None = None
479
706
 
480
- def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
707
+ def _compute_hash(self, text: str) -> str:
708
+ """Compute SHA256 hash of text."""
709
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
710
+
711
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
712
+ """Get bounding box for an element from snapshot."""
713
+ if element_id is None:
714
+ return None
715
+ for el in snap.elements:
716
+ if el.id == element_id:
717
+ return {
718
+ "x": el.bbox.x,
719
+ "y": el.bbox.y,
720
+ "width": el.bbox.width,
721
+ "height": el.bbox.height,
722
+ }
723
+ return None
724
+
725
+ async def act( # noqa: C901
726
+ self,
727
+ goal: str,
728
+ max_retries: int = 2,
729
+ snapshot_options: SnapshotOptions | None = None,
730
+ ) -> AgentActionResult:
481
731
  """
482
- Parse action string and execute SDK call
732
+ Execute a high-level goal using observe → think → act loop (async)
483
733
 
484
734
  Args:
485
- action_str: Action string from LLM (e.g., "CLICK(42)")
486
- snap: Current snapshot (for context)
735
+ goal: Natural language instruction (e.g., "Click the Sign In button")
736
+ max_retries: Number of retries on failure (default: 2)
737
+ snapshot_options: Optional SnapshotOptions for this specific action
487
738
 
488
739
  Returns:
489
- Execution result dictionary
740
+ AgentActionResult with execution details
741
+
742
+ Example:
743
+ >>> result = await agent.act("Click the search box")
744
+ >>> print(result.success, result.action, result.element_id)
745
+ True click 42
490
746
  """
491
- # Parse CLICK(42)
492
- if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
493
- element_id = int(match.group(1))
494
- result = click(self.browser, element_id)
495
- return {
496
- "success": result.success,
497
- "action": "click",
498
- "element_id": element_id,
499
- "outcome": result.outcome,
500
- "url_changed": result.url_changed,
501
- }
747
+ if self.verbose:
748
+ print(f"\n{'=' * 70}")
749
+ print(f"🤖 Agent Goal: {goal}")
750
+ print(f"{'=' * 70}")
502
751
 
503
- # Parse TYPE(42, "hello world")
504
- elif match := re.match(
505
- r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
506
- action_str,
507
- re.IGNORECASE,
508
- ):
509
- element_id = int(match.group(1))
510
- text = match.group(2)
511
- result = type_text(self.browser, element_id, text)
512
- return {
513
- "success": result.success,
514
- "action": "type",
515
- "element_id": element_id,
516
- "text": text,
517
- "outcome": result.outcome,
518
- }
752
+ # Generate step ID for tracing
753
+ self._step_count += 1
754
+ step_id = f"step-{self._step_count}"
519
755
 
520
- # Parse PRESS("Enter")
521
- elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
522
- key = match.group(1)
523
- result = press(self.browser, key)
524
- return {
525
- "success": result.success,
526
- "action": "press",
527
- "key": key,
528
- "outcome": result.outcome,
529
- }
756
+ # Emit step_start trace event if tracer is enabled
757
+ if self.tracer:
758
+ pre_url = self.browser.page.url if self.browser.page else None
759
+ _safe_tracer_call(
760
+ self.tracer,
761
+ "emit_step_start",
762
+ self.verbose,
763
+ step_id=step_id,
764
+ step_index=self._step_count,
765
+ goal=goal,
766
+ attempt=0,
767
+ pre_url=pre_url,
768
+ )
530
769
 
531
- # Parse FINISH()
532
- elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
533
- return {
534
- "success": True,
535
- "action": "finish",
536
- "message": "Task marked as complete",
537
- }
770
+ for attempt in range(max_retries + 1):
771
+ try:
772
+ # 1. OBSERVE: Get refined semantic snapshot
773
+ start_time = time.time()
538
774
 
539
- else:
540
- raise ValueError(
541
- f"Unknown action format: {action_str}\n"
542
- f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
543
- )
775
+ # Use provided options or create default
776
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
777
+ # Only set goal if not already provided
778
+ if snap_opts.goal is None:
779
+ snap_opts.goal = goal
544
780
 
545
- def _track_tokens(self, goal: str, llm_response: LLMResponse):
546
- """
547
- Track token usage for analytics
781
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
782
+ # Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
783
+ # (snapshot_options.screenshot defaults to False, so we check if it's still False)
784
+ if self.config and (snapshot_options is None or snap_opts.screenshot is False):
785
+ if self.config.capture_screenshots:
786
+ # Create ScreenshotConfig from AgentConfig
787
+ snap_opts.screenshot = ScreenshotConfig(
788
+ format=self.config.screenshot_format,
789
+ quality=(
790
+ self.config.screenshot_quality
791
+ if self.config.screenshot_format == "jpeg"
792
+ else None
793
+ ),
794
+ )
795
+ else:
796
+ snap_opts.screenshot = False
797
+ # Apply show_overlay from AgentConfig
798
+ # Note: User can override by explicitly passing show_overlay in snapshot_options
799
+ snap_opts.show_overlay = self.config.show_overlay
800
+
801
+ # Call snapshot with options object (matches TypeScript API)
802
+ snap = await snapshot_async(self.browser, snap_opts)
548
803
 
549
- Args:
550
- goal: User goal
551
- llm_response: LLM response with token usage
552
- """
804
+ if snap.status != "success":
805
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
806
+
807
+ # Compute diff_status by comparing with previous snapshot
808
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
809
+
810
+ # Create snapshot with diff_status populated
811
+ snap_with_diff = Snapshot(
812
+ status=snap.status,
813
+ timestamp=snap.timestamp,
814
+ url=snap.url,
815
+ viewport=snap.viewport,
816
+ elements=elements_with_diff,
817
+ screenshot=snap.screenshot,
818
+ screenshot_format=snap.screenshot_format,
819
+ error=snap.error,
820
+ )
821
+
822
+ # Update previous snapshot for next comparison
823
+ self._previous_snapshot = snap
824
+
825
+ # Apply element filtering based on goal
826
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
827
+
828
+ # Emit snapshot trace event if tracer is enabled
829
+ if self.tracer:
830
+ # Build snapshot event data (use snap_with_diff to include diff_status)
831
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
832
+
833
+ # Always include screenshot in trace event for studio viewer compatibility
834
+ # CloudTraceSink will extract and upload screenshots separately, then remove
835
+ # screenshot_base64 from events before uploading the trace file.
836
+ if snap.screenshot:
837
+ # Extract base64 string from data URL if needed
838
+ if snap.screenshot.startswith("data:image"):
839
+ # Format: "data:image/jpeg;base64,{base64_string}"
840
+ screenshot_base64 = (
841
+ snap.screenshot.split(",", 1)[1]
842
+ if "," in snap.screenshot
843
+ else snap.screenshot
844
+ )
845
+ else:
846
+ screenshot_base64 = snap.screenshot
847
+
848
+ snapshot_data["screenshot_base64"] = screenshot_base64
849
+ if snap.screenshot_format:
850
+ snapshot_data["screenshot_format"] = snap.screenshot_format
851
+
852
+ _safe_tracer_call(
853
+ self.tracer,
854
+ "emit",
855
+ self.verbose,
856
+ "snapshot",
857
+ snapshot_data,
858
+ step_id=step_id,
859
+ )
860
+
861
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
862
+ filtered_snap = Snapshot(
863
+ status=snap_with_diff.status,
864
+ timestamp=snap_with_diff.timestamp,
865
+ url=snap_with_diff.url,
866
+ viewport=snap_with_diff.viewport,
867
+ elements=filtered_elements,
868
+ screenshot=snap_with_diff.screenshot,
869
+ screenshot_format=snap_with_diff.screenshot_format,
870
+ error=snap_with_diff.error,
871
+ )
872
+
873
+ # 2. GROUND: Format elements for LLM context
874
+ context = self.llm_handler.build_context(filtered_snap, goal)
875
+
876
+ # 3. THINK: Query LLM for next action
877
+ llm_response = self.llm_handler.query_llm(context, goal)
878
+
879
+ # Emit LLM query trace event if tracer is enabled
880
+ if self.tracer:
881
+ _safe_tracer_call(
882
+ self.tracer,
883
+ "emit",
884
+ self.verbose,
885
+ "llm_query",
886
+ {
887
+ "prompt_tokens": llm_response.prompt_tokens,
888
+ "completion_tokens": llm_response.completion_tokens,
889
+ "model": llm_response.model_name,
890
+ "response": llm_response.content[:200], # Truncate for brevity
891
+ },
892
+ step_id=step_id,
893
+ )
894
+
895
+ if self.verbose:
896
+ print(f"🧠 LLM Decision: {llm_response.content}")
897
+
898
+ # Track token usage
899
+ self._track_tokens(goal, llm_response)
900
+
901
+ # Parse action from LLM response
902
+ action_str = self.llm_handler.extract_action(llm_response.content)
903
+
904
+ # 4. EXECUTE: Parse and run action
905
+ result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
906
+
907
+ duration_ms = int((time.time() - start_time) * 1000)
908
+
909
+ # Create AgentActionResult from execution result
910
+ result = AgentActionResult(
911
+ success=result_dict["success"],
912
+ action=result_dict["action"],
913
+ goal=goal,
914
+ duration_ms=duration_ms,
915
+ attempt=attempt,
916
+ element_id=result_dict.get("element_id"),
917
+ text=result_dict.get("text"),
918
+ key=result_dict.get("key"),
919
+ outcome=result_dict.get("outcome"),
920
+ url_changed=result_dict.get("url_changed"),
921
+ error=result_dict.get("error"),
922
+ message=result_dict.get("message"),
923
+ )
924
+
925
+ # Emit action execution trace event if tracer is enabled
926
+ if self.tracer:
927
+ post_url = self.browser.page.url if self.browser.page else None
928
+
929
+ # Include element data for live overlay visualization
930
+ elements_data = [
931
+ {
932
+ "id": el.id,
933
+ "bbox": {
934
+ "x": el.bbox.x,
935
+ "y": el.bbox.y,
936
+ "width": el.bbox.width,
937
+ "height": el.bbox.height,
938
+ },
939
+ "role": el.role,
940
+ "text": el.text[:50] if el.text else "",
941
+ }
942
+ for el in filtered_snap.elements[:50]
943
+ ]
944
+
945
+ _safe_tracer_call(
946
+ self.tracer,
947
+ "emit",
948
+ self.verbose,
949
+ "action",
950
+ {
951
+ "action": result.action,
952
+ "element_id": result.element_id,
953
+ "success": result.success,
954
+ "outcome": result.outcome,
955
+ "duration_ms": duration_ms,
956
+ "post_url": post_url,
957
+ "elements": elements_data, # Add element data for overlay
958
+ "target_element_id": result.element_id, # Highlight target in red
959
+ },
960
+ step_id=step_id,
961
+ )
962
+
963
+ # 5. RECORD: Track history
964
+ self.history.append(
965
+ {
966
+ "goal": goal,
967
+ "action": action_str,
968
+ "result": result.model_dump(), # Store as dict
969
+ "success": result.success,
970
+ "attempt": attempt,
971
+ "duration_ms": duration_ms,
972
+ }
973
+ )
974
+
975
+ if self.verbose:
976
+ status = "✅" if result.success else "❌"
977
+ print(f"{status} Completed in {duration_ms}ms")
978
+
979
+ # Emit step completion trace event if tracer is enabled
980
+ if self.tracer:
981
+ # Get pre_url from step_start (stored in tracer or use current)
982
+ pre_url = snap.url
983
+ post_url = self.browser.page.url if self.browser.page else None
984
+
985
+ # Compute snapshot digest (simplified - use URL + timestamp)
986
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
987
+
988
+ # Build LLM data
989
+ llm_response_text = llm_response.content
990
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
991
+ llm_data = {
992
+ "response_text": llm_response_text,
993
+ "response_hash": llm_response_hash,
994
+ "usage": {
995
+ "prompt_tokens": llm_response.prompt_tokens or 0,
996
+ "completion_tokens": llm_response.completion_tokens or 0,
997
+ "total_tokens": llm_response.total_tokens or 0,
998
+ },
999
+ }
1000
+
1001
+ # Build exec data
1002
+ exec_data = {
1003
+ "success": result.success,
1004
+ "action": result.action,
1005
+ "outcome": result.outcome
1006
+ or (
1007
+ f"Action {result.action} executed successfully"
1008
+ if result.success
1009
+ else f"Action {result.action} failed"
1010
+ ),
1011
+ "duration_ms": duration_ms,
1012
+ }
1013
+
1014
+ # Add optional exec fields
1015
+ if result.element_id is not None:
1016
+ exec_data["element_id"] = result.element_id
1017
+ # Add bounding box if element found
1018
+ bbox = self._get_element_bbox(result.element_id, snap)
1019
+ if bbox:
1020
+ exec_data["bounding_box"] = bbox
1021
+ if result.text is not None:
1022
+ exec_data["text"] = result.text
1023
+ if result.key is not None:
1024
+ exec_data["key"] = result.key
1025
+ if result.error is not None:
1026
+ exec_data["error"] = result.error
1027
+
1028
+ # Build verify data (simplified - based on success and url_changed)
1029
+ verify_passed = result.success and (
1030
+ result.url_changed or result.action != "click"
1031
+ )
1032
+ verify_signals = {
1033
+ "url_changed": result.url_changed or False,
1034
+ }
1035
+ if result.error:
1036
+ verify_signals["error"] = result.error
1037
+
1038
+ # Add elements_found array if element was targeted
1039
+ if result.element_id is not None:
1040
+ bbox = self._get_element_bbox(result.element_id, snap)
1041
+ if bbox:
1042
+ verify_signals["elements_found"] = [
1043
+ {
1044
+ "label": f"Element {result.element_id}",
1045
+ "bounding_box": bbox,
1046
+ }
1047
+ ]
1048
+
1049
+ verify_data = {
1050
+ "passed": verify_passed,
1051
+ "signals": verify_signals,
1052
+ }
1053
+
1054
+ # Build elements data for pre field (include diff_status from snap_with_diff)
1055
+ # Use the same format as build_snapshot_event for consistency
1056
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
1057
+ pre_elements = snapshot_event_data.get("elements", [])
1058
+
1059
+ # Build complete step_end event
1060
+ step_end_data = TraceEventBuilder.build_step_end_event(
1061
+ step_id=step_id,
1062
+ step_index=self._step_count,
1063
+ goal=goal,
1064
+ attempt=attempt,
1065
+ pre_url=pre_url,
1066
+ post_url=post_url,
1067
+ snapshot_digest=snapshot_digest,
1068
+ llm_data=llm_data,
1069
+ exec_data=exec_data,
1070
+ verify_data=verify_data,
1071
+ pre_elements=pre_elements,
1072
+ )
1073
+
1074
+ _safe_tracer_call(
1075
+ self.tracer,
1076
+ "emit",
1077
+ self.verbose,
1078
+ "step_end",
1079
+ step_end_data,
1080
+ step_id=step_id,
1081
+ )
1082
+
1083
+ return result
1084
+
1085
+ except Exception as e:
1086
+ # Emit error trace event if tracer is enabled
1087
+ if self.tracer:
1088
+ _safe_tracer_call(
1089
+ self.tracer,
1090
+ "emit_error",
1091
+ self.verbose,
1092
+ step_id=step_id,
1093
+ error=str(e),
1094
+ attempt=attempt,
1095
+ )
1096
+
1097
+ if attempt < max_retries:
1098
+ if self.verbose:
1099
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
1100
+ await asyncio.sleep(1.0) # Brief delay before retry
1101
+ continue
1102
+ else:
1103
+ # Create error result
1104
+ error_result = AgentActionResult(
1105
+ success=False,
1106
+ action="error",
1107
+ goal=goal,
1108
+ duration_ms=0,
1109
+ attempt=attempt,
1110
+ error=str(e),
1111
+ )
1112
+ self.history.append(
1113
+ {
1114
+ "goal": goal,
1115
+ "action": "error",
1116
+ "result": error_result.model_dump(),
1117
+ "success": False,
1118
+ "attempt": attempt,
1119
+ "duration_ms": 0,
1120
+ }
1121
+ )
1122
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
1123
+
1124
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
1125
+ """Track token usage for analytics (same as sync version)"""
553
1126
  if llm_response.prompt_tokens:
554
1127
  self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
555
1128
  if llm_response.completion_tokens:
@@ -568,12 +1141,7 @@ INCORRECT Examples (DO NOT DO THIS):
568
1141
  )
569
1142
 
570
1143
  def get_token_stats(self) -> TokenStats:
571
- """
572
- Get token usage statistics
573
-
574
- Returns:
575
- TokenStats with token usage breakdown
576
- """
1144
+ """Get token usage statistics (same as sync version)"""
577
1145
  by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
578
1146
  return TokenStats(
579
1147
  total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
@@ -583,16 +1151,11 @@ INCORRECT Examples (DO NOT DO THIS):
583
1151
  )
584
1152
 
585
1153
  def get_history(self) -> list[ActionHistory]:
586
- """
587
- Get execution history
588
-
589
- Returns:
590
- List of ActionHistory entries
591
- """
1154
+ """Get execution history (same as sync version)"""
592
1155
  return [ActionHistory(**h) for h in self.history]
593
1156
 
594
1157
  def clear_history(self) -> None:
595
- """Clear execution history and reset token counters"""
1158
+ """Clear execution history and reset token counters (same as sync version)"""
596
1159
  self.history.clear()
597
1160
  self._token_usage_raw = {
598
1161
  "total_prompt_tokens": 0,
@@ -605,8 +1168,8 @@ INCORRECT Examples (DO NOT DO THIS):
605
1168
  """
606
1169
  Filter elements from snapshot based on goal context.
607
1170
 
608
- This default implementation applies goal-based keyword matching to boost
609
- relevant elements and filters out irrelevant ones.
1171
+ This implementation uses ElementFilter to apply goal-based keyword matching
1172
+ to boost relevant elements and filters out irrelevant ones.
610
1173
 
611
1174
  Args:
612
1175
  snapshot: Current page snapshot
@@ -615,73 +1178,4 @@ INCORRECT Examples (DO NOT DO THIS):
615
1178
  Returns:
616
1179
  Filtered list of elements
617
1180
  """
618
- elements = snapshot.elements
619
-
620
- # If no goal provided, return all elements (up to limit)
621
- if not goal:
622
- return elements[: self.default_snapshot_limit]
623
-
624
- goal_lower = goal.lower()
625
-
626
- # Extract keywords from goal
627
- keywords = self._extract_keywords(goal_lower)
628
-
629
- # Boost elements matching goal keywords
630
- scored_elements = []
631
- for el in elements:
632
- score = el.importance
633
-
634
- # Boost if element text matches goal
635
- if el.text and any(kw in el.text.lower() for kw in keywords):
636
- score += 0.3
637
-
638
- # Boost if role matches goal intent
639
- if "click" in goal_lower and el.visual_cues.is_clickable:
640
- score += 0.2
641
- if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
642
- score += 0.2
643
- if "search" in goal_lower:
644
- # Filter out non-interactive elements for search tasks
645
- if el.role in ["link", "img"] and not el.visual_cues.is_primary:
646
- score -= 0.5
647
-
648
- scored_elements.append((score, el))
649
-
650
- # Re-sort by boosted score
651
- scored_elements.sort(key=lambda x: x[0], reverse=True)
652
- elements = [el for _, el in scored_elements]
653
-
654
- return elements[: self.default_snapshot_limit]
655
-
656
- def _extract_keywords(self, text: str) -> list[str]:
657
- """
658
- Extract meaningful keywords from goal text
659
-
660
- Args:
661
- text: Text to extract keywords from
662
-
663
- Returns:
664
- List of keywords
665
- """
666
- stopwords = {
667
- "the",
668
- "a",
669
- "an",
670
- "and",
671
- "or",
672
- "but",
673
- "in",
674
- "on",
675
- "at",
676
- "to",
677
- "for",
678
- "of",
679
- "with",
680
- "by",
681
- "from",
682
- "as",
683
- "is",
684
- "was",
685
- }
686
- words = text.split()
687
- return [w for w in words if w not in stopwords and len(w) > 2]
1181
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)