sentienceapi 0.90.16__py3-none-any.whl → 0.92.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (61) hide show
  1. sentience/__init__.py +14 -5
  2. sentience/action_executor.py +215 -0
  3. sentience/actions.py +408 -25
  4. sentience/agent.py +802 -293
  5. sentience/agent_config.py +3 -0
  6. sentience/async_api.py +83 -1142
  7. sentience/base_agent.py +95 -0
  8. sentience/browser.py +484 -1
  9. sentience/browser_evaluator.py +299 -0
  10. sentience/cloud_tracing.py +457 -33
  11. sentience/conversational_agent.py +77 -43
  12. sentience/element_filter.py +136 -0
  13. sentience/expect.py +98 -2
  14. sentience/extension/background.js +56 -185
  15. sentience/extension/content.js +117 -289
  16. sentience/extension/injected_api.js +799 -1374
  17. sentience/extension/manifest.json +1 -1
  18. sentience/extension/pkg/sentience_core.js +190 -396
  19. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  20. sentience/extension/release.json +47 -47
  21. sentience/formatting.py +9 -53
  22. sentience/inspector.py +183 -1
  23. sentience/llm_interaction_handler.py +191 -0
  24. sentience/llm_provider.py +74 -52
  25. sentience/llm_provider_utils.py +120 -0
  26. sentience/llm_response_builder.py +153 -0
  27. sentience/models.py +60 -1
  28. sentience/overlay.py +109 -2
  29. sentience/protocols.py +228 -0
  30. sentience/query.py +1 -1
  31. sentience/read.py +95 -3
  32. sentience/recorder.py +223 -3
  33. sentience/schemas/trace_v1.json +102 -9
  34. sentience/screenshot.py +48 -2
  35. sentience/sentience_methods.py +86 -0
  36. sentience/snapshot.py +291 -38
  37. sentience/snapshot_diff.py +141 -0
  38. sentience/text_search.py +119 -5
  39. sentience/trace_event_builder.py +129 -0
  40. sentience/trace_file_manager.py +197 -0
  41. sentience/trace_indexing/index_schema.py +95 -7
  42. sentience/trace_indexing/indexer.py +117 -14
  43. sentience/tracer_factory.py +119 -6
  44. sentience/tracing.py +172 -8
  45. sentience/utils/__init__.py +40 -0
  46. sentience/utils/browser.py +46 -0
  47. sentience/utils/element.py +257 -0
  48. sentience/utils/formatting.py +59 -0
  49. sentience/utils.py +1 -1
  50. sentience/visual_agent.py +2056 -0
  51. sentience/wait.py +68 -2
  52. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/METADATA +2 -1
  53. sentienceapi-0.92.2.dist-info/RECORD +65 -0
  54. sentience/extension/test-content.js +0 -4
  55. sentienceapi-0.90.16.dist-info/RECORD +0 -50
  56. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/WHEEL +0 -0
  57. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/entry_points.txt +0 -0
  58. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/licenses/LICENSE +0 -0
  59. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/licenses/LICENSE-APACHE +0 -0
  60. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/licenses/LICENSE-MIT +0 -0
  61. {sentienceapi-0.90.16.dist-info → sentienceapi-0.92.2.dist-info}/top_level.txt +0 -0
sentience/agent.py CHANGED
@@ -3,13 +3,17 @@ Sentience Agent: High-level automation agent using LLM + SDK
3
3
  Implements observe-think-act loop for natural language commands
4
4
  """
5
5
 
6
- import re
6
+ import asyncio
7
+ import hashlib
7
8
  import time
8
- from typing import TYPE_CHECKING, Any, Optional
9
-
10
- from .actions import click, press, type_text
11
- from .base_agent import BaseAgent
12
- from .browser import SentienceBrowser
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
+
11
+ from .action_executor import ActionExecutor
12
+ from .agent_config import AgentConfig
13
+ from .base_agent import BaseAgent, BaseAgentAsync
14
+ from .browser import AsyncSentienceBrowser, SentienceBrowser
15
+ from .element_filter import ElementFilter
16
+ from .llm_interaction_handler import LLMInteractionHandler
13
17
  from .llm_provider import LLMProvider, LLMResponse
14
18
  from .models import (
15
19
  ActionHistory,
@@ -21,13 +25,46 @@ from .models import (
21
25
  SnapshotOptions,
22
26
  TokenStats,
23
27
  )
24
- from .snapshot import snapshot
28
+ from .protocols import AsyncBrowserProtocol, BrowserProtocol
29
+ from .snapshot import snapshot, snapshot_async
30
+ from .snapshot_diff import SnapshotDiff
31
+ from .trace_event_builder import TraceEventBuilder
25
32
 
26
33
  if TYPE_CHECKING:
27
- from .agent_config import AgentConfig
28
34
  from .tracing import Tracer
29
35
 
30
36
 
37
+ def _safe_tracer_call(
38
+ tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
39
+ ) -> None:
40
+ """
41
+ Safely call tracer method, catching and logging errors without breaking execution.
42
+
43
+ Args:
44
+ tracer: Tracer instance or None
45
+ method_name: Name of tracer method to call (e.g., "emit", "emit_error")
46
+ verbose: Whether to print error messages
47
+ *args: Positional arguments for the tracer method
48
+ **kwargs: Keyword arguments for the tracer method
49
+ """
50
+ if not tracer:
51
+ return
52
+ try:
53
+ method = getattr(tracer, method_name)
54
+ if args and kwargs:
55
+ method(*args, **kwargs)
56
+ elif args:
57
+ method(*args)
58
+ elif kwargs:
59
+ method(**kwargs)
60
+ else:
61
+ method()
62
+ except Exception as tracer_error:
63
+ # Tracer errors should not break agent execution
64
+ if verbose:
65
+ print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
66
+
67
+
31
68
  class SentienceAgent(BaseAgent):
32
69
  """
33
70
  High-level agent that combines Sentience SDK with any LLM provider.
@@ -54,7 +91,7 @@ class SentienceAgent(BaseAgent):
54
91
 
55
92
  def __init__(
56
93
  self,
57
- browser: SentienceBrowser,
94
+ browser: SentienceBrowser | BrowserProtocol,
58
95
  llm: LLMProvider,
59
96
  default_snapshot_limit: int = 50,
60
97
  verbose: bool = True,
@@ -65,7 +102,8 @@ class SentienceAgent(BaseAgent):
65
102
  Initialize Sentience Agent
66
103
 
67
104
  Args:
68
- browser: SentienceBrowser instance
105
+ browser: SentienceBrowser instance or BrowserProtocol-compatible object
106
+ (for testing, can use mock objects that implement BrowserProtocol)
69
107
  llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
70
108
  default_snapshot_limit: Default maximum elements to include in context (default: 50)
71
109
  verbose: Print execution logs (default: True)
@@ -77,8 +115,13 @@ class SentienceAgent(BaseAgent):
77
115
  self.default_snapshot_limit = default_snapshot_limit
78
116
  self.verbose = verbose
79
117
  self.tracer = tracer
80
- self.config = config
118
+ self.config = config or AgentConfig()
119
+
120
+ # Initialize handlers
121
+ self.llm_handler = LLMInteractionHandler(llm)
122
+ self.action_executor = ActionExecutor(browser)
81
123
 
124
+ # Screenshot sequence counter
82
125
  # Execution history
83
126
  self.history: list[dict[str, Any]] = []
84
127
 
@@ -93,6 +136,27 @@ class SentienceAgent(BaseAgent):
93
136
  # Step counter for tracing
94
137
  self._step_count = 0
95
138
 
139
+ # Previous snapshot for diff detection
140
+ self._previous_snapshot: Snapshot | None = None
141
+
142
+ def _compute_hash(self, text: str) -> str:
143
+ """Compute SHA256 hash of text."""
144
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
145
+
146
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
147
+ """Get bounding box for an element from snapshot."""
148
+ if element_id is None:
149
+ return None
150
+ for el in snap.elements:
151
+ if el.id == element_id:
152
+ return {
153
+ "x": el.bbox.x,
154
+ "y": el.bbox.y,
155
+ "width": el.bbox.width,
156
+ "height": el.bbox.height,
157
+ }
158
+ return None
159
+
96
160
  def act( # noqa: C901
97
161
  self,
98
162
  goal: str,
@@ -130,7 +194,10 @@ class SentienceAgent(BaseAgent):
130
194
  # Emit step_start trace event if tracer is enabled
131
195
  if self.tracer:
132
196
  pre_url = self.browser.page.url if self.browser.page else None
133
- self.tracer.emit_step_start(
197
+ _safe_tracer_call(
198
+ self.tracer,
199
+ "emit_step_start",
200
+ self.verbose,
134
201
  step_id=step_id,
135
202
  step_index=self._step_count,
136
203
  goal=goal,
@@ -149,66 +216,107 @@ class SentienceAgent(BaseAgent):
149
216
  if snap_opts.goal is None:
150
217
  snap_opts.goal = goal
151
218
 
219
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
220
+ if snapshot_options is None and self.config:
221
+ if self.config.capture_screenshots:
222
+ # Create ScreenshotConfig from AgentConfig
223
+ snap_opts.screenshot = ScreenshotConfig(
224
+ format=self.config.screenshot_format,
225
+ quality=(
226
+ self.config.screenshot_quality
227
+ if self.config.screenshot_format == "jpeg"
228
+ else None
229
+ ),
230
+ )
231
+ else:
232
+ snap_opts.screenshot = False
233
+ # Apply show_overlay from AgentConfig
234
+ snap_opts.show_overlay = self.config.show_overlay
235
+
152
236
  # Call snapshot with options object (matches TypeScript API)
153
237
  snap = snapshot(self.browser, snap_opts)
154
238
 
155
239
  if snap.status != "success":
156
240
  raise RuntimeError(f"Snapshot failed: {snap.error}")
157
241
 
242
+ # Compute diff_status by comparing with previous snapshot
243
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
244
+
245
+ # Create snapshot with diff_status populated
246
+ snap_with_diff = Snapshot(
247
+ status=snap.status,
248
+ timestamp=snap.timestamp,
249
+ url=snap.url,
250
+ viewport=snap.viewport,
251
+ elements=elements_with_diff,
252
+ screenshot=snap.screenshot,
253
+ screenshot_format=snap.screenshot_format,
254
+ error=snap.error,
255
+ )
256
+
257
+ # Update previous snapshot for next comparison
258
+ self._previous_snapshot = snap
259
+
158
260
  # Apply element filtering based on goal
159
- filtered_elements = self.filter_elements(snap, goal)
261
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
160
262
 
161
263
  # Emit snapshot trace event if tracer is enabled
162
264
  if self.tracer:
163
- # Include element data for live overlay visualization
164
- # Use filtered_elements for overlay (only relevant elements)
165
- elements_data = [
166
- {
167
- "id": el.id,
168
- "bbox": {
169
- "x": el.bbox.x,
170
- "y": el.bbox.y,
171
- "width": el.bbox.width,
172
- "height": el.bbox.height,
173
- },
174
- "role": el.role,
175
- "text": el.text[:50] if el.text else "", # Truncate for brevity
176
- }
177
- for el in filtered_elements[:50] # Limit to first 50 for performance
178
- ]
179
-
180
- self.tracer.emit(
265
+ # Build snapshot event data (use snap_with_diff to include diff_status)
266
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
267
+
268
+ # Always include screenshot in trace event for studio viewer compatibility
269
+ # CloudTraceSink will extract and upload screenshots separately, then remove
270
+ # screenshot_base64 from events before uploading the trace file.
271
+ if snap.screenshot:
272
+ # Extract base64 string from data URL if needed
273
+ if snap.screenshot.startswith("data:image"):
274
+ # Format: "data:image/jpeg;base64,{base64_string}"
275
+ screenshot_base64 = (
276
+ snap.screenshot.split(",", 1)[1]
277
+ if "," in snap.screenshot
278
+ else snap.screenshot
279
+ )
280
+ else:
281
+ screenshot_base64 = snap.screenshot
282
+
283
+ snapshot_data["screenshot_base64"] = screenshot_base64
284
+ if snap.screenshot_format:
285
+ snapshot_data["screenshot_format"] = snap.screenshot_format
286
+
287
+ _safe_tracer_call(
288
+ self.tracer,
289
+ "emit",
290
+ self.verbose,
181
291
  "snapshot",
182
- {
183
- "url": snap.url,
184
- "element_count": len(snap.elements),
185
- "timestamp": snap.timestamp,
186
- "elements": elements_data, # Add element data for overlay
187
- },
292
+ snapshot_data,
188
293
  step_id=step_id,
189
294
  )
190
295
 
191
- # Create filtered snapshot
296
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
192
297
  filtered_snap = Snapshot(
193
- status=snap.status,
194
- timestamp=snap.timestamp,
195
- url=snap.url,
196
- viewport=snap.viewport,
298
+ status=snap_with_diff.status,
299
+ timestamp=snap_with_diff.timestamp,
300
+ url=snap_with_diff.url,
301
+ viewport=snap_with_diff.viewport,
197
302
  elements=filtered_elements,
198
- screenshot=snap.screenshot,
199
- screenshot_format=snap.screenshot_format,
200
- error=snap.error,
303
+ screenshot=snap_with_diff.screenshot,
304
+ screenshot_format=snap_with_diff.screenshot_format,
305
+ error=snap_with_diff.error,
201
306
  )
202
307
 
203
308
  # 2. GROUND: Format elements for LLM context
204
- context = self._build_context(filtered_snap, goal)
309
+ context = self.llm_handler.build_context(filtered_snap, goal)
205
310
 
206
311
  # 3. THINK: Query LLM for next action
207
- llm_response = self._query_llm(context, goal)
312
+ llm_response = self.llm_handler.query_llm(context, goal)
208
313
 
209
314
  # Emit LLM query trace event if tracer is enabled
210
315
  if self.tracer:
211
- self.tracer.emit(
316
+ _safe_tracer_call(
317
+ self.tracer,
318
+ "emit",
319
+ self.verbose,
212
320
  "llm_query",
213
321
  {
214
322
  "prompt_tokens": llm_response.prompt_tokens,
@@ -226,10 +334,10 @@ class SentienceAgent(BaseAgent):
226
334
  self._track_tokens(goal, llm_response)
227
335
 
228
336
  # Parse action from LLM response
229
- action_str = self._extract_action_from_response(llm_response.content)
337
+ action_str = self.llm_handler.extract_action(llm_response.content)
230
338
 
231
339
  # 4. EXECUTE: Parse and run action
232
- result_dict = self._execute_action(action_str, filtered_snap)
340
+ result_dict = self.action_executor.execute(action_str, filtered_snap)
233
341
 
234
342
  duration_ms = int((time.time() - start_time) * 1000)
235
343
 
@@ -269,7 +377,10 @@ class SentienceAgent(BaseAgent):
269
377
  for el in filtered_snap.elements[:50]
270
378
  ]
271
379
 
272
- self.tracer.emit(
380
+ _safe_tracer_call(
381
+ self.tracer,
382
+ "emit",
383
+ self.verbose,
273
384
  "action",
274
385
  {
275
386
  "action": result.action,
@@ -302,13 +413,105 @@ class SentienceAgent(BaseAgent):
302
413
 
303
414
  # Emit step completion trace event if tracer is enabled
304
415
  if self.tracer:
305
- self.tracer.emit(
306
- "step_end",
307
- {
308
- "success": result.success,
309
- "duration_ms": duration_ms,
310
- "action": result.action,
416
+ # Get pre_url from step_start (stored in tracer or use current)
417
+ pre_url = snap.url
418
+ post_url = self.browser.page.url if self.browser.page else None
419
+
420
+ # Compute snapshot digest (simplified - use URL + timestamp)
421
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
422
+
423
+ # Build LLM data
424
+ llm_response_text = llm_response.content
425
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
426
+ llm_data = {
427
+ "response_text": llm_response_text,
428
+ "response_hash": llm_response_hash,
429
+ "usage": {
430
+ "prompt_tokens": llm_response.prompt_tokens or 0,
431
+ "completion_tokens": llm_response.completion_tokens or 0,
432
+ "total_tokens": llm_response.total_tokens or 0,
311
433
  },
434
+ }
435
+
436
+ # Build exec data
437
+ exec_data = {
438
+ "success": result.success,
439
+ "action": result.action,
440
+ "outcome": result.outcome
441
+ or (
442
+ f"Action {result.action} executed successfully"
443
+ if result.success
444
+ else f"Action {result.action} failed"
445
+ ),
446
+ "duration_ms": duration_ms,
447
+ }
448
+
449
+ # Add optional exec fields
450
+ if result.element_id is not None:
451
+ exec_data["element_id"] = result.element_id
452
+ # Add bounding box if element found
453
+ bbox = self._get_element_bbox(result.element_id, snap)
454
+ if bbox:
455
+ exec_data["bounding_box"] = bbox
456
+ if result.text is not None:
457
+ exec_data["text"] = result.text
458
+ if result.key is not None:
459
+ exec_data["key"] = result.key
460
+ if result.error is not None:
461
+ exec_data["error"] = result.error
462
+
463
+ # Build verify data (simplified - based on success and url_changed)
464
+ verify_passed = result.success and (
465
+ result.url_changed or result.action != "click"
466
+ )
467
+ verify_signals = {
468
+ "url_changed": result.url_changed or False,
469
+ }
470
+ if result.error:
471
+ verify_signals["error"] = result.error
472
+
473
+ # Add elements_found array if element was targeted
474
+ if result.element_id is not None:
475
+ bbox = self._get_element_bbox(result.element_id, snap)
476
+ if bbox:
477
+ verify_signals["elements_found"] = [
478
+ {
479
+ "label": f"Element {result.element_id}",
480
+ "bounding_box": bbox,
481
+ }
482
+ ]
483
+
484
+ verify_data = {
485
+ "passed": verify_passed,
486
+ "signals": verify_signals,
487
+ }
488
+
489
+ # Build elements data for pre field (include diff_status from snap_with_diff)
490
+ # Use the same format as build_snapshot_event for consistency
491
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
492
+ pre_elements = snapshot_event_data.get("elements", [])
493
+
494
+ # Build complete step_end event
495
+ step_end_data = TraceEventBuilder.build_step_end_event(
496
+ step_id=step_id,
497
+ step_index=self._step_count,
498
+ goal=goal,
499
+ attempt=attempt,
500
+ pre_url=pre_url,
501
+ post_url=post_url,
502
+ snapshot_digest=snapshot_digest,
503
+ llm_data=llm_data,
504
+ exec_data=exec_data,
505
+ verify_data=verify_data,
506
+ pre_elements=pre_elements,
507
+ )
508
+
509
+ _safe_tracer_call(
510
+ self.tracer,
511
+ "emit",
512
+ self.verbose,
513
+ "step_end",
514
+ step_end_data,
312
515
  step_id=step_id,
313
516
  )
314
517
 
@@ -317,7 +520,14 @@ class SentienceAgent(BaseAgent):
317
520
  except Exception as e:
318
521
  # Emit error trace event if tracer is enabled
319
522
  if self.tracer:
320
- self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
523
+ _safe_tracer_call(
524
+ self.tracer,
525
+ "emit_error",
526
+ self.verbose,
527
+ step_id=step_id,
528
+ error=str(e),
529
+ attempt=attempt,
530
+ )
321
531
 
322
532
  if attempt < max_retries:
323
533
  if self.verbose:
@@ -346,195 +556,573 @@ class SentienceAgent(BaseAgent):
346
556
  )
347
557
  raise RuntimeError(f"Failed after {max_retries} retries: {e}")
348
558
 
349
- def _build_context(self, snap: Snapshot, goal: str) -> str:
559
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
350
560
  """
351
- Convert snapshot elements to token-efficient prompt string
352
-
353
- Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
561
+ Track token usage for analytics
354
562
 
355
563
  Args:
356
- snap: Snapshot object
357
- goal: User goal (for context)
564
+ goal: User goal
565
+ llm_response: LLM response with token usage
566
+ """
567
+ if llm_response.prompt_tokens:
568
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
569
+ if llm_response.completion_tokens:
570
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
571
+ if llm_response.total_tokens:
572
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
573
+
574
+ self._token_usage_raw["by_action"].append(
575
+ {
576
+ "goal": goal,
577
+ "prompt_tokens": llm_response.prompt_tokens or 0,
578
+ "completion_tokens": llm_response.completion_tokens or 0,
579
+ "total_tokens": llm_response.total_tokens or 0,
580
+ "model": llm_response.model_name,
581
+ }
582
+ )
583
+
584
+ def get_token_stats(self) -> TokenStats:
585
+ """
586
+ Get token usage statistics
358
587
 
359
588
  Returns:
360
- Formatted element context string
589
+ TokenStats with token usage breakdown
361
590
  """
362
- lines = []
363
- # Note: elements are already filtered by filter_elements() in act()
364
- for el in snap.elements:
365
- # Extract visual cues
366
- cues = []
367
- if el.visual_cues.is_primary:
368
- cues.append("PRIMARY")
369
- if el.visual_cues.is_clickable:
370
- cues.append("CLICKABLE")
371
- if el.visual_cues.background_color_name:
372
- cues.append(f"color:{el.visual_cues.background_color_name}")
373
-
374
- # Format element line
375
- cues_str = f" {{{','.join(cues)}}}" if cues else ""
376
- text_preview = (
377
- (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
378
- )
591
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
592
+ return TokenStats(
593
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
594
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
595
+ total_tokens=self._token_usage_raw["total_tokens"],
596
+ by_action=by_action,
597
+ )
379
598
 
380
- lines.append(
381
- f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
382
- f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
383
- )
599
+ def get_history(self) -> list[ActionHistory]:
600
+ """
601
+ Get execution history
602
+
603
+ Returns:
604
+ List of ActionHistory entries
605
+ """
606
+ return [ActionHistory(**h) for h in self.history]
384
607
 
385
- return "\n".join(lines)
608
+ def clear_history(self) -> None:
609
+ """Clear execution history and reset token counters"""
610
+ self.history.clear()
611
+ self._token_usage_raw = {
612
+ "total_prompt_tokens": 0,
613
+ "total_completion_tokens": 0,
614
+ "total_tokens": 0,
615
+ "by_action": [],
616
+ }
386
617
 
387
- def _extract_action_from_response(self, response: str) -> str:
618
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
388
619
  """
389
- Extract action command from LLM response, handling cases where
390
- the LLM adds extra explanation despite instructions.
620
+ Filter elements from snapshot based on goal context.
621
+
622
+ This implementation uses ElementFilter to apply goal-based keyword matching
623
+ to boost relevant elements and filters out irrelevant ones.
391
624
 
392
625
  Args:
393
- response: Raw LLM response text
626
+ snapshot: Current page snapshot
627
+ goal: User's goal (can inform filtering)
394
628
 
395
629
  Returns:
396
- Cleaned action command string
630
+ Filtered list of elements
397
631
  """
398
- import re
632
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
399
633
 
400
- # Remove markdown code blocks if present
401
- response = re.sub(r"```[\w]*\n?", "", response)
402
- response = response.strip()
403
634
 
404
- # Try to find action patterns in the response
405
- # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
406
- action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
635
+ class SentienceAgentAsync(BaseAgentAsync):
636
+ """
637
+ High-level async agent that combines Sentience SDK with any LLM provider.
407
638
 
408
- match = re.search(action_pattern, response, re.IGNORECASE)
409
- if match:
410
- return match.group(1)
639
+ Uses observe-think-act loop to execute natural language commands:
640
+ 1. OBSERVE: Get snapshot of current page state
641
+ 2. THINK: Query LLM to decide next action
642
+ 3. ACT: Execute action using SDK
411
643
 
412
- # If no pattern match, return the original response (will likely fail parsing)
413
- return response
644
+ Example:
645
+ >>> from sentience.async_api import AsyncSentienceBrowser
646
+ >>> from sentience.agent import SentienceAgentAsync
647
+ >>> from sentience.llm_provider import OpenAIProvider
648
+ >>>
649
+ >>> async with AsyncSentienceBrowser() as browser:
650
+ >>> await browser.goto("https://google.com")
651
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
652
+ >>> agent = SentienceAgentAsync(browser, llm)
653
+ >>> await agent.act("Click the search box")
654
+ >>> await agent.act("Type 'magic mouse' into the search field")
655
+ >>> await agent.act("Press Enter key")
656
+ """
414
657
 
415
- def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
658
+ def __init__(
659
+ self,
660
+ browser: AsyncSentienceBrowser,
661
+ llm: LLMProvider,
662
+ default_snapshot_limit: int = 50,
663
+ verbose: bool = True,
664
+ tracer: Optional["Tracer"] = None,
665
+ config: Optional["AgentConfig"] = None,
666
+ ):
416
667
  """
417
- Query LLM with standardized prompt template
668
+ Initialize Sentience Agent (async)
418
669
 
419
670
  Args:
420
- dom_context: Formatted element context
421
- goal: User goal
422
-
423
- Returns:
424
- LLMResponse from LLM provider
671
+ browser: AsyncSentienceBrowser instance
672
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
673
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
674
+ verbose: Print execution logs (default: True)
675
+ tracer: Optional Tracer instance for execution tracking (default: None)
676
+ config: Optional AgentConfig for advanced configuration (default: None)
425
677
  """
426
- system_prompt = f"""You are an AI web automation agent.
427
-
428
- GOAL: {goal}
429
-
430
- VISIBLE ELEMENTS (sorted by importance):
431
- {dom_context}
432
-
433
- VISUAL CUES EXPLAINED:
434
- - {{PRIMARY}}: Main call-to-action element on the page
435
- - {{CLICKABLE}}: Element is clickable
436
- - {{color:X}}: Background color name
437
-
438
- CRITICAL RESPONSE FORMAT:
439
- You MUST respond with ONLY ONE of these exact action formats:
440
- - CLICK(id) - Click element by ID
441
- - TYPE(id, "text") - Type text into element
442
- - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
443
- - FINISH() - Task complete
444
-
445
- DO NOT include any explanation, reasoning, or natural language.
446
- DO NOT use markdown formatting or code blocks.
447
- DO NOT say "The next step is..." or anything similar.
448
-
449
- CORRECT Examples:
450
- CLICK(42)
451
- TYPE(15, "magic mouse")
452
- PRESS("Enter")
453
- FINISH()
454
-
455
- INCORRECT Examples (DO NOT DO THIS):
456
- "The next step is to click..."
457
- "I will type..."
458
- ```CLICK(42)```
459
- """
678
+ self.browser = browser
679
+ self.llm = llm
680
+ self.default_snapshot_limit = default_snapshot_limit
681
+ self.verbose = verbose
682
+ self.tracer = tracer
683
+ self.config = config or AgentConfig()
684
+
685
+ # Initialize handlers
686
+ self.llm_handler = LLMInteractionHandler(llm)
687
+ self.action_executor = ActionExecutor(browser)
688
+
689
+ # Screenshot sequence counter
690
+ # Execution history
691
+ self.history: list[dict[str, Any]] = []
692
+
693
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
694
+ self._token_usage_raw = {
695
+ "total_prompt_tokens": 0,
696
+ "total_completion_tokens": 0,
697
+ "total_tokens": 0,
698
+ "by_action": [],
699
+ }
460
700
 
461
- user_prompt = "Return the single action command:"
701
+ # Step counter for tracing
702
+ self._step_count = 0
703
+
704
+ # Previous snapshot for diff detection
705
+ self._previous_snapshot: Snapshot | None = None
462
706
 
463
- return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
707
+ def _compute_hash(self, text: str) -> str:
708
+ """Compute SHA256 hash of text."""
709
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
464
710
 
465
- def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
711
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
712
+ """Get bounding box for an element from snapshot."""
713
+ if element_id is None:
714
+ return None
715
+ for el in snap.elements:
716
+ if el.id == element_id:
717
+ return {
718
+ "x": el.bbox.x,
719
+ "y": el.bbox.y,
720
+ "width": el.bbox.width,
721
+ "height": el.bbox.height,
722
+ }
723
+ return None
724
+
725
+ async def act( # noqa: C901
726
+ self,
727
+ goal: str,
728
+ max_retries: int = 2,
729
+ snapshot_options: SnapshotOptions | None = None,
730
+ ) -> AgentActionResult:
466
731
  """
467
- Parse action string and execute SDK call
732
+ Execute a high-level goal using observe → think → act loop (async)
468
733
 
469
734
  Args:
470
- action_str: Action string from LLM (e.g., "CLICK(42)")
471
- snap: Current snapshot (for context)
735
+ goal: Natural language instruction (e.g., "Click the Sign In button")
736
+ max_retries: Number of retries on failure (default: 2)
737
+ snapshot_options: Optional SnapshotOptions for this specific action
472
738
 
473
739
  Returns:
474
- Execution result dictionary
740
+ AgentActionResult with execution details
741
+
742
+ Example:
743
+ >>> result = await agent.act("Click the search box")
744
+ >>> print(result.success, result.action, result.element_id)
745
+ True click 42
475
746
  """
476
- # Parse CLICK(42)
477
- if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
478
- element_id = int(match.group(1))
479
- result = click(self.browser, element_id)
480
- return {
481
- "success": result.success,
482
- "action": "click",
483
- "element_id": element_id,
484
- "outcome": result.outcome,
485
- "url_changed": result.url_changed,
486
- }
747
+ if self.verbose:
748
+ print(f"\n{'=' * 70}")
749
+ print(f"🤖 Agent Goal: {goal}")
750
+ print(f"{'=' * 70}")
487
751
 
488
- # Parse TYPE(42, "hello world")
489
- elif match := re.match(
490
- r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
491
- action_str,
492
- re.IGNORECASE,
493
- ):
494
- element_id = int(match.group(1))
495
- text = match.group(2)
496
- result = type_text(self.browser, element_id, text)
497
- return {
498
- "success": result.success,
499
- "action": "type",
500
- "element_id": element_id,
501
- "text": text,
502
- "outcome": result.outcome,
503
- }
752
+ # Generate step ID for tracing
753
+ self._step_count += 1
754
+ step_id = f"step-{self._step_count}"
504
755
 
505
- # Parse PRESS("Enter")
506
- elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
507
- key = match.group(1)
508
- result = press(self.browser, key)
509
- return {
510
- "success": result.success,
511
- "action": "press",
512
- "key": key,
513
- "outcome": result.outcome,
514
- }
756
+ # Emit step_start trace event if tracer is enabled
757
+ if self.tracer:
758
+ pre_url = self.browser.page.url if self.browser.page else None
759
+ _safe_tracer_call(
760
+ self.tracer,
761
+ "emit_step_start",
762
+ self.verbose,
763
+ step_id=step_id,
764
+ step_index=self._step_count,
765
+ goal=goal,
766
+ attempt=0,
767
+ pre_url=pre_url,
768
+ )
515
769
 
516
- # Parse FINISH()
517
- elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
518
- return {
519
- "success": True,
520
- "action": "finish",
521
- "message": "Task marked as complete",
522
- }
770
+ for attempt in range(max_retries + 1):
771
+ try:
772
+ # 1. OBSERVE: Get refined semantic snapshot
773
+ start_time = time.time()
523
774
 
524
- else:
525
- raise ValueError(
526
- f"Unknown action format: {action_str}\n"
527
- f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
528
- )
775
+ # Use provided options or create default
776
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
777
+ # Only set goal if not already provided
778
+ if snap_opts.goal is None:
779
+ snap_opts.goal = goal
529
780
 
530
- def _track_tokens(self, goal: str, llm_response: LLMResponse):
531
- """
532
- Track token usage for analytics
781
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
782
+ # Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
783
+ # (snapshot_options.screenshot defaults to False, so we check if it's still False)
784
+ if self.config and (snapshot_options is None or snap_opts.screenshot is False):
785
+ if self.config.capture_screenshots:
786
+ # Create ScreenshotConfig from AgentConfig
787
+ snap_opts.screenshot = ScreenshotConfig(
788
+ format=self.config.screenshot_format,
789
+ quality=(
790
+ self.config.screenshot_quality
791
+ if self.config.screenshot_format == "jpeg"
792
+ else None
793
+ ),
794
+ )
795
+ else:
796
+ snap_opts.screenshot = False
797
+ # Apply show_overlay from AgentConfig
798
+ # Note: User can override by explicitly passing show_overlay in snapshot_options
799
+ snap_opts.show_overlay = self.config.show_overlay
533
800
 
534
- Args:
535
- goal: User goal
536
- llm_response: LLM response with token usage
537
- """
801
+ # Call snapshot with options object (matches TypeScript API)
802
+ snap = await snapshot_async(self.browser, snap_opts)
803
+
804
+ if snap.status != "success":
805
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
806
+
807
+ # Compute diff_status by comparing with previous snapshot
808
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
809
+
810
+ # Create snapshot with diff_status populated
811
+ snap_with_diff = Snapshot(
812
+ status=snap.status,
813
+ timestamp=snap.timestamp,
814
+ url=snap.url,
815
+ viewport=snap.viewport,
816
+ elements=elements_with_diff,
817
+ screenshot=snap.screenshot,
818
+ screenshot_format=snap.screenshot_format,
819
+ error=snap.error,
820
+ )
821
+
822
+ # Update previous snapshot for next comparison
823
+ self._previous_snapshot = snap
824
+
825
+ # Apply element filtering based on goal
826
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
827
+
828
+ # Emit snapshot trace event if tracer is enabled
829
+ if self.tracer:
830
+ # Build snapshot event data (use snap_with_diff to include diff_status)
831
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
832
+
833
+ # Always include screenshot in trace event for studio viewer compatibility
834
+ # CloudTraceSink will extract and upload screenshots separately, then remove
835
+ # screenshot_base64 from events before uploading the trace file.
836
+ if snap.screenshot:
837
+ # Extract base64 string from data URL if needed
838
+ if snap.screenshot.startswith("data:image"):
839
+ # Format: "data:image/jpeg;base64,{base64_string}"
840
+ screenshot_base64 = (
841
+ snap.screenshot.split(",", 1)[1]
842
+ if "," in snap.screenshot
843
+ else snap.screenshot
844
+ )
845
+ else:
846
+ screenshot_base64 = snap.screenshot
847
+
848
+ snapshot_data["screenshot_base64"] = screenshot_base64
849
+ if snap.screenshot_format:
850
+ snapshot_data["screenshot_format"] = snap.screenshot_format
851
+
852
+ _safe_tracer_call(
853
+ self.tracer,
854
+ "emit",
855
+ self.verbose,
856
+ "snapshot",
857
+ snapshot_data,
858
+ step_id=step_id,
859
+ )
860
+
861
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
862
+ filtered_snap = Snapshot(
863
+ status=snap_with_diff.status,
864
+ timestamp=snap_with_diff.timestamp,
865
+ url=snap_with_diff.url,
866
+ viewport=snap_with_diff.viewport,
867
+ elements=filtered_elements,
868
+ screenshot=snap_with_diff.screenshot,
869
+ screenshot_format=snap_with_diff.screenshot_format,
870
+ error=snap_with_diff.error,
871
+ )
872
+
873
+ # 2. GROUND: Format elements for LLM context
874
+ context = self.llm_handler.build_context(filtered_snap, goal)
875
+
876
+ # 3. THINK: Query LLM for next action
877
+ llm_response = self.llm_handler.query_llm(context, goal)
878
+
879
+ # Emit LLM query trace event if tracer is enabled
880
+ if self.tracer:
881
+ _safe_tracer_call(
882
+ self.tracer,
883
+ "emit",
884
+ self.verbose,
885
+ "llm_query",
886
+ {
887
+ "prompt_tokens": llm_response.prompt_tokens,
888
+ "completion_tokens": llm_response.completion_tokens,
889
+ "model": llm_response.model_name,
890
+ "response": llm_response.content[:200], # Truncate for brevity
891
+ },
892
+ step_id=step_id,
893
+ )
894
+
895
+ if self.verbose:
896
+ print(f"🧠 LLM Decision: {llm_response.content}")
897
+
898
+ # Track token usage
899
+ self._track_tokens(goal, llm_response)
900
+
901
+ # Parse action from LLM response
902
+ action_str = self.llm_handler.extract_action(llm_response.content)
903
+
904
+ # 4. EXECUTE: Parse and run action
905
+ result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
906
+
907
+ duration_ms = int((time.time() - start_time) * 1000)
908
+
909
+ # Create AgentActionResult from execution result
910
+ result = AgentActionResult(
911
+ success=result_dict["success"],
912
+ action=result_dict["action"],
913
+ goal=goal,
914
+ duration_ms=duration_ms,
915
+ attempt=attempt,
916
+ element_id=result_dict.get("element_id"),
917
+ text=result_dict.get("text"),
918
+ key=result_dict.get("key"),
919
+ outcome=result_dict.get("outcome"),
920
+ url_changed=result_dict.get("url_changed"),
921
+ error=result_dict.get("error"),
922
+ message=result_dict.get("message"),
923
+ )
924
+
925
+ # Emit action execution trace event if tracer is enabled
926
+ if self.tracer:
927
+ post_url = self.browser.page.url if self.browser.page else None
928
+
929
+ # Include element data for live overlay visualization
930
+ elements_data = [
931
+ {
932
+ "id": el.id,
933
+ "bbox": {
934
+ "x": el.bbox.x,
935
+ "y": el.bbox.y,
936
+ "width": el.bbox.width,
937
+ "height": el.bbox.height,
938
+ },
939
+ "role": el.role,
940
+ "text": el.text[:50] if el.text else "",
941
+ }
942
+ for el in filtered_snap.elements[:50]
943
+ ]
944
+
945
+ _safe_tracer_call(
946
+ self.tracer,
947
+ "emit",
948
+ self.verbose,
949
+ "action",
950
+ {
951
+ "action": result.action,
952
+ "element_id": result.element_id,
953
+ "success": result.success,
954
+ "outcome": result.outcome,
955
+ "duration_ms": duration_ms,
956
+ "post_url": post_url,
957
+ "elements": elements_data, # Add element data for overlay
958
+ "target_element_id": result.element_id, # Highlight target in red
959
+ },
960
+ step_id=step_id,
961
+ )
962
+
963
+ # 5. RECORD: Track history
964
+ self.history.append(
965
+ {
966
+ "goal": goal,
967
+ "action": action_str,
968
+ "result": result.model_dump(), # Store as dict
969
+ "success": result.success,
970
+ "attempt": attempt,
971
+ "duration_ms": duration_ms,
972
+ }
973
+ )
974
+
975
+ if self.verbose:
976
+ status = "✅" if result.success else "❌"
977
+ print(f"{status} Completed in {duration_ms}ms")
978
+
979
+ # Emit step completion trace event if tracer is enabled
980
+ if self.tracer:
981
+ # Get pre_url from step_start (stored in tracer or use current)
982
+ pre_url = snap.url
983
+ post_url = self.browser.page.url if self.browser.page else None
984
+
985
+ # Compute snapshot digest (simplified - use URL + timestamp)
986
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
987
+
988
+ # Build LLM data
989
+ llm_response_text = llm_response.content
990
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
991
+ llm_data = {
992
+ "response_text": llm_response_text,
993
+ "response_hash": llm_response_hash,
994
+ "usage": {
995
+ "prompt_tokens": llm_response.prompt_tokens or 0,
996
+ "completion_tokens": llm_response.completion_tokens or 0,
997
+ "total_tokens": llm_response.total_tokens or 0,
998
+ },
999
+ }
1000
+
1001
+ # Build exec data
1002
+ exec_data = {
1003
+ "success": result.success,
1004
+ "action": result.action,
1005
+ "outcome": result.outcome
1006
+ or (
1007
+ f"Action {result.action} executed successfully"
1008
+ if result.success
1009
+ else f"Action {result.action} failed"
1010
+ ),
1011
+ "duration_ms": duration_ms,
1012
+ }
1013
+
1014
+ # Add optional exec fields
1015
+ if result.element_id is not None:
1016
+ exec_data["element_id"] = result.element_id
1017
+ # Add bounding box if element found
1018
+ bbox = self._get_element_bbox(result.element_id, snap)
1019
+ if bbox:
1020
+ exec_data["bounding_box"] = bbox
1021
+ if result.text is not None:
1022
+ exec_data["text"] = result.text
1023
+ if result.key is not None:
1024
+ exec_data["key"] = result.key
1025
+ if result.error is not None:
1026
+ exec_data["error"] = result.error
1027
+
1028
+ # Build verify data (simplified - based on success and url_changed)
1029
+ verify_passed = result.success and (
1030
+ result.url_changed or result.action != "click"
1031
+ )
1032
+ verify_signals = {
1033
+ "url_changed": result.url_changed or False,
1034
+ }
1035
+ if result.error:
1036
+ verify_signals["error"] = result.error
1037
+
1038
+ # Add elements_found array if element was targeted
1039
+ if result.element_id is not None:
1040
+ bbox = self._get_element_bbox(result.element_id, snap)
1041
+ if bbox:
1042
+ verify_signals["elements_found"] = [
1043
+ {
1044
+ "label": f"Element {result.element_id}",
1045
+ "bounding_box": bbox,
1046
+ }
1047
+ ]
1048
+
1049
+ verify_data = {
1050
+ "passed": verify_passed,
1051
+ "signals": verify_signals,
1052
+ }
1053
+
1054
+ # Build elements data for pre field (include diff_status from snap_with_diff)
1055
+ # Use the same format as build_snapshot_event for consistency
1056
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
1057
+ pre_elements = snapshot_event_data.get("elements", [])
1058
+
1059
+ # Build complete step_end event
1060
+ step_end_data = TraceEventBuilder.build_step_end_event(
1061
+ step_id=step_id,
1062
+ step_index=self._step_count,
1063
+ goal=goal,
1064
+ attempt=attempt,
1065
+ pre_url=pre_url,
1066
+ post_url=post_url,
1067
+ snapshot_digest=snapshot_digest,
1068
+ llm_data=llm_data,
1069
+ exec_data=exec_data,
1070
+ verify_data=verify_data,
1071
+ pre_elements=pre_elements,
1072
+ )
1073
+
1074
+ _safe_tracer_call(
1075
+ self.tracer,
1076
+ "emit",
1077
+ self.verbose,
1078
+ "step_end",
1079
+ step_end_data,
1080
+ step_id=step_id,
1081
+ )
1082
+
1083
+ return result
1084
+
1085
+ except Exception as e:
1086
+ # Emit error trace event if tracer is enabled
1087
+ if self.tracer:
1088
+ _safe_tracer_call(
1089
+ self.tracer,
1090
+ "emit_error",
1091
+ self.verbose,
1092
+ step_id=step_id,
1093
+ error=str(e),
1094
+ attempt=attempt,
1095
+ )
1096
+
1097
+ if attempt < max_retries:
1098
+ if self.verbose:
1099
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
1100
+ await asyncio.sleep(1.0) # Brief delay before retry
1101
+ continue
1102
+ else:
1103
+ # Create error result
1104
+ error_result = AgentActionResult(
1105
+ success=False,
1106
+ action="error",
1107
+ goal=goal,
1108
+ duration_ms=0,
1109
+ attempt=attempt,
1110
+ error=str(e),
1111
+ )
1112
+ self.history.append(
1113
+ {
1114
+ "goal": goal,
1115
+ "action": "error",
1116
+ "result": error_result.model_dump(),
1117
+ "success": False,
1118
+ "attempt": attempt,
1119
+ "duration_ms": 0,
1120
+ }
1121
+ )
1122
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
1123
+
1124
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
1125
+ """Track token usage for analytics (same as sync version)"""
538
1126
  if llm_response.prompt_tokens:
539
1127
  self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
540
1128
  if llm_response.completion_tokens:
@@ -553,12 +1141,7 @@ INCORRECT Examples (DO NOT DO THIS):
553
1141
  )
554
1142
 
555
1143
  def get_token_stats(self) -> TokenStats:
556
- """
557
- Get token usage statistics
558
-
559
- Returns:
560
- TokenStats with token usage breakdown
561
- """
1144
+ """Get token usage statistics (same as sync version)"""
562
1145
  by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
563
1146
  return TokenStats(
564
1147
  total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
@@ -568,16 +1151,11 @@ INCORRECT Examples (DO NOT DO THIS):
568
1151
  )
569
1152
 
570
1153
  def get_history(self) -> list[ActionHistory]:
571
- """
572
- Get execution history
573
-
574
- Returns:
575
- List of ActionHistory entries
576
- """
1154
+ """Get execution history (same as sync version)"""
577
1155
  return [ActionHistory(**h) for h in self.history]
578
1156
 
579
1157
  def clear_history(self) -> None:
580
- """Clear execution history and reset token counters"""
1158
+ """Clear execution history and reset token counters (same as sync version)"""
581
1159
  self.history.clear()
582
1160
  self._token_usage_raw = {
583
1161
  "total_prompt_tokens": 0,
@@ -590,8 +1168,8 @@ INCORRECT Examples (DO NOT DO THIS):
590
1168
  """
591
1169
  Filter elements from snapshot based on goal context.
592
1170
 
593
- This default implementation applies goal-based keyword matching to boost
594
- relevant elements and filters out irrelevant ones.
1171
+ This implementation uses ElementFilter to apply goal-based keyword matching
1172
+ to boost relevant elements and filters out irrelevant ones.
595
1173
 
596
1174
  Args:
597
1175
  snapshot: Current page snapshot
@@ -600,73 +1178,4 @@ INCORRECT Examples (DO NOT DO THIS):
600
1178
  Returns:
601
1179
  Filtered list of elements
602
1180
  """
603
- elements = snapshot.elements
604
-
605
- # If no goal provided, return all elements (up to limit)
606
- if not goal:
607
- return elements[: self.default_snapshot_limit]
608
-
609
- goal_lower = goal.lower()
610
-
611
- # Extract keywords from goal
612
- keywords = self._extract_keywords(goal_lower)
613
-
614
- # Boost elements matching goal keywords
615
- scored_elements = []
616
- for el in elements:
617
- score = el.importance
618
-
619
- # Boost if element text matches goal
620
- if el.text and any(kw in el.text.lower() for kw in keywords):
621
- score += 0.3
622
-
623
- # Boost if role matches goal intent
624
- if "click" in goal_lower and el.visual_cues.is_clickable:
625
- score += 0.2
626
- if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
627
- score += 0.2
628
- if "search" in goal_lower:
629
- # Filter out non-interactive elements for search tasks
630
- if el.role in ["link", "img"] and not el.visual_cues.is_primary:
631
- score -= 0.5
632
-
633
- scored_elements.append((score, el))
634
-
635
- # Re-sort by boosted score
636
- scored_elements.sort(key=lambda x: x[0], reverse=True)
637
- elements = [el for _, el in scored_elements]
638
-
639
- return elements[: self.default_snapshot_limit]
640
-
641
- def _extract_keywords(self, text: str) -> list[str]:
642
- """
643
- Extract meaningful keywords from goal text
644
-
645
- Args:
646
- text: Text to extract keywords from
647
-
648
- Returns:
649
- List of keywords
650
- """
651
- stopwords = {
652
- "the",
653
- "a",
654
- "an",
655
- "and",
656
- "or",
657
- "but",
658
- "in",
659
- "on",
660
- "at",
661
- "to",
662
- "for",
663
- "of",
664
- "with",
665
- "by",
666
- "from",
667
- "as",
668
- "is",
669
- "was",
670
- }
671
- words = text.split()
672
- return [w for w in words if w not in stopwords and len(w) > 2]
1181
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)