sentienceapi 0.90.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. sentience/__init__.py +153 -0
  2. sentience/actions.py +439 -0
  3. sentience/agent.py +687 -0
  4. sentience/agent_config.py +43 -0
  5. sentience/base_agent.py +101 -0
  6. sentience/browser.py +409 -0
  7. sentience/cli.py +130 -0
  8. sentience/cloud_tracing.py +292 -0
  9. sentience/conversational_agent.py +509 -0
  10. sentience/expect.py +92 -0
  11. sentience/extension/background.js +233 -0
  12. sentience/extension/content.js +298 -0
  13. sentience/extension/injected_api.js +1473 -0
  14. sentience/extension/manifest.json +36 -0
  15. sentience/extension/pkg/sentience_core.d.ts +51 -0
  16. sentience/extension/pkg/sentience_core.js +529 -0
  17. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  18. sentience/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
  19. sentience/extension/release.json +115 -0
  20. sentience/extension/test-content.js +4 -0
  21. sentience/formatting.py +59 -0
  22. sentience/generator.py +202 -0
  23. sentience/inspector.py +185 -0
  24. sentience/llm_provider.py +431 -0
  25. sentience/models.py +406 -0
  26. sentience/overlay.py +115 -0
  27. sentience/query.py +303 -0
  28. sentience/read.py +96 -0
  29. sentience/recorder.py +369 -0
  30. sentience/schemas/trace_v1.json +216 -0
  31. sentience/screenshot.py +54 -0
  32. sentience/snapshot.py +282 -0
  33. sentience/text_search.py +150 -0
  34. sentience/trace_indexing/__init__.py +27 -0
  35. sentience/trace_indexing/index_schema.py +111 -0
  36. sentience/trace_indexing/indexer.py +363 -0
  37. sentience/tracer_factory.py +211 -0
  38. sentience/tracing.py +285 -0
  39. sentience/utils.py +296 -0
  40. sentience/wait.py +73 -0
  41. sentienceapi-0.90.11.dist-info/METADATA +878 -0
  42. sentienceapi-0.90.11.dist-info/RECORD +46 -0
  43. sentienceapi-0.90.11.dist-info/WHEEL +5 -0
  44. sentienceapi-0.90.11.dist-info/entry_points.txt +2 -0
  45. sentienceapi-0.90.11.dist-info/licenses/LICENSE.md +43 -0
  46. sentienceapi-0.90.11.dist-info/top_level.txt +1 -0
sentience/agent.py ADDED
@@ -0,0 +1,687 @@
1
+ """
2
+ Sentience Agent: High-level automation agent using LLM + SDK
3
+ Implements observe-think-act loop for natural language commands
4
+ """
5
+
6
+ import re
7
+ import time
8
+ from typing import TYPE_CHECKING, Any, Optional
9
+
10
+ from .actions import click, press, type_text
11
+ from .base_agent import BaseAgent
12
+ from .browser import SentienceBrowser
13
+ from .llm_provider import LLMProvider, LLMResponse
14
+ from .models import (
15
+ ActionHistory,
16
+ ActionTokenUsage,
17
+ AgentActionResult,
18
+ Element,
19
+ ScreenshotConfig,
20
+ Snapshot,
21
+ SnapshotOptions,
22
+ TokenStats,
23
+ )
24
+ from .snapshot import snapshot
25
+
26
+ if TYPE_CHECKING:
27
+ from .agent_config import AgentConfig
28
+ from .tracing import Tracer
29
+
30
+
31
+ class SentienceAgent(BaseAgent):
32
+ """
33
+ High-level agent that combines Sentience SDK with any LLM provider.
34
+
35
+ Uses observe-think-act loop to execute natural language commands:
36
+ 1. OBSERVE: Get snapshot of current page state
37
+ 2. THINK: Query LLM to decide next action
38
+ 3. ACT: Execute action using SDK
39
+
40
+ Example:
41
+ >>> from sentience import SentienceBrowser, SentienceAgent
42
+ >>> from sentience.llm_provider import OpenAIProvider
43
+ >>>
44
+ >>> browser = SentienceBrowser(api_key="sentience_key")
45
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
46
+ >>> agent = SentienceAgent(browser, llm)
47
+ >>>
48
+ >>> with browser:
49
+ >>> browser.page.goto("https://google.com")
50
+ >>> agent.act("Click the search box")
51
+ >>> agent.act("Type 'magic mouse' into the search field")
52
+ >>> agent.act("Press Enter key")
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ browser: SentienceBrowser,
58
+ llm: LLMProvider,
59
+ default_snapshot_limit: int = 50,
60
+ verbose: bool = True,
61
+ tracer: Optional["Tracer"] = None,
62
+ config: Optional["AgentConfig"] = None,
63
+ ):
64
+ """
65
+ Initialize Sentience Agent
66
+
67
+ Args:
68
+ browser: SentienceBrowser instance
69
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
70
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
71
+ verbose: Print execution logs (default: True)
72
+ tracer: Optional Tracer instance for execution tracking (default: None)
73
+ config: Optional AgentConfig for advanced configuration (default: None)
74
+ """
75
+ self.browser = browser
76
+ self.llm = llm
77
+ self.default_snapshot_limit = default_snapshot_limit
78
+ self.verbose = verbose
79
+ self.tracer = tracer
80
+ self.config = config
81
+
82
+ # Execution history
83
+ self.history: list[dict[str, Any]] = []
84
+
85
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
86
+ self._token_usage_raw = {
87
+ "total_prompt_tokens": 0,
88
+ "total_completion_tokens": 0,
89
+ "total_tokens": 0,
90
+ "by_action": [],
91
+ }
92
+
93
+ # Step counter for tracing
94
+ self._step_count = 0
95
+
96
+ def act( # noqa: C901
97
+ self,
98
+ goal: str,
99
+ max_retries: int = 2,
100
+ snapshot_options: SnapshotOptions | None = None,
101
+ ) -> AgentActionResult:
102
+ """
103
+ Execute a high-level goal using observe → think → act loop
104
+
105
+ Args:
106
+ goal: Natural language instruction (e.g., "Click the Sign In button")
107
+ max_retries: Number of retries on failure (default: 2)
108
+ snapshot_options: Optional SnapshotOptions for this specific action
109
+
110
+ Returns:
111
+ AgentActionResult with execution details
112
+
113
+ Example:
114
+ >>> result = agent.act("Click the search box")
115
+ >>> print(result.success, result.action, result.element_id)
116
+ True click 42
117
+ >>> # Backward compatible dict access
118
+ >>> print(result["element_id"]) # Works but shows deprecation warning
119
+ 42
120
+ """
121
+ if self.verbose:
122
+ print(f"\n{'=' * 70}")
123
+ print(f"🤖 Agent Goal: {goal}")
124
+ print(f"{'=' * 70}")
125
+
126
+ # Generate step ID for tracing
127
+ self._step_count += 1
128
+ step_id = f"step-{self._step_count}"
129
+
130
+ # Emit step_start trace event if tracer is enabled
131
+ if self.tracer:
132
+ pre_url = self.browser.page.url if self.browser.page else None
133
+ self.tracer.emit_step_start(
134
+ step_id=step_id,
135
+ step_index=self._step_count,
136
+ goal=goal,
137
+ attempt=0,
138
+ pre_url=pre_url,
139
+ )
140
+
141
+ for attempt in range(max_retries + 1):
142
+ try:
143
+ # 1. OBSERVE: Get refined semantic snapshot
144
+ start_time = time.time()
145
+
146
+ # Use provided options or create default
147
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
148
+ # Only set goal if not already provided
149
+ if snap_opts.goal is None:
150
+ snap_opts.goal = goal
151
+
152
+ # Convert screenshot config to dict if needed
153
+ screenshot_param = snap_opts.screenshot
154
+ if isinstance(snap_opts.screenshot, ScreenshotConfig):
155
+ screenshot_param = {
156
+ "format": snap_opts.screenshot.format,
157
+ "quality": snap_opts.screenshot.quality,
158
+ }
159
+
160
+ # Call snapshot with converted parameters
161
+ snap = snapshot(
162
+ self.browser,
163
+ screenshot=screenshot_param,
164
+ limit=snap_opts.limit,
165
+ filter=snap_opts.filter.model_dump() if snap_opts.filter else None,
166
+ use_api=snap_opts.use_api,
167
+ goal=snap_opts.goal, # Pass goal to snapshot
168
+ )
169
+
170
+ if snap.status != "success":
171
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
172
+
173
+ # Apply element filtering based on goal
174
+ filtered_elements = self.filter_elements(snap, goal)
175
+
176
+ # Emit snapshot trace event if tracer is enabled
177
+ if self.tracer:
178
+ # Include element data for live overlay visualization
179
+ # Use filtered_elements for overlay (only relevant elements)
180
+ elements_data = [
181
+ {
182
+ "id": el.id,
183
+ "bbox": {
184
+ "x": el.bbox.x,
185
+ "y": el.bbox.y,
186
+ "width": el.bbox.width,
187
+ "height": el.bbox.height,
188
+ },
189
+ "role": el.role,
190
+ "text": el.text[:50] if el.text else "", # Truncate for brevity
191
+ }
192
+ for el in filtered_elements[:50] # Limit to first 50 for performance
193
+ ]
194
+
195
+ self.tracer.emit(
196
+ "snapshot",
197
+ {
198
+ "url": snap.url,
199
+ "element_count": len(snap.elements),
200
+ "timestamp": snap.timestamp,
201
+ "elements": elements_data, # Add element data for overlay
202
+ },
203
+ step_id=step_id,
204
+ )
205
+
206
+ # Create filtered snapshot
207
+ filtered_snap = Snapshot(
208
+ status=snap.status,
209
+ timestamp=snap.timestamp,
210
+ url=snap.url,
211
+ viewport=snap.viewport,
212
+ elements=filtered_elements,
213
+ screenshot=snap.screenshot,
214
+ screenshot_format=snap.screenshot_format,
215
+ error=snap.error,
216
+ )
217
+
218
+ # 2. GROUND: Format elements for LLM context
219
+ context = self._build_context(filtered_snap, goal)
220
+
221
+ # 3. THINK: Query LLM for next action
222
+ llm_response = self._query_llm(context, goal)
223
+
224
+ # Emit LLM query trace event if tracer is enabled
225
+ if self.tracer:
226
+ self.tracer.emit(
227
+ "llm_query",
228
+ {
229
+ "prompt_tokens": llm_response.prompt_tokens,
230
+ "completion_tokens": llm_response.completion_tokens,
231
+ "model": llm_response.model_name,
232
+ "response": llm_response.content[:200], # Truncate for brevity
233
+ },
234
+ step_id=step_id,
235
+ )
236
+
237
+ if self.verbose:
238
+ print(f"🧠 LLM Decision: {llm_response.content}")
239
+
240
+ # Track token usage
241
+ self._track_tokens(goal, llm_response)
242
+
243
+ # Parse action from LLM response
244
+ action_str = self._extract_action_from_response(llm_response.content)
245
+
246
+ # 4. EXECUTE: Parse and run action
247
+ result_dict = self._execute_action(action_str, filtered_snap)
248
+
249
+ duration_ms = int((time.time() - start_time) * 1000)
250
+
251
+ # Create AgentActionResult from execution result
252
+ result = AgentActionResult(
253
+ success=result_dict["success"],
254
+ action=result_dict["action"],
255
+ goal=goal,
256
+ duration_ms=duration_ms,
257
+ attempt=attempt,
258
+ element_id=result_dict.get("element_id"),
259
+ text=result_dict.get("text"),
260
+ key=result_dict.get("key"),
261
+ outcome=result_dict.get("outcome"),
262
+ url_changed=result_dict.get("url_changed"),
263
+ error=result_dict.get("error"),
264
+ message=result_dict.get("message"),
265
+ )
266
+
267
+ # Emit action execution trace event if tracer is enabled
268
+ if self.tracer:
269
+ post_url = self.browser.page.url if self.browser.page else None
270
+
271
+ # Include element data for live overlay visualization
272
+ elements_data = [
273
+ {
274
+ "id": el.id,
275
+ "bbox": {
276
+ "x": el.bbox.x,
277
+ "y": el.bbox.y,
278
+ "width": el.bbox.width,
279
+ "height": el.bbox.height,
280
+ },
281
+ "role": el.role,
282
+ "text": el.text[:50] if el.text else "",
283
+ }
284
+ for el in filtered_snap.elements[:50]
285
+ ]
286
+
287
+ self.tracer.emit(
288
+ "action",
289
+ {
290
+ "action": result.action,
291
+ "element_id": result.element_id,
292
+ "success": result.success,
293
+ "outcome": result.outcome,
294
+ "duration_ms": duration_ms,
295
+ "post_url": post_url,
296
+ "elements": elements_data, # Add element data for overlay
297
+ "target_element_id": result.element_id, # Highlight target in red
298
+ },
299
+ step_id=step_id,
300
+ )
301
+
302
+ # 5. RECORD: Track history
303
+ self.history.append(
304
+ {
305
+ "goal": goal,
306
+ "action": action_str,
307
+ "result": result.model_dump(), # Store as dict
308
+ "success": result.success,
309
+ "attempt": attempt,
310
+ "duration_ms": duration_ms,
311
+ }
312
+ )
313
+
314
+ if self.verbose:
315
+ status = "✅" if result.success else "❌"
316
+ print(f"{status} Completed in {duration_ms}ms")
317
+
318
+ # Emit step completion trace event if tracer is enabled
319
+ if self.tracer:
320
+ self.tracer.emit(
321
+ "step_end",
322
+ {
323
+ "success": result.success,
324
+ "duration_ms": duration_ms,
325
+ "action": result.action,
326
+ },
327
+ step_id=step_id,
328
+ )
329
+
330
+ return result
331
+
332
+ except Exception as e:
333
+ # Emit error trace event if tracer is enabled
334
+ if self.tracer:
335
+ self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
336
+
337
+ if attempt < max_retries:
338
+ if self.verbose:
339
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
340
+ time.sleep(1.0) # Brief delay before retry
341
+ continue
342
+ else:
343
+ # Create error result
344
+ error_result = AgentActionResult(
345
+ success=False,
346
+ action="error",
347
+ goal=goal,
348
+ duration_ms=0,
349
+ attempt=attempt,
350
+ error=str(e),
351
+ )
352
+ self.history.append(
353
+ {
354
+ "goal": goal,
355
+ "action": "error",
356
+ "result": error_result.model_dump(),
357
+ "success": False,
358
+ "attempt": attempt,
359
+ "duration_ms": 0,
360
+ }
361
+ )
362
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
363
+
364
+ def _build_context(self, snap: Snapshot, goal: str) -> str:
365
+ """
366
+ Convert snapshot elements to token-efficient prompt string
367
+
368
+ Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
369
+
370
+ Args:
371
+ snap: Snapshot object
372
+ goal: User goal (for context)
373
+
374
+ Returns:
375
+ Formatted element context string
376
+ """
377
+ lines = []
378
+ # Note: elements are already filtered by filter_elements() in act()
379
+ for el in snap.elements:
380
+ # Extract visual cues
381
+ cues = []
382
+ if el.visual_cues.is_primary:
383
+ cues.append("PRIMARY")
384
+ if el.visual_cues.is_clickable:
385
+ cues.append("CLICKABLE")
386
+ if el.visual_cues.background_color_name:
387
+ cues.append(f"color:{el.visual_cues.background_color_name}")
388
+
389
+ # Format element line
390
+ cues_str = f" {{{','.join(cues)}}}" if cues else ""
391
+ text_preview = (
392
+ (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
393
+ )
394
+
395
+ lines.append(
396
+ f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
397
+ f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
398
+ )
399
+
400
+ return "\n".join(lines)
401
+
402
+ def _extract_action_from_response(self, response: str) -> str:
403
+ """
404
+ Extract action command from LLM response, handling cases where
405
+ the LLM adds extra explanation despite instructions.
406
+
407
+ Args:
408
+ response: Raw LLM response text
409
+
410
+ Returns:
411
+ Cleaned action command string
412
+ """
413
+ import re
414
+
415
+ # Remove markdown code blocks if present
416
+ response = re.sub(r"```[\w]*\n?", "", response)
417
+ response = response.strip()
418
+
419
+ # Try to find action patterns in the response
420
+ # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
421
+ action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
422
+
423
+ match = re.search(action_pattern, response, re.IGNORECASE)
424
+ if match:
425
+ return match.group(1)
426
+
427
+ # If no pattern match, return the original response (will likely fail parsing)
428
+ return response
429
+
430
+ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
431
+ """
432
+ Query LLM with standardized prompt template
433
+
434
+ Args:
435
+ dom_context: Formatted element context
436
+ goal: User goal
437
+
438
+ Returns:
439
+ LLMResponse from LLM provider
440
+ """
441
+ system_prompt = f"""You are an AI web automation agent.
442
+
443
+ GOAL: {goal}
444
+
445
+ VISIBLE ELEMENTS (sorted by importance):
446
+ {dom_context}
447
+
448
+ VISUAL CUES EXPLAINED:
449
+ - {{PRIMARY}}: Main call-to-action element on the page
450
+ - {{CLICKABLE}}: Element is clickable
451
+ - {{color:X}}: Background color name
452
+
453
+ CRITICAL RESPONSE FORMAT:
454
+ You MUST respond with ONLY ONE of these exact action formats:
455
+ - CLICK(id) - Click element by ID
456
+ - TYPE(id, "text") - Type text into element
457
+ - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
458
+ - FINISH() - Task complete
459
+
460
+ DO NOT include any explanation, reasoning, or natural language.
461
+ DO NOT use markdown formatting or code blocks.
462
+ DO NOT say "The next step is..." or anything similar.
463
+
464
+ CORRECT Examples:
465
+ CLICK(42)
466
+ TYPE(15, "magic mouse")
467
+ PRESS("Enter")
468
+ FINISH()
469
+
470
+ INCORRECT Examples (DO NOT DO THIS):
471
+ "The next step is to click..."
472
+ "I will type..."
473
+ ```CLICK(42)```
474
+ """
475
+
476
+ user_prompt = "Return the single action command:"
477
+
478
+ return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
479
+
480
+ def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
481
+ """
482
+ Parse action string and execute SDK call
483
+
484
+ Args:
485
+ action_str: Action string from LLM (e.g., "CLICK(42)")
486
+ snap: Current snapshot (for context)
487
+
488
+ Returns:
489
+ Execution result dictionary
490
+ """
491
+ # Parse CLICK(42)
492
+ if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
493
+ element_id = int(match.group(1))
494
+ result = click(self.browser, element_id)
495
+ return {
496
+ "success": result.success,
497
+ "action": "click",
498
+ "element_id": element_id,
499
+ "outcome": result.outcome,
500
+ "url_changed": result.url_changed,
501
+ }
502
+
503
+ # Parse TYPE(42, "hello world")
504
+ elif match := re.match(
505
+ r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
506
+ action_str,
507
+ re.IGNORECASE,
508
+ ):
509
+ element_id = int(match.group(1))
510
+ text = match.group(2)
511
+ result = type_text(self.browser, element_id, text)
512
+ return {
513
+ "success": result.success,
514
+ "action": "type",
515
+ "element_id": element_id,
516
+ "text": text,
517
+ "outcome": result.outcome,
518
+ }
519
+
520
+ # Parse PRESS("Enter")
521
+ elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
522
+ key = match.group(1)
523
+ result = press(self.browser, key)
524
+ return {
525
+ "success": result.success,
526
+ "action": "press",
527
+ "key": key,
528
+ "outcome": result.outcome,
529
+ }
530
+
531
+ # Parse FINISH()
532
+ elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
533
+ return {
534
+ "success": True,
535
+ "action": "finish",
536
+ "message": "Task marked as complete",
537
+ }
538
+
539
+ else:
540
+ raise ValueError(
541
+ f"Unknown action format: {action_str}\n"
542
+ f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
543
+ )
544
+
545
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
546
+ """
547
+ Track token usage for analytics
548
+
549
+ Args:
550
+ goal: User goal
551
+ llm_response: LLM response with token usage
552
+ """
553
+ if llm_response.prompt_tokens:
554
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
555
+ if llm_response.completion_tokens:
556
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
557
+ if llm_response.total_tokens:
558
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
559
+
560
+ self._token_usage_raw["by_action"].append(
561
+ {
562
+ "goal": goal,
563
+ "prompt_tokens": llm_response.prompt_tokens or 0,
564
+ "completion_tokens": llm_response.completion_tokens or 0,
565
+ "total_tokens": llm_response.total_tokens or 0,
566
+ "model": llm_response.model_name,
567
+ }
568
+ )
569
+
570
+ def get_token_stats(self) -> TokenStats:
571
+ """
572
+ Get token usage statistics
573
+
574
+ Returns:
575
+ TokenStats with token usage breakdown
576
+ """
577
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
578
+ return TokenStats(
579
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
580
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
581
+ total_tokens=self._token_usage_raw["total_tokens"],
582
+ by_action=by_action,
583
+ )
584
+
585
+ def get_history(self) -> list[ActionHistory]:
586
+ """
587
+ Get execution history
588
+
589
+ Returns:
590
+ List of ActionHistory entries
591
+ """
592
+ return [ActionHistory(**h) for h in self.history]
593
+
594
+ def clear_history(self) -> None:
595
+ """Clear execution history and reset token counters"""
596
+ self.history.clear()
597
+ self._token_usage_raw = {
598
+ "total_prompt_tokens": 0,
599
+ "total_completion_tokens": 0,
600
+ "total_tokens": 0,
601
+ "by_action": [],
602
+ }
603
+
604
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
605
+ """
606
+ Filter elements from snapshot based on goal context.
607
+
608
+ This default implementation applies goal-based keyword matching to boost
609
+ relevant elements and filters out irrelevant ones.
610
+
611
+ Args:
612
+ snapshot: Current page snapshot
613
+ goal: User's goal (can inform filtering)
614
+
615
+ Returns:
616
+ Filtered list of elements
617
+ """
618
+ elements = snapshot.elements
619
+
620
+ # If no goal provided, return all elements (up to limit)
621
+ if not goal:
622
+ return elements[: self.default_snapshot_limit]
623
+
624
+ goal_lower = goal.lower()
625
+
626
+ # Extract keywords from goal
627
+ keywords = self._extract_keywords(goal_lower)
628
+
629
+ # Boost elements matching goal keywords
630
+ scored_elements = []
631
+ for el in elements:
632
+ score = el.importance
633
+
634
+ # Boost if element text matches goal
635
+ if el.text and any(kw in el.text.lower() for kw in keywords):
636
+ score += 0.3
637
+
638
+ # Boost if role matches goal intent
639
+ if "click" in goal_lower and el.visual_cues.is_clickable:
640
+ score += 0.2
641
+ if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
642
+ score += 0.2
643
+ if "search" in goal_lower:
644
+ # Filter out non-interactive elements for search tasks
645
+ if el.role in ["link", "img"] and not el.visual_cues.is_primary:
646
+ score -= 0.5
647
+
648
+ scored_elements.append((score, el))
649
+
650
+ # Re-sort by boosted score
651
+ scored_elements.sort(key=lambda x: x[0], reverse=True)
652
+ elements = [el for _, el in scored_elements]
653
+
654
+ return elements[: self.default_snapshot_limit]
655
+
656
+ def _extract_keywords(self, text: str) -> list[str]:
657
+ """
658
+ Extract meaningful keywords from goal text
659
+
660
+ Args:
661
+ text: Text to extract keywords from
662
+
663
+ Returns:
664
+ List of keywords
665
+ """
666
+ stopwords = {
667
+ "the",
668
+ "a",
669
+ "an",
670
+ "and",
671
+ "or",
672
+ "but",
673
+ "in",
674
+ "on",
675
+ "at",
676
+ "to",
677
+ "for",
678
+ "of",
679
+ "with",
680
+ "by",
681
+ "from",
682
+ "as",
683
+ "is",
684
+ "was",
685
+ }
686
+ words = text.split()
687
+ return [w for w in words if w not in stopwords and len(w) > 2]