sentienceapi 0.90.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (50) hide show
  1. sentience/__init__.py +153 -0
  2. sentience/_extension_loader.py +40 -0
  3. sentience/actions.py +837 -0
  4. sentience/agent.py +1246 -0
  5. sentience/agent_config.py +43 -0
  6. sentience/async_api.py +101 -0
  7. sentience/base_agent.py +194 -0
  8. sentience/browser.py +1037 -0
  9. sentience/cli.py +130 -0
  10. sentience/cloud_tracing.py +382 -0
  11. sentience/conversational_agent.py +509 -0
  12. sentience/expect.py +188 -0
  13. sentience/extension/background.js +233 -0
  14. sentience/extension/content.js +298 -0
  15. sentience/extension/injected_api.js +1473 -0
  16. sentience/extension/manifest.json +36 -0
  17. sentience/extension/pkg/sentience_core.d.ts +51 -0
  18. sentience/extension/pkg/sentience_core.js +529 -0
  19. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  20. sentience/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
  21. sentience/extension/release.json +115 -0
  22. sentience/extension/test-content.js +4 -0
  23. sentience/formatting.py +59 -0
  24. sentience/generator.py +202 -0
  25. sentience/inspector.py +365 -0
  26. sentience/llm_provider.py +637 -0
  27. sentience/models.py +412 -0
  28. sentience/overlay.py +222 -0
  29. sentience/query.py +303 -0
  30. sentience/read.py +185 -0
  31. sentience/recorder.py +589 -0
  32. sentience/schemas/trace_v1.json +216 -0
  33. sentience/screenshot.py +100 -0
  34. sentience/snapshot.py +516 -0
  35. sentience/text_search.py +290 -0
  36. sentience/trace_indexing/__init__.py +27 -0
  37. sentience/trace_indexing/index_schema.py +111 -0
  38. sentience/trace_indexing/indexer.py +357 -0
  39. sentience/tracer_factory.py +211 -0
  40. sentience/tracing.py +285 -0
  41. sentience/utils.py +296 -0
  42. sentience/wait.py +137 -0
  43. sentienceapi-0.90.17.dist-info/METADATA +917 -0
  44. sentienceapi-0.90.17.dist-info/RECORD +50 -0
  45. sentienceapi-0.90.17.dist-info/WHEEL +5 -0
  46. sentienceapi-0.90.17.dist-info/entry_points.txt +2 -0
  47. sentienceapi-0.90.17.dist-info/licenses/LICENSE +24 -0
  48. sentienceapi-0.90.17.dist-info/licenses/LICENSE-APACHE +201 -0
  49. sentienceapi-0.90.17.dist-info/licenses/LICENSE-MIT +21 -0
  50. sentienceapi-0.90.17.dist-info/top_level.txt +1 -0
sentience/agent.py ADDED
@@ -0,0 +1,1246 @@
1
+ """
2
+ Sentience Agent: High-level automation agent using LLM + SDK
3
+ Implements observe-think-act loop for natural language commands
4
+ """
5
+
6
+ import asyncio
7
+ import re
8
+ import time
9
+ from typing import TYPE_CHECKING, Any, Optional
10
+
11
+ from .actions import click, click_async, press, press_async, type_text, type_text_async
12
+ from .base_agent import BaseAgent, BaseAgentAsync
13
+ from .browser import AsyncSentienceBrowser, SentienceBrowser
14
+ from .llm_provider import LLMProvider, LLMResponse
15
+ from .models import (
16
+ ActionHistory,
17
+ ActionTokenUsage,
18
+ AgentActionResult,
19
+ Element,
20
+ ScreenshotConfig,
21
+ Snapshot,
22
+ SnapshotOptions,
23
+ TokenStats,
24
+ )
25
+ from .snapshot import snapshot, snapshot_async
26
+
27
+ if TYPE_CHECKING:
28
+ from .agent_config import AgentConfig
29
+ from .tracing import Tracer
30
+
31
+
32
+ class SentienceAgent(BaseAgent):
33
+ """
34
+ High-level agent that combines Sentience SDK with any LLM provider.
35
+
36
+ Uses observe-think-act loop to execute natural language commands:
37
+ 1. OBSERVE: Get snapshot of current page state
38
+ 2. THINK: Query LLM to decide next action
39
+ 3. ACT: Execute action using SDK
40
+
41
+ Example:
42
+ >>> from sentience import SentienceBrowser, SentienceAgent
43
+ >>> from sentience.llm_provider import OpenAIProvider
44
+ >>>
45
+ >>> browser = SentienceBrowser(api_key="sentience_key")
46
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
47
+ >>> agent = SentienceAgent(browser, llm)
48
+ >>>
49
+ >>> with browser:
50
+ >>> browser.page.goto("https://google.com")
51
+ >>> agent.act("Click the search box")
52
+ >>> agent.act("Type 'magic mouse' into the search field")
53
+ >>> agent.act("Press Enter key")
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ browser: SentienceBrowser,
59
+ llm: LLMProvider,
60
+ default_snapshot_limit: int = 50,
61
+ verbose: bool = True,
62
+ tracer: Optional["Tracer"] = None,
63
+ config: Optional["AgentConfig"] = None,
64
+ ):
65
+ """
66
+ Initialize Sentience Agent
67
+
68
+ Args:
69
+ browser: SentienceBrowser instance
70
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
71
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
72
+ verbose: Print execution logs (default: True)
73
+ tracer: Optional Tracer instance for execution tracking (default: None)
74
+ config: Optional AgentConfig for advanced configuration (default: None)
75
+ """
76
+ self.browser = browser
77
+ self.llm = llm
78
+ self.default_snapshot_limit = default_snapshot_limit
79
+ self.verbose = verbose
80
+ self.tracer = tracer
81
+ self.config = config
82
+
83
+ # Execution history
84
+ self.history: list[dict[str, Any]] = []
85
+
86
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
87
+ self._token_usage_raw = {
88
+ "total_prompt_tokens": 0,
89
+ "total_completion_tokens": 0,
90
+ "total_tokens": 0,
91
+ "by_action": [],
92
+ }
93
+
94
+ # Step counter for tracing
95
+ self._step_count = 0
96
+
97
+ def act( # noqa: C901
98
+ self,
99
+ goal: str,
100
+ max_retries: int = 2,
101
+ snapshot_options: SnapshotOptions | None = None,
102
+ ) -> AgentActionResult:
103
+ """
104
+ Execute a high-level goal using observe → think → act loop
105
+
106
+ Args:
107
+ goal: Natural language instruction (e.g., "Click the Sign In button")
108
+ max_retries: Number of retries on failure (default: 2)
109
+ snapshot_options: Optional SnapshotOptions for this specific action
110
+
111
+ Returns:
112
+ AgentActionResult with execution details
113
+
114
+ Example:
115
+ >>> result = agent.act("Click the search box")
116
+ >>> print(result.success, result.action, result.element_id)
117
+ True click 42
118
+ >>> # Backward compatible dict access
119
+ >>> print(result["element_id"]) # Works but shows deprecation warning
120
+ 42
121
+ """
122
+ if self.verbose:
123
+ print(f"\n{'=' * 70}")
124
+ print(f"🤖 Agent Goal: {goal}")
125
+ print(f"{'=' * 70}")
126
+
127
+ # Generate step ID for tracing
128
+ self._step_count += 1
129
+ step_id = f"step-{self._step_count}"
130
+
131
+ # Emit step_start trace event if tracer is enabled
132
+ if self.tracer:
133
+ pre_url = self.browser.page.url if self.browser.page else None
134
+ self.tracer.emit_step_start(
135
+ step_id=step_id,
136
+ step_index=self._step_count,
137
+ goal=goal,
138
+ attempt=0,
139
+ pre_url=pre_url,
140
+ )
141
+
142
+ for attempt in range(max_retries + 1):
143
+ try:
144
+ # 1. OBSERVE: Get refined semantic snapshot
145
+ start_time = time.time()
146
+
147
+ # Use provided options or create default
148
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
149
+ # Only set goal if not already provided
150
+ if snap_opts.goal is None:
151
+ snap_opts.goal = goal
152
+
153
+ # Call snapshot with options object (matches TypeScript API)
154
+ snap = snapshot(self.browser, snap_opts)
155
+
156
+ if snap.status != "success":
157
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
158
+
159
+ # Apply element filtering based on goal
160
+ filtered_elements = self.filter_elements(snap, goal)
161
+
162
+ # Emit snapshot trace event if tracer is enabled
163
+ if self.tracer:
164
+ # Include element data for live overlay visualization
165
+ # Use filtered_elements for overlay (only relevant elements)
166
+ elements_data = [
167
+ {
168
+ "id": el.id,
169
+ "bbox": {
170
+ "x": el.bbox.x,
171
+ "y": el.bbox.y,
172
+ "width": el.bbox.width,
173
+ "height": el.bbox.height,
174
+ },
175
+ "role": el.role,
176
+ "text": el.text[:50] if el.text else "", # Truncate for brevity
177
+ }
178
+ for el in filtered_elements[:50] # Limit to first 50 for performance
179
+ ]
180
+
181
+ self.tracer.emit(
182
+ "snapshot",
183
+ {
184
+ "url": snap.url,
185
+ "element_count": len(snap.elements),
186
+ "timestamp": snap.timestamp,
187
+ "elements": elements_data, # Add element data for overlay
188
+ },
189
+ step_id=step_id,
190
+ )
191
+
192
+ # Create filtered snapshot
193
+ filtered_snap = Snapshot(
194
+ status=snap.status,
195
+ timestamp=snap.timestamp,
196
+ url=snap.url,
197
+ viewport=snap.viewport,
198
+ elements=filtered_elements,
199
+ screenshot=snap.screenshot,
200
+ screenshot_format=snap.screenshot_format,
201
+ error=snap.error,
202
+ )
203
+
204
+ # 2. GROUND: Format elements for LLM context
205
+ context = self._build_context(filtered_snap, goal)
206
+
207
+ # 3. THINK: Query LLM for next action
208
+ llm_response = self._query_llm(context, goal)
209
+
210
+ # Emit LLM query trace event if tracer is enabled
211
+ if self.tracer:
212
+ self.tracer.emit(
213
+ "llm_query",
214
+ {
215
+ "prompt_tokens": llm_response.prompt_tokens,
216
+ "completion_tokens": llm_response.completion_tokens,
217
+ "model": llm_response.model_name,
218
+ "response": llm_response.content[:200], # Truncate for brevity
219
+ },
220
+ step_id=step_id,
221
+ )
222
+
223
+ if self.verbose:
224
+ print(f"🧠 LLM Decision: {llm_response.content}")
225
+
226
+ # Track token usage
227
+ self._track_tokens(goal, llm_response)
228
+
229
+ # Parse action from LLM response
230
+ action_str = self._extract_action_from_response(llm_response.content)
231
+
232
+ # 4. EXECUTE: Parse and run action
233
+ result_dict = self._execute_action(action_str, filtered_snap)
234
+
235
+ duration_ms = int((time.time() - start_time) * 1000)
236
+
237
+ # Create AgentActionResult from execution result
238
+ result = AgentActionResult(
239
+ success=result_dict["success"],
240
+ action=result_dict["action"],
241
+ goal=goal,
242
+ duration_ms=duration_ms,
243
+ attempt=attempt,
244
+ element_id=result_dict.get("element_id"),
245
+ text=result_dict.get("text"),
246
+ key=result_dict.get("key"),
247
+ outcome=result_dict.get("outcome"),
248
+ url_changed=result_dict.get("url_changed"),
249
+ error=result_dict.get("error"),
250
+ message=result_dict.get("message"),
251
+ )
252
+
253
+ # Emit action execution trace event if tracer is enabled
254
+ if self.tracer:
255
+ post_url = self.browser.page.url if self.browser.page else None
256
+
257
+ # Include element data for live overlay visualization
258
+ elements_data = [
259
+ {
260
+ "id": el.id,
261
+ "bbox": {
262
+ "x": el.bbox.x,
263
+ "y": el.bbox.y,
264
+ "width": el.bbox.width,
265
+ "height": el.bbox.height,
266
+ },
267
+ "role": el.role,
268
+ "text": el.text[:50] if el.text else "",
269
+ }
270
+ for el in filtered_snap.elements[:50]
271
+ ]
272
+
273
+ self.tracer.emit(
274
+ "action",
275
+ {
276
+ "action": result.action,
277
+ "element_id": result.element_id,
278
+ "success": result.success,
279
+ "outcome": result.outcome,
280
+ "duration_ms": duration_ms,
281
+ "post_url": post_url,
282
+ "elements": elements_data, # Add element data for overlay
283
+ "target_element_id": result.element_id, # Highlight target in red
284
+ },
285
+ step_id=step_id,
286
+ )
287
+
288
+ # 5. RECORD: Track history
289
+ self.history.append(
290
+ {
291
+ "goal": goal,
292
+ "action": action_str,
293
+ "result": result.model_dump(), # Store as dict
294
+ "success": result.success,
295
+ "attempt": attempt,
296
+ "duration_ms": duration_ms,
297
+ }
298
+ )
299
+
300
+ if self.verbose:
301
+ status = "✅" if result.success else "❌"
302
+ print(f"{status} Completed in {duration_ms}ms")
303
+
304
+ # Emit step completion trace event if tracer is enabled
305
+ if self.tracer:
306
+ self.tracer.emit(
307
+ "step_end",
308
+ {
309
+ "success": result.success,
310
+ "duration_ms": duration_ms,
311
+ "action": result.action,
312
+ },
313
+ step_id=step_id,
314
+ )
315
+
316
+ return result
317
+
318
+ except Exception as e:
319
+ # Emit error trace event if tracer is enabled
320
+ if self.tracer:
321
+ self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
322
+
323
+ if attempt < max_retries:
324
+ if self.verbose:
325
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
326
+ time.sleep(1.0) # Brief delay before retry
327
+ continue
328
+ else:
329
+ # Create error result
330
+ error_result = AgentActionResult(
331
+ success=False,
332
+ action="error",
333
+ goal=goal,
334
+ duration_ms=0,
335
+ attempt=attempt,
336
+ error=str(e),
337
+ )
338
+ self.history.append(
339
+ {
340
+ "goal": goal,
341
+ "action": "error",
342
+ "result": error_result.model_dump(),
343
+ "success": False,
344
+ "attempt": attempt,
345
+ "duration_ms": 0,
346
+ }
347
+ )
348
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
349
+
350
+ def _build_context(self, snap: Snapshot, goal: str) -> str:
351
+ """
352
+ Convert snapshot elements to token-efficient prompt string
353
+
354
+ Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
355
+
356
+ Args:
357
+ snap: Snapshot object
358
+ goal: User goal (for context)
359
+
360
+ Returns:
361
+ Formatted element context string
362
+ """
363
+ lines = []
364
+ # Note: elements are already filtered by filter_elements() in act()
365
+ for el in snap.elements:
366
+ # Extract visual cues
367
+ cues = []
368
+ if el.visual_cues.is_primary:
369
+ cues.append("PRIMARY")
370
+ if el.visual_cues.is_clickable:
371
+ cues.append("CLICKABLE")
372
+ if el.visual_cues.background_color_name:
373
+ cues.append(f"color:{el.visual_cues.background_color_name}")
374
+
375
+ # Format element line
376
+ cues_str = f" {{{','.join(cues)}}}" if cues else ""
377
+ text_preview = (
378
+ (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
379
+ )
380
+
381
+ lines.append(
382
+ f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
383
+ f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
384
+ )
385
+
386
+ return "\n".join(lines)
387
+
388
+ def _extract_action_from_response(self, response: str) -> str:
389
+ """
390
+ Extract action command from LLM response, handling cases where
391
+ the LLM adds extra explanation despite instructions.
392
+
393
+ Args:
394
+ response: Raw LLM response text
395
+
396
+ Returns:
397
+ Cleaned action command string
398
+ """
399
+ import re
400
+
401
+ # Remove markdown code blocks if present
402
+ response = re.sub(r"```[\w]*\n?", "", response)
403
+ response = response.strip()
404
+
405
+ # Try to find action patterns in the response
406
+ # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
407
+ action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
408
+
409
+ match = re.search(action_pattern, response, re.IGNORECASE)
410
+ if match:
411
+ return match.group(1)
412
+
413
+ # If no pattern match, return the original response (will likely fail parsing)
414
+ return response
415
+
416
+ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
417
+ """
418
+ Query LLM with standardized prompt template
419
+
420
+ Args:
421
+ dom_context: Formatted element context
422
+ goal: User goal
423
+
424
+ Returns:
425
+ LLMResponse from LLM provider
426
+ """
427
+ system_prompt = f"""You are an AI web automation agent.
428
+
429
+ GOAL: {goal}
430
+
431
+ VISIBLE ELEMENTS (sorted by importance):
432
+ {dom_context}
433
+
434
+ VISUAL CUES EXPLAINED:
435
+ - {{PRIMARY}}: Main call-to-action element on the page
436
+ - {{CLICKABLE}}: Element is clickable
437
+ - {{color:X}}: Background color name
438
+
439
+ CRITICAL RESPONSE FORMAT:
440
+ You MUST respond with ONLY ONE of these exact action formats:
441
+ - CLICK(id) - Click element by ID
442
+ - TYPE(id, "text") - Type text into element
443
+ - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
444
+ - FINISH() - Task complete
445
+
446
+ DO NOT include any explanation, reasoning, or natural language.
447
+ DO NOT use markdown formatting or code blocks.
448
+ DO NOT say "The next step is..." or anything similar.
449
+
450
+ CORRECT Examples:
451
+ CLICK(42)
452
+ TYPE(15, "magic mouse")
453
+ PRESS("Enter")
454
+ FINISH()
455
+
456
+ INCORRECT Examples (DO NOT DO THIS):
457
+ "The next step is to click..."
458
+ "I will type..."
459
+ ```CLICK(42)```
460
+ """
461
+
462
+ user_prompt = "Return the single action command:"
463
+
464
+ return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
465
+
466
+ def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
467
+ """
468
+ Parse action string and execute SDK call
469
+
470
+ Args:
471
+ action_str: Action string from LLM (e.g., "CLICK(42)")
472
+ snap: Current snapshot (for context)
473
+
474
+ Returns:
475
+ Execution result dictionary
476
+ """
477
+ # Parse CLICK(42)
478
+ if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
479
+ element_id = int(match.group(1))
480
+ result = click(self.browser, element_id)
481
+ return {
482
+ "success": result.success,
483
+ "action": "click",
484
+ "element_id": element_id,
485
+ "outcome": result.outcome,
486
+ "url_changed": result.url_changed,
487
+ }
488
+
489
+ # Parse TYPE(42, "hello world")
490
+ elif match := re.match(
491
+ r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
492
+ action_str,
493
+ re.IGNORECASE,
494
+ ):
495
+ element_id = int(match.group(1))
496
+ text = match.group(2)
497
+ result = type_text(self.browser, element_id, text)
498
+ return {
499
+ "success": result.success,
500
+ "action": "type",
501
+ "element_id": element_id,
502
+ "text": text,
503
+ "outcome": result.outcome,
504
+ }
505
+
506
+ # Parse PRESS("Enter")
507
+ elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
508
+ key = match.group(1)
509
+ result = press(self.browser, key)
510
+ return {
511
+ "success": result.success,
512
+ "action": "press",
513
+ "key": key,
514
+ "outcome": result.outcome,
515
+ }
516
+
517
+ # Parse FINISH()
518
+ elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
519
+ return {
520
+ "success": True,
521
+ "action": "finish",
522
+ "message": "Task marked as complete",
523
+ }
524
+
525
+ else:
526
+ raise ValueError(
527
+ f"Unknown action format: {action_str}\n"
528
+ f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
529
+ )
530
+
531
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
532
+ """
533
+ Track token usage for analytics
534
+
535
+ Args:
536
+ goal: User goal
537
+ llm_response: LLM response with token usage
538
+ """
539
+ if llm_response.prompt_tokens:
540
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
541
+ if llm_response.completion_tokens:
542
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
543
+ if llm_response.total_tokens:
544
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
545
+
546
+ self._token_usage_raw["by_action"].append(
547
+ {
548
+ "goal": goal,
549
+ "prompt_tokens": llm_response.prompt_tokens or 0,
550
+ "completion_tokens": llm_response.completion_tokens or 0,
551
+ "total_tokens": llm_response.total_tokens or 0,
552
+ "model": llm_response.model_name,
553
+ }
554
+ )
555
+
556
+ def get_token_stats(self) -> TokenStats:
557
+ """
558
+ Get token usage statistics
559
+
560
+ Returns:
561
+ TokenStats with token usage breakdown
562
+ """
563
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
564
+ return TokenStats(
565
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
566
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
567
+ total_tokens=self._token_usage_raw["total_tokens"],
568
+ by_action=by_action,
569
+ )
570
+
571
+ def get_history(self) -> list[ActionHistory]:
572
+ """
573
+ Get execution history
574
+
575
+ Returns:
576
+ List of ActionHistory entries
577
+ """
578
+ return [ActionHistory(**h) for h in self.history]
579
+
580
+ def clear_history(self) -> None:
581
+ """Clear execution history and reset token counters"""
582
+ self.history.clear()
583
+ self._token_usage_raw = {
584
+ "total_prompt_tokens": 0,
585
+ "total_completion_tokens": 0,
586
+ "total_tokens": 0,
587
+ "by_action": [],
588
+ }
589
+
590
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
591
+ """
592
+ Filter elements from snapshot based on goal context.
593
+
594
+ This default implementation applies goal-based keyword matching to boost
595
+ relevant elements and filters out irrelevant ones.
596
+
597
+ Args:
598
+ snapshot: Current page snapshot
599
+ goal: User's goal (can inform filtering)
600
+
601
+ Returns:
602
+ Filtered list of elements
603
+ """
604
+ elements = snapshot.elements
605
+
606
+ # If no goal provided, return all elements (up to limit)
607
+ if not goal:
608
+ return elements[: self.default_snapshot_limit]
609
+
610
+ goal_lower = goal.lower()
611
+
612
+ # Extract keywords from goal
613
+ keywords = self._extract_keywords(goal_lower)
614
+
615
+ # Boost elements matching goal keywords
616
+ scored_elements = []
617
+ for el in elements:
618
+ score = el.importance
619
+
620
+ # Boost if element text matches goal
621
+ if el.text and any(kw in el.text.lower() for kw in keywords):
622
+ score += 0.3
623
+
624
+ # Boost if role matches goal intent
625
+ if "click" in goal_lower and el.visual_cues.is_clickable:
626
+ score += 0.2
627
+ if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
628
+ score += 0.2
629
+ if "search" in goal_lower:
630
+ # Filter out non-interactive elements for search tasks
631
+ if el.role in ["link", "img"] and not el.visual_cues.is_primary:
632
+ score -= 0.5
633
+
634
+ scored_elements.append((score, el))
635
+
636
+ # Re-sort by boosted score
637
+ scored_elements.sort(key=lambda x: x[0], reverse=True)
638
+ elements = [el for _, el in scored_elements]
639
+
640
+ return elements[: self.default_snapshot_limit]
641
+
642
+ def _extract_keywords(self, text: str) -> list[str]:
643
+ """
644
+ Extract meaningful keywords from goal text
645
+
646
+ Args:
647
+ text: Text to extract keywords from
648
+
649
+ Returns:
650
+ List of keywords
651
+ """
652
+ stopwords = {
653
+ "the",
654
+ "a",
655
+ "an",
656
+ "and",
657
+ "or",
658
+ "but",
659
+ "in",
660
+ "on",
661
+ "at",
662
+ "to",
663
+ "for",
664
+ "of",
665
+ "with",
666
+ "by",
667
+ "from",
668
+ "as",
669
+ "is",
670
+ "was",
671
+ }
672
+ words = text.split()
673
+ return [w for w in words if w not in stopwords and len(w) > 2]
674
+
675
+
676
+ class SentienceAgentAsync(BaseAgentAsync):
677
+ """
678
+ High-level async agent that combines Sentience SDK with any LLM provider.
679
+
680
+ Uses observe-think-act loop to execute natural language commands:
681
+ 1. OBSERVE: Get snapshot of current page state
682
+ 2. THINK: Query LLM to decide next action
683
+ 3. ACT: Execute action using SDK
684
+
685
+ Example:
686
+ >>> from sentience.async_api import AsyncSentienceBrowser
687
+ >>> from sentience.agent import SentienceAgentAsync
688
+ >>> from sentience.llm_provider import OpenAIProvider
689
+ >>>
690
+ >>> async with AsyncSentienceBrowser() as browser:
691
+ >>> await browser.goto("https://google.com")
692
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
693
+ >>> agent = SentienceAgentAsync(browser, llm)
694
+ >>> await agent.act("Click the search box")
695
+ >>> await agent.act("Type 'magic mouse' into the search field")
696
+ >>> await agent.act("Press Enter key")
697
+ """
698
+
699
+ def __init__(
700
+ self,
701
+ browser: AsyncSentienceBrowser,
702
+ llm: LLMProvider,
703
+ default_snapshot_limit: int = 50,
704
+ verbose: bool = True,
705
+ tracer: Optional["Tracer"] = None,
706
+ config: Optional["AgentConfig"] = None,
707
+ ):
708
+ """
709
+ Initialize Sentience Agent (async)
710
+
711
+ Args:
712
+ browser: AsyncSentienceBrowser instance
713
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
714
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
715
+ verbose: Print execution logs (default: True)
716
+ tracer: Optional Tracer instance for execution tracking (default: None)
717
+ config: Optional AgentConfig for advanced configuration (default: None)
718
+ """
719
+ self.browser = browser
720
+ self.llm = llm
721
+ self.default_snapshot_limit = default_snapshot_limit
722
+ self.verbose = verbose
723
+ self.tracer = tracer
724
+ self.config = config
725
+
726
+ # Execution history
727
+ self.history: list[dict[str, Any]] = []
728
+
729
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
730
+ self._token_usage_raw = {
731
+ "total_prompt_tokens": 0,
732
+ "total_completion_tokens": 0,
733
+ "total_tokens": 0,
734
+ "by_action": [],
735
+ }
736
+
737
+ # Step counter for tracing
738
+ self._step_count = 0
739
+
740
+ async def act( # noqa: C901
741
+ self,
742
+ goal: str,
743
+ max_retries: int = 2,
744
+ snapshot_options: SnapshotOptions | None = None,
745
+ ) -> AgentActionResult:
746
+ """
747
+ Execute a high-level goal using observe → think → act loop (async)
748
+
749
+ Args:
750
+ goal: Natural language instruction (e.g., "Click the Sign In button")
751
+ max_retries: Number of retries on failure (default: 2)
752
+ snapshot_options: Optional SnapshotOptions for this specific action
753
+
754
+ Returns:
755
+ AgentActionResult with execution details
756
+
757
+ Example:
758
+ >>> result = await agent.act("Click the search box")
759
+ >>> print(result.success, result.action, result.element_id)
760
+ True click 42
761
+ """
762
+ if self.verbose:
763
+ print(f"\n{'=' * 70}")
764
+ print(f"🤖 Agent Goal: {goal}")
765
+ print(f"{'=' * 70}")
766
+
767
+ # Generate step ID for tracing
768
+ self._step_count += 1
769
+ step_id = f"step-{self._step_count}"
770
+
771
+ # Emit step_start trace event if tracer is enabled
772
+ if self.tracer:
773
+ pre_url = self.browser.page.url if self.browser.page else None
774
+ self.tracer.emit_step_start(
775
+ step_id=step_id,
776
+ step_index=self._step_count,
777
+ goal=goal,
778
+ attempt=0,
779
+ pre_url=pre_url,
780
+ )
781
+
782
+ for attempt in range(max_retries + 1):
783
+ try:
784
+ # 1. OBSERVE: Get refined semantic snapshot
785
+ start_time = time.time()
786
+
787
+ # Use provided options or create default
788
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
789
+ # Only set goal if not already provided
790
+ if snap_opts.goal is None:
791
+ snap_opts.goal = goal
792
+
793
+ # Call snapshot with options object (matches TypeScript API)
794
+ snap = await snapshot_async(self.browser, snap_opts)
795
+
796
+ if snap.status != "success":
797
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
798
+
799
+ # Apply element filtering based on goal
800
+ filtered_elements = self.filter_elements(snap, goal)
801
+
802
+ # Emit snapshot trace event if tracer is enabled
803
+ if self.tracer:
804
+ # Include element data for live overlay visualization
805
+ # Use filtered_elements for overlay (only relevant elements)
806
+ elements_data = [
807
+ {
808
+ "id": el.id,
809
+ "bbox": {
810
+ "x": el.bbox.x,
811
+ "y": el.bbox.y,
812
+ "width": el.bbox.width,
813
+ "height": el.bbox.height,
814
+ },
815
+ "role": el.role,
816
+ "text": el.text[:50] if el.text else "", # Truncate for brevity
817
+ }
818
+ for el in filtered_elements[:50] # Limit to first 50 for performance
819
+ ]
820
+
821
+ self.tracer.emit(
822
+ "snapshot",
823
+ {
824
+ "url": snap.url,
825
+ "element_count": len(snap.elements),
826
+ "timestamp": snap.timestamp,
827
+ "elements": elements_data, # Add element data for overlay
828
+ },
829
+ step_id=step_id,
830
+ )
831
+
832
+ # Create filtered snapshot
833
+ filtered_snap = Snapshot(
834
+ status=snap.status,
835
+ timestamp=snap.timestamp,
836
+ url=snap.url,
837
+ viewport=snap.viewport,
838
+ elements=filtered_elements,
839
+ screenshot=snap.screenshot,
840
+ screenshot_format=snap.screenshot_format,
841
+ error=snap.error,
842
+ )
843
+
844
+ # 2. GROUND: Format elements for LLM context
845
+ context = self._build_context(filtered_snap, goal)
846
+
847
+ # 3. THINK: Query LLM for next action
848
+ llm_response = self._query_llm(context, goal)
849
+
850
+ # Emit LLM query trace event if tracer is enabled
851
+ if self.tracer:
852
+ self.tracer.emit(
853
+ "llm_query",
854
+ {
855
+ "prompt_tokens": llm_response.prompt_tokens,
856
+ "completion_tokens": llm_response.completion_tokens,
857
+ "model": llm_response.model_name,
858
+ "response": llm_response.content[:200], # Truncate for brevity
859
+ },
860
+ step_id=step_id,
861
+ )
862
+
863
+ if self.verbose:
864
+ print(f"🧠 LLM Decision: {llm_response.content}")
865
+
866
+ # Track token usage
867
+ self._track_tokens(goal, llm_response)
868
+
869
+ # Parse action from LLM response
870
+ action_str = self._extract_action_from_response(llm_response.content)
871
+
872
+ # 4. EXECUTE: Parse and run action
873
+ result_dict = await self._execute_action(action_str, filtered_snap)
874
+
875
+ duration_ms = int((time.time() - start_time) * 1000)
876
+
877
+ # Create AgentActionResult from execution result
878
+ result = AgentActionResult(
879
+ success=result_dict["success"],
880
+ action=result_dict["action"],
881
+ goal=goal,
882
+ duration_ms=duration_ms,
883
+ attempt=attempt,
884
+ element_id=result_dict.get("element_id"),
885
+ text=result_dict.get("text"),
886
+ key=result_dict.get("key"),
887
+ outcome=result_dict.get("outcome"),
888
+ url_changed=result_dict.get("url_changed"),
889
+ error=result_dict.get("error"),
890
+ message=result_dict.get("message"),
891
+ )
892
+
893
+ # Emit action execution trace event if tracer is enabled
894
+ if self.tracer:
895
+ post_url = self.browser.page.url if self.browser.page else None
896
+
897
+ # Include element data for live overlay visualization
898
+ elements_data = [
899
+ {
900
+ "id": el.id,
901
+ "bbox": {
902
+ "x": el.bbox.x,
903
+ "y": el.bbox.y,
904
+ "width": el.bbox.width,
905
+ "height": el.bbox.height,
906
+ },
907
+ "role": el.role,
908
+ "text": el.text[:50] if el.text else "",
909
+ }
910
+ for el in filtered_snap.elements[:50]
911
+ ]
912
+
913
+ self.tracer.emit(
914
+ "action",
915
+ {
916
+ "action": result.action,
917
+ "element_id": result.element_id,
918
+ "success": result.success,
919
+ "outcome": result.outcome,
920
+ "duration_ms": duration_ms,
921
+ "post_url": post_url,
922
+ "elements": elements_data, # Add element data for overlay
923
+ "target_element_id": result.element_id, # Highlight target in red
924
+ },
925
+ step_id=step_id,
926
+ )
927
+
928
+ # 5. RECORD: Track history
929
+ self.history.append(
930
+ {
931
+ "goal": goal,
932
+ "action": action_str,
933
+ "result": result.model_dump(), # Store as dict
934
+ "success": result.success,
935
+ "attempt": attempt,
936
+ "duration_ms": duration_ms,
937
+ }
938
+ )
939
+
940
+ if self.verbose:
941
+ status = "✅" if result.success else "❌"
942
+ print(f"{status} Completed in {duration_ms}ms")
943
+
944
+ # Emit step completion trace event if tracer is enabled
945
+ if self.tracer:
946
+ self.tracer.emit(
947
+ "step_end",
948
+ {
949
+ "success": result.success,
950
+ "duration_ms": duration_ms,
951
+ "action": result.action,
952
+ },
953
+ step_id=step_id,
954
+ )
955
+
956
+ return result
957
+
958
+ except Exception as e:
959
+ # Emit error trace event if tracer is enabled
960
+ if self.tracer:
961
+ self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
962
+
963
+ if attempt < max_retries:
964
+ if self.verbose:
965
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
966
+ await asyncio.sleep(1.0) # Brief delay before retry
967
+ continue
968
+ else:
969
+ # Create error result
970
+ error_result = AgentActionResult(
971
+ success=False,
972
+ action="error",
973
+ goal=goal,
974
+ duration_ms=0,
975
+ attempt=attempt,
976
+ error=str(e),
977
+ )
978
+ self.history.append(
979
+ {
980
+ "goal": goal,
981
+ "action": "error",
982
+ "result": error_result.model_dump(),
983
+ "success": False,
984
+ "attempt": attempt,
985
+ "duration_ms": 0,
986
+ }
987
+ )
988
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
989
+
990
+ def _build_context(self, snap: Snapshot, goal: str) -> str:
991
+ """Convert snapshot elements to token-efficient prompt string (same as sync version)"""
992
+ lines = []
993
+ # Note: elements are already filtered by filter_elements() in act()
994
+ for el in snap.elements:
995
+ # Extract visual cues
996
+ cues = []
997
+ if el.visual_cues.is_primary:
998
+ cues.append("PRIMARY")
999
+ if el.visual_cues.is_clickable:
1000
+ cues.append("CLICKABLE")
1001
+ if el.visual_cues.background_color_name:
1002
+ cues.append(f"color:{el.visual_cues.background_color_name}")
1003
+
1004
+ # Format element line
1005
+ cues_str = f" {{{','.join(cues)}}}" if cues else ""
1006
+ text_preview = (
1007
+ (el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
1008
+ )
1009
+
1010
+ lines.append(
1011
+ f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
1012
+ f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
1013
+ )
1014
+
1015
+ return "\n".join(lines)
1016
+
1017
+ def _extract_action_from_response(self, response: str) -> str:
1018
+ """Extract action command from LLM response (same as sync version)"""
1019
+ # Remove markdown code blocks if present
1020
+ response = re.sub(r"```[\w]*\n?", "", response)
1021
+ response = response.strip()
1022
+
1023
+ # Try to find action patterns in the response
1024
+ # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
1025
+ action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
1026
+
1027
+ match = re.search(action_pattern, response, re.IGNORECASE)
1028
+ if match:
1029
+ return match.group(1)
1030
+
1031
+ # If no pattern match, return the original response (will likely fail parsing)
1032
+ return response
1033
+
1034
+ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
1035
+ """Query LLM with standardized prompt template (same as sync version)"""
1036
+ system_prompt = f"""You are an AI web automation agent.
1037
+
1038
+ GOAL: {goal}
1039
+
1040
+ VISIBLE ELEMENTS (sorted by importance):
1041
+ {dom_context}
1042
+
1043
+ VISUAL CUES EXPLAINED:
1044
+ - {{PRIMARY}}: Main call-to-action element on the page
1045
+ - {{CLICKABLE}}: Element is clickable
1046
+ - {{color:X}}: Background color name
1047
+
1048
+ CRITICAL RESPONSE FORMAT:
1049
+ You MUST respond with ONLY ONE of these exact action formats:
1050
+ - CLICK(id) - Click element by ID
1051
+ - TYPE(id, "text") - Type text into element
1052
+ - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
1053
+ - FINISH() - Task complete
1054
+
1055
+ DO NOT include any explanation, reasoning, or natural language.
1056
+ DO NOT use markdown formatting or code blocks.
1057
+ DO NOT say "The next step is..." or anything similar.
1058
+
1059
+ CORRECT Examples:
1060
+ CLICK(42)
1061
+ TYPE(15, "magic mouse")
1062
+ PRESS("Enter")
1063
+ FINISH()
1064
+
1065
+ INCORRECT Examples (DO NOT DO THIS):
1066
+ "The next step is to click..."
1067
+ "I will type..."
1068
+ ```CLICK(42)```
1069
+ """
1070
+
1071
+ user_prompt = "Return the single action command:"
1072
+
1073
+ return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
1074
+
1075
+ async def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
1076
+ """
1077
+ Parse action string and execute SDK call (async)
1078
+
1079
+ Args:
1080
+ action_str: Action string from LLM (e.g., "CLICK(42)")
1081
+ snap: Current snapshot (for context)
1082
+
1083
+ Returns:
1084
+ Execution result dictionary
1085
+ """
1086
+ # Parse CLICK(42)
1087
+ if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
1088
+ element_id = int(match.group(1))
1089
+ result = await click_async(self.browser, element_id)
1090
+ return {
1091
+ "success": result.success,
1092
+ "action": "click",
1093
+ "element_id": element_id,
1094
+ "outcome": result.outcome,
1095
+ "url_changed": result.url_changed,
1096
+ }
1097
+
1098
+ # Parse TYPE(42, "hello world")
1099
+ elif match := re.match(
1100
+ r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
1101
+ action_str,
1102
+ re.IGNORECASE,
1103
+ ):
1104
+ element_id = int(match.group(1))
1105
+ text = match.group(2)
1106
+ result = await type_text_async(self.browser, element_id, text)
1107
+ return {
1108
+ "success": result.success,
1109
+ "action": "type",
1110
+ "element_id": element_id,
1111
+ "text": text,
1112
+ "outcome": result.outcome,
1113
+ }
1114
+
1115
+ # Parse PRESS("Enter")
1116
+ elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
1117
+ key = match.group(1)
1118
+ result = await press_async(self.browser, key)
1119
+ return {
1120
+ "success": result.success,
1121
+ "action": "press",
1122
+ "key": key,
1123
+ "outcome": result.outcome,
1124
+ }
1125
+
1126
+ # Parse FINISH()
1127
+ elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
1128
+ return {
1129
+ "success": True,
1130
+ "action": "finish",
1131
+ "message": "Task marked as complete",
1132
+ }
1133
+
1134
+ else:
1135
+ raise ValueError(
1136
+ f"Unknown action format: {action_str}\n"
1137
+ f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
1138
+ )
1139
+
1140
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
1141
+ """Track token usage for analytics (same as sync version)"""
1142
+ if llm_response.prompt_tokens:
1143
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
1144
+ if llm_response.completion_tokens:
1145
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
1146
+ if llm_response.total_tokens:
1147
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
1148
+
1149
+ self._token_usage_raw["by_action"].append(
1150
+ {
1151
+ "goal": goal,
1152
+ "prompt_tokens": llm_response.prompt_tokens or 0,
1153
+ "completion_tokens": llm_response.completion_tokens or 0,
1154
+ "total_tokens": llm_response.total_tokens or 0,
1155
+ "model": llm_response.model_name,
1156
+ }
1157
+ )
1158
+
1159
+ def get_token_stats(self) -> TokenStats:
1160
+ """Get token usage statistics (same as sync version)"""
1161
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
1162
+ return TokenStats(
1163
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
1164
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
1165
+ total_tokens=self._token_usage_raw["total_tokens"],
1166
+ by_action=by_action,
1167
+ )
1168
+
1169
+ def get_history(self) -> list[ActionHistory]:
1170
+ """Get execution history (same as sync version)"""
1171
+ return [ActionHistory(**h) for h in self.history]
1172
+
1173
+ def clear_history(self) -> None:
1174
+ """Clear execution history and reset token counters (same as sync version)"""
1175
+ self.history.clear()
1176
+ self._token_usage_raw = {
1177
+ "total_prompt_tokens": 0,
1178
+ "total_completion_tokens": 0,
1179
+ "total_tokens": 0,
1180
+ "by_action": [],
1181
+ }
1182
+
1183
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
1184
+ """Filter elements from snapshot based on goal context (same as sync version)"""
1185
+ elements = snapshot.elements
1186
+
1187
+ # If no goal provided, return all elements (up to limit)
1188
+ if not goal:
1189
+ return elements[: self.default_snapshot_limit]
1190
+
1191
+ goal_lower = goal.lower()
1192
+
1193
+ # Extract keywords from goal
1194
+ keywords = self._extract_keywords(goal_lower)
1195
+
1196
+ # Boost elements matching goal keywords
1197
+ scored_elements = []
1198
+ for el in elements:
1199
+ score = el.importance
1200
+
1201
+ # Boost if element text matches goal
1202
+ if el.text and any(kw in el.text.lower() for kw in keywords):
1203
+ score += 0.3
1204
+
1205
+ # Boost if role matches goal intent
1206
+ if "click" in goal_lower and el.visual_cues.is_clickable:
1207
+ score += 0.2
1208
+ if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
1209
+ score += 0.2
1210
+ if "search" in goal_lower:
1211
+ # Filter out non-interactive elements for search tasks
1212
+ if el.role in ["link", "img"] and not el.visual_cues.is_primary:
1213
+ score -= 0.5
1214
+
1215
+ scored_elements.append((score, el))
1216
+
1217
+ # Re-sort by boosted score
1218
+ scored_elements.sort(key=lambda x: x[0], reverse=True)
1219
+ elements = [el for _, el in scored_elements]
1220
+
1221
+ return elements[: self.default_snapshot_limit]
1222
+
1223
+ def _extract_keywords(self, text: str) -> list[str]:
1224
+ """Extract meaningful keywords from goal text (same as sync version)"""
1225
+ stopwords = {
1226
+ "the",
1227
+ "a",
1228
+ "an",
1229
+ "and",
1230
+ "or",
1231
+ "but",
1232
+ "in",
1233
+ "on",
1234
+ "at",
1235
+ "to",
1236
+ "for",
1237
+ "of",
1238
+ "with",
1239
+ "by",
1240
+ "from",
1241
+ "as",
1242
+ "is",
1243
+ "was",
1244
+ }
1245
+ words = text.split()
1246
+ return [w for w in words if w not in stopwords and len(w) > 2]