sentienceapi 0.95.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (82) hide show
  1. sentience/__init__.py +253 -0
  2. sentience/_extension_loader.py +195 -0
  3. sentience/action_executor.py +215 -0
  4. sentience/actions.py +1020 -0
  5. sentience/agent.py +1181 -0
  6. sentience/agent_config.py +46 -0
  7. sentience/agent_runtime.py +424 -0
  8. sentience/asserts/__init__.py +70 -0
  9. sentience/asserts/expect.py +621 -0
  10. sentience/asserts/query.py +383 -0
  11. sentience/async_api.py +108 -0
  12. sentience/backends/__init__.py +137 -0
  13. sentience/backends/actions.py +343 -0
  14. sentience/backends/browser_use_adapter.py +241 -0
  15. sentience/backends/cdp_backend.py +393 -0
  16. sentience/backends/exceptions.py +211 -0
  17. sentience/backends/playwright_backend.py +194 -0
  18. sentience/backends/protocol.py +216 -0
  19. sentience/backends/sentience_context.py +469 -0
  20. sentience/backends/snapshot.py +427 -0
  21. sentience/base_agent.py +196 -0
  22. sentience/browser.py +1215 -0
  23. sentience/browser_evaluator.py +299 -0
  24. sentience/canonicalization.py +207 -0
  25. sentience/cli.py +130 -0
  26. sentience/cloud_tracing.py +807 -0
  27. sentience/constants.py +6 -0
  28. sentience/conversational_agent.py +543 -0
  29. sentience/element_filter.py +136 -0
  30. sentience/expect.py +188 -0
  31. sentience/extension/background.js +104 -0
  32. sentience/extension/content.js +161 -0
  33. sentience/extension/injected_api.js +914 -0
  34. sentience/extension/manifest.json +36 -0
  35. sentience/extension/pkg/sentience_core.d.ts +51 -0
  36. sentience/extension/pkg/sentience_core.js +323 -0
  37. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  38. sentience/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
  39. sentience/extension/release.json +115 -0
  40. sentience/formatting.py +15 -0
  41. sentience/generator.py +202 -0
  42. sentience/inspector.py +367 -0
  43. sentience/llm_interaction_handler.py +191 -0
  44. sentience/llm_provider.py +875 -0
  45. sentience/llm_provider_utils.py +120 -0
  46. sentience/llm_response_builder.py +153 -0
  47. sentience/models.py +846 -0
  48. sentience/ordinal.py +280 -0
  49. sentience/overlay.py +222 -0
  50. sentience/protocols.py +228 -0
  51. sentience/query.py +303 -0
  52. sentience/read.py +188 -0
  53. sentience/recorder.py +589 -0
  54. sentience/schemas/trace_v1.json +335 -0
  55. sentience/screenshot.py +100 -0
  56. sentience/sentience_methods.py +86 -0
  57. sentience/snapshot.py +706 -0
  58. sentience/snapshot_diff.py +126 -0
  59. sentience/text_search.py +262 -0
  60. sentience/trace_event_builder.py +148 -0
  61. sentience/trace_file_manager.py +197 -0
  62. sentience/trace_indexing/__init__.py +27 -0
  63. sentience/trace_indexing/index_schema.py +199 -0
  64. sentience/trace_indexing/indexer.py +414 -0
  65. sentience/tracer_factory.py +322 -0
  66. sentience/tracing.py +449 -0
  67. sentience/utils/__init__.py +40 -0
  68. sentience/utils/browser.py +46 -0
  69. sentience/utils/element.py +257 -0
  70. sentience/utils/formatting.py +59 -0
  71. sentience/utils.py +296 -0
  72. sentience/verification.py +380 -0
  73. sentience/visual_agent.py +2058 -0
  74. sentience/wait.py +139 -0
  75. sentienceapi-0.95.0.dist-info/METADATA +984 -0
  76. sentienceapi-0.95.0.dist-info/RECORD +82 -0
  77. sentienceapi-0.95.0.dist-info/WHEEL +5 -0
  78. sentienceapi-0.95.0.dist-info/entry_points.txt +2 -0
  79. sentienceapi-0.95.0.dist-info/licenses/LICENSE +24 -0
  80. sentienceapi-0.95.0.dist-info/licenses/LICENSE-APACHE +201 -0
  81. sentienceapi-0.95.0.dist-info/licenses/LICENSE-MIT +21 -0
  82. sentienceapi-0.95.0.dist-info/top_level.txt +1 -0
sentience/agent.py ADDED
@@ -0,0 +1,1181 @@
1
+ """
2
+ Sentience Agent: High-level automation agent using LLM + SDK
3
+ Implements observe-think-act loop for natural language commands
4
+ """
5
+
6
+ import asyncio
7
+ import hashlib
8
+ import time
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
+
11
+ from .action_executor import ActionExecutor
12
+ from .agent_config import AgentConfig
13
+ from .base_agent import BaseAgent, BaseAgentAsync
14
+ from .browser import AsyncSentienceBrowser, SentienceBrowser
15
+ from .element_filter import ElementFilter
16
+ from .llm_interaction_handler import LLMInteractionHandler
17
+ from .llm_provider import LLMProvider, LLMResponse
18
+ from .models import (
19
+ ActionHistory,
20
+ ActionTokenUsage,
21
+ AgentActionResult,
22
+ Element,
23
+ ScreenshotConfig,
24
+ Snapshot,
25
+ SnapshotOptions,
26
+ TokenStats,
27
+ )
28
+ from .protocols import AsyncBrowserProtocol, BrowserProtocol
29
+ from .snapshot import snapshot, snapshot_async
30
+ from .snapshot_diff import SnapshotDiff
31
+ from .trace_event_builder import TraceEventBuilder
32
+
33
+ if TYPE_CHECKING:
34
+ from .tracing import Tracer
35
+
36
+
37
+ def _safe_tracer_call(
38
+ tracer: Optional["Tracer"], method_name: str, verbose: bool, *args, **kwargs
39
+ ) -> None:
40
+ """
41
+ Safely call tracer method, catching and logging errors without breaking execution.
42
+
43
+ Args:
44
+ tracer: Tracer instance or None
45
+ method_name: Name of tracer method to call (e.g., "emit", "emit_error")
46
+ verbose: Whether to print error messages
47
+ *args: Positional arguments for the tracer method
48
+ **kwargs: Keyword arguments for the tracer method
49
+ """
50
+ if not tracer:
51
+ return
52
+ try:
53
+ method = getattr(tracer, method_name)
54
+ if args and kwargs:
55
+ method(*args, **kwargs)
56
+ elif args:
57
+ method(*args)
58
+ elif kwargs:
59
+ method(**kwargs)
60
+ else:
61
+ method()
62
+ except Exception as tracer_error:
63
+ # Tracer errors should not break agent execution
64
+ if verbose:
65
+ print(f"⚠️ Tracer error (non-fatal): {tracer_error}")
66
+
67
+
68
+ class SentienceAgent(BaseAgent):
69
+ """
70
+ High-level agent that combines Sentience SDK with any LLM provider.
71
+
72
+ Uses observe-think-act loop to execute natural language commands:
73
+ 1. OBSERVE: Get snapshot of current page state
74
+ 2. THINK: Query LLM to decide next action
75
+ 3. ACT: Execute action using SDK
76
+
77
+ Example:
78
+ >>> from sentience import SentienceBrowser, SentienceAgent
79
+ >>> from sentience.llm_provider import OpenAIProvider
80
+ >>>
81
+ >>> browser = SentienceBrowser(api_key="sentience_key")
82
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
83
+ >>> agent = SentienceAgent(browser, llm)
84
+ >>>
85
+ >>> with browser:
86
+ >>> browser.page.goto("https://google.com")
87
+ >>> agent.act("Click the search box")
88
+ >>> agent.act("Type 'magic mouse' into the search field")
89
+ >>> agent.act("Press Enter key")
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ browser: SentienceBrowser | BrowserProtocol,
95
+ llm: LLMProvider,
96
+ default_snapshot_limit: int = 50,
97
+ verbose: bool = True,
98
+ tracer: Optional["Tracer"] = None,
99
+ config: Optional["AgentConfig"] = None,
100
+ ):
101
+ """
102
+ Initialize Sentience Agent
103
+
104
+ Args:
105
+ browser: SentienceBrowser instance or BrowserProtocol-compatible object
106
+ (for testing, can use mock objects that implement BrowserProtocol)
107
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
108
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
109
+ verbose: Print execution logs (default: True)
110
+ tracer: Optional Tracer instance for execution tracking (default: None)
111
+ config: Optional AgentConfig for advanced configuration (default: None)
112
+ """
113
+ self.browser = browser
114
+ self.llm = llm
115
+ self.default_snapshot_limit = default_snapshot_limit
116
+ self.verbose = verbose
117
+ self.tracer = tracer
118
+ self.config = config or AgentConfig()
119
+
120
+ # Initialize handlers
121
+ self.llm_handler = LLMInteractionHandler(llm)
122
+ self.action_executor = ActionExecutor(browser)
123
+
124
+ # Screenshot sequence counter
125
+ # Execution history
126
+ self.history: list[dict[str, Any]] = []
127
+
128
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
129
+ self._token_usage_raw = {
130
+ "total_prompt_tokens": 0,
131
+ "total_completion_tokens": 0,
132
+ "total_tokens": 0,
133
+ "by_action": [],
134
+ }
135
+
136
+ # Step counter for tracing
137
+ self._step_count = 0
138
+
139
+ # Previous snapshot for diff detection
140
+ self._previous_snapshot: Snapshot | None = None
141
+
142
+ def _compute_hash(self, text: str) -> str:
143
+ """Compute SHA256 hash of text."""
144
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
145
+
146
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
147
+ """Get bounding box for an element from snapshot."""
148
+ if element_id is None:
149
+ return None
150
+ for el in snap.elements:
151
+ if el.id == element_id:
152
+ return {
153
+ "x": el.bbox.x,
154
+ "y": el.bbox.y,
155
+ "width": el.bbox.width,
156
+ "height": el.bbox.height,
157
+ }
158
+ return None
159
+
160
+ def act( # noqa: C901
161
+ self,
162
+ goal: str,
163
+ max_retries: int = 2,
164
+ snapshot_options: SnapshotOptions | None = None,
165
+ ) -> AgentActionResult:
166
+ """
167
+ Execute a high-level goal using observe → think → act loop
168
+
169
+ Args:
170
+ goal: Natural language instruction (e.g., "Click the Sign In button")
171
+ max_retries: Number of retries on failure (default: 2)
172
+ snapshot_options: Optional SnapshotOptions for this specific action
173
+
174
+ Returns:
175
+ AgentActionResult with execution details
176
+
177
+ Example:
178
+ >>> result = agent.act("Click the search box")
179
+ >>> print(result.success, result.action, result.element_id)
180
+ True click 42
181
+ >>> # Backward compatible dict access
182
+ >>> print(result["element_id"]) # Works but shows deprecation warning
183
+ 42
184
+ """
185
+ if self.verbose:
186
+ print(f"\n{'=' * 70}")
187
+ print(f"🤖 Agent Goal: {goal}")
188
+ print(f"{'=' * 70}")
189
+
190
+ # Generate step ID for tracing
191
+ self._step_count += 1
192
+ step_id = f"step-{self._step_count}"
193
+
194
+ # Emit step_start trace event if tracer is enabled
195
+ if self.tracer:
196
+ pre_url = self.browser.page.url if self.browser.page else None
197
+ _safe_tracer_call(
198
+ self.tracer,
199
+ "emit_step_start",
200
+ self.verbose,
201
+ step_id=step_id,
202
+ step_index=self._step_count,
203
+ goal=goal,
204
+ attempt=0,
205
+ pre_url=pre_url,
206
+ )
207
+
208
+ for attempt in range(max_retries + 1):
209
+ try:
210
+ # 1. OBSERVE: Get refined semantic snapshot
211
+ start_time = time.time()
212
+
213
+ # Use provided options or create default
214
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
215
+ # Only set goal if not already provided
216
+ if snap_opts.goal is None:
217
+ snap_opts.goal = goal
218
+
219
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
220
+ if snapshot_options is None and self.config:
221
+ if self.config.capture_screenshots:
222
+ # Create ScreenshotConfig from AgentConfig
223
+ snap_opts.screenshot = ScreenshotConfig(
224
+ format=self.config.screenshot_format,
225
+ quality=(
226
+ self.config.screenshot_quality
227
+ if self.config.screenshot_format == "jpeg"
228
+ else None
229
+ ),
230
+ )
231
+ else:
232
+ snap_opts.screenshot = False
233
+ # Apply show_overlay from AgentConfig
234
+ snap_opts.show_overlay = self.config.show_overlay
235
+
236
+ # Call snapshot with options object (matches TypeScript API)
237
+ snap = snapshot(self.browser, snap_opts)
238
+
239
+ if snap.status != "success":
240
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
241
+
242
+ # Compute diff_status by comparing with previous snapshot
243
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
244
+
245
+ # Create snapshot with diff_status populated
246
+ snap_with_diff = Snapshot(
247
+ status=snap.status,
248
+ timestamp=snap.timestamp,
249
+ url=snap.url,
250
+ viewport=snap.viewport,
251
+ elements=elements_with_diff,
252
+ screenshot=snap.screenshot,
253
+ screenshot_format=snap.screenshot_format,
254
+ error=snap.error,
255
+ )
256
+
257
+ # Update previous snapshot for next comparison
258
+ self._previous_snapshot = snap
259
+
260
+ # Apply element filtering based on goal
261
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
262
+
263
+ # Emit snapshot trace event if tracer is enabled
264
+ if self.tracer:
265
+ # Build snapshot event data (use snap_with_diff to include diff_status)
266
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
267
+
268
+ # Always include screenshot in trace event for studio viewer compatibility
269
+ # CloudTraceSink will extract and upload screenshots separately, then remove
270
+ # screenshot_base64 from events before uploading the trace file.
271
+ if snap.screenshot:
272
+ # Extract base64 string from data URL if needed
273
+ if snap.screenshot.startswith("data:image"):
274
+ # Format: "data:image/jpeg;base64,{base64_string}"
275
+ screenshot_base64 = (
276
+ snap.screenshot.split(",", 1)[1]
277
+ if "," in snap.screenshot
278
+ else snap.screenshot
279
+ )
280
+ else:
281
+ screenshot_base64 = snap.screenshot
282
+
283
+ snapshot_data["screenshot_base64"] = screenshot_base64
284
+ if snap.screenshot_format:
285
+ snapshot_data["screenshot_format"] = snap.screenshot_format
286
+
287
+ _safe_tracer_call(
288
+ self.tracer,
289
+ "emit",
290
+ self.verbose,
291
+ "snapshot",
292
+ snapshot_data,
293
+ step_id=step_id,
294
+ )
295
+
296
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
297
+ filtered_snap = Snapshot(
298
+ status=snap_with_diff.status,
299
+ timestamp=snap_with_diff.timestamp,
300
+ url=snap_with_diff.url,
301
+ viewport=snap_with_diff.viewport,
302
+ elements=filtered_elements,
303
+ screenshot=snap_with_diff.screenshot,
304
+ screenshot_format=snap_with_diff.screenshot_format,
305
+ error=snap_with_diff.error,
306
+ )
307
+
308
+ # 2. GROUND: Format elements for LLM context
309
+ context = self.llm_handler.build_context(filtered_snap, goal)
310
+
311
+ # 3. THINK: Query LLM for next action
312
+ llm_response = self.llm_handler.query_llm(context, goal)
313
+
314
+ # Emit LLM query trace event if tracer is enabled
315
+ if self.tracer:
316
+ _safe_tracer_call(
317
+ self.tracer,
318
+ "emit",
319
+ self.verbose,
320
+ "llm_query",
321
+ {
322
+ "prompt_tokens": llm_response.prompt_tokens,
323
+ "completion_tokens": llm_response.completion_tokens,
324
+ "model": llm_response.model_name,
325
+ "response": llm_response.content[:200], # Truncate for brevity
326
+ },
327
+ step_id=step_id,
328
+ )
329
+
330
+ if self.verbose:
331
+ print(f"🧠 LLM Decision: {llm_response.content}")
332
+
333
+ # Track token usage
334
+ self._track_tokens(goal, llm_response)
335
+
336
+ # Parse action from LLM response
337
+ action_str = self.llm_handler.extract_action(llm_response.content)
338
+
339
+ # 4. EXECUTE: Parse and run action
340
+ result_dict = self.action_executor.execute(action_str, filtered_snap)
341
+
342
+ duration_ms = int((time.time() - start_time) * 1000)
343
+
344
+ # Create AgentActionResult from execution result
345
+ result = AgentActionResult(
346
+ success=result_dict["success"],
347
+ action=result_dict["action"],
348
+ goal=goal,
349
+ duration_ms=duration_ms,
350
+ attempt=attempt,
351
+ element_id=result_dict.get("element_id"),
352
+ text=result_dict.get("text"),
353
+ key=result_dict.get("key"),
354
+ outcome=result_dict.get("outcome"),
355
+ url_changed=result_dict.get("url_changed"),
356
+ error=result_dict.get("error"),
357
+ message=result_dict.get("message"),
358
+ )
359
+
360
+ # Emit action execution trace event if tracer is enabled
361
+ if self.tracer:
362
+ post_url = self.browser.page.url if self.browser.page else None
363
+
364
+ # Include element data for live overlay visualization
365
+ elements_data = [
366
+ {
367
+ "id": el.id,
368
+ "bbox": {
369
+ "x": el.bbox.x,
370
+ "y": el.bbox.y,
371
+ "width": el.bbox.width,
372
+ "height": el.bbox.height,
373
+ },
374
+ "role": el.role,
375
+ "text": el.text[:50] if el.text else "",
376
+ }
377
+ for el in filtered_snap.elements[:50]
378
+ ]
379
+
380
+ _safe_tracer_call(
381
+ self.tracer,
382
+ "emit",
383
+ self.verbose,
384
+ "action",
385
+ {
386
+ "action": result.action,
387
+ "element_id": result.element_id,
388
+ "success": result.success,
389
+ "outcome": result.outcome,
390
+ "duration_ms": duration_ms,
391
+ "post_url": post_url,
392
+ "elements": elements_data, # Add element data for overlay
393
+ "target_element_id": result.element_id, # Highlight target in red
394
+ },
395
+ step_id=step_id,
396
+ )
397
+
398
+ # 5. RECORD: Track history
399
+ self.history.append(
400
+ {
401
+ "goal": goal,
402
+ "action": action_str,
403
+ "result": result.model_dump(), # Store as dict
404
+ "success": result.success,
405
+ "attempt": attempt,
406
+ "duration_ms": duration_ms,
407
+ }
408
+ )
409
+
410
+ if self.verbose:
411
+ status = "✅" if result.success else "❌"
412
+ print(f"{status} Completed in {duration_ms}ms")
413
+
414
+ # Emit step completion trace event if tracer is enabled
415
+ if self.tracer:
416
+ # Get pre_url from step_start (stored in tracer or use current)
417
+ pre_url = snap.url
418
+ post_url = self.browser.page.url if self.browser.page else None
419
+
420
+ # Compute snapshot digest (simplified - use URL + timestamp)
421
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
422
+
423
+ # Build LLM data
424
+ llm_response_text = llm_response.content
425
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
426
+ llm_data = {
427
+ "response_text": llm_response_text,
428
+ "response_hash": llm_response_hash,
429
+ "usage": {
430
+ "prompt_tokens": llm_response.prompt_tokens or 0,
431
+ "completion_tokens": llm_response.completion_tokens or 0,
432
+ "total_tokens": llm_response.total_tokens or 0,
433
+ },
434
+ }
435
+
436
+ # Build exec data
437
+ exec_data = {
438
+ "success": result.success,
439
+ "action": result.action,
440
+ "outcome": result.outcome
441
+ or (
442
+ f"Action {result.action} executed successfully"
443
+ if result.success
444
+ else f"Action {result.action} failed"
445
+ ),
446
+ "duration_ms": duration_ms,
447
+ }
448
+
449
+ # Add optional exec fields
450
+ if result.element_id is not None:
451
+ exec_data["element_id"] = result.element_id
452
+ # Add bounding box if element found
453
+ bbox = self._get_element_bbox(result.element_id, snap)
454
+ if bbox:
455
+ exec_data["bounding_box"] = bbox
456
+ if result.text is not None:
457
+ exec_data["text"] = result.text
458
+ if result.key is not None:
459
+ exec_data["key"] = result.key
460
+ if result.error is not None:
461
+ exec_data["error"] = result.error
462
+
463
+ # Build verify data (simplified - based on success and url_changed)
464
+ verify_passed = result.success and (
465
+ result.url_changed or result.action != "click"
466
+ )
467
+ verify_signals = {
468
+ "url_changed": result.url_changed or False,
469
+ }
470
+ if result.error:
471
+ verify_signals["error"] = result.error
472
+
473
+ # Add elements_found array if element was targeted
474
+ if result.element_id is not None:
475
+ bbox = self._get_element_bbox(result.element_id, snap)
476
+ if bbox:
477
+ verify_signals["elements_found"] = [
478
+ {
479
+ "label": f"Element {result.element_id}",
480
+ "bounding_box": bbox,
481
+ }
482
+ ]
483
+
484
+ verify_data = {
485
+ "passed": verify_passed,
486
+ "signals": verify_signals,
487
+ }
488
+
489
+ # Build elements data for pre field (include diff_status from snap_with_diff)
490
+ # Use the same format as build_snapshot_event for consistency
491
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
492
+ pre_elements = snapshot_event_data.get("elements", [])
493
+
494
+ # Build complete step_end event
495
+ step_end_data = TraceEventBuilder.build_step_end_event(
496
+ step_id=step_id,
497
+ step_index=self._step_count,
498
+ goal=goal,
499
+ attempt=attempt,
500
+ pre_url=pre_url,
501
+ post_url=post_url,
502
+ snapshot_digest=snapshot_digest,
503
+ llm_data=llm_data,
504
+ exec_data=exec_data,
505
+ verify_data=verify_data,
506
+ pre_elements=pre_elements,
507
+ )
508
+
509
+ _safe_tracer_call(
510
+ self.tracer,
511
+ "emit",
512
+ self.verbose,
513
+ "step_end",
514
+ step_end_data,
515
+ step_id=step_id,
516
+ )
517
+
518
+ return result
519
+
520
+ except Exception as e:
521
+ # Emit error trace event if tracer is enabled
522
+ if self.tracer:
523
+ _safe_tracer_call(
524
+ self.tracer,
525
+ "emit_error",
526
+ self.verbose,
527
+ step_id=step_id,
528
+ error=str(e),
529
+ attempt=attempt,
530
+ )
531
+
532
+ if attempt < max_retries:
533
+ if self.verbose:
534
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
535
+ time.sleep(1.0) # Brief delay before retry
536
+ continue
537
+ else:
538
+ # Create error result
539
+ error_result = AgentActionResult(
540
+ success=False,
541
+ action="error",
542
+ goal=goal,
543
+ duration_ms=0,
544
+ attempt=attempt,
545
+ error=str(e),
546
+ )
547
+ self.history.append(
548
+ {
549
+ "goal": goal,
550
+ "action": "error",
551
+ "result": error_result.model_dump(),
552
+ "success": False,
553
+ "attempt": attempt,
554
+ "duration_ms": 0,
555
+ }
556
+ )
557
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
558
+
559
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
560
+ """
561
+ Track token usage for analytics
562
+
563
+ Args:
564
+ goal: User goal
565
+ llm_response: LLM response with token usage
566
+ """
567
+ if llm_response.prompt_tokens:
568
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
569
+ if llm_response.completion_tokens:
570
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
571
+ if llm_response.total_tokens:
572
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
573
+
574
+ self._token_usage_raw["by_action"].append(
575
+ {
576
+ "goal": goal,
577
+ "prompt_tokens": llm_response.prompt_tokens or 0,
578
+ "completion_tokens": llm_response.completion_tokens or 0,
579
+ "total_tokens": llm_response.total_tokens or 0,
580
+ "model": llm_response.model_name,
581
+ }
582
+ )
583
+
584
+ def get_token_stats(self) -> TokenStats:
585
+ """
586
+ Get token usage statistics
587
+
588
+ Returns:
589
+ TokenStats with token usage breakdown
590
+ """
591
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
592
+ return TokenStats(
593
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
594
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
595
+ total_tokens=self._token_usage_raw["total_tokens"],
596
+ by_action=by_action,
597
+ )
598
+
599
+ def get_history(self) -> list[ActionHistory]:
600
+ """
601
+ Get execution history
602
+
603
+ Returns:
604
+ List of ActionHistory entries
605
+ """
606
+ return [ActionHistory(**h) for h in self.history]
607
+
608
+ def clear_history(self) -> None:
609
+ """Clear execution history and reset token counters"""
610
+ self.history.clear()
611
+ self._token_usage_raw = {
612
+ "total_prompt_tokens": 0,
613
+ "total_completion_tokens": 0,
614
+ "total_tokens": 0,
615
+ "by_action": [],
616
+ }
617
+
618
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
619
+ """
620
+ Filter elements from snapshot based on goal context.
621
+
622
+ This implementation uses ElementFilter to apply goal-based keyword matching
623
+ to boost relevant elements and filters out irrelevant ones.
624
+
625
+ Args:
626
+ snapshot: Current page snapshot
627
+ goal: User's goal (can inform filtering)
628
+
629
+ Returns:
630
+ Filtered list of elements
631
+ """
632
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)
633
+
634
+
635
+ class SentienceAgentAsync(BaseAgentAsync):
636
+ """
637
+ High-level async agent that combines Sentience SDK with any LLM provider.
638
+
639
+ Uses observe-think-act loop to execute natural language commands:
640
+ 1. OBSERVE: Get snapshot of current page state
641
+ 2. THINK: Query LLM to decide next action
642
+ 3. ACT: Execute action using SDK
643
+
644
+ Example:
645
+ >>> from sentience.async_api import AsyncSentienceBrowser
646
+ >>> from sentience.agent import SentienceAgentAsync
647
+ >>> from sentience.llm_provider import OpenAIProvider
648
+ >>>
649
+ >>> async with AsyncSentienceBrowser() as browser:
650
+ >>> await browser.goto("https://google.com")
651
+ >>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
652
+ >>> agent = SentienceAgentAsync(browser, llm)
653
+ >>> await agent.act("Click the search box")
654
+ >>> await agent.act("Type 'magic mouse' into the search field")
655
+ >>> await agent.act("Press Enter key")
656
+ """
657
+
658
+ def __init__(
659
+ self,
660
+ browser: AsyncSentienceBrowser,
661
+ llm: LLMProvider,
662
+ default_snapshot_limit: int = 50,
663
+ verbose: bool = True,
664
+ tracer: Optional["Tracer"] = None,
665
+ config: Optional["AgentConfig"] = None,
666
+ ):
667
+ """
668
+ Initialize Sentience Agent (async)
669
+
670
+ Args:
671
+ browser: AsyncSentienceBrowser instance
672
+ llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
673
+ default_snapshot_limit: Default maximum elements to include in context (default: 50)
674
+ verbose: Print execution logs (default: True)
675
+ tracer: Optional Tracer instance for execution tracking (default: None)
676
+ config: Optional AgentConfig for advanced configuration (default: None)
677
+ """
678
+ self.browser = browser
679
+ self.llm = llm
680
+ self.default_snapshot_limit = default_snapshot_limit
681
+ self.verbose = verbose
682
+ self.tracer = tracer
683
+ self.config = config or AgentConfig()
684
+
685
+ # Initialize handlers
686
+ self.llm_handler = LLMInteractionHandler(llm)
687
+ self.action_executor = ActionExecutor(browser)
688
+
689
+ # Screenshot sequence counter
690
+ # Execution history
691
+ self.history: list[dict[str, Any]] = []
692
+
693
+ # Token usage tracking (will be converted to TokenStats on get_token_stats())
694
+ self._token_usage_raw = {
695
+ "total_prompt_tokens": 0,
696
+ "total_completion_tokens": 0,
697
+ "total_tokens": 0,
698
+ "by_action": [],
699
+ }
700
+
701
+ # Step counter for tracing
702
+ self._step_count = 0
703
+
704
+ # Previous snapshot for diff detection
705
+ self._previous_snapshot: Snapshot | None = None
706
+
707
+ def _compute_hash(self, text: str) -> str:
708
+ """Compute SHA256 hash of text."""
709
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
710
+
711
+ def _get_element_bbox(self, element_id: int | None, snap: Snapshot) -> dict[str, float] | None:
712
+ """Get bounding box for an element from snapshot."""
713
+ if element_id is None:
714
+ return None
715
+ for el in snap.elements:
716
+ if el.id == element_id:
717
+ return {
718
+ "x": el.bbox.x,
719
+ "y": el.bbox.y,
720
+ "width": el.bbox.width,
721
+ "height": el.bbox.height,
722
+ }
723
+ return None
724
+
725
+ async def act( # noqa: C901
726
+ self,
727
+ goal: str,
728
+ max_retries: int = 2,
729
+ snapshot_options: SnapshotOptions | None = None,
730
+ ) -> AgentActionResult:
731
+ """
732
+ Execute a high-level goal using observe → think → act loop (async)
733
+
734
+ Args:
735
+ goal: Natural language instruction (e.g., "Click the Sign In button")
736
+ max_retries: Number of retries on failure (default: 2)
737
+ snapshot_options: Optional SnapshotOptions for this specific action
738
+
739
+ Returns:
740
+ AgentActionResult with execution details
741
+
742
+ Example:
743
+ >>> result = await agent.act("Click the search box")
744
+ >>> print(result.success, result.action, result.element_id)
745
+ True click 42
746
+ """
747
+ if self.verbose:
748
+ print(f"\n{'=' * 70}")
749
+ print(f"🤖 Agent Goal: {goal}")
750
+ print(f"{'=' * 70}")
751
+
752
+ # Generate step ID for tracing
753
+ self._step_count += 1
754
+ step_id = f"step-{self._step_count}"
755
+
756
+ # Emit step_start trace event if tracer is enabled
757
+ if self.tracer:
758
+ pre_url = self.browser.page.url if self.browser.page else None
759
+ _safe_tracer_call(
760
+ self.tracer,
761
+ "emit_step_start",
762
+ self.verbose,
763
+ step_id=step_id,
764
+ step_index=self._step_count,
765
+ goal=goal,
766
+ attempt=0,
767
+ pre_url=pre_url,
768
+ )
769
+
770
+ for attempt in range(max_retries + 1):
771
+ try:
772
+ # 1. OBSERVE: Get refined semantic snapshot
773
+ start_time = time.time()
774
+
775
+ # Use provided options or create default
776
+ snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
777
+ # Only set goal if not already provided
778
+ if snap_opts.goal is None:
779
+ snap_opts.goal = goal
780
+
781
+ # Apply AgentConfig screenshot settings if not overridden by snapshot_options
782
+ # Only apply if snapshot_options wasn't provided OR if screenshot wasn't explicitly set
783
+ # (snapshot_options.screenshot defaults to False, so we check if it's still False)
784
+ if self.config and (snapshot_options is None or snap_opts.screenshot is False):
785
+ if self.config.capture_screenshots:
786
+ # Create ScreenshotConfig from AgentConfig
787
+ snap_opts.screenshot = ScreenshotConfig(
788
+ format=self.config.screenshot_format,
789
+ quality=(
790
+ self.config.screenshot_quality
791
+ if self.config.screenshot_format == "jpeg"
792
+ else None
793
+ ),
794
+ )
795
+ else:
796
+ snap_opts.screenshot = False
797
+ # Apply show_overlay from AgentConfig
798
+ # Note: User can override by explicitly passing show_overlay in snapshot_options
799
+ snap_opts.show_overlay = self.config.show_overlay
800
+
801
+ # Call snapshot with options object (matches TypeScript API)
802
+ snap = await snapshot_async(self.browser, snap_opts)
803
+
804
+ if snap.status != "success":
805
+ raise RuntimeError(f"Snapshot failed: {snap.error}")
806
+
807
+ # Compute diff_status by comparing with previous snapshot
808
+ elements_with_diff = SnapshotDiff.compute_diff_status(snap, self._previous_snapshot)
809
+
810
+ # Create snapshot with diff_status populated
811
+ snap_with_diff = Snapshot(
812
+ status=snap.status,
813
+ timestamp=snap.timestamp,
814
+ url=snap.url,
815
+ viewport=snap.viewport,
816
+ elements=elements_with_diff,
817
+ screenshot=snap.screenshot,
818
+ screenshot_format=snap.screenshot_format,
819
+ error=snap.error,
820
+ )
821
+
822
+ # Update previous snapshot for next comparison
823
+ self._previous_snapshot = snap
824
+
825
+ # Apply element filtering based on goal
826
+ filtered_elements = self.filter_elements(snap_with_diff, goal)
827
+
828
+ # Emit snapshot trace event if tracer is enabled
829
+ if self.tracer:
830
+ # Build snapshot event data (use snap_with_diff to include diff_status)
831
+ snapshot_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
832
+
833
+ # Always include screenshot in trace event for studio viewer compatibility
834
+ # CloudTraceSink will extract and upload screenshots separately, then remove
835
+ # screenshot_base64 from events before uploading the trace file.
836
+ if snap.screenshot:
837
+ # Extract base64 string from data URL if needed
838
+ if snap.screenshot.startswith("data:image"):
839
+ # Format: "data:image/jpeg;base64,{base64_string}"
840
+ screenshot_base64 = (
841
+ snap.screenshot.split(",", 1)[1]
842
+ if "," in snap.screenshot
843
+ else snap.screenshot
844
+ )
845
+ else:
846
+ screenshot_base64 = snap.screenshot
847
+
848
+ snapshot_data["screenshot_base64"] = screenshot_base64
849
+ if snap.screenshot_format:
850
+ snapshot_data["screenshot_format"] = snap.screenshot_format
851
+
852
+ _safe_tracer_call(
853
+ self.tracer,
854
+ "emit",
855
+ self.verbose,
856
+ "snapshot",
857
+ snapshot_data,
858
+ step_id=step_id,
859
+ )
860
+
861
+ # Create filtered snapshot (use snap_with_diff to preserve metadata)
862
+ filtered_snap = Snapshot(
863
+ status=snap_with_diff.status,
864
+ timestamp=snap_with_diff.timestamp,
865
+ url=snap_with_diff.url,
866
+ viewport=snap_with_diff.viewport,
867
+ elements=filtered_elements,
868
+ screenshot=snap_with_diff.screenshot,
869
+ screenshot_format=snap_with_diff.screenshot_format,
870
+ error=snap_with_diff.error,
871
+ )
872
+
873
+ # 2. GROUND: Format elements for LLM context
874
+ context = self.llm_handler.build_context(filtered_snap, goal)
875
+
876
+ # 3. THINK: Query LLM for next action
877
+ llm_response = self.llm_handler.query_llm(context, goal)
878
+
879
+ # Emit LLM query trace event if tracer is enabled
880
+ if self.tracer:
881
+ _safe_tracer_call(
882
+ self.tracer,
883
+ "emit",
884
+ self.verbose,
885
+ "llm_query",
886
+ {
887
+ "prompt_tokens": llm_response.prompt_tokens,
888
+ "completion_tokens": llm_response.completion_tokens,
889
+ "model": llm_response.model_name,
890
+ "response": llm_response.content[:200], # Truncate for brevity
891
+ },
892
+ step_id=step_id,
893
+ )
894
+
895
+ if self.verbose:
896
+ print(f"🧠 LLM Decision: {llm_response.content}")
897
+
898
+ # Track token usage
899
+ self._track_tokens(goal, llm_response)
900
+
901
+ # Parse action from LLM response
902
+ action_str = self.llm_handler.extract_action(llm_response.content)
903
+
904
+ # 4. EXECUTE: Parse and run action
905
+ result_dict = await self.action_executor.execute_async(action_str, filtered_snap)
906
+
907
+ duration_ms = int((time.time() - start_time) * 1000)
908
+
909
+ # Create AgentActionResult from execution result
910
+ result = AgentActionResult(
911
+ success=result_dict["success"],
912
+ action=result_dict["action"],
913
+ goal=goal,
914
+ duration_ms=duration_ms,
915
+ attempt=attempt,
916
+ element_id=result_dict.get("element_id"),
917
+ text=result_dict.get("text"),
918
+ key=result_dict.get("key"),
919
+ outcome=result_dict.get("outcome"),
920
+ url_changed=result_dict.get("url_changed"),
921
+ error=result_dict.get("error"),
922
+ message=result_dict.get("message"),
923
+ )
924
+
925
+ # Emit action execution trace event if tracer is enabled
926
+ if self.tracer:
927
+ post_url = self.browser.page.url if self.browser.page else None
928
+
929
+ # Include element data for live overlay visualization
930
+ elements_data = [
931
+ {
932
+ "id": el.id,
933
+ "bbox": {
934
+ "x": el.bbox.x,
935
+ "y": el.bbox.y,
936
+ "width": el.bbox.width,
937
+ "height": el.bbox.height,
938
+ },
939
+ "role": el.role,
940
+ "text": el.text[:50] if el.text else "",
941
+ }
942
+ for el in filtered_snap.elements[:50]
943
+ ]
944
+
945
+ _safe_tracer_call(
946
+ self.tracer,
947
+ "emit",
948
+ self.verbose,
949
+ "action",
950
+ {
951
+ "action": result.action,
952
+ "element_id": result.element_id,
953
+ "success": result.success,
954
+ "outcome": result.outcome,
955
+ "duration_ms": duration_ms,
956
+ "post_url": post_url,
957
+ "elements": elements_data, # Add element data for overlay
958
+ "target_element_id": result.element_id, # Highlight target in red
959
+ },
960
+ step_id=step_id,
961
+ )
962
+
963
+ # 5. RECORD: Track history
964
+ self.history.append(
965
+ {
966
+ "goal": goal,
967
+ "action": action_str,
968
+ "result": result.model_dump(), # Store as dict
969
+ "success": result.success,
970
+ "attempt": attempt,
971
+ "duration_ms": duration_ms,
972
+ }
973
+ )
974
+
975
+ if self.verbose:
976
+ status = "✅" if result.success else "❌"
977
+ print(f"{status} Completed in {duration_ms}ms")
978
+
979
+ # Emit step completion trace event if tracer is enabled
980
+ if self.tracer:
981
+ # Get pre_url from step_start (stored in tracer or use current)
982
+ pre_url = snap.url
983
+ post_url = self.browser.page.url if self.browser.page else None
984
+
985
+ # Compute snapshot digest (simplified - use URL + timestamp)
986
+ snapshot_digest = f"sha256:{self._compute_hash(f'{pre_url}{snap.timestamp}')}"
987
+
988
+ # Build LLM data
989
+ llm_response_text = llm_response.content
990
+ llm_response_hash = f"sha256:{self._compute_hash(llm_response_text)}"
991
+ llm_data = {
992
+ "response_text": llm_response_text,
993
+ "response_hash": llm_response_hash,
994
+ "usage": {
995
+ "prompt_tokens": llm_response.prompt_tokens or 0,
996
+ "completion_tokens": llm_response.completion_tokens or 0,
997
+ "total_tokens": llm_response.total_tokens or 0,
998
+ },
999
+ }
1000
+
1001
+ # Build exec data
1002
+ exec_data = {
1003
+ "success": result.success,
1004
+ "action": result.action,
1005
+ "outcome": result.outcome
1006
+ or (
1007
+ f"Action {result.action} executed successfully"
1008
+ if result.success
1009
+ else f"Action {result.action} failed"
1010
+ ),
1011
+ "duration_ms": duration_ms,
1012
+ }
1013
+
1014
+ # Add optional exec fields
1015
+ if result.element_id is not None:
1016
+ exec_data["element_id"] = result.element_id
1017
+ # Add bounding box if element found
1018
+ bbox = self._get_element_bbox(result.element_id, snap)
1019
+ if bbox:
1020
+ exec_data["bounding_box"] = bbox
1021
+ if result.text is not None:
1022
+ exec_data["text"] = result.text
1023
+ if result.key is not None:
1024
+ exec_data["key"] = result.key
1025
+ if result.error is not None:
1026
+ exec_data["error"] = result.error
1027
+
1028
+ # Build verify data (simplified - based on success and url_changed)
1029
+ verify_passed = result.success and (
1030
+ result.url_changed or result.action != "click"
1031
+ )
1032
+ verify_signals = {
1033
+ "url_changed": result.url_changed or False,
1034
+ }
1035
+ if result.error:
1036
+ verify_signals["error"] = result.error
1037
+
1038
+ # Add elements_found array if element was targeted
1039
+ if result.element_id is not None:
1040
+ bbox = self._get_element_bbox(result.element_id, snap)
1041
+ if bbox:
1042
+ verify_signals["elements_found"] = [
1043
+ {
1044
+ "label": f"Element {result.element_id}",
1045
+ "bounding_box": bbox,
1046
+ }
1047
+ ]
1048
+
1049
+ verify_data = {
1050
+ "passed": verify_passed,
1051
+ "signals": verify_signals,
1052
+ }
1053
+
1054
+ # Build elements data for pre field (include diff_status from snap_with_diff)
1055
+ # Use the same format as build_snapshot_event for consistency
1056
+ snapshot_event_data = TraceEventBuilder.build_snapshot_event(snap_with_diff)
1057
+ pre_elements = snapshot_event_data.get("elements", [])
1058
+
1059
+ # Build complete step_end event
1060
+ step_end_data = TraceEventBuilder.build_step_end_event(
1061
+ step_id=step_id,
1062
+ step_index=self._step_count,
1063
+ goal=goal,
1064
+ attempt=attempt,
1065
+ pre_url=pre_url,
1066
+ post_url=post_url,
1067
+ snapshot_digest=snapshot_digest,
1068
+ llm_data=llm_data,
1069
+ exec_data=exec_data,
1070
+ verify_data=verify_data,
1071
+ pre_elements=pre_elements,
1072
+ )
1073
+
1074
+ _safe_tracer_call(
1075
+ self.tracer,
1076
+ "emit",
1077
+ self.verbose,
1078
+ "step_end",
1079
+ step_end_data,
1080
+ step_id=step_id,
1081
+ )
1082
+
1083
+ return result
1084
+
1085
+ except Exception as e:
1086
+ # Emit error trace event if tracer is enabled
1087
+ if self.tracer:
1088
+ _safe_tracer_call(
1089
+ self.tracer,
1090
+ "emit_error",
1091
+ self.verbose,
1092
+ step_id=step_id,
1093
+ error=str(e),
1094
+ attempt=attempt,
1095
+ )
1096
+
1097
+ if attempt < max_retries:
1098
+ if self.verbose:
1099
+ print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
1100
+ await asyncio.sleep(1.0) # Brief delay before retry
1101
+ continue
1102
+ else:
1103
+ # Create error result
1104
+ error_result = AgentActionResult(
1105
+ success=False,
1106
+ action="error",
1107
+ goal=goal,
1108
+ duration_ms=0,
1109
+ attempt=attempt,
1110
+ error=str(e),
1111
+ )
1112
+ self.history.append(
1113
+ {
1114
+ "goal": goal,
1115
+ "action": "error",
1116
+ "result": error_result.model_dump(),
1117
+ "success": False,
1118
+ "attempt": attempt,
1119
+ "duration_ms": 0,
1120
+ }
1121
+ )
1122
+ raise RuntimeError(f"Failed after {max_retries} retries: {e}")
1123
+
1124
+ def _track_tokens(self, goal: str, llm_response: LLMResponse):
1125
+ """Track token usage for analytics (same as sync version)"""
1126
+ if llm_response.prompt_tokens:
1127
+ self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
1128
+ if llm_response.completion_tokens:
1129
+ self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
1130
+ if llm_response.total_tokens:
1131
+ self._token_usage_raw["total_tokens"] += llm_response.total_tokens
1132
+
1133
+ self._token_usage_raw["by_action"].append(
1134
+ {
1135
+ "goal": goal,
1136
+ "prompt_tokens": llm_response.prompt_tokens or 0,
1137
+ "completion_tokens": llm_response.completion_tokens or 0,
1138
+ "total_tokens": llm_response.total_tokens or 0,
1139
+ "model": llm_response.model_name,
1140
+ }
1141
+ )
1142
+
1143
+ def get_token_stats(self) -> TokenStats:
1144
+ """Get token usage statistics (same as sync version)"""
1145
+ by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
1146
+ return TokenStats(
1147
+ total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
1148
+ total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
1149
+ total_tokens=self._token_usage_raw["total_tokens"],
1150
+ by_action=by_action,
1151
+ )
1152
+
1153
+ def get_history(self) -> list[ActionHistory]:
1154
+ """Get execution history (same as sync version)"""
1155
+ return [ActionHistory(**h) for h in self.history]
1156
+
1157
+ def clear_history(self) -> None:
1158
+ """Clear execution history and reset token counters (same as sync version)"""
1159
+ self.history.clear()
1160
+ self._token_usage_raw = {
1161
+ "total_prompt_tokens": 0,
1162
+ "total_completion_tokens": 0,
1163
+ "total_tokens": 0,
1164
+ "by_action": [],
1165
+ }
1166
+
1167
+ def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
1168
+ """
1169
+ Filter elements from snapshot based on goal context.
1170
+
1171
+ This implementation uses ElementFilter to apply goal-based keyword matching
1172
+ to boost relevant elements and filters out irrelevant ones.
1173
+
1174
+ Args:
1175
+ snapshot: Current page snapshot
1176
+ goal: User's goal (can inform filtering)
1177
+
1178
+ Returns:
1179
+ Filtered list of elements
1180
+ """
1181
+ return ElementFilter.filter_by_goal(snapshot, goal, self.default_snapshot_limit)