sentienceapi 0.92.2__py3-none-any.whl → 0.98.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sentienceapi might be problematic. Click here for more details.

Files changed (64) hide show
  1. sentience/__init__.py +107 -2
  2. sentience/_extension_loader.py +156 -1
  3. sentience/action_executor.py +2 -0
  4. sentience/actions.py +354 -9
  5. sentience/agent.py +4 -0
  6. sentience/agent_runtime.py +840 -0
  7. sentience/asserts/__init__.py +70 -0
  8. sentience/asserts/expect.py +621 -0
  9. sentience/asserts/query.py +383 -0
  10. sentience/async_api.py +8 -1
  11. sentience/backends/__init__.py +137 -0
  12. sentience/backends/actions.py +372 -0
  13. sentience/backends/browser_use_adapter.py +241 -0
  14. sentience/backends/cdp_backend.py +393 -0
  15. sentience/backends/exceptions.py +211 -0
  16. sentience/backends/playwright_backend.py +194 -0
  17. sentience/backends/protocol.py +216 -0
  18. sentience/backends/sentience_context.py +469 -0
  19. sentience/backends/snapshot.py +483 -0
  20. sentience/browser.py +230 -74
  21. sentience/canonicalization.py +207 -0
  22. sentience/cloud_tracing.py +65 -24
  23. sentience/constants.py +6 -0
  24. sentience/cursor_policy.py +142 -0
  25. sentience/extension/content.js +35 -0
  26. sentience/extension/injected_api.js +310 -15
  27. sentience/extension/manifest.json +1 -1
  28. sentience/extension/pkg/sentience_core.d.ts +22 -22
  29. sentience/extension/pkg/sentience_core.js +192 -144
  30. sentience/extension/pkg/sentience_core_bg.wasm +0 -0
  31. sentience/extension/release.json +29 -29
  32. sentience/failure_artifacts.py +241 -0
  33. sentience/integrations/__init__.py +6 -0
  34. sentience/integrations/langchain/__init__.py +12 -0
  35. sentience/integrations/langchain/context.py +18 -0
  36. sentience/integrations/langchain/core.py +326 -0
  37. sentience/integrations/langchain/tools.py +180 -0
  38. sentience/integrations/models.py +46 -0
  39. sentience/integrations/pydanticai/__init__.py +15 -0
  40. sentience/integrations/pydanticai/deps.py +20 -0
  41. sentience/integrations/pydanticai/toolset.py +468 -0
  42. sentience/llm_provider.py +695 -18
  43. sentience/models.py +536 -3
  44. sentience/ordinal.py +280 -0
  45. sentience/query.py +66 -4
  46. sentience/schemas/trace_v1.json +27 -1
  47. sentience/snapshot.py +384 -93
  48. sentience/snapshot_diff.py +39 -54
  49. sentience/text_search.py +1 -0
  50. sentience/trace_event_builder.py +20 -1
  51. sentience/trace_indexing/indexer.py +3 -49
  52. sentience/tracer_factory.py +1 -3
  53. sentience/verification.py +618 -0
  54. sentience/visual_agent.py +3 -1
  55. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +198 -40
  56. sentienceapi-0.98.0.dist-info/RECORD +92 -0
  57. sentience/utils.py +0 -296
  58. sentienceapi-0.92.2.dist-info/RECORD +0 -65
  59. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
  60. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
  61. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
  62. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
  63. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
  64. {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,840 @@
1
+ """
2
+ Agent runtime for verification loop support.
3
+
4
+ This module provides a thin runtime wrapper that combines:
5
+ 1. Browser session management (via BrowserBackend protocol)
6
+ 2. Snapshot/query helpers
7
+ 3. Tracer for event emission
8
+ 4. Assertion/verification methods
9
+
10
+ The AgentRuntime is designed to be used in agent verification loops where
11
+ you need to repeatedly take snapshots, execute actions, and verify results.
12
+
13
+ Example usage with browser-use:
14
+ from browser_use import BrowserSession, BrowserProfile
15
+ from sentience import get_extension_dir
16
+ from sentience.backends import BrowserUseAdapter
17
+ from sentience.agent_runtime import AgentRuntime
18
+ from sentience.verification import url_matches, exists
19
+ from sentience.tracing import Tracer, JsonlTraceSink
20
+
21
+ # Setup browser-use with Sentience extension
22
+ profile = BrowserProfile(args=[f"--load-extension={get_extension_dir()}"])
23
+ session = BrowserSession(browser_profile=profile)
24
+ await session.start()
25
+
26
+ # Create adapter and backend
27
+ adapter = BrowserUseAdapter(session)
28
+ backend = await adapter.create_backend()
29
+
30
+ # Navigate using browser-use
31
+ page = await session.get_current_page()
32
+ await page.goto("https://example.com")
33
+
34
+ # Create runtime with backend
35
+ sink = JsonlTraceSink("trace.jsonl")
36
+ tracer = Tracer(run_id="test-run", sink=sink)
37
+ runtime = AgentRuntime(backend=backend, tracer=tracer)
38
+
39
+ # Take snapshot and run assertions
40
+ await runtime.snapshot()
41
+ runtime.assert_(url_matches(r"example\\.com"), label="on_homepage")
42
+ runtime.assert_(exists("role=button"), label="has_buttons")
43
+
44
+ # Check if task is done
45
+ if runtime.assert_done(exists("text~'Success'"), label="task_complete"):
46
+ print("Task completed!")
47
+
48
+ Example usage with AsyncSentienceBrowser (backward compatible):
49
+ from sentience import AsyncSentienceBrowser
50
+ from sentience.agent_runtime import AgentRuntime
51
+
52
+ async with AsyncSentienceBrowser() as browser:
53
+ page = await browser.new_page()
54
+ await page.goto("https://example.com")
55
+
56
+ runtime = await AgentRuntime.from_sentience_browser(
57
+ browser=browser,
58
+ page=page,
59
+ tracer=tracer,
60
+ )
61
+ await runtime.snapshot()
62
+ """
63
+
64
+ from __future__ import annotations
65
+
66
+ import asyncio
67
+ import difflib
68
+ import time
69
+ import uuid
70
+ from dataclasses import dataclass
71
+ from typing import TYPE_CHECKING, Any
72
+
73
+ from .failure_artifacts import FailureArtifactBuffer, FailureArtifactsOptions
74
+ from .models import Snapshot, SnapshotOptions
75
+ from .verification import AssertContext, AssertOutcome, Predicate
76
+
77
+ if TYPE_CHECKING:
78
+ from playwright.async_api import Page
79
+
80
+ from .backends.protocol import BrowserBackend
81
+ from .browser import AsyncSentienceBrowser
82
+ from .tracing import Tracer
83
+
84
+
85
+ class AgentRuntime:
86
+ """
87
+ Runtime wrapper for agent verification loops.
88
+
89
+ Provides ergonomic methods for:
90
+ - snapshot(): Take page snapshot
91
+ - assert_(): Evaluate assertion predicates
92
+ - assert_done(): Assert task completion (required assertion)
93
+
94
+ The runtime manages assertion state per step and emits verification events
95
+ to the tracer for Studio timeline display.
96
+
97
+ Attributes:
98
+ backend: BrowserBackend instance for browser operations
99
+ tracer: Tracer for event emission
100
+ step_id: Current step identifier
101
+ step_index: Current step index (0-based)
102
+ last_snapshot: Most recent snapshot (for assertion context)
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ backend: BrowserBackend,
108
+ tracer: Tracer,
109
+ snapshot_options: SnapshotOptions | None = None,
110
+ sentience_api_key: str | None = None,
111
+ ):
112
+ """
113
+ Initialize agent runtime with any BrowserBackend-compatible browser.
114
+
115
+ Args:
116
+ backend: Any browser implementing BrowserBackend protocol.
117
+ Examples:
118
+ - CDPBackendV0 (for browser-use via BrowserUseAdapter)
119
+ - PlaywrightBackend (future, for direct Playwright)
120
+ tracer: Tracer for emitting verification events
121
+ snapshot_options: Default options for snapshots
122
+ sentience_api_key: API key for Pro/Enterprise tier (enables Gateway refinement)
123
+ """
124
+ self.backend = backend
125
+ self.tracer = tracer
126
+
127
+ # Build default snapshot options with API key if provided
128
+ default_opts = snapshot_options or SnapshotOptions()
129
+ if sentience_api_key:
130
+ default_opts.sentience_api_key = sentience_api_key
131
+ if default_opts.use_api is None:
132
+ default_opts.use_api = True
133
+ self._snapshot_options = default_opts
134
+
135
+ # Step tracking
136
+ self.step_id: str | None = None
137
+ self.step_index: int = 0
138
+
139
+ # Snapshot state
140
+ self.last_snapshot: Snapshot | None = None
141
+
142
+ # Failure artifacts (Phase 1)
143
+ self._artifact_buffer: FailureArtifactBuffer | None = None
144
+ self._artifact_timer_task: asyncio.Task | None = None
145
+
146
+ # Cached URL (updated on snapshot or explicit get_url call)
147
+ self._cached_url: str | None = None
148
+
149
+ # Assertions accumulated during current step
150
+ self._assertions_this_step: list[dict[str, Any]] = []
151
+
152
+ # Task completion tracking
153
+ self._task_done: bool = False
154
+ self._task_done_label: str | None = None
155
+
156
+ @classmethod
157
+ async def from_sentience_browser(
158
+ cls,
159
+ browser: AsyncSentienceBrowser,
160
+ page: Page,
161
+ tracer: Tracer,
162
+ snapshot_options: SnapshotOptions | None = None,
163
+ sentience_api_key: str | None = None,
164
+ ) -> AgentRuntime:
165
+ """
166
+ Create AgentRuntime from AsyncSentienceBrowser (backward compatibility).
167
+
168
+ This factory method wraps an AsyncSentienceBrowser + Page combination
169
+ into the new BrowserBackend-based AgentRuntime.
170
+
171
+ Args:
172
+ browser: AsyncSentienceBrowser instance
173
+ page: Playwright Page for browser interaction
174
+ tracer: Tracer for emitting verification events
175
+ snapshot_options: Default options for snapshots
176
+ sentience_api_key: API key for Pro/Enterprise tier
177
+
178
+ Returns:
179
+ AgentRuntime instance
180
+ """
181
+ from .backends.playwright_backend import PlaywrightBackend
182
+
183
+ backend = PlaywrightBackend(page)
184
+ runtime = cls(
185
+ backend=backend,
186
+ tracer=tracer,
187
+ snapshot_options=snapshot_options,
188
+ sentience_api_key=sentience_api_key,
189
+ )
190
+ # Store browser reference for snapshot() to use
191
+ runtime._legacy_browser = browser
192
+ runtime._legacy_page = page
193
+ return runtime
194
+
195
+ def _ctx(self) -> AssertContext:
196
+ """
197
+ Build assertion context from current state.
198
+
199
+ Returns:
200
+ AssertContext with current snapshot and URL
201
+ """
202
+ url = None
203
+ if self.last_snapshot is not None:
204
+ url = self.last_snapshot.url
205
+ elif self._cached_url:
206
+ url = self._cached_url
207
+
208
+ return AssertContext(
209
+ snapshot=self.last_snapshot,
210
+ url=url,
211
+ step_id=self.step_id,
212
+ )
213
+
214
+ async def get_url(self) -> str:
215
+ """
216
+ Get current page URL.
217
+
218
+ Returns:
219
+ Current page URL
220
+ """
221
+ url = await self.backend.get_url()
222
+ self._cached_url = url
223
+ return url
224
+
225
+ async def snapshot(self, **kwargs: Any) -> Snapshot:
226
+ """
227
+ Take a snapshot of the current page state.
228
+
229
+ This updates last_snapshot which is used as context for assertions.
230
+
231
+ Args:
232
+ **kwargs: Override default snapshot options for this call.
233
+ Common options:
234
+ - limit: Maximum elements to return
235
+ - goal: Task goal for ordinal support
236
+ - screenshot: Include screenshot
237
+ - show_overlay: Show visual overlay
238
+
239
+ Returns:
240
+ Snapshot of current page state
241
+ """
242
+ # Check if using legacy browser (backward compat)
243
+ if hasattr(self, "_legacy_browser") and hasattr(self, "_legacy_page"):
244
+ self.last_snapshot = await self._legacy_browser.snapshot(self._legacy_page, **kwargs)
245
+ return self.last_snapshot
246
+
247
+ # Use backend-agnostic snapshot
248
+ from .backends.snapshot import snapshot as backend_snapshot
249
+
250
+ # Merge default options with call-specific kwargs
251
+ options_dict = self._snapshot_options.model_dump(exclude_none=True)
252
+ options_dict.update(kwargs)
253
+ options = SnapshotOptions(**options_dict)
254
+
255
+ self.last_snapshot = await backend_snapshot(self.backend, options=options)
256
+ return self.last_snapshot
257
+
258
+ async def enable_failure_artifacts(
259
+ self,
260
+ options: FailureArtifactsOptions | None = None,
261
+ ) -> None:
262
+ """
263
+ Enable failure artifact buffer (Phase 1).
264
+ """
265
+ opts = options or FailureArtifactsOptions()
266
+ self._artifact_buffer = FailureArtifactBuffer(
267
+ run_id=self.tracer.run_id,
268
+ options=opts,
269
+ )
270
+ if opts.fps > 0:
271
+ self._artifact_timer_task = asyncio.create_task(self._artifact_timer_loop())
272
+
273
+ def disable_failure_artifacts(self) -> None:
274
+ """
275
+ Disable failure artifact buffer and stop background capture.
276
+ """
277
+ if self._artifact_timer_task:
278
+ self._artifact_timer_task.cancel()
279
+ self._artifact_timer_task = None
280
+
281
+ async def record_action(
282
+ self,
283
+ action: str,
284
+ *,
285
+ url: str | None = None,
286
+ ) -> None:
287
+ """
288
+ Record an action in the artifact timeline and capture a frame if enabled.
289
+ """
290
+ if not self._artifact_buffer:
291
+ return
292
+ self._artifact_buffer.record_step(
293
+ action=action,
294
+ step_id=self.step_id,
295
+ step_index=self.step_index,
296
+ url=url,
297
+ )
298
+ if self._artifact_buffer.options.capture_on_action:
299
+ await self._capture_artifact_frame()
300
+
301
+ async def _capture_artifact_frame(self) -> None:
302
+ if not self._artifact_buffer:
303
+ return
304
+ try:
305
+ image_bytes = await self.backend.screenshot_png()
306
+ except Exception:
307
+ return
308
+ self._artifact_buffer.add_frame(image_bytes, fmt="png")
309
+
310
+ async def _artifact_timer_loop(self) -> None:
311
+ if not self._artifact_buffer:
312
+ return
313
+ interval = 1.0 / max(0.001, self._artifact_buffer.options.fps)
314
+ try:
315
+ while True:
316
+ await self._capture_artifact_frame()
317
+ await asyncio.sleep(interval)
318
+ except asyncio.CancelledError:
319
+ return
320
+
321
+ def finalize_run(self, *, success: bool) -> None:
322
+ """
323
+ Finalize artifact buffer at end of run.
324
+ """
325
+ if not self._artifact_buffer:
326
+ return
327
+ if success:
328
+ if self._artifact_buffer.options.persist_mode == "always":
329
+ self._artifact_buffer.persist(
330
+ reason="success",
331
+ status="success",
332
+ snapshot=self.last_snapshot,
333
+ diagnostics=getattr(self.last_snapshot, "diagnostics", None),
334
+ metadata=self._artifact_metadata(),
335
+ )
336
+ self._artifact_buffer.cleanup()
337
+ else:
338
+ self._persist_failure_artifacts(reason="finalize_failure")
339
+
340
+ def _persist_failure_artifacts(self, *, reason: str) -> None:
341
+ if not self._artifact_buffer:
342
+ return
343
+ self._artifact_buffer.persist(
344
+ reason=reason,
345
+ status="failure",
346
+ snapshot=self.last_snapshot,
347
+ diagnostics=getattr(self.last_snapshot, "diagnostics", None),
348
+ metadata=self._artifact_metadata(),
349
+ )
350
+ self._artifact_buffer.cleanup()
351
+ if self._artifact_buffer.options.persist_mode == "onFail":
352
+ self.disable_failure_artifacts()
353
+
354
+ def _artifact_metadata(self) -> dict[str, Any]:
355
+ url = None
356
+ if self.last_snapshot is not None:
357
+ url = self.last_snapshot.url
358
+ elif self._cached_url:
359
+ url = self._cached_url
360
+ return {
361
+ "backend": self.backend.__class__.__name__,
362
+ "url": url,
363
+ }
364
+
365
+ def begin_step(self, goal: str, step_index: int | None = None) -> str:
366
+ """
367
+ Begin a new step in the verification loop.
368
+
369
+ This:
370
+ - Generates a new step_id
371
+ - Clears assertions from previous step
372
+ - Increments step_index (or uses provided value)
373
+
374
+ Args:
375
+ goal: Description of what this step aims to achieve
376
+ step_index: Optional explicit step index (otherwise auto-increments)
377
+
378
+ Returns:
379
+ Generated step_id
380
+ """
381
+ # Clear previous step state
382
+ self._assertions_this_step = []
383
+
384
+ # Generate new step_id
385
+ self.step_id = str(uuid.uuid4())
386
+
387
+ # Update step index
388
+ if step_index is not None:
389
+ self.step_index = step_index
390
+ else:
391
+ self.step_index += 1
392
+
393
+ return self.step_id
394
+
395
+ def assert_(
396
+ self,
397
+ predicate: Predicate,
398
+ label: str,
399
+ required: bool = False,
400
+ ) -> bool:
401
+ """
402
+ Evaluate an assertion against current snapshot state.
403
+
404
+ The assertion result is:
405
+ 1. Accumulated for inclusion in step_end.data.verify.signals.assertions
406
+ 2. Emitted as a dedicated 'verification' event for Studio timeline
407
+
408
+ Args:
409
+ predicate: Predicate function to evaluate
410
+ label: Human-readable label for this assertion
411
+ required: If True, this assertion gates step success (default: False)
412
+
413
+ Returns:
414
+ True if assertion passed, False otherwise
415
+ """
416
+ outcome = predicate(self._ctx())
417
+ self._record_outcome(
418
+ outcome=outcome,
419
+ label=label,
420
+ required=required,
421
+ kind="assert",
422
+ record_in_step=True,
423
+ )
424
+ if required and not outcome.passed:
425
+ self._persist_failure_artifacts(reason=f"assert_failed:{label}")
426
+ return outcome.passed
427
+
428
+ def check(self, predicate: Predicate, label: str, required: bool = False) -> AssertionHandle:
429
+ """
430
+ Create an AssertionHandle for fluent `.once()` / `.eventually()` usage.
431
+
432
+ This does NOT evaluate the predicate immediately.
433
+ """
434
+
435
+ return AssertionHandle(runtime=self, predicate=predicate, label=label, required=required)
436
+
437
+ def assert_done(
438
+ self,
439
+ predicate: Predicate,
440
+ label: str,
441
+ ) -> bool:
442
+ """
443
+ Assert task completion (required assertion).
444
+
445
+ This is a convenience wrapper for assert_() with required=True.
446
+ When the assertion passes, it marks the task as done.
447
+
448
+ Use this for final verification that the agent's goal is complete.
449
+
450
+ Args:
451
+ predicate: Predicate function to evaluate
452
+ label: Human-readable label for this assertion
453
+
454
+ Returns:
455
+ True if task is complete (assertion passed), False otherwise
456
+ """
457
+ # Convenience wrapper for assert_ with required=True
458
+ ok = self.assertTrue(predicate, label=label, required=True)
459
+ if ok:
460
+ self._task_done = True
461
+ self._task_done_label = label
462
+
463
+ # Emit task_done verification event
464
+ self.tracer.emit(
465
+ "verification",
466
+ data={
467
+ "kind": "task_done",
468
+ "passed": True,
469
+ "label": label,
470
+ },
471
+ step_id=self.step_id,
472
+ )
473
+
474
+ return ok
475
+
476
+ def _record_outcome(
477
+ self,
478
+ *,
479
+ outcome: Any,
480
+ label: str,
481
+ required: bool,
482
+ kind: str,
483
+ record_in_step: bool,
484
+ extra: dict[str, Any] | None = None,
485
+ ) -> None:
486
+ """
487
+ Internal helper: emit verification event and optionally accumulate for step_end.
488
+ """
489
+ details = dict(outcome.details or {})
490
+
491
+ # Failure intelligence: nearest matches for selector-driven assertions
492
+ if not outcome.passed and self.last_snapshot is not None and "selector" in details:
493
+ selector = str(details.get("selector") or "")
494
+ details.setdefault("nearest_matches", self._nearest_matches(selector, limit=3))
495
+
496
+ record = {
497
+ "label": label,
498
+ "passed": bool(outcome.passed),
499
+ "required": required,
500
+ "reason": str(outcome.reason or ""),
501
+ "details": details,
502
+ }
503
+ if extra:
504
+ record.update(extra)
505
+
506
+ if record_in_step:
507
+ self._assertions_this_step.append(record)
508
+
509
+ self.tracer.emit(
510
+ "verification",
511
+ data={
512
+ "kind": kind,
513
+ "passed": bool(outcome.passed),
514
+ **record,
515
+ },
516
+ step_id=self.step_id,
517
+ )
518
+
519
+ def _nearest_matches(self, selector: str, *, limit: int = 3) -> list[dict[str, Any]]:
520
+ """
521
+ Best-effort nearest match suggestions for debugging failed selector assertions.
522
+ """
523
+ if self.last_snapshot is None:
524
+ return []
525
+
526
+ s = selector.lower().strip()
527
+ if not s:
528
+ return []
529
+
530
+ scored: list[tuple[float, Any]] = []
531
+ for el in self.last_snapshot.elements:
532
+ hay = (getattr(el, "name", None) or getattr(el, "text", None) or "").strip()
533
+ if not hay:
534
+ continue
535
+ score = difflib.SequenceMatcher(None, s, hay.lower()).ratio()
536
+ scored.append((score, el))
537
+
538
+ scored.sort(key=lambda t: t[0], reverse=True)
539
+ out: list[dict[str, Any]] = []
540
+ for score, el in scored[:limit]:
541
+ out.append(
542
+ {
543
+ "id": getattr(el, "id", None),
544
+ "role": getattr(el, "role", None),
545
+ "text": (getattr(el, "text", "") or "")[:80],
546
+ "name": (getattr(el, "name", "") or "")[:80],
547
+ "score": round(float(score), 4),
548
+ }
549
+ )
550
+ return out
551
+
552
+ def get_assertions_for_step_end(self) -> dict[str, Any]:
553
+ """
554
+ Get assertions data for inclusion in step_end.data.verify.signals.
555
+
556
+ Returns:
557
+ Dictionary with 'assertions', 'task_done', 'task_done_label' keys
558
+ """
559
+ result: dict[str, Any] = {
560
+ "assertions": self._assertions_this_step.copy(),
561
+ }
562
+
563
+ if self._task_done:
564
+ result["task_done"] = True
565
+ result["task_done_label"] = self._task_done_label
566
+
567
+ return result
568
+
569
+ def flush_assertions(self) -> list[dict[str, Any]]:
570
+ """
571
+ Get and clear assertions for current step.
572
+ """
573
+ assertions = self._assertions_this_step.copy()
574
+ self._assertions_this_step = []
575
+ return assertions
576
+
577
+ @property
578
+ def is_task_done(self) -> bool:
579
+ """Check if task has been marked as done via assert_done()."""
580
+ return self._task_done
581
+
582
+ def reset_task_done(self) -> None:
583
+ """Reset task_done state (for multi-task runs)."""
584
+ self._task_done = False
585
+ self._task_done_label = None
586
+
587
+ def all_assertions_passed(self) -> bool:
588
+ """Return True if all assertions in current step passed (or none)."""
589
+ return all(a["passed"] for a in self._assertions_this_step)
590
+
591
+ def required_assertions_passed(self) -> bool:
592
+ """Return True if all required assertions in current step passed (or none)."""
593
+ required = [a for a in self._assertions_this_step if a.get("required")]
594
+ return all(a["passed"] for a in required)
595
+
596
+
597
+ @dataclass
598
+ class AssertionHandle:
599
+ runtime: AgentRuntime
600
+ predicate: Predicate
601
+ label: str
602
+ required: bool = False
603
+
604
+ def once(self) -> bool:
605
+ """Evaluate once (same behavior as runtime.assert_)."""
606
+ return self.runtime.assert_(self.predicate, label=self.label, required=self.required)
607
+
608
+ async def eventually(
609
+ self,
610
+ *,
611
+ timeout_s: float = 10.0,
612
+ poll_s: float = 0.25,
613
+ min_confidence: float | None = None,
614
+ max_snapshot_attempts: int = 3,
615
+ snapshot_kwargs: dict[str, Any] | None = None,
616
+ vision_provider: Any | None = None,
617
+ vision_system_prompt: str | None = None,
618
+ vision_user_prompt: str | None = None,
619
+ ) -> bool:
620
+ """
621
+ Retry until the predicate passes or timeout is reached.
622
+
623
+ Intermediate attempts emit verification events but do NOT accumulate in step_end assertions.
624
+ Final result is accumulated once.
625
+ """
626
+ deadline = time.monotonic() + timeout_s
627
+ attempt = 0
628
+ snapshot_attempt = 0
629
+ last_outcome = None
630
+
631
+ while True:
632
+ attempt += 1
633
+ await self.runtime.snapshot(**(snapshot_kwargs or {}))
634
+ snapshot_attempt += 1
635
+
636
+ # Optional: gate predicate evaluation on snapshot confidence.
637
+ # If diagnostics are missing, we don't block (backward compatible).
638
+ confidence = None
639
+ diagnostics = None
640
+ if self.runtime.last_snapshot is not None:
641
+ diagnostics = getattr(self.runtime.last_snapshot, "diagnostics", None)
642
+ if diagnostics is not None:
643
+ confidence = getattr(diagnostics, "confidence", None)
644
+
645
+ if (
646
+ min_confidence is not None
647
+ and confidence is not None
648
+ and isinstance(confidence, (int, float))
649
+ and confidence < min_confidence
650
+ ):
651
+ last_outcome = AssertOutcome(
652
+ passed=False,
653
+ reason=f"Snapshot confidence {confidence:.3f} < min_confidence {min_confidence:.3f}",
654
+ details={
655
+ "reason_code": "snapshot_low_confidence",
656
+ "confidence": confidence,
657
+ "min_confidence": min_confidence,
658
+ "snapshot_attempt": snapshot_attempt,
659
+ "diagnostics": (
660
+ diagnostics.model_dump()
661
+ if hasattr(diagnostics, "model_dump")
662
+ else diagnostics
663
+ ),
664
+ },
665
+ )
666
+
667
+ # Emit attempt event (not recorded in step_end)
668
+ self.runtime._record_outcome(
669
+ outcome=last_outcome,
670
+ label=self.label,
671
+ required=self.required,
672
+ kind="assert",
673
+ record_in_step=False,
674
+ extra={
675
+ "eventually": True,
676
+ "attempt": attempt,
677
+ "snapshot_attempt": snapshot_attempt,
678
+ },
679
+ )
680
+
681
+ if snapshot_attempt >= max_snapshot_attempts:
682
+ # Optional: vision fallback as last resort (Phase 2-lite).
683
+ # This keeps the assertion surface invariant; only the perception layer changes.
684
+ if (
685
+ vision_provider is not None
686
+ and getattr(vision_provider, "supports_vision", lambda: False)()
687
+ ):
688
+ try:
689
+ import base64
690
+
691
+ png_bytes = await self.runtime.backend.screenshot_png()
692
+ image_b64 = base64.b64encode(png_bytes).decode("utf-8")
693
+
694
+ sys_prompt = vision_system_prompt or (
695
+ "You are a strict visual verifier. Answer only YES or NO."
696
+ )
697
+ user_prompt = vision_user_prompt or (
698
+ f"Given the screenshot, is the following condition satisfied?\n\n{self.label}\n\nAnswer YES or NO."
699
+ )
700
+
701
+ resp = vision_provider.generate_with_image(
702
+ sys_prompt,
703
+ user_prompt,
704
+ image_base64=image_b64,
705
+ temperature=0.0,
706
+ )
707
+ text = (resp.content or "").strip().lower()
708
+ passed = text.startswith("yes")
709
+
710
+ final_outcome = AssertOutcome(
711
+ passed=passed,
712
+ reason="vision_fallback_yes" if passed else "vision_fallback_no",
713
+ details={
714
+ "reason_code": (
715
+ "vision_fallback_pass" if passed else "vision_fallback_fail"
716
+ ),
717
+ "vision_response": resp.content,
718
+ "min_confidence": min_confidence,
719
+ "snapshot_attempts": snapshot_attempt,
720
+ },
721
+ )
722
+ self.runtime._record_outcome(
723
+ outcome=final_outcome,
724
+ label=self.label,
725
+ required=self.required,
726
+ kind="assert",
727
+ record_in_step=True,
728
+ extra={
729
+ "eventually": True,
730
+ "attempt": attempt,
731
+ "snapshot_attempt": snapshot_attempt,
732
+ "final": True,
733
+ "vision_fallback": True,
734
+ },
735
+ )
736
+ if self.required and not passed:
737
+ self.runtime._persist_failure_artifacts(
738
+ reason=f"assert_eventually_failed:{self.label}"
739
+ )
740
+ return passed
741
+ except Exception as e:
742
+ # If vision fallback fails, fall through to snapshot_exhausted.
743
+ last_outcome.details["vision_error"] = str(e)
744
+
745
+ final_outcome = AssertOutcome(
746
+ passed=False,
747
+ reason=f"Snapshot exhausted after {snapshot_attempt} attempt(s) below min_confidence {min_confidence:.3f}",
748
+ details={
749
+ "reason_code": "snapshot_exhausted",
750
+ "confidence": confidence,
751
+ "min_confidence": min_confidence,
752
+ "snapshot_attempts": snapshot_attempt,
753
+ "diagnostics": last_outcome.details.get("diagnostics"),
754
+ },
755
+ )
756
+ self.runtime._record_outcome(
757
+ outcome=final_outcome,
758
+ label=self.label,
759
+ required=self.required,
760
+ kind="assert",
761
+ record_in_step=True,
762
+ extra={
763
+ "eventually": True,
764
+ "attempt": attempt,
765
+ "snapshot_attempt": snapshot_attempt,
766
+ "final": True,
767
+ "exhausted": True,
768
+ },
769
+ )
770
+ if self.required:
771
+ self.runtime._persist_failure_artifacts(
772
+ reason=f"assert_eventually_failed:{self.label}"
773
+ )
774
+ return False
775
+
776
+ if time.monotonic() >= deadline:
777
+ self.runtime._record_outcome(
778
+ outcome=last_outcome,
779
+ label=self.label,
780
+ required=self.required,
781
+ kind="assert",
782
+ record_in_step=True,
783
+ extra={
784
+ "eventually": True,
785
+ "attempt": attempt,
786
+ "snapshot_attempt": snapshot_attempt,
787
+ "final": True,
788
+ "timeout": True,
789
+ },
790
+ )
791
+ if self.required:
792
+ self.runtime._persist_failure_artifacts(
793
+ reason=f"assert_eventually_timeout:{self.label}"
794
+ )
795
+ return False
796
+
797
+ await asyncio.sleep(poll_s)
798
+ continue
799
+
800
+ last_outcome = self.predicate(self.runtime._ctx())
801
+
802
+ # Emit attempt event (not recorded in step_end)
803
+ self.runtime._record_outcome(
804
+ outcome=last_outcome,
805
+ label=self.label,
806
+ required=self.required,
807
+ kind="assert",
808
+ record_in_step=False,
809
+ extra={"eventually": True, "attempt": attempt},
810
+ )
811
+
812
+ if last_outcome.passed:
813
+ # Record final success once
814
+ self.runtime._record_outcome(
815
+ outcome=last_outcome,
816
+ label=self.label,
817
+ required=self.required,
818
+ kind="assert",
819
+ record_in_step=True,
820
+ extra={"eventually": True, "attempt": attempt, "final": True},
821
+ )
822
+ return True
823
+
824
+ if time.monotonic() >= deadline:
825
+ # Record final failure once
826
+ self.runtime._record_outcome(
827
+ outcome=last_outcome,
828
+ label=self.label,
829
+ required=self.required,
830
+ kind="assert",
831
+ record_in_step=True,
832
+ extra={"eventually": True, "attempt": attempt, "final": True, "timeout": True},
833
+ )
834
+ if self.required:
835
+ self.runtime._persist_failure_artifacts(
836
+ reason=f"assert_eventually_timeout:{self.label}"
837
+ )
838
+ return False
839
+
840
+ await asyncio.sleep(poll_s)