sentienceapi 0.92.2__py3-none-any.whl → 0.98.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +107 -2
- sentience/_extension_loader.py +156 -1
- sentience/action_executor.py +2 -0
- sentience/actions.py +354 -9
- sentience/agent.py +4 -0
- sentience/agent_runtime.py +840 -0
- sentience/asserts/__init__.py +70 -0
- sentience/asserts/expect.py +621 -0
- sentience/asserts/query.py +383 -0
- sentience/async_api.py +8 -1
- sentience/backends/__init__.py +137 -0
- sentience/backends/actions.py +372 -0
- sentience/backends/browser_use_adapter.py +241 -0
- sentience/backends/cdp_backend.py +393 -0
- sentience/backends/exceptions.py +211 -0
- sentience/backends/playwright_backend.py +194 -0
- sentience/backends/protocol.py +216 -0
- sentience/backends/sentience_context.py +469 -0
- sentience/backends/snapshot.py +483 -0
- sentience/browser.py +230 -74
- sentience/canonicalization.py +207 -0
- sentience/cloud_tracing.py +65 -24
- sentience/constants.py +6 -0
- sentience/cursor_policy.py +142 -0
- sentience/extension/content.js +35 -0
- sentience/extension/injected_api.js +310 -15
- sentience/extension/manifest.json +1 -1
- sentience/extension/pkg/sentience_core.d.ts +22 -22
- sentience/extension/pkg/sentience_core.js +192 -144
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/release.json +29 -29
- sentience/failure_artifacts.py +241 -0
- sentience/integrations/__init__.py +6 -0
- sentience/integrations/langchain/__init__.py +12 -0
- sentience/integrations/langchain/context.py +18 -0
- sentience/integrations/langchain/core.py +326 -0
- sentience/integrations/langchain/tools.py +180 -0
- sentience/integrations/models.py +46 -0
- sentience/integrations/pydanticai/__init__.py +15 -0
- sentience/integrations/pydanticai/deps.py +20 -0
- sentience/integrations/pydanticai/toolset.py +468 -0
- sentience/llm_provider.py +695 -18
- sentience/models.py +536 -3
- sentience/ordinal.py +280 -0
- sentience/query.py +66 -4
- sentience/schemas/trace_v1.json +27 -1
- sentience/snapshot.py +384 -93
- sentience/snapshot_diff.py +39 -54
- sentience/text_search.py +1 -0
- sentience/trace_event_builder.py +20 -1
- sentience/trace_indexing/indexer.py +3 -49
- sentience/tracer_factory.py +1 -3
- sentience/verification.py +618 -0
- sentience/visual_agent.py +3 -1
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +198 -40
- sentienceapi-0.98.0.dist-info/RECORD +92 -0
- sentience/utils.py +0 -296
- sentienceapi-0.92.2.dist-info/RECORD +0 -65
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
- {sentienceapi-0.92.2.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent runtime for verification loop support.
|
|
3
|
+
|
|
4
|
+
This module provides a thin runtime wrapper that combines:
|
|
5
|
+
1. Browser session management (via BrowserBackend protocol)
|
|
6
|
+
2. Snapshot/query helpers
|
|
7
|
+
3. Tracer for event emission
|
|
8
|
+
4. Assertion/verification methods
|
|
9
|
+
|
|
10
|
+
The AgentRuntime is designed to be used in agent verification loops where
|
|
11
|
+
you need to repeatedly take snapshots, execute actions, and verify results.
|
|
12
|
+
|
|
13
|
+
Example usage with browser-use:
|
|
14
|
+
from browser_use import BrowserSession, BrowserProfile
|
|
15
|
+
from sentience import get_extension_dir
|
|
16
|
+
from sentience.backends import BrowserUseAdapter
|
|
17
|
+
from sentience.agent_runtime import AgentRuntime
|
|
18
|
+
from sentience.verification import url_matches, exists
|
|
19
|
+
from sentience.tracing import Tracer, JsonlTraceSink
|
|
20
|
+
|
|
21
|
+
# Setup browser-use with Sentience extension
|
|
22
|
+
profile = BrowserProfile(args=[f"--load-extension={get_extension_dir()}"])
|
|
23
|
+
session = BrowserSession(browser_profile=profile)
|
|
24
|
+
await session.start()
|
|
25
|
+
|
|
26
|
+
# Create adapter and backend
|
|
27
|
+
adapter = BrowserUseAdapter(session)
|
|
28
|
+
backend = await adapter.create_backend()
|
|
29
|
+
|
|
30
|
+
# Navigate using browser-use
|
|
31
|
+
page = await session.get_current_page()
|
|
32
|
+
await page.goto("https://example.com")
|
|
33
|
+
|
|
34
|
+
# Create runtime with backend
|
|
35
|
+
sink = JsonlTraceSink("trace.jsonl")
|
|
36
|
+
tracer = Tracer(run_id="test-run", sink=sink)
|
|
37
|
+
runtime = AgentRuntime(backend=backend, tracer=tracer)
|
|
38
|
+
|
|
39
|
+
# Take snapshot and run assertions
|
|
40
|
+
await runtime.snapshot()
|
|
41
|
+
runtime.assert_(url_matches(r"example\\.com"), label="on_homepage")
|
|
42
|
+
runtime.assert_(exists("role=button"), label="has_buttons")
|
|
43
|
+
|
|
44
|
+
# Check if task is done
|
|
45
|
+
if runtime.assert_done(exists("text~'Success'"), label="task_complete"):
|
|
46
|
+
print("Task completed!")
|
|
47
|
+
|
|
48
|
+
Example usage with AsyncSentienceBrowser (backward compatible):
|
|
49
|
+
from sentience import AsyncSentienceBrowser
|
|
50
|
+
from sentience.agent_runtime import AgentRuntime
|
|
51
|
+
|
|
52
|
+
async with AsyncSentienceBrowser() as browser:
|
|
53
|
+
page = await browser.new_page()
|
|
54
|
+
await page.goto("https://example.com")
|
|
55
|
+
|
|
56
|
+
runtime = await AgentRuntime.from_sentience_browser(
|
|
57
|
+
browser=browser,
|
|
58
|
+
page=page,
|
|
59
|
+
tracer=tracer,
|
|
60
|
+
)
|
|
61
|
+
await runtime.snapshot()
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
from __future__ import annotations
|
|
65
|
+
|
|
66
|
+
import asyncio
|
|
67
|
+
import difflib
|
|
68
|
+
import time
|
|
69
|
+
import uuid
|
|
70
|
+
from dataclasses import dataclass
|
|
71
|
+
from typing import TYPE_CHECKING, Any
|
|
72
|
+
|
|
73
|
+
from .failure_artifacts import FailureArtifactBuffer, FailureArtifactsOptions
|
|
74
|
+
from .models import Snapshot, SnapshotOptions
|
|
75
|
+
from .verification import AssertContext, AssertOutcome, Predicate
|
|
76
|
+
|
|
77
|
+
if TYPE_CHECKING:
|
|
78
|
+
from playwright.async_api import Page
|
|
79
|
+
|
|
80
|
+
from .backends.protocol import BrowserBackend
|
|
81
|
+
from .browser import AsyncSentienceBrowser
|
|
82
|
+
from .tracing import Tracer
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class AgentRuntime:
|
|
86
|
+
"""
|
|
87
|
+
Runtime wrapper for agent verification loops.
|
|
88
|
+
|
|
89
|
+
Provides ergonomic methods for:
|
|
90
|
+
- snapshot(): Take page snapshot
|
|
91
|
+
- assert_(): Evaluate assertion predicates
|
|
92
|
+
- assert_done(): Assert task completion (required assertion)
|
|
93
|
+
|
|
94
|
+
The runtime manages assertion state per step and emits verification events
|
|
95
|
+
to the tracer for Studio timeline display.
|
|
96
|
+
|
|
97
|
+
Attributes:
|
|
98
|
+
backend: BrowserBackend instance for browser operations
|
|
99
|
+
tracer: Tracer for event emission
|
|
100
|
+
step_id: Current step identifier
|
|
101
|
+
step_index: Current step index (0-based)
|
|
102
|
+
last_snapshot: Most recent snapshot (for assertion context)
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
backend: BrowserBackend,
|
|
108
|
+
tracer: Tracer,
|
|
109
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
110
|
+
sentience_api_key: str | None = None,
|
|
111
|
+
):
|
|
112
|
+
"""
|
|
113
|
+
Initialize agent runtime with any BrowserBackend-compatible browser.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
backend: Any browser implementing BrowserBackend protocol.
|
|
117
|
+
Examples:
|
|
118
|
+
- CDPBackendV0 (for browser-use via BrowserUseAdapter)
|
|
119
|
+
- PlaywrightBackend (future, for direct Playwright)
|
|
120
|
+
tracer: Tracer for emitting verification events
|
|
121
|
+
snapshot_options: Default options for snapshots
|
|
122
|
+
sentience_api_key: API key for Pro/Enterprise tier (enables Gateway refinement)
|
|
123
|
+
"""
|
|
124
|
+
self.backend = backend
|
|
125
|
+
self.tracer = tracer
|
|
126
|
+
|
|
127
|
+
# Build default snapshot options with API key if provided
|
|
128
|
+
default_opts = snapshot_options or SnapshotOptions()
|
|
129
|
+
if sentience_api_key:
|
|
130
|
+
default_opts.sentience_api_key = sentience_api_key
|
|
131
|
+
if default_opts.use_api is None:
|
|
132
|
+
default_opts.use_api = True
|
|
133
|
+
self._snapshot_options = default_opts
|
|
134
|
+
|
|
135
|
+
# Step tracking
|
|
136
|
+
self.step_id: str | None = None
|
|
137
|
+
self.step_index: int = 0
|
|
138
|
+
|
|
139
|
+
# Snapshot state
|
|
140
|
+
self.last_snapshot: Snapshot | None = None
|
|
141
|
+
|
|
142
|
+
# Failure artifacts (Phase 1)
|
|
143
|
+
self._artifact_buffer: FailureArtifactBuffer | None = None
|
|
144
|
+
self._artifact_timer_task: asyncio.Task | None = None
|
|
145
|
+
|
|
146
|
+
# Cached URL (updated on snapshot or explicit get_url call)
|
|
147
|
+
self._cached_url: str | None = None
|
|
148
|
+
|
|
149
|
+
# Assertions accumulated during current step
|
|
150
|
+
self._assertions_this_step: list[dict[str, Any]] = []
|
|
151
|
+
|
|
152
|
+
# Task completion tracking
|
|
153
|
+
self._task_done: bool = False
|
|
154
|
+
self._task_done_label: str | None = None
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
async def from_sentience_browser(
|
|
158
|
+
cls,
|
|
159
|
+
browser: AsyncSentienceBrowser,
|
|
160
|
+
page: Page,
|
|
161
|
+
tracer: Tracer,
|
|
162
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
163
|
+
sentience_api_key: str | None = None,
|
|
164
|
+
) -> AgentRuntime:
|
|
165
|
+
"""
|
|
166
|
+
Create AgentRuntime from AsyncSentienceBrowser (backward compatibility).
|
|
167
|
+
|
|
168
|
+
This factory method wraps an AsyncSentienceBrowser + Page combination
|
|
169
|
+
into the new BrowserBackend-based AgentRuntime.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
browser: AsyncSentienceBrowser instance
|
|
173
|
+
page: Playwright Page for browser interaction
|
|
174
|
+
tracer: Tracer for emitting verification events
|
|
175
|
+
snapshot_options: Default options for snapshots
|
|
176
|
+
sentience_api_key: API key for Pro/Enterprise tier
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
AgentRuntime instance
|
|
180
|
+
"""
|
|
181
|
+
from .backends.playwright_backend import PlaywrightBackend
|
|
182
|
+
|
|
183
|
+
backend = PlaywrightBackend(page)
|
|
184
|
+
runtime = cls(
|
|
185
|
+
backend=backend,
|
|
186
|
+
tracer=tracer,
|
|
187
|
+
snapshot_options=snapshot_options,
|
|
188
|
+
sentience_api_key=sentience_api_key,
|
|
189
|
+
)
|
|
190
|
+
# Store browser reference for snapshot() to use
|
|
191
|
+
runtime._legacy_browser = browser
|
|
192
|
+
runtime._legacy_page = page
|
|
193
|
+
return runtime
|
|
194
|
+
|
|
195
|
+
def _ctx(self) -> AssertContext:
|
|
196
|
+
"""
|
|
197
|
+
Build assertion context from current state.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
AssertContext with current snapshot and URL
|
|
201
|
+
"""
|
|
202
|
+
url = None
|
|
203
|
+
if self.last_snapshot is not None:
|
|
204
|
+
url = self.last_snapshot.url
|
|
205
|
+
elif self._cached_url:
|
|
206
|
+
url = self._cached_url
|
|
207
|
+
|
|
208
|
+
return AssertContext(
|
|
209
|
+
snapshot=self.last_snapshot,
|
|
210
|
+
url=url,
|
|
211
|
+
step_id=self.step_id,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
async def get_url(self) -> str:
|
|
215
|
+
"""
|
|
216
|
+
Get current page URL.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Current page URL
|
|
220
|
+
"""
|
|
221
|
+
url = await self.backend.get_url()
|
|
222
|
+
self._cached_url = url
|
|
223
|
+
return url
|
|
224
|
+
|
|
225
|
+
async def snapshot(self, **kwargs: Any) -> Snapshot:
|
|
226
|
+
"""
|
|
227
|
+
Take a snapshot of the current page state.
|
|
228
|
+
|
|
229
|
+
This updates last_snapshot which is used as context for assertions.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
**kwargs: Override default snapshot options for this call.
|
|
233
|
+
Common options:
|
|
234
|
+
- limit: Maximum elements to return
|
|
235
|
+
- goal: Task goal for ordinal support
|
|
236
|
+
- screenshot: Include screenshot
|
|
237
|
+
- show_overlay: Show visual overlay
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Snapshot of current page state
|
|
241
|
+
"""
|
|
242
|
+
# Check if using legacy browser (backward compat)
|
|
243
|
+
if hasattr(self, "_legacy_browser") and hasattr(self, "_legacy_page"):
|
|
244
|
+
self.last_snapshot = await self._legacy_browser.snapshot(self._legacy_page, **kwargs)
|
|
245
|
+
return self.last_snapshot
|
|
246
|
+
|
|
247
|
+
# Use backend-agnostic snapshot
|
|
248
|
+
from .backends.snapshot import snapshot as backend_snapshot
|
|
249
|
+
|
|
250
|
+
# Merge default options with call-specific kwargs
|
|
251
|
+
options_dict = self._snapshot_options.model_dump(exclude_none=True)
|
|
252
|
+
options_dict.update(kwargs)
|
|
253
|
+
options = SnapshotOptions(**options_dict)
|
|
254
|
+
|
|
255
|
+
self.last_snapshot = await backend_snapshot(self.backend, options=options)
|
|
256
|
+
return self.last_snapshot
|
|
257
|
+
|
|
258
|
+
async def enable_failure_artifacts(
|
|
259
|
+
self,
|
|
260
|
+
options: FailureArtifactsOptions | None = None,
|
|
261
|
+
) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Enable failure artifact buffer (Phase 1).
|
|
264
|
+
"""
|
|
265
|
+
opts = options or FailureArtifactsOptions()
|
|
266
|
+
self._artifact_buffer = FailureArtifactBuffer(
|
|
267
|
+
run_id=self.tracer.run_id,
|
|
268
|
+
options=opts,
|
|
269
|
+
)
|
|
270
|
+
if opts.fps > 0:
|
|
271
|
+
self._artifact_timer_task = asyncio.create_task(self._artifact_timer_loop())
|
|
272
|
+
|
|
273
|
+
def disable_failure_artifacts(self) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Disable failure artifact buffer and stop background capture.
|
|
276
|
+
"""
|
|
277
|
+
if self._artifact_timer_task:
|
|
278
|
+
self._artifact_timer_task.cancel()
|
|
279
|
+
self._artifact_timer_task = None
|
|
280
|
+
|
|
281
|
+
async def record_action(
|
|
282
|
+
self,
|
|
283
|
+
action: str,
|
|
284
|
+
*,
|
|
285
|
+
url: str | None = None,
|
|
286
|
+
) -> None:
|
|
287
|
+
"""
|
|
288
|
+
Record an action in the artifact timeline and capture a frame if enabled.
|
|
289
|
+
"""
|
|
290
|
+
if not self._artifact_buffer:
|
|
291
|
+
return
|
|
292
|
+
self._artifact_buffer.record_step(
|
|
293
|
+
action=action,
|
|
294
|
+
step_id=self.step_id,
|
|
295
|
+
step_index=self.step_index,
|
|
296
|
+
url=url,
|
|
297
|
+
)
|
|
298
|
+
if self._artifact_buffer.options.capture_on_action:
|
|
299
|
+
await self._capture_artifact_frame()
|
|
300
|
+
|
|
301
|
+
async def _capture_artifact_frame(self) -> None:
|
|
302
|
+
if not self._artifact_buffer:
|
|
303
|
+
return
|
|
304
|
+
try:
|
|
305
|
+
image_bytes = await self.backend.screenshot_png()
|
|
306
|
+
except Exception:
|
|
307
|
+
return
|
|
308
|
+
self._artifact_buffer.add_frame(image_bytes, fmt="png")
|
|
309
|
+
|
|
310
|
+
async def _artifact_timer_loop(self) -> None:
|
|
311
|
+
if not self._artifact_buffer:
|
|
312
|
+
return
|
|
313
|
+
interval = 1.0 / max(0.001, self._artifact_buffer.options.fps)
|
|
314
|
+
try:
|
|
315
|
+
while True:
|
|
316
|
+
await self._capture_artifact_frame()
|
|
317
|
+
await asyncio.sleep(interval)
|
|
318
|
+
except asyncio.CancelledError:
|
|
319
|
+
return
|
|
320
|
+
|
|
321
|
+
def finalize_run(self, *, success: bool) -> None:
|
|
322
|
+
"""
|
|
323
|
+
Finalize artifact buffer at end of run.
|
|
324
|
+
"""
|
|
325
|
+
if not self._artifact_buffer:
|
|
326
|
+
return
|
|
327
|
+
if success:
|
|
328
|
+
if self._artifact_buffer.options.persist_mode == "always":
|
|
329
|
+
self._artifact_buffer.persist(
|
|
330
|
+
reason="success",
|
|
331
|
+
status="success",
|
|
332
|
+
snapshot=self.last_snapshot,
|
|
333
|
+
diagnostics=getattr(self.last_snapshot, "diagnostics", None),
|
|
334
|
+
metadata=self._artifact_metadata(),
|
|
335
|
+
)
|
|
336
|
+
self._artifact_buffer.cleanup()
|
|
337
|
+
else:
|
|
338
|
+
self._persist_failure_artifacts(reason="finalize_failure")
|
|
339
|
+
|
|
340
|
+
def _persist_failure_artifacts(self, *, reason: str) -> None:
|
|
341
|
+
if not self._artifact_buffer:
|
|
342
|
+
return
|
|
343
|
+
self._artifact_buffer.persist(
|
|
344
|
+
reason=reason,
|
|
345
|
+
status="failure",
|
|
346
|
+
snapshot=self.last_snapshot,
|
|
347
|
+
diagnostics=getattr(self.last_snapshot, "diagnostics", None),
|
|
348
|
+
metadata=self._artifact_metadata(),
|
|
349
|
+
)
|
|
350
|
+
self._artifact_buffer.cleanup()
|
|
351
|
+
if self._artifact_buffer.options.persist_mode == "onFail":
|
|
352
|
+
self.disable_failure_artifacts()
|
|
353
|
+
|
|
354
|
+
def _artifact_metadata(self) -> dict[str, Any]:
|
|
355
|
+
url = None
|
|
356
|
+
if self.last_snapshot is not None:
|
|
357
|
+
url = self.last_snapshot.url
|
|
358
|
+
elif self._cached_url:
|
|
359
|
+
url = self._cached_url
|
|
360
|
+
return {
|
|
361
|
+
"backend": self.backend.__class__.__name__,
|
|
362
|
+
"url": url,
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
def begin_step(self, goal: str, step_index: int | None = None) -> str:
|
|
366
|
+
"""
|
|
367
|
+
Begin a new step in the verification loop.
|
|
368
|
+
|
|
369
|
+
This:
|
|
370
|
+
- Generates a new step_id
|
|
371
|
+
- Clears assertions from previous step
|
|
372
|
+
- Increments step_index (or uses provided value)
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
goal: Description of what this step aims to achieve
|
|
376
|
+
step_index: Optional explicit step index (otherwise auto-increments)
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
Generated step_id
|
|
380
|
+
"""
|
|
381
|
+
# Clear previous step state
|
|
382
|
+
self._assertions_this_step = []
|
|
383
|
+
|
|
384
|
+
# Generate new step_id
|
|
385
|
+
self.step_id = str(uuid.uuid4())
|
|
386
|
+
|
|
387
|
+
# Update step index
|
|
388
|
+
if step_index is not None:
|
|
389
|
+
self.step_index = step_index
|
|
390
|
+
else:
|
|
391
|
+
self.step_index += 1
|
|
392
|
+
|
|
393
|
+
return self.step_id
|
|
394
|
+
|
|
395
|
+
def assert_(
|
|
396
|
+
self,
|
|
397
|
+
predicate: Predicate,
|
|
398
|
+
label: str,
|
|
399
|
+
required: bool = False,
|
|
400
|
+
) -> bool:
|
|
401
|
+
"""
|
|
402
|
+
Evaluate an assertion against current snapshot state.
|
|
403
|
+
|
|
404
|
+
The assertion result is:
|
|
405
|
+
1. Accumulated for inclusion in step_end.data.verify.signals.assertions
|
|
406
|
+
2. Emitted as a dedicated 'verification' event for Studio timeline
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
predicate: Predicate function to evaluate
|
|
410
|
+
label: Human-readable label for this assertion
|
|
411
|
+
required: If True, this assertion gates step success (default: False)
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
True if assertion passed, False otherwise
|
|
415
|
+
"""
|
|
416
|
+
outcome = predicate(self._ctx())
|
|
417
|
+
self._record_outcome(
|
|
418
|
+
outcome=outcome,
|
|
419
|
+
label=label,
|
|
420
|
+
required=required,
|
|
421
|
+
kind="assert",
|
|
422
|
+
record_in_step=True,
|
|
423
|
+
)
|
|
424
|
+
if required and not outcome.passed:
|
|
425
|
+
self._persist_failure_artifacts(reason=f"assert_failed:{label}")
|
|
426
|
+
return outcome.passed
|
|
427
|
+
|
|
428
|
+
def check(self, predicate: Predicate, label: str, required: bool = False) -> AssertionHandle:
|
|
429
|
+
"""
|
|
430
|
+
Create an AssertionHandle for fluent `.once()` / `.eventually()` usage.
|
|
431
|
+
|
|
432
|
+
This does NOT evaluate the predicate immediately.
|
|
433
|
+
"""
|
|
434
|
+
|
|
435
|
+
return AssertionHandle(runtime=self, predicate=predicate, label=label, required=required)
|
|
436
|
+
|
|
437
|
+
def assert_done(
|
|
438
|
+
self,
|
|
439
|
+
predicate: Predicate,
|
|
440
|
+
label: str,
|
|
441
|
+
) -> bool:
|
|
442
|
+
"""
|
|
443
|
+
Assert task completion (required assertion).
|
|
444
|
+
|
|
445
|
+
This is a convenience wrapper for assert_() with required=True.
|
|
446
|
+
When the assertion passes, it marks the task as done.
|
|
447
|
+
|
|
448
|
+
Use this for final verification that the agent's goal is complete.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
predicate: Predicate function to evaluate
|
|
452
|
+
label: Human-readable label for this assertion
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
True if task is complete (assertion passed), False otherwise
|
|
456
|
+
"""
|
|
457
|
+
# Convenience wrapper for assert_ with required=True
|
|
458
|
+
ok = self.assertTrue(predicate, label=label, required=True)
|
|
459
|
+
if ok:
|
|
460
|
+
self._task_done = True
|
|
461
|
+
self._task_done_label = label
|
|
462
|
+
|
|
463
|
+
# Emit task_done verification event
|
|
464
|
+
self.tracer.emit(
|
|
465
|
+
"verification",
|
|
466
|
+
data={
|
|
467
|
+
"kind": "task_done",
|
|
468
|
+
"passed": True,
|
|
469
|
+
"label": label,
|
|
470
|
+
},
|
|
471
|
+
step_id=self.step_id,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
return ok
|
|
475
|
+
|
|
476
|
+
def _record_outcome(
|
|
477
|
+
self,
|
|
478
|
+
*,
|
|
479
|
+
outcome: Any,
|
|
480
|
+
label: str,
|
|
481
|
+
required: bool,
|
|
482
|
+
kind: str,
|
|
483
|
+
record_in_step: bool,
|
|
484
|
+
extra: dict[str, Any] | None = None,
|
|
485
|
+
) -> None:
|
|
486
|
+
"""
|
|
487
|
+
Internal helper: emit verification event and optionally accumulate for step_end.
|
|
488
|
+
"""
|
|
489
|
+
details = dict(outcome.details or {})
|
|
490
|
+
|
|
491
|
+
# Failure intelligence: nearest matches for selector-driven assertions
|
|
492
|
+
if not outcome.passed and self.last_snapshot is not None and "selector" in details:
|
|
493
|
+
selector = str(details.get("selector") or "")
|
|
494
|
+
details.setdefault("nearest_matches", self._nearest_matches(selector, limit=3))
|
|
495
|
+
|
|
496
|
+
record = {
|
|
497
|
+
"label": label,
|
|
498
|
+
"passed": bool(outcome.passed),
|
|
499
|
+
"required": required,
|
|
500
|
+
"reason": str(outcome.reason or ""),
|
|
501
|
+
"details": details,
|
|
502
|
+
}
|
|
503
|
+
if extra:
|
|
504
|
+
record.update(extra)
|
|
505
|
+
|
|
506
|
+
if record_in_step:
|
|
507
|
+
self._assertions_this_step.append(record)
|
|
508
|
+
|
|
509
|
+
self.tracer.emit(
|
|
510
|
+
"verification",
|
|
511
|
+
data={
|
|
512
|
+
"kind": kind,
|
|
513
|
+
"passed": bool(outcome.passed),
|
|
514
|
+
**record,
|
|
515
|
+
},
|
|
516
|
+
step_id=self.step_id,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
def _nearest_matches(self, selector: str, *, limit: int = 3) -> list[dict[str, Any]]:
|
|
520
|
+
"""
|
|
521
|
+
Best-effort nearest match suggestions for debugging failed selector assertions.
|
|
522
|
+
"""
|
|
523
|
+
if self.last_snapshot is None:
|
|
524
|
+
return []
|
|
525
|
+
|
|
526
|
+
s = selector.lower().strip()
|
|
527
|
+
if not s:
|
|
528
|
+
return []
|
|
529
|
+
|
|
530
|
+
scored: list[tuple[float, Any]] = []
|
|
531
|
+
for el in self.last_snapshot.elements:
|
|
532
|
+
hay = (getattr(el, "name", None) or getattr(el, "text", None) or "").strip()
|
|
533
|
+
if not hay:
|
|
534
|
+
continue
|
|
535
|
+
score = difflib.SequenceMatcher(None, s, hay.lower()).ratio()
|
|
536
|
+
scored.append((score, el))
|
|
537
|
+
|
|
538
|
+
scored.sort(key=lambda t: t[0], reverse=True)
|
|
539
|
+
out: list[dict[str, Any]] = []
|
|
540
|
+
for score, el in scored[:limit]:
|
|
541
|
+
out.append(
|
|
542
|
+
{
|
|
543
|
+
"id": getattr(el, "id", None),
|
|
544
|
+
"role": getattr(el, "role", None),
|
|
545
|
+
"text": (getattr(el, "text", "") or "")[:80],
|
|
546
|
+
"name": (getattr(el, "name", "") or "")[:80],
|
|
547
|
+
"score": round(float(score), 4),
|
|
548
|
+
}
|
|
549
|
+
)
|
|
550
|
+
return out
|
|
551
|
+
|
|
552
|
+
def get_assertions_for_step_end(self) -> dict[str, Any]:
|
|
553
|
+
"""
|
|
554
|
+
Get assertions data for inclusion in step_end.data.verify.signals.
|
|
555
|
+
|
|
556
|
+
Returns:
|
|
557
|
+
Dictionary with 'assertions', 'task_done', 'task_done_label' keys
|
|
558
|
+
"""
|
|
559
|
+
result: dict[str, Any] = {
|
|
560
|
+
"assertions": self._assertions_this_step.copy(),
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
if self._task_done:
|
|
564
|
+
result["task_done"] = True
|
|
565
|
+
result["task_done_label"] = self._task_done_label
|
|
566
|
+
|
|
567
|
+
return result
|
|
568
|
+
|
|
569
|
+
def flush_assertions(self) -> list[dict[str, Any]]:
|
|
570
|
+
"""
|
|
571
|
+
Get and clear assertions for current step.
|
|
572
|
+
"""
|
|
573
|
+
assertions = self._assertions_this_step.copy()
|
|
574
|
+
self._assertions_this_step = []
|
|
575
|
+
return assertions
|
|
576
|
+
|
|
577
|
+
@property
|
|
578
|
+
def is_task_done(self) -> bool:
|
|
579
|
+
"""Check if task has been marked as done via assert_done()."""
|
|
580
|
+
return self._task_done
|
|
581
|
+
|
|
582
|
+
def reset_task_done(self) -> None:
|
|
583
|
+
"""Reset task_done state (for multi-task runs)."""
|
|
584
|
+
self._task_done = False
|
|
585
|
+
self._task_done_label = None
|
|
586
|
+
|
|
587
|
+
def all_assertions_passed(self) -> bool:
|
|
588
|
+
"""Return True if all assertions in current step passed (or none)."""
|
|
589
|
+
return all(a["passed"] for a in self._assertions_this_step)
|
|
590
|
+
|
|
591
|
+
def required_assertions_passed(self) -> bool:
|
|
592
|
+
"""Return True if all required assertions in current step passed (or none)."""
|
|
593
|
+
required = [a for a in self._assertions_this_step if a.get("required")]
|
|
594
|
+
return all(a["passed"] for a in required)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
@dataclass
|
|
598
|
+
class AssertionHandle:
|
|
599
|
+
runtime: AgentRuntime
|
|
600
|
+
predicate: Predicate
|
|
601
|
+
label: str
|
|
602
|
+
required: bool = False
|
|
603
|
+
|
|
604
|
+
def once(self) -> bool:
|
|
605
|
+
"""Evaluate once (same behavior as runtime.assert_)."""
|
|
606
|
+
return self.runtime.assert_(self.predicate, label=self.label, required=self.required)
|
|
607
|
+
|
|
608
|
+
async def eventually(
|
|
609
|
+
self,
|
|
610
|
+
*,
|
|
611
|
+
timeout_s: float = 10.0,
|
|
612
|
+
poll_s: float = 0.25,
|
|
613
|
+
min_confidence: float | None = None,
|
|
614
|
+
max_snapshot_attempts: int = 3,
|
|
615
|
+
snapshot_kwargs: dict[str, Any] | None = None,
|
|
616
|
+
vision_provider: Any | None = None,
|
|
617
|
+
vision_system_prompt: str | None = None,
|
|
618
|
+
vision_user_prompt: str | None = None,
|
|
619
|
+
) -> bool:
|
|
620
|
+
"""
|
|
621
|
+
Retry until the predicate passes or timeout is reached.
|
|
622
|
+
|
|
623
|
+
Intermediate attempts emit verification events but do NOT accumulate in step_end assertions.
|
|
624
|
+
Final result is accumulated once.
|
|
625
|
+
"""
|
|
626
|
+
deadline = time.monotonic() + timeout_s
|
|
627
|
+
attempt = 0
|
|
628
|
+
snapshot_attempt = 0
|
|
629
|
+
last_outcome = None
|
|
630
|
+
|
|
631
|
+
while True:
|
|
632
|
+
attempt += 1
|
|
633
|
+
await self.runtime.snapshot(**(snapshot_kwargs or {}))
|
|
634
|
+
snapshot_attempt += 1
|
|
635
|
+
|
|
636
|
+
# Optional: gate predicate evaluation on snapshot confidence.
|
|
637
|
+
# If diagnostics are missing, we don't block (backward compatible).
|
|
638
|
+
confidence = None
|
|
639
|
+
diagnostics = None
|
|
640
|
+
if self.runtime.last_snapshot is not None:
|
|
641
|
+
diagnostics = getattr(self.runtime.last_snapshot, "diagnostics", None)
|
|
642
|
+
if diagnostics is not None:
|
|
643
|
+
confidence = getattr(diagnostics, "confidence", None)
|
|
644
|
+
|
|
645
|
+
if (
|
|
646
|
+
min_confidence is not None
|
|
647
|
+
and confidence is not None
|
|
648
|
+
and isinstance(confidence, (int, float))
|
|
649
|
+
and confidence < min_confidence
|
|
650
|
+
):
|
|
651
|
+
last_outcome = AssertOutcome(
|
|
652
|
+
passed=False,
|
|
653
|
+
reason=f"Snapshot confidence {confidence:.3f} < min_confidence {min_confidence:.3f}",
|
|
654
|
+
details={
|
|
655
|
+
"reason_code": "snapshot_low_confidence",
|
|
656
|
+
"confidence": confidence,
|
|
657
|
+
"min_confidence": min_confidence,
|
|
658
|
+
"snapshot_attempt": snapshot_attempt,
|
|
659
|
+
"diagnostics": (
|
|
660
|
+
diagnostics.model_dump()
|
|
661
|
+
if hasattr(diagnostics, "model_dump")
|
|
662
|
+
else diagnostics
|
|
663
|
+
),
|
|
664
|
+
},
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
# Emit attempt event (not recorded in step_end)
|
|
668
|
+
self.runtime._record_outcome(
|
|
669
|
+
outcome=last_outcome,
|
|
670
|
+
label=self.label,
|
|
671
|
+
required=self.required,
|
|
672
|
+
kind="assert",
|
|
673
|
+
record_in_step=False,
|
|
674
|
+
extra={
|
|
675
|
+
"eventually": True,
|
|
676
|
+
"attempt": attempt,
|
|
677
|
+
"snapshot_attempt": snapshot_attempt,
|
|
678
|
+
},
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
if snapshot_attempt >= max_snapshot_attempts:
|
|
682
|
+
# Optional: vision fallback as last resort (Phase 2-lite).
|
|
683
|
+
# This keeps the assertion surface invariant; only the perception layer changes.
|
|
684
|
+
if (
|
|
685
|
+
vision_provider is not None
|
|
686
|
+
and getattr(vision_provider, "supports_vision", lambda: False)()
|
|
687
|
+
):
|
|
688
|
+
try:
|
|
689
|
+
import base64
|
|
690
|
+
|
|
691
|
+
png_bytes = await self.runtime.backend.screenshot_png()
|
|
692
|
+
image_b64 = base64.b64encode(png_bytes).decode("utf-8")
|
|
693
|
+
|
|
694
|
+
sys_prompt = vision_system_prompt or (
|
|
695
|
+
"You are a strict visual verifier. Answer only YES or NO."
|
|
696
|
+
)
|
|
697
|
+
user_prompt = vision_user_prompt or (
|
|
698
|
+
f"Given the screenshot, is the following condition satisfied?\n\n{self.label}\n\nAnswer YES or NO."
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
resp = vision_provider.generate_with_image(
|
|
702
|
+
sys_prompt,
|
|
703
|
+
user_prompt,
|
|
704
|
+
image_base64=image_b64,
|
|
705
|
+
temperature=0.0,
|
|
706
|
+
)
|
|
707
|
+
text = (resp.content or "").strip().lower()
|
|
708
|
+
passed = text.startswith("yes")
|
|
709
|
+
|
|
710
|
+
final_outcome = AssertOutcome(
|
|
711
|
+
passed=passed,
|
|
712
|
+
reason="vision_fallback_yes" if passed else "vision_fallback_no",
|
|
713
|
+
details={
|
|
714
|
+
"reason_code": (
|
|
715
|
+
"vision_fallback_pass" if passed else "vision_fallback_fail"
|
|
716
|
+
),
|
|
717
|
+
"vision_response": resp.content,
|
|
718
|
+
"min_confidence": min_confidence,
|
|
719
|
+
"snapshot_attempts": snapshot_attempt,
|
|
720
|
+
},
|
|
721
|
+
)
|
|
722
|
+
self.runtime._record_outcome(
|
|
723
|
+
outcome=final_outcome,
|
|
724
|
+
label=self.label,
|
|
725
|
+
required=self.required,
|
|
726
|
+
kind="assert",
|
|
727
|
+
record_in_step=True,
|
|
728
|
+
extra={
|
|
729
|
+
"eventually": True,
|
|
730
|
+
"attempt": attempt,
|
|
731
|
+
"snapshot_attempt": snapshot_attempt,
|
|
732
|
+
"final": True,
|
|
733
|
+
"vision_fallback": True,
|
|
734
|
+
},
|
|
735
|
+
)
|
|
736
|
+
if self.required and not passed:
|
|
737
|
+
self.runtime._persist_failure_artifacts(
|
|
738
|
+
reason=f"assert_eventually_failed:{self.label}"
|
|
739
|
+
)
|
|
740
|
+
return passed
|
|
741
|
+
except Exception as e:
|
|
742
|
+
# If vision fallback fails, fall through to snapshot_exhausted.
|
|
743
|
+
last_outcome.details["vision_error"] = str(e)
|
|
744
|
+
|
|
745
|
+
final_outcome = AssertOutcome(
|
|
746
|
+
passed=False,
|
|
747
|
+
reason=f"Snapshot exhausted after {snapshot_attempt} attempt(s) below min_confidence {min_confidence:.3f}",
|
|
748
|
+
details={
|
|
749
|
+
"reason_code": "snapshot_exhausted",
|
|
750
|
+
"confidence": confidence,
|
|
751
|
+
"min_confidence": min_confidence,
|
|
752
|
+
"snapshot_attempts": snapshot_attempt,
|
|
753
|
+
"diagnostics": last_outcome.details.get("diagnostics"),
|
|
754
|
+
},
|
|
755
|
+
)
|
|
756
|
+
self.runtime._record_outcome(
|
|
757
|
+
outcome=final_outcome,
|
|
758
|
+
label=self.label,
|
|
759
|
+
required=self.required,
|
|
760
|
+
kind="assert",
|
|
761
|
+
record_in_step=True,
|
|
762
|
+
extra={
|
|
763
|
+
"eventually": True,
|
|
764
|
+
"attempt": attempt,
|
|
765
|
+
"snapshot_attempt": snapshot_attempt,
|
|
766
|
+
"final": True,
|
|
767
|
+
"exhausted": True,
|
|
768
|
+
},
|
|
769
|
+
)
|
|
770
|
+
if self.required:
|
|
771
|
+
self.runtime._persist_failure_artifacts(
|
|
772
|
+
reason=f"assert_eventually_failed:{self.label}"
|
|
773
|
+
)
|
|
774
|
+
return False
|
|
775
|
+
|
|
776
|
+
if time.monotonic() >= deadline:
|
|
777
|
+
self.runtime._record_outcome(
|
|
778
|
+
outcome=last_outcome,
|
|
779
|
+
label=self.label,
|
|
780
|
+
required=self.required,
|
|
781
|
+
kind="assert",
|
|
782
|
+
record_in_step=True,
|
|
783
|
+
extra={
|
|
784
|
+
"eventually": True,
|
|
785
|
+
"attempt": attempt,
|
|
786
|
+
"snapshot_attempt": snapshot_attempt,
|
|
787
|
+
"final": True,
|
|
788
|
+
"timeout": True,
|
|
789
|
+
},
|
|
790
|
+
)
|
|
791
|
+
if self.required:
|
|
792
|
+
self.runtime._persist_failure_artifacts(
|
|
793
|
+
reason=f"assert_eventually_timeout:{self.label}"
|
|
794
|
+
)
|
|
795
|
+
return False
|
|
796
|
+
|
|
797
|
+
await asyncio.sleep(poll_s)
|
|
798
|
+
continue
|
|
799
|
+
|
|
800
|
+
last_outcome = self.predicate(self.runtime._ctx())
|
|
801
|
+
|
|
802
|
+
# Emit attempt event (not recorded in step_end)
|
|
803
|
+
self.runtime._record_outcome(
|
|
804
|
+
outcome=last_outcome,
|
|
805
|
+
label=self.label,
|
|
806
|
+
required=self.required,
|
|
807
|
+
kind="assert",
|
|
808
|
+
record_in_step=False,
|
|
809
|
+
extra={"eventually": True, "attempt": attempt},
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
if last_outcome.passed:
|
|
813
|
+
# Record final success once
|
|
814
|
+
self.runtime._record_outcome(
|
|
815
|
+
outcome=last_outcome,
|
|
816
|
+
label=self.label,
|
|
817
|
+
required=self.required,
|
|
818
|
+
kind="assert",
|
|
819
|
+
record_in_step=True,
|
|
820
|
+
extra={"eventually": True, "attempt": attempt, "final": True},
|
|
821
|
+
)
|
|
822
|
+
return True
|
|
823
|
+
|
|
824
|
+
if time.monotonic() >= deadline:
|
|
825
|
+
# Record final failure once
|
|
826
|
+
self.runtime._record_outcome(
|
|
827
|
+
outcome=last_outcome,
|
|
828
|
+
label=self.label,
|
|
829
|
+
required=self.required,
|
|
830
|
+
kind="assert",
|
|
831
|
+
record_in_step=True,
|
|
832
|
+
extra={"eventually": True, "attempt": attempt, "final": True, "timeout": True},
|
|
833
|
+
)
|
|
834
|
+
if self.required:
|
|
835
|
+
self.runtime._persist_failure_artifacts(
|
|
836
|
+
reason=f"assert_eventually_timeout:{self.label}"
|
|
837
|
+
)
|
|
838
|
+
return False
|
|
839
|
+
|
|
840
|
+
await asyncio.sleep(poll_s)
|