sentienceapi 0.90.16__py3-none-any.whl → 0.98.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +120 -6
- sentience/_extension_loader.py +156 -1
- sentience/action_executor.py +217 -0
- sentience/actions.py +758 -30
- sentience/agent.py +806 -293
- sentience/agent_config.py +3 -0
- sentience/agent_runtime.py +840 -0
- sentience/asserts/__init__.py +70 -0
- sentience/asserts/expect.py +621 -0
- sentience/asserts/query.py +383 -0
- sentience/async_api.py +89 -1141
- sentience/backends/__init__.py +137 -0
- sentience/backends/actions.py +372 -0
- sentience/backends/browser_use_adapter.py +241 -0
- sentience/backends/cdp_backend.py +393 -0
- sentience/backends/exceptions.py +211 -0
- sentience/backends/playwright_backend.py +194 -0
- sentience/backends/protocol.py +216 -0
- sentience/backends/sentience_context.py +469 -0
- sentience/backends/snapshot.py +483 -0
- sentience/base_agent.py +95 -0
- sentience/browser.py +678 -39
- sentience/browser_evaluator.py +299 -0
- sentience/canonicalization.py +207 -0
- sentience/cloud_tracing.py +507 -42
- sentience/constants.py +6 -0
- sentience/conversational_agent.py +77 -43
- sentience/cursor_policy.py +142 -0
- sentience/element_filter.py +136 -0
- sentience/expect.py +98 -2
- sentience/extension/background.js +56 -185
- sentience/extension/content.js +150 -287
- sentience/extension/injected_api.js +1088 -1368
- sentience/extension/manifest.json +1 -1
- sentience/extension/pkg/sentience_core.d.ts +22 -22
- sentience/extension/pkg/sentience_core.js +275 -433
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/release.json +47 -47
- sentience/failure_artifacts.py +241 -0
- sentience/formatting.py +9 -53
- sentience/inspector.py +183 -1
- sentience/integrations/__init__.py +6 -0
- sentience/integrations/langchain/__init__.py +12 -0
- sentience/integrations/langchain/context.py +18 -0
- sentience/integrations/langchain/core.py +326 -0
- sentience/integrations/langchain/tools.py +180 -0
- sentience/integrations/models.py +46 -0
- sentience/integrations/pydanticai/__init__.py +15 -0
- sentience/integrations/pydanticai/deps.py +20 -0
- sentience/integrations/pydanticai/toolset.py +468 -0
- sentience/llm_interaction_handler.py +191 -0
- sentience/llm_provider.py +765 -66
- sentience/llm_provider_utils.py +120 -0
- sentience/llm_response_builder.py +153 -0
- sentience/models.py +595 -3
- sentience/ordinal.py +280 -0
- sentience/overlay.py +109 -2
- sentience/protocols.py +228 -0
- sentience/query.py +67 -5
- sentience/read.py +95 -3
- sentience/recorder.py +223 -3
- sentience/schemas/trace_v1.json +128 -9
- sentience/screenshot.py +48 -2
- sentience/sentience_methods.py +86 -0
- sentience/snapshot.py +599 -55
- sentience/snapshot_diff.py +126 -0
- sentience/text_search.py +120 -5
- sentience/trace_event_builder.py +148 -0
- sentience/trace_file_manager.py +197 -0
- sentience/trace_indexing/index_schema.py +95 -7
- sentience/trace_indexing/indexer.py +105 -48
- sentience/tracer_factory.py +120 -9
- sentience/tracing.py +172 -8
- sentience/utils/__init__.py +40 -0
- sentience/utils/browser.py +46 -0
- sentience/{utils.py → utils/element.py} +3 -42
- sentience/utils/formatting.py +59 -0
- sentience/verification.py +618 -0
- sentience/visual_agent.py +2058 -0
- sentience/wait.py +68 -2
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/METADATA +199 -40
- sentienceapi-0.98.0.dist-info/RECORD +92 -0
- sentience/extension/test-content.js +0 -4
- sentienceapi-0.90.16.dist-info/RECORD +0 -50
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/WHEEL +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/entry_points.txt +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-APACHE +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/licenses/LICENSE-MIT +0 -0
- {sentienceapi-0.90.16.dist-info → sentienceapi-0.98.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from typing import Annotated, Any, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from sentience.actions import (
|
|
11
|
+
click_async,
|
|
12
|
+
click_rect_async,
|
|
13
|
+
press_async,
|
|
14
|
+
scroll_to_async,
|
|
15
|
+
type_text_async,
|
|
16
|
+
)
|
|
17
|
+
from sentience.integrations.models import AssertionResult, BrowserState, ElementSummary
|
|
18
|
+
from sentience.models import ReadResult, SnapshotOptions, TextRectSearchResult
|
|
19
|
+
from sentience.read import read_async
|
|
20
|
+
from sentience.snapshot import snapshot_async
|
|
21
|
+
from sentience.text_search import find_text_rect_async
|
|
22
|
+
from sentience.trace_event_builder import TraceEventBuilder
|
|
23
|
+
|
|
24
|
+
from .deps import SentiencePydanticDeps
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register_sentience_tools(agent: Any) -> dict[str, Any]:
|
|
28
|
+
"""
|
|
29
|
+
Register Sentience tools on a PydanticAI agent.
|
|
30
|
+
|
|
31
|
+
This function is intentionally lightweight and avoids importing `pydantic_ai`
|
|
32
|
+
at module import time. It expects `agent` to provide a `.tool` decorator
|
|
33
|
+
compatible with PydanticAI's `Agent.tool`.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Mapping of tool name -> underlying coroutine function (useful for tests).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# Per-agent counter for tool call steps (for tracing)
|
|
40
|
+
step_counter = {"n": 0}
|
|
41
|
+
|
|
42
|
+
def _safe_tracer_call(tracer: Any, method_name: str, *args, **kwargs) -> None:
|
|
43
|
+
try:
|
|
44
|
+
getattr(tracer, method_name)(*args, **kwargs)
|
|
45
|
+
except Exception:
|
|
46
|
+
# Tracing must be non-fatal for tool execution
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
async def _trace_tool_call(ctx: Any, tool_name: str, exec_coro, exec_meta: dict[str, Any]):
|
|
50
|
+
"""
|
|
51
|
+
Wrap a tool execution with Sentience tracing if a tracer is present in deps.
|
|
52
|
+
"""
|
|
53
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
54
|
+
tracer = deps.tracer
|
|
55
|
+
|
|
56
|
+
pre_url = None
|
|
57
|
+
if getattr(deps.browser, "page", None) is not None:
|
|
58
|
+
pre_url = getattr(deps.browser.page, "url", None)
|
|
59
|
+
|
|
60
|
+
# Initialize run_start once (best-effort)
|
|
61
|
+
if tracer and getattr(tracer, "started_at", None) is None:
|
|
62
|
+
_safe_tracer_call(
|
|
63
|
+
tracer,
|
|
64
|
+
"emit_run_start",
|
|
65
|
+
agent="PydanticAI+SentienceToolset",
|
|
66
|
+
llm_model=None,
|
|
67
|
+
config={"integration": "pydanticai"},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
step_id = None
|
|
71
|
+
step_index = None
|
|
72
|
+
start = time.time()
|
|
73
|
+
if tracer:
|
|
74
|
+
step_counter["n"] += 1
|
|
75
|
+
step_index = step_counter["n"]
|
|
76
|
+
step_id = f"tool-{step_index}:{tool_name}"
|
|
77
|
+
_safe_tracer_call(
|
|
78
|
+
tracer,
|
|
79
|
+
"emit_step_start",
|
|
80
|
+
step_id=step_id,
|
|
81
|
+
step_index=step_index,
|
|
82
|
+
goal=f"tool:{tool_name}",
|
|
83
|
+
attempt=0,
|
|
84
|
+
pre_url=pre_url,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
result = await exec_coro()
|
|
89
|
+
|
|
90
|
+
if tracer and step_id and step_index:
|
|
91
|
+
post_url = pre_url
|
|
92
|
+
if getattr(deps.browser, "page", None) is not None:
|
|
93
|
+
post_url = getattr(deps.browser.page, "url", pre_url)
|
|
94
|
+
|
|
95
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
96
|
+
|
|
97
|
+
# Best-effort success inference
|
|
98
|
+
success: bool | None = None
|
|
99
|
+
if hasattr(result, "success"):
|
|
100
|
+
success = bool(getattr(result, "success"))
|
|
101
|
+
elif hasattr(result, "status"):
|
|
102
|
+
success = getattr(result, "status") == "success"
|
|
103
|
+
elif isinstance(result, dict):
|
|
104
|
+
if "success" in result:
|
|
105
|
+
try:
|
|
106
|
+
success = bool(result.get("success"))
|
|
107
|
+
except Exception:
|
|
108
|
+
success = None
|
|
109
|
+
elif "status" in result:
|
|
110
|
+
success = result.get("status") == "success"
|
|
111
|
+
|
|
112
|
+
exec_data = {"tool": tool_name, "duration_ms": duration_ms, **exec_meta}
|
|
113
|
+
if success is not None:
|
|
114
|
+
exec_data["success"] = success
|
|
115
|
+
|
|
116
|
+
verify_data = {
|
|
117
|
+
"passed": bool(success) if success is not None else True,
|
|
118
|
+
"signals": {},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
step_end_data = TraceEventBuilder.build_step_end_event(
|
|
122
|
+
step_id=step_id,
|
|
123
|
+
step_index=step_index,
|
|
124
|
+
goal=f"tool:{tool_name}",
|
|
125
|
+
attempt=0,
|
|
126
|
+
pre_url=pre_url or "",
|
|
127
|
+
post_url=post_url or "",
|
|
128
|
+
snapshot_digest=None,
|
|
129
|
+
llm_data={},
|
|
130
|
+
exec_data=exec_data,
|
|
131
|
+
verify_data=verify_data,
|
|
132
|
+
)
|
|
133
|
+
_safe_tracer_call(tracer, "emit", "step_end", step_end_data, step_id=step_id)
|
|
134
|
+
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
if tracer and step_id:
|
|
139
|
+
_safe_tracer_call(tracer, "emit_error", step_id=step_id, error=str(e), attempt=0)
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
@agent.tool
|
|
143
|
+
async def snapshot_state(
|
|
144
|
+
ctx: Any,
|
|
145
|
+
limit: Annotated[int, Field(ge=1, le=500)] = 50,
|
|
146
|
+
include_screenshot: bool = False,
|
|
147
|
+
) -> BrowserState:
|
|
148
|
+
"""
|
|
149
|
+
Take a bounded snapshot of the current page and return a small typed summary.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
async def _run():
|
|
153
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
154
|
+
opts = SnapshotOptions(limit=limit, screenshot=include_screenshot)
|
|
155
|
+
snap = await snapshot_async(deps.browser, opts)
|
|
156
|
+
if getattr(snap, "status", "success") != "success":
|
|
157
|
+
raise RuntimeError(getattr(snap, "error", None) or "snapshot failed")
|
|
158
|
+
elements = [
|
|
159
|
+
ElementSummary(
|
|
160
|
+
id=e.id,
|
|
161
|
+
role=e.role,
|
|
162
|
+
text=e.text,
|
|
163
|
+
importance=e.importance,
|
|
164
|
+
bbox=e.bbox,
|
|
165
|
+
)
|
|
166
|
+
for e in snap.elements
|
|
167
|
+
]
|
|
168
|
+
return BrowserState(url=snap.url, elements=elements)
|
|
169
|
+
|
|
170
|
+
return await _trace_tool_call(
|
|
171
|
+
ctx,
|
|
172
|
+
"snapshot_state",
|
|
173
|
+
_run,
|
|
174
|
+
{"limit": limit, "include_screenshot": include_screenshot},
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
@agent.tool
|
|
178
|
+
async def read_page(
|
|
179
|
+
ctx: Any,
|
|
180
|
+
format: Literal["raw", "text", "markdown"] = "text",
|
|
181
|
+
enhance_markdown: bool = True,
|
|
182
|
+
) -> ReadResult:
|
|
183
|
+
"""
|
|
184
|
+
Read page content as raw HTML, text, or markdown.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
async def _run():
|
|
188
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
189
|
+
return await read_async(
|
|
190
|
+
deps.browser,
|
|
191
|
+
output_format=format,
|
|
192
|
+
enhance_markdown=enhance_markdown,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return await _trace_tool_call(
|
|
196
|
+
ctx,
|
|
197
|
+
"read_page",
|
|
198
|
+
_run,
|
|
199
|
+
{"format": format, "enhance_markdown": enhance_markdown},
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
@agent.tool
|
|
203
|
+
async def click(
|
|
204
|
+
ctx: Any,
|
|
205
|
+
element_id: Annotated[int, Field(ge=0)],
|
|
206
|
+
):
|
|
207
|
+
"""
|
|
208
|
+
Click an element by Sentience element id (from snapshot).
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
async def _run():
|
|
212
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
213
|
+
return await click_async(deps.browser, element_id)
|
|
214
|
+
|
|
215
|
+
return await _trace_tool_call(ctx, "click", _run, {"element_id": element_id})
|
|
216
|
+
|
|
217
|
+
@agent.tool
|
|
218
|
+
async def type_text(
|
|
219
|
+
ctx: Any,
|
|
220
|
+
element_id: Annotated[int, Field(ge=0)],
|
|
221
|
+
text: str,
|
|
222
|
+
delay_ms: Annotated[float, Field(ge=0, le=250)] = 0,
|
|
223
|
+
):
|
|
224
|
+
"""
|
|
225
|
+
Type text into an element by Sentience element id (from snapshot).
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
async def _run():
|
|
229
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
230
|
+
return await type_text_async(deps.browser, element_id, text, delay_ms=delay_ms)
|
|
231
|
+
|
|
232
|
+
# NOTE: we intentionally don't trace full `text` to avoid accidental PII leakage
|
|
233
|
+
return await _trace_tool_call(
|
|
234
|
+
ctx,
|
|
235
|
+
"type_text",
|
|
236
|
+
_run,
|
|
237
|
+
{"element_id": element_id, "delay_ms": delay_ms},
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
@agent.tool
|
|
241
|
+
async def press_key(
|
|
242
|
+
ctx: Any,
|
|
243
|
+
key: str,
|
|
244
|
+
):
|
|
245
|
+
"""
|
|
246
|
+
Press a keyboard key (Enter, Escape, Tab, etc.).
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
async def _run():
|
|
250
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
251
|
+
return await press_async(deps.browser, key)
|
|
252
|
+
|
|
253
|
+
return await _trace_tool_call(ctx, "press_key", _run, {"key": key})
|
|
254
|
+
|
|
255
|
+
@agent.tool
|
|
256
|
+
async def scroll_to(
|
|
257
|
+
ctx: Any,
|
|
258
|
+
element_id: Annotated[int, Field(ge=0)],
|
|
259
|
+
behavior: Literal["smooth", "instant", "auto"] = "smooth",
|
|
260
|
+
block: Literal["start", "center", "end", "nearest"] = "center",
|
|
261
|
+
):
|
|
262
|
+
"""
|
|
263
|
+
Scroll an element into view by Sentience element id (from snapshot).
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
async def _run():
|
|
267
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
268
|
+
return await scroll_to_async(deps.browser, element_id, behavior=behavior, block=block)
|
|
269
|
+
|
|
270
|
+
return await _trace_tool_call(
|
|
271
|
+
ctx,
|
|
272
|
+
"scroll_to",
|
|
273
|
+
_run,
|
|
274
|
+
{"element_id": element_id, "behavior": behavior, "block": block},
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
@agent.tool
|
|
278
|
+
async def navigate(
|
|
279
|
+
ctx: Any,
|
|
280
|
+
url: Annotated[str, Field(min_length=1)],
|
|
281
|
+
) -> dict[str, Any]:
|
|
282
|
+
"""
|
|
283
|
+
Navigate to a URL using Playwright page.goto via AsyncSentienceBrowser.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
async def _run():
|
|
287
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
288
|
+
await deps.browser.goto(url)
|
|
289
|
+
post_url = None
|
|
290
|
+
if getattr(deps.browser, "page", None) is not None:
|
|
291
|
+
post_url = getattr(deps.browser.page, "url", None)
|
|
292
|
+
return {"success": True, "url": post_url or url}
|
|
293
|
+
|
|
294
|
+
return await _trace_tool_call(ctx, "navigate", _run, {"url": url})
|
|
295
|
+
|
|
296
|
+
@agent.tool
|
|
297
|
+
async def click_rect(
|
|
298
|
+
ctx: Any,
|
|
299
|
+
*,
|
|
300
|
+
x: Annotated[float, Field()],
|
|
301
|
+
y: Annotated[float, Field()],
|
|
302
|
+
width: Annotated[float, Field(gt=0)],
|
|
303
|
+
height: Annotated[float, Field(gt=0)],
|
|
304
|
+
button: Literal["left", "right", "middle"] = "left",
|
|
305
|
+
click_count: Annotated[int, Field(ge=1, le=3)] = 1,
|
|
306
|
+
):
|
|
307
|
+
"""
|
|
308
|
+
Click by pixel coordinates (rectangle), useful with `find_text_rect`.
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
async def _run():
|
|
312
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
313
|
+
return await click_rect_async(
|
|
314
|
+
deps.browser,
|
|
315
|
+
{"x": x, "y": y, "w": width, "h": height},
|
|
316
|
+
button=button,
|
|
317
|
+
click_count=click_count,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return await _trace_tool_call(
|
|
321
|
+
ctx,
|
|
322
|
+
"click_rect",
|
|
323
|
+
_run,
|
|
324
|
+
{
|
|
325
|
+
"x": x,
|
|
326
|
+
"y": y,
|
|
327
|
+
"width": width,
|
|
328
|
+
"height": height,
|
|
329
|
+
"button": button,
|
|
330
|
+
"click_count": click_count,
|
|
331
|
+
},
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
@agent.tool
|
|
335
|
+
async def find_text_rect(
|
|
336
|
+
ctx: Any,
|
|
337
|
+
text: Annotated[str, Field(min_length=1)],
|
|
338
|
+
case_sensitive: bool = False,
|
|
339
|
+
whole_word: bool = False,
|
|
340
|
+
max_results: Annotated[int, Field(ge=1, le=100)] = 10,
|
|
341
|
+
) -> TextRectSearchResult:
|
|
342
|
+
"""
|
|
343
|
+
Find text occurrences and return pixel coordinates.
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
async def _run():
|
|
347
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
348
|
+
return await find_text_rect_async(
|
|
349
|
+
deps.browser,
|
|
350
|
+
text,
|
|
351
|
+
case_sensitive=case_sensitive,
|
|
352
|
+
whole_word=whole_word,
|
|
353
|
+
max_results=max_results,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
return await _trace_tool_call(
|
|
357
|
+
ctx,
|
|
358
|
+
"find_text_rect",
|
|
359
|
+
_run,
|
|
360
|
+
{
|
|
361
|
+
"query": text,
|
|
362
|
+
"case_sensitive": case_sensitive,
|
|
363
|
+
"whole_word": whole_word,
|
|
364
|
+
"max_results": max_results,
|
|
365
|
+
},
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
@agent.tool
|
|
369
|
+
async def verify_url_matches(
|
|
370
|
+
ctx: Any,
|
|
371
|
+
pattern: Annotated[str, Field(min_length=1)],
|
|
372
|
+
flags: int = 0,
|
|
373
|
+
) -> AssertionResult:
|
|
374
|
+
"""
|
|
375
|
+
Verify the current page URL matches a regex pattern.
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
async def _run():
|
|
379
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
380
|
+
if not deps.browser.page:
|
|
381
|
+
return AssertionResult(passed=False, reason="Browser not started (page is None)")
|
|
382
|
+
|
|
383
|
+
url = deps.browser.page.url
|
|
384
|
+
ok = re.search(pattern, url, flags) is not None
|
|
385
|
+
return AssertionResult(
|
|
386
|
+
passed=ok,
|
|
387
|
+
reason="" if ok else f"URL did not match pattern. url={url!r} pattern={pattern!r}",
|
|
388
|
+
details={"url": url, "pattern": pattern},
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
return await _trace_tool_call(
|
|
392
|
+
ctx,
|
|
393
|
+
"verify_url_matches",
|
|
394
|
+
_run,
|
|
395
|
+
{"pattern": pattern},
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
@agent.tool
|
|
399
|
+
async def verify_text_present(
|
|
400
|
+
ctx: Any,
|
|
401
|
+
text: Annotated[str, Field(min_length=1)],
|
|
402
|
+
*,
|
|
403
|
+
format: Literal["text", "markdown", "raw"] = "text",
|
|
404
|
+
case_sensitive: bool = False,
|
|
405
|
+
) -> AssertionResult:
|
|
406
|
+
"""
|
|
407
|
+
Verify a text substring is present in `read_page()` output.
|
|
408
|
+
"""
|
|
409
|
+
|
|
410
|
+
async def _run():
|
|
411
|
+
deps: SentiencePydanticDeps = ctx.deps
|
|
412
|
+
result = await read_async(deps.browser, output_format=format, enhance_markdown=True)
|
|
413
|
+
if result.status != "success":
|
|
414
|
+
return AssertionResult(
|
|
415
|
+
passed=False, reason=f"read failed: {result.error}", details={}
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
haystack = result.content if case_sensitive else result.content.lower()
|
|
419
|
+
needle = text if case_sensitive else text.lower()
|
|
420
|
+
ok = needle in haystack
|
|
421
|
+
return AssertionResult(
|
|
422
|
+
passed=ok,
|
|
423
|
+
reason="" if ok else f"Text not present: {text!r}",
|
|
424
|
+
details={"format": format, "query": text, "length": result.length},
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
return await _trace_tool_call(
|
|
428
|
+
ctx,
|
|
429
|
+
"verify_text_present",
|
|
430
|
+
_run,
|
|
431
|
+
{"query": text, "format": format},
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
@agent.tool
|
|
435
|
+
async def assert_eventually_url_matches(
|
|
436
|
+
ctx: Any,
|
|
437
|
+
pattern: Annotated[str, Field(min_length=1)],
|
|
438
|
+
*,
|
|
439
|
+
timeout_s: Annotated[float, Field(gt=0)] = 10.0,
|
|
440
|
+
poll_s: Annotated[float, Field(gt=0)] = 0.25,
|
|
441
|
+
flags: int = 0,
|
|
442
|
+
) -> AssertionResult:
|
|
443
|
+
"""
|
|
444
|
+
Retry until the page URL matches `pattern` or timeout is reached.
|
|
445
|
+
"""
|
|
446
|
+
deadline = time.monotonic() + timeout_s
|
|
447
|
+
last = None
|
|
448
|
+
while time.monotonic() <= deadline:
|
|
449
|
+
last = await verify_url_matches(ctx, pattern, flags)
|
|
450
|
+
if last.passed:
|
|
451
|
+
return last
|
|
452
|
+
await asyncio.sleep(poll_s)
|
|
453
|
+
return last or AssertionResult(passed=False, reason="No attempts executed", details={})
|
|
454
|
+
|
|
455
|
+
return {
|
|
456
|
+
"snapshot_state": snapshot_state,
|
|
457
|
+
"read_page": read_page,
|
|
458
|
+
"click": click,
|
|
459
|
+
"type_text": type_text,
|
|
460
|
+
"press_key": press_key,
|
|
461
|
+
"scroll_to": scroll_to,
|
|
462
|
+
"navigate": navigate,
|
|
463
|
+
"click_rect": click_rect,
|
|
464
|
+
"find_text_rect": find_text_rect,
|
|
465
|
+
"verify_url_matches": verify_url_matches,
|
|
466
|
+
"verify_text_present": verify_text_present,
|
|
467
|
+
"assert_eventually_url_matches": assert_eventually_url_matches,
|
|
468
|
+
}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM Interaction Handler for Sentience Agent.
|
|
3
|
+
|
|
4
|
+
Handles all LLM-related operations: context building, querying, and response parsing.
|
|
5
|
+
This separates LLM interaction concerns from action execution.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from .llm_provider import LLMProvider, LLMResponse
|
|
11
|
+
from .models import Snapshot
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LLMInteractionHandler:
|
|
15
|
+
"""
|
|
16
|
+
Handles LLM queries and response parsing for Sentience Agent.
|
|
17
|
+
|
|
18
|
+
This class encapsulates all LLM interaction logic, making it easier to:
|
|
19
|
+
- Test LLM interactions independently
|
|
20
|
+
- Swap LLM providers without changing agent code
|
|
21
|
+
- Modify prompt templates in one place
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, llm: LLMProvider):
|
|
25
|
+
"""
|
|
26
|
+
Initialize LLM interaction handler.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
llm: LLM provider instance (OpenAIProvider, AnthropicProvider, etc.)
|
|
30
|
+
"""
|
|
31
|
+
self.llm = llm
|
|
32
|
+
|
|
33
|
+
def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Convert snapshot elements to token-efficient prompt string.
|
|
36
|
+
|
|
37
|
+
Format: [ID] <role> "text" {cues} @ position size:WxH importance:score [status]
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
snap: Snapshot object
|
|
41
|
+
goal: Optional user goal (for context, currently unused but kept for API consistency)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Formatted element context string
|
|
45
|
+
"""
|
|
46
|
+
lines = []
|
|
47
|
+
for el in snap.elements:
|
|
48
|
+
# Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
|
|
49
|
+
if el.diff_status == "REMOVED":
|
|
50
|
+
continue
|
|
51
|
+
# Extract visual cues
|
|
52
|
+
cues: list[str] = []
|
|
53
|
+
if el.visual_cues.is_primary:
|
|
54
|
+
cues.append("PRIMARY")
|
|
55
|
+
if el.visual_cues.is_clickable:
|
|
56
|
+
cues.append("CLICKABLE")
|
|
57
|
+
if el.visual_cues.background_color_name:
|
|
58
|
+
cues.append(f"color:{el.visual_cues.background_color_name}")
|
|
59
|
+
|
|
60
|
+
# Format element line with improved readability
|
|
61
|
+
# Ensure cues is defined before using it in f-string
|
|
62
|
+
cues_str = f" {{{','.join(cues)}}}" if cues else ""
|
|
63
|
+
|
|
64
|
+
# Better text handling - show truncation indicator
|
|
65
|
+
text_preview = ""
|
|
66
|
+
if el.text:
|
|
67
|
+
if len(el.text) > 50:
|
|
68
|
+
text_preview = f'"{el.text[:50]}..."'
|
|
69
|
+
else:
|
|
70
|
+
text_preview = f'"{el.text}"'
|
|
71
|
+
|
|
72
|
+
# Build position and size info
|
|
73
|
+
x, y = int(el.bbox.x), int(el.bbox.y)
|
|
74
|
+
width, height = int(el.bbox.width), int(el.bbox.height)
|
|
75
|
+
position_str = f"@ ({x},{y})"
|
|
76
|
+
size_str = f"size:{width}x{height}"
|
|
77
|
+
|
|
78
|
+
# Build status indicators (only include if relevant)
|
|
79
|
+
status_parts = []
|
|
80
|
+
if not el.in_viewport:
|
|
81
|
+
status_parts.append("not_in_viewport")
|
|
82
|
+
if el.is_occluded:
|
|
83
|
+
status_parts.append("occluded")
|
|
84
|
+
if el.diff_status:
|
|
85
|
+
status_parts.append(f"diff:{el.diff_status}")
|
|
86
|
+
status_str = f" [{','.join(status_parts)}]" if status_parts else ""
|
|
87
|
+
|
|
88
|
+
# Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
|
|
89
|
+
lines.append(
|
|
90
|
+
f"[{el.id}] <{el.role}> {text_preview}{cues_str} "
|
|
91
|
+
f"{position_str} {size_str} importance:{el.importance}{status_str}"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return "\n".join(lines)
|
|
95
|
+
|
|
96
|
+
def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
|
|
97
|
+
"""
|
|
98
|
+
Query LLM with standardized prompt template.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
dom_context: Formatted element context from build_context()
|
|
102
|
+
goal: User goal
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
LLMResponse from LLM provider
|
|
106
|
+
"""
|
|
107
|
+
system_prompt = f"""You are an AI web automation agent.
|
|
108
|
+
|
|
109
|
+
GOAL: {goal}
|
|
110
|
+
|
|
111
|
+
VISIBLE ELEMENTS (sorted by importance):
|
|
112
|
+
{dom_context}
|
|
113
|
+
|
|
114
|
+
VISUAL CUES EXPLAINED:
|
|
115
|
+
After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
|
|
116
|
+
- PRIMARY: Main call-to-action element on the page
|
|
117
|
+
- CLICKABLE: Element is clickable/interactive
|
|
118
|
+
- color:X: Background color name (e.g., color:white, color:blue)
|
|
119
|
+
Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}
|
|
120
|
+
|
|
121
|
+
ELEMENT FORMAT EXPLAINED:
|
|
122
|
+
Each element line follows this format:
|
|
123
|
+
[ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]
|
|
124
|
+
|
|
125
|
+
Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811
|
|
126
|
+
|
|
127
|
+
Breaking down each part:
|
|
128
|
+
- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
|
|
129
|
+
Example: If you see [346], use CLICK(346) or TYPE(346, "text")
|
|
130
|
+
- <role>: Element type (button, link, textbox, etc.)
|
|
131
|
+
- "text": Visible text content (truncated with "..." if long)
|
|
132
|
+
- {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
|
|
133
|
+
If no cues, this part is omitted entirely
|
|
134
|
+
- @ (x,y): Element position in pixels from top-left corner
|
|
135
|
+
- size:WxH: Element dimensions (width x height in pixels)
|
|
136
|
+
- importance: Score indicating element relevance (higher = more important)
|
|
137
|
+
- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
|
|
138
|
+
|
|
139
|
+
CRITICAL RESPONSE FORMAT:
|
|
140
|
+
You MUST respond with ONLY ONE of these exact action formats:
|
|
141
|
+
- CLICK(id) - Click element by ID (use the number from [ID] brackets)
|
|
142
|
+
- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
|
|
143
|
+
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
|
|
144
|
+
- FINISH() - Task complete
|
|
145
|
+
|
|
146
|
+
DO NOT include any explanation, reasoning, or natural language.
|
|
147
|
+
DO NOT use markdown formatting or code blocks.
|
|
148
|
+
DO NOT say "The next step is..." or anything similar.
|
|
149
|
+
|
|
150
|
+
CORRECT Examples (matching element IDs from the list above):
|
|
151
|
+
If element is [346] <button> "Click me" → respond: CLICK(346)
|
|
152
|
+
If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
|
|
153
|
+
PRESS("Enter")
|
|
154
|
+
FINISH()
|
|
155
|
+
|
|
156
|
+
INCORRECT Examples (DO NOT DO THIS):
|
|
157
|
+
"The next step is to click..."
|
|
158
|
+
"I will type..."
|
|
159
|
+
```CLICK(42)```
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
user_prompt = "Return the single action command:"
|
|
163
|
+
|
|
164
|
+
return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
|
|
165
|
+
|
|
166
|
+
def extract_action(self, response: str) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Extract action command from LLM response.
|
|
169
|
+
|
|
170
|
+
Handles cases where the LLM adds extra explanation despite instructions.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
response: Raw LLM response text
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Cleaned action command string (e.g., "CLICK(42)", "TYPE(15, \"text\")")
|
|
177
|
+
"""
|
|
178
|
+
# Remove markdown code blocks if present
|
|
179
|
+
response = re.sub(r"```[\w]*\n?", "", response)
|
|
180
|
+
response = response.strip()
|
|
181
|
+
|
|
182
|
+
# Try to find action patterns in the response
|
|
183
|
+
# Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
|
|
184
|
+
action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
|
|
185
|
+
|
|
186
|
+
match = re.search(action_pattern, response, re.IGNORECASE)
|
|
187
|
+
if match:
|
|
188
|
+
return match.group(1)
|
|
189
|
+
|
|
190
|
+
# If no pattern match, return the original response (will likely fail parsing)
|
|
191
|
+
return response
|