sentienceapi 0.90.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sentienceapi might be problematic. Click here for more details.
- sentience/__init__.py +153 -0
- sentience/_extension_loader.py +40 -0
- sentience/actions.py +837 -0
- sentience/agent.py +1246 -0
- sentience/agent_config.py +43 -0
- sentience/async_api.py +101 -0
- sentience/base_agent.py +194 -0
- sentience/browser.py +1037 -0
- sentience/cli.py +130 -0
- sentience/cloud_tracing.py +382 -0
- sentience/conversational_agent.py +509 -0
- sentience/expect.py +188 -0
- sentience/extension/background.js +233 -0
- sentience/extension/content.js +298 -0
- sentience/extension/injected_api.js +1473 -0
- sentience/extension/manifest.json +36 -0
- sentience/extension/pkg/sentience_core.d.ts +51 -0
- sentience/extension/pkg/sentience_core.js +529 -0
- sentience/extension/pkg/sentience_core_bg.wasm +0 -0
- sentience/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
- sentience/extension/release.json +115 -0
- sentience/extension/test-content.js +4 -0
- sentience/formatting.py +59 -0
- sentience/generator.py +202 -0
- sentience/inspector.py +365 -0
- sentience/llm_provider.py +637 -0
- sentience/models.py +412 -0
- sentience/overlay.py +222 -0
- sentience/query.py +303 -0
- sentience/read.py +185 -0
- sentience/recorder.py +589 -0
- sentience/schemas/trace_v1.json +216 -0
- sentience/screenshot.py +100 -0
- sentience/snapshot.py +516 -0
- sentience/text_search.py +290 -0
- sentience/trace_indexing/__init__.py +27 -0
- sentience/trace_indexing/index_schema.py +111 -0
- sentience/trace_indexing/indexer.py +357 -0
- sentience/tracer_factory.py +211 -0
- sentience/tracing.py +285 -0
- sentience/utils.py +296 -0
- sentience/wait.py +137 -0
- sentienceapi-0.90.17.dist-info/METADATA +917 -0
- sentienceapi-0.90.17.dist-info/RECORD +50 -0
- sentienceapi-0.90.17.dist-info/WHEEL +5 -0
- sentienceapi-0.90.17.dist-info/entry_points.txt +2 -0
- sentienceapi-0.90.17.dist-info/licenses/LICENSE +24 -0
- sentienceapi-0.90.17.dist-info/licenses/LICENSE-APACHE +201 -0
- sentienceapi-0.90.17.dist-info/licenses/LICENSE-MIT +21 -0
- sentienceapi-0.90.17.dist-info/top_level.txt +1 -0
sentience/agent.py
ADDED
|
@@ -0,0 +1,1246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sentience Agent: High-level automation agent using LLM + SDK
|
|
3
|
+
Implements observe-think-act loop for natural language commands
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
10
|
+
|
|
11
|
+
from .actions import click, click_async, press, press_async, type_text, type_text_async
|
|
12
|
+
from .base_agent import BaseAgent, BaseAgentAsync
|
|
13
|
+
from .browser import AsyncSentienceBrowser, SentienceBrowser
|
|
14
|
+
from .llm_provider import LLMProvider, LLMResponse
|
|
15
|
+
from .models import (
|
|
16
|
+
ActionHistory,
|
|
17
|
+
ActionTokenUsage,
|
|
18
|
+
AgentActionResult,
|
|
19
|
+
Element,
|
|
20
|
+
ScreenshotConfig,
|
|
21
|
+
Snapshot,
|
|
22
|
+
SnapshotOptions,
|
|
23
|
+
TokenStats,
|
|
24
|
+
)
|
|
25
|
+
from .snapshot import snapshot, snapshot_async
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from .agent_config import AgentConfig
|
|
29
|
+
from .tracing import Tracer
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SentienceAgent(BaseAgent):
|
|
33
|
+
"""
|
|
34
|
+
High-level agent that combines Sentience SDK with any LLM provider.
|
|
35
|
+
|
|
36
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
37
|
+
1. OBSERVE: Get snapshot of current page state
|
|
38
|
+
2. THINK: Query LLM to decide next action
|
|
39
|
+
3. ACT: Execute action using SDK
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> from sentience import SentienceBrowser, SentienceAgent
|
|
43
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
44
|
+
>>>
|
|
45
|
+
>>> browser = SentienceBrowser(api_key="sentience_key")
|
|
46
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
47
|
+
>>> agent = SentienceAgent(browser, llm)
|
|
48
|
+
>>>
|
|
49
|
+
>>> with browser:
|
|
50
|
+
>>> browser.page.goto("https://google.com")
|
|
51
|
+
>>> agent.act("Click the search box")
|
|
52
|
+
>>> agent.act("Type 'magic mouse' into the search field")
|
|
53
|
+
>>> agent.act("Press Enter key")
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
browser: SentienceBrowser,
|
|
59
|
+
llm: LLMProvider,
|
|
60
|
+
default_snapshot_limit: int = 50,
|
|
61
|
+
verbose: bool = True,
|
|
62
|
+
tracer: Optional["Tracer"] = None,
|
|
63
|
+
config: Optional["AgentConfig"] = None,
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Initialize Sentience Agent
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
browser: SentienceBrowser instance
|
|
70
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
71
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
72
|
+
verbose: Print execution logs (default: True)
|
|
73
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
74
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
75
|
+
"""
|
|
76
|
+
self.browser = browser
|
|
77
|
+
self.llm = llm
|
|
78
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
79
|
+
self.verbose = verbose
|
|
80
|
+
self.tracer = tracer
|
|
81
|
+
self.config = config
|
|
82
|
+
|
|
83
|
+
# Execution history
|
|
84
|
+
self.history: list[dict[str, Any]] = []
|
|
85
|
+
|
|
86
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
87
|
+
self._token_usage_raw = {
|
|
88
|
+
"total_prompt_tokens": 0,
|
|
89
|
+
"total_completion_tokens": 0,
|
|
90
|
+
"total_tokens": 0,
|
|
91
|
+
"by_action": [],
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Step counter for tracing
|
|
95
|
+
self._step_count = 0
|
|
96
|
+
|
|
97
|
+
def act( # noqa: C901
|
|
98
|
+
self,
|
|
99
|
+
goal: str,
|
|
100
|
+
max_retries: int = 2,
|
|
101
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
102
|
+
) -> AgentActionResult:
|
|
103
|
+
"""
|
|
104
|
+
Execute a high-level goal using observe → think → act loop
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
108
|
+
max_retries: Number of retries on failure (default: 2)
|
|
109
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
AgentActionResult with execution details
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
>>> result = agent.act("Click the search box")
|
|
116
|
+
>>> print(result.success, result.action, result.element_id)
|
|
117
|
+
True click 42
|
|
118
|
+
>>> # Backward compatible dict access
|
|
119
|
+
>>> print(result["element_id"]) # Works but shows deprecation warning
|
|
120
|
+
42
|
|
121
|
+
"""
|
|
122
|
+
if self.verbose:
|
|
123
|
+
print(f"\n{'=' * 70}")
|
|
124
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
125
|
+
print(f"{'=' * 70}")
|
|
126
|
+
|
|
127
|
+
# Generate step ID for tracing
|
|
128
|
+
self._step_count += 1
|
|
129
|
+
step_id = f"step-{self._step_count}"
|
|
130
|
+
|
|
131
|
+
# Emit step_start trace event if tracer is enabled
|
|
132
|
+
if self.tracer:
|
|
133
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
134
|
+
self.tracer.emit_step_start(
|
|
135
|
+
step_id=step_id,
|
|
136
|
+
step_index=self._step_count,
|
|
137
|
+
goal=goal,
|
|
138
|
+
attempt=0,
|
|
139
|
+
pre_url=pre_url,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
for attempt in range(max_retries + 1):
|
|
143
|
+
try:
|
|
144
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
145
|
+
start_time = time.time()
|
|
146
|
+
|
|
147
|
+
# Use provided options or create default
|
|
148
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
149
|
+
# Only set goal if not already provided
|
|
150
|
+
if snap_opts.goal is None:
|
|
151
|
+
snap_opts.goal = goal
|
|
152
|
+
|
|
153
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
154
|
+
snap = snapshot(self.browser, snap_opts)
|
|
155
|
+
|
|
156
|
+
if snap.status != "success":
|
|
157
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
158
|
+
|
|
159
|
+
# Apply element filtering based on goal
|
|
160
|
+
filtered_elements = self.filter_elements(snap, goal)
|
|
161
|
+
|
|
162
|
+
# Emit snapshot trace event if tracer is enabled
|
|
163
|
+
if self.tracer:
|
|
164
|
+
# Include element data for live overlay visualization
|
|
165
|
+
# Use filtered_elements for overlay (only relevant elements)
|
|
166
|
+
elements_data = [
|
|
167
|
+
{
|
|
168
|
+
"id": el.id,
|
|
169
|
+
"bbox": {
|
|
170
|
+
"x": el.bbox.x,
|
|
171
|
+
"y": el.bbox.y,
|
|
172
|
+
"width": el.bbox.width,
|
|
173
|
+
"height": el.bbox.height,
|
|
174
|
+
},
|
|
175
|
+
"role": el.role,
|
|
176
|
+
"text": el.text[:50] if el.text else "", # Truncate for brevity
|
|
177
|
+
}
|
|
178
|
+
for el in filtered_elements[:50] # Limit to first 50 for performance
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
self.tracer.emit(
|
|
182
|
+
"snapshot",
|
|
183
|
+
{
|
|
184
|
+
"url": snap.url,
|
|
185
|
+
"element_count": len(snap.elements),
|
|
186
|
+
"timestamp": snap.timestamp,
|
|
187
|
+
"elements": elements_data, # Add element data for overlay
|
|
188
|
+
},
|
|
189
|
+
step_id=step_id,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Create filtered snapshot
|
|
193
|
+
filtered_snap = Snapshot(
|
|
194
|
+
status=snap.status,
|
|
195
|
+
timestamp=snap.timestamp,
|
|
196
|
+
url=snap.url,
|
|
197
|
+
viewport=snap.viewport,
|
|
198
|
+
elements=filtered_elements,
|
|
199
|
+
screenshot=snap.screenshot,
|
|
200
|
+
screenshot_format=snap.screenshot_format,
|
|
201
|
+
error=snap.error,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# 2. GROUND: Format elements for LLM context
|
|
205
|
+
context = self._build_context(filtered_snap, goal)
|
|
206
|
+
|
|
207
|
+
# 3. THINK: Query LLM for next action
|
|
208
|
+
llm_response = self._query_llm(context, goal)
|
|
209
|
+
|
|
210
|
+
# Emit LLM query trace event if tracer is enabled
|
|
211
|
+
if self.tracer:
|
|
212
|
+
self.tracer.emit(
|
|
213
|
+
"llm_query",
|
|
214
|
+
{
|
|
215
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
216
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
217
|
+
"model": llm_response.model_name,
|
|
218
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
219
|
+
},
|
|
220
|
+
step_id=step_id,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
if self.verbose:
|
|
224
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
225
|
+
|
|
226
|
+
# Track token usage
|
|
227
|
+
self._track_tokens(goal, llm_response)
|
|
228
|
+
|
|
229
|
+
# Parse action from LLM response
|
|
230
|
+
action_str = self._extract_action_from_response(llm_response.content)
|
|
231
|
+
|
|
232
|
+
# 4. EXECUTE: Parse and run action
|
|
233
|
+
result_dict = self._execute_action(action_str, filtered_snap)
|
|
234
|
+
|
|
235
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
236
|
+
|
|
237
|
+
# Create AgentActionResult from execution result
|
|
238
|
+
result = AgentActionResult(
|
|
239
|
+
success=result_dict["success"],
|
|
240
|
+
action=result_dict["action"],
|
|
241
|
+
goal=goal,
|
|
242
|
+
duration_ms=duration_ms,
|
|
243
|
+
attempt=attempt,
|
|
244
|
+
element_id=result_dict.get("element_id"),
|
|
245
|
+
text=result_dict.get("text"),
|
|
246
|
+
key=result_dict.get("key"),
|
|
247
|
+
outcome=result_dict.get("outcome"),
|
|
248
|
+
url_changed=result_dict.get("url_changed"),
|
|
249
|
+
error=result_dict.get("error"),
|
|
250
|
+
message=result_dict.get("message"),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Emit action execution trace event if tracer is enabled
|
|
254
|
+
if self.tracer:
|
|
255
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
256
|
+
|
|
257
|
+
# Include element data for live overlay visualization
|
|
258
|
+
elements_data = [
|
|
259
|
+
{
|
|
260
|
+
"id": el.id,
|
|
261
|
+
"bbox": {
|
|
262
|
+
"x": el.bbox.x,
|
|
263
|
+
"y": el.bbox.y,
|
|
264
|
+
"width": el.bbox.width,
|
|
265
|
+
"height": el.bbox.height,
|
|
266
|
+
},
|
|
267
|
+
"role": el.role,
|
|
268
|
+
"text": el.text[:50] if el.text else "",
|
|
269
|
+
}
|
|
270
|
+
for el in filtered_snap.elements[:50]
|
|
271
|
+
]
|
|
272
|
+
|
|
273
|
+
self.tracer.emit(
|
|
274
|
+
"action",
|
|
275
|
+
{
|
|
276
|
+
"action": result.action,
|
|
277
|
+
"element_id": result.element_id,
|
|
278
|
+
"success": result.success,
|
|
279
|
+
"outcome": result.outcome,
|
|
280
|
+
"duration_ms": duration_ms,
|
|
281
|
+
"post_url": post_url,
|
|
282
|
+
"elements": elements_data, # Add element data for overlay
|
|
283
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
284
|
+
},
|
|
285
|
+
step_id=step_id,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# 5. RECORD: Track history
|
|
289
|
+
self.history.append(
|
|
290
|
+
{
|
|
291
|
+
"goal": goal,
|
|
292
|
+
"action": action_str,
|
|
293
|
+
"result": result.model_dump(), # Store as dict
|
|
294
|
+
"success": result.success,
|
|
295
|
+
"attempt": attempt,
|
|
296
|
+
"duration_ms": duration_ms,
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if self.verbose:
|
|
301
|
+
status = "✅" if result.success else "❌"
|
|
302
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
303
|
+
|
|
304
|
+
# Emit step completion trace event if tracer is enabled
|
|
305
|
+
if self.tracer:
|
|
306
|
+
self.tracer.emit(
|
|
307
|
+
"step_end",
|
|
308
|
+
{
|
|
309
|
+
"success": result.success,
|
|
310
|
+
"duration_ms": duration_ms,
|
|
311
|
+
"action": result.action,
|
|
312
|
+
},
|
|
313
|
+
step_id=step_id,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
return result
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
# Emit error trace event if tracer is enabled
|
|
320
|
+
if self.tracer:
|
|
321
|
+
self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
|
|
322
|
+
|
|
323
|
+
if attempt < max_retries:
|
|
324
|
+
if self.verbose:
|
|
325
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
326
|
+
time.sleep(1.0) # Brief delay before retry
|
|
327
|
+
continue
|
|
328
|
+
else:
|
|
329
|
+
# Create error result
|
|
330
|
+
error_result = AgentActionResult(
|
|
331
|
+
success=False,
|
|
332
|
+
action="error",
|
|
333
|
+
goal=goal,
|
|
334
|
+
duration_ms=0,
|
|
335
|
+
attempt=attempt,
|
|
336
|
+
error=str(e),
|
|
337
|
+
)
|
|
338
|
+
self.history.append(
|
|
339
|
+
{
|
|
340
|
+
"goal": goal,
|
|
341
|
+
"action": "error",
|
|
342
|
+
"result": error_result.model_dump(),
|
|
343
|
+
"success": False,
|
|
344
|
+
"attempt": attempt,
|
|
345
|
+
"duration_ms": 0,
|
|
346
|
+
}
|
|
347
|
+
)
|
|
348
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
349
|
+
|
|
350
|
+
def _build_context(self, snap: Snapshot, goal: str) -> str:
|
|
351
|
+
"""
|
|
352
|
+
Convert snapshot elements to token-efficient prompt string
|
|
353
|
+
|
|
354
|
+
Format: [ID] <role> "text" {cues} @ (x,y) (Imp:score)
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
snap: Snapshot object
|
|
358
|
+
goal: User goal (for context)
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Formatted element context string
|
|
362
|
+
"""
|
|
363
|
+
lines = []
|
|
364
|
+
# Note: elements are already filtered by filter_elements() in act()
|
|
365
|
+
for el in snap.elements:
|
|
366
|
+
# Extract visual cues
|
|
367
|
+
cues = []
|
|
368
|
+
if el.visual_cues.is_primary:
|
|
369
|
+
cues.append("PRIMARY")
|
|
370
|
+
if el.visual_cues.is_clickable:
|
|
371
|
+
cues.append("CLICKABLE")
|
|
372
|
+
if el.visual_cues.background_color_name:
|
|
373
|
+
cues.append(f"color:{el.visual_cues.background_color_name}")
|
|
374
|
+
|
|
375
|
+
# Format element line
|
|
376
|
+
cues_str = f" {{{','.join(cues)}}}" if cues else ""
|
|
377
|
+
text_preview = (
|
|
378
|
+
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
lines.append(
|
|
382
|
+
f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
|
|
383
|
+
f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return "\n".join(lines)
|
|
387
|
+
|
|
388
|
+
def _extract_action_from_response(self, response: str) -> str:
|
|
389
|
+
"""
|
|
390
|
+
Extract action command from LLM response, handling cases where
|
|
391
|
+
the LLM adds extra explanation despite instructions.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
response: Raw LLM response text
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
Cleaned action command string
|
|
398
|
+
"""
|
|
399
|
+
import re
|
|
400
|
+
|
|
401
|
+
# Remove markdown code blocks if present
|
|
402
|
+
response = re.sub(r"```[\w]*\n?", "", response)
|
|
403
|
+
response = response.strip()
|
|
404
|
+
|
|
405
|
+
# Try to find action patterns in the response
|
|
406
|
+
# Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
|
|
407
|
+
action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
|
|
408
|
+
|
|
409
|
+
match = re.search(action_pattern, response, re.IGNORECASE)
|
|
410
|
+
if match:
|
|
411
|
+
return match.group(1)
|
|
412
|
+
|
|
413
|
+
# If no pattern match, return the original response (will likely fail parsing)
|
|
414
|
+
return response
|
|
415
|
+
|
|
416
|
+
def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
|
|
417
|
+
"""
|
|
418
|
+
Query LLM with standardized prompt template
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
dom_context: Formatted element context
|
|
422
|
+
goal: User goal
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
LLMResponse from LLM provider
|
|
426
|
+
"""
|
|
427
|
+
system_prompt = f"""You are an AI web automation agent.
|
|
428
|
+
|
|
429
|
+
GOAL: {goal}
|
|
430
|
+
|
|
431
|
+
VISIBLE ELEMENTS (sorted by importance):
|
|
432
|
+
{dom_context}
|
|
433
|
+
|
|
434
|
+
VISUAL CUES EXPLAINED:
|
|
435
|
+
- {{PRIMARY}}: Main call-to-action element on the page
|
|
436
|
+
- {{CLICKABLE}}: Element is clickable
|
|
437
|
+
- {{color:X}}: Background color name
|
|
438
|
+
|
|
439
|
+
CRITICAL RESPONSE FORMAT:
|
|
440
|
+
You MUST respond with ONLY ONE of these exact action formats:
|
|
441
|
+
- CLICK(id) - Click element by ID
|
|
442
|
+
- TYPE(id, "text") - Type text into element
|
|
443
|
+
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
|
|
444
|
+
- FINISH() - Task complete
|
|
445
|
+
|
|
446
|
+
DO NOT include any explanation, reasoning, or natural language.
|
|
447
|
+
DO NOT use markdown formatting or code blocks.
|
|
448
|
+
DO NOT say "The next step is..." or anything similar.
|
|
449
|
+
|
|
450
|
+
CORRECT Examples:
|
|
451
|
+
CLICK(42)
|
|
452
|
+
TYPE(15, "magic mouse")
|
|
453
|
+
PRESS("Enter")
|
|
454
|
+
FINISH()
|
|
455
|
+
|
|
456
|
+
INCORRECT Examples (DO NOT DO THIS):
|
|
457
|
+
"The next step is to click..."
|
|
458
|
+
"I will type..."
|
|
459
|
+
```CLICK(42)```
|
|
460
|
+
"""
|
|
461
|
+
|
|
462
|
+
user_prompt = "Return the single action command:"
|
|
463
|
+
|
|
464
|
+
return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
|
|
465
|
+
|
|
466
|
+
def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
|
|
467
|
+
"""
|
|
468
|
+
Parse action string and execute SDK call
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
action_str: Action string from LLM (e.g., "CLICK(42)")
|
|
472
|
+
snap: Current snapshot (for context)
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
Execution result dictionary
|
|
476
|
+
"""
|
|
477
|
+
# Parse CLICK(42)
|
|
478
|
+
if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
|
|
479
|
+
element_id = int(match.group(1))
|
|
480
|
+
result = click(self.browser, element_id)
|
|
481
|
+
return {
|
|
482
|
+
"success": result.success,
|
|
483
|
+
"action": "click",
|
|
484
|
+
"element_id": element_id,
|
|
485
|
+
"outcome": result.outcome,
|
|
486
|
+
"url_changed": result.url_changed,
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
# Parse TYPE(42, "hello world")
|
|
490
|
+
elif match := re.match(
|
|
491
|
+
r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
|
|
492
|
+
action_str,
|
|
493
|
+
re.IGNORECASE,
|
|
494
|
+
):
|
|
495
|
+
element_id = int(match.group(1))
|
|
496
|
+
text = match.group(2)
|
|
497
|
+
result = type_text(self.browser, element_id, text)
|
|
498
|
+
return {
|
|
499
|
+
"success": result.success,
|
|
500
|
+
"action": "type",
|
|
501
|
+
"element_id": element_id,
|
|
502
|
+
"text": text,
|
|
503
|
+
"outcome": result.outcome,
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
# Parse PRESS("Enter")
|
|
507
|
+
elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
|
|
508
|
+
key = match.group(1)
|
|
509
|
+
result = press(self.browser, key)
|
|
510
|
+
return {
|
|
511
|
+
"success": result.success,
|
|
512
|
+
"action": "press",
|
|
513
|
+
"key": key,
|
|
514
|
+
"outcome": result.outcome,
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
# Parse FINISH()
|
|
518
|
+
elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
|
|
519
|
+
return {
|
|
520
|
+
"success": True,
|
|
521
|
+
"action": "finish",
|
|
522
|
+
"message": "Task marked as complete",
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
else:
|
|
526
|
+
raise ValueError(
|
|
527
|
+
f"Unknown action format: {action_str}\n"
|
|
528
|
+
f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
532
|
+
"""
|
|
533
|
+
Track token usage for analytics
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
goal: User goal
|
|
537
|
+
llm_response: LLM response with token usage
|
|
538
|
+
"""
|
|
539
|
+
if llm_response.prompt_tokens:
|
|
540
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
541
|
+
if llm_response.completion_tokens:
|
|
542
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
543
|
+
if llm_response.total_tokens:
|
|
544
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
545
|
+
|
|
546
|
+
self._token_usage_raw["by_action"].append(
|
|
547
|
+
{
|
|
548
|
+
"goal": goal,
|
|
549
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
550
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
551
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
552
|
+
"model": llm_response.model_name,
|
|
553
|
+
}
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
def get_token_stats(self) -> TokenStats:
|
|
557
|
+
"""
|
|
558
|
+
Get token usage statistics
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
TokenStats with token usage breakdown
|
|
562
|
+
"""
|
|
563
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
564
|
+
return TokenStats(
|
|
565
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
566
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
567
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
568
|
+
by_action=by_action,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
def get_history(self) -> list[ActionHistory]:
|
|
572
|
+
"""
|
|
573
|
+
Get execution history
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
List of ActionHistory entries
|
|
577
|
+
"""
|
|
578
|
+
return [ActionHistory(**h) for h in self.history]
|
|
579
|
+
|
|
580
|
+
def clear_history(self) -> None:
|
|
581
|
+
"""Clear execution history and reset token counters"""
|
|
582
|
+
self.history.clear()
|
|
583
|
+
self._token_usage_raw = {
|
|
584
|
+
"total_prompt_tokens": 0,
|
|
585
|
+
"total_completion_tokens": 0,
|
|
586
|
+
"total_tokens": 0,
|
|
587
|
+
"by_action": [],
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
591
|
+
"""
|
|
592
|
+
Filter elements from snapshot based on goal context.
|
|
593
|
+
|
|
594
|
+
This default implementation applies goal-based keyword matching to boost
|
|
595
|
+
relevant elements and filters out irrelevant ones.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
snapshot: Current page snapshot
|
|
599
|
+
goal: User's goal (can inform filtering)
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
Filtered list of elements
|
|
603
|
+
"""
|
|
604
|
+
elements = snapshot.elements
|
|
605
|
+
|
|
606
|
+
# If no goal provided, return all elements (up to limit)
|
|
607
|
+
if not goal:
|
|
608
|
+
return elements[: self.default_snapshot_limit]
|
|
609
|
+
|
|
610
|
+
goal_lower = goal.lower()
|
|
611
|
+
|
|
612
|
+
# Extract keywords from goal
|
|
613
|
+
keywords = self._extract_keywords(goal_lower)
|
|
614
|
+
|
|
615
|
+
# Boost elements matching goal keywords
|
|
616
|
+
scored_elements = []
|
|
617
|
+
for el in elements:
|
|
618
|
+
score = el.importance
|
|
619
|
+
|
|
620
|
+
# Boost if element text matches goal
|
|
621
|
+
if el.text and any(kw in el.text.lower() for kw in keywords):
|
|
622
|
+
score += 0.3
|
|
623
|
+
|
|
624
|
+
# Boost if role matches goal intent
|
|
625
|
+
if "click" in goal_lower and el.visual_cues.is_clickable:
|
|
626
|
+
score += 0.2
|
|
627
|
+
if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
|
|
628
|
+
score += 0.2
|
|
629
|
+
if "search" in goal_lower:
|
|
630
|
+
# Filter out non-interactive elements for search tasks
|
|
631
|
+
if el.role in ["link", "img"] and not el.visual_cues.is_primary:
|
|
632
|
+
score -= 0.5
|
|
633
|
+
|
|
634
|
+
scored_elements.append((score, el))
|
|
635
|
+
|
|
636
|
+
# Re-sort by boosted score
|
|
637
|
+
scored_elements.sort(key=lambda x: x[0], reverse=True)
|
|
638
|
+
elements = [el for _, el in scored_elements]
|
|
639
|
+
|
|
640
|
+
return elements[: self.default_snapshot_limit]
|
|
641
|
+
|
|
642
|
+
def _extract_keywords(self, text: str) -> list[str]:
|
|
643
|
+
"""
|
|
644
|
+
Extract meaningful keywords from goal text
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
text: Text to extract keywords from
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
List of keywords
|
|
651
|
+
"""
|
|
652
|
+
stopwords = {
|
|
653
|
+
"the",
|
|
654
|
+
"a",
|
|
655
|
+
"an",
|
|
656
|
+
"and",
|
|
657
|
+
"or",
|
|
658
|
+
"but",
|
|
659
|
+
"in",
|
|
660
|
+
"on",
|
|
661
|
+
"at",
|
|
662
|
+
"to",
|
|
663
|
+
"for",
|
|
664
|
+
"of",
|
|
665
|
+
"with",
|
|
666
|
+
"by",
|
|
667
|
+
"from",
|
|
668
|
+
"as",
|
|
669
|
+
"is",
|
|
670
|
+
"was",
|
|
671
|
+
}
|
|
672
|
+
words = text.split()
|
|
673
|
+
return [w for w in words if w not in stopwords and len(w) > 2]
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
class SentienceAgentAsync(BaseAgentAsync):
|
|
677
|
+
"""
|
|
678
|
+
High-level async agent that combines Sentience SDK with any LLM provider.
|
|
679
|
+
|
|
680
|
+
Uses observe-think-act loop to execute natural language commands:
|
|
681
|
+
1. OBSERVE: Get snapshot of current page state
|
|
682
|
+
2. THINK: Query LLM to decide next action
|
|
683
|
+
3. ACT: Execute action using SDK
|
|
684
|
+
|
|
685
|
+
Example:
|
|
686
|
+
>>> from sentience.async_api import AsyncSentienceBrowser
|
|
687
|
+
>>> from sentience.agent import SentienceAgentAsync
|
|
688
|
+
>>> from sentience.llm_provider import OpenAIProvider
|
|
689
|
+
>>>
|
|
690
|
+
>>> async with AsyncSentienceBrowser() as browser:
|
|
691
|
+
>>> await browser.goto("https://google.com")
|
|
692
|
+
>>> llm = OpenAIProvider(api_key="openai_key", model="gpt-4o")
|
|
693
|
+
>>> agent = SentienceAgentAsync(browser, llm)
|
|
694
|
+
>>> await agent.act("Click the search box")
|
|
695
|
+
>>> await agent.act("Type 'magic mouse' into the search field")
|
|
696
|
+
>>> await agent.act("Press Enter key")
|
|
697
|
+
"""
|
|
698
|
+
|
|
699
|
+
def __init__(
|
|
700
|
+
self,
|
|
701
|
+
browser: AsyncSentienceBrowser,
|
|
702
|
+
llm: LLMProvider,
|
|
703
|
+
default_snapshot_limit: int = 50,
|
|
704
|
+
verbose: bool = True,
|
|
705
|
+
tracer: Optional["Tracer"] = None,
|
|
706
|
+
config: Optional["AgentConfig"] = None,
|
|
707
|
+
):
|
|
708
|
+
"""
|
|
709
|
+
Initialize Sentience Agent (async)
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
browser: AsyncSentienceBrowser instance
|
|
713
|
+
llm: LLM provider (OpenAIProvider, AnthropicProvider, etc.)
|
|
714
|
+
default_snapshot_limit: Default maximum elements to include in context (default: 50)
|
|
715
|
+
verbose: Print execution logs (default: True)
|
|
716
|
+
tracer: Optional Tracer instance for execution tracking (default: None)
|
|
717
|
+
config: Optional AgentConfig for advanced configuration (default: None)
|
|
718
|
+
"""
|
|
719
|
+
self.browser = browser
|
|
720
|
+
self.llm = llm
|
|
721
|
+
self.default_snapshot_limit = default_snapshot_limit
|
|
722
|
+
self.verbose = verbose
|
|
723
|
+
self.tracer = tracer
|
|
724
|
+
self.config = config
|
|
725
|
+
|
|
726
|
+
# Execution history
|
|
727
|
+
self.history: list[dict[str, Any]] = []
|
|
728
|
+
|
|
729
|
+
# Token usage tracking (will be converted to TokenStats on get_token_stats())
|
|
730
|
+
self._token_usage_raw = {
|
|
731
|
+
"total_prompt_tokens": 0,
|
|
732
|
+
"total_completion_tokens": 0,
|
|
733
|
+
"total_tokens": 0,
|
|
734
|
+
"by_action": [],
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
# Step counter for tracing
|
|
738
|
+
self._step_count = 0
|
|
739
|
+
|
|
740
|
+
async def act( # noqa: C901
|
|
741
|
+
self,
|
|
742
|
+
goal: str,
|
|
743
|
+
max_retries: int = 2,
|
|
744
|
+
snapshot_options: SnapshotOptions | None = None,
|
|
745
|
+
) -> AgentActionResult:
|
|
746
|
+
"""
|
|
747
|
+
Execute a high-level goal using observe → think → act loop (async)
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
goal: Natural language instruction (e.g., "Click the Sign In button")
|
|
751
|
+
max_retries: Number of retries on failure (default: 2)
|
|
752
|
+
snapshot_options: Optional SnapshotOptions for this specific action
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
AgentActionResult with execution details
|
|
756
|
+
|
|
757
|
+
Example:
|
|
758
|
+
>>> result = await agent.act("Click the search box")
|
|
759
|
+
>>> print(result.success, result.action, result.element_id)
|
|
760
|
+
True click 42
|
|
761
|
+
"""
|
|
762
|
+
if self.verbose:
|
|
763
|
+
print(f"\n{'=' * 70}")
|
|
764
|
+
print(f"🤖 Agent Goal: {goal}")
|
|
765
|
+
print(f"{'=' * 70}")
|
|
766
|
+
|
|
767
|
+
# Generate step ID for tracing
|
|
768
|
+
self._step_count += 1
|
|
769
|
+
step_id = f"step-{self._step_count}"
|
|
770
|
+
|
|
771
|
+
# Emit step_start trace event if tracer is enabled
|
|
772
|
+
if self.tracer:
|
|
773
|
+
pre_url = self.browser.page.url if self.browser.page else None
|
|
774
|
+
self.tracer.emit_step_start(
|
|
775
|
+
step_id=step_id,
|
|
776
|
+
step_index=self._step_count,
|
|
777
|
+
goal=goal,
|
|
778
|
+
attempt=0,
|
|
779
|
+
pre_url=pre_url,
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
for attempt in range(max_retries + 1):
|
|
783
|
+
try:
|
|
784
|
+
# 1. OBSERVE: Get refined semantic snapshot
|
|
785
|
+
start_time = time.time()
|
|
786
|
+
|
|
787
|
+
# Use provided options or create default
|
|
788
|
+
snap_opts = snapshot_options or SnapshotOptions(limit=self.default_snapshot_limit)
|
|
789
|
+
# Only set goal if not already provided
|
|
790
|
+
if snap_opts.goal is None:
|
|
791
|
+
snap_opts.goal = goal
|
|
792
|
+
|
|
793
|
+
# Call snapshot with options object (matches TypeScript API)
|
|
794
|
+
snap = await snapshot_async(self.browser, snap_opts)
|
|
795
|
+
|
|
796
|
+
if snap.status != "success":
|
|
797
|
+
raise RuntimeError(f"Snapshot failed: {snap.error}")
|
|
798
|
+
|
|
799
|
+
# Apply element filtering based on goal
|
|
800
|
+
filtered_elements = self.filter_elements(snap, goal)
|
|
801
|
+
|
|
802
|
+
# Emit snapshot trace event if tracer is enabled
|
|
803
|
+
if self.tracer:
|
|
804
|
+
# Include element data for live overlay visualization
|
|
805
|
+
# Use filtered_elements for overlay (only relevant elements)
|
|
806
|
+
elements_data = [
|
|
807
|
+
{
|
|
808
|
+
"id": el.id,
|
|
809
|
+
"bbox": {
|
|
810
|
+
"x": el.bbox.x,
|
|
811
|
+
"y": el.bbox.y,
|
|
812
|
+
"width": el.bbox.width,
|
|
813
|
+
"height": el.bbox.height,
|
|
814
|
+
},
|
|
815
|
+
"role": el.role,
|
|
816
|
+
"text": el.text[:50] if el.text else "", # Truncate for brevity
|
|
817
|
+
}
|
|
818
|
+
for el in filtered_elements[:50] # Limit to first 50 for performance
|
|
819
|
+
]
|
|
820
|
+
|
|
821
|
+
self.tracer.emit(
|
|
822
|
+
"snapshot",
|
|
823
|
+
{
|
|
824
|
+
"url": snap.url,
|
|
825
|
+
"element_count": len(snap.elements),
|
|
826
|
+
"timestamp": snap.timestamp,
|
|
827
|
+
"elements": elements_data, # Add element data for overlay
|
|
828
|
+
},
|
|
829
|
+
step_id=step_id,
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
# Create filtered snapshot
|
|
833
|
+
filtered_snap = Snapshot(
|
|
834
|
+
status=snap.status,
|
|
835
|
+
timestamp=snap.timestamp,
|
|
836
|
+
url=snap.url,
|
|
837
|
+
viewport=snap.viewport,
|
|
838
|
+
elements=filtered_elements,
|
|
839
|
+
screenshot=snap.screenshot,
|
|
840
|
+
screenshot_format=snap.screenshot_format,
|
|
841
|
+
error=snap.error,
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
# 2. GROUND: Format elements for LLM context
|
|
845
|
+
context = self._build_context(filtered_snap, goal)
|
|
846
|
+
|
|
847
|
+
# 3. THINK: Query LLM for next action
|
|
848
|
+
llm_response = self._query_llm(context, goal)
|
|
849
|
+
|
|
850
|
+
# Emit LLM query trace event if tracer is enabled
|
|
851
|
+
if self.tracer:
|
|
852
|
+
self.tracer.emit(
|
|
853
|
+
"llm_query",
|
|
854
|
+
{
|
|
855
|
+
"prompt_tokens": llm_response.prompt_tokens,
|
|
856
|
+
"completion_tokens": llm_response.completion_tokens,
|
|
857
|
+
"model": llm_response.model_name,
|
|
858
|
+
"response": llm_response.content[:200], # Truncate for brevity
|
|
859
|
+
},
|
|
860
|
+
step_id=step_id,
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
if self.verbose:
|
|
864
|
+
print(f"🧠 LLM Decision: {llm_response.content}")
|
|
865
|
+
|
|
866
|
+
# Track token usage
|
|
867
|
+
self._track_tokens(goal, llm_response)
|
|
868
|
+
|
|
869
|
+
# Parse action from LLM response
|
|
870
|
+
action_str = self._extract_action_from_response(llm_response.content)
|
|
871
|
+
|
|
872
|
+
# 4. EXECUTE: Parse and run action
|
|
873
|
+
result_dict = await self._execute_action(action_str, filtered_snap)
|
|
874
|
+
|
|
875
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
876
|
+
|
|
877
|
+
# Create AgentActionResult from execution result
|
|
878
|
+
result = AgentActionResult(
|
|
879
|
+
success=result_dict["success"],
|
|
880
|
+
action=result_dict["action"],
|
|
881
|
+
goal=goal,
|
|
882
|
+
duration_ms=duration_ms,
|
|
883
|
+
attempt=attempt,
|
|
884
|
+
element_id=result_dict.get("element_id"),
|
|
885
|
+
text=result_dict.get("text"),
|
|
886
|
+
key=result_dict.get("key"),
|
|
887
|
+
outcome=result_dict.get("outcome"),
|
|
888
|
+
url_changed=result_dict.get("url_changed"),
|
|
889
|
+
error=result_dict.get("error"),
|
|
890
|
+
message=result_dict.get("message"),
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
# Emit action execution trace event if tracer is enabled
|
|
894
|
+
if self.tracer:
|
|
895
|
+
post_url = self.browser.page.url if self.browser.page else None
|
|
896
|
+
|
|
897
|
+
# Include element data for live overlay visualization
|
|
898
|
+
elements_data = [
|
|
899
|
+
{
|
|
900
|
+
"id": el.id,
|
|
901
|
+
"bbox": {
|
|
902
|
+
"x": el.bbox.x,
|
|
903
|
+
"y": el.bbox.y,
|
|
904
|
+
"width": el.bbox.width,
|
|
905
|
+
"height": el.bbox.height,
|
|
906
|
+
},
|
|
907
|
+
"role": el.role,
|
|
908
|
+
"text": el.text[:50] if el.text else "",
|
|
909
|
+
}
|
|
910
|
+
for el in filtered_snap.elements[:50]
|
|
911
|
+
]
|
|
912
|
+
|
|
913
|
+
self.tracer.emit(
|
|
914
|
+
"action",
|
|
915
|
+
{
|
|
916
|
+
"action": result.action,
|
|
917
|
+
"element_id": result.element_id,
|
|
918
|
+
"success": result.success,
|
|
919
|
+
"outcome": result.outcome,
|
|
920
|
+
"duration_ms": duration_ms,
|
|
921
|
+
"post_url": post_url,
|
|
922
|
+
"elements": elements_data, # Add element data for overlay
|
|
923
|
+
"target_element_id": result.element_id, # Highlight target in red
|
|
924
|
+
},
|
|
925
|
+
step_id=step_id,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
# 5. RECORD: Track history
|
|
929
|
+
self.history.append(
|
|
930
|
+
{
|
|
931
|
+
"goal": goal,
|
|
932
|
+
"action": action_str,
|
|
933
|
+
"result": result.model_dump(), # Store as dict
|
|
934
|
+
"success": result.success,
|
|
935
|
+
"attempt": attempt,
|
|
936
|
+
"duration_ms": duration_ms,
|
|
937
|
+
}
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
if self.verbose:
|
|
941
|
+
status = "✅" if result.success else "❌"
|
|
942
|
+
print(f"{status} Completed in {duration_ms}ms")
|
|
943
|
+
|
|
944
|
+
# Emit step completion trace event if tracer is enabled
|
|
945
|
+
if self.tracer:
|
|
946
|
+
self.tracer.emit(
|
|
947
|
+
"step_end",
|
|
948
|
+
{
|
|
949
|
+
"success": result.success,
|
|
950
|
+
"duration_ms": duration_ms,
|
|
951
|
+
"action": result.action,
|
|
952
|
+
},
|
|
953
|
+
step_id=step_id,
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
return result
|
|
957
|
+
|
|
958
|
+
except Exception as e:
|
|
959
|
+
# Emit error trace event if tracer is enabled
|
|
960
|
+
if self.tracer:
|
|
961
|
+
self.tracer.emit_error(step_id=step_id, error=str(e), attempt=attempt)
|
|
962
|
+
|
|
963
|
+
if attempt < max_retries:
|
|
964
|
+
if self.verbose:
|
|
965
|
+
print(f"⚠️ Retry {attempt + 1}/{max_retries}: {e}")
|
|
966
|
+
await asyncio.sleep(1.0) # Brief delay before retry
|
|
967
|
+
continue
|
|
968
|
+
else:
|
|
969
|
+
# Create error result
|
|
970
|
+
error_result = AgentActionResult(
|
|
971
|
+
success=False,
|
|
972
|
+
action="error",
|
|
973
|
+
goal=goal,
|
|
974
|
+
duration_ms=0,
|
|
975
|
+
attempt=attempt,
|
|
976
|
+
error=str(e),
|
|
977
|
+
)
|
|
978
|
+
self.history.append(
|
|
979
|
+
{
|
|
980
|
+
"goal": goal,
|
|
981
|
+
"action": "error",
|
|
982
|
+
"result": error_result.model_dump(),
|
|
983
|
+
"success": False,
|
|
984
|
+
"attempt": attempt,
|
|
985
|
+
"duration_ms": 0,
|
|
986
|
+
}
|
|
987
|
+
)
|
|
988
|
+
raise RuntimeError(f"Failed after {max_retries} retries: {e}")
|
|
989
|
+
|
|
990
|
+
def _build_context(self, snap: Snapshot, goal: str) -> str:
|
|
991
|
+
"""Convert snapshot elements to token-efficient prompt string (same as sync version)"""
|
|
992
|
+
lines = []
|
|
993
|
+
# Note: elements are already filtered by filter_elements() in act()
|
|
994
|
+
for el in snap.elements:
|
|
995
|
+
# Extract visual cues
|
|
996
|
+
cues = []
|
|
997
|
+
if el.visual_cues.is_primary:
|
|
998
|
+
cues.append("PRIMARY")
|
|
999
|
+
if el.visual_cues.is_clickable:
|
|
1000
|
+
cues.append("CLICKABLE")
|
|
1001
|
+
if el.visual_cues.background_color_name:
|
|
1002
|
+
cues.append(f"color:{el.visual_cues.background_color_name}")
|
|
1003
|
+
|
|
1004
|
+
# Format element line
|
|
1005
|
+
cues_str = f" {{{','.join(cues)}}}" if cues else ""
|
|
1006
|
+
text_preview = (
|
|
1007
|
+
(el.text[:50] + "...") if el.text and len(el.text) > 50 else (el.text or "")
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
lines.append(
|
|
1011
|
+
f'[{el.id}] <{el.role}> "{text_preview}"{cues_str} '
|
|
1012
|
+
f"@ ({int(el.bbox.x)},{int(el.bbox.y)}) (Imp:{el.importance})"
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
return "\n".join(lines)
|
|
1016
|
+
|
|
1017
|
+
def _extract_action_from_response(self, response: str) -> str:
|
|
1018
|
+
"""Extract action command from LLM response (same as sync version)"""
|
|
1019
|
+
# Remove markdown code blocks if present
|
|
1020
|
+
response = re.sub(r"```[\w]*\n?", "", response)
|
|
1021
|
+
response = response.strip()
|
|
1022
|
+
|
|
1023
|
+
# Try to find action patterns in the response
|
|
1024
|
+
# Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
|
|
1025
|
+
action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
|
|
1026
|
+
|
|
1027
|
+
match = re.search(action_pattern, response, re.IGNORECASE)
|
|
1028
|
+
if match:
|
|
1029
|
+
return match.group(1)
|
|
1030
|
+
|
|
1031
|
+
# If no pattern match, return the original response (will likely fail parsing)
|
|
1032
|
+
return response
|
|
1033
|
+
|
|
1034
|
+
def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
|
|
1035
|
+
"""Query LLM with standardized prompt template (same as sync version)"""
|
|
1036
|
+
system_prompt = f"""You are an AI web automation agent.
|
|
1037
|
+
|
|
1038
|
+
GOAL: {goal}
|
|
1039
|
+
|
|
1040
|
+
VISIBLE ELEMENTS (sorted by importance):
|
|
1041
|
+
{dom_context}
|
|
1042
|
+
|
|
1043
|
+
VISUAL CUES EXPLAINED:
|
|
1044
|
+
- {{PRIMARY}}: Main call-to-action element on the page
|
|
1045
|
+
- {{CLICKABLE}}: Element is clickable
|
|
1046
|
+
- {{color:X}}: Background color name
|
|
1047
|
+
|
|
1048
|
+
CRITICAL RESPONSE FORMAT:
|
|
1049
|
+
You MUST respond with ONLY ONE of these exact action formats:
|
|
1050
|
+
- CLICK(id) - Click element by ID
|
|
1051
|
+
- TYPE(id, "text") - Type text into element
|
|
1052
|
+
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
|
|
1053
|
+
- FINISH() - Task complete
|
|
1054
|
+
|
|
1055
|
+
DO NOT include any explanation, reasoning, or natural language.
|
|
1056
|
+
DO NOT use markdown formatting or code blocks.
|
|
1057
|
+
DO NOT say "The next step is..." or anything similar.
|
|
1058
|
+
|
|
1059
|
+
CORRECT Examples:
|
|
1060
|
+
CLICK(42)
|
|
1061
|
+
TYPE(15, "magic mouse")
|
|
1062
|
+
PRESS("Enter")
|
|
1063
|
+
FINISH()
|
|
1064
|
+
|
|
1065
|
+
INCORRECT Examples (DO NOT DO THIS):
|
|
1066
|
+
"The next step is to click..."
|
|
1067
|
+
"I will type..."
|
|
1068
|
+
```CLICK(42)```
|
|
1069
|
+
"""
|
|
1070
|
+
|
|
1071
|
+
user_prompt = "Return the single action command:"
|
|
1072
|
+
|
|
1073
|
+
return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
|
|
1074
|
+
|
|
1075
|
+
async def _execute_action(self, action_str: str, snap: Snapshot) -> dict[str, Any]:
|
|
1076
|
+
"""
|
|
1077
|
+
Parse action string and execute SDK call (async)
|
|
1078
|
+
|
|
1079
|
+
Args:
|
|
1080
|
+
action_str: Action string from LLM (e.g., "CLICK(42)")
|
|
1081
|
+
snap: Current snapshot (for context)
|
|
1082
|
+
|
|
1083
|
+
Returns:
|
|
1084
|
+
Execution result dictionary
|
|
1085
|
+
"""
|
|
1086
|
+
# Parse CLICK(42)
|
|
1087
|
+
if match := re.match(r"CLICK\s*\(\s*(\d+)\s*\)", action_str, re.IGNORECASE):
|
|
1088
|
+
element_id = int(match.group(1))
|
|
1089
|
+
result = await click_async(self.browser, element_id)
|
|
1090
|
+
return {
|
|
1091
|
+
"success": result.success,
|
|
1092
|
+
"action": "click",
|
|
1093
|
+
"element_id": element_id,
|
|
1094
|
+
"outcome": result.outcome,
|
|
1095
|
+
"url_changed": result.url_changed,
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
# Parse TYPE(42, "hello world")
|
|
1099
|
+
elif match := re.match(
|
|
1100
|
+
r'TYPE\s*\(\s*(\d+)\s*,\s*["\']([^"\']*)["\']\s*\)',
|
|
1101
|
+
action_str,
|
|
1102
|
+
re.IGNORECASE,
|
|
1103
|
+
):
|
|
1104
|
+
element_id = int(match.group(1))
|
|
1105
|
+
text = match.group(2)
|
|
1106
|
+
result = await type_text_async(self.browser, element_id, text)
|
|
1107
|
+
return {
|
|
1108
|
+
"success": result.success,
|
|
1109
|
+
"action": "type",
|
|
1110
|
+
"element_id": element_id,
|
|
1111
|
+
"text": text,
|
|
1112
|
+
"outcome": result.outcome,
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
# Parse PRESS("Enter")
|
|
1116
|
+
elif match := re.match(r'PRESS\s*\(\s*["\']([^"\']+)["\']\s*\)', action_str, re.IGNORECASE):
|
|
1117
|
+
key = match.group(1)
|
|
1118
|
+
result = await press_async(self.browser, key)
|
|
1119
|
+
return {
|
|
1120
|
+
"success": result.success,
|
|
1121
|
+
"action": "press",
|
|
1122
|
+
"key": key,
|
|
1123
|
+
"outcome": result.outcome,
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
# Parse FINISH()
|
|
1127
|
+
elif re.match(r"FINISH\s*\(\s*\)", action_str, re.IGNORECASE):
|
|
1128
|
+
return {
|
|
1129
|
+
"success": True,
|
|
1130
|
+
"action": "finish",
|
|
1131
|
+
"message": "Task marked as complete",
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
else:
|
|
1135
|
+
raise ValueError(
|
|
1136
|
+
f"Unknown action format: {action_str}\n"
|
|
1137
|
+
f'Expected: CLICK(id), TYPE(id, "text"), PRESS("key"), or FINISH()'
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
def _track_tokens(self, goal: str, llm_response: LLMResponse):
|
|
1141
|
+
"""Track token usage for analytics (same as sync version)"""
|
|
1142
|
+
if llm_response.prompt_tokens:
|
|
1143
|
+
self._token_usage_raw["total_prompt_tokens"] += llm_response.prompt_tokens
|
|
1144
|
+
if llm_response.completion_tokens:
|
|
1145
|
+
self._token_usage_raw["total_completion_tokens"] += llm_response.completion_tokens
|
|
1146
|
+
if llm_response.total_tokens:
|
|
1147
|
+
self._token_usage_raw["total_tokens"] += llm_response.total_tokens
|
|
1148
|
+
|
|
1149
|
+
self._token_usage_raw["by_action"].append(
|
|
1150
|
+
{
|
|
1151
|
+
"goal": goal,
|
|
1152
|
+
"prompt_tokens": llm_response.prompt_tokens or 0,
|
|
1153
|
+
"completion_tokens": llm_response.completion_tokens or 0,
|
|
1154
|
+
"total_tokens": llm_response.total_tokens or 0,
|
|
1155
|
+
"model": llm_response.model_name,
|
|
1156
|
+
}
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
def get_token_stats(self) -> TokenStats:
|
|
1160
|
+
"""Get token usage statistics (same as sync version)"""
|
|
1161
|
+
by_action = [ActionTokenUsage(**action) for action in self._token_usage_raw["by_action"]]
|
|
1162
|
+
return TokenStats(
|
|
1163
|
+
total_prompt_tokens=self._token_usage_raw["total_prompt_tokens"],
|
|
1164
|
+
total_completion_tokens=self._token_usage_raw["total_completion_tokens"],
|
|
1165
|
+
total_tokens=self._token_usage_raw["total_tokens"],
|
|
1166
|
+
by_action=by_action,
|
|
1167
|
+
)
|
|
1168
|
+
|
|
1169
|
+
def get_history(self) -> list[ActionHistory]:
|
|
1170
|
+
"""Get execution history (same as sync version)"""
|
|
1171
|
+
return [ActionHistory(**h) for h in self.history]
|
|
1172
|
+
|
|
1173
|
+
def clear_history(self) -> None:
|
|
1174
|
+
"""Clear execution history and reset token counters (same as sync version)"""
|
|
1175
|
+
self.history.clear()
|
|
1176
|
+
self._token_usage_raw = {
|
|
1177
|
+
"total_prompt_tokens": 0,
|
|
1178
|
+
"total_completion_tokens": 0,
|
|
1179
|
+
"total_tokens": 0,
|
|
1180
|
+
"by_action": [],
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
def filter_elements(self, snapshot: Snapshot, goal: str | None = None) -> list[Element]:
|
|
1184
|
+
"""Filter elements from snapshot based on goal context (same as sync version)"""
|
|
1185
|
+
elements = snapshot.elements
|
|
1186
|
+
|
|
1187
|
+
# If no goal provided, return all elements (up to limit)
|
|
1188
|
+
if not goal:
|
|
1189
|
+
return elements[: self.default_snapshot_limit]
|
|
1190
|
+
|
|
1191
|
+
goal_lower = goal.lower()
|
|
1192
|
+
|
|
1193
|
+
# Extract keywords from goal
|
|
1194
|
+
keywords = self._extract_keywords(goal_lower)
|
|
1195
|
+
|
|
1196
|
+
# Boost elements matching goal keywords
|
|
1197
|
+
scored_elements = []
|
|
1198
|
+
for el in elements:
|
|
1199
|
+
score = el.importance
|
|
1200
|
+
|
|
1201
|
+
# Boost if element text matches goal
|
|
1202
|
+
if el.text and any(kw in el.text.lower() for kw in keywords):
|
|
1203
|
+
score += 0.3
|
|
1204
|
+
|
|
1205
|
+
# Boost if role matches goal intent
|
|
1206
|
+
if "click" in goal_lower and el.visual_cues.is_clickable:
|
|
1207
|
+
score += 0.2
|
|
1208
|
+
if "type" in goal_lower and el.role in ["textbox", "searchbox"]:
|
|
1209
|
+
score += 0.2
|
|
1210
|
+
if "search" in goal_lower:
|
|
1211
|
+
# Filter out non-interactive elements for search tasks
|
|
1212
|
+
if el.role in ["link", "img"] and not el.visual_cues.is_primary:
|
|
1213
|
+
score -= 0.5
|
|
1214
|
+
|
|
1215
|
+
scored_elements.append((score, el))
|
|
1216
|
+
|
|
1217
|
+
# Re-sort by boosted score
|
|
1218
|
+
scored_elements.sort(key=lambda x: x[0], reverse=True)
|
|
1219
|
+
elements = [el for _, el in scored_elements]
|
|
1220
|
+
|
|
1221
|
+
return elements[: self.default_snapshot_limit]
|
|
1222
|
+
|
|
1223
|
+
def _extract_keywords(self, text: str) -> list[str]:
|
|
1224
|
+
"""Extract meaningful keywords from goal text (same as sync version)"""
|
|
1225
|
+
stopwords = {
|
|
1226
|
+
"the",
|
|
1227
|
+
"a",
|
|
1228
|
+
"an",
|
|
1229
|
+
"and",
|
|
1230
|
+
"or",
|
|
1231
|
+
"but",
|
|
1232
|
+
"in",
|
|
1233
|
+
"on",
|
|
1234
|
+
"at",
|
|
1235
|
+
"to",
|
|
1236
|
+
"for",
|
|
1237
|
+
"of",
|
|
1238
|
+
"with",
|
|
1239
|
+
"by",
|
|
1240
|
+
"from",
|
|
1241
|
+
"as",
|
|
1242
|
+
"is",
|
|
1243
|
+
"was",
|
|
1244
|
+
}
|
|
1245
|
+
words = text.split()
|
|
1246
|
+
return [w for w in words if w not in stopwords and len(w) > 2]
|