camel-ai 0.2.68__py3-none-any.whl → 0.2.69a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +170 -11
- camel/configs/vllm_config.py +2 -0
- camel/datagen/self_improving_cot.py +1 -1
- camel/memories/context_creators/score_based.py +129 -87
- camel/runtimes/configs.py +11 -11
- camel/runtimes/daytona_runtime.py +4 -4
- camel/runtimes/docker_runtime.py +6 -6
- camel/runtimes/remote_http_runtime.py +5 -5
- camel/societies/workforce/prompts.py +13 -12
- camel/societies/workforce/single_agent_worker.py +252 -22
- camel/societies/workforce/utils.py +10 -2
- camel/societies/workforce/worker.py +21 -45
- camel/societies/workforce/workforce.py +36 -15
- camel/tasks/task.py +18 -12
- camel/toolkits/__init__.py +2 -0
- camel/toolkits/aci_toolkit.py +19 -19
- camel/toolkits/arxiv_toolkit.py +6 -6
- camel/toolkits/dappier_toolkit.py +5 -5
- camel/toolkits/file_write_toolkit.py +10 -10
- camel/toolkits/github_toolkit.py +3 -3
- camel/toolkits/non_visual_browser_toolkit/__init__.py +18 -0
- camel/toolkits/non_visual_browser_toolkit/actions.py +196 -0
- camel/toolkits/non_visual_browser_toolkit/agent.py +278 -0
- camel/toolkits/non_visual_browser_toolkit/browser_non_visual_toolkit.py +363 -0
- camel/toolkits/non_visual_browser_toolkit/nv_browser_session.py +175 -0
- camel/toolkits/non_visual_browser_toolkit/snapshot.js +188 -0
- camel/toolkits/non_visual_browser_toolkit/snapshot.py +164 -0
- camel/toolkits/pptx_toolkit.py +4 -4
- camel/toolkits/sympy_toolkit.py +1 -1
- camel/toolkits/task_planning_toolkit.py +3 -3
- camel/toolkits/thinking_toolkit.py +1 -1
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a1.dist-info}/METADATA +1 -1
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a1.dist-info}/RECORD +36 -29
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a1.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.68.dist-info → camel_ai-0.2.69a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import re
|
|
17
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
from camel.models import BaseModelBackend, ModelFactory
|
|
20
|
+
from camel.types import ModelPlatformType, ModelType
|
|
21
|
+
|
|
22
|
+
from .actions import ActionExecutor
|
|
23
|
+
from .nv_browser_session import NVBrowserSession
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from camel.agents import ChatAgent
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PlaywrightLLMAgent:
|
|
32
|
+
"""High-level orchestration: snapshot ↔ LLM ↔ action executor."""
|
|
33
|
+
|
|
34
|
+
# System prompt as class constant to avoid recreation
|
|
35
|
+
SYSTEM_PROMPT = """
|
|
36
|
+
You are a web automation assistant.
|
|
37
|
+
|
|
38
|
+
" Analyse the page snapshot and create a short high-level plan, "
|
|
39
|
+
"then output the FIRST action to start with.\n\n"
|
|
40
|
+
"Return a JSON object in *exactly* this shape:\n"
|
|
41
|
+
"Action format json_object examples:\n"
|
|
42
|
+
"{\n \"plan\": [\"Step 1\", \"Step 2\"],\n \"action\": {\n \"type\":
|
|
43
|
+
\"click\",\n \"ref\": \"e1\"\n }\n}\n\n"
|
|
44
|
+
"If task is already complete:\n"
|
|
45
|
+
"{\n \"plan\": [],\n \"action\": {\n \"type\": \"finish\",
|
|
46
|
+
\n \"ref\": null,\n \"summary\": \"Task was already completed. Summary
|
|
47
|
+
of what was found...\"\n }\n}"
|
|
48
|
+
|
|
49
|
+
Available action types:
|
|
50
|
+
- 'click': {"type": "click", "ref": "e1"} or {"type": "click", "text":
|
|
51
|
+
"Button Text"} or {"type": "click", "selector": "button"}
|
|
52
|
+
- 'type': {"type": "type", "ref": "e1", "text": "search text"} or {"type":
|
|
53
|
+
"type", "selector": "input", "text": "search text"}
|
|
54
|
+
- 'select': {"type": "select", "ref": "e1", "value": "option"} or {"type":
|
|
55
|
+
"select", "selector": "select", "value": "option"}
|
|
56
|
+
- 'wait': {"type": "wait", "timeout": 2000} or {"type": "wait", "selector":
|
|
57
|
+
"#element"}
|
|
58
|
+
- 'scroll': {"type": "scroll", "direction": "down", "amount": 300}
|
|
59
|
+
- 'enter': {"type": "enter", "ref": "e1"} or {"type": "enter", "selector":
|
|
60
|
+
"input[name=q]"} or {"type": "enter"}
|
|
61
|
+
- 'navigate': {"type": "navigate", "url": "https://example.com"}
|
|
62
|
+
- 'finish': {"type": "finish", "ref": null, "summary": "task completion
|
|
63
|
+
summary"}
|
|
64
|
+
|
|
65
|
+
IMPORTANT:
|
|
66
|
+
- For 'click': Use 'ref' from snapshot, or 'text' for visible text,
|
|
67
|
+
or 'selector' for CSS selectors
|
|
68
|
+
- For 'type'/'select': Use 'ref' from snapshot or 'selector' for CSS selectors
|
|
69
|
+
- Only use 'ref' values that exist in the snapshot (e.g., ref=e1, ref=e2, etc.)
|
|
70
|
+
- Use 'finish' when the task is completed successfully with a summary of
|
|
71
|
+
what was accomplished
|
|
72
|
+
- Use 'enter' to press the Enter key (optionally focus an element first)
|
|
73
|
+
- Use 'navigate' to open a new URL before interacting further
|
|
74
|
+
- click can choose radio, checkbox...
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
*,
|
|
80
|
+
user_data_dir: Optional[str] = None,
|
|
81
|
+
headless: bool = False,
|
|
82
|
+
model_backend: Optional[BaseModelBackend] = None,
|
|
83
|
+
):
|
|
84
|
+
self._session = NVBrowserSession(
|
|
85
|
+
headless=headless, user_data_dir=user_data_dir
|
|
86
|
+
)
|
|
87
|
+
from camel.agents import ChatAgent
|
|
88
|
+
|
|
89
|
+
# Populated lazily after first page load
|
|
90
|
+
self.action_history: List[Dict[str, Any]] = []
|
|
91
|
+
if model_backend is None:
|
|
92
|
+
model_backend = ModelFactory.create(
|
|
93
|
+
model_platform=ModelPlatformType.OPENAI,
|
|
94
|
+
model_type=ModelType.GPT_4O_MINI,
|
|
95
|
+
model_config_dict={"temperature": 0, "top_p": 1},
|
|
96
|
+
)
|
|
97
|
+
self.model_backend = model_backend
|
|
98
|
+
# Reuse ChatAgent instance to avoid recreation overhead
|
|
99
|
+
self._chat_agent: Optional[ChatAgent] = None
|
|
100
|
+
|
|
101
|
+
async def navigate(self, url: str) -> str:
|
|
102
|
+
try:
|
|
103
|
+
# NVBrowserSession handles waits internally
|
|
104
|
+
logger.debug("Navigated to URL: %s", url)
|
|
105
|
+
await self._session.visit(url)
|
|
106
|
+
return await self._session.get_snapshot(force_refresh=True)
|
|
107
|
+
except Exception as exc:
|
|
108
|
+
return f"Error: could not navigate - {exc}"
|
|
109
|
+
|
|
110
|
+
def _get_chat_agent(self) -> "ChatAgent":
|
|
111
|
+
"""Get or create the ChatAgent instance."""
|
|
112
|
+
from camel.agents import ChatAgent
|
|
113
|
+
|
|
114
|
+
if self._chat_agent is None:
|
|
115
|
+
self._chat_agent = ChatAgent(
|
|
116
|
+
system_message=self.SYSTEM_PROMPT, model=self.model_backend
|
|
117
|
+
)
|
|
118
|
+
return self._chat_agent
|
|
119
|
+
|
|
120
|
+
def _safe_parse_json(self, content: str) -> Dict[str, Any]:
|
|
121
|
+
r"""Safely parse JSON from LLM response with multiple fallback
|
|
122
|
+
strategies.
|
|
123
|
+
"""
|
|
124
|
+
# First attempt: direct parsing
|
|
125
|
+
try:
|
|
126
|
+
return json.loads(content)
|
|
127
|
+
except json.JSONDecodeError:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
# Second attempt: extract JSON-like block using regex
|
|
131
|
+
# Look for content between outermost braces
|
|
132
|
+
json_pattern = re.compile(
|
|
133
|
+
r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', re.DOTALL
|
|
134
|
+
)
|
|
135
|
+
json_matches = json_pattern.findall(content)
|
|
136
|
+
|
|
137
|
+
for match in json_matches:
|
|
138
|
+
try:
|
|
139
|
+
return json.loads(match)
|
|
140
|
+
except json.JSONDecodeError:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
# Third attempt: try to find and parse line by line
|
|
144
|
+
lines = content.split('\n')
|
|
145
|
+
json_lines = []
|
|
146
|
+
in_json = False
|
|
147
|
+
|
|
148
|
+
for line in lines:
|
|
149
|
+
line = line.strip()
|
|
150
|
+
if line.startswith('{'):
|
|
151
|
+
in_json = True
|
|
152
|
+
json_lines = [line]
|
|
153
|
+
elif in_json:
|
|
154
|
+
json_lines.append(line)
|
|
155
|
+
if line.endswith('}'):
|
|
156
|
+
try:
|
|
157
|
+
json_text = '\n'.join(json_lines)
|
|
158
|
+
return json.loads(json_text)
|
|
159
|
+
except json.JSONDecodeError:
|
|
160
|
+
pass
|
|
161
|
+
in_json = False
|
|
162
|
+
json_lines = []
|
|
163
|
+
|
|
164
|
+
# Fallback: return default structure
|
|
165
|
+
logger.warning(
|
|
166
|
+
"Could not parse JSON from LLM response: %s", content[:200]
|
|
167
|
+
)
|
|
168
|
+
return {
|
|
169
|
+
"plan": ["Could not parse response"],
|
|
170
|
+
"action": {
|
|
171
|
+
"type": "finish",
|
|
172
|
+
"ref": None,
|
|
173
|
+
"summary": "Parsing error",
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
def _llm_call(
|
|
178
|
+
self,
|
|
179
|
+
prompt: str,
|
|
180
|
+
snapshot: str,
|
|
181
|
+
is_initial: bool,
|
|
182
|
+
history: Optional[List[Dict[str, Any]]] = None,
|
|
183
|
+
) -> Dict[str, Any]:
|
|
184
|
+
"""Call the LLM (via CAMEL ChatAgent) to get plan & next action."""
|
|
185
|
+
# Build user message
|
|
186
|
+
if is_initial:
|
|
187
|
+
user_content = f"Snapshot:\n{snapshot}\n\nTask: {prompt}"
|
|
188
|
+
else:
|
|
189
|
+
hist_lines = [
|
|
190
|
+
(
|
|
191
|
+
f"{i + 1}. {'✅' if h['success'] else '❌'} "
|
|
192
|
+
f"{h['action']['type']} -> {h['result']}"
|
|
193
|
+
)
|
|
194
|
+
for i, h in enumerate(history or [])
|
|
195
|
+
]
|
|
196
|
+
user_content = (
|
|
197
|
+
f"Snapshot:\n{snapshot}\n\nHistory:\n"
|
|
198
|
+
+ "\n".join(hist_lines)
|
|
199
|
+
+ f"\n\nTask: {prompt}"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Run ChatAgent
|
|
203
|
+
chat_agent = self._get_chat_agent()
|
|
204
|
+
response = chat_agent.step(user_content)
|
|
205
|
+
content = response.msgs[0].content if response.msgs else "{}"
|
|
206
|
+
|
|
207
|
+
# Safely parse JSON response
|
|
208
|
+
return self._safe_parse_json(content)
|
|
209
|
+
|
|
210
|
+
async def process_command(self, prompt: str, max_steps: int = 15):
|
|
211
|
+
# initial full snapshot
|
|
212
|
+
full_snapshot = await self._session.get_snapshot()
|
|
213
|
+
assert self._session.snapshot is not None
|
|
214
|
+
meta = self._session.snapshot.last_info
|
|
215
|
+
logger.info("Initial snapshot priorities=%s", meta["priorities"])
|
|
216
|
+
logger.debug("Full snapshot:\n%s", full_snapshot)
|
|
217
|
+
|
|
218
|
+
plan_resp = self._llm_call(
|
|
219
|
+
prompt, full_snapshot or "", is_initial=True
|
|
220
|
+
)
|
|
221
|
+
plan = plan_resp.get("plan", [])
|
|
222
|
+
action = plan_resp.get("action")
|
|
223
|
+
|
|
224
|
+
logger.info("Plan generated: %s", json.dumps(plan, ensure_ascii=False))
|
|
225
|
+
|
|
226
|
+
steps = 0
|
|
227
|
+
while action and steps < max_steps:
|
|
228
|
+
if action.get("type") == "finish":
|
|
229
|
+
logger.info("Task finished: %s", action.get("summary", "Done"))
|
|
230
|
+
break
|
|
231
|
+
|
|
232
|
+
result = await self._run_action(action)
|
|
233
|
+
logger.debug("Executed action: %s | Result: %s", action, result)
|
|
234
|
+
|
|
235
|
+
self.action_history.append(
|
|
236
|
+
{
|
|
237
|
+
"action": action,
|
|
238
|
+
"result": result,
|
|
239
|
+
"success": "Error" not in result,
|
|
240
|
+
}
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
diff_snapshot = await self._session.get_snapshot(
|
|
244
|
+
force_refresh=ActionExecutor.should_update_snapshot(action),
|
|
245
|
+
diff_only=True,
|
|
246
|
+
)
|
|
247
|
+
assert self._session.snapshot is not None
|
|
248
|
+
meta = self._session.snapshot.last_info
|
|
249
|
+
logger.debug(
|
|
250
|
+
"Snapshot after action (diff=%s):\n%s",
|
|
251
|
+
meta["is_diff"],
|
|
252
|
+
diff_snapshot,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Update full snapshot if page changed
|
|
256
|
+
if meta["is_diff"] and not diff_snapshot.startswith(
|
|
257
|
+
"- Page Snapshot (no structural changes)"
|
|
258
|
+
):
|
|
259
|
+
assert self._session.snapshot is not None
|
|
260
|
+
full_snapshot = self._session.snapshot.snapshot_data or ""
|
|
261
|
+
|
|
262
|
+
action = self._llm_call(
|
|
263
|
+
prompt,
|
|
264
|
+
full_snapshot or "",
|
|
265
|
+
is_initial=False,
|
|
266
|
+
history=self.action_history,
|
|
267
|
+
).get("action")
|
|
268
|
+
steps += 1
|
|
269
|
+
|
|
270
|
+
logger.info("Process completed with %d steps", steps)
|
|
271
|
+
|
|
272
|
+
async def _run_action(self, action: Dict[str, Any]) -> str:
|
|
273
|
+
if action.get("type") == "navigate":
|
|
274
|
+
return await self.navigate(action.get("url", ""))
|
|
275
|
+
return await self._session.exec_action(action)
|
|
276
|
+
|
|
277
|
+
async def close(self):
|
|
278
|
+
await self._session.close()
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Any, Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
from camel.models import BaseModelBackend
|
|
19
|
+
from camel.toolkits.base import BaseToolkit
|
|
20
|
+
from camel.toolkits.function_tool import FunctionTool
|
|
21
|
+
|
|
22
|
+
from .agent import PlaywrightLLMAgent
|
|
23
|
+
|
|
24
|
+
# session wrapper
|
|
25
|
+
from .nv_browser_session import NVBrowserSession
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BrowserNonVisualToolkit(BaseToolkit):
|
|
29
|
+
r"""A lightweight, *non-visual* browser toolkit exposing primitive
|
|
30
|
+
Playwright actions as CAMEL `FunctionTool`s.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
headless: bool = True,
|
|
37
|
+
user_data_dir: Optional[str] = None,
|
|
38
|
+
web_agent_model: Optional[BaseModelBackend] = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
super().__init__()
|
|
41
|
+
self._headless = headless
|
|
42
|
+
self._user_data_dir = user_data_dir
|
|
43
|
+
self.web_agent_model = web_agent_model # Currently unused but kept
|
|
44
|
+
# for compatibility
|
|
45
|
+
|
|
46
|
+
# Encapsulated browser session
|
|
47
|
+
self._session = NVBrowserSession(
|
|
48
|
+
headless=headless, user_data_dir=user_data_dir
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Optional higher-level agent (only if user supplies model)
|
|
52
|
+
self._agent: Optional[PlaywrightLLMAgent] = None
|
|
53
|
+
|
|
54
|
+
def __del__(self):
|
|
55
|
+
r"""Ensure cleanup when toolkit is garbage collected."""
|
|
56
|
+
# Note: __del__ cannot be async, so we schedule cleanup if needed
|
|
57
|
+
import asyncio
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
loop = asyncio.get_event_loop()
|
|
61
|
+
if loop.is_running():
|
|
62
|
+
task = loop.create_task(self.close_browser())
|
|
63
|
+
# Don't wait for completion to avoid blocking
|
|
64
|
+
del task
|
|
65
|
+
else:
|
|
66
|
+
asyncio.run(self.close_browser())
|
|
67
|
+
except Exception:
|
|
68
|
+
pass # Don't fail during garbage collection
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
# Internal helpers
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
async def _ensure_browser(self):
|
|
74
|
+
await self._session.ensure_browser()
|
|
75
|
+
|
|
76
|
+
async def _require_page(self):
|
|
77
|
+
await self._session.ensure_browser()
|
|
78
|
+
return await self._session.get_page()
|
|
79
|
+
|
|
80
|
+
def _validate_ref(self, ref: str, method_name: str) -> None:
|
|
81
|
+
"""Validate that ref parameter is a non-empty string."""
|
|
82
|
+
if not ref or not isinstance(ref, str):
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"{method_name}(): 'ref' must be a non-empty string, "
|
|
85
|
+
f"got: {ref}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
# Tool implementations
|
|
90
|
+
# ------------------------------------------------------------------
|
|
91
|
+
async def open_browser(
|
|
92
|
+
self, start_url: Optional[str] = None
|
|
93
|
+
) -> Dict[str, str]:
|
|
94
|
+
r"""Launch a Playwright browser session.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
start_url (Optional[str]): If provided, the page will navigate to
|
|
98
|
+
this URL immediately after the browser launches.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dict[str, str]: Keys: ``result`` for action outcome,
|
|
102
|
+
``snapshot`` for full DOM snapshot.
|
|
103
|
+
"""
|
|
104
|
+
await self._session.ensure_browser()
|
|
105
|
+
if start_url:
|
|
106
|
+
return await self.visit_page(start_url)
|
|
107
|
+
# If no start_url provided, still capture initial snapshot
|
|
108
|
+
snapshot = await self._session.get_snapshot(
|
|
109
|
+
force_refresh=True, diff_only=False
|
|
110
|
+
)
|
|
111
|
+
return {"result": "Browser session started.", "snapshot": snapshot}
|
|
112
|
+
|
|
113
|
+
async def close_browser(self) -> str:
|
|
114
|
+
r"""Terminate the current browser session and free all resources.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
str: Confirmation message.
|
|
118
|
+
"""
|
|
119
|
+
# Close agent if it exists
|
|
120
|
+
if self._agent is not None:
|
|
121
|
+
try:
|
|
122
|
+
await self._agent.close()
|
|
123
|
+
except Exception:
|
|
124
|
+
pass # Don't fail if agent cleanup fails
|
|
125
|
+
self._agent = None
|
|
126
|
+
|
|
127
|
+
# Close session
|
|
128
|
+
await self._session.close()
|
|
129
|
+
return "Browser session closed."
|
|
130
|
+
|
|
131
|
+
# Navigation / page state ------------------------------------------------
|
|
132
|
+
async def visit_page(self, url: str) -> Dict[str, str]:
|
|
133
|
+
"""Navigate the current page to the specified URL.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
url (str): The destination URL.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dict[str, str]: Keys: ``result`` for action outcome,
|
|
140
|
+
``snapshot`` for full DOM snapshot.
|
|
141
|
+
"""
|
|
142
|
+
if not url or not isinstance(url, str):
|
|
143
|
+
raise ValueError("visit_page(): 'url' must be a non-empty string")
|
|
144
|
+
|
|
145
|
+
nav_result = await self._session.visit(url)
|
|
146
|
+
snapshot = await self._session.get_snapshot(
|
|
147
|
+
force_refresh=True, diff_only=False
|
|
148
|
+
)
|
|
149
|
+
return {"result": nav_result, "snapshot": snapshot}
|
|
150
|
+
|
|
151
|
+
async def get_page_snapshot(
|
|
152
|
+
self, *, force_refresh: bool = False, diff_only: bool = False
|
|
153
|
+
) -> str:
|
|
154
|
+
r"""Capture a YAML-like structural snapshot of the DOM.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
force_refresh (bool): When ``True`` always re-generate the
|
|
158
|
+
snapshot even
|
|
159
|
+
if the URL has not changed.
|
|
160
|
+
diff_only (bool): If ``True`` return only the diff relative to the
|
|
161
|
+
previous snapshot.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
str: Formatted snapshot string.
|
|
165
|
+
"""
|
|
166
|
+
return await self._session.get_snapshot(
|
|
167
|
+
force_refresh=force_refresh, diff_only=diff_only
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Element-level wrappers -------------------------------------------------
|
|
171
|
+
async def click(self, *, ref: str) -> Dict[str, str]:
|
|
172
|
+
r"""Click an element identified by ``ref``
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
ref (str): Element reference ID extracted from snapshot (e.g.
|
|
176
|
+
``"e3"``).
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Dict[str, str]: Result message from ``ActionExecutor``.
|
|
180
|
+
"""
|
|
181
|
+
self._validate_ref(ref, "click")
|
|
182
|
+
|
|
183
|
+
action: Dict[str, Any] = {"type": "click", "ref": ref}
|
|
184
|
+
return await self._exec_with_snapshot(action)
|
|
185
|
+
|
|
186
|
+
async def type(self, *, ref: str, text: str) -> Dict[str, str]:
|
|
187
|
+
r"""Type text into an input or textarea element.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
ref (str): Element reference ID extracted from snapshot (e.g.
|
|
191
|
+
``"e3"``).
|
|
192
|
+
text (str): The text to enter.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Dict[str, str]: Execution result message.
|
|
196
|
+
"""
|
|
197
|
+
self._validate_ref(ref, "type")
|
|
198
|
+
|
|
199
|
+
action: Dict[str, Any] = {"type": "type", "ref": ref, "text": text}
|
|
200
|
+
return await self._exec_with_snapshot(action)
|
|
201
|
+
|
|
202
|
+
async def select(self, *, ref: str, value: str) -> Dict[str, str]:
|
|
203
|
+
r"""Select an option in a ``<select>`` element.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
ref (str): Element reference ID.
|
|
207
|
+
value (str): The value / option to select.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Dict[str, str]: Execution result message.
|
|
211
|
+
"""
|
|
212
|
+
self._validate_ref(ref, "select")
|
|
213
|
+
|
|
214
|
+
action: Dict[str, Any] = {"type": "select", "ref": ref, "value": value}
|
|
215
|
+
return await self._exec_with_snapshot(action)
|
|
216
|
+
|
|
217
|
+
async def scroll(self, *, direction: str, amount: int) -> Dict[str, str]:
|
|
218
|
+
r"""Scroll the page.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
direction (str): ``"down"`` or ``"up"``.
|
|
222
|
+
amount (int): Pixel distance to scroll.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Dict[str, str]: Execution result message.
|
|
226
|
+
"""
|
|
227
|
+
if direction not in ("up", "down"):
|
|
228
|
+
raise ValueError("scroll(): 'direction' must be 'up' or 'down'")
|
|
229
|
+
|
|
230
|
+
action = {"type": "scroll", "direction": direction, "amount": amount}
|
|
231
|
+
return await self._exec_with_snapshot(action)
|
|
232
|
+
|
|
233
|
+
async def wait(
|
|
234
|
+
self, *, timeout_ms: int | None = None, selector: str | None = None
|
|
235
|
+
) -> Dict[str, str]:
|
|
236
|
+
r"""Explicit wait utility.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
timeout_ms (Optional[int]): Milliseconds to sleep.
|
|
240
|
+
selector (Optional[str]): Wait until this CSS selector appears
|
|
241
|
+
in DOM.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dict[str, str]: Execution result message.
|
|
245
|
+
"""
|
|
246
|
+
# Default to 1 000 ms sleep when no arguments provided
|
|
247
|
+
if timeout_ms is None and selector is None:
|
|
248
|
+
timeout_ms = 1000
|
|
249
|
+
|
|
250
|
+
action: Dict[str, Any] = {"type": "wait"}
|
|
251
|
+
if timeout_ms is not None:
|
|
252
|
+
action["timeout"] = timeout_ms
|
|
253
|
+
if selector is not None:
|
|
254
|
+
action["selector"] = selector
|
|
255
|
+
return await self._exec_with_snapshot(action)
|
|
256
|
+
|
|
257
|
+
async def extract(self, *, ref: str) -> Dict[str, str]:
|
|
258
|
+
r"""Extract text content from an element.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
ref (str): Element reference ID obtained from snapshot.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Dict[str, str]: Extracted text or error message.
|
|
265
|
+
"""
|
|
266
|
+
self._validate_ref(ref, "extract")
|
|
267
|
+
return await self._exec_with_snapshot({"type": "extract", "ref": ref})
|
|
268
|
+
|
|
269
|
+
async def enter(self, *, ref: str) -> Dict[str, str]:
|
|
270
|
+
r"""Press the Enter key.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
ref (str): Element reference ID to focus before pressing.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Dict[str, str]: Execution result message.
|
|
277
|
+
"""
|
|
278
|
+
self._validate_ref(ref, "enter")
|
|
279
|
+
|
|
280
|
+
action: Dict[str, Any] = {"type": "enter", "ref": ref}
|
|
281
|
+
return await self._exec_with_snapshot(action)
|
|
282
|
+
|
|
283
|
+
# Helper to run through ActionExecutor
|
|
284
|
+
async def _exec(self, action: Dict[str, Any]) -> str:
|
|
285
|
+
return await self._session.exec_action(action)
|
|
286
|
+
|
|
287
|
+
async def _exec_with_snapshot(
|
|
288
|
+
self, action: Dict[str, Any]
|
|
289
|
+
) -> Dict[str, str]:
|
|
290
|
+
r"""Execute action and, if DOM structure changed, include snapshot
|
|
291
|
+
diff.
|
|
292
|
+
"""
|
|
293
|
+
result = await self._session.exec_action(action)
|
|
294
|
+
|
|
295
|
+
# Only capture diff if action type typically changes DOM
|
|
296
|
+
from .actions import ActionExecutor
|
|
297
|
+
|
|
298
|
+
if not ActionExecutor.should_update_snapshot(action):
|
|
299
|
+
return {"result": result}
|
|
300
|
+
|
|
301
|
+
# Capture structural diff to previous snapshot
|
|
302
|
+
diff = await self._session.get_snapshot(
|
|
303
|
+
force_refresh=True, diff_only=True
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
if diff.startswith("- Page Snapshot (no structural changes)"):
|
|
307
|
+
return {"result": result}
|
|
308
|
+
|
|
309
|
+
return {"result": result, "snapshot": diff}
|
|
310
|
+
|
|
311
|
+
# ------------------------------------------------------------------
|
|
312
|
+
# Optional PlaywrightLLMAgent helpers
|
|
313
|
+
# ------------------------------------------------------------------
|
|
314
|
+
def _ensure_agent(self) -> PlaywrightLLMAgent:
|
|
315
|
+
r"""Create PlaywrightLLMAgent on first use if `web_agent_model`
|
|
316
|
+
provided."""
|
|
317
|
+
if self.web_agent_model is None:
|
|
318
|
+
raise RuntimeError(
|
|
319
|
+
"web_agent_model not supplied - high-level task planning is "
|
|
320
|
+
"unavailable."
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
if self._agent is None:
|
|
324
|
+
self._agent = PlaywrightLLMAgent(
|
|
325
|
+
headless=self._headless,
|
|
326
|
+
user_data_dir=self._user_data_dir,
|
|
327
|
+
model_backend=self.web_agent_model,
|
|
328
|
+
)
|
|
329
|
+
return self._agent
|
|
330
|
+
|
|
331
|
+
async def solve_task(
|
|
332
|
+
self, task_prompt: str, start_url: str, max_steps: int = 15
|
|
333
|
+
) -> str:
|
|
334
|
+
r"""Use LLM agent to autonomously complete the task (requires
|
|
335
|
+
`web_agent_model`)."""
|
|
336
|
+
|
|
337
|
+
agent = self._ensure_agent()
|
|
338
|
+
await agent.navigate(start_url)
|
|
339
|
+
await agent.process_command(task_prompt, max_steps=max_steps)
|
|
340
|
+
return "Task processing finished - see stdout for detailed trace."
|
|
341
|
+
|
|
342
|
+
# ------------------------------------------------------------------
|
|
343
|
+
# Toolkit registration
|
|
344
|
+
# ------------------------------------------------------------------
|
|
345
|
+
def get_tools(self) -> List[FunctionTool]:
|
|
346
|
+
base_tools = [
|
|
347
|
+
FunctionTool(self.open_browser),
|
|
348
|
+
FunctionTool(self.close_browser),
|
|
349
|
+
FunctionTool(self.visit_page),
|
|
350
|
+
FunctionTool(self.get_page_snapshot),
|
|
351
|
+
FunctionTool(self.click),
|
|
352
|
+
FunctionTool(self.type),
|
|
353
|
+
FunctionTool(self.select),
|
|
354
|
+
FunctionTool(self.scroll),
|
|
355
|
+
FunctionTool(self.wait),
|
|
356
|
+
FunctionTool(self.extract),
|
|
357
|
+
FunctionTool(self.enter),
|
|
358
|
+
]
|
|
359
|
+
|
|
360
|
+
if self.web_agent_model is not None:
|
|
361
|
+
base_tools.append(FunctionTool(self.solve_task))
|
|
362
|
+
|
|
363
|
+
return base_tools
|