camel-ai 0.2.60__py3-none-any.whl → 0.2.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +159 -8
- camel/agents/mcp_agent.py +5 -5
- camel/configs/anthropic_config.py +6 -5
- camel/{data_collector → data_collectors}/alpaca_collector.py +1 -1
- camel/{data_collector → data_collectors}/sharegpt_collector.py +1 -1
- camel/datagen/evol_instruct/scorer.py +22 -23
- camel/datagen/evol_instruct/templates.py +46 -46
- camel/datasets/static_dataset.py +144 -0
- camel/loaders/__init__.py +5 -2
- camel/loaders/chunkr_reader.py +117 -91
- camel/loaders/mistral_reader.py +148 -0
- camel/memories/blocks/chat_history_block.py +1 -2
- camel/models/model_manager.py +7 -3
- camel/retrievers/auto_retriever.py +20 -1
- camel/{runtime → runtimes}/daytona_runtime.py +1 -1
- camel/{runtime → runtimes}/docker_runtime.py +1 -1
- camel/{runtime → runtimes}/llm_guard_runtime.py +2 -2
- camel/{runtime → runtimes}/remote_http_runtime.py +1 -1
- camel/{runtime → runtimes}/ubuntu_docker_runtime.py +1 -1
- camel/societies/workforce/base.py +7 -3
- camel/societies/workforce/single_agent_worker.py +2 -1
- camel/societies/workforce/worker.py +5 -3
- camel/societies/workforce/workforce.py +65 -24
- camel/storages/__init__.py +2 -0
- camel/storages/vectordb_storages/__init__.py +2 -0
- camel/storages/vectordb_storages/faiss.py +712 -0
- camel/toolkits/__init__.py +4 -0
- camel/toolkits/async_browser_toolkit.py +75 -523
- camel/toolkits/bohrium_toolkit.py +318 -0
- camel/toolkits/browser_toolkit.py +215 -538
- camel/toolkits/browser_toolkit_commons.py +568 -0
- camel/toolkits/file_write_toolkit.py +80 -31
- camel/toolkits/mcp_toolkit.py +477 -665
- camel/toolkits/pptx_toolkit.py +777 -0
- camel/toolkits/wolfram_alpha_toolkit.py +5 -1
- camel/types/enums.py +13 -1
- camel/utils/__init__.py +2 -0
- camel/utils/commons.py +27 -0
- camel/utils/mcp_client.py +979 -0
- {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/METADATA +14 -1
- {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/RECORD +53 -47
- /camel/{data_collector → data_collectors}/__init__.py +0 -0
- /camel/{data_collector → data_collectors}/base.py +0 -0
- /camel/{runtime → runtimes}/__init__.py +0 -0
- /camel/{runtime → runtimes}/api.py +0 -0
- /camel/{runtime → runtimes}/base.py +0 -0
- /camel/{runtime → runtimes}/configs.py +0 -0
- /camel/{runtime → runtimes}/utils/__init__.py +0 -0
- /camel/{runtime → runtimes}/utils/function_risk_toolkit.py +0 -0
- /camel/{runtime → runtimes}/utils/ignore_risk_toolkit.py +0 -0
- {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.60.dist-info → camel_ai-0.2.62.dist-info}/licenses/LICENSE +0 -0
|
@@ -17,9 +17,7 @@ from __future__ import annotations
|
|
|
17
17
|
|
|
18
18
|
import datetime
|
|
19
19
|
import io
|
|
20
|
-
import json
|
|
21
20
|
import os
|
|
22
|
-
import random
|
|
23
21
|
import re
|
|
24
22
|
import shutil
|
|
25
23
|
import time
|
|
@@ -28,18 +26,16 @@ from copy import deepcopy
|
|
|
28
26
|
from typing import (
|
|
29
27
|
TYPE_CHECKING,
|
|
30
28
|
Any,
|
|
31
|
-
BinaryIO,
|
|
32
29
|
Dict,
|
|
33
30
|
List,
|
|
34
31
|
Literal,
|
|
35
32
|
Optional,
|
|
36
33
|
Tuple,
|
|
37
|
-
TypedDict,
|
|
38
34
|
Union,
|
|
39
35
|
cast,
|
|
40
36
|
)
|
|
41
37
|
|
|
42
|
-
from PIL import Image
|
|
38
|
+
from PIL import Image
|
|
43
39
|
|
|
44
40
|
from camel.logger import get_logger
|
|
45
41
|
from camel.messages import BaseMessage
|
|
@@ -54,7 +50,34 @@ from camel.utils import (
|
|
|
54
50
|
sanitize_filename,
|
|
55
51
|
)
|
|
56
52
|
|
|
53
|
+
# Import shared components from browser_toolkit_commons
|
|
54
|
+
from .browser_toolkit_commons import (
|
|
55
|
+
ACTION_WITH_FEEDBACK_LIST,
|
|
56
|
+
AVAILABLE_ACTIONS_PROMPT,
|
|
57
|
+
GET_FINAL_ANSWER_PROMPT_TEMPLATE,
|
|
58
|
+
OBSERVE_PROMPT_TEMPLATE,
|
|
59
|
+
PLANNING_AGENT_SYSTEM_PROMPT,
|
|
60
|
+
TASK_PLANNING_PROMPT_TEMPLATE,
|
|
61
|
+
TASK_REPLANNING_PROMPT_TEMPLATE,
|
|
62
|
+
WEB_AGENT_SYSTEM_PROMPT,
|
|
63
|
+
InteractiveRegion,
|
|
64
|
+
VisualViewport,
|
|
65
|
+
_add_set_of_mark,
|
|
66
|
+
_parse_json_output,
|
|
67
|
+
_reload_image,
|
|
68
|
+
interactive_region_from_dict,
|
|
69
|
+
visual_viewport_from_dict,
|
|
70
|
+
)
|
|
71
|
+
|
|
57
72
|
if TYPE_CHECKING:
|
|
73
|
+
from playwright.sync_api import (
|
|
74
|
+
Browser,
|
|
75
|
+
BrowserContext,
|
|
76
|
+
FloatRect,
|
|
77
|
+
Page,
|
|
78
|
+
Playwright,
|
|
79
|
+
)
|
|
80
|
+
|
|
58
81
|
from camel.agents import ChatAgent
|
|
59
82
|
|
|
60
83
|
logger = get_logger(__name__)
|
|
@@ -62,82 +85,6 @@ logger = get_logger(__name__)
|
|
|
62
85
|
TOP_NO_LABEL_ZONE = 20
|
|
63
86
|
|
|
64
87
|
|
|
65
|
-
AVAILABLE_ACTIONS_PROMPT = """
|
|
66
|
-
1. `fill_input_id(identifier: Union[str, int], text: str)`: Fill an input
|
|
67
|
-
field (e.g. search box) with the given text and press Enter.
|
|
68
|
-
2. `click_id(identifier: Union[str, int])`: Click an element with the given ID.
|
|
69
|
-
3. `hover_id(identifier: Union[str, int])`: Hover over an element with the
|
|
70
|
-
given ID.
|
|
71
|
-
4. `download_file_id(identifier: Union[str, int])`: Download a file with the
|
|
72
|
-
given ID. It returns the path to the downloaded file. If the file is
|
|
73
|
-
successfully downloaded, you can stop the simulation and report the path to
|
|
74
|
-
the downloaded file for further processing.
|
|
75
|
-
5. `scroll_to_bottom()`: Scroll to the bottom of the page.
|
|
76
|
-
6. `scroll_to_top()`: Scroll to the top of the page.
|
|
77
|
-
7. `scroll_up()`: Scroll up the page. It is suitable when you want to see the
|
|
78
|
-
elements above the current viewport.
|
|
79
|
-
8. `scroll_down()`: Scroll down the page. It is suitable when you want to see
|
|
80
|
-
the elements below the current viewport. If the webpage does not change, It
|
|
81
|
-
means that the webpage has scrolled to the bottom.
|
|
82
|
-
9. `back()`: Navigate back to the previous page. This is useful when you want
|
|
83
|
-
to go back to the previous page, as current page is not useful.
|
|
84
|
-
10. `stop()`: Stop the action process, because the task is completed or failed
|
|
85
|
-
(impossible to find the answer). In this situation, you should provide your
|
|
86
|
-
answer in your output.
|
|
87
|
-
11. `get_url()`: Get the current URL of the current page.
|
|
88
|
-
12. `find_text_on_page(search_text: str)`: Find the next given text on the
|
|
89
|
-
current whole page, and scroll the page to the targeted text. It is equivalent
|
|
90
|
-
to pressing Ctrl + F and searching for the text, and is powerful when you want
|
|
91
|
-
to fast-check whether the current page contains some specific text.
|
|
92
|
-
13. `visit_page(url: str)`: Go to the specific url page.
|
|
93
|
-
14. `click_blank_area()`: Click a blank area of the page to unfocus the
|
|
94
|
-
current element. It is useful when you have clicked an element but it cannot
|
|
95
|
-
unfocus itself (e.g. Menu bar) to automatically render the updated webpage.
|
|
96
|
-
15. `ask_question_about_video(question: str)`: Ask a question about the
|
|
97
|
-
current webpage which contains video, e.g. youtube websites.
|
|
98
|
-
"""
|
|
99
|
-
|
|
100
|
-
ACTION_WITH_FEEDBACK_LIST = [
|
|
101
|
-
'ask_question_about_video',
|
|
102
|
-
'download_file_id',
|
|
103
|
-
'find_text_on_page',
|
|
104
|
-
]
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# Code from magentic-one
|
|
108
|
-
class DOMRectangle(TypedDict):
|
|
109
|
-
x: Union[int, float]
|
|
110
|
-
y: Union[int, float]
|
|
111
|
-
width: Union[int, float]
|
|
112
|
-
height: Union[int, float]
|
|
113
|
-
top: Union[int, float]
|
|
114
|
-
right: Union[int, float]
|
|
115
|
-
bottom: Union[int, float]
|
|
116
|
-
left: Union[int, float]
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
class VisualViewport(TypedDict):
|
|
120
|
-
height: Union[int, float]
|
|
121
|
-
width: Union[int, float]
|
|
122
|
-
offsetLeft: Union[int, float]
|
|
123
|
-
offsetTop: Union[int, float]
|
|
124
|
-
pageLeft: Union[int, float]
|
|
125
|
-
pageTop: Union[int, float]
|
|
126
|
-
scale: Union[int, float]
|
|
127
|
-
clientWidth: Union[int, float]
|
|
128
|
-
clientHeight: Union[int, float]
|
|
129
|
-
scrollWidth: Union[int, float]
|
|
130
|
-
scrollHeight: Union[int, float]
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
class InteractiveRegion(TypedDict):
|
|
134
|
-
tag_name: str
|
|
135
|
-
role: str
|
|
136
|
-
aria_name: str
|
|
137
|
-
v_scrollable: bool
|
|
138
|
-
rects: List[DOMRectangle]
|
|
139
|
-
|
|
140
|
-
|
|
141
88
|
def _get_str(d: Any, k: str) -> str:
|
|
142
89
|
r"""Safely retrieve a string value from a dictionary."""
|
|
143
90
|
if k not in d:
|
|
@@ -171,270 +118,6 @@ def _get_bool(d: Any, k: str) -> bool:
|
|
|
171
118
|
)
|
|
172
119
|
|
|
173
120
|
|
|
174
|
-
def _parse_json_output(text: str) -> Dict[str, Any]:
|
|
175
|
-
r"""Extract JSON output from a string."""
|
|
176
|
-
|
|
177
|
-
markdown_pattern = r'```(?:json)?\s*(.*?)\s*```'
|
|
178
|
-
markdown_match = re.search(markdown_pattern, text, re.DOTALL)
|
|
179
|
-
if markdown_match:
|
|
180
|
-
text = markdown_match.group(1).strip()
|
|
181
|
-
|
|
182
|
-
triple_quotes_pattern = r'"""(?:json)?\s*(.*?)\s*"""'
|
|
183
|
-
triple_quotes_match = re.search(triple_quotes_pattern, text, re.DOTALL)
|
|
184
|
-
if triple_quotes_match:
|
|
185
|
-
text = triple_quotes_match.group(1).strip()
|
|
186
|
-
|
|
187
|
-
try:
|
|
188
|
-
return json.loads(text)
|
|
189
|
-
except json.JSONDecodeError:
|
|
190
|
-
try:
|
|
191
|
-
fixed_text = re.sub(
|
|
192
|
-
r'`([^`]*?)`(?=\s*[:,\[\]{}]|$)', r'"\1"', text
|
|
193
|
-
)
|
|
194
|
-
return json.loads(fixed_text)
|
|
195
|
-
except json.JSONDecodeError:
|
|
196
|
-
result = {}
|
|
197
|
-
try:
|
|
198
|
-
bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
|
|
199
|
-
for match in re.finditer(bool_pattern, text, re.IGNORECASE):
|
|
200
|
-
key, value = match.groups()
|
|
201
|
-
result[key] = value.lower() == "true"
|
|
202
|
-
|
|
203
|
-
str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
|
|
204
|
-
for match in re.finditer(str_pattern, text):
|
|
205
|
-
key, value = match.groups()
|
|
206
|
-
result[key] = value
|
|
207
|
-
|
|
208
|
-
num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
|
|
209
|
-
for match in re.finditer(num_pattern, text):
|
|
210
|
-
key, value = match.groups()
|
|
211
|
-
try:
|
|
212
|
-
result[key] = int(value)
|
|
213
|
-
except ValueError:
|
|
214
|
-
result[key] = float(value)
|
|
215
|
-
|
|
216
|
-
empty_str_pattern = r'"(\w+)"\s*:\s*""'
|
|
217
|
-
for match in re.finditer(empty_str_pattern, text):
|
|
218
|
-
key = match.group(1)
|
|
219
|
-
result[key] = ""
|
|
220
|
-
|
|
221
|
-
if result:
|
|
222
|
-
return result
|
|
223
|
-
|
|
224
|
-
logger.warning(f"Failed to parse JSON output: {text}")
|
|
225
|
-
return {}
|
|
226
|
-
except Exception as e:
|
|
227
|
-
logger.warning(f"Error while extracting fields from JSON: {e}")
|
|
228
|
-
return {}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
def _reload_image(image: Image.Image) -> Image.Image:
|
|
232
|
-
buffer = io.BytesIO()
|
|
233
|
-
image.save(buffer, format="PNG")
|
|
234
|
-
buffer.seek(0)
|
|
235
|
-
return Image.open(buffer)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def dom_rectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
|
|
239
|
-
r"""Create a DOMRectangle object from a dictionary."""
|
|
240
|
-
return DOMRectangle(
|
|
241
|
-
x=_get_number(rect, "x"),
|
|
242
|
-
y=_get_number(rect, "y"),
|
|
243
|
-
width=_get_number(rect, "width"),
|
|
244
|
-
height=_get_number(rect, "height"),
|
|
245
|
-
top=_get_number(rect, "top"),
|
|
246
|
-
right=_get_number(rect, "right"),
|
|
247
|
-
bottom=_get_number(rect, "bottom"),
|
|
248
|
-
left=_get_number(rect, "left"),
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def interactive_region_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
|
|
253
|
-
r"""Create an :class:`InteractiveRegion` object from a dictionary."""
|
|
254
|
-
typed_rects: List[DOMRectangle] = []
|
|
255
|
-
for rect in region["rects"]:
|
|
256
|
-
typed_rects.append(dom_rectangle_from_dict(rect))
|
|
257
|
-
|
|
258
|
-
return InteractiveRegion(
|
|
259
|
-
tag_name=_get_str(region, "tag_name"),
|
|
260
|
-
role=_get_str(region, "role"),
|
|
261
|
-
aria_name=_get_str(region, "aria-name"),
|
|
262
|
-
v_scrollable=_get_bool(region, "v-scrollable"),
|
|
263
|
-
rects=typed_rects,
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def visual_viewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
|
|
268
|
-
r"""Create a :class:`VisualViewport` object from a dictionary."""
|
|
269
|
-
return VisualViewport(
|
|
270
|
-
height=_get_number(viewport, "height"),
|
|
271
|
-
width=_get_number(viewport, "width"),
|
|
272
|
-
offsetLeft=_get_number(viewport, "offsetLeft"),
|
|
273
|
-
offsetTop=_get_number(viewport, "offsetTop"),
|
|
274
|
-
pageLeft=_get_number(viewport, "pageLeft"),
|
|
275
|
-
pageTop=_get_number(viewport, "pageTop"),
|
|
276
|
-
scale=_get_number(viewport, "scale"),
|
|
277
|
-
clientWidth=_get_number(viewport, "clientWidth"),
|
|
278
|
-
clientHeight=_get_number(viewport, "clientHeight"),
|
|
279
|
-
scrollWidth=_get_number(viewport, "scrollWidth"),
|
|
280
|
-
scrollHeight=_get_number(viewport, "scrollHeight"),
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def add_set_of_mark(
|
|
285
|
-
screenshot: Union[bytes, Image.Image, io.BufferedIOBase],
|
|
286
|
-
ROIs: Dict[str, InteractiveRegion],
|
|
287
|
-
) -> Tuple[Image.Image, List[str], List[str], List[str]]:
|
|
288
|
-
if isinstance(screenshot, Image.Image):
|
|
289
|
-
return _add_set_of_mark(screenshot, ROIs)
|
|
290
|
-
|
|
291
|
-
if isinstance(screenshot, bytes):
|
|
292
|
-
screenshot = io.BytesIO(screenshot)
|
|
293
|
-
|
|
294
|
-
image = Image.open(cast(BinaryIO, screenshot))
|
|
295
|
-
comp, visible_rects, rects_above, rects_below = _add_set_of_mark(
|
|
296
|
-
image, ROIs
|
|
297
|
-
)
|
|
298
|
-
image.close()
|
|
299
|
-
return comp, visible_rects, rects_above, rects_below
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
def _add_set_of_mark(
|
|
303
|
-
screenshot: Image.Image, ROIs: Dict[str, InteractiveRegion]
|
|
304
|
-
) -> Tuple[Image.Image, List[str], List[str], List[str]]:
|
|
305
|
-
r"""Add a set of marks to the screenshot.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
screenshot (Image.Image): The screenshot to add marks to.
|
|
309
|
-
ROIs (Dict[str, InteractiveRegion]): The regions to add marks to.
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
Tuple[Image.Image, List[str], List[str], List[str]]: A tuple
|
|
313
|
-
containing the screenshot with marked ROIs, ROIs fully within the
|
|
314
|
-
images, ROIs located above the visible area, and ROIs located below
|
|
315
|
-
the visible area.
|
|
316
|
-
"""
|
|
317
|
-
visible_rects: List[str] = list()
|
|
318
|
-
rects_above: List[str] = list() # Scroll up to see
|
|
319
|
-
rects_below: List[str] = list() # Scroll down to see
|
|
320
|
-
|
|
321
|
-
fnt = ImageFont.load_default(14)
|
|
322
|
-
base = screenshot.convert("L").convert("RGBA")
|
|
323
|
-
overlay = Image.new("RGBA", base.size)
|
|
324
|
-
|
|
325
|
-
draw = ImageDraw.Draw(overlay)
|
|
326
|
-
for r in ROIs:
|
|
327
|
-
for rect in ROIs[r]["rects"]:
|
|
328
|
-
# Empty rectangles
|
|
329
|
-
if not rect or rect["width"] == 0 or rect["height"] == 0:
|
|
330
|
-
continue
|
|
331
|
-
|
|
332
|
-
# TODO: add scroll left and right?
|
|
333
|
-
horizontal_center = (rect["right"] + rect["left"]) / 2.0
|
|
334
|
-
vertical_center = (rect["top"] + rect["bottom"]) / 2.0
|
|
335
|
-
is_within_horizon = 0 <= horizontal_center < base.size[0]
|
|
336
|
-
is_above_viewport = vertical_center < 0
|
|
337
|
-
is_below_viewport = vertical_center >= base.size[1]
|
|
338
|
-
|
|
339
|
-
if is_within_horizon:
|
|
340
|
-
if is_above_viewport:
|
|
341
|
-
rects_above.append(r)
|
|
342
|
-
elif is_below_viewport:
|
|
343
|
-
rects_below.append(r)
|
|
344
|
-
else: # Fully visible
|
|
345
|
-
visible_rects.append(r)
|
|
346
|
-
_draw_roi(draw, int(r), fnt, rect)
|
|
347
|
-
|
|
348
|
-
comp = Image.alpha_composite(base, overlay)
|
|
349
|
-
overlay.close()
|
|
350
|
-
return comp, visible_rects, rects_above, rects_below
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
def _draw_roi(
|
|
354
|
-
draw: ImageDraw.ImageDraw,
|
|
355
|
-
idx: int,
|
|
356
|
-
font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
|
|
357
|
-
rect: DOMRectangle,
|
|
358
|
-
) -> None:
|
|
359
|
-
r"""Draw a ROI on the image.
|
|
360
|
-
|
|
361
|
-
Args:
|
|
362
|
-
draw (ImageDraw.ImageDraw): The draw object.
|
|
363
|
-
idx (int): The index of the ROI.
|
|
364
|
-
font (ImageFont.FreeTypeFont | ImageFont.ImageFont): The font.
|
|
365
|
-
rect (DOMRectangle): The DOM rectangle.
|
|
366
|
-
"""
|
|
367
|
-
color = _get_random_color(idx)
|
|
368
|
-
text_color = _get_text_color(color)
|
|
369
|
-
|
|
370
|
-
roi = ((rect["left"], rect["top"]), (rect["right"], rect["bottom"]))
|
|
371
|
-
|
|
372
|
-
label_location = (rect["right"], rect["top"])
|
|
373
|
-
label_anchor = "rb"
|
|
374
|
-
|
|
375
|
-
if label_location[1] <= TOP_NO_LABEL_ZONE:
|
|
376
|
-
label_location = (rect["right"], rect["bottom"])
|
|
377
|
-
label_anchor = "rt"
|
|
378
|
-
|
|
379
|
-
draw.rectangle(
|
|
380
|
-
roi, outline=color, fill=(color[0], color[1], color[2], 48), width=2
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
bbox = draw.textbbox(
|
|
384
|
-
label_location,
|
|
385
|
-
str(idx),
|
|
386
|
-
font=font,
|
|
387
|
-
anchor=label_anchor,
|
|
388
|
-
align="center",
|
|
389
|
-
)
|
|
390
|
-
bbox = (bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3)
|
|
391
|
-
draw.rectangle(bbox, fill=color)
|
|
392
|
-
|
|
393
|
-
draw.text(
|
|
394
|
-
label_location,
|
|
395
|
-
str(idx),
|
|
396
|
-
fill=text_color,
|
|
397
|
-
font=font,
|
|
398
|
-
anchor=label_anchor,
|
|
399
|
-
align="center",
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
def _get_text_color(
|
|
404
|
-
bg_color: Tuple[int, int, int, int],
|
|
405
|
-
) -> Tuple[int, int, int, int]:
|
|
406
|
-
r"""Determine the ideal text color (black or white) for contrast.
|
|
407
|
-
|
|
408
|
-
Args:
|
|
409
|
-
bg_color: The background color (R, G, B, A).
|
|
410
|
-
|
|
411
|
-
Returns:
|
|
412
|
-
A tuple representing black or white color for text.
|
|
413
|
-
"""
|
|
414
|
-
luminance = bg_color[0] * 0.3 + bg_color[1] * 0.59 + bg_color[2] * 0.11
|
|
415
|
-
return (0, 0, 0, 255) if luminance > 120 else (255, 255, 255, 255)
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
def _get_random_color(identifier: int) -> Tuple[int, int, int, int]:
|
|
419
|
-
r"""Generate a consistent random RGBA color based on the identifier.
|
|
420
|
-
|
|
421
|
-
Args:
|
|
422
|
-
identifier: The ID used as a seed to ensure color consistency.
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
A tuple representing (R, G, B, A) values.
|
|
426
|
-
"""
|
|
427
|
-
rnd = random.Random(int(identifier))
|
|
428
|
-
r = rnd.randint(0, 255)
|
|
429
|
-
g = rnd.randint(125, 255)
|
|
430
|
-
b = rnd.randint(0, 50)
|
|
431
|
-
color = [r, g, b]
|
|
432
|
-
# TODO: check why shuffle is needed?
|
|
433
|
-
rnd.shuffle(color)
|
|
434
|
-
color.append(255)
|
|
435
|
-
return cast(Tuple[int, int, int, int], tuple(color))
|
|
436
|
-
|
|
437
|
-
|
|
438
121
|
class BaseBrowser:
|
|
439
122
|
def __init__(
|
|
440
123
|
self,
|
|
@@ -453,7 +136,8 @@ class BaseBrowser:
|
|
|
453
136
|
"chromium".
|
|
454
137
|
cookie_json_path (Optional[str]): Path to a JSON file containing
|
|
455
138
|
authentication cookies and browser storage state. If provided
|
|
456
|
-
and the file exists, the browser will load this state to
|
|
139
|
+
and the file exists, the browser will load this state to
|
|
140
|
+
maintain
|
|
457
141
|
authenticated sessions without requiring manual login.
|
|
458
142
|
|
|
459
143
|
Returns:
|
|
@@ -463,12 +147,14 @@ class BaseBrowser:
|
|
|
463
147
|
sync_playwright,
|
|
464
148
|
)
|
|
465
149
|
|
|
466
|
-
self.history:
|
|
150
|
+
self.history: List[Any] = []
|
|
467
151
|
self.headless = headless
|
|
468
152
|
self.channel = channel
|
|
469
153
|
self._ensure_browser_installed()
|
|
470
|
-
self.playwright = sync_playwright().start()
|
|
471
|
-
self.page_history:
|
|
154
|
+
self.playwright: Playwright = sync_playwright().start()
|
|
155
|
+
self.page_history: List[
|
|
156
|
+
str
|
|
157
|
+
] = [] # stores the history of visited pages
|
|
472
158
|
self.cookie_json_path = cookie_json_path
|
|
473
159
|
|
|
474
160
|
# Set the cache directory
|
|
@@ -487,10 +173,18 @@ class BaseBrowser:
|
|
|
487
173
|
raise FileNotFoundError(
|
|
488
174
|
f"Page script file not found at path: {page_script_path}"
|
|
489
175
|
)
|
|
176
|
+
self.browser: Optional[Browser] = None
|
|
177
|
+
self.context: Optional[BrowserContext] = None
|
|
178
|
+
self.page: Optional[Page] = None
|
|
179
|
+
self.page_url: Optional[str] = None
|
|
180
|
+
self.web_agent_model: Optional[BaseModelBackend] = (
|
|
181
|
+
None # Added for type hinting
|
|
182
|
+
)
|
|
490
183
|
|
|
491
184
|
def init(self) -> None:
|
|
492
185
|
r"""Initialize the browser."""
|
|
493
186
|
# Launch the browser, if headless is False, the browser will display
|
|
187
|
+
assert self.playwright is not None
|
|
494
188
|
self.browser = self.playwright.chromium.launch(
|
|
495
189
|
headless=self.headless, channel=self.channel
|
|
496
190
|
)
|
|
@@ -498,6 +192,7 @@ class BaseBrowser:
|
|
|
498
192
|
# Check if cookie file exists before using it to maintain
|
|
499
193
|
# authenticated sessions. This prevents errors when the cookie file
|
|
500
194
|
# doesn't exist
|
|
195
|
+
assert self.browser is not None
|
|
501
196
|
if self.cookie_json_path and os.path.exists(self.cookie_json_path):
|
|
502
197
|
self.context = self.browser.new_context(
|
|
503
198
|
accept_downloads=True, storage_state=self.cookie_json_path
|
|
@@ -507,6 +202,7 @@ class BaseBrowser:
|
|
|
507
202
|
accept_downloads=True,
|
|
508
203
|
)
|
|
509
204
|
# Create a new page
|
|
205
|
+
assert self.context is not None
|
|
510
206
|
self.page = self.context.new_page()
|
|
511
207
|
|
|
512
208
|
def clean_cache(self) -> None:
|
|
@@ -517,7 +213,7 @@ class BaseBrowser:
|
|
|
517
213
|
def _wait_for_load(self, timeout: int = 20) -> None:
|
|
518
214
|
r"""Wait for a certain amount of time for the page to load."""
|
|
519
215
|
timeout_ms = timeout * 1000
|
|
520
|
-
|
|
216
|
+
assert self.page is not None
|
|
521
217
|
self.page.wait_for_load_state("load", timeout=timeout_ms)
|
|
522
218
|
|
|
523
219
|
# TODO: check if this is needed
|
|
@@ -525,13 +221,14 @@ class BaseBrowser:
|
|
|
525
221
|
|
|
526
222
|
def click_blank_area(self) -> None:
|
|
527
223
|
r"""Click a blank area of the page to unfocus the current element."""
|
|
224
|
+
assert self.page is not None
|
|
528
225
|
self.page.mouse.click(0, 0)
|
|
529
226
|
self._wait_for_load()
|
|
530
227
|
|
|
531
228
|
@retry_on_error()
|
|
532
229
|
def visit_page(self, url: str) -> None:
|
|
533
230
|
r"""Visit a page with the given URL."""
|
|
534
|
-
|
|
231
|
+
assert self.page is not None
|
|
535
232
|
self.page.goto(url)
|
|
536
233
|
self._wait_for_load()
|
|
537
234
|
self.page_url = url
|
|
@@ -548,7 +245,8 @@ class BaseBrowser:
|
|
|
548
245
|
"""
|
|
549
246
|
current_url = self.get_url()
|
|
550
247
|
|
|
551
|
-
# Confirm with user before proceeding due to potential slow
|
|
248
|
+
# Confirm with user before proceeding due to potential slow
|
|
249
|
+
# processing time
|
|
552
250
|
confirmation_message = (
|
|
553
251
|
f"Do you want to analyze the video on the current "
|
|
554
252
|
f"page({current_url})? This operation may take a long time.(y/n): "
|
|
@@ -559,7 +257,10 @@ class BaseBrowser:
|
|
|
559
257
|
return "User cancelled the video analysis."
|
|
560
258
|
|
|
561
259
|
model = None
|
|
562
|
-
if
|
|
260
|
+
if (
|
|
261
|
+
hasattr(self, 'web_agent_model')
|
|
262
|
+
and self.web_agent_model is not None
|
|
263
|
+
):
|
|
563
264
|
model = self.web_agent_model
|
|
564
265
|
|
|
565
266
|
video_analyzer = VideoAnalysisToolkit(model=model)
|
|
@@ -581,6 +282,7 @@ class BaseBrowser:
|
|
|
581
282
|
image and the path to the image file if saved, otherwise
|
|
582
283
|
:obj:`None`.
|
|
583
284
|
"""
|
|
285
|
+
assert self.page is not None
|
|
584
286
|
image_data = self.page.screenshot(timeout=60000)
|
|
585
287
|
image = Image.open(io.BytesIO(image_data))
|
|
586
288
|
|
|
@@ -588,6 +290,7 @@ class BaseBrowser:
|
|
|
588
290
|
if save_image:
|
|
589
291
|
# Get url name to form a file name
|
|
590
292
|
# Use urlparser for a safer extraction the url name
|
|
293
|
+
assert self.page_url is not None
|
|
591
294
|
parsed_url = urllib.parse.urlparse(self.page_url)
|
|
592
295
|
# Max length is set to 241 as there are 10 characters for the
|
|
593
296
|
# timestamp and 4 characters for the file extension:
|
|
@@ -615,17 +318,24 @@ class BaseBrowser:
|
|
|
615
318
|
Returns:
|
|
616
319
|
List[str]: A list of paths to the screenshot files.
|
|
617
320
|
"""
|
|
618
|
-
screenshots = []
|
|
619
|
-
|
|
321
|
+
screenshots: List[str] = [] # Ensure screenshots is typed
|
|
322
|
+
assert self.page is not None
|
|
323
|
+
scroll_height_eval = self.page.evaluate("document.body.scrollHeight")
|
|
324
|
+
scroll_height = cast(
|
|
325
|
+
float, scroll_height_eval
|
|
326
|
+
) # Ensure scroll_height is
|
|
327
|
+
# float
|
|
328
|
+
|
|
620
329
|
assert self.page.viewport_size is not None
|
|
621
330
|
viewport_height = self.page.viewport_size["height"]
|
|
622
|
-
|
|
623
|
-
|
|
331
|
+
current_scroll_eval = self.page.evaluate("window.scrollY")
|
|
332
|
+
current_scroll = cast(float, current_scroll_eval)
|
|
333
|
+
# screenshot_index = 1 # This variable is not used
|
|
624
334
|
|
|
625
335
|
max_height = scroll_height - viewport_height
|
|
626
336
|
scroll_step = int(viewport_height * scroll_ratio)
|
|
627
337
|
|
|
628
|
-
last_height = 0
|
|
338
|
+
last_height = 0.0 # Initialize last_height as float
|
|
629
339
|
|
|
630
340
|
while True:
|
|
631
341
|
logger.debug(
|
|
@@ -634,19 +344,22 @@ class BaseBrowser:
|
|
|
634
344
|
)
|
|
635
345
|
|
|
636
346
|
_, file_path = self.get_screenshot(save_image=True)
|
|
637
|
-
|
|
347
|
+
if file_path is not None: # Ensure file_path is not None before
|
|
348
|
+
# appending
|
|
349
|
+
screenshots.append(file_path)
|
|
638
350
|
|
|
639
351
|
self.page.evaluate(f"window.scrollBy(0, {scroll_step})")
|
|
640
352
|
# Allow time for content to load
|
|
641
353
|
time.sleep(0.5)
|
|
642
354
|
|
|
643
|
-
|
|
355
|
+
current_scroll_eval = self.page.evaluate("window.scrollY")
|
|
356
|
+
current_scroll = cast(float, current_scroll_eval)
|
|
644
357
|
# Break if there is no significant scroll
|
|
645
358
|
if abs(current_scroll - last_height) < viewport_height * 0.1:
|
|
646
359
|
break
|
|
647
360
|
|
|
648
361
|
last_height = current_scroll
|
|
649
|
-
screenshot_index += 1
|
|
362
|
+
# screenshot_index += 1 # This variable is not used
|
|
650
363
|
|
|
651
364
|
return screenshots
|
|
652
365
|
|
|
@@ -656,13 +369,17 @@ class BaseBrowser:
|
|
|
656
369
|
Returns:
|
|
657
370
|
VisualViewport: The visual viewport of the current page.
|
|
658
371
|
"""
|
|
372
|
+
assert self.page is not None
|
|
659
373
|
try:
|
|
660
374
|
self.page.evaluate(self.page_script)
|
|
661
375
|
except Exception as e:
|
|
662
376
|
logger.warning(f"Error evaluating page script: {e}")
|
|
663
377
|
|
|
378
|
+
visual_viewport_eval = self.page.evaluate(
|
|
379
|
+
"MultimodalWebSurfer.getVisualViewport();"
|
|
380
|
+
)
|
|
664
381
|
return visual_viewport_from_dict(
|
|
665
|
-
|
|
382
|
+
cast(Dict[str, Any], visual_viewport_eval)
|
|
666
383
|
)
|
|
667
384
|
|
|
668
385
|
def get_interactive_elements(self) -> Dict[str, InteractiveRegion]:
|
|
@@ -671,6 +388,7 @@ class BaseBrowser:
|
|
|
671
388
|
Returns:
|
|
672
389
|
Dict[str, InteractiveRegion]: A dictionary of interactive elements.
|
|
673
390
|
"""
|
|
391
|
+
assert self.page is not None
|
|
674
392
|
try:
|
|
675
393
|
self.page.evaluate(self.page_script)
|
|
676
394
|
except Exception as e:
|
|
@@ -685,7 +403,7 @@ class BaseBrowser:
|
|
|
685
403
|
for k in result:
|
|
686
404
|
typed_results[k] = interactive_region_from_dict(result[k])
|
|
687
405
|
|
|
688
|
-
return typed_results
|
|
406
|
+
return typed_results
|
|
689
407
|
|
|
690
408
|
def get_som_screenshot(
|
|
691
409
|
self,
|
|
@@ -699,7 +417,8 @@ class BaseBrowser:
|
|
|
699
417
|
directory.
|
|
700
418
|
|
|
701
419
|
Returns:
|
|
702
|
-
Tuple[Image.Image, Union[str, None]]: A tuple containing the
|
|
420
|
+
Tuple[Image.Image, Union[str, None]]: A tuple containing the
|
|
421
|
+
screenshot image
|
|
703
422
|
and an optional path to the image file if saved, otherwise
|
|
704
423
|
:obj:`None`.
|
|
705
424
|
"""
|
|
@@ -709,11 +428,12 @@ class BaseBrowser:
|
|
|
709
428
|
rects = self.get_interactive_elements()
|
|
710
429
|
|
|
711
430
|
file_path: str | None = None
|
|
712
|
-
comp, _, _, _ =
|
|
431
|
+
comp, _, _, _ = _add_set_of_mark(
|
|
713
432
|
screenshot,
|
|
714
|
-
rects,
|
|
433
|
+
rects,
|
|
715
434
|
)
|
|
716
435
|
if save_image:
|
|
436
|
+
assert self.page_url is not None
|
|
717
437
|
parsed_url = urllib.parse.urlparse(self.page_url)
|
|
718
438
|
# Max length is set to 241 as there are 10 characters for the
|
|
719
439
|
# timestamp and 4 characters for the file extension:
|
|
@@ -730,25 +450,30 @@ class BaseBrowser:
|
|
|
730
450
|
|
|
731
451
|
def scroll_up(self) -> None:
|
|
732
452
|
r"""Scroll up the page."""
|
|
453
|
+
assert self.page is not None
|
|
733
454
|
self.page.keyboard.press("PageUp")
|
|
734
455
|
|
|
735
456
|
def scroll_down(self) -> None:
|
|
736
457
|
r"""Scroll down the page."""
|
|
458
|
+
assert self.page is not None
|
|
737
459
|
self.page.keyboard.press("PageDown")
|
|
738
460
|
|
|
739
461
|
def get_url(self) -> str:
|
|
740
462
|
r"""Get the URL of the current page."""
|
|
463
|
+
assert self.page is not None
|
|
741
464
|
return self.page.url
|
|
742
465
|
|
|
743
466
|
def click_id(self, identifier: Union[str, int]) -> None:
|
|
744
467
|
r"""Click an element with the given identifier."""
|
|
468
|
+
assert self.page is not None
|
|
745
469
|
if isinstance(identifier, int):
|
|
746
470
|
identifier = str(identifier)
|
|
747
471
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
748
472
|
|
|
749
473
|
try:
|
|
750
474
|
target.wait_for(timeout=5000)
|
|
751
|
-
except
|
|
475
|
+
except Exception as e: # Consider using playwright specific
|
|
476
|
+
# TimeoutError
|
|
752
477
|
logger.debug(f"Error during click operation: {e}")
|
|
753
478
|
raise ValueError("No such element.") from None
|
|
754
479
|
|
|
@@ -757,7 +482,13 @@ class BaseBrowser:
|
|
|
757
482
|
new_page = None
|
|
758
483
|
try:
|
|
759
484
|
with self.page.expect_event("popup", timeout=1000) as page_info:
|
|
760
|
-
box
|
|
485
|
+
box: Optional[FloatRect] = target.bounding_box()
|
|
486
|
+
if box is None:
|
|
487
|
+
logger.warning(
|
|
488
|
+
f"Bounding box not found for element '{identifier}'. "
|
|
489
|
+
f"Cannot click."
|
|
490
|
+
)
|
|
491
|
+
return
|
|
761
492
|
self.page.mouse.click(
|
|
762
493
|
box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
|
|
763
494
|
)
|
|
@@ -768,7 +499,8 @@ class BaseBrowser:
|
|
|
768
499
|
self.page_history.append(deepcopy(self.page.url))
|
|
769
500
|
self.page = new_page
|
|
770
501
|
|
|
771
|
-
except
|
|
502
|
+
except Exception as e: # Consider using playwright specific
|
|
503
|
+
# TimeoutError
|
|
772
504
|
logger.debug(f"Error during click operation: {e}")
|
|
773
505
|
pass
|
|
774
506
|
|
|
@@ -776,6 +508,7 @@ class BaseBrowser:
|
|
|
776
508
|
|
|
777
509
|
def extract_url_content(self) -> str:
|
|
778
510
|
r"""Extract the content of the current page."""
|
|
511
|
+
assert self.page is not None
|
|
779
512
|
content = self.page.content()
|
|
780
513
|
return content
|
|
781
514
|
|
|
@@ -784,17 +517,17 @@ class BaseBrowser:
|
|
|
784
517
|
|
|
785
518
|
Args:
|
|
786
519
|
identifier (str): The identifier of the file to download.
|
|
787
|
-
file_path (str): The path to save the downloaded file.
|
|
788
520
|
|
|
789
521
|
Returns:
|
|
790
522
|
str: The result of the action.
|
|
791
523
|
"""
|
|
792
|
-
|
|
524
|
+
assert self.page is not None
|
|
793
525
|
if isinstance(identifier, int):
|
|
794
526
|
identifier = str(identifier)
|
|
795
527
|
try:
|
|
796
528
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
797
|
-
except
|
|
529
|
+
except Exception as e: # Consider using playwright specific
|
|
530
|
+
# TimeoutError
|
|
798
531
|
logger.debug(f"Error during download operation: {e}")
|
|
799
532
|
logger.warning(
|
|
800
533
|
f"Element with identifier '{identifier}' not found."
|
|
@@ -803,7 +536,7 @@ class BaseBrowser:
|
|
|
803
536
|
|
|
804
537
|
target.scroll_into_view_if_needed()
|
|
805
538
|
|
|
806
|
-
|
|
539
|
+
file_path_val = os.path.join(self.cache_dir)
|
|
807
540
|
self._wait_for_load()
|
|
808
541
|
|
|
809
542
|
try:
|
|
@@ -812,12 +545,13 @@ class BaseBrowser:
|
|
|
812
545
|
download = download_info.value
|
|
813
546
|
file_name = download.suggested_filename
|
|
814
547
|
|
|
815
|
-
|
|
816
|
-
download.save_as(
|
|
548
|
+
file_path_val = os.path.join(file_path_val, file_name)
|
|
549
|
+
download.save_as(file_path_val)
|
|
817
550
|
|
|
818
|
-
return f"Downloaded file to path '{
|
|
551
|
+
return f"Downloaded file to path '{file_path_val}'."
|
|
819
552
|
|
|
820
|
-
except
|
|
553
|
+
except Exception as e: # Consider using playwright specific
|
|
554
|
+
# TimeoutError
|
|
821
555
|
logger.debug(f"Error during download operation: {e}")
|
|
822
556
|
return f"Failed to download file with identifier '{identifier}'."
|
|
823
557
|
|
|
@@ -831,12 +565,14 @@ class BaseBrowser:
|
|
|
831
565
|
Returns:
|
|
832
566
|
str: The result of the action.
|
|
833
567
|
"""
|
|
568
|
+
assert self.page is not None
|
|
834
569
|
if isinstance(identifier, int):
|
|
835
570
|
identifier = str(identifier)
|
|
836
571
|
|
|
837
572
|
try:
|
|
838
573
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
839
|
-
except
|
|
574
|
+
except Exception as e: # Consider using playwright specific
|
|
575
|
+
# TimeoutError
|
|
840
576
|
logger.debug(f"Error during fill operation: {e}")
|
|
841
577
|
logger.warning(
|
|
842
578
|
f"Element with identifier '{identifier}' not found."
|
|
@@ -847,7 +583,8 @@ class BaseBrowser:
|
|
|
847
583
|
target.focus()
|
|
848
584
|
try:
|
|
849
585
|
target.fill(text)
|
|
850
|
-
except
|
|
586
|
+
except Exception as e: # Consider using playwright specific
|
|
587
|
+
# TimeoutError
|
|
851
588
|
logger.debug(f"Error during fill operation: {e}")
|
|
852
589
|
target.press_sequentially(text)
|
|
853
590
|
|
|
@@ -859,11 +596,13 @@ class BaseBrowser:
|
|
|
859
596
|
)
|
|
860
597
|
|
|
861
598
|
def scroll_to_bottom(self) -> str:
|
|
599
|
+
assert self.page is not None
|
|
862
600
|
self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
863
601
|
self._wait_for_load()
|
|
864
602
|
return "Scrolled to the bottom of the page."
|
|
865
603
|
|
|
866
604
|
def scroll_to_top(self) -> str:
|
|
605
|
+
assert self.page is not None
|
|
867
606
|
self.page.evaluate("window.scrollTo(0, 0);")
|
|
868
607
|
self._wait_for_load()
|
|
869
608
|
return "Scrolled to the top of the page."
|
|
@@ -877,11 +616,13 @@ class BaseBrowser:
|
|
|
877
616
|
Returns:
|
|
878
617
|
str: The result of the action.
|
|
879
618
|
"""
|
|
619
|
+
assert self.page is not None
|
|
880
620
|
if isinstance(identifier, int):
|
|
881
621
|
identifier = str(identifier)
|
|
882
622
|
try:
|
|
883
623
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
884
|
-
except
|
|
624
|
+
except Exception as e: # Consider using playwright specific
|
|
625
|
+
# TimeoutError
|
|
885
626
|
logger.debug(f"Error during hover operation: {e}")
|
|
886
627
|
logger.warning(
|
|
887
628
|
f"Element with identifier '{identifier}' not found."
|
|
@@ -899,15 +640,18 @@ class BaseBrowser:
|
|
|
899
640
|
the text.
|
|
900
641
|
"""
|
|
901
642
|
# ruff: noqa: E501
|
|
643
|
+
assert self.page is not None
|
|
902
644
|
script = f"""
|
|
903
|
-
(function() {{
|
|
645
|
+
(function() {{
|
|
904
646
|
let text = "{search_text}";
|
|
905
647
|
let found = window.find(text);
|
|
906
648
|
if (!found) {{
|
|
907
|
-
let elements = document.querySelectorAll("*:not(script):not(
|
|
649
|
+
let elements = document.querySelectorAll("*:not(script):not(
|
|
650
|
+
style)");
|
|
908
651
|
for (let el of elements) {{
|
|
909
652
|
if (el.innerText && el.innerText.includes(text)) {{
|
|
910
|
-
el.scrollIntoView({{behavior: "smooth", block:
|
|
653
|
+
el.scrollIntoView({{behavior: "smooth", block:
|
|
654
|
+
"center"}});
|
|
911
655
|
el.style.backgroundColor = "yellow";
|
|
912
656
|
el.style.border = '2px solid red';
|
|
913
657
|
return true;
|
|
@@ -918,7 +662,8 @@ class BaseBrowser:
|
|
|
918
662
|
return true;
|
|
919
663
|
}})();
|
|
920
664
|
"""
|
|
921
|
-
|
|
665
|
+
found_eval = self.page.evaluate(script)
|
|
666
|
+
found = cast(bool, found_eval) # Ensure found is bool
|
|
922
667
|
self._wait_for_load()
|
|
923
668
|
if found:
|
|
924
669
|
return f"Found text '{search_text}' on the page."
|
|
@@ -927,7 +672,7 @@ class BaseBrowser:
|
|
|
927
672
|
|
|
928
673
|
def back(self):
|
|
929
674
|
r"""Navigate back to the previous page."""
|
|
930
|
-
|
|
675
|
+
assert self.page is not None
|
|
931
676
|
page_url_before = self.page.url
|
|
932
677
|
self.page.go_back()
|
|
933
678
|
|
|
@@ -945,15 +690,21 @@ class BaseBrowser:
|
|
|
945
690
|
self._wait_for_load()
|
|
946
691
|
|
|
947
692
|
def close(self):
|
|
693
|
+
assert self.browser is not None
|
|
948
694
|
self.browser.close()
|
|
695
|
+
if self.playwright:
|
|
696
|
+
self.playwright.stop() # Stop playwright instance
|
|
949
697
|
|
|
950
698
|
# ruff: noqa: E501
|
|
951
699
|
def show_interactive_elements(self):
|
|
952
700
|
r"""Show simple interactive elements on the current page."""
|
|
701
|
+
assert self.page is not None
|
|
953
702
|
self.page.evaluate(self.page_script)
|
|
954
703
|
self.page.evaluate("""
|
|
955
704
|
() => {
|
|
956
|
-
document.querySelectorAll('a, button, input, select, textarea,
|
|
705
|
+
document.querySelectorAll('a, button, input, select, textarea,
|
|
706
|
+
[tabindex]:not([tabindex="-1"]),
|
|
707
|
+
[contenteditable="true"]').forEach(el => {
|
|
957
708
|
el.style.border = '2px solid red';
|
|
958
709
|
});
|
|
959
710
|
}
|
|
@@ -963,6 +714,7 @@ class BaseBrowser:
|
|
|
963
714
|
def get_webpage_content(self) -> str:
|
|
964
715
|
from html2text import html2text
|
|
965
716
|
|
|
717
|
+
assert self.page is not None
|
|
966
718
|
self._wait_for_load()
|
|
967
719
|
html_content = self.page.content()
|
|
968
720
|
|
|
@@ -1048,25 +800,32 @@ class BrowserToolkit(BaseToolkit):
|
|
|
1048
800
|
(default: :obj:`"en`")
|
|
1049
801
|
cookie_json_path (Optional[str]): Path to a JSON file containing
|
|
1050
802
|
authentication cookies and browser storage state. If provided
|
|
1051
|
-
and the file exists, the browser will load this state to
|
|
803
|
+
and the file exists, the browser will load this state to
|
|
804
|
+
maintain
|
|
1052
805
|
authenticated sessions without requiring manual login.
|
|
1053
806
|
(default: :obj:`None`)
|
|
1054
807
|
"""
|
|
1055
|
-
|
|
808
|
+
super().__init__() # Call to super().__init__() added
|
|
1056
809
|
self.browser = BaseBrowser(
|
|
1057
810
|
headless=headless,
|
|
1058
811
|
cache_dir=cache_dir,
|
|
1059
812
|
channel=channel,
|
|
1060
813
|
cookie_json_path=cookie_json_path,
|
|
1061
814
|
)
|
|
815
|
+
self.browser.web_agent_model = web_agent_model # Pass model to
|
|
816
|
+
# BaseBrowser instance
|
|
1062
817
|
|
|
1063
818
|
self.history_window = history_window
|
|
1064
819
|
self.web_agent_model = web_agent_model
|
|
1065
820
|
self.planning_agent_model = planning_agent_model
|
|
1066
821
|
self.output_language = output_language
|
|
1067
822
|
|
|
1068
|
-
self.history:
|
|
1069
|
-
self.web_agent
|
|
823
|
+
self.history: List[Dict[str, Any]] = [] # Typed history list
|
|
824
|
+
self.web_agent: ChatAgent
|
|
825
|
+
self.planning_agent: ChatAgent
|
|
826
|
+
self.web_agent, self.planning_agent = self._initialize_agent(
|
|
827
|
+
web_agent_model, planning_agent_model
|
|
828
|
+
)
|
|
1070
829
|
|
|
1071
830
|
def _reset(self):
|
|
1072
831
|
self.web_agent.reset()
|
|
@@ -1074,43 +833,40 @@ class BrowserToolkit(BaseToolkit):
|
|
|
1074
833
|
self.history = []
|
|
1075
834
|
os.makedirs(self.browser.cache_dir, exist_ok=True)
|
|
1076
835
|
|
|
1077
|
-
def _initialize_agent(
|
|
836
|
+
def _initialize_agent(
|
|
837
|
+
self,
|
|
838
|
+
web_agent_model_backend: Optional[BaseModelBackend],
|
|
839
|
+
planning_agent_model_backend: Optional[BaseModelBackend],
|
|
840
|
+
) -> Tuple[ChatAgent, ChatAgent]:
|
|
1078
841
|
r"""Initialize the agent."""
|
|
1079
842
|
from camel.agents import ChatAgent
|
|
1080
843
|
|
|
1081
|
-
if
|
|
1082
|
-
|
|
844
|
+
if web_agent_model_backend is None:
|
|
845
|
+
web_agent_model_instance = ModelFactory.create(
|
|
1083
846
|
model_platform=ModelPlatformType.OPENAI,
|
|
1084
847
|
model_type=ModelType.GPT_4_1,
|
|
1085
848
|
model_config_dict={"temperature": 0, "top_p": 1},
|
|
1086
849
|
)
|
|
1087
850
|
else:
|
|
1088
|
-
|
|
851
|
+
web_agent_model_instance = web_agent_model_backend
|
|
1089
852
|
|
|
1090
|
-
if
|
|
853
|
+
if planning_agent_model_backend is None:
|
|
1091
854
|
planning_model = ModelFactory.create(
|
|
1092
855
|
model_platform=ModelPlatformType.OPENAI,
|
|
1093
856
|
model_type=ModelType.O3_MINI,
|
|
1094
857
|
)
|
|
1095
858
|
else:
|
|
1096
|
-
planning_model =
|
|
859
|
+
planning_model = planning_agent_model_backend
|
|
1097
860
|
|
|
1098
|
-
system_prompt =
|
|
1099
|
-
You are a helpful web agent that can assist users in browsing the web.
|
|
1100
|
-
Given a high-level task, you can leverage predefined browser tools to help
|
|
1101
|
-
users achieve their goals.
|
|
1102
|
-
"""
|
|
861
|
+
system_prompt = WEB_AGENT_SYSTEM_PROMPT
|
|
1103
862
|
|
|
1104
863
|
web_agent = ChatAgent(
|
|
1105
864
|
system_message=system_prompt,
|
|
1106
|
-
model=
|
|
865
|
+
model=web_agent_model_instance,
|
|
1107
866
|
output_language=self.output_language,
|
|
1108
867
|
)
|
|
1109
868
|
|
|
1110
|
-
planning_system_prompt =
|
|
1111
|
-
You are a helpful planning agent that can assist users in planning complex
|
|
1112
|
-
tasks which need multi-step browser interaction.
|
|
1113
|
-
"""
|
|
869
|
+
planning_system_prompt = PLANNING_AGENT_SYSTEM_PROMPT
|
|
1114
870
|
|
|
1115
871
|
planning_agent = ChatAgent(
|
|
1116
872
|
system_message=planning_system_prompt,
|
|
@@ -1123,96 +879,24 @@ tasks which need multi-step browser interaction.
|
|
|
1123
879
|
def _observe(
|
|
1124
880
|
self, task_prompt: str, detailed_plan: Optional[str] = None
|
|
1125
881
|
) -> Tuple[str, str, str]:
|
|
1126
|
-
r"""Let agent observe the current environment, and get the next
|
|
882
|
+
r"""Let agent observe the current environment, and get the next
|
|
883
|
+
action."""
|
|
1127
884
|
|
|
1128
|
-
|
|
885
|
+
detailed_plan_prompt_str = ""
|
|
1129
886
|
|
|
1130
887
|
if detailed_plan is not None:
|
|
1131
|
-
|
|
888
|
+
detailed_plan_prompt_str = f"""
|
|
1132
889
|
Here is a plan about how to solve the task step-by-step which you must follow:
|
|
1133
890
|
<detailed_plan>{detailed_plan}<detailed_plan>
|
|
1134
891
|
"""
|
|
1135
892
|
|
|
1136
|
-
observe_prompt =
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
the browser, and provide the next appropriate action to take.
|
|
1144
|
-
|
|
1145
|
-
{detailed_plan_prompt}
|
|
1146
|
-
|
|
1147
|
-
Here are the current available browser functions you can use:
|
|
1148
|
-
{AVAILABLE_ACTIONS_PROMPT}
|
|
1149
|
-
|
|
1150
|
-
Here are the latest {self.history_window} trajectory (at most) you have taken:
|
|
1151
|
-
<history>
|
|
1152
|
-
{self.history[-self.history_window :]}
|
|
1153
|
-
</history>
|
|
1154
|
-
|
|
1155
|
-
Your output should be in json format, including the following fields:
|
|
1156
|
-
- `observation`: The detailed image description about the current viewport. Do
|
|
1157
|
-
not over-confident about the correctness of the history actions. You should
|
|
1158
|
-
always check the current viewport to make sure the correctness of the next
|
|
1159
|
-
action.
|
|
1160
|
-
- `reasoning`: The reasoning about the next action you want to take, and the
|
|
1161
|
-
possible obstacles you may encounter, and how to solve them. Do not forget to
|
|
1162
|
-
check the history actions to avoid the same mistakes.
|
|
1163
|
-
- `action_code`: The action code you want to take. It is only one step action
|
|
1164
|
-
code, without any other texts (such as annotation)
|
|
1165
|
-
|
|
1166
|
-
Here is two example of the output:
|
|
1167
|
-
```json
|
|
1168
|
-
{{
|
|
1169
|
-
"observation": [IMAGE_DESCRIPTION],
|
|
1170
|
-
"reasoning": [YOUR_REASONING],
|
|
1171
|
-
"action_code": "fill_input_id([ID], [TEXT])"
|
|
1172
|
-
}}
|
|
1173
|
-
|
|
1174
|
-
{{
|
|
1175
|
-
"observation": "The current page is a CAPTCHA verification page on Amazon. It asks the user to ..",
|
|
1176
|
-
"reasoning": "To proceed with the task of searching for products, I need to complete..",
|
|
1177
|
-
"action_code": "fill_input_id(3, 'AUXPMR')"
|
|
1178
|
-
}}
|
|
1179
|
-
|
|
1180
|
-
Here are some tips for you:
|
|
1181
|
-
- Never forget the overall question: **{task_prompt}**
|
|
1182
|
-
- Maybe after a certain operation (e.g. click_id), the page content has not
|
|
1183
|
-
changed. You can check whether the action step is successful by looking at the
|
|
1184
|
-
`success` of the action step in the history. If successful, it means that the
|
|
1185
|
-
page content is indeed the same after the click. You need to try other methods.
|
|
1186
|
-
- If using one way to solve the problem is not successful, try other ways.
|
|
1187
|
-
Make sure your provided ID is correct!
|
|
1188
|
-
- Some cases are very complex and need to be achieve by an iterative process.
|
|
1189
|
-
You can use the `back()` function to go back to the previous page to try other
|
|
1190
|
-
methods.
|
|
1191
|
-
- There are many links on the page, which may be useful for solving the
|
|
1192
|
-
problem. You can use the `click_id()` function to click on the link to see if
|
|
1193
|
-
it is useful.
|
|
1194
|
-
- Always keep in mind that your action must be based on the ID shown in the
|
|
1195
|
-
current image or viewport, not the ID shown in the history.
|
|
1196
|
-
- Do not use `stop()` lightly. Always remind yourself that the image only
|
|
1197
|
-
shows a part of the full page. If you cannot find the answer, try to use
|
|
1198
|
-
functions like `scroll_up()` and `scroll_down()` to check the full content of
|
|
1199
|
-
the webpage before doing anything else, because the answer or next key step
|
|
1200
|
-
may be hidden in the content below.
|
|
1201
|
-
- If the webpage needs human verification, you must avoid processing it.
|
|
1202
|
-
Please use `back()` to go back to the previous page, and try other ways.
|
|
1203
|
-
- If you have tried everything and still cannot resolve the issue, please stop
|
|
1204
|
-
the simulation, and report issues you have encountered.
|
|
1205
|
-
- Check the history actions carefully, detect whether you have repeatedly made
|
|
1206
|
-
the same actions or not.
|
|
1207
|
-
- When dealing with wikipedia revision history related tasks, you need to
|
|
1208
|
-
think about the solution flexibly. First, adjust the browsing history
|
|
1209
|
-
displayed on a single page to the maximum, and then make use of the
|
|
1210
|
-
find_text_on_page function. This is extremely useful which can quickly locate
|
|
1211
|
-
the text you want to find and skip massive amount of useless information.
|
|
1212
|
-
- Flexibly use interactive elements like slide down selection bar to filter
|
|
1213
|
-
out the information you need. Sometimes they are extremely useful.
|
|
1214
|
-
```
|
|
1215
|
-
"""
|
|
893
|
+
observe_prompt = OBSERVE_PROMPT_TEMPLATE.format(
|
|
894
|
+
task_prompt=task_prompt,
|
|
895
|
+
detailed_plan_prompt=detailed_plan_prompt_str,
|
|
896
|
+
AVAILABLE_ACTIONS_PROMPT=AVAILABLE_ACTIONS_PROMPT,
|
|
897
|
+
history_window=self.history_window,
|
|
898
|
+
history=self.history[-self.history_window :],
|
|
899
|
+
)
|
|
1216
900
|
|
|
1217
901
|
# get current state
|
|
1218
902
|
som_screenshot, _ = self.browser.get_som_screenshot(save_image=True)
|
|
@@ -1226,7 +910,8 @@ out the information you need. Sometimes they are extremely useful.
|
|
|
1226
910
|
|
|
1227
911
|
resp_content = resp.msgs[0].content
|
|
1228
912
|
|
|
1229
|
-
resp_dict = _parse_json_output(resp_content)
|
|
913
|
+
resp_dict = _parse_json_output(resp_content, logger) # Pass logger to
|
|
914
|
+
# _parse_json_output
|
|
1230
915
|
observation_result: str = resp_dict.get("observation", "")
|
|
1231
916
|
reasoning_result: str = resp_dict.get("reasoning", "")
|
|
1232
917
|
action_code: str = resp_dict.get("action_code", "")
|
|
@@ -1247,7 +932,10 @@ out the information you need. Sometimes they are extremely useful.
|
|
|
1247
932
|
id_part = (
|
|
1248
933
|
parts[0].replace("fill_input_id(", "").strip()
|
|
1249
934
|
)
|
|
1250
|
-
action_code =
|
|
935
|
+
action_code = (
|
|
936
|
+
f"fill_input_id({id_part}, 'Please "
|
|
937
|
+
f"fill the text here.')"
|
|
938
|
+
)
|
|
1251
939
|
|
|
1252
940
|
action_code = action_code.replace("`", "").strip()
|
|
1253
941
|
|
|
@@ -1349,43 +1037,36 @@ out the information you need. Sometimes they are extremely useful.
|
|
|
1349
1037
|
)
|
|
1350
1038
|
|
|
1351
1039
|
def _get_final_answer(self, task_prompt: str) -> str:
|
|
1352
|
-
r"""Get the final answer based on the task prompt and current
|
|
1353
|
-
|
|
1040
|
+
r"""Get the final answer based on the task prompt and current
|
|
1041
|
+
browser state.
|
|
1042
|
+
It is used when the agent thinks that the task can be completed
|
|
1043
|
+
without any further action, and answer can be directly found in the
|
|
1044
|
+
current viewport.
|
|
1354
1045
|
"""
|
|
1355
1046
|
|
|
1356
|
-
prompt =
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
<history>{self.history}</history>
|
|
1360
|
-
Please find the final answer, or give valuable insights and founds (e.g. if previous actions contain downloading files, your output should include the path of the downloaded file) about the overall task: <task>{task_prompt}</task>
|
|
1361
|
-
"""
|
|
1047
|
+
prompt = GET_FINAL_ANSWER_PROMPT_TEMPLATE.format(
|
|
1048
|
+
history=self.history, task_prompt=task_prompt
|
|
1049
|
+
)
|
|
1362
1050
|
|
|
1363
1051
|
message = BaseMessage.make_user_message(
|
|
1364
1052
|
role_name='user',
|
|
1365
1053
|
content=prompt,
|
|
1366
1054
|
)
|
|
1367
|
-
|
|
1055
|
+
self.web_agent.reset() # Reset before step
|
|
1368
1056
|
resp = self.web_agent.step(message)
|
|
1369
1057
|
return resp.msgs[0].content
|
|
1370
1058
|
|
|
1371
1059
|
def _task_planning(self, task_prompt: str, start_url: str) -> str:
|
|
1372
1060
|
r"""Plan the task based on the given task prompt."""
|
|
1373
1061
|
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
<task>{task_prompt}</task>
|
|
1378
|
-
According to the problem above, if we use browser interaction, what is the general process of the interaction after visiting the webpage `{start_url}`?
|
|
1379
|
-
|
|
1380
|
-
Please note that it can be viewed as Partially Observable MDP. Do not over-confident about your plan.
|
|
1381
|
-
Please first restate the task in detail, and then provide a detailed plan to solve the task.
|
|
1382
|
-
"""
|
|
1383
|
-
# Here are some tips for you: Please note that we can only see a part of the full page because of the limited viewport after an action. Thus, do not forget to use methods like `scroll_up()` and `scroll_down()` to check the full content of the webpage, because the answer or next key step may be hidden in the content below.
|
|
1062
|
+
planning_prompt = TASK_PLANNING_PROMPT_TEMPLATE.format(
|
|
1063
|
+
task_prompt=task_prompt, start_url=start_url
|
|
1064
|
+
)
|
|
1384
1065
|
|
|
1385
1066
|
message = BaseMessage.make_user_message(
|
|
1386
1067
|
role_name='user', content=planning_prompt
|
|
1387
1068
|
)
|
|
1388
|
-
|
|
1069
|
+
self.planning_agent.reset() # Reset before step
|
|
1389
1070
|
resp = self.planning_agent.step(message)
|
|
1390
1071
|
return resp.msgs[0].content
|
|
1391
1072
|
|
|
@@ -1399,35 +1080,26 @@ Please first restate the task in detail, and then provide a detailed plan to sol
|
|
|
1399
1080
|
detailed_plan (str): The detailed plan to replan.
|
|
1400
1081
|
|
|
1401
1082
|
Returns:
|
|
1402
|
-
Tuple[bool, str]: A tuple containing a boolean indicating
|
|
1083
|
+
Tuple[bool, str]: A tuple containing a boolean indicating
|
|
1084
|
+
whether the task needs to be replanned, and the replanned schema.
|
|
1403
1085
|
"""
|
|
1404
1086
|
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
In order to solve the task, we made a detailed plan previously. Here is the detailed plan:
|
|
1412
|
-
<detailed plan>{detailed_plan}</detailed plan>
|
|
1413
|
-
|
|
1414
|
-
According to the task above, we have made a series of observations, reasonings, and actions. Here are the latest {self.history_window} trajectory (at most) we have taken:
|
|
1415
|
-
<history>{self.history[-self.history_window :]}</history>
|
|
1416
|
-
|
|
1417
|
-
However, the task is not completed yet. As the task is partially observable, we may need to replan the task based on the current state of the browser if necessary.
|
|
1418
|
-
Now please carefully examine the current task planning schema, and our history actions, and then judge whether the task needs to be fundamentally replanned. If so, please provide a detailed replanned schema (including the restated overall task).
|
|
1419
|
-
|
|
1420
|
-
Your output should be in json format, including the following fields:
|
|
1421
|
-
- `if_need_replan`: bool, A boolean value indicating whether the task needs to be fundamentally replanned.
|
|
1422
|
-
- `replanned_schema`: str, The replanned schema for the task, which should not be changed too much compared with the original one. If the task does not need to be replanned, the value should be an empty string.
|
|
1423
|
-
"""
|
|
1087
|
+
replanning_prompt = TASK_REPLANNING_PROMPT_TEMPLATE.format(
|
|
1088
|
+
task_prompt=task_prompt,
|
|
1089
|
+
detailed_plan=detailed_plan,
|
|
1090
|
+
history_window=self.history_window,
|
|
1091
|
+
history=self.history[-self.history_window :],
|
|
1092
|
+
)
|
|
1424
1093
|
# Reset the history message of planning_agent.
|
|
1425
1094
|
self.planning_agent.reset()
|
|
1426
1095
|
resp = self.planning_agent.step(replanning_prompt)
|
|
1427
|
-
resp_dict = _parse_json_output(
|
|
1096
|
+
resp_dict = _parse_json_output(
|
|
1097
|
+
resp.msgs[0].content, logger
|
|
1098
|
+
) # Pass logger
|
|
1428
1099
|
|
|
1429
|
-
|
|
1430
|
-
|
|
1100
|
+
if_need_replan_eval = resp_dict.get("if_need_replan", False)
|
|
1101
|
+
if_need_replan = cast(bool, if_need_replan_eval) # Ensure bool
|
|
1102
|
+
replanned_schema: str = resp_dict.get("replanned_schema", "")
|
|
1431
1103
|
|
|
1432
1104
|
if if_need_replan:
|
|
1433
1105
|
return True, replanned_schema
|
|
@@ -1466,10 +1138,10 @@ Your output should be in json format, including the following fields:
|
|
|
1466
1138
|
logger.debug(f"Observation: {observation}")
|
|
1467
1139
|
logger.debug(f"Reasoning: {reasoning}")
|
|
1468
1140
|
logger.debug(f"Action code: {action_code}")
|
|
1469
|
-
|
|
1141
|
+
trajectory_info: Dict[str, Any]
|
|
1470
1142
|
if "stop" in action_code:
|
|
1471
1143
|
task_completed = True
|
|
1472
|
-
trajectory_info = {
|
|
1144
|
+
trajectory_info = { # Typed trajectory_info
|
|
1473
1145
|
"round": i,
|
|
1474
1146
|
"observation": observation,
|
|
1475
1147
|
"thought": reasoning,
|
|
@@ -1486,7 +1158,7 @@ Your output should be in json format, including the following fields:
|
|
|
1486
1158
|
if not success:
|
|
1487
1159
|
logger.warning(f"Error while executing the action: {info}")
|
|
1488
1160
|
|
|
1489
|
-
trajectory_info = {
|
|
1161
|
+
trajectory_info = { # Typed trajectory_info
|
|
1490
1162
|
"round": i,
|
|
1491
1163
|
"observation": observation,
|
|
1492
1164
|
"thought": reasoning,
|
|
@@ -1505,15 +1177,20 @@ Your output should be in json format, including the following fields:
|
|
|
1505
1177
|
detailed_plan = replanned_schema
|
|
1506
1178
|
logger.debug(f"Replanned schema: {replanned_schema}")
|
|
1507
1179
|
|
|
1180
|
+
simulation_result: str
|
|
1508
1181
|
if not task_completed:
|
|
1509
1182
|
simulation_result = f"""
|
|
1510
|
-
The task is not completed within the round limit. Please
|
|
1183
|
+
The task is not completed within the round limit. Please
|
|
1184
|
+
check the last round {self.history_window} information to
|
|
1185
|
+
see if there is any useful information:
|
|
1511
1186
|
<history>{self.history[-self.history_window :]}</history>
|
|
1512
1187
|
"""
|
|
1513
1188
|
|
|
1514
1189
|
else:
|
|
1515
1190
|
simulation_result = self._get_final_answer(task_prompt)
|
|
1516
1191
|
|
|
1192
|
+
self.browser.close() # Close browser after task completion or limit
|
|
1193
|
+
# reached
|
|
1517
1194
|
return simulation_result
|
|
1518
1195
|
|
|
1519
1196
|
def get_tools(self) -> List[FunctionTool]:
|