camel-ai 0.2.59__py3-none-any.whl → 0.2.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +158 -7
- camel/configs/anthropic_config.py +6 -5
- camel/configs/cohere_config.py +1 -1
- camel/configs/mistral_config.py +1 -1
- camel/configs/openai_config.py +3 -0
- camel/configs/reka_config.py +1 -1
- camel/configs/samba_config.py +2 -2
- camel/datagen/cot_datagen.py +29 -34
- camel/datagen/evol_instruct/scorer.py +22 -23
- camel/datagen/evol_instruct/templates.py +46 -46
- camel/datasets/static_dataset.py +144 -0
- camel/embeddings/jina_embedding.py +8 -1
- camel/embeddings/sentence_transformers_embeddings.py +2 -2
- camel/embeddings/vlm_embedding.py +9 -2
- camel/loaders/__init__.py +5 -2
- camel/loaders/chunkr_reader.py +117 -91
- camel/loaders/mistral_reader.py +148 -0
- camel/memories/blocks/chat_history_block.py +1 -2
- camel/memories/records.py +3 -0
- camel/messages/base.py +15 -3
- camel/models/azure_openai_model.py +1 -0
- camel/models/model_factory.py +2 -2
- camel/models/model_manager.py +7 -3
- camel/retrievers/bm25_retriever.py +1 -2
- camel/retrievers/hybrid_retrival.py +2 -2
- camel/societies/workforce/workforce.py +65 -24
- camel/storages/__init__.py +2 -0
- camel/storages/vectordb_storages/__init__.py +2 -0
- camel/storages/vectordb_storages/faiss.py +712 -0
- camel/storages/vectordb_storages/oceanbase.py +1 -2
- camel/toolkits/__init__.py +2 -0
- camel/toolkits/async_browser_toolkit.py +80 -524
- camel/toolkits/bohrium_toolkit.py +318 -0
- camel/toolkits/browser_toolkit.py +221 -541
- camel/toolkits/browser_toolkit_commons.py +568 -0
- camel/toolkits/dalle_toolkit.py +4 -0
- camel/toolkits/excel_toolkit.py +8 -2
- camel/toolkits/file_write_toolkit.py +76 -29
- camel/toolkits/github_toolkit.py +43 -25
- camel/toolkits/image_analysis_toolkit.py +3 -0
- camel/toolkits/jina_reranker_toolkit.py +194 -77
- camel/toolkits/mcp_toolkit.py +134 -16
- camel/toolkits/page_script.js +40 -28
- camel/toolkits/twitter_toolkit.py +6 -1
- camel/toolkits/video_analysis_toolkit.py +3 -0
- camel/toolkits/video_download_toolkit.py +3 -0
- camel/toolkits/wolfram_alpha_toolkit.py +51 -23
- camel/types/enums.py +27 -6
- camel/utils/__init__.py +2 -0
- camel/utils/commons.py +27 -0
- {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/METADATA +17 -9
- {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/RECORD +55 -51
- {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.59.dist-info → camel_ai-0.2.61.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,11 +12,12 @@
|
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
14
|
|
|
15
|
+
# Enables postponed evaluation of annotations (for string-based type hints)
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
15
18
|
import datetime
|
|
16
19
|
import io
|
|
17
|
-
import json
|
|
18
20
|
import os
|
|
19
|
-
import random
|
|
20
21
|
import re
|
|
21
22
|
import shutil
|
|
22
23
|
import time
|
|
@@ -25,21 +26,17 @@ from copy import deepcopy
|
|
|
25
26
|
from typing import (
|
|
26
27
|
TYPE_CHECKING,
|
|
27
28
|
Any,
|
|
28
|
-
BinaryIO,
|
|
29
29
|
Dict,
|
|
30
30
|
List,
|
|
31
31
|
Literal,
|
|
32
32
|
Optional,
|
|
33
33
|
Tuple,
|
|
34
|
-
TypedDict,
|
|
35
34
|
Union,
|
|
36
35
|
cast,
|
|
37
36
|
)
|
|
38
37
|
|
|
39
|
-
from PIL import Image
|
|
38
|
+
from PIL import Image
|
|
40
39
|
|
|
41
|
-
if TYPE_CHECKING:
|
|
42
|
-
from camel.agents import ChatAgent
|
|
43
40
|
from camel.logger import get_logger
|
|
44
41
|
from camel.messages import BaseMessage
|
|
45
42
|
from camel.models import BaseModelBackend, ModelFactory
|
|
@@ -53,85 +50,39 @@ from camel.utils import (
|
|
|
53
50
|
sanitize_filename,
|
|
54
51
|
)
|
|
55
52
|
|
|
56
|
-
|
|
53
|
+
# Import shared components from browser_toolkit_commons
|
|
54
|
+
from .browser_toolkit_commons import (
|
|
55
|
+
ACTION_WITH_FEEDBACK_LIST,
|
|
56
|
+
AVAILABLE_ACTIONS_PROMPT,
|
|
57
|
+
GET_FINAL_ANSWER_PROMPT_TEMPLATE,
|
|
58
|
+
OBSERVE_PROMPT_TEMPLATE,
|
|
59
|
+
PLANNING_AGENT_SYSTEM_PROMPT,
|
|
60
|
+
TASK_PLANNING_PROMPT_TEMPLATE,
|
|
61
|
+
TASK_REPLANNING_PROMPT_TEMPLATE,
|
|
62
|
+
WEB_AGENT_SYSTEM_PROMPT,
|
|
63
|
+
InteractiveRegion,
|
|
64
|
+
VisualViewport,
|
|
65
|
+
_add_set_of_mark,
|
|
66
|
+
_parse_json_output,
|
|
67
|
+
_reload_image,
|
|
68
|
+
interactive_region_from_dict,
|
|
69
|
+
visual_viewport_from_dict,
|
|
70
|
+
)
|
|
57
71
|
|
|
58
|
-
|
|
72
|
+
if TYPE_CHECKING:
|
|
73
|
+
from playwright.sync_api import (
|
|
74
|
+
Browser,
|
|
75
|
+
BrowserContext,
|
|
76
|
+
FloatRect,
|
|
77
|
+
Page,
|
|
78
|
+
Playwright,
|
|
79
|
+
)
|
|
59
80
|
|
|
81
|
+
from camel.agents import ChatAgent
|
|
82
|
+
|
|
83
|
+
logger = get_logger(__name__)
|
|
60
84
|
|
|
61
|
-
|
|
62
|
-
1. `fill_input_id(identifier: Union[str, int], text: str)`: Fill an input
|
|
63
|
-
field (e.g. search box) with the given text and press Enter.
|
|
64
|
-
2. `click_id(identifier: Union[str, int])`: Click an element with the given ID.
|
|
65
|
-
3. `hover_id(identifier: Union[str, int])`: Hover over an element with the
|
|
66
|
-
given ID.
|
|
67
|
-
4. `download_file_id(identifier: Union[str, int])`: Download a file with the
|
|
68
|
-
given ID. It returns the path to the downloaded file. If the file is
|
|
69
|
-
successfully downloaded, you can stop the simulation and report the path to
|
|
70
|
-
the downloaded file for further processing.
|
|
71
|
-
5. `scroll_to_bottom()`: Scroll to the bottom of the page.
|
|
72
|
-
6. `scroll_to_top()`: Scroll to the top of the page.
|
|
73
|
-
7. `scroll_up()`: Scroll up the page. It is suitable when you want to see the
|
|
74
|
-
elements above the current viewport.
|
|
75
|
-
8. `scroll_down()`: Scroll down the page. It is suitable when you want to see
|
|
76
|
-
the elements below the current viewport. If the webpage does not change, It
|
|
77
|
-
means that the webpage has scrolled to the bottom.
|
|
78
|
-
9. `back()`: Navigate back to the previous page. This is useful when you want
|
|
79
|
-
to go back to the previous page, as current page is not useful.
|
|
80
|
-
10. `stop()`: Stop the action process, because the task is completed or failed
|
|
81
|
-
(impossible to find the answer). In this situation, you should provide your
|
|
82
|
-
answer in your output.
|
|
83
|
-
11. `get_url()`: Get the current URL of the current page.
|
|
84
|
-
12. `find_text_on_page(search_text: str)`: Find the next given text on the
|
|
85
|
-
current whole page, and scroll the page to the targeted text. It is equivalent
|
|
86
|
-
to pressing Ctrl + F and searching for the text, and is powerful when you want
|
|
87
|
-
to fast-check whether the current page contains some specific text.
|
|
88
|
-
13. `visit_page(url: str)`: Go to the specific url page.
|
|
89
|
-
14. `click_blank_area()`: Click a blank area of the page to unfocus the
|
|
90
|
-
current element. It is useful when you have clicked an element but it cannot
|
|
91
|
-
unfocus itself (e.g. Menu bar) to automatically render the updated webpage.
|
|
92
|
-
15. `ask_question_about_video(question: str)`: Ask a question about the
|
|
93
|
-
current webpage which contains video, e.g. youtube websites.
|
|
94
|
-
"""
|
|
95
|
-
|
|
96
|
-
ACTION_WITH_FEEDBACK_LIST = [
|
|
97
|
-
'ask_question_about_video',
|
|
98
|
-
'download_file_id',
|
|
99
|
-
'find_text_on_page',
|
|
100
|
-
]
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
# Code from magentic-one
|
|
104
|
-
class DOMRectangle(TypedDict):
|
|
105
|
-
x: Union[int, float]
|
|
106
|
-
y: Union[int, float]
|
|
107
|
-
width: Union[int, float]
|
|
108
|
-
height: Union[int, float]
|
|
109
|
-
top: Union[int, float]
|
|
110
|
-
right: Union[int, float]
|
|
111
|
-
bottom: Union[int, float]
|
|
112
|
-
left: Union[int, float]
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
class VisualViewport(TypedDict):
|
|
116
|
-
height: Union[int, float]
|
|
117
|
-
width: Union[int, float]
|
|
118
|
-
offsetLeft: Union[int, float]
|
|
119
|
-
offsetTop: Union[int, float]
|
|
120
|
-
pageLeft: Union[int, float]
|
|
121
|
-
pageTop: Union[int, float]
|
|
122
|
-
scale: Union[int, float]
|
|
123
|
-
clientWidth: Union[int, float]
|
|
124
|
-
clientHeight: Union[int, float]
|
|
125
|
-
scrollWidth: Union[int, float]
|
|
126
|
-
scrollHeight: Union[int, float]
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
class InteractiveRegion(TypedDict):
|
|
130
|
-
tag_name: str
|
|
131
|
-
role: str
|
|
132
|
-
aria_name: str
|
|
133
|
-
v_scrollable: bool
|
|
134
|
-
rects: List[DOMRectangle]
|
|
85
|
+
TOP_NO_LABEL_ZONE = 20
|
|
135
86
|
|
|
136
87
|
|
|
137
88
|
def _get_str(d: Any, k: str) -> str:
|
|
@@ -167,270 +118,6 @@ def _get_bool(d: Any, k: str) -> bool:
|
|
|
167
118
|
)
|
|
168
119
|
|
|
169
120
|
|
|
170
|
-
def _parse_json_output(text: str) -> Dict[str, Any]:
|
|
171
|
-
r"""Extract JSON output from a string."""
|
|
172
|
-
|
|
173
|
-
markdown_pattern = r'```(?:json)?\s*(.*?)\s*```'
|
|
174
|
-
markdown_match = re.search(markdown_pattern, text, re.DOTALL)
|
|
175
|
-
if markdown_match:
|
|
176
|
-
text = markdown_match.group(1).strip()
|
|
177
|
-
|
|
178
|
-
triple_quotes_pattern = r'"""(?:json)?\s*(.*?)\s*"""'
|
|
179
|
-
triple_quotes_match = re.search(triple_quotes_pattern, text, re.DOTALL)
|
|
180
|
-
if triple_quotes_match:
|
|
181
|
-
text = triple_quotes_match.group(1).strip()
|
|
182
|
-
|
|
183
|
-
try:
|
|
184
|
-
return json.loads(text)
|
|
185
|
-
except json.JSONDecodeError:
|
|
186
|
-
try:
|
|
187
|
-
fixed_text = re.sub(
|
|
188
|
-
r'`([^`]*?)`(?=\s*[:,\[\]{}]|$)', r'"\1"', text
|
|
189
|
-
)
|
|
190
|
-
return json.loads(fixed_text)
|
|
191
|
-
except json.JSONDecodeError:
|
|
192
|
-
result = {}
|
|
193
|
-
try:
|
|
194
|
-
bool_pattern = r'"(\w+)"\s*:\s*(true|false)'
|
|
195
|
-
for match in re.finditer(bool_pattern, text, re.IGNORECASE):
|
|
196
|
-
key, value = match.groups()
|
|
197
|
-
result[key] = value.lower() == "true"
|
|
198
|
-
|
|
199
|
-
str_pattern = r'"(\w+)"\s*:\s*"([^"]*)"'
|
|
200
|
-
for match in re.finditer(str_pattern, text):
|
|
201
|
-
key, value = match.groups()
|
|
202
|
-
result[key] = value
|
|
203
|
-
|
|
204
|
-
num_pattern = r'"(\w+)"\s*:\s*(-?\d+(?:\.\d+)?)'
|
|
205
|
-
for match in re.finditer(num_pattern, text):
|
|
206
|
-
key, value = match.groups()
|
|
207
|
-
try:
|
|
208
|
-
result[key] = int(value)
|
|
209
|
-
except ValueError:
|
|
210
|
-
result[key] = float(value)
|
|
211
|
-
|
|
212
|
-
empty_str_pattern = r'"(\w+)"\s*:\s*""'
|
|
213
|
-
for match in re.finditer(empty_str_pattern, text):
|
|
214
|
-
key = match.group(1)
|
|
215
|
-
result[key] = ""
|
|
216
|
-
|
|
217
|
-
if result:
|
|
218
|
-
return result
|
|
219
|
-
|
|
220
|
-
logger.warning(f"Failed to parse JSON output: {text}")
|
|
221
|
-
return {}
|
|
222
|
-
except Exception as e:
|
|
223
|
-
logger.warning(f"Error while extracting fields from JSON: {e}")
|
|
224
|
-
return {}
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def _reload_image(image: Image.Image) -> Image.Image:
|
|
228
|
-
buffer = io.BytesIO()
|
|
229
|
-
image.save(buffer, format="PNG")
|
|
230
|
-
buffer.seek(0)
|
|
231
|
-
return Image.open(buffer)
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
def dom_rectangle_from_dict(rect: Dict[str, Any]) -> DOMRectangle:
|
|
235
|
-
r"""Create a DOMRectangle object from a dictionary."""
|
|
236
|
-
return DOMRectangle(
|
|
237
|
-
x=_get_number(rect, "x"),
|
|
238
|
-
y=_get_number(rect, "y"),
|
|
239
|
-
width=_get_number(rect, "width"),
|
|
240
|
-
height=_get_number(rect, "height"),
|
|
241
|
-
top=_get_number(rect, "top"),
|
|
242
|
-
right=_get_number(rect, "right"),
|
|
243
|
-
bottom=_get_number(rect, "bottom"),
|
|
244
|
-
left=_get_number(rect, "left"),
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def interactive_region_from_dict(region: Dict[str, Any]) -> InteractiveRegion:
|
|
249
|
-
r"""Create an :class:`InteractiveRegion` object from a dictionary."""
|
|
250
|
-
typed_rects: List[DOMRectangle] = []
|
|
251
|
-
for rect in region["rects"]:
|
|
252
|
-
typed_rects.append(dom_rectangle_from_dict(rect))
|
|
253
|
-
|
|
254
|
-
return InteractiveRegion(
|
|
255
|
-
tag_name=_get_str(region, "tag_name"),
|
|
256
|
-
role=_get_str(region, "role"),
|
|
257
|
-
aria_name=_get_str(region, "aria-name"),
|
|
258
|
-
v_scrollable=_get_bool(region, "v-scrollable"),
|
|
259
|
-
rects=typed_rects,
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
def visual_viewport_from_dict(viewport: Dict[str, Any]) -> VisualViewport:
|
|
264
|
-
r"""Create a :class:`VisualViewport` object from a dictionary."""
|
|
265
|
-
return VisualViewport(
|
|
266
|
-
height=_get_number(viewport, "height"),
|
|
267
|
-
width=_get_number(viewport, "width"),
|
|
268
|
-
offsetLeft=_get_number(viewport, "offsetLeft"),
|
|
269
|
-
offsetTop=_get_number(viewport, "offsetTop"),
|
|
270
|
-
pageLeft=_get_number(viewport, "pageLeft"),
|
|
271
|
-
pageTop=_get_number(viewport, "pageTop"),
|
|
272
|
-
scale=_get_number(viewport, "scale"),
|
|
273
|
-
clientWidth=_get_number(viewport, "clientWidth"),
|
|
274
|
-
clientHeight=_get_number(viewport, "clientHeight"),
|
|
275
|
-
scrollWidth=_get_number(viewport, "scrollWidth"),
|
|
276
|
-
scrollHeight=_get_number(viewport, "scrollHeight"),
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
def add_set_of_mark(
|
|
281
|
-
screenshot: Union[bytes, Image.Image, io.BufferedIOBase],
|
|
282
|
-
ROIs: Dict[str, InteractiveRegion],
|
|
283
|
-
) -> Tuple[Image.Image, List[str], List[str], List[str]]:
|
|
284
|
-
if isinstance(screenshot, Image.Image):
|
|
285
|
-
return _add_set_of_mark(screenshot, ROIs)
|
|
286
|
-
|
|
287
|
-
if isinstance(screenshot, bytes):
|
|
288
|
-
screenshot = io.BytesIO(screenshot)
|
|
289
|
-
|
|
290
|
-
image = Image.open(cast(BinaryIO, screenshot))
|
|
291
|
-
comp, visible_rects, rects_above, rects_below = _add_set_of_mark(
|
|
292
|
-
image, ROIs
|
|
293
|
-
)
|
|
294
|
-
image.close()
|
|
295
|
-
return comp, visible_rects, rects_above, rects_below
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
def _add_set_of_mark(
|
|
299
|
-
screenshot: Image.Image, ROIs: Dict[str, InteractiveRegion]
|
|
300
|
-
) -> Tuple[Image.Image, List[str], List[str], List[str]]:
|
|
301
|
-
r"""Add a set of marks to the screenshot.
|
|
302
|
-
|
|
303
|
-
Args:
|
|
304
|
-
screenshot (Image.Image): The screenshot to add marks to.
|
|
305
|
-
ROIs (Dict[str, InteractiveRegion]): The regions to add marks to.
|
|
306
|
-
|
|
307
|
-
Returns:
|
|
308
|
-
Tuple[Image.Image, List[str], List[str], List[str]]: A tuple
|
|
309
|
-
containing the screenshot with marked ROIs, ROIs fully within the
|
|
310
|
-
images, ROIs located above the visible area, and ROIs located below
|
|
311
|
-
the visible area.
|
|
312
|
-
"""
|
|
313
|
-
visible_rects: List[str] = list()
|
|
314
|
-
rects_above: List[str] = list() # Scroll up to see
|
|
315
|
-
rects_below: List[str] = list() # Scroll down to see
|
|
316
|
-
|
|
317
|
-
fnt = ImageFont.load_default(14)
|
|
318
|
-
base = screenshot.convert("L").convert("RGBA")
|
|
319
|
-
overlay = Image.new("RGBA", base.size)
|
|
320
|
-
|
|
321
|
-
draw = ImageDraw.Draw(overlay)
|
|
322
|
-
for r in ROIs:
|
|
323
|
-
for rect in ROIs[r]["rects"]:
|
|
324
|
-
# Empty rectangles
|
|
325
|
-
if not rect or rect["width"] == 0 or rect["height"] == 0:
|
|
326
|
-
continue
|
|
327
|
-
|
|
328
|
-
# TODO: add scroll left and right?
|
|
329
|
-
horizontal_center = (rect["right"] + rect["left"]) / 2.0
|
|
330
|
-
vertical_center = (rect["top"] + rect["bottom"]) / 2.0
|
|
331
|
-
is_within_horizon = 0 <= horizontal_center < base.size[0]
|
|
332
|
-
is_above_viewport = vertical_center < 0
|
|
333
|
-
is_below_viewport = vertical_center >= base.size[1]
|
|
334
|
-
|
|
335
|
-
if is_within_horizon:
|
|
336
|
-
if is_above_viewport:
|
|
337
|
-
rects_above.append(r)
|
|
338
|
-
elif is_below_viewport:
|
|
339
|
-
rects_below.append(r)
|
|
340
|
-
else: # Fully visible
|
|
341
|
-
visible_rects.append(r)
|
|
342
|
-
_draw_roi(draw, int(r), fnt, rect)
|
|
343
|
-
|
|
344
|
-
comp = Image.alpha_composite(base, overlay)
|
|
345
|
-
overlay.close()
|
|
346
|
-
return comp, visible_rects, rects_above, rects_below
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
def _draw_roi(
|
|
350
|
-
draw: ImageDraw.ImageDraw,
|
|
351
|
-
idx: int,
|
|
352
|
-
font: ImageFont.FreeTypeFont | ImageFont.ImageFont,
|
|
353
|
-
rect: DOMRectangle,
|
|
354
|
-
) -> None:
|
|
355
|
-
r"""Draw a ROI on the image.
|
|
356
|
-
|
|
357
|
-
Args:
|
|
358
|
-
draw (ImageDraw.ImageDraw): The draw object.
|
|
359
|
-
idx (int): The index of the ROI.
|
|
360
|
-
font (ImageFont.FreeTypeFont | ImageFont.ImageFont): The font.
|
|
361
|
-
rect (DOMRectangle): The DOM rectangle.
|
|
362
|
-
"""
|
|
363
|
-
color = _get_random_color(idx)
|
|
364
|
-
text_color = _get_text_color(color)
|
|
365
|
-
|
|
366
|
-
roi = ((rect["left"], rect["top"]), (rect["right"], rect["bottom"]))
|
|
367
|
-
|
|
368
|
-
label_location = (rect["right"], rect["top"])
|
|
369
|
-
label_anchor = "rb"
|
|
370
|
-
|
|
371
|
-
if label_location[1] <= TOP_NO_LABEL_ZONE:
|
|
372
|
-
label_location = (rect["right"], rect["bottom"])
|
|
373
|
-
label_anchor = "rt"
|
|
374
|
-
|
|
375
|
-
draw.rectangle(
|
|
376
|
-
roi, outline=color, fill=(color[0], color[1], color[2], 48), width=2
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
bbox = draw.textbbox(
|
|
380
|
-
label_location,
|
|
381
|
-
str(idx),
|
|
382
|
-
font=font,
|
|
383
|
-
anchor=label_anchor,
|
|
384
|
-
align="center",
|
|
385
|
-
)
|
|
386
|
-
bbox = (bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3)
|
|
387
|
-
draw.rectangle(bbox, fill=color)
|
|
388
|
-
|
|
389
|
-
draw.text(
|
|
390
|
-
label_location,
|
|
391
|
-
str(idx),
|
|
392
|
-
fill=text_color,
|
|
393
|
-
font=font,
|
|
394
|
-
anchor=label_anchor,
|
|
395
|
-
align="center",
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def _get_text_color(
|
|
400
|
-
bg_color: Tuple[int, int, int, int],
|
|
401
|
-
) -> Tuple[int, int, int, int]:
|
|
402
|
-
r"""Determine the ideal text color (black or white) for contrast.
|
|
403
|
-
|
|
404
|
-
Args:
|
|
405
|
-
bg_color: The background color (R, G, B, A).
|
|
406
|
-
|
|
407
|
-
Returns:
|
|
408
|
-
A tuple representing black or white color for text.
|
|
409
|
-
"""
|
|
410
|
-
luminance = bg_color[0] * 0.3 + bg_color[1] * 0.59 + bg_color[2] * 0.11
|
|
411
|
-
return (0, 0, 0, 255) if luminance > 120 else (255, 255, 255, 255)
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
def _get_random_color(identifier: int) -> Tuple[int, int, int, int]:
|
|
415
|
-
r"""Generate a consistent random RGBA color based on the identifier.
|
|
416
|
-
|
|
417
|
-
Args:
|
|
418
|
-
identifier: The ID used as a seed to ensure color consistency.
|
|
419
|
-
|
|
420
|
-
Returns:
|
|
421
|
-
A tuple representing (R, G, B, A) values.
|
|
422
|
-
"""
|
|
423
|
-
rnd = random.Random(int(identifier))
|
|
424
|
-
r = rnd.randint(0, 255)
|
|
425
|
-
g = rnd.randint(125, 255)
|
|
426
|
-
b = rnd.randint(0, 50)
|
|
427
|
-
color = [r, g, b]
|
|
428
|
-
# TODO: check why shuffle is needed?
|
|
429
|
-
rnd.shuffle(color)
|
|
430
|
-
color.append(255)
|
|
431
|
-
return cast(Tuple[int, int, int, int], tuple(color))
|
|
432
|
-
|
|
433
|
-
|
|
434
121
|
class BaseBrowser:
|
|
435
122
|
def __init__(
|
|
436
123
|
self,
|
|
@@ -449,7 +136,8 @@ class BaseBrowser:
|
|
|
449
136
|
"chromium".
|
|
450
137
|
cookie_json_path (Optional[str]): Path to a JSON file containing
|
|
451
138
|
authentication cookies and browser storage state. If provided
|
|
452
|
-
and the file exists, the browser will load this state to
|
|
139
|
+
and the file exists, the browser will load this state to
|
|
140
|
+
maintain
|
|
453
141
|
authenticated sessions without requiring manual login.
|
|
454
142
|
|
|
455
143
|
Returns:
|
|
@@ -459,12 +147,14 @@ class BaseBrowser:
|
|
|
459
147
|
sync_playwright,
|
|
460
148
|
)
|
|
461
149
|
|
|
462
|
-
self.history:
|
|
150
|
+
self.history: List[Any] = []
|
|
463
151
|
self.headless = headless
|
|
464
152
|
self.channel = channel
|
|
465
153
|
self._ensure_browser_installed()
|
|
466
|
-
self.playwright = sync_playwright().start()
|
|
467
|
-
self.page_history:
|
|
154
|
+
self.playwright: Playwright = sync_playwright().start()
|
|
155
|
+
self.page_history: List[
|
|
156
|
+
str
|
|
157
|
+
] = [] # stores the history of visited pages
|
|
468
158
|
self.cookie_json_path = cookie_json_path
|
|
469
159
|
|
|
470
160
|
# Set the cache directory
|
|
@@ -483,10 +173,18 @@ class BaseBrowser:
|
|
|
483
173
|
raise FileNotFoundError(
|
|
484
174
|
f"Page script file not found at path: {page_script_path}"
|
|
485
175
|
)
|
|
176
|
+
self.browser: Optional[Browser] = None
|
|
177
|
+
self.context: Optional[BrowserContext] = None
|
|
178
|
+
self.page: Optional[Page] = None
|
|
179
|
+
self.page_url: Optional[str] = None
|
|
180
|
+
self.web_agent_model: Optional[BaseModelBackend] = (
|
|
181
|
+
None # Added for type hinting
|
|
182
|
+
)
|
|
486
183
|
|
|
487
184
|
def init(self) -> None:
|
|
488
185
|
r"""Initialize the browser."""
|
|
489
186
|
# Launch the browser, if headless is False, the browser will display
|
|
187
|
+
assert self.playwright is not None
|
|
490
188
|
self.browser = self.playwright.chromium.launch(
|
|
491
189
|
headless=self.headless, channel=self.channel
|
|
492
190
|
)
|
|
@@ -494,6 +192,7 @@ class BaseBrowser:
|
|
|
494
192
|
# Check if cookie file exists before using it to maintain
|
|
495
193
|
# authenticated sessions. This prevents errors when the cookie file
|
|
496
194
|
# doesn't exist
|
|
195
|
+
assert self.browser is not None
|
|
497
196
|
if self.cookie_json_path and os.path.exists(self.cookie_json_path):
|
|
498
197
|
self.context = self.browser.new_context(
|
|
499
198
|
accept_downloads=True, storage_state=self.cookie_json_path
|
|
@@ -503,6 +202,7 @@ class BaseBrowser:
|
|
|
503
202
|
accept_downloads=True,
|
|
504
203
|
)
|
|
505
204
|
# Create a new page
|
|
205
|
+
assert self.context is not None
|
|
506
206
|
self.page = self.context.new_page()
|
|
507
207
|
|
|
508
208
|
def clean_cache(self) -> None:
|
|
@@ -513,7 +213,7 @@ class BaseBrowser:
|
|
|
513
213
|
def _wait_for_load(self, timeout: int = 20) -> None:
|
|
514
214
|
r"""Wait for a certain amount of time for the page to load."""
|
|
515
215
|
timeout_ms = timeout * 1000
|
|
516
|
-
|
|
216
|
+
assert self.page is not None
|
|
517
217
|
self.page.wait_for_load_state("load", timeout=timeout_ms)
|
|
518
218
|
|
|
519
219
|
# TODO: check if this is needed
|
|
@@ -521,13 +221,14 @@ class BaseBrowser:
|
|
|
521
221
|
|
|
522
222
|
def click_blank_area(self) -> None:
|
|
523
223
|
r"""Click a blank area of the page to unfocus the current element."""
|
|
224
|
+
assert self.page is not None
|
|
524
225
|
self.page.mouse.click(0, 0)
|
|
525
226
|
self._wait_for_load()
|
|
526
227
|
|
|
527
228
|
@retry_on_error()
|
|
528
229
|
def visit_page(self, url: str) -> None:
|
|
529
230
|
r"""Visit a page with the given URL."""
|
|
530
|
-
|
|
231
|
+
assert self.page is not None
|
|
531
232
|
self.page.goto(url)
|
|
532
233
|
self._wait_for_load()
|
|
533
234
|
self.page_url = url
|
|
@@ -544,7 +245,8 @@ class BaseBrowser:
|
|
|
544
245
|
"""
|
|
545
246
|
current_url = self.get_url()
|
|
546
247
|
|
|
547
|
-
# Confirm with user before proceeding due to potential slow
|
|
248
|
+
# Confirm with user before proceeding due to potential slow
|
|
249
|
+
# processing time
|
|
548
250
|
confirmation_message = (
|
|
549
251
|
f"Do you want to analyze the video on the current "
|
|
550
252
|
f"page({current_url})? This operation may take a long time.(y/n): "
|
|
@@ -555,7 +257,10 @@ class BaseBrowser:
|
|
|
555
257
|
return "User cancelled the video analysis."
|
|
556
258
|
|
|
557
259
|
model = None
|
|
558
|
-
if
|
|
260
|
+
if (
|
|
261
|
+
hasattr(self, 'web_agent_model')
|
|
262
|
+
and self.web_agent_model is not None
|
|
263
|
+
):
|
|
559
264
|
model = self.web_agent_model
|
|
560
265
|
|
|
561
266
|
video_analyzer = VideoAnalysisToolkit(model=model)
|
|
@@ -577,7 +282,7 @@ class BaseBrowser:
|
|
|
577
282
|
image and the path to the image file if saved, otherwise
|
|
578
283
|
:obj:`None`.
|
|
579
284
|
"""
|
|
580
|
-
|
|
285
|
+
assert self.page is not None
|
|
581
286
|
image_data = self.page.screenshot(timeout=60000)
|
|
582
287
|
image = Image.open(io.BytesIO(image_data))
|
|
583
288
|
|
|
@@ -585,6 +290,7 @@ class BaseBrowser:
|
|
|
585
290
|
if save_image:
|
|
586
291
|
# Get url name to form a file name
|
|
587
292
|
# Use urlparser for a safer extraction the url name
|
|
293
|
+
assert self.page_url is not None
|
|
588
294
|
parsed_url = urllib.parse.urlparse(self.page_url)
|
|
589
295
|
# Max length is set to 241 as there are 10 characters for the
|
|
590
296
|
# timestamp and 4 characters for the file extension:
|
|
@@ -612,17 +318,24 @@ class BaseBrowser:
|
|
|
612
318
|
Returns:
|
|
613
319
|
List[str]: A list of paths to the screenshot files.
|
|
614
320
|
"""
|
|
615
|
-
screenshots = []
|
|
616
|
-
|
|
321
|
+
screenshots: List[str] = [] # Ensure screenshots is typed
|
|
322
|
+
assert self.page is not None
|
|
323
|
+
scroll_height_eval = self.page.evaluate("document.body.scrollHeight")
|
|
324
|
+
scroll_height = cast(
|
|
325
|
+
float, scroll_height_eval
|
|
326
|
+
) # Ensure scroll_height is
|
|
327
|
+
# float
|
|
328
|
+
|
|
617
329
|
assert self.page.viewport_size is not None
|
|
618
330
|
viewport_height = self.page.viewport_size["height"]
|
|
619
|
-
|
|
620
|
-
|
|
331
|
+
current_scroll_eval = self.page.evaluate("window.scrollY")
|
|
332
|
+
current_scroll = cast(float, current_scroll_eval)
|
|
333
|
+
# screenshot_index = 1 # This variable is not used
|
|
621
334
|
|
|
622
335
|
max_height = scroll_height - viewport_height
|
|
623
336
|
scroll_step = int(viewport_height * scroll_ratio)
|
|
624
337
|
|
|
625
|
-
last_height = 0
|
|
338
|
+
last_height = 0.0 # Initialize last_height as float
|
|
626
339
|
|
|
627
340
|
while True:
|
|
628
341
|
logger.debug(
|
|
@@ -631,19 +344,22 @@ class BaseBrowser:
|
|
|
631
344
|
)
|
|
632
345
|
|
|
633
346
|
_, file_path = self.get_screenshot(save_image=True)
|
|
634
|
-
|
|
347
|
+
if file_path is not None: # Ensure file_path is not None before
|
|
348
|
+
# appending
|
|
349
|
+
screenshots.append(file_path)
|
|
635
350
|
|
|
636
351
|
self.page.evaluate(f"window.scrollBy(0, {scroll_step})")
|
|
637
352
|
# Allow time for content to load
|
|
638
353
|
time.sleep(0.5)
|
|
639
354
|
|
|
640
|
-
|
|
355
|
+
current_scroll_eval = self.page.evaluate("window.scrollY")
|
|
356
|
+
current_scroll = cast(float, current_scroll_eval)
|
|
641
357
|
# Break if there is no significant scroll
|
|
642
358
|
if abs(current_scroll - last_height) < viewport_height * 0.1:
|
|
643
359
|
break
|
|
644
360
|
|
|
645
361
|
last_height = current_scroll
|
|
646
|
-
screenshot_index += 1
|
|
362
|
+
# screenshot_index += 1 # This variable is not used
|
|
647
363
|
|
|
648
364
|
return screenshots
|
|
649
365
|
|
|
@@ -653,13 +369,17 @@ class BaseBrowser:
|
|
|
653
369
|
Returns:
|
|
654
370
|
VisualViewport: The visual viewport of the current page.
|
|
655
371
|
"""
|
|
372
|
+
assert self.page is not None
|
|
656
373
|
try:
|
|
657
374
|
self.page.evaluate(self.page_script)
|
|
658
375
|
except Exception as e:
|
|
659
376
|
logger.warning(f"Error evaluating page script: {e}")
|
|
660
377
|
|
|
378
|
+
visual_viewport_eval = self.page.evaluate(
|
|
379
|
+
"MultimodalWebSurfer.getVisualViewport();"
|
|
380
|
+
)
|
|
661
381
|
return visual_viewport_from_dict(
|
|
662
|
-
|
|
382
|
+
cast(Dict[str, Any], visual_viewport_eval)
|
|
663
383
|
)
|
|
664
384
|
|
|
665
385
|
def get_interactive_elements(self) -> Dict[str, InteractiveRegion]:
|
|
@@ -668,6 +388,7 @@ class BaseBrowser:
|
|
|
668
388
|
Returns:
|
|
669
389
|
Dict[str, InteractiveRegion]: A dictionary of interactive elements.
|
|
670
390
|
"""
|
|
391
|
+
assert self.page is not None
|
|
671
392
|
try:
|
|
672
393
|
self.page.evaluate(self.page_script)
|
|
673
394
|
except Exception as e:
|
|
@@ -682,7 +403,7 @@ class BaseBrowser:
|
|
|
682
403
|
for k in result:
|
|
683
404
|
typed_results[k] = interactive_region_from_dict(result[k])
|
|
684
405
|
|
|
685
|
-
return typed_results
|
|
406
|
+
return typed_results
|
|
686
407
|
|
|
687
408
|
def get_som_screenshot(
|
|
688
409
|
self,
|
|
@@ -696,7 +417,8 @@ class BaseBrowser:
|
|
|
696
417
|
directory.
|
|
697
418
|
|
|
698
419
|
Returns:
|
|
699
|
-
Tuple[Image.Image, Union[str, None]]: A tuple containing the
|
|
420
|
+
Tuple[Image.Image, Union[str, None]]: A tuple containing the
|
|
421
|
+
screenshot image
|
|
700
422
|
and an optional path to the image file if saved, otherwise
|
|
701
423
|
:obj:`None`.
|
|
702
424
|
"""
|
|
@@ -706,11 +428,12 @@ class BaseBrowser:
|
|
|
706
428
|
rects = self.get_interactive_elements()
|
|
707
429
|
|
|
708
430
|
file_path: str | None = None
|
|
709
|
-
comp, _, _, _ =
|
|
431
|
+
comp, _, _, _ = _add_set_of_mark(
|
|
710
432
|
screenshot,
|
|
711
|
-
rects,
|
|
433
|
+
rects,
|
|
712
434
|
)
|
|
713
435
|
if save_image:
|
|
436
|
+
assert self.page_url is not None
|
|
714
437
|
parsed_url = urllib.parse.urlparse(self.page_url)
|
|
715
438
|
# Max length is set to 241 as there are 10 characters for the
|
|
716
439
|
# timestamp and 4 characters for the file extension:
|
|
@@ -727,25 +450,30 @@ class BaseBrowser:
|
|
|
727
450
|
|
|
728
451
|
def scroll_up(self) -> None:
|
|
729
452
|
r"""Scroll up the page."""
|
|
453
|
+
assert self.page is not None
|
|
730
454
|
self.page.keyboard.press("PageUp")
|
|
731
455
|
|
|
732
456
|
def scroll_down(self) -> None:
|
|
733
457
|
r"""Scroll down the page."""
|
|
458
|
+
assert self.page is not None
|
|
734
459
|
self.page.keyboard.press("PageDown")
|
|
735
460
|
|
|
736
461
|
def get_url(self) -> str:
|
|
737
462
|
r"""Get the URL of the current page."""
|
|
463
|
+
assert self.page is not None
|
|
738
464
|
return self.page.url
|
|
739
465
|
|
|
740
466
|
def click_id(self, identifier: Union[str, int]) -> None:
|
|
741
467
|
r"""Click an element with the given identifier."""
|
|
468
|
+
assert self.page is not None
|
|
742
469
|
if isinstance(identifier, int):
|
|
743
470
|
identifier = str(identifier)
|
|
744
471
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
745
472
|
|
|
746
473
|
try:
|
|
747
474
|
target.wait_for(timeout=5000)
|
|
748
|
-
except
|
|
475
|
+
except Exception as e: # Consider using playwright specific
|
|
476
|
+
# TimeoutError
|
|
749
477
|
logger.debug(f"Error during click operation: {e}")
|
|
750
478
|
raise ValueError("No such element.") from None
|
|
751
479
|
|
|
@@ -754,7 +482,13 @@ class BaseBrowser:
|
|
|
754
482
|
new_page = None
|
|
755
483
|
try:
|
|
756
484
|
with self.page.expect_event("popup", timeout=1000) as page_info:
|
|
757
|
-
box
|
|
485
|
+
box: Optional[FloatRect] = target.bounding_box()
|
|
486
|
+
if box is None:
|
|
487
|
+
logger.warning(
|
|
488
|
+
f"Bounding box not found for element '{identifier}'. "
|
|
489
|
+
f"Cannot click."
|
|
490
|
+
)
|
|
491
|
+
return
|
|
758
492
|
self.page.mouse.click(
|
|
759
493
|
box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
|
|
760
494
|
)
|
|
@@ -765,7 +499,8 @@ class BaseBrowser:
|
|
|
765
499
|
self.page_history.append(deepcopy(self.page.url))
|
|
766
500
|
self.page = new_page
|
|
767
501
|
|
|
768
|
-
except
|
|
502
|
+
except Exception as e: # Consider using playwright specific
|
|
503
|
+
# TimeoutError
|
|
769
504
|
logger.debug(f"Error during click operation: {e}")
|
|
770
505
|
pass
|
|
771
506
|
|
|
@@ -773,6 +508,7 @@ class BaseBrowser:
|
|
|
773
508
|
|
|
774
509
|
def extract_url_content(self) -> str:
|
|
775
510
|
r"""Extract the content of the current page."""
|
|
511
|
+
assert self.page is not None
|
|
776
512
|
content = self.page.content()
|
|
777
513
|
return content
|
|
778
514
|
|
|
@@ -781,17 +517,17 @@ class BaseBrowser:
|
|
|
781
517
|
|
|
782
518
|
Args:
|
|
783
519
|
identifier (str): The identifier of the file to download.
|
|
784
|
-
file_path (str): The path to save the downloaded file.
|
|
785
520
|
|
|
786
521
|
Returns:
|
|
787
522
|
str: The result of the action.
|
|
788
523
|
"""
|
|
789
|
-
|
|
524
|
+
assert self.page is not None
|
|
790
525
|
if isinstance(identifier, int):
|
|
791
526
|
identifier = str(identifier)
|
|
792
527
|
try:
|
|
793
528
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
794
|
-
except
|
|
529
|
+
except Exception as e: # Consider using playwright specific
|
|
530
|
+
# TimeoutError
|
|
795
531
|
logger.debug(f"Error during download operation: {e}")
|
|
796
532
|
logger.warning(
|
|
797
533
|
f"Element with identifier '{identifier}' not found."
|
|
@@ -800,7 +536,7 @@ class BaseBrowser:
|
|
|
800
536
|
|
|
801
537
|
target.scroll_into_view_if_needed()
|
|
802
538
|
|
|
803
|
-
|
|
539
|
+
file_path_val = os.path.join(self.cache_dir)
|
|
804
540
|
self._wait_for_load()
|
|
805
541
|
|
|
806
542
|
try:
|
|
@@ -809,12 +545,13 @@ class BaseBrowser:
|
|
|
809
545
|
download = download_info.value
|
|
810
546
|
file_name = download.suggested_filename
|
|
811
547
|
|
|
812
|
-
|
|
813
|
-
download.save_as(
|
|
548
|
+
file_path_val = os.path.join(file_path_val, file_name)
|
|
549
|
+
download.save_as(file_path_val)
|
|
814
550
|
|
|
815
|
-
return f"Downloaded file to path '{
|
|
551
|
+
return f"Downloaded file to path '{file_path_val}'."
|
|
816
552
|
|
|
817
|
-
except
|
|
553
|
+
except Exception as e: # Consider using playwright specific
|
|
554
|
+
# TimeoutError
|
|
818
555
|
logger.debug(f"Error during download operation: {e}")
|
|
819
556
|
return f"Failed to download file with identifier '{identifier}'."
|
|
820
557
|
|
|
@@ -828,12 +565,14 @@ class BaseBrowser:
|
|
|
828
565
|
Returns:
|
|
829
566
|
str: The result of the action.
|
|
830
567
|
"""
|
|
568
|
+
assert self.page is not None
|
|
831
569
|
if isinstance(identifier, int):
|
|
832
570
|
identifier = str(identifier)
|
|
833
571
|
|
|
834
572
|
try:
|
|
835
573
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
836
|
-
except
|
|
574
|
+
except Exception as e: # Consider using playwright specific
|
|
575
|
+
# TimeoutError
|
|
837
576
|
logger.debug(f"Error during fill operation: {e}")
|
|
838
577
|
logger.warning(
|
|
839
578
|
f"Element with identifier '{identifier}' not found."
|
|
@@ -844,7 +583,8 @@ class BaseBrowser:
|
|
|
844
583
|
target.focus()
|
|
845
584
|
try:
|
|
846
585
|
target.fill(text)
|
|
847
|
-
except
|
|
586
|
+
except Exception as e: # Consider using playwright specific
|
|
587
|
+
# TimeoutError
|
|
848
588
|
logger.debug(f"Error during fill operation: {e}")
|
|
849
589
|
target.press_sequentially(text)
|
|
850
590
|
|
|
@@ -856,11 +596,13 @@ class BaseBrowser:
|
|
|
856
596
|
)
|
|
857
597
|
|
|
858
598
|
def scroll_to_bottom(self) -> str:
|
|
599
|
+
assert self.page is not None
|
|
859
600
|
self.page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
860
601
|
self._wait_for_load()
|
|
861
602
|
return "Scrolled to the bottom of the page."
|
|
862
603
|
|
|
863
604
|
def scroll_to_top(self) -> str:
|
|
605
|
+
assert self.page is not None
|
|
864
606
|
self.page.evaluate("window.scrollTo(0, 0);")
|
|
865
607
|
self._wait_for_load()
|
|
866
608
|
return "Scrolled to the top of the page."
|
|
@@ -874,11 +616,13 @@ class BaseBrowser:
|
|
|
874
616
|
Returns:
|
|
875
617
|
str: The result of the action.
|
|
876
618
|
"""
|
|
619
|
+
assert self.page is not None
|
|
877
620
|
if isinstance(identifier, int):
|
|
878
621
|
identifier = str(identifier)
|
|
879
622
|
try:
|
|
880
623
|
target = self.page.locator(f"[__elementId='{identifier}']")
|
|
881
|
-
except
|
|
624
|
+
except Exception as e: # Consider using playwright specific
|
|
625
|
+
# TimeoutError
|
|
882
626
|
logger.debug(f"Error during hover operation: {e}")
|
|
883
627
|
logger.warning(
|
|
884
628
|
f"Element with identifier '{identifier}' not found."
|
|
@@ -896,15 +640,18 @@ class BaseBrowser:
|
|
|
896
640
|
the text.
|
|
897
641
|
"""
|
|
898
642
|
# ruff: noqa: E501
|
|
643
|
+
assert self.page is not None
|
|
899
644
|
script = f"""
|
|
900
|
-
(function() {{
|
|
645
|
+
(function() {{
|
|
901
646
|
let text = "{search_text}";
|
|
902
647
|
let found = window.find(text);
|
|
903
648
|
if (!found) {{
|
|
904
|
-
let elements = document.querySelectorAll("*:not(script):not(
|
|
649
|
+
let elements = document.querySelectorAll("*:not(script):not(
|
|
650
|
+
style)");
|
|
905
651
|
for (let el of elements) {{
|
|
906
652
|
if (el.innerText && el.innerText.includes(text)) {{
|
|
907
|
-
el.scrollIntoView({{behavior: "smooth", block:
|
|
653
|
+
el.scrollIntoView({{behavior: "smooth", block:
|
|
654
|
+
"center"}});
|
|
908
655
|
el.style.backgroundColor = "yellow";
|
|
909
656
|
el.style.border = '2px solid red';
|
|
910
657
|
return true;
|
|
@@ -915,7 +662,8 @@ class BaseBrowser:
|
|
|
915
662
|
return true;
|
|
916
663
|
}})();
|
|
917
664
|
"""
|
|
918
|
-
|
|
665
|
+
found_eval = self.page.evaluate(script)
|
|
666
|
+
found = cast(bool, found_eval) # Ensure found is bool
|
|
919
667
|
self._wait_for_load()
|
|
920
668
|
if found:
|
|
921
669
|
return f"Found text '{search_text}' on the page."
|
|
@@ -924,7 +672,7 @@ class BaseBrowser:
|
|
|
924
672
|
|
|
925
673
|
def back(self):
|
|
926
674
|
r"""Navigate back to the previous page."""
|
|
927
|
-
|
|
675
|
+
assert self.page is not None
|
|
928
676
|
page_url_before = self.page.url
|
|
929
677
|
self.page.go_back()
|
|
930
678
|
|
|
@@ -942,15 +690,21 @@ class BaseBrowser:
|
|
|
942
690
|
self._wait_for_load()
|
|
943
691
|
|
|
944
692
|
def close(self):
|
|
693
|
+
assert self.browser is not None
|
|
945
694
|
self.browser.close()
|
|
695
|
+
if self.playwright:
|
|
696
|
+
self.playwright.stop() # Stop playwright instance
|
|
946
697
|
|
|
947
698
|
# ruff: noqa: E501
|
|
948
699
|
def show_interactive_elements(self):
|
|
949
700
|
r"""Show simple interactive elements on the current page."""
|
|
701
|
+
assert self.page is not None
|
|
950
702
|
self.page.evaluate(self.page_script)
|
|
951
703
|
self.page.evaluate("""
|
|
952
704
|
() => {
|
|
953
|
-
document.querySelectorAll('a, button, input, select, textarea,
|
|
705
|
+
document.querySelectorAll('a, button, input, select, textarea,
|
|
706
|
+
[tabindex]:not([tabindex="-1"]),
|
|
707
|
+
[contenteditable="true"]').forEach(el => {
|
|
954
708
|
el.style.border = '2px solid red';
|
|
955
709
|
});
|
|
956
710
|
}
|
|
@@ -960,6 +714,7 @@ class BaseBrowser:
|
|
|
960
714
|
def get_webpage_content(self) -> str:
|
|
961
715
|
from html2text import html2text
|
|
962
716
|
|
|
717
|
+
assert self.page is not None
|
|
963
718
|
self._wait_for_load()
|
|
964
719
|
html_content = self.page.content()
|
|
965
720
|
|
|
@@ -1045,25 +800,32 @@ class BrowserToolkit(BaseToolkit):
|
|
|
1045
800
|
(default: :obj:`"en`")
|
|
1046
801
|
cookie_json_path (Optional[str]): Path to a JSON file containing
|
|
1047
802
|
authentication cookies and browser storage state. If provided
|
|
1048
|
-
and the file exists, the browser will load this state to
|
|
803
|
+
and the file exists, the browser will load this state to
|
|
804
|
+
maintain
|
|
1049
805
|
authenticated sessions without requiring manual login.
|
|
1050
806
|
(default: :obj:`None`)
|
|
1051
807
|
"""
|
|
1052
|
-
|
|
808
|
+
super().__init__() # Call to super().__init__() added
|
|
1053
809
|
self.browser = BaseBrowser(
|
|
1054
810
|
headless=headless,
|
|
1055
811
|
cache_dir=cache_dir,
|
|
1056
812
|
channel=channel,
|
|
1057
813
|
cookie_json_path=cookie_json_path,
|
|
1058
814
|
)
|
|
815
|
+
self.browser.web_agent_model = web_agent_model # Pass model to
|
|
816
|
+
# BaseBrowser instance
|
|
1059
817
|
|
|
1060
818
|
self.history_window = history_window
|
|
1061
819
|
self.web_agent_model = web_agent_model
|
|
1062
820
|
self.planning_agent_model = planning_agent_model
|
|
1063
821
|
self.output_language = output_language
|
|
1064
822
|
|
|
1065
|
-
self.history:
|
|
1066
|
-
self.web_agent
|
|
823
|
+
self.history: List[Dict[str, Any]] = [] # Typed history list
|
|
824
|
+
self.web_agent: ChatAgent
|
|
825
|
+
self.planning_agent: ChatAgent
|
|
826
|
+
self.web_agent, self.planning_agent = self._initialize_agent(
|
|
827
|
+
web_agent_model, planning_agent_model
|
|
828
|
+
)
|
|
1067
829
|
|
|
1068
830
|
def _reset(self):
|
|
1069
831
|
self.web_agent.reset()
|
|
@@ -1071,43 +833,40 @@ class BrowserToolkit(BaseToolkit):
|
|
|
1071
833
|
self.history = []
|
|
1072
834
|
os.makedirs(self.browser.cache_dir, exist_ok=True)
|
|
1073
835
|
|
|
1074
|
-
def _initialize_agent(
|
|
836
|
+
def _initialize_agent(
|
|
837
|
+
self,
|
|
838
|
+
web_agent_model_backend: Optional[BaseModelBackend],
|
|
839
|
+
planning_agent_model_backend: Optional[BaseModelBackend],
|
|
840
|
+
) -> Tuple[ChatAgent, ChatAgent]:
|
|
1075
841
|
r"""Initialize the agent."""
|
|
1076
842
|
from camel.agents import ChatAgent
|
|
1077
843
|
|
|
1078
|
-
if
|
|
1079
|
-
|
|
844
|
+
if web_agent_model_backend is None:
|
|
845
|
+
web_agent_model_instance = ModelFactory.create(
|
|
1080
846
|
model_platform=ModelPlatformType.OPENAI,
|
|
1081
847
|
model_type=ModelType.GPT_4_1,
|
|
1082
848
|
model_config_dict={"temperature": 0, "top_p": 1},
|
|
1083
849
|
)
|
|
1084
850
|
else:
|
|
1085
|
-
|
|
851
|
+
web_agent_model_instance = web_agent_model_backend
|
|
1086
852
|
|
|
1087
|
-
if
|
|
853
|
+
if planning_agent_model_backend is None:
|
|
1088
854
|
planning_model = ModelFactory.create(
|
|
1089
855
|
model_platform=ModelPlatformType.OPENAI,
|
|
1090
856
|
model_type=ModelType.O3_MINI,
|
|
1091
857
|
)
|
|
1092
858
|
else:
|
|
1093
|
-
planning_model =
|
|
859
|
+
planning_model = planning_agent_model_backend
|
|
1094
860
|
|
|
1095
|
-
system_prompt =
|
|
1096
|
-
You are a helpful web agent that can assist users in browsing the web.
|
|
1097
|
-
Given a high-level task, you can leverage predefined browser tools to help
|
|
1098
|
-
users achieve their goals.
|
|
1099
|
-
"""
|
|
861
|
+
system_prompt = WEB_AGENT_SYSTEM_PROMPT
|
|
1100
862
|
|
|
1101
863
|
web_agent = ChatAgent(
|
|
1102
864
|
system_message=system_prompt,
|
|
1103
|
-
model=
|
|
865
|
+
model=web_agent_model_instance,
|
|
1104
866
|
output_language=self.output_language,
|
|
1105
867
|
)
|
|
1106
868
|
|
|
1107
|
-
planning_system_prompt =
|
|
1108
|
-
You are a helpful planning agent that can assist users in planning complex
|
|
1109
|
-
tasks which need multi-step browser interaction.
|
|
1110
|
-
"""
|
|
869
|
+
planning_system_prompt = PLANNING_AGENT_SYSTEM_PROMPT
|
|
1111
870
|
|
|
1112
871
|
planning_agent = ChatAgent(
|
|
1113
872
|
system_message=planning_system_prompt,
|
|
@@ -1120,96 +879,24 @@ tasks which need multi-step browser interaction.
|
|
|
1120
879
|
def _observe(
|
|
1121
880
|
self, task_prompt: str, detailed_plan: Optional[str] = None
|
|
1122
881
|
) -> Tuple[str, str, str]:
|
|
1123
|
-
r"""Let agent observe the current environment, and get the next
|
|
882
|
+
r"""Let agent observe the current environment, and get the next
|
|
883
|
+
action."""
|
|
1124
884
|
|
|
1125
|
-
|
|
885
|
+
detailed_plan_prompt_str = ""
|
|
1126
886
|
|
|
1127
887
|
if detailed_plan is not None:
|
|
1128
|
-
|
|
888
|
+
detailed_plan_prompt_str = f"""
|
|
1129
889
|
Here is a plan about how to solve the task step-by-step which you must follow:
|
|
1130
890
|
<detailed_plan>{detailed_plan}<detailed_plan>
|
|
1131
891
|
"""
|
|
1132
892
|
|
|
1133
|
-
observe_prompt =
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
the browser, and provide the next appropriate action to take.
|
|
1141
|
-
|
|
1142
|
-
{detailed_plan_prompt}
|
|
1143
|
-
|
|
1144
|
-
Here are the current available browser functions you can use:
|
|
1145
|
-
{AVAILABLE_ACTIONS_PROMPT}
|
|
1146
|
-
|
|
1147
|
-
Here are the latest {self.history_window} trajectory (at most) you have taken:
|
|
1148
|
-
<history>
|
|
1149
|
-
{self.history[-self.history_window :]}
|
|
1150
|
-
</history>
|
|
1151
|
-
|
|
1152
|
-
Your output should be in json format, including the following fields:
|
|
1153
|
-
- `observation`: The detailed image description about the current viewport. Do
|
|
1154
|
-
not over-confident about the correctness of the history actions. You should
|
|
1155
|
-
always check the current viewport to make sure the correctness of the next
|
|
1156
|
-
action.
|
|
1157
|
-
- `reasoning`: The reasoning about the next action you want to take, and the
|
|
1158
|
-
possible obstacles you may encounter, and how to solve them. Do not forget to
|
|
1159
|
-
check the history actions to avoid the same mistakes.
|
|
1160
|
-
- `action_code`: The action code you want to take. It is only one step action
|
|
1161
|
-
code, without any other texts (such as annotation)
|
|
1162
|
-
|
|
1163
|
-
Here is two example of the output:
|
|
1164
|
-
```json
|
|
1165
|
-
{{
|
|
1166
|
-
"observation": [IMAGE_DESCRIPTION],
|
|
1167
|
-
"reasoning": [YOUR_REASONING],
|
|
1168
|
-
"action_code": "fill_input_id([ID], [TEXT])"
|
|
1169
|
-
}}
|
|
1170
|
-
|
|
1171
|
-
{{
|
|
1172
|
-
"observation": "The current page is a CAPTCHA verification page on Amazon. It asks the user to ..",
|
|
1173
|
-
"reasoning": "To proceed with the task of searching for products, I need to complete..",
|
|
1174
|
-
"action_code": "fill_input_id(3, 'AUXPMR')"
|
|
1175
|
-
}}
|
|
1176
|
-
|
|
1177
|
-
Here are some tips for you:
|
|
1178
|
-
- Never forget the overall question: **{task_prompt}**
|
|
1179
|
-
- Maybe after a certain operation (e.g. click_id), the page content has not
|
|
1180
|
-
changed. You can check whether the action step is successful by looking at the
|
|
1181
|
-
`success` of the action step in the history. If successful, it means that the
|
|
1182
|
-
page content is indeed the same after the click. You need to try other methods.
|
|
1183
|
-
- If using one way to solve the problem is not successful, try other ways.
|
|
1184
|
-
Make sure your provided ID is correct!
|
|
1185
|
-
- Some cases are very complex and need to be achieve by an iterative process.
|
|
1186
|
-
You can use the `back()` function to go back to the previous page to try other
|
|
1187
|
-
methods.
|
|
1188
|
-
- There are many links on the page, which may be useful for solving the
|
|
1189
|
-
problem. You can use the `click_id()` function to click on the link to see if
|
|
1190
|
-
it is useful.
|
|
1191
|
-
- Always keep in mind that your action must be based on the ID shown in the
|
|
1192
|
-
current image or viewport, not the ID shown in the history.
|
|
1193
|
-
- Do not use `stop()` lightly. Always remind yourself that the image only
|
|
1194
|
-
shows a part of the full page. If you cannot find the answer, try to use
|
|
1195
|
-
functions like `scroll_up()` and `scroll_down()` to check the full content of
|
|
1196
|
-
the webpage before doing anything else, because the answer or next key step
|
|
1197
|
-
may be hidden in the content below.
|
|
1198
|
-
- If the webpage needs human verification, you must avoid processing it.
|
|
1199
|
-
Please use `back()` to go back to the previous page, and try other ways.
|
|
1200
|
-
- If you have tried everything and still cannot resolve the issue, please stop
|
|
1201
|
-
the simulation, and report issues you have encountered.
|
|
1202
|
-
- Check the history actions carefully, detect whether you have repeatedly made
|
|
1203
|
-
the same actions or not.
|
|
1204
|
-
- When dealing with wikipedia revision history related tasks, you need to
|
|
1205
|
-
think about the solution flexibly. First, adjust the browsing history
|
|
1206
|
-
displayed on a single page to the maximum, and then make use of the
|
|
1207
|
-
find_text_on_page function. This is extremely useful which can quickly locate
|
|
1208
|
-
the text you want to find and skip massive amount of useless information.
|
|
1209
|
-
- Flexibly use interactive elements like slide down selection bar to filter
|
|
1210
|
-
out the information you need. Sometimes they are extremely useful.
|
|
1211
|
-
```
|
|
1212
|
-
"""
|
|
893
|
+
observe_prompt = OBSERVE_PROMPT_TEMPLATE.format(
|
|
894
|
+
task_prompt=task_prompt,
|
|
895
|
+
detailed_plan_prompt=detailed_plan_prompt_str,
|
|
896
|
+
AVAILABLE_ACTIONS_PROMPT=AVAILABLE_ACTIONS_PROMPT,
|
|
897
|
+
history_window=self.history_window,
|
|
898
|
+
history=self.history[-self.history_window :],
|
|
899
|
+
)
|
|
1213
900
|
|
|
1214
901
|
# get current state
|
|
1215
902
|
som_screenshot, _ = self.browser.get_som_screenshot(save_image=True)
|
|
@@ -1223,7 +910,8 @@ out the information you need. Sometimes they are extremely useful.
|
|
|
1223
910
|
|
|
1224
911
|
resp_content = resp.msgs[0].content
|
|
1225
912
|
|
|
1226
|
-
resp_dict = _parse_json_output(resp_content)
|
|
913
|
+
resp_dict = _parse_json_output(resp_content, logger) # Pass logger to
|
|
914
|
+
# _parse_json_output
|
|
1227
915
|
observation_result: str = resp_dict.get("observation", "")
|
|
1228
916
|
reasoning_result: str = resp_dict.get("reasoning", "")
|
|
1229
917
|
action_code: str = resp_dict.get("action_code", "")
|
|
@@ -1244,7 +932,10 @@ out the information you need. Sometimes they are extremely useful.
|
|
|
1244
932
|
id_part = (
|
|
1245
933
|
parts[0].replace("fill_input_id(", "").strip()
|
|
1246
934
|
)
|
|
1247
|
-
action_code =
|
|
935
|
+
action_code = (
|
|
936
|
+
f"fill_input_id({id_part}, 'Please "
|
|
937
|
+
f"fill the text here.')"
|
|
938
|
+
)
|
|
1248
939
|
|
|
1249
940
|
action_code = action_code.replace("`", "").strip()
|
|
1250
941
|
|
|
@@ -1346,43 +1037,36 @@ out the information you need. Sometimes they are extremely useful.
|
|
|
1346
1037
|
)
|
|
1347
1038
|
|
|
1348
1039
|
def _get_final_answer(self, task_prompt: str) -> str:
|
|
1349
|
-
r"""Get the final answer based on the task prompt and current
|
|
1350
|
-
|
|
1040
|
+
r"""Get the final answer based on the task prompt and current
|
|
1041
|
+
browser state.
|
|
1042
|
+
It is used when the agent thinks that the task can be completed
|
|
1043
|
+
without any further action, and answer can be directly found in the
|
|
1044
|
+
current viewport.
|
|
1351
1045
|
"""
|
|
1352
1046
|
|
|
1353
|
-
prompt =
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
<history>{self.history}</history>
|
|
1357
|
-
Please find the final answer, or give valuable insights and founds (e.g. if previous actions contain downloading files, your output should include the path of the downloaded file) about the overall task: <task>{task_prompt}</task>
|
|
1358
|
-
"""
|
|
1047
|
+
prompt = GET_FINAL_ANSWER_PROMPT_TEMPLATE.format(
|
|
1048
|
+
history=self.history, task_prompt=task_prompt
|
|
1049
|
+
)
|
|
1359
1050
|
|
|
1360
1051
|
message = BaseMessage.make_user_message(
|
|
1361
1052
|
role_name='user',
|
|
1362
1053
|
content=prompt,
|
|
1363
1054
|
)
|
|
1364
|
-
|
|
1055
|
+
self.web_agent.reset() # Reset before step
|
|
1365
1056
|
resp = self.web_agent.step(message)
|
|
1366
1057
|
return resp.msgs[0].content
|
|
1367
1058
|
|
|
1368
1059
|
def _task_planning(self, task_prompt: str, start_url: str) -> str:
|
|
1369
1060
|
r"""Plan the task based on the given task prompt."""
|
|
1370
1061
|
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
<task>{task_prompt}</task>
|
|
1375
|
-
According to the problem above, if we use browser interaction, what is the general process of the interaction after visiting the webpage `{start_url}`?
|
|
1376
|
-
|
|
1377
|
-
Please note that it can be viewed as Partially Observable MDP. Do not over-confident about your plan.
|
|
1378
|
-
Please first restate the task in detail, and then provide a detailed plan to solve the task.
|
|
1379
|
-
"""
|
|
1380
|
-
# Here are some tips for you: Please note that we can only see a part of the full page because of the limited viewport after an action. Thus, do not forget to use methods like `scroll_up()` and `scroll_down()` to check the full content of the webpage, because the answer or next key step may be hidden in the content below.
|
|
1062
|
+
planning_prompt = TASK_PLANNING_PROMPT_TEMPLATE.format(
|
|
1063
|
+
task_prompt=task_prompt, start_url=start_url
|
|
1064
|
+
)
|
|
1381
1065
|
|
|
1382
1066
|
message = BaseMessage.make_user_message(
|
|
1383
1067
|
role_name='user', content=planning_prompt
|
|
1384
1068
|
)
|
|
1385
|
-
|
|
1069
|
+
self.planning_agent.reset() # Reset before step
|
|
1386
1070
|
resp = self.planning_agent.step(message)
|
|
1387
1071
|
return resp.msgs[0].content
|
|
1388
1072
|
|
|
@@ -1396,35 +1080,26 @@ Please first restate the task in detail, and then provide a detailed plan to sol
|
|
|
1396
1080
|
detailed_plan (str): The detailed plan to replan.
|
|
1397
1081
|
|
|
1398
1082
|
Returns:
|
|
1399
|
-
Tuple[bool, str]: A tuple containing a boolean indicating
|
|
1083
|
+
Tuple[bool, str]: A tuple containing a boolean indicating
|
|
1084
|
+
whether the task needs to be replanned, and the replanned schema.
|
|
1400
1085
|
"""
|
|
1401
1086
|
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
In order to solve the task, we made a detailed plan previously. Here is the detailed plan:
|
|
1409
|
-
<detailed plan>{detailed_plan}</detailed plan>
|
|
1410
|
-
|
|
1411
|
-
According to the task above, we have made a series of observations, reasonings, and actions. Here are the latest {self.history_window} trajectory (at most) we have taken:
|
|
1412
|
-
<history>{self.history[-self.history_window :]}</history>
|
|
1413
|
-
|
|
1414
|
-
However, the task is not completed yet. As the task is partially observable, we may need to replan the task based on the current state of the browser if necessary.
|
|
1415
|
-
Now please carefully examine the current task planning schema, and our history actions, and then judge whether the task needs to be fundamentally replanned. If so, please provide a detailed replanned schema (including the restated overall task).
|
|
1416
|
-
|
|
1417
|
-
Your output should be in json format, including the following fields:
|
|
1418
|
-
- `if_need_replan`: bool, A boolean value indicating whether the task needs to be fundamentally replanned.
|
|
1419
|
-
- `replanned_schema`: str, The replanned schema for the task, which should not be changed too much compared with the original one. If the task does not need to be replanned, the value should be an empty string.
|
|
1420
|
-
"""
|
|
1087
|
+
replanning_prompt = TASK_REPLANNING_PROMPT_TEMPLATE.format(
|
|
1088
|
+
task_prompt=task_prompt,
|
|
1089
|
+
detailed_plan=detailed_plan,
|
|
1090
|
+
history_window=self.history_window,
|
|
1091
|
+
history=self.history[-self.history_window :],
|
|
1092
|
+
)
|
|
1421
1093
|
# Reset the history message of planning_agent.
|
|
1422
1094
|
self.planning_agent.reset()
|
|
1423
1095
|
resp = self.planning_agent.step(replanning_prompt)
|
|
1424
|
-
resp_dict = _parse_json_output(
|
|
1096
|
+
resp_dict = _parse_json_output(
|
|
1097
|
+
resp.msgs[0].content, logger
|
|
1098
|
+
) # Pass logger
|
|
1425
1099
|
|
|
1426
|
-
|
|
1427
|
-
|
|
1100
|
+
if_need_replan_eval = resp_dict.get("if_need_replan", False)
|
|
1101
|
+
if_need_replan = cast(bool, if_need_replan_eval) # Ensure bool
|
|
1102
|
+
replanned_schema: str = resp_dict.get("replanned_schema", "")
|
|
1428
1103
|
|
|
1429
1104
|
if if_need_replan:
|
|
1430
1105
|
return True, replanned_schema
|
|
@@ -1463,10 +1138,10 @@ Your output should be in json format, including the following fields:
|
|
|
1463
1138
|
logger.debug(f"Observation: {observation}")
|
|
1464
1139
|
logger.debug(f"Reasoning: {reasoning}")
|
|
1465
1140
|
logger.debug(f"Action code: {action_code}")
|
|
1466
|
-
|
|
1141
|
+
trajectory_info: Dict[str, Any]
|
|
1467
1142
|
if "stop" in action_code:
|
|
1468
1143
|
task_completed = True
|
|
1469
|
-
trajectory_info = {
|
|
1144
|
+
trajectory_info = { # Typed trajectory_info
|
|
1470
1145
|
"round": i,
|
|
1471
1146
|
"observation": observation,
|
|
1472
1147
|
"thought": reasoning,
|
|
@@ -1483,7 +1158,7 @@ Your output should be in json format, including the following fields:
|
|
|
1483
1158
|
if not success:
|
|
1484
1159
|
logger.warning(f"Error while executing the action: {info}")
|
|
1485
1160
|
|
|
1486
|
-
trajectory_info = {
|
|
1161
|
+
trajectory_info = { # Typed trajectory_info
|
|
1487
1162
|
"round": i,
|
|
1488
1163
|
"observation": observation,
|
|
1489
1164
|
"thought": reasoning,
|
|
@@ -1502,15 +1177,20 @@ Your output should be in json format, including the following fields:
|
|
|
1502
1177
|
detailed_plan = replanned_schema
|
|
1503
1178
|
logger.debug(f"Replanned schema: {replanned_schema}")
|
|
1504
1179
|
|
|
1180
|
+
simulation_result: str
|
|
1505
1181
|
if not task_completed:
|
|
1506
1182
|
simulation_result = f"""
|
|
1507
|
-
The task is not completed within the round limit. Please
|
|
1183
|
+
The task is not completed within the round limit. Please
|
|
1184
|
+
check the last round {self.history_window} information to
|
|
1185
|
+
see if there is any useful information:
|
|
1508
1186
|
<history>{self.history[-self.history_window :]}</history>
|
|
1509
1187
|
"""
|
|
1510
1188
|
|
|
1511
1189
|
else:
|
|
1512
1190
|
simulation_result = self._get_final_answer(task_prompt)
|
|
1513
1191
|
|
|
1192
|
+
self.browser.close() # Close browser after task completion or limit
|
|
1193
|
+
# reached
|
|
1514
1194
|
return simulation_result
|
|
1515
1195
|
|
|
1516
1196
|
def get_tools(self) -> List[FunctionTool]:
|