myagent-ai 1.2.2 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,24 @@
1
1
  """
2
- skills/browser_skill.py - 浏览器操作技能
3
- =========================================
4
- 提供浏览器自动化操作功能(使用 Playwright)。
2
+ skills/browser_skill.py - 浏览器自动化技能 (完整版)
3
+ ===================================================
4
+ 基于 Playwright 的完整浏览器自动化,支持持久会话、多标签页、JS 执行、截图等。
5
+ 所有浏览器技能共享同一个浏览器实例,实现跨操作的持久会话。
6
+
7
+ Skills:
8
+ - BrowserOpenSkill: 打开 URL,返回结构化页面信息
9
+ - BrowserClickSkill: 通过 CSS/文本选择器点击元素
10
+ - BrowserFillSkill: 通过 CSS/文本选择器填写输入框
11
+ - BrowserScreenshotSkill: 截取当前页面截图,返回路径供 VLM 分析
12
+ - BrowserEvalSkill: 在页面上执行 JavaScript
13
+ - BrowserNavigateSkill: 浏览器导航(前进、后退、新标签页)
14
+ - BrowserCloseSkill: 关闭当前页面或浏览器
5
15
  """
6
16
  from __future__ import annotations
7
17
 
8
- from typing import Optional, List
18
+ import asyncio
19
+ import os
20
+ import time
21
+ from typing import Any, Dict, List, Optional
9
22
 
10
23
  from core.logger import get_logger
11
24
  from skills.base import Skill, SkillResult, SkillParameter
@@ -13,134 +26,745 @@ from skills.base import Skill, SkillResult, SkillParameter
13
26
  logger = get_logger("myagent.skills.browser")
14
27
 
15
28
 
29
+ class BrowserSession:
30
+ """
31
+ 浏览器持久会话管理器。
32
+
33
+ 使用类级别变量在所有浏览器技能之间共享同一个浏览器实例。
34
+ 支持懒初始化(首次使用时才启动浏览器)。
35
+ 所有方法使用 asyncio.Lock 保证并发安全。
36
+ """
37
+
38
+ _browser: Any = None # Playwright Browser 实例
39
+ _playwright: Any = None # Playwright 实例
40
+ _pages: Dict[str, Any] = {} # page_id -> Page 映射 (多标签页支持)
41
+ _active_page_id: str = "" # 当前活跃页面 ID
42
+ _lock: Optional[asyncio.Lock] = None
43
+
44
+ @classmethod
45
+ def _get_lock(cls) -> asyncio.Lock:
46
+ """获取或创建异步锁(线程安全的懒初始化)"""
47
+ if cls._lock is None:
48
+ cls._lock = asyncio.Lock()
49
+ return cls._lock
50
+
51
+ @classmethod
52
+ async def get_browser(cls) -> Any:
53
+ """
54
+ 获取浏览器实例(懒初始化)。
55
+ 首次调用时启动 Chromium 浏览器,后续调用复用已有实例。
56
+ """
57
+ from playwright.async_api import async_playwright
58
+
59
+ async with cls._get_lock():
60
+ if cls._browser is None or not cls._browser.is_connected():
61
+ cls._playwright = await async_playwright().start()
62
+ cls._browser = await cls._playwright.chromium.launch(
63
+ headless=True,
64
+ args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
65
+ )
66
+ logger.info("浏览器实例已启动 (Chromium, headless)")
67
+ return cls._browser
68
+
69
+ @classmethod
70
+ async def get_page(cls, page_id: str = "default") -> Any:
71
+ """
72
+ 获取指定 ID 的页面。如果不存在则自动创建。
73
+ 默认使用 'default' 页面 ID。
74
+ """
75
+ browser = await cls.get_browser()
76
+ async with cls._get_lock():
77
+ if page_id not in cls._pages:
78
+ page = await browser.new_page()
79
+ # 设置合理的默认超时和视口
80
+ page.set_default_timeout(30000)
81
+ page.set_default_navigation_timeout(30000)
82
+ cls._pages[page_id] = page
83
+ cls._active_page_id = page_id
84
+ logger.info(f"新标签页已创建: {page_id}")
85
+ cls._active_page_id = page_id
86
+ return cls._pages[page_id]
87
+
88
+ @classmethod
89
+ async def get_active_page(cls) -> tuple[Any, str]:
90
+ """获取当前活跃页面及其 ID"""
91
+ if not cls._active_page_id or cls._active_page_id not in cls._pages:
92
+ page = await cls.get_page("default")
93
+ return page, "default"
94
+ return cls._pages[cls._active_page_id], cls._active_page_id
95
+
96
+ @classmethod
97
+ async def close_page(cls, page_id: str = "") -> SkillResult:
98
+ """关闭指定页面。如果未指定则关闭当前活跃页面。"""
99
+ async with cls._get_lock():
100
+ if not page_id:
101
+ page_id = cls._active_page_id
102
+
103
+ if page_id and page_id in cls._pages:
104
+ await cls._pages[page_id].close()
105
+ del cls._pages[page_id]
106
+ logger.info(f"标签页已关闭: {page_id}")
107
+
108
+ # 如果关闭的是当前活跃页面,切换到其他页面
109
+ if page_id == cls._active_page_id:
110
+ remaining = list(cls._pages.keys())
111
+ cls._active_page_id = remaining[0] if remaining else ""
112
+ return SkillResult(
113
+ success=True,
114
+ message=f"标签页 '{page_id}' 已关闭,当前活跃: {cls._active_page_id or '无'}",
115
+ data={"remaining_tabs": list(cls._pages.keys())},
116
+ )
117
+ return SkillResult(
118
+ success=True,
119
+ message=f"标签页 '{page_id}' 已关闭",
120
+ data={"remaining_tabs": list(cls._pages.keys())},
121
+ )
122
+ return SkillResult(success=False, error="没有可关闭的页面")
123
+
124
+ @classmethod
125
+ async def close_browser(cls) -> SkillResult:
126
+ """关闭整个浏览器及所有页面。"""
127
+ async with cls._get_lock():
128
+ if cls._pages:
129
+ for pid, page in cls._pages.items():
130
+ try:
131
+ await page.close()
132
+ except Exception:
133
+ pass
134
+ cls._pages.clear()
135
+ cls._active_page_id = ""
136
+
137
+ if cls._browser:
138
+ await cls._browser.close()
139
+ cls._browser = None
140
+ logger.info("浏览器实例已关闭")
141
+
142
+ if cls._playwright:
143
+ await cls._playwright.stop()
144
+ cls._playwright = None
145
+
146
+ return SkillResult(success=True, message="浏览器已完全关闭")
147
+
148
+ @classmethod
149
+ def _generate_screenshot_path(cls) -> str:
150
+ """生成截图文件路径(带时间戳)"""
151
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
152
+ path = f"/tmp/myagent_gui_screenshot_{timestamp}.png"
153
+ return path
154
+
155
+
16
156
  class BrowserOpenSkill(Skill):
17
- """打开网页"""
157
+ """
158
+ 打开网页 - 打开指定 URL 并返回结构化的页面信息。
159
+
160
+ 如果浏览器尚未启动,会自动启动。如果已有页面,在当前页面导航。
161
+ 返回页面的标题、可见文本、链接列表、表单信息等结构化数据。
162
+ """
163
+
18
164
  name = "browser_open"
19
- description = "使用无头浏览器打开指定 URL,返回页面内容"
165
+ description = (
166
+ "使用浏览器打开指定 URL,返回页面的结构化信息(标题、可见文本、链接、表单等)。"
167
+ "浏览器会保持持久会话,后续操作可在同一页面继续。"
168
+ )
20
169
  category = "browser"
21
170
  parameters = [
22
- SkillParameter("url", "string", "要打开的 URL", required=True),
23
- SkillParameter("wait", "integer", "等待时间(毫秒)", required=False, default=3000),
24
- SkillParameter("screenshot", "boolean", "是否截图", required=False, default=False),
171
+ SkillParameter("url", "string", "要打开的网页 URL(必须以 http:// 或 https:// 开头)", required=True),
172
+ SkillParameter("wait", "integer", "页面加载后额外等待时间(毫秒),用于等待动态内容渲染", required=False, default=3000),
173
+ SkillParameter("page_id", "string", "在指定标签页打开(留空则使用当前活跃标签页)", required=False, default=""),
174
+ SkillParameter("wait_until", "string", "导航等待策略: domcontentloaded/load/networkidle/commit", required=False, default="domcontentloaded",
175
+ enum=["domcontentloaded", "load", "networkidle", "commit"]),
25
176
  ]
26
177
 
27
- async def execute(self, url: str = "", wait: int = 3000,
28
- screenshot: bool = False, **kwargs) -> SkillResult:
178
+ async def execute(
179
+ self,
180
+ url: str = "",
181
+ wait: int = 3000,
182
+ page_id: str = "",
183
+ wait_until: str = "domcontentloaded",
184
+ **kwargs,
185
+ ) -> SkillResult:
186
+ """执行:打开 URL 并提取页面结构化信息"""
29
187
  try:
30
188
  from playwright.async_api import async_playwright
189
+ except ImportError:
190
+ # 自动安装缺失依赖
191
+ from core.deps_checker import ensure_skill_deps
192
+ if not ensure_skill_deps("browser"):
193
+ return SkillResult(
194
+ success=False,
195
+ error="Playwright 安装失败,请手动运行: pip install playwright && playwright install chromium",
196
+ )
197
+ from playwright.async_api import async_playwright
31
198
 
32
- async with async_playwright() as p:
33
- browser = await p.chromium.launch(headless=True)
34
- page = await browser.new_page()
199
+ if not url:
200
+ return SkillResult(success=False, error="缺少必需参数: url")
35
201
 
36
- await page.goto(url, wait_until="networkidle", timeout=30000)
37
- if wait > 0:
38
- await page.wait_for_timeout(wait)
202
+ try:
203
+ page = await BrowserSession.get_page(page_id or "default")
39
204
 
40
- # 提取页面内容
41
- title = await page.title()
42
- content = await page.content()
43
- # 纯文本
44
- text = await page.evaluate("() => document.body.innerText")
205
+ # 导航到目标 URL
206
+ await page.goto(url, wait_until=wait_until, timeout=30000)
45
207
 
46
- result_data = {
47
- "url": url,
48
- "title": title,
49
- "text_content": text[:15000],
50
- }
208
+ # 等待动态内容渲染
209
+ if wait > 0:
210
+ await page.wait_for_timeout(wait)
51
211
 
52
- # 截图
53
- if screenshot:
54
- ss_path = f"/tmp/screenshot_{url.replace('/', '_')[:50]}.png"
55
- await page.screenshot(path=ss_path, full_page=True)
56
- result_data["screenshot_path"] = ss_path
212
+ # 提取结构化页面信息
213
+ page_info = await page.evaluate("""() => {
214
+ // 提取可见文本(去除隐藏元素)
215
+ const allText = document.body ? document.body.innerText : '';
57
216
 
58
- await browser.close()
217
+ // 提取所有链接
218
+ const links = Array.from(document.querySelectorAll('a[href]'))
219
+ .filter(a => a.offsetParent !== null) // 只取可见链接
220
+ .slice(0, 50)
221
+ .map(a => ({
222
+ text: a.innerText.trim().substring(0, 100),
223
+ href: a.href,
224
+ }))
225
+ .filter(l => l.text);
226
+
227
+ // 提取表单信息
228
+ const forms = Array.from(document.querySelectorAll('form'))
229
+ .slice(0, 20)
230
+ .map(form => {
231
+ const inputs = Array.from(form.querySelectorAll('input, textarea, select'))
232
+ .map(el => ({
233
+ tag: el.tagName.toLowerCase(),
234
+ type: el.type || '',
235
+ name: el.name || '',
236
+ placeholder: el.placeholder || '',
237
+ id: el.id || '',
238
+ }));
239
+ return {
240
+ action: form.action || '',
241
+ method: (form.method || 'GET').toUpperCase(),
242
+ inputs: inputs.slice(0, 20),
243
+ };
244
+ });
245
+
246
+ // 提取标题层级(用于理解页面结构)
247
+ const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4'))
248
+ .map(h => ({
249
+ tag: h.tagName,
250
+ text: h.innerText.trim().substring(0, 200),
251
+ }))
252
+ .filter(h => h.text)
253
+ .slice(0, 20);
254
+
255
+ return {
256
+ title: document.title || '',
257
+ text: allText.substring(0, 20000),
258
+ links: links,
259
+ forms: forms,
260
+ headings: headings,
261
+ url: window.location.href,
262
+ };
263
+ }""")
59
264
 
60
- return SkillResult(
61
- success=True,
62
- data=result_data,
63
- message=f"已打开: {title} ({len(text)} 字符)",
64
- files=result_data.get("screenshot_path", []),
65
- )
66
- except ImportError:
67
265
  return SkillResult(
68
- success=False,
69
- error="请安装 Playwright: pip install playwright && playwright install chromium",
266
+ success=True,
267
+ data=page_info,
268
+ message=f"已打开: {page_info.get('title', '未知页面')} (文本 {len(page_info.get('text', ''))} 字符, {len(page_info.get('links', []))} 个链接)",
70
269
  )
71
270
  except Exception as e:
72
- return SkillResult(success=False, error=f"浏览器操作失败: {e}")
271
+ logger.error(f"浏览器打开失败: {e}")
272
+ return SkillResult(success=False, error=f"浏览器打开失败: {e}")
73
273
 
74
274
 
75
275
  class BrowserClickSkill(Skill):
76
- """点击页面元素"""
276
+ """
277
+ 点击页面元素 - 通过 CSS 选择器或可见文本定位并点击元素。
278
+
279
+ 支持两种定位方式:
280
+ 1. CSS 选择器: selector="button.submit-btn"
281
+ 2. 文本匹配: text="登录" 或 text="Submit"
282
+ 如果同时提供 selector 和 text,优先使用 selector。
283
+ """
284
+
77
285
  name = "browser_click"
78
- description = "在浏览器页面中点击指定元素"
286
+ description = (
287
+ "在当前浏览器页面中点击元素。支持 CSS 选择器(selector)或可见文本(text)定位。"
288
+ "点击后会等待 1 秒让页面响应。"
289
+ )
79
290
  category = "browser"
80
291
  parameters = [
81
- SkillParameter("selector", "string", "CSS 选择器", required=True),
82
- SkillParameter("url", "string", "页面 URL(如果未打开)", required=False, default=""),
292
+ SkillParameter("selector", "string", "目标元素的 CSS 选择器(如 'button#submit', 'a.login-link', 'input[type=submit]')", required=False, default=""),
293
+ SkillParameter("text", "string", "目标元素的可见文本(如 '登录', '搜索', 'Submit')", required=False, default=""),
294
+ SkillParameter("wait_after", "integer", "点击后等待时间(毫秒),用于等待页面响应", required=False, default=1000),
295
+ SkillParameter("double_click", "boolean", "是否双击(默认单击)", required=False, default=False),
83
296
  ]
84
297
 
85
- async def execute(self, selector: str = "", url: str = "", **kwargs) -> SkillResult:
298
+ async def execute(
299
+ self,
300
+ selector: str = "",
301
+ text: str = "",
302
+ wait_after: int = 1000,
303
+ double_click: bool = False,
304
+ **kwargs,
305
+ ) -> SkillResult:
306
+ """执行:定位并点击页面元素"""
86
307
  try:
87
308
  from playwright.async_api import async_playwright
309
+ except ImportError:
310
+ from core.deps_checker import ensure_skill_deps
311
+ if not ensure_skill_deps("browser"):
312
+ return SkillResult(
313
+ success=False,
314
+ error="Playwright 安装失败,请手动运行: pip install playwright && playwright install chromium",
315
+ )
316
+ from playwright.async_api import async_playwright
88
317
 
89
- async with async_playwright() as p:
90
- browser = await p.chromium.launch(headless=True)
91
- page = await browser.new_page()
318
+ if not selector and not text:
319
+ return SkillResult(success=False, error="必须提供 selector 或 text 参数来定位元素")
92
320
 
93
- if url:
94
- await page.goto(url, wait_until="networkidle", timeout=30000)
321
+ try:
322
+ page, page_id = await BrowserSession.get_active_page()
95
323
 
96
- await page.click(selector, timeout=10000)
97
- await page.wait_for_timeout(2000)
324
+ # 定位元素
325
+ if selector:
326
+ # CSS 选择器定位
327
+ element = page.locator(selector).first
328
+ await element.wait_for(state="visible", timeout=10000)
329
+ if double_click:
330
+ await element.dblclick()
331
+ else:
332
+ await element.click()
333
+ else:
334
+ # 文本匹配定位(使用 Playwright 的 text 选择器)
335
+ text_selector = f"text={text}"
336
+ element = page.locator(text_selector).first
337
+ await element.wait_for(state="visible", timeout=10000)
338
+ if double_click:
339
+ await element.dblclick()
340
+ else:
341
+ await element.click()
98
342
 
99
- text = await page.evaluate("() => document.body.innerText")
100
- title = await page.title()
343
+ # 等待页面响应
344
+ if wait_after > 0:
345
+ await page.wait_for_timeout(wait_after)
101
346
 
102
- await browser.close()
347
+ # 获取点击后的页面状态
348
+ title = await page.title()
349
+ visible_text = await page.evaluate("() => document.body.innerText.substring(0, 3000)")
103
350
 
104
- return SkillResult(
105
- success=True,
106
- data={"title": title, "text": text[:10000]},
107
- message=f"已点击: {selector}",
108
- )
351
+ click_desc = f"双击" if double_click else "点击"
352
+ loc_desc = f"选择器 '{selector}'" if selector else f"文本 '{text}'"
353
+
354
+ return SkillResult(
355
+ success=True,
356
+ data={"title": title, "text_preview": visible_text},
357
+ message=f"已{click_desc} {loc_desc},当前页面: {title}",
358
+ )
109
359
  except Exception as e:
110
- return SkillResult(success=False, error=str(e))
360
+ logger.error(f"点击元素失败: {e}")
361
+ return SkillResult(
362
+ success=False,
363
+ error=f"点击元素失败: {e}(请检查选择器 '{selector}' 或文本 '{text}' 是否正确)",
364
+ )
111
365
 
112
366
 
113
367
  class BrowserFillSkill(Skill):
114
- """填写表单"""
368
+ """
369
+ 填写输入框 - 通过 CSS 选择器或可见文本定位输入框并填入内容。
370
+
371
+ 支持 <input>, <textarea>, <select> 以及 contenteditable 元素。
372
+ 填写前会先清空原有内容,确保填写结果的准确性。
373
+ """
374
+
115
375
  name = "browser_fill"
116
- description = "在浏览器页面中填写表单字段"
376
+ description = (
377
+ "在当前浏览器页面的输入框中填写内容。支持 CSS 选择器(selector)或文本标签定位。"
378
+ "填写前会自动清空原有内容。"
379
+ )
117
380
  category = "browser"
118
381
  parameters = [
119
- SkillParameter("selector", "string", "输入框 CSS 选择器", required=True),
120
- SkillParameter("value", "string", "要填写的值", required=True),
121
- SkillParameter("url", "string", "页面 URL", required=False, default=""),
382
+ SkillParameter("selector", "string", "输入框的 CSS 选择器(如 'input#username', 'textarea[name=content]')", required=False, default=""),
383
+ SkillParameter("text", "string", "输入框关联的可见文本标签(如 '用户名', 'Email')", required=False, default=""),
384
+ SkillParameter("value", "string", "要填写的值(文本内容)", required=True),
385
+ SkillParameter("clear_first", "boolean", "是否先清空输入框(默认 true)", required=False, default=True),
386
+ SkillParameter("press_enter", "boolean", "填写后是否按回车键(默认 false)", required=False, default=False),
122
387
  ]
123
388
 
124
- async def execute(self, selector: str = "", value: str = "",
125
- url: str = "", **kwargs) -> SkillResult:
389
+ async def execute(
390
+ self,
391
+ selector: str = "",
392
+ text: str = "",
393
+ value: str = "",
394
+ clear_first: bool = True,
395
+ press_enter: bool = False,
396
+ **kwargs,
397
+ ) -> SkillResult:
398
+ """执行:定位输入框并填写内容"""
126
399
  try:
127
400
  from playwright.async_api import async_playwright
401
+ except ImportError:
402
+ from core.deps_checker import ensure_skill_deps
403
+ if not ensure_skill_deps("browser"):
404
+ return SkillResult(
405
+ success=False,
406
+ error="Playwright 安装失败,请手动运行: pip install playwright && playwright install chromium",
407
+ )
408
+ from playwright.async_api import async_playwright
128
409
 
129
- async with async_playwright() as p:
130
- browser = await p.chromium.launch(headless=True)
131
- page = await browser.new_page()
410
+ if not value:
411
+ return SkillResult(success=False, error="缺少必需参数: value")
412
+ if not selector and not text:
413
+ return SkillResult(success=False, error="必须提供 selector 或 text 参数来定位输入框")
132
414
 
133
- if url:
134
- await page.goto(url, wait_until="networkidle", timeout=30000)
415
+ try:
416
+ page, page_id = await BrowserSession.get_active_page()
417
+
418
+ if selector:
419
+ element = page.locator(selector).first
420
+ await element.wait_for(state="visible", timeout=10000)
421
+ else:
422
+ # 通过文本标签查找关联的输入框
423
+ # 尝试找到包含该文本的 label,然后找到 label for 指向的 input
424
+ element = await page.evaluate_handle(f"""(text) => {{
425
+ // 方式1: 查找 label 标签
426
+ const labels = Array.from(document.querySelectorAll('label'));
427
+ for (const label of labels) {{
428
+ if (label.innerText.trim().includes(text)) {{
429
+ if (label.htmlFor) {{
430
+ return document.getElementById(label.htmlFor) || label.querySelector('input, textarea, select');
431
+ }}
432
+ return label.querySelector('input, textarea, select');
433
+ }}
434
+ }}
435
+ // 方式2: 查找 placeholder 包含文本的输入框
436
+ const inputs = Array.from(document.querySelectorAll('input, textarea, select'));
437
+ for (const input of inputs) {{
438
+ if (input.placeholder && input.placeholder.includes(text)) {{
439
+ return input;
440
+ }}
441
+ }}
442
+ return null;
443
+ }}""", text)
444
+
445
+ if element is None:
446
+ loc_desc = f"选择器 '{selector}'" if selector else f"文本标签 '{text}'"
447
+ return SkillResult(success=False, error=f"未找到输入框: {loc_desc}")
448
+
449
+ # 清空并填写
450
+ if clear_first:
451
+ await page.locator(selector).first.fill("") if selector else await element.fill("")
452
+ await page.locator(selector).first.fill(value) if selector else await element.fill(value)
135
453
 
136
- await page.fill(selector, value, timeout=10000)
454
+ # 按回车(可选)
455
+ if press_enter:
456
+ await page.locator(selector).first.press("Enter") if selector else await element.press("Enter")
137
457
  await page.wait_for_timeout(1000)
138
458
 
139
- await browser.close()
459
+ loc_desc = f"选择器 '{selector}'" if selector else f"文本标签 '{text}'"
460
+ return SkillResult(
461
+ success=True,
462
+ message=f"已在 {loc_desc} 填写内容({len(value)} 字符)",
463
+ )
464
+ except Exception as e:
465
+ logger.error(f"填写输入框失败: {e}")
466
+ return SkillResult(success=False, error=f"填写输入框失败: {e}")
467
+
468
+
469
+ class BrowserScreenshotSkill(Skill):
470
+ """
471
+ 页面截图 - 截取当前浏览器页面的截图。
140
472
 
473
+ 截图保存到 /tmp/myagent_gui_screenshot_* 路径,返回文件路径。
474
+ 可配合 VLM 技能对截图进行视觉分析。
475
+ 支持全页面截图或仅可视区域截图。
476
+ """
477
+
478
+ name = "browser_screenshot"
479
+ description = (
480
+ "截取当前浏览器页面的截图,保存为 PNG 文件并返回文件路径。"
481
+ "可用于 VLM 视觉分析。支持全页面截图或仅截取可视区域。"
482
+ )
483
+ category = "browser"
484
+ parameters = [
485
+ SkillParameter("full_page", "boolean", "是否截取整个页面(包括滚动区域),默认仅截取可视区域", required=False, default=False),
486
+ SkillParameter("selector", "string", "截取特定元素的截图(CSS 选择器),留空则截取整个页面", required=False, default=""),
487
+ ]
488
+
489
+ async def execute(
490
+ self,
491
+ full_page: bool = False,
492
+ selector: str = "",
493
+ **kwargs,
494
+ ) -> SkillResult:
495
+ """执行:截取页面截图"""
496
+ try:
497
+ from playwright.async_api import async_playwright
498
+ except ImportError:
499
+ from core.deps_checker import ensure_skill_deps
500
+ if not ensure_skill_deps("browser"):
501
+ return SkillResult(
502
+ success=False,
503
+ error="Playwright 安装失败,请手动运行: pip install playwright && playwright install chromium",
504
+ )
505
+ from playwright.async_api import async_playwright
506
+
507
+ try:
508
+ page, page_id = await BrowserSession.get_active_page()
509
+ screenshot_path = BrowserSession._generate_screenshot_path()
510
+
511
+ if selector:
512
+ # 截取特定元素
513
+ element = page.locator(selector).first
514
+ await element.wait_for(state="visible", timeout=10000)
515
+ await element.screenshot(path=screenshot_path)
516
+ else:
517
+ # 截取整个页面或可视区域
518
+ await page.screenshot(path=screenshot_path, full_page=full_page)
519
+
520
+ # 获取文件大小
521
+ file_size = os.path.getsize(screenshot_path)
522
+
523
+ return SkillResult(
524
+ success=True,
525
+ data={
526
+ "screenshot_path": screenshot_path,
527
+ "file_size_bytes": file_size,
528
+ "full_page": full_page,
529
+ "element_selector": selector or None,
530
+ "page_title": await page.title(),
531
+ },
532
+ message=f"截图已保存: {screenshot_path} ({file_size} 字节)",
533
+ files=[screenshot_path],
534
+ )
535
+ except Exception as e:
536
+ logger.error(f"截图失败: {e}")
537
+ return SkillResult(success=False, error=f"截图失败: {e}")
538
+
539
+
540
+ class BrowserEvalSkill(Skill):
541
+ """
542
+ 执行 JavaScript - 在当前浏览器页面中执行自定义 JavaScript 代码。
543
+
544
+ 可用于提取数据、修改页面状态、与页面交互等高级操作。
545
+ 执行结果以 JSON 格式返回。
546
+ """
547
+
548
+ name = "browser_eval"
549
+ description = (
550
+ "在当前浏览器页面中执行 JavaScript 代码并返回结果。"
551
+ "代码应返回一个可序列化的值(字符串、数字、对象等)。"
552
+ "可用于提取页面数据、修改 DOM、与页面 API 交互等高级操作。"
553
+ )
554
+ category = "browser"
555
+ dangerous = True
556
+ parameters = [
557
+ SkillParameter("code", "string", "要执行的 JavaScript 代码(应包含 return 语句以返回结果)", required=True),
558
+ SkillParameter("wait_after", "integer", "执行后等待时间(毫秒)", required=False, default=500),
559
+ ]
560
+
561
+ async def execute(
562
+ self,
563
+ code: str = "",
564
+ wait_after: int = 500,
565
+ **kwargs,
566
+ ) -> SkillResult:
567
+ """执行:在页面上运行 JavaScript"""
568
+ try:
569
+ from playwright.async_api import async_playwright
570
+ except ImportError:
571
+ from core.deps_checker import ensure_skill_deps
572
+ if not ensure_skill_deps("browser"):
573
+ return SkillResult(
574
+ success=False,
575
+ error="Playwright 安装失败,请手动运行: pip install playwright && playwright install chromium",
576
+ )
577
+ from playwright.async_api import async_playwright
578
+
579
+ if not code:
580
+ return SkillResult(success=False, error="缺少必需参数: code")
581
+
582
+ try:
583
+ page, page_id = await BrowserSession.get_active_page()
584
+
585
+ # 自动包装代码:如果没有 return 语句,将最后一个表达式作为返回值
586
+ wrapped_code = code.strip()
587
+ if not wrapped_code.startswith("return"):
588
+ wrapped_code = f"return (() => {{ {wrapped_code} }})()"
589
+
590
+ result = await page.evaluate(wrapped_code)
591
+
592
+ return SkillResult(
593
+ success=True,
594
+ data={"result": result},
595
+ message=f"JavaScript 执行成功,返回类型: {type(result).__name__}",
596
+ )
597
+ except Exception as e:
598
+ logger.error(f"JavaScript 执行失败: {e}")
599
+ return SkillResult(
600
+ success=False,
601
+ error=f"JavaScript 执行失败: {e}",
602
+ )
603
+
604
+
605
+ class BrowserNavigateSkill(Skill):
606
+ """
607
+ 浏览器导航 - 控制浏览器的前进、后退、新标签页等操作。
608
+
609
+ 支持的导航操作:
610
+ - back: 后退到上一页
611
+ - forward: 前进到下一页
612
+ - new_tab: 打开新标签页(可选指定初始 URL)
613
+ - switch_tab: 切换到指定标签页
614
+ - list_tabs: 列出所有打开的标签页
615
+ """
616
+
617
+ name = "browser_navigate"
618
+ description = (
619
+ "控制浏览器导航操作:前进、后退、打开新标签页、切换标签页、列出标签页。"
620
+ "浏览器保持持久会话,标签页之间可自由切换。"
621
+ )
622
+ category = "browser"
623
+ parameters = [
624
+ SkillParameter("action", "string", "导航操作类型", required=True,
625
+ enum=["back", "forward", "new_tab", "switch_tab", "list_tabs"]),
626
+ SkillParameter("url", "string", "新标签页的 URL(仅 action=new_tab 时有效)", required=False, default=""),
627
+ SkillParameter("page_id", "string", "目标标签页 ID(仅 switch_tab 时有效)", required=False, default=""),
628
+ ]
629
+
630
+ async def execute(
631
+ self,
632
+ action: str = "",
633
+ url: str = "",
634
+ page_id: str = "",
635
+ **kwargs,
636
+ ) -> SkillResult:
637
+ """执行:浏览器导航操作"""
638
+ try:
639
+ from playwright.async_api import async_playwright
640
+ except ImportError:
641
+ from core.deps_checker import ensure_skill_deps
642
+ if not ensure_skill_deps("browser"):
643
+ return SkillResult(
644
+ success=False,
645
+ error="Playwright 安装失败,请手动运行: pip install playwright && playwright install chromium",
646
+ )
647
+ from playwright.async_api import async_playwright
648
+
649
+ if not action:
650
+ return SkillResult(success=False, error="缺少必需参数: action")
651
+
652
+ try:
653
+ if action == "back":
654
+ page, _ = await BrowserSession.get_active_page()
655
+ await page.go_back(wait_until="domcontentloaded", timeout=15000)
656
+ title = await page.title()
657
+ current_url = page.url
658
+ return SkillResult(
659
+ success=True,
660
+ data={"title": title, "url": current_url},
661
+ message=f"已后退到: {title} ({current_url})",
662
+ )
663
+
664
+ elif action == "forward":
665
+ page, _ = await BrowserSession.get_active_page()
666
+ await page.go_forward(wait_until="domcontentloaded", timeout=15000)
667
+ title = await page.title()
668
+ current_url = page.url
669
+ return SkillResult(
670
+ success=True,
671
+ data={"title": title, "url": current_url},
672
+ message=f"已前进到: {title} ({current_url})",
673
+ )
674
+
675
+ elif action == "new_tab":
676
+ # 生成唯一标签页 ID
677
+ import uuid
678
+ new_id = page_id or f"tab_{uuid.uuid4().hex[:8]}"
679
+ page = await BrowserSession.get_page(new_id)
680
+ if url:
681
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000)
682
+ title = await page.title()
683
+ tabs = list(BrowserSession._pages.keys())
141
684
  return SkillResult(
142
685
  success=True,
143
- message=f"已填写 {selector} = {value[:50]}",
686
+ data={"page_id": new_id, "title": title, "url": url, "all_tabs": tabs},
687
+ message=f"新标签页 '{new_id}' 已打开{f',已导航到 {url}' if url else ''}",
144
688
  )
689
+
690
+ elif action == "switch_tab":
691
+ if not page_id:
692
+ return SkillResult(success=False, error="switch_tab 需要指定 page_id 参数")
693
+ if page_id not in BrowserSession._pages:
694
+ available = list(BrowserSession._pages.keys())
695
+ return SkillResult(
696
+ success=False,
697
+ error=f"标签页 '{page_id}' 不存在。可用标签页: {available}",
698
+ )
699
+ BrowserSession._active_page_id = page_id
700
+ page = BrowserSession._pages[page_id]
701
+ title = await page.title()
702
+ current_url = page.url
703
+ return SkillResult(
704
+ success=True,
705
+ data={"page_id": page_id, "title": title, "url": current_url},
706
+ message=f"已切换到标签页 '{page_id}': {title}",
707
+ )
708
+
709
+ elif action == "list_tabs":
710
+ tabs_info = {}
711
+ for pid, p in BrowserSession._pages.items():
712
+ try:
713
+ title = await p.title()
714
+ tabs_info[pid] = {
715
+ "title": title,
716
+ "url": p.url,
717
+ "is_active": pid == BrowserSession._active_page_id,
718
+ }
719
+ except Exception:
720
+ tabs_info[pid] = {"title": "(无法获取)", "url": "(无法获取)", "is_active": False}
721
+ return SkillResult(
722
+ success=True,
723
+ data={"tabs": tabs_info, "active_tab": BrowserSession._active_page_id},
724
+ message=f"共 {len(tabs_info)} 个标签页,当前活跃: {BrowserSession._active_page_id}",
725
+ )
726
+
727
+ else:
728
+ return SkillResult(success=False, error=f"未知导航操作: {action}")
729
+
730
+ except Exception as e:
731
+ logger.error(f"浏览器导航失败: {e}")
732
+ return SkillResult(success=False, error=f"浏览器导航失败: {e}")
733
+
734
+
735
+ class BrowserCloseSkill(Skill):
736
+ """
737
+ 关闭浏览器 - 关闭当前标签页或整个浏览器。
738
+
739
+ 关闭标签页后,如果有其他标签页存在,会自动切换到第一个。
740
+ 关闭浏览器会终止所有页面和持久会话。
741
+ """
742
+
743
+ name = "browser_close"
744
+ description = (
745
+ "关闭当前浏览器标签页或整个浏览器。"
746
+ "关闭标签页时,如果有其他标签页会自动切换。"
747
+ "关闭浏览器会终止所有持久会话。"
748
+ )
749
+ category = "browser"
750
+ parameters = [
751
+ SkillParameter("target", "string", "关闭目标: 'tab' 关闭当前标签页, 'browser' 关闭整个浏览器", required=False, default="tab",
752
+ enum=["tab", "browser"]),
753
+ SkillParameter("page_id", "string", "要关闭的标签页 ID(留空则关闭当前标签页,仅 target=tab 时有效)", required=False, default=""),
754
+ ]
755
+
756
+ async def execute(
757
+ self,
758
+ target: str = "tab",
759
+ page_id: str = "",
760
+ **kwargs,
761
+ ) -> SkillResult:
762
+ """执行:关闭标签页或浏览器"""
763
+ try:
764
+ if target == "browser":
765
+ return await BrowserSession.close_browser()
766
+ else:
767
+ return await BrowserSession.close_page(page_id)
145
768
  except Exception as e:
146
- return SkillResult(success=False, error=str(e))
769
+ logger.error(f"关闭浏览器失败: {e}")
770
+ return SkillResult(success=False, error=f"关闭失败: {e}")