mooncat-browser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +213 -0
  2. package/browser-op/backend/browserd.cjs +1004 -0
  3. package/browser-op/backend/rpc-client.cjs +64 -0
  4. package/browser-op/backend/state.cjs +51 -0
  5. package/browser-op/cdp/capture-inject.js +426 -0
  6. package/browser-op/cdp/capture-inject.ts +426 -0
  7. package/browser-op/cdp/capture-service.cjs +172 -0
  8. package/browser-op/cdp/chrome-launcher.cjs +370 -0
  9. package/browser-op/cdp/chrome-path.cjs +57 -0
  10. package/browser-op/cdp/state.cjs +89 -0
  11. package/browser-op/extension/extension-detect.cjs +228 -0
  12. package/browser-op/extension/server.cjs +197 -0
  13. package/browser-op/extension/service.cjs +228 -0
  14. package/browser-op/extension/state.cjs +78 -0
  15. package/browser-op/index.cjs +389 -0
  16. package/browser-op/package.json +17 -0
  17. package/browser-op/py/behavior.py +138 -0
  18. package/browser-op/py/browser.py +340 -0
  19. package/browser-op/py/captcha.py +115 -0
  20. package/browser-op/py/crawler.py +125 -0
  21. package/browser-op/py/examples/01_open_and_probe.py +48 -0
  22. package/browser-op/py/examples/02_reuse_and_probe.py +66 -0
  23. package/browser-op/py/examples/03_interact.py +66 -0
  24. package/browser-op/py/find.py +150 -0
  25. package/browser-op/py/honeypot.py +73 -0
  26. package/browser-op/py/humanize.py +392 -0
  27. package/browser-op/py/image.py +186 -0
  28. package/browser-op/py/interact.py +193 -0
  29. package/browser-op/py/markdown.py +38 -0
  30. package/browser-op/py/pyproject.toml +32 -0
  31. package/browser-op/py/ready.py +208 -0
  32. package/browser-op/py/scroll.py +180 -0
  33. package/browser-op/py/upload.py +103 -0
  34. package/browser-op/py/visual_target.py +47 -0
  35. package/browser-op/py/visualize.py +91 -0
  36. package/browser-op/state.cjs +63 -0
  37. package/browser-op/web/behavior.js +153 -0
  38. package/browser-op/web/browser.js +231 -0
  39. package/browser-op/web/captcha.js +85 -0
  40. package/browser-op/web/crawler.js +109 -0
  41. package/browser-op/web/find.js +147 -0
  42. package/browser-op/web/honeypot.js +68 -0
  43. package/browser-op/web/humanize.js +522 -0
  44. package/browser-op/web/image.js +177 -0
  45. package/browser-op/web/interact.js +169 -0
  46. package/browser-op/web/markdown.js +80 -0
  47. package/browser-op/web/ready.js +295 -0
  48. package/browser-op/web/scroll.js +167 -0
  49. package/browser-op/web/upload.js +116 -0
  50. package/browser-op/web/visual-runtime.inject.cjs +6 -0
  51. package/browser-op/webplater/.env.example +7 -0
  52. package/browser-op/webplater/ARCHITECTURE.md +102 -0
  53. package/browser-op/webplater/dist/chrome-mv3/assets/popup-BUZEUmsx.css +1 -0
  54. package/browser-op/webplater/dist/chrome-mv3/background.js +2 -0
  55. package/browser-op/webplater/dist/chrome-mv3/capture.js +310 -0
  56. package/browser-op/webplater/dist/chrome-mv3/chunks/_virtual_wxt-html-plugins-DPbbfBKe.js +1 -0
  57. package/browser-op/webplater/dist/chrome-mv3/chunks/offscreen-CFXYw9Mo.js +1 -0
  58. package/browser-op/webplater/dist/chrome-mv3/chunks/popup-C-lpxZZO.js +1 -0
  59. package/browser-op/webplater/dist/chrome-mv3/content-scripts/content.js +7 -0
  60. package/browser-op/webplater/dist/chrome-mv3/manifest.json +1 -0
  61. package/browser-op/webplater/dist/chrome-mv3/offscreen.html +16 -0
  62. package/browser-op/webplater/dist/chrome-mv3/popup.html +31 -0
  63. package/browser-op/webplater/entrypoints/background.ts +938 -0
  64. package/browser-op/webplater/entrypoints/content.ts +1150 -0
  65. package/browser-op/webplater/entrypoints/offscreen/index.html +15 -0
  66. package/browser-op/webplater/entrypoints/offscreen/main.ts +161 -0
  67. package/browser-op/webplater/entrypoints/popup/index.html +29 -0
  68. package/browser-op/webplater/entrypoints/popup/main.ts +61 -0
  69. package/browser-op/webplater/entrypoints/popup/style.css +100 -0
  70. package/browser-op/webplater/lib/snapshot.ts +352 -0
  71. package/browser-op/webplater/package.json +29 -0
  72. package/browser-op/webplater/pnpm-lock.yaml +3411 -0
  73. package/browser-op/webplater/public/capture.js +310 -0
  74. package/browser-op/webplater/scripts/publish-extension.mjs +176 -0
  75. package/browser-op/webplater/tsconfig.json +19 -0
  76. package/browser-op/webplater/wxt.config.ts +34 -0
  77. package/dist/actions.md +102 -0
  78. package/dist/cli.d.ts +2 -0
  79. package/dist/cli.d.ts.map +1 -0
  80. package/dist/cli.js +278 -0
  81. package/dist/cli.js.map +1 -0
  82. package/dist/client.d.ts +94 -0
  83. package/dist/client.d.ts.map +1 -0
  84. package/dist/client.js +277 -0
  85. package/dist/client.js.map +1 -0
  86. package/dist/config.d.ts +61 -0
  87. package/dist/config.d.ts.map +1 -0
  88. package/dist/config.js +119 -0
  89. package/dist/config.js.map +1 -0
  90. package/dist/protocol.d.ts +195 -0
  91. package/dist/protocol.d.ts.map +1 -0
  92. package/dist/protocol.js +11 -0
  93. package/dist/protocol.js.map +1 -0
  94. package/dist/server.d.ts +66 -0
  95. package/dist/server.d.ts.map +1 -0
  96. package/dist/server.js +259 -0
  97. package/dist/server.js.map +1 -0
  98. package/package.json +78 -0
  99. package/schemas/browser.clearCookies.schema.json +13 -0
  100. package/schemas/browser.close.schema.json +9 -0
  101. package/schemas/browser.getCookies.schema.json +13 -0
  102. package/schemas/browser.getDownload.schema.json +15 -0
  103. package/schemas/browser.health.schema.json +9 -0
  104. package/schemas/browser.listDownloads.schema.json +16 -0
  105. package/schemas/browser.listTabs.schema.json +9 -0
  106. package/schemas/browser.newTab.schema.json +15 -0
  107. package/schemas/browser.open.schema.json +15 -0
  108. package/schemas/browser.operate.schema.json +15 -0
  109. package/schemas/browser.reuseTab.schema.json +15 -0
  110. package/schemas/browser.setCookies.schema.json +15 -0
  111. package/schemas/browser.waitFor.schema.json +15 -0
  112. package/schemas/browser.waitForDownload.schema.json +15 -0
  113. package/skills/browser/SKILL.md +110 -0
  114. package/skills/browser/references/collect.md +163 -0
  115. package/skills/browser/references/high-risk.md +161 -0
  116. package/skills/browser/references/operate-actions.md +92 -0
  117. package/skills/browser/references/probing.md +302 -0
@@ -0,0 +1,340 @@
1
+ """
2
+ browserd Python 客户端 — 复刻 lib/web/browser.js 的 facade 模式.
3
+
4
+ 连接 PM2 Runtime 管理的 browserd, 通过 HTTP JSON-RPC (127.0.0.1:17322/rpc) 通信.
5
+ 业务脚本退出不影响 Chrome (browserd 持有它, 常驻).
6
+
7
+ 用法:
8
+ from browser import Browser
9
+ b = Browser()
10
+ bh = b.open(route_mode='auto') # 自动起 browserd + 开浏览器
11
+ ph = b.find_or_new_tab('example.com') # 找/开 tab
12
+ b.operate(ph, action='click', selector='#btn')
13
+ data = b.operate(ph, action='evaluate', source='() => document.title')
14
+ b.close()
15
+
16
+ 设计原则 (对齐 browser.js):
17
+ - handle 是纯数据, 绝不判断路由 (browser.js 铁律 1)
18
+ - 会话长生命周期, open 一次跨多步复用 (铁律 2)
19
+ - 步进落盘, 不一次做完 (铁律 3)
20
+ - Python 侧零路由判断, 全透传给 browserd
21
+ """
22
+ from __future__ import annotations
23
+ import json
24
+ import os
25
+ import subprocess
26
+ import time
27
+ from pathlib import Path
28
+ from typing import Any
29
+
30
+ import httpx
31
+
32
+ # ─── 常量 (对齐 rpc-client.cjs) ───
33
+ IPC_HOST = "127.0.0.1"
34
+ IPC_PORT = 17322
35
+ RPC_URL = f"http://{IPC_HOST}:{IPC_PORT}/rpc"
36
+ HEALTH_URL = f"http://{IPC_HOST}:{IPC_PORT}/health"
37
+
38
+ # browserd.cjs 路径: 本文件在 lib/py/, browserd 在 lib/backend/
39
+ def _resolve_browserd_path() -> Path:
40
+ here = Path(__file__).resolve()
41
+ candidates = [
42
+ # Source and production runtime layout:
43
+ # libs/browser/py/browser.py -> libs/browser/backend/browserd.cjs
44
+ here.parent.parent / "backend" / "browserd.cjs",
45
+ # Distributed skillpack layout:
46
+ # browser/lib/py/browser.py -> browser/lib/backend/browserd.cjs
47
+ here.parent.parent.parent / "lib" / "backend" / "browserd.cjs",
48
+ ]
49
+
50
+ for candidate in candidates:
51
+ if candidate.exists():
52
+ return candidate
53
+
54
+ return candidates[0]
55
+
56
+
57
+ BROWSERD_PATH = _resolve_browserd_path()
58
+
59
+ # JSON-RPC 请求 id (Python 客户端单线程顺序递增即可)
60
+ _rpc_seq = 0
61
+
62
+
63
+ class BrowserdError(RuntimeError):
64
+ """browserd RPC 失败."""
65
+
66
+
67
+ def _next_id() -> int:
68
+ global _rpc_seq
69
+ _rpc_seq += 1
70
+ return _rpc_seq
71
+
72
+
73
+ def _health(timeout_ms: int = 2000) -> dict | None:
74
+ """探测 browserd 是否存活. 存活返回 health dict, 否则 None."""
75
+ try:
76
+ r = httpx.get(HEALTH_URL, timeout=timeout_ms / 1000)
77
+ if r.status_code == 200:
78
+ return r.json()
79
+ except (httpx.HTTPError, OSError):
80
+ pass
81
+ return None
82
+
83
+
84
+ def _rpc(method: str, params: dict | None = None, timeout_ms: int = 60000) -> Any:
85
+ """发一个 JSON-RPC 请求到 browserd, 返回 result 或抛 BrowserdError."""
86
+ payload = {"id": _next_id(), "method": method, "params": params or {}}
87
+ try:
88
+ r = httpx.post(RPC_URL, json=payload, timeout=timeout_ms / 1000)
89
+ data = r.json()
90
+ except (httpx.HTTPError, json.JSONDecodeError) as e:
91
+ raise BrowserdError(f"rpc 通信失败 ({method}): {e}") from e
92
+
93
+ if not data.get("ok"):
94
+ raise BrowserdError(f"{method}: {data.get('error', 'rpc failed')}")
95
+ return data.get("result")
96
+
97
+
98
+ def _ensure_browserd(timeout_ms: int = 15000) -> None:
99
+ """
100
+ 检查 browserd 是否在跑. 不自动启动 — 进程生命周期由 PM2 Runtime 管理.
101
+ 不活就报错, 提示用 `mooncat services start` 起.
102
+ """
103
+ if _health(2000):
104
+ return # browserd 在跑
105
+
106
+ raise BrowserdError(
107
+ "browserd 未运行. 它现在由 PM2 Runtime 管理, 不再自动启动.\n"
108
+ "请先运行: mooncat services start\n"
109
+ "或单独起: mooncat services restart browserd"
110
+ )
111
+
112
+
113
+ def _find_node() -> str | None:
114
+ """找 node 可执行文件."""
115
+ for name in ("node", "node.exe"):
116
+ for p in os.environ.get("PATH", "").split(os.pathsep):
117
+ f = Path(p) / name
118
+ if f.exists():
119
+ return str(f)
120
+ return None
121
+
122
+
123
+ class Browser:
124
+ """
125
+ browserd 客户端 facade (对齐 browser.js 的导出函数).
126
+
127
+ 生命周期:
128
+ b = Browser()
129
+ bh = b.open() # 起 browserd + 开浏览器 → browserHandle
130
+ ph = b.find_or_new_tab('url') # → pageHandle
131
+ b.operate(ph, action='click', ...) # 操作
132
+ b.close() # 关浏览器 + browserd (仅"完成"时调)
133
+ """
134
+
135
+ def __init__(self) -> None:
136
+ self._browser_open = False
137
+
138
+ # ─── 顶层 RPC (对齐 browser.js) ───
139
+
140
+ def open(
141
+ self,
142
+ route_mode: str = "auto",
143
+ headless: bool = False,
144
+ executable_path: str | None = None,
145
+ profile_root: str | None = None,
146
+ user_data_dir: str | None = None,
147
+ ) -> dict:
148
+ """
149
+ 打开浏览器 (自动起 browserd + 开浏览器). 返回 browserHandle (纯数据).
150
+ route_mode: 'auto' | 'cdp' | 'plugin' (高危站点用 plugin/extension)
151
+ """
152
+ _ensure_browserd()
153
+ params: dict[str, Any] = {"routeMode": route_mode, "headless": headless}
154
+ if executable_path:
155
+ params["executablePath"] = executable_path
156
+ if profile_root:
157
+ params["profileRoot"] = profile_root
158
+ if user_data_dir:
159
+ params["userDataDir"] = user_data_dir
160
+ bh = _rpc("open", params, timeout_ms=120000) # 开浏览器慢, 给 2 分钟
161
+ self._browser_open = True
162
+ return bh
163
+
164
+ def new_tab(self, url: str = "") -> dict:
165
+ """新建 tab. 返回 pageHandle (纯数据)."""
166
+ if not self._browser_open:
167
+ raise BrowserdError("browser not open; call open() first")
168
+ return _rpc("newTab", {"url": url})
169
+
170
+ def list_tabs(self) -> list[dict]:
171
+ """列出所有 tab, 每个 dict 含 pageHandle/url/title/active."""
172
+ if not self._browser_open:
173
+ raise BrowserdError("browser not open; call open() first")
174
+ return _rpc("listTabs", {})
175
+
176
+ def find_or_new_tab(
177
+ self,
178
+ url: str,
179
+ match: str = "includes",
180
+ new_if_missing: bool = True,
181
+ ) -> dict:
182
+ """
183
+ 按 url 找已有 tab (复刻 browser.js findOrNewTab, 在客户端实现).
184
+ match: 'includes' (子串, 默认) | 'exact'
185
+ new_if_missing: 没找到是否新开 (True=新开, False=返回 None)
186
+ 返回 pageHandle.
187
+ """
188
+ tabs = self.list_tabs()
189
+ for t in tabs:
190
+ t_url = t.get("url", "")
191
+ hit = (t_url == url) if match == "exact" else (url in t_url)
192
+ if hit and t.get("pageHandle"):
193
+ return t["pageHandle"]
194
+ if not new_if_missing:
195
+ return None # type: ignore[return-value]
196
+ return self.new_tab(url)
197
+
198
+ def operate(
199
+ self,
200
+ page_handle: dict,
201
+ action: str,
202
+ timeout_ms: int = 60000,
203
+ **params: Any,
204
+ ) -> Any:
205
+ """
206
+ 在 page 上执行一个 action (对齐 browser.js operate).
207
+ action 全集见 browserd.cjs: goto/click/fill/type/press/snapshot/evaluate/...
208
+ 额外参数通过 **params 传 (selector/value/source/x/y/...).
209
+ """
210
+ if not self._browser_open:
211
+ raise BrowserdError("browser not open")
212
+ if not page_handle or not page_handle.get("pageId"):
213
+ raise BrowserdError("invalid pageHandle: missing pageId")
214
+ return _rpc(
215
+ "operate",
216
+ {"pageHandle": page_handle, "action": action, "params": params},
217
+ timeout_ms=timeout_ms,
218
+ )
219
+
220
+ def close(self) -> dict:
221
+ """关闭浏览器 + browserd daemon. 仅"完成"时调用."""
222
+ if not self._browser_open:
223
+ return {"ok": True, "note": "already closed"}
224
+ result = _rpc("close", {})
225
+ self._browser_open = False
226
+ return result
227
+
228
+ # ─── cookie (挂 browserHandle) ───
229
+
230
+ def get_cookies(self, browser_handle: dict | None = None) -> list[dict]:
231
+ return _rpc("getCookies", {"browserHandle": browser_handle or {}})
232
+
233
+ def set_cookies(self, cookies: list[dict], browser_handle: dict | None = None) -> dict:
234
+ return _rpc("setCookies", {"cookies": cookies, "browserHandle": browser_handle or {}})
235
+
236
+ def clear_cookies(self, browser_handle: dict | None = None) -> dict:
237
+ return _rpc("clearCookies", {"browserHandle": browser_handle or {}})
238
+
239
+ # ─── 便捷封装 (常用 action 的语法糖, 减少样板) ───
240
+
241
+ def goto(self, page_handle: dict, url: str) -> Any:
242
+ return self.operate(page_handle, "goto", url=url) # type: ignore[arg-type]
243
+
244
+ def click(self, page_handle: dict, selector: str, x: float | None = None, y: float | None = None) -> Any:
245
+ p: dict[str, Any] = {"selector": selector}
246
+ if x is not None:
247
+ p["x"] = x
248
+ if y is not None:
249
+ p["y"] = y
250
+ return self.operate(page_handle, "click", **p)
251
+
252
+ def fill(self, page_handle: dict, selector: str, value: str) -> Any:
253
+ return self.operate(page_handle, "fill", selector=selector, value=value)
254
+
255
+ def press(self, page_handle: dict, selector: str, key: str) -> Any:
256
+ return self.operate(page_handle, "press", selector=selector, key=key)
257
+
258
+ def status(self, page_handle: dict) -> dict:
259
+ return self.operate(page_handle, "status") # type: ignore[return-value]
260
+
261
+ def snapshot(self, page_handle: dict, **opts: Any) -> Any:
262
+ """aria 树 (探查 DOM)."""
263
+ return self.operate(page_handle, "snapshot", **opts)
264
+
265
+ def evaluate(self, page_handle: dict, source: str, args: list | None = None) -> Any:
266
+ """跑页面 JS (source 必须是函数表达式如 '() => document.title')."""
267
+ p: dict[str, Any] = {"source": source}
268
+ if args is not None:
269
+ p["args"] = args
270
+ return self.operate(page_handle, "evaluate", **p)
271
+
272
+ def screenshot(self, page_handle: dict) -> Any:
273
+ return self.operate(page_handle, "screenshot")
274
+
275
+ def inner_text(self, page_handle: dict, selector: str) -> str:
276
+ return self.operate(page_handle, "innerText", selector=selector) # type: ignore[return-value]
277
+
278
+ def wait_for_selector(self, page_handle: dict, selector: str, timeout: int | None = None) -> Any:
279
+ p: dict[str, Any] = {"selector": selector}
280
+ if timeout is not None:
281
+ p["timeout"] = timeout
282
+ return self.operate(page_handle, "waitForSelector", **p)
283
+
284
+
285
+ # ─── 模块级接口 (对齐 JS 的 operate/listTabs 等, 供 find/scroll/ready/humanize 等子模块用) ───
286
+ # 用法: from browser import operate, list_tabs, new_tab, open_browser, find_or_new_tab, close_browser
287
+
288
+ _default_browser: Browser | None = None
289
+
290
+
291
+ def _get_browser() -> Browser:
292
+ global _default_browser
293
+ if _default_browser is None:
294
+ _default_browser = Browser()
295
+ return _default_browser
296
+
297
+
298
+ def open_browser(**kwargs) -> dict:
299
+ """模块级 open (对齐 JS open)."""
300
+ return _get_browser().open(**kwargs)
301
+
302
+
303
+ def new_tab(url: str = "") -> dict:
304
+ """模块级 newTab (对齐 JS newTab)."""
305
+ return _get_browser().new_tab(url)
306
+
307
+
308
+ def list_tabs(browser_handle: dict | None = None) -> list[dict]:
309
+ """模块级 listTabs (对齐 JS listTabs)."""
310
+ return _get_browser().list_tabs()
311
+
312
+
313
+ def find_or_new_tab(url: str, match: str = "includes", new_if_missing: bool = True) -> dict | None:
314
+ """模块级 findOrNewTab (对齐 JS findOrNewTab)."""
315
+ return _get_browser().find_or_new_tab(url, match, new_if_missing)
316
+
317
+
318
+ def operate(page_handle: dict, params: dict) -> Any:
319
+ """
320
+ 模块级 operate (对齐 JS operate(pageHandle, actionObj)).
321
+
322
+ params 是 dict, 必须含 'action' key, 其余 key 是 action 参数:
323
+ operate(ph, {"action": "click", "selector": "#btn"})
324
+ operate(ph, {"action": "evaluate", "source": "() => 1", "_skipVisualize": True})
325
+
326
+ 特殊 key (不传给 browserd, 本地控制):
327
+ _skipVisualize / _skipHumanize / label — 忽略 (仅 JS 端用)
328
+ """
329
+ b = _get_browser()
330
+ action = params.get("action")
331
+ if not action:
332
+ raise ValueError("operate: params 缺 action")
333
+ # 提取 action 参数 (排除本地控制 key)
334
+ kwargs = {k: v for k, v in params.items() if k not in ("action", "_skipVisualize", "_skipHumanize", "label")}
335
+ return b.operate(page_handle, action, **kwargs)
336
+
337
+
338
+ def close_browser() -> dict:
339
+ """模块级 close (对齐 JS close)."""
340
+ return _get_browser().close()
@@ -0,0 +1,115 @@
1
+ """web/captcha — 验证码检测(多信号交叉验证,路由无关)
2
+
3
+ 忠实复刻 lib/web/captcha.js。
4
+ 信号:URL 关键词 / DOM selector / 页面标题关键词 / 预期内容缺失。
5
+ 基于 operate({action:'status'}) + operate({action:'evaluate'})。
6
+
7
+ 依赖:browser(operate)
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import sys
13
+ from typing import Any
14
+
15
+ sys.path.insert(0, __file__.rsplit("\\", 1)[0] if "\\" in __file__ else __file__.rsplit("/", 1)[0])
16
+ from browser import operate # noqa: E402
17
+
18
+ URL_KEYWORDS = ['captcha', 'verify', 'geetest', 'dingxiang', 'challenge', 'slider', 'recaptcha', 'hcaptcha']
19
+ TITLE_KEYWORDS = ['验证', 'captcha', 'security', 'verify', '人机', '安全验证']
20
+ DOM_SELECTORS = [
21
+ '.geetest_', '#geetest', '.yidun', '.dx_captcha', '#captcha',
22
+ 'iframe[src*="captcha"]', 'iframe[src*="recaptcha"]', 'iframe[src*="hcaptcha"]',
23
+ '.g-recaptcha', '.h-captcha',
24
+ ]
25
+
26
+
27
+ # 探测页面是否含特定 captcha DOM
28
+ def _dom_probe_source(selectors: list) -> str:
29
+ return ('() => {\n'
30
+ ' const sels = ' + json.dumps(selectors, separators=(',', ':')) + '\n'
31
+ ' const found = []\n'
32
+ ' for (const s of sels) { if (document.querySelector(s)) found.push(s) }\n'
33
+ ' return found\n'
34
+ '}')
35
+
36
+
37
+ def detect(page_handle: dict, options: dict | None = None) -> dict:
38
+ """多信号验证码检测。
39
+
40
+ options:
41
+ expectedSelector: 正常页面应存在的 selector(缺失视为信号)
42
+ 返回: {isCaptcha, confidence, signals, evidence}
43
+ confidence: 0~1;>=0.7 判 isCaptcha=true。
44
+ 2+ 信号 → 0.95;1 强信号 → 0.7;1 弱信号(仅缺失)→ 0.3。
45
+ """
46
+ if options is None:
47
+ options = {}
48
+ if not page_handle:
49
+ raise ValueError('detect: pageHandle required')
50
+
51
+ signals: list[str] = []
52
+ evidence: list[str] = []
53
+
54
+ # 信号 1:URL 关键词
55
+ st = operate(page_handle, {'action': 'status'})
56
+ url = (st and (st.get('url') or st.get('value'))) or ''
57
+ url_hit = None
58
+ for k in URL_KEYWORDS:
59
+ if k in url.lower():
60
+ url_hit = k
61
+ break
62
+ if url_hit:
63
+ signals.append('url-keyword')
64
+ evidence.append('url contains "' + url_hit + '"')
65
+
66
+ # 信号 2:标题关键词
67
+ title = (st and st.get('title')) or ''
68
+ title_hit = None
69
+ for k in TITLE_KEYWORDS:
70
+ if k.lower() in title.lower():
71
+ title_hit = k
72
+ break
73
+ if title_hit:
74
+ signals.append('title-keyword')
75
+ evidence.append('title contains "' + title_hit + '"')
76
+
77
+ # 信号 3:captcha DOM
78
+ found = operate(page_handle, {'action': 'evaluate', 'source': _dom_probe_source(DOM_SELECTORS)})
79
+ if isinstance(found, list):
80
+ found_arr = found
81
+ elif isinstance(found, dict) and isinstance(found.get('value'), list):
82
+ found_arr = found['value']
83
+ else:
84
+ found_arr = []
85
+ if len(found_arr) > 0:
86
+ signals.append('dom-selector')
87
+ evidence.append('selectors: ' + ', '.join(found_arr))
88
+
89
+ # 信号 4:预期内容缺失(弱信号)
90
+ if options.get('expectedSelector'):
91
+ exp = operate(page_handle, {'action': 'evaluate', 'source': '(s) => !!document.querySelector(s)'})
92
+ if isinstance(exp, bool):
93
+ present = exp
94
+ elif isinstance(exp, dict) and isinstance(exp.get('value'), bool):
95
+ present = exp['value']
96
+ elif isinstance(exp, dict) and isinstance(exp.get('result'), bool):
97
+ present = exp['result']
98
+ else:
99
+ present = bool(exp)
100
+ if not present:
101
+ signals.append('expected-missing')
102
+ evidence.append('expected "' + options.get('expectedSelector') + '" not found')
103
+
104
+ # 置信度
105
+ confidence = 0
106
+ strong = len([s for s in signals if s != 'expected-missing'])
107
+ weak = 1 if 'expected-missing' in signals else 0
108
+ if strong >= 2:
109
+ confidence = 0.95
110
+ elif strong == 1:
111
+ confidence = 0.7
112
+ elif weak == 1:
113
+ confidence = 0.3
114
+
115
+ return {'isCaptcha': confidence >= 0.7, 'confidence': confidence, 'signals': signals, 'evidence': evidence}
@@ -0,0 +1,125 @@
1
+ """web/crawler — 轻量爬虫 (URL 去重 + 深度控制 + 节流, 路由无关).
2
+
3
+ 忠实复刻 lib/web/crawler.js。
4
+ 基于 operate (goto + evaluate 提取链接/内容) 组合, 复用同一个 pageHandle (顺序导航)。
5
+
6
+ 依赖: browser (operate)
7
+ """
8
+ from __future__ import annotations
9
+ import json
10
+ import time
11
+ from typing import Any, Callable
12
+ from urllib.parse import urlparse
13
+
14
+ import sys
15
+ sys.path.insert(0, __file__.rsplit("\\", 1)[0] if "\\" in __file__ else __file__.rsplit("/", 1)[0])
16
+ from browser import operate # noqa: E402
17
+
18
+
19
+ def _extract_source(base_domain: str) -> str:
20
+ return f"""() => {{
21
+ const title = document.title || "";
22
+ const text = (document.body && document.body.innerText) ? document.body.innerText.slice(0, 8000) : "";
23
+ const links = [];
24
+ const a = document.querySelectorAll("a[href]");
25
+ for (const el of a) {{
26
+ try {{
27
+ const u = new URL(el.href, location.href);
28
+ if (u.protocol !== "http:" && u.protocol !== "https:") continue;
29
+ links.push(u.href);
30
+ }} catch {{}}
31
+ }}
32
+ const base = {json.dumps(base_domain or "")};
33
+ const filtered = base ? links.filter((l) => {{ try {{ return new URL(l).hostname === base }} catch {{ return false }} }}) : links;
34
+ return {{ title, text, links: [...new Set(filtered)].slice(0, 200) }};
35
+ }}()"""
36
+
37
+
38
+ def create_crawler(page_handle: dict, options: dict | None = None) -> dict:
39
+ """创建爬虫实例.
40
+
41
+ options:
42
+ maxPages: 最多爬取页数 (默认 50)
43
+ maxDepth: 最大链接深度 (默认 3)
44
+ delay: 请求间隔秒 (默认 0.5)
45
+ retries: 失败重试次数 (默认 2)
46
+ sameDomain: 是否限制同域 (默认 True)
47
+ 返回: {{ crawl, results }}
48
+ """
49
+ if not page_handle:
50
+ raise ValueError("create_crawler: page_handle required")
51
+ opts = options or {}
52
+ max_pages = opts.get("maxPages", 50)
53
+ max_depth = opts.get("maxDepth", 3)
54
+ delay_sec = opts.get("delay", 0.5)
55
+ retries = opts.get("retries", 2)
56
+ same_domain = opts.get("sameDomain", True)
57
+
58
+ results = []
59
+
60
+ def fetch_page(url: str, depth: int) -> dict:
61
+ last_err = None
62
+ for attempt in range(retries + 1):
63
+ try:
64
+ operate(page_handle, {"action": "goto", "url": url, "timeout": 15000})
65
+ operate(page_handle, {"action": "waitForLoadState", "state": "complete"})
66
+ base_domain = ""
67
+ if same_domain:
68
+ try:
69
+ base_domain = urlparse(url).hostname or ""
70
+ except Exception:
71
+ pass
72
+ data = operate(page_handle, {"action": "evaluate", "source": _extract_source(base_domain)})
73
+ d = {}
74
+ if isinstance(data, dict):
75
+ if "title" in data or "links" in data:
76
+ d = data
77
+ elif "value" in data:
78
+ d = data["value"]
79
+ return {"title": d.get("title", ""), "text": d.get("text", ""), "links": d.get("links", [])}
80
+ except Exception as e:
81
+ last_err = e
82
+ time.sleep(0.5)
83
+ raise last_err or RuntimeError("fetchPage failed")
84
+
85
+ def crawl(start_url: str) -> list[dict]:
86
+ queue = [{"url": start_url, "depth": 0}]
87
+ visited = set()
88
+ start_domain = ""
89
+ try:
90
+ start_domain = urlparse(start_url).hostname or ""
91
+ except Exception:
92
+ pass
93
+
94
+ while queue and len(results) < max_pages:
95
+ item = queue.pop(0)
96
+ url = item["url"]
97
+ depth = item["depth"]
98
+ if url in visited:
99
+ continue
100
+ visited.add(url)
101
+ if depth > max_depth:
102
+ continue
103
+
104
+ try:
105
+ page = fetch_page(url, depth)
106
+ results.append({"url": url, "title": page["title"], "html": page["text"], "depth": depth})
107
+ for link in page["links"]:
108
+ if link in visited:
109
+ continue
110
+ if same_domain and start_domain:
111
+ try:
112
+ if urlparse(link).hostname != start_domain:
113
+ continue
114
+ except Exception:
115
+ continue
116
+ queue.append({"url": link, "depth": depth + 1})
117
+ except Exception:
118
+ results.append({"url": url, "title": "", "html": "", "depth": depth, "error": "fetch failed"})
119
+
120
+ if delay_sec > 0:
121
+ time.sleep(delay_sec)
122
+
123
+ return results
124
+
125
+ return {"crawl": crawl, "results": results}
@@ -0,0 +1,48 @@
1
+ """
2
+ 样例 01: 最小打开 + 探查 (对齐 e2e_steps/01_cdp_open.js)
3
+
4
+ 演示 Python 客户端最基本的: open → new_tab → status → evaluate → close
5
+ 适合第一次接触 Python 客户端的人跑通。
6
+
7
+ 跑: python lib/py/examples/01_open_and_probe.py
8
+ """
9
+ import sys
10
+ import os
11
+ import time
12
+
13
+ # 让 examples 能 import 上级目录的 browser 模块
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
15
+
16
+ from browser import Browser # noqa: E402
17
+
18
+
19
+ def main() -> None:
20
+ b = Browser()
21
+
22
+ # 1. 打开浏览器 (自动起 browserd daemon, 双路由 auto)
23
+ print("[01] open browser...")
24
+ bh = b.open(route_mode="auto", headless=False)
25
+ print("[01] mode:", bh.get("mode"))
26
+
27
+ # 2. 新开 tab 到 example.com
28
+ print("[01] new tab → example.com")
29
+ ph = b.new_tab("https://example.com")
30
+ time.sleep(3) # 等加载 (演示用, 生产用 wait_for_selector)
31
+
32
+ # 3. 探查: status
33
+ st = b.status(ph)
34
+ print("[01] status:", st)
35
+
36
+ # 4. 探查: evaluate 取标题 + 正文
37
+ title = b.evaluate(ph, source="() => document.title")
38
+ h1 = b.evaluate(ph, source='() => (document.querySelector("h1")||{}).innerText || ""')
39
+ print(f"[01] title={title!r} h1={h1!r}")
40
+
41
+ # 5. 用完关闭 (仅演示结束才关, 生产里 agent 长生命周期复用)
42
+ print("[01] close")
43
+ b.close()
44
+ print("[01] DONE")
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
@@ -0,0 +1,66 @@
1
+ """
2
+ 样例 02: 复用已有 tab + DOM 探查 (对齐 findOrNewTab + snapshot/evaluate)
3
+
4
+ 演示 Python 客户端的核心探查能力:
5
+ - find_or_new_tab (只找不开, 不重复打开已有 tab)
6
+ - snapshot (aria 树, DOM 结构探查)
7
+ - evaluate (跑页面 JS 取数据)
8
+
9
+ 这是 agent/采集场景最常用的模式: 先看页面长什么样, 再决定怎么操作。
10
+
11
+ 跑: python lib/py/examples/02_reuse_and_probe.py
12
+ """
13
+ import sys
14
+ import os
15
+ import time
16
+
17
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
18
+
19
+ from browser import Browser # noqa: E402
20
+
21
+
22
+ def main() -> None:
23
+ b = Browser()
24
+ print("[02] open...")
25
+ b.open(route_mode="auto", headless=False)
26
+
27
+ # 1. find_or_new_tab: 已有真实 example.com tab 就复用, 没有就开
28
+ # (对齐 browser.js 的"复用已有 tab, 不盲目 newTab"原则)
29
+ # 注意: 用 'https://example.com' 精确匹配, 避开 chrome-extension:// 代理 tab
30
+ # (代理 tab 的 url 是 chrome-extension://xxx/example.com, 子串会误命中)
31
+ ph = b.find_or_new_tab("https://example.com", match="includes")
32
+ print("[02] pageHandle:", ph)
33
+
34
+ # 切到这个 tab 并等加载
35
+ b.operate(ph, action="activate")
36
+ b.operate(ph, action="waitForLoadState", state="complete")
37
+ time.sleep(2)
38
+
39
+ # 2. snapshot: aria 树 (探查 DOM 结构, 对齐 browser 的 snapshot action)
40
+ snap = b.snapshot(ph)
41
+ if snap and isinstance(snap, dict):
42
+ yaml = snap.get("yaml", "")
43
+ print(f"[02] snapshot aria 树长度: {len(yaml)} 字符")
44
+ print("[02] aria 树前 300 字:")
45
+ print(yaml[:300])
46
+ else:
47
+ print("[02] snapshot:", snap)
48
+
49
+ # 3. evaluate: 取结构化数据 (采集常用)
50
+ links = b.evaluate(
51
+ ph,
52
+ source="""() => {
53
+ return [...document.querySelectorAll('a')].map(a => ({
54
+ text: (a.innerText || '').trim(),
55
+ href: a.href
56
+ })).slice(0, 5)
57
+ }""",
58
+ )
59
+ print(f"[02] 页面链接 (前5): {links}")
60
+
61
+ # 不关浏览器 (长生命周期, 后续 agent 继续用)
62
+ print("[02] 浏览器保持打开 (长生命周期复用)")
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()