mooncat-browser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +213 -0
- package/browser-op/backend/browserd.cjs +1004 -0
- package/browser-op/backend/rpc-client.cjs +64 -0
- package/browser-op/backend/state.cjs +51 -0
- package/browser-op/cdp/capture-inject.js +426 -0
- package/browser-op/cdp/capture-inject.ts +426 -0
- package/browser-op/cdp/capture-service.cjs +172 -0
- package/browser-op/cdp/chrome-launcher.cjs +370 -0
- package/browser-op/cdp/chrome-path.cjs +57 -0
- package/browser-op/cdp/state.cjs +89 -0
- package/browser-op/extension/extension-detect.cjs +228 -0
- package/browser-op/extension/server.cjs +197 -0
- package/browser-op/extension/service.cjs +228 -0
- package/browser-op/extension/state.cjs +78 -0
- package/browser-op/index.cjs +389 -0
- package/browser-op/package.json +17 -0
- package/browser-op/py/behavior.py +138 -0
- package/browser-op/py/browser.py +340 -0
- package/browser-op/py/captcha.py +115 -0
- package/browser-op/py/crawler.py +125 -0
- package/browser-op/py/examples/01_open_and_probe.py +48 -0
- package/browser-op/py/examples/02_reuse_and_probe.py +66 -0
- package/browser-op/py/examples/03_interact.py +66 -0
- package/browser-op/py/find.py +150 -0
- package/browser-op/py/honeypot.py +73 -0
- package/browser-op/py/humanize.py +392 -0
- package/browser-op/py/image.py +186 -0
- package/browser-op/py/interact.py +193 -0
- package/browser-op/py/markdown.py +38 -0
- package/browser-op/py/pyproject.toml +32 -0
- package/browser-op/py/ready.py +208 -0
- package/browser-op/py/scroll.py +180 -0
- package/browser-op/py/upload.py +103 -0
- package/browser-op/py/visual_target.py +47 -0
- package/browser-op/py/visualize.py +91 -0
- package/browser-op/state.cjs +63 -0
- package/browser-op/web/behavior.js +153 -0
- package/browser-op/web/browser.js +231 -0
- package/browser-op/web/captcha.js +85 -0
- package/browser-op/web/crawler.js +109 -0
- package/browser-op/web/find.js +147 -0
- package/browser-op/web/honeypot.js +68 -0
- package/browser-op/web/humanize.js +522 -0
- package/browser-op/web/image.js +177 -0
- package/browser-op/web/interact.js +169 -0
- package/browser-op/web/markdown.js +80 -0
- package/browser-op/web/ready.js +295 -0
- package/browser-op/web/scroll.js +167 -0
- package/browser-op/web/upload.js +116 -0
- package/browser-op/web/visual-runtime.inject.cjs +6 -0
- package/browser-op/webplater/.env.example +7 -0
- package/browser-op/webplater/ARCHITECTURE.md +102 -0
- package/browser-op/webplater/dist/chrome-mv3/assets/popup-BUZEUmsx.css +1 -0
- package/browser-op/webplater/dist/chrome-mv3/background.js +2 -0
- package/browser-op/webplater/dist/chrome-mv3/capture.js +310 -0
- package/browser-op/webplater/dist/chrome-mv3/chunks/_virtual_wxt-html-plugins-DPbbfBKe.js +1 -0
- package/browser-op/webplater/dist/chrome-mv3/chunks/offscreen-CFXYw9Mo.js +1 -0
- package/browser-op/webplater/dist/chrome-mv3/chunks/popup-C-lpxZZO.js +1 -0
- package/browser-op/webplater/dist/chrome-mv3/content-scripts/content.js +7 -0
- package/browser-op/webplater/dist/chrome-mv3/manifest.json +1 -0
- package/browser-op/webplater/dist/chrome-mv3/offscreen.html +16 -0
- package/browser-op/webplater/dist/chrome-mv3/popup.html +31 -0
- package/browser-op/webplater/entrypoints/background.ts +938 -0
- package/browser-op/webplater/entrypoints/content.ts +1150 -0
- package/browser-op/webplater/entrypoints/offscreen/index.html +15 -0
- package/browser-op/webplater/entrypoints/offscreen/main.ts +161 -0
- package/browser-op/webplater/entrypoints/popup/index.html +29 -0
- package/browser-op/webplater/entrypoints/popup/main.ts +61 -0
- package/browser-op/webplater/entrypoints/popup/style.css +100 -0
- package/browser-op/webplater/lib/snapshot.ts +352 -0
- package/browser-op/webplater/package.json +29 -0
- package/browser-op/webplater/pnpm-lock.yaml +3411 -0
- package/browser-op/webplater/public/capture.js +310 -0
- package/browser-op/webplater/scripts/publish-extension.mjs +176 -0
- package/browser-op/webplater/tsconfig.json +19 -0
- package/browser-op/webplater/wxt.config.ts +34 -0
- package/dist/actions.md +102 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +278 -0
- package/dist/cli.js.map +1 -0
- package/dist/client.d.ts +94 -0
- package/dist/client.d.ts.map +1 -0
- package/dist/client.js +277 -0
- package/dist/client.js.map +1 -0
- package/dist/config.d.ts +61 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +119 -0
- package/dist/config.js.map +1 -0
- package/dist/protocol.d.ts +195 -0
- package/dist/protocol.d.ts.map +1 -0
- package/dist/protocol.js +11 -0
- package/dist/protocol.js.map +1 -0
- package/dist/server.d.ts +66 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +259 -0
- package/dist/server.js.map +1 -0
- package/package.json +78 -0
- package/schemas/browser.clearCookies.schema.json +13 -0
- package/schemas/browser.close.schema.json +9 -0
- package/schemas/browser.getCookies.schema.json +13 -0
- package/schemas/browser.getDownload.schema.json +15 -0
- package/schemas/browser.health.schema.json +9 -0
- package/schemas/browser.listDownloads.schema.json +16 -0
- package/schemas/browser.listTabs.schema.json +9 -0
- package/schemas/browser.newTab.schema.json +15 -0
- package/schemas/browser.open.schema.json +15 -0
- package/schemas/browser.operate.schema.json +15 -0
- package/schemas/browser.reuseTab.schema.json +15 -0
- package/schemas/browser.setCookies.schema.json +15 -0
- package/schemas/browser.waitFor.schema.json +15 -0
- package/schemas/browser.waitForDownload.schema.json +15 -0
- package/skills/browser/SKILL.md +110 -0
- package/skills/browser/references/collect.md +163 -0
- package/skills/browser/references/high-risk.md +161 -0
- package/skills/browser/references/operate-actions.md +92 -0
- package/skills/browser/references/probing.md +302 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""
|
|
2
|
+
browserd Python 客户端 — 复刻 lib/web/browser.js 的 facade 模式.
|
|
3
|
+
|
|
4
|
+
连接 PM2 Runtime 管理的 browserd, 通过 HTTP JSON-RPC (127.0.0.1:17322/rpc) 通信.
|
|
5
|
+
业务脚本退出不影响 Chrome (browserd 持有它, 常驻).
|
|
6
|
+
|
|
7
|
+
用法:
|
|
8
|
+
from browser import Browser
|
|
9
|
+
b = Browser()
|
|
10
|
+
bh = b.open(route_mode='auto') # 自动起 browserd + 开浏览器
|
|
11
|
+
ph = b.find_or_new_tab('example.com') # 找/开 tab
|
|
12
|
+
b.operate(ph, action='click', selector='#btn')
|
|
13
|
+
data = b.operate(ph, action='evaluate', source='() => document.title')
|
|
14
|
+
b.close()
|
|
15
|
+
|
|
16
|
+
设计原则 (对齐 browser.js):
|
|
17
|
+
- handle 是纯数据, 绝不判断路由 (browser.js 铁律 1)
|
|
18
|
+
- 会话长生命周期, open 一次跨多步复用 (铁律 2)
|
|
19
|
+
- 步进落盘, 不一次做完 (铁律 3)
|
|
20
|
+
- Python 侧零路由判断, 全透传给 browserd
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import subprocess
|
|
26
|
+
import time
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
import httpx
|
|
31
|
+
|
|
32
|
+
# ─── 常量 (对齐 rpc-client.cjs) ───
|
|
33
|
+
IPC_HOST = "127.0.0.1"
|
|
34
|
+
IPC_PORT = 17322
|
|
35
|
+
RPC_URL = f"http://{IPC_HOST}:{IPC_PORT}/rpc"
|
|
36
|
+
HEALTH_URL = f"http://{IPC_HOST}:{IPC_PORT}/health"
|
|
37
|
+
|
|
38
|
+
# browserd.cjs 路径: 本文件在 lib/py/, browserd 在 lib/backend/
|
|
39
|
+
def _resolve_browserd_path() -> Path:
|
|
40
|
+
here = Path(__file__).resolve()
|
|
41
|
+
candidates = [
|
|
42
|
+
# Source and production runtime layout:
|
|
43
|
+
# libs/browser/py/browser.py -> libs/browser/backend/browserd.cjs
|
|
44
|
+
here.parent.parent / "backend" / "browserd.cjs",
|
|
45
|
+
# Distributed skillpack layout:
|
|
46
|
+
# browser/lib/py/browser.py -> browser/lib/backend/browserd.cjs
|
|
47
|
+
here.parent.parent.parent / "lib" / "backend" / "browserd.cjs",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
for candidate in candidates:
|
|
51
|
+
if candidate.exists():
|
|
52
|
+
return candidate
|
|
53
|
+
|
|
54
|
+
return candidates[0]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
BROWSERD_PATH = _resolve_browserd_path()
|
|
58
|
+
|
|
59
|
+
# JSON-RPC 请求 id (Python 客户端单线程顺序递增即可)
|
|
60
|
+
_rpc_seq = 0
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class BrowserdError(RuntimeError):
|
|
64
|
+
"""browserd RPC 失败."""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _next_id() -> int:
|
|
68
|
+
global _rpc_seq
|
|
69
|
+
_rpc_seq += 1
|
|
70
|
+
return _rpc_seq
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _health(timeout_ms: int = 2000) -> dict | None:
|
|
74
|
+
"""探测 browserd 是否存活. 存活返回 health dict, 否则 None."""
|
|
75
|
+
try:
|
|
76
|
+
r = httpx.get(HEALTH_URL, timeout=timeout_ms / 1000)
|
|
77
|
+
if r.status_code == 200:
|
|
78
|
+
return r.json()
|
|
79
|
+
except (httpx.HTTPError, OSError):
|
|
80
|
+
pass
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _rpc(method: str, params: dict | None = None, timeout_ms: int = 60000) -> Any:
|
|
85
|
+
"""发一个 JSON-RPC 请求到 browserd, 返回 result 或抛 BrowserdError."""
|
|
86
|
+
payload = {"id": _next_id(), "method": method, "params": params or {}}
|
|
87
|
+
try:
|
|
88
|
+
r = httpx.post(RPC_URL, json=payload, timeout=timeout_ms / 1000)
|
|
89
|
+
data = r.json()
|
|
90
|
+
except (httpx.HTTPError, json.JSONDecodeError) as e:
|
|
91
|
+
raise BrowserdError(f"rpc 通信失败 ({method}): {e}") from e
|
|
92
|
+
|
|
93
|
+
if not data.get("ok"):
|
|
94
|
+
raise BrowserdError(f"{method}: {data.get('error', 'rpc failed')}")
|
|
95
|
+
return data.get("result")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _ensure_browserd(timeout_ms: int = 15000) -> None:
|
|
99
|
+
"""
|
|
100
|
+
检查 browserd 是否在跑. 不自动启动 — 进程生命周期由 PM2 Runtime 管理.
|
|
101
|
+
不活就报错, 提示用 `mooncat services start` 起.
|
|
102
|
+
"""
|
|
103
|
+
if _health(2000):
|
|
104
|
+
return # browserd 在跑
|
|
105
|
+
|
|
106
|
+
raise BrowserdError(
|
|
107
|
+
"browserd 未运行. 它现在由 PM2 Runtime 管理, 不再自动启动.\n"
|
|
108
|
+
"请先运行: mooncat services start\n"
|
|
109
|
+
"或单独起: mooncat services restart browserd"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _find_node() -> str | None:
|
|
114
|
+
"""找 node 可执行文件."""
|
|
115
|
+
for name in ("node", "node.exe"):
|
|
116
|
+
for p in os.environ.get("PATH", "").split(os.pathsep):
|
|
117
|
+
f = Path(p) / name
|
|
118
|
+
if f.exists():
|
|
119
|
+
return str(f)
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class Browser:
|
|
124
|
+
"""
|
|
125
|
+
browserd 客户端 facade (对齐 browser.js 的导出函数).
|
|
126
|
+
|
|
127
|
+
生命周期:
|
|
128
|
+
b = Browser()
|
|
129
|
+
bh = b.open() # 起 browserd + 开浏览器 → browserHandle
|
|
130
|
+
ph = b.find_or_new_tab('url') # → pageHandle
|
|
131
|
+
b.operate(ph, action='click', ...) # 操作
|
|
132
|
+
b.close() # 关浏览器 + browserd (仅"完成"时调)
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(self) -> None:
|
|
136
|
+
self._browser_open = False
|
|
137
|
+
|
|
138
|
+
# ─── 顶层 RPC (对齐 browser.js) ───
|
|
139
|
+
|
|
140
|
+
def open(
|
|
141
|
+
self,
|
|
142
|
+
route_mode: str = "auto",
|
|
143
|
+
headless: bool = False,
|
|
144
|
+
executable_path: str | None = None,
|
|
145
|
+
profile_root: str | None = None,
|
|
146
|
+
user_data_dir: str | None = None,
|
|
147
|
+
) -> dict:
|
|
148
|
+
"""
|
|
149
|
+
打开浏览器 (自动起 browserd + 开浏览器). 返回 browserHandle (纯数据).
|
|
150
|
+
route_mode: 'auto' | 'cdp' | 'plugin' (高危站点用 plugin/extension)
|
|
151
|
+
"""
|
|
152
|
+
_ensure_browserd()
|
|
153
|
+
params: dict[str, Any] = {"routeMode": route_mode, "headless": headless}
|
|
154
|
+
if executable_path:
|
|
155
|
+
params["executablePath"] = executable_path
|
|
156
|
+
if profile_root:
|
|
157
|
+
params["profileRoot"] = profile_root
|
|
158
|
+
if user_data_dir:
|
|
159
|
+
params["userDataDir"] = user_data_dir
|
|
160
|
+
bh = _rpc("open", params, timeout_ms=120000) # 开浏览器慢, 给 2 分钟
|
|
161
|
+
self._browser_open = True
|
|
162
|
+
return bh
|
|
163
|
+
|
|
164
|
+
def new_tab(self, url: str = "") -> dict:
|
|
165
|
+
"""新建 tab. 返回 pageHandle (纯数据)."""
|
|
166
|
+
if not self._browser_open:
|
|
167
|
+
raise BrowserdError("browser not open; call open() first")
|
|
168
|
+
return _rpc("newTab", {"url": url})
|
|
169
|
+
|
|
170
|
+
def list_tabs(self) -> list[dict]:
|
|
171
|
+
"""列出所有 tab, 每个 dict 含 pageHandle/url/title/active."""
|
|
172
|
+
if not self._browser_open:
|
|
173
|
+
raise BrowserdError("browser not open; call open() first")
|
|
174
|
+
return _rpc("listTabs", {})
|
|
175
|
+
|
|
176
|
+
def find_or_new_tab(
|
|
177
|
+
self,
|
|
178
|
+
url: str,
|
|
179
|
+
match: str = "includes",
|
|
180
|
+
new_if_missing: bool = True,
|
|
181
|
+
) -> dict:
|
|
182
|
+
"""
|
|
183
|
+
按 url 找已有 tab (复刻 browser.js findOrNewTab, 在客户端实现).
|
|
184
|
+
match: 'includes' (子串, 默认) | 'exact'
|
|
185
|
+
new_if_missing: 没找到是否新开 (True=新开, False=返回 None)
|
|
186
|
+
返回 pageHandle.
|
|
187
|
+
"""
|
|
188
|
+
tabs = self.list_tabs()
|
|
189
|
+
for t in tabs:
|
|
190
|
+
t_url = t.get("url", "")
|
|
191
|
+
hit = (t_url == url) if match == "exact" else (url in t_url)
|
|
192
|
+
if hit and t.get("pageHandle"):
|
|
193
|
+
return t["pageHandle"]
|
|
194
|
+
if not new_if_missing:
|
|
195
|
+
return None # type: ignore[return-value]
|
|
196
|
+
return self.new_tab(url)
|
|
197
|
+
|
|
198
|
+
def operate(
|
|
199
|
+
self,
|
|
200
|
+
page_handle: dict,
|
|
201
|
+
action: str,
|
|
202
|
+
timeout_ms: int = 60000,
|
|
203
|
+
**params: Any,
|
|
204
|
+
) -> Any:
|
|
205
|
+
"""
|
|
206
|
+
在 page 上执行一个 action (对齐 browser.js operate).
|
|
207
|
+
action 全集见 browserd.cjs: goto/click/fill/type/press/snapshot/evaluate/...
|
|
208
|
+
额外参数通过 **params 传 (selector/value/source/x/y/...).
|
|
209
|
+
"""
|
|
210
|
+
if not self._browser_open:
|
|
211
|
+
raise BrowserdError("browser not open")
|
|
212
|
+
if not page_handle or not page_handle.get("pageId"):
|
|
213
|
+
raise BrowserdError("invalid pageHandle: missing pageId")
|
|
214
|
+
return _rpc(
|
|
215
|
+
"operate",
|
|
216
|
+
{"pageHandle": page_handle, "action": action, "params": params},
|
|
217
|
+
timeout_ms=timeout_ms,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def close(self) -> dict:
|
|
221
|
+
"""关闭浏览器 + browserd daemon. 仅"完成"时调用."""
|
|
222
|
+
if not self._browser_open:
|
|
223
|
+
return {"ok": True, "note": "already closed"}
|
|
224
|
+
result = _rpc("close", {})
|
|
225
|
+
self._browser_open = False
|
|
226
|
+
return result
|
|
227
|
+
|
|
228
|
+
# ─── cookie (挂 browserHandle) ───
|
|
229
|
+
|
|
230
|
+
def get_cookies(self, browser_handle: dict | None = None) -> list[dict]:
|
|
231
|
+
return _rpc("getCookies", {"browserHandle": browser_handle or {}})
|
|
232
|
+
|
|
233
|
+
def set_cookies(self, cookies: list[dict], browser_handle: dict | None = None) -> dict:
|
|
234
|
+
return _rpc("setCookies", {"cookies": cookies, "browserHandle": browser_handle or {}})
|
|
235
|
+
|
|
236
|
+
def clear_cookies(self, browser_handle: dict | None = None) -> dict:
|
|
237
|
+
return _rpc("clearCookies", {"browserHandle": browser_handle or {}})
|
|
238
|
+
|
|
239
|
+
# ─── 便捷封装 (常用 action 的语法糖, 减少样板) ───
|
|
240
|
+
|
|
241
|
+
def goto(self, page_handle: dict, url: str) -> Any:
|
|
242
|
+
return self.operate(page_handle, "goto", url=url) # type: ignore[arg-type]
|
|
243
|
+
|
|
244
|
+
def click(self, page_handle: dict, selector: str, x: float | None = None, y: float | None = None) -> Any:
|
|
245
|
+
p: dict[str, Any] = {"selector": selector}
|
|
246
|
+
if x is not None:
|
|
247
|
+
p["x"] = x
|
|
248
|
+
if y is not None:
|
|
249
|
+
p["y"] = y
|
|
250
|
+
return self.operate(page_handle, "click", **p)
|
|
251
|
+
|
|
252
|
+
def fill(self, page_handle: dict, selector: str, value: str) -> Any:
|
|
253
|
+
return self.operate(page_handle, "fill", selector=selector, value=value)
|
|
254
|
+
|
|
255
|
+
def press(self, page_handle: dict, selector: str, key: str) -> Any:
|
|
256
|
+
return self.operate(page_handle, "press", selector=selector, key=key)
|
|
257
|
+
|
|
258
|
+
def status(self, page_handle: dict) -> dict:
|
|
259
|
+
return self.operate(page_handle, "status") # type: ignore[return-value]
|
|
260
|
+
|
|
261
|
+
def snapshot(self, page_handle: dict, **opts: Any) -> Any:
|
|
262
|
+
"""aria 树 (探查 DOM)."""
|
|
263
|
+
return self.operate(page_handle, "snapshot", **opts)
|
|
264
|
+
|
|
265
|
+
def evaluate(self, page_handle: dict, source: str, args: list | None = None) -> Any:
|
|
266
|
+
"""跑页面 JS (source 必须是函数表达式如 '() => document.title')."""
|
|
267
|
+
p: dict[str, Any] = {"source": source}
|
|
268
|
+
if args is not None:
|
|
269
|
+
p["args"] = args
|
|
270
|
+
return self.operate(page_handle, "evaluate", **p)
|
|
271
|
+
|
|
272
|
+
def screenshot(self, page_handle: dict) -> Any:
|
|
273
|
+
return self.operate(page_handle, "screenshot")
|
|
274
|
+
|
|
275
|
+
def inner_text(self, page_handle: dict, selector: str) -> str:
|
|
276
|
+
return self.operate(page_handle, "innerText", selector=selector) # type: ignore[return-value]
|
|
277
|
+
|
|
278
|
+
def wait_for_selector(self, page_handle: dict, selector: str, timeout: int | None = None) -> Any:
|
|
279
|
+
p: dict[str, Any] = {"selector": selector}
|
|
280
|
+
if timeout is not None:
|
|
281
|
+
p["timeout"] = timeout
|
|
282
|
+
return self.operate(page_handle, "waitForSelector", **p)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# ─── 模块级接口 (对齐 JS 的 operate/listTabs 等, 供 find/scroll/ready/humanize 等子模块用) ───
|
|
286
|
+
# 用法: from browser import operate, list_tabs, new_tab, open_browser, find_or_new_tab, close_browser
|
|
287
|
+
|
|
288
|
+
_default_browser: Browser | None = None
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _get_browser() -> Browser:
|
|
292
|
+
global _default_browser
|
|
293
|
+
if _default_browser is None:
|
|
294
|
+
_default_browser = Browser()
|
|
295
|
+
return _default_browser
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def open_browser(**kwargs) -> dict:
|
|
299
|
+
"""模块级 open (对齐 JS open)."""
|
|
300
|
+
return _get_browser().open(**kwargs)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def new_tab(url: str = "") -> dict:
|
|
304
|
+
"""模块级 newTab (对齐 JS newTab)."""
|
|
305
|
+
return _get_browser().new_tab(url)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def list_tabs(browser_handle: dict | None = None) -> list[dict]:
|
|
309
|
+
"""模块级 listTabs (对齐 JS listTabs)."""
|
|
310
|
+
return _get_browser().list_tabs()
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def find_or_new_tab(url: str, match: str = "includes", new_if_missing: bool = True) -> dict | None:
|
|
314
|
+
"""模块级 findOrNewTab (对齐 JS findOrNewTab)."""
|
|
315
|
+
return _get_browser().find_or_new_tab(url, match, new_if_missing)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def operate(page_handle: dict, params: dict) -> Any:
|
|
319
|
+
"""
|
|
320
|
+
模块级 operate (对齐 JS operate(pageHandle, actionObj)).
|
|
321
|
+
|
|
322
|
+
params 是 dict, 必须含 'action' key, 其余 key 是 action 参数:
|
|
323
|
+
operate(ph, {"action": "click", "selector": "#btn"})
|
|
324
|
+
operate(ph, {"action": "evaluate", "source": "() => 1", "_skipVisualize": True})
|
|
325
|
+
|
|
326
|
+
特殊 key (不传给 browserd, 本地控制):
|
|
327
|
+
_skipVisualize / _skipHumanize / label — 忽略 (仅 JS 端用)
|
|
328
|
+
"""
|
|
329
|
+
b = _get_browser()
|
|
330
|
+
action = params.get("action")
|
|
331
|
+
if not action:
|
|
332
|
+
raise ValueError("operate: params 缺 action")
|
|
333
|
+
# 提取 action 参数 (排除本地控制 key)
|
|
334
|
+
kwargs = {k: v for k, v in params.items() if k not in ("action", "_skipVisualize", "_skipHumanize", "label")}
|
|
335
|
+
return b.operate(page_handle, action, **kwargs)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def close_browser() -> dict:
|
|
339
|
+
"""模块级 close (对齐 JS close)."""
|
|
340
|
+
return _get_browser().close()
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""web/captcha — 验证码检测(多信号交叉验证,路由无关)
|
|
2
|
+
|
|
3
|
+
忠实复刻 lib/web/captcha.js。
|
|
4
|
+
信号:URL 关键词 / DOM selector / 页面标题关键词 / 预期内容缺失。
|
|
5
|
+
基于 operate({action:'status'}) + operate({action:'evaluate'})。
|
|
6
|
+
|
|
7
|
+
依赖:browser(operate)
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
sys.path.insert(0, __file__.rsplit("\\", 1)[0] if "\\" in __file__ else __file__.rsplit("/", 1)[0])
|
|
16
|
+
from browser import operate # noqa: E402
|
|
17
|
+
|
|
18
|
+
URL_KEYWORDS = ['captcha', 'verify', 'geetest', 'dingxiang', 'challenge', 'slider', 'recaptcha', 'hcaptcha']
|
|
19
|
+
TITLE_KEYWORDS = ['验证', 'captcha', 'security', 'verify', '人机', '安全验证']
|
|
20
|
+
DOM_SELECTORS = [
|
|
21
|
+
'.geetest_', '#geetest', '.yidun', '.dx_captcha', '#captcha',
|
|
22
|
+
'iframe[src*="captcha"]', 'iframe[src*="recaptcha"]', 'iframe[src*="hcaptcha"]',
|
|
23
|
+
'.g-recaptcha', '.h-captcha',
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# 探测页面是否含特定 captcha DOM
|
|
28
|
+
def _dom_probe_source(selectors: list) -> str:
|
|
29
|
+
return ('() => {\n'
|
|
30
|
+
' const sels = ' + json.dumps(selectors, separators=(',', ':')) + '\n'
|
|
31
|
+
' const found = []\n'
|
|
32
|
+
' for (const s of sels) { if (document.querySelector(s)) found.push(s) }\n'
|
|
33
|
+
' return found\n'
|
|
34
|
+
'}')
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def detect(page_handle: dict, options: dict | None = None) -> dict:
|
|
38
|
+
"""多信号验证码检测。
|
|
39
|
+
|
|
40
|
+
options:
|
|
41
|
+
expectedSelector: 正常页面应存在的 selector(缺失视为信号)
|
|
42
|
+
返回: {isCaptcha, confidence, signals, evidence}
|
|
43
|
+
confidence: 0~1;>=0.7 判 isCaptcha=true。
|
|
44
|
+
2+ 信号 → 0.95;1 强信号 → 0.7;1 弱信号(仅缺失)→ 0.3。
|
|
45
|
+
"""
|
|
46
|
+
if options is None:
|
|
47
|
+
options = {}
|
|
48
|
+
if not page_handle:
|
|
49
|
+
raise ValueError('detect: pageHandle required')
|
|
50
|
+
|
|
51
|
+
signals: list[str] = []
|
|
52
|
+
evidence: list[str] = []
|
|
53
|
+
|
|
54
|
+
# 信号 1:URL 关键词
|
|
55
|
+
st = operate(page_handle, {'action': 'status'})
|
|
56
|
+
url = (st and (st.get('url') or st.get('value'))) or ''
|
|
57
|
+
url_hit = None
|
|
58
|
+
for k in URL_KEYWORDS:
|
|
59
|
+
if k in url.lower():
|
|
60
|
+
url_hit = k
|
|
61
|
+
break
|
|
62
|
+
if url_hit:
|
|
63
|
+
signals.append('url-keyword')
|
|
64
|
+
evidence.append('url contains "' + url_hit + '"')
|
|
65
|
+
|
|
66
|
+
# 信号 2:标题关键词
|
|
67
|
+
title = (st and st.get('title')) or ''
|
|
68
|
+
title_hit = None
|
|
69
|
+
for k in TITLE_KEYWORDS:
|
|
70
|
+
if k.lower() in title.lower():
|
|
71
|
+
title_hit = k
|
|
72
|
+
break
|
|
73
|
+
if title_hit:
|
|
74
|
+
signals.append('title-keyword')
|
|
75
|
+
evidence.append('title contains "' + title_hit + '"')
|
|
76
|
+
|
|
77
|
+
# 信号 3:captcha DOM
|
|
78
|
+
found = operate(page_handle, {'action': 'evaluate', 'source': _dom_probe_source(DOM_SELECTORS)})
|
|
79
|
+
if isinstance(found, list):
|
|
80
|
+
found_arr = found
|
|
81
|
+
elif isinstance(found, dict) and isinstance(found.get('value'), list):
|
|
82
|
+
found_arr = found['value']
|
|
83
|
+
else:
|
|
84
|
+
found_arr = []
|
|
85
|
+
if len(found_arr) > 0:
|
|
86
|
+
signals.append('dom-selector')
|
|
87
|
+
evidence.append('selectors: ' + ', '.join(found_arr))
|
|
88
|
+
|
|
89
|
+
# 信号 4:预期内容缺失(弱信号)
|
|
90
|
+
if options.get('expectedSelector'):
|
|
91
|
+
exp = operate(page_handle, {'action': 'evaluate', 'source': '(s) => !!document.querySelector(s)'})
|
|
92
|
+
if isinstance(exp, bool):
|
|
93
|
+
present = exp
|
|
94
|
+
elif isinstance(exp, dict) and isinstance(exp.get('value'), bool):
|
|
95
|
+
present = exp['value']
|
|
96
|
+
elif isinstance(exp, dict) and isinstance(exp.get('result'), bool):
|
|
97
|
+
present = exp['result']
|
|
98
|
+
else:
|
|
99
|
+
present = bool(exp)
|
|
100
|
+
if not present:
|
|
101
|
+
signals.append('expected-missing')
|
|
102
|
+
evidence.append('expected "' + options.get('expectedSelector') + '" not found')
|
|
103
|
+
|
|
104
|
+
# 置信度
|
|
105
|
+
confidence = 0
|
|
106
|
+
strong = len([s for s in signals if s != 'expected-missing'])
|
|
107
|
+
weak = 1 if 'expected-missing' in signals else 0
|
|
108
|
+
if strong >= 2:
|
|
109
|
+
confidence = 0.95
|
|
110
|
+
elif strong == 1:
|
|
111
|
+
confidence = 0.7
|
|
112
|
+
elif weak == 1:
|
|
113
|
+
confidence = 0.3
|
|
114
|
+
|
|
115
|
+
return {'isCaptcha': confidence >= 0.7, 'confidence': confidence, 'signals': signals, 'evidence': evidence}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""web/crawler — 轻量爬虫 (URL 去重 + 深度控制 + 节流, 路由无关).
|
|
2
|
+
|
|
3
|
+
忠实复刻 lib/web/crawler.js。
|
|
4
|
+
基于 operate (goto + evaluate 提取链接/内容) 组合, 复用同一个 pageHandle (顺序导航)。
|
|
5
|
+
|
|
6
|
+
依赖: browser (operate)
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any, Callable
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import sys
|
|
15
|
+
sys.path.insert(0, __file__.rsplit("\\", 1)[0] if "\\" in __file__ else __file__.rsplit("/", 1)[0])
|
|
16
|
+
from browser import operate # noqa: E402
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _extract_source(base_domain: str) -> str:
|
|
20
|
+
return f"""() => {{
|
|
21
|
+
const title = document.title || "";
|
|
22
|
+
const text = (document.body && document.body.innerText) ? document.body.innerText.slice(0, 8000) : "";
|
|
23
|
+
const links = [];
|
|
24
|
+
const a = document.querySelectorAll("a[href]");
|
|
25
|
+
for (const el of a) {{
|
|
26
|
+
try {{
|
|
27
|
+
const u = new URL(el.href, location.href);
|
|
28
|
+
if (u.protocol !== "http:" && u.protocol !== "https:") continue;
|
|
29
|
+
links.push(u.href);
|
|
30
|
+
}} catch {{}}
|
|
31
|
+
}}
|
|
32
|
+
const base = {json.dumps(base_domain or "")};
|
|
33
|
+
const filtered = base ? links.filter((l) => {{ try {{ return new URL(l).hostname === base }} catch {{ return false }} }}) : links;
|
|
34
|
+
return {{ title, text, links: [...new Set(filtered)].slice(0, 200) }};
|
|
35
|
+
}}()"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def create_crawler(page_handle: dict, options: dict | None = None) -> dict:
|
|
39
|
+
"""创建爬虫实例.
|
|
40
|
+
|
|
41
|
+
options:
|
|
42
|
+
maxPages: 最多爬取页数 (默认 50)
|
|
43
|
+
maxDepth: 最大链接深度 (默认 3)
|
|
44
|
+
delay: 请求间隔秒 (默认 0.5)
|
|
45
|
+
retries: 失败重试次数 (默认 2)
|
|
46
|
+
sameDomain: 是否限制同域 (默认 True)
|
|
47
|
+
返回: {{ crawl, results }}
|
|
48
|
+
"""
|
|
49
|
+
if not page_handle:
|
|
50
|
+
raise ValueError("create_crawler: page_handle required")
|
|
51
|
+
opts = options or {}
|
|
52
|
+
max_pages = opts.get("maxPages", 50)
|
|
53
|
+
max_depth = opts.get("maxDepth", 3)
|
|
54
|
+
delay_sec = opts.get("delay", 0.5)
|
|
55
|
+
retries = opts.get("retries", 2)
|
|
56
|
+
same_domain = opts.get("sameDomain", True)
|
|
57
|
+
|
|
58
|
+
results = []
|
|
59
|
+
|
|
60
|
+
def fetch_page(url: str, depth: int) -> dict:
|
|
61
|
+
last_err = None
|
|
62
|
+
for attempt in range(retries + 1):
|
|
63
|
+
try:
|
|
64
|
+
operate(page_handle, {"action": "goto", "url": url, "timeout": 15000})
|
|
65
|
+
operate(page_handle, {"action": "waitForLoadState", "state": "complete"})
|
|
66
|
+
base_domain = ""
|
|
67
|
+
if same_domain:
|
|
68
|
+
try:
|
|
69
|
+
base_domain = urlparse(url).hostname or ""
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
data = operate(page_handle, {"action": "evaluate", "source": _extract_source(base_domain)})
|
|
73
|
+
d = {}
|
|
74
|
+
if isinstance(data, dict):
|
|
75
|
+
if "title" in data or "links" in data:
|
|
76
|
+
d = data
|
|
77
|
+
elif "value" in data:
|
|
78
|
+
d = data["value"]
|
|
79
|
+
return {"title": d.get("title", ""), "text": d.get("text", ""), "links": d.get("links", [])}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
last_err = e
|
|
82
|
+
time.sleep(0.5)
|
|
83
|
+
raise last_err or RuntimeError("fetchPage failed")
|
|
84
|
+
|
|
85
|
+
def crawl(start_url: str) -> list[dict]:
|
|
86
|
+
queue = [{"url": start_url, "depth": 0}]
|
|
87
|
+
visited = set()
|
|
88
|
+
start_domain = ""
|
|
89
|
+
try:
|
|
90
|
+
start_domain = urlparse(start_url).hostname or ""
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
while queue and len(results) < max_pages:
|
|
95
|
+
item = queue.pop(0)
|
|
96
|
+
url = item["url"]
|
|
97
|
+
depth = item["depth"]
|
|
98
|
+
if url in visited:
|
|
99
|
+
continue
|
|
100
|
+
visited.add(url)
|
|
101
|
+
if depth > max_depth:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
page = fetch_page(url, depth)
|
|
106
|
+
results.append({"url": url, "title": page["title"], "html": page["text"], "depth": depth})
|
|
107
|
+
for link in page["links"]:
|
|
108
|
+
if link in visited:
|
|
109
|
+
continue
|
|
110
|
+
if same_domain and start_domain:
|
|
111
|
+
try:
|
|
112
|
+
if urlparse(link).hostname != start_domain:
|
|
113
|
+
continue
|
|
114
|
+
except Exception:
|
|
115
|
+
continue
|
|
116
|
+
queue.append({"url": link, "depth": depth + 1})
|
|
117
|
+
except Exception:
|
|
118
|
+
results.append({"url": url, "title": "", "html": "", "depth": depth, "error": "fetch failed"})
|
|
119
|
+
|
|
120
|
+
if delay_sec > 0:
|
|
121
|
+
time.sleep(delay_sec)
|
|
122
|
+
|
|
123
|
+
return results
|
|
124
|
+
|
|
125
|
+
return {"crawl": crawl, "results": results}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
样例 01: 最小打开 + 探查 (对齐 e2e_steps/01_cdp_open.js)
|
|
3
|
+
|
|
4
|
+
演示 Python 客户端最基本的: open → new_tab → status → evaluate → close
|
|
5
|
+
适合第一次接触 Python 客户端的人跑通。
|
|
6
|
+
|
|
7
|
+
跑: python lib/py/examples/01_open_and_probe.py
|
|
8
|
+
"""
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
# 让 examples 能 import 上级目录的 browser 模块
|
|
14
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
15
|
+
|
|
16
|
+
from browser import Browser # noqa: E402
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main() -> None:
|
|
20
|
+
b = Browser()
|
|
21
|
+
|
|
22
|
+
# 1. 打开浏览器 (自动起 browserd daemon, 双路由 auto)
|
|
23
|
+
print("[01] open browser...")
|
|
24
|
+
bh = b.open(route_mode="auto", headless=False)
|
|
25
|
+
print("[01] mode:", bh.get("mode"))
|
|
26
|
+
|
|
27
|
+
# 2. 新开 tab 到 example.com
|
|
28
|
+
print("[01] new tab → example.com")
|
|
29
|
+
ph = b.new_tab("https://example.com")
|
|
30
|
+
time.sleep(3) # 等加载 (演示用, 生产用 wait_for_selector)
|
|
31
|
+
|
|
32
|
+
# 3. 探查: status
|
|
33
|
+
st = b.status(ph)
|
|
34
|
+
print("[01] status:", st)
|
|
35
|
+
|
|
36
|
+
# 4. 探查: evaluate 取标题 + 正文
|
|
37
|
+
title = b.evaluate(ph, source="() => document.title")
|
|
38
|
+
h1 = b.evaluate(ph, source='() => (document.querySelector("h1")||{}).innerText || ""')
|
|
39
|
+
print(f"[01] title={title!r} h1={h1!r}")
|
|
40
|
+
|
|
41
|
+
# 5. 用完关闭 (仅演示结束才关, 生产里 agent 长生命周期复用)
|
|
42
|
+
print("[01] close")
|
|
43
|
+
b.close()
|
|
44
|
+
print("[01] DONE")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
main()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
样例 02: 复用已有 tab + DOM 探查 (对齐 findOrNewTab + snapshot/evaluate)
|
|
3
|
+
|
|
4
|
+
演示 Python 客户端的核心探查能力:
|
|
5
|
+
- find_or_new_tab (只找不开, 不重复打开已有 tab)
|
|
6
|
+
- snapshot (aria 树, DOM 结构探查)
|
|
7
|
+
- evaluate (跑页面 JS 取数据)
|
|
8
|
+
|
|
9
|
+
这是 agent/采集场景最常用的模式: 先看页面长什么样, 再决定怎么操作。
|
|
10
|
+
|
|
11
|
+
跑: python lib/py/examples/02_reuse_and_probe.py
|
|
12
|
+
"""
|
|
13
|
+
import sys
|
|
14
|
+
import os
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
18
|
+
|
|
19
|
+
from browser import Browser # noqa: E402
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def main() -> None:
|
|
23
|
+
b = Browser()
|
|
24
|
+
print("[02] open...")
|
|
25
|
+
b.open(route_mode="auto", headless=False)
|
|
26
|
+
|
|
27
|
+
# 1. find_or_new_tab: 已有真实 example.com tab 就复用, 没有就开
|
|
28
|
+
# (对齐 browser.js 的"复用已有 tab, 不盲目 newTab"原则)
|
|
29
|
+
# 注意: 用 'https://example.com' 精确匹配, 避开 chrome-extension:// 代理 tab
|
|
30
|
+
# (代理 tab 的 url 是 chrome-extension://xxx/example.com, 子串会误命中)
|
|
31
|
+
ph = b.find_or_new_tab("https://example.com", match="includes")
|
|
32
|
+
print("[02] pageHandle:", ph)
|
|
33
|
+
|
|
34
|
+
# 切到这个 tab 并等加载
|
|
35
|
+
b.operate(ph, action="activate")
|
|
36
|
+
b.operate(ph, action="waitForLoadState", state="complete")
|
|
37
|
+
time.sleep(2)
|
|
38
|
+
|
|
39
|
+
# 2. snapshot: aria 树 (探查 DOM 结构, 对齐 browser 的 snapshot action)
|
|
40
|
+
snap = b.snapshot(ph)
|
|
41
|
+
if snap and isinstance(snap, dict):
|
|
42
|
+
yaml = snap.get("yaml", "")
|
|
43
|
+
print(f"[02] snapshot aria 树长度: {len(yaml)} 字符")
|
|
44
|
+
print("[02] aria 树前 300 字:")
|
|
45
|
+
print(yaml[:300])
|
|
46
|
+
else:
|
|
47
|
+
print("[02] snapshot:", snap)
|
|
48
|
+
|
|
49
|
+
# 3. evaluate: 取结构化数据 (采集常用)
|
|
50
|
+
links = b.evaluate(
|
|
51
|
+
ph,
|
|
52
|
+
source="""() => {
|
|
53
|
+
return [...document.querySelectorAll('a')].map(a => ({
|
|
54
|
+
text: (a.innerText || '').trim(),
|
|
55
|
+
href: a.href
|
|
56
|
+
})).slice(0, 5)
|
|
57
|
+
}""",
|
|
58
|
+
)
|
|
59
|
+
print(f"[02] 页面链接 (前5): {links}")
|
|
60
|
+
|
|
61
|
+
# 不关浏览器 (长生命周期, 后续 agent 继续用)
|
|
62
|
+
print("[02] 浏览器保持打开 (长生命周期复用)")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|