mooncat-browser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +213 -0
- package/browser-op/backend/browserd.cjs +1004 -0
- package/browser-op/backend/rpc-client.cjs +64 -0
- package/browser-op/backend/state.cjs +51 -0
- package/browser-op/cdp/capture-inject.js +426 -0
- package/browser-op/cdp/capture-inject.ts +426 -0
- package/browser-op/cdp/capture-service.cjs +172 -0
- package/browser-op/cdp/chrome-launcher.cjs +370 -0
- package/browser-op/cdp/chrome-path.cjs +57 -0
- package/browser-op/cdp/state.cjs +89 -0
- package/browser-op/extension/extension-detect.cjs +228 -0
- package/browser-op/extension/server.cjs +197 -0
- package/browser-op/extension/service.cjs +228 -0
- package/browser-op/extension/state.cjs +78 -0
- package/browser-op/index.cjs +389 -0
- package/browser-op/package.json +17 -0
- package/browser-op/py/behavior.py +138 -0
- package/browser-op/py/browser.py +340 -0
- package/browser-op/py/captcha.py +115 -0
- package/browser-op/py/crawler.py +125 -0
- package/browser-op/py/examples/01_open_and_probe.py +48 -0
- package/browser-op/py/examples/02_reuse_and_probe.py +66 -0
- package/browser-op/py/examples/03_interact.py +66 -0
- package/browser-op/py/find.py +150 -0
- package/browser-op/py/honeypot.py +73 -0
- package/browser-op/py/humanize.py +392 -0
- package/browser-op/py/image.py +186 -0
- package/browser-op/py/interact.py +193 -0
- package/browser-op/py/markdown.py +38 -0
- package/browser-op/py/pyproject.toml +32 -0
- package/browser-op/py/ready.py +208 -0
- package/browser-op/py/scroll.py +180 -0
- package/browser-op/py/upload.py +103 -0
- package/browser-op/py/visual_target.py +47 -0
- package/browser-op/py/visualize.py +91 -0
- package/browser-op/state.cjs +63 -0
- package/browser-op/web/behavior.js +153 -0
- package/browser-op/web/browser.js +231 -0
- package/browser-op/web/captcha.js +85 -0
- package/browser-op/web/crawler.js +109 -0
- package/browser-op/web/find.js +147 -0
- package/browser-op/web/honeypot.js +68 -0
- package/browser-op/web/humanize.js +522 -0
- package/browser-op/web/image.js +177 -0
- package/browser-op/web/interact.js +169 -0
- package/browser-op/web/markdown.js +80 -0
- package/browser-op/web/ready.js +295 -0
- package/browser-op/web/scroll.js +167 -0
- package/browser-op/web/upload.js +116 -0
- package/browser-op/web/visual-runtime.inject.cjs +6 -0
- package/browser-op/webplater/.env.example +7 -0
- package/browser-op/webplater/ARCHITECTURE.md +102 -0
- package/browser-op/webplater/dist/chrome-mv3/assets/popup-BUZEUmsx.css +1 -0
- package/browser-op/webplater/dist/chrome-mv3/background.js +2 -0
- package/browser-op/webplater/dist/chrome-mv3/capture.js +310 -0
- package/browser-op/webplater/dist/chrome-mv3/chunks/_virtual_wxt-html-plugins-DPbbfBKe.js +1 -0
- package/browser-op/webplater/dist/chrome-mv3/chunks/offscreen-CFXYw9Mo.js +1 -0
- package/browser-op/webplater/dist/chrome-mv3/chunks/popup-C-lpxZZO.js +1 -0
- package/browser-op/webplater/dist/chrome-mv3/content-scripts/content.js +7 -0
- package/browser-op/webplater/dist/chrome-mv3/manifest.json +1 -0
- package/browser-op/webplater/dist/chrome-mv3/offscreen.html +16 -0
- package/browser-op/webplater/dist/chrome-mv3/popup.html +31 -0
- package/browser-op/webplater/entrypoints/background.ts +938 -0
- package/browser-op/webplater/entrypoints/content.ts +1150 -0
- package/browser-op/webplater/entrypoints/offscreen/index.html +15 -0
- package/browser-op/webplater/entrypoints/offscreen/main.ts +161 -0
- package/browser-op/webplater/entrypoints/popup/index.html +29 -0
- package/browser-op/webplater/entrypoints/popup/main.ts +61 -0
- package/browser-op/webplater/entrypoints/popup/style.css +100 -0
- package/browser-op/webplater/lib/snapshot.ts +352 -0
- package/browser-op/webplater/package.json +29 -0
- package/browser-op/webplater/pnpm-lock.yaml +3411 -0
- package/browser-op/webplater/public/capture.js +310 -0
- package/browser-op/webplater/scripts/publish-extension.mjs +176 -0
- package/browser-op/webplater/tsconfig.json +19 -0
- package/browser-op/webplater/wxt.config.ts +34 -0
- package/dist/actions.md +102 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +278 -0
- package/dist/cli.js.map +1 -0
- package/dist/client.d.ts +94 -0
- package/dist/client.d.ts.map +1 -0
- package/dist/client.js +277 -0
- package/dist/client.js.map +1 -0
- package/dist/config.d.ts +61 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +119 -0
- package/dist/config.js.map +1 -0
- package/dist/protocol.d.ts +195 -0
- package/dist/protocol.d.ts.map +1 -0
- package/dist/protocol.js +11 -0
- package/dist/protocol.js.map +1 -0
- package/dist/server.d.ts +66 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +259 -0
- package/dist/server.js.map +1 -0
- package/package.json +78 -0
- package/schemas/browser.clearCookies.schema.json +13 -0
- package/schemas/browser.close.schema.json +9 -0
- package/schemas/browser.getCookies.schema.json +13 -0
- package/schemas/browser.getDownload.schema.json +15 -0
- package/schemas/browser.health.schema.json +9 -0
- package/schemas/browser.listDownloads.schema.json +16 -0
- package/schemas/browser.listTabs.schema.json +9 -0
- package/schemas/browser.newTab.schema.json +15 -0
- package/schemas/browser.open.schema.json +15 -0
- package/schemas/browser.operate.schema.json +15 -0
- package/schemas/browser.reuseTab.schema.json +15 -0
- package/schemas/browser.setCookies.schema.json +15 -0
- package/schemas/browser.waitFor.schema.json +15 -0
- package/schemas/browser.waitForDownload.schema.json +15 -0
- package/skills/browser/SKILL.md +110 -0
- package/skills/browser/references/collect.md +163 -0
- package/skills/browser/references/high-risk.md +161 -0
- package/skills/browser/references/operate-actions.md +92 -0
- package/skills/browser/references/probing.md +302 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
// web/browser — browserd client facade
|
|
2
|
+
//
|
|
3
|
+
// 对外 API 不变: open / newTab / operate / listTabs / close 等。
|
|
4
|
+
// 但底层不再是进程内 BrowserOp, 而是自动管理一个 detached browserd 后端进程。
|
|
5
|
+
// 业务脚本退出不影响 Chrome (browserd 持有它)。
|
|
6
|
+
//
|
|
7
|
+
// 生命周期:
|
|
8
|
+
// open() → 自动 spawn browserd (如不存在) → IPC open → 返回 browserHandle (纯数据)
|
|
9
|
+
// newTab/operate/listTabs → IPC 调 browserd
|
|
10
|
+
// close() → IPC close → browserd 关 Chrome + 退出
|
|
11
|
+
// 业务脚本退出 → 什么都不做 (browserd 常驻)
|
|
12
|
+
|
|
13
|
+
'use strict'
|
|
14
|
+
|
|
15
|
+
const { spawn } = require('node:child_process')
|
|
16
|
+
const path = require('node:path')
|
|
17
|
+
const os = require('node:os')
|
|
18
|
+
|
|
19
|
+
const { rpc, health } = require('../backend/rpc-client.cjs')
|
|
20
|
+
|
|
21
|
+
// skill 根目录 (browser.js 在 lib/web/, 往上两层)
|
|
22
|
+
const SKILL_ROOT = path.resolve(__dirname, '..', '..')
|
|
23
|
+
const BROWSERD_PATH = path.join(SKILL_ROOT, 'lib', 'backend', 'browserd.cjs')
|
|
24
|
+
|
|
25
|
+
// browserd 是否已在本进程确认存活 (避免重复 spawn)
|
|
26
|
+
let _browserdReady = false
|
|
27
|
+
|
|
28
|
+
// ─── ensureBrowserd: 确保后端存活 ───
|
|
29
|
+
|
|
30
|
+
async function ensureBrowserd (timeoutMs = 15000) {
|
|
31
|
+
// 先查 health
|
|
32
|
+
const h = await health(2000)
|
|
33
|
+
if (h && h.ok) {
|
|
34
|
+
_browserdReady = true
|
|
35
|
+
return
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// 不存活 → detached spawn
|
|
39
|
+
const child = spawn(process.execPath, [BROWSERD_PATH], {
|
|
40
|
+
cwd: SKILL_ROOT,
|
|
41
|
+
stdio: 'ignore',
|
|
42
|
+
detached: true,
|
|
43
|
+
windowsHide: true,
|
|
44
|
+
env: process.env
|
|
45
|
+
})
|
|
46
|
+
child.unref()
|
|
47
|
+
|
|
48
|
+
// 轮询 health
|
|
49
|
+
const deadline = Date.now() + timeoutMs
|
|
50
|
+
while (Date.now() < deadline) {
|
|
51
|
+
await new Promise((r) => setTimeout(r, 300))
|
|
52
|
+
const h2 = await health(1000)
|
|
53
|
+
if (h2 && h2.ok) {
|
|
54
|
+
_browserdReady = true
|
|
55
|
+
return
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
throw new Error('browserd failed to start (no /health after ' + timeoutMs + 'ms)')
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ─── 对外 API (签名不变) ───
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* 打开浏览器 (自动连接或启动 browserd 后端)。
|
|
66
|
+
* @param {Object} options
|
|
67
|
+
* mode: 'humanize' → 开启拟人行为注入(高危平台必用);调用方显式 humanAct
|
|
68
|
+
* riskLevel: 'low'|'medium'|'high';不传则按 url 自动 detect(高危域名→high)
|
|
69
|
+
* @returns {Promise<browserHandle>} 纯数据: { mode, notice, ownerId, humanize? }
|
|
70
|
+
*/
|
|
71
|
+
async function open (options = {}) {
|
|
72
|
+
await ensureBrowserd()
|
|
73
|
+
const bh = await rpc('open', {
|
|
74
|
+
routeMode: options.routeMode || 'auto',
|
|
75
|
+
headless: options.headless || false,
|
|
76
|
+
executablePath: options.executablePath,
|
|
77
|
+
profileRoot: options.profileRoot,
|
|
78
|
+
userDataDir: options.userDataDir
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
// 拟人模式:挂 humanizer 到 browserHandle,newTab 继承,operate 自动注入
|
|
82
|
+
if (options.mode === 'humanize') {
|
|
83
|
+
const humanize = require('./humanize')
|
|
84
|
+
const level = options.riskLevel || humanize.detectRiskLevel(undefined)
|
|
85
|
+
bh.humanize = humanize.createHumanizer({ riskLevel: level })
|
|
86
|
+
bh.notice = bh.notice || { level: 'info', message: '拟人模式已开启 (riskLevel=' + bh.humanize.riskLevel + ')' }
|
|
87
|
+
}
|
|
88
|
+
// 可视化在后端默认开启 (resolver 单源保证高亮=执行对象), 这里不挂任何标记。
|
|
89
|
+
return bh
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* 新建 tab。返回 pageHandle (纯数据: { mode, pageId, tabId?, url, ownerId, humanize? })。
|
|
94
|
+
* humanize 标记从 browserHandle 继承(open mode:'humanize' 才有)。
|
|
95
|
+
*/
|
|
96
|
+
async function newTab (browserHandle, params = {}) {
|
|
97
|
+
if (!browserHandle) throw new Error('newTab: browserHandle required')
|
|
98
|
+
const ph = await rpc('newTab', { url: params.url || '' })
|
|
99
|
+
// 继承 humanizer(同一个实例,共享鼠标位置等模块级状态)
|
|
100
|
+
if (browserHandle.humanize) ph.humanize = browserHandle.humanize
|
|
101
|
+
return ph
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* 列出所有 tab。返回 [{ pageHandle, url, title, active }]。
|
|
106
|
+
*/
|
|
107
|
+
async function listTabs (browserHandle) {
|
|
108
|
+
if (!browserHandle) throw new Error('listTabs: browserHandle required')
|
|
109
|
+
return rpc('listTabs', {})
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* 复用已有 tab: 按 url 匹配找, 找到复用 pageHandle; 找不到才 newTab。
|
|
114
|
+
*
|
|
115
|
+
* 解决「盲目 newTab 重复打开同一页面」的污染问题。
|
|
116
|
+
* 这是采集/工作流场景的推荐入口: 先复用, 没有再开。
|
|
117
|
+
*
|
|
118
|
+
* @param {browserHandle} browserHandle
|
|
119
|
+
* @param {object} params
|
|
120
|
+
* @param {string} params.url 匹配已开 tab 的 url (子串匹配, 不含协议/查询更稳)
|
|
121
|
+
* @param {string} [params.match='includes'] 匹配方式: 'includes'(默认,子串) | 'exact'
|
|
122
|
+
* @param {boolean} [params.newIfMissing=true] 没找到是否 newTab; false 时没找到返回 null
|
|
123
|
+
* @param {string} [params.openUrl] newTab 时用的完整 url (缺省用 params.url)
|
|
124
|
+
* @returns {Promise<pageHandle|null>}
|
|
125
|
+
*/
|
|
126
|
+
async function findOrNewTab (browserHandle, params = {}) {
|
|
127
|
+
if (!browserHandle) throw new Error('findOrNewTab: browserHandle required')
|
|
128
|
+
const match = params.match || 'includes'
|
|
129
|
+
const target = params.url || ''
|
|
130
|
+
if (!target) throw new Error('findOrNewTab: url required')
|
|
131
|
+
|
|
132
|
+
const tabs = await rpc('listTabs', {})
|
|
133
|
+
let found = null
|
|
134
|
+
for (const t of tabs) {
|
|
135
|
+
const u = t.url || ''
|
|
136
|
+
const hit = match === 'exact' ? u === target : u.includes(target)
|
|
137
|
+
if (hit) { found = t; break }
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (found) {
|
|
141
|
+
// 复用: 继承 humanize 标记 (newTab 会做, 这里手动补)
|
|
142
|
+
const ph = found.pageHandle
|
|
143
|
+
if (browserHandle.humanize) ph.humanize = browserHandle.humanize
|
|
144
|
+
return ph
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (params.newIfMissing === false) return null
|
|
148
|
+
return newTab(browserHandle, { url: params.openUrl || target })
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* 在 pageHandle 上执行操作。统一入口。
|
|
153
|
+
*
|
|
154
|
+
* 纯透传: operate 把 action 发给后端 browserd 执行。
|
|
155
|
+
* 可视化 (高亮真实操作对象) 在后端默认开启, 由 resolver 单源保证
|
|
156
|
+
* 高亮对象 = 执行对象 (resolve → visualMark → execute 同一元素/handle)。
|
|
157
|
+
* 不需要调用方传任何开关。
|
|
158
|
+
* 拟人化是独立能力, 由调用方显式控制 (humanize.humanAct)。
|
|
159
|
+
* @param {pageHandle} pageHandle
|
|
160
|
+
* @param {Object} actionObj - { action: 'click', selector: '#x', ... }
|
|
161
|
+
*/
|
|
162
|
+
async function operate (pageHandle, actionObj) {
|
|
163
|
+
if (!pageHandle) throw new Error('operate: pageHandle required')
|
|
164
|
+
if (!actionObj || !actionObj.action) throw new Error('operate: action required')
|
|
165
|
+
|
|
166
|
+
// 剥离内部专用 flag (_skipHumanize/_skipVisualize), 不透传给后端动作参数
|
|
167
|
+
const { action, _skipHumanize, _skipVisualize, ...rest } = actionObj
|
|
168
|
+
|
|
169
|
+
return rpc('operate', { pageHandle, action, params: rest })
|
|
170
|
+
|
|
171
|
+
return result
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* 关闭浏览器 + browserd。只有用户说"完成"时才调。
|
|
176
|
+
*/
|
|
177
|
+
async function close (browserHandle) {
|
|
178
|
+
if (!browserHandle) return
|
|
179
|
+
try {
|
|
180
|
+
await rpc('close', {})
|
|
181
|
+
} catch (e) {
|
|
182
|
+
// browserd 可能已退出, 忽略
|
|
183
|
+
}
|
|
184
|
+
_browserdReady = false
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// ─── cookie (browserHandle 作用域) ───
|
|
188
|
+
|
|
189
|
+
async function getCookies (browserHandle, params = {}) {
|
|
190
|
+
return rpc('getCookies', params)
|
|
191
|
+
}
|
|
192
|
+
async function setCookies (browserHandle, params = {}) {
|
|
193
|
+
return rpc('setCookies', params)
|
|
194
|
+
}
|
|
195
|
+
async function clearCookies (browserHandle, params = {}) {
|
|
196
|
+
return rpc('clearCookies', params)
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// ─── 模块加载 (辅助模块, 不走 browserd) ───
|
|
200
|
+
|
|
201
|
+
const createHumanizer = (opts) => {
|
|
202
|
+
try { return require('./humanize').createHumanizer(opts) } catch { return null }
|
|
203
|
+
}
|
|
204
|
+
const detectRiskLevel = (url) => {
|
|
205
|
+
try { return require('./humanize').detectRiskLevel(url) } catch { return 'medium' }
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
module.exports = {
|
|
209
|
+
open,
|
|
210
|
+
newTab,
|
|
211
|
+
findOrNewTab,
|
|
212
|
+
listTabs,
|
|
213
|
+
operate,
|
|
214
|
+
close,
|
|
215
|
+
getCookies,
|
|
216
|
+
setCookies,
|
|
217
|
+
clearCookies,
|
|
218
|
+
createHumanizer,
|
|
219
|
+
detectRiskLevel,
|
|
220
|
+
ensureBrowserd,
|
|
221
|
+
// 重新导出辅助模块 (兼容现有 require 路径)
|
|
222
|
+
createBehavior: (opts) => { try { return require('./behavior')(opts) } catch { return null } },
|
|
223
|
+
detect: (opts) => { try { return require('./captcha').detect(opts) } catch { return null } },
|
|
224
|
+
createCrawler: (opts) => { try { return require('./crawler')(opts) } catch { return null } },
|
|
225
|
+
scan: (opts) => { try { return require('./honeypot').scan(opts) } catch { return null } },
|
|
226
|
+
captureImages: (opts) => { try { return require('./image').captureImages(opts) } catch { return null } },
|
|
227
|
+
highlight: (opts) => { try { return require('./interact').highlight(opts) } catch {} },
|
|
228
|
+
toast: (opts) => { try { return require('./interact').toast(opts) } catch {} },
|
|
229
|
+
htmlToMarkdown: (html) => { try { return require('./markdown').htmlToMarkdown(html) } catch { return html } },
|
|
230
|
+
uploadFiles: (opts) => { try { return require('./upload').uploadFiles(opts) } catch { return null } }
|
|
231
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
// -*- coding: utf-8 -*-
|
|
2
|
+
//
|
|
3
|
+
// web/captcha — 验证码检测(多信号交叉验证,路由无关)
|
|
4
|
+
//
|
|
5
|
+
// 完整复刻自 bee/resources/sandbox/builtin-libs/web/captcha.js。
|
|
6
|
+
// 信号:URL 关键词 / DOM selector / 页面标题关键词 / 预期内容缺失。
|
|
7
|
+
// 基于 operate({action:'status'}) + operate({action:'evaluate'})。
|
|
8
|
+
//
|
|
9
|
+
// 依赖:./browser(operate)
|
|
10
|
+
|
|
11
|
+
'use strict'
|
|
12
|
+
|
|
13
|
+
const { operate } = require('./browser')
|
|
14
|
+
|
|
15
|
+
const URL_KEYWORDS = ['captcha', 'verify', 'geetest', 'dingxiang', 'challenge', 'slider', 'recaptcha', 'hcaptcha']
|
|
16
|
+
const TITLE_KEYWORDS = ['验证', 'captcha', 'security', 'verify', '人机', '安全验证']
|
|
17
|
+
const DOM_SELECTORS = [
|
|
18
|
+
'.geetest_', '#geetest', '.yidun', '.dx_captcha', '#captcha',
|
|
19
|
+
'iframe[src*="captcha"]', 'iframe[src*="recaptcha"]', 'iframe[src*="hcaptcha"]',
|
|
20
|
+
'.g-recaptcha', '.h-captcha'
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
// 探测页面是否含特定 captcha DOM
|
|
24
|
+
const DOM_PROBE_SOURCE = (selectors) => '() => {\n' +
|
|
25
|
+
' const sels = ' + JSON.stringify(selectors) + '\n' +
|
|
26
|
+
' const found = []\n' +
|
|
27
|
+
' for (const s of sels) { if (document.querySelector(s)) found.push(s) }\n' +
|
|
28
|
+
' return found\n' +
|
|
29
|
+
'}'
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* 多信号验证码检测。
|
|
33
|
+
* @param {object} pageHandle - 单个 page 句柄
|
|
34
|
+
* @param {object} [options]
|
|
35
|
+
* @param {string} [options.expectedSelector] - 正常页面应存在的 selector(缺失视为信号)
|
|
36
|
+
* @returns {Promise<{isCaptcha, confidence, signals, evidence}>}
|
|
37
|
+
* confidence: 0~1;>=0.7 判 isCaptcha=true。
|
|
38
|
+
* 2+ 信号 → 0.95;1 强信号 → 0.7;1 弱信号(仅缺失)→ 0.3。
|
|
39
|
+
*/
|
|
40
|
+
async function detect (pageHandle, options = {}) {
|
|
41
|
+
if (!pageHandle) throw new Error('detect: pageHandle required')
|
|
42
|
+
|
|
43
|
+
const signals = []
|
|
44
|
+
const evidence = []
|
|
45
|
+
|
|
46
|
+
// 信号 1:URL 关键词
|
|
47
|
+
const st = await operate(pageHandle, { action: 'status' })
|
|
48
|
+
const url = (st && (st.url || st.value)) || ''
|
|
49
|
+
const urlHit = URL_KEYWORDS.find((k) => url.toLowerCase().includes(k))
|
|
50
|
+
if (urlHit) { signals.push('url-keyword'); evidence.push('url contains "' + urlHit + '"') }
|
|
51
|
+
|
|
52
|
+
// 信号 2:标题关键词
|
|
53
|
+
const title = (st && st.title) || ''
|
|
54
|
+
const titleHit = TITLE_KEYWORDS.find((k) => title.toLowerCase().includes(k.toLowerCase()))
|
|
55
|
+
if (titleHit) { signals.push('title-keyword'); evidence.push('title contains "' + titleHit + '"') }
|
|
56
|
+
|
|
57
|
+
// 信号 3:captcha DOM
|
|
58
|
+
const found = await operate(pageHandle, { action: 'evaluate', source: DOM_PROBE_SOURCE(DOM_SELECTORS) })
|
|
59
|
+
const foundArr = Array.isArray(found) ? found : (found && Array.isArray(found.value) ? found.value : [])
|
|
60
|
+
if (foundArr.length > 0) { signals.push('dom-selector'); evidence.push('selectors: ' + foundArr.join(', ')) }
|
|
61
|
+
|
|
62
|
+
// 信号 4:预期内容缺失(弱信号)
|
|
63
|
+
if (options.expectedSelector) {
|
|
64
|
+
const exp = await operate(pageHandle, { action: 'evaluate', source: '(s) => !!document.querySelector(s)' })
|
|
65
|
+
const present = (() => {
|
|
66
|
+
if (typeof exp === 'boolean') return exp
|
|
67
|
+
if (exp && typeof exp.value === 'boolean') return exp.value
|
|
68
|
+
if (exp && typeof exp.result === 'boolean') return exp.result
|
|
69
|
+
return !!exp
|
|
70
|
+
})()
|
|
71
|
+
if (!present) { signals.push('expected-missing'); evidence.push('expected "' + options.expectedSelector + '" not found') }
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// 置信度
|
|
75
|
+
let confidence = 0
|
|
76
|
+
const strong = signals.filter((s) => s !== 'expected-missing').length
|
|
77
|
+
const weak = signals.includes('expected-missing') ? 1 : 0
|
|
78
|
+
if (strong >= 2) confidence = 0.95
|
|
79
|
+
else if (strong === 1) confidence = 0.7
|
|
80
|
+
else if (weak === 1) confidence = 0.3
|
|
81
|
+
|
|
82
|
+
return { isCaptcha: confidence >= 0.7, confidence, signals, evidence }
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
module.exports = { detect }
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
// -*- coding: utf-8 -*-
|
|
2
|
+
//
|
|
3
|
+
// web/crawler — 轻量爬虫(URL 去重 + 深度控制 + 节流,路由无关)
|
|
4
|
+
//
|
|
5
|
+
// 完整复刻自 bee/resources/sandbox/builtin-libs/web/crawler.js。
|
|
6
|
+
// 基于 operate(goto + evaluate 提取链接/内容)组合,复用同一个 pageHandle(顺序导航)。
|
|
7
|
+
//
|
|
8
|
+
// 依赖:./browser(operate)
|
|
9
|
+
|
|
10
|
+
'use strict'
|
|
11
|
+
|
|
12
|
+
const { operate } = require('./browser')
|
|
13
|
+
|
|
14
|
+
// 页面内提取标题 + 正文 + 同域链接
|
|
15
|
+
const EXTRACT_SOURCE = (baseDomain) => '() => {\n' +
|
|
16
|
+
' const title = document.title || ""\n' +
|
|
17
|
+
' const text = (document.body && document.body.innerText) ? document.body.innerText.slice(0, 8000) : ""\n' +
|
|
18
|
+
' const links = []\n' +
|
|
19
|
+
' const a = document.querySelectorAll("a[href]")\n' +
|
|
20
|
+
' for (const el of a) {\n' +
|
|
21
|
+
' try {\n' +
|
|
22
|
+
' const u = new URL(el.href, location.href)\n' +
|
|
23
|
+
' if (u.protocol !== "http:" && u.protocol !== "https:") continue\n' +
|
|
24
|
+
' links.push(u.href)\n' +
|
|
25
|
+
' } catch {}\n' +
|
|
26
|
+
' }\n' +
|
|
27
|
+
' const base = ' + JSON.stringify(baseDomain || '') + '\n' +
|
|
28
|
+
' const filtered = base ? links.filter((l) => { try { return new URL(l).hostname === base } catch { return false } }) : links\n' +
|
|
29
|
+
' return { title, text, links: [...new Set(filtered)].slice(0, 200) }\n' +
|
|
30
|
+
'}'
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* 创建爬虫实例。
|
|
34
|
+
* @param {object} pageHandle - 单个 page 句柄
|
|
35
|
+
* @param {object} [options]
|
|
36
|
+
* @param {number} [options.maxPages=50] - 最多爬取页数
|
|
37
|
+
* @param {number} [options.maxDepth=3] - 最大链接深度
|
|
38
|
+
* @param {number} [options.delay=0.5] - 请求间隔秒
|
|
39
|
+
* @param {number} [options.retries=2] - 失败重试次数
|
|
40
|
+
* @param {boolean} [options.sameDomain=true] - 是否限制同域
|
|
41
|
+
* @returns {{ crawl(startUrl): Promise<Array<{url,title,html,depth}>>, results }}
|
|
42
|
+
*/
|
|
43
|
+
function createCrawler (pageHandle, options = {}) {
|
|
44
|
+
if (!pageHandle) throw new Error('createCrawler: pageHandle required')
|
|
45
|
+
const maxPages = options.maxPages != null ? options.maxPages : 50
|
|
46
|
+
const maxDepth = options.maxDepth != null ? options.maxDepth : 3
|
|
47
|
+
const delaySec = options.delay != null ? options.delay : 0.5
|
|
48
|
+
const retries = options.retries != null ? options.retries : 2
|
|
49
|
+
const sameDomain = options.sameDomain != null ? options.sameDomain : true
|
|
50
|
+
|
|
51
|
+
const results = []
|
|
52
|
+
const sleep = (ms) => new Promise((r) => setTimeout(r, ms))
|
|
53
|
+
|
|
54
|
+
async function fetchPage (url, depth) {
|
|
55
|
+
let lastErr = null
|
|
56
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
57
|
+
try {
|
|
58
|
+
await operate(pageHandle, { action: 'goto', url, timeout: 15000 })
|
|
59
|
+
await operate(pageHandle, { action: 'waitForLoadState', state: 'complete' }).catch(() => {})
|
|
60
|
+
let baseDomain = ''
|
|
61
|
+
try { baseDomain = sameDomain ? new URL(url).hostname : '' } catch {}
|
|
62
|
+
const data = await operate(pageHandle, { action: 'evaluate', source: EXTRACT_SOURCE(baseDomain) })
|
|
63
|
+
// evaluate 返回结构兼容
|
|
64
|
+
const d = (data && data.value && typeof data.value === 'object') ? data.value
|
|
65
|
+
: (data && typeof data === 'object' && !Array.isArray(data) && (data.title || data.links)) ? data
|
|
66
|
+
: (data && data.result) || {}
|
|
67
|
+
return { title: d.title || '', text: d.text || '', links: d.links || [] }
|
|
68
|
+
} catch (e) {
|
|
69
|
+
lastErr = e
|
|
70
|
+
await sleep(500)
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
throw lastErr || new Error('fetchPage failed')
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async function crawl (startUrl) {
|
|
77
|
+
const queue = [{ url: startUrl, depth: 0 }]
|
|
78
|
+
const visited = new Set()
|
|
79
|
+
const startDomain = (() => { try { return new URL(startUrl).hostname } catch { return '' } })()
|
|
80
|
+
|
|
81
|
+
while (queue.length > 0 && results.length < maxPages) {
|
|
82
|
+
const { url, depth } = queue.shift()
|
|
83
|
+
if (visited.has(url)) continue
|
|
84
|
+
visited.add(url)
|
|
85
|
+
if (depth > maxDepth) continue
|
|
86
|
+
|
|
87
|
+
try {
|
|
88
|
+
const page = await fetchPage(url, depth)
|
|
89
|
+
results.push({ url, title: page.title, html: page.text, depth })
|
|
90
|
+
for (const link of page.links) {
|
|
91
|
+
if (visited.has(link)) continue
|
|
92
|
+
if (sameDomain && startDomain) {
|
|
93
|
+
try { if (new URL(link).hostname !== startDomain) continue } catch { continue }
|
|
94
|
+
}
|
|
95
|
+
queue.push({ url: link, depth: depth + 1 })
|
|
96
|
+
}
|
|
97
|
+
} catch {
|
|
98
|
+
results.push({ url, title: '', html: '', depth, error: 'fetch failed' })
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (delaySec > 0) await sleep(delaySec * 1000)
|
|
102
|
+
}
|
|
103
|
+
return results
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return { crawl, results }
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
module.exports = { createCrawler }
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
// -*- coding: utf-8 -*-
|
|
2
|
+
//
|
|
3
|
+
// web/find — 元素定位(路由无关)
|
|
4
|
+
//
|
|
5
|
+
// 通用能力: 按文本/selector 找元素, 返回坐标 + CSS path。
|
|
6
|
+
// 解决 SPA 两大定位问题:
|
|
7
|
+
// ① 标题被拆成多 span → 叶子精确匹配失败 → 兜底取包含匹配最深层
|
|
8
|
+
// ② aria snapshot 抓不到 (CSS Modules 非语义 DOM) → 不依赖 aria-ref
|
|
9
|
+
//
|
|
10
|
+
// clickByText: 基于此的点击封装 (sycm 这类页面文本是唯一稳定位)
|
|
11
|
+
//
|
|
12
|
+
// 依赖:./browser(operate)
|
|
13
|
+
|
|
14
|
+
'use strict'
|
|
15
|
+
|
|
16
|
+
const { operate } = require('./browser')
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* 查找页面上匹配的元素, 返回坐标 + CSS path。
|
|
20
|
+
*
|
|
21
|
+
* 文本匹配策略 (解决标题被拆 span):
|
|
22
|
+
* 1. 叶子节点精确匹配 (最稳)
|
|
23
|
+
* 2. 兜底: 任意层级 innerText 包含, 取后代最少的 (最接近文本本身的元素)
|
|
24
|
+
*
|
|
25
|
+
* @param {pageHandle} pageHandle
|
|
26
|
+
* @param {object} criteria
|
|
27
|
+
* @param {string} [criteria.text] 按文本找 (exact=false 时子串匹配)
|
|
28
|
+
* @param {boolean} [criteria.exact=false]
|
|
29
|
+
* @param {string} [criteria.selector] 按 CSS selector 找 (与 text 二选一)
|
|
30
|
+
* @param {boolean} [criteria.visibleOnly=false] 只返回可见元素
|
|
31
|
+
* @returns {Promise<{found:boolean, x:number, y:number, width:number, height:number, path:string, text:string}|{found:false}>}
|
|
32
|
+
*/
|
|
33
|
+
async function findElement (pageHandle, criteria = {}) {
|
|
34
|
+
const { text, exact, selector, visibleOnly } = criteria
|
|
35
|
+
if (!text && !selector) throw new Error('findElement: text 或 selector 必须传一个')
|
|
36
|
+
const r = await operate(pageHandle, {
|
|
37
|
+
action: 'evaluate',
|
|
38
|
+
source: `(() => {
|
|
39
|
+
const text = ${JSON.stringify(text || '')}
|
|
40
|
+
const exact = ${!!exact}
|
|
41
|
+
const selector = ${JSON.stringify(selector || '')}
|
|
42
|
+
const visibleOnly = ${!!visibleOnly}
|
|
43
|
+
const all = [...document.querySelectorAll('*')]
|
|
44
|
+
const isVisible = (el) => {
|
|
45
|
+
if (!el) return false
|
|
46
|
+
const r = el.getBoundingClientRect()
|
|
47
|
+
const s = getComputedStyle(el)
|
|
48
|
+
return r.width > 0 && r.height > 0 && s.visibility !== 'hidden' && s.display !== 'none' && s.opacity !== '0'
|
|
49
|
+
}
|
|
50
|
+
let hit = null
|
|
51
|
+
if (selector) {
|
|
52
|
+
hit = document.querySelector(selector)
|
|
53
|
+
} else if (text) {
|
|
54
|
+
// 1. 叶子精确
|
|
55
|
+
hit = all.find(el => el.children.length === 0 && (exact ? (el.innerText||'').trim()===text : (el.innerText||'').trim()===text))
|
|
56
|
+
// 2. 兜底: 包含匹配, 取后代最少的 (最接近文本本身)
|
|
57
|
+
if (!hit) {
|
|
58
|
+
const cands = all.filter(el => (el.innerText||'').includes(text) && (exact ? (el.innerText||'').trim()===text : true))
|
|
59
|
+
cands.sort((a,b) => a.querySelectorAll('*').length - b.querySelectorAll('*').length)
|
|
60
|
+
hit = cands[0]
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
if (!hit) return { found: false }
|
|
64
|
+
if (visibleOnly && !isVisible(hit)) return { found: false }
|
|
65
|
+
// 生成简短 CSS path (供 click 复用)
|
|
66
|
+
const path = []
|
|
67
|
+
let cur = hit
|
|
68
|
+
let depth = 0
|
|
69
|
+
while (cur && cur !== document.body && depth < 4) {
|
|
70
|
+
let seg = cur.tagName.toLowerCase()
|
|
71
|
+
if (cur.id) seg += '#' + cur.id
|
|
72
|
+
else if (cur.className) {
|
|
73
|
+
const c = (cur.className||'').toString().trim().split(/\\s+/).filter(x=>x).slice(0,1).join('.')
|
|
74
|
+
if (c) seg += '.' + c
|
|
75
|
+
}
|
|
76
|
+
path.unshift(seg)
|
|
77
|
+
cur = cur.parentElement
|
|
78
|
+
depth++
|
|
79
|
+
}
|
|
80
|
+
const r = hit.getBoundingClientRect()
|
|
81
|
+
return {
|
|
82
|
+
found: true,
|
|
83
|
+
x: Math.round(r.x), y: Math.round(r.y),
|
|
84
|
+
width: Math.round(r.width), height: Math.round(r.height),
|
|
85
|
+
path: path.join(' > '),
|
|
86
|
+
text: (hit.innerText||'').trim().slice(0, 60)
|
|
87
|
+
}
|
|
88
|
+
})()`,
|
|
89
|
+
_skipVisualize: true
|
|
90
|
+
}).catch(() => ({ found: false }))
|
|
91
|
+
return r || { found: false }
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* 按文本点击元素 (SPA 最稳定定位, sycm 这类页面文本是唯一不随部署变的锚点)。
|
|
96
|
+
*
|
|
97
|
+
* 容错 "访客分析 >" 这种 (文本带后缀): 默认 startsWith 匹配, exact=true 则精确。
|
|
98
|
+
*
|
|
99
|
+
* @param {pageHandle} pageHandle
|
|
100
|
+
* @param {string} text
|
|
101
|
+
* @param {object} [opts]
|
|
102
|
+
* @param {boolean} [opts.exact=false] true=精确匹配 "访客分析", false=允许 "访客分析 >"
|
|
103
|
+
* @param {boolean} [opts.scrollIntoView=true] 点击前滚到元素 (人类化, 找不到时下滚探测)
|
|
104
|
+
* @returns {Promise<{clicked:boolean, path:string, text:string}>}
|
|
105
|
+
*/
|
|
106
|
+
async function clickByText (pageHandle, text, opts = {}) {
|
|
107
|
+
const exact = !!opts.exact
|
|
108
|
+
// 找元素 (findElement 已处理多 span 拆分)
|
|
109
|
+
let el = await findElement(pageHandle, { text, exact })
|
|
110
|
+
// 找不到: 下滚探测 (元素在懒加载区, 用 scrollBy 连续)
|
|
111
|
+
if (!el.found && opts.scrollIntoView !== false) {
|
|
112
|
+
for (let i = 0; i < 40; i++) {
|
|
113
|
+
await operate(pageHandle, {
|
|
114
|
+
action: 'evaluate',
|
|
115
|
+
source: '() => { window.scrollBy(0, 40 + Math.floor(Math.random()*30)); return null }',
|
|
116
|
+
_skipVisualize: true
|
|
117
|
+
})
|
|
118
|
+
await new Promise(r => setTimeout(r, 60 + Math.random() * 60))
|
|
119
|
+
el = await findElement(pageHandle, { text, exact })
|
|
120
|
+
if (el.found) break
|
|
121
|
+
const atBottom = await operate(pageHandle, { action: 'evaluate', source: '()=>(window.innerHeight + (window.scrollY||0) + 5) >= document.body.scrollHeight', _skipVisualize: true }).catch(()=>false)
|
|
122
|
+
if (atBottom) break
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (!el.found) return { clicked: false, error: 'not found: ' + text }
|
|
126
|
+
// 点击: evaluate 内重新按文本找元素直接 click (不靠 path, path 末尾可能是 span 点不动)
|
|
127
|
+
// 滚动由 ScrollController 在调用前处理, 这里不 scrollIntoView
|
|
128
|
+
const clicked = await operate(pageHandle, {
|
|
129
|
+
action: 'evaluate',
|
|
130
|
+
source: `(() => {
|
|
131
|
+
const all = [...document.querySelectorAll('*')]
|
|
132
|
+
const hit = all.find(e => e.children.length === 0 && (e.innerText||'').trim() === ${JSON.stringify(text)})
|
|
133
|
+
|| all.find(e => e.children.length === 0 && (e.innerText||'').trim().startsWith(${JSON.stringify(text)}))
|
|
134
|
+
if (!hit) return false
|
|
135
|
+
// 点可点击的祖先 (button/a), 不只点 span
|
|
136
|
+
const clickable = hit.closest('button, a, [role=button], [onclick]') || hit
|
|
137
|
+
// 高亮同一真实元素 (resolve→mark→click 同一引用, 铁律: 高亮=执行对象)
|
|
138
|
+
if (window.visualMark) window.visualMark(clickable)
|
|
139
|
+
clickable.click()
|
|
140
|
+
return true
|
|
141
|
+
})()`,
|
|
142
|
+
_skipVisualize: true
|
|
143
|
+
})
|
|
144
|
+
return { clicked: !!(clicked !== false), path: el.path, text: el.text }
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
module.exports = { findElement, clickByText }
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// -*- coding: utf-8 -*-
|
|
2
|
+
//
|
|
3
|
+
// web/honeypot — 蜜罐陷阱检测(路由无关)
|
|
4
|
+
//
|
|
5
|
+
// 完整复刻自 bee/resources/sandbox/builtin-libs/web/honeypot.js。
|
|
6
|
+
// 扫描页面隐藏的陷阱元素(隐藏 link/input/button),避免触发反爬。
|
|
7
|
+
// 基于 operate({action:'evaluate'}) 跑页面 JS。
|
|
8
|
+
//
|
|
9
|
+
// 依赖:./browser(operate)
|
|
10
|
+
|
|
11
|
+
'use strict'
|
|
12
|
+
|
|
13
|
+
const { operate } = require('./browser')
|
|
14
|
+
|
|
15
|
+
// 页面内扫描隐藏陷阱的探测脚本(在页面 MAIN world 执行)
|
|
16
|
+
// 检测:display:none / opacity:0 / visibility:hidden / 零尺寸 / 离屏 / 父级隐藏(向上查 3 层)
|
|
17
|
+
const SCAN_SOURCE = '() => {\n' +
|
|
18
|
+
' const traps = []\n' +
|
|
19
|
+
' const isHidden = (el) => {\n' +
|
|
20
|
+
' const s = getComputedStyle(el)\n' +
|
|
21
|
+
' if (s.display === "none") return "display:none"\n' +
|
|
22
|
+
' if (parseFloat(s.opacity) === 0) return "opacity:0"\n' +
|
|
23
|
+
' if (s.visibility === "hidden") return "visibility:hidden"\n' +
|
|
24
|
+
' const r = el.getBoundingClientRect()\n' +
|
|
25
|
+
' if (r.width === 0 || r.height === 0) return "zero-size"\n' +
|
|
26
|
+
' if (r.top < 0 && r.bottom < 0) return "offscreen-top"\n' +
|
|
27
|
+
' if (r.left < 0 && r.right < 0) return "offscreen-left"\n' +
|
|
28
|
+
' return null\n' +
|
|
29
|
+
' }\n' +
|
|
30
|
+
' const walk = (el, depth) => {\n' +
|
|
31
|
+
' if (!el || depth > 3) return null\n' +
|
|
32
|
+
' const reason = isHidden(el)\n' +
|
|
33
|
+
' if (reason) return reason + (depth > 0 ? "(parent)" : "")\n' +
|
|
34
|
+
' return walk(el.parentElement, depth + 1)\n' +
|
|
35
|
+
' }\n' +
|
|
36
|
+
' const sel = "a[href], input, button, [role=button], [onclick]"\n' +
|
|
37
|
+
' document.querySelectorAll(sel).forEach((el) => {\n' +
|
|
38
|
+
' const reason = walk(el, 0)\n' +
|
|
39
|
+
' if (reason) {\n' +
|
|
40
|
+
' traps.push({\n' +
|
|
41
|
+
' tag: el.tagName.toLowerCase(),\n' +
|
|
42
|
+
' text: (el.innerText || el.value || "").slice(0, 80),\n' +
|
|
43
|
+
' href: el.getAttribute("href") || null,\n' +
|
|
44
|
+
' name: el.getAttribute("name") || null,\n' +
|
|
45
|
+
' type: el.getAttribute("type") || null,\n' +
|
|
46
|
+
' reason,\n' +
|
|
47
|
+
' rect: (() => { const r = el.getBoundingClientRect(); return { x: r.x, y: r.y, w: r.width, h: r.height } })(),\n' +
|
|
48
|
+
' })\n' +
|
|
49
|
+
' }\n' +
|
|
50
|
+
' })\n' +
|
|
51
|
+
' return traps\n' +
|
|
52
|
+
'}'
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* 扫描页面的隐藏陷阱元素。
|
|
56
|
+
* @param {object} pageHandle - 单个 page 句柄(来自 newTab)
|
|
57
|
+
* @returns {Promise<Array<{tag,text,href,name,type,reason,rect}>>}
|
|
58
|
+
*/
|
|
59
|
+
async function scan (pageHandle) {
|
|
60
|
+
if (!pageHandle) throw new Error('scan: pageHandle required')
|
|
61
|
+
const r = await operate(pageHandle, { action: 'evaluate', source: SCAN_SOURCE })
|
|
62
|
+
// evaluate 在 CDP 分支直接返回结果;插件分支返回 {ok,...} 或裸结果,兼容两种
|
|
63
|
+
return Array.isArray(r) ? r
|
|
64
|
+
: (r && Array.isArray(r.value) ? r.value
|
|
65
|
+
: (r && Array.isArray(r.result) ? r.result : []))
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
module.exports = { scan }
|