@jackwener/opencli 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLI-CREATOR.md +102 -5
- package/README.md +5 -5
- package/README.zh-CN.md +5 -5
- package/SKILL.md +15 -1
- package/dist/browser.d.ts +5 -1
- package/dist/browser.js +33 -5
- package/dist/clis/bilibili/subtitle.d.ts +1 -0
- package/dist/clis/bilibili/subtitle.js +86 -0
- package/dist/explore.js +50 -0
- package/dist/main.js +2 -2
- package/dist/pipeline/steps/browser.js +4 -8
- package/dist/pipeline/steps/fetch.js +19 -6
- package/dist/pipeline/steps/tap.js +8 -6
- package/dist/types.d.ts +5 -1
- package/package.json +1 -1
- package/src/browser.ts +33 -6
- package/src/clis/bilibili/subtitle.ts +100 -0
- package/src/explore.ts +51 -0
- package/src/main.ts +2 -2
- package/src/pipeline/steps/browser.ts +4 -7
- package/src/pipeline/steps/fetch.ts +22 -6
- package/src/pipeline/steps/tap.ts +8 -6
- package/src/types.ts +1 -1
package/CLI-CREATOR.md
CHANGED
|
@@ -57,8 +57,9 @@ opencli bilibili hot -v # 查看已有命令的 pipeline 每步数据流
|
|
|
57
57
|
|
|
58
58
|
1. **后缀爆破法 (`.json`)**: 像 Reddit 这样复杂的网站,只要在其 URL 后加上 `.json`(例如 `/r/all.json`),就能在带 Cookie 的情况下直接利用 `fetch` 拿到极其干净的 REST 数据(Tier 2 Cookie 策略极速秒杀)。另外如功能完备的**雪球 (xueqiu)** 也可以走这种纯 API 的方式极简获取,成为你构建简单 YAML 的黄金标杆。
|
|
59
59
|
2. **全局状态查找法 (`__INITIAL_STATE__`)**: 许多服务端渲染 (SSR) 的网站(如小红书、Bilibili)会将首页或详情页的完整数据挂载到全局 window 对象上。与其去拦截网络请求,不如直接 `page.evaluate('() => window.__INITIAL_STATE__')` 获取整个数据树。
|
|
60
|
-
3.
|
|
61
|
-
4.
|
|
60
|
+
3. **主动交互触发法 (Active Interaction)**: 很多深层 API(如视频字幕、评论下的回复)是懒加载的。在静态抓包找不到数据时,尝试在 `evaluate` 步骤或手动打断点时,主动去**点击(Click)页面上的对应按钮**(如"CC"、"展开全部"),从而诱发隐藏的 Network Fetch。
|
|
61
|
+
4. **框架探测与 Store Action 截断**: 如果站点使用 Vue + Pinia,可以使用 `tap` 步骤调用 action,让前端框架代替你完成复杂的鉴权签名封装。
|
|
62
|
+
5. **底层 XHR/Fetch 拦截**: 最后手段,当上述都不行时,使用 TypeScript 适配器进行无侵入式的请求抓取。
|
|
62
63
|
|
|
63
64
|
### 1d. 框架检测
|
|
64
65
|
|
|
@@ -411,6 +412,35 @@ cli({
|
|
|
411
412
|
|
|
412
413
|
> **拦截核心思路**:不自己构造签名,而是利用 `installInterceptor` 劫持网站自己的 `XMLHttpRequest` 和 `fetch`,让网站发请求,我们直接在底层取出解析好的 `response.json()`。
|
|
413
414
|
|
|
415
|
+
#### 进阶场景 1: 级联请求 (Cascading Requests) 与鉴权绕过
|
|
416
|
+
|
|
417
|
+
部分 API 获取是非常复杂的连环请求(例如 B 站获取视频字幕:先需要 `bvid` 获取核心 `cid`,再通过 `cid` 获取包含签名/Wbi 的字幕列表拉取地址,最后 fetch 真实的 CDN 资源)。在此类场景中,你必须在一个 `evaluate` 块内部或者在 TypeScript Node 端编排整个请求链条:
|
|
418
|
+
|
|
419
|
+
```typescript
|
|
420
|
+
// 真实场景:B站获取视频字幕的级联获取思路
|
|
421
|
+
const subtitleUrls = await page.evaluate(async (bvid) => {
|
|
422
|
+
// Step 1: 拿 CID (通常可以通过页面全局状态极速提取)
|
|
423
|
+
const cid = window.__INITIAL_STATE__?.videoData?.cid;
|
|
424
|
+
|
|
425
|
+
// Step 2: 依据 BVID 和 CID 拿字幕配置 (可能需要携带 W_RID 签名或依赖浏览器当前登录状态 Cookie)
|
|
426
|
+
const res = await fetch(\`/x/player/wbi/v2?bvid=\${bvid}&cid=\${cid}\`, { credentials: 'include' });
|
|
427
|
+
const data = await res.json();
|
|
428
|
+
|
|
429
|
+
// Step 3: 风控拦截/未登录降级空值检测 (Anti-Bot Empty Value Detection) ⚠️ 极其重要
|
|
430
|
+
// 很多大厂 API 只要签名失败或无强登录 Cookie 依然会返回 HTTP 200,但把关键 URL 设为 ""
|
|
431
|
+
const firstSubUrl = data.data?.subtitle?.subtitles?.[0]?.subtitle_url;
|
|
432
|
+
if (!firstSubUrl) {
|
|
433
|
+
throw new Error('被风控降级或需登录:拿不到真实的 subtitle_url,请检查 Cookie 状态 (Tier 2/3)');
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return firstSubUrl;
|
|
437
|
+
}, kwargs.bvid);
|
|
438
|
+
|
|
439
|
+
// Step 4: 拉取最终的 CDN 静态文件 (无鉴权)
|
|
440
|
+
const finalRes = await fetch(subtitleUrls.startsWith('//') ? 'https:' + subtitleUrls : subtitleUrls);
|
|
441
|
+
const subtitles = await finalRes.json();
|
|
442
|
+
```
|
|
443
|
+
|
|
414
444
|
---
|
|
415
445
|
|
|
416
446
|
## Step 4: 测试
|
|
@@ -539,6 +569,70 @@ git push
|
|
|
539
569
|
|
|
540
570
|
---
|
|
541
571
|
|
|
572
|
+
## 进阶模式: 级联请求 (Cascading Requests)
|
|
573
|
+
|
|
574
|
+
当目标数据需要多步 API 链式获取时(如 `BVID → CID → 字幕列表 → 字幕内容`),必须使用 **TS 适配器**。YAML 无法处理这种多步逻辑。
|
|
575
|
+
|
|
576
|
+
### 模板代码
|
|
577
|
+
|
|
578
|
+
```typescript
|
|
579
|
+
import { cli, Strategy } from '../../registry.js';
|
|
580
|
+
import type { IPage } from '../../types.js';
|
|
581
|
+
import { apiGet } from '../../bilibili.js'; // 复用平台 SDK
|
|
582
|
+
|
|
583
|
+
cli({
|
|
584
|
+
site: 'bilibili',
|
|
585
|
+
name: 'subtitle',
|
|
586
|
+
strategy: Strategy.COOKIE,
|
|
587
|
+
args: [{ name: 'bvid', required: true }],
|
|
588
|
+
columns: ['index', 'from', 'to', 'content'],
|
|
589
|
+
func: async (page: IPage | null, kwargs: any) => {
|
|
590
|
+
if (!page) throw new Error('Requires browser');
|
|
591
|
+
|
|
592
|
+
// Step 1: 建立 Session
|
|
593
|
+
await page.goto(`https://www.bilibili.com/video/${kwargs.bvid}/`);
|
|
594
|
+
|
|
595
|
+
// Step 2: 从页面提取中间 ID (__INITIAL_STATE__)
|
|
596
|
+
const cid = await page.evaluate(`(async () => {
|
|
597
|
+
return window.__INITIAL_STATE__?.videoData?.cid;
|
|
598
|
+
})()`);
|
|
599
|
+
if (!cid) throw new Error('无法提取 CID');
|
|
600
|
+
|
|
601
|
+
// Step 3: 用中间 ID 调用下一级 API (自动 Wbi 签名)
|
|
602
|
+
const payload = await apiGet(page, '/x/player/wbi/v2', {
|
|
603
|
+
params: { bvid: kwargs.bvid, cid },
|
|
604
|
+
signed: true, // ← 自动生成 w_rid
|
|
605
|
+
});
|
|
606
|
+
|
|
607
|
+
// Step 4: 检测风控降级 (空值断言)
|
|
608
|
+
const subtitles = payload.data?.subtitle?.subtitles || [];
|
|
609
|
+
const url = subtitles[0]?.subtitle_url;
|
|
610
|
+
if (!url) throw new Error('subtitle_url 为空,疑似风控降级');
|
|
611
|
+
|
|
612
|
+
// Step 5: 拉取最终数据 (CDN JSON)
|
|
613
|
+
const items = await page.evaluate(`(async () => {
|
|
614
|
+
const res = await fetch(${JSON.stringify('https:' + url)});
|
|
615
|
+
const json = await res.json();
|
|
616
|
+
return { data: json.body || json };
|
|
617
|
+
})()`);
|
|
618
|
+
|
|
619
|
+
return items.data.map((item, idx) => ({ ... }));
|
|
620
|
+
},
|
|
621
|
+
});
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
### 关键要点
|
|
625
|
+
|
|
626
|
+
| 步骤 | 注意事项 |
|
|
627
|
+
|------|----------|
|
|
628
|
+
| 提取中间 ID | 优先从 `__INITIAL_STATE__` 拿,避免额外 API 调用 |
|
|
629
|
+
| Wbi 签名 | B 站 `/wbi/` 接口**强制校验** `w_rid`,纯 `fetch` 会被 403 |
|
|
630
|
+
| 空值断言 | 即使 HTTP 200,核心字段可能为空串(风控降级) |
|
|
631
|
+
| CDN URL | 常以 `//` 开头,记得补 `https:` |
|
|
632
|
+
| `JSON.stringify` | 拼接 URL 到 evaluate 时必须用它转义,避免注入 |
|
|
633
|
+
|
|
634
|
+
---
|
|
635
|
+
|
|
542
636
|
## 常见陷阱
|
|
543
637
|
|
|
544
638
|
| 陷阱 | 表现 | 解决方案 |
|
|
@@ -553,6 +647,8 @@ git push
|
|
|
553
647
|
| TS evaluate 格式 | `() => {}` 报 `result is not a function` | TS 中 `page.evaluate()` 必须用 IIFE:`(async () => { ... })()` |
|
|
554
648
|
| 页面异步加载 | evaluate 拿到空数据(store state 还没更新) | 在 evaluate 内用 polling 等待数据出现,或增加 `wait` 时间 |
|
|
555
649
|
| YAML 内嵌大段 JS | 调试困难,字符串转义问题 | 超过 10 行 JS 的命令改用 TS adapter |
|
|
650
|
+
| **风控被拦截(伪200)** | 获取到的 JSON 里核心数据是 `""` (空串) | 极易被误判。必须添加断言!无核心数据立刻要求升级鉴权 Tier 并重新配置 Cookie |
|
|
651
|
+
| **API 没找见** | `explore` 工具打分出来的都拿不到深层数据 | 点击页面按钮诱发懒加载数据,再结合 `getInterceptedRequests` 获取 |
|
|
556
652
|
|
|
557
653
|
---
|
|
558
654
|
|
|
@@ -565,9 +661,10 @@ git push
|
|
|
565
661
|
opencli generate https://www.example.com --goal "hot"
|
|
566
662
|
|
|
567
663
|
# 或分步执行:
|
|
568
|
-
opencli explore https://www.example.com --site mysite
|
|
569
|
-
opencli
|
|
570
|
-
opencli
|
|
664
|
+
opencli explore https://www.example.com --site mysite # 发现 API
|
|
665
|
+
opencli explore https://www.example.com --auto --click "字幕,CC" # 模拟点击触发懒加载 API
|
|
666
|
+
opencli synthesize mysite # 生成候选 YAML
|
|
667
|
+
opencli verify mysite/hot --smoke # 冒烟测试
|
|
571
668
|
```
|
|
572
669
|
|
|
573
670
|
生成的候选 YAML 保存在 `.opencli/explore/mysite/candidates/`,可直接复制到 `src/clis/mysite/` 并微调。
|
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
[](https://www.npmjs.com/package/@jackwener/opencli)
|
|
9
9
|
|
|
10
|
-
A CLI tool that turns **any website** into a command-line interface. **
|
|
10
|
+
A CLI tool that turns **any website** into a command-line interface. **46 commands** across **17 sites** — bilibili, zhihu, xiaohongshu, twitter, reddit, xueqiu, github, v2ex, hackernews, bbc, weibo, boss, yahoo-finance, reuters, smzdm, ctrip, youtube — powered by browser session reuse and AI-native discovery.
|
|
11
11
|
|
|
12
12
|
## ✨ Highlights
|
|
13
13
|
|
|
@@ -82,12 +82,12 @@ Public API commands (`hackernews`, `github search`, `v2ex`) need no browser at a
|
|
|
82
82
|
|
|
83
83
|
| Site | Commands | Mode |
|
|
84
84
|
|------|----------|------|
|
|
85
|
-
| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `user-videos` | 🔐 Browser |
|
|
85
|
+
| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `user-videos` `subtitle` `dynamic` `ranking` | 🔐 Browser |
|
|
86
86
|
| **zhihu** | `hot` `search` `question` | 🔐 Browser |
|
|
87
|
-
| **xiaohongshu** | `search` `notifications` `feed` | 🔐 Browser |
|
|
87
|
+
| **xiaohongshu** | `search` `notifications` `feed` `me` `user` | 🔐 Browser |
|
|
88
88
|
| **xueqiu** | `feed` `hot-stock` `hot` `search` `stock` `watchlist` | 🔐 Browser |
|
|
89
|
-
| **twitter** | `trending` `bookmarks` | 🔐 Browser |
|
|
90
|
-
| **reddit** | `hot` | 🔐 Browser |
|
|
89
|
+
| **twitter** | `trending` `bookmarks` `profile` `search` `timeline` | 🔐 Browser |
|
|
90
|
+
| **reddit** | `hot` `frontpage` `search` `subreddit` | 🔐 Browser |
|
|
91
91
|
| **weibo** | `hot` | 🔐 Browser |
|
|
92
92
|
| **boss** | `search` | 🔐 Browser |
|
|
93
93
|
| **youtube** | `search` | 🔐 Browser |
|
package/README.zh-CN.md
CHANGED
|
@@ -11,7 +11,7 @@ OpenCLI 通过 Chrome 浏览器 + [Playwright MCP Bridge](https://github.com/nic
|
|
|
11
11
|
|
|
12
12
|
## ✨ 亮点
|
|
13
13
|
|
|
14
|
-
- 🌐 **
|
|
14
|
+
- 🌐 **46 个命令,17 个站点** — B站、知乎、小红书、Twitter、Reddit、雪球(xueqiu)、GitHub、V2EX、Hacker News、BBC、微博、BOSS直聘、Yahoo Finance、路透社、什么值得买、携程、YouTube
|
|
15
15
|
- 🔐 **零风控** — 复用 Chrome 登录态,无需存储任何凭证
|
|
16
16
|
- 🤖 **AI 原生** — `explore` 自动发现 API,`synthesize` 生成适配器,`cascade` 探测认证策略
|
|
17
17
|
- 🚀 **动态加载引擎** — 只需将 `.ts` 或 `.yaml` 适配器放入 `clis/` 文件夹即可自动注册生效
|
|
@@ -83,12 +83,12 @@ npm install -g @jackwener/opencli@latest
|
|
|
83
83
|
|
|
84
84
|
| 站点 | 命令 | 模式 |
|
|
85
85
|
|------|------|------|
|
|
86
|
-
| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `user-videos` | 🔐 浏览器 |
|
|
86
|
+
| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `user-videos` `subtitle` `dynamic` `ranking` | 🔐 浏览器 |
|
|
87
87
|
| **zhihu** | `hot` `search` `question` | 🔐 浏览器 |
|
|
88
|
-
| **xiaohongshu** | `search` `notifications` `feed` | 🔐 浏览器 |
|
|
88
|
+
| **xiaohongshu** | `search` `notifications` `feed` `me` `user` | 🔐 浏览器 |
|
|
89
89
|
| **xueqiu** | `feed` `hot-stock` `hot` `search` `stock` `watchlist` | 🔐 浏览器 |
|
|
90
|
-
| **twitter** | `trending` `bookmarks` | 🔐 浏览器 |
|
|
91
|
-
| **reddit** | `hot` | 🔐 浏览器 |
|
|
90
|
+
| **twitter** | `trending` `bookmarks` `profile` `search` `timeline` | 🔐 浏览器 |
|
|
91
|
+
| **reddit** | `hot` `frontpage` `search` `subreddit` | 🔐 浏览器 |
|
|
92
92
|
| **weibo** | `hot` | 🔐 浏览器 |
|
|
93
93
|
| **boss** | `search` | 🔐 浏览器 |
|
|
94
94
|
| **youtube** | `search` | 🔐 浏览器 |
|
package/SKILL.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: opencli
|
|
3
3
|
description: "OpenCLI — Make any website your CLI. Zero risk, AI-powered, reuse Chrome login."
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
author: jackwener
|
|
6
6
|
tags: [cli, browser, web, mcp, playwright, bilibili, zhihu, twitter, github, v2ex, hackernews, reddit, xiaohongshu, xueqiu, AI, agent]
|
|
7
7
|
---
|
|
@@ -49,6 +49,9 @@ opencli bilibili favorite # 我的收藏
|
|
|
49
49
|
opencli bilibili history --limit 20 # 观看历史
|
|
50
50
|
opencli bilibili feed --limit 10 # 动态时间线
|
|
51
51
|
opencli bilibili user-videos --uid 12345 # 用户投稿
|
|
52
|
+
opencli bilibili subtitle --bvid BV1xxx # 获取视频字幕 (支持 --lang zh-CN)
|
|
53
|
+
opencli bilibili dynamic --limit 10 # 动态
|
|
54
|
+
opencli bilibili ranking --limit 10 # 排行榜
|
|
52
55
|
|
|
53
56
|
# 知乎 (browser)
|
|
54
57
|
opencli zhihu hot --limit 10 # 知乎热榜
|
|
@@ -59,6 +62,8 @@ opencli zhihu question --id 34816524 # 问题详情和回答
|
|
|
59
62
|
opencli xiaohongshu search --keyword "美食" # 搜索笔记
|
|
60
63
|
opencli xiaohongshu notifications # 通知(mentions/likes/connections)
|
|
61
64
|
opencli xiaohongshu feed --limit 10 # 推荐 Feed
|
|
65
|
+
opencli xiaohongshu me # 我的信息
|
|
66
|
+
opencli xiaohongshu user --uid xxx # 用户主页
|
|
62
67
|
|
|
63
68
|
# 雪球 Xueqiu (browser)
|
|
64
69
|
opencli xueqiu hot-stock --limit 10 # 雪球热门股票榜
|
|
@@ -73,10 +78,16 @@ opencli github search --keyword "cli" # 搜索仓库
|
|
|
73
78
|
# Twitter/X (browser)
|
|
74
79
|
opencli twitter trending --limit 10 # 热门话题
|
|
75
80
|
opencli twitter bookmarks --limit 20 # 获取收藏的书签推文
|
|
81
|
+
opencli twitter search --keyword "AI" # 搜索推文
|
|
82
|
+
opencli twitter profile --username elonmusk # 用户资料
|
|
83
|
+
opencli twitter timeline --limit 20 # 时间线
|
|
76
84
|
|
|
77
85
|
# Reddit (browser)
|
|
78
86
|
opencli reddit hot --limit 10 # 热门帖子
|
|
79
87
|
opencli reddit hot --subreddit programming # 指定子版块
|
|
88
|
+
opencli reddit frontpage --limit 10 # 首页
|
|
89
|
+
opencli reddit search --keyword "AI" # 搜索
|
|
90
|
+
opencli reddit subreddit --name rust # 子版块浏览
|
|
80
91
|
|
|
81
92
|
# V2EX (public)
|
|
82
93
|
opencli v2ex hot --limit 10 # 热门话题
|
|
@@ -135,6 +146,9 @@ opencli generate <url> --goal "hot"
|
|
|
135
146
|
# Strategy Cascade: auto-probe PUBLIC → COOKIE → HEADER
|
|
136
147
|
opencli cascade <api-url>
|
|
137
148
|
|
|
149
|
+
# Explore with interactive fuzzing (click buttons to trigger lazy APIs)
|
|
150
|
+
opencli explore <url> --auto --click "字幕,CC,评论"
|
|
151
|
+
|
|
138
152
|
# Verify: smoke-test a generated adapter
|
|
139
153
|
opencli verify <site/name> --smoke
|
|
140
154
|
```
|
package/dist/browser.d.ts
CHANGED
|
@@ -23,7 +23,11 @@ export declare class Page implements IPage {
|
|
|
23
23
|
click(ref: string): Promise<void>;
|
|
24
24
|
typeText(ref: string, text: string): Promise<void>;
|
|
25
25
|
pressKey(key: string): Promise<void>;
|
|
26
|
-
wait(
|
|
26
|
+
wait(options: number | {
|
|
27
|
+
text?: string;
|
|
28
|
+
time?: number;
|
|
29
|
+
timeout?: number;
|
|
30
|
+
}): Promise<void>;
|
|
27
31
|
tabs(): Promise<any>;
|
|
28
32
|
closeTab(index?: number): Promise<void>;
|
|
29
33
|
newTab(): Promise<void>;
|
package/dist/browser.js
CHANGED
|
@@ -110,8 +110,14 @@ export class Page {
|
|
|
110
110
|
async pressKey(key) {
|
|
111
111
|
await this.call('tools/call', { name: 'browser_press_key', arguments: { key } });
|
|
112
112
|
}
|
|
113
|
-
async wait(
|
|
114
|
-
|
|
113
|
+
async wait(options) {
|
|
114
|
+
if (typeof options === 'number') {
|
|
115
|
+
await this.call('tools/call', { name: 'browser_wait_for', arguments: { time: options } });
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
// Pass directly to native wait_for, which supports natively awaiting text strings without heavy DOM polling
|
|
119
|
+
await this.call('tools/call', { name: 'browser_wait_for', arguments: options });
|
|
120
|
+
}
|
|
115
121
|
}
|
|
116
122
|
async tabs() {
|
|
117
123
|
return this.call('tools/call', { name: 'browser_tabs', arguments: { action: 'list' } });
|
|
@@ -137,10 +143,32 @@ export class Page {
|
|
|
137
143
|
async autoScroll(options = {}) {
|
|
138
144
|
const times = options.times ?? 3;
|
|
139
145
|
const delayMs = options.delayMs ?? 2000;
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
146
|
+
const js = `
|
|
147
|
+
async () => {
|
|
148
|
+
const maxTimes = ${times};
|
|
149
|
+
const maxWaitMs = ${delayMs};
|
|
150
|
+
for (let i = 0; i < maxTimes; i++) {
|
|
151
|
+
const lastHeight = document.body.scrollHeight;
|
|
152
|
+
window.scrollTo(0, lastHeight);
|
|
153
|
+
await new Promise(resolve => {
|
|
154
|
+
let timeoutId;
|
|
155
|
+
const observer = new MutationObserver(() => {
|
|
156
|
+
if (document.body.scrollHeight > lastHeight) {
|
|
157
|
+
clearTimeout(timeoutId);
|
|
158
|
+
observer.disconnect();
|
|
159
|
+
setTimeout(resolve, 100); // Small debounce for rendering
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
observer.observe(document.body, { childList: true, subtree: true });
|
|
163
|
+
timeoutId = setTimeout(() => {
|
|
164
|
+
observer.disconnect();
|
|
165
|
+
resolve(null);
|
|
166
|
+
}, maxWaitMs);
|
|
167
|
+
});
|
|
143
168
|
}
|
|
169
|
+
}
|
|
170
|
+
`;
|
|
171
|
+
await this.evaluate(js);
|
|
144
172
|
}
|
|
145
173
|
async installInterceptor(pattern) {
|
|
146
174
|
const js = `
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { cli, Strategy } from '../../registry.js';
|
|
2
|
+
import { apiGet } from '../../bilibili.js';
|
|
3
|
+
cli({
|
|
4
|
+
site: 'bilibili',
|
|
5
|
+
name: 'subtitle',
|
|
6
|
+
description: '获取 Bilibili 视频的字幕',
|
|
7
|
+
strategy: Strategy.COOKIE,
|
|
8
|
+
args: [
|
|
9
|
+
{ name: 'bvid', required: true },
|
|
10
|
+
{ name: 'lang', required: false, help: '字幕语言代码 (如 zh-CN, en-US, ai-zh),默认取第一个' },
|
|
11
|
+
],
|
|
12
|
+
columns: ['index', 'from', 'to', 'content'],
|
|
13
|
+
func: async (page, kwargs) => {
|
|
14
|
+
if (!page)
|
|
15
|
+
throw new Error('Requires browser');
|
|
16
|
+
// 1. 先前往视频详情页 (建立有鉴权的 Session,且这里不需要加载完整个视频)
|
|
17
|
+
await page.goto(`https://www.bilibili.com/video/${kwargs.bvid}/`);
|
|
18
|
+
// 2. 利用 __INITIAL_STATE__ 获取基础信息,拿 CID
|
|
19
|
+
const cid = await page.evaluate(`(async () => {
|
|
20
|
+
const state = window.__INITIAL_STATE__ || {};
|
|
21
|
+
return state?.videoData?.cid;
|
|
22
|
+
})()`);
|
|
23
|
+
if (!cid) {
|
|
24
|
+
throw new Error('无法在页面中提取到当前视频的 CID,请检查页面是否正常加载。');
|
|
25
|
+
}
|
|
26
|
+
// 3. 在 Node 端使用 apiGet 获取带 Wbi 签名的字幕列表
|
|
27
|
+
// 之前纯靠 evaluate 里的 fetch 会失败,因为 B 站 /wbi/ 开头的接口强校验 w_rid,未签名直接被风控返回 403 HTML
|
|
28
|
+
const payload = await apiGet(page, '/x/player/wbi/v2', {
|
|
29
|
+
params: { bvid: kwargs.bvid, cid },
|
|
30
|
+
signed: true, // 开启 wbi_sign 自动签名
|
|
31
|
+
});
|
|
32
|
+
if (payload.code !== 0) {
|
|
33
|
+
throw new Error(`获取视频播放信息失败: ${payload.message} (${payload.code})`);
|
|
34
|
+
}
|
|
35
|
+
const subtitles = payload.data?.subtitle?.subtitles || [];
|
|
36
|
+
if (subtitles.length === 0) {
|
|
37
|
+
throw new Error('此视频没有发现外挂或智能字幕。');
|
|
38
|
+
}
|
|
39
|
+
// 4. 选择目标字幕语言
|
|
40
|
+
const target = kwargs.lang
|
|
41
|
+
? subtitles.find((s) => s.lan === kwargs.lang) || subtitles[0]
|
|
42
|
+
: subtitles[0];
|
|
43
|
+
const targetSubUrl = target.subtitle_url;
|
|
44
|
+
if (!targetSubUrl || targetSubUrl === '') {
|
|
45
|
+
throw new Error('[风控拦截/未登录] 获取到的 subtitle_url 为空!请确保 CLI 已成功登录且风控未封锁此账号。');
|
|
46
|
+
}
|
|
47
|
+
const finalUrl = targetSubUrl.startsWith('//') ? 'https:' + targetSubUrl : targetSubUrl;
|
|
48
|
+
// 5. 解析并拉取 CDN 的 JSON 文件
|
|
49
|
+
const fetchJs = `
|
|
50
|
+
(async () => {
|
|
51
|
+
const url = ${JSON.stringify(finalUrl)};
|
|
52
|
+
const res = await fetch(url);
|
|
53
|
+
const text = await res.text();
|
|
54
|
+
|
|
55
|
+
if (text.startsWith('<!DOCTYPE') || text.startsWith('<html')) {
|
|
56
|
+
return { error: 'HTML', text: text.substring(0, 100), url };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
try {
|
|
60
|
+
const subJson = JSON.parse(text);
|
|
61
|
+
// B站真实返回格式是 { font_size: 0.4, font_color: "#FFFFFF", background_alpha: 0.5, background_color: "#9C27B0", Stroke: "none", type: "json" , body: [{from: 0, to: 0, content: ""}] }
|
|
62
|
+
if (Array.isArray(subJson?.body)) return { success: true, data: subJson.body };
|
|
63
|
+
if (Array.isArray(subJson)) return { success: true, data: subJson };
|
|
64
|
+
return { error: 'UNKNOWN_JSON', data: subJson };
|
|
65
|
+
} catch (e) {
|
|
66
|
+
return { error: 'PARSE_FAILED', text: text.substring(0, 100) };
|
|
67
|
+
}
|
|
68
|
+
})()
|
|
69
|
+
`;
|
|
70
|
+
const items = await page.evaluate(fetchJs);
|
|
71
|
+
if (items?.error) {
|
|
72
|
+
throw new Error(`字幕获取失败: ${items.error}${items.text ? ' — ' + items.text : ''}`);
|
|
73
|
+
}
|
|
74
|
+
const finalItems = items?.data || [];
|
|
75
|
+
if (!Array.isArray(finalItems)) {
|
|
76
|
+
throw new Error('解析到的字幕列表对象不符合数组格式');
|
|
77
|
+
}
|
|
78
|
+
// 6. 数据映射
|
|
79
|
+
return finalItems.map((item, idx) => ({
|
|
80
|
+
index: idx + 1,
|
|
81
|
+
from: Number(item.from || 0).toFixed(2) + 's',
|
|
82
|
+
to: Number(item.to || 0).toFixed(2) + 's',
|
|
83
|
+
content: item.content
|
|
84
|
+
}));
|
|
85
|
+
},
|
|
86
|
+
});
|
package/dist/explore.js
CHANGED
|
@@ -175,6 +175,9 @@ function scoreEndpoint(ep) {
|
|
|
175
175
|
s += 2;
|
|
176
176
|
if (ep.status === 200)
|
|
177
177
|
s += 2;
|
|
178
|
+
// Anti-Bot Empty Value Detection: penalize JSON endpoints returning empty data
|
|
179
|
+
if (ep.responseAnalysis && ep.responseAnalysis.itemCount === 0 && ep.contentType.includes('json'))
|
|
180
|
+
s -= 3;
|
|
178
181
|
return s;
|
|
179
182
|
}
|
|
180
183
|
function inferCapabilityName(url, goal) {
|
|
@@ -266,6 +269,28 @@ const STORE_DISCOVER_JS = `
|
|
|
266
269
|
return stores;
|
|
267
270
|
}
|
|
268
271
|
`;
|
|
272
|
+
// ── Auto-Interaction (Fuzzing) ─────────────────────────────────────────────
|
|
273
|
+
const INTERACT_FUZZ_JS = `
|
|
274
|
+
async () => {
|
|
275
|
+
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
276
|
+
const clickables = Array.from(document.querySelectorAll(
|
|
277
|
+
'button, [role="button"], [role="tab"], .tab, .btn, a[href="javascript:void(0)"], a[href="#"]'
|
|
278
|
+
)).slice(0, 15); // limit to 15 to avoid endless loops
|
|
279
|
+
|
|
280
|
+
let clicked = 0;
|
|
281
|
+
for (const el of clickables) {
|
|
282
|
+
try {
|
|
283
|
+
const rect = el.getBoundingClientRect();
|
|
284
|
+
if (rect.width > 0 && rect.height > 0) {
|
|
285
|
+
el.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true, view: window }));
|
|
286
|
+
clicked++;
|
|
287
|
+
await sleep(300); // give it time to trigger network
|
|
288
|
+
}
|
|
289
|
+
} catch {}
|
|
290
|
+
}
|
|
291
|
+
return clicked;
|
|
292
|
+
}
|
|
293
|
+
`;
|
|
269
294
|
// ── Main explore function ──────────────────────────────────────────────────
|
|
270
295
|
export async function exploreUrl(url, opts) {
|
|
271
296
|
const waitSeconds = opts.waitSeconds ?? 3.0;
|
|
@@ -283,6 +308,31 @@ export async function exploreUrl(url, opts) {
|
|
|
283
308
|
catch { }
|
|
284
309
|
await page.wait(1);
|
|
285
310
|
}
|
|
311
|
+
// Step 2.5: Interactive Fuzzing (if requested)
|
|
312
|
+
if (opts.auto) {
|
|
313
|
+
try {
|
|
314
|
+
// First: targeted clicks by label (e.g. "字幕", "CC", "评论")
|
|
315
|
+
if (opts.clickLabels?.length) {
|
|
316
|
+
for (const label of opts.clickLabels) {
|
|
317
|
+
const safeLabel = label.replace(/'/g, "\\'");
|
|
318
|
+
await page.evaluate(`
|
|
319
|
+
(() => {
|
|
320
|
+
const el = [...document.querySelectorAll('button, [role="button"], [role="tab"], a, span')]
|
|
321
|
+
.find(e => e.textContent && e.textContent.trim().includes('${safeLabel}'));
|
|
322
|
+
if (el) el.click();
|
|
323
|
+
})()
|
|
324
|
+
`);
|
|
325
|
+
await page.wait(1);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
// Then: blind fuzzing on generic interactive elements
|
|
329
|
+
const clicks = await page.evaluate(INTERACT_FUZZ_JS);
|
|
330
|
+
await page.wait(2); // wait for XHRs to settle
|
|
331
|
+
}
|
|
332
|
+
catch (e) {
|
|
333
|
+
// fuzzing is best-effort, don't fail the whole explore
|
|
334
|
+
}
|
|
335
|
+
}
|
|
286
336
|
// Step 3: Read page metadata
|
|
287
337
|
const metadata = await readPageMetadata(page);
|
|
288
338
|
// Step 4: Capture network traffic
|
package/dist/main.js
CHANGED
|
@@ -56,8 +56,8 @@ program.command('validate').description('Validate CLI definitions').argument('[t
|
|
|
56
56
|
.action(async (target) => { const { validateClisWithTarget, renderValidationReport } = await import('./validate.js'); console.log(renderValidationReport(validateClisWithTarget([BUILTIN_CLIS, USER_CLIS], target))); });
|
|
57
57
|
program.command('verify').description('Validate + smoke test').argument('[target]').option('--smoke', 'Run smoke tests', false)
|
|
58
58
|
.action(async (target, opts) => { const { verifyClis, renderVerifyReport } = await import('./verify.js'); const r = await verifyClis({ builtinClis: BUILTIN_CLIS, userClis: USER_CLIS, target, smoke: opts.smoke }); console.log(renderVerifyReport(r)); process.exitCode = r.ok ? 0 : 1; });
|
|
59
|
-
program.command('explore').alias('probe').description('Explore a website: discover APIs, stores, and recommend strategies').argument('<url>').option('--site <name>').option('--goal <text>').option('--wait <s>', '', '3')
|
|
60
|
-
.action(async (url, opts) => { const { exploreUrl, renderExploreSummary } = await import('./explore.js'); console.log(renderExploreSummary(await exploreUrl(url, { BrowserFactory: PlaywrightMCP, site: opts.site, goal: opts.goal, waitSeconds: parseFloat(opts.wait) }))); });
|
|
59
|
+
program.command('explore').alias('probe').description('Explore a website: discover APIs, stores, and recommend strategies').argument('<url>').option('--site <name>').option('--goal <text>').option('--wait <s>', '', '3').option('--auto', 'Enable interactive fuzzing (simulate clicks to trigger lazy APIs)').option('--click <labels>', 'Comma-separated labels to click before fuzzing (e.g. "字幕,CC,评论")')
|
|
60
|
+
.action(async (url, opts) => { const { exploreUrl, renderExploreSummary } = await import('./explore.js'); const clickLabels = opts.click ? opts.click.split(',').map((s) => s.trim()) : undefined; console.log(renderExploreSummary(await exploreUrl(url, { BrowserFactory: PlaywrightMCP, site: opts.site, goal: opts.goal, waitSeconds: parseFloat(opts.wait), auto: opts.auto, clickLabels }))); });
|
|
61
61
|
program.command('synthesize').description('Synthesize CLIs from explore').argument('<target>').option('--top <n>', '', '3')
|
|
62
62
|
.action(async (target, opts) => { const { synthesizeFromExplore, renderSynthesizeSummary } = await import('./synthesize.js'); console.log(renderSynthesizeSummary(synthesizeFromExplore(target, { top: parseInt(opts.top) }))); });
|
|
63
63
|
program.command('generate').description('One-shot: explore → synthesize → register').argument('<url>').option('--goal <text>').option('--site <name>')
|
|
@@ -27,14 +27,10 @@ export async function stepWait(page, params, data, args) {
|
|
|
27
27
|
await page.wait(params);
|
|
28
28
|
else if (typeof params === 'object' && params) {
|
|
29
29
|
if ('text' in params) {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
if (typeof snap === 'string' && snap.includes(params.text))
|
|
35
|
-
break;
|
|
36
|
-
await page.wait(0.5);
|
|
37
|
-
}
|
|
30
|
+
await page.wait({
|
|
31
|
+
text: String(render(params.text, { args, data })),
|
|
32
|
+
timeout: params.timeout
|
|
33
|
+
});
|
|
38
34
|
}
|
|
39
35
|
else if ('time' in params)
|
|
40
36
|
await page.wait(Number(params.time));
|
|
@@ -2,6 +2,20 @@
|
|
|
2
2
|
* Pipeline step: fetch — HTTP API requests.
|
|
3
3
|
*/
|
|
4
4
|
import { render } from '../template.js';
|
|
5
|
+
/** Simple async concurrency limiter */
|
|
6
|
+
async function mapConcurrent(items, limit, fn) {
|
|
7
|
+
const results = new Array(items.length);
|
|
8
|
+
let index = 0;
|
|
9
|
+
async function worker() {
|
|
10
|
+
while (index < items.length) {
|
|
11
|
+
const i = index++;
|
|
12
|
+
results[i] = await fn(items[i], i);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
const workers = Array.from({ length: Math.min(limit, items.length) }, () => worker());
|
|
16
|
+
await Promise.all(workers);
|
|
17
|
+
return results;
|
|
18
|
+
}
|
|
5
19
|
/** Single URL fetch helper */
|
|
6
20
|
async function fetchSingle(page, url, method, queryParams, headers, args, data) {
|
|
7
21
|
const renderedParams = {};
|
|
@@ -38,12 +52,11 @@ export async function stepFetch(page, params, data, args) {
|
|
|
38
52
|
const urlTemplate = String(urlOrObj);
|
|
39
53
|
// Per-item fetch when data is array and URL references item
|
|
40
54
|
if (Array.isArray(data) && urlTemplate.includes('item')) {
|
|
41
|
-
const
|
|
42
|
-
|
|
43
|
-
const itemUrl = String(render(urlTemplate, { args, data, item
|
|
44
|
-
|
|
45
|
-
}
|
|
46
|
-
return results;
|
|
55
|
+
const concurrency = typeof params?.concurrency === 'number' ? params.concurrency : 5;
|
|
56
|
+
return mapConcurrent(data, concurrency, async (item, index) => {
|
|
57
|
+
const itemUrl = String(render(urlTemplate, { args, data, item, index }));
|
|
58
|
+
return fetchSingle(page, itemUrl, method, queryParams, headers, args, data);
|
|
59
|
+
});
|
|
47
60
|
}
|
|
48
61
|
const url = render(urlOrObj, { args, data });
|
|
49
62
|
return fetchSingle(page, String(url), method, queryParams, headers, args, data);
|
|
@@ -36,6 +36,8 @@ export async function stepTap(page, params, data, args) {
|
|
|
36
36
|
async () => {
|
|
37
37
|
// ── 1. Setup capture proxy (fetch + XHR dual interception) ──
|
|
38
38
|
let captured = null;
|
|
39
|
+
let captureResolve;
|
|
40
|
+
const capturePromise = new Promise(r => { captureResolve = r; });
|
|
39
41
|
const capturePattern = ${JSON.stringify(capturePattern)};
|
|
40
42
|
|
|
41
43
|
// Intercept fetch API
|
|
@@ -46,7 +48,7 @@ export async function stepTap(page, params, data, args) {
|
|
|
46
48
|
const url = typeof fetchArgs[0] === 'string' ? fetchArgs[0]
|
|
47
49
|
: fetchArgs[0] instanceof Request ? fetchArgs[0].url : String(fetchArgs[0]);
|
|
48
50
|
if (capturePattern && url.includes(capturePattern) && !captured) {
|
|
49
|
-
try { captured = await resp.clone().json(); } catch {}
|
|
51
|
+
try { captured = await resp.clone().json(); captureResolve(); } catch {}
|
|
50
52
|
}
|
|
51
53
|
} catch {}
|
|
52
54
|
return resp;
|
|
@@ -65,13 +67,13 @@ export async function stepTap(page, params, data, args) {
|
|
|
65
67
|
const origHandler = xhr.onreadystatechange;
|
|
66
68
|
xhr.onreadystatechange = function() {
|
|
67
69
|
if (xhr.readyState === 4 && !captured) {
|
|
68
|
-
try { captured = JSON.parse(xhr.responseText); } catch {}
|
|
70
|
+
try { captured = JSON.parse(xhr.responseText); captureResolve(); } catch {}
|
|
69
71
|
}
|
|
70
72
|
if (origHandler) origHandler.apply(this, arguments);
|
|
71
73
|
};
|
|
72
74
|
const origOnload = xhr.onload;
|
|
73
75
|
xhr.onload = function() {
|
|
74
|
-
if (!captured) { try { captured = JSON.parse(xhr.responseText); } catch {} }
|
|
76
|
+
if (!captured) { try { captured = JSON.parse(xhr.responseText); captureResolve(); } catch {} }
|
|
75
77
|
if (origOnload) origOnload.apply(this, arguments);
|
|
76
78
|
};
|
|
77
79
|
}
|
|
@@ -111,9 +113,9 @@ export async function stepTap(page, params, data, args) {
|
|
|
111
113
|
await ${actionCall};
|
|
112
114
|
|
|
113
115
|
// ── 4. Wait for network response ──
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
await
|
|
116
|
+
if (!captured) {
|
|
117
|
+
const timeoutPromise = new Promise(r => setTimeout(r, ${timeout} * 1000));
|
|
118
|
+
await Promise.race([capturePromise, timeoutPromise]);
|
|
117
119
|
}
|
|
118
120
|
} finally {
|
|
119
121
|
// ── 5. Always restore originals ──
|
package/dist/types.d.ts
CHANGED
|
@@ -16,7 +16,11 @@ export interface IPage {
|
|
|
16
16
|
click(ref: string): Promise<void>;
|
|
17
17
|
typeText(ref: string, text: string): Promise<void>;
|
|
18
18
|
pressKey(key: string): Promise<void>;
|
|
19
|
-
wait(
|
|
19
|
+
wait(options: number | {
|
|
20
|
+
text?: string;
|
|
21
|
+
time?: number;
|
|
22
|
+
timeout?: number;
|
|
23
|
+
}): Promise<void>;
|
|
20
24
|
tabs(): Promise<any>;
|
|
21
25
|
closeTab(index?: number): Promise<void>;
|
|
22
26
|
newTab(): Promise<void>;
|
package/package.json
CHANGED
package/src/browser.ts
CHANGED
|
@@ -104,8 +104,13 @@ export class Page implements IPage {
|
|
|
104
104
|
await this.call('tools/call', { name: 'browser_press_key', arguments: { key } });
|
|
105
105
|
}
|
|
106
106
|
|
|
107
|
-
async wait(
|
|
108
|
-
|
|
107
|
+
async wait(options: number | { text?: string; time?: number; timeout?: number }): Promise<void> {
|
|
108
|
+
if (typeof options === 'number') {
|
|
109
|
+
await this.call('tools/call', { name: 'browser_wait_for', arguments: { time: options } });
|
|
110
|
+
} else {
|
|
111
|
+
// Pass directly to native wait_for, which supports natively awaiting text strings without heavy DOM polling
|
|
112
|
+
await this.call('tools/call', { name: 'browser_wait_for', arguments: options });
|
|
113
|
+
}
|
|
109
114
|
}
|
|
110
115
|
|
|
111
116
|
async tabs(): Promise<any> {
|
|
@@ -139,10 +144,32 @@ export class Page implements IPage {
|
|
|
139
144
|
async autoScroll(options: { times?: number; delayMs?: number } = {}): Promise<void> {
|
|
140
145
|
const times = options.times ?? 3;
|
|
141
146
|
const delayMs = options.delayMs ?? 2000;
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
147
|
+
const js = `
|
|
148
|
+
async () => {
|
|
149
|
+
const maxTimes = ${times};
|
|
150
|
+
const maxWaitMs = ${delayMs};
|
|
151
|
+
for (let i = 0; i < maxTimes; i++) {
|
|
152
|
+
const lastHeight = document.body.scrollHeight;
|
|
153
|
+
window.scrollTo(0, lastHeight);
|
|
154
|
+
await new Promise(resolve => {
|
|
155
|
+
let timeoutId;
|
|
156
|
+
const observer = new MutationObserver(() => {
|
|
157
|
+
if (document.body.scrollHeight > lastHeight) {
|
|
158
|
+
clearTimeout(timeoutId);
|
|
159
|
+
observer.disconnect();
|
|
160
|
+
setTimeout(resolve, 100); // Small debounce for rendering
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
observer.observe(document.body, { childList: true, subtree: true });
|
|
164
|
+
timeoutId = setTimeout(() => {
|
|
165
|
+
observer.disconnect();
|
|
166
|
+
resolve(null);
|
|
167
|
+
}, maxWaitMs);
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
`;
|
|
172
|
+
await this.evaluate(js);
|
|
146
173
|
}
|
|
147
174
|
|
|
148
175
|
async installInterceptor(pattern: string): Promise<void> {
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { cli, Strategy } from '../../registry.js';
|
|
2
|
+
import type { IPage } from '../../types.js';
|
|
3
|
+
import { apiGet } from '../../bilibili.js';
|
|
4
|
+
|
|
5
|
+
cli({
|
|
6
|
+
site: 'bilibili',
|
|
7
|
+
name: 'subtitle',
|
|
8
|
+
description: '获取 Bilibili 视频的字幕',
|
|
9
|
+
strategy: Strategy.COOKIE,
|
|
10
|
+
args: [
|
|
11
|
+
{ name: 'bvid', required: true },
|
|
12
|
+
{ name: 'lang', required: false, help: '字幕语言代码 (如 zh-CN, en-US, ai-zh),默认取第一个' },
|
|
13
|
+
],
|
|
14
|
+
columns: ['index', 'from', 'to', 'content'],
|
|
15
|
+
func: async (page: IPage | null, kwargs: any) => {
|
|
16
|
+
if (!page) throw new Error('Requires browser');
|
|
17
|
+
// 1. 先前往视频详情页 (建立有鉴权的 Session,且这里不需要加载完整个视频)
|
|
18
|
+
await page.goto(`https://www.bilibili.com/video/${kwargs.bvid}/`);
|
|
19
|
+
|
|
20
|
+
// 2. 利用 __INITIAL_STATE__ 获取基础信息,拿 CID
|
|
21
|
+
const cid = await page.evaluate(`(async () => {
|
|
22
|
+
const state = window.__INITIAL_STATE__ || {};
|
|
23
|
+
return state?.videoData?.cid;
|
|
24
|
+
})()`);
|
|
25
|
+
|
|
26
|
+
if (!cid) {
|
|
27
|
+
throw new Error('无法在页面中提取到当前视频的 CID,请检查页面是否正常加载。');
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// 3. 在 Node 端使用 apiGet 获取带 Wbi 签名的字幕列表
|
|
31
|
+
// 之前纯靠 evaluate 里的 fetch 会失败,因为 B 站 /wbi/ 开头的接口强校验 w_rid,未签名直接被风控返回 403 HTML
|
|
32
|
+
const payload = await apiGet(page, '/x/player/wbi/v2', {
|
|
33
|
+
params: { bvid: kwargs.bvid, cid },
|
|
34
|
+
signed: true, // 开启 wbi_sign 自动签名
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
if (payload.code !== 0) {
|
|
38
|
+
throw new Error(`获取视频播放信息失败: ${payload.message} (${payload.code})`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const subtitles = payload.data?.subtitle?.subtitles || [];
|
|
42
|
+
if (subtitles.length === 0) {
|
|
43
|
+
throw new Error('此视频没有发现外挂或智能字幕。');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// 4. 选择目标字幕语言
|
|
47
|
+
const target = kwargs.lang
|
|
48
|
+
? subtitles.find((s: any) => s.lan === kwargs.lang) || subtitles[0]
|
|
49
|
+
: subtitles[0];
|
|
50
|
+
|
|
51
|
+
const targetSubUrl = target.subtitle_url;
|
|
52
|
+
if (!targetSubUrl || targetSubUrl === '') {
|
|
53
|
+
throw new Error('[风控拦截/未登录] 获取到的 subtitle_url 为空!请确保 CLI 已成功登录且风控未封锁此账号。');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const finalUrl = targetSubUrl.startsWith('//') ? 'https:' + targetSubUrl : targetSubUrl;
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
// 5. 解析并拉取 CDN 的 JSON 文件
|
|
60
|
+
const fetchJs = `
|
|
61
|
+
(async () => {
|
|
62
|
+
const url = ${JSON.stringify(finalUrl)};
|
|
63
|
+
const res = await fetch(url);
|
|
64
|
+
const text = await res.text();
|
|
65
|
+
|
|
66
|
+
if (text.startsWith('<!DOCTYPE') || text.startsWith('<html')) {
|
|
67
|
+
return { error: 'HTML', text: text.substring(0, 100), url };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
const subJson = JSON.parse(text);
|
|
72
|
+
// B站真实返回格式是 { font_size: 0.4, font_color: "#FFFFFF", background_alpha: 0.5, background_color: "#9C27B0", Stroke: "none", type: "json" , body: [{from: 0, to: 0, content: ""}] }
|
|
73
|
+
if (Array.isArray(subJson?.body)) return { success: true, data: subJson.body };
|
|
74
|
+
if (Array.isArray(subJson)) return { success: true, data: subJson };
|
|
75
|
+
return { error: 'UNKNOWN_JSON', data: subJson };
|
|
76
|
+
} catch (e) {
|
|
77
|
+
return { error: 'PARSE_FAILED', text: text.substring(0, 100) };
|
|
78
|
+
}
|
|
79
|
+
})()
|
|
80
|
+
`;
|
|
81
|
+
const items = await page.evaluate(fetchJs);
|
|
82
|
+
|
|
83
|
+
if (items?.error) {
|
|
84
|
+
throw new Error(`字幕获取失败: ${items.error}${items.text ? ' — ' + items.text : ''}`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const finalItems = items?.data || [];
|
|
88
|
+
if (!Array.isArray(finalItems)) {
|
|
89
|
+
throw new Error('解析到的字幕列表对象不符合数组格式');
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// 6. 数据映射
|
|
93
|
+
return finalItems.map((item: any, idx: number) => ({
|
|
94
|
+
index: idx + 1,
|
|
95
|
+
from: Number(item.from || 0).toFixed(2) + 's',
|
|
96
|
+
to: Number(item.to || 0).toFixed(2) + 's',
|
|
97
|
+
content: item.content
|
|
98
|
+
}));
|
|
99
|
+
},
|
|
100
|
+
});
|
package/src/explore.ts
CHANGED
|
@@ -184,6 +184,8 @@ function scoreEndpoint(ep: { contentType: string; responseAnalysis: any; pattern
|
|
|
184
184
|
if (ep.hasPaginationParam) s += 2;
|
|
185
185
|
if (ep.hasLimitParam) s += 2;
|
|
186
186
|
if (ep.status === 200) s += 2;
|
|
187
|
+
// Anti-Bot Empty Value Detection: penalize JSON endpoints returning empty data
|
|
188
|
+
if (ep.responseAnalysis && ep.responseAnalysis.itemCount === 0 && ep.contentType.includes('json')) s -= 3;
|
|
187
189
|
return s;
|
|
188
190
|
}
|
|
189
191
|
|
|
@@ -277,6 +279,30 @@ export interface DiscoveredStore {
|
|
|
277
279
|
stateKeys: string[];
|
|
278
280
|
}
|
|
279
281
|
|
|
282
|
+
// ── Auto-Interaction (Fuzzing) ─────────────────────────────────────────────
|
|
283
|
+
|
|
284
|
+
const INTERACT_FUZZ_JS = `
|
|
285
|
+
async () => {
|
|
286
|
+
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
287
|
+
const clickables = Array.from(document.querySelectorAll(
|
|
288
|
+
'button, [role="button"], [role="tab"], .tab, .btn, a[href="javascript:void(0)"], a[href="#"]'
|
|
289
|
+
)).slice(0, 15); // limit to 15 to avoid endless loops
|
|
290
|
+
|
|
291
|
+
let clicked = 0;
|
|
292
|
+
for (const el of clickables) {
|
|
293
|
+
try {
|
|
294
|
+
const rect = el.getBoundingClientRect();
|
|
295
|
+
if (rect.width > 0 && rect.height > 0) {
|
|
296
|
+
el.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true, view: window }));
|
|
297
|
+
clicked++;
|
|
298
|
+
await sleep(300); // give it time to trigger network
|
|
299
|
+
}
|
|
300
|
+
} catch {}
|
|
301
|
+
}
|
|
302
|
+
return clicked;
|
|
303
|
+
}
|
|
304
|
+
`;
|
|
305
|
+
|
|
280
306
|
// ── Main explore function ──────────────────────────────────────────────────
|
|
281
307
|
|
|
282
308
|
export async function exploreUrl(
|
|
@@ -300,6 +326,31 @@ export async function exploreUrl(
|
|
|
300
326
|
// Step 2: Auto-scroll to trigger lazy loading (use keyboard since page.scroll may not exist)
|
|
301
327
|
for (let i = 0; i < 3; i++) { try { await page.pressKey('End'); } catch {} await page.wait(1); }
|
|
302
328
|
|
|
329
|
+
// Step 2.5: Interactive Fuzzing (if requested)
|
|
330
|
+
if (opts.auto) {
|
|
331
|
+
try {
|
|
332
|
+
// First: targeted clicks by label (e.g. "字幕", "CC", "评论")
|
|
333
|
+
if (opts.clickLabels?.length) {
|
|
334
|
+
for (const label of opts.clickLabels) {
|
|
335
|
+
const safeLabel = label.replace(/'/g, "\\'");
|
|
336
|
+
await page.evaluate(`
|
|
337
|
+
(() => {
|
|
338
|
+
const el = [...document.querySelectorAll('button, [role="button"], [role="tab"], a, span')]
|
|
339
|
+
.find(e => e.textContent && e.textContent.trim().includes('${safeLabel}'));
|
|
340
|
+
if (el) el.click();
|
|
341
|
+
})()
|
|
342
|
+
`);
|
|
343
|
+
await page.wait(1);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
// Then: blind fuzzing on generic interactive elements
|
|
347
|
+
const clicks = await page.evaluate(INTERACT_FUZZ_JS);
|
|
348
|
+
await page.wait(2); // wait for XHRs to settle
|
|
349
|
+
} catch (e) {
|
|
350
|
+
// fuzzing is best-effort, don't fail the whole explore
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
303
354
|
// Step 3: Read page metadata
|
|
304
355
|
const metadata = await readPageMetadata(page);
|
|
305
356
|
|
package/src/main.ts
CHANGED
|
@@ -53,8 +53,8 @@ program.command('validate').description('Validate CLI definitions').argument('[t
|
|
|
53
53
|
program.command('verify').description('Validate + smoke test').argument('[target]').option('--smoke', 'Run smoke tests', false)
|
|
54
54
|
.action(async (target, opts) => { const { verifyClis, renderVerifyReport } = await import('./verify.js'); const r = await verifyClis({ builtinClis: BUILTIN_CLIS, userClis: USER_CLIS, target, smoke: opts.smoke }); console.log(renderVerifyReport(r)); process.exitCode = r.ok ? 0 : 1; });
|
|
55
55
|
|
|
56
|
-
program.command('explore').alias('probe').description('Explore a website: discover APIs, stores, and recommend strategies').argument('<url>').option('--site <name>').option('--goal <text>').option('--wait <s>', '', '3')
|
|
57
|
-
.action(async (url, opts) => { const { exploreUrl, renderExploreSummary } = await import('./explore.js'); console.log(renderExploreSummary(await exploreUrl(url, { BrowserFactory: PlaywrightMCP, site: opts.site, goal: opts.goal, waitSeconds: parseFloat(opts.wait) }))); });
|
|
56
|
+
program.command('explore').alias('probe').description('Explore a website: discover APIs, stores, and recommend strategies').argument('<url>').option('--site <name>').option('--goal <text>').option('--wait <s>', '', '3').option('--auto', 'Enable interactive fuzzing (simulate clicks to trigger lazy APIs)').option('--click <labels>', 'Comma-separated labels to click before fuzzing (e.g. "字幕,CC,评论")')
|
|
57
|
+
.action(async (url, opts) => { const { exploreUrl, renderExploreSummary } = await import('./explore.js'); const clickLabels = opts.click ? opts.click.split(',').map((s: string) => s.trim()) : undefined; console.log(renderExploreSummary(await exploreUrl(url, { BrowserFactory: PlaywrightMCP, site: opts.site, goal: opts.goal, waitSeconds: parseFloat(opts.wait), auto: opts.auto, clickLabels }))); });
|
|
58
58
|
|
|
59
59
|
program.command('synthesize').description('Synthesize CLIs from explore').argument('<target>').option('--top <n>', '', '3')
|
|
60
60
|
.action(async (target, opts) => { const { synthesizeFromExplore, renderSynthesizeSummary } = await import('./synthesize.js'); console.log(renderSynthesizeSummary(synthesizeFromExplore(target, { top: parseInt(opts.top) }))); });
|
|
@@ -31,13 +31,10 @@ export async function stepWait(page: IPage, params: any, data: any, args: Record
|
|
|
31
31
|
if (typeof params === 'number') await page.wait(params);
|
|
32
32
|
else if (typeof params === 'object' && params) {
|
|
33
33
|
if ('text' in params) {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if (typeof snap === 'string' && snap.includes(params.text)) break;
|
|
39
|
-
await page.wait(0.5);
|
|
40
|
-
}
|
|
34
|
+
await page.wait({
|
|
35
|
+
text: String(render(params.text, { args, data })),
|
|
36
|
+
timeout: params.timeout
|
|
37
|
+
});
|
|
41
38
|
} else if ('time' in params) await page.wait(Number(params.time));
|
|
42
39
|
} else if (typeof params === 'string') await page.wait(Number(render(params, { args, data })));
|
|
43
40
|
return data;
|
|
@@ -5,6 +5,23 @@
|
|
|
5
5
|
import type { IPage } from '../../types.js';
|
|
6
6
|
import { render } from '../template.js';
|
|
7
7
|
|
|
8
|
+
/** Simple async concurrency limiter */
|
|
9
|
+
async function mapConcurrent<T, R>(items: T[], limit: number, fn: (item: T, index: number) => Promise<R>): Promise<R[]> {
|
|
10
|
+
const results: R[] = new Array(items.length);
|
|
11
|
+
let index = 0;
|
|
12
|
+
|
|
13
|
+
async function worker() {
|
|
14
|
+
while (index < items.length) {
|
|
15
|
+
const i = index++;
|
|
16
|
+
results[i] = await fn(items[i], i);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const workers = Array.from({ length: Math.min(limit, items.length) }, () => worker());
|
|
21
|
+
await Promise.all(workers);
|
|
22
|
+
return results;
|
|
23
|
+
}
|
|
24
|
+
|
|
8
25
|
/** Single URL fetch helper */
|
|
9
26
|
async function fetchSingle(
|
|
10
27
|
page: IPage | null, url: string, method: string,
|
|
@@ -48,12 +65,11 @@ export async function stepFetch(page: IPage | null, params: any, data: any, args
|
|
|
48
65
|
|
|
49
66
|
// Per-item fetch when data is array and URL references item
|
|
50
67
|
if (Array.isArray(data) && urlTemplate.includes('item')) {
|
|
51
|
-
const
|
|
52
|
-
|
|
53
|
-
const itemUrl = String(render(urlTemplate, { args, data, item
|
|
54
|
-
|
|
55
|
-
}
|
|
56
|
-
return results;
|
|
68
|
+
const concurrency = typeof params?.concurrency === 'number' ? params.concurrency : 5;
|
|
69
|
+
return mapConcurrent(data, concurrency, async (item, index) => {
|
|
70
|
+
const itemUrl = String(render(urlTemplate, { args, data, item, index }));
|
|
71
|
+
return fetchSingle(page, itemUrl, method, queryParams, headers, args, data);
|
|
72
|
+
});
|
|
57
73
|
}
|
|
58
74
|
const url = render(urlOrObj, { args, data });
|
|
59
75
|
return fetchSingle(page, String(url), method, queryParams, headers, args, data);
|
|
@@ -42,6 +42,8 @@ export async function stepTap(page: IPage, params: any, data: any, args: Record<
|
|
|
42
42
|
async () => {
|
|
43
43
|
// ── 1. Setup capture proxy (fetch + XHR dual interception) ──
|
|
44
44
|
let captured = null;
|
|
45
|
+
let captureResolve;
|
|
46
|
+
const capturePromise = new Promise(r => { captureResolve = r; });
|
|
45
47
|
const capturePattern = ${JSON.stringify(capturePattern)};
|
|
46
48
|
|
|
47
49
|
// Intercept fetch API
|
|
@@ -52,7 +54,7 @@ export async function stepTap(page: IPage, params: any, data: any, args: Record<
|
|
|
52
54
|
const url = typeof fetchArgs[0] === 'string' ? fetchArgs[0]
|
|
53
55
|
: fetchArgs[0] instanceof Request ? fetchArgs[0].url : String(fetchArgs[0]);
|
|
54
56
|
if (capturePattern && url.includes(capturePattern) && !captured) {
|
|
55
|
-
try { captured = await resp.clone().json(); } catch {}
|
|
57
|
+
try { captured = await resp.clone().json(); captureResolve(); } catch {}
|
|
56
58
|
}
|
|
57
59
|
} catch {}
|
|
58
60
|
return resp;
|
|
@@ -71,13 +73,13 @@ export async function stepTap(page: IPage, params: any, data: any, args: Record<
|
|
|
71
73
|
const origHandler = xhr.onreadystatechange;
|
|
72
74
|
xhr.onreadystatechange = function() {
|
|
73
75
|
if (xhr.readyState === 4 && !captured) {
|
|
74
|
-
try { captured = JSON.parse(xhr.responseText); } catch {}
|
|
76
|
+
try { captured = JSON.parse(xhr.responseText); captureResolve(); } catch {}
|
|
75
77
|
}
|
|
76
78
|
if (origHandler) origHandler.apply(this, arguments);
|
|
77
79
|
};
|
|
78
80
|
const origOnload = xhr.onload;
|
|
79
81
|
xhr.onload = function() {
|
|
80
|
-
if (!captured) { try { captured = JSON.parse(xhr.responseText); } catch {} }
|
|
82
|
+
if (!captured) { try { captured = JSON.parse(xhr.responseText); captureResolve(); } catch {} }
|
|
81
83
|
if (origOnload) origOnload.apply(this, arguments);
|
|
82
84
|
};
|
|
83
85
|
}
|
|
@@ -117,9 +119,9 @@ export async function stepTap(page: IPage, params: any, data: any, args: Record<
|
|
|
117
119
|
await ${actionCall};
|
|
118
120
|
|
|
119
121
|
// ── 4. Wait for network response ──
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
await
|
|
122
|
+
if (!captured) {
|
|
123
|
+
const timeoutPromise = new Promise(r => setTimeout(r, ${timeout} * 1000));
|
|
124
|
+
await Promise.race([capturePromise, timeoutPromise]);
|
|
123
125
|
}
|
|
124
126
|
} finally {
|
|
125
127
|
// ── 5. Always restore originals ──
|
package/src/types.ts
CHANGED
|
@@ -12,7 +12,7 @@ export interface IPage {
|
|
|
12
12
|
click(ref: string): Promise<void>;
|
|
13
13
|
typeText(ref: string, text: string): Promise<void>;
|
|
14
14
|
pressKey(key: string): Promise<void>;
|
|
15
|
-
wait(
|
|
15
|
+
wait(options: number | { text?: string; time?: number; timeout?: number }): Promise<void>;
|
|
16
16
|
tabs(): Promise<any>;
|
|
17
17
|
closeTab(index?: number): Promise<void>;
|
|
18
18
|
newTab(): Promise<void>;
|