@jackwener/opencli 1.7.6 → 1.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -8
- package/README.zh-CN.md +14 -8
- package/cli-manifest.json +325 -11
- package/clis/51job/company.js +125 -0
- package/clis/51job/detail.js +108 -0
- package/clis/51job/hot.js +55 -0
- package/clis/51job/search.js +79 -0
- package/clis/51job/utils.js +302 -0
- package/clis/51job/utils.test.js +69 -0
- package/clis/bilibili/video.js +11 -4
- package/clis/bilibili/video.test.js +51 -0
- package/clis/chatgpt/image.js +1 -1
- package/clis/deepseek/ask.js +19 -13
- package/clis/deepseek/ask.test.js +93 -1
- package/clis/deepseek/utils.js +108 -23
- package/clis/deepseek/utils.test.js +109 -1
- package/clis/gemini/image.js +1 -1
- package/clis/instagram/download.js +1 -1
- package/clis/twitter/likes.js +3 -2
- package/clis/twitter/search.js +4 -2
- package/clis/twitter/search.test.js +4 -0
- package/clis/twitter/shared.js +28 -0
- package/clis/twitter/shared.test.js +96 -0
- package/clis/twitter/thread.js +3 -1
- package/clis/twitter/timeline.js +3 -2
- package/clis/twitter/tweets.js +3 -2
- package/clis/twitter/tweets.test.js +1 -1
- package/clis/web/read.js +25 -5
- package/clis/web/read.test.js +76 -0
- package/clis/weread/ai-outline.js +170 -0
- package/clis/weread/ai-outline.test.js +83 -0
- package/clis/weread/book.js +57 -44
- package/clis/weread/commands.test.js +24 -0
- package/clis/xiaoyuzhou/podcast-episodes.js +2 -2
- package/clis/xiaoyuzhou/podcast-episodes.test.js +78 -0
- package/dist/src/browser/analyze.d.ts +103 -0
- package/dist/src/browser/analyze.js +230 -0
- package/dist/src/browser/analyze.test.d.ts +1 -0
- package/dist/src/browser/analyze.test.js +164 -0
- package/dist/src/browser/article-extract.d.ts +57 -0
- package/dist/src/browser/article-extract.e2e.test.d.ts +1 -0
- package/dist/src/browser/article-extract.e2e.test.js +105 -0
- package/dist/src/browser/article-extract.js +169 -0
- package/dist/src/browser/article-extract.test.d.ts +1 -0
- package/dist/src/browser/article-extract.test.js +94 -0
- package/dist/src/browser/cdp.js +11 -2
- package/dist/src/browser/verify-fixture.d.ts +59 -0
- package/dist/src/browser/verify-fixture.js +213 -0
- package/dist/src/browser/verify-fixture.test.d.ts +1 -0
- package/dist/src/browser/verify-fixture.test.js +161 -0
- package/dist/src/cli.d.ts +32 -0
- package/dist/src/cli.js +333 -43
- package/dist/src/cli.test.js +257 -1
- package/dist/src/daemon.d.ts +3 -2
- package/dist/src/daemon.js +16 -4
- package/dist/src/daemon.test.d.ts +1 -0
- package/dist/src/daemon.test.js +19 -0
- package/dist/src/download/article-download.d.ts +12 -0
- package/dist/src/download/article-download.js +141 -17
- package/dist/src/download/article-download.test.js +196 -0
- package/dist/src/download/index.js +73 -86
- package/dist/src/errors.js +4 -2
- package/dist/src/errors.test.js +13 -0
- package/dist/src/launcher.d.ts +1 -1
- package/dist/src/launcher.js +3 -3
- package/dist/src/output.js +1 -1
- package/dist/src/output.test.js +6 -0
- package/package.json +5 -1
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
|
2
|
+
import { getRegistry } from '@jackwener/opencli/registry';
|
|
3
|
+
import './ai-outline.js';
|
|
4
|
+
|
|
5
|
+
describe('weread ai-outline', () => {
|
|
6
|
+
const command = getRegistry().get('weread/ai-outline');
|
|
7
|
+
|
|
8
|
+
beforeEach(() => {
|
|
9
|
+
vi.restoreAllMocks();
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
it('registers ai-outline with plain default output', () => {
|
|
13
|
+
expect(command?.defaultFormat).toBe('plain');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it('maps chapterInfos auth-expired responses to AUTH_REQUIRED', async () => {
|
|
17
|
+
expect(command?.func).toBeTypeOf('function');
|
|
18
|
+
const page = {
|
|
19
|
+
getCookies: vi.fn()
|
|
20
|
+
.mockResolvedValueOnce([{ name: 'wr_vid', value: 'vid123', domain: '.weread.qq.com' }])
|
|
21
|
+
.mockResolvedValueOnce([{ name: 'wr_name', value: 'alice', domain: '.weread.qq.com' }]),
|
|
22
|
+
};
|
|
23
|
+
vi.stubGlobal('fetch', vi.fn().mockResolvedValue({
|
|
24
|
+
ok: true,
|
|
25
|
+
status: 200,
|
|
26
|
+
json: () => Promise.resolve({ errcode: -2012, errmsg: '登录超时' }),
|
|
27
|
+
}));
|
|
28
|
+
await expect(command.func(page, { 'book-id': 'book-1' })).rejects.toMatchObject({
|
|
29
|
+
code: 'AUTH_REQUIRED',
|
|
30
|
+
message: 'Not logged in to WeRead',
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('returns structured rows for --raw and respects depth filtering', async () => {
|
|
35
|
+
expect(command?.func).toBeTypeOf('function');
|
|
36
|
+
const page = {
|
|
37
|
+
getCookies: vi.fn()
|
|
38
|
+
.mockResolvedValueOnce([{ name: 'wr_vid', value: 'vid123', domain: '.weread.qq.com' }])
|
|
39
|
+
.mockResolvedValueOnce([{ name: 'wr_name', value: 'alice', domain: '.weread.qq.com' }]),
|
|
40
|
+
};
|
|
41
|
+
const fetchMock = vi.fn()
|
|
42
|
+
.mockResolvedValueOnce({
|
|
43
|
+
ok: true,
|
|
44
|
+
status: 200,
|
|
45
|
+
json: () => Promise.resolve({
|
|
46
|
+
data: [{
|
|
47
|
+
updated: [
|
|
48
|
+
{ chapterUid: 'c1', title: '第一章' },
|
|
49
|
+
],
|
|
50
|
+
}],
|
|
51
|
+
}),
|
|
52
|
+
})
|
|
53
|
+
.mockResolvedValueOnce({
|
|
54
|
+
ok: true,
|
|
55
|
+
status: 200,
|
|
56
|
+
json: () => Promise.resolve({
|
|
57
|
+
itemsArray: [{
|
|
58
|
+
chapterUid: 'c1',
|
|
59
|
+
items: [
|
|
60
|
+
{ level: 2, uiIdx: '1', text: '主题一' },
|
|
61
|
+
{ level: 3, uiIdx: '1.1', text: '要点一' },
|
|
62
|
+
{ level: 4, text: '细节一' },
|
|
63
|
+
],
|
|
64
|
+
}],
|
|
65
|
+
}),
|
|
66
|
+
});
|
|
67
|
+
vi.stubGlobal('fetch', fetchMock);
|
|
68
|
+
const rows = await command.func(page, { 'book-id': 'book-1', raw: true, depth: 3, limit: 10 });
|
|
69
|
+
expect(rows).toEqual([
|
|
70
|
+
{ chapter: '第一章', idx: '1', level: 2, text: '主题一' },
|
|
71
|
+
{ chapter: '第一章', idx: '1.1', level: 3, text: '要点一' },
|
|
72
|
+
]);
|
|
73
|
+
expect(fetchMock).toHaveBeenNthCalledWith(1, 'https://weread.qq.com/web/book/chapterInfos', expect.objectContaining({
|
|
74
|
+
method: 'POST',
|
|
75
|
+
headers: expect.objectContaining({
|
|
76
|
+
Cookie: 'wr_name=alice; wr_vid=vid123',
|
|
77
|
+
}),
|
|
78
|
+
}));
|
|
79
|
+
expect(fetchMock).toHaveBeenNthCalledWith(2, 'https://weread.qq.com/web/book/outline', expect.objectContaining({
|
|
80
|
+
method: 'POST',
|
|
81
|
+
}));
|
|
82
|
+
});
|
|
83
|
+
});
|
package/clis/weread/book.js
CHANGED
|
@@ -37,6 +37,61 @@ function countSearchIdentities(entries) {
|
|
|
37
37
|
}
|
|
38
38
|
return counts;
|
|
39
39
|
}
|
|
40
|
+
export function strictTitleFromWereadDocumentTitle(rawTitle) {
|
|
41
|
+
const suffix = ' - 微信读书';
|
|
42
|
+
const normalized = String(rawTitle || '').trim();
|
|
43
|
+
if (!normalized.endsWith(suffix))
|
|
44
|
+
return '';
|
|
45
|
+
const base = normalized.slice(0, -suffix.length).trim();
|
|
46
|
+
// Only accept the title when WeRead exposes the strict "<title> - 微信读书"
|
|
47
|
+
// shape. If extra separators remain, the page title is ambiguous.
|
|
48
|
+
return base.includes(' - ') ? '' : base;
|
|
49
|
+
}
|
|
50
|
+
export function extractReaderFallbackMetadata(doc) {
|
|
51
|
+
const text = (node) => node?.textContent?.trim() || '';
|
|
52
|
+
const firstText = (...sels) => { for (const s of sels) {
|
|
53
|
+
const v = text(doc.querySelector(s));
|
|
54
|
+
if (v)
|
|
55
|
+
return v;
|
|
56
|
+
} return ''; };
|
|
57
|
+
const bodyText = doc.body?.innerText?.replace(/\s+/g, ' ').trim() || '';
|
|
58
|
+
const extractRating = () => {
|
|
59
|
+
const match = bodyText.match(/微信读书推荐值\s*([0-9.]+%)/);
|
|
60
|
+
return match ? match[1] : '';
|
|
61
|
+
};
|
|
62
|
+
const extractPublisher = () => {
|
|
63
|
+
const direct = text(doc.querySelector('.introDialog_content_pub_line'));
|
|
64
|
+
return direct.startsWith('出版社') ? direct.replace(/^出版社\s*/, '').trim() : '';
|
|
65
|
+
};
|
|
66
|
+
const extractIntro = () => {
|
|
67
|
+
const selectors = [
|
|
68
|
+
'.horizontalReaderCoverPage_content_bookInfo_intro',
|
|
69
|
+
'.wr_flyleaf_page_bookIntro_content',
|
|
70
|
+
'.introDialog_content_intro_para',
|
|
71
|
+
];
|
|
72
|
+
for (const selector of selectors) {
|
|
73
|
+
const value = text(doc.querySelector(selector));
|
|
74
|
+
if (value)
|
|
75
|
+
return value;
|
|
76
|
+
}
|
|
77
|
+
return '';
|
|
78
|
+
};
|
|
79
|
+
const categorySource = Array.from(doc.scripts || [])
|
|
80
|
+
.map((script) => script.textContent || '')
|
|
81
|
+
.find((scriptText) => scriptText.includes('"category"')) || '';
|
|
82
|
+
const categoryMatch = categorySource.match(/"category"\s*:\s*"([^"]+)"/);
|
|
83
|
+
const title = firstText('.horizontalReaderCoverPage_content_bookTitle', '.wr_flyleaf_page_bookInfo_bookTitle', '.outline_book_detail_header_title', '.readerTopBar_title_link') || strictTitleFromWereadDocumentTitle(doc.title || '');
|
|
84
|
+
const author = firstText('.horizontalReaderCoverPage_content_author', '.wr_flyleaf_page_bookInfo_author', '.outline_book_detail_header_author');
|
|
85
|
+
return {
|
|
86
|
+
title,
|
|
87
|
+
author,
|
|
88
|
+
publisher: extractPublisher(),
|
|
89
|
+
intro: extractIntro(),
|
|
90
|
+
category: categoryMatch ? categoryMatch[1].trim() : '',
|
|
91
|
+
rating: extractRating(),
|
|
92
|
+
metadataReady: Boolean(title || author),
|
|
93
|
+
};
|
|
94
|
+
}
|
|
40
95
|
/**
|
|
41
96
|
* Reuse the public search page as a last-resort reader URL source when the
|
|
42
97
|
* cached shelf page cannot provide a trustworthy bookId-to-reader mapping.
|
|
@@ -108,51 +163,9 @@ async function resolveSearchReaderUrl(title, author) {
|
|
|
108
163
|
*/
|
|
109
164
|
async function loadReaderFallbackResult(page, readerUrl) {
|
|
110
165
|
await page.goto(readerUrl);
|
|
111
|
-
await page.wait({ selector: '.horizontalReaderCoverPage_content_bookTitle, .wr_flyleaf_page_bookInfo_bookTitle', timeout: 10 });
|
|
166
|
+
await page.wait({ selector: '.horizontalReaderCoverPage_content_bookTitle, .wr_flyleaf_page_bookInfo_bookTitle, .readerTopBar_title_link', timeout: 10 });
|
|
112
167
|
const result = await page.evaluate(`
|
|
113
|
-
(()
|
|
114
|
-
const text = (node) => node?.textContent?.trim() || '';
|
|
115
|
-
const bodyText = document.body?.innerText?.replace(/\\s+/g, ' ').trim() || '';
|
|
116
|
-
const titleSelector = '.horizontalReaderCoverPage_content_bookTitle, .wr_flyleaf_page_bookInfo_bookTitle';
|
|
117
|
-
const authorSelector = '.horizontalReaderCoverPage_content_author, .wr_flyleaf_page_bookInfo_author';
|
|
118
|
-
const extractRating = () => {
|
|
119
|
-
const match = bodyText.match(/微信读书推荐值\\s*([0-9.]+%)/);
|
|
120
|
-
return match ? match[1] : '';
|
|
121
|
-
};
|
|
122
|
-
const extractPublisher = () => {
|
|
123
|
-
const direct = text(document.querySelector('.introDialog_content_pub_line'));
|
|
124
|
-
return direct.startsWith('出版社') ? direct.replace(/^出版社\\s*/, '').trim() : '';
|
|
125
|
-
};
|
|
126
|
-
const extractIntro = () => {
|
|
127
|
-
const selectors = [
|
|
128
|
-
'.horizontalReaderCoverPage_content_bookInfo_intro',
|
|
129
|
-
'.wr_flyleaf_page_bookIntro_content',
|
|
130
|
-
'.introDialog_content_intro_para',
|
|
131
|
-
];
|
|
132
|
-
for (const selector of selectors) {
|
|
133
|
-
const value = text(document.querySelector(selector));
|
|
134
|
-
if (value) return value;
|
|
135
|
-
}
|
|
136
|
-
return '';
|
|
137
|
-
};
|
|
138
|
-
|
|
139
|
-
const categorySource = Array.from(document.scripts)
|
|
140
|
-
.map((script) => script.textContent || '')
|
|
141
|
-
.find((scriptText) => scriptText.includes('"category"')) || '';
|
|
142
|
-
const categoryMatch = categorySource.match(/"category"\\s*:\\s*"([^"]+)"/);
|
|
143
|
-
const title = text(document.querySelector(titleSelector));
|
|
144
|
-
const author = text(document.querySelector(authorSelector));
|
|
145
|
-
|
|
146
|
-
return {
|
|
147
|
-
title,
|
|
148
|
-
author,
|
|
149
|
-
publisher: extractPublisher(),
|
|
150
|
-
intro: extractIntro(),
|
|
151
|
-
category: categoryMatch ? categoryMatch[1].trim() : '',
|
|
152
|
-
rating: extractRating(),
|
|
153
|
-
metadataReady: Boolean(title || author),
|
|
154
|
-
};
|
|
155
|
-
})()
|
|
168
|
+
(${extractReaderFallbackMetadata.toString()})(document)
|
|
156
169
|
`);
|
|
157
170
|
return {
|
|
158
171
|
title: String(result?.title || '').trim(),
|
|
@@ -14,6 +14,7 @@ import { getRegistry } from '@jackwener/opencli/registry';
|
|
|
14
14
|
import './book.js';
|
|
15
15
|
import './highlights.js';
|
|
16
16
|
import './notes.js';
|
|
17
|
+
import { extractReaderFallbackMetadata, strictTitleFromWereadDocumentTitle } from './book.js';
|
|
17
18
|
describe('weread book-id positional args', () => {
|
|
18
19
|
const book = getRegistry().get('weread/book');
|
|
19
20
|
const highlights = getRegistry().get('weread/highlights');
|
|
@@ -356,6 +357,29 @@ describe('weread book-id positional args', () => {
|
|
|
356
357
|
message: 'Not logged in to WeRead',
|
|
357
358
|
});
|
|
358
359
|
});
|
|
360
|
+
it('does not guess author from document.title when the reader page skips cover metadata', async () => {
|
|
361
|
+
const nodes = new Map([
|
|
362
|
+
['.readerTopBar_title_link', { textContent: 'Part 1 - Part 2' }],
|
|
363
|
+
['.introDialog_content_pub_line', { textContent: '出版社 测试出版社' }],
|
|
364
|
+
['.introDialog_content_intro_para', { textContent: '测试简介。' }],
|
|
365
|
+
]);
|
|
366
|
+
const mockDocument = {
|
|
367
|
+
title: 'Part 1 - Part 2 - 作者甲 - 微信读书',
|
|
368
|
+
body: { innerText: '微信读书推荐值 88.8%' },
|
|
369
|
+
scripts: [],
|
|
370
|
+
querySelector: (selector) => nodes.get(selector) || null,
|
|
371
|
+
};
|
|
372
|
+
expect(strictTitleFromWereadDocumentTitle(mockDocument.title)).toBe('');
|
|
373
|
+
expect(extractReaderFallbackMetadata(mockDocument)).toEqual({
|
|
374
|
+
title: 'Part 1 - Part 2',
|
|
375
|
+
author: '',
|
|
376
|
+
publisher: '测试出版社',
|
|
377
|
+
intro: '测试简介。',
|
|
378
|
+
category: '',
|
|
379
|
+
rating: '88.8%',
|
|
380
|
+
metadataReady: true,
|
|
381
|
+
});
|
|
382
|
+
});
|
|
359
383
|
it('passes the positional book-id to highlights', async () => {
|
|
360
384
|
mockFetchPrivateApi.mockResolvedValue({ updated: [] });
|
|
361
385
|
await highlights.func({}, { 'book-id': 'abc', limit: 5 });
|
|
@@ -20,9 +20,9 @@ cli({
|
|
|
20
20
|
throw new CliError('INVALID_ARGUMENT', 'limit must be a positive integer', 'Example: --limit 5');
|
|
21
21
|
}
|
|
22
22
|
const credentials = loadXiaoyuzhouCredentials();
|
|
23
|
-
const response = await requestXiaoyuzhouJson('/v1/
|
|
23
|
+
const response = await requestXiaoyuzhouJson('/v1/episode/list', {
|
|
24
24
|
method: 'POST',
|
|
25
|
-
body: { pid: args.id, limit: requestedLimit },
|
|
25
|
+
body: { pid: args.id, order: 'desc', limit: requestedLimit },
|
|
26
26
|
credentials,
|
|
27
27
|
});
|
|
28
28
|
const episodes = response.data ?? [];
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { beforeAll, beforeEach, describe, expect, it, vi } from 'vitest';
|
|
2
|
+
import { getRegistry } from '@jackwener/opencli/registry';
|
|
3
|
+
|
|
4
|
+
const { mockRequestJson, mockLoadCredentials } = vi.hoisted(() => ({
|
|
5
|
+
mockRequestJson: vi.fn(),
|
|
6
|
+
mockLoadCredentials: vi.fn(),
|
|
7
|
+
}));
|
|
8
|
+
|
|
9
|
+
vi.mock('./auth.js', async () => {
|
|
10
|
+
const actual = await vi.importActual('./auth.js');
|
|
11
|
+
return {
|
|
12
|
+
...actual,
|
|
13
|
+
requestXiaoyuzhouJson: mockRequestJson,
|
|
14
|
+
loadXiaoyuzhouCredentials: mockLoadCredentials,
|
|
15
|
+
};
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
await import('./podcast-episodes.js');
|
|
19
|
+
|
|
20
|
+
let cmd;
|
|
21
|
+
|
|
22
|
+
beforeAll(() => {
|
|
23
|
+
cmd = getRegistry().get('xiaoyuzhou/podcast-episodes');
|
|
24
|
+
expect(cmd?.func).toBeTypeOf('function');
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
describe('xiaoyuzhou podcast-episodes', () => {
|
|
28
|
+
beforeEach(() => {
|
|
29
|
+
mockRequestJson.mockReset();
|
|
30
|
+
mockLoadCredentials.mockReset();
|
|
31
|
+
mockLoadCredentials.mockReturnValue({ access_token: 'access', refresh_token: 'refresh' });
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('calls the fixed episode list endpoint with desc ordering', async () => {
|
|
35
|
+
mockRequestJson.mockResolvedValue({
|
|
36
|
+
data: [
|
|
37
|
+
{
|
|
38
|
+
eid: 'ep-1',
|
|
39
|
+
title: 'Episode 1',
|
|
40
|
+
duration: 3661,
|
|
41
|
+
playCount: 42,
|
|
42
|
+
pubDate: '2026-04-20T10:00:00.000Z',
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
const result = await cmd.func(null, {
|
|
48
|
+
id: 'podcast-1',
|
|
49
|
+
limit: 3,
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
expect(mockRequestJson).toHaveBeenCalledWith('/v1/episode/list', {
|
|
53
|
+
method: 'POST',
|
|
54
|
+
body: { pid: 'podcast-1', order: 'desc', limit: 3 },
|
|
55
|
+
credentials: { access_token: 'access', refresh_token: 'refresh' },
|
|
56
|
+
});
|
|
57
|
+
expect(result).toEqual([
|
|
58
|
+
{
|
|
59
|
+
eid: 'ep-1',
|
|
60
|
+
title: 'Episode 1',
|
|
61
|
+
duration: '61:01',
|
|
62
|
+
plays: 42,
|
|
63
|
+
date: '2026-04-20',
|
|
64
|
+
},
|
|
65
|
+
]);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it('rejects non-positive limits before hitting the API', async () => {
|
|
69
|
+
await expect(cmd.func(null, {
|
|
70
|
+
id: 'podcast-1',
|
|
71
|
+
limit: 0,
|
|
72
|
+
})).rejects.toMatchObject({
|
|
73
|
+
code: 'INVALID_ARGUMENT',
|
|
74
|
+
message: 'limit must be a positive integer',
|
|
75
|
+
});
|
|
76
|
+
expect(mockRequestJson).not.toHaveBeenCalled();
|
|
77
|
+
});
|
|
78
|
+
});
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `browser analyze <url>` — turn site-recon guesswork into deterministic CLI output.
|
|
3
|
+
*
|
|
4
|
+
* When an agent starts a new adapter, the first question is "which pattern am
|
|
5
|
+
* I looking at?" (A/B/C/D/E from site-recon docs) and "will Node-side fetch
|
|
6
|
+
* work, or will anti-bot middleware block me?". Today the agent has to open
|
|
7
|
+
* the page, poke `network`, try cURL, fail, guess again. This module condenses
|
|
8
|
+
* that into one call that returns a classification + evidence.
|
|
9
|
+
*
|
|
10
|
+
* Kept pure (no page imports) so the bulk is unit-testable; the CLI wrapper
|
|
11
|
+
* drives a real page, feeds the resulting signals here, and prints the verdict.
|
|
12
|
+
*/
|
|
13
|
+
import type { CliCommand } from '../registry.js';
|
|
14
|
+
export interface PageSignals {
|
|
15
|
+
/** URL we navigated to (may redirect; both fields help agents notice that). */
|
|
16
|
+
requestedUrl: string;
|
|
17
|
+
finalUrl: string;
|
|
18
|
+
/** document.cookie split into names; value not needed for detection. */
|
|
19
|
+
cookieNames: string[];
|
|
20
|
+
/**
|
|
21
|
+
* Response bodies captured during the navigation + first few seconds.
|
|
22
|
+
* We only need enough body text to spot WAF markers; the CLI truncates
|
|
23
|
+
* per-entry before feeding us.
|
|
24
|
+
*/
|
|
25
|
+
networkEntries: Array<{
|
|
26
|
+
url: string;
|
|
27
|
+
status: number;
|
|
28
|
+
contentType: string;
|
|
29
|
+
/** First N chars of body; null when not available. */
|
|
30
|
+
bodyPreview: string | null;
|
|
31
|
+
}>;
|
|
32
|
+
/**
|
|
33
|
+
* Which globals the page exposes on `window`. We don't care about the values,
|
|
34
|
+
* just presence — distinguishes Pattern B (SSR state) from Pattern A.
|
|
35
|
+
*/
|
|
36
|
+
initialState: {
|
|
37
|
+
__INITIAL_STATE__: boolean;
|
|
38
|
+
__NUXT__: boolean;
|
|
39
|
+
__NEXT_DATA__: boolean;
|
|
40
|
+
__APOLLO_STATE__: boolean;
|
|
41
|
+
};
|
|
42
|
+
/** Document title — only for the human-debug `summary` field. */
|
|
43
|
+
title: string;
|
|
44
|
+
}
|
|
45
|
+
export type AntiBotVendor = 'aliyun_waf' | 'cloudflare' | 'akamai' | 'geetest' | 'unknown';
|
|
46
|
+
export interface AntiBotVerdict {
|
|
47
|
+
detected: boolean;
|
|
48
|
+
vendor: AntiBotVendor | null;
|
|
49
|
+
evidence: string[];
|
|
50
|
+
/** One-line imperative instruction for the agent. */
|
|
51
|
+
implication: string;
|
|
52
|
+
}
|
|
53
|
+
export declare function detectAntiBot(signals: PageSignals): AntiBotVerdict;
|
|
54
|
+
export type Pattern = 'A' | 'B' | 'C' | 'D' | 'E' | 'unknown';
|
|
55
|
+
export interface PatternVerdict {
|
|
56
|
+
pattern: Pattern;
|
|
57
|
+
reason: string;
|
|
58
|
+
/** How many JSON XHR/fetch responses we saw during navigation. */
|
|
59
|
+
json_responses: number;
|
|
60
|
+
/** Count of non-2xx API responses — hint for token-gated (Pattern D). */
|
|
61
|
+
auth_failures: number;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Apply the decision tree from `site-recon.md` mechanically.
|
|
65
|
+
*
|
|
66
|
+
* B beats A when initial-state globals are present: even if the page fetches
|
|
67
|
+
* more data via XHR afterwards, the SSR payload is the highest-leverage source.
|
|
68
|
+
* D (token-gated) dominates when we see 401/403 on what looks like API
|
|
69
|
+
* endpoints — without that, an authenticated route looks identical to A.
|
|
70
|
+
*/
|
|
71
|
+
export declare function classifyPattern(signals: PageSignals): PatternVerdict;
|
|
72
|
+
export interface NearestAdapter {
|
|
73
|
+
site: string;
|
|
74
|
+
example_commands: string[];
|
|
75
|
+
reason: string;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Find existing adapters that target the same site.
|
|
79
|
+
*
|
|
80
|
+
* Keep the hostname match simple — agents extend naming conventions
|
|
81
|
+
* differently per site, so we match on the registered `domain` field and fall
|
|
82
|
+
* back to site-name containment. Returning `null` is fine; agents can always
|
|
83
|
+
* read site-memory docs.
|
|
84
|
+
*/
|
|
85
|
+
export declare function findNearestAdapter(finalUrl: string, registry: Map<string, CliCommand>): NearestAdapter | null;
|
|
86
|
+
export interface AnalyzeReport {
|
|
87
|
+
requested_url: string;
|
|
88
|
+
final_url: string;
|
|
89
|
+
title: string;
|
|
90
|
+
pattern: PatternVerdict;
|
|
91
|
+
anti_bot: AntiBotVerdict;
|
|
92
|
+
initial_state: PageSignals['initialState'];
|
|
93
|
+
nearest_adapter: NearestAdapter | null;
|
|
94
|
+
recommended_next_step: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Synthesize the verdict from collected signals + registry.
|
|
98
|
+
*
|
|
99
|
+
* The `recommended_next_step` is deliberately a single imperative
|
|
100
|
+
* sentence — agents act on it directly instead of re-deriving advice from
|
|
101
|
+
* the structured fields.
|
|
102
|
+
*/
|
|
103
|
+
export declare function analyzeSite(signals: PageSignals, registry: Map<string, CliCommand>): AnalyzeReport;
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `browser analyze <url>` — turn site-recon guesswork into deterministic CLI output.
|
|
3
|
+
*
|
|
4
|
+
* When an agent starts a new adapter, the first question is "which pattern am
|
|
5
|
+
* I looking at?" (A/B/C/D/E from site-recon docs) and "will Node-side fetch
|
|
6
|
+
* work, or will anti-bot middleware block me?". Today the agent has to open
|
|
7
|
+
* the page, poke `network`, try cURL, fail, guess again. This module condenses
|
|
8
|
+
* that into one call that returns a classification + evidence.
|
|
9
|
+
*
|
|
10
|
+
* Kept pure (no page imports) so the bulk is unit-testable; the CLI wrapper
|
|
11
|
+
* drives a real page, feeds the resulting signals here, and prints the verdict.
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* WAF vendors we can reliably detect from cookies + response body markers
|
|
15
|
+
* alone. Signals are orthogonal per vendor — so when two vendors match
|
|
16
|
+
* simultaneously (rare), we keep all evidence and report the higher-signal
|
|
17
|
+
* vendor first.
|
|
18
|
+
*/
|
|
19
|
+
const WAF_SIGNATURES = [
|
|
20
|
+
{
|
|
21
|
+
vendor: 'aliyun_waf',
|
|
22
|
+
cookiePatterns: [/^acw_sc__v2$/, /^acw_tc$/, /^ssxmod_itna/],
|
|
23
|
+
bodyPatterns: [/arg1\s*=\s*['"][0-9A-F]{30,}/, /\/ntc_captcha\//i],
|
|
24
|
+
implication: 'Direct Node-side fetch/curl will return the slider HTML. Validate the endpoint in browser context first; HTML COOKIE adapters still finish with Node-side fetch + page.getCookies.',
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
vendor: 'cloudflare',
|
|
28
|
+
cookiePatterns: [/^__cf_bm$/, /^cf_clearance$/, /^__cfduid$/],
|
|
29
|
+
bodyPatterns: [/Cloudflare Ray ID/i, /Checking your browser before accessing/i, /cf-chl-/i],
|
|
30
|
+
implication: 'Cloudflare bot check. Start from a real browser session; probe in browser context first. HTML COOKIE adapters still finish with Node-side fetch + page.getCookies.',
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
vendor: 'akamai',
|
|
34
|
+
cookiePatterns: [/^_abck$/, /^bm_sz$/, /^bm_sv$/],
|
|
35
|
+
bodyPatterns: [/akamai/i],
|
|
36
|
+
implication: 'Akamai Bot Manager. Probe in browser context first; keep final HTML COOKIE adapters on Node-side fetch + page.getCookies.',
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
vendor: 'geetest',
|
|
40
|
+
cookiePatterns: [],
|
|
41
|
+
bodyPatterns: [/geetest/i, /gt_captcha/i],
|
|
42
|
+
implication: 'Geetest slider/puzzle captcha. Agent cannot bypass programmatically — requires UI strategy or human-in-loop.',
|
|
43
|
+
},
|
|
44
|
+
];
|
|
45
|
+
export function detectAntiBot(signals) {
|
|
46
|
+
const evidence = [];
|
|
47
|
+
let match = null;
|
|
48
|
+
for (const sig of WAF_SIGNATURES) {
|
|
49
|
+
const hits = [];
|
|
50
|
+
for (const pat of sig.cookiePatterns) {
|
|
51
|
+
const hit = signals.cookieNames.find((c) => pat.test(c));
|
|
52
|
+
if (hit)
|
|
53
|
+
hits.push(`cookie:${hit}`);
|
|
54
|
+
}
|
|
55
|
+
for (const pat of sig.bodyPatterns) {
|
|
56
|
+
for (const entry of signals.networkEntries) {
|
|
57
|
+
if (entry.bodyPreview && pat.test(entry.bodyPreview)) {
|
|
58
|
+
hits.push(`body:${entry.url}`);
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
if (hits.length > 0 && !match) {
|
|
64
|
+
match = sig;
|
|
65
|
+
evidence.push(...hits);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (!match) {
|
|
69
|
+
return {
|
|
70
|
+
detected: false,
|
|
71
|
+
vendor: null,
|
|
72
|
+
evidence: [],
|
|
73
|
+
implication: 'No known anti-bot signatures. Try Node-side COOKIE fetch first; if endpoint validation is blocked, retry from browser context.',
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
return {
|
|
77
|
+
detected: true,
|
|
78
|
+
vendor: match.vendor,
|
|
79
|
+
evidence,
|
|
80
|
+
implication: match.implication,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Apply the decision tree from `site-recon.md` mechanically.
|
|
85
|
+
*
|
|
86
|
+
* B beats A when initial-state globals are present: even if the page fetches
|
|
87
|
+
* more data via XHR afterwards, the SSR payload is the highest-leverage source.
|
|
88
|
+
* D (token-gated) dominates when we see 401/403 on what looks like API
|
|
89
|
+
* endpoints — without that, an authenticated route looks identical to A.
|
|
90
|
+
*/
|
|
91
|
+
export function classifyPattern(signals) {
|
|
92
|
+
const jsonEntries = signals.networkEntries.filter((e) => /json/i.test(e.contentType));
|
|
93
|
+
const authFailures = signals.networkEntries.filter((e) => e.status === 401 || e.status === 403).length;
|
|
94
|
+
const hasInitialState = signals.initialState.__INITIAL_STATE__ ||
|
|
95
|
+
signals.initialState.__NUXT__ ||
|
|
96
|
+
signals.initialState.__NEXT_DATA__ ||
|
|
97
|
+
signals.initialState.__APOLLO_STATE__;
|
|
98
|
+
if (authFailures >= 2 && jsonEntries.length >= 1) {
|
|
99
|
+
return {
|
|
100
|
+
pattern: 'D',
|
|
101
|
+
reason: `${authFailures} auth-failing API responses seen — endpoint is token-gated`,
|
|
102
|
+
json_responses: jsonEntries.length,
|
|
103
|
+
auth_failures: authFailures,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
if (hasInitialState) {
|
|
107
|
+
const which = Object.entries(signals.initialState)
|
|
108
|
+
.filter(([, v]) => v)
|
|
109
|
+
.map(([k]) => k);
|
|
110
|
+
return {
|
|
111
|
+
pattern: 'B',
|
|
112
|
+
reason: `SSR state global present: ${which.join(', ')}`,
|
|
113
|
+
json_responses: jsonEntries.length,
|
|
114
|
+
auth_failures: authFailures,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
if (jsonEntries.length >= 1) {
|
|
118
|
+
return {
|
|
119
|
+
pattern: 'A',
|
|
120
|
+
reason: `${jsonEntries.length} JSON XHR/fetch responses observed — classic API pattern`,
|
|
121
|
+
json_responses: jsonEntries.length,
|
|
122
|
+
auth_failures: authFailures,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
// No API, no SSR state — probably static HTML or a bundled SPA that lazy-loads.
|
|
126
|
+
// Pattern C (HTML scrape) is the default fallback; E (streaming) we can't
|
|
127
|
+
// reliably detect without watching WebSocket frames, so we label 'C' and
|
|
128
|
+
// leave the agent to upgrade to E manually if they see WS traffic.
|
|
129
|
+
return {
|
|
130
|
+
pattern: 'C',
|
|
131
|
+
reason: 'No JSON XHR and no SSR state — HTML scrape (Pattern C); escalate to E manually if WebSocket traffic appears',
|
|
132
|
+
json_responses: jsonEntries.length,
|
|
133
|
+
auth_failures: authFailures,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Find existing adapters that target the same site.
|
|
138
|
+
*
|
|
139
|
+
* Keep the hostname match simple — agents extend naming conventions
|
|
140
|
+
* differently per site, so we match on the registered `domain` field and fall
|
|
141
|
+
* back to site-name containment. Returning `null` is fine; agents can always
|
|
142
|
+
* read site-memory docs.
|
|
143
|
+
*/
|
|
144
|
+
export function findNearestAdapter(finalUrl, registry) {
|
|
145
|
+
let host;
|
|
146
|
+
try {
|
|
147
|
+
host = new URL(finalUrl).hostname;
|
|
148
|
+
}
|
|
149
|
+
catch {
|
|
150
|
+
return null;
|
|
151
|
+
}
|
|
152
|
+
// Strip leading www.; 'www' as a site identifier is never what an adapter uses.
|
|
153
|
+
const cleanedHost = host.replace(/^www\./, '');
|
|
154
|
+
// Extract apex (xx.com) and registrable parts for fuzzy match.
|
|
155
|
+
const parts = cleanedHost.split('.');
|
|
156
|
+
const apex = parts.slice(-2).join('.');
|
|
157
|
+
const siteKey = parts.length > 1 ? parts[parts.length - 2] : cleanedHost;
|
|
158
|
+
const hits = new Map();
|
|
159
|
+
for (const cmd of registry.values()) {
|
|
160
|
+
const domain = cmd.domain?.toLowerCase();
|
|
161
|
+
const siteMatches = (domain && (cleanedHost.endsWith(domain) || domain.endsWith(apex))) ||
|
|
162
|
+
cmd.site.toLowerCase() === siteKey?.toLowerCase() ||
|
|
163
|
+
cleanedHost.includes(cmd.site.toLowerCase());
|
|
164
|
+
if (siteMatches) {
|
|
165
|
+
const list = hits.get(cmd.site) ?? [];
|
|
166
|
+
list.push(cmd);
|
|
167
|
+
hits.set(cmd.site, list);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
if (hits.size === 0)
|
|
171
|
+
return null;
|
|
172
|
+
// Pick the site with the most commands — likely the most-developed adapter,
|
|
173
|
+
// and the best reference for a new command on the same host.
|
|
174
|
+
let best = null;
|
|
175
|
+
for (const entry of hits) {
|
|
176
|
+
if (!best || entry[1].length > best[1].length)
|
|
177
|
+
best = entry;
|
|
178
|
+
}
|
|
179
|
+
if (!best)
|
|
180
|
+
return null;
|
|
181
|
+
return {
|
|
182
|
+
site: best[0],
|
|
183
|
+
example_commands: best[1].slice(0, 5).map((c) => `${c.site} ${c.name}`),
|
|
184
|
+
reason: `${best[1].length} existing adapter${best[1].length === 1 ? '' : 's'} target this site — reuse strategy/cookie config`,
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Synthesize the verdict from collected signals + registry.
|
|
189
|
+
*
|
|
190
|
+
* The `recommended_next_step` is deliberately a single imperative
|
|
191
|
+
* sentence — agents act on it directly instead of re-deriving advice from
|
|
192
|
+
* the structured fields.
|
|
193
|
+
*/
|
|
194
|
+
export function analyzeSite(signals, registry) {
|
|
195
|
+
const pattern = classifyPattern(signals);
|
|
196
|
+
const antiBot = detectAntiBot(signals);
|
|
197
|
+
const nearest = findNearestAdapter(signals.finalUrl, registry);
|
|
198
|
+
let next;
|
|
199
|
+
if (antiBot.detected) {
|
|
200
|
+
next = antiBot.implication;
|
|
201
|
+
}
|
|
202
|
+
else if (pattern.pattern === 'A') {
|
|
203
|
+
next = 'Pick the most specific JSON endpoint from `opencli browser network` and try a bare Node fetch with cookies; escalate to browser-context fetch only if blocked.';
|
|
204
|
+
}
|
|
205
|
+
else if (pattern.pattern === 'B') {
|
|
206
|
+
next = 'Read the SSR global via `opencli browser eval "JSON.stringify(window.__INITIAL_STATE__ ?? window.__NUXT__ ?? window.__NEXT_DATA__ ?? window.__APOLLO_STATE__)"` — no API needed.';
|
|
207
|
+
}
|
|
208
|
+
else if (pattern.pattern === 'C') {
|
|
209
|
+
next = 'No API visible — use SSR HTML scrape (e.g. `opencli browser extract`) against the rendered page.';
|
|
210
|
+
}
|
|
211
|
+
else if (pattern.pattern === 'D') {
|
|
212
|
+
next = 'Endpoints need auth. Re-open the page from a signed-in session, then retry analyze; see `field-decode-playbook` §4 for token tracing.';
|
|
213
|
+
}
|
|
214
|
+
else if (pattern.pattern === 'E') {
|
|
215
|
+
next = 'WebSocket stream detected — find the underlying HTTP poll/long-poll endpoint; raw WS is not supported.';
|
|
216
|
+
}
|
|
217
|
+
else {
|
|
218
|
+
next = 'No strong signal. Manually inspect `opencli browser network --all` and pick a pattern.';
|
|
219
|
+
}
|
|
220
|
+
return {
|
|
221
|
+
requested_url: signals.requestedUrl,
|
|
222
|
+
final_url: signals.finalUrl,
|
|
223
|
+
title: signals.title,
|
|
224
|
+
pattern,
|
|
225
|
+
anti_bot: antiBot,
|
|
226
|
+
initial_state: signals.initialState,
|
|
227
|
+
nearest_adapter: nearest,
|
|
228
|
+
recommended_next_step: next,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|