@jackwener/opencli 1.7.6 → 1.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +17 -8
  2. package/README.zh-CN.md +14 -8
  3. package/cli-manifest.json +325 -11
  4. package/clis/51job/company.js +125 -0
  5. package/clis/51job/detail.js +108 -0
  6. package/clis/51job/hot.js +55 -0
  7. package/clis/51job/search.js +79 -0
  8. package/clis/51job/utils.js +302 -0
  9. package/clis/51job/utils.test.js +69 -0
  10. package/clis/bilibili/video.js +11 -4
  11. package/clis/bilibili/video.test.js +51 -0
  12. package/clis/chatgpt/image.js +1 -1
  13. package/clis/deepseek/ask.js +19 -13
  14. package/clis/deepseek/ask.test.js +93 -1
  15. package/clis/deepseek/utils.js +108 -23
  16. package/clis/deepseek/utils.test.js +109 -1
  17. package/clis/gemini/image.js +1 -1
  18. package/clis/instagram/download.js +1 -1
  19. package/clis/twitter/likes.js +3 -2
  20. package/clis/twitter/search.js +4 -2
  21. package/clis/twitter/search.test.js +4 -0
  22. package/clis/twitter/shared.js +28 -0
  23. package/clis/twitter/shared.test.js +96 -0
  24. package/clis/twitter/thread.js +3 -1
  25. package/clis/twitter/timeline.js +3 -2
  26. package/clis/twitter/tweets.js +3 -2
  27. package/clis/twitter/tweets.test.js +1 -1
  28. package/clis/web/read.js +25 -5
  29. package/clis/web/read.test.js +76 -0
  30. package/clis/weread/ai-outline.js +170 -0
  31. package/clis/weread/ai-outline.test.js +83 -0
  32. package/clis/weread/book.js +57 -44
  33. package/clis/weread/commands.test.js +24 -0
  34. package/clis/xiaoyuzhou/podcast-episodes.js +2 -2
  35. package/clis/xiaoyuzhou/podcast-episodes.test.js +78 -0
  36. package/dist/src/browser/analyze.d.ts +103 -0
  37. package/dist/src/browser/analyze.js +230 -0
  38. package/dist/src/browser/analyze.test.d.ts +1 -0
  39. package/dist/src/browser/analyze.test.js +164 -0
  40. package/dist/src/browser/article-extract.d.ts +57 -0
  41. package/dist/src/browser/article-extract.e2e.test.d.ts +1 -0
  42. package/dist/src/browser/article-extract.e2e.test.js +105 -0
  43. package/dist/src/browser/article-extract.js +169 -0
  44. package/dist/src/browser/article-extract.test.d.ts +1 -0
  45. package/dist/src/browser/article-extract.test.js +94 -0
  46. package/dist/src/browser/cdp.js +11 -2
  47. package/dist/src/browser/verify-fixture.d.ts +59 -0
  48. package/dist/src/browser/verify-fixture.js +213 -0
  49. package/dist/src/browser/verify-fixture.test.d.ts +1 -0
  50. package/dist/src/browser/verify-fixture.test.js +161 -0
  51. package/dist/src/cli.d.ts +32 -0
  52. package/dist/src/cli.js +333 -43
  53. package/dist/src/cli.test.js +257 -1
  54. package/dist/src/daemon.d.ts +3 -2
  55. package/dist/src/daemon.js +16 -4
  56. package/dist/src/daemon.test.d.ts +1 -0
  57. package/dist/src/daemon.test.js +19 -0
  58. package/dist/src/download/article-download.d.ts +12 -0
  59. package/dist/src/download/article-download.js +141 -17
  60. package/dist/src/download/article-download.test.js +196 -0
  61. package/dist/src/download/index.js +73 -86
  62. package/dist/src/errors.js +4 -2
  63. package/dist/src/errors.test.js +13 -0
  64. package/dist/src/launcher.d.ts +1 -1
  65. package/dist/src/launcher.js +3 -3
  66. package/dist/src/output.js +1 -1
  67. package/dist/src/output.test.js +6 -0
  68. package/package.json +5 -1
@@ -0,0 +1,164 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { analyzeSite, detectAntiBot, classifyPattern, findNearestAdapter, } from './analyze.js';
3
+ function mkSignals(overrides = {}) {
4
+ return {
5
+ requestedUrl: 'https://example.com/',
6
+ finalUrl: 'https://example.com/',
7
+ cookieNames: [],
8
+ networkEntries: [],
9
+ initialState: {
10
+ __INITIAL_STATE__: false,
11
+ __NUXT__: false,
12
+ __NEXT_DATA__: false,
13
+ __APOLLO_STATE__: false,
14
+ },
15
+ title: 'Example',
16
+ ...overrides,
17
+ };
18
+ }
19
+ function mkCmd(site, name, domain) {
20
+ return {
21
+ site,
22
+ name,
23
+ description: '',
24
+ domain,
25
+ args: [],
26
+ };
27
+ }
28
+ describe('detectAntiBot', () => {
29
+ it('flags Aliyun WAF from cookie', () => {
30
+ const v = detectAntiBot(mkSignals({ cookieNames: ['JSESSIONID', 'acw_sc__v2'] }));
31
+ expect(v.detected).toBe(true);
32
+ expect(v.vendor).toBe('aliyun_waf');
33
+ expect(v.evidence).toContain('cookie:acw_sc__v2');
34
+ expect(v.implication).toMatch(/browser context/i);
35
+ });
36
+ it('flags Aliyun WAF from challenge HTML body', () => {
37
+ const v = detectAntiBot(mkSignals({
38
+ networkEntries: [
39
+ {
40
+ url: 'https://x.com/',
41
+ status: 200,
42
+ contentType: 'text/html',
43
+ bodyPreview: "var arg1 = 'A1B2C3D4E5F6A7B8C9D0E1F2A3B4C5D6';",
44
+ },
45
+ ],
46
+ }));
47
+ expect(v.detected).toBe(true);
48
+ expect(v.vendor).toBe('aliyun_waf');
49
+ });
50
+ it('flags Cloudflare from cf_clearance cookie', () => {
51
+ const v = detectAntiBot(mkSignals({ cookieNames: ['cf_clearance'] }));
52
+ expect(v.vendor).toBe('cloudflare');
53
+ expect(v.implication).toMatch(/Cloudflare/i);
54
+ });
55
+ it('flags Akamai from _abck cookie', () => {
56
+ const v = detectAntiBot(mkSignals({ cookieNames: ['_abck', 'bm_sz'] }));
57
+ expect(v.vendor).toBe('akamai');
58
+ });
59
+ it('returns no-match verdict with actionable fallback advice', () => {
60
+ const v = detectAntiBot(mkSignals());
61
+ expect(v.detected).toBe(false);
62
+ expect(v.vendor).toBeNull();
63
+ expect(v.implication).toMatch(/Node-side COOKIE fetch first/);
64
+ });
65
+ });
66
+ describe('classifyPattern', () => {
67
+ it('returns A for JSON-heavy pages without SSR state', () => {
68
+ const v = classifyPattern(mkSignals({
69
+ networkEntries: [
70
+ { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' },
71
+ { url: 'https://x.com/api/b', status: 200, contentType: 'application/json;charset=utf-8', bodyPreview: '{}' },
72
+ ],
73
+ }));
74
+ expect(v.pattern).toBe('A');
75
+ expect(v.json_responses).toBe(2);
76
+ });
77
+ it('returns B when __INITIAL_STATE__ is present, beating JSON signals', () => {
78
+ const v = classifyPattern(mkSignals({
79
+ initialState: { __INITIAL_STATE__: true, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: false },
80
+ networkEntries: [
81
+ { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' },
82
+ ],
83
+ }));
84
+ expect(v.pattern).toBe('B');
85
+ });
86
+ it('returns D when auth failures dominate', () => {
87
+ const v = classifyPattern(mkSignals({
88
+ networkEntries: [
89
+ { url: 'https://x.com/api/a', status: 401, contentType: 'application/json', bodyPreview: '' },
90
+ { url: 'https://x.com/api/b', status: 403, contentType: 'application/json', bodyPreview: '' },
91
+ ],
92
+ }));
93
+ expect(v.pattern).toBe('D');
94
+ expect(v.auth_failures).toBe(2);
95
+ });
96
+ it('returns C by default for static pages', () => {
97
+ const v = classifyPattern(mkSignals());
98
+ expect(v.pattern).toBe('C');
99
+ });
100
+ });
101
+ describe('findNearestAdapter', () => {
102
+ it('matches by domain suffix', () => {
103
+ const reg = new Map([
104
+ ['51job search', mkCmd('51job', 'search', '51job.com')],
105
+ ['51job detail', mkCmd('51job', 'detail', '51job.com')],
106
+ ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')],
107
+ ]);
108
+ const v = findNearestAdapter('https://jobs.51job.com/', reg);
109
+ expect(v?.site).toBe('51job');
110
+ expect(v?.example_commands).toContain('51job search');
111
+ });
112
+ it('falls back to site-name containment when no domain is registered', () => {
113
+ const reg = new Map([
114
+ ['51job search', mkCmd('51job', 'search')],
115
+ ]);
116
+ const v = findNearestAdapter('https://we.51job.com/', reg);
117
+ expect(v?.site).toBe('51job');
118
+ });
119
+ it('returns null when no adapter matches', () => {
120
+ const reg = new Map([
121
+ ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')],
122
+ ]);
123
+ const v = findNearestAdapter('https://random-site.io/', reg);
124
+ expect(v).toBeNull();
125
+ });
126
+ it('prefers the site with the most commands', () => {
127
+ const reg = new Map([
128
+ ['a search', mkCmd('a', 'search', 'a.com')],
129
+ ['b search', mkCmd('b', 'search', 'a.com')],
130
+ ['b detail', mkCmd('b', 'detail', 'a.com')],
131
+ ['b company', mkCmd('b', 'company', 'a.com')],
132
+ ]);
133
+ const v = findNearestAdapter('https://jobs.a.com/', reg);
134
+ expect(v?.site).toBe('b');
135
+ });
136
+ });
137
+ describe('analyzeSite', () => {
138
+ it('recommends browser-context fetch when WAF is detected', () => {
139
+ const report = analyzeSite(mkSignals({ cookieNames: ['acw_sc__v2'] }), new Map());
140
+ expect(report.anti_bot.vendor).toBe('aliyun_waf');
141
+ expect(report.recommended_next_step).toMatch(/browser context/i);
142
+ });
143
+ it('recommends reading SSR state when Pattern B fires', () => {
144
+ const report = analyzeSite(mkSignals({
145
+ initialState: { __INITIAL_STATE__: false, __NUXT__: true, __NEXT_DATA__: false, __APOLLO_STATE__: false },
146
+ }), new Map());
147
+ expect(report.pattern.pattern).toBe('B');
148
+ expect(report.recommended_next_step).toMatch(/__NUXT__|__INITIAL_STATE__|__NEXT_DATA__/);
149
+ });
150
+ it('includes __APOLLO_STATE__ in Pattern B next-step guidance', () => {
151
+ const report = analyzeSite(mkSignals({
152
+ initialState: { __INITIAL_STATE__: false, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: true },
153
+ }), new Map());
154
+ expect(report.pattern.pattern).toBe('B');
155
+ expect(report.recommended_next_step).toMatch(/__APOLLO_STATE__/);
156
+ });
157
+ it('includes nearest_adapter when the registry has a match', () => {
158
+ const reg = new Map([
159
+ ['51job search', mkCmd('51job', 'search', '51job.com')],
160
+ ]);
161
+ const report = analyzeSite(mkSignals({ finalUrl: 'https://we.51job.com/' }), reg);
162
+ expect(report.nearest_adapter?.site).toBe('51job');
163
+ });
164
+ });
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Article extraction via Readability — generic `page → article HTML` pipeline.
3
+ *
4
+ * Complements `src/browser/extract.ts`: that one takes a caller-supplied
5
+ * selector. This one works with zero configuration on arbitrary article pages
6
+ * (blogs, news, docs) by running `@mozilla/readability` inside the page
7
+ * context via CDP evaluate.
8
+ *
9
+ * Pipeline:
10
+ * 1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page
11
+ * renderer wrapping a plain-text file would pollute the DOM pipeline.
12
+ * 2. Short-circuit the "body is a single <pre>" case, which browsers use
13
+ * when loading *.txt / *.md over file:// or raw.githubusercontent.com.
14
+ * 3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the
15
+ * clone (preserves live page state for subsequent snapshot/click).
16
+ * 4. Inject Readability + isProbablyReaderable sources into the page,
17
+ * parse on the clone. `isProbablyReaderable` gates the parse unless
18
+ * `force: true`.
19
+ * 5. On Readability miss, walk a fallback selector chain
20
+ * (main → [role="main"] → #main-content → … → body) and return the
21
+ * first root with >80 characters of text.
22
+ *
23
+ * Readability runs in the page's own window because it needs real DOM APIs
24
+ * (getComputedStyle, treeWalker). Running it Node-side would require jsdom —
25
+ * a heavy dep the rest of OpenCLI doesn't need.
26
+ */
27
+ export interface ExtractArticleOptions {
28
+ /** CSS selectors removed from the cloned document before Readability runs. */
29
+ cleanSelectors?: string[];
30
+ /** Fallback chain when Readability fails. Defaults to the common structural ids. */
31
+ fallbackSelectors?: string[];
32
+ /** Bypass `isProbablyReaderable` and always attempt a parse. */
33
+ force?: boolean;
34
+ }
35
+ export type ExtractSource = 'readability' | 'fallback' | 'raw-text' | 'pre';
36
+ export interface ExtractedArticle {
37
+ html: string;
38
+ title: string;
39
+ byline?: string;
40
+ publishedTime?: string;
41
+ siteName?: string;
42
+ source: ExtractSource;
43
+ }
44
+ export declare const DEFAULT_FALLBACK_SELECTORS: string[];
45
+ /**
46
+ * Build the JS expression evaluated in-page to extract the article. Exported
47
+ * for testability — callers on the host side should use `extractArticle`.
48
+ */
49
+ export declare function buildExtractArticleJs(options?: ExtractArticleOptions): string;
50
+ export interface PageLike {
51
+ evaluate(js: string): Promise<unknown>;
52
+ }
53
+ /**
54
+ * Run the extract pipeline on the given page. Returns `null` when no usable
55
+ * content is found (Readability miss + empty fallback chain).
56
+ */
57
+ export declare function extractArticle(page: PageLike, options?: ExtractArticleOptions): Promise<ExtractedArticle | null>;
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,105 @@
1
+ import { afterEach, describe, expect, it } from 'vitest';
2
+ import { JSDOM } from 'jsdom';
3
+ import * as fs from 'node:fs';
4
+ import * as os from 'node:os';
5
+ import * as path from 'node:path';
6
+ import { fileURLToPath } from 'node:url';
7
+ import { buildExtractArticleJs } from './article-extract.js';
8
+ import { downloadArticle } from '../download/article-download.js';
9
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
+ const fixturesDir = path.join(__dirname, '__fixtures__', 'article-extract');
11
+ const tempDirs = [];
12
+ afterEach(() => {
13
+ for (const dir of tempDirs)
14
+ fs.rmSync(dir, { recursive: true, force: true });
15
+ tempDirs.length = 0;
16
+ });
17
+ function loadFixture(name) {
18
+ return fs.readFileSync(path.join(fixturesDir, name), 'utf8');
19
+ }
20
+ function escapeHtml(text) {
21
+ return text.replace(/[&<>]/g, ch => ({ '&': '&amp;', '<': '&lt;', '>': '&gt;' }[ch]));
22
+ }
23
+ function runExtract(html, url, options = {}, contentType) {
24
+ const dom = new JSDOM(html, {
25
+ url,
26
+ contentType: 'text/html',
27
+ pretendToBeVisual: true,
28
+ runScripts: 'outside-only',
29
+ });
30
+ if (contentType) {
31
+ Object.defineProperty(dom.window.document, 'contentType', {
32
+ value: contentType,
33
+ configurable: true,
34
+ });
35
+ }
36
+ return dom.window.eval(buildExtractArticleJs(options));
37
+ }
38
+ async function renderMarkdown(article, url, options = {}) {
39
+ const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-e2e-'));
40
+ tempDirs.push(tempDir);
41
+ const result = await downloadArticle({
42
+ title: article.title || 'untitled',
43
+ contentHtml: article.html,
44
+ sourceUrl: url,
45
+ }, {
46
+ output: tempDir,
47
+ downloadImages: false,
48
+ cleanSelectors: options.cleanSelectors,
49
+ });
50
+ expect(result[0].status).toBe('success');
51
+ return fs.readFileSync(result[0].saved, 'utf8');
52
+ }
53
+ describe('article extract → markdown e2e fixtures', () => {
54
+ it('extracts a Wikipedia article fixture and keeps infobox/reference noise out of markdown', async () => {
55
+ const url = 'https://en.wikipedia.org/wiki/Markdown';
56
+ const cleanSelectors = ['.infobox', '.navbox', '.reference', '.mw-editsection', '.metadata'];
57
+ const article = runExtract(loadFixture('wikipedia-markdown.html'), url, { cleanSelectors });
58
+ expect(article?.source).toBe('readability');
59
+ expect(article?.title).toBe('Markdown');
60
+ if (!article)
61
+ throw new Error('expected extracted article');
62
+ const md = await renderMarkdown(article, url, { cleanSelectors });
63
+ expect(md).toContain('lightweight markup language');
64
+ expect(md).toContain('John Gruber');
65
+ expect(md).not.toContain('Syntax description');
66
+ expect(md).not.toContain('Standard file extension');
67
+ });
68
+ it('extracts a Deno blog fixture, preserves embedded iframes as markdown links, and drops page chrome', async () => {
69
+ const url = 'https://deno.com/blog/v2.0';
70
+ const article = runExtract(loadFixture('deno-v2.html'), url);
71
+ expect(article?.source).toBe('readability');
72
+ expect(article?.title).toBe('Announcing Deno 2 | Deno');
73
+ if (!article)
74
+ throw new Error('expected extracted article');
75
+ const md = await renderMarkdown(article, url);
76
+ expect(md).toContain('## Announcing Deno 2');
77
+ expect(md).toContain('The web is humanity’s largest software platform');
78
+ expect(md).toMatch(/\]\(https:\/\/www\.youtube(?:-nocookie)?\.com\/embed\/[^)]+\)/);
79
+ expect(md).not.toContain('Skip to main content');
80
+ });
81
+ it('short-circuits non-HTML raw text pages end-to-end', async () => {
82
+ const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md';
83
+ const text = loadFixture('openai-cookbook-readme.txt');
84
+ const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`;
85
+ const article = runExtract(html, url, {}, 'text/plain');
86
+ expect(article?.source).toBe('raw-text');
87
+ if (!article)
88
+ throw new Error('expected extracted article');
89
+ const md = await renderMarkdown(article, url);
90
+ expect(md).toContain('OPENAI\\_API\\_KEY');
91
+ expect(md).toContain('Example code and guides for accomplishing common tasks');
92
+ });
93
+ it('short-circuits a single-pre document end-to-end', async () => {
94
+ const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md';
95
+ const text = loadFixture('openai-cookbook-readme.txt');
96
+ const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`;
97
+ const article = runExtract(html, url);
98
+ expect(article?.source).toBe('pre');
99
+ if (!article)
100
+ throw new Error('expected extracted article');
101
+ const md = await renderMarkdown(article, url);
102
+ expect(md).toContain('OPENAI\\_API\\_KEY');
103
+ expect(md).toContain('Most code examples are written in Python');
104
+ });
105
+ });
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Article extraction via Readability — generic `page → article HTML` pipeline.
3
+ *
4
+ * Complements `src/browser/extract.ts`: that one takes a caller-supplied
5
+ * selector. This one works with zero configuration on arbitrary article pages
6
+ * (blogs, news, docs) by running `@mozilla/readability` inside the page
7
+ * context via CDP evaluate.
8
+ *
9
+ * Pipeline:
10
+ * 1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page
11
+ * renderer wrapping a plain-text file would pollute the DOM pipeline.
12
+ * 2. Short-circuit the "body is a single <pre>" case, which browsers use
13
+ * when loading *.txt / *.md over file:// or raw.githubusercontent.com.
14
+ * 3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the
15
+ * clone (preserves live page state for subsequent snapshot/click).
16
+ * 4. Inject Readability + isProbablyReaderable sources into the page,
17
+ * parse on the clone. `isProbablyReaderable` gates the parse unless
18
+ * `force: true`.
19
+ * 5. On Readability miss, walk a fallback selector chain
20
+ * (main → [role="main"] → #main-content → … → body) and return the
21
+ * first root with >80 characters of text.
22
+ *
23
+ * Readability runs in the page's own window because it needs real DOM APIs
24
+ * (getComputedStyle, treeWalker). Running it Node-side would require jsdom —
25
+ * a heavy dep the rest of OpenCLI doesn't need.
26
+ */
27
+ import * as fs from 'node:fs';
28
+ import { createRequire } from 'node:module';
29
+ const requireFromHere = createRequire(import.meta.url);
30
+ let cachedSources = null;
31
+ function readabilitySources() {
32
+ if (cachedSources)
33
+ return cachedSources;
34
+ const readabilityPath = requireFromHere.resolve('@mozilla/readability/Readability.js');
35
+ const readerablePath = requireFromHere.resolve('@mozilla/readability/Readability-readerable.js');
36
+ cachedSources = {
37
+ readability: fs.readFileSync(readabilityPath, 'utf8'),
38
+ readerable: fs.readFileSync(readerablePath, 'utf8'),
39
+ };
40
+ return cachedSources;
41
+ }
42
+ export const DEFAULT_FALLBACK_SELECTORS = [
43
+ 'main',
44
+ '[role="main"]',
45
+ '#main-content',
46
+ '#main',
47
+ '#content',
48
+ '.content',
49
+ 'article',
50
+ 'body',
51
+ ];
52
+ const MIN_FALLBACK_TEXT_LENGTH = 80;
53
+ /**
54
+ * Build the JS expression evaluated in-page to extract the article. Exported
55
+ * for testability — callers on the host side should use `extractArticle`.
56
+ */
57
+ export function buildExtractArticleJs(options = {}) {
58
+ const { readability, readerable } = readabilitySources();
59
+ const cleanSelectors = options.cleanSelectors ?? [];
60
+ const fallbackSelectors = options.fallbackSelectors ?? DEFAULT_FALLBACK_SELECTORS;
61
+ const force = !!options.force;
62
+ // Library sources contain backticks and ${...} fragments, so we embed them
63
+ // as JSON-encoded string literals and eval them inside a Function() scope.
64
+ // This isolates their var declarations from the outer IIFE without polluting
65
+ // window globals.
66
+ const readabilityLit = JSON.stringify(readability);
67
+ const readerableLit = JSON.stringify(readerable);
68
+ const cleanLit = JSON.stringify(cleanSelectors);
69
+ const fallbackLit = JSON.stringify(fallbackSelectors);
70
+ const forceLit = JSON.stringify(force);
71
+ return [
72
+ '(() => {',
73
+ ' const cleanSelectors = ' + cleanLit + ';',
74
+ ' const fallbackSelectors = ' + fallbackLit + ';',
75
+ ' const force = ' + forceLit + ';',
76
+ ' const minFallbackText = ' + MIN_FALLBACK_TEXT_LENGTH + ';',
77
+ ' const readabilitySrc = ' + readabilityLit + ';',
78
+ ' const readerableSrc = ' + readerableLit + ';',
79
+ '',
80
+ ' function escapeHtml(s) {',
81
+ ' return String(s).replace(/[&<>]/g, c => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[c]));',
82
+ ' }',
83
+ '',
84
+ ' // Short-circuit 1: non-HTML document',
85
+ ' const ct = document.contentType || "";',
86
+ ' if (ct && ct !== "text/html" && ct !== "application/xhtml+xml") {',
87
+ ' const body = document.body ? (document.body.textContent || "") : "";',
88
+ ' return { source: "raw-text", html: "<pre>" + escapeHtml(body) + "</pre>", title: document.title || "" };',
89
+ ' }',
90
+ '',
91
+ ' // Short-circuit 2: body is a single <pre>',
92
+ ' if (document.body) {',
93
+ ' const kids = document.body.children;',
94
+ ' if (kids.length === 1 && kids[0] && kids[0].tagName === "PRE") {',
95
+ ' return { source: "pre", html: document.body.outerHTML, title: document.title || "" };',
96
+ ' }',
97
+ ' }',
98
+ '',
99
+ ' // Deep-clone + adapter-supplied dirty-node removal',
100
+ ' const cloneDoc = document.cloneNode(true);',
101
+ ' for (const sel of cleanSelectors) {',
102
+ ' try { for (const n of cloneDoc.querySelectorAll(sel)) n.remove(); }',
103
+ ' catch (e) { /* ignore invalid selector */ }',
104
+ ' }',
105
+ '',
106
+ ' // Inject Readability into an isolated Function scope and extract the',
107
+ ' // constructors we need. Library sources use their own module.exports',
108
+ ' // guard (if typeof module === "object"), which is falsy here.',
109
+ ' const libs = (new Function(',
110
+ ' readabilitySrc + "\\n" + readerableSrc + "\\nreturn {" +',
111
+ ' " Readability: typeof Readability !== \\"undefined\\" ? Readability : null," +',
112
+ ' " isProbablyReaderable: typeof isProbablyReaderable !== \\"undefined\\" ? isProbablyReaderable : null" +',
113
+ ' " };"',
114
+ ' ))();',
115
+ ' const Readability = libs.Readability;',
116
+ ' const isProbablyReaderable = libs.isProbablyReaderable;',
117
+ '',
118
+ ' const readerableOk = force || (typeof isProbablyReaderable === "function" ? isProbablyReaderable(cloneDoc) : true);',
119
+ ' let article = null;',
120
+ ' if (readerableOk && typeof Readability === "function") {',
121
+ ' try { article = new Readability(cloneDoc).parse(); } catch (e) { article = null; }',
122
+ ' }',
123
+ ' if (article && article.content) {',
124
+ ' return {',
125
+ ' source: "readability",',
126
+ ' html: article.content,',
127
+ ' title: article.title || document.title || "",',
128
+ ' byline: article.byline || undefined,',
129
+ ' publishedTime: article.publishedTime || undefined,',
130
+ ' siteName: article.siteName || undefined,',
131
+ ' };',
132
+ ' }',
133
+ '',
134
+ ' // Fallback chain',
135
+ ' for (const sel of fallbackSelectors) {',
136
+ ' let el = null;',
137
+ ' try { el = cloneDoc.querySelector(sel); } catch (e) { continue; }',
138
+ ' if (!el) continue;',
139
+ ' const text = (el.textContent || "").trim();',
140
+ ' if (text.length < minFallbackText) continue;',
141
+ ' return { source: "fallback", html: el.outerHTML, title: document.title || "" };',
142
+ ' }',
143
+ '',
144
+ ' return null;',
145
+ '})()',
146
+ ].join('\n');
147
+ }
148
+ /**
149
+ * Run the extract pipeline on the given page. Returns `null` when no usable
150
+ * content is found (Readability miss + empty fallback chain).
151
+ */
152
+ export async function extractArticle(page, options = {}) {
153
+ const js = buildExtractArticleJs(options);
154
+ const raw = await page.evaluate(js);
155
+ if (raw == null || typeof raw !== 'object')
156
+ return null;
157
+ const r = raw;
158
+ if (typeof r.html !== 'string' || typeof r.source !== 'string')
159
+ return null;
160
+ const source = r.source;
161
+ return {
162
+ html: r.html,
163
+ title: typeof r.title === 'string' ? r.title : '',
164
+ ...(r.byline && { byline: r.byline }),
165
+ ...(r.publishedTime && { publishedTime: r.publishedTime }),
166
+ ...(r.siteName && { siteName: r.siteName }),
167
+ source,
168
+ };
169
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,94 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { buildExtractArticleJs, extractArticle, DEFAULT_FALLBACK_SELECTORS, } from './article-extract.js';
3
+ function fakePage(response) {
4
+ const state = { lastJs: null };
5
+ return {
6
+ lastJs: null,
7
+ async evaluate(js) {
8
+ state.lastJs = js;
9
+ Object.assign(this, state);
10
+ return response;
11
+ },
12
+ };
13
+ }
14
+ describe('buildExtractArticleJs', () => {
15
+ it('embeds Readability + Readerable sources once per evaluation', () => {
16
+ const js = buildExtractArticleJs();
17
+ // Both libs should be inlined (matched by identifying strings from the
18
+ // upstream @mozilla/readability sources).
19
+ expect(js).toContain('function Readability(doc, options)');
20
+ expect(js).toContain('function isProbablyReaderable');
21
+ });
22
+ it('serializes caller-supplied options into the evaluated JS', () => {
23
+ const js = buildExtractArticleJs({
24
+ cleanSelectors: ['.ads', '#banner'],
25
+ fallbackSelectors: ['article', 'body'],
26
+ force: true,
27
+ });
28
+ expect(js).toContain('[".ads","#banner"]');
29
+ expect(js).toContain('["article","body"]');
30
+ expect(js).toContain('const force = true;');
31
+ });
32
+ it('uses the default fallback chain when none is supplied', () => {
33
+ const js = buildExtractArticleJs();
34
+ for (const sel of DEFAULT_FALLBACK_SELECTORS) {
35
+ expect(js).toContain(JSON.stringify(sel));
36
+ }
37
+ });
38
+ it('runs fallback selection against the cleaned clone', () => {
39
+ const js = buildExtractArticleJs({ cleanSelectors: ['.noise'] });
40
+ expect(js).toContain('el = cloneDoc.querySelector(sel);');
41
+ expect(js).not.toContain('el = document.querySelector(sel);');
42
+ });
43
+ it('produces syntactically valid JavaScript', () => {
44
+ // Parsing via the Function constructor rejects any syntax error in the
45
+ // generated code — including accidental template-literal break-outs from
46
+ // the embedded Readability sources.
47
+ expect(() => new Function(buildExtractArticleJs())).not.toThrow();
48
+ expect(() => new Function(buildExtractArticleJs({ force: true }))).not.toThrow();
49
+ expect(() => new Function(buildExtractArticleJs({
50
+ cleanSelectors: ['.a', '.b'],
51
+ fallbackSelectors: ['main', 'body'],
52
+ }))).not.toThrow();
53
+ });
54
+ });
55
+ describe('extractArticle (host-side)', () => {
56
+ it('returns a normalized ExtractedArticle when the page responds with one', async () => {
57
+ const page = fakePage({
58
+ source: 'readability',
59
+ html: '<p>hello</p>',
60
+ title: 'Hello',
61
+ byline: 'Alice',
62
+ publishedTime: '2026-04-22',
63
+ siteName: 'Example',
64
+ });
65
+ const res = await extractArticle(page);
66
+ expect(res).toEqual({
67
+ source: 'readability',
68
+ html: '<p>hello</p>',
69
+ title: 'Hello',
70
+ byline: 'Alice',
71
+ publishedTime: '2026-04-22',
72
+ siteName: 'Example',
73
+ });
74
+ });
75
+ it('drops undefined optional fields cleanly', async () => {
76
+ const page = fakePage({ source: 'fallback', html: '<main>x</main>', title: 't' });
77
+ const res = await extractArticle(page);
78
+ expect(res).toEqual({ source: 'fallback', html: '<main>x</main>', title: 't' });
79
+ expect(res).not.toHaveProperty('byline');
80
+ expect(res).not.toHaveProperty('publishedTime');
81
+ });
82
+ it('returns null on a missing body or malformed payload', async () => {
83
+ expect(await extractArticle(fakePage(null))).toBeNull();
84
+ expect(await extractArticle(fakePage('oops'))).toBeNull();
85
+ expect(await extractArticle(fakePage({ source: 'readability' }))).toBeNull();
86
+ expect(await extractArticle(fakePage({ html: '<p>x</p>' }))).toBeNull();
87
+ });
88
+ it('defaults title to empty string when the page omits it', async () => {
89
+ const page = fakePage({ source: 'pre', html: '<body><pre>x</pre></body>' });
90
+ const res = await extractArticle(page);
91
+ expect(res?.title).toBe('');
92
+ expect(res?.source).toBe('pre');
93
+ });
94
+ });
@@ -91,7 +91,12 @@ export class CDPBridge {
91
91
  }
92
92
  }
93
93
  }
94
- catch { }
94
+ catch (err) {
95
+ if (process.env.OPENCLI_VERBOSE) {
96
+ // eslint-disable-next-line no-console
97
+ console.error('[cdp] Failed to parse WebSocket message:', err instanceof Error ? err.message : err);
98
+ }
99
+ }
95
100
  });
96
101
  });
97
102
  }
@@ -253,8 +258,12 @@ class CDPPage extends BasePage {
253
258
  this._networkEntries[idx].responseBodyFullSize = fullSize;
254
259
  this._networkEntries[idx].responseBodyTruncated = truncated;
255
260
  }
256
- }).catch(() => {
261
+ }).catch((err) => {
257
262
  // Body unavailable for some requests (e.g. uploads) — non-fatal
263
+ if (process.env.OPENCLI_VERBOSE) {
264
+ // eslint-disable-next-line no-console
265
+ console.error(`[cdp] getResponseBody failed for ${p.requestId}:`, err instanceof Error ? err.message : err);
266
+ }
258
267
  }).finally(() => {
259
268
  this._pendingBodyFetches.delete(bodyFetch);
260
269
  });