@jackwener/opencli 1.7.6 → 1.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +17 -8
  2. package/README.zh-CN.md +14 -8
  3. package/cli-manifest.json +469 -11
  4. package/clis/51job/company.js +125 -0
  5. package/clis/51job/detail.js +108 -0
  6. package/clis/51job/hot.js +55 -0
  7. package/clis/51job/search.js +79 -0
  8. package/clis/51job/utils.js +302 -0
  9. package/clis/51job/utils.test.js +69 -0
  10. package/clis/amazon/discussion.js +37 -6
  11. package/clis/amazon/discussion.test.js +147 -32
  12. package/clis/bilibili/video.js +11 -4
  13. package/clis/bilibili/video.test.js +51 -0
  14. package/clis/chatgpt/image.js +1 -1
  15. package/clis/chatgpt-app/ask.js +3 -19
  16. package/clis/chatgpt-app/ax.js +132 -1
  17. package/clis/chatgpt-app/ax.test.js +23 -0
  18. package/clis/chatgpt-app/send.js +2 -21
  19. package/clis/deepseek/ask.js +50 -18
  20. package/clis/deepseek/ask.test.js +195 -2
  21. package/clis/deepseek/utils.js +113 -29
  22. package/clis/deepseek/utils.test.js +109 -1
  23. package/clis/gemini/image.js +1 -1
  24. package/clis/instagram/download.js +1 -1
  25. package/clis/powerchina/search.js +250 -0
  26. package/clis/powerchina/search.test.js +67 -0
  27. package/clis/sinafinance/stock.js +5 -2
  28. package/clis/sinafinance/stock.test.js +59 -0
  29. package/clis/toutiao/articles.js +81 -0
  30. package/clis/toutiao/articles.test.js +23 -0
  31. package/clis/twitter/likes.js +3 -2
  32. package/clis/twitter/search.js +4 -2
  33. package/clis/twitter/search.test.js +4 -0
  34. package/clis/twitter/shared.js +28 -0
  35. package/clis/twitter/shared.test.js +96 -0
  36. package/clis/twitter/thread.js +3 -1
  37. package/clis/twitter/timeline.js +3 -2
  38. package/clis/twitter/tweets.js +3 -2
  39. package/clis/twitter/tweets.test.js +1 -1
  40. package/clis/web/read.js +25 -5
  41. package/clis/web/read.test.js +76 -0
  42. package/clis/weixin/create-draft.js +225 -0
  43. package/clis/weixin/drafts.js +65 -0
  44. package/clis/weixin/drafts.test.js +65 -0
  45. package/clis/weread/ai-outline.js +170 -0
  46. package/clis/weread/ai-outline.test.js +83 -0
  47. package/clis/weread/book.js +57 -44
  48. package/clis/weread/commands.test.js +24 -0
  49. package/clis/xiaoyuzhou/podcast-episodes.js +2 -2
  50. package/clis/xiaoyuzhou/podcast-episodes.test.js +78 -0
  51. package/dist/src/browser/analyze.d.ts +103 -0
  52. package/dist/src/browser/analyze.js +230 -0
  53. package/dist/src/browser/analyze.test.d.ts +1 -0
  54. package/dist/src/browser/analyze.test.js +164 -0
  55. package/dist/src/browser/article-extract.d.ts +57 -0
  56. package/dist/src/browser/article-extract.e2e.test.d.ts +1 -0
  57. package/dist/src/browser/article-extract.e2e.test.js +105 -0
  58. package/dist/src/browser/article-extract.js +169 -0
  59. package/dist/src/browser/article-extract.test.d.ts +1 -0
  60. package/dist/src/browser/article-extract.test.js +94 -0
  61. package/dist/src/browser/cdp.js +11 -2
  62. package/dist/src/browser/verify-fixture.d.ts +59 -0
  63. package/dist/src/browser/verify-fixture.js +213 -0
  64. package/dist/src/browser/verify-fixture.test.d.ts +1 -0
  65. package/dist/src/browser/verify-fixture.test.js +161 -0
  66. package/dist/src/cli.d.ts +32 -0
  67. package/dist/src/cli.js +333 -43
  68. package/dist/src/cli.test.js +257 -1
  69. package/dist/src/commanderAdapter.js +12 -0
  70. package/dist/src/commanderAdapter.test.js +11 -0
  71. package/dist/src/daemon.d.ts +3 -2
  72. package/dist/src/daemon.js +16 -4
  73. package/dist/src/daemon.test.d.ts +1 -0
  74. package/dist/src/daemon.test.js +19 -0
  75. package/dist/src/download/article-download.d.ts +12 -0
  76. package/dist/src/download/article-download.js +141 -17
  77. package/dist/src/download/article-download.test.js +196 -0
  78. package/dist/src/download/index.js +73 -86
  79. package/dist/src/errors.js +4 -2
  80. package/dist/src/errors.test.js +13 -0
  81. package/dist/src/launcher.d.ts +1 -1
  82. package/dist/src/launcher.js +3 -3
  83. package/dist/src/output.js +1 -1
  84. package/dist/src/output.test.js +6 -0
  85. package/package.json +5 -1
@@ -0,0 +1,230 @@
1
+ /**
2
+ * `browser analyze <url>` — turn site-recon guesswork into deterministic CLI output.
3
+ *
4
+ * When an agent starts a new adapter, the first question is "which pattern am
5
+ * I looking at?" (A/B/C/D/E from site-recon docs) and "will Node-side fetch
6
+ * work, or will anti-bot middleware block me?". Today the agent has to open
7
+ * the page, poke `network`, try cURL, fail, guess again. This module condenses
8
+ * that into one call that returns a classification + evidence.
9
+ *
10
+ * Kept pure (no page imports) so the bulk is unit-testable; the CLI wrapper
11
+ * drives a real page, feeds the resulting signals here, and prints the verdict.
12
+ */
13
+ /**
14
+ * WAF vendors we can reliably detect from cookies + response body markers
15
+ * alone. Signals are orthogonal per vendor — so when two vendors match
16
+ * simultaneously (rare), we keep all evidence and report the higher-signal
17
+ * vendor first.
18
+ */
19
+ const WAF_SIGNATURES = [
20
+ {
21
+ vendor: 'aliyun_waf',
22
+ cookiePatterns: [/^acw_sc__v2$/, /^acw_tc$/, /^ssxmod_itna/],
23
+ bodyPatterns: [/arg1\s*=\s*['"][0-9A-F]{30,}/, /\/ntc_captcha\//i],
24
+ implication: 'Direct Node-side fetch/curl will return the slider HTML. Validate the endpoint in browser context first; HTML COOKIE adapters still finish with Node-side fetch + page.getCookies.',
25
+ },
26
+ {
27
+ vendor: 'cloudflare',
28
+ cookiePatterns: [/^__cf_bm$/, /^cf_clearance$/, /^__cfduid$/],
29
+ bodyPatterns: [/Cloudflare Ray ID/i, /Checking your browser before accessing/i, /cf-chl-/i],
30
+ implication: 'Cloudflare bot check. Start from a real browser session; probe in browser context first. HTML COOKIE adapters still finish with Node-side fetch + page.getCookies.',
31
+ },
32
+ {
33
+ vendor: 'akamai',
34
+ cookiePatterns: [/^_abck$/, /^bm_sz$/, /^bm_sv$/],
35
+ bodyPatterns: [/akamai/i],
36
+ implication: 'Akamai Bot Manager. Probe in browser context first; keep final HTML COOKIE adapters on Node-side fetch + page.getCookies.',
37
+ },
38
+ {
39
+ vendor: 'geetest',
40
+ cookiePatterns: [],
41
+ bodyPatterns: [/geetest/i, /gt_captcha/i],
42
+ implication: 'Geetest slider/puzzle captcha. Agent cannot bypass programmatically — requires UI strategy or human-in-loop.',
43
+ },
44
+ ];
45
+ export function detectAntiBot(signals) {
46
+ const evidence = [];
47
+ let match = null;
48
+ for (const sig of WAF_SIGNATURES) {
49
+ const hits = [];
50
+ for (const pat of sig.cookiePatterns) {
51
+ const hit = signals.cookieNames.find((c) => pat.test(c));
52
+ if (hit)
53
+ hits.push(`cookie:${hit}`);
54
+ }
55
+ for (const pat of sig.bodyPatterns) {
56
+ for (const entry of signals.networkEntries) {
57
+ if (entry.bodyPreview && pat.test(entry.bodyPreview)) {
58
+ hits.push(`body:${entry.url}`);
59
+ break;
60
+ }
61
+ }
62
+ }
63
+ if (hits.length > 0 && !match) {
64
+ match = sig;
65
+ evidence.push(...hits);
66
+ }
67
+ }
68
+ if (!match) {
69
+ return {
70
+ detected: false,
71
+ vendor: null,
72
+ evidence: [],
73
+ implication: 'No known anti-bot signatures. Try Node-side COOKIE fetch first; if endpoint validation is blocked, retry from browser context.',
74
+ };
75
+ }
76
+ return {
77
+ detected: true,
78
+ vendor: match.vendor,
79
+ evidence,
80
+ implication: match.implication,
81
+ };
82
+ }
83
+ /**
84
+ * Apply the decision tree from `site-recon.md` mechanically.
85
+ *
86
+ * B beats A when initial-state globals are present: even if the page fetches
87
+ * more data via XHR afterwards, the SSR payload is the highest-leverage source.
88
+ * D (token-gated) dominates when we see 401/403 on what looks like API
89
+ * endpoints — without that, an authenticated route looks identical to A.
90
+ */
91
+ export function classifyPattern(signals) {
92
+ const jsonEntries = signals.networkEntries.filter((e) => /json/i.test(e.contentType));
93
+ const authFailures = signals.networkEntries.filter((e) => e.status === 401 || e.status === 403).length;
94
+ const hasInitialState = signals.initialState.__INITIAL_STATE__ ||
95
+ signals.initialState.__NUXT__ ||
96
+ signals.initialState.__NEXT_DATA__ ||
97
+ signals.initialState.__APOLLO_STATE__;
98
+ if (authFailures >= 2 && jsonEntries.length >= 1) {
99
+ return {
100
+ pattern: 'D',
101
+ reason: `${authFailures} auth-failing API responses seen — endpoint is token-gated`,
102
+ json_responses: jsonEntries.length,
103
+ auth_failures: authFailures,
104
+ };
105
+ }
106
+ if (hasInitialState) {
107
+ const which = Object.entries(signals.initialState)
108
+ .filter(([, v]) => v)
109
+ .map(([k]) => k);
110
+ return {
111
+ pattern: 'B',
112
+ reason: `SSR state global present: ${which.join(', ')}`,
113
+ json_responses: jsonEntries.length,
114
+ auth_failures: authFailures,
115
+ };
116
+ }
117
+ if (jsonEntries.length >= 1) {
118
+ return {
119
+ pattern: 'A',
120
+ reason: `${jsonEntries.length} JSON XHR/fetch responses observed — classic API pattern`,
121
+ json_responses: jsonEntries.length,
122
+ auth_failures: authFailures,
123
+ };
124
+ }
125
+ // No API, no SSR state — probably static HTML or a bundled SPA that lazy-loads.
126
+ // Pattern C (HTML scrape) is the default fallback; E (streaming) we can't
127
+ // reliably detect without watching WebSocket frames, so we label 'C' and
128
+ // leave the agent to upgrade to E manually if they see WS traffic.
129
+ return {
130
+ pattern: 'C',
131
+ reason: 'No JSON XHR and no SSR state — HTML scrape (Pattern C); escalate to E manually if WebSocket traffic appears',
132
+ json_responses: jsonEntries.length,
133
+ auth_failures: authFailures,
134
+ };
135
+ }
136
+ /**
137
+ * Find existing adapters that target the same site.
138
+ *
139
+ * Keep the hostname match simple — agents extend naming conventions
140
+ * differently per site, so we match on the registered `domain` field and fall
141
+ * back to site-name containment. Returning `null` is fine; agents can always
142
+ * read site-memory docs.
143
+ */
144
+ export function findNearestAdapter(finalUrl, registry) {
145
+ let host;
146
+ try {
147
+ host = new URL(finalUrl).hostname;
148
+ }
149
+ catch {
150
+ return null;
151
+ }
152
+ // Strip leading www.; 'www' as a site identifier is never what an adapter uses.
153
+ const cleanedHost = host.replace(/^www\./, '');
154
+ // Extract apex (xx.com) and registrable parts for fuzzy match.
155
+ const parts = cleanedHost.split('.');
156
+ const apex = parts.slice(-2).join('.');
157
+ const siteKey = parts.length > 1 ? parts[parts.length - 2] : cleanedHost;
158
+ const hits = new Map();
159
+ for (const cmd of registry.values()) {
160
+ const domain = cmd.domain?.toLowerCase();
161
+ const siteMatches = (domain && (cleanedHost.endsWith(domain) || domain.endsWith(apex))) ||
162
+ cmd.site.toLowerCase() === siteKey?.toLowerCase() ||
163
+ cleanedHost.includes(cmd.site.toLowerCase());
164
+ if (siteMatches) {
165
+ const list = hits.get(cmd.site) ?? [];
166
+ list.push(cmd);
167
+ hits.set(cmd.site, list);
168
+ }
169
+ }
170
+ if (hits.size === 0)
171
+ return null;
172
+ // Pick the site with the most commands — likely the most-developed adapter,
173
+ // and the best reference for a new command on the same host.
174
+ let best = null;
175
+ for (const entry of hits) {
176
+ if (!best || entry[1].length > best[1].length)
177
+ best = entry;
178
+ }
179
+ if (!best)
180
+ return null;
181
+ return {
182
+ site: best[0],
183
+ example_commands: best[1].slice(0, 5).map((c) => `${c.site} ${c.name}`),
184
+ reason: `${best[1].length} existing adapter${best[1].length === 1 ? '' : 's'} target this site — reuse strategy/cookie config`,
185
+ };
186
+ }
187
+ /**
188
+ * Synthesize the verdict from collected signals + registry.
189
+ *
190
+ * The `recommended_next_step` is deliberately a single imperative
191
+ * sentence — agents act on it directly instead of re-deriving advice from
192
+ * the structured fields.
193
+ */
194
+ export function analyzeSite(signals, registry) {
195
+ const pattern = classifyPattern(signals);
196
+ const antiBot = detectAntiBot(signals);
197
+ const nearest = findNearestAdapter(signals.finalUrl, registry);
198
+ let next;
199
+ if (antiBot.detected) {
200
+ next = antiBot.implication;
201
+ }
202
+ else if (pattern.pattern === 'A') {
203
+ next = 'Pick the most specific JSON endpoint from `opencli browser network` and try a bare Node fetch with cookies; escalate to browser-context fetch only if blocked.';
204
+ }
205
+ else if (pattern.pattern === 'B') {
206
+ next = 'Read the SSR global via `opencli browser eval "JSON.stringify(window.__INITIAL_STATE__ ?? window.__NUXT__ ?? window.__NEXT_DATA__ ?? window.__APOLLO_STATE__)"` — no API needed.';
207
+ }
208
+ else if (pattern.pattern === 'C') {
209
+ next = 'No API visible — use SSR HTML scrape (e.g. `opencli browser extract`) against the rendered page.';
210
+ }
211
+ else if (pattern.pattern === 'D') {
212
+ next = 'Endpoints need auth. Re-open the page from a signed-in session, then retry analyze; see `field-decode-playbook` §4 for token tracing.';
213
+ }
214
+ else if (pattern.pattern === 'E') {
215
+ next = 'WebSocket stream detected — find the underlying HTTP poll/long-poll endpoint; raw WS is not supported.';
216
+ }
217
+ else {
218
+ next = 'No strong signal. Manually inspect `opencli browser network --all` and pick a pattern.';
219
+ }
220
+ return {
221
+ requested_url: signals.requestedUrl,
222
+ final_url: signals.finalUrl,
223
+ title: signals.title,
224
+ pattern,
225
+ anti_bot: antiBot,
226
+ initial_state: signals.initialState,
227
+ nearest_adapter: nearest,
228
+ recommended_next_step: next,
229
+ };
230
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,164 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { analyzeSite, detectAntiBot, classifyPattern, findNearestAdapter, } from './analyze.js';
3
+ function mkSignals(overrides = {}) {
4
+ return {
5
+ requestedUrl: 'https://example.com/',
6
+ finalUrl: 'https://example.com/',
7
+ cookieNames: [],
8
+ networkEntries: [],
9
+ initialState: {
10
+ __INITIAL_STATE__: false,
11
+ __NUXT__: false,
12
+ __NEXT_DATA__: false,
13
+ __APOLLO_STATE__: false,
14
+ },
15
+ title: 'Example',
16
+ ...overrides,
17
+ };
18
+ }
19
+ function mkCmd(site, name, domain) {
20
+ return {
21
+ site,
22
+ name,
23
+ description: '',
24
+ domain,
25
+ args: [],
26
+ };
27
+ }
28
+ describe('detectAntiBot', () => {
29
+ it('flags Aliyun WAF from cookie', () => {
30
+ const v = detectAntiBot(mkSignals({ cookieNames: ['JSESSIONID', 'acw_sc__v2'] }));
31
+ expect(v.detected).toBe(true);
32
+ expect(v.vendor).toBe('aliyun_waf');
33
+ expect(v.evidence).toContain('cookie:acw_sc__v2');
34
+ expect(v.implication).toMatch(/browser context/i);
35
+ });
36
+ it('flags Aliyun WAF from challenge HTML body', () => {
37
+ const v = detectAntiBot(mkSignals({
38
+ networkEntries: [
39
+ {
40
+ url: 'https://x.com/',
41
+ status: 200,
42
+ contentType: 'text/html',
43
+ bodyPreview: "var arg1 = 'A1B2C3D4E5F6A7B8C9D0E1F2A3B4C5D6';",
44
+ },
45
+ ],
46
+ }));
47
+ expect(v.detected).toBe(true);
48
+ expect(v.vendor).toBe('aliyun_waf');
49
+ });
50
+ it('flags Cloudflare from cf_clearance cookie', () => {
51
+ const v = detectAntiBot(mkSignals({ cookieNames: ['cf_clearance'] }));
52
+ expect(v.vendor).toBe('cloudflare');
53
+ expect(v.implication).toMatch(/Cloudflare/i);
54
+ });
55
+ it('flags Akamai from _abck cookie', () => {
56
+ const v = detectAntiBot(mkSignals({ cookieNames: ['_abck', 'bm_sz'] }));
57
+ expect(v.vendor).toBe('akamai');
58
+ });
59
+ it('returns no-match verdict with actionable fallback advice', () => {
60
+ const v = detectAntiBot(mkSignals());
61
+ expect(v.detected).toBe(false);
62
+ expect(v.vendor).toBeNull();
63
+ expect(v.implication).toMatch(/Node-side COOKIE fetch first/);
64
+ });
65
+ });
66
+ describe('classifyPattern', () => {
67
+ it('returns A for JSON-heavy pages without SSR state', () => {
68
+ const v = classifyPattern(mkSignals({
69
+ networkEntries: [
70
+ { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' },
71
+ { url: 'https://x.com/api/b', status: 200, contentType: 'application/json;charset=utf-8', bodyPreview: '{}' },
72
+ ],
73
+ }));
74
+ expect(v.pattern).toBe('A');
75
+ expect(v.json_responses).toBe(2);
76
+ });
77
+ it('returns B when __INITIAL_STATE__ is present, beating JSON signals', () => {
78
+ const v = classifyPattern(mkSignals({
79
+ initialState: { __INITIAL_STATE__: true, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: false },
80
+ networkEntries: [
81
+ { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' },
82
+ ],
83
+ }));
84
+ expect(v.pattern).toBe('B');
85
+ });
86
+ it('returns D when auth failures dominate', () => {
87
+ const v = classifyPattern(mkSignals({
88
+ networkEntries: [
89
+ { url: 'https://x.com/api/a', status: 401, contentType: 'application/json', bodyPreview: '' },
90
+ { url: 'https://x.com/api/b', status: 403, contentType: 'application/json', bodyPreview: '' },
91
+ ],
92
+ }));
93
+ expect(v.pattern).toBe('D');
94
+ expect(v.auth_failures).toBe(2);
95
+ });
96
+ it('returns C by default for static pages', () => {
97
+ const v = classifyPattern(mkSignals());
98
+ expect(v.pattern).toBe('C');
99
+ });
100
+ });
101
+ describe('findNearestAdapter', () => {
102
+ it('matches by domain suffix', () => {
103
+ const reg = new Map([
104
+ ['51job search', mkCmd('51job', 'search', '51job.com')],
105
+ ['51job detail', mkCmd('51job', 'detail', '51job.com')],
106
+ ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')],
107
+ ]);
108
+ const v = findNearestAdapter('https://jobs.51job.com/', reg);
109
+ expect(v?.site).toBe('51job');
110
+ expect(v?.example_commands).toContain('51job search');
111
+ });
112
+ it('falls back to site-name containment when no domain is registered', () => {
113
+ const reg = new Map([
114
+ ['51job search', mkCmd('51job', 'search')],
115
+ ]);
116
+ const v = findNearestAdapter('https://we.51job.com/', reg);
117
+ expect(v?.site).toBe('51job');
118
+ });
119
+ it('returns null when no adapter matches', () => {
120
+ const reg = new Map([
121
+ ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')],
122
+ ]);
123
+ const v = findNearestAdapter('https://random-site.io/', reg);
124
+ expect(v).toBeNull();
125
+ });
126
+ it('prefers the site with the most commands', () => {
127
+ const reg = new Map([
128
+ ['a search', mkCmd('a', 'search', 'a.com')],
129
+ ['b search', mkCmd('b', 'search', 'a.com')],
130
+ ['b detail', mkCmd('b', 'detail', 'a.com')],
131
+ ['b company', mkCmd('b', 'company', 'a.com')],
132
+ ]);
133
+ const v = findNearestAdapter('https://jobs.a.com/', reg);
134
+ expect(v?.site).toBe('b');
135
+ });
136
+ });
137
+ describe('analyzeSite', () => {
138
+ it('recommends browser-context fetch when WAF is detected', () => {
139
+ const report = analyzeSite(mkSignals({ cookieNames: ['acw_sc__v2'] }), new Map());
140
+ expect(report.anti_bot.vendor).toBe('aliyun_waf');
141
+ expect(report.recommended_next_step).toMatch(/browser context/i);
142
+ });
143
+ it('recommends reading SSR state when Pattern B fires', () => {
144
+ const report = analyzeSite(mkSignals({
145
+ initialState: { __INITIAL_STATE__: false, __NUXT__: true, __NEXT_DATA__: false, __APOLLO_STATE__: false },
146
+ }), new Map());
147
+ expect(report.pattern.pattern).toBe('B');
148
+ expect(report.recommended_next_step).toMatch(/__NUXT__|__INITIAL_STATE__|__NEXT_DATA__/);
149
+ });
150
+ it('includes __APOLLO_STATE__ in Pattern B next-step guidance', () => {
151
+ const report = analyzeSite(mkSignals({
152
+ initialState: { __INITIAL_STATE__: false, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: true },
153
+ }), new Map());
154
+ expect(report.pattern.pattern).toBe('B');
155
+ expect(report.recommended_next_step).toMatch(/__APOLLO_STATE__/);
156
+ });
157
+ it('includes nearest_adapter when the registry has a match', () => {
158
+ const reg = new Map([
159
+ ['51job search', mkCmd('51job', 'search', '51job.com')],
160
+ ]);
161
+ const report = analyzeSite(mkSignals({ finalUrl: 'https://we.51job.com/' }), reg);
162
+ expect(report.nearest_adapter?.site).toBe('51job');
163
+ });
164
+ });
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Article extraction via Readability — generic `page → article HTML` pipeline.
3
+ *
4
+ * Complements `src/browser/extract.ts`: that one takes a caller-supplied
5
+ * selector. This one works with zero configuration on arbitrary article pages
6
+ * (blogs, news, docs) by running `@mozilla/readability` inside the page
7
+ * context via CDP evaluate.
8
+ *
9
+ * Pipeline:
10
+ * 1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page
11
+ * renderer wrapping a plain-text file would pollute the DOM pipeline.
12
+ * 2. Short-circuit the "body is a single <pre>" case, which browsers use
13
+ * when loading *.txt / *.md over file:// or raw.githubusercontent.com.
14
+ * 3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the
15
+ * clone (preserves live page state for subsequent snapshot/click).
16
+ * 4. Inject Readability + isProbablyReaderable sources into the page,
17
+ * parse on the clone. `isProbablyReaderable` gates the parse unless
18
+ * `force: true`.
19
+ * 5. On Readability miss, walk a fallback selector chain
20
+ * (main → [role="main"] → #main-content → … → body) and return the
21
+ * first root with >80 characters of text.
22
+ *
23
+ * Readability runs in the page's own window because it needs real DOM APIs
24
+ * (getComputedStyle, treeWalker). Running it Node-side would require jsdom —
25
+ * a heavy dep the rest of OpenCLI doesn't need.
26
+ */
27
+ export interface ExtractArticleOptions {
28
+ /** CSS selectors removed from the cloned document before Readability runs. */
29
+ cleanSelectors?: string[];
30
+ /** Fallback chain when Readability fails. Defaults to the common structural ids. */
31
+ fallbackSelectors?: string[];
32
+ /** Bypass `isProbablyReaderable` and always attempt a parse. */
33
+ force?: boolean;
34
+ }
35
+ export type ExtractSource = 'readability' | 'fallback' | 'raw-text' | 'pre';
36
+ export interface ExtractedArticle {
37
+ html: string;
38
+ title: string;
39
+ byline?: string;
40
+ publishedTime?: string;
41
+ siteName?: string;
42
+ source: ExtractSource;
43
+ }
44
+ export declare const DEFAULT_FALLBACK_SELECTORS: string[];
45
+ /**
46
+ * Build the JS expression evaluated in-page to extract the article. Exported
47
+ * for testability — callers on the host side should use `extractArticle`.
48
+ */
49
+ export declare function buildExtractArticleJs(options?: ExtractArticleOptions): string;
50
+ export interface PageLike {
51
+ evaluate(js: string): Promise<unknown>;
52
+ }
53
+ /**
54
+ * Run the extract pipeline on the given page. Returns `null` when no usable
55
+ * content is found (Readability miss + empty fallback chain).
56
+ */
57
+ export declare function extractArticle(page: PageLike, options?: ExtractArticleOptions): Promise<ExtractedArticle | null>;
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,105 @@
1
+ import { afterEach, describe, expect, it } from 'vitest';
2
+ import { JSDOM } from 'jsdom';
3
+ import * as fs from 'node:fs';
4
+ import * as os from 'node:os';
5
+ import * as path from 'node:path';
6
+ import { fileURLToPath } from 'node:url';
7
+ import { buildExtractArticleJs } from './article-extract.js';
8
+ import { downloadArticle } from '../download/article-download.js';
9
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
+ const fixturesDir = path.join(__dirname, '__fixtures__', 'article-extract');
11
+ const tempDirs = [];
12
+ afterEach(() => {
13
+ for (const dir of tempDirs)
14
+ fs.rmSync(dir, { recursive: true, force: true });
15
+ tempDirs.length = 0;
16
+ });
17
+ function loadFixture(name) {
18
+ return fs.readFileSync(path.join(fixturesDir, name), 'utf8');
19
+ }
20
+ function escapeHtml(text) {
21
+ return text.replace(/[&<>]/g, ch => ({ '&': '&amp;', '<': '&lt;', '>': '&gt;' }[ch]));
22
+ }
23
+ function runExtract(html, url, options = {}, contentType) {
24
+ const dom = new JSDOM(html, {
25
+ url,
26
+ contentType: 'text/html',
27
+ pretendToBeVisual: true,
28
+ runScripts: 'outside-only',
29
+ });
30
+ if (contentType) {
31
+ Object.defineProperty(dom.window.document, 'contentType', {
32
+ value: contentType,
33
+ configurable: true,
34
+ });
35
+ }
36
+ return dom.window.eval(buildExtractArticleJs(options));
37
+ }
38
+ async function renderMarkdown(article, url, options = {}) {
39
+ const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-e2e-'));
40
+ tempDirs.push(tempDir);
41
+ const result = await downloadArticle({
42
+ title: article.title || 'untitled',
43
+ contentHtml: article.html,
44
+ sourceUrl: url,
45
+ }, {
46
+ output: tempDir,
47
+ downloadImages: false,
48
+ cleanSelectors: options.cleanSelectors,
49
+ });
50
+ expect(result[0].status).toBe('success');
51
+ return fs.readFileSync(result[0].saved, 'utf8');
52
+ }
53
+ describe('article extract → markdown e2e fixtures', () => {
54
+ it('extracts a Wikipedia article fixture and keeps infobox/reference noise out of markdown', async () => {
55
+ const url = 'https://en.wikipedia.org/wiki/Markdown';
56
+ const cleanSelectors = ['.infobox', '.navbox', '.reference', '.mw-editsection', '.metadata'];
57
+ const article = runExtract(loadFixture('wikipedia-markdown.html'), url, { cleanSelectors });
58
+ expect(article?.source).toBe('readability');
59
+ expect(article?.title).toBe('Markdown');
60
+ if (!article)
61
+ throw new Error('expected extracted article');
62
+ const md = await renderMarkdown(article, url, { cleanSelectors });
63
+ expect(md).toContain('lightweight markup language');
64
+ expect(md).toContain('John Gruber');
65
+ expect(md).not.toContain('Syntax description');
66
+ expect(md).not.toContain('Standard file extension');
67
+ });
68
+ it('extracts a Deno blog fixture, preserves embedded iframes as markdown links, and drops page chrome', async () => {
69
+ const url = 'https://deno.com/blog/v2.0';
70
+ const article = runExtract(loadFixture('deno-v2.html'), url);
71
+ expect(article?.source).toBe('readability');
72
+ expect(article?.title).toBe('Announcing Deno 2 | Deno');
73
+ if (!article)
74
+ throw new Error('expected extracted article');
75
+ const md = await renderMarkdown(article, url);
76
+ expect(md).toContain('## Announcing Deno 2');
77
+ expect(md).toContain('The web is humanity’s largest software platform');
78
+ expect(md).toMatch(/\]\(https:\/\/www\.youtube(?:-nocookie)?\.com\/embed\/[^)]+\)/);
79
+ expect(md).not.toContain('Skip to main content');
80
+ });
81
+ it('short-circuits non-HTML raw text pages end-to-end', async () => {
82
+ const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md';
83
+ const text = loadFixture('openai-cookbook-readme.txt');
84
+ const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`;
85
+ const article = runExtract(html, url, {}, 'text/plain');
86
+ expect(article?.source).toBe('raw-text');
87
+ if (!article)
88
+ throw new Error('expected extracted article');
89
+ const md = await renderMarkdown(article, url);
90
+ expect(md).toContain('OPENAI\\_API\\_KEY');
91
+ expect(md).toContain('Example code and guides for accomplishing common tasks');
92
+ });
93
+ it('short-circuits a single-pre document end-to-end', async () => {
94
+ const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md';
95
+ const text = loadFixture('openai-cookbook-readme.txt');
96
+ const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`;
97
+ const article = runExtract(html, url);
98
+ expect(article?.source).toBe('pre');
99
+ if (!article)
100
+ throw new Error('expected extracted article');
101
+ const md = await renderMarkdown(article, url);
102
+ expect(md).toContain('OPENAI\\_API\\_KEY');
103
+ expect(md).toContain('Most code examples are written in Python');
104
+ });
105
+ });