@jackwener/opencli 1.7.5 → 1.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/README.md +22 -10
  2. package/README.zh-CN.md +18 -9
  3. package/cli-manifest.json +401 -11
  4. package/clis/51job/company.js +125 -0
  5. package/clis/51job/detail.js +108 -0
  6. package/clis/51job/hot.js +55 -0
  7. package/clis/51job/search.js +79 -0
  8. package/clis/51job/utils.js +302 -0
  9. package/clis/51job/utils.test.js +69 -0
  10. package/clis/bilibili/video.js +68 -0
  11. package/clis/bilibili/video.test.js +132 -0
  12. package/clis/chatgpt/image.js +1 -1
  13. package/clis/deepseek/ask.js +37 -11
  14. package/clis/deepseek/ask.test.js +165 -0
  15. package/clis/deepseek/utils.js +192 -24
  16. package/clis/deepseek/utils.test.js +145 -0
  17. package/clis/gemini/image.js +1 -1
  18. package/clis/instagram/download.js +1 -1
  19. package/clis/jianyu/search.js +139 -3
  20. package/clis/jianyu/search.test.js +25 -0
  21. package/clis/jianyu/shared/procurement-detail.js +15 -0
  22. package/clis/jianyu/shared/procurement-detail.test.js +12 -0
  23. package/clis/twitter/likes.js +3 -2
  24. package/clis/twitter/search.js +4 -2
  25. package/clis/twitter/search.test.js +4 -0
  26. package/clis/twitter/shared.js +35 -2
  27. package/clis/twitter/shared.test.js +96 -0
  28. package/clis/twitter/thread.js +3 -1
  29. package/clis/twitter/timeline.js +3 -2
  30. package/clis/twitter/tweets.js +219 -0
  31. package/clis/twitter/tweets.test.js +125 -0
  32. package/clis/web/read.js +25 -5
  33. package/clis/web/read.test.js +76 -0
  34. package/clis/weread/ai-outline.js +170 -0
  35. package/clis/weread/ai-outline.test.js +83 -0
  36. package/clis/weread/book.js +57 -44
  37. package/clis/weread/commands.test.js +24 -0
  38. package/clis/xiaoyuzhou/podcast-episodes.js +2 -2
  39. package/clis/xiaoyuzhou/podcast-episodes.test.js +78 -0
  40. package/clis/youtube/channel.js +35 -0
  41. package/dist/src/browser/analyze.d.ts +103 -0
  42. package/dist/src/browser/analyze.js +230 -0
  43. package/dist/src/browser/analyze.test.d.ts +1 -0
  44. package/dist/src/browser/analyze.test.js +164 -0
  45. package/dist/src/browser/article-extract.d.ts +57 -0
  46. package/dist/src/browser/article-extract.e2e.test.d.ts +1 -0
  47. package/dist/src/browser/article-extract.e2e.test.js +105 -0
  48. package/dist/src/browser/article-extract.js +169 -0
  49. package/dist/src/browser/article-extract.test.d.ts +1 -0
  50. package/dist/src/browser/article-extract.test.js +94 -0
  51. package/dist/src/browser/base-page.d.ts +13 -3
  52. package/dist/src/browser/base-page.js +35 -25
  53. package/dist/src/browser/cdp.d.ts +1 -0
  54. package/dist/src/browser/cdp.js +23 -5
  55. package/dist/src/browser/compound.d.ts +59 -0
  56. package/dist/src/browser/compound.js +112 -0
  57. package/dist/src/browser/compound.test.d.ts +1 -0
  58. package/dist/src/browser/compound.test.js +175 -0
  59. package/dist/src/browser/dom-snapshot.d.ts +7 -0
  60. package/dist/src/browser/dom-snapshot.js +76 -3
  61. package/dist/src/browser/dom-snapshot.test.js +65 -0
  62. package/dist/src/browser/extract.d.ts +69 -0
  63. package/dist/src/browser/extract.js +132 -0
  64. package/dist/src/browser/extract.test.d.ts +1 -0
  65. package/dist/src/browser/extract.test.js +129 -0
  66. package/dist/src/browser/find.d.ts +76 -0
  67. package/dist/src/browser/find.js +179 -0
  68. package/dist/src/browser/find.test.d.ts +1 -0
  69. package/dist/src/browser/find.test.js +120 -0
  70. package/dist/src/browser/html-tree.d.ts +75 -0
  71. package/dist/src/browser/html-tree.js +112 -0
  72. package/dist/src/browser/html-tree.test.d.ts +1 -0
  73. package/dist/src/browser/html-tree.test.js +181 -0
  74. package/dist/src/browser/network-cache.d.ts +48 -0
  75. package/dist/src/browser/network-cache.js +66 -0
  76. package/dist/src/browser/network-cache.test.d.ts +1 -0
  77. package/dist/src/browser/network-cache.test.js +58 -0
  78. package/dist/src/browser/network-key.d.ts +22 -0
  79. package/dist/src/browser/network-key.js +66 -0
  80. package/dist/src/browser/network-key.test.d.ts +1 -0
  81. package/dist/src/browser/network-key.test.js +49 -0
  82. package/dist/src/browser/shape-filter.d.ts +52 -0
  83. package/dist/src/browser/shape-filter.js +101 -0
  84. package/dist/src/browser/shape-filter.test.d.ts +1 -0
  85. package/dist/src/browser/shape-filter.test.js +101 -0
  86. package/dist/src/browser/shape.d.ts +23 -0
  87. package/dist/src/browser/shape.js +95 -0
  88. package/dist/src/browser/shape.test.d.ts +1 -0
  89. package/dist/src/browser/shape.test.js +82 -0
  90. package/dist/src/browser/target-errors.d.ts +14 -1
  91. package/dist/src/browser/target-errors.js +13 -0
  92. package/dist/src/browser/target-errors.test.js +39 -6
  93. package/dist/src/browser/target-resolver.d.ts +57 -10
  94. package/dist/src/browser/target-resolver.js +195 -75
  95. package/dist/src/browser/target-resolver.test.js +80 -5
  96. package/dist/src/browser/verify-fixture.d.ts +59 -0
  97. package/dist/src/browser/verify-fixture.js +213 -0
  98. package/dist/src/browser/verify-fixture.test.d.ts +1 -0
  99. package/dist/src/browser/verify-fixture.test.js +161 -0
  100. package/dist/src/cli.d.ts +32 -0
  101. package/dist/src/cli.js +936 -141
  102. package/dist/src/cli.test.js +1051 -1
  103. package/dist/src/daemon.d.ts +3 -2
  104. package/dist/src/daemon.js +16 -4
  105. package/dist/src/daemon.test.d.ts +1 -0
  106. package/dist/src/daemon.test.js +19 -0
  107. package/dist/src/download/article-download.d.ts +12 -0
  108. package/dist/src/download/article-download.js +141 -17
  109. package/dist/src/download/article-download.test.js +196 -0
  110. package/dist/src/download/index.js +73 -86
  111. package/dist/src/errors.js +4 -2
  112. package/dist/src/errors.test.js +13 -0
  113. package/dist/src/execution.js +7 -2
  114. package/dist/src/execution.test.js +54 -0
  115. package/dist/src/launcher.d.ts +1 -1
  116. package/dist/src/launcher.js +3 -3
  117. package/dist/src/main.js +16 -0
  118. package/dist/src/output.js +1 -1
  119. package/dist/src/output.test.js +6 -0
  120. package/dist/src/types.d.ts +18 -3
  121. package/package.json +5 -1
@@ -9,7 +9,8 @@
9
9
  * 1. Origin check — reject HTTP/WS from non chrome-extension:// origins
10
10
  * 2. Custom header — require X-OpenCLI header (browsers can't send it
11
11
  * without CORS preflight, which we deny)
12
- * 3. No CORS headers — responses never include Access-Control-Allow-Origin
12
+ * 3. No CORS headers on command endpoints only /ping is readable from the
13
+ * Browser Bridge extension origin so the extension can probe daemon reachability
13
14
  * 4. Body size limit — 1 MB max to prevent OOM
14
15
  * 5. WebSocket verifyClient — reject upgrade before connection is established
15
16
  *
@@ -18,4 +19,4 @@
18
19
  * - Persistent — stays alive until explicit shutdown, SIGTERM, or uninstall
19
20
  * - Listens on localhost:19825
20
21
  */
21
- export {};
22
+ export declare function getResponseCorsHeaders(pathname: string, origin?: string): Record<string, string> | undefined;
@@ -9,7 +9,8 @@
9
9
  * 1. Origin check — reject HTTP/WS from non chrome-extension:// origins
10
10
  * 2. Custom header — require X-OpenCLI header (browsers can't send it
11
11
  * without CORS preflight, which we deny)
12
- * 3. No CORS headers — responses never include Access-Control-Allow-Origin
12
+ * 3. No CORS headers on command endpoints only /ping is readable from the
13
+ * Browser Bridge extension origin so the extension can probe daemon reachability
13
14
  * 4. Body size limit — 1 MB max to prevent OOM
14
15
  * 5. WebSocket verifyClient — reject upgrade before connection is established
15
16
  *
@@ -60,10 +61,20 @@ function readBody(req) {
60
61
  reject(err); });
61
62
  });
62
63
  }
63
- function jsonResponse(res, status, data) {
64
- res.writeHead(status, { 'Content-Type': 'application/json' });
64
+ function jsonResponse(res, status, data, extraHeaders) {
65
+ res.writeHead(status, { 'Content-Type': 'application/json', ...extraHeaders });
65
66
  res.end(JSON.stringify(data));
66
67
  }
68
+ export function getResponseCorsHeaders(pathname, origin) {
69
+ if (pathname !== '/ping')
70
+ return undefined;
71
+ if (!origin || !origin.startsWith('chrome-extension://'))
72
+ return undefined;
73
+ return {
74
+ 'Access-Control-Allow-Origin': origin,
75
+ Vary: 'Origin',
76
+ };
77
+ }
67
78
  async function handleRequest(req, res) {
68
79
  // ─── Security: Origin & custom-header check ──────────────────────
69
80
  // Block browser-based CSRF: browsers always send an Origin header on
@@ -93,7 +104,7 @@ async function handleRequest(req, res) {
93
104
  // Timing side-channels can reveal daemon presence to local processes, which
94
105
  // is an accepted risk given the daemon is loopback-only and short-lived.
95
106
  if (req.method === 'GET' && pathname === '/ping') {
96
- jsonResponse(res, 200, { ok: true });
107
+ jsonResponse(res, 200, { ok: true }, getResponseCorsHeaders(pathname, origin));
97
108
  return;
98
109
  }
99
110
  // Require custom header on all other HTTP requests. Browsers cannot attach
@@ -272,6 +283,7 @@ wss.on('connection', (ws) => {
272
283
  if (extensionWs === ws) {
273
284
  extensionWs = null;
274
285
  extensionVersion = null;
286
+ extensionCompatRange = null;
275
287
  // Reject pending requests in case 'close' does not follow this 'error'
276
288
  for (const [, p] of pending) {
277
289
  clearTimeout(p.timer);
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,19 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { getResponseCorsHeaders } from './daemon.js';
3
+ describe('getResponseCorsHeaders', () => {
4
+ it('allows the Browser Bridge extension origin to read /ping', () => {
5
+ expect(getResponseCorsHeaders('/ping', 'chrome-extension://abc123')).toEqual({
6
+ 'Access-Control-Allow-Origin': 'chrome-extension://abc123',
7
+ Vary: 'Origin',
8
+ });
9
+ });
10
+ it('does not add CORS headers for ordinary web origins', () => {
11
+ expect(getResponseCorsHeaders('/ping', 'https://example.com')).toBeUndefined();
12
+ });
13
+ it('does not add CORS headers when origin is absent', () => {
14
+ expect(getResponseCorsHeaders('/ping')).toBeUndefined();
15
+ });
16
+ it('does not add CORS headers for command endpoints even from the extension origin', () => {
17
+ expect(getResponseCorsHeaders('/command', 'chrome-extension://abc123')).toBeUndefined();
18
+ });
19
+ });
@@ -37,6 +37,18 @@ export interface ArticleDownloadOptions {
37
37
  detectImageExt?: (url: string) => string;
38
38
  /** Custom frontmatter labels (default: Chinese labels) */
39
39
  frontmatterLabels?: FrontmatterLabels;
40
+ /**
41
+ * Extra CSS selectors removed from the article before Turndown conversion.
42
+ * Use this to drop site-specific noise the adapter can't always trim upstream
43
+ * (e.g. zhihu 折叠卡, weixin 赞赏栏, wiki infobox).
44
+ */
45
+ cleanSelectors?: string[];
46
+ /**
47
+ * Write the markdown to `process.stdout` instead of a file on disk. Image
48
+ * download and directory creation are skipped — remote image URLs are kept
49
+ * as-is so the output is self-contained when piped.
50
+ */
51
+ stdout?: boolean;
40
52
  }
41
53
  export interface ArticleDownloadResult {
42
54
  title: string;
@@ -8,6 +8,7 @@
8
8
  import * as fs from 'node:fs';
9
9
  import * as path from 'node:path';
10
10
  import TurndownService from 'turndown';
11
+ import { gfm } from 'turndown-plugin-gfm';
11
12
  import { httpDownload, sanitizeFilename } from './index.js';
12
13
  import { formatBytes } from './progress.js';
13
14
  const IMAGE_CONCURRENCY = 5;
@@ -19,22 +20,127 @@ const DEFAULT_LABELS = {
19
20
  // ============================================================
20
21
  // Markdown Conversion
21
22
  // ============================================================
22
- function createTurndown(configure) {
23
+ // Nodes that never carry article content. Turndown keeps them by default — if an
24
+ // adapter's contentHtml extraction misses one, CSS / scripts / widget markup
25
+ // ends up inline in the .md. Strip them unconditionally at the converter level.
26
+ // `svg` is not in HTMLElementTagNameMap, so we type-narrow manually.
27
+ // `header/footer/nav/aside` cover page chrome that adapters occasionally
28
+ // forget to trim — the article's own title/author/publishTime are supplied
29
+ // as separate fields on ArticleData, so duplicated nodes are redundant.
30
+ // `iframe` is NOT in this set — it's handled by a dedicated rule below that
31
+ // degrades to a link so embedded content (YouTube, Twitter, CodePen …) keeps
32
+ // a reachable URL in the exported markdown.
33
+ const STRIPPED_TAGS = [
34
+ 'script', 'style', 'noscript',
35
+ 'canvas',
36
+ 'form', 'button', 'dialog',
37
+ 'header', 'footer', 'nav', 'aside',
38
+ ];
39
+ function createTurndown(configure, cleanSelectors) {
23
40
  const td = new TurndownService({
24
41
  headingStyle: 'atx',
25
42
  codeBlockStyle: 'fenced',
26
43
  bulletListMarker: '-',
27
44
  });
45
+ td.use(gfm);
46
+ td.remove(STRIPPED_TAGS);
47
+ // turndown-plugin-gfm@1.0.2 emits single-tilde strikethrough (`~x~`), which
48
+ // is not the canonical GFM form. Override it so exported markdown is
49
+ // portable across common renderers.
50
+ td.addRule('canonicalStrikethrough', {
51
+ filter: (node) => ['DEL', 'S', 'STRIKE'].includes(node.nodeName),
52
+ replacement: (content) => `~~${content}~~`,
53
+ });
54
+ // SVG isn't in the static HTML tag map; match by name with a custom filter.
55
+ td.addRule('stripSvg', {
56
+ filter: (node) => node.nodeName === 'svg' || node.nodeName === 'SVG',
57
+ replacement: () => '',
58
+ });
28
59
  td.addRule('linebreak', {
29
60
  filter: 'br',
30
61
  replacement: () => '\n',
31
62
  });
63
+ // Inline base64 images would land as huge `![](data:image/...;base64,...)`
64
+ // strings that the image downloader can't localize. Drop them.
65
+ td.addRule('ignoreBase64Images', {
66
+ filter: (node) => {
67
+ if (node.nodeName !== 'IMG')
68
+ return false;
69
+ const src = node.getAttribute?.('src') ?? '';
70
+ return src.startsWith('data:');
71
+ },
72
+ replacement: () => '',
73
+ });
74
+ // Markdown has no native video/audio primitive. Emit inline HTML so
75
+ // renderers that support it (GitHub, VS Code preview …) still play the
76
+ // media; viewers that don't simply show the tag as text, which is still
77
+ // more information than dropping the node outright.
78
+ td.addRule('videoElement', {
79
+ filter: (node) => node.nodeName === 'VIDEO',
80
+ replacement: (_content, node) => {
81
+ const el = node;
82
+ const src = el.getAttribute('src')
83
+ || el.querySelector('source')?.getAttribute('src')
84
+ || '';
85
+ if (!src)
86
+ return '';
87
+ const poster = el.getAttribute('poster') || '';
88
+ return `\n<video src="${src}" controls${poster ? ` poster="${poster}"` : ''}></video>\n`;
89
+ },
90
+ });
91
+ td.addRule('audioElement', {
92
+ filter: (node) => node.nodeName === 'AUDIO',
93
+ replacement: (_content, node) => {
94
+ const el = node;
95
+ const src = el.getAttribute('src')
96
+ || el.querySelector('source')?.getAttribute('src')
97
+ || '';
98
+ return src ? `\n<audio src="${src}" controls></audio>\n` : '';
99
+ },
100
+ });
101
+ // Iframes (YouTube, Twitter, CodePen …) degrade to a markdown link so the
102
+ // embedded resource is still reachable from the exported file.
103
+ td.addRule('iframeToLink', {
104
+ filter: (node) => node.nodeName === 'IFRAME',
105
+ replacement: (_content, node) => {
106
+ const el = node;
107
+ const src = el.getAttribute('src') || '';
108
+ if (!src)
109
+ return '';
110
+ const title = el.getAttribute('title') || 'Embedded content';
111
+ return `\n[${title}](${src})\n`;
112
+ },
113
+ });
114
+ // Per-adapter dirty-node removal. Adapters know their site's specific noise
115
+ // (zhihu 折叠卡, weixin 赞赏栏, wiki 折叠 infobox …); we keep the default set
116
+ // empty so the generic converter stays untouched.
117
+ const selectorRules = (cleanSelectors ?? [])
118
+ .map(sel => sel.trim())
119
+ .filter(Boolean);
120
+ if (selectorRules.length > 0) {
121
+ td.addRule('cleanSelectors', {
122
+ filter: (node) => {
123
+ const match = node.matches;
124
+ if (typeof match !== 'function')
125
+ return false;
126
+ return selectorRules.some((sel) => {
127
+ try {
128
+ return match.call(node, sel);
129
+ }
130
+ catch {
131
+ return false;
132
+ }
133
+ });
134
+ },
135
+ replacement: () => '',
136
+ });
137
+ }
32
138
  if (configure)
33
139
  configure(td);
34
140
  return td;
35
141
  }
36
- function convertToMarkdown(contentHtml, codeBlocks, configure) {
37
- const td = createTurndown(configure);
142
+ function convertToMarkdown(contentHtml, codeBlocks, configure, cleanSelectors) {
143
+ const td = createTurndown(configure, cleanSelectors);
38
144
  let md = td.turndown(contentHtml);
39
145
  // Restore code block placeholders
40
146
  codeBlocks.forEach((block, i) => {
@@ -44,8 +150,12 @@ function convertToMarkdown(contentHtml, codeBlocks, configure) {
44
150
  });
45
151
  // Clean up
46
152
  md = md.replace(/\u00a0/g, ' ');
47
- md = md.replace(/\n{4,}/g, '\n\n\n');
153
+ // Turndown leaves behind lone dashes / middle dots when list bullets or
154
+ // decorative separators lose their surrounding inline context.
155
+ md = md.replace(/^[ \t]*[-·][ \t]*$/gm, '');
156
+ md = md.replace(/^[ \t]+$/gm, '');
48
157
  md = md.replace(/[ \t]+$/gm, '');
158
+ md = md.replace(/\n{3,}/g, '\n\n');
49
159
  return md;
50
160
  }
51
161
  function replaceImageUrls(md, urlMap) {
@@ -120,7 +230,7 @@ async function downloadImages(imgUrls, imgDir, headers, detectExt) {
120
230
  * 6. File write
121
231
  */
122
232
  export async function downloadArticle(data, options) {
123
- const { output, downloadImages: shouldDownloadImages = true, imageHeaders, maxTitleLength = 80, configureTurndown, detectImageExt, frontmatterLabels, } = options;
233
+ const { output, downloadImages: shouldDownloadImages = true, imageHeaders, maxTitleLength = 80, configureTurndown, detectImageExt, frontmatterLabels, cleanSelectors, stdout = false, } = options;
124
234
  const labels = { ...DEFAULT_LABELS, ...frontmatterLabels };
125
235
  if (!data.title) {
126
236
  return [{
@@ -143,33 +253,47 @@ export async function downloadArticle(data, options) {
143
253
  }];
144
254
  }
145
255
  // Convert HTML to Markdown
146
- let markdown = convertToMarkdown(data.contentHtml, data.codeBlocks || [], configureTurndown);
147
- // Prepare output directory
256
+ let markdown = convertToMarkdown(data.contentHtml, data.codeBlocks || [], configureTurndown, cleanSelectors);
148
257
  const safeTitle = sanitizeFilename(data.title, maxTitleLength);
149
- const articleDir = path.join(output, safeTitle);
150
- fs.mkdirSync(articleDir, { recursive: true });
151
- // Download images
152
- if (shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) {
258
+ // Download images only when writing to disk. In stdout mode remote URLs
259
+ // stay intact so the piped output is self-contained.
260
+ if (!stdout && shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) {
261
+ const articleDir = path.join(output, safeTitle);
262
+ fs.mkdirSync(articleDir, { recursive: true });
153
263
  const imagesDir = path.join(articleDir, 'images');
154
264
  fs.mkdirSync(imagesDir, { recursive: true });
155
265
  const urlMap = await downloadImages(data.imageUrls, imagesDir, imageHeaders, detectImageExt);
156
266
  markdown = replaceImageUrls(markdown, urlMap);
157
267
  }
158
- // Build frontmatter with customizable labels
159
- const headerLines = [`# ${data.title}`, ''];
268
+ // Build frontmatter with customizable labels.
269
+ // Shape: `# Title\n[> meta\n...]\n---\n\n<markdown>` — exactly one blank
270
+ // line separates every section, so we never produce ≥3 consecutive newlines.
271
+ const headerLines = [`# ${data.title}`];
160
272
  if (data.author)
161
273
  headerLines.push(`> ${labels.author}: ${data.author}`);
162
274
  if (data.publishTime)
163
275
  headerLines.push(`> ${labels.publishTime}: ${data.publishTime}`);
164
276
  if (data.sourceUrl)
165
277
  headerLines.push(`> ${labels.sourceUrl}: ${data.sourceUrl}`);
166
- headerLines.push('', '---', '');
167
- const fullContent = headerLines.join('\n') + markdown;
168
- // Write file
278
+ const frontmatter = headerLines.join('\n') + '\n\n---\n\n';
279
+ const fullContent = frontmatter + markdown;
280
+ const size = Buffer.byteLength(fullContent, 'utf-8');
281
+ if (stdout) {
282
+ process.stdout.write(fullContent.endsWith('\n') ? fullContent : fullContent + '\n');
283
+ return [{
284
+ title: data.title,
285
+ author: data.author || '-',
286
+ publish_time: data.publishTime || '-',
287
+ status: 'success',
288
+ size: formatBytes(size),
289
+ saved: '-',
290
+ }];
291
+ }
292
+ const articleDir = path.join(output, safeTitle);
293
+ fs.mkdirSync(articleDir, { recursive: true });
169
294
  const filename = `${safeTitle}.md`;
170
295
  const filePath = path.join(articleDir, filename);
171
296
  fs.writeFileSync(filePath, fullContent, 'utf-8');
172
- const size = Buffer.byteLength(fullContent, 'utf-8');
173
297
  return [{
174
298
  title: data.title,
175
299
  author: data.author || '-',
@@ -15,6 +15,20 @@ afterEach(() => {
15
15
  }
16
16
  tempDirs.length = 0;
17
17
  });
18
+ async function runAndRead(contentHtml, opts = {}) {
19
+ const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
20
+ tempDirs.push(tempDir);
21
+ const result = await downloadArticle({
22
+ title: 'Test Article',
23
+ contentHtml,
24
+ }, {
25
+ output: tempDir,
26
+ downloadImages: false,
27
+ ...(opts.cleanSelectors && { cleanSelectors: opts.cleanSelectors }),
28
+ });
29
+ expect(result[0].status).toBe('success');
30
+ return fs.readFileSync(result[0].saved, 'utf8');
31
+ }
18
32
  describe('downloadArticle', () => {
19
33
  it('returns the saved markdown file path on success', async () => {
20
34
  const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
@@ -36,4 +50,186 @@ describe('downloadArticle', () => {
36
50
  expect(fs.existsSync(result[0].saved)).toBe(true);
37
51
  expect(fs.readFileSync(result[0].saved, 'utf8')).toContain('Hello world');
38
52
  });
53
+ describe('markdown pipeline', () => {
54
+ it('converts GFM tables', async () => {
55
+ const md = await runAndRead('<table><thead><tr><th>a</th><th>b</th></tr></thead>' +
56
+ '<tbody><tr><td>1</td><td>2</td></tr></tbody></table>');
57
+ expect(md).toMatch(/\|\s*a\s*\|\s*b\s*\|/);
58
+ expect(md).toMatch(/\|\s*---\s*\|\s*---\s*\|/);
59
+ expect(md).toMatch(/\|\s*1\s*\|\s*2\s*\|/);
60
+ });
61
+ it('converts strikethrough and task lists', async () => {
62
+ const md = await runAndRead('<p><del>gone</del></p>' +
63
+ '<ul><li><input type="checkbox" checked>done</li><li><input type="checkbox">todo</li></ul>');
64
+ expect(md).toContain('~~gone~~');
65
+ expect(md).toContain('[x] done');
66
+ expect(md).toContain('[ ] todo');
67
+ });
68
+ it('strips script / style / noscript / form but keeps iframe as a link', async () => {
69
+ const md = await runAndRead('<p>keep</p>' +
70
+ '<script>alert(1)</script>' +
71
+ '<style>.x{color:red}</style>' +
72
+ '<noscript>nojs</noscript>' +
73
+ '<iframe src="https://www.youtube.com/embed/abc" title="Demo video"></iframe>' +
74
+ '<form><button>click</button></form>');
75
+ expect(md).toContain('keep');
76
+ expect(md).not.toContain('alert');
77
+ expect(md).not.toContain('color:red');
78
+ expect(md).not.toContain('nojs');
79
+ expect(md).not.toContain('click');
80
+ // Iframe degrades to a link preserving the embedded URL.
81
+ expect(md).toContain('[Demo video](https://www.youtube.com/embed/abc)');
82
+ });
83
+ it('strips SVG nodes entirely', async () => {
84
+ const md = await runAndRead('<p>before</p><svg><circle cx="5" cy="5" r="4"/></svg><p>after</p>');
85
+ expect(md).toContain('before');
86
+ expect(md).toContain('after');
87
+ expect(md).not.toContain('svg');
88
+ expect(md).not.toContain('circle');
89
+ });
90
+ it('drops base64 data URI images but keeps regular images', async () => {
91
+ const md = await runAndRead('<p><img alt="inline" src="data:image/png;base64,iVBORw0KGgo="></p>' +
92
+ '<p><img alt="keep" src="https://example.com/a.jpg"></p>');
93
+ expect(md).not.toContain('data:image');
94
+ expect(md).toContain('![keep](https://example.com/a.jpg)');
95
+ });
96
+ it('collapses 3+ blank lines and strips lone bullet / middle-dot residue', async () => {
97
+ const md = await runAndRead('<p>top</p>' +
98
+ '<p>-</p>' +
99
+ '<p>·</p>' +
100
+ '<p>bottom</p>');
101
+ expect(md).not.toMatch(/\n{3,}/);
102
+ expect(md).not.toMatch(/^\s*-\s*$/m);
103
+ expect(md).not.toMatch(/^\s*·\s*$/m);
104
+ expect(md).toContain('top');
105
+ expect(md).toContain('bottom');
106
+ });
107
+ it('strips page chrome (header / footer / nav / aside)', async () => {
108
+ const md = await runAndRead('<header><p>page-header-text</p></header>' +
109
+ '<nav><a href="/">home-link</a></nav>' +
110
+ '<p>article-body</p>' +
111
+ '<aside><p>sidebar-text</p></aside>' +
112
+ '<footer><p>page-footer-text</p></footer>');
113
+ expect(md).toContain('article-body');
114
+ expect(md).not.toContain('page-header-text');
115
+ expect(md).not.toContain('home-link');
116
+ expect(md).not.toContain('sidebar-text');
117
+ expect(md).not.toContain('page-footer-text');
118
+ });
119
+ it('cleanSelectors removes matching nodes before conversion', async () => {
120
+ const md = await runAndRead('<p>keep-me</p>' +
121
+ '<div class="vote-card">折叠卡</div>' +
122
+ '<section class="reward-panel">赞赏栏</section>' +
123
+ '<p>also-keep</p>', { cleanSelectors: ['.vote-card', '.reward-panel'] });
124
+ expect(md).toContain('keep-me');
125
+ expect(md).toContain('also-keep');
126
+ expect(md).not.toContain('折叠卡');
127
+ expect(md).not.toContain('赞赏栏');
128
+ });
129
+ it('cleanSelectors silently ignores invalid selectors', async () => {
130
+ const md = await runAndRead('<p>survives</p><div class="x">and-this-too</div>', { cleanSelectors: ['!!!not-a-valid-selector', '.missing'] });
131
+ expect(md).toContain('survives');
132
+ expect(md).toContain('and-this-too');
133
+ });
134
+ it('cleanSelectors keeps valid selectors active when one selector is invalid', async () => {
135
+ const md = await runAndRead('<p>keep</p><div class="vote-card">strip-me</div><p>also-keep</p>', { cleanSelectors: ['!!!not-a-valid-selector', '.vote-card'] });
136
+ expect(md).toContain('keep');
137
+ expect(md).toContain('also-keep');
138
+ expect(md).not.toContain('strip-me');
139
+ });
140
+ it('preserves <video> as inline HTML with src + poster', async () => {
141
+ const md = await runAndRead('<p>before</p>' +
142
+ '<video src="https://cdn.example.com/clip.mp4" poster="https://cdn.example.com/poster.jpg"></video>' +
143
+ '<p>after</p>');
144
+ expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls poster="https://cdn.example.com/poster.jpg"></video>');
145
+ expect(md).toContain('before');
146
+ expect(md).toContain('after');
147
+ });
148
+ it('falls back to <source> inside <video> when src attribute is absent', async () => {
149
+ const md = await runAndRead('<video><source src="https://cdn.example.com/clip.mp4" type="video/mp4"></video>');
150
+ expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls></video>');
151
+ });
152
+ it('drops <video> with no src and no <source>', async () => {
153
+ const md = await runAndRead('<p>before</p><video></video><p>after</p>');
154
+ expect(md).not.toContain('<video');
155
+ expect(md).toContain('before');
156
+ expect(md).toContain('after');
157
+ });
158
+ it('preserves <audio> as inline HTML', async () => {
159
+ const md = await runAndRead('<audio src="https://cdn.example.com/podcast.mp3"></audio>');
160
+ expect(md).toContain('<audio src="https://cdn.example.com/podcast.mp3" controls></audio>');
161
+ });
162
+ it('degrades <iframe> to a markdown link with title', async () => {
163
+ const md = await runAndRead('<iframe src="https://codepen.io/pen/abc" title="Live demo"></iframe>');
164
+ expect(md).toContain('[Live demo](https://codepen.io/pen/abc)');
165
+ });
166
+ it('defaults iframe title to "Embedded content" when missing', async () => {
167
+ const md = await runAndRead('<iframe src="https://example.com/embed"></iframe>');
168
+ expect(md).toContain('[Embedded content](https://example.com/embed)');
169
+ });
170
+ it('drops <iframe> with no src', async () => {
171
+ const md = await runAndRead('<p>before</p><iframe></iframe><p>after</p>');
172
+ expect(md).not.toContain('iframe');
173
+ expect(md).toContain('before');
174
+ expect(md).toContain('after');
175
+ });
176
+ });
177
+ describe('stdout mode', () => {
178
+ it('writes markdown to process.stdout and skips file write', async () => {
179
+ const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
180
+ tempDirs.push(tempDir);
181
+ const chunks = [];
182
+ const originalWrite = process.stdout.write.bind(process.stdout);
183
+ process.stdout.write = ((chunk) => {
184
+ chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
185
+ return true;
186
+ });
187
+ try {
188
+ const result = await downloadArticle({
189
+ title: 'Piped',
190
+ contentHtml: '<p>Streaming body</p>',
191
+ sourceUrl: 'https://example.com/a',
192
+ }, {
193
+ output: tempDir,
194
+ stdout: true,
195
+ });
196
+ expect(result[0].status).toBe('success');
197
+ expect(result[0].saved).toBe('-');
198
+ expect(fs.readdirSync(tempDir)).toHaveLength(0);
199
+ const emitted = chunks.join('');
200
+ expect(emitted).toContain('# Piped');
201
+ expect(emitted).toContain('Streaming body');
202
+ expect(emitted.endsWith('\n')).toBe(true);
203
+ }
204
+ finally {
205
+ process.stdout.write = originalWrite;
206
+ }
207
+ });
208
+ it('keeps remote image URLs intact in stdout mode (no download)', async () => {
209
+ const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
210
+ tempDirs.push(tempDir);
211
+ const chunks = [];
212
+ const originalWrite = process.stdout.write.bind(process.stdout);
213
+ process.stdout.write = ((chunk) => {
214
+ chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
215
+ return true;
216
+ });
217
+ try {
218
+ await downloadArticle({
219
+ title: 'WithImage',
220
+ contentHtml: '<p><img src="https://example.com/a.jpg"></p>',
221
+ imageUrls: ['https://example.com/a.jpg'],
222
+ }, {
223
+ output: tempDir,
224
+ downloadImages: true,
225
+ stdout: true,
226
+ });
227
+ expect(fs.readdirSync(tempDir)).toHaveLength(0);
228
+ expect(chunks.join('')).toContain('https://example.com/a.jpg');
229
+ }
230
+ finally {
231
+ process.stdout.write = originalWrite;
232
+ }
233
+ });
234
+ });
39
235
  });