@jackwener/opencli 1.7.5 → 1.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -10
- package/README.zh-CN.md +18 -9
- package/cli-manifest.json +401 -11
- package/clis/51job/company.js +125 -0
- package/clis/51job/detail.js +108 -0
- package/clis/51job/hot.js +55 -0
- package/clis/51job/search.js +79 -0
- package/clis/51job/utils.js +302 -0
- package/clis/51job/utils.test.js +69 -0
- package/clis/bilibili/video.js +68 -0
- package/clis/bilibili/video.test.js +132 -0
- package/clis/chatgpt/image.js +1 -1
- package/clis/deepseek/ask.js +37 -11
- package/clis/deepseek/ask.test.js +165 -0
- package/clis/deepseek/utils.js +192 -24
- package/clis/deepseek/utils.test.js +145 -0
- package/clis/gemini/image.js +1 -1
- package/clis/instagram/download.js +1 -1
- package/clis/jianyu/search.js +139 -3
- package/clis/jianyu/search.test.js +25 -0
- package/clis/jianyu/shared/procurement-detail.js +15 -0
- package/clis/jianyu/shared/procurement-detail.test.js +12 -0
- package/clis/twitter/likes.js +3 -2
- package/clis/twitter/search.js +4 -2
- package/clis/twitter/search.test.js +4 -0
- package/clis/twitter/shared.js +35 -2
- package/clis/twitter/shared.test.js +96 -0
- package/clis/twitter/thread.js +3 -1
- package/clis/twitter/timeline.js +3 -2
- package/clis/twitter/tweets.js +219 -0
- package/clis/twitter/tweets.test.js +125 -0
- package/clis/web/read.js +25 -5
- package/clis/web/read.test.js +76 -0
- package/clis/weread/ai-outline.js +170 -0
- package/clis/weread/ai-outline.test.js +83 -0
- package/clis/weread/book.js +57 -44
- package/clis/weread/commands.test.js +24 -0
- package/clis/xiaoyuzhou/podcast-episodes.js +2 -2
- package/clis/xiaoyuzhou/podcast-episodes.test.js +78 -0
- package/clis/youtube/channel.js +35 -0
- package/dist/src/browser/analyze.d.ts +103 -0
- package/dist/src/browser/analyze.js +230 -0
- package/dist/src/browser/analyze.test.d.ts +1 -0
- package/dist/src/browser/analyze.test.js +164 -0
- package/dist/src/browser/article-extract.d.ts +57 -0
- package/dist/src/browser/article-extract.e2e.test.d.ts +1 -0
- package/dist/src/browser/article-extract.e2e.test.js +105 -0
- package/dist/src/browser/article-extract.js +169 -0
- package/dist/src/browser/article-extract.test.d.ts +1 -0
- package/dist/src/browser/article-extract.test.js +94 -0
- package/dist/src/browser/base-page.d.ts +13 -3
- package/dist/src/browser/base-page.js +35 -25
- package/dist/src/browser/cdp.d.ts +1 -0
- package/dist/src/browser/cdp.js +23 -5
- package/dist/src/browser/compound.d.ts +59 -0
- package/dist/src/browser/compound.js +112 -0
- package/dist/src/browser/compound.test.d.ts +1 -0
- package/dist/src/browser/compound.test.js +175 -0
- package/dist/src/browser/dom-snapshot.d.ts +7 -0
- package/dist/src/browser/dom-snapshot.js +76 -3
- package/dist/src/browser/dom-snapshot.test.js +65 -0
- package/dist/src/browser/extract.d.ts +69 -0
- package/dist/src/browser/extract.js +132 -0
- package/dist/src/browser/extract.test.d.ts +1 -0
- package/dist/src/browser/extract.test.js +129 -0
- package/dist/src/browser/find.d.ts +76 -0
- package/dist/src/browser/find.js +179 -0
- package/dist/src/browser/find.test.d.ts +1 -0
- package/dist/src/browser/find.test.js +120 -0
- package/dist/src/browser/html-tree.d.ts +75 -0
- package/dist/src/browser/html-tree.js +112 -0
- package/dist/src/browser/html-tree.test.d.ts +1 -0
- package/dist/src/browser/html-tree.test.js +181 -0
- package/dist/src/browser/network-cache.d.ts +48 -0
- package/dist/src/browser/network-cache.js +66 -0
- package/dist/src/browser/network-cache.test.d.ts +1 -0
- package/dist/src/browser/network-cache.test.js +58 -0
- package/dist/src/browser/network-key.d.ts +22 -0
- package/dist/src/browser/network-key.js +66 -0
- package/dist/src/browser/network-key.test.d.ts +1 -0
- package/dist/src/browser/network-key.test.js +49 -0
- package/dist/src/browser/shape-filter.d.ts +52 -0
- package/dist/src/browser/shape-filter.js +101 -0
- package/dist/src/browser/shape-filter.test.d.ts +1 -0
- package/dist/src/browser/shape-filter.test.js +101 -0
- package/dist/src/browser/shape.d.ts +23 -0
- package/dist/src/browser/shape.js +95 -0
- package/dist/src/browser/shape.test.d.ts +1 -0
- package/dist/src/browser/shape.test.js +82 -0
- package/dist/src/browser/target-errors.d.ts +14 -1
- package/dist/src/browser/target-errors.js +13 -0
- package/dist/src/browser/target-errors.test.js +39 -6
- package/dist/src/browser/target-resolver.d.ts +57 -10
- package/dist/src/browser/target-resolver.js +195 -75
- package/dist/src/browser/target-resolver.test.js +80 -5
- package/dist/src/browser/verify-fixture.d.ts +59 -0
- package/dist/src/browser/verify-fixture.js +213 -0
- package/dist/src/browser/verify-fixture.test.d.ts +1 -0
- package/dist/src/browser/verify-fixture.test.js +161 -0
- package/dist/src/cli.d.ts +32 -0
- package/dist/src/cli.js +936 -141
- package/dist/src/cli.test.js +1051 -1
- package/dist/src/daemon.d.ts +3 -2
- package/dist/src/daemon.js +16 -4
- package/dist/src/daemon.test.d.ts +1 -0
- package/dist/src/daemon.test.js +19 -0
- package/dist/src/download/article-download.d.ts +12 -0
- package/dist/src/download/article-download.js +141 -17
- package/dist/src/download/article-download.test.js +196 -0
- package/dist/src/download/index.js +73 -86
- package/dist/src/errors.js +4 -2
- package/dist/src/errors.test.js +13 -0
- package/dist/src/execution.js +7 -2
- package/dist/src/execution.test.js +54 -0
- package/dist/src/launcher.d.ts +1 -1
- package/dist/src/launcher.js +3 -3
- package/dist/src/main.js +16 -0
- package/dist/src/output.js +1 -1
- package/dist/src/output.test.js +6 -0
- package/dist/src/types.d.ts +18 -3
- package/package.json +5 -1
package/dist/src/daemon.d.ts
CHANGED
|
@@ -9,7 +9,8 @@
|
|
|
9
9
|
* 1. Origin check — reject HTTP/WS from non chrome-extension:// origins
|
|
10
10
|
* 2. Custom header — require X-OpenCLI header (browsers can't send it
|
|
11
11
|
* without CORS preflight, which we deny)
|
|
12
|
-
* 3. No CORS headers —
|
|
12
|
+
* 3. No CORS headers on command endpoints — only /ping is readable from the
|
|
13
|
+
* Browser Bridge extension origin so the extension can probe daemon reachability
|
|
13
14
|
* 4. Body size limit — 1 MB max to prevent OOM
|
|
14
15
|
* 5. WebSocket verifyClient — reject upgrade before connection is established
|
|
15
16
|
*
|
|
@@ -18,4 +19,4 @@
|
|
|
18
19
|
* - Persistent — stays alive until explicit shutdown, SIGTERM, or uninstall
|
|
19
20
|
* - Listens on localhost:19825
|
|
20
21
|
*/
|
|
21
|
-
export
|
|
22
|
+
export declare function getResponseCorsHeaders(pathname: string, origin?: string): Record<string, string> | undefined;
|
package/dist/src/daemon.js
CHANGED
|
@@ -9,7 +9,8 @@
|
|
|
9
9
|
* 1. Origin check — reject HTTP/WS from non chrome-extension:// origins
|
|
10
10
|
* 2. Custom header — require X-OpenCLI header (browsers can't send it
|
|
11
11
|
* without CORS preflight, which we deny)
|
|
12
|
-
* 3. No CORS headers —
|
|
12
|
+
* 3. No CORS headers on command endpoints — only /ping is readable from the
|
|
13
|
+
* Browser Bridge extension origin so the extension can probe daemon reachability
|
|
13
14
|
* 4. Body size limit — 1 MB max to prevent OOM
|
|
14
15
|
* 5. WebSocket verifyClient — reject upgrade before connection is established
|
|
15
16
|
*
|
|
@@ -60,10 +61,20 @@ function readBody(req) {
|
|
|
60
61
|
reject(err); });
|
|
61
62
|
});
|
|
62
63
|
}
|
|
63
|
-
function jsonResponse(res, status, data) {
|
|
64
|
-
res.writeHead(status, { 'Content-Type': 'application/json' });
|
|
64
|
+
function jsonResponse(res, status, data, extraHeaders) {
|
|
65
|
+
res.writeHead(status, { 'Content-Type': 'application/json', ...extraHeaders });
|
|
65
66
|
res.end(JSON.stringify(data));
|
|
66
67
|
}
|
|
68
|
+
export function getResponseCorsHeaders(pathname, origin) {
|
|
69
|
+
if (pathname !== '/ping')
|
|
70
|
+
return undefined;
|
|
71
|
+
if (!origin || !origin.startsWith('chrome-extension://'))
|
|
72
|
+
return undefined;
|
|
73
|
+
return {
|
|
74
|
+
'Access-Control-Allow-Origin': origin,
|
|
75
|
+
Vary: 'Origin',
|
|
76
|
+
};
|
|
77
|
+
}
|
|
67
78
|
async function handleRequest(req, res) {
|
|
68
79
|
// ─── Security: Origin & custom-header check ──────────────────────
|
|
69
80
|
// Block browser-based CSRF: browsers always send an Origin header on
|
|
@@ -93,7 +104,7 @@ async function handleRequest(req, res) {
|
|
|
93
104
|
// Timing side-channels can reveal daemon presence to local processes, which
|
|
94
105
|
// is an accepted risk given the daemon is loopback-only and short-lived.
|
|
95
106
|
if (req.method === 'GET' && pathname === '/ping') {
|
|
96
|
-
jsonResponse(res, 200, { ok: true });
|
|
107
|
+
jsonResponse(res, 200, { ok: true }, getResponseCorsHeaders(pathname, origin));
|
|
97
108
|
return;
|
|
98
109
|
}
|
|
99
110
|
// Require custom header on all other HTTP requests. Browsers cannot attach
|
|
@@ -272,6 +283,7 @@ wss.on('connection', (ws) => {
|
|
|
272
283
|
if (extensionWs === ws) {
|
|
273
284
|
extensionWs = null;
|
|
274
285
|
extensionVersion = null;
|
|
286
|
+
extensionCompatRange = null;
|
|
275
287
|
// Reject pending requests in case 'close' does not follow this 'error'
|
|
276
288
|
for (const [, p] of pending) {
|
|
277
289
|
clearTimeout(p.timer);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
import { getResponseCorsHeaders } from './daemon.js';
|
|
3
|
+
describe('getResponseCorsHeaders', () => {
|
|
4
|
+
it('allows the Browser Bridge extension origin to read /ping', () => {
|
|
5
|
+
expect(getResponseCorsHeaders('/ping', 'chrome-extension://abc123')).toEqual({
|
|
6
|
+
'Access-Control-Allow-Origin': 'chrome-extension://abc123',
|
|
7
|
+
Vary: 'Origin',
|
|
8
|
+
});
|
|
9
|
+
});
|
|
10
|
+
it('does not add CORS headers for ordinary web origins', () => {
|
|
11
|
+
expect(getResponseCorsHeaders('/ping', 'https://example.com')).toBeUndefined();
|
|
12
|
+
});
|
|
13
|
+
it('does not add CORS headers when origin is absent', () => {
|
|
14
|
+
expect(getResponseCorsHeaders('/ping')).toBeUndefined();
|
|
15
|
+
});
|
|
16
|
+
it('does not add CORS headers for command endpoints even from the extension origin', () => {
|
|
17
|
+
expect(getResponseCorsHeaders('/command', 'chrome-extension://abc123')).toBeUndefined();
|
|
18
|
+
});
|
|
19
|
+
});
|
|
@@ -37,6 +37,18 @@ export interface ArticleDownloadOptions {
|
|
|
37
37
|
detectImageExt?: (url: string) => string;
|
|
38
38
|
/** Custom frontmatter labels (default: Chinese labels) */
|
|
39
39
|
frontmatterLabels?: FrontmatterLabels;
|
|
40
|
+
/**
|
|
41
|
+
* Extra CSS selectors removed from the article before Turndown conversion.
|
|
42
|
+
* Use this to drop site-specific noise the adapter can't always trim upstream
|
|
43
|
+
* (e.g. zhihu 折叠卡, weixin 赞赏栏, wiki infobox).
|
|
44
|
+
*/
|
|
45
|
+
cleanSelectors?: string[];
|
|
46
|
+
/**
|
|
47
|
+
* Write the markdown to `process.stdout` instead of a file on disk. Image
|
|
48
|
+
* download and directory creation are skipped — remote image URLs are kept
|
|
49
|
+
* as-is so the output is self-contained when piped.
|
|
50
|
+
*/
|
|
51
|
+
stdout?: boolean;
|
|
40
52
|
}
|
|
41
53
|
export interface ArticleDownloadResult {
|
|
42
54
|
title: string;
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import * as fs from 'node:fs';
|
|
9
9
|
import * as path from 'node:path';
|
|
10
10
|
import TurndownService from 'turndown';
|
|
11
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
11
12
|
import { httpDownload, sanitizeFilename } from './index.js';
|
|
12
13
|
import { formatBytes } from './progress.js';
|
|
13
14
|
const IMAGE_CONCURRENCY = 5;
|
|
@@ -19,22 +20,127 @@ const DEFAULT_LABELS = {
|
|
|
19
20
|
// ============================================================
|
|
20
21
|
// Markdown Conversion
|
|
21
22
|
// ============================================================
|
|
22
|
-
|
|
23
|
+
// Nodes that never carry article content. Turndown keeps them by default — if an
|
|
24
|
+
// adapter's contentHtml extraction misses one, CSS / scripts / widget markup
|
|
25
|
+
// ends up inline in the .md. Strip them unconditionally at the converter level.
|
|
26
|
+
// `svg` is not in HTMLElementTagNameMap, so we type-narrow manually.
|
|
27
|
+
// `header/footer/nav/aside` cover page chrome that adapters occasionally
|
|
28
|
+
// forget to trim — the article's own title/author/publishTime are supplied
|
|
29
|
+
// as separate fields on ArticleData, so duplicated nodes are redundant.
|
|
30
|
+
// `iframe` is NOT in this set — it's handled by a dedicated rule below that
|
|
31
|
+
// degrades to a link so embedded content (YouTube, Twitter, CodePen …) keeps
|
|
32
|
+
// a reachable URL in the exported markdown.
|
|
33
|
+
const STRIPPED_TAGS = [
|
|
34
|
+
'script', 'style', 'noscript',
|
|
35
|
+
'canvas',
|
|
36
|
+
'form', 'button', 'dialog',
|
|
37
|
+
'header', 'footer', 'nav', 'aside',
|
|
38
|
+
];
|
|
39
|
+
function createTurndown(configure, cleanSelectors) {
|
|
23
40
|
const td = new TurndownService({
|
|
24
41
|
headingStyle: 'atx',
|
|
25
42
|
codeBlockStyle: 'fenced',
|
|
26
43
|
bulletListMarker: '-',
|
|
27
44
|
});
|
|
45
|
+
td.use(gfm);
|
|
46
|
+
td.remove(STRIPPED_TAGS);
|
|
47
|
+
// turndown-plugin-gfm@1.0.2 emits single-tilde strikethrough (`~x~`), which
|
|
48
|
+
// is not the canonical GFM form. Override it so exported markdown is
|
|
49
|
+
// portable across common renderers.
|
|
50
|
+
td.addRule('canonicalStrikethrough', {
|
|
51
|
+
filter: (node) => ['DEL', 'S', 'STRIKE'].includes(node.nodeName),
|
|
52
|
+
replacement: (content) => `~~${content}~~`,
|
|
53
|
+
});
|
|
54
|
+
// SVG isn't in the static HTML tag map; match by name with a custom filter.
|
|
55
|
+
td.addRule('stripSvg', {
|
|
56
|
+
filter: (node) => node.nodeName === 'svg' || node.nodeName === 'SVG',
|
|
57
|
+
replacement: () => '',
|
|
58
|
+
});
|
|
28
59
|
td.addRule('linebreak', {
|
|
29
60
|
filter: 'br',
|
|
30
61
|
replacement: () => '\n',
|
|
31
62
|
});
|
|
63
|
+
// Inline base64 images would land as huge ``
|
|
64
|
+
// strings that the image downloader can't localize. Drop them.
|
|
65
|
+
td.addRule('ignoreBase64Images', {
|
|
66
|
+
filter: (node) => {
|
|
67
|
+
if (node.nodeName !== 'IMG')
|
|
68
|
+
return false;
|
|
69
|
+
const src = node.getAttribute?.('src') ?? '';
|
|
70
|
+
return src.startsWith('data:');
|
|
71
|
+
},
|
|
72
|
+
replacement: () => '',
|
|
73
|
+
});
|
|
74
|
+
// Markdown has no native video/audio primitive. Emit inline HTML so
|
|
75
|
+
// renderers that support it (GitHub, VS Code preview …) still play the
|
|
76
|
+
// media; viewers that don't simply show the tag as text, which is still
|
|
77
|
+
// more information than dropping the node outright.
|
|
78
|
+
td.addRule('videoElement', {
|
|
79
|
+
filter: (node) => node.nodeName === 'VIDEO',
|
|
80
|
+
replacement: (_content, node) => {
|
|
81
|
+
const el = node;
|
|
82
|
+
const src = el.getAttribute('src')
|
|
83
|
+
|| el.querySelector('source')?.getAttribute('src')
|
|
84
|
+
|| '';
|
|
85
|
+
if (!src)
|
|
86
|
+
return '';
|
|
87
|
+
const poster = el.getAttribute('poster') || '';
|
|
88
|
+
return `\n<video src="${src}" controls${poster ? ` poster="${poster}"` : ''}></video>\n`;
|
|
89
|
+
},
|
|
90
|
+
});
|
|
91
|
+
td.addRule('audioElement', {
|
|
92
|
+
filter: (node) => node.nodeName === 'AUDIO',
|
|
93
|
+
replacement: (_content, node) => {
|
|
94
|
+
const el = node;
|
|
95
|
+
const src = el.getAttribute('src')
|
|
96
|
+
|| el.querySelector('source')?.getAttribute('src')
|
|
97
|
+
|| '';
|
|
98
|
+
return src ? `\n<audio src="${src}" controls></audio>\n` : '';
|
|
99
|
+
},
|
|
100
|
+
});
|
|
101
|
+
// Iframes (YouTube, Twitter, CodePen …) degrade to a markdown link so the
|
|
102
|
+
// embedded resource is still reachable from the exported file.
|
|
103
|
+
td.addRule('iframeToLink', {
|
|
104
|
+
filter: (node) => node.nodeName === 'IFRAME',
|
|
105
|
+
replacement: (_content, node) => {
|
|
106
|
+
const el = node;
|
|
107
|
+
const src = el.getAttribute('src') || '';
|
|
108
|
+
if (!src)
|
|
109
|
+
return '';
|
|
110
|
+
const title = el.getAttribute('title') || 'Embedded content';
|
|
111
|
+
return `\n[${title}](${src})\n`;
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
// Per-adapter dirty-node removal. Adapters know their site's specific noise
|
|
115
|
+
// (zhihu 折叠卡, weixin 赞赏栏, wiki 折叠 infobox …); we keep the default set
|
|
116
|
+
// empty so the generic converter stays untouched.
|
|
117
|
+
const selectorRules = (cleanSelectors ?? [])
|
|
118
|
+
.map(sel => sel.trim())
|
|
119
|
+
.filter(Boolean);
|
|
120
|
+
if (selectorRules.length > 0) {
|
|
121
|
+
td.addRule('cleanSelectors', {
|
|
122
|
+
filter: (node) => {
|
|
123
|
+
const match = node.matches;
|
|
124
|
+
if (typeof match !== 'function')
|
|
125
|
+
return false;
|
|
126
|
+
return selectorRules.some((sel) => {
|
|
127
|
+
try {
|
|
128
|
+
return match.call(node, sel);
|
|
129
|
+
}
|
|
130
|
+
catch {
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
});
|
|
134
|
+
},
|
|
135
|
+
replacement: () => '',
|
|
136
|
+
});
|
|
137
|
+
}
|
|
32
138
|
if (configure)
|
|
33
139
|
configure(td);
|
|
34
140
|
return td;
|
|
35
141
|
}
|
|
36
|
-
function convertToMarkdown(contentHtml, codeBlocks, configure) {
|
|
37
|
-
const td = createTurndown(configure);
|
|
142
|
+
function convertToMarkdown(contentHtml, codeBlocks, configure, cleanSelectors) {
|
|
143
|
+
const td = createTurndown(configure, cleanSelectors);
|
|
38
144
|
let md = td.turndown(contentHtml);
|
|
39
145
|
// Restore code block placeholders
|
|
40
146
|
codeBlocks.forEach((block, i) => {
|
|
@@ -44,8 +150,12 @@ function convertToMarkdown(contentHtml, codeBlocks, configure) {
|
|
|
44
150
|
});
|
|
45
151
|
// Clean up
|
|
46
152
|
md = md.replace(/\u00a0/g, ' ');
|
|
47
|
-
|
|
153
|
+
// Turndown leaves behind lone dashes / middle dots when list bullets or
|
|
154
|
+
// decorative separators lose their surrounding inline context.
|
|
155
|
+
md = md.replace(/^[ \t]*[-·][ \t]*$/gm, '');
|
|
156
|
+
md = md.replace(/^[ \t]+$/gm, '');
|
|
48
157
|
md = md.replace(/[ \t]+$/gm, '');
|
|
158
|
+
md = md.replace(/\n{3,}/g, '\n\n');
|
|
49
159
|
return md;
|
|
50
160
|
}
|
|
51
161
|
function replaceImageUrls(md, urlMap) {
|
|
@@ -120,7 +230,7 @@ async function downloadImages(imgUrls, imgDir, headers, detectExt) {
|
|
|
120
230
|
* 6. File write
|
|
121
231
|
*/
|
|
122
232
|
export async function downloadArticle(data, options) {
|
|
123
|
-
const { output, downloadImages: shouldDownloadImages = true, imageHeaders, maxTitleLength = 80, configureTurndown, detectImageExt, frontmatterLabels, } = options;
|
|
233
|
+
const { output, downloadImages: shouldDownloadImages = true, imageHeaders, maxTitleLength = 80, configureTurndown, detectImageExt, frontmatterLabels, cleanSelectors, stdout = false, } = options;
|
|
124
234
|
const labels = { ...DEFAULT_LABELS, ...frontmatterLabels };
|
|
125
235
|
if (!data.title) {
|
|
126
236
|
return [{
|
|
@@ -143,33 +253,47 @@ export async function downloadArticle(data, options) {
|
|
|
143
253
|
}];
|
|
144
254
|
}
|
|
145
255
|
// Convert HTML to Markdown
|
|
146
|
-
let markdown = convertToMarkdown(data.contentHtml, data.codeBlocks || [], configureTurndown);
|
|
147
|
-
// Prepare output directory
|
|
256
|
+
let markdown = convertToMarkdown(data.contentHtml, data.codeBlocks || [], configureTurndown, cleanSelectors);
|
|
148
257
|
const safeTitle = sanitizeFilename(data.title, maxTitleLength);
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
258
|
+
// Download images only when writing to disk. In stdout mode remote URLs
|
|
259
|
+
// stay intact so the piped output is self-contained.
|
|
260
|
+
if (!stdout && shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) {
|
|
261
|
+
const articleDir = path.join(output, safeTitle);
|
|
262
|
+
fs.mkdirSync(articleDir, { recursive: true });
|
|
153
263
|
const imagesDir = path.join(articleDir, 'images');
|
|
154
264
|
fs.mkdirSync(imagesDir, { recursive: true });
|
|
155
265
|
const urlMap = await downloadImages(data.imageUrls, imagesDir, imageHeaders, detectImageExt);
|
|
156
266
|
markdown = replaceImageUrls(markdown, urlMap);
|
|
157
267
|
}
|
|
158
|
-
// Build frontmatter with customizable labels
|
|
159
|
-
|
|
268
|
+
// Build frontmatter with customizable labels.
|
|
269
|
+
// Shape: `# Title\n[> meta\n...]\n---\n\n<markdown>` — exactly one blank
|
|
270
|
+
// line separates every section, so we never produce ≥3 consecutive newlines.
|
|
271
|
+
const headerLines = [`# ${data.title}`];
|
|
160
272
|
if (data.author)
|
|
161
273
|
headerLines.push(`> ${labels.author}: ${data.author}`);
|
|
162
274
|
if (data.publishTime)
|
|
163
275
|
headerLines.push(`> ${labels.publishTime}: ${data.publishTime}`);
|
|
164
276
|
if (data.sourceUrl)
|
|
165
277
|
headerLines.push(`> ${labels.sourceUrl}: ${data.sourceUrl}`);
|
|
166
|
-
headerLines.
|
|
167
|
-
const fullContent =
|
|
168
|
-
|
|
278
|
+
const frontmatter = headerLines.join('\n') + '\n\n---\n\n';
|
|
279
|
+
const fullContent = frontmatter + markdown;
|
|
280
|
+
const size = Buffer.byteLength(fullContent, 'utf-8');
|
|
281
|
+
if (stdout) {
|
|
282
|
+
process.stdout.write(fullContent.endsWith('\n') ? fullContent : fullContent + '\n');
|
|
283
|
+
return [{
|
|
284
|
+
title: data.title,
|
|
285
|
+
author: data.author || '-',
|
|
286
|
+
publish_time: data.publishTime || '-',
|
|
287
|
+
status: 'success',
|
|
288
|
+
size: formatBytes(size),
|
|
289
|
+
saved: '-',
|
|
290
|
+
}];
|
|
291
|
+
}
|
|
292
|
+
const articleDir = path.join(output, safeTitle);
|
|
293
|
+
fs.mkdirSync(articleDir, { recursive: true });
|
|
169
294
|
const filename = `${safeTitle}.md`;
|
|
170
295
|
const filePath = path.join(articleDir, filename);
|
|
171
296
|
fs.writeFileSync(filePath, fullContent, 'utf-8');
|
|
172
|
-
const size = Buffer.byteLength(fullContent, 'utf-8');
|
|
173
297
|
return [{
|
|
174
298
|
title: data.title,
|
|
175
299
|
author: data.author || '-',
|
|
@@ -15,6 +15,20 @@ afterEach(() => {
|
|
|
15
15
|
}
|
|
16
16
|
tempDirs.length = 0;
|
|
17
17
|
});
|
|
18
|
+
async function runAndRead(contentHtml, opts = {}) {
|
|
19
|
+
const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
|
|
20
|
+
tempDirs.push(tempDir);
|
|
21
|
+
const result = await downloadArticle({
|
|
22
|
+
title: 'Test Article',
|
|
23
|
+
contentHtml,
|
|
24
|
+
}, {
|
|
25
|
+
output: tempDir,
|
|
26
|
+
downloadImages: false,
|
|
27
|
+
...(opts.cleanSelectors && { cleanSelectors: opts.cleanSelectors }),
|
|
28
|
+
});
|
|
29
|
+
expect(result[0].status).toBe('success');
|
|
30
|
+
return fs.readFileSync(result[0].saved, 'utf8');
|
|
31
|
+
}
|
|
18
32
|
describe('downloadArticle', () => {
|
|
19
33
|
it('returns the saved markdown file path on success', async () => {
|
|
20
34
|
const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
|
|
@@ -36,4 +50,186 @@ describe('downloadArticle', () => {
|
|
|
36
50
|
expect(fs.existsSync(result[0].saved)).toBe(true);
|
|
37
51
|
expect(fs.readFileSync(result[0].saved, 'utf8')).toContain('Hello world');
|
|
38
52
|
});
|
|
53
|
+
describe('markdown pipeline', () => {
|
|
54
|
+
it('converts GFM tables', async () => {
|
|
55
|
+
const md = await runAndRead('<table><thead><tr><th>a</th><th>b</th></tr></thead>' +
|
|
56
|
+
'<tbody><tr><td>1</td><td>2</td></tr></tbody></table>');
|
|
57
|
+
expect(md).toMatch(/\|\s*a\s*\|\s*b\s*\|/);
|
|
58
|
+
expect(md).toMatch(/\|\s*---\s*\|\s*---\s*\|/);
|
|
59
|
+
expect(md).toMatch(/\|\s*1\s*\|\s*2\s*\|/);
|
|
60
|
+
});
|
|
61
|
+
it('converts strikethrough and task lists', async () => {
|
|
62
|
+
const md = await runAndRead('<p><del>gone</del></p>' +
|
|
63
|
+
'<ul><li><input type="checkbox" checked>done</li><li><input type="checkbox">todo</li></ul>');
|
|
64
|
+
expect(md).toContain('~~gone~~');
|
|
65
|
+
expect(md).toContain('[x] done');
|
|
66
|
+
expect(md).toContain('[ ] todo');
|
|
67
|
+
});
|
|
68
|
+
it('strips script / style / noscript / form but keeps iframe as a link', async () => {
|
|
69
|
+
const md = await runAndRead('<p>keep</p>' +
|
|
70
|
+
'<script>alert(1)</script>' +
|
|
71
|
+
'<style>.x{color:red}</style>' +
|
|
72
|
+
'<noscript>nojs</noscript>' +
|
|
73
|
+
'<iframe src="https://www.youtube.com/embed/abc" title="Demo video"></iframe>' +
|
|
74
|
+
'<form><button>click</button></form>');
|
|
75
|
+
expect(md).toContain('keep');
|
|
76
|
+
expect(md).not.toContain('alert');
|
|
77
|
+
expect(md).not.toContain('color:red');
|
|
78
|
+
expect(md).not.toContain('nojs');
|
|
79
|
+
expect(md).not.toContain('click');
|
|
80
|
+
// Iframe degrades to a link preserving the embedded URL.
|
|
81
|
+
expect(md).toContain('[Demo video](https://www.youtube.com/embed/abc)');
|
|
82
|
+
});
|
|
83
|
+
it('strips SVG nodes entirely', async () => {
|
|
84
|
+
const md = await runAndRead('<p>before</p><svg><circle cx="5" cy="5" r="4"/></svg><p>after</p>');
|
|
85
|
+
expect(md).toContain('before');
|
|
86
|
+
expect(md).toContain('after');
|
|
87
|
+
expect(md).not.toContain('svg');
|
|
88
|
+
expect(md).not.toContain('circle');
|
|
89
|
+
});
|
|
90
|
+
it('drops base64 data URI images but keeps regular images', async () => {
|
|
91
|
+
const md = await runAndRead('<p><img alt="inline" src="data:image/png;base64,iVBORw0KGgo="></p>' +
|
|
92
|
+
'<p><img alt="keep" src="https://example.com/a.jpg"></p>');
|
|
93
|
+
expect(md).not.toContain('data:image');
|
|
94
|
+
expect(md).toContain('');
|
|
95
|
+
});
|
|
96
|
+
it('collapses 3+ blank lines and strips lone bullet / middle-dot residue', async () => {
|
|
97
|
+
const md = await runAndRead('<p>top</p>' +
|
|
98
|
+
'<p>-</p>' +
|
|
99
|
+
'<p>·</p>' +
|
|
100
|
+
'<p>bottom</p>');
|
|
101
|
+
expect(md).not.toMatch(/\n{3,}/);
|
|
102
|
+
expect(md).not.toMatch(/^\s*-\s*$/m);
|
|
103
|
+
expect(md).not.toMatch(/^\s*·\s*$/m);
|
|
104
|
+
expect(md).toContain('top');
|
|
105
|
+
expect(md).toContain('bottom');
|
|
106
|
+
});
|
|
107
|
+
it('strips page chrome (header / footer / nav / aside)', async () => {
|
|
108
|
+
const md = await runAndRead('<header><p>page-header-text</p></header>' +
|
|
109
|
+
'<nav><a href="/">home-link</a></nav>' +
|
|
110
|
+
'<p>article-body</p>' +
|
|
111
|
+
'<aside><p>sidebar-text</p></aside>' +
|
|
112
|
+
'<footer><p>page-footer-text</p></footer>');
|
|
113
|
+
expect(md).toContain('article-body');
|
|
114
|
+
expect(md).not.toContain('page-header-text');
|
|
115
|
+
expect(md).not.toContain('home-link');
|
|
116
|
+
expect(md).not.toContain('sidebar-text');
|
|
117
|
+
expect(md).not.toContain('page-footer-text');
|
|
118
|
+
});
|
|
119
|
+
it('cleanSelectors removes matching nodes before conversion', async () => {
|
|
120
|
+
const md = await runAndRead('<p>keep-me</p>' +
|
|
121
|
+
'<div class="vote-card">折叠卡</div>' +
|
|
122
|
+
'<section class="reward-panel">赞赏栏</section>' +
|
|
123
|
+
'<p>also-keep</p>', { cleanSelectors: ['.vote-card', '.reward-panel'] });
|
|
124
|
+
expect(md).toContain('keep-me');
|
|
125
|
+
expect(md).toContain('also-keep');
|
|
126
|
+
expect(md).not.toContain('折叠卡');
|
|
127
|
+
expect(md).not.toContain('赞赏栏');
|
|
128
|
+
});
|
|
129
|
+
it('cleanSelectors silently ignores invalid selectors', async () => {
|
|
130
|
+
const md = await runAndRead('<p>survives</p><div class="x">and-this-too</div>', { cleanSelectors: ['!!!not-a-valid-selector', '.missing'] });
|
|
131
|
+
expect(md).toContain('survives');
|
|
132
|
+
expect(md).toContain('and-this-too');
|
|
133
|
+
});
|
|
134
|
+
it('cleanSelectors keeps valid selectors active when one selector is invalid', async () => {
|
|
135
|
+
const md = await runAndRead('<p>keep</p><div class="vote-card">strip-me</div><p>also-keep</p>', { cleanSelectors: ['!!!not-a-valid-selector', '.vote-card'] });
|
|
136
|
+
expect(md).toContain('keep');
|
|
137
|
+
expect(md).toContain('also-keep');
|
|
138
|
+
expect(md).not.toContain('strip-me');
|
|
139
|
+
});
|
|
140
|
+
it('preserves <video> as inline HTML with src + poster', async () => {
|
|
141
|
+
const md = await runAndRead('<p>before</p>' +
|
|
142
|
+
'<video src="https://cdn.example.com/clip.mp4" poster="https://cdn.example.com/poster.jpg"></video>' +
|
|
143
|
+
'<p>after</p>');
|
|
144
|
+
expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls poster="https://cdn.example.com/poster.jpg"></video>');
|
|
145
|
+
expect(md).toContain('before');
|
|
146
|
+
expect(md).toContain('after');
|
|
147
|
+
});
|
|
148
|
+
it('falls back to <source> inside <video> when src attribute is absent', async () => {
|
|
149
|
+
const md = await runAndRead('<video><source src="https://cdn.example.com/clip.mp4" type="video/mp4"></video>');
|
|
150
|
+
expect(md).toContain('<video src="https://cdn.example.com/clip.mp4" controls></video>');
|
|
151
|
+
});
|
|
152
|
+
it('drops <video> with no src and no <source>', async () => {
|
|
153
|
+
const md = await runAndRead('<p>before</p><video></video><p>after</p>');
|
|
154
|
+
expect(md).not.toContain('<video');
|
|
155
|
+
expect(md).toContain('before');
|
|
156
|
+
expect(md).toContain('after');
|
|
157
|
+
});
|
|
158
|
+
it('preserves <audio> as inline HTML', async () => {
|
|
159
|
+
const md = await runAndRead('<audio src="https://cdn.example.com/podcast.mp3"></audio>');
|
|
160
|
+
expect(md).toContain('<audio src="https://cdn.example.com/podcast.mp3" controls></audio>');
|
|
161
|
+
});
|
|
162
|
+
it('degrades <iframe> to a markdown link with title', async () => {
|
|
163
|
+
const md = await runAndRead('<iframe src="https://codepen.io/pen/abc" title="Live demo"></iframe>');
|
|
164
|
+
expect(md).toContain('[Live demo](https://codepen.io/pen/abc)');
|
|
165
|
+
});
|
|
166
|
+
it('defaults iframe title to "Embedded content" when missing', async () => {
|
|
167
|
+
const md = await runAndRead('<iframe src="https://example.com/embed"></iframe>');
|
|
168
|
+
expect(md).toContain('[Embedded content](https://example.com/embed)');
|
|
169
|
+
});
|
|
170
|
+
it('drops <iframe> with no src', async () => {
|
|
171
|
+
const md = await runAndRead('<p>before</p><iframe></iframe><p>after</p>');
|
|
172
|
+
expect(md).not.toContain('iframe');
|
|
173
|
+
expect(md).toContain('before');
|
|
174
|
+
expect(md).toContain('after');
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
describe('stdout mode', () => {
|
|
178
|
+
it('writes markdown to process.stdout and skips file write', async () => {
|
|
179
|
+
const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
|
|
180
|
+
tempDirs.push(tempDir);
|
|
181
|
+
const chunks = [];
|
|
182
|
+
const originalWrite = process.stdout.write.bind(process.stdout);
|
|
183
|
+
process.stdout.write = ((chunk) => {
|
|
184
|
+
chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
|
|
185
|
+
return true;
|
|
186
|
+
});
|
|
187
|
+
try {
|
|
188
|
+
const result = await downloadArticle({
|
|
189
|
+
title: 'Piped',
|
|
190
|
+
contentHtml: '<p>Streaming body</p>',
|
|
191
|
+
sourceUrl: 'https://example.com/a',
|
|
192
|
+
}, {
|
|
193
|
+
output: tempDir,
|
|
194
|
+
stdout: true,
|
|
195
|
+
});
|
|
196
|
+
expect(result[0].status).toBe('success');
|
|
197
|
+
expect(result[0].saved).toBe('-');
|
|
198
|
+
expect(fs.readdirSync(tempDir)).toHaveLength(0);
|
|
199
|
+
const emitted = chunks.join('');
|
|
200
|
+
expect(emitted).toContain('# Piped');
|
|
201
|
+
expect(emitted).toContain('Streaming body');
|
|
202
|
+
expect(emitted.endsWith('\n')).toBe(true);
|
|
203
|
+
}
|
|
204
|
+
finally {
|
|
205
|
+
process.stdout.write = originalWrite;
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
it('keeps remote image URLs intact in stdout mode (no download)', async () => {
|
|
209
|
+
const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-'));
|
|
210
|
+
tempDirs.push(tempDir);
|
|
211
|
+
const chunks = [];
|
|
212
|
+
const originalWrite = process.stdout.write.bind(process.stdout);
|
|
213
|
+
process.stdout.write = ((chunk) => {
|
|
214
|
+
chunks.push(typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf8'));
|
|
215
|
+
return true;
|
|
216
|
+
});
|
|
217
|
+
try {
|
|
218
|
+
await downloadArticle({
|
|
219
|
+
title: 'WithImage',
|
|
220
|
+
contentHtml: '<p><img src="https://example.com/a.jpg"></p>',
|
|
221
|
+
imageUrls: ['https://example.com/a.jpg'],
|
|
222
|
+
}, {
|
|
223
|
+
output: tempDir,
|
|
224
|
+
downloadImages: true,
|
|
225
|
+
stdout: true,
|
|
226
|
+
});
|
|
227
|
+
expect(fs.readdirSync(tempDir)).toHaveLength(0);
|
|
228
|
+
expect(chunks.join('')).toContain('https://example.com/a.jpg');
|
|
229
|
+
}
|
|
230
|
+
finally {
|
|
231
|
+
process.stdout.write = originalWrite;
|
|
232
|
+
}
|
|
233
|
+
});
|
|
234
|
+
});
|
|
39
235
|
});
|