@jackwener/opencli 1.7.6 → 1.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +17 -8
  2. package/README.zh-CN.md +14 -8
  3. package/cli-manifest.json +325 -11
  4. package/clis/51job/company.js +125 -0
  5. package/clis/51job/detail.js +108 -0
  6. package/clis/51job/hot.js +55 -0
  7. package/clis/51job/search.js +79 -0
  8. package/clis/51job/utils.js +302 -0
  9. package/clis/51job/utils.test.js +69 -0
  10. package/clis/bilibili/video.js +11 -4
  11. package/clis/bilibili/video.test.js +51 -0
  12. package/clis/chatgpt/image.js +1 -1
  13. package/clis/deepseek/ask.js +19 -13
  14. package/clis/deepseek/ask.test.js +93 -1
  15. package/clis/deepseek/utils.js +108 -23
  16. package/clis/deepseek/utils.test.js +109 -1
  17. package/clis/gemini/image.js +1 -1
  18. package/clis/instagram/download.js +1 -1
  19. package/clis/twitter/likes.js +3 -2
  20. package/clis/twitter/search.js +4 -2
  21. package/clis/twitter/search.test.js +4 -0
  22. package/clis/twitter/shared.js +28 -0
  23. package/clis/twitter/shared.test.js +96 -0
  24. package/clis/twitter/thread.js +3 -1
  25. package/clis/twitter/timeline.js +3 -2
  26. package/clis/twitter/tweets.js +3 -2
  27. package/clis/twitter/tweets.test.js +1 -1
  28. package/clis/web/read.js +25 -5
  29. package/clis/web/read.test.js +76 -0
  30. package/clis/weread/ai-outline.js +170 -0
  31. package/clis/weread/ai-outline.test.js +83 -0
  32. package/clis/weread/book.js +57 -44
  33. package/clis/weread/commands.test.js +24 -0
  34. package/clis/xiaoyuzhou/podcast-episodes.js +2 -2
  35. package/clis/xiaoyuzhou/podcast-episodes.test.js +78 -0
  36. package/dist/src/browser/analyze.d.ts +103 -0
  37. package/dist/src/browser/analyze.js +230 -0
  38. package/dist/src/browser/analyze.test.d.ts +1 -0
  39. package/dist/src/browser/analyze.test.js +164 -0
  40. package/dist/src/browser/article-extract.d.ts +57 -0
  41. package/dist/src/browser/article-extract.e2e.test.d.ts +1 -0
  42. package/dist/src/browser/article-extract.e2e.test.js +105 -0
  43. package/dist/src/browser/article-extract.js +169 -0
  44. package/dist/src/browser/article-extract.test.d.ts +1 -0
  45. package/dist/src/browser/article-extract.test.js +94 -0
  46. package/dist/src/browser/cdp.js +11 -2
  47. package/dist/src/browser/verify-fixture.d.ts +59 -0
  48. package/dist/src/browser/verify-fixture.js +213 -0
  49. package/dist/src/browser/verify-fixture.test.d.ts +1 -0
  50. package/dist/src/browser/verify-fixture.test.js +161 -0
  51. package/dist/src/cli.d.ts +32 -0
  52. package/dist/src/cli.js +333 -43
  53. package/dist/src/cli.test.js +257 -1
  54. package/dist/src/daemon.d.ts +3 -2
  55. package/dist/src/daemon.js +16 -4
  56. package/dist/src/daemon.test.d.ts +1 -0
  57. package/dist/src/daemon.test.js +19 -0
  58. package/dist/src/download/article-download.d.ts +12 -0
  59. package/dist/src/download/article-download.js +141 -17
  60. package/dist/src/download/article-download.test.js +196 -0
  61. package/dist/src/download/index.js +73 -86
  62. package/dist/src/errors.js +4 -2
  63. package/dist/src/errors.test.js +13 -0
  64. package/dist/src/launcher.d.ts +1 -1
  65. package/dist/src/launcher.js +3 -3
  66. package/dist/src/output.js +1 -1
  67. package/dist/src/output.test.js +6 -0
  68. package/package.json +5 -1
@@ -17,7 +17,7 @@ vi.mock('./browser/index.js', () => {
17
17
  },
18
18
  };
19
19
  });
20
- import { createProgram, findPackageRoot, resolveBrowserVerifyInvocation } from './cli.js';
20
+ import { createProgram, findPackageRoot, normalizeVerifyRows, renderVerifyPreview, resolveBrowserVerifyInvocation } from './cli.js';
21
21
  describe('resolveBrowserVerifyInvocation', () => {
22
22
  it('prefers the built entry declared in package metadata', () => {
23
23
  const projectRoot = path.join('repo-root');
@@ -102,6 +102,7 @@ describe('browser tab targeting commands', () => {
102
102
  getActivePage: vi.fn().mockReturnValue('tab-1'),
103
103
  getCurrentUrl: vi.fn().mockResolvedValue('https://one.example'),
104
104
  startNetworkCapture: vi.fn().mockResolvedValue(true),
105
+ getCookies: vi.fn().mockResolvedValue([]),
105
106
  evaluate: vi.fn().mockResolvedValue({ ok: true }),
106
107
  tabs: vi.fn().mockResolvedValue([
107
108
  { index: 0, page: 'tab-1', url: 'https://one.example', title: 'one', active: true },
@@ -117,6 +118,15 @@ describe('browser tab targeting commands', () => {
117
118
  readNetworkCapture: vi.fn().mockResolvedValue([]),
118
119
  };
119
120
  });
121
+ function lastJsonLog() {
122
+ const calls = consoleLogSpy.mock.calls;
123
+ if (calls.length === 0)
124
+ throw new Error('Expected at least one console.log call');
125
+ const last = calls[calls.length - 1][0];
126
+ if (typeof last !== 'string')
127
+ throw new Error(`Expected string arg to console.log, got ${typeof last}`);
128
+ return JSON.parse(last);
129
+ }
120
130
  it('binds browser commands to an explicit target tab via --tab', async () => {
121
131
  const program = createProgram('', '');
122
132
  await program.parseAsync(['node', 'opencli', 'browser', 'eval', '--tab', 'tab-2', 'document.title']);
@@ -233,6 +243,203 @@ describe('browser tab targeting commands', () => {
233
243
  expect(browserState.page?.closeTab).not.toHaveBeenCalled();
234
244
  expect(stderrSpy.mock.calls.flat().join('\n')).toContain('Target tab tab-stale is not part of the current browser session');
235
245
  });
246
+ it('browser analyze merges HttpOnly cookie names from page.getCookies and drains stale capture before verdict', async () => {
247
+ browserState.page = {
248
+ goto: vi.fn().mockResolvedValue(undefined),
249
+ wait: vi.fn().mockResolvedValue(undefined),
250
+ setActivePage: vi.fn(),
251
+ getActivePage: vi.fn().mockReturnValue('tab-1'),
252
+ getCurrentUrl: vi.fn().mockResolvedValue('https://target.example'),
253
+ startNetworkCapture: vi.fn().mockResolvedValue(true),
254
+ getCookies: vi.fn().mockResolvedValue([{ name: 'cf_clearance', value: 'x', domain: '.target.example' }]),
255
+ evaluate: vi.fn().mockResolvedValue({
256
+ cookieNames: [],
257
+ initialState: {
258
+ __INITIAL_STATE__: false,
259
+ __NUXT__: false,
260
+ __NEXT_DATA__: false,
261
+ __APOLLO_STATE__: false,
262
+ },
263
+ title: 'Target',
264
+ finalUrl: 'https://target.example/',
265
+ }),
266
+ tabs: vi.fn().mockResolvedValue([{ index: 0, page: 'tab-1', url: 'https://target.example', title: 'Target', active: true }]),
267
+ readNetworkCapture: vi.fn()
268
+ .mockResolvedValueOnce([
269
+ {
270
+ url: 'https://stale.example/api/old',
271
+ method: 'GET',
272
+ responseStatus: 200,
273
+ responseContentType: 'application/json',
274
+ responsePreview: '{"stale":true}',
275
+ },
276
+ ])
277
+ .mockResolvedValueOnce([
278
+ {
279
+ url: 'https://target.example/waf',
280
+ method: 'GET',
281
+ responseStatus: 403,
282
+ responseContentType: 'text/html',
283
+ responsePreview: 'Cloudflare Ray ID',
284
+ },
285
+ ]),
286
+ };
287
+ const program = createProgram('', '');
288
+ await program.parseAsync(['node', 'opencli', 'browser', 'analyze', 'https://target.example/']);
289
+ const out = lastJsonLog();
290
+ expect(browserState.page?.readNetworkCapture).toHaveBeenCalledTimes(2);
291
+ expect(out.anti_bot.vendor).toBe('cloudflare');
292
+ expect(out.anti_bot.evidence).toContain('cookie:cf_clearance');
293
+ });
294
+ it('browser analyze falls back to interceptor buffer when network capture is unsupported', async () => {
295
+ let bufferReads = 0;
296
+ browserState.page = {
297
+ goto: vi.fn().mockResolvedValue(undefined),
298
+ wait: vi.fn().mockResolvedValue(undefined),
299
+ setActivePage: vi.fn(),
300
+ getActivePage: vi.fn().mockReturnValue('tab-1'),
301
+ getCurrentUrl: vi.fn().mockResolvedValue('https://target.example'),
302
+ startNetworkCapture: vi.fn().mockResolvedValue(false),
303
+ getCookies: vi.fn().mockResolvedValue([{ name: 'cf_clearance', value: 'x', domain: '.target.example' }]),
304
+ evaluate: vi.fn().mockImplementation(async (arg) => {
305
+ if (typeof arg === 'string' && arg.includes('document.cookie')) {
306
+ return {
307
+ cookieNames: [],
308
+ initialState: {
309
+ __INITIAL_STATE__: false,
310
+ __NUXT__: false,
311
+ __NEXT_DATA__: false,
312
+ __APOLLO_STATE__: false,
313
+ },
314
+ title: 'Target',
315
+ finalUrl: 'https://target.example/',
316
+ };
317
+ }
318
+ if (typeof arg === 'string' && arg.includes('window.__opencli_net = []')) {
319
+ bufferReads += 1;
320
+ if (bufferReads === 1) {
321
+ return JSON.stringify([
322
+ {
323
+ url: 'https://stale.example/api/old',
324
+ method: 'GET',
325
+ status: 200,
326
+ size: 12,
327
+ ct: 'application/json',
328
+ body: { stale: true },
329
+ },
330
+ ]);
331
+ }
332
+ return JSON.stringify([
333
+ {
334
+ url: 'https://target.example/waf',
335
+ method: 'GET',
336
+ status: 403,
337
+ size: 17,
338
+ ct: 'text/html',
339
+ body: 'Cloudflare Ray ID',
340
+ },
341
+ ]);
342
+ }
343
+ return undefined;
344
+ }),
345
+ tabs: vi.fn().mockResolvedValue([{ index: 0, page: 'tab-1', url: 'https://target.example', title: 'Target', active: true }]),
346
+ readNetworkCapture: vi.fn().mockResolvedValue([]),
347
+ };
348
+ const program = createProgram('', '');
349
+ await program.parseAsync(['node', 'opencli', 'browser', 'analyze', 'https://target.example/']);
350
+ const out = lastJsonLog();
351
+ expect(browserState.page?.readNetworkCapture).toHaveBeenCalledTimes(2);
352
+ expect(bufferReads).toBe(2);
353
+ expect(out.anti_bot.vendor).toBe('cloudflare');
354
+ expect(out.anti_bot.evidence).toContain('cookie:cf_clearance');
355
+ expect(out.anti_bot.evidence).toContain('body:https://target.example/waf');
356
+ });
357
+ it('browser wait xhr starts capture, injects interceptor on fallback, and ignores stale ring entries', async () => {
358
+ browserState.page = {
359
+ goto: vi.fn().mockResolvedValue(undefined),
360
+ wait: vi.fn().mockResolvedValue(undefined),
361
+ setActivePage: vi.fn(),
362
+ getActivePage: vi.fn().mockReturnValue('tab-1'),
363
+ getCurrentUrl: vi.fn().mockResolvedValue('https://target.example'),
364
+ startNetworkCapture: vi.fn().mockResolvedValue(false),
365
+ evaluate: vi.fn().mockResolvedValue(undefined),
366
+ tabs: vi.fn().mockResolvedValue([{ index: 0, page: 'tab-1', url: 'https://target.example', title: 'Target', active: true }]),
367
+ readNetworkCapture: vi.fn()
368
+ .mockResolvedValueOnce([
369
+ {
370
+ url: 'https://stale.example/api/old',
371
+ method: 'GET',
372
+ responseStatus: 200,
373
+ responseContentType: 'application/json',
374
+ responsePreview: '{"stale":true}',
375
+ },
376
+ ])
377
+ .mockResolvedValueOnce([
378
+ {
379
+ url: 'https://target.example/api/target',
380
+ method: 'GET',
381
+ responseStatus: 200,
382
+ responseContentType: 'application/json',
383
+ responsePreview: '{"ok":true}',
384
+ },
385
+ ]),
386
+ };
387
+ const program = createProgram('', '');
388
+ await program.parseAsync(['node', 'opencli', 'browser', 'wait', 'xhr', '/api/target', '--timeout', '900']);
389
+ const out = lastJsonLog();
390
+ expect(browserState.page?.startNetworkCapture).toHaveBeenCalledTimes(1);
391
+ expect(browserState.page?.evaluate).toHaveBeenCalledWith(expect.stringContaining('window.__opencli_net'));
392
+ expect(browserState.page?.readNetworkCapture).toHaveBeenCalledTimes(2);
393
+ expect(out.matched.url).toBe('https://target.example/api/target');
394
+ });
395
+ it('browser wait xhr reads interceptor buffer when network capture is unsupported', async () => {
396
+ let bufferReads = 0;
397
+ browserState.page = {
398
+ goto: vi.fn().mockResolvedValue(undefined),
399
+ wait: vi.fn().mockResolvedValue(undefined),
400
+ setActivePage: vi.fn(),
401
+ getActivePage: vi.fn().mockReturnValue('tab-1'),
402
+ getCurrentUrl: vi.fn().mockResolvedValue('https://target.example'),
403
+ startNetworkCapture: vi.fn().mockResolvedValue(false),
404
+ evaluate: vi.fn().mockImplementation(async (arg) => {
405
+ if (typeof arg === 'string' && arg.includes('window.__opencli_net = []')) {
406
+ bufferReads += 1;
407
+ if (bufferReads === 1) {
408
+ return JSON.stringify([
409
+ {
410
+ url: 'https://stale.example/api/old',
411
+ method: 'GET',
412
+ status: 200,
413
+ size: 12,
414
+ ct: 'application/json',
415
+ body: { stale: true },
416
+ },
417
+ ]);
418
+ }
419
+ return JSON.stringify([
420
+ {
421
+ url: 'https://target.example/api/target',
422
+ method: 'GET',
423
+ status: 200,
424
+ size: 11,
425
+ ct: 'application/json',
426
+ body: { ok: true },
427
+ },
428
+ ]);
429
+ }
430
+ return undefined;
431
+ }),
432
+ tabs: vi.fn().mockResolvedValue([{ index: 0, page: 'tab-1', url: 'https://target.example', title: 'Target', active: true }]),
433
+ readNetworkCapture: vi.fn().mockResolvedValue([]),
434
+ };
435
+ const program = createProgram('', '');
436
+ await program.parseAsync(['node', 'opencli', 'browser', 'wait', 'xhr', '/api/target', '--timeout', '900']);
437
+ const out = lastJsonLog();
438
+ expect(browserState.page?.startNetworkCapture).toHaveBeenCalledTimes(1);
439
+ expect(browserState.page?.readNetworkCapture).toHaveBeenCalledTimes(2);
440
+ expect(bufferReads).toBe(2);
441
+ expect(out.matched.url).toBe('https://target.example/api/target');
442
+ });
236
443
  });
237
444
  describe('browser network command', () => {
238
445
  const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
@@ -1045,3 +1252,52 @@ describe('findPackageRoot', () => {
1045
1252
  expect(findPackageRoot(cliFile, (candidate) => exists.has(candidate))).toBe(packageRoot);
1046
1253
  });
1047
1254
  });
1255
+ describe('normalizeVerifyRows', () => {
1256
+ it('returns an empty array for null / primitives', () => {
1257
+ expect(normalizeVerifyRows(null)).toEqual([]);
1258
+ expect(normalizeVerifyRows(undefined)).toEqual([]);
1259
+ expect(normalizeVerifyRows('hello')).toEqual([]);
1260
+ });
1261
+ it('passes through array-of-objects', () => {
1262
+ const rows = [{ a: 1 }, { a: 2 }];
1263
+ expect(normalizeVerifyRows(rows)).toEqual(rows);
1264
+ });
1265
+ it('wraps array-of-primitives as { value } rows', () => {
1266
+ expect(normalizeVerifyRows([1, 'two', null])).toEqual([
1267
+ { value: 1 }, { value: 'two' }, { value: null },
1268
+ ]);
1269
+ });
1270
+ it('unwraps common envelope shapes', () => {
1271
+ expect(normalizeVerifyRows({ rows: [{ a: 1 }] })).toEqual([{ a: 1 }]);
1272
+ expect(normalizeVerifyRows({ items: [{ b: 2 }] })).toEqual([{ b: 2 }]);
1273
+ expect(normalizeVerifyRows({ data: [{ c: 3 }] })).toEqual([{ c: 3 }]);
1274
+ expect(normalizeVerifyRows({ results: [{ d: 4 }] })).toEqual([{ d: 4 }]);
1275
+ });
1276
+ it('wraps a single object as a one-row array', () => {
1277
+ expect(normalizeVerifyRows({ ok: true })).toEqual([{ ok: true }]);
1278
+ });
1279
+ });
1280
+ describe('renderVerifyPreview', () => {
1281
+ it('emits a placeholder for empty rows', () => {
1282
+ expect(renderVerifyPreview([])).toContain('no rows');
1283
+ });
1284
+ it('prints column headers followed by row cells', () => {
1285
+ const out = renderVerifyPreview([{ a: 'x', b: 1 }, { a: 'y', b: 2 }]);
1286
+ const lines = out.split('\n');
1287
+ expect(lines[0]).toContain('a');
1288
+ expect(lines[0]).toContain('b');
1289
+ expect(lines.some((l) => l.includes('x') && l.includes('1'))).toBe(true);
1290
+ expect(lines.some((l) => l.includes('y') && l.includes('2'))).toBe(true);
1291
+ });
1292
+ it('truncates long cells and reports hidden rows / columns', () => {
1293
+ const rows = Array.from({ length: 15 }, (_, i) => ({
1294
+ a: i, b: 'x'.repeat(100), c: i, d: i, e: i, f: i, g: i, h: i,
1295
+ }));
1296
+ const out = renderVerifyPreview(rows, { maxRows: 5, maxCols: 3, cellMax: 10 });
1297
+ expect(out).toContain('and 10 more row');
1298
+ expect(out).toContain('more column');
1299
+ // cell gets truncated
1300
+ expect(out).toContain('xxxxxxxxxx');
1301
+ expect(out).not.toContain('xxxxxxxxxxx'); // never 11 consecutive
1302
+ });
1303
+ });
@@ -9,7 +9,8 @@
9
9
  * 1. Origin check — reject HTTP/WS from non chrome-extension:// origins
10
10
  * 2. Custom header — require X-OpenCLI header (browsers can't send it
11
11
  * without CORS preflight, which we deny)
12
- * 3. No CORS headers — responses never include Access-Control-Allow-Origin
12
+ * 3. No CORS headers on command endpoints only /ping is readable from the
13
+ * Browser Bridge extension origin so the extension can probe daemon reachability
13
14
  * 4. Body size limit — 1 MB max to prevent OOM
14
15
  * 5. WebSocket verifyClient — reject upgrade before connection is established
15
16
  *
@@ -18,4 +19,4 @@
18
19
  * - Persistent — stays alive until explicit shutdown, SIGTERM, or uninstall
19
20
  * - Listens on localhost:19825
20
21
  */
21
- export {};
22
+ export declare function getResponseCorsHeaders(pathname: string, origin?: string): Record<string, string> | undefined;
@@ -9,7 +9,8 @@
9
9
  * 1. Origin check — reject HTTP/WS from non chrome-extension:// origins
10
10
  * 2. Custom header — require X-OpenCLI header (browsers can't send it
11
11
  * without CORS preflight, which we deny)
12
- * 3. No CORS headers — responses never include Access-Control-Allow-Origin
12
+ * 3. No CORS headers on command endpoints only /ping is readable from the
13
+ * Browser Bridge extension origin so the extension can probe daemon reachability
13
14
  * 4. Body size limit — 1 MB max to prevent OOM
14
15
  * 5. WebSocket verifyClient — reject upgrade before connection is established
15
16
  *
@@ -60,10 +61,20 @@ function readBody(req) {
60
61
  reject(err); });
61
62
  });
62
63
  }
63
- function jsonResponse(res, status, data) {
64
- res.writeHead(status, { 'Content-Type': 'application/json' });
64
+ function jsonResponse(res, status, data, extraHeaders) {
65
+ res.writeHead(status, { 'Content-Type': 'application/json', ...extraHeaders });
65
66
  res.end(JSON.stringify(data));
66
67
  }
68
+ export function getResponseCorsHeaders(pathname, origin) {
69
+ if (pathname !== '/ping')
70
+ return undefined;
71
+ if (!origin || !origin.startsWith('chrome-extension://'))
72
+ return undefined;
73
+ return {
74
+ 'Access-Control-Allow-Origin': origin,
75
+ Vary: 'Origin',
76
+ };
77
+ }
67
78
  async function handleRequest(req, res) {
68
79
  // ─── Security: Origin & custom-header check ──────────────────────
69
80
  // Block browser-based CSRF: browsers always send an Origin header on
@@ -93,7 +104,7 @@ async function handleRequest(req, res) {
93
104
  // Timing side-channels can reveal daemon presence to local processes, which
94
105
  // is an accepted risk given the daemon is loopback-only and short-lived.
95
106
  if (req.method === 'GET' && pathname === '/ping') {
96
- jsonResponse(res, 200, { ok: true });
107
+ jsonResponse(res, 200, { ok: true }, getResponseCorsHeaders(pathname, origin));
97
108
  return;
98
109
  }
99
110
  // Require custom header on all other HTTP requests. Browsers cannot attach
@@ -272,6 +283,7 @@ wss.on('connection', (ws) => {
272
283
  if (extensionWs === ws) {
273
284
  extensionWs = null;
274
285
  extensionVersion = null;
286
+ extensionCompatRange = null;
275
287
  // Reject pending requests in case 'close' does not follow this 'error'
276
288
  for (const [, p] of pending) {
277
289
  clearTimeout(p.timer);
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,19 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { getResponseCorsHeaders } from './daemon.js';
3
+ describe('getResponseCorsHeaders', () => {
4
+ it('allows the Browser Bridge extension origin to read /ping', () => {
5
+ expect(getResponseCorsHeaders('/ping', 'chrome-extension://abc123')).toEqual({
6
+ 'Access-Control-Allow-Origin': 'chrome-extension://abc123',
7
+ Vary: 'Origin',
8
+ });
9
+ });
10
+ it('does not add CORS headers for ordinary web origins', () => {
11
+ expect(getResponseCorsHeaders('/ping', 'https://example.com')).toBeUndefined();
12
+ });
13
+ it('does not add CORS headers when origin is absent', () => {
14
+ expect(getResponseCorsHeaders('/ping')).toBeUndefined();
15
+ });
16
+ it('does not add CORS headers for command endpoints even from the extension origin', () => {
17
+ expect(getResponseCorsHeaders('/command', 'chrome-extension://abc123')).toBeUndefined();
18
+ });
19
+ });
@@ -37,6 +37,18 @@ export interface ArticleDownloadOptions {
37
37
  detectImageExt?: (url: string) => string;
38
38
  /** Custom frontmatter labels (default: Chinese labels) */
39
39
  frontmatterLabels?: FrontmatterLabels;
40
+ /**
41
+ * Extra CSS selectors removed from the article before Turndown conversion.
42
+ * Use this to drop site-specific noise the adapter can't always trim upstream
43
+ * (e.g. zhihu 折叠卡, weixin 赞赏栏, wiki infobox).
44
+ */
45
+ cleanSelectors?: string[];
46
+ /**
47
+ * Write the markdown to `process.stdout` instead of a file on disk. Image
48
+ * download and directory creation are skipped — remote image URLs are kept
49
+ * as-is so the output is self-contained when piped.
50
+ */
51
+ stdout?: boolean;
40
52
  }
41
53
  export interface ArticleDownloadResult {
42
54
  title: string;
@@ -8,6 +8,7 @@
8
8
  import * as fs from 'node:fs';
9
9
  import * as path from 'node:path';
10
10
  import TurndownService from 'turndown';
11
+ import { gfm } from 'turndown-plugin-gfm';
11
12
  import { httpDownload, sanitizeFilename } from './index.js';
12
13
  import { formatBytes } from './progress.js';
13
14
  const IMAGE_CONCURRENCY = 5;
@@ -19,22 +20,127 @@ const DEFAULT_LABELS = {
19
20
  // ============================================================
20
21
  // Markdown Conversion
21
22
  // ============================================================
22
- function createTurndown(configure) {
23
+ // Nodes that never carry article content. Turndown keeps them by default — if an
24
+ // adapter's contentHtml extraction misses one, CSS / scripts / widget markup
25
+ // ends up inline in the .md. Strip them unconditionally at the converter level.
26
+ // `svg` is not in HTMLElementTagNameMap, so we type-narrow manually.
27
+ // `header/footer/nav/aside` cover page chrome that adapters occasionally
28
+ // forget to trim — the article's own title/author/publishTime are supplied
29
+ // as separate fields on ArticleData, so duplicated nodes are redundant.
30
+ // `iframe` is NOT in this set — it's handled by a dedicated rule below that
31
+ // degrades to a link so embedded content (YouTube, Twitter, CodePen …) keeps
32
+ // a reachable URL in the exported markdown.
33
+ const STRIPPED_TAGS = [
34
+ 'script', 'style', 'noscript',
35
+ 'canvas',
36
+ 'form', 'button', 'dialog',
37
+ 'header', 'footer', 'nav', 'aside',
38
+ ];
39
+ function createTurndown(configure, cleanSelectors) {
23
40
  const td = new TurndownService({
24
41
  headingStyle: 'atx',
25
42
  codeBlockStyle: 'fenced',
26
43
  bulletListMarker: '-',
27
44
  });
45
+ td.use(gfm);
46
+ td.remove(STRIPPED_TAGS);
47
+ // turndown-plugin-gfm@1.0.2 emits single-tilde strikethrough (`~x~`), which
48
+ // is not the canonical GFM form. Override it so exported markdown is
49
+ // portable across common renderers.
50
+ td.addRule('canonicalStrikethrough', {
51
+ filter: (node) => ['DEL', 'S', 'STRIKE'].includes(node.nodeName),
52
+ replacement: (content) => `~~${content}~~`,
53
+ });
54
+ // SVG isn't in the static HTML tag map; match by name with a custom filter.
55
+ td.addRule('stripSvg', {
56
+ filter: (node) => node.nodeName === 'svg' || node.nodeName === 'SVG',
57
+ replacement: () => '',
58
+ });
28
59
  td.addRule('linebreak', {
29
60
  filter: 'br',
30
61
  replacement: () => '\n',
31
62
  });
63
+ // Inline base64 images would land as huge `![](data:image/...;base64,...)`
64
+ // strings that the image downloader can't localize. Drop them.
65
+ td.addRule('ignoreBase64Images', {
66
+ filter: (node) => {
67
+ if (node.nodeName !== 'IMG')
68
+ return false;
69
+ const src = node.getAttribute?.('src') ?? '';
70
+ return src.startsWith('data:');
71
+ },
72
+ replacement: () => '',
73
+ });
74
+ // Markdown has no native video/audio primitive. Emit inline HTML so
75
+ // renderers that support it (GitHub, VS Code preview …) still play the
76
+ // media; viewers that don't simply show the tag as text, which is still
77
+ // more information than dropping the node outright.
78
+ td.addRule('videoElement', {
79
+ filter: (node) => node.nodeName === 'VIDEO',
80
+ replacement: (_content, node) => {
81
+ const el = node;
82
+ const src = el.getAttribute('src')
83
+ || el.querySelector('source')?.getAttribute('src')
84
+ || '';
85
+ if (!src)
86
+ return '';
87
+ const poster = el.getAttribute('poster') || '';
88
+ return `\n<video src="${src}" controls${poster ? ` poster="${poster}"` : ''}></video>\n`;
89
+ },
90
+ });
91
+ td.addRule('audioElement', {
92
+ filter: (node) => node.nodeName === 'AUDIO',
93
+ replacement: (_content, node) => {
94
+ const el = node;
95
+ const src = el.getAttribute('src')
96
+ || el.querySelector('source')?.getAttribute('src')
97
+ || '';
98
+ return src ? `\n<audio src="${src}" controls></audio>\n` : '';
99
+ },
100
+ });
101
+ // Iframes (YouTube, Twitter, CodePen …) degrade to a markdown link so the
102
+ // embedded resource is still reachable from the exported file.
103
+ td.addRule('iframeToLink', {
104
+ filter: (node) => node.nodeName === 'IFRAME',
105
+ replacement: (_content, node) => {
106
+ const el = node;
107
+ const src = el.getAttribute('src') || '';
108
+ if (!src)
109
+ return '';
110
+ const title = el.getAttribute('title') || 'Embedded content';
111
+ return `\n[${title}](${src})\n`;
112
+ },
113
+ });
114
+ // Per-adapter dirty-node removal. Adapters know their site's specific noise
115
+ // (zhihu 折叠卡, weixin 赞赏栏, wiki 折叠 infobox …); we keep the default set
116
+ // empty so the generic converter stays untouched.
117
+ const selectorRules = (cleanSelectors ?? [])
118
+ .map(sel => sel.trim())
119
+ .filter(Boolean);
120
+ if (selectorRules.length > 0) {
121
+ td.addRule('cleanSelectors', {
122
+ filter: (node) => {
123
+ const match = node.matches;
124
+ if (typeof match !== 'function')
125
+ return false;
126
+ return selectorRules.some((sel) => {
127
+ try {
128
+ return match.call(node, sel);
129
+ }
130
+ catch {
131
+ return false;
132
+ }
133
+ });
134
+ },
135
+ replacement: () => '',
136
+ });
137
+ }
32
138
  if (configure)
33
139
  configure(td);
34
140
  return td;
35
141
  }
36
- function convertToMarkdown(contentHtml, codeBlocks, configure) {
37
- const td = createTurndown(configure);
142
+ function convertToMarkdown(contentHtml, codeBlocks, configure, cleanSelectors) {
143
+ const td = createTurndown(configure, cleanSelectors);
38
144
  let md = td.turndown(contentHtml);
39
145
  // Restore code block placeholders
40
146
  codeBlocks.forEach((block, i) => {
@@ -44,8 +150,12 @@ function convertToMarkdown(contentHtml, codeBlocks, configure) {
44
150
  });
45
151
  // Clean up
46
152
  md = md.replace(/\u00a0/g, ' ');
47
- md = md.replace(/\n{4,}/g, '\n\n\n');
153
+ // Turndown leaves behind lone dashes / middle dots when list bullets or
154
+ // decorative separators lose their surrounding inline context.
155
+ md = md.replace(/^[ \t]*[-·][ \t]*$/gm, '');
156
+ md = md.replace(/^[ \t]+$/gm, '');
48
157
  md = md.replace(/[ \t]+$/gm, '');
158
+ md = md.replace(/\n{3,}/g, '\n\n');
49
159
  return md;
50
160
  }
51
161
  function replaceImageUrls(md, urlMap) {
@@ -120,7 +230,7 @@ async function downloadImages(imgUrls, imgDir, headers, detectExt) {
120
230
  * 6. File write
121
231
  */
122
232
  export async function downloadArticle(data, options) {
123
- const { output, downloadImages: shouldDownloadImages = true, imageHeaders, maxTitleLength = 80, configureTurndown, detectImageExt, frontmatterLabels, } = options;
233
+ const { output, downloadImages: shouldDownloadImages = true, imageHeaders, maxTitleLength = 80, configureTurndown, detectImageExt, frontmatterLabels, cleanSelectors, stdout = false, } = options;
124
234
  const labels = { ...DEFAULT_LABELS, ...frontmatterLabels };
125
235
  if (!data.title) {
126
236
  return [{
@@ -143,33 +253,47 @@ export async function downloadArticle(data, options) {
143
253
  }];
144
254
  }
145
255
  // Convert HTML to Markdown
146
- let markdown = convertToMarkdown(data.contentHtml, data.codeBlocks || [], configureTurndown);
147
- // Prepare output directory
256
+ let markdown = convertToMarkdown(data.contentHtml, data.codeBlocks || [], configureTurndown, cleanSelectors);
148
257
  const safeTitle = sanitizeFilename(data.title, maxTitleLength);
149
- const articleDir = path.join(output, safeTitle);
150
- fs.mkdirSync(articleDir, { recursive: true });
151
- // Download images
152
- if (shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) {
258
+ // Download images only when writing to disk. In stdout mode remote URLs
259
+ // stay intact so the piped output is self-contained.
260
+ if (!stdout && shouldDownloadImages && data.imageUrls && data.imageUrls.length > 0) {
261
+ const articleDir = path.join(output, safeTitle);
262
+ fs.mkdirSync(articleDir, { recursive: true });
153
263
  const imagesDir = path.join(articleDir, 'images');
154
264
  fs.mkdirSync(imagesDir, { recursive: true });
155
265
  const urlMap = await downloadImages(data.imageUrls, imagesDir, imageHeaders, detectImageExt);
156
266
  markdown = replaceImageUrls(markdown, urlMap);
157
267
  }
158
- // Build frontmatter with customizable labels
159
- const headerLines = [`# ${data.title}`, ''];
268
+ // Build frontmatter with customizable labels.
269
+ // Shape: `# Title\n[> meta\n...]\n---\n\n<markdown>` — exactly one blank
270
+ // line separates every section, so we never produce ≥3 consecutive newlines.
271
+ const headerLines = [`# ${data.title}`];
160
272
  if (data.author)
161
273
  headerLines.push(`> ${labels.author}: ${data.author}`);
162
274
  if (data.publishTime)
163
275
  headerLines.push(`> ${labels.publishTime}: ${data.publishTime}`);
164
276
  if (data.sourceUrl)
165
277
  headerLines.push(`> ${labels.sourceUrl}: ${data.sourceUrl}`);
166
- headerLines.push('', '---', '');
167
- const fullContent = headerLines.join('\n') + markdown;
168
- // Write file
278
+ const frontmatter = headerLines.join('\n') + '\n\n---\n\n';
279
+ const fullContent = frontmatter + markdown;
280
+ const size = Buffer.byteLength(fullContent, 'utf-8');
281
+ if (stdout) {
282
+ process.stdout.write(fullContent.endsWith('\n') ? fullContent : fullContent + '\n');
283
+ return [{
284
+ title: data.title,
285
+ author: data.author || '-',
286
+ publish_time: data.publishTime || '-',
287
+ status: 'success',
288
+ size: formatBytes(size),
289
+ saved: '-',
290
+ }];
291
+ }
292
+ const articleDir = path.join(output, safeTitle);
293
+ fs.mkdirSync(articleDir, { recursive: true });
169
294
  const filename = `${safeTitle}.md`;
170
295
  const filePath = path.join(articleDir, filename);
171
296
  fs.writeFileSync(filePath, fullContent, 'utf-8');
172
- const size = Buffer.byteLength(fullContent, 'utf-8');
173
297
  return [{
174
298
  title: data.title,
175
299
  author: data.author || '-',