@kadaliao/geektime-downloader 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/download.js CHANGED
@@ -6,7 +6,10 @@ import chalk from 'chalk';
6
6
  import ora from 'ora';
7
7
  import fs from 'fs/promises';
8
8
  import path from 'path';
9
- import { fileURLToPath } from 'url';
9
+ import { fileURLToPath, pathToFileURL } from 'url';
10
+ import { load as loadHtml } from 'cheerio';
11
+ import crypto from 'crypto';
12
+ import mime from 'mime-types';
10
13
  import { createRequire } from 'module';
11
14
  import * as pdfLib from 'pdf-lib';
12
15
  import { outlinePdfFactory } from '@lillallol/outline-pdf';
@@ -245,9 +248,335 @@ const PRINT_FIX_CSS = `
245
248
  }
246
249
  `;
247
250
 
251
+ // 代码高亮彩色语法(覆盖Prism/Highlight.js常见class)
252
+ const CODE_HIGHLIGHT_CSS = `
253
+ pre[class*="language-"],
254
+ code[class*="language-"],
255
+ pre code,
256
+ code.hljs,
257
+ pre.hljs {
258
+ color: #2d2d2d;
259
+ background: #f7f7f7;
260
+ }
261
+ .token.comment,
262
+ .token.prolog,
263
+ .token.doctype,
264
+ .token.cdata,
265
+ .hljs-comment,
266
+ .hljs-quote {
267
+ color: #6a737d;
268
+ font-style: italic;
269
+ }
270
+ .token.punctuation,
271
+ .hljs-punctuation {
272
+ color: #5e6687;
273
+ }
274
+ .token.property,
275
+ .token.tag,
276
+ .token.constant,
277
+ .token.symbol,
278
+ .token.deleted,
279
+ .hljs-keyword,
280
+ .hljs-selector-tag,
281
+ .hljs-subst,
282
+ .hljs-attribute {
283
+ color: #d73a49;
284
+ }
285
+ .token.boolean,
286
+ .token.number,
287
+ .token.selector,
288
+ .token.attr-name,
289
+ .token.char,
290
+ .token.builtin,
291
+ .token.inserted,
292
+ .hljs-number,
293
+ .hljs-literal,
294
+ .hljs-variable,
295
+ .hljs-template-variable {
296
+ color: #b76bff;
297
+ }
298
+ .token.string,
299
+ .token.attr-value,
300
+ .token.operator,
301
+ .token.entity,
302
+ .token.url,
303
+ .token.statement,
304
+ .token.regex,
305
+ .token.important,
306
+ .token.variable,
307
+ .token.bold,
308
+ .hljs-string,
309
+ .hljs-doctag,
310
+ .hljs-addition {
311
+ color: #22863a;
312
+ }
313
+ .token.function,
314
+ .token.class-name,
315
+ .token.keyword,
316
+ .hljs-title,
317
+ .hljs-section,
318
+ .hljs-type,
319
+ .hljs-selector-id,
320
+ .hljs-selector-class {
321
+ color: #005cc5;
322
+ }
323
+ .token.operator,
324
+ .token.entity,
325
+ .token.url,
326
+ .hljs-bullet,
327
+ .hljs-built_in,
328
+ .hljs-builtin-name,
329
+ .hljs-link {
330
+ color: #e36209;
331
+ }
332
+ .token.italic {
333
+ font-style: italic;
334
+ }
335
+ .token.bold {
336
+ font-weight: 600;
337
+ }
338
+ .token.deleted,
339
+ .hljs-deletion {
340
+ color: #b31d28;
341
+ }
342
+ `;
343
+
248
344
  const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
249
- const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
250
345
  const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
346
+ const EPUB_IMAGE_BATCH_SIZE = 5;
347
+ const TEMP_ASSET_PREFIX = '__epub_assets__';
348
+ const ARTICLE_CONTENT_SELECTORS = [
349
+ '#article-content',
350
+ '#article-content-container',
351
+ '.article-content',
352
+ '.article-detail',
353
+ '.article-detail-content',
354
+ '.article-content__body',
355
+ '.Index_articleContent_QBG5G',
356
+ '.ArticleContent_articleContent',
357
+ 'article .content',
358
+ 'main article',
359
+ '.content-container article'
360
+ ];
361
+ const ARTICLE_REMOVAL_SELECTORS = [
362
+ 'nav', 'header', 'footer', 'aside',
363
+ '.comment', '.comments', '.Index_comment', '.CommentArea', '.comment-area', '.CommentWrapper', '.Comment-module', '.CommentList',
364
+ '#comments', '#comment', '[data-section="comment"]',
365
+ '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
366
+ '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
367
+ '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
368
+ '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
369
+ '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
370
+ '.AudioPlayer', '.VoicePlayer', '.AudioWrapper', '.voice-player',
371
+ '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
372
+ '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
373
+ '.copyright', '.statement', '.disclaimer',
374
+ '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
375
+ '.article-plugin-wrapper',
376
+ '[class*="Share"]', '[data-widget="audio"]', '[data-widget="Audio"]',
377
+ 'audio', 'video',
378
+ '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
379
+ '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
380
+ '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
381
+ '[data-role="toolbar"]',
382
+ 'button[data-role="comment"]',
383
+ 'script[data-role="plugin"]',
384
+ '.ArticleBottomBar',
385
+ '.bottom-toolbar'
386
+ ];
387
+ const ARTICLE_PLUGIN_KEYWORDS = [
388
+ 'note', 'translation', 'audio', 'player', 'reward', 'donate',
389
+ 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
390
+ 'copyright', 'geeknote', 'bilingual', 'comment'
391
+ ];
392
+ const ARTICLE_MINDMAP_SELECTORS = [
393
+ '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
394
+ '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
395
+ '[class*="MindMap"]', '[class*="mindMap"]'
396
+ ];
397
+ const PDF_BASE_CSS = `
398
+ body {
399
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", sans-serif;
400
+ margin: 0;
401
+ padding: 0;
402
+ background: #fff;
403
+ color: #1f2329;
404
+ }
405
+ .article-pdf-wrapper {
406
+ max-width: 860px;
407
+ margin: 0 auto;
408
+ padding: 48px 56px 60px;
409
+ }
410
+ .article-title {
411
+ font-size: 32px;
412
+ font-weight: 600;
413
+ margin-bottom: 16px;
414
+ line-height: 1.3;
415
+ color: #111;
416
+ }
417
+ .article-meta {
418
+ color: #7f8c8d;
419
+ font-size: 14px;
420
+ margin-bottom: 32px;
421
+ }
422
+ .article-content p,
423
+ .article-content div {
424
+ margin: 1.1em 0;
425
+ line-height: 1.9;
426
+ font-size: 16px;
427
+ }
428
+ .article-content p + p,
429
+ .article-content div + p,
430
+ .article-content p + div {
431
+ margin-top: 1.6em;
432
+ }
433
+ .article-content h2,
434
+ .article-content h3,
435
+ .article-content h4 {
436
+ margin-top: 2.2em;
437
+ margin-bottom: 1em;
438
+ font-weight: 600;
439
+ color: #111;
440
+ }
441
+ .article-content h2 {
442
+ font-size: 26px;
443
+ }
444
+ .article-content h3 {
445
+ font-size: 22px;
446
+ }
447
+ .article-content h4 {
448
+ font-size: 18px;
449
+ }
450
+ .article-content img {
451
+ max-width: 100%;
452
+ margin: 1.2em auto;
453
+ display: block;
454
+ border-radius: 4px;
455
+ }
456
+ .article-content blockquote {
457
+ margin: 1.3em 0;
458
+ padding: 0.8em 1.2em;
459
+ border-left: 4px solid #d0d7de;
460
+ background: #f8fafc;
461
+ color: #4b5563;
462
+ }
463
+ .article-content ul,
464
+ .article-content ol {
465
+ margin: 1em 0;
466
+ padding-left: 2em;
467
+ }
468
+ .article-content pre {
469
+ background: #0b1220;
470
+ color: #d9e2ff;
471
+ border-radius: 6px;
472
+ padding: 16px 20px;
473
+ overflow: auto;
474
+ margin: 1.4em 0;
475
+ font-size: 14px;
476
+ line-height: 1.6;
477
+ }
478
+ .article-content pre code {
479
+ background: transparent;
480
+ border: none;
481
+ padding: 0;
482
+ color: inherit;
483
+ }
484
+ .article-content code {
485
+ font-family: "Fira Code", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
486
+ background: rgba(15, 23, 42, 0.08);
487
+ border-radius: 4px;
488
+ padding: 0.2em 0.4em;
489
+ }
490
+ .article-content hr {
491
+ border: none;
492
+ border-top: 1px solid #e5e7eb;
493
+ margin: 2.4em 0;
494
+ }
495
+ `;
496
+
497
+ async function fileExists(filePath) {
498
+ try {
499
+ await fs.access(filePath);
500
+ return true;
501
+ } catch {
502
+ return false;
503
+ }
504
+ }
505
+
506
+ function normalizeCookieSameSite(value) {
507
+ if (!value) return undefined;
508
+ const lower = value.toString().toLowerCase();
509
+ if (lower.includes('lax')) return 'Lax';
510
+ if (lower.includes('strict')) return 'Strict';
511
+ if (lower.includes('none') || lower.includes('no_restriction')) return 'None';
512
+ return undefined;
513
+ }
514
+
515
+ function normalizeCookieDomain(domain) {
516
+ if (!domain || typeof domain !== 'string') {
517
+ return '.geekbang.org';
518
+ }
519
+ return domain.trim();
520
+ }
521
+
522
+ async function loadCookiesFromJsonFile(filePath) {
523
+ const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
524
+ let raw;
525
+ try {
526
+ raw = await fs.readFile(absolutePath, 'utf-8');
527
+ } catch (error) {
528
+ throw new Error(`无法读取 cookie JSON 文件: ${error.message}`);
529
+ }
530
+
531
+ let parsed;
532
+ try {
533
+ parsed = JSON.parse(raw);
534
+ } catch (error) {
535
+ throw new Error(`cookie JSON 解析失败: ${error.message}`);
536
+ }
537
+
538
+ if (!Array.isArray(parsed)) {
539
+ throw new Error('cookie JSON 必须是数组格式');
540
+ }
541
+
542
+ const cookies = parsed
543
+ .filter(item => item && typeof item.name === 'string' && item.value !== undefined)
544
+ .map(item => {
545
+ const cookieValue = typeof item.value === 'string' ? item.value : String(item.value ?? '');
546
+ const cookie = {
547
+ name: item.name,
548
+ value: cookieValue,
549
+ domain: normalizeCookieDomain(item.domain),
550
+ path: item.path || '/',
551
+ secure: Boolean(item.secure),
552
+ httpOnly: Boolean(item.httpOnly)
553
+ };
554
+ const sameSite = normalizeCookieSameSite(item.sameSite);
555
+ if (sameSite) {
556
+ cookie.sameSite = sameSite;
557
+ }
558
+ return cookie;
559
+ });
560
+
561
+ if (cookies.length === 0) {
562
+ throw new Error('cookie JSON 中没有有效的 cookie 项');
563
+ }
564
+
565
+ const withExpiry = parsed
566
+ .filter(item => item && typeof item.name === 'string' && item.value !== undefined)
567
+ .map((item, idx) => ({ item, target: cookies[idx] }))
568
+ .filter(entry => entry.target);
569
+ withExpiry.forEach(({ item, target }) => {
570
+ const expires = item.expires || item.expirationDate;
571
+ if (expires) {
572
+ target.expires = Math.floor(Number(expires));
573
+ }
574
+ });
575
+
576
+ const cookieHeader = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
577
+
578
+ return { cookieHeader, cookies, absolutePath };
579
+ }
251
580
 
252
581
  // 解析 cookie 字符串
253
582
  function parseCookies(cookieString) {
@@ -272,96 +601,308 @@ function normalizeArticleHtml(html = '') {
272
601
  .replace(/href='\/\//gi, "href='https://");
273
602
  }
274
603
 
275
- async function fetchArticleData(context, articleId) {
276
- const maxAttempts = 3;
277
- const refererUrl = `${GEEKTIME_BASE_URL}/column/article/${articleId}`;
278
- let lastError = null;
604
+ function resolveImageUrl(rawSrc = '') {
605
+ if (!rawSrc) return null;
606
+ let src = rawSrc.trim();
607
+ if (!src || src.startsWith('data:') || src.startsWith('blob:')) {
608
+ return null;
609
+ }
610
+ if (src.startsWith('//')) {
611
+ return `https:${src}`;
612
+ }
613
+ if (src.startsWith('/')) {
614
+ return `${GEEKTIME_BASE_URL}${src}`;
615
+ }
616
+ if (/^https?:/i.test(src)) {
617
+ return src;
618
+ }
619
+ try {
620
+ return new URL(src, GEEKTIME_BASE_URL).toString();
621
+ } catch {
622
+ return null;
623
+ }
624
+ }
279
625
 
280
- for (let attempt = 1; attempt <= maxAttempts; attempt++) {
626
+ async function fetchBinaryWithContext(context, url) {
627
+ const headers = {
628
+ 'user-agent': DEFAULT_USER_AGENT,
629
+ 'accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
630
+ 'referer': GEEKTIME_BASE_URL,
631
+ ...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
632
+ };
633
+ const response = await context.request.get(url, { headers, failOnStatusCode: true });
634
+ if (!response.ok()) {
635
+ throw new Error(`HTTP ${response.status()} ${response.statusText()}`);
636
+ }
637
+ const buffer = await response.body();
638
+ const headersMap = response.headers();
639
+ return {
640
+ buffer,
641
+ contentType: headersMap['content-type'] || '',
642
+ finalUrl: response.url()
643
+ };
644
+ }
645
+
646
+ function determineImageExtension(resourceUrl = '', contentType = '') {
647
+ let ext = '';
648
+ if (resourceUrl) {
281
649
  try {
282
- const response = await context.request.post(ARTICLE_API_URL, {
283
- headers: {
284
- 'user-agent': DEFAULT_USER_AGENT,
285
- 'content-type': 'application/json',
286
- 'accept': 'application/json, text/plain, */*',
287
- 'origin': GEEKTIME_BASE_URL,
288
- 'referer': refererUrl,
289
- 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
290
- ...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
291
- },
292
- data: {
293
- id: String(articleId),
294
- include_neighbors: true,
295
- is_freelyread: true
296
- }
297
- });
650
+ const { pathname } = new URL(resourceUrl);
651
+ ext = path.extname(pathname).replace('.', '');
652
+ } catch {
653
+ ext = '';
654
+ }
655
+ }
656
+ if (!ext && contentType) {
657
+ ext = (mime.extension(contentType) || '').toString();
658
+ }
659
+ if (!ext) {
660
+ ext = 'bin';
661
+ }
662
+ return ext.toLowerCase();
663
+ }
298
664
 
299
- const bodyText = await response.text();
665
+ async function downloadImageToLocal(context, normalizedUrl, assetsDir, articleIndex) {
666
+ const { buffer, contentType, finalUrl } = await fetchBinaryWithContext(context, normalizedUrl);
667
+ const ext = determineImageExtension(finalUrl || normalizedUrl, contentType);
668
+ const hash = crypto.createHash('md5').update(normalizedUrl).digest('hex').slice(0, 10);
669
+ const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_${hash}.${ext}`;
670
+ const filepath = path.join(assetsDir, filename);
671
+ await fs.writeFile(filepath, buffer);
672
+ return {
673
+ fileUrl: pathToFileURL(filepath).href,
674
+ localPath: filepath
675
+ };
676
+ }
300
677
 
301
- if (!response.ok()) {
302
- throw new Error(`API请求失败: ${response.status()} ${response.statusText()} - ${bodyText.slice(0, 160)}`);
303
- }
678
+ function mapSameSiteForExport(value) {
679
+ if (!value) return 'unspecified';
680
+ const lower = value.toString().toLowerCase();
681
+ if (lower.includes('strict')) return 'strict';
682
+ if (lower.includes('lax')) return 'lax';
683
+ if (lower.includes('none')) return 'no_restriction';
684
+ return 'unspecified';
685
+ }
686
+
687
+ async function updateGlobalCookieHeaderFromContext(context) {
688
+ if (!context) return;
689
+ try {
690
+ const cookies = await context.cookies();
691
+ if (!cookies || cookies.length === 0) {
692
+ return;
693
+ }
694
+ const header = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
695
+ if (header) {
696
+ globalCookieHeader = header;
697
+ }
698
+ } catch {
699
+ // ignore
700
+ }
701
+ }
702
+
703
+ async function persistCookiesToFile(context, targetPath) {
704
+ if (!context || !targetPath) return;
705
+ try {
706
+ const cookies = await context.cookies();
707
+ if (!cookies || cookies.length === 0) {
708
+ return;
709
+ }
710
+ const serialized = cookies.map(cookie => ({
711
+ domain: cookie.domain,
712
+ expirationDate: cookie.expires || undefined,
713
+ hostOnly: !cookie.domain.startsWith('.'),
714
+ httpOnly: cookie.httpOnly,
715
+ name: cookie.name,
716
+ path: cookie.path,
717
+ sameSite: mapSameSiteForExport(cookie.sameSite),
718
+ secure: cookie.secure,
719
+ session: !cookie.expires,
720
+ storeId: '0',
721
+ value: cookie.value
722
+ }));
723
+ await fs.writeFile(targetPath, JSON.stringify(serialized, null, 2), 'utf-8');
724
+ console.log(chalk.gray(`🍪 已刷新 Cookie → ${targetPath}`));
725
+ } catch (error) {
726
+ console.log(chalk.yellow(`⚠️ 保存 Cookie 失败: ${error.message}`));
727
+ }
728
+ }
304
729
 
305
- let json;
730
+ async function saveDataUriImage(dataUri, assetsDir, articleIndex, dataIndex) {
731
+ if (!dataUri || typeof dataUri !== 'string') {
732
+ return null;
733
+ }
734
+ const match = dataUri.match(/^data:(.+?);base64,(.+)$/i);
735
+ if (!match) {
736
+ return null;
737
+ }
738
+ const mimeType = match[1] || 'application/octet-stream';
739
+ const base64Data = match[2];
740
+ let buffer;
741
+ try {
742
+ buffer = Buffer.from(base64Data, 'base64');
743
+ } catch {
744
+ return null;
745
+ }
746
+ if (!buffer || buffer.length === 0) {
747
+ return null;
748
+ }
749
+ const ext = mime.extension(mimeType) || 'bin';
750
+ const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_inline_${String(dataIndex).padStart(3, '0')}.${ext}`;
751
+ const filepath = path.join(assetsDir, filename);
752
+ await fs.writeFile(filepath, buffer);
753
+ return pathToFileURL(filepath).href;
754
+ }
755
+
756
+ async function rewriteImagesWithLocalFiles(context, htmlContent, assetsDir, articleIndex, sharedCache) {
757
+ if (!htmlContent || htmlContent.indexOf('<img') === -1) {
758
+ return { html: htmlContent, replaced: 0 };
759
+ }
760
+
761
+ const $ = loadHtml(htmlContent, { decodeEntities: false });
762
+ const images = $('img');
763
+ if (images.length === 0) {
764
+ return { html: htmlContent, replaced: 0 };
765
+ }
766
+
767
+ const pendingDownloads = new Map();
768
+ const dataUriImages = [];
769
+
770
+ images.each((_, element) => {
771
+ const originalSrc = $(element).attr('src') || '';
772
+ if (/^data:/i.test(originalSrc.trim())) {
773
+ dataUriImages.push({ element, src: originalSrc.trim() });
774
+ return;
775
+ }
776
+ const normalizedUrl = resolveImageUrl(originalSrc);
777
+ if (!normalizedUrl) {
778
+ return;
779
+ }
780
+ if (sharedCache.has(normalizedUrl)) {
781
+ return;
782
+ }
783
+ if (!pendingDownloads.has(normalizedUrl)) {
784
+ pendingDownloads.set(normalizedUrl, null);
785
+ }
786
+ });
787
+
788
+ const downloadTargets = Array.from(pendingDownloads.keys());
789
+ for (let i = 0; i < downloadTargets.length; i += EPUB_IMAGE_BATCH_SIZE) {
790
+ const batch = downloadTargets.slice(i, i + EPUB_IMAGE_BATCH_SIZE).map(async (targetUrl) => {
306
791
  try {
307
- json = JSON.parse(bodyText);
308
- } catch (parseError) {
309
- throw new Error(`API响应解析失败: ${parseError.message} - ${bodyText.slice(0, 160)}`);
792
+ const info = await downloadImageToLocal(context, targetUrl, assetsDir, articleIndex);
793
+ sharedCache.set(targetUrl, info.fileUrl);
794
+ pendingDownloads.set(targetUrl, info.fileUrl);
795
+ } catch (error) {
796
+ console.log(chalk.yellow(` ⚠️ 图片下载失败: ${targetUrl} (${error.message})`));
797
+ pendingDownloads.set(targetUrl, null);
310
798
  }
799
+ });
800
+ await Promise.all(batch);
801
+ }
311
802
 
312
- if (!json || json.code !== 0 || !json.data) {
313
- throw new Error(`无法获取完整文章内容: ${bodyText.slice(0, 160)}`);
314
- }
803
+ images.each((_, element) => {
804
+ const originalSrc = $(element).attr('src') || '';
805
+ if (/^data:/i.test(originalSrc.trim())) {
806
+ return;
807
+ }
808
+ const normalizedUrl = resolveImageUrl(originalSrc);
809
+ if (!normalizedUrl) {
810
+ return;
811
+ }
812
+ const localUrl = sharedCache.get(normalizedUrl) || pendingDownloads.get(normalizedUrl);
813
+ if (localUrl) {
814
+ $(element).attr('src', localUrl);
815
+ }
816
+ });
315
817
 
316
- if (!json.data.article_content) {
317
- throw new Error('文章内容为空,可能需要更新 Cookie 或重新获取权限');
818
+ let processedInlineImages = 0;
819
+ for (let i = 0; i < dataUriImages.length; i++) {
820
+ const item = dataUriImages[i];
821
+ try {
822
+ const localUrl = await saveDataUriImage(item.src, assetsDir, articleIndex, i);
823
+ if (localUrl) {
824
+ $(item.element).attr('src', localUrl);
825
+ processedInlineImages++;
826
+ } else {
827
+ $(item.element).remove();
318
828
  }
319
-
320
- return json.data;
321
829
  } catch (error) {
322
- lastError = error;
323
- if (attempt < maxAttempts) {
324
- await new Promise(resolve => setTimeout(resolve, attempt * 700));
830
+ console.log(chalk.yellow(` ⚠️ 内联图片处理失败: ${error.message}`));
831
+ $(item.element).remove();
832
+ }
833
+ }
834
+
835
+ const finalHtml = $.root().html() || htmlContent;
836
+
837
+ return {
838
+ html: finalHtml,
839
+ replaced: downloadTargets.length + processedInlineImages
840
+ };
841
+ }
842
+
843
+ async function rewriteEpubContentImages(context, contentResults, assetsDir) {
844
+ const cache = new Map();
845
+ let processedArticles = 0;
846
+ let processedImages = 0;
847
+
848
+ const spinner = ora('正在缓存 EPUB 图片...').start();
849
+
850
+ const updatedResults = [];
851
+ for (let i = 0; i < contentResults.length; i++) {
852
+ const result = contentResults[i];
853
+ if (!result || !result.success || !result.content) {
854
+ updatedResults.push(result);
855
+ continue;
856
+ }
857
+ try {
858
+ const { html, replaced } = await rewriteImagesWithLocalFiles(context, result.content, assetsDir, i, cache);
859
+ processedImages += replaced;
860
+ if (replaced > 0) {
861
+ processedArticles++;
325
862
  }
863
+ updatedResults.push({ ...result, content: html });
864
+ } catch (error) {
865
+ spinner.stop();
866
+ console.log(chalk.yellow(`⚠️ 处理第 ${i + 1} 篇文章图片失败: ${error.message}`));
867
+ spinner.start();
868
+ updatedResults.push(result);
326
869
  }
327
870
  }
328
871
 
329
- throw lastError || new Error('未知错误导致文章内容获取失败');
872
+ if (processedImages === 0) {
873
+ spinner.stop();
874
+ console.log(chalk.gray('📷 没有检测到需要缓存的图片'));
875
+ } else {
876
+ spinner.succeed(`已缓存 EPUB 图片: ${processedImages} 张(${processedArticles} 篇文章)`);
877
+ }
878
+
879
+ return updatedResults;
880
+ }
881
+
882
+ async function createTempAssetsDir(baseDir) {
883
+ const tempDir = path.join(baseDir, `${TEMP_ASSET_PREFIX}_${Date.now().toString(36)}_${Math.random().toString(16).slice(2, 8)}`);
884
+ await fs.mkdir(tempDir, { recursive: true });
885
+ return tempDir;
886
+ }
887
+
888
+ async function cleanupTempAssetsDir(dir) {
889
+ if (!dir) return;
890
+ try {
891
+ await fs.rm(dir, { recursive: true, force: true });
892
+ } catch (error) {
893
+ console.log(chalk.gray(`清理临时目录失败: ${error.message}`));
894
+ }
330
895
  }
331
896
 
332
897
  async function sanitizeArticleHtml(page, rawHtml) {
333
- return page.evaluate((html) => {
898
+ return page.evaluate(({ html, removalSelectors, pluginKeywords, mindmapSelectors }) => {
334
899
  const template = document.createElement('template');
335
900
  template.innerHTML = html;
336
901
 
337
- const removalSelectors = [
338
- 'nav', 'header', 'footer', 'aside',
339
- '.comment', '.comments', '.Index_comment',
340
- '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
341
- '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
342
- '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
343
- '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
344
- '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
345
- '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
346
- '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
347
- '.copyright', '.statement', '.disclaimer',
348
- '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
349
- 'audio', 'video',
350
- '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
351
- '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
352
- '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
353
- '[data-role="toolbar"]',
354
- 'button', 'iframe', 'script', 'style'
355
- ];
356
902
  removalSelectors.forEach(selector => {
357
903
  template.content.querySelectorAll(selector).forEach(el => el.remove());
358
904
  });
359
905
 
360
- const pluginKeywords = [
361
- 'note', 'translation', 'audio', 'player', 'reward', 'donate',
362
- 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
363
- 'copyright', 'geeknote', 'bilingual'
364
- ];
365
906
  const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
366
907
  const className = (el.className || '').toString().toLowerCase();
367
908
  const idValue = (el.id || '').toString().toLowerCase();
@@ -372,11 +913,6 @@ async function sanitizeArticleHtml(page, rawHtml) {
372
913
  });
373
914
  pluginElements.forEach(el => el.remove());
374
915
 
375
- const mindmapSelectors = [
376
- '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
377
- '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
378
- '[class*="MindMap"]', '[class*="mindMap"]'
379
- ];
380
916
  mindmapSelectors.forEach(selector => {
381
917
  template.content.querySelectorAll(selector).forEach(el => el.remove());
382
918
  });
@@ -435,7 +971,16 @@ async function sanitizeArticleHtml(page, rawHtml) {
435
971
  });
436
972
 
437
973
  return template.innerHTML;
438
- }, rawHtml);
974
+ }, {
975
+ html: rawHtml,
976
+ removalSelectors: ARTICLE_REMOVAL_SELECTORS,
977
+ pluginKeywords: ARTICLE_PLUGIN_KEYWORDS,
978
+ mindmapSelectors: ARTICLE_MINDMAP_SELECTORS
979
+ });
980
+ }
981
+
982
+ function normalizeTextContent(text = '') {
983
+ return text.replace(/\s+/g, ' ').trim();
439
984
  }
440
985
 
441
986
  function escapeHtml(text = '') {
@@ -447,59 +992,650 @@ function escapeHtml(text = '') {
447
992
  .replace(/'/g, '&#39;');
448
993
  }
449
994
 
450
- function buildPrintableHtml(title, sanitizedHtml) {
451
- const baseCss = `
452
- body {
453
- font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
454
- font-size: 16px;
455
- line-height: 1.8;
456
- color: #1f2329;
457
- margin: 0;
458
- padding: 40px;
459
- background: #fff;
460
- }
461
-
462
- .article-print-wrapper {
463
- max-width: 900px;
464
- margin: 0 auto;
465
- }
466
-
467
- .article-print-wrapper h1 {
468
- font-size: 32px;
469
- line-height: 1.4;
470
- margin-bottom: 24px;
471
- }
472
-
473
- a {
474
- color: #0f5ef2;
475
- text-decoration: none;
476
- }
477
-
478
- pre {
479
- background: #f7f7f7;
480
- padding: 16px;
481
- border-radius: 6px;
482
- overflow: auto;
995
+ function removeDuplicateTitle(html, title = '') {
996
+ if (!html || !title) {
997
+ return html;
998
+ }
999
+ const normalizedTitle = normalizeTextContent(title);
1000
+ if (!normalizedTitle) {
1001
+ return html;
1002
+ }
1003
+ try {
1004
+ const $ = loadHtml(html, { decodeEntities: false });
1005
+ const firstHeading = $('h1, h2').first();
1006
+ if (firstHeading.length) {
1007
+ const headingText = normalizeTextContent(firstHeading.text());
1008
+ if (headingText && headingText === normalizedTitle) {
1009
+ firstHeading.remove();
1010
+ }
483
1011
  }
484
- `;
1012
+ return $.root().html() || html;
1013
+ } catch {
1014
+ return html;
1015
+ }
1016
+ }
485
1017
 
1018
+ function buildPdfHtml(title, sanitizedHtml, articleMeta = '') {
486
1019
  return `
487
1020
  <!DOCTYPE html>
488
1021
  <html lang="zh-CN">
489
1022
  <head>
490
1023
  <meta charset="utf-8">
491
1024
  <base href="${GEEKTIME_BASE_URL}">
492
- <style>${baseCss}${PRINT_FIX_CSS}</style>
1025
+ <style>${PDF_BASE_CSS}${PRINT_FIX_CSS}${CODE_HIGHLIGHT_CSS}</style>
493
1026
  </head>
494
1027
  <body>
495
- <div class="article-print-wrapper">
496
- <h1>${escapeHtml(title)}</h1>
497
- ${sanitizedHtml}
498
- </div>
1028
+ <article class="article-pdf-wrapper">
1029
+ <section class="article-content">
1030
+ <h1 class="article-title">${escapeHtml(title)}</h1>
1031
+ ${articleMeta ? `<div class="article-meta">${escapeHtml(articleMeta)}</div>` : ''}
1032
+ ${sanitizedHtml}
1033
+ </section>
1034
+ </article>
499
1035
  </body>
500
1036
  </html>`;
501
1037
  }
502
1038
 
1039
+ function enhanceCodeBlocks(html) {
1040
+ if (!html) return html;
1041
+ try {
1042
+ const $ = loadHtml(html, { decodeEntities: false });
1043
+ const wrapCodeElement = ($source, innerHtml) => {
1044
+ const wrapper = $('<pre class="code-block"></pre>');
1045
+ const codeEl = $('<code></code>').html(innerHtml);
1046
+ wrapper.append(codeEl);
1047
+ $source.replaceWith(wrapper);
1048
+ };
1049
+
1050
+ $('code').each((_, element) => {
1051
+ const $el = $(element);
1052
+ const parent = $el.parent();
1053
+ const text = $el.text() || '';
1054
+ const isBlocky = text.includes('\n') || text.length > 120 || $el.html().includes('<br');
1055
+ if (isBlocky && parent.length && parent[0].tagName !== 'PRE') {
1056
+ wrapCodeElement($el, $el.html());
1057
+ }
1058
+ });
1059
+ $('pre').each((_, element) => {
1060
+ const $el = $(element);
1061
+ if (!$el.hasClass('code-block')) {
1062
+ $el.addClass('code-block');
1063
+ }
1064
+ if ($el.find('code').length === 0) {
1065
+ const text = $el.html();
1066
+ $el.empty().append($('<code></code>').html(text));
1067
+ }
1068
+ });
1069
+
1070
+ const codeLikeSelectors = [
1071
+ '[class*="code"]',
1072
+ '[class*="Code"]',
1073
+ '[class*="code-block"]',
1074
+ '[class*="CodeBlock"]',
1075
+ '[class*="hljs"]',
1076
+ '[class*="language-"]',
1077
+ '.highlight',
1078
+ '.prism-code'
1079
+ ];
1080
+ const blockTags = ['P', 'DIV', 'SECTION', 'ARTICLE', 'UL', 'OL', 'TABLE', 'IMG', 'FIGURE'];
1081
+ const isLikelyCodeText = (text = '') => {
1082
+ const trimmed = text.trim();
1083
+ if (trimmed.length === 0) return false;
1084
+ if (trimmed.length > 1200) return false;
1085
+ return trimmed.includes('\n') || trimmed.includes('{') || trimmed.includes(';') || trimmed.includes(' ');
1086
+ };
1087
+ $(codeLikeSelectors.join(',')).each((_, element) => {
1088
+ const $el = $(element);
1089
+ if ($el.is('pre') || $el.find('pre').length > 0) {
1090
+ return;
1091
+ }
1092
+ const hasBlockChildren = blockTags.some(tag => $el.find(tag).length > 0);
1093
+ if (hasBlockChildren) {
1094
+ return;
1095
+ }
1096
+ const text = $el.text() || '';
1097
+ if (!isLikelyCodeText(text)) {
1098
+ return;
1099
+ }
1100
+ wrapCodeElement($el, $el.html());
1101
+ });
1102
+
1103
+ $('figure').each((_, element) => {
1104
+ const $el = $(element);
1105
+ if ($el.find('pre').length === 1 && $el.children().length === 1) {
1106
+ $el.replaceWith($el.find('pre').first());
1107
+ }
1108
+ });
1109
+
1110
+ const highlightSelectors = [
1111
+ '[class*="hljs"]',
1112
+ '[class*="language-"]',
1113
+ '.simplebar-content',
1114
+ '[data-language]',
1115
+ '[data-code-block]',
1116
+ '[class*="RichContent"]'
1117
+ ];
1118
+ const containerClassHints = ['simplebar', 'code', 'hljs', 'prism', 'syntax', 'monaco', 'ace', 'terminal', 'shell'];
1119
+ const containerStyleHints = ['white-space: pre', 'white-space:pre', 'font-family: monospace', 'font-family:monospace'];
1120
+ const inlineTags = new Set(['span', 'code', 'em', 'strong', 'b', 'i', 'u', 'a', 'label']);
1121
+ const newlineTags = new Set(['DIV', 'P', 'LI', 'SECTION', 'ARTICLE', 'FIGURE', 'PRE', 'CODE', 'BR', 'TR', 'TD', 'TH']);
1122
+ const looksLikeCodeBlock = (text = '') => {
1123
+ if (!text) return false;
1124
+ const trimmed = text.trim();
1125
+ if (!trimmed) return false;
1126
+ if (trimmed.includes('\n')) return true;
1127
+ const keywords = ['{', '}', ';', '=>', '->', '#!', 'SELECT ', 'INSERT ', 'docker ', 'kubectl ', 'sudo ', 'printf', 'def ', 'class ', 'function ', 'const ', 'let ', 'var ', 'public ', 'private ', 'import ', 'package ', 'namespace ', 'http '];
1128
+ return keywords.some(keyword => trimmed.includes(keyword));
1129
+ };
1130
+ const getTextWithBreaks = (node) => {
1131
+ if (!node) return '';
1132
+ if (node.type === 'text') {
1133
+ return node.data || '';
1134
+ }
1135
+ if (!node.children || node.children.length === 0) {
1136
+ return newlineTags.has((node.tagName || node.name || '').toUpperCase()) ? '\n' : '';
1137
+ }
1138
+ let text = '';
1139
+ for (const child of node.children) {
1140
+ text += getTextWithBreaks(child);
1141
+ }
1142
+ if (newlineTags.has((node.tagName || node.name || '').toUpperCase())) {
1143
+ text += '\n';
1144
+ }
1145
+ return text;
1146
+ };
1147
+ const normalizeCodeText = (text = '') => {
1148
+ const lines = text
1149
+ .replace(/\r\n?/g, '\n')
1150
+ .split('\n')
1151
+ .map(line => line.replace(/\u00a0/g, ' ').replace(/\t/g, ' ').replace(/\s+$/, ''));
1152
+ while (lines.length && !lines[0].trim()) {
1153
+ lines.shift();
1154
+ }
1155
+ while (lines.length && !lines[lines.length - 1].trim()) {
1156
+ lines.pop();
1157
+ }
1158
+ const result = [];
1159
+ let previousBlank = false;
1160
+ for (const line of lines) {
1161
+ const isBlank = line.trim().length === 0;
1162
+ if (isBlank && previousBlank) {
1163
+ continue;
1164
+ }
1165
+ result.push(line);
1166
+ previousBlank = isBlank;
1167
+ }
1168
+ return result.join('\n').trim();
1169
+ };
1170
+ const convertToCodeBlock = ($target) => {
1171
+ if (!$target || !$target.length) {
1172
+ return false;
1173
+ }
1174
+ const rawText = getTextWithBreaks($target[0]) || '';
1175
+ const normalized = normalizeCodeText(rawText);
1176
+ if (!looksLikeCodeBlock(normalized)) {
1177
+ return false;
1178
+ }
1179
+ const $pre = $('<pre class="code-block"></pre>');
1180
+ const $code = $('<code></code>').text(normalized);
1181
+ $pre.append($code);
1182
+ $target.replaceWith($pre);
1183
+ return true;
1184
+ };
1185
+ const processedCandidates = new Set();
1186
+ $(highlightSelectors.join(',')).each((_, node) => {
1187
+ const $start = $(node);
1188
+ if (!$start || !$start.length) {
1189
+ return;
1190
+ }
1191
+ let $candidate = null;
1192
+ let $current = $start;
1193
+ for (let depth = 0; depth < 8 && $current && $current.length; depth++) {
1194
+ const rawTag = ($current[0]?.tagName || $current[0]?.name || '').toLowerCase();
1195
+ const classAttr = ($current.attr('class') || '').toLowerCase();
1196
+ const styleAttr = ($current.attr('style') || '').toLowerCase();
1197
+ const hasClassHint = containerClassHints.some(keyword => classAttr.includes(keyword));
1198
+ const hasStyleHint = containerStyleHints.some(keyword => styleAttr.includes(keyword));
1199
+ if (!inlineTags.has(rawTag) && (hasClassHint || hasStyleHint)) {
1200
+ $candidate = $current;
1201
+ }
1202
+ $current = $current.parent();
1203
+ }
1204
+ if (!$candidate || !$candidate.length || $candidate.is('pre')) {
1205
+ return;
1206
+ }
1207
+ const key = $candidate[0];
1208
+ if (processedCandidates.has(key)) {
1209
+ return;
1210
+ }
1211
+ if (convertToCodeBlock($candidate)) {
1212
+ processedCandidates.add(key);
1213
+ }
1214
+ });
1215
+
1216
+ const simplebarWrappers = [
1217
+ '.simplebar-wrapper',
1218
+ '.simplebar-height-auto-observer-wrapper',
1219
+ '.simplebar-height-auto-observer',
1220
+ '.simplebar-mask',
1221
+ '.simplebar-offset',
1222
+ '.simplebar-content-wrapper',
1223
+ '.simplebar-placeholder'
1224
+ ];
1225
+ simplebarWrappers.forEach(selector => {
1226
+ $(selector).each((_, element) => {
1227
+ const $el = $(element);
1228
+ if ($el.find('pre.code-block').length > 0 || !$el.text().trim()) {
1229
+ $el.replaceWith($el.contents());
1230
+ }
1231
+ });
1232
+ });
1233
+ $('.simplebar-track, .simplebar-scrollbar').remove();
1234
+
1235
+ return $.root().html() || html;
1236
+ } catch {
1237
+ return html;
1238
+ }
1239
+ }
1240
+
1241
+ async function detectAccessIssuesOnPage(page) {
1242
+ return page.evaluate(() => {
1243
+ const bodyText = document.body ? (document.body.innerText || '') : '';
1244
+ if (!bodyText) {
1245
+ return null;
1246
+ }
1247
+ const normalized = bodyText.replace(/\s+/g, ' ').trim();
1248
+ if (!normalized) {
1249
+ return null;
1250
+ }
1251
+
1252
+ const checks = [
1253
+ {
1254
+ keywords: ['请先登录', '重新登录', '立即登录', '登录后'],
1255
+ message: '页面提示需要登录,Cookie 可能已失效或未正确导入'
1256
+ },
1257
+ {
1258
+ keywords: ['试看结束', '购买专栏', '立即订阅', '购买课程', '仅对付费用户开放', '开通会员'],
1259
+ message: '检测到购买/试看提示,可能未订阅该专栏或 Cookie 已失效'
1260
+ },
1261
+ {
1262
+ keywords: ['暂无权限', '没有权限', '权限不足'],
1263
+ message: '账号没有访问该专栏的权限'
1264
+ }
1265
+ ];
1266
+
1267
+ const lower = normalized.toLowerCase();
1268
+ for (const check of checks) {
1269
+ for (const keyword of check.keywords) {
1270
+ if (lower.includes(keyword.toLowerCase())) {
1271
+ return check.message;
1272
+ }
1273
+ }
1274
+ }
1275
+ return null;
1276
+ });
1277
+ }
1278
+
1279
+ async function waitForArticleContentSelector(page, timeout = 60000) {
1280
+ const start = Date.now();
1281
+ while ((Date.now() - start) < timeout) {
1282
+ for (const selector of ARTICLE_CONTENT_SELECTORS) {
1283
+ const handle = await page.$(selector);
1284
+ if (handle) {
1285
+ await handle.dispose();
1286
+ return selector;
1287
+ }
1288
+ }
1289
+ await page.waitForTimeout(300);
1290
+ }
1291
+ return null;
1292
+ }
1293
+
1294
+ async function autoScrollArticle(page, { step = 400, delay = 120, maxIterations = 80 } = {}) {
1295
+ await page.evaluate(({ step, delay, maxIterations }) => {
1296
+ return new Promise((resolve) => {
1297
+ let iterations = 0;
1298
+ const timer = setInterval(() => {
1299
+ window.scrollBy(0, step);
1300
+ iterations += 1;
1301
+ const reachedBottom = window.scrollY + window.innerHeight >= document.body.scrollHeight - 50;
1302
+ if (reachedBottom || iterations >= maxIterations) {
1303
+ clearInterval(timer);
1304
+ window.scrollTo(0, 0);
1305
+ resolve();
1306
+ }
1307
+ }, delay);
1308
+ });
1309
+ }, { step, delay, maxIterations });
1310
+ }
1311
+
1312
+ async function fetchArticleContentFromPage(page, article, timeout = 60000) {
1313
+ const targetUrl = article.url || `${GEEKTIME_BASE_URL}/column/article/${article.id}`;
1314
+ let response;
1315
+ try {
1316
+ response = await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout });
1317
+ } catch (error) {
1318
+ throw new Error(`页面加载失败: ${error.message}`);
1319
+ }
1320
+
1321
+ if (response && !response.ok()) {
1322
+ throw new Error(`页面响应异常: HTTP ${response.status()} ${response.statusText()}`);
1323
+ }
1324
+
1325
+ try {
1326
+ await page.waitForLoadState('networkidle', { timeout: Math.min(10000, timeout) });
1327
+ } catch {
1328
+ // 部分页面可能没有额外请求,忽略 networkidle 超时
1329
+ }
1330
+
1331
+ await autoScrollArticle(page);
1332
+ await page.waitForTimeout(500);
1333
+
1334
+ const selector = await waitForArticleContentSelector(page, timeout);
1335
+ if (!selector) {
1336
+ const issue = await detectAccessIssuesOnPage(page);
1337
+ if (issue) {
1338
+ throw new Error(issue);
1339
+ }
1340
+ throw new Error('未能定位到文章正文,请重试或检查 Cookie 是否有效');
1341
+ }
1342
+
1343
+ let extraction;
1344
+ try {
1345
+ extraction = await page.$eval(selector, (el) => {
1346
+ const clone = el.cloneNode(true);
1347
+ const removalSelectors = [
1348
+ '.article-share',
1349
+ '.article-actions',
1350
+ '.article-copyright',
1351
+ '.article-bottom',
1352
+ '.reward',
1353
+ '.share',
1354
+ '.Index_recommend',
1355
+ '.recommend',
1356
+ '.audio-player',
1357
+ '.AudioPlayer',
1358
+ '.voice-player',
1359
+ '.VoicePlayer',
1360
+ '.audio-wrapper',
1361
+ '.AudioWrapper',
1362
+ '.geek-player',
1363
+ '.Player',
1364
+ '.plugin',
1365
+ '.Plugin',
1366
+ '[data-widget="audio"]',
1367
+ '[data-widget="Audio"]',
1368
+ '[data-role="audio"]',
1369
+ '.comment-area',
1370
+ '.CommentArea',
1371
+ '.comment-wrapper',
1372
+ '.CommentWrapper',
1373
+ '#comments',
1374
+ '#comment',
1375
+ '.comments',
1376
+ '.Comments'
1377
+ ];
1378
+ removalSelectors.forEach(sel => {
1379
+ clone.querySelectorAll(sel).forEach(node => node.remove());
1380
+ });
1381
+
1382
+ const toAbsoluteUrl = (value) => {
1383
+ if (!value || typeof value !== 'string') {
1384
+ return '';
1385
+ }
1386
+ const trimmed = value.trim();
1387
+ if (!trimmed) {
1388
+ return '';
1389
+ }
1390
+ if (trimmed.startsWith('blob:')) {
1391
+ return '';
1392
+ }
1393
+ if (trimmed.startsWith('data:')) {
1394
+ return trimmed;
1395
+ }
1396
+ if (/^https?:/i.test(trimmed)) {
1397
+ return trimmed;
1398
+ }
1399
+ if (trimmed.startsWith('//')) {
1400
+ return `${location.protocol}${trimmed}`;
1401
+ }
1402
+ try {
1403
+ const url = new URL(trimmed, location.href);
1404
+ return url.href;
1405
+ } catch {
1406
+ return '';
1407
+ }
1408
+ };
1409
+
1410
+ const imageFallbackAttrs = [
1411
+ 'data-src',
1412
+ 'data-original',
1413
+ 'data-actualsrc',
1414
+ 'data-url',
1415
+ 'data-image',
1416
+ 'data-origin',
1417
+ 'data-thumbnail',
1418
+ 'data-bigimgsrc',
1419
+ 'data-download',
1420
+ 'data-href'
1421
+ ];
1422
+
1423
+ clone.querySelectorAll('img').forEach(img => {
1424
+ let finalSrc = toAbsoluteUrl(img.getAttribute('src'));
1425
+ if (!finalSrc) {
1426
+ for (const attr of imageFallbackAttrs) {
1427
+ const candidate = toAbsoluteUrl(img.getAttribute(attr));
1428
+ if (candidate) {
1429
+ finalSrc = candidate;
1430
+ break;
1431
+ }
1432
+ }
1433
+ }
1434
+
1435
+ if (!finalSrc) {
1436
+ img.remove();
1437
+ } else {
1438
+ img.setAttribute('src', finalSrc);
1439
+ }
1440
+ });
1441
+
1442
+ const textLength = clone.innerText ? clone.innerText.trim().length : 0;
1443
+ return {
1444
+ html: clone.innerHTML,
1445
+ textLength
1446
+ };
1447
+ });
1448
+ } catch (error) {
1449
+ throw new Error(`读取文章内容失败: ${error.message}`);
1450
+ }
1451
+
1452
+ if (!extraction || !extraction.html || extraction.textLength < 20) {
1453
+ const issue = await detectAccessIssuesOnPage(page);
1454
+ if (issue) {
1455
+ throw new Error(issue);
1456
+ }
1457
+ throw new Error('正文内容为空,可能是 Cookie 失效或只获取到试看内容');
1458
+ }
1459
+
1460
+ const normalizedHtml = normalizeArticleHtml(extraction.html);
1461
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1462
+
1463
+ if (!sanitizedHtml || sanitizedHtml.trim().length === 0) {
1464
+ throw new Error('正文清洗后为空,可能是页面结构变化');
1465
+ }
1466
+
1467
+ const cleaned = removeDuplicateTitle(sanitizedHtml, article.originalTitle || article.title || '');
1468
+ return enhanceCodeBlocks(cleaned);
1469
+ }
1470
+
1471
+ function isRetryableContentError(message = '') {
1472
+ if (!message) return true;
1473
+ const lower = message.toLowerCase();
1474
+ const nonRetryableKeywords = [
1475
+ 'cookie', '登录', '登陆', '订阅', '试看', '权限', '购买', '未授权', '无权限'
1476
+ ];
1477
+ return !nonRetryableKeywords.some(keyword => lower.includes(keyword));
1478
+ }
1479
+
1480
+ async function fetchArticleContentWithRetry(page, article, options = {}) {
1481
+ const {
1482
+ timeout = 60000,
1483
+ maxAttempts = 3,
1484
+ delayMs = 1500
1485
+ } = options;
1486
+
1487
+ let lastError = null;
1488
+
1489
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
1490
+ try {
1491
+ if (attempt > 1) {
1492
+ await page.waitForTimeout(400);
1493
+ }
1494
+ return await fetchArticleContentFromPage(page, article, timeout);
1495
+ } catch (error) {
1496
+ lastError = error;
1497
+ const message = error?.message || '';
1498
+ if (!isRetryableContentError(message) || attempt === maxAttempts) {
1499
+ throw error;
1500
+ }
1501
+ const waitTime = delayMs * attempt;
1502
+ if (process.env.DEBUG) {
1503
+ console.log(chalk.gray(`重试文章 ${article.id} (第${attempt}次失败: ${message}),等待 ${waitTime}ms`));
1504
+ }
1505
+ try {
1506
+ await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 });
1507
+ } catch {
1508
+ // 忽略
1509
+ }
1510
+ await page.waitForTimeout(waitTime);
1511
+ }
1512
+ }
1513
+
1514
+ throw lastError || new Error('无法获取文章内容');
1515
+ }
1516
+
1517
+ async function extractArticlesFromPageDom(page) {
1518
+ return page.evaluate((baseUrl) => {
1519
+ const selectors = [
1520
+ '[class*="catalog"] a[href*="/column/article/"]',
1521
+ '[class*="directory"] a[href*="/column/article/"]',
1522
+ '[class*="Catalogue"] a[href*="/column/article/"]',
1523
+ '[class*="Catalog"] a[href*="/column/article/"]',
1524
+ 'nav a[href*="/column/article/"]',
1525
+ 'a[href*="/column/article/"]'
1526
+ ];
1527
+
1528
+ const collectedAnchors = [];
1529
+ const seenElements = new Set();
1530
+ selectors.forEach(selector => {
1531
+ const nodes = document.querySelectorAll(selector);
1532
+ nodes.forEach(node => {
1533
+ if (!seenElements.has(node)) {
1534
+ seenElements.add(node);
1535
+ collectedAnchors.push(node);
1536
+ }
1537
+ });
1538
+ });
1539
+
1540
+ if (collectedAnchors.length === 0) {
1541
+ return [];
1542
+ }
1543
+
1544
+ const seenIds = new Set();
1545
+ const articles = [];
1546
+
1547
+ const cleanText = (text) => (text || '').replace(/\s+/g, ' ').trim();
1548
+
1549
+ collectedAnchors.forEach((anchor, index) => {
1550
+ const href = anchor.getAttribute('href') || '';
1551
+ const match = href.match(/column\/article\/(\d+)/i);
1552
+ if (!match) {
1553
+ return;
1554
+ }
1555
+
1556
+ const id = parseInt(match[1], 10);
1557
+ if (!id || seenIds.has(id)) {
1558
+ return;
1559
+ }
1560
+ seenIds.add(id);
1561
+
1562
+ let title = cleanText(anchor.innerText || anchor.textContent || anchor.getAttribute('title') || '');
1563
+ if (!title) {
1564
+ const titleNode = anchor.querySelector('[class*="title"], span, div');
1565
+ if (titleNode) {
1566
+ title = cleanText(titleNode.textContent);
1567
+ }
1568
+ }
1569
+ if (!title) {
1570
+ title = `文章_${id}`;
1571
+ }
1572
+
1573
+ let absoluteUrl = href;
1574
+ try {
1575
+ absoluteUrl = new URL(href, baseUrl).toString();
1576
+ } catch {
1577
+ if (href.startsWith('/')) {
1578
+ absoluteUrl = `${baseUrl.replace(/\/$/, '')}${href}`;
1579
+ }
1580
+ }
1581
+
1582
+ const sectionNode = anchor.closest('[data-section],[data-chapter],[class*="section"],[class*="Section"],[class*="chapter"],[class*="Chapter"]');
1583
+ let sectionName = '';
1584
+ if (sectionNode) {
1585
+ sectionName = cleanText(
1586
+ sectionNode.getAttribute('data-section') ||
1587
+ sectionNode.getAttribute('data-chapter') ||
1588
+ sectionNode.getAttribute('data-title') ||
1589
+ sectionNode.querySelector('h2, h3, h4, .title, .section-title')?.textContent ||
1590
+ ''
1591
+ );
1592
+ }
1593
+
1594
+ articles.push({
1595
+ id,
1596
+ article_title: title,
1597
+ article_sharetitle: title,
1598
+ url: absoluteUrl,
1599
+ section_name: sectionName,
1600
+ chapter_index: index + 1,
1601
+ originalIndex: index
1602
+ });
1603
+ });
1604
+
1605
+ return articles;
1606
+ }, GEEKTIME_BASE_URL);
1607
+ }
1608
+
1609
+ async function extractColumnAuthorFromPage(page) {
1610
+ try {
1611
+ return await page.evaluate(() => {
1612
+ const selectors = [
1613
+ '.author-name',
1614
+ '.author',
1615
+ '.teacher-name',
1616
+ '.lecturer-name',
1617
+ '.Index_teacherName',
1618
+ '.ProductHeader_teacherName',
1619
+ '.ColumnIntro_teacher__name',
1620
+ '.ColumnIntro_author__name'
1621
+ ];
1622
+ for (const selector of selectors) {
1623
+ const el = document.querySelector(selector);
1624
+ if (el && el.textContent && el.textContent.trim()) {
1625
+ return el.textContent.trim();
1626
+ }
1627
+ }
1628
+ const metaAuthor = document.querySelector('meta[name="author"]');
1629
+ if (metaAuthor && metaAuthor.content) {
1630
+ return metaAuthor.content.trim();
1631
+ }
1632
+ return null;
1633
+ });
1634
+ } catch {
1635
+ return null;
1636
+ }
1637
+ }
1638
+
503
1639
  // 获取专栏所有文章列表(通过API)
504
1640
  function getValueByPath(obj, path) {
505
1641
  if (!obj || !path) return undefined;
@@ -602,45 +1738,53 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
602
1738
  let columnInfoHandler = null;
603
1739
 
604
1740
  // 用于同步的 Promise
605
- const articlesPromise = new Promise((resolve, reject) => {
606
- articlesHandler = async (response) => {
607
- const url = response.url();
608
- // 监听文章列表 API
609
- if (url.includes('/serv/v1/column/articles')) {
610
- try {
611
- const data = await response.json();
612
- if (process.env.DEBUG) {
613
- console.log(chalk.gray('\n收到文章列表API响应'));
1741
+ const articlesPromise = Promise.race([
1742
+ new Promise((resolve) => {
1743
+ articlesHandler = async (response) => {
1744
+ const url = response.url();
1745
+ // 监听文章列表 API
1746
+ if (url.includes('/serv/v1/column/articles')) {
1747
+ try {
1748
+ const data = await response.json();
1749
+ if (process.env.DEBUG) {
1750
+ console.log(chalk.gray('\n收到文章列表API响应'));
1751
+ }
1752
+ resolve(data);
1753
+ } catch (e) {
1754
+ console.error('解析文章列表API失败:', e);
1755
+ resolve(null);
614
1756
  }
615
- resolve(data);
616
- } catch (e) {
617
- console.error('解析文章列表API失败:', e);
618
1757
  }
619
- }
620
- };
621
- page.on('response', articlesHandler);
622
- });
623
-
624
- const columnInfoPromise = new Promise((resolve) => {
625
- columnInfoHandler = async (response) => {
626
- const url = response.url();
627
- // 监听专栏详情相关的 API
628
- if (url.includes('/serv/v1/column/intro') ||
629
- url.includes('/serv/v3/column/info') ||
630
- url.includes('/serv/v1/column/detail')) {
631
- try {
632
- const data = await response.json();
633
- if (process.env.DEBUG) {
634
- console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
1758
+ };
1759
+ page.on('response', articlesHandler);
1760
+ }),
1761
+ new Promise(resolve => setTimeout(() => resolve(null), 30000))
1762
+ ]);
1763
+
1764
+ const columnInfoPromise = Promise.race([
1765
+ new Promise((resolve) => {
1766
+ columnInfoHandler = async (response) => {
1767
+ const url = response.url();
1768
+ // 监听专栏详情相关的 API
1769
+ if (url.includes('/serv/v1/column/intro') ||
1770
+ url.includes('/serv/v3/column/info') ||
1771
+ url.includes('/serv/v1/column/detail')) {
1772
+ try {
1773
+ const data = await response.json();
1774
+ if (process.env.DEBUG) {
1775
+ console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
1776
+ }
1777
+ resolve(data);
1778
+ } catch (e) {
1779
+ console.error('解析专栏信息API失败:', e);
1780
+ resolve(null);
635
1781
  }
636
- resolve(data);
637
- } catch (e) {
638
- console.error('解析专栏信息API失败:', e);
639
1782
  }
640
- }
641
- };
642
- page.on('response', columnInfoHandler);
643
- });
1783
+ };
1784
+ page.on('response', columnInfoHandler);
1785
+ }),
1786
+ new Promise(resolve => setTimeout(() => resolve(null), 5000))
1787
+ ]);
644
1788
 
645
1789
  try {
646
1790
  // 先设置监听器,再访问页面
@@ -649,23 +1793,13 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
649
1793
 
650
1794
  spinner.text = '正在获取文章列表...';
651
1795
 
652
- // 等待文章列表 API(必须的)
653
- articlesData = await Promise.race([
654
- articlesPromise,
655
- new Promise((_, reject) => setTimeout(() => reject(new Error('文章列表API调用超时')), 30000))
656
- ]);
1796
+ // 等待文章列表 API(如果失败将返回 null)
1797
+ articlesData = await articlesPromise;
657
1798
 
658
- // 尝试等待专栏信息 API(可选的,5秒超时)
659
- try {
660
- columnInfoData = await Promise.race([
661
- columnInfoPromise,
662
- new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
663
- ]);
664
- } catch (e) {
665
- // 获取专栏信息失败不是致命错误
666
- if (process.env.DEBUG) {
667
- console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
668
- }
1799
+ // 尝试等待专栏信息 API(可选)
1800
+ columnInfoData = await columnInfoPromise;
1801
+ if (!columnInfoData && process.env.DEBUG) {
1802
+ console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
669
1803
  }
670
1804
 
671
1805
  } catch (error) {
@@ -694,32 +1828,47 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
694
1828
  }
695
1829
  }
696
1830
 
697
- if (!articlesData || !articlesData.data || !articlesData.data.list) {
698
- spinner.fail('API响应数据格式错误');
1831
+ let useDomExtraction = false;
1832
+ let domArticles = [];
1833
+
1834
+ if (!articlesData || !articlesData.data || !Array.isArray(articlesData.data.list) || articlesData.data.list.length === 0) {
1835
+ spinner.text = 'API 不可用,尝试从页面解析文章列表...';
1836
+ try {
1837
+ domArticles = await extractArticlesFromPageDom(page);
1838
+ } catch (error) {
1839
+ if (process.env.DEBUG) {
1840
+ console.log(chalk.gray(`DOM文章提取失败: ${error.message}`));
1841
+ }
1842
+ }
1843
+
1844
+ if (!domArticles || domArticles.length === 0) {
1845
+ spinner.fail('无法获取文章列表');
1846
+
1847
+ if (!articlesData) {
1848
+ console.log(chalk.yellow('\n⚠️ 未能从接口或页面获取文章列表\n'));
1849
+ console.log(chalk.cyan('可能的原因:'));
1850
+ console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
1851
+ console.log(chalk.gray(' 2. 页面结构发生变化 - 请联系开发者更新解析逻辑'));
1852
+ console.log(chalk.gray(' 3. 网络连接问题或URL无效\n'));
1853
+ } else if (articlesData.code === -3000 || articlesData.code === -3001) {
1854
+ console.log(chalk.red('\n❌ Cookie 已失效\n'));
1855
+ console.log(chalk.cyan('📖 请重新获取 Cookie:'));
1856
+ console.log(chalk.gray(' 1. 浏览器登录极客时间'));
1857
+ console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
1858
+ console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
1859
+ console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
1860
+ } else if (articlesData.error) {
1861
+ console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
1862
+ }
699
1863
 
700
- // 智能判断可能的原因
701
- if (!articlesData) {
702
- console.log(chalk.yellow('\n⚠️ 未能获取到文章列表数据\n'));
703
- console.log(chalk.cyan('可能的原因:'));
704
- console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
705
- console.log(chalk.gray(' 2. 网络连接问题 - 请检查网络'));
706
- console.log(chalk.gray(' 3. 专栏 ID 不正确 - 请检查 URL\n'));
707
- } else if (articlesData.code === -3000 || articlesData.code === -3001) {
708
- console.log(chalk.red('\n❌ Cookie 已失效\n'));
709
- console.log(chalk.cyan('📖 请重新获取 Cookie:'));
710
- console.log(chalk.gray(' 1. 浏览器登录极客时间'));
711
- console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
712
- console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
713
- console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
714
- } else if (articlesData.error) {
715
- console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
1864
+ return { articles: [], columnTitle: 'unknown', columnAuthor: '极客时间' };
716
1865
  }
717
1866
 
718
- return { articles: [], columnTitle: 'unknown' };
1867
+ useDomExtraction = true;
719
1868
  }
720
1869
 
721
1870
  // 调试信息:记录完整的API响应结构(仅在环境变量DEBUG存在时)
722
- if (process.env.DEBUG) {
1871
+ if (!useDomExtraction && process.env.DEBUG) {
723
1872
  console.log(chalk.gray('\n========== 文章列表 API 响应数据 =========='));
724
1873
  console.log(chalk.gray(JSON.stringify(articlesData.data, null, 2)));
725
1874
  if (columnInfoData) {
@@ -742,7 +1891,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
742
1891
  }
743
1892
 
744
1893
  // 方法2: 从文章列表 API 数据中获取
745
- if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
1894
+ if ((!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') && articlesData && articlesData.data) {
746
1895
  columnTitle = articlesData.data.column_title
747
1896
  || articlesData.data.column_subtitle
748
1897
  || articlesData.data.title
@@ -826,10 +1975,15 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
826
1975
  console.log(chalk.gray(` 提取的专栏名: ${columnTitle}\n`));
827
1976
  }
828
1977
 
829
- const columnAuthor = extractColumnAuthor(columnInfoData, articlesData) || '极客时间';
1978
+ let columnAuthor = '极客时间';
1979
+ if (!useDomExtraction && articlesData) {
1980
+ columnAuthor = extractColumnAuthor(columnInfoData, articlesData) || '极客时间';
1981
+ } else {
1982
+ columnAuthor = await extractColumnAuthorFromPage(page) || '极客时间';
1983
+ }
830
1984
 
831
1985
  // 解析文章列表
832
- const rawArticles = articlesData.data.list;
1986
+ const rawArticles = useDomExtraction ? domArticles : (articlesData.data.list || []);
833
1987
 
834
1988
  const articles = rawArticles.map((article, index) => {
835
1989
  const title = article.article_title || article.article_sharetitle || 'Untitled';
@@ -844,7 +1998,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
844
1998
 
845
1999
  return {
846
2000
  title: cleanTitle,
847
- url: `https://time.geekbang.org/column/article/${id}`,
2001
+ url: article.url || `${GEEKTIME_BASE_URL}/column/article/${id}`,
848
2002
  originalTitle: title,
849
2003
  id: id,
850
2004
  sectionName: article.section_name || '',
@@ -888,7 +2042,7 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
888
2042
  const article = articles[index];
889
2043
 
890
2044
  try {
891
- const result = await downloadArticleSilent(page, article, outputDir, index + 1, total);
2045
+ const result = await downloadArticleSilent(page, article, outputDir, index + 1, total, timeout);
892
2046
  results[index] = result;
893
2047
  completed++;
894
2048
 
@@ -943,52 +2097,20 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
943
2097
  }
944
2098
 
945
2099
  // 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
946
- async function downloadArticleSilent(page, article, outputDir, index, total) {
2100
+ async function downloadArticleSilent(page, article, outputDir, index, total, timeout = 60000) {
947
2101
  try {
948
2102
  if (process.env.DEBUG) {
949
2103
  console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
950
2104
  }
951
- const articleData = await fetchArticleData(page.context(), article.id);
952
- if (process.env.DEBUG) {
953
- console.log(chalk.gray(`[silent] 已获取文章数据 ${article.id}`));
954
- }
955
- const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
956
- const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
957
- if (process.env.DEBUG) {
958
- console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
959
- }
960
- const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
2105
+ const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
2106
+ const meta = article.sectionName ? `章节:${article.sectionName}` : '';
2107
+ const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
961
2108
 
962
2109
  await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
963
- if (process.env.DEBUG) {
964
- console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
965
- }
966
- if (process.env.DEBUG) {
967
- console.log(chalk.gray(`[silent] 等待图片初步加载 ${article.id}`));
968
- }
969
- try {
970
- await page.waitForFunction(() => {
971
- const imgs = Array.from(document.images || []);
972
- if (imgs.length === 0) {
973
- return true;
974
- }
975
- return imgs.every(img => img.complete);
976
- }, { timeout: 30000 });
977
- } catch (waitError) {
978
- if (process.env.DEBUG) {
979
- console.log(chalk.gray(`[silent] 图片初步加载等待超时 ${article.id}: ${waitError?.message || waitError}`));
980
- }
981
- }
982
2110
  try {
983
2111
  await page.waitForLoadState('networkidle', { timeout: 5000 });
984
- if (process.env.DEBUG) {
985
- console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
986
- }
987
2112
  } catch {
988
- // 忽略由于没有额外资源导致的延时
989
- if (process.env.DEBUG) {
990
- console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
991
- }
2113
+ // ignore
992
2114
  }
993
2115
 
994
2116
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
@@ -1080,7 +2202,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
1080
2202
  }
1081
2203
 
1082
2204
  // 等待图片处理完成
1083
- await page.waitForTimeout(30000);
2205
+ await page.waitForTimeout(1200);
1084
2206
  if (process.env.DEBUG) {
1085
2207
  console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
1086
2208
  }
@@ -1098,7 +2220,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
1098
2220
  bottom: '20mm',
1099
2221
  left: '15mm'
1100
2222
  },
1101
- printBackground: false, // 关闭背景打印,显著减小文件大小
2223
+ printBackground: true,
1102
2224
  preferCSSPageSize: false
1103
2225
  });
1104
2226
  if (process.env.DEBUG) {
@@ -1116,20 +2238,19 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
1116
2238
  }
1117
2239
 
1118
2240
  // 下载单篇文章为 PDF
1119
- async function downloadArticle(page, article, outputDir, index, total) {
2241
+ async function downloadArticle(page, article, outputDir, index, total, timeout = 60000) {
1120
2242
  const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
1121
2243
 
1122
2244
  try {
1123
- const articleData = await fetchArticleData(page.context(), article.id);
1124
- const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1125
- const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1126
- const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
2245
+ const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
2246
+ const meta = article.sectionName ? `章节:${article.sectionName}` : '';
2247
+ const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
1127
2248
 
1128
2249
  await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
1129
2250
  try {
1130
2251
  await page.waitForLoadState('networkidle', { timeout: 5000 });
1131
2252
  } catch {
1132
- // 没有额外资源加载时忽略
2253
+ // 忽略
1133
2254
  }
1134
2255
 
1135
2256
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
@@ -1204,7 +2325,7 @@ async function downloadArticle(page, article, outputDir, index, total) {
1204
2325
  bottom: '20mm',
1205
2326
  left: '15mm'
1206
2327
  },
1207
- printBackground: false, // 关闭背景打印,显著减小文件大小
2328
+ printBackground: true,
1208
2329
  preferCSSPageSize: false
1209
2330
  });
1210
2331
 
@@ -1327,11 +2448,9 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
1327
2448
  }
1328
2449
 
1329
2450
  // 提取单篇文章的 HTML 内容(用于 EPUB 生成)
1330
- async function extractArticleContent(page, article, index, total) {
2451
+ async function extractArticleContent(page, article, index, total, timeout = 60000) {
1331
2452
  try {
1332
- const articleData = await fetchArticleData(page.context(), article.id);
1333
- const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1334
- const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
2453
+ const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
1335
2454
 
1336
2455
  if (!sanitizedHtml) {
1337
2456
  throw new Error('未能提取到文章内容');
@@ -1385,7 +2504,7 @@ async function extractWithConcurrency(context, articles, concurrency = 5, delay
1385
2504
  const article = articles[index];
1386
2505
 
1387
2506
  try {
1388
- const result = await extractArticleContent(page, article, index + 1, total);
2507
+ const result = await extractArticleContent(page, article, index + 1, total, timeout);
1389
2508
  results[index] = result;
1390
2509
  completed++;
1391
2510
 
@@ -1500,41 +2619,43 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
1500
2619
  margin: 1.5em 0;
1501
2620
  padding: 0;
1502
2621
  }
1503
- p {
2622
+ p, div {
1504
2623
  margin: 1.2em 0;
1505
2624
  text-indent: 0;
1506
- line-height: 1.8;
2625
+ line-height: 1.9;
1507
2626
  word-wrap: break-word;
1508
2627
  overflow-wrap: break-word;
1509
2628
  display: block;
1510
2629
  page-break-inside: avoid;
1511
2630
  }
1512
- /* 确保段落之间有明显间隔 */
1513
- p + p {
1514
- margin-top: 1.5em;
2631
+ p + p,
2632
+ div + p,
2633
+ p + div {
2634
+ margin-top: 1.6em;
1515
2635
  }
1516
2636
  /* 代码块样式 */
1517
2637
  pre {
1518
- background-color: #f6f8fa;
2638
+ background-color: #0b1220;
2639
+ color: #d9e2ff;
1519
2640
  border: 1px solid #e1e4e8;
1520
2641
  border-radius: 6px;
1521
- padding: 16px;
2642
+ padding: 18px 20px;
1522
2643
  overflow-x: auto;
1523
2644
  margin: 1em 0;
1524
- line-height: 1.5;
2645
+ line-height: 1.6;
1525
2646
  font-size: 14px;
1526
2647
  white-space: pre-wrap;
1527
2648
  word-wrap: break-word;
1528
- font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
2649
+ font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
1529
2650
  page-break-inside: avoid;
1530
2651
  }
1531
2652
  code {
1532
- font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
2653
+ font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
1533
2654
  font-size: 0.9em;
1534
- background-color: #f6f8fa;
2655
+ background-color: rgba(15, 23, 42, 0.1);
1535
2656
  padding: 0.2em 0.4em;
1536
2657
  border-radius: 3px;
1537
- border: 1px solid #e1e4e8;
2658
+ border: 1px solid rgba(15, 23, 42, 0.1);
1538
2659
  }
1539
2660
  pre code {
1540
2661
  background-color: transparent;
@@ -1645,12 +2766,13 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
1645
2766
  async function main(options) {
1646
2767
  console.log(chalk.bold.cyan('\n🚀 极客时间专栏下载器\n'));
1647
2768
 
1648
- // 获取配置:优先级 命令行 > 配置文件
2769
+ // 获取配置:优先级 命令行 > 配置文件 > 默认 cookies.json
1649
2770
  let cookie = options.cookie;
2771
+ let cookieFile = options.cookieFile;
1650
2772
  let columnUrl = options.url;
1651
2773
 
1652
- // 如果命令行没有提供,尝试从配置文件读取
1653
- if (!cookie || !columnUrl) {
2774
+ // 如果命令行没有提供所需信息,尝试从配置文件读取
2775
+ if (!cookie || !columnUrl || !cookieFile) {
1654
2776
  // 使用当前工作目录的config.json,而不是脚本所在目录
1655
2777
  const configPath = path.join(process.cwd(), 'config.json');
1656
2778
  try {
@@ -1660,22 +2782,37 @@ async function main(options) {
1660
2782
  // 使用配置文件中的值作为默认值
1661
2783
  if (!cookie) cookie = config.cookie;
1662
2784
  if (!columnUrl) columnUrl = config.columnUrl;
2785
+ if (!cookieFile) cookieFile = config.cookieFile;
1663
2786
  } catch (error) {
1664
2787
  // 配置文件不存在或读取失败,不是致命错误
1665
2788
  // 只有在命令行也没提供时才报错
1666
2789
  }
1667
2790
  }
1668
2791
 
2792
+ // 如果没有cookie字符串但存在 cookies.json 文件,自动使用
2793
+ if (!cookie && !cookieFile) {
2794
+ const defaultCookieJsonPath = path.join(process.cwd(), 'cookies.json');
2795
+ if (await fileExists(defaultCookieJsonPath)) {
2796
+ cookieFile = defaultCookieJsonPath;
2797
+ }
2798
+ }
2799
+
2800
+ const cookieSavePath = cookieFile || path.join(process.cwd(), 'cookies.json');
2801
+
1669
2802
  // 验证必要参数
1670
- if (!cookie) {
2803
+ if (!cookie && !cookieFile) {
1671
2804
  console.error(chalk.red('❌ 缺少 Cookie!'));
1672
2805
  console.log(chalk.yellow('\n请通过以下方式之一提供 Cookie:'));
1673
2806
  console.log(chalk.gray('1. 命令行参数:--cookie "你的cookie字符串"'));
1674
2807
  console.log(chalk.gray('2. 配置文件 config.json:'));
1675
2808
  console.log(chalk.gray(' {'));
1676
2809
  console.log(chalk.gray(' "cookie": "你的cookie字符串",'));
1677
- console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx"'));
1678
- console.log(chalk.gray(' }\n'));
2810
+ console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx",'));
2811
+ console.log(chalk.gray(' "cookieFile": "cookies.json" // 可选,导入JSON文件'));
2812
+ console.log(chalk.gray(' }'));
2813
+ console.log(chalk.gray('3. 提供 Cookie JSON 文件:'));
2814
+ console.log(chalk.gray(' - 命令行参数:--cookie-file ./cookies.json'));
2815
+ console.log(chalk.gray(' - 或将 cookies.json 放到当前目录\n'));
1679
2816
  process.exit(1);
1680
2817
  }
1681
2818
 
@@ -1724,16 +2861,42 @@ async function main(options) {
1724
2861
  userAgent: DEFAULT_USER_AGENT
1725
2862
  });
1726
2863
 
1727
- // 兼容用户直接复制整行"Cookie: xxx"
1728
- let normalizedCookie = cookie.trim();
1729
- if (/^cookie:/i.test(normalizedCookie)) {
1730
- normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
2864
+ let normalizedCookie = '';
2865
+ let cookiesForContext = [];
2866
+
2867
+ if (cookie) {
2868
+ normalizedCookie = cookie.trim();
2869
+ if (/^cookie:/i.test(normalizedCookie)) {
2870
+ normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
2871
+ }
2872
+ cookiesForContext = parseCookies(normalizedCookie);
2873
+ } else if (cookieFile) {
2874
+ try {
2875
+ const { cookieHeader, cookies, absolutePath } = await loadCookiesFromJsonFile(cookieFile);
2876
+ normalizedCookie = cookieHeader.trim();
2877
+ cookiesForContext = cookies;
2878
+ console.log(chalk.gray(`🍪 已从 ${absolutePath} 导入 Cookie`));
2879
+ } catch (error) {
2880
+ console.error(chalk.red(`❌ 读取 Cookie JSON 失败: ${error.message}`));
2881
+ process.exit(1);
2882
+ }
1731
2883
  }
2884
+
1732
2885
  globalCookieHeader = normalizedCookie;
1733
2886
 
1734
2887
  // 设置 cookies
1735
- const cookies = parseCookies(normalizedCookie);
1736
- await context.addCookies(cookies);
2888
+ await context.addCookies(cookiesForContext);
2889
+ await updateGlobalCookieHeaderFromContext(context);
2890
+ context.on('response', (response) => {
2891
+ try {
2892
+ const headers = response.headers();
2893
+ if (headers && headers['set-cookie']) {
2894
+ updateGlobalCookieHeaderFromContext(context);
2895
+ }
2896
+ } catch {
2897
+ // ignore
2898
+ }
2899
+ });
1737
2900
 
1738
2901
  // 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
1739
2902
  await context.route('**/*', (route) => {
@@ -1755,9 +2918,12 @@ async function main(options) {
1755
2918
  }
1756
2919
 
1757
2920
  const headers = {
1758
- ...request.headers(),
1759
- cookie: normalizedCookie
2921
+ ...request.headers()
1760
2922
  };
2923
+ const outgoingCookieHeader = globalCookieHeader || normalizedCookie;
2924
+ if (outgoingCookieHeader) {
2925
+ headers.cookie = outgoingCookieHeader;
2926
+ }
1761
2927
  route.continue({ headers });
1762
2928
  });
1763
2929
 
@@ -1843,7 +3009,10 @@ async function main(options) {
1843
3009
  const successCount = results.filter(r => r.success).length;
1844
3010
  const failCount = results.filter(r => !r.success).length;
1845
3011
  const timeoutCount = results.filter(r =>
1846
- !r.success && r.error && (r.error.includes('timeout') || r.error.includes('Timeout'))
3012
+ !r.success && r.error && /timeout/i.test(r.error)
3013
+ ).length;
3014
+ const authIssueCount = results.filter(r =>
3015
+ !r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
1847
3016
  ).length;
1848
3017
 
1849
3018
  console.log(chalk.bold.cyan('\n📊 PDF 下载统计\n'));
@@ -1857,6 +3026,11 @@ async function main(options) {
1857
3026
  console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
1858
3027
  console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
1859
3028
  console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
3029
+ } else if (authIssueCount > 0) {
3030
+ console.log(chalk.yellow('⚠️ 检测到登录或权限相关异常\n'));
3031
+ console.log(chalk.gray(' 1. 在浏览器中重新登录极客时间,进入该专栏任意文章'));
3032
+ console.log(chalk.gray(' 2. 复制最新的 Cookie(或重新导出 cookies.json)'));
3033
+ console.log(chalk.gray(' 3. 使用新的 --cookie 或 --cookie-file 参数后重试\n'));
1860
3034
  }
1861
3035
 
1862
3036
  // 合并 PDF
@@ -1900,7 +3074,10 @@ async function main(options) {
1900
3074
  const successCount = contentResults.filter(r => r.success).length;
1901
3075
  const failCount = contentResults.filter(r => !r.success).length;
1902
3076
  const timeoutCount = contentResults.filter(r =>
1903
- !r.success && r.error && (r.error.includes('Cookie') || r.error.includes('timeout') || r.error.includes('Timeout'))
3077
+ !r.success && r.error && /timeout/i.test(r.error)
3078
+ ).length;
3079
+ const authIssueCount = contentResults.filter(r =>
3080
+ !r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
1904
3081
  ).length;
1905
3082
 
1906
3083
  console.log(chalk.bold.cyan('\n📊 EPUB 提取统计\n'));
@@ -1913,19 +3090,42 @@ async function main(options) {
1913
3090
  console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
1914
3091
  console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
1915
3092
  console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
3093
+ } else if (authIssueCount > 0) {
3094
+ console.log(chalk.yellow('⚠️ 检测到登录/权限问题,建议步骤:\n'));
3095
+ console.log(chalk.gray(' 1. 浏览器重新登录极客时间并打开该专栏文章'));
3096
+ console.log(chalk.gray(' 2. 重新复制最新 Cookie 或导出 cookies.json'));
3097
+ console.log(chalk.gray(' 3. 更新 --cookie 或 --cookie-file 后再次执行\n'));
1916
3098
  }
1917
3099
 
1918
3100
  // 生成 EPUB
1919
3101
  if (successCount > 0) {
1920
- const epubPath = await generateEPUB(
1921
- outputDir,
1922
- columnTitle,
1923
- columnAuthor,
1924
- articlesToDownload,
1925
- contentResults
3102
+ const hasImageContent = contentResults.some(result =>
3103
+ result && result.success && typeof result.content === 'string' && result.content.includes('<img')
1926
3104
  );
1927
- if (epubPath) {
1928
- console.log(chalk.green(`\n✅ EPUB 生成完成: ${epubPath}\n`));
3105
+
3106
+ let processedContent = contentResults;
3107
+ let tempAssetsDir = null;
3108
+
3109
+ try {
3110
+ if (hasImageContent) {
3111
+ tempAssetsDir = await createTempAssetsDir(outputDir);
3112
+ processedContent = await rewriteEpubContentImages(context, contentResults, tempAssetsDir);
3113
+ }
3114
+
3115
+ const epubPath = await generateEPUB(
3116
+ outputDir,
3117
+ columnTitle,
3118
+ columnAuthor,
3119
+ articlesToDownload,
3120
+ processedContent
3121
+ );
3122
+ if (epubPath) {
3123
+ console.log(chalk.green(`\n✅ EPUB 生成完成: ${epubPath}\n`));
3124
+ }
3125
+ } finally {
3126
+ if (tempAssetsDir) {
3127
+ await cleanupTempAssetsDir(tempAssetsDir);
3128
+ }
1929
3129
  }
1930
3130
  }
1931
3131
  }
@@ -1942,6 +3142,11 @@ async function main(options) {
1942
3142
  }
1943
3143
  process.exit(1);
1944
3144
  } finally {
3145
+ try {
3146
+ await persistCookiesToFile(context, cookieSavePath);
3147
+ } catch {
3148
+ // ignore
3149
+ }
1945
3150
  // 确保浏览器完全关闭
1946
3151
  try {
1947
3152
  if (browser && !isShuttingDown) {
@@ -1961,6 +3166,7 @@ program
1961
3166
  .version(version)
1962
3167
  .option('-u, --url <url>', '专栏文章URL(任意一篇)')
1963
3168
  .option('-c, --cookie <cookie>', 'Cookie字符串(用于认证)')
3169
+ .option('--cookie-file <path>', '从 JSON 文件导入 Cookie(如 chrome 扩展导出的 cookies.json)')
1964
3170
  .option('-o, --output <dir>', '输出目录', './downloads')
1965
3171
  .option('-f, --format <format>', '输出格式: pdf, epub, both', 'pdf')
1966
3172
  .option('--headless <boolean>', '无头模式', true)