@kadaliao/geektime-downloader 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/download.js CHANGED
@@ -6,7 +6,10 @@ import chalk from 'chalk';
6
6
  import ora from 'ora';
7
7
  import fs from 'fs/promises';
8
8
  import path from 'path';
9
- import { fileURLToPath } from 'url';
9
+ import { fileURLToPath, pathToFileURL } from 'url';
10
+ import { load as loadHtml } from 'cheerio';
11
+ import crypto from 'crypto';
12
+ import mime from 'mime-types';
10
13
  import { createRequire } from 'module';
11
14
  import * as pdfLib from 'pdf-lib';
12
15
  import { outlinePdfFactory } from '@lillallol/outline-pdf';
@@ -245,9 +248,335 @@ const PRINT_FIX_CSS = `
245
248
  }
246
249
  `;
247
250
 
251
+ // 代码高亮彩色语法(覆盖Prism/Highlight.js常见class)
252
+ const CODE_HIGHLIGHT_CSS = `
253
+ pre[class*="language-"],
254
+ code[class*="language-"],
255
+ pre code,
256
+ code.hljs,
257
+ pre.hljs {
258
+ color: #2d2d2d;
259
+ background: #f7f7f7;
260
+ }
261
+ .token.comment,
262
+ .token.prolog,
263
+ .token.doctype,
264
+ .token.cdata,
265
+ .hljs-comment,
266
+ .hljs-quote {
267
+ color: #6a737d;
268
+ font-style: italic;
269
+ }
270
+ .token.punctuation,
271
+ .hljs-punctuation {
272
+ color: #5e6687;
273
+ }
274
+ .token.property,
275
+ .token.tag,
276
+ .token.constant,
277
+ .token.symbol,
278
+ .token.deleted,
279
+ .hljs-keyword,
280
+ .hljs-selector-tag,
281
+ .hljs-subst,
282
+ .hljs-attribute {
283
+ color: #d73a49;
284
+ }
285
+ .token.boolean,
286
+ .token.number,
287
+ .token.selector,
288
+ .token.attr-name,
289
+ .token.char,
290
+ .token.builtin,
291
+ .token.inserted,
292
+ .hljs-number,
293
+ .hljs-literal,
294
+ .hljs-variable,
295
+ .hljs-template-variable {
296
+ color: #b76bff;
297
+ }
298
+ .token.string,
299
+ .token.attr-value,
300
+ .token.operator,
301
+ .token.entity,
302
+ .token.url,
303
+ .token.statement,
304
+ .token.regex,
305
+ .token.important,
306
+ .token.variable,
307
+ .token.bold,
308
+ .hljs-string,
309
+ .hljs-doctag,
310
+ .hljs-addition {
311
+ color: #22863a;
312
+ }
313
+ .token.function,
314
+ .token.class-name,
315
+ .token.keyword,
316
+ .hljs-title,
317
+ .hljs-section,
318
+ .hljs-type,
319
+ .hljs-selector-id,
320
+ .hljs-selector-class {
321
+ color: #005cc5;
322
+ }
323
+ .token.operator,
324
+ .token.entity,
325
+ .token.url,
326
+ .hljs-bullet,
327
+ .hljs-built_in,
328
+ .hljs-builtin-name,
329
+ .hljs-link {
330
+ color: #e36209;
331
+ }
332
+ .token.italic {
333
+ font-style: italic;
334
+ }
335
+ .token.bold {
336
+ font-weight: 600;
337
+ }
338
+ .token.deleted,
339
+ .hljs-deletion {
340
+ color: #b31d28;
341
+ }
342
+ `;
343
+
248
344
  const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
249
- const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
250
345
  const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
346
+ const EPUB_IMAGE_BATCH_SIZE = 5;
347
+ const TEMP_ASSET_PREFIX = '__epub_assets__';
348
+ const ARTICLE_CONTENT_SELECTORS = [
349
+ '#article-content',
350
+ '#article-content-container',
351
+ '.article-content',
352
+ '.article-detail',
353
+ '.article-detail-content',
354
+ '.article-content__body',
355
+ '.Index_articleContent_QBG5G',
356
+ '.ArticleContent_articleContent',
357
+ 'article .content',
358
+ 'main article',
359
+ '.content-container article'
360
+ ];
361
+ const ARTICLE_REMOVAL_SELECTORS = [
362
+ 'nav', 'header', 'footer', 'aside',
363
+ '.comment', '.comments', '.Index_comment', '.CommentArea', '.comment-area', '.CommentWrapper', '.Comment-module', '.CommentList',
364
+ '#comments', '#comment', '[data-section="comment"]',
365
+ '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
366
+ '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
367
+ '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
368
+ '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
369
+ '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
370
+ '.AudioPlayer', '.VoicePlayer', '.AudioWrapper', '.voice-player',
371
+ '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
372
+ '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
373
+ '.copyright', '.statement', '.disclaimer',
374
+ '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
375
+ '.article-plugin-wrapper',
376
+ '[class*="Share"]', '[data-widget="audio"]', '[data-widget="Audio"]',
377
+ 'audio', 'video',
378
+ '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
379
+ '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
380
+ '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
381
+ '[data-role="toolbar"]',
382
+ 'button[data-role="comment"]',
383
+ 'script[data-role="plugin"]',
384
+ '.ArticleBottomBar',
385
+ '.bottom-toolbar'
386
+ ];
387
+ const ARTICLE_PLUGIN_KEYWORDS = [
388
+ 'note', 'translation', 'audio', 'player', 'reward', 'donate',
389
+ 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
390
+ 'copyright', 'geeknote', 'bilingual', 'comment'
391
+ ];
392
+ const ARTICLE_MINDMAP_SELECTORS = [
393
+ '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
394
+ '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
395
+ '[class*="MindMap"]', '[class*="mindMap"]'
396
+ ];
397
+ const PDF_BASE_CSS = `
398
+ body {
399
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", sans-serif;
400
+ margin: 0;
401
+ padding: 0;
402
+ background: #fff;
403
+ color: #1f2329;
404
+ }
405
+ .article-pdf-wrapper {
406
+ max-width: 860px;
407
+ margin: 0 auto;
408
+ padding: 48px 56px 60px;
409
+ }
410
+ .article-title {
411
+ font-size: 32px;
412
+ font-weight: 600;
413
+ margin-bottom: 16px;
414
+ line-height: 1.3;
415
+ color: #111;
416
+ }
417
+ .article-meta {
418
+ color: #7f8c8d;
419
+ font-size: 14px;
420
+ margin-bottom: 32px;
421
+ }
422
+ .article-content p,
423
+ .article-content div {
424
+ margin: 1.1em 0;
425
+ line-height: 1.9;
426
+ font-size: 16px;
427
+ }
428
+ .article-content p + p,
429
+ .article-content div + p,
430
+ .article-content p + div {
431
+ margin-top: 1.6em;
432
+ }
433
+ .article-content h2,
434
+ .article-content h3,
435
+ .article-content h4 {
436
+ margin-top: 2.2em;
437
+ margin-bottom: 1em;
438
+ font-weight: 600;
439
+ color: #111;
440
+ }
441
+ .article-content h2 {
442
+ font-size: 26px;
443
+ }
444
+ .article-content h3 {
445
+ font-size: 22px;
446
+ }
447
+ .article-content h4 {
448
+ font-size: 18px;
449
+ }
450
+ .article-content img {
451
+ max-width: 100%;
452
+ margin: 1.2em auto;
453
+ display: block;
454
+ border-radius: 4px;
455
+ }
456
+ .article-content blockquote {
457
+ margin: 1.3em 0;
458
+ padding: 0.8em 1.2em;
459
+ border-left: 4px solid #d0d7de;
460
+ background: #f8fafc;
461
+ color: #4b5563;
462
+ }
463
+ .article-content ul,
464
+ .article-content ol {
465
+ margin: 1em 0;
466
+ padding-left: 2em;
467
+ }
468
+ .article-content pre {
469
+ background: #0b1220;
470
+ color: #d9e2ff;
471
+ border-radius: 6px;
472
+ padding: 16px 20px;
473
+ overflow: auto;
474
+ margin: 1.4em 0;
475
+ font-size: 14px;
476
+ line-height: 1.6;
477
+ }
478
+ .article-content pre code {
479
+ background: transparent;
480
+ border: none;
481
+ padding: 0;
482
+ color: inherit;
483
+ }
484
+ .article-content code {
485
+ font-family: "Fira Code", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
486
+ background: rgba(15, 23, 42, 0.08);
487
+ border-radius: 4px;
488
+ padding: 0.2em 0.4em;
489
+ }
490
+ .article-content hr {
491
+ border: none;
492
+ border-top: 1px solid #e5e7eb;
493
+ margin: 2.4em 0;
494
+ }
495
+ `;
496
+
497
+ async function fileExists(filePath) {
498
+ try {
499
+ await fs.access(filePath);
500
+ return true;
501
+ } catch {
502
+ return false;
503
+ }
504
+ }
505
+
506
+ function normalizeCookieSameSite(value) {
507
+ if (!value) return undefined;
508
+ const lower = value.toString().toLowerCase();
509
+ if (lower.includes('lax')) return 'Lax';
510
+ if (lower.includes('strict')) return 'Strict';
511
+ if (lower.includes('none') || lower.includes('no_restriction')) return 'None';
512
+ return undefined;
513
+ }
514
+
515
+ function normalizeCookieDomain(domain) {
516
+ if (!domain || typeof domain !== 'string') {
517
+ return '.geekbang.org';
518
+ }
519
+ return domain.trim();
520
+ }
521
+
522
+ async function loadCookiesFromJsonFile(filePath) {
523
+ const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
524
+ let raw;
525
+ try {
526
+ raw = await fs.readFile(absolutePath, 'utf-8');
527
+ } catch (error) {
528
+ throw new Error(`无法读取 cookie JSON 文件: ${error.message}`);
529
+ }
530
+
531
+ let parsed;
532
+ try {
533
+ parsed = JSON.parse(raw);
534
+ } catch (error) {
535
+ throw new Error(`cookie JSON 解析失败: ${error.message}`);
536
+ }
537
+
538
+ if (!Array.isArray(parsed)) {
539
+ throw new Error('cookie JSON 必须是数组格式');
540
+ }
541
+
542
+ const cookies = parsed
543
+ .filter(item => item && typeof item.name === 'string' && item.value !== undefined)
544
+ .map(item => {
545
+ const cookieValue = typeof item.value === 'string' ? item.value : String(item.value ?? '');
546
+ const cookie = {
547
+ name: item.name,
548
+ value: cookieValue,
549
+ domain: normalizeCookieDomain(item.domain),
550
+ path: item.path || '/',
551
+ secure: Boolean(item.secure),
552
+ httpOnly: Boolean(item.httpOnly)
553
+ };
554
+ const sameSite = normalizeCookieSameSite(item.sameSite);
555
+ if (sameSite) {
556
+ cookie.sameSite = sameSite;
557
+ }
558
+ return cookie;
559
+ });
560
+
561
+ if (cookies.length === 0) {
562
+ throw new Error('cookie JSON 中没有有效的 cookie 项');
563
+ }
564
+
565
+ const withExpiry = parsed
566
+ .filter(item => item && typeof item.name === 'string' && item.value !== undefined)
567
+ .map((item, idx) => ({ item, target: cookies[idx] }))
568
+ .filter(entry => entry.target);
569
+ withExpiry.forEach(({ item, target }) => {
570
+ const expires = item.expires || item.expirationDate;
571
+ if (expires) {
572
+ target.expires = Math.floor(Number(expires));
573
+ }
574
+ });
575
+
576
+ const cookieHeader = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
577
+
578
+ return { cookieHeader, cookies, absolutePath };
579
+ }
251
580
 
252
581
  // 解析 cookie 字符串
253
582
  function parseCookies(cookieString) {
@@ -272,96 +601,308 @@ function normalizeArticleHtml(html = '') {
272
601
  .replace(/href='\/\//gi, "href='https://");
273
602
  }
274
603
 
275
- async function fetchArticleData(context, articleId) {
276
- const maxAttempts = 3;
277
- const refererUrl = `${GEEKTIME_BASE_URL}/column/article/${articleId}`;
278
- let lastError = null;
604
+ function resolveImageUrl(rawSrc = '') {
605
+ if (!rawSrc) return null;
606
+ let src = rawSrc.trim();
607
+ if (!src || src.startsWith('data:') || src.startsWith('blob:')) {
608
+ return null;
609
+ }
610
+ if (src.startsWith('//')) {
611
+ return `https:${src}`;
612
+ }
613
+ if (src.startsWith('/')) {
614
+ return `${GEEKTIME_BASE_URL}${src}`;
615
+ }
616
+ if (/^https?:/i.test(src)) {
617
+ return src;
618
+ }
619
+ try {
620
+ return new URL(src, GEEKTIME_BASE_URL).toString();
621
+ } catch {
622
+ return null;
623
+ }
624
+ }
279
625
 
280
- for (let attempt = 1; attempt <= maxAttempts; attempt++) {
626
+ async function fetchBinaryWithContext(context, url) {
627
+ const headers = {
628
+ 'user-agent': DEFAULT_USER_AGENT,
629
+ 'accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
630
+ 'referer': GEEKTIME_BASE_URL,
631
+ ...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
632
+ };
633
+ const response = await context.request.get(url, { headers, failOnStatusCode: true });
634
+ if (!response.ok()) {
635
+ throw new Error(`HTTP ${response.status()} ${response.statusText()}`);
636
+ }
637
+ const buffer = await response.body();
638
+ const headersMap = response.headers();
639
+ return {
640
+ buffer,
641
+ contentType: headersMap['content-type'] || '',
642
+ finalUrl: response.url()
643
+ };
644
+ }
645
+
646
+ function determineImageExtension(resourceUrl = '', contentType = '') {
647
+ let ext = '';
648
+ if (resourceUrl) {
281
649
  try {
282
- const response = await context.request.post(ARTICLE_API_URL, {
283
- headers: {
284
- 'user-agent': DEFAULT_USER_AGENT,
285
- 'content-type': 'application/json',
286
- 'accept': 'application/json, text/plain, */*',
287
- 'origin': GEEKTIME_BASE_URL,
288
- 'referer': refererUrl,
289
- 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
290
- ...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
291
- },
292
- data: {
293
- id: String(articleId),
294
- include_neighbors: true,
295
- is_freelyread: true
296
- }
297
- });
650
+ const { pathname } = new URL(resourceUrl);
651
+ ext = path.extname(pathname).replace('.', '');
652
+ } catch {
653
+ ext = '';
654
+ }
655
+ }
656
+ if (!ext && contentType) {
657
+ ext = (mime.extension(contentType) || '').toString();
658
+ }
659
+ if (!ext) {
660
+ ext = 'bin';
661
+ }
662
+ return ext.toLowerCase();
663
+ }
298
664
 
299
- const bodyText = await response.text();
665
+ async function downloadImageToLocal(context, normalizedUrl, assetsDir, articleIndex) {
666
+ const { buffer, contentType, finalUrl } = await fetchBinaryWithContext(context, normalizedUrl);
667
+ const ext = determineImageExtension(finalUrl || normalizedUrl, contentType);
668
+ const hash = crypto.createHash('md5').update(normalizedUrl).digest('hex').slice(0, 10);
669
+ const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_${hash}.${ext}`;
670
+ const filepath = path.join(assetsDir, filename);
671
+ await fs.writeFile(filepath, buffer);
672
+ return {
673
+ fileUrl: pathToFileURL(filepath).href,
674
+ localPath: filepath
675
+ };
676
+ }
300
677
 
301
- if (!response.ok()) {
302
- throw new Error(`API请求失败: ${response.status()} ${response.statusText()} - ${bodyText.slice(0, 160)}`);
303
- }
678
+ function mapSameSiteForExport(value) {
679
+ if (!value) return 'unspecified';
680
+ const lower = value.toString().toLowerCase();
681
+ if (lower.includes('strict')) return 'strict';
682
+ if (lower.includes('lax')) return 'lax';
683
+ if (lower.includes('none')) return 'no_restriction';
684
+ return 'unspecified';
685
+ }
686
+
687
+ async function updateGlobalCookieHeaderFromContext(context) {
688
+ if (!context) return;
689
+ try {
690
+ const cookies = await context.cookies();
691
+ if (!cookies || cookies.length === 0) {
692
+ return;
693
+ }
694
+ const header = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
695
+ if (header) {
696
+ globalCookieHeader = header;
697
+ }
698
+ } catch {
699
+ // ignore
700
+ }
701
+ }
702
+
703
+ async function persistCookiesToFile(context, targetPath) {
704
+ if (!context || !targetPath) return;
705
+ try {
706
+ const cookies = await context.cookies();
707
+ if (!cookies || cookies.length === 0) {
708
+ return;
709
+ }
710
+ const serialized = cookies.map(cookie => ({
711
+ domain: cookie.domain,
712
+ expirationDate: cookie.expires || undefined,
713
+ hostOnly: !cookie.domain.startsWith('.'),
714
+ httpOnly: cookie.httpOnly,
715
+ name: cookie.name,
716
+ path: cookie.path,
717
+ sameSite: mapSameSiteForExport(cookie.sameSite),
718
+ secure: cookie.secure,
719
+ session: !cookie.expires,
720
+ storeId: '0',
721
+ value: cookie.value
722
+ }));
723
+ await fs.writeFile(targetPath, JSON.stringify(serialized, null, 2), 'utf-8');
724
+ console.log(chalk.gray(`🍪 已刷新 Cookie → ${targetPath}`));
725
+ } catch (error) {
726
+ console.log(chalk.yellow(`⚠️ 保存 Cookie 失败: ${error.message}`));
727
+ }
728
+ }
304
729
 
305
- let json;
730
+ async function saveDataUriImage(dataUri, assetsDir, articleIndex, dataIndex) {
731
+ if (!dataUri || typeof dataUri !== 'string') {
732
+ return null;
733
+ }
734
+ const match = dataUri.match(/^data:(.+?);base64,(.+)$/i);
735
+ if (!match) {
736
+ return null;
737
+ }
738
+ const mimeType = match[1] || 'application/octet-stream';
739
+ const base64Data = match[2];
740
+ let buffer;
741
+ try {
742
+ buffer = Buffer.from(base64Data, 'base64');
743
+ } catch {
744
+ return null;
745
+ }
746
+ if (!buffer || buffer.length === 0) {
747
+ return null;
748
+ }
749
+ const ext = mime.extension(mimeType) || 'bin';
750
+ const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_inline_${String(dataIndex).padStart(3, '0')}.${ext}`;
751
+ const filepath = path.join(assetsDir, filename);
752
+ await fs.writeFile(filepath, buffer);
753
+ return pathToFileURL(filepath).href;
754
+ }
755
+
756
+ async function rewriteImagesWithLocalFiles(context, htmlContent, assetsDir, articleIndex, sharedCache) {
757
+ if (!htmlContent || htmlContent.indexOf('<img') === -1) {
758
+ return { html: htmlContent, replaced: 0 };
759
+ }
760
+
761
+ const $ = loadHtml(htmlContent, { decodeEntities: false });
762
+ const images = $('img');
763
+ if (images.length === 0) {
764
+ return { html: htmlContent, replaced: 0 };
765
+ }
766
+
767
+ const pendingDownloads = new Map();
768
+ const dataUriImages = [];
769
+
770
+ images.each((_, element) => {
771
+ const originalSrc = $(element).attr('src') || '';
772
+ if (/^data:/i.test(originalSrc.trim())) {
773
+ dataUriImages.push({ element, src: originalSrc.trim() });
774
+ return;
775
+ }
776
+ const normalizedUrl = resolveImageUrl(originalSrc);
777
+ if (!normalizedUrl) {
778
+ return;
779
+ }
780
+ if (sharedCache.has(normalizedUrl)) {
781
+ return;
782
+ }
783
+ if (!pendingDownloads.has(normalizedUrl)) {
784
+ pendingDownloads.set(normalizedUrl, null);
785
+ }
786
+ });
787
+
788
+ const downloadTargets = Array.from(pendingDownloads.keys());
789
+ for (let i = 0; i < downloadTargets.length; i += EPUB_IMAGE_BATCH_SIZE) {
790
+ const batch = downloadTargets.slice(i, i + EPUB_IMAGE_BATCH_SIZE).map(async (targetUrl) => {
306
791
  try {
307
- json = JSON.parse(bodyText);
308
- } catch (parseError) {
309
- throw new Error(`API响应解析失败: ${parseError.message} - ${bodyText.slice(0, 160)}`);
792
+ const info = await downloadImageToLocal(context, targetUrl, assetsDir, articleIndex);
793
+ sharedCache.set(targetUrl, info.fileUrl);
794
+ pendingDownloads.set(targetUrl, info.fileUrl);
795
+ } catch (error) {
796
+ console.log(chalk.yellow(` ⚠️ 图片下载失败: ${targetUrl} (${error.message})`));
797
+ pendingDownloads.set(targetUrl, null);
310
798
  }
799
+ });
800
+ await Promise.all(batch);
801
+ }
311
802
 
312
- if (!json || json.code !== 0 || !json.data) {
313
- throw new Error(`无法获取完整文章内容: ${bodyText.slice(0, 160)}`);
314
- }
803
+ images.each((_, element) => {
804
+ const originalSrc = $(element).attr('src') || '';
805
+ if (/^data:/i.test(originalSrc.trim())) {
806
+ return;
807
+ }
808
+ const normalizedUrl = resolveImageUrl(originalSrc);
809
+ if (!normalizedUrl) {
810
+ return;
811
+ }
812
+ const localUrl = sharedCache.get(normalizedUrl) || pendingDownloads.get(normalizedUrl);
813
+ if (localUrl) {
814
+ $(element).attr('src', localUrl);
815
+ }
816
+ });
315
817
 
316
- if (!json.data.article_content) {
317
- throw new Error('文章内容为空,可能需要更新 Cookie 或重新获取权限');
818
+ let processedInlineImages = 0;
819
+ for (let i = 0; i < dataUriImages.length; i++) {
820
+ const item = dataUriImages[i];
821
+ try {
822
+ const localUrl = await saveDataUriImage(item.src, assetsDir, articleIndex, i);
823
+ if (localUrl) {
824
+ $(item.element).attr('src', localUrl);
825
+ processedInlineImages++;
826
+ } else {
827
+ $(item.element).remove();
318
828
  }
319
-
320
- return json.data;
321
829
  } catch (error) {
322
- lastError = error;
323
- if (attempt < maxAttempts) {
324
- await new Promise(resolve => setTimeout(resolve, attempt * 700));
830
+ console.log(chalk.yellow(` ⚠️ 内联图片处理失败: ${error.message}`));
831
+ $(item.element).remove();
832
+ }
833
+ }
834
+
835
+ const finalHtml = $.root().html() || htmlContent;
836
+
837
+ return {
838
+ html: finalHtml,
839
+ replaced: downloadTargets.length + processedInlineImages
840
+ };
841
+ }
842
+
843
+ async function rewriteEpubContentImages(context, contentResults, assetsDir) {
844
+ const cache = new Map();
845
+ let processedArticles = 0;
846
+ let processedImages = 0;
847
+
848
+ const spinner = ora('正在缓存 EPUB 图片...').start();
849
+
850
+ const updatedResults = [];
851
+ for (let i = 0; i < contentResults.length; i++) {
852
+ const result = contentResults[i];
853
+ if (!result || !result.success || !result.content) {
854
+ updatedResults.push(result);
855
+ continue;
856
+ }
857
+ try {
858
+ const { html, replaced } = await rewriteImagesWithLocalFiles(context, result.content, assetsDir, i, cache);
859
+ processedImages += replaced;
860
+ if (replaced > 0) {
861
+ processedArticles++;
325
862
  }
863
+ updatedResults.push({ ...result, content: html });
864
+ } catch (error) {
865
+ spinner.stop();
866
+ console.log(chalk.yellow(`⚠️ 处理第 ${i + 1} 篇文章图片失败: ${error.message}`));
867
+ spinner.start();
868
+ updatedResults.push(result);
326
869
  }
327
870
  }
328
871
 
329
- throw lastError || new Error('未知错误导致文章内容获取失败');
872
+ if (processedImages === 0) {
873
+ spinner.stop();
874
+ console.log(chalk.gray('📷 没有检测到需要缓存的图片'));
875
+ } else {
876
+ spinner.succeed(`已缓存 EPUB 图片: ${processedImages} 张(${processedArticles} 篇文章)`);
877
+ }
878
+
879
+ return updatedResults;
880
+ }
881
+
882
+ async function createTempAssetsDir(baseDir) {
883
+ const tempDir = path.join(baseDir, `${TEMP_ASSET_PREFIX}_${Date.now().toString(36)}_${Math.random().toString(16).slice(2, 8)}`);
884
+ await fs.mkdir(tempDir, { recursive: true });
885
+ return tempDir;
886
+ }
887
+
888
+ async function cleanupTempAssetsDir(dir) {
889
+ if (!dir) return;
890
+ try {
891
+ await fs.rm(dir, { recursive: true, force: true });
892
+ } catch (error) {
893
+ console.log(chalk.gray(`清理临时目录失败: ${error.message}`));
894
+ }
330
895
  }
331
896
 
332
897
  async function sanitizeArticleHtml(page, rawHtml) {
333
- return page.evaluate((html) => {
898
+ return page.evaluate(({ html, removalSelectors, pluginKeywords, mindmapSelectors }) => {
334
899
  const template = document.createElement('template');
335
900
  template.innerHTML = html;
336
901
 
337
- const removalSelectors = [
338
- 'nav', 'header', 'footer', 'aside',
339
- '.comment', '.comments', '.Index_comment',
340
- '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
341
- '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
342
- '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
343
- '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
344
- '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
345
- '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
346
- '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
347
- '.copyright', '.statement', '.disclaimer',
348
- '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
349
- 'audio', 'video',
350
- '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
351
- '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
352
- '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
353
- '[data-role="toolbar"]',
354
- 'button', 'iframe', 'script', 'style'
355
- ];
356
902
  removalSelectors.forEach(selector => {
357
903
  template.content.querySelectorAll(selector).forEach(el => el.remove());
358
904
  });
359
905
 
360
- const pluginKeywords = [
361
- 'note', 'translation', 'audio', 'player', 'reward', 'donate',
362
- 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
363
- 'copyright', 'geeknote', 'bilingual'
364
- ];
365
906
  const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
366
907
  const className = (el.className || '').toString().toLowerCase();
367
908
  const idValue = (el.id || '').toString().toLowerCase();
@@ -372,11 +913,6 @@ async function sanitizeArticleHtml(page, rawHtml) {
372
913
  });
373
914
  pluginElements.forEach(el => el.remove());
374
915
 
375
- const mindmapSelectors = [
376
- '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
377
- '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
378
- '[class*="MindMap"]', '[class*="mindMap"]'
379
- ];
380
916
  mindmapSelectors.forEach(selector => {
381
917
  template.content.querySelectorAll(selector).forEach(el => el.remove());
382
918
  });
@@ -428,15 +964,23 @@ async function sanitizeArticleHtml(page, rawHtml) {
428
964
 
429
965
  const images = template.content ? template.content.querySelectorAll('img') : [];
430
966
  images.forEach(img => {
431
- if (!img.getAttribute('loading')) {
432
- img.setAttribute('loading', 'lazy');
433
- }
967
+ img.setAttribute('loading', 'eager');
968
+ img.setAttribute('decoding', 'sync');
434
969
  img.style.maxWidth = '100%';
435
970
  img.style.height = 'auto';
436
971
  });
437
972
 
438
973
  return template.innerHTML;
439
- }, rawHtml);
974
+ }, {
975
+ html: rawHtml,
976
+ removalSelectors: ARTICLE_REMOVAL_SELECTORS,
977
+ pluginKeywords: ARTICLE_PLUGIN_KEYWORDS,
978
+ mindmapSelectors: ARTICLE_MINDMAP_SELECTORS
979
+ });
980
+ }
981
+
982
+ function normalizeTextContent(text = '') {
983
+ return text.replace(/\s+/g, ' ').trim();
440
984
  }
441
985
 
442
986
  function escapeHtml(text = '') {
@@ -448,59 +992,650 @@ function escapeHtml(text = '') {
448
992
  .replace(/'/g, '&#39;');
449
993
  }
450
994
 
451
- function buildPrintableHtml(title, sanitizedHtml) {
452
- const baseCss = `
453
- body {
454
- font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
455
- font-size: 16px;
456
- line-height: 1.8;
457
- color: #1f2329;
458
- margin: 0;
459
- padding: 40px;
460
- background: #fff;
461
- }
462
-
463
- .article-print-wrapper {
464
- max-width: 900px;
465
- margin: 0 auto;
466
- }
467
-
468
- .article-print-wrapper h1 {
469
- font-size: 32px;
470
- line-height: 1.4;
471
- margin-bottom: 24px;
472
- }
473
-
474
- a {
475
- color: #0f5ef2;
476
- text-decoration: none;
477
- }
478
-
479
- pre {
480
- background: #f7f7f7;
481
- padding: 16px;
482
- border-radius: 6px;
483
- overflow: auto;
995
+ function removeDuplicateTitle(html, title = '') {
996
+ if (!html || !title) {
997
+ return html;
998
+ }
999
+ const normalizedTitle = normalizeTextContent(title);
1000
+ if (!normalizedTitle) {
1001
+ return html;
1002
+ }
1003
+ try {
1004
+ const $ = loadHtml(html, { decodeEntities: false });
1005
+ const firstHeading = $('h1, h2').first();
1006
+ if (firstHeading.length) {
1007
+ const headingText = normalizeTextContent(firstHeading.text());
1008
+ if (headingText && headingText === normalizedTitle) {
1009
+ firstHeading.remove();
1010
+ }
484
1011
  }
485
- `;
1012
+ return $.root().html() || html;
1013
+ } catch {
1014
+ return html;
1015
+ }
1016
+ }
486
1017
 
1018
+ function buildPdfHtml(title, sanitizedHtml, articleMeta = '') {
487
1019
  return `
488
1020
  <!DOCTYPE html>
489
1021
  <html lang="zh-CN">
490
1022
  <head>
491
1023
  <meta charset="utf-8">
492
1024
  <base href="${GEEKTIME_BASE_URL}">
493
- <style>${baseCss}${PRINT_FIX_CSS}</style>
1025
+ <style>${PDF_BASE_CSS}${PRINT_FIX_CSS}${CODE_HIGHLIGHT_CSS}</style>
494
1026
  </head>
495
1027
  <body>
496
- <div class="article-print-wrapper">
497
- <h1>${escapeHtml(title)}</h1>
498
- ${sanitizedHtml}
499
- </div>
1028
+ <article class="article-pdf-wrapper">
1029
+ <section class="article-content">
1030
+ <h1 class="article-title">${escapeHtml(title)}</h1>
1031
+ ${articleMeta ? `<div class="article-meta">${escapeHtml(articleMeta)}</div>` : ''}
1032
+ ${sanitizedHtml}
1033
+ </section>
1034
+ </article>
500
1035
  </body>
501
1036
  </html>`;
502
1037
  }
503
1038
 
1039
+ function enhanceCodeBlocks(html) {
1040
+ if (!html) return html;
1041
+ try {
1042
+ const $ = loadHtml(html, { decodeEntities: false });
1043
+ const wrapCodeElement = ($source, innerHtml) => {
1044
+ const wrapper = $('<pre class="code-block"></pre>');
1045
+ const codeEl = $('<code></code>').html(innerHtml);
1046
+ wrapper.append(codeEl);
1047
+ $source.replaceWith(wrapper);
1048
+ };
1049
+
1050
+ $('code').each((_, element) => {
1051
+ const $el = $(element);
1052
+ const parent = $el.parent();
1053
+ const text = $el.text() || '';
1054
+ const isBlocky = text.includes('\n') || text.length > 120 || $el.html().includes('<br');
1055
+ if (isBlocky && parent.length && parent[0].tagName !== 'PRE') {
1056
+ wrapCodeElement($el, $el.html());
1057
+ }
1058
+ });
1059
+ $('pre').each((_, element) => {
1060
+ const $el = $(element);
1061
+ if (!$el.hasClass('code-block')) {
1062
+ $el.addClass('code-block');
1063
+ }
1064
+ if ($el.find('code').length === 0) {
1065
+ const text = $el.html();
1066
+ $el.empty().append($('<code></code>').html(text));
1067
+ }
1068
+ });
1069
+
1070
+ const codeLikeSelectors = [
1071
+ '[class*="code"]',
1072
+ '[class*="Code"]',
1073
+ '[class*="code-block"]',
1074
+ '[class*="CodeBlock"]',
1075
+ '[class*="hljs"]',
1076
+ '[class*="language-"]',
1077
+ '.highlight',
1078
+ '.prism-code'
1079
+ ];
1080
+ const blockTags = ['P', 'DIV', 'SECTION', 'ARTICLE', 'UL', 'OL', 'TABLE', 'IMG', 'FIGURE'];
1081
+ const isLikelyCodeText = (text = '') => {
1082
+ const trimmed = text.trim();
1083
+ if (trimmed.length === 0) return false;
1084
+ if (trimmed.length > 1200) return false;
1085
+ return trimmed.includes('\n') || trimmed.includes('{') || trimmed.includes(';') || trimmed.includes(' ');
1086
+ };
1087
+ $(codeLikeSelectors.join(',')).each((_, element) => {
1088
+ const $el = $(element);
1089
+ if ($el.is('pre') || $el.find('pre').length > 0) {
1090
+ return;
1091
+ }
1092
+ const hasBlockChildren = blockTags.some(tag => $el.find(tag).length > 0);
1093
+ if (hasBlockChildren) {
1094
+ return;
1095
+ }
1096
+ const text = $el.text() || '';
1097
+ if (!isLikelyCodeText(text)) {
1098
+ return;
1099
+ }
1100
+ wrapCodeElement($el, $el.html());
1101
+ });
1102
+
1103
+ $('figure').each((_, element) => {
1104
+ const $el = $(element);
1105
+ if ($el.find('pre').length === 1 && $el.children().length === 1) {
1106
+ $el.replaceWith($el.find('pre').first());
1107
+ }
1108
+ });
1109
+
1110
+ const highlightSelectors = [
1111
+ '[class*="hljs"]',
1112
+ '[class*="language-"]',
1113
+ '.simplebar-content',
1114
+ '[data-language]',
1115
+ '[data-code-block]',
1116
+ '[class*="RichContent"]'
1117
+ ];
1118
+ const containerClassHints = ['simplebar', 'code', 'hljs', 'prism', 'syntax', 'monaco', 'ace', 'terminal', 'shell'];
1119
+ const containerStyleHints = ['white-space: pre', 'white-space:pre', 'font-family: monospace', 'font-family:monospace'];
1120
+ const inlineTags = new Set(['span', 'code', 'em', 'strong', 'b', 'i', 'u', 'a', 'label']);
1121
+ const newlineTags = new Set(['DIV', 'P', 'LI', 'SECTION', 'ARTICLE', 'FIGURE', 'PRE', 'CODE', 'BR', 'TR', 'TD', 'TH']);
1122
+ const looksLikeCodeBlock = (text = '') => {
1123
+ if (!text) return false;
1124
+ const trimmed = text.trim();
1125
+ if (!trimmed) return false;
1126
+ if (trimmed.includes('\n')) return true;
1127
+ const keywords = ['{', '}', ';', '=>', '->', '#!', 'SELECT ', 'INSERT ', 'docker ', 'kubectl ', 'sudo ', 'printf', 'def ', 'class ', 'function ', 'const ', 'let ', 'var ', 'public ', 'private ', 'import ', 'package ', 'namespace ', 'http '];
1128
+ return keywords.some(keyword => trimmed.includes(keyword));
1129
+ };
1130
+ const getTextWithBreaks = (node) => {
1131
+ if (!node) return '';
1132
+ if (node.type === 'text') {
1133
+ return node.data || '';
1134
+ }
1135
+ if (!node.children || node.children.length === 0) {
1136
+ return newlineTags.has((node.tagName || node.name || '').toUpperCase()) ? '\n' : '';
1137
+ }
1138
+ let text = '';
1139
+ for (const child of node.children) {
1140
+ text += getTextWithBreaks(child);
1141
+ }
1142
+ if (newlineTags.has((node.tagName || node.name || '').toUpperCase())) {
1143
+ text += '\n';
1144
+ }
1145
+ return text;
1146
+ };
1147
+ const normalizeCodeText = (text = '') => {
1148
+ const lines = text
1149
+ .replace(/\r\n?/g, '\n')
1150
+ .split('\n')
1151
+ .map(line => line.replace(/\u00a0/g, ' ').replace(/\t/g, ' ').replace(/\s+$/, ''));
1152
+ while (lines.length && !lines[0].trim()) {
1153
+ lines.shift();
1154
+ }
1155
+ while (lines.length && !lines[lines.length - 1].trim()) {
1156
+ lines.pop();
1157
+ }
1158
+ const result = [];
1159
+ let previousBlank = false;
1160
+ for (const line of lines) {
1161
+ const isBlank = line.trim().length === 0;
1162
+ if (isBlank && previousBlank) {
1163
+ continue;
1164
+ }
1165
+ result.push(line);
1166
+ previousBlank = isBlank;
1167
+ }
1168
+ return result.join('\n').trim();
1169
+ };
1170
+ const convertToCodeBlock = ($target) => {
1171
+ if (!$target || !$target.length) {
1172
+ return false;
1173
+ }
1174
+ const rawText = getTextWithBreaks($target[0]) || '';
1175
+ const normalized = normalizeCodeText(rawText);
1176
+ if (!looksLikeCodeBlock(normalized)) {
1177
+ return false;
1178
+ }
1179
+ const $pre = $('<pre class="code-block"></pre>');
1180
+ const $code = $('<code></code>').text(normalized);
1181
+ $pre.append($code);
1182
+ $target.replaceWith($pre);
1183
+ return true;
1184
+ };
1185
+ const processedCandidates = new Set();
1186
+ $(highlightSelectors.join(',')).each((_, node) => {
1187
+ const $start = $(node);
1188
+ if (!$start || !$start.length) {
1189
+ return;
1190
+ }
1191
+ let $candidate = null;
1192
+ let $current = $start;
1193
+ for (let depth = 0; depth < 8 && $current && $current.length; depth++) {
1194
+ const rawTag = ($current[0]?.tagName || $current[0]?.name || '').toLowerCase();
1195
+ const classAttr = ($current.attr('class') || '').toLowerCase();
1196
+ const styleAttr = ($current.attr('style') || '').toLowerCase();
1197
+ const hasClassHint = containerClassHints.some(keyword => classAttr.includes(keyword));
1198
+ const hasStyleHint = containerStyleHints.some(keyword => styleAttr.includes(keyword));
1199
+ if (!inlineTags.has(rawTag) && (hasClassHint || hasStyleHint)) {
1200
+ $candidate = $current;
1201
+ }
1202
+ $current = $current.parent();
1203
+ }
1204
+ if (!$candidate || !$candidate.length || $candidate.is('pre')) {
1205
+ return;
1206
+ }
1207
+ const key = $candidate[0];
1208
+ if (processedCandidates.has(key)) {
1209
+ return;
1210
+ }
1211
+ if (convertToCodeBlock($candidate)) {
1212
+ processedCandidates.add(key);
1213
+ }
1214
+ });
1215
+
1216
+ const simplebarWrappers = [
1217
+ '.simplebar-wrapper',
1218
+ '.simplebar-height-auto-observer-wrapper',
1219
+ '.simplebar-height-auto-observer',
1220
+ '.simplebar-mask',
1221
+ '.simplebar-offset',
1222
+ '.simplebar-content-wrapper',
1223
+ '.simplebar-placeholder'
1224
+ ];
1225
+ simplebarWrappers.forEach(selector => {
1226
+ $(selector).each((_, element) => {
1227
+ const $el = $(element);
1228
+ if ($el.find('pre.code-block').length > 0 || !$el.text().trim()) {
1229
+ $el.replaceWith($el.contents());
1230
+ }
1231
+ });
1232
+ });
1233
+ $('.simplebar-track, .simplebar-scrollbar').remove();
1234
+
1235
+ return $.root().html() || html;
1236
+ } catch {
1237
+ return html;
1238
+ }
1239
+ }
1240
+
1241
+ async function detectAccessIssuesOnPage(page) {
1242
+ return page.evaluate(() => {
1243
+ const bodyText = document.body ? (document.body.innerText || '') : '';
1244
+ if (!bodyText) {
1245
+ return null;
1246
+ }
1247
+ const normalized = bodyText.replace(/\s+/g, ' ').trim();
1248
+ if (!normalized) {
1249
+ return null;
1250
+ }
1251
+
1252
+ const checks = [
1253
+ {
1254
+ keywords: ['请先登录', '重新登录', '立即登录', '登录后'],
1255
+ message: '页面提示需要登录,Cookie 可能已失效或未正确导入'
1256
+ },
1257
+ {
1258
+ keywords: ['试看结束', '购买专栏', '立即订阅', '购买课程', '仅对付费用户开放', '开通会员'],
1259
+ message: '检测到购买/试看提示,可能未订阅该专栏或 Cookie 已失效'
1260
+ },
1261
+ {
1262
+ keywords: ['暂无权限', '没有权限', '权限不足'],
1263
+ message: '账号没有访问该专栏的权限'
1264
+ }
1265
+ ];
1266
+
1267
+ const lower = normalized.toLowerCase();
1268
+ for (const check of checks) {
1269
+ for (const keyword of check.keywords) {
1270
+ if (lower.includes(keyword.toLowerCase())) {
1271
+ return check.message;
1272
+ }
1273
+ }
1274
+ }
1275
+ return null;
1276
+ });
1277
+ }
1278
+
1279
+ async function waitForArticleContentSelector(page, timeout = 60000) {
1280
+ const start = Date.now();
1281
+ while ((Date.now() - start) < timeout) {
1282
+ for (const selector of ARTICLE_CONTENT_SELECTORS) {
1283
+ const handle = await page.$(selector);
1284
+ if (handle) {
1285
+ await handle.dispose();
1286
+ return selector;
1287
+ }
1288
+ }
1289
+ await page.waitForTimeout(300);
1290
+ }
1291
+ return null;
1292
+ }
1293
+
1294
+ async function autoScrollArticle(page, { step = 400, delay = 120, maxIterations = 80 } = {}) {
1295
+ await page.evaluate(({ step, delay, maxIterations }) => {
1296
+ return new Promise((resolve) => {
1297
+ let iterations = 0;
1298
+ const timer = setInterval(() => {
1299
+ window.scrollBy(0, step);
1300
+ iterations += 1;
1301
+ const reachedBottom = window.scrollY + window.innerHeight >= document.body.scrollHeight - 50;
1302
+ if (reachedBottom || iterations >= maxIterations) {
1303
+ clearInterval(timer);
1304
+ window.scrollTo(0, 0);
1305
+ resolve();
1306
+ }
1307
+ }, delay);
1308
+ });
1309
+ }, { step, delay, maxIterations });
1310
+ }
1311
+
1312
+ async function fetchArticleContentFromPage(page, article, timeout = 60000) {
1313
+ const targetUrl = article.url || `${GEEKTIME_BASE_URL}/column/article/${article.id}`;
1314
+ let response;
1315
+ try {
1316
+ response = await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout });
1317
+ } catch (error) {
1318
+ throw new Error(`页面加载失败: ${error.message}`);
1319
+ }
1320
+
1321
+ if (response && !response.ok()) {
1322
+ throw new Error(`页面响应异常: HTTP ${response.status()} ${response.statusText()}`);
1323
+ }
1324
+
1325
+ try {
1326
+ await page.waitForLoadState('networkidle', { timeout: Math.min(10000, timeout) });
1327
+ } catch {
1328
+ // 部分页面可能没有额外请求,忽略 networkidle 超时
1329
+ }
1330
+
1331
+ await autoScrollArticle(page);
1332
+ await page.waitForTimeout(500);
1333
+
1334
+ const selector = await waitForArticleContentSelector(page, timeout);
1335
+ if (!selector) {
1336
+ const issue = await detectAccessIssuesOnPage(page);
1337
+ if (issue) {
1338
+ throw new Error(issue);
1339
+ }
1340
+ throw new Error('未能定位到文章正文,请重试或检查 Cookie 是否有效');
1341
+ }
1342
+
1343
+ let extraction;
1344
+ try {
1345
+ extraction = await page.$eval(selector, (el) => {
1346
+ const clone = el.cloneNode(true);
1347
+ const removalSelectors = [
1348
+ '.article-share',
1349
+ '.article-actions',
1350
+ '.article-copyright',
1351
+ '.article-bottom',
1352
+ '.reward',
1353
+ '.share',
1354
+ '.Index_recommend',
1355
+ '.recommend',
1356
+ '.audio-player',
1357
+ '.AudioPlayer',
1358
+ '.voice-player',
1359
+ '.VoicePlayer',
1360
+ '.audio-wrapper',
1361
+ '.AudioWrapper',
1362
+ '.geek-player',
1363
+ '.Player',
1364
+ '.plugin',
1365
+ '.Plugin',
1366
+ '[data-widget="audio"]',
1367
+ '[data-widget="Audio"]',
1368
+ '[data-role="audio"]',
1369
+ '.comment-area',
1370
+ '.CommentArea',
1371
+ '.comment-wrapper',
1372
+ '.CommentWrapper',
1373
+ '#comments',
1374
+ '#comment',
1375
+ '.comments',
1376
+ '.Comments'
1377
+ ];
1378
+ removalSelectors.forEach(sel => {
1379
+ clone.querySelectorAll(sel).forEach(node => node.remove());
1380
+ });
1381
+
1382
+ const toAbsoluteUrl = (value) => {
1383
+ if (!value || typeof value !== 'string') {
1384
+ return '';
1385
+ }
1386
+ const trimmed = value.trim();
1387
+ if (!trimmed) {
1388
+ return '';
1389
+ }
1390
+ if (trimmed.startsWith('blob:')) {
1391
+ return '';
1392
+ }
1393
+ if (trimmed.startsWith('data:')) {
1394
+ return trimmed;
1395
+ }
1396
+ if (/^https?:/i.test(trimmed)) {
1397
+ return trimmed;
1398
+ }
1399
+ if (trimmed.startsWith('//')) {
1400
+ return `${location.protocol}${trimmed}`;
1401
+ }
1402
+ try {
1403
+ const url = new URL(trimmed, location.href);
1404
+ return url.href;
1405
+ } catch {
1406
+ return '';
1407
+ }
1408
+ };
1409
+
1410
+ const imageFallbackAttrs = [
1411
+ 'data-src',
1412
+ 'data-original',
1413
+ 'data-actualsrc',
1414
+ 'data-url',
1415
+ 'data-image',
1416
+ 'data-origin',
1417
+ 'data-thumbnail',
1418
+ 'data-bigimgsrc',
1419
+ 'data-download',
1420
+ 'data-href'
1421
+ ];
1422
+
1423
+ clone.querySelectorAll('img').forEach(img => {
1424
+ let finalSrc = toAbsoluteUrl(img.getAttribute('src'));
1425
+ if (!finalSrc) {
1426
+ for (const attr of imageFallbackAttrs) {
1427
+ const candidate = toAbsoluteUrl(img.getAttribute(attr));
1428
+ if (candidate) {
1429
+ finalSrc = candidate;
1430
+ break;
1431
+ }
1432
+ }
1433
+ }
1434
+
1435
+ if (!finalSrc) {
1436
+ img.remove();
1437
+ } else {
1438
+ img.setAttribute('src', finalSrc);
1439
+ }
1440
+ });
1441
+
1442
+ const textLength = clone.innerText ? clone.innerText.trim().length : 0;
1443
+ return {
1444
+ html: clone.innerHTML,
1445
+ textLength
1446
+ };
1447
+ });
1448
+ } catch (error) {
1449
+ throw new Error(`读取文章内容失败: ${error.message}`);
1450
+ }
1451
+
1452
+ if (!extraction || !extraction.html || extraction.textLength < 20) {
1453
+ const issue = await detectAccessIssuesOnPage(page);
1454
+ if (issue) {
1455
+ throw new Error(issue);
1456
+ }
1457
+ throw new Error('正文内容为空,可能是 Cookie 失效或只获取到试看内容');
1458
+ }
1459
+
1460
+ const normalizedHtml = normalizeArticleHtml(extraction.html);
1461
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1462
+
1463
+ if (!sanitizedHtml || sanitizedHtml.trim().length === 0) {
1464
+ throw new Error('正文清洗后为空,可能是页面结构变化');
1465
+ }
1466
+
1467
+ const cleaned = removeDuplicateTitle(sanitizedHtml, article.originalTitle || article.title || '');
1468
+ return enhanceCodeBlocks(cleaned);
1469
+ }
1470
+
1471
+ function isRetryableContentError(message = '') {
1472
+ if (!message) return true;
1473
+ const lower = message.toLowerCase();
1474
+ const nonRetryableKeywords = [
1475
+ 'cookie', '登录', '登陆', '订阅', '试看', '权限', '购买', '未授权', '无权限'
1476
+ ];
1477
+ return !nonRetryableKeywords.some(keyword => lower.includes(keyword));
1478
+ }
1479
+
1480
+ async function fetchArticleContentWithRetry(page, article, options = {}) {
1481
+ const {
1482
+ timeout = 60000,
1483
+ maxAttempts = 3,
1484
+ delayMs = 1500
1485
+ } = options;
1486
+
1487
+ let lastError = null;
1488
+
1489
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
1490
+ try {
1491
+ if (attempt > 1) {
1492
+ await page.waitForTimeout(400);
1493
+ }
1494
+ return await fetchArticleContentFromPage(page, article, timeout);
1495
+ } catch (error) {
1496
+ lastError = error;
1497
+ const message = error?.message || '';
1498
+ if (!isRetryableContentError(message) || attempt === maxAttempts) {
1499
+ throw error;
1500
+ }
1501
+ const waitTime = delayMs * attempt;
1502
+ if (process.env.DEBUG) {
1503
+ console.log(chalk.gray(`重试文章 ${article.id} (第${attempt}次失败: ${message}),等待 ${waitTime}ms`));
1504
+ }
1505
+ try {
1506
+ await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 });
1507
+ } catch {
1508
+ // 忽略
1509
+ }
1510
+ await page.waitForTimeout(waitTime);
1511
+ }
1512
+ }
1513
+
1514
+ throw lastError || new Error('无法获取文章内容');
1515
+ }
1516
+
1517
+ async function extractArticlesFromPageDom(page) {
1518
+ return page.evaluate((baseUrl) => {
1519
+ const selectors = [
1520
+ '[class*="catalog"] a[href*="/column/article/"]',
1521
+ '[class*="directory"] a[href*="/column/article/"]',
1522
+ '[class*="Catalogue"] a[href*="/column/article/"]',
1523
+ '[class*="Catalog"] a[href*="/column/article/"]',
1524
+ 'nav a[href*="/column/article/"]',
1525
+ 'a[href*="/column/article/"]'
1526
+ ];
1527
+
1528
+ const collectedAnchors = [];
1529
+ const seenElements = new Set();
1530
+ selectors.forEach(selector => {
1531
+ const nodes = document.querySelectorAll(selector);
1532
+ nodes.forEach(node => {
1533
+ if (!seenElements.has(node)) {
1534
+ seenElements.add(node);
1535
+ collectedAnchors.push(node);
1536
+ }
1537
+ });
1538
+ });
1539
+
1540
+ if (collectedAnchors.length === 0) {
1541
+ return [];
1542
+ }
1543
+
1544
+ const seenIds = new Set();
1545
+ const articles = [];
1546
+
1547
+ const cleanText = (text) => (text || '').replace(/\s+/g, ' ').trim();
1548
+
1549
+ collectedAnchors.forEach((anchor, index) => {
1550
+ const href = anchor.getAttribute('href') || '';
1551
+ const match = href.match(/column\/article\/(\d+)/i);
1552
+ if (!match) {
1553
+ return;
1554
+ }
1555
+
1556
+ const id = parseInt(match[1], 10);
1557
+ if (!id || seenIds.has(id)) {
1558
+ return;
1559
+ }
1560
+ seenIds.add(id);
1561
+
1562
+ let title = cleanText(anchor.innerText || anchor.textContent || anchor.getAttribute('title') || '');
1563
+ if (!title) {
1564
+ const titleNode = anchor.querySelector('[class*="title"], span, div');
1565
+ if (titleNode) {
1566
+ title = cleanText(titleNode.textContent);
1567
+ }
1568
+ }
1569
+ if (!title) {
1570
+ title = `文章_${id}`;
1571
+ }
1572
+
1573
+ let absoluteUrl = href;
1574
+ try {
1575
+ absoluteUrl = new URL(href, baseUrl).toString();
1576
+ } catch {
1577
+ if (href.startsWith('/')) {
1578
+ absoluteUrl = `${baseUrl.replace(/\/$/, '')}${href}`;
1579
+ }
1580
+ }
1581
+
1582
+ const sectionNode = anchor.closest('[data-section],[data-chapter],[class*="section"],[class*="Section"],[class*="chapter"],[class*="Chapter"]');
1583
+ let sectionName = '';
1584
+ if (sectionNode) {
1585
+ sectionName = cleanText(
1586
+ sectionNode.getAttribute('data-section') ||
1587
+ sectionNode.getAttribute('data-chapter') ||
1588
+ sectionNode.getAttribute('data-title') ||
1589
+ sectionNode.querySelector('h2, h3, h4, .title, .section-title')?.textContent ||
1590
+ ''
1591
+ );
1592
+ }
1593
+
1594
+ articles.push({
1595
+ id,
1596
+ article_title: title,
1597
+ article_sharetitle: title,
1598
+ url: absoluteUrl,
1599
+ section_name: sectionName,
1600
+ chapter_index: index + 1,
1601
+ originalIndex: index
1602
+ });
1603
+ });
1604
+
1605
+ return articles;
1606
+ }, GEEKTIME_BASE_URL);
1607
+ }
1608
+
1609
+ async function extractColumnAuthorFromPage(page) {
1610
+ try {
1611
+ return await page.evaluate(() => {
1612
+ const selectors = [
1613
+ '.author-name',
1614
+ '.author',
1615
+ '.teacher-name',
1616
+ '.lecturer-name',
1617
+ '.Index_teacherName',
1618
+ '.ProductHeader_teacherName',
1619
+ '.ColumnIntro_teacher__name',
1620
+ '.ColumnIntro_author__name'
1621
+ ];
1622
+ for (const selector of selectors) {
1623
+ const el = document.querySelector(selector);
1624
+ if (el && el.textContent && el.textContent.trim()) {
1625
+ return el.textContent.trim();
1626
+ }
1627
+ }
1628
+ const metaAuthor = document.querySelector('meta[name="author"]');
1629
+ if (metaAuthor && metaAuthor.content) {
1630
+ return metaAuthor.content.trim();
1631
+ }
1632
+ return null;
1633
+ });
1634
+ } catch {
1635
+ return null;
1636
+ }
1637
+ }
1638
+
504
1639
  // 获取专栏所有文章列表(通过API)
505
1640
  function getValueByPath(obj, path) {
506
1641
  if (!obj || !path) return undefined;
@@ -603,45 +1738,53 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
603
1738
  let columnInfoHandler = null;
604
1739
 
605
1740
  // 用于同步的 Promise
606
- const articlesPromise = new Promise((resolve, reject) => {
607
- articlesHandler = async (response) => {
608
- const url = response.url();
609
- // 监听文章列表 API
610
- if (url.includes('/serv/v1/column/articles')) {
611
- try {
612
- const data = await response.json();
613
- if (process.env.DEBUG) {
614
- console.log(chalk.gray('\n收到文章列表API响应'));
1741
+ const articlesPromise = Promise.race([
1742
+ new Promise((resolve) => {
1743
+ articlesHandler = async (response) => {
1744
+ const url = response.url();
1745
+ // 监听文章列表 API
1746
+ if (url.includes('/serv/v1/column/articles')) {
1747
+ try {
1748
+ const data = await response.json();
1749
+ if (process.env.DEBUG) {
1750
+ console.log(chalk.gray('\n收到文章列表API响应'));
1751
+ }
1752
+ resolve(data);
1753
+ } catch (e) {
1754
+ console.error('解析文章列表API失败:', e);
1755
+ resolve(null);
615
1756
  }
616
- resolve(data);
617
- } catch (e) {
618
- console.error('解析文章列表API失败:', e);
619
1757
  }
620
- }
621
- };
622
- page.on('response', articlesHandler);
623
- });
624
-
625
- const columnInfoPromise = new Promise((resolve) => {
626
- columnInfoHandler = async (response) => {
627
- const url = response.url();
628
- // 监听专栏详情相关的 API
629
- if (url.includes('/serv/v1/column/intro') ||
630
- url.includes('/serv/v3/column/info') ||
631
- url.includes('/serv/v1/column/detail')) {
632
- try {
633
- const data = await response.json();
634
- if (process.env.DEBUG) {
635
- console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
1758
+ };
1759
+ page.on('response', articlesHandler);
1760
+ }),
1761
+ new Promise(resolve => setTimeout(() => resolve(null), 30000))
1762
+ ]);
1763
+
1764
+ const columnInfoPromise = Promise.race([
1765
+ new Promise((resolve) => {
1766
+ columnInfoHandler = async (response) => {
1767
+ const url = response.url();
1768
+ // 监听专栏详情相关的 API
1769
+ if (url.includes('/serv/v1/column/intro') ||
1770
+ url.includes('/serv/v3/column/info') ||
1771
+ url.includes('/serv/v1/column/detail')) {
1772
+ try {
1773
+ const data = await response.json();
1774
+ if (process.env.DEBUG) {
1775
+ console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
1776
+ }
1777
+ resolve(data);
1778
+ } catch (e) {
1779
+ console.error('解析专栏信息API失败:', e);
1780
+ resolve(null);
636
1781
  }
637
- resolve(data);
638
- } catch (e) {
639
- console.error('解析专栏信息API失败:', e);
640
1782
  }
641
- }
642
- };
643
- page.on('response', columnInfoHandler);
644
- });
1783
+ };
1784
+ page.on('response', columnInfoHandler);
1785
+ }),
1786
+ new Promise(resolve => setTimeout(() => resolve(null), 5000))
1787
+ ]);
645
1788
 
646
1789
  try {
647
1790
  // 先设置监听器,再访问页面
@@ -650,23 +1793,13 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
650
1793
 
651
1794
  spinner.text = '正在获取文章列表...';
652
1795
 
653
- // 等待文章列表 API(必须的)
654
- articlesData = await Promise.race([
655
- articlesPromise,
656
- new Promise((_, reject) => setTimeout(() => reject(new Error('文章列表API调用超时')), 30000))
657
- ]);
1796
+ // 等待文章列表 API(如果失败将返回 null)
1797
+ articlesData = await articlesPromise;
658
1798
 
659
- // 尝试等待专栏信息 API(可选的,5秒超时)
660
- try {
661
- columnInfoData = await Promise.race([
662
- columnInfoPromise,
663
- new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
664
- ]);
665
- } catch (e) {
666
- // 获取专栏信息失败不是致命错误
667
- if (process.env.DEBUG) {
668
- console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
669
- }
1799
+ // 尝试等待专栏信息 API(可选)
1800
+ columnInfoData = await columnInfoPromise;
1801
+ if (!columnInfoData && process.env.DEBUG) {
1802
+ console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
670
1803
  }
671
1804
 
672
1805
  } catch (error) {
@@ -695,32 +1828,47 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
695
1828
  }
696
1829
  }
697
1830
 
698
- if (!articlesData || !articlesData.data || !articlesData.data.list) {
699
- spinner.fail('API响应数据格式错误');
1831
+ let useDomExtraction = false;
1832
+ let domArticles = [];
700
1833
 
701
- // 智能判断可能的原因
702
- if (!articlesData) {
703
- console.log(chalk.yellow('\n⚠️ 未能获取到文章列表数据\n'));
704
- console.log(chalk.cyan('可能的原因:'));
705
- console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
706
- console.log(chalk.gray(' 2. 网络连接问题 - 请检查网络'));
707
- console.log(chalk.gray(' 3. 专栏 ID 不正确 - 请检查 URL\n'));
708
- } else if (articlesData.code === -3000 || articlesData.code === -3001) {
709
- console.log(chalk.red('\n❌ Cookie 已失效\n'));
710
- console.log(chalk.cyan('📖 请重新获取 Cookie:'));
711
- console.log(chalk.gray(' 1. 浏览器登录极客时间'));
712
- console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
713
- console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
714
- console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
715
- } else if (articlesData.error) {
716
- console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
1834
+ if (!articlesData || !articlesData.data || !Array.isArray(articlesData.data.list) || articlesData.data.list.length === 0) {
1835
+ spinner.text = 'API 不可用,尝试从页面解析文章列表...';
1836
+ try {
1837
+ domArticles = await extractArticlesFromPageDom(page);
1838
+ } catch (error) {
1839
+ if (process.env.DEBUG) {
1840
+ console.log(chalk.gray(`DOM文章提取失败: ${error.message}`));
1841
+ }
717
1842
  }
718
1843
 
719
- return { articles: [], columnTitle: 'unknown' };
1844
+ if (!domArticles || domArticles.length === 0) {
1845
+ spinner.fail('无法获取文章列表');
1846
+
1847
+ if (!articlesData) {
1848
+ console.log(chalk.yellow('\n⚠️ 未能从接口或页面获取文章列表\n'));
1849
+ console.log(chalk.cyan('可能的原因:'));
1850
+ console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
1851
+ console.log(chalk.gray(' 2. 页面结构发生变化 - 请联系开发者更新解析逻辑'));
1852
+ console.log(chalk.gray(' 3. 网络连接问题或URL无效\n'));
1853
+ } else if (articlesData.code === -3000 || articlesData.code === -3001) {
1854
+ console.log(chalk.red('\n❌ Cookie 已失效\n'));
1855
+ console.log(chalk.cyan('📖 请重新获取 Cookie:'));
1856
+ console.log(chalk.gray(' 1. 浏览器登录极客时间'));
1857
+ console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
1858
+ console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
1859
+ console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
1860
+ } else if (articlesData.error) {
1861
+ console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
1862
+ }
1863
+
1864
+ return { articles: [], columnTitle: 'unknown', columnAuthor: '极客时间' };
1865
+ }
1866
+
1867
+ useDomExtraction = true;
720
1868
  }
721
1869
 
722
1870
  // 调试信息:记录完整的API响应结构(仅在环境变量DEBUG存在时)
723
- if (process.env.DEBUG) {
1871
+ if (!useDomExtraction && process.env.DEBUG) {
724
1872
  console.log(chalk.gray('\n========== 文章列表 API 响应数据 =========='));
725
1873
  console.log(chalk.gray(JSON.stringify(articlesData.data, null, 2)));
726
1874
  if (columnInfoData) {
@@ -743,7 +1891,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
743
1891
  }
744
1892
 
745
1893
  // 方法2: 从文章列表 API 数据中获取
746
- if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
1894
+ if ((!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') && articlesData && articlesData.data) {
747
1895
  columnTitle = articlesData.data.column_title
748
1896
  || articlesData.data.column_subtitle
749
1897
  || articlesData.data.title
@@ -827,10 +1975,15 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
827
1975
  console.log(chalk.gray(` 提取的专栏名: ${columnTitle}\n`));
828
1976
  }
829
1977
 
830
- const columnAuthor = extractColumnAuthor(columnInfoData, articlesData) || '极客时间';
1978
+ let columnAuthor = '极客时间';
1979
+ if (!useDomExtraction && articlesData) {
1980
+ columnAuthor = extractColumnAuthor(columnInfoData, articlesData) || '极客时间';
1981
+ } else {
1982
+ columnAuthor = await extractColumnAuthorFromPage(page) || '极客时间';
1983
+ }
831
1984
 
832
1985
  // 解析文章列表
833
- const rawArticles = articlesData.data.list;
1986
+ const rawArticles = useDomExtraction ? domArticles : (articlesData.data.list || []);
834
1987
 
835
1988
  const articles = rawArticles.map((article, index) => {
836
1989
  const title = article.article_title || article.article_sharetitle || 'Untitled';
@@ -845,7 +1998,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
845
1998
 
846
1999
  return {
847
2000
  title: cleanTitle,
848
- url: `https://time.geekbang.org/column/article/${id}`,
2001
+ url: article.url || `${GEEKTIME_BASE_URL}/column/article/${id}`,
849
2002
  originalTitle: title,
850
2003
  id: id,
851
2004
  sectionName: article.section_name || '',
@@ -889,7 +2042,7 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
889
2042
  const article = articles[index];
890
2043
 
891
2044
  try {
892
- const result = await downloadArticleSilent(page, article, outputDir, index + 1, total);
2045
+ const result = await downloadArticleSilent(page, article, outputDir, index + 1, total, timeout);
893
2046
  results[index] = result;
894
2047
  completed++;
895
2048
 
@@ -944,36 +2097,20 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
944
2097
  }
945
2098
 
946
2099
  // 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
947
- async function downloadArticleSilent(page, article, outputDir, index, total) {
2100
+ async function downloadArticleSilent(page, article, outputDir, index, total, timeout = 60000) {
948
2101
  try {
949
2102
  if (process.env.DEBUG) {
950
2103
  console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
951
2104
  }
952
- const articleData = await fetchArticleData(page.context(), article.id);
953
- if (process.env.DEBUG) {
954
- console.log(chalk.gray(`[silent] 已获取文章数据 ${article.id}`));
955
- }
956
- const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
957
- const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
958
- if (process.env.DEBUG) {
959
- console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
960
- }
961
- const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
2105
+ const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
2106
+ const meta = article.sectionName ? `章节:${article.sectionName}` : '';
2107
+ const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
962
2108
 
963
2109
  await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
964
- if (process.env.DEBUG) {
965
- console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
966
- }
967
2110
  try {
968
2111
  await page.waitForLoadState('networkidle', { timeout: 5000 });
969
- if (process.env.DEBUG) {
970
- console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
971
- }
972
2112
  } catch {
973
- // 忽略由于没有额外资源导致的延时
974
- if (process.env.DEBUG) {
975
- console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
976
- }
2113
+ // ignore
977
2114
  }
978
2115
 
979
2116
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
@@ -991,7 +2128,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
991
2128
  resolve();
992
2129
  }
993
2130
  };
994
- const attachTimeout = () => setTimeout(safeResolve, 3000);
2131
+ const attachTimeout = () => setTimeout(safeResolve, 15000);
995
2132
  let fallbackTimer = null;
996
2133
 
997
2134
  // 如果图片还未加载完成,等待加载
@@ -1065,7 +2202,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
1065
2202
  }
1066
2203
 
1067
2204
  // 等待图片处理完成
1068
- await page.waitForTimeout(1000);
2205
+ await page.waitForTimeout(1200);
1069
2206
  if (process.env.DEBUG) {
1070
2207
  console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
1071
2208
  }
@@ -1083,7 +2220,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
1083
2220
  bottom: '20mm',
1084
2221
  left: '15mm'
1085
2222
  },
1086
- printBackground: false, // 关闭背景打印,显著减小文件大小
2223
+ printBackground: true,
1087
2224
  preferCSSPageSize: false
1088
2225
  });
1089
2226
  if (process.env.DEBUG) {
@@ -1101,20 +2238,19 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
1101
2238
  }
1102
2239
 
1103
2240
  // 下载单篇文章为 PDF
1104
- async function downloadArticle(page, article, outputDir, index, total) {
2241
+ async function downloadArticle(page, article, outputDir, index, total, timeout = 60000) {
1105
2242
  const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
1106
2243
 
1107
2244
  try {
1108
- const articleData = await fetchArticleData(page.context(), article.id);
1109
- const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1110
- const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1111
- const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
2245
+ const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
2246
+ const meta = article.sectionName ? `章节:${article.sectionName}` : '';
2247
+ const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
1112
2248
 
1113
2249
  await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
1114
2250
  try {
1115
2251
  await page.waitForLoadState('networkidle', { timeout: 5000 });
1116
2252
  } catch {
1117
- // 没有额外资源加载时忽略
2253
+ // 忽略
1118
2254
  }
1119
2255
 
1120
2256
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
@@ -1189,7 +2325,7 @@ async function downloadArticle(page, article, outputDir, index, total) {
1189
2325
  bottom: '20mm',
1190
2326
  left: '15mm'
1191
2327
  },
1192
- printBackground: false, // 关闭背景打印,显著减小文件大小
2328
+ printBackground: true,
1193
2329
  preferCSSPageSize: false
1194
2330
  });
1195
2331
 
@@ -1312,11 +2448,9 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
1312
2448
  }
1313
2449
 
1314
2450
  // 提取单篇文章的 HTML 内容(用于 EPUB 生成)
1315
- async function extractArticleContent(page, article, index, total) {
2451
+ async function extractArticleContent(page, article, index, total, timeout = 60000) {
1316
2452
  try {
1317
- const articleData = await fetchArticleData(page.context(), article.id);
1318
- const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1319
- const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
2453
+ const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
1320
2454
 
1321
2455
  if (!sanitizedHtml) {
1322
2456
  throw new Error('未能提取到文章内容');
@@ -1370,7 +2504,7 @@ async function extractWithConcurrency(context, articles, concurrency = 5, delay
1370
2504
  const article = articles[index];
1371
2505
 
1372
2506
  try {
1373
- const result = await extractArticleContent(page, article, index + 1, total);
2507
+ const result = await extractArticleContent(page, article, index + 1, total, timeout);
1374
2508
  results[index] = result;
1375
2509
  completed++;
1376
2510
 
@@ -1485,41 +2619,43 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
1485
2619
  margin: 1.5em 0;
1486
2620
  padding: 0;
1487
2621
  }
1488
- p {
2622
+ p, div {
1489
2623
  margin: 1.2em 0;
1490
2624
  text-indent: 0;
1491
- line-height: 1.8;
2625
+ line-height: 1.9;
1492
2626
  word-wrap: break-word;
1493
2627
  overflow-wrap: break-word;
1494
2628
  display: block;
1495
2629
  page-break-inside: avoid;
1496
2630
  }
1497
- /* 确保段落之间有明显间隔 */
1498
- p + p {
1499
- margin-top: 1.5em;
2631
+ p + p,
2632
+ div + p,
2633
+ p + div {
2634
+ margin-top: 1.6em;
1500
2635
  }
1501
2636
  /* 代码块样式 */
1502
2637
  pre {
1503
- background-color: #f6f8fa;
2638
+ background-color: #0b1220;
2639
+ color: #d9e2ff;
1504
2640
  border: 1px solid #e1e4e8;
1505
2641
  border-radius: 6px;
1506
- padding: 16px;
2642
+ padding: 18px 20px;
1507
2643
  overflow-x: auto;
1508
2644
  margin: 1em 0;
1509
- line-height: 1.5;
2645
+ line-height: 1.6;
1510
2646
  font-size: 14px;
1511
2647
  white-space: pre-wrap;
1512
2648
  word-wrap: break-word;
1513
- font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
2649
+ font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
1514
2650
  page-break-inside: avoid;
1515
2651
  }
1516
2652
  code {
1517
- font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
2653
+ font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
1518
2654
  font-size: 0.9em;
1519
- background-color: #f6f8fa;
2655
+ background-color: rgba(15, 23, 42, 0.1);
1520
2656
  padding: 0.2em 0.4em;
1521
2657
  border-radius: 3px;
1522
- border: 1px solid #e1e4e8;
2658
+ border: 1px solid rgba(15, 23, 42, 0.1);
1523
2659
  }
1524
2660
  pre code {
1525
2661
  background-color: transparent;
@@ -1630,12 +2766,13 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
1630
2766
  async function main(options) {
1631
2767
  console.log(chalk.bold.cyan('\n🚀 极客时间专栏下载器\n'));
1632
2768
 
1633
- // 获取配置:优先级 命令行 > 配置文件
2769
+ // 获取配置:优先级 命令行 > 配置文件 > 默认 cookies.json
1634
2770
  let cookie = options.cookie;
2771
+ let cookieFile = options.cookieFile;
1635
2772
  let columnUrl = options.url;
1636
2773
 
1637
- // 如果命令行没有提供,尝试从配置文件读取
1638
- if (!cookie || !columnUrl) {
2774
+ // 如果命令行没有提供所需信息,尝试从配置文件读取
2775
+ if (!cookie || !columnUrl || !cookieFile) {
1639
2776
  // 使用当前工作目录的config.json,而不是脚本所在目录
1640
2777
  const configPath = path.join(process.cwd(), 'config.json');
1641
2778
  try {
@@ -1645,22 +2782,37 @@ async function main(options) {
1645
2782
  // 使用配置文件中的值作为默认值
1646
2783
  if (!cookie) cookie = config.cookie;
1647
2784
  if (!columnUrl) columnUrl = config.columnUrl;
2785
+ if (!cookieFile) cookieFile = config.cookieFile;
1648
2786
  } catch (error) {
1649
2787
  // 配置文件不存在或读取失败,不是致命错误
1650
2788
  // 只有在命令行也没提供时才报错
1651
2789
  }
1652
2790
  }
1653
2791
 
2792
+ // 如果没有cookie字符串但存在 cookies.json 文件,自动使用
2793
+ if (!cookie && !cookieFile) {
2794
+ const defaultCookieJsonPath = path.join(process.cwd(), 'cookies.json');
2795
+ if (await fileExists(defaultCookieJsonPath)) {
2796
+ cookieFile = defaultCookieJsonPath;
2797
+ }
2798
+ }
2799
+
2800
+ const cookieSavePath = cookieFile || path.join(process.cwd(), 'cookies.json');
2801
+
1654
2802
  // 验证必要参数
1655
- if (!cookie) {
2803
+ if (!cookie && !cookieFile) {
1656
2804
  console.error(chalk.red('❌ 缺少 Cookie!'));
1657
2805
  console.log(chalk.yellow('\n请通过以下方式之一提供 Cookie:'));
1658
2806
  console.log(chalk.gray('1. 命令行参数:--cookie "你的cookie字符串"'));
1659
2807
  console.log(chalk.gray('2. 配置文件 config.json:'));
1660
2808
  console.log(chalk.gray(' {'));
1661
2809
  console.log(chalk.gray(' "cookie": "你的cookie字符串",'));
1662
- console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx"'));
1663
- console.log(chalk.gray(' }\n'));
2810
+ console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx",'));
2811
+ console.log(chalk.gray(' "cookieFile": "cookies.json" // 可选,导入JSON文件'));
2812
+ console.log(chalk.gray(' }'));
2813
+ console.log(chalk.gray('3. 提供 Cookie JSON 文件:'));
2814
+ console.log(chalk.gray(' - 命令行参数:--cookie-file ./cookies.json'));
2815
+ console.log(chalk.gray(' - 或将 cookies.json 放到当前目录\n'));
1664
2816
  process.exit(1);
1665
2817
  }
1666
2818
 
@@ -1709,16 +2861,42 @@ async function main(options) {
1709
2861
  userAgent: DEFAULT_USER_AGENT
1710
2862
  });
1711
2863
 
1712
- // 兼容用户直接复制整行"Cookie: xxx"
1713
- let normalizedCookie = cookie.trim();
1714
- if (/^cookie:/i.test(normalizedCookie)) {
1715
- normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
2864
+ let normalizedCookie = '';
2865
+ let cookiesForContext = [];
2866
+
2867
+ if (cookie) {
2868
+ normalizedCookie = cookie.trim();
2869
+ if (/^cookie:/i.test(normalizedCookie)) {
2870
+ normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
2871
+ }
2872
+ cookiesForContext = parseCookies(normalizedCookie);
2873
+ } else if (cookieFile) {
2874
+ try {
2875
+ const { cookieHeader, cookies, absolutePath } = await loadCookiesFromJsonFile(cookieFile);
2876
+ normalizedCookie = cookieHeader.trim();
2877
+ cookiesForContext = cookies;
2878
+ console.log(chalk.gray(`🍪 已从 ${absolutePath} 导入 Cookie`));
2879
+ } catch (error) {
2880
+ console.error(chalk.red(`❌ 读取 Cookie JSON 失败: ${error.message}`));
2881
+ process.exit(1);
2882
+ }
1716
2883
  }
2884
+
1717
2885
  globalCookieHeader = normalizedCookie;
1718
2886
 
1719
2887
  // 设置 cookies
1720
- const cookies = parseCookies(normalizedCookie);
1721
- await context.addCookies(cookies);
2888
+ await context.addCookies(cookiesForContext);
2889
+ await updateGlobalCookieHeaderFromContext(context);
2890
+ context.on('response', (response) => {
2891
+ try {
2892
+ const headers = response.headers();
2893
+ if (headers && headers['set-cookie']) {
2894
+ updateGlobalCookieHeaderFromContext(context);
2895
+ }
2896
+ } catch {
2897
+ // ignore
2898
+ }
2899
+ });
1722
2900
 
1723
2901
  // 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
1724
2902
  await context.route('**/*', (route) => {
@@ -1740,9 +2918,12 @@ async function main(options) {
1740
2918
  }
1741
2919
 
1742
2920
  const headers = {
1743
- ...request.headers(),
1744
- cookie: normalizedCookie
2921
+ ...request.headers()
1745
2922
  };
2923
+ const outgoingCookieHeader = globalCookieHeader || normalizedCookie;
2924
+ if (outgoingCookieHeader) {
2925
+ headers.cookie = outgoingCookieHeader;
2926
+ }
1746
2927
  route.continue({ headers });
1747
2928
  });
1748
2929
 
@@ -1828,7 +3009,10 @@ async function main(options) {
1828
3009
  const successCount = results.filter(r => r.success).length;
1829
3010
  const failCount = results.filter(r => !r.success).length;
1830
3011
  const timeoutCount = results.filter(r =>
1831
- !r.success && r.error && (r.error.includes('timeout') || r.error.includes('Timeout'))
3012
+ !r.success && r.error && /timeout/i.test(r.error)
3013
+ ).length;
3014
+ const authIssueCount = results.filter(r =>
3015
+ !r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
1832
3016
  ).length;
1833
3017
 
1834
3018
  console.log(chalk.bold.cyan('\n📊 PDF 下载统计\n'));
@@ -1842,6 +3026,11 @@ async function main(options) {
1842
3026
  console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
1843
3027
  console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
1844
3028
  console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
3029
+ } else if (authIssueCount > 0) {
3030
+ console.log(chalk.yellow('⚠️ 检测到登录或权限相关异常\n'));
3031
+ console.log(chalk.gray(' 1. 在浏览器中重新登录极客时间,进入该专栏任意文章'));
3032
+ console.log(chalk.gray(' 2. 复制最新的 Cookie(或重新导出 cookies.json)'));
3033
+ console.log(chalk.gray(' 3. 使用新的 --cookie 或 --cookie-file 参数后重试\n'));
1845
3034
  }
1846
3035
 
1847
3036
  // 合并 PDF
@@ -1885,7 +3074,10 @@ async function main(options) {
1885
3074
  const successCount = contentResults.filter(r => r.success).length;
1886
3075
  const failCount = contentResults.filter(r => !r.success).length;
1887
3076
  const timeoutCount = contentResults.filter(r =>
1888
- !r.success && r.error && (r.error.includes('Cookie') || r.error.includes('timeout') || r.error.includes('Timeout'))
3077
+ !r.success && r.error && /timeout/i.test(r.error)
3078
+ ).length;
3079
+ const authIssueCount = contentResults.filter(r =>
3080
+ !r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
1889
3081
  ).length;
1890
3082
 
1891
3083
  console.log(chalk.bold.cyan('\n📊 EPUB 提取统计\n'));
@@ -1898,19 +3090,42 @@ async function main(options) {
1898
3090
  console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
1899
3091
  console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
1900
3092
  console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
3093
+ } else if (authIssueCount > 0) {
3094
+ console.log(chalk.yellow('⚠️ 检测到登录/权限问题,建议步骤:\n'));
3095
+ console.log(chalk.gray(' 1. 浏览器重新登录极客时间并打开该专栏文章'));
3096
+ console.log(chalk.gray(' 2. 重新复制最新 Cookie 或导出 cookies.json'));
3097
+ console.log(chalk.gray(' 3. 更新 --cookie 或 --cookie-file 后再次执行\n'));
1901
3098
  }
1902
3099
 
1903
3100
  // 生成 EPUB
1904
3101
  if (successCount > 0) {
1905
- const epubPath = await generateEPUB(
1906
- outputDir,
1907
- columnTitle,
1908
- columnAuthor,
1909
- articlesToDownload,
1910
- contentResults
3102
+ const hasImageContent = contentResults.some(result =>
3103
+ result && result.success && typeof result.content === 'string' && result.content.includes('<img')
1911
3104
  );
1912
- if (epubPath) {
1913
- console.log(chalk.green(`\n✅ EPUB 生成完成: ${epubPath}\n`));
3105
+
3106
+ let processedContent = contentResults;
3107
+ let tempAssetsDir = null;
3108
+
3109
+ try {
3110
+ if (hasImageContent) {
3111
+ tempAssetsDir = await createTempAssetsDir(outputDir);
3112
+ processedContent = await rewriteEpubContentImages(context, contentResults, tempAssetsDir);
3113
+ }
3114
+
3115
+ const epubPath = await generateEPUB(
3116
+ outputDir,
3117
+ columnTitle,
3118
+ columnAuthor,
3119
+ articlesToDownload,
3120
+ processedContent
3121
+ );
3122
+ if (epubPath) {
3123
+ console.log(chalk.green(`\n✅ EPUB 生成完成: ${epubPath}\n`));
3124
+ }
3125
+ } finally {
3126
+ if (tempAssetsDir) {
3127
+ await cleanupTempAssetsDir(tempAssetsDir);
3128
+ }
1914
3129
  }
1915
3130
  }
1916
3131
  }
@@ -1927,6 +3142,11 @@ async function main(options) {
1927
3142
  }
1928
3143
  process.exit(1);
1929
3144
  } finally {
3145
+ try {
3146
+ await persistCookiesToFile(context, cookieSavePath);
3147
+ } catch {
3148
+ // ignore
3149
+ }
1930
3150
  // 确保浏览器完全关闭
1931
3151
  try {
1932
3152
  if (browser && !isShuttingDown) {
@@ -1946,6 +3166,7 @@ program
1946
3166
  .version(version)
1947
3167
  .option('-u, --url <url>', '专栏文章URL(任意一篇)')
1948
3168
  .option('-c, --cookie <cookie>', 'Cookie字符串(用于认证)')
3169
+ .option('--cookie-file <path>', '从 JSON 文件导入 Cookie(如 chrome 扩展导出的 cookies.json)')
1949
3170
  .option('-o, --output <dir>', '输出目录', './downloads')
1950
3171
  .option('-f, --format <format>', '输出格式: pdf, epub, both', 'pdf')
1951
3172
  .option('--headless <boolean>', '无头模式', true)