@kadaliao/geektime-downloader 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/download.js +379 -666
  2. package/package.json +1 -1
package/download.js CHANGED
@@ -7,6 +7,7 @@ import ora from 'ora';
7
7
  import fs from 'fs/promises';
8
8
  import path from 'path';
9
9
  import { fileURLToPath } from 'url';
10
+ import { createRequire } from 'module';
10
11
  import * as pdfLib from 'pdf-lib';
11
12
  import { outlinePdfFactory } from '@lillallol/outline-pdf';
12
13
  import epubGenMemory from 'epub-gen-memory';
@@ -14,8 +15,11 @@ import epubGenMemory from 'epub-gen-memory';
14
15
  const { PDFDocument } = pdfLib;
15
16
  const outlinePdf = outlinePdfFactory(pdfLib);
16
17
  const epub = epubGenMemory.default || epubGenMemory;
18
+ const require = createRequire(import.meta.url);
19
+ const { version } = require('./package.json');
17
20
 
18
21
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
22
+ let globalCookieHeader = '';
19
23
 
20
24
  // 全局变量:跟踪当前浏览器实例和是否正在关闭
21
25
  let globalBrowser = null;
@@ -241,6 +245,10 @@ const PRINT_FIX_CSS = `
241
245
  }
242
246
  `;
243
247
 
248
+ const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
249
+ const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
250
+ const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
251
+
244
252
  // 解析 cookie 字符串
245
253
  function parseCookies(cookieString) {
246
254
  return cookieString.split(';').map(cookie => {
@@ -254,6 +262,245 @@ function parseCookies(cookieString) {
254
262
  });
255
263
  }
256
264
 
265
+ function normalizeArticleHtml(html = '') {
266
+ if (!html) return '';
267
+ return html
268
+ .replace(/<!--\s*\[\[\[read_end]]\]\s*-->/gi, '')
269
+ .replace(/src="\/\//gi, 'src="https://')
270
+ .replace(/src='\/\//gi, "src='https://")
271
+ .replace(/href="\/\//gi, 'href="https://')
272
+ .replace(/href='\/\//gi, "href='https://");
273
+ }
274
+
275
+ async function fetchArticleData(context, articleId) {
276
+ const maxAttempts = 3;
277
+ const refererUrl = `${GEEKTIME_BASE_URL}/column/article/${articleId}`;
278
+ let lastError = null;
279
+
280
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
281
+ try {
282
+ const response = await context.request.post(ARTICLE_API_URL, {
283
+ headers: {
284
+ 'user-agent': DEFAULT_USER_AGENT,
285
+ 'content-type': 'application/json',
286
+ 'accept': 'application/json, text/plain, */*',
287
+ 'origin': GEEKTIME_BASE_URL,
288
+ 'referer': refererUrl,
289
+ 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
290
+ ...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
291
+ },
292
+ data: {
293
+ id: String(articleId),
294
+ include_neighbors: true,
295
+ is_freelyread: true
296
+ }
297
+ });
298
+
299
+ const bodyText = await response.text();
300
+
301
+ if (!response.ok()) {
302
+ throw new Error(`API请求失败: ${response.status()} ${response.statusText()} - ${bodyText.slice(0, 160)}`);
303
+ }
304
+
305
+ let json;
306
+ try {
307
+ json = JSON.parse(bodyText);
308
+ } catch (parseError) {
309
+ throw new Error(`API响应解析失败: ${parseError.message} - ${bodyText.slice(0, 160)}`);
310
+ }
311
+
312
+ if (!json || json.code !== 0 || !json.data) {
313
+ throw new Error(`无法获取完整文章内容: ${bodyText.slice(0, 160)}`);
314
+ }
315
+
316
+ if (!json.data.article_content) {
317
+ throw new Error('文章内容为空,可能需要更新 Cookie 或重新获取权限');
318
+ }
319
+
320
+ return json.data;
321
+ } catch (error) {
322
+ lastError = error;
323
+ if (attempt < maxAttempts) {
324
+ await new Promise(resolve => setTimeout(resolve, attempt * 700));
325
+ }
326
+ }
327
+ }
328
+
329
+ throw lastError || new Error('未知错误导致文章内容获取失败');
330
+ }
331
+
332
+ async function sanitizeArticleHtml(page, rawHtml) {
333
+ return page.evaluate((html) => {
334
+ const template = document.createElement('template');
335
+ template.innerHTML = html;
336
+
337
+ const removalSelectors = [
338
+ 'nav', 'header', 'footer', 'aside',
339
+ '.comment', '.comments', '.Index_comment',
340
+ '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
341
+ '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
342
+ '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
343
+ '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
344
+ '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
345
+ '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
346
+ '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
347
+ '.copyright', '.statement', '.disclaimer',
348
+ '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
349
+ 'audio', 'video',
350
+ '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
351
+ '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
352
+ '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
353
+ '[data-role="toolbar"]',
354
+ 'button', 'iframe', 'script', 'style'
355
+ ];
356
+ removalSelectors.forEach(selector => {
357
+ template.content.querySelectorAll(selector).forEach(el => el.remove());
358
+ });
359
+
360
+ const pluginKeywords = [
361
+ 'note', 'translation', 'audio', 'player', 'reward', 'donate',
362
+ 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
363
+ 'copyright', 'geeknote', 'bilingual'
364
+ ];
365
+ const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
366
+ const className = (el.className || '').toString().toLowerCase();
367
+ const idValue = (el.id || '').toString().toLowerCase();
368
+ const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
369
+ const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
370
+ const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
371
+ return pluginKeywords.some(keyword => combined.includes(keyword));
372
+ });
373
+ pluginElements.forEach(el => el.remove());
374
+
375
+ const mindmapSelectors = [
376
+ '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
377
+ '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
378
+ '[class*="MindMap"]', '[class*="mindMap"]'
379
+ ];
380
+ mindmapSelectors.forEach(selector => {
381
+ template.content.querySelectorAll(selector).forEach(el => el.remove());
382
+ });
383
+ const vectorCandidates = Array.from(template.content.querySelectorAll('svg, canvas, object, embed'));
384
+ vectorCandidates.forEach(el => {
385
+ const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
386
+ const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
387
+ if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
388
+ el.remove();
389
+ }
390
+ });
391
+
392
+ const allowedTags = new Set([
393
+ 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
394
+ 'UL', 'OL', 'LI',
395
+ 'BLOCKQUOTE', 'PRE', 'CODE',
396
+ 'IMG', 'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', 'FIGURE', 'FIGCAPTION',
397
+ 'STRONG', 'EM', 'B', 'I', 'SPAN', 'DIV', 'BR', 'HR',
398
+ 'A', 'SUP', 'SUB'
399
+ ]);
400
+
401
+ const blockDisplayTags = new Set(['DIV', 'SECTION', 'ARTICLE', 'FIGURE']);
402
+ const allowedAttributes = new Set(['href', 'src', 'alt', 'title', 'class', 'style', 'target', 'rel']);
403
+
404
+ function sanitizeNode(node) {
405
+ const children = Array.from(node.children || []);
406
+ for (const child of children) {
407
+ if (!allowedTags.has(child.tagName)) {
408
+ child.replaceWith(...child.childNodes);
409
+ continue;
410
+ }
411
+
412
+ if (blockDisplayTags.has(child.tagName)) {
413
+ child.style.display = 'block';
414
+ }
415
+
416
+ const attributes = Array.from(child.attributes);
417
+ for (const attr of attributes) {
418
+ if (!allowedAttributes.has(attr.name.toLowerCase())) {
419
+ child.removeAttribute(attr.name);
420
+ }
421
+ }
422
+
423
+ sanitizeNode(child);
424
+ }
425
+ }
426
+
427
+ sanitizeNode(template.content || template);
428
+
429
+ const images = template.content ? template.content.querySelectorAll('img') : [];
430
+ images.forEach(img => {
431
+ if (!img.getAttribute('loading')) {
432
+ img.setAttribute('loading', 'lazy');
433
+ }
434
+ img.style.maxWidth = '100%';
435
+ img.style.height = 'auto';
436
+ });
437
+
438
+ return template.innerHTML;
439
+ }, rawHtml);
440
+ }
441
+
442
+ function escapeHtml(text = '') {
443
+ return text
444
+ .replace(/&/g, '&amp;')
445
+ .replace(/</g, '&lt;')
446
+ .replace(/>/g, '&gt;')
447
+ .replace(/"/g, '&quot;')
448
+ .replace(/'/g, '&#39;');
449
+ }
450
+
451
+ function buildPrintableHtml(title, sanitizedHtml) {
452
+ const baseCss = `
453
+ body {
454
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
455
+ font-size: 16px;
456
+ line-height: 1.8;
457
+ color: #1f2329;
458
+ margin: 0;
459
+ padding: 40px;
460
+ background: #fff;
461
+ }
462
+
463
+ .article-print-wrapper {
464
+ max-width: 900px;
465
+ margin: 0 auto;
466
+ }
467
+
468
+ .article-print-wrapper h1 {
469
+ font-size: 32px;
470
+ line-height: 1.4;
471
+ margin-bottom: 24px;
472
+ }
473
+
474
+ a {
475
+ color: #0f5ef2;
476
+ text-decoration: none;
477
+ }
478
+
479
+ pre {
480
+ background: #f7f7f7;
481
+ padding: 16px;
482
+ border-radius: 6px;
483
+ overflow: auto;
484
+ }
485
+ `;
486
+
487
+ return `
488
+ <!DOCTYPE html>
489
+ <html lang="zh-CN">
490
+ <head>
491
+ <meta charset="utf-8">
492
+ <base href="${GEEKTIME_BASE_URL}">
493
+ <style>${baseCss}${PRINT_FIX_CSS}</style>
494
+ </head>
495
+ <body>
496
+ <div class="article-print-wrapper">
497
+ <h1>${escapeHtml(title)}</h1>
498
+ ${sanitizedHtml}
499
+ </div>
500
+ </body>
501
+ </html>`;
502
+ }
503
+
257
504
  // 获取专栏所有文章列表(通过API)
258
505
  function getValueByPath(obj, path) {
259
506
  if (!obj || !path) return undefined;
@@ -699,128 +946,67 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
699
946
  // 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
700
947
  async function downloadArticleSilent(page, article, outputDir, index, total) {
701
948
  try {
702
- // 访问文章页面
703
- await page.goto(article.url, { waitUntil: 'networkidle' });
704
- await page.waitForTimeout(2000);
705
-
706
- // 注入打印修复样式
707
- await page.addStyleTag({ content: PRINT_FIX_CSS });
708
-
709
- // 激进的布局重构:提取正文并重建页面结构
710
- await page.evaluate((titleText) => {
711
- // 1. 找到文章正文内容
712
- const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
713
-
714
- if (articleContent) {
715
- // 2. 克隆正文内容
716
- const contentClone = articleContent.cloneNode(true);
717
-
718
- // 3. 清空body的所有内容
719
- document.body.innerHTML = '';
720
-
721
- // 4. 重置body样式为全宽
722
- document.body.style.margin = '0';
723
- document.body.style.padding = '0';
724
- document.body.style.width = '100%';
725
- document.body.style.maxWidth = 'none';
726
- document.body.style.boxSizing = 'border-box';
727
-
728
- // 5. 创建一个简单的容器
729
- const wrapper = document.createElement('div');
730
- wrapper.style.width = '100%';
731
- wrapper.style.maxWidth = '100%';
732
- wrapper.style.margin = '0';
733
- wrapper.style.padding = '0';
734
- wrapper.style.boxSizing = 'border-box';
735
-
736
- // 6. 创建标题元素(使用传入的标题文本)
737
- if (titleText) {
738
- const titleElement = document.createElement('h1');
739
- titleElement.textContent = titleText;
740
- // 设置标题样式
741
- titleElement.style.fontSize = '32px';
742
- titleElement.style.fontWeight = 'bold';
743
- titleElement.style.marginBottom = '30px';
744
- titleElement.style.marginTop = '0';
745
- titleElement.style.lineHeight = '1.4';
746
- titleElement.style.color = '#000';
747
- wrapper.appendChild(titleElement);
748
- }
749
-
750
- // 7. 将正文插入容器
751
- wrapper.appendChild(contentClone);
752
-
753
- // 8. 将容器插入body
754
- document.body.appendChild(wrapper);
755
-
756
- // 9. 确保正文内容使用全宽且不溢出
757
- contentClone.style.width = '100%';
758
- contentClone.style.maxWidth = '100%';
759
- contentClone.style.margin = '0';
760
- contentClone.style.padding = '0';
761
- contentClone.style.boxSizing = 'border-box';
762
- contentClone.style.overflowWrap = 'break-word';
763
- contentClone.style.wordBreak = 'break-word';
764
- } else {
765
- // 如果找不到正文,使用原有的删除方法
766
- const selectors = [
767
- 'aside',
768
- '[class*="leftSide"]',
769
- '[class*="LeftSide"]',
770
- '[class*="sidebar"]',
771
- '[class*="Sidebar"]',
772
- '[class*="side_"]',
773
- '[class*="catalog"]',
774
- '[class*="directory"]',
775
- '[class*="toc"]',
776
- '[class*="outline"]',
777
- '[class*="Outline"]',
778
- 'nav',
779
- '[class*="nav"]',
780
- '[class*="Nav"]',
781
- '[class*="rightSide"]',
782
- '[class*="RightSide"]',
783
- '[class*="comment"]',
784
- '[class*="recommend"]',
785
- '[class*="footer"]',
786
- '[class*="bottom"]'
787
- ];
949
+ if (process.env.DEBUG) {
950
+ console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
951
+ }
952
+ const articleData = await fetchArticleData(page.context(), article.id);
953
+ if (process.env.DEBUG) {
954
+ console.log(chalk.gray(`[silent] 已获取文章数据 ${article.id}`));
955
+ }
956
+ const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
957
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
958
+ if (process.env.DEBUG) {
959
+ console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
960
+ }
961
+ const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
788
962
 
789
- selectors.forEach(selector => {
790
- try {
791
- const elements = document.querySelectorAll(selector);
792
- elements.forEach(el => el.remove());
793
- } catch (e) {
794
- // 忽略无效选择器
795
- }
796
- });
963
+ await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
964
+ if (process.env.DEBUG) {
965
+ console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
966
+ }
967
+ try {
968
+ await page.waitForLoadState('networkidle', { timeout: 5000 });
969
+ if (process.env.DEBUG) {
970
+ console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
797
971
  }
798
-
799
- // 额外:删除所有包含"大纲"的元素
800
- const allElements = document.querySelectorAll('*');
801
- allElements.forEach(el => {
802
- const text = el.textContent || el.innerText || '';
803
- if (text.trim() === '大纲' ||
804
- (text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
805
- el.remove();
806
- }
807
- });
808
- }, article.originalTitle || article.title);
809
-
810
- // 等待文章内容加载
811
- await page.waitForSelector('.Index_articleContent_QBG5G, .content');
972
+ } catch {
973
+ // 忽略由于没有额外资源导致的延时
974
+ if (process.env.DEBUG) {
975
+ console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
976
+ }
977
+ }
812
978
 
813
979
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
980
+ if (process.env.DEBUG) {
981
+ console.log(chalk.gray(`[silent] 开始处理图片 ${article.id}`));
982
+ }
814
983
  await page.evaluate(() => {
815
984
  const images = document.querySelectorAll('img');
816
985
  const promises = Array.from(images).map(img => {
817
986
  return new Promise((resolve) => {
987
+ let resolved = false;
988
+ const safeResolve = () => {
989
+ if (!resolved) {
990
+ resolved = true;
991
+ resolve();
992
+ }
993
+ };
994
+ const attachTimeout = () => setTimeout(safeResolve, 3000);
995
+ let fallbackTimer = null;
996
+
818
997
  // 如果图片还未加载完成,等待加载
819
998
  if (!img.complete) {
820
- img.onload = () => processImage(img, resolve);
821
- img.onerror = () => resolve(); // 图片加载失败,跳过
999
+ fallbackTimer = attachTimeout();
1000
+ img.onload = () => {
1001
+ if (fallbackTimer) clearTimeout(fallbackTimer);
1002
+ processImage(img, safeResolve);
1003
+ };
1004
+ img.onerror = () => {
1005
+ if (fallbackTimer) clearTimeout(fallbackTimer);
1006
+ safeResolve(); // 图片加载失败,跳过
1007
+ };
822
1008
  } else {
823
- processImage(img, resolve);
1009
+ processImage(img, safeResolve);
824
1010
  }
825
1011
  });
826
1012
  });
@@ -848,12 +1034,21 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
848
1034
  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
849
1035
 
850
1036
  // 转换为压缩后的data URL
1037
+ let hasResolved = false;
1038
+ const finalize = () => {
1039
+ if (!hasResolved) {
1040
+ hasResolved = true;
1041
+ resolve();
1042
+ }
1043
+ };
851
1044
  canvas.toBlob((blob) => {
852
- const url = URL.createObjectURL(blob);
853
- img.src = url;
1045
+ if (blob) {
1046
+ const url = URL.createObjectURL(blob);
1047
+ img.src = url;
1048
+ }
854
1049
  img.style.width = maxWidth + 'px';
855
1050
  img.style.height = 'auto';
856
- resolve();
1051
+ finalize();
857
1052
  }, 'image/jpeg', quality);
858
1053
  } catch (e) {
859
1054
  // 如果压缩失败,至少限制大小
@@ -865,9 +1060,15 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
865
1060
 
866
1061
  return Promise.all(promises);
867
1062
  });
1063
+ if (process.env.DEBUG) {
1064
+ console.log(chalk.gray(`[silent] 图片处理完成 ${article.id}`));
1065
+ }
868
1066
 
869
1067
  // 等待图片处理完成
870
1068
  await page.waitForTimeout(1000);
1069
+ if (process.env.DEBUG) {
1070
+ console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
1071
+ }
871
1072
 
872
1073
  // 生成 PDF
873
1074
  const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
@@ -885,10 +1086,16 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
885
1086
  printBackground: false, // 关闭背景打印,显著减小文件大小
886
1087
  preferCSSPageSize: false
887
1088
  });
1089
+ if (process.env.DEBUG) {
1090
+ console.log(chalk.gray(`[silent] PDF生成完成 ${article.id}`));
1091
+ }
888
1092
 
889
1093
  return { success: true, title: article.title };
890
1094
 
891
1095
  } catch (error) {
1096
+ if (process.env.DEBUG) {
1097
+ console.log(chalk.red(`[silent] 文章 ${article.id} 失败: ${error.message}`));
1098
+ }
892
1099
  return { success: false, title: article.title, error: error.message };
893
1100
  }
894
1101
  }
@@ -898,116 +1105,17 @@ async function downloadArticle(page, article, outputDir, index, total) {
898
1105
  const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
899
1106
 
900
1107
  try {
901
- // 访问文章页面
902
- await page.goto(article.url, { waitUntil: 'networkidle' });
903
- await page.waitForTimeout(2000);
904
-
905
- // 注入打印修复样式
906
- await page.addStyleTag({ content: PRINT_FIX_CSS });
907
-
908
- // 激进的布局重构:提取正文并重建页面结构
909
- await page.evaluate((titleText) => {
910
- // 1. 找到文章正文内容
911
- const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
912
-
913
- if (articleContent) {
914
- // 2. 克隆正文内容
915
- const contentClone = articleContent.cloneNode(true);
916
-
917
- // 3. 清空body的所有内容
918
- document.body.innerHTML = '';
919
-
920
- // 4. 重置body样式为全宽
921
- document.body.style.margin = '0';
922
- document.body.style.padding = '0';
923
- document.body.style.width = '100%';
924
- document.body.style.maxWidth = 'none';
925
- document.body.style.boxSizing = 'border-box';
926
-
927
- // 5. 创建一个简单的容器
928
- const wrapper = document.createElement('div');
929
- wrapper.style.width = '100%';
930
- wrapper.style.maxWidth = '100%';
931
- wrapper.style.margin = '0';
932
- wrapper.style.padding = '0';
933
- wrapper.style.boxSizing = 'border-box';
934
-
935
- // 6. 创建标题元素(使用传入的标题文本)
936
- if (titleText) {
937
- const titleElement = document.createElement('h1');
938
- titleElement.textContent = titleText;
939
- // 设置标题样式
940
- titleElement.style.fontSize = '32px';
941
- titleElement.style.fontWeight = 'bold';
942
- titleElement.style.marginBottom = '30px';
943
- titleElement.style.marginTop = '0';
944
- titleElement.style.lineHeight = '1.4';
945
- titleElement.style.color = '#000';
946
- wrapper.appendChild(titleElement);
947
- }
948
-
949
- // 7. 将正文插入容器
950
- wrapper.appendChild(contentClone);
1108
+ const articleData = await fetchArticleData(page.context(), article.id);
1109
+ const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1110
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1111
+ const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
951
1112
 
952
- // 8. 将容器插入body
953
- document.body.appendChild(wrapper);
954
-
955
- // 9. 确保正文内容使用全宽且不溢出
956
- contentClone.style.width = '100%';
957
- contentClone.style.maxWidth = '100%';
958
- contentClone.style.margin = '0';
959
- contentClone.style.padding = '0';
960
- contentClone.style.boxSizing = 'border-box';
961
- contentClone.style.overflowWrap = 'break-word';
962
- contentClone.style.wordBreak = 'break-word';
963
- } else {
964
- // 如果找不到正文,使用原有的删除方法
965
- const selectors = [
966
- 'aside',
967
- '[class*="leftSide"]',
968
- '[class*="LeftSide"]',
969
- '[class*="sidebar"]',
970
- '[class*="Sidebar"]',
971
- '[class*="side_"]',
972
- '[class*="catalog"]',
973
- '[class*="directory"]',
974
- '[class*="toc"]',
975
- '[class*="outline"]',
976
- '[class*="Outline"]',
977
- 'nav',
978
- '[class*="nav"]',
979
- '[class*="Nav"]',
980
- '[class*="rightSide"]',
981
- '[class*="RightSide"]',
982
- '[class*="comment"]',
983
- '[class*="recommend"]',
984
- '[class*="footer"]',
985
- '[class*="bottom"]'
986
- ];
987
-
988
- selectors.forEach(selector => {
989
- try {
990
- const elements = document.querySelectorAll(selector);
991
- elements.forEach(el => el.remove());
992
- } catch (e) {
993
- // 忽略无效选择器
994
- }
995
- });
996
- }
997
-
998
- // 额外:删除所有包含"大纲"的元素
999
- const allElements = document.querySelectorAll('*');
1000
- allElements.forEach(el => {
1001
- const text = el.textContent || el.innerText || '';
1002
- if (text.trim() === '大纲' ||
1003
- (text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
1004
- el.remove();
1005
- }
1006
- });
1007
- }, article.originalTitle || article.title);
1008
-
1009
- // 等待文章内容加载
1010
- await page.waitForSelector('.Index_articleContent_QBG5G, .content');
1113
+ await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
1114
+ try {
1115
+ await page.waitForLoadState('networkidle', { timeout: 5000 });
1116
+ } catch {
1117
+ // 没有额外资源加载时忽略
1118
+ }
1011
1119
 
1012
1120
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
1013
1121
  await page.evaluate(() => {
@@ -1206,460 +1314,32 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
1206
1314
  // 提取单篇文章的 HTML 内容(用于 EPUB 生成)
1207
1315
  async function extractArticleContent(page, article, index, total) {
1208
1316
  try {
1209
- // 访问文章页面
1210
- await page.goto(article.url, { waitUntil: 'networkidle' });
1211
-
1212
- // 等待文章内容加载
1213
- await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 60000 });
1214
-
1215
- // 关键:等待文章完整内容加载,而不是试看内容
1216
- // 滚动页面以触发懒加载内容
1217
- await page.evaluate(async () => {
1218
- await new Promise((resolve) => {
1219
- let totalHeight = 0;
1220
- const distance = 100;
1221
- const timer = setInterval(() => {
1222
- const scrollHeight = document.body.scrollHeight;
1223
- window.scrollBy(0, distance);
1224
- totalHeight += distance;
1225
-
1226
- if (totalHeight >= scrollHeight) {
1227
- clearInterval(timer);
1228
- resolve();
1229
- }
1230
- }, 100);
1231
- });
1232
- });
1233
-
1234
- // 再等待一段时间,确保内容完全加载
1235
- await page.waitForTimeout(3000);
1236
-
1237
- // 提取文章 HTML 内容
1238
- const content = await page.evaluate(() => {
1239
- // 找到文章正文内容
1240
- const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
1241
-
1242
- if (!articleContent) {
1243
- return null;
1244
- }
1245
-
1246
- // 克隆正文以避免修改原始DOM
1247
- const contentClone = articleContent.cloneNode(true);
1248
-
1249
- // 白名单策略:只保留正文核心元素
1250
- // 允许的元素标签
1251
- const allowedTags = new Set([
1252
- 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', // 段落和标题
1253
- 'UL', 'OL', 'LI', // 列表
1254
- 'BLOCKQUOTE', // 引用
1255
- 'PRE', 'CODE', // 代码
1256
- 'IMG', // 图片
1257
- 'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', // 表格
1258
- 'A', // 链接
1259
- 'STRONG', 'B', 'EM', 'I', 'U', // 强调和样式
1260
- 'BR', 'HR', // 换行和分隔线
1261
- 'FIGURE', 'FIGCAPTION', 'DETAILS', 'SUMMARY',
1262
- 'SPAN', 'DIV', 'SECTION', 'ARTICLE' // 容器(可能包含文本)
1263
- ]);
1264
-
1265
- // 在清理前,移除常见的非正文区域
1266
- const removalSelectors = [
1267
- 'nav', 'header', 'footer', 'aside',
1268
- '.comment', '.comments', '.Index_comment',
1269
- '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
1270
- '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
1271
- '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
1272
- '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
1273
- '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
1274
- '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
1275
- '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
1276
- '.copyright', '.statement', '.disclaimer',
1277
- '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
1278
- 'audio', 'video',
1279
- '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
1280
- '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
1281
- '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
1282
- '[data-role="toolbar"]',
1283
- 'button', 'iframe', 'script', 'style'
1284
- ];
1285
- removalSelectors.forEach(selector => {
1286
- contentClone.querySelectorAll(selector).forEach(el => el.remove());
1287
- });
1288
-
1289
- // 根据关键词进一步移除插件类元素
1290
- const pluginKeywords = [
1291
- 'note', 'translation', 'audio', 'player', 'reward', 'donate',
1292
- 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
1293
- 'copyright', 'geeknote', 'bilingual'
1294
- ];
1295
- const pluginElements = Array.from(contentClone.querySelectorAll('*')).filter(el => {
1296
- const className = (el.className || '').toString().toLowerCase();
1297
- const idValue = (el.id || '').toString().toLowerCase();
1298
- const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
1299
- const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
1300
- const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
1301
- return pluginKeywords.some(keyword => combined.includes(keyword));
1302
- });
1303
- pluginElements.forEach(el => el.remove());
1304
-
1305
- // 移除 MindMap 等 SVG/Canvas 思维导图内容(阅读器无法正确渲染)
1306
- const mindmapSelectors = [
1307
- '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
1308
- '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
1309
- '[class*="MindMap"]', '[class*="mindMap"]'
1310
- ];
1311
- mindmapSelectors.forEach(selector => {
1312
- contentClone.querySelectorAll(selector).forEach(el => el.remove());
1313
- });
1314
- const vectorCandidates = Array.from(contentClone.querySelectorAll('svg, canvas, object, embed'));
1315
- vectorCandidates.forEach(el => {
1316
- const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
1317
- const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
1318
- if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
1319
- el.remove();
1320
- }
1321
- });
1322
-
1323
- // 将富文本中的代码块结构转换为标准 <pre><code>
1324
- const blockSeparatorTags = new Set([
1325
- 'P','DIV','SECTION','ARTICLE','UL','OL','LI','FIGURE','FIGCAPTION',
1326
- 'TABLE','THEAD','TBODY','TR','TD'
1327
- ]);
1328
-
1329
- function collectCodeText(node) {
1330
- const parts = [];
1331
-
1332
- const ensureNewline = () => {
1333
- if (!parts.length) {
1334
- parts.push('\n');
1335
- return;
1336
- }
1337
- if (!parts[parts.length - 1].endsWith('\n')) {
1338
- parts.push('\n');
1339
- }
1340
- };
1341
-
1342
- const traverse = (current) => {
1343
- if (!current) {
1344
- return;
1345
- }
1346
- if (current.nodeType === Node.TEXT_NODE) {
1347
- const textValue = current.textContent.replace(/\u00A0/g, ' ');
1348
- if (textValue) {
1349
- parts.push(textValue);
1350
- }
1351
- return;
1352
- }
1353
- if (current.nodeType !== Node.ELEMENT_NODE) {
1354
- return;
1355
- }
1356
- const tag = current.tagName.toUpperCase();
1357
- if (tag === 'BR') {
1358
- ensureNewline();
1359
- return;
1360
- }
1361
- Array.from(current.childNodes).forEach(traverse);
1362
- if (blockSeparatorTags.has(tag)) {
1363
- ensureNewline();
1364
- }
1365
- };
1366
-
1367
- traverse(node);
1368
- let text = parts.join('');
1369
- text = text
1370
- .replace(/\r\n/g, '\n')
1371
- .replace(/\n{3,}/g, '\n\n')
1372
- .replace(/[ \t]+\n/g, '\n')
1373
- .replace(/\n+$/g, '\n');
1374
- return text.trim() ? text : '';
1375
- }
1376
-
1377
- const codeLikeSelectors = [
1378
- '[data-slate-type="code"]',
1379
- '[data-slate-node="code"]',
1380
- '[data-code-block]',
1381
- '[data-code]',
1382
- '[data-code-language]',
1383
- '[class*="code-block"]',
1384
- '[class*="CodeBlock"]'
1385
- ];
1386
- const codeCandidates = new Set();
1387
- codeLikeSelectors.forEach(selector => {
1388
- contentClone.querySelectorAll(selector).forEach(el => codeCandidates.add(el));
1389
- });
1390
- const replaceWithPre = (element) => {
1391
- if (!element || !element.parentNode) {
1392
- return;
1393
- }
1394
- const codeText = collectCodeText(element);
1395
- if (!codeText) {
1396
- element.remove();
1397
- return;
1398
- }
1399
- const pre = document.createElement('pre');
1400
- const code = document.createElement('code');
1401
- code.textContent = codeText;
1402
- pre.appendChild(code);
1403
- element.parentNode.replaceChild(pre, element);
1404
- };
1405
- codeCandidates.forEach(el => {
1406
- if (el.tagName && el.tagName.toUpperCase() === 'PRE') {
1407
- return;
1408
- }
1409
- replaceWithPre(el);
1410
- });
1411
-
1412
- const multilineInlineCodes = Array.from(contentClone.querySelectorAll('code')).filter(codeEl => {
1413
- const parent = codeEl.parentElement;
1414
- return parent && parent.tagName.toUpperCase() !== 'PRE' && codeEl.textContent.includes('\n');
1415
- });
1416
- multilineInlineCodes.forEach(codeEl => {
1417
- const codeText = collectCodeText(codeEl);
1418
- if (!codeText) {
1419
- codeEl.remove();
1420
- return;
1421
- }
1422
- const pre = document.createElement('pre');
1423
- const innerCode = document.createElement('code');
1424
- innerCode.textContent = codeText;
1425
- pre.appendChild(innerCode);
1426
- codeEl.parentNode.replaceChild(pre, codeEl);
1427
- });
1428
-
1429
- // 递归清理函数:移除不在白名单中的元素
1430
- function cleanElement(element) {
1431
- const children = Array.from(element.childNodes);
1432
-
1433
- for (const child of children) {
1434
- if (child.nodeType === Node.ELEMENT_NODE) {
1435
- const tagName = child.tagName.toUpperCase();
1436
-
1437
- if (!allowedTags.has(tagName)) {
1438
- // 先递归处理子节点
1439
- cleanElement(child);
1440
-
1441
- if (child.childNodes.length > 0) {
1442
- while (child.firstChild) {
1443
- element.insertBefore(child.firstChild, child);
1444
- }
1445
- child.remove();
1446
- } else {
1447
- const textContent = (child.textContent || '').trim();
1448
- if (textContent) {
1449
- const textNode = document.createTextNode(textContent + ' ');
1450
- element.insertBefore(textNode, child);
1451
- }
1452
- child.remove();
1453
- }
1454
- } else {
1455
- cleanElement(child);
1456
- }
1457
- }
1458
- }
1459
- }
1460
-
1461
- cleanElement(contentClone);
1462
-
1463
- // 移除所有style属性,避免样式冲突
1464
- const allElements = contentClone.querySelectorAll('*');
1465
- allElements.forEach(el => {
1466
- el.removeAttribute('style');
1467
- el.removeAttribute('class');
1468
- el.removeAttribute('id');
1469
- el.removeAttribute('onclick');
1470
- el.removeAttribute('onload');
1471
- });
1472
-
1473
- // 处理图片URL
1474
- const images = contentClone.querySelectorAll('img');
1475
- const adKeywordLower = ['ad', 'advert', 'banner', 'qrcode', 'qr-code', 'reward', 'donate', 'appdownload', 'app-download', 'sponsor', 'thanks'];
1476
- const adKeywordCn = ['广告', '二维码', '赞赏', '打赏', '版权', '推广'];
1477
- images.forEach(img => {
1478
- let src = img.getAttribute('src');
1479
- const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || img.getAttribute('data-lazy-src');
1480
-
1481
- if (dataSrc && (dataSrc.startsWith('http://') || dataSrc.startsWith('https://'))) {
1482
- src = dataSrc;
1483
- img.setAttribute('src', src);
1484
- }
1485
-
1486
- if (!src || src.startsWith('blob:') || src.startsWith('data:')) {
1487
- img.remove();
1488
- return;
1489
- }
1490
-
1491
- if (!src.startsWith('http://') && !src.startsWith('https://')) {
1492
- try {
1493
- const absoluteUrl = new URL(src, window.location.href).href;
1494
- img.setAttribute('src', absoluteUrl);
1495
- src = absoluteUrl;
1496
- } catch (e) {
1497
- img.remove();
1498
- }
1499
- }
1317
+ const articleData = await fetchArticleData(page.context(), article.id);
1318
+ const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1319
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1500
1320
 
1501
- const altText = img.getAttribute('alt') || '';
1502
- const altLower = altText.toLowerCase();
1503
- const srcLower = (src || '').toLowerCase();
1504
- if (
1505
- adKeywordLower.some(keyword => srcLower.includes(keyword)) ||
1506
- adKeywordLower.some(keyword => altLower.includes(keyword)) ||
1507
- adKeywordCn.some(keyword => altText.includes(keyword))
1508
- ) {
1509
- img.remove();
1510
- return;
1511
- }
1512
-
1513
- // 清理图片属性
1514
- const imgAttrs = img.attributes;
1515
- for (let i = imgAttrs.length - 1; i >= 0; i--) {
1516
- const attrName = imgAttrs[i].name;
1517
- if (attrName !== 'src' && attrName !== 'alt') {
1518
- img.removeAttribute(attrName);
1519
- }
1520
- }
1521
- });
1522
-
1523
- // 清理空的div和span
1524
- const containers = contentClone.querySelectorAll('div, span');
1525
- containers.forEach(container => {
1526
- if (!container.textContent.trim() && !container.querySelector('img, pre, code, table')) {
1527
- container.remove();
1528
- }
1529
- });
1530
-
1531
- // 将只包含纯文本的 div 转换为段落,避免没有段间距
1532
- const blockLikeTags = new Set(['P','UL','OL','LI','TABLE','PRE','BLOCKQUOTE','H1','H2','H3','H4','H5','H6','IMG','SECTION','ARTICLE','FIGURE','FIGCAPTION','DETAILS','SUMMARY']);
1533
- const textContainers = Array.from(contentClone.querySelectorAll('div, section, article')).reverse();
1534
- textContainers.forEach(container => {
1535
- if (container === contentClone) {
1536
- return;
1537
- }
1538
-
1539
- if (!container.textContent.trim()) {
1540
- return;
1541
- }
1542
-
1543
- if (container.querySelector('img, pre, table, ul, ol, blockquote, h1, h2, h3, h4, h5, h6, figure')) {
1544
- return;
1545
- }
1546
-
1547
- const hasBlockChildren = Array.from(container.children).some(child => blockLikeTags.has(child.tagName?.toUpperCase()));
1548
- if (hasBlockChildren) {
1549
- return;
1550
- }
1551
-
1552
- const paragraph = document.createElement('p');
1553
- paragraph.innerHTML = container.innerHTML;
1554
- container.parentNode.replaceChild(paragraph, container);
1555
- });
1556
-
1557
- // 包装直接挂在容器下的文本或行内节点,避免散乱文本没有段落间距
1558
- const inlineTags = new Set(['A','SPAN','STRONG','B','EM','I','U','CODE','SMALL','SUB','SUP','MARK']);
1559
-
1560
- function wrapInlineChildren(element) {
1561
- const tagName = element.tagName ? element.tagName.toUpperCase() : '';
1562
- if (['P','LI','PRE','CODE','TABLE','THEAD','TBODY','TR'].includes(tagName)) {
1563
- return;
1564
- }
1565
-
1566
- const childNodes = Array.from(element.childNodes);
1567
- let buffer = [];
1568
-
1569
- const flushBuffer = (referenceNode) => {
1570
- if (!buffer.length) {
1571
- return;
1572
- }
1573
- const paragraph = document.createElement('p');
1574
- buffer.forEach(node => paragraph.appendChild(node));
1575
- element.insertBefore(paragraph, referenceNode);
1576
- buffer = [];
1577
- };
1578
-
1579
- for (const node of childNodes) {
1580
- if (node.nodeType === Node.TEXT_NODE) {
1581
- if (node.textContent.trim()) {
1582
- buffer.push(node);
1583
- } else {
1584
- element.removeChild(node);
1585
- }
1586
- continue;
1587
- }
1588
-
1589
- if (node.nodeType === Node.ELEMENT_NODE) {
1590
- const childTag = node.tagName.toUpperCase();
1591
- if (inlineTags.has(childTag) || childTag === 'BR') {
1592
- buffer.push(node);
1593
- continue;
1594
- }
1595
-
1596
- flushBuffer(node);
1597
- wrapInlineChildren(node);
1598
- continue;
1599
- }
1600
-
1601
- flushBuffer(node);
1602
- }
1603
-
1604
- flushBuffer(null);
1605
- }
1606
-
1607
- wrapInlineChildren(contentClone);
1608
-
1609
- // 移除尾部的版权/广告声明
1610
- const footerKeywords = ['版权', '未经许可', '未经授权', '不得转载', '未经允许', 'All Rights Reserved', '最终解释权', '转载'];
1611
- const trailingElements = Array.from(contentClone.querySelectorAll('p, div, section')).slice(-6);
1612
- trailingElements.forEach(el => {
1613
- const text = (el.textContent || '').trim();
1614
- if (!text) {
1615
- return;
1616
- }
1617
- if (text.length <= 200 && footerKeywords.some(keyword => text.includes(keyword))) {
1618
- el.remove();
1619
- }
1620
- });
1621
-
1622
- // 处理代码块
1623
- const codeBlocks = contentClone.querySelectorAll('pre');
1624
- codeBlocks.forEach(block => {
1625
- const codeText = collectCodeText(block);
1626
- if (!codeText) {
1627
- block.remove();
1628
- return;
1629
- }
1630
- let codeInside = block.querySelector('code');
1631
- if (!codeInside) {
1632
- codeInside = document.createElement('code');
1633
- block.appendChild(codeInside);
1634
- }
1635
- codeInside.textContent = codeText;
1636
- });
1637
-
1638
- return contentClone.innerHTML;
1639
- });
1321
+ if (!sanitizedHtml) {
1322
+ throw new Error('未能提取到文章内容');
1323
+ }
1640
1324
 
1641
1325
  return {
1642
1326
  success: true,
1643
1327
  title: article.originalTitle || article.title,
1644
- content: content || `<p>内容提取失败</p>`
1328
+ content: sanitizedHtml
1645
1329
  };
1646
1330
 
1647
1331
  } catch (error) {
1648
- // 判断是否可能是 Cookie 失效
1649
- let errorMessage = error.message;
1650
- if (error.message.includes('Timeout') || error.message.includes('timeout')) {
1651
- errorMessage = 'Cookie 可能已失效或页面加载超时';
1652
- }
1653
-
1332
+ console.error(`[${index}/${total}] 提取文章内容失败: ${article.originalTitle || article.title}`, error);
1654
1333
  return {
1655
1334
  success: false,
1656
1335
  title: article.originalTitle || article.title,
1657
- content: `<p>下载失败: ${errorMessage}</p>`,
1658
- error: errorMessage
1336
+ error: error.message,
1337
+ content: ''
1659
1338
  };
1660
1339
  }
1661
1340
  }
1662
1341
 
1342
+
1663
1343
  // 并发提取文章内容(用于 EPUB)
1664
1344
  async function extractWithConcurrency(context, articles, concurrency = 5, delay = 2000, timeout = 60000) {
1665
1345
  const results = [];
@@ -1766,7 +1446,7 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
1766
1446
  return null;
1767
1447
  }
1768
1448
 
1769
- const options = {
1449
+ const options = {
1770
1450
  title: columnTitle,
1771
1451
  author: columnAuthor || '极客时间',
1772
1452
  publisher: '极客时间',
@@ -2026,13 +1706,46 @@ async function main(options) {
2026
1706
  globalBrowser = browser;
2027
1707
 
2028
1708
  const context = await browser.newContext({
2029
- userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
1709
+ userAgent: DEFAULT_USER_AGENT
2030
1710
  });
2031
1711
 
1712
+ // 兼容用户直接复制整行"Cookie: xxx"
1713
+ let normalizedCookie = cookie.trim();
1714
+ if (/^cookie:/i.test(normalizedCookie)) {
1715
+ normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
1716
+ }
1717
+ globalCookieHeader = normalizedCookie;
1718
+
2032
1719
  // 设置 cookies
2033
- const cookies = parseCookies(cookie);
1720
+ const cookies = parseCookies(normalizedCookie);
2034
1721
  await context.addCookies(cookies);
2035
1722
 
1723
+ // 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
1724
+ await context.route('**/*', (route) => {
1725
+ const request = route.request();
1726
+ let url;
1727
+ try {
1728
+ url = new URL(request.url());
1729
+ } catch {
1730
+ return route.continue();
1731
+ }
1732
+
1733
+ const hostname = url.hostname || '';
1734
+ const isGeekbangDomain =
1735
+ hostname === 'geekbang.org' ||
1736
+ hostname.endsWith('.geekbang.org');
1737
+
1738
+ if (!isGeekbangDomain) {
1739
+ return route.continue();
1740
+ }
1741
+
1742
+ const headers = {
1743
+ ...request.headers(),
1744
+ cookie: normalizedCookie
1745
+ };
1746
+ route.continue({ headers });
1747
+ });
1748
+
2036
1749
  const page = await context.newPage();
2037
1750
 
2038
1751
  try {
@@ -2230,7 +1943,7 @@ async function main(options) {
2230
1943
  program
2231
1944
  .name('geektime-dl')
2232
1945
  .description('批量下载极客时间专栏文章为PDF或EPUB')
2233
- .version('1.1.0')
1946
+ .version(version)
2234
1947
  .option('-u, --url <url>', '专栏文章URL(任意一篇)')
2235
1948
  .option('-c, --cookie <cookie>', 'Cookie字符串(用于认证)')
2236
1949
  .option('-o, --output <dir>', '输出目录', './downloads')
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kadaliao/geektime-downloader",
3
- "version": "1.1.1",
3
+ "version": "1.1.3",
4
4
  "description": "极客时间专栏文章批量下载工具 - 支持一键下载整个专栏为PDF或EPUB",
5
5
  "type": "module",
6
6
  "main": "download.js",