@kadaliao/geektime-downloader 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/download.js CHANGED
@@ -19,6 +19,7 @@ const require = createRequire(import.meta.url);
19
19
  const { version } = require('./package.json');
20
20
 
21
21
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
22
+ let globalCookieHeader = '';
22
23
 
23
24
  // 全局变量:跟踪当前浏览器实例和是否正在关闭
24
25
  let globalBrowser = null;
@@ -244,6 +245,10 @@ const PRINT_FIX_CSS = `
244
245
  }
245
246
  `;
246
247
 
248
+ const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
249
+ const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
250
+ const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
251
+
247
252
  // 解析 cookie 字符串
248
253
  function parseCookies(cookieString) {
249
254
  return cookieString.split(';').map(cookie => {
@@ -257,6 +262,244 @@ function parseCookies(cookieString) {
257
262
  });
258
263
  }
259
264
 
265
+ function normalizeArticleHtml(html = '') {
266
+ if (!html) return '';
267
+ return html
268
+ .replace(/<!--\s*\[\[\[read_end]]\]\s*-->/gi, '')
269
+ .replace(/src="\/\//gi, 'src="https://')
270
+ .replace(/src='\/\//gi, "src='https://")
271
+ .replace(/href="\/\//gi, 'href="https://')
272
+ .replace(/href='\/\//gi, "href='https://");
273
+ }
274
+
275
+ async function fetchArticleData(context, articleId) {
276
+ const maxAttempts = 3;
277
+ const refererUrl = `${GEEKTIME_BASE_URL}/column/article/${articleId}`;
278
+ let lastError = null;
279
+
280
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
281
+ try {
282
+ const response = await context.request.post(ARTICLE_API_URL, {
283
+ headers: {
284
+ 'user-agent': DEFAULT_USER_AGENT,
285
+ 'content-type': 'application/json',
286
+ 'accept': 'application/json, text/plain, */*',
287
+ 'origin': GEEKTIME_BASE_URL,
288
+ 'referer': refererUrl,
289
+ 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
290
+ ...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
291
+ },
292
+ data: {
293
+ id: String(articleId),
294
+ include_neighbors: true,
295
+ is_freelyread: true
296
+ }
297
+ });
298
+
299
+ const bodyText = await response.text();
300
+
301
+ if (!response.ok()) {
302
+ throw new Error(`API请求失败: ${response.status()} ${response.statusText()} - ${bodyText.slice(0, 160)}`);
303
+ }
304
+
305
+ let json;
306
+ try {
307
+ json = JSON.parse(bodyText);
308
+ } catch (parseError) {
309
+ throw new Error(`API响应解析失败: ${parseError.message} - ${bodyText.slice(0, 160)}`);
310
+ }
311
+
312
+ if (!json || json.code !== 0 || !json.data) {
313
+ throw new Error(`无法获取完整文章内容: ${bodyText.slice(0, 160)}`);
314
+ }
315
+
316
+ if (!json.data.article_content) {
317
+ throw new Error('文章内容为空,可能需要更新 Cookie 或重新获取权限');
318
+ }
319
+
320
+ return json.data;
321
+ } catch (error) {
322
+ lastError = error;
323
+ if (attempt < maxAttempts) {
324
+ await new Promise(resolve => setTimeout(resolve, attempt * 700));
325
+ }
326
+ }
327
+ }
328
+
329
+ throw lastError || new Error('未知错误导致文章内容获取失败');
330
+ }
331
+
332
+ async function sanitizeArticleHtml(page, rawHtml) {
333
+ return page.evaluate((html) => {
334
+ const template = document.createElement('template');
335
+ template.innerHTML = html;
336
+
337
+ const removalSelectors = [
338
+ 'nav', 'header', 'footer', 'aside',
339
+ '.comment', '.comments', '.Index_comment',
340
+ '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
341
+ '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
342
+ '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
343
+ '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
344
+ '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
345
+ '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
346
+ '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
347
+ '.copyright', '.statement', '.disclaimer',
348
+ '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
349
+ 'audio', 'video',
350
+ '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
351
+ '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
352
+ '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
353
+ '[data-role="toolbar"]',
354
+ 'button', 'iframe', 'script', 'style'
355
+ ];
356
+ removalSelectors.forEach(selector => {
357
+ template.content.querySelectorAll(selector).forEach(el => el.remove());
358
+ });
359
+
360
+ const pluginKeywords = [
361
+ 'note', 'translation', 'audio', 'player', 'reward', 'donate',
362
+ 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
363
+ 'copyright', 'geeknote', 'bilingual'
364
+ ];
365
+ const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
366
+ const className = (el.className || '').toString().toLowerCase();
367
+ const idValue = (el.id || '').toString().toLowerCase();
368
+ const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
369
+ const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
370
+ const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
371
+ return pluginKeywords.some(keyword => combined.includes(keyword));
372
+ });
373
+ pluginElements.forEach(el => el.remove());
374
+
375
+ const mindmapSelectors = [
376
+ '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
377
+ '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
378
+ '[class*="MindMap"]', '[class*="mindMap"]'
379
+ ];
380
+ mindmapSelectors.forEach(selector => {
381
+ template.content.querySelectorAll(selector).forEach(el => el.remove());
382
+ });
383
+ const vectorCandidates = Array.from(template.content.querySelectorAll('svg, canvas, object, embed'));
384
+ vectorCandidates.forEach(el => {
385
+ const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
386
+ const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
387
+ if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
388
+ el.remove();
389
+ }
390
+ });
391
+
392
+ const allowedTags = new Set([
393
+ 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
394
+ 'UL', 'OL', 'LI',
395
+ 'BLOCKQUOTE', 'PRE', 'CODE',
396
+ 'IMG', 'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', 'FIGURE', 'FIGCAPTION',
397
+ 'STRONG', 'EM', 'B', 'I', 'SPAN', 'DIV', 'BR', 'HR',
398
+ 'A', 'SUP', 'SUB'
399
+ ]);
400
+
401
+ const blockDisplayTags = new Set(['DIV', 'SECTION', 'ARTICLE', 'FIGURE']);
402
+ const allowedAttributes = new Set(['href', 'src', 'alt', 'title', 'class', 'style', 'target', 'rel']);
403
+
404
+ function sanitizeNode(node) {
405
+ const children = Array.from(node.children || []);
406
+ for (const child of children) {
407
+ if (!allowedTags.has(child.tagName)) {
408
+ child.replaceWith(...child.childNodes);
409
+ continue;
410
+ }
411
+
412
+ if (blockDisplayTags.has(child.tagName)) {
413
+ child.style.display = 'block';
414
+ }
415
+
416
+ const attributes = Array.from(child.attributes);
417
+ for (const attr of attributes) {
418
+ if (!allowedAttributes.has(attr.name.toLowerCase())) {
419
+ child.removeAttribute(attr.name);
420
+ }
421
+ }
422
+
423
+ sanitizeNode(child);
424
+ }
425
+ }
426
+
427
+ sanitizeNode(template.content || template);
428
+
429
+ const images = template.content ? template.content.querySelectorAll('img') : [];
430
+ images.forEach(img => {
431
+ img.setAttribute('loading', 'eager');
432
+ img.setAttribute('decoding', 'sync');
433
+ img.style.maxWidth = '100%';
434
+ img.style.height = 'auto';
435
+ });
436
+
437
+ return template.innerHTML;
438
+ }, rawHtml);
439
+ }
440
+
441
+ function escapeHtml(text = '') {
442
+ return text
443
+ .replace(/&/g, '&amp;')
444
+ .replace(/</g, '&lt;')
445
+ .replace(/>/g, '&gt;')
446
+ .replace(/"/g, '&quot;')
447
+ .replace(/'/g, '&#39;');
448
+ }
449
+
450
+ function buildPrintableHtml(title, sanitizedHtml) {
451
+ const baseCss = `
452
+ body {
453
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
454
+ font-size: 16px;
455
+ line-height: 1.8;
456
+ color: #1f2329;
457
+ margin: 0;
458
+ padding: 40px;
459
+ background: #fff;
460
+ }
461
+
462
+ .article-print-wrapper {
463
+ max-width: 900px;
464
+ margin: 0 auto;
465
+ }
466
+
467
+ .article-print-wrapper h1 {
468
+ font-size: 32px;
469
+ line-height: 1.4;
470
+ margin-bottom: 24px;
471
+ }
472
+
473
+ a {
474
+ color: #0f5ef2;
475
+ text-decoration: none;
476
+ }
477
+
478
+ pre {
479
+ background: #f7f7f7;
480
+ padding: 16px;
481
+ border-radius: 6px;
482
+ overflow: auto;
483
+ }
484
+ `;
485
+
486
+ return `
487
+ <!DOCTYPE html>
488
+ <html lang="zh-CN">
489
+ <head>
490
+ <meta charset="utf-8">
491
+ <base href="${GEEKTIME_BASE_URL}">
492
+ <style>${baseCss}${PRINT_FIX_CSS}</style>
493
+ </head>
494
+ <body>
495
+ <div class="article-print-wrapper">
496
+ <h1>${escapeHtml(title)}</h1>
497
+ ${sanitizedHtml}
498
+ </div>
499
+ </body>
500
+ </html>`;
501
+ }
502
+
260
503
  // 获取专栏所有文章列表(通过API)
261
504
  function getValueByPath(obj, path) {
262
505
  if (!obj || !path) return undefined;
@@ -702,128 +945,83 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
702
945
  // 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
703
946
  async function downloadArticleSilent(page, article, outputDir, index, total) {
704
947
  try {
705
- // 访问文章页面
706
- await page.goto(article.url, { waitUntil: 'networkidle' });
707
- await page.waitForTimeout(2000);
708
-
709
- // 注入打印修复样式
710
- await page.addStyleTag({ content: PRINT_FIX_CSS });
711
-
712
- // 激进的布局重构:提取正文并重建页面结构
713
- await page.evaluate((titleText) => {
714
- // 1. 找到文章正文内容
715
- const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
716
-
717
- if (articleContent) {
718
- // 2. 克隆正文内容
719
- const contentClone = articleContent.cloneNode(true);
720
-
721
- // 3. 清空body的所有内容
722
- document.body.innerHTML = '';
723
-
724
- // 4. 重置body样式为全宽
725
- document.body.style.margin = '0';
726
- document.body.style.padding = '0';
727
- document.body.style.width = '100%';
728
- document.body.style.maxWidth = 'none';
729
- document.body.style.boxSizing = 'border-box';
730
-
731
- // 5. 创建一个简单的容器
732
- const wrapper = document.createElement('div');
733
- wrapper.style.width = '100%';
734
- wrapper.style.maxWidth = '100%';
735
- wrapper.style.margin = '0';
736
- wrapper.style.padding = '0';
737
- wrapper.style.boxSizing = 'border-box';
738
-
739
- // 6. 创建标题元素(使用传入的标题文本)
740
- if (titleText) {
741
- const titleElement = document.createElement('h1');
742
- titleElement.textContent = titleText;
743
- // 设置标题样式
744
- titleElement.style.fontSize = '32px';
745
- titleElement.style.fontWeight = 'bold';
746
- titleElement.style.marginBottom = '30px';
747
- titleElement.style.marginTop = '0';
748
- titleElement.style.lineHeight = '1.4';
749
- titleElement.style.color = '#000';
750
- wrapper.appendChild(titleElement);
751
- }
752
-
753
- // 7. 将正文插入容器
754
- wrapper.appendChild(contentClone);
755
-
756
- // 8. 将容器插入body
757
- document.body.appendChild(wrapper);
758
-
759
- // 9. 确保正文内容使用全宽且不溢出
760
- contentClone.style.width = '100%';
761
- contentClone.style.maxWidth = '100%';
762
- contentClone.style.margin = '0';
763
- contentClone.style.padding = '0';
764
- contentClone.style.boxSizing = 'border-box';
765
- contentClone.style.overflowWrap = 'break-word';
766
- contentClone.style.wordBreak = 'break-word';
767
- } else {
768
- // 如果找不到正文,使用原有的删除方法
769
- const selectors = [
770
- 'aside',
771
- '[class*="leftSide"]',
772
- '[class*="LeftSide"]',
773
- '[class*="sidebar"]',
774
- '[class*="Sidebar"]',
775
- '[class*="side_"]',
776
- '[class*="catalog"]',
777
- '[class*="directory"]',
778
- '[class*="toc"]',
779
- '[class*="outline"]',
780
- '[class*="Outline"]',
781
- 'nav',
782
- '[class*="nav"]',
783
- '[class*="Nav"]',
784
- '[class*="rightSide"]',
785
- '[class*="RightSide"]',
786
- '[class*="comment"]',
787
- '[class*="recommend"]',
788
- '[class*="footer"]',
789
- '[class*="bottom"]'
790
- ];
791
-
792
- selectors.forEach(selector => {
793
- try {
794
- const elements = document.querySelectorAll(selector);
795
- elements.forEach(el => el.remove());
796
- } catch (e) {
797
- // 忽略无效选择器
798
- }
799
- });
800
- }
948
+ if (process.env.DEBUG) {
949
+ console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
950
+ }
951
+ const articleData = await fetchArticleData(page.context(), article.id);
952
+ if (process.env.DEBUG) {
953
+ console.log(chalk.gray(`[silent] 已获取文章数据 ${article.id}`));
954
+ }
955
+ const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
956
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
957
+ if (process.env.DEBUG) {
958
+ console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
959
+ }
960
+ const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
801
961
 
802
- // 额外:删除所有包含"大纲"的元素
803
- const allElements = document.querySelectorAll('*');
804
- allElements.forEach(el => {
805
- const text = el.textContent || el.innerText || '';
806
- if (text.trim() === '大纲' ||
807
- (text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
808
- el.remove();
962
+ await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
963
+ if (process.env.DEBUG) {
964
+ console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
965
+ }
966
+ if (process.env.DEBUG) {
967
+ console.log(chalk.gray(`[silent] 等待图片初步加载 ${article.id}`));
968
+ }
969
+ try {
970
+ await page.waitForFunction(() => {
971
+ const imgs = Array.from(document.images || []);
972
+ if (imgs.length === 0) {
973
+ return true;
809
974
  }
810
- });
811
- }, article.originalTitle || article.title);
812
-
813
- // 等待文章内容加载
814
- await page.waitForSelector('.Index_articleContent_QBG5G, .content');
975
+ return imgs.every(img => img.complete);
976
+ }, { timeout: 30000 });
977
+ } catch (waitError) {
978
+ if (process.env.DEBUG) {
979
+ console.log(chalk.gray(`[silent] 图片初步加载等待超时 ${article.id}: ${waitError?.message || waitError}`));
980
+ }
981
+ }
982
+ try {
983
+ await page.waitForLoadState('networkidle', { timeout: 5000 });
984
+ if (process.env.DEBUG) {
985
+ console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
986
+ }
987
+ } catch {
988
+ // 忽略由于没有额外资源导致的延时
989
+ if (process.env.DEBUG) {
990
+ console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
991
+ }
992
+ }
815
993
 
816
994
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
995
+ if (process.env.DEBUG) {
996
+ console.log(chalk.gray(`[silent] 开始处理图片 ${article.id}`));
997
+ }
817
998
  await page.evaluate(() => {
818
999
  const images = document.querySelectorAll('img');
819
1000
  const promises = Array.from(images).map(img => {
820
1001
  return new Promise((resolve) => {
1002
+ let resolved = false;
1003
+ const safeResolve = () => {
1004
+ if (!resolved) {
1005
+ resolved = true;
1006
+ resolve();
1007
+ }
1008
+ };
1009
+ const attachTimeout = () => setTimeout(safeResolve, 15000);
1010
+ let fallbackTimer = null;
1011
+
821
1012
  // 如果图片还未加载完成,等待加载
822
1013
  if (!img.complete) {
823
- img.onload = () => processImage(img, resolve);
824
- img.onerror = () => resolve(); // 图片加载失败,跳过
1014
+ fallbackTimer = attachTimeout();
1015
+ img.onload = () => {
1016
+ if (fallbackTimer) clearTimeout(fallbackTimer);
1017
+ processImage(img, safeResolve);
1018
+ };
1019
+ img.onerror = () => {
1020
+ if (fallbackTimer) clearTimeout(fallbackTimer);
1021
+ safeResolve(); // 图片加载失败,跳过
1022
+ };
825
1023
  } else {
826
- processImage(img, resolve);
1024
+ processImage(img, safeResolve);
827
1025
  }
828
1026
  });
829
1027
  });
@@ -851,12 +1049,21 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
851
1049
  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
852
1050
 
853
1051
  // 转换为压缩后的data URL
1052
+ let hasResolved = false;
1053
+ const finalize = () => {
1054
+ if (!hasResolved) {
1055
+ hasResolved = true;
1056
+ resolve();
1057
+ }
1058
+ };
854
1059
  canvas.toBlob((blob) => {
855
- const url = URL.createObjectURL(blob);
856
- img.src = url;
1060
+ if (blob) {
1061
+ const url = URL.createObjectURL(blob);
1062
+ img.src = url;
1063
+ }
857
1064
  img.style.width = maxWidth + 'px';
858
1065
  img.style.height = 'auto';
859
- resolve();
1066
+ finalize();
860
1067
  }, 'image/jpeg', quality);
861
1068
  } catch (e) {
862
1069
  // 如果压缩失败,至少限制大小
@@ -868,9 +1075,15 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
868
1075
 
869
1076
  return Promise.all(promises);
870
1077
  });
1078
+ if (process.env.DEBUG) {
1079
+ console.log(chalk.gray(`[silent] 图片处理完成 ${article.id}`));
1080
+ }
871
1081
 
872
1082
  // 等待图片处理完成
873
- await page.waitForTimeout(1000);
1083
+ await page.waitForTimeout(30000);
1084
+ if (process.env.DEBUG) {
1085
+ console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
1086
+ }
874
1087
 
875
1088
  // 生成 PDF
876
1089
  const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
@@ -888,10 +1101,16 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
888
1101
  printBackground: false, // 关闭背景打印,显著减小文件大小
889
1102
  preferCSSPageSize: false
890
1103
  });
1104
+ if (process.env.DEBUG) {
1105
+ console.log(chalk.gray(`[silent] PDF生成完成 ${article.id}`));
1106
+ }
891
1107
 
892
1108
  return { success: true, title: article.title };
893
1109
 
894
1110
  } catch (error) {
1111
+ if (process.env.DEBUG) {
1112
+ console.log(chalk.red(`[silent] 文章 ${article.id} 失败: ${error.message}`));
1113
+ }
895
1114
  return { success: false, title: article.title, error: error.message };
896
1115
  }
897
1116
  }
@@ -901,116 +1120,17 @@ async function downloadArticle(page, article, outputDir, index, total) {
901
1120
  const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
902
1121
 
903
1122
  try {
904
- // 访问文章页面
905
- await page.goto(article.url, { waitUntil: 'networkidle' });
906
- await page.waitForTimeout(2000);
907
-
908
- // 注入打印修复样式
909
- await page.addStyleTag({ content: PRINT_FIX_CSS });
910
-
911
- // 激进的布局重构:提取正文并重建页面结构
912
- await page.evaluate((titleText) => {
913
- // 1. 找到文章正文内容
914
- const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
915
-
916
- if (articleContent) {
917
- // 2. 克隆正文内容
918
- const contentClone = articleContent.cloneNode(true);
919
-
920
- // 3. 清空body的所有内容
921
- document.body.innerHTML = '';
922
-
923
- // 4. 重置body样式为全宽
924
- document.body.style.margin = '0';
925
- document.body.style.padding = '0';
926
- document.body.style.width = '100%';
927
- document.body.style.maxWidth = 'none';
928
- document.body.style.boxSizing = 'border-box';
929
-
930
- // 5. 创建一个简单的容器
931
- const wrapper = document.createElement('div');
932
- wrapper.style.width = '100%';
933
- wrapper.style.maxWidth = '100%';
934
- wrapper.style.margin = '0';
935
- wrapper.style.padding = '0';
936
- wrapper.style.boxSizing = 'border-box';
937
-
938
- // 6. 创建标题元素(使用传入的标题文本)
939
- if (titleText) {
940
- const titleElement = document.createElement('h1');
941
- titleElement.textContent = titleText;
942
- // 设置标题样式
943
- titleElement.style.fontSize = '32px';
944
- titleElement.style.fontWeight = 'bold';
945
- titleElement.style.marginBottom = '30px';
946
- titleElement.style.marginTop = '0';
947
- titleElement.style.lineHeight = '1.4';
948
- titleElement.style.color = '#000';
949
- wrapper.appendChild(titleElement);
950
- }
951
-
952
- // 7. 将正文插入容器
953
- wrapper.appendChild(contentClone);
954
-
955
- // 8. 将容器插入body
956
- document.body.appendChild(wrapper);
957
-
958
- // 9. 确保正文内容使用全宽且不溢出
959
- contentClone.style.width = '100%';
960
- contentClone.style.maxWidth = '100%';
961
- contentClone.style.margin = '0';
962
- contentClone.style.padding = '0';
963
- contentClone.style.boxSizing = 'border-box';
964
- contentClone.style.overflowWrap = 'break-word';
965
- contentClone.style.wordBreak = 'break-word';
966
- } else {
967
- // 如果找不到正文,使用原有的删除方法
968
- const selectors = [
969
- 'aside',
970
- '[class*="leftSide"]',
971
- '[class*="LeftSide"]',
972
- '[class*="sidebar"]',
973
- '[class*="Sidebar"]',
974
- '[class*="side_"]',
975
- '[class*="catalog"]',
976
- '[class*="directory"]',
977
- '[class*="toc"]',
978
- '[class*="outline"]',
979
- '[class*="Outline"]',
980
- 'nav',
981
- '[class*="nav"]',
982
- '[class*="Nav"]',
983
- '[class*="rightSide"]',
984
- '[class*="RightSide"]',
985
- '[class*="comment"]',
986
- '[class*="recommend"]',
987
- '[class*="footer"]',
988
- '[class*="bottom"]'
989
- ];
990
-
991
- selectors.forEach(selector => {
992
- try {
993
- const elements = document.querySelectorAll(selector);
994
- elements.forEach(el => el.remove());
995
- } catch (e) {
996
- // 忽略无效选择器
997
- }
998
- });
999
- }
1000
-
1001
- // 额外:删除所有包含"大纲"的元素
1002
- const allElements = document.querySelectorAll('*');
1003
- allElements.forEach(el => {
1004
- const text = el.textContent || el.innerText || '';
1005
- if (text.trim() === '大纲' ||
1006
- (text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
1007
- el.remove();
1008
- }
1009
- });
1010
- }, article.originalTitle || article.title);
1123
+ const articleData = await fetchArticleData(page.context(), article.id);
1124
+ const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1125
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1126
+ const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
1011
1127
 
1012
- // 等待文章内容加载
1013
- await page.waitForSelector('.Index_articleContent_QBG5G, .content');
1128
+ await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
1129
+ try {
1130
+ await page.waitForLoadState('networkidle', { timeout: 5000 });
1131
+ } catch {
1132
+ // 没有额外资源加载时忽略
1133
+ }
1014
1134
 
1015
1135
  // 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
1016
1136
  await page.evaluate(() => {
@@ -1209,460 +1329,32 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
1209
1329
  // 提取单篇文章的 HTML 内容(用于 EPUB 生成)
1210
1330
  async function extractArticleContent(page, article, index, total) {
1211
1331
  try {
1212
- // 访问文章页面
1213
- await page.goto(article.url, { waitUntil: 'networkidle' });
1214
-
1215
- // 等待文章内容加载
1216
- await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 60000 });
1217
-
1218
- // 关键:等待文章完整内容加载,而不是试看内容
1219
- // 滚动页面以触发懒加载内容
1220
- await page.evaluate(async () => {
1221
- await new Promise((resolve) => {
1222
- let totalHeight = 0;
1223
- const distance = 100;
1224
- const timer = setInterval(() => {
1225
- const scrollHeight = document.body.scrollHeight;
1226
- window.scrollBy(0, distance);
1227
- totalHeight += distance;
1228
-
1229
- if (totalHeight >= scrollHeight) {
1230
- clearInterval(timer);
1231
- resolve();
1232
- }
1233
- }, 100);
1234
- });
1235
- });
1236
-
1237
- // 再等待一段时间,确保内容完全加载
1238
- await page.waitForTimeout(3000);
1239
-
1240
- // 提取文章 HTML 内容
1241
- const content = await page.evaluate(() => {
1242
- // 找到文章正文内容
1243
- const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
1244
-
1245
- if (!articleContent) {
1246
- return null;
1247
- }
1248
-
1249
- // 克隆正文以避免修改原始DOM
1250
- const contentClone = articleContent.cloneNode(true);
1251
-
1252
- // 白名单策略:只保留正文核心元素
1253
- // 允许的元素标签
1254
- const allowedTags = new Set([
1255
- 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', // 段落和标题
1256
- 'UL', 'OL', 'LI', // 列表
1257
- 'BLOCKQUOTE', // 引用
1258
- 'PRE', 'CODE', // 代码
1259
- 'IMG', // 图片
1260
- 'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', // 表格
1261
- 'A', // 链接
1262
- 'STRONG', 'B', 'EM', 'I', 'U', // 强调和样式
1263
- 'BR', 'HR', // 换行和分隔线
1264
- 'FIGURE', 'FIGCAPTION', 'DETAILS', 'SUMMARY',
1265
- 'SPAN', 'DIV', 'SECTION', 'ARTICLE' // 容器(可能包含文本)
1266
- ]);
1267
-
1268
- // 在清理前,移除常见的非正文区域
1269
- const removalSelectors = [
1270
- 'nav', 'header', 'footer', 'aside',
1271
- '.comment', '.comments', '.Index_comment',
1272
- '.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
1273
- '.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
1274
- '.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
1275
- '.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
1276
- '.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
1277
- '.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
1278
- '.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
1279
- '.copyright', '.statement', '.disclaimer',
1280
- '.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
1281
- 'audio', 'video',
1282
- '[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
1283
- '[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
1284
- '[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
1285
- '[data-role="toolbar"]',
1286
- 'button', 'iframe', 'script', 'style'
1287
- ];
1288
- removalSelectors.forEach(selector => {
1289
- contentClone.querySelectorAll(selector).forEach(el => el.remove());
1290
- });
1291
-
1292
- // 根据关键词进一步移除插件类元素
1293
- const pluginKeywords = [
1294
- 'note', 'translation', 'audio', 'player', 'reward', 'donate',
1295
- 'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
1296
- 'copyright', 'geeknote', 'bilingual'
1297
- ];
1298
- const pluginElements = Array.from(contentClone.querySelectorAll('*')).filter(el => {
1299
- const className = (el.className || '').toString().toLowerCase();
1300
- const idValue = (el.id || '').toString().toLowerCase();
1301
- const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
1302
- const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
1303
- const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
1304
- return pluginKeywords.some(keyword => combined.includes(keyword));
1305
- });
1306
- pluginElements.forEach(el => el.remove());
1307
-
1308
- // 移除 MindMap 等 SVG/Canvas 思维导图内容(阅读器无法正确渲染)
1309
- const mindmapSelectors = [
1310
- '.mindmap', '.mind-map', '.MindMap', '.Mind-map',
1311
- '[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
1312
- '[class*="MindMap"]', '[class*="mindMap"]'
1313
- ];
1314
- mindmapSelectors.forEach(selector => {
1315
- contentClone.querySelectorAll(selector).forEach(el => el.remove());
1316
- });
1317
- const vectorCandidates = Array.from(contentClone.querySelectorAll('svg, canvas, object, embed'));
1318
- vectorCandidates.forEach(el => {
1319
- const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
1320
- const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
1321
- if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
1322
- el.remove();
1323
- }
1324
- });
1325
-
1326
- // 将富文本中的代码块结构转换为标准 <pre><code>
1327
- const blockSeparatorTags = new Set([
1328
- 'P','DIV','SECTION','ARTICLE','UL','OL','LI','FIGURE','FIGCAPTION',
1329
- 'TABLE','THEAD','TBODY','TR','TD'
1330
- ]);
1331
-
1332
- function collectCodeText(node) {
1333
- const parts = [];
1334
-
1335
- const ensureNewline = () => {
1336
- if (!parts.length) {
1337
- parts.push('\n');
1338
- return;
1339
- }
1340
- if (!parts[parts.length - 1].endsWith('\n')) {
1341
- parts.push('\n');
1342
- }
1343
- };
1344
-
1345
- const traverse = (current) => {
1346
- if (!current) {
1347
- return;
1348
- }
1349
- if (current.nodeType === Node.TEXT_NODE) {
1350
- const textValue = current.textContent.replace(/\u00A0/g, ' ');
1351
- if (textValue) {
1352
- parts.push(textValue);
1353
- }
1354
- return;
1355
- }
1356
- if (current.nodeType !== Node.ELEMENT_NODE) {
1357
- return;
1358
- }
1359
- const tag = current.tagName.toUpperCase();
1360
- if (tag === 'BR') {
1361
- ensureNewline();
1362
- return;
1363
- }
1364
- Array.from(current.childNodes).forEach(traverse);
1365
- if (blockSeparatorTags.has(tag)) {
1366
- ensureNewline();
1367
- }
1368
- };
1369
-
1370
- traverse(node);
1371
- let text = parts.join('');
1372
- text = text
1373
- .replace(/\r\n/g, '\n')
1374
- .replace(/\n{3,}/g, '\n\n')
1375
- .replace(/[ \t]+\n/g, '\n')
1376
- .replace(/\n+$/g, '\n');
1377
- return text.trim() ? text : '';
1378
- }
1379
-
1380
- const codeLikeSelectors = [
1381
- '[data-slate-type="code"]',
1382
- '[data-slate-node="code"]',
1383
- '[data-code-block]',
1384
- '[data-code]',
1385
- '[data-code-language]',
1386
- '[class*="code-block"]',
1387
- '[class*="CodeBlock"]'
1388
- ];
1389
- const codeCandidates = new Set();
1390
- codeLikeSelectors.forEach(selector => {
1391
- contentClone.querySelectorAll(selector).forEach(el => codeCandidates.add(el));
1392
- });
1393
- const replaceWithPre = (element) => {
1394
- if (!element || !element.parentNode) {
1395
- return;
1396
- }
1397
- const codeText = collectCodeText(element);
1398
- if (!codeText) {
1399
- element.remove();
1400
- return;
1401
- }
1402
- const pre = document.createElement('pre');
1403
- const code = document.createElement('code');
1404
- code.textContent = codeText;
1405
- pre.appendChild(code);
1406
- element.parentNode.replaceChild(pre, element);
1407
- };
1408
- codeCandidates.forEach(el => {
1409
- if (el.tagName && el.tagName.toUpperCase() === 'PRE') {
1410
- return;
1411
- }
1412
- replaceWithPre(el);
1413
- });
1414
-
1415
- const multilineInlineCodes = Array.from(contentClone.querySelectorAll('code')).filter(codeEl => {
1416
- const parent = codeEl.parentElement;
1417
- return parent && parent.tagName.toUpperCase() !== 'PRE' && codeEl.textContent.includes('\n');
1418
- });
1419
- multilineInlineCodes.forEach(codeEl => {
1420
- const codeText = collectCodeText(codeEl);
1421
- if (!codeText) {
1422
- codeEl.remove();
1423
- return;
1424
- }
1425
- const pre = document.createElement('pre');
1426
- const innerCode = document.createElement('code');
1427
- innerCode.textContent = codeText;
1428
- pre.appendChild(innerCode);
1429
- codeEl.parentNode.replaceChild(pre, codeEl);
1430
- });
1431
-
1432
- // 递归清理函数:移除不在白名单中的元素
1433
- function cleanElement(element) {
1434
- const children = Array.from(element.childNodes);
1435
-
1436
- for (const child of children) {
1437
- if (child.nodeType === Node.ELEMENT_NODE) {
1438
- const tagName = child.tagName.toUpperCase();
1439
-
1440
- if (!allowedTags.has(tagName)) {
1441
- // 先递归处理子节点
1442
- cleanElement(child);
1443
-
1444
- if (child.childNodes.length > 0) {
1445
- while (child.firstChild) {
1446
- element.insertBefore(child.firstChild, child);
1447
- }
1448
- child.remove();
1449
- } else {
1450
- const textContent = (child.textContent || '').trim();
1451
- if (textContent) {
1452
- const textNode = document.createTextNode(textContent + ' ');
1453
- element.insertBefore(textNode, child);
1454
- }
1455
- child.remove();
1456
- }
1457
- } else {
1458
- cleanElement(child);
1459
- }
1460
- }
1461
- }
1462
- }
1463
-
1464
- cleanElement(contentClone);
1465
-
1466
- // 移除所有style属性,避免样式冲突
1467
- const allElements = contentClone.querySelectorAll('*');
1468
- allElements.forEach(el => {
1469
- el.removeAttribute('style');
1470
- el.removeAttribute('class');
1471
- el.removeAttribute('id');
1472
- el.removeAttribute('onclick');
1473
- el.removeAttribute('onload');
1474
- });
1475
-
1476
- // 处理图片URL
1477
- const images = contentClone.querySelectorAll('img');
1478
- const adKeywordLower = ['ad', 'advert', 'banner', 'qrcode', 'qr-code', 'reward', 'donate', 'appdownload', 'app-download', 'sponsor', 'thanks'];
1479
- const adKeywordCn = ['广告', '二维码', '赞赏', '打赏', '版权', '推广'];
1480
- images.forEach(img => {
1481
- let src = img.getAttribute('src');
1482
- const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || img.getAttribute('data-lazy-src');
1483
-
1484
- if (dataSrc && (dataSrc.startsWith('http://') || dataSrc.startsWith('https://'))) {
1485
- src = dataSrc;
1486
- img.setAttribute('src', src);
1487
- }
1488
-
1489
- if (!src || src.startsWith('blob:') || src.startsWith('data:')) {
1490
- img.remove();
1491
- return;
1492
- }
1493
-
1494
- if (!src.startsWith('http://') && !src.startsWith('https://')) {
1495
- try {
1496
- const absoluteUrl = new URL(src, window.location.href).href;
1497
- img.setAttribute('src', absoluteUrl);
1498
- src = absoluteUrl;
1499
- } catch (e) {
1500
- img.remove();
1501
- }
1502
- }
1503
-
1504
- const altText = img.getAttribute('alt') || '';
1505
- const altLower = altText.toLowerCase();
1506
- const srcLower = (src || '').toLowerCase();
1507
- if (
1508
- adKeywordLower.some(keyword => srcLower.includes(keyword)) ||
1509
- adKeywordLower.some(keyword => altLower.includes(keyword)) ||
1510
- adKeywordCn.some(keyword => altText.includes(keyword))
1511
- ) {
1512
- img.remove();
1513
- return;
1514
- }
1515
-
1516
- // 清理图片属性
1517
- const imgAttrs = img.attributes;
1518
- for (let i = imgAttrs.length - 1; i >= 0; i--) {
1519
- const attrName = imgAttrs[i].name;
1520
- if (attrName !== 'src' && attrName !== 'alt') {
1521
- img.removeAttribute(attrName);
1522
- }
1523
- }
1524
- });
1525
-
1526
- // 清理空的div和span
1527
- const containers = contentClone.querySelectorAll('div, span');
1528
- containers.forEach(container => {
1529
- if (!container.textContent.trim() && !container.querySelector('img, pre, code, table')) {
1530
- container.remove();
1531
- }
1532
- });
1533
-
1534
- // 将只包含纯文本的 div 转换为段落,避免没有段间距
1535
- const blockLikeTags = new Set(['P','UL','OL','LI','TABLE','PRE','BLOCKQUOTE','H1','H2','H3','H4','H5','H6','IMG','SECTION','ARTICLE','FIGURE','FIGCAPTION','DETAILS','SUMMARY']);
1536
- const textContainers = Array.from(contentClone.querySelectorAll('div, section, article')).reverse();
1537
- textContainers.forEach(container => {
1538
- if (container === contentClone) {
1539
- return;
1540
- }
1541
-
1542
- if (!container.textContent.trim()) {
1543
- return;
1544
- }
1545
-
1546
- if (container.querySelector('img, pre, table, ul, ol, blockquote, h1, h2, h3, h4, h5, h6, figure')) {
1547
- return;
1548
- }
1332
+ const articleData = await fetchArticleData(page.context(), article.id);
1333
+ const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
1334
+ const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
1549
1335
 
1550
- const hasBlockChildren = Array.from(container.children).some(child => blockLikeTags.has(child.tagName?.toUpperCase()));
1551
- if (hasBlockChildren) {
1552
- return;
1553
- }
1554
-
1555
- const paragraph = document.createElement('p');
1556
- paragraph.innerHTML = container.innerHTML;
1557
- container.parentNode.replaceChild(paragraph, container);
1558
- });
1559
-
1560
- // 包装直接挂在容器下的文本或行内节点,避免散乱文本没有段落间距
1561
- const inlineTags = new Set(['A','SPAN','STRONG','B','EM','I','U','CODE','SMALL','SUB','SUP','MARK']);
1562
-
1563
- function wrapInlineChildren(element) {
1564
- const tagName = element.tagName ? element.tagName.toUpperCase() : '';
1565
- if (['P','LI','PRE','CODE','TABLE','THEAD','TBODY','TR'].includes(tagName)) {
1566
- return;
1567
- }
1568
-
1569
- const childNodes = Array.from(element.childNodes);
1570
- let buffer = [];
1571
-
1572
- const flushBuffer = (referenceNode) => {
1573
- if (!buffer.length) {
1574
- return;
1575
- }
1576
- const paragraph = document.createElement('p');
1577
- buffer.forEach(node => paragraph.appendChild(node));
1578
- element.insertBefore(paragraph, referenceNode);
1579
- buffer = [];
1580
- };
1581
-
1582
- for (const node of childNodes) {
1583
- if (node.nodeType === Node.TEXT_NODE) {
1584
- if (node.textContent.trim()) {
1585
- buffer.push(node);
1586
- } else {
1587
- element.removeChild(node);
1588
- }
1589
- continue;
1590
- }
1591
-
1592
- if (node.nodeType === Node.ELEMENT_NODE) {
1593
- const childTag = node.tagName.toUpperCase();
1594
- if (inlineTags.has(childTag) || childTag === 'BR') {
1595
- buffer.push(node);
1596
- continue;
1597
- }
1598
-
1599
- flushBuffer(node);
1600
- wrapInlineChildren(node);
1601
- continue;
1602
- }
1603
-
1604
- flushBuffer(node);
1605
- }
1606
-
1607
- flushBuffer(null);
1608
- }
1609
-
1610
- wrapInlineChildren(contentClone);
1611
-
1612
- // 移除尾部的版权/广告声明
1613
- const footerKeywords = ['版权', '未经许可', '未经授权', '不得转载', '未经允许', 'All Rights Reserved', '最终解释权', '转载'];
1614
- const trailingElements = Array.from(contentClone.querySelectorAll('p, div, section')).slice(-6);
1615
- trailingElements.forEach(el => {
1616
- const text = (el.textContent || '').trim();
1617
- if (!text) {
1618
- return;
1619
- }
1620
- if (text.length <= 200 && footerKeywords.some(keyword => text.includes(keyword))) {
1621
- el.remove();
1622
- }
1623
- });
1624
-
1625
- // 处理代码块
1626
- const codeBlocks = contentClone.querySelectorAll('pre');
1627
- codeBlocks.forEach(block => {
1628
- const codeText = collectCodeText(block);
1629
- if (!codeText) {
1630
- block.remove();
1631
- return;
1632
- }
1633
- let codeInside = block.querySelector('code');
1634
- if (!codeInside) {
1635
- codeInside = document.createElement('code');
1636
- block.appendChild(codeInside);
1637
- }
1638
- codeInside.textContent = codeText;
1639
- });
1640
-
1641
- return contentClone.innerHTML;
1642
- });
1336
+ if (!sanitizedHtml) {
1337
+ throw new Error('未能提取到文章内容');
1338
+ }
1643
1339
 
1644
1340
  return {
1645
1341
  success: true,
1646
1342
  title: article.originalTitle || article.title,
1647
- content: content || `<p>内容提取失败</p>`
1343
+ content: sanitizedHtml
1648
1344
  };
1649
1345
 
1650
1346
  } catch (error) {
1651
- // 判断是否可能是 Cookie 失效
1652
- let errorMessage = error.message;
1653
- if (error.message.includes('Timeout') || error.message.includes('timeout')) {
1654
- errorMessage = 'Cookie 可能已失效或页面加载超时';
1655
- }
1656
-
1347
+ console.error(`[${index}/${total}] 提取文章内容失败: ${article.originalTitle || article.title}`, error);
1657
1348
  return {
1658
1349
  success: false,
1659
1350
  title: article.originalTitle || article.title,
1660
- content: `<p>下载失败: ${errorMessage}</p>`,
1661
- error: errorMessage
1351
+ error: error.message,
1352
+ content: ''
1662
1353
  };
1663
1354
  }
1664
1355
  }
1665
1356
 
1357
+
1666
1358
  // 并发提取文章内容(用于 EPUB)
1667
1359
  async function extractWithConcurrency(context, articles, concurrency = 5, delay = 2000, timeout = 60000) {
1668
1360
  const results = [];
@@ -1769,7 +1461,7 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
1769
1461
  return null;
1770
1462
  }
1771
1463
 
1772
- const options = {
1464
+ const options = {
1773
1465
  title: columnTitle,
1774
1466
  author: columnAuthor || '极客时间',
1775
1467
  publisher: '极客时间',
@@ -2029,13 +1721,46 @@ async function main(options) {
2029
1721
  globalBrowser = browser;
2030
1722
 
2031
1723
  const context = await browser.newContext({
2032
- userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
1724
+ userAgent: DEFAULT_USER_AGENT
2033
1725
  });
2034
1726
 
1727
+ // 兼容用户直接复制整行"Cookie: xxx"
1728
+ let normalizedCookie = cookie.trim();
1729
+ if (/^cookie:/i.test(normalizedCookie)) {
1730
+ normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
1731
+ }
1732
+ globalCookieHeader = normalizedCookie;
1733
+
2035
1734
  // 设置 cookies
2036
- const cookies = parseCookies(cookie);
1735
+ const cookies = parseCookies(normalizedCookie);
2037
1736
  await context.addCookies(cookies);
2038
1737
 
1738
+ // 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
1739
+ await context.route('**/*', (route) => {
1740
+ const request = route.request();
1741
+ let url;
1742
+ try {
1743
+ url = new URL(request.url());
1744
+ } catch {
1745
+ return route.continue();
1746
+ }
1747
+
1748
+ const hostname = url.hostname || '';
1749
+ const isGeekbangDomain =
1750
+ hostname === 'geekbang.org' ||
1751
+ hostname.endsWith('.geekbang.org');
1752
+
1753
+ if (!isGeekbangDomain) {
1754
+ return route.continue();
1755
+ }
1756
+
1757
+ const headers = {
1758
+ ...request.headers(),
1759
+ cookie: normalizedCookie
1760
+ };
1761
+ route.continue({ headers });
1762
+ });
1763
+
2039
1764
  const page = await context.newPage();
2040
1765
 
2041
1766
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kadaliao/geektime-downloader",
3
- "version": "1.1.2",
3
+ "version": "1.1.4",
4
4
  "description": "极客时间专栏文章批量下载工具 - 支持一键下载整个专栏为PDF或EPUB",
5
5
  "type": "module",
6
6
  "main": "download.js",