@kadaliao/geektime-downloader 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/download.js +391 -666
- package/package.json +1 -1
- package/kadaliao-geektime-downloader-1.1.1.tgz +0 -0
package/download.js
CHANGED
|
@@ -19,6 +19,7 @@ const require = createRequire(import.meta.url);
|
|
|
19
19
|
const { version } = require('./package.json');
|
|
20
20
|
|
|
21
21
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
22
|
+
let globalCookieHeader = '';
|
|
22
23
|
|
|
23
24
|
// 全局变量:跟踪当前浏览器实例和是否正在关闭
|
|
24
25
|
let globalBrowser = null;
|
|
@@ -244,6 +245,10 @@ const PRINT_FIX_CSS = `
|
|
|
244
245
|
}
|
|
245
246
|
`;
|
|
246
247
|
|
|
248
|
+
const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
|
|
249
|
+
const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
|
|
250
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
251
|
+
|
|
247
252
|
// 解析 cookie 字符串
|
|
248
253
|
function parseCookies(cookieString) {
|
|
249
254
|
return cookieString.split(';').map(cookie => {
|
|
@@ -257,6 +262,244 @@ function parseCookies(cookieString) {
|
|
|
257
262
|
});
|
|
258
263
|
}
|
|
259
264
|
|
|
265
|
+
function normalizeArticleHtml(html = '') {
|
|
266
|
+
if (!html) return '';
|
|
267
|
+
return html
|
|
268
|
+
.replace(/<!--\s*\[\[\[read_end]]\]\s*-->/gi, '')
|
|
269
|
+
.replace(/src="\/\//gi, 'src="https://')
|
|
270
|
+
.replace(/src='\/\//gi, "src='https://")
|
|
271
|
+
.replace(/href="\/\//gi, 'href="https://')
|
|
272
|
+
.replace(/href='\/\//gi, "href='https://");
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
async function fetchArticleData(context, articleId) {
|
|
276
|
+
const maxAttempts = 3;
|
|
277
|
+
const refererUrl = `${GEEKTIME_BASE_URL}/column/article/${articleId}`;
|
|
278
|
+
let lastError = null;
|
|
279
|
+
|
|
280
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
281
|
+
try {
|
|
282
|
+
const response = await context.request.post(ARTICLE_API_URL, {
|
|
283
|
+
headers: {
|
|
284
|
+
'user-agent': DEFAULT_USER_AGENT,
|
|
285
|
+
'content-type': 'application/json',
|
|
286
|
+
'accept': 'application/json, text/plain, */*',
|
|
287
|
+
'origin': GEEKTIME_BASE_URL,
|
|
288
|
+
'referer': refererUrl,
|
|
289
|
+
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
290
|
+
...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
|
|
291
|
+
},
|
|
292
|
+
data: {
|
|
293
|
+
id: String(articleId),
|
|
294
|
+
include_neighbors: true,
|
|
295
|
+
is_freelyread: true
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
const bodyText = await response.text();
|
|
300
|
+
|
|
301
|
+
if (!response.ok()) {
|
|
302
|
+
throw new Error(`API请求失败: ${response.status()} ${response.statusText()} - ${bodyText.slice(0, 160)}`);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
let json;
|
|
306
|
+
try {
|
|
307
|
+
json = JSON.parse(bodyText);
|
|
308
|
+
} catch (parseError) {
|
|
309
|
+
throw new Error(`API响应解析失败: ${parseError.message} - ${bodyText.slice(0, 160)}`);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (!json || json.code !== 0 || !json.data) {
|
|
313
|
+
throw new Error(`无法获取完整文章内容: ${bodyText.slice(0, 160)}`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (!json.data.article_content) {
|
|
317
|
+
throw new Error('文章内容为空,可能需要更新 Cookie 或重新获取权限');
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return json.data;
|
|
321
|
+
} catch (error) {
|
|
322
|
+
lastError = error;
|
|
323
|
+
if (attempt < maxAttempts) {
|
|
324
|
+
await new Promise(resolve => setTimeout(resolve, attempt * 700));
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
throw lastError || new Error('未知错误导致文章内容获取失败');
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
async function sanitizeArticleHtml(page, rawHtml) {
|
|
333
|
+
return page.evaluate((html) => {
|
|
334
|
+
const template = document.createElement('template');
|
|
335
|
+
template.innerHTML = html;
|
|
336
|
+
|
|
337
|
+
const removalSelectors = [
|
|
338
|
+
'nav', 'header', 'footer', 'aside',
|
|
339
|
+
'.comment', '.comments', '.Index_comment',
|
|
340
|
+
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
341
|
+
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
342
|
+
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
343
|
+
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
344
|
+
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
345
|
+
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
346
|
+
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
347
|
+
'.copyright', '.statement', '.disclaimer',
|
|
348
|
+
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
349
|
+
'audio', 'video',
|
|
350
|
+
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
351
|
+
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
352
|
+
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
353
|
+
'[data-role="toolbar"]',
|
|
354
|
+
'button', 'iframe', 'script', 'style'
|
|
355
|
+
];
|
|
356
|
+
removalSelectors.forEach(selector => {
|
|
357
|
+
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
const pluginKeywords = [
|
|
361
|
+
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
362
|
+
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
363
|
+
'copyright', 'geeknote', 'bilingual'
|
|
364
|
+
];
|
|
365
|
+
const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
|
|
366
|
+
const className = (el.className || '').toString().toLowerCase();
|
|
367
|
+
const idValue = (el.id || '').toString().toLowerCase();
|
|
368
|
+
const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
|
|
369
|
+
const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
|
|
370
|
+
const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
|
|
371
|
+
return pluginKeywords.some(keyword => combined.includes(keyword));
|
|
372
|
+
});
|
|
373
|
+
pluginElements.forEach(el => el.remove());
|
|
374
|
+
|
|
375
|
+
const mindmapSelectors = [
|
|
376
|
+
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
377
|
+
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
378
|
+
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
379
|
+
];
|
|
380
|
+
mindmapSelectors.forEach(selector => {
|
|
381
|
+
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
382
|
+
});
|
|
383
|
+
const vectorCandidates = Array.from(template.content.querySelectorAll('svg, canvas, object, embed'));
|
|
384
|
+
vectorCandidates.forEach(el => {
|
|
385
|
+
const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
|
|
386
|
+
const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
|
|
387
|
+
if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
|
|
388
|
+
el.remove();
|
|
389
|
+
}
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
const allowedTags = new Set([
|
|
393
|
+
'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
|
|
394
|
+
'UL', 'OL', 'LI',
|
|
395
|
+
'BLOCKQUOTE', 'PRE', 'CODE',
|
|
396
|
+
'IMG', 'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', 'FIGURE', 'FIGCAPTION',
|
|
397
|
+
'STRONG', 'EM', 'B', 'I', 'SPAN', 'DIV', 'BR', 'HR',
|
|
398
|
+
'A', 'SUP', 'SUB'
|
|
399
|
+
]);
|
|
400
|
+
|
|
401
|
+
const blockDisplayTags = new Set(['DIV', 'SECTION', 'ARTICLE', 'FIGURE']);
|
|
402
|
+
const allowedAttributes = new Set(['href', 'src', 'alt', 'title', 'class', 'style', 'target', 'rel']);
|
|
403
|
+
|
|
404
|
+
function sanitizeNode(node) {
|
|
405
|
+
const children = Array.from(node.children || []);
|
|
406
|
+
for (const child of children) {
|
|
407
|
+
if (!allowedTags.has(child.tagName)) {
|
|
408
|
+
child.replaceWith(...child.childNodes);
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if (blockDisplayTags.has(child.tagName)) {
|
|
413
|
+
child.style.display = 'block';
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
const attributes = Array.from(child.attributes);
|
|
417
|
+
for (const attr of attributes) {
|
|
418
|
+
if (!allowedAttributes.has(attr.name.toLowerCase())) {
|
|
419
|
+
child.removeAttribute(attr.name);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
sanitizeNode(child);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
sanitizeNode(template.content || template);
|
|
428
|
+
|
|
429
|
+
const images = template.content ? template.content.querySelectorAll('img') : [];
|
|
430
|
+
images.forEach(img => {
|
|
431
|
+
img.setAttribute('loading', 'eager');
|
|
432
|
+
img.setAttribute('decoding', 'sync');
|
|
433
|
+
img.style.maxWidth = '100%';
|
|
434
|
+
img.style.height = 'auto';
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
return template.innerHTML;
|
|
438
|
+
}, rawHtml);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
function escapeHtml(text = '') {
|
|
442
|
+
return text
|
|
443
|
+
.replace(/&/g, '&')
|
|
444
|
+
.replace(/</g, '<')
|
|
445
|
+
.replace(/>/g, '>')
|
|
446
|
+
.replace(/"/g, '"')
|
|
447
|
+
.replace(/'/g, ''');
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function buildPrintableHtml(title, sanitizedHtml) {
|
|
451
|
+
const baseCss = `
|
|
452
|
+
body {
|
|
453
|
+
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
|
|
454
|
+
font-size: 16px;
|
|
455
|
+
line-height: 1.8;
|
|
456
|
+
color: #1f2329;
|
|
457
|
+
margin: 0;
|
|
458
|
+
padding: 40px;
|
|
459
|
+
background: #fff;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
.article-print-wrapper {
|
|
463
|
+
max-width: 900px;
|
|
464
|
+
margin: 0 auto;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
.article-print-wrapper h1 {
|
|
468
|
+
font-size: 32px;
|
|
469
|
+
line-height: 1.4;
|
|
470
|
+
margin-bottom: 24px;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
a {
|
|
474
|
+
color: #0f5ef2;
|
|
475
|
+
text-decoration: none;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
pre {
|
|
479
|
+
background: #f7f7f7;
|
|
480
|
+
padding: 16px;
|
|
481
|
+
border-radius: 6px;
|
|
482
|
+
overflow: auto;
|
|
483
|
+
}
|
|
484
|
+
`;
|
|
485
|
+
|
|
486
|
+
return `
|
|
487
|
+
<!DOCTYPE html>
|
|
488
|
+
<html lang="zh-CN">
|
|
489
|
+
<head>
|
|
490
|
+
<meta charset="utf-8">
|
|
491
|
+
<base href="${GEEKTIME_BASE_URL}">
|
|
492
|
+
<style>${baseCss}${PRINT_FIX_CSS}</style>
|
|
493
|
+
</head>
|
|
494
|
+
<body>
|
|
495
|
+
<div class="article-print-wrapper">
|
|
496
|
+
<h1>${escapeHtml(title)}</h1>
|
|
497
|
+
${sanitizedHtml}
|
|
498
|
+
</div>
|
|
499
|
+
</body>
|
|
500
|
+
</html>`;
|
|
501
|
+
}
|
|
502
|
+
|
|
260
503
|
// 获取专栏所有文章列表(通过API)
|
|
261
504
|
function getValueByPath(obj, path) {
|
|
262
505
|
if (!obj || !path) return undefined;
|
|
@@ -702,128 +945,83 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
|
|
|
702
945
|
// 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
|
|
703
946
|
async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
704
947
|
try {
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
await page
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
// 2. 克隆正文内容
|
|
719
|
-
const contentClone = articleContent.cloneNode(true);
|
|
720
|
-
|
|
721
|
-
// 3. 清空body的所有内容
|
|
722
|
-
document.body.innerHTML = '';
|
|
723
|
-
|
|
724
|
-
// 4. 重置body样式为全宽
|
|
725
|
-
document.body.style.margin = '0';
|
|
726
|
-
document.body.style.padding = '0';
|
|
727
|
-
document.body.style.width = '100%';
|
|
728
|
-
document.body.style.maxWidth = 'none';
|
|
729
|
-
document.body.style.boxSizing = 'border-box';
|
|
730
|
-
|
|
731
|
-
// 5. 创建一个简单的容器
|
|
732
|
-
const wrapper = document.createElement('div');
|
|
733
|
-
wrapper.style.width = '100%';
|
|
734
|
-
wrapper.style.maxWidth = '100%';
|
|
735
|
-
wrapper.style.margin = '0';
|
|
736
|
-
wrapper.style.padding = '0';
|
|
737
|
-
wrapper.style.boxSizing = 'border-box';
|
|
738
|
-
|
|
739
|
-
// 6. 创建标题元素(使用传入的标题文本)
|
|
740
|
-
if (titleText) {
|
|
741
|
-
const titleElement = document.createElement('h1');
|
|
742
|
-
titleElement.textContent = titleText;
|
|
743
|
-
// 设置标题样式
|
|
744
|
-
titleElement.style.fontSize = '32px';
|
|
745
|
-
titleElement.style.fontWeight = 'bold';
|
|
746
|
-
titleElement.style.marginBottom = '30px';
|
|
747
|
-
titleElement.style.marginTop = '0';
|
|
748
|
-
titleElement.style.lineHeight = '1.4';
|
|
749
|
-
titleElement.style.color = '#000';
|
|
750
|
-
wrapper.appendChild(titleElement);
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
// 7. 将正文插入容器
|
|
754
|
-
wrapper.appendChild(contentClone);
|
|
755
|
-
|
|
756
|
-
// 8. 将容器插入body
|
|
757
|
-
document.body.appendChild(wrapper);
|
|
758
|
-
|
|
759
|
-
// 9. 确保正文内容使用全宽且不溢出
|
|
760
|
-
contentClone.style.width = '100%';
|
|
761
|
-
contentClone.style.maxWidth = '100%';
|
|
762
|
-
contentClone.style.margin = '0';
|
|
763
|
-
contentClone.style.padding = '0';
|
|
764
|
-
contentClone.style.boxSizing = 'border-box';
|
|
765
|
-
contentClone.style.overflowWrap = 'break-word';
|
|
766
|
-
contentClone.style.wordBreak = 'break-word';
|
|
767
|
-
} else {
|
|
768
|
-
// 如果找不到正文,使用原有的删除方法
|
|
769
|
-
const selectors = [
|
|
770
|
-
'aside',
|
|
771
|
-
'[class*="leftSide"]',
|
|
772
|
-
'[class*="LeftSide"]',
|
|
773
|
-
'[class*="sidebar"]',
|
|
774
|
-
'[class*="Sidebar"]',
|
|
775
|
-
'[class*="side_"]',
|
|
776
|
-
'[class*="catalog"]',
|
|
777
|
-
'[class*="directory"]',
|
|
778
|
-
'[class*="toc"]',
|
|
779
|
-
'[class*="outline"]',
|
|
780
|
-
'[class*="Outline"]',
|
|
781
|
-
'nav',
|
|
782
|
-
'[class*="nav"]',
|
|
783
|
-
'[class*="Nav"]',
|
|
784
|
-
'[class*="rightSide"]',
|
|
785
|
-
'[class*="RightSide"]',
|
|
786
|
-
'[class*="comment"]',
|
|
787
|
-
'[class*="recommend"]',
|
|
788
|
-
'[class*="footer"]',
|
|
789
|
-
'[class*="bottom"]'
|
|
790
|
-
];
|
|
791
|
-
|
|
792
|
-
selectors.forEach(selector => {
|
|
793
|
-
try {
|
|
794
|
-
const elements = document.querySelectorAll(selector);
|
|
795
|
-
elements.forEach(el => el.remove());
|
|
796
|
-
} catch (e) {
|
|
797
|
-
// 忽略无效选择器
|
|
798
|
-
}
|
|
799
|
-
});
|
|
800
|
-
}
|
|
948
|
+
if (process.env.DEBUG) {
|
|
949
|
+
console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
|
|
950
|
+
}
|
|
951
|
+
const articleData = await fetchArticleData(page.context(), article.id);
|
|
952
|
+
if (process.env.DEBUG) {
|
|
953
|
+
console.log(chalk.gray(`[silent] 已获取文章数据 ${article.id}`));
|
|
954
|
+
}
|
|
955
|
+
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
956
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
957
|
+
if (process.env.DEBUG) {
|
|
958
|
+
console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
|
|
959
|
+
}
|
|
960
|
+
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
801
961
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
962
|
+
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
963
|
+
if (process.env.DEBUG) {
|
|
964
|
+
console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
|
|
965
|
+
}
|
|
966
|
+
if (process.env.DEBUG) {
|
|
967
|
+
console.log(chalk.gray(`[silent] 等待图片初步加载 ${article.id}`));
|
|
968
|
+
}
|
|
969
|
+
try {
|
|
970
|
+
await page.waitForFunction(() => {
|
|
971
|
+
const imgs = Array.from(document.images || []);
|
|
972
|
+
if (imgs.length === 0) {
|
|
973
|
+
return true;
|
|
809
974
|
}
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
975
|
+
return imgs.every(img => img.complete);
|
|
976
|
+
}, { timeout: 30000 });
|
|
977
|
+
} catch (waitError) {
|
|
978
|
+
if (process.env.DEBUG) {
|
|
979
|
+
console.log(chalk.gray(`[silent] 图片初步加载等待超时 ${article.id}: ${waitError?.message || waitError}`));
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
try {
|
|
983
|
+
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
984
|
+
if (process.env.DEBUG) {
|
|
985
|
+
console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
|
|
986
|
+
}
|
|
987
|
+
} catch {
|
|
988
|
+
// 忽略由于没有额外资源导致的延时
|
|
989
|
+
if (process.env.DEBUG) {
|
|
990
|
+
console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
|
|
991
|
+
}
|
|
992
|
+
}
|
|
815
993
|
|
|
816
994
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
995
|
+
if (process.env.DEBUG) {
|
|
996
|
+
console.log(chalk.gray(`[silent] 开始处理图片 ${article.id}`));
|
|
997
|
+
}
|
|
817
998
|
await page.evaluate(() => {
|
|
818
999
|
const images = document.querySelectorAll('img');
|
|
819
1000
|
const promises = Array.from(images).map(img => {
|
|
820
1001
|
return new Promise((resolve) => {
|
|
1002
|
+
let resolved = false;
|
|
1003
|
+
const safeResolve = () => {
|
|
1004
|
+
if (!resolved) {
|
|
1005
|
+
resolved = true;
|
|
1006
|
+
resolve();
|
|
1007
|
+
}
|
|
1008
|
+
};
|
|
1009
|
+
const attachTimeout = () => setTimeout(safeResolve, 15000);
|
|
1010
|
+
let fallbackTimer = null;
|
|
1011
|
+
|
|
821
1012
|
// 如果图片还未加载完成,等待加载
|
|
822
1013
|
if (!img.complete) {
|
|
823
|
-
|
|
824
|
-
img.
|
|
1014
|
+
fallbackTimer = attachTimeout();
|
|
1015
|
+
img.onload = () => {
|
|
1016
|
+
if (fallbackTimer) clearTimeout(fallbackTimer);
|
|
1017
|
+
processImage(img, safeResolve);
|
|
1018
|
+
};
|
|
1019
|
+
img.onerror = () => {
|
|
1020
|
+
if (fallbackTimer) clearTimeout(fallbackTimer);
|
|
1021
|
+
safeResolve(); // 图片加载失败,跳过
|
|
1022
|
+
};
|
|
825
1023
|
} else {
|
|
826
|
-
processImage(img,
|
|
1024
|
+
processImage(img, safeResolve);
|
|
827
1025
|
}
|
|
828
1026
|
});
|
|
829
1027
|
});
|
|
@@ -851,12 +1049,21 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
851
1049
|
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
|
852
1050
|
|
|
853
1051
|
// 转换为压缩后的data URL
|
|
1052
|
+
let hasResolved = false;
|
|
1053
|
+
const finalize = () => {
|
|
1054
|
+
if (!hasResolved) {
|
|
1055
|
+
hasResolved = true;
|
|
1056
|
+
resolve();
|
|
1057
|
+
}
|
|
1058
|
+
};
|
|
854
1059
|
canvas.toBlob((blob) => {
|
|
855
|
-
|
|
856
|
-
|
|
1060
|
+
if (blob) {
|
|
1061
|
+
const url = URL.createObjectURL(blob);
|
|
1062
|
+
img.src = url;
|
|
1063
|
+
}
|
|
857
1064
|
img.style.width = maxWidth + 'px';
|
|
858
1065
|
img.style.height = 'auto';
|
|
859
|
-
|
|
1066
|
+
finalize();
|
|
860
1067
|
}, 'image/jpeg', quality);
|
|
861
1068
|
} catch (e) {
|
|
862
1069
|
// 如果压缩失败,至少限制大小
|
|
@@ -868,9 +1075,15 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
868
1075
|
|
|
869
1076
|
return Promise.all(promises);
|
|
870
1077
|
});
|
|
1078
|
+
if (process.env.DEBUG) {
|
|
1079
|
+
console.log(chalk.gray(`[silent] 图片处理完成 ${article.id}`));
|
|
1080
|
+
}
|
|
871
1081
|
|
|
872
1082
|
// 等待图片处理完成
|
|
873
|
-
await page.waitForTimeout(
|
|
1083
|
+
await page.waitForTimeout(30000);
|
|
1084
|
+
if (process.env.DEBUG) {
|
|
1085
|
+
console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
|
|
1086
|
+
}
|
|
874
1087
|
|
|
875
1088
|
// 生成 PDF
|
|
876
1089
|
const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
|
|
@@ -888,10 +1101,16 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
888
1101
|
printBackground: false, // 关闭背景打印,显著减小文件大小
|
|
889
1102
|
preferCSSPageSize: false
|
|
890
1103
|
});
|
|
1104
|
+
if (process.env.DEBUG) {
|
|
1105
|
+
console.log(chalk.gray(`[silent] PDF生成完成 ${article.id}`));
|
|
1106
|
+
}
|
|
891
1107
|
|
|
892
1108
|
return { success: true, title: article.title };
|
|
893
1109
|
|
|
894
1110
|
} catch (error) {
|
|
1111
|
+
if (process.env.DEBUG) {
|
|
1112
|
+
console.log(chalk.red(`[silent] 文章 ${article.id} 失败: ${error.message}`));
|
|
1113
|
+
}
|
|
895
1114
|
return { success: false, title: article.title, error: error.message };
|
|
896
1115
|
}
|
|
897
1116
|
}
|
|
@@ -901,116 +1120,17 @@ async function downloadArticle(page, article, outputDir, index, total) {
|
|
|
901
1120
|
const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
|
|
902
1121
|
|
|
903
1122
|
try {
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
await page
|
|
907
|
-
|
|
908
|
-
// 注入打印修复样式
|
|
909
|
-
await page.addStyleTag({ content: PRINT_FIX_CSS });
|
|
910
|
-
|
|
911
|
-
// 激进的布局重构:提取正文并重建页面结构
|
|
912
|
-
await page.evaluate((titleText) => {
|
|
913
|
-
// 1. 找到文章正文内容
|
|
914
|
-
const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
|
|
915
|
-
|
|
916
|
-
if (articleContent) {
|
|
917
|
-
// 2. 克隆正文内容
|
|
918
|
-
const contentClone = articleContent.cloneNode(true);
|
|
919
|
-
|
|
920
|
-
// 3. 清空body的所有内容
|
|
921
|
-
document.body.innerHTML = '';
|
|
922
|
-
|
|
923
|
-
// 4. 重置body样式为全宽
|
|
924
|
-
document.body.style.margin = '0';
|
|
925
|
-
document.body.style.padding = '0';
|
|
926
|
-
document.body.style.width = '100%';
|
|
927
|
-
document.body.style.maxWidth = 'none';
|
|
928
|
-
document.body.style.boxSizing = 'border-box';
|
|
929
|
-
|
|
930
|
-
// 5. 创建一个简单的容器
|
|
931
|
-
const wrapper = document.createElement('div');
|
|
932
|
-
wrapper.style.width = '100%';
|
|
933
|
-
wrapper.style.maxWidth = '100%';
|
|
934
|
-
wrapper.style.margin = '0';
|
|
935
|
-
wrapper.style.padding = '0';
|
|
936
|
-
wrapper.style.boxSizing = 'border-box';
|
|
937
|
-
|
|
938
|
-
// 6. 创建标题元素(使用传入的标题文本)
|
|
939
|
-
if (titleText) {
|
|
940
|
-
const titleElement = document.createElement('h1');
|
|
941
|
-
titleElement.textContent = titleText;
|
|
942
|
-
// 设置标题样式
|
|
943
|
-
titleElement.style.fontSize = '32px';
|
|
944
|
-
titleElement.style.fontWeight = 'bold';
|
|
945
|
-
titleElement.style.marginBottom = '30px';
|
|
946
|
-
titleElement.style.marginTop = '0';
|
|
947
|
-
titleElement.style.lineHeight = '1.4';
|
|
948
|
-
titleElement.style.color = '#000';
|
|
949
|
-
wrapper.appendChild(titleElement);
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
// 7. 将正文插入容器
|
|
953
|
-
wrapper.appendChild(contentClone);
|
|
954
|
-
|
|
955
|
-
// 8. 将容器插入body
|
|
956
|
-
document.body.appendChild(wrapper);
|
|
957
|
-
|
|
958
|
-
// 9. 确保正文内容使用全宽且不溢出
|
|
959
|
-
contentClone.style.width = '100%';
|
|
960
|
-
contentClone.style.maxWidth = '100%';
|
|
961
|
-
contentClone.style.margin = '0';
|
|
962
|
-
contentClone.style.padding = '0';
|
|
963
|
-
contentClone.style.boxSizing = 'border-box';
|
|
964
|
-
contentClone.style.overflowWrap = 'break-word';
|
|
965
|
-
contentClone.style.wordBreak = 'break-word';
|
|
966
|
-
} else {
|
|
967
|
-
// 如果找不到正文,使用原有的删除方法
|
|
968
|
-
const selectors = [
|
|
969
|
-
'aside',
|
|
970
|
-
'[class*="leftSide"]',
|
|
971
|
-
'[class*="LeftSide"]',
|
|
972
|
-
'[class*="sidebar"]',
|
|
973
|
-
'[class*="Sidebar"]',
|
|
974
|
-
'[class*="side_"]',
|
|
975
|
-
'[class*="catalog"]',
|
|
976
|
-
'[class*="directory"]',
|
|
977
|
-
'[class*="toc"]',
|
|
978
|
-
'[class*="outline"]',
|
|
979
|
-
'[class*="Outline"]',
|
|
980
|
-
'nav',
|
|
981
|
-
'[class*="nav"]',
|
|
982
|
-
'[class*="Nav"]',
|
|
983
|
-
'[class*="rightSide"]',
|
|
984
|
-
'[class*="RightSide"]',
|
|
985
|
-
'[class*="comment"]',
|
|
986
|
-
'[class*="recommend"]',
|
|
987
|
-
'[class*="footer"]',
|
|
988
|
-
'[class*="bottom"]'
|
|
989
|
-
];
|
|
990
|
-
|
|
991
|
-
selectors.forEach(selector => {
|
|
992
|
-
try {
|
|
993
|
-
const elements = document.querySelectorAll(selector);
|
|
994
|
-
elements.forEach(el => el.remove());
|
|
995
|
-
} catch (e) {
|
|
996
|
-
// 忽略无效选择器
|
|
997
|
-
}
|
|
998
|
-
});
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
|
-
// 额外:删除所有包含"大纲"的元素
|
|
1002
|
-
const allElements = document.querySelectorAll('*');
|
|
1003
|
-
allElements.forEach(el => {
|
|
1004
|
-
const text = el.textContent || el.innerText || '';
|
|
1005
|
-
if (text.trim() === '大纲' ||
|
|
1006
|
-
(text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
|
|
1007
|
-
el.remove();
|
|
1008
|
-
}
|
|
1009
|
-
});
|
|
1010
|
-
}, article.originalTitle || article.title);
|
|
1123
|
+
const articleData = await fetchArticleData(page.context(), article.id);
|
|
1124
|
+
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
1125
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
1126
|
+
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
1011
1127
|
|
|
1012
|
-
|
|
1013
|
-
|
|
1128
|
+
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
1129
|
+
try {
|
|
1130
|
+
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
1131
|
+
} catch {
|
|
1132
|
+
// 没有额外资源加载时忽略
|
|
1133
|
+
}
|
|
1014
1134
|
|
|
1015
1135
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
1016
1136
|
await page.evaluate(() => {
|
|
@@ -1209,460 +1329,32 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
|
|
|
1209
1329
|
// 提取单篇文章的 HTML 内容(用于 EPUB 生成)
|
|
1210
1330
|
async function extractArticleContent(page, article, index, total) {
|
|
1211
1331
|
try {
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
// 等待文章内容加载
|
|
1216
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 60000 });
|
|
1217
|
-
|
|
1218
|
-
// 关键:等待文章完整内容加载,而不是试看内容
|
|
1219
|
-
// 滚动页面以触发懒加载内容
|
|
1220
|
-
await page.evaluate(async () => {
|
|
1221
|
-
await new Promise((resolve) => {
|
|
1222
|
-
let totalHeight = 0;
|
|
1223
|
-
const distance = 100;
|
|
1224
|
-
const timer = setInterval(() => {
|
|
1225
|
-
const scrollHeight = document.body.scrollHeight;
|
|
1226
|
-
window.scrollBy(0, distance);
|
|
1227
|
-
totalHeight += distance;
|
|
1228
|
-
|
|
1229
|
-
if (totalHeight >= scrollHeight) {
|
|
1230
|
-
clearInterval(timer);
|
|
1231
|
-
resolve();
|
|
1232
|
-
}
|
|
1233
|
-
}, 100);
|
|
1234
|
-
});
|
|
1235
|
-
});
|
|
1236
|
-
|
|
1237
|
-
// 再等待一段时间,确保内容完全加载
|
|
1238
|
-
await page.waitForTimeout(3000);
|
|
1239
|
-
|
|
1240
|
-
// 提取文章 HTML 内容
|
|
1241
|
-
const content = await page.evaluate(() => {
|
|
1242
|
-
// 找到文章正文内容
|
|
1243
|
-
const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
|
|
1244
|
-
|
|
1245
|
-
if (!articleContent) {
|
|
1246
|
-
return null;
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
// 克隆正文以避免修改原始DOM
|
|
1250
|
-
const contentClone = articleContent.cloneNode(true);
|
|
1251
|
-
|
|
1252
|
-
// 白名单策略:只保留正文核心元素
|
|
1253
|
-
// 允许的元素标签
|
|
1254
|
-
const allowedTags = new Set([
|
|
1255
|
-
'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', // 段落和标题
|
|
1256
|
-
'UL', 'OL', 'LI', // 列表
|
|
1257
|
-
'BLOCKQUOTE', // 引用
|
|
1258
|
-
'PRE', 'CODE', // 代码
|
|
1259
|
-
'IMG', // 图片
|
|
1260
|
-
'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', // 表格
|
|
1261
|
-
'A', // 链接
|
|
1262
|
-
'STRONG', 'B', 'EM', 'I', 'U', // 强调和样式
|
|
1263
|
-
'BR', 'HR', // 换行和分隔线
|
|
1264
|
-
'FIGURE', 'FIGCAPTION', 'DETAILS', 'SUMMARY',
|
|
1265
|
-
'SPAN', 'DIV', 'SECTION', 'ARTICLE' // 容器(可能包含文本)
|
|
1266
|
-
]);
|
|
1267
|
-
|
|
1268
|
-
// 在清理前,移除常见的非正文区域
|
|
1269
|
-
const removalSelectors = [
|
|
1270
|
-
'nav', 'header', 'footer', 'aside',
|
|
1271
|
-
'.comment', '.comments', '.Index_comment',
|
|
1272
|
-
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
1273
|
-
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
1274
|
-
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
1275
|
-
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
1276
|
-
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
1277
|
-
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
1278
|
-
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
1279
|
-
'.copyright', '.statement', '.disclaimer',
|
|
1280
|
-
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
1281
|
-
'audio', 'video',
|
|
1282
|
-
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
1283
|
-
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
1284
|
-
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
1285
|
-
'[data-role="toolbar"]',
|
|
1286
|
-
'button', 'iframe', 'script', 'style'
|
|
1287
|
-
];
|
|
1288
|
-
removalSelectors.forEach(selector => {
|
|
1289
|
-
contentClone.querySelectorAll(selector).forEach(el => el.remove());
|
|
1290
|
-
});
|
|
1291
|
-
|
|
1292
|
-
// 根据关键词进一步移除插件类元素
|
|
1293
|
-
const pluginKeywords = [
|
|
1294
|
-
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
1295
|
-
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
1296
|
-
'copyright', 'geeknote', 'bilingual'
|
|
1297
|
-
];
|
|
1298
|
-
const pluginElements = Array.from(contentClone.querySelectorAll('*')).filter(el => {
|
|
1299
|
-
const className = (el.className || '').toString().toLowerCase();
|
|
1300
|
-
const idValue = (el.id || '').toString().toLowerCase();
|
|
1301
|
-
const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
|
|
1302
|
-
const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
|
|
1303
|
-
const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
|
|
1304
|
-
return pluginKeywords.some(keyword => combined.includes(keyword));
|
|
1305
|
-
});
|
|
1306
|
-
pluginElements.forEach(el => el.remove());
|
|
1307
|
-
|
|
1308
|
-
// 移除 MindMap 等 SVG/Canvas 思维导图内容(阅读器无法正确渲染)
|
|
1309
|
-
const mindmapSelectors = [
|
|
1310
|
-
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
1311
|
-
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
1312
|
-
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
1313
|
-
];
|
|
1314
|
-
mindmapSelectors.forEach(selector => {
|
|
1315
|
-
contentClone.querySelectorAll(selector).forEach(el => el.remove());
|
|
1316
|
-
});
|
|
1317
|
-
const vectorCandidates = Array.from(contentClone.querySelectorAll('svg, canvas, object, embed'));
|
|
1318
|
-
vectorCandidates.forEach(el => {
|
|
1319
|
-
const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
|
|
1320
|
-
const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
|
|
1321
|
-
if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
|
|
1322
|
-
el.remove();
|
|
1323
|
-
}
|
|
1324
|
-
});
|
|
1325
|
-
|
|
1326
|
-
// 将富文本中的代码块结构转换为标准 <pre><code>
|
|
1327
|
-
const blockSeparatorTags = new Set([
|
|
1328
|
-
'P','DIV','SECTION','ARTICLE','UL','OL','LI','FIGURE','FIGCAPTION',
|
|
1329
|
-
'TABLE','THEAD','TBODY','TR','TD'
|
|
1330
|
-
]);
|
|
1331
|
-
|
|
1332
|
-
function collectCodeText(node) {
|
|
1333
|
-
const parts = [];
|
|
1334
|
-
|
|
1335
|
-
const ensureNewline = () => {
|
|
1336
|
-
if (!parts.length) {
|
|
1337
|
-
parts.push('\n');
|
|
1338
|
-
return;
|
|
1339
|
-
}
|
|
1340
|
-
if (!parts[parts.length - 1].endsWith('\n')) {
|
|
1341
|
-
parts.push('\n');
|
|
1342
|
-
}
|
|
1343
|
-
};
|
|
1344
|
-
|
|
1345
|
-
const traverse = (current) => {
|
|
1346
|
-
if (!current) {
|
|
1347
|
-
return;
|
|
1348
|
-
}
|
|
1349
|
-
if (current.nodeType === Node.TEXT_NODE) {
|
|
1350
|
-
const textValue = current.textContent.replace(/\u00A0/g, ' ');
|
|
1351
|
-
if (textValue) {
|
|
1352
|
-
parts.push(textValue);
|
|
1353
|
-
}
|
|
1354
|
-
return;
|
|
1355
|
-
}
|
|
1356
|
-
if (current.nodeType !== Node.ELEMENT_NODE) {
|
|
1357
|
-
return;
|
|
1358
|
-
}
|
|
1359
|
-
const tag = current.tagName.toUpperCase();
|
|
1360
|
-
if (tag === 'BR') {
|
|
1361
|
-
ensureNewline();
|
|
1362
|
-
return;
|
|
1363
|
-
}
|
|
1364
|
-
Array.from(current.childNodes).forEach(traverse);
|
|
1365
|
-
if (blockSeparatorTags.has(tag)) {
|
|
1366
|
-
ensureNewline();
|
|
1367
|
-
}
|
|
1368
|
-
};
|
|
1369
|
-
|
|
1370
|
-
traverse(node);
|
|
1371
|
-
let text = parts.join('');
|
|
1372
|
-
text = text
|
|
1373
|
-
.replace(/\r\n/g, '\n')
|
|
1374
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
1375
|
-
.replace(/[ \t]+\n/g, '\n')
|
|
1376
|
-
.replace(/\n+$/g, '\n');
|
|
1377
|
-
return text.trim() ? text : '';
|
|
1378
|
-
}
|
|
1379
|
-
|
|
1380
|
-
const codeLikeSelectors = [
|
|
1381
|
-
'[data-slate-type="code"]',
|
|
1382
|
-
'[data-slate-node="code"]',
|
|
1383
|
-
'[data-code-block]',
|
|
1384
|
-
'[data-code]',
|
|
1385
|
-
'[data-code-language]',
|
|
1386
|
-
'[class*="code-block"]',
|
|
1387
|
-
'[class*="CodeBlock"]'
|
|
1388
|
-
];
|
|
1389
|
-
const codeCandidates = new Set();
|
|
1390
|
-
codeLikeSelectors.forEach(selector => {
|
|
1391
|
-
contentClone.querySelectorAll(selector).forEach(el => codeCandidates.add(el));
|
|
1392
|
-
});
|
|
1393
|
-
const replaceWithPre = (element) => {
|
|
1394
|
-
if (!element || !element.parentNode) {
|
|
1395
|
-
return;
|
|
1396
|
-
}
|
|
1397
|
-
const codeText = collectCodeText(element);
|
|
1398
|
-
if (!codeText) {
|
|
1399
|
-
element.remove();
|
|
1400
|
-
return;
|
|
1401
|
-
}
|
|
1402
|
-
const pre = document.createElement('pre');
|
|
1403
|
-
const code = document.createElement('code');
|
|
1404
|
-
code.textContent = codeText;
|
|
1405
|
-
pre.appendChild(code);
|
|
1406
|
-
element.parentNode.replaceChild(pre, element);
|
|
1407
|
-
};
|
|
1408
|
-
codeCandidates.forEach(el => {
|
|
1409
|
-
if (el.tagName && el.tagName.toUpperCase() === 'PRE') {
|
|
1410
|
-
return;
|
|
1411
|
-
}
|
|
1412
|
-
replaceWithPre(el);
|
|
1413
|
-
});
|
|
1414
|
-
|
|
1415
|
-
const multilineInlineCodes = Array.from(contentClone.querySelectorAll('code')).filter(codeEl => {
|
|
1416
|
-
const parent = codeEl.parentElement;
|
|
1417
|
-
return parent && parent.tagName.toUpperCase() !== 'PRE' && codeEl.textContent.includes('\n');
|
|
1418
|
-
});
|
|
1419
|
-
multilineInlineCodes.forEach(codeEl => {
|
|
1420
|
-
const codeText = collectCodeText(codeEl);
|
|
1421
|
-
if (!codeText) {
|
|
1422
|
-
codeEl.remove();
|
|
1423
|
-
return;
|
|
1424
|
-
}
|
|
1425
|
-
const pre = document.createElement('pre');
|
|
1426
|
-
const innerCode = document.createElement('code');
|
|
1427
|
-
innerCode.textContent = codeText;
|
|
1428
|
-
pre.appendChild(innerCode);
|
|
1429
|
-
codeEl.parentNode.replaceChild(pre, codeEl);
|
|
1430
|
-
});
|
|
1431
|
-
|
|
1432
|
-
// 递归清理函数:移除不在白名单中的元素
|
|
1433
|
-
function cleanElement(element) {
|
|
1434
|
-
const children = Array.from(element.childNodes);
|
|
1435
|
-
|
|
1436
|
-
for (const child of children) {
|
|
1437
|
-
if (child.nodeType === Node.ELEMENT_NODE) {
|
|
1438
|
-
const tagName = child.tagName.toUpperCase();
|
|
1439
|
-
|
|
1440
|
-
if (!allowedTags.has(tagName)) {
|
|
1441
|
-
// 先递归处理子节点
|
|
1442
|
-
cleanElement(child);
|
|
1443
|
-
|
|
1444
|
-
if (child.childNodes.length > 0) {
|
|
1445
|
-
while (child.firstChild) {
|
|
1446
|
-
element.insertBefore(child.firstChild, child);
|
|
1447
|
-
}
|
|
1448
|
-
child.remove();
|
|
1449
|
-
} else {
|
|
1450
|
-
const textContent = (child.textContent || '').trim();
|
|
1451
|
-
if (textContent) {
|
|
1452
|
-
const textNode = document.createTextNode(textContent + ' ');
|
|
1453
|
-
element.insertBefore(textNode, child);
|
|
1454
|
-
}
|
|
1455
|
-
child.remove();
|
|
1456
|
-
}
|
|
1457
|
-
} else {
|
|
1458
|
-
cleanElement(child);
|
|
1459
|
-
}
|
|
1460
|
-
}
|
|
1461
|
-
}
|
|
1462
|
-
}
|
|
1463
|
-
|
|
1464
|
-
cleanElement(contentClone);
|
|
1465
|
-
|
|
1466
|
-
// 移除所有style属性,避免样式冲突
|
|
1467
|
-
const allElements = contentClone.querySelectorAll('*');
|
|
1468
|
-
allElements.forEach(el => {
|
|
1469
|
-
el.removeAttribute('style');
|
|
1470
|
-
el.removeAttribute('class');
|
|
1471
|
-
el.removeAttribute('id');
|
|
1472
|
-
el.removeAttribute('onclick');
|
|
1473
|
-
el.removeAttribute('onload');
|
|
1474
|
-
});
|
|
1475
|
-
|
|
1476
|
-
// 处理图片URL
|
|
1477
|
-
const images = contentClone.querySelectorAll('img');
|
|
1478
|
-
const adKeywordLower = ['ad', 'advert', 'banner', 'qrcode', 'qr-code', 'reward', 'donate', 'appdownload', 'app-download', 'sponsor', 'thanks'];
|
|
1479
|
-
const adKeywordCn = ['广告', '二维码', '赞赏', '打赏', '版权', '推广'];
|
|
1480
|
-
images.forEach(img => {
|
|
1481
|
-
let src = img.getAttribute('src');
|
|
1482
|
-
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || img.getAttribute('data-lazy-src');
|
|
1483
|
-
|
|
1484
|
-
if (dataSrc && (dataSrc.startsWith('http://') || dataSrc.startsWith('https://'))) {
|
|
1485
|
-
src = dataSrc;
|
|
1486
|
-
img.setAttribute('src', src);
|
|
1487
|
-
}
|
|
1488
|
-
|
|
1489
|
-
if (!src || src.startsWith('blob:') || src.startsWith('data:')) {
|
|
1490
|
-
img.remove();
|
|
1491
|
-
return;
|
|
1492
|
-
}
|
|
1493
|
-
|
|
1494
|
-
if (!src.startsWith('http://') && !src.startsWith('https://')) {
|
|
1495
|
-
try {
|
|
1496
|
-
const absoluteUrl = new URL(src, window.location.href).href;
|
|
1497
|
-
img.setAttribute('src', absoluteUrl);
|
|
1498
|
-
src = absoluteUrl;
|
|
1499
|
-
} catch (e) {
|
|
1500
|
-
img.remove();
|
|
1501
|
-
}
|
|
1502
|
-
}
|
|
1503
|
-
|
|
1504
|
-
const altText = img.getAttribute('alt') || '';
|
|
1505
|
-
const altLower = altText.toLowerCase();
|
|
1506
|
-
const srcLower = (src || '').toLowerCase();
|
|
1507
|
-
if (
|
|
1508
|
-
adKeywordLower.some(keyword => srcLower.includes(keyword)) ||
|
|
1509
|
-
adKeywordLower.some(keyword => altLower.includes(keyword)) ||
|
|
1510
|
-
adKeywordCn.some(keyword => altText.includes(keyword))
|
|
1511
|
-
) {
|
|
1512
|
-
img.remove();
|
|
1513
|
-
return;
|
|
1514
|
-
}
|
|
1515
|
-
|
|
1516
|
-
// 清理图片属性
|
|
1517
|
-
const imgAttrs = img.attributes;
|
|
1518
|
-
for (let i = imgAttrs.length - 1; i >= 0; i--) {
|
|
1519
|
-
const attrName = imgAttrs[i].name;
|
|
1520
|
-
if (attrName !== 'src' && attrName !== 'alt') {
|
|
1521
|
-
img.removeAttribute(attrName);
|
|
1522
|
-
}
|
|
1523
|
-
}
|
|
1524
|
-
});
|
|
1525
|
-
|
|
1526
|
-
// 清理空的div和span
|
|
1527
|
-
const containers = contentClone.querySelectorAll('div, span');
|
|
1528
|
-
containers.forEach(container => {
|
|
1529
|
-
if (!container.textContent.trim() && !container.querySelector('img, pre, code, table')) {
|
|
1530
|
-
container.remove();
|
|
1531
|
-
}
|
|
1532
|
-
});
|
|
1533
|
-
|
|
1534
|
-
// 将只包含纯文本的 div 转换为段落,避免没有段间距
|
|
1535
|
-
const blockLikeTags = new Set(['P','UL','OL','LI','TABLE','PRE','BLOCKQUOTE','H1','H2','H3','H4','H5','H6','IMG','SECTION','ARTICLE','FIGURE','FIGCAPTION','DETAILS','SUMMARY']);
|
|
1536
|
-
const textContainers = Array.from(contentClone.querySelectorAll('div, section, article')).reverse();
|
|
1537
|
-
textContainers.forEach(container => {
|
|
1538
|
-
if (container === contentClone) {
|
|
1539
|
-
return;
|
|
1540
|
-
}
|
|
1541
|
-
|
|
1542
|
-
if (!container.textContent.trim()) {
|
|
1543
|
-
return;
|
|
1544
|
-
}
|
|
1545
|
-
|
|
1546
|
-
if (container.querySelector('img, pre, table, ul, ol, blockquote, h1, h2, h3, h4, h5, h6, figure')) {
|
|
1547
|
-
return;
|
|
1548
|
-
}
|
|
1332
|
+
const articleData = await fetchArticleData(page.context(), article.id);
|
|
1333
|
+
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
1334
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
1549
1335
|
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
}
|
|
1554
|
-
|
|
1555
|
-
const paragraph = document.createElement('p');
|
|
1556
|
-
paragraph.innerHTML = container.innerHTML;
|
|
1557
|
-
container.parentNode.replaceChild(paragraph, container);
|
|
1558
|
-
});
|
|
1559
|
-
|
|
1560
|
-
// 包装直接挂在容器下的文本或行内节点,避免散乱文本没有段落间距
|
|
1561
|
-
const inlineTags = new Set(['A','SPAN','STRONG','B','EM','I','U','CODE','SMALL','SUB','SUP','MARK']);
|
|
1562
|
-
|
|
1563
|
-
function wrapInlineChildren(element) {
|
|
1564
|
-
const tagName = element.tagName ? element.tagName.toUpperCase() : '';
|
|
1565
|
-
if (['P','LI','PRE','CODE','TABLE','THEAD','TBODY','TR'].includes(tagName)) {
|
|
1566
|
-
return;
|
|
1567
|
-
}
|
|
1568
|
-
|
|
1569
|
-
const childNodes = Array.from(element.childNodes);
|
|
1570
|
-
let buffer = [];
|
|
1571
|
-
|
|
1572
|
-
const flushBuffer = (referenceNode) => {
|
|
1573
|
-
if (!buffer.length) {
|
|
1574
|
-
return;
|
|
1575
|
-
}
|
|
1576
|
-
const paragraph = document.createElement('p');
|
|
1577
|
-
buffer.forEach(node => paragraph.appendChild(node));
|
|
1578
|
-
element.insertBefore(paragraph, referenceNode);
|
|
1579
|
-
buffer = [];
|
|
1580
|
-
};
|
|
1581
|
-
|
|
1582
|
-
for (const node of childNodes) {
|
|
1583
|
-
if (node.nodeType === Node.TEXT_NODE) {
|
|
1584
|
-
if (node.textContent.trim()) {
|
|
1585
|
-
buffer.push(node);
|
|
1586
|
-
} else {
|
|
1587
|
-
element.removeChild(node);
|
|
1588
|
-
}
|
|
1589
|
-
continue;
|
|
1590
|
-
}
|
|
1591
|
-
|
|
1592
|
-
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
1593
|
-
const childTag = node.tagName.toUpperCase();
|
|
1594
|
-
if (inlineTags.has(childTag) || childTag === 'BR') {
|
|
1595
|
-
buffer.push(node);
|
|
1596
|
-
continue;
|
|
1597
|
-
}
|
|
1598
|
-
|
|
1599
|
-
flushBuffer(node);
|
|
1600
|
-
wrapInlineChildren(node);
|
|
1601
|
-
continue;
|
|
1602
|
-
}
|
|
1603
|
-
|
|
1604
|
-
flushBuffer(node);
|
|
1605
|
-
}
|
|
1606
|
-
|
|
1607
|
-
flushBuffer(null);
|
|
1608
|
-
}
|
|
1609
|
-
|
|
1610
|
-
wrapInlineChildren(contentClone);
|
|
1611
|
-
|
|
1612
|
-
// 移除尾部的版权/广告声明
|
|
1613
|
-
const footerKeywords = ['版权', '未经许可', '未经授权', '不得转载', '未经允许', 'All Rights Reserved', '最终解释权', '转载'];
|
|
1614
|
-
const trailingElements = Array.from(contentClone.querySelectorAll('p, div, section')).slice(-6);
|
|
1615
|
-
trailingElements.forEach(el => {
|
|
1616
|
-
const text = (el.textContent || '').trim();
|
|
1617
|
-
if (!text) {
|
|
1618
|
-
return;
|
|
1619
|
-
}
|
|
1620
|
-
if (text.length <= 200 && footerKeywords.some(keyword => text.includes(keyword))) {
|
|
1621
|
-
el.remove();
|
|
1622
|
-
}
|
|
1623
|
-
});
|
|
1624
|
-
|
|
1625
|
-
// 处理代码块
|
|
1626
|
-
const codeBlocks = contentClone.querySelectorAll('pre');
|
|
1627
|
-
codeBlocks.forEach(block => {
|
|
1628
|
-
const codeText = collectCodeText(block);
|
|
1629
|
-
if (!codeText) {
|
|
1630
|
-
block.remove();
|
|
1631
|
-
return;
|
|
1632
|
-
}
|
|
1633
|
-
let codeInside = block.querySelector('code');
|
|
1634
|
-
if (!codeInside) {
|
|
1635
|
-
codeInside = document.createElement('code');
|
|
1636
|
-
block.appendChild(codeInside);
|
|
1637
|
-
}
|
|
1638
|
-
codeInside.textContent = codeText;
|
|
1639
|
-
});
|
|
1640
|
-
|
|
1641
|
-
return contentClone.innerHTML;
|
|
1642
|
-
});
|
|
1336
|
+
if (!sanitizedHtml) {
|
|
1337
|
+
throw new Error('未能提取到文章内容');
|
|
1338
|
+
}
|
|
1643
1339
|
|
|
1644
1340
|
return {
|
|
1645
1341
|
success: true,
|
|
1646
1342
|
title: article.originalTitle || article.title,
|
|
1647
|
-
content:
|
|
1343
|
+
content: sanitizedHtml
|
|
1648
1344
|
};
|
|
1649
1345
|
|
|
1650
1346
|
} catch (error) {
|
|
1651
|
-
|
|
1652
|
-
let errorMessage = error.message;
|
|
1653
|
-
if (error.message.includes('Timeout') || error.message.includes('timeout')) {
|
|
1654
|
-
errorMessage = 'Cookie 可能已失效或页面加载超时';
|
|
1655
|
-
}
|
|
1656
|
-
|
|
1347
|
+
console.error(`[${index}/${total}] 提取文章内容失败: ${article.originalTitle || article.title}`, error);
|
|
1657
1348
|
return {
|
|
1658
1349
|
success: false,
|
|
1659
1350
|
title: article.originalTitle || article.title,
|
|
1660
|
-
|
|
1661
|
-
|
|
1351
|
+
error: error.message,
|
|
1352
|
+
content: ''
|
|
1662
1353
|
};
|
|
1663
1354
|
}
|
|
1664
1355
|
}
|
|
1665
1356
|
|
|
1357
|
+
|
|
1666
1358
|
// 并发提取文章内容(用于 EPUB)
|
|
1667
1359
|
async function extractWithConcurrency(context, articles, concurrency = 5, delay = 2000, timeout = 60000) {
|
|
1668
1360
|
const results = [];
|
|
@@ -1769,7 +1461,7 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
|
|
|
1769
1461
|
return null;
|
|
1770
1462
|
}
|
|
1771
1463
|
|
|
1772
|
-
|
|
1464
|
+
const options = {
|
|
1773
1465
|
title: columnTitle,
|
|
1774
1466
|
author: columnAuthor || '极客时间',
|
|
1775
1467
|
publisher: '极客时间',
|
|
@@ -2029,13 +1721,46 @@ async function main(options) {
|
|
|
2029
1721
|
globalBrowser = browser;
|
|
2030
1722
|
|
|
2031
1723
|
const context = await browser.newContext({
|
|
2032
|
-
userAgent:
|
|
1724
|
+
userAgent: DEFAULT_USER_AGENT
|
|
2033
1725
|
});
|
|
2034
1726
|
|
|
1727
|
+
// 兼容用户直接复制整行"Cookie: xxx"
|
|
1728
|
+
let normalizedCookie = cookie.trim();
|
|
1729
|
+
if (/^cookie:/i.test(normalizedCookie)) {
|
|
1730
|
+
normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
|
|
1731
|
+
}
|
|
1732
|
+
globalCookieHeader = normalizedCookie;
|
|
1733
|
+
|
|
2035
1734
|
// 设置 cookies
|
|
2036
|
-
const cookies = parseCookies(
|
|
1735
|
+
const cookies = parseCookies(normalizedCookie);
|
|
2037
1736
|
await context.addCookies(cookies);
|
|
2038
1737
|
|
|
1738
|
+
// 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
|
|
1739
|
+
await context.route('**/*', (route) => {
|
|
1740
|
+
const request = route.request();
|
|
1741
|
+
let url;
|
|
1742
|
+
try {
|
|
1743
|
+
url = new URL(request.url());
|
|
1744
|
+
} catch {
|
|
1745
|
+
return route.continue();
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
const hostname = url.hostname || '';
|
|
1749
|
+
const isGeekbangDomain =
|
|
1750
|
+
hostname === 'geekbang.org' ||
|
|
1751
|
+
hostname.endsWith('.geekbang.org');
|
|
1752
|
+
|
|
1753
|
+
if (!isGeekbangDomain) {
|
|
1754
|
+
return route.continue();
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1757
|
+
const headers = {
|
|
1758
|
+
...request.headers(),
|
|
1759
|
+
cookie: normalizedCookie
|
|
1760
|
+
};
|
|
1761
|
+
route.continue({ headers });
|
|
1762
|
+
});
|
|
1763
|
+
|
|
2039
1764
|
const page = await context.newPage();
|
|
2040
1765
|
|
|
2041
1766
|
try {
|
package/package.json
CHANGED
|
Binary file
|