@kadaliao/geektime-downloader 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/download.js +375 -665
- package/package.json +1 -1
- package/kadaliao-geektime-downloader-1.1.1.tgz +0 -0
package/download.js
CHANGED
|
@@ -19,6 +19,7 @@ const require = createRequire(import.meta.url);
|
|
|
19
19
|
const { version } = require('./package.json');
|
|
20
20
|
|
|
21
21
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
22
|
+
let globalCookieHeader = '';
|
|
22
23
|
|
|
23
24
|
// 全局变量:跟踪当前浏览器实例和是否正在关闭
|
|
24
25
|
let globalBrowser = null;
|
|
@@ -244,6 +245,10 @@ const PRINT_FIX_CSS = `
|
|
|
244
245
|
}
|
|
245
246
|
`;
|
|
246
247
|
|
|
248
|
+
const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
|
|
249
|
+
const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
|
|
250
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
251
|
+
|
|
247
252
|
// 解析 cookie 字符串
|
|
248
253
|
function parseCookies(cookieString) {
|
|
249
254
|
return cookieString.split(';').map(cookie => {
|
|
@@ -257,6 +262,245 @@ function parseCookies(cookieString) {
|
|
|
257
262
|
});
|
|
258
263
|
}
|
|
259
264
|
|
|
265
|
+
function normalizeArticleHtml(html = '') {
|
|
266
|
+
if (!html) return '';
|
|
267
|
+
return html
|
|
268
|
+
.replace(/<!--\s*\[\[\[read_end]]\]\s*-->/gi, '')
|
|
269
|
+
.replace(/src="\/\//gi, 'src="https://')
|
|
270
|
+
.replace(/src='\/\//gi, "src='https://")
|
|
271
|
+
.replace(/href="\/\//gi, 'href="https://')
|
|
272
|
+
.replace(/href='\/\//gi, "href='https://");
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
async function fetchArticleData(context, articleId) {
|
|
276
|
+
const maxAttempts = 3;
|
|
277
|
+
const refererUrl = `${GEEKTIME_BASE_URL}/column/article/${articleId}`;
|
|
278
|
+
let lastError = null;
|
|
279
|
+
|
|
280
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
281
|
+
try {
|
|
282
|
+
const response = await context.request.post(ARTICLE_API_URL, {
|
|
283
|
+
headers: {
|
|
284
|
+
'user-agent': DEFAULT_USER_AGENT,
|
|
285
|
+
'content-type': 'application/json',
|
|
286
|
+
'accept': 'application/json, text/plain, */*',
|
|
287
|
+
'origin': GEEKTIME_BASE_URL,
|
|
288
|
+
'referer': refererUrl,
|
|
289
|
+
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
290
|
+
...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
|
|
291
|
+
},
|
|
292
|
+
data: {
|
|
293
|
+
id: String(articleId),
|
|
294
|
+
include_neighbors: true,
|
|
295
|
+
is_freelyread: true
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
const bodyText = await response.text();
|
|
300
|
+
|
|
301
|
+
if (!response.ok()) {
|
|
302
|
+
throw new Error(`API请求失败: ${response.status()} ${response.statusText()} - ${bodyText.slice(0, 160)}`);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
let json;
|
|
306
|
+
try {
|
|
307
|
+
json = JSON.parse(bodyText);
|
|
308
|
+
} catch (parseError) {
|
|
309
|
+
throw new Error(`API响应解析失败: ${parseError.message} - ${bodyText.slice(0, 160)}`);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (!json || json.code !== 0 || !json.data) {
|
|
313
|
+
throw new Error(`无法获取完整文章内容: ${bodyText.slice(0, 160)}`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (!json.data.article_content) {
|
|
317
|
+
throw new Error('文章内容为空,可能需要更新 Cookie 或重新获取权限');
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return json.data;
|
|
321
|
+
} catch (error) {
|
|
322
|
+
lastError = error;
|
|
323
|
+
if (attempt < maxAttempts) {
|
|
324
|
+
await new Promise(resolve => setTimeout(resolve, attempt * 700));
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
throw lastError || new Error('未知错误导致文章内容获取失败');
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
async function sanitizeArticleHtml(page, rawHtml) {
|
|
333
|
+
return page.evaluate((html) => {
|
|
334
|
+
const template = document.createElement('template');
|
|
335
|
+
template.innerHTML = html;
|
|
336
|
+
|
|
337
|
+
const removalSelectors = [
|
|
338
|
+
'nav', 'header', 'footer', 'aside',
|
|
339
|
+
'.comment', '.comments', '.Index_comment',
|
|
340
|
+
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
341
|
+
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
342
|
+
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
343
|
+
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
344
|
+
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
345
|
+
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
346
|
+
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
347
|
+
'.copyright', '.statement', '.disclaimer',
|
|
348
|
+
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
349
|
+
'audio', 'video',
|
|
350
|
+
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
351
|
+
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
352
|
+
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
353
|
+
'[data-role="toolbar"]',
|
|
354
|
+
'button', 'iframe', 'script', 'style'
|
|
355
|
+
];
|
|
356
|
+
removalSelectors.forEach(selector => {
|
|
357
|
+
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
const pluginKeywords = [
|
|
361
|
+
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
362
|
+
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
363
|
+
'copyright', 'geeknote', 'bilingual'
|
|
364
|
+
];
|
|
365
|
+
const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
|
|
366
|
+
const className = (el.className || '').toString().toLowerCase();
|
|
367
|
+
const idValue = (el.id || '').toString().toLowerCase();
|
|
368
|
+
const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
|
|
369
|
+
const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
|
|
370
|
+
const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
|
|
371
|
+
return pluginKeywords.some(keyword => combined.includes(keyword));
|
|
372
|
+
});
|
|
373
|
+
pluginElements.forEach(el => el.remove());
|
|
374
|
+
|
|
375
|
+
const mindmapSelectors = [
|
|
376
|
+
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
377
|
+
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
378
|
+
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
379
|
+
];
|
|
380
|
+
mindmapSelectors.forEach(selector => {
|
|
381
|
+
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
382
|
+
});
|
|
383
|
+
const vectorCandidates = Array.from(template.content.querySelectorAll('svg, canvas, object, embed'));
|
|
384
|
+
vectorCandidates.forEach(el => {
|
|
385
|
+
const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
|
|
386
|
+
const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
|
|
387
|
+
if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
|
|
388
|
+
el.remove();
|
|
389
|
+
}
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
const allowedTags = new Set([
|
|
393
|
+
'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
|
|
394
|
+
'UL', 'OL', 'LI',
|
|
395
|
+
'BLOCKQUOTE', 'PRE', 'CODE',
|
|
396
|
+
'IMG', 'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', 'FIGURE', 'FIGCAPTION',
|
|
397
|
+
'STRONG', 'EM', 'B', 'I', 'SPAN', 'DIV', 'BR', 'HR',
|
|
398
|
+
'A', 'SUP', 'SUB'
|
|
399
|
+
]);
|
|
400
|
+
|
|
401
|
+
const blockDisplayTags = new Set(['DIV', 'SECTION', 'ARTICLE', 'FIGURE']);
|
|
402
|
+
const allowedAttributes = new Set(['href', 'src', 'alt', 'title', 'class', 'style', 'target', 'rel']);
|
|
403
|
+
|
|
404
|
+
function sanitizeNode(node) {
|
|
405
|
+
const children = Array.from(node.children || []);
|
|
406
|
+
for (const child of children) {
|
|
407
|
+
if (!allowedTags.has(child.tagName)) {
|
|
408
|
+
child.replaceWith(...child.childNodes);
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if (blockDisplayTags.has(child.tagName)) {
|
|
413
|
+
child.style.display = 'block';
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
const attributes = Array.from(child.attributes);
|
|
417
|
+
for (const attr of attributes) {
|
|
418
|
+
if (!allowedAttributes.has(attr.name.toLowerCase())) {
|
|
419
|
+
child.removeAttribute(attr.name);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
sanitizeNode(child);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
sanitizeNode(template.content || template);
|
|
428
|
+
|
|
429
|
+
const images = template.content ? template.content.querySelectorAll('img') : [];
|
|
430
|
+
images.forEach(img => {
|
|
431
|
+
if (!img.getAttribute('loading')) {
|
|
432
|
+
img.setAttribute('loading', 'lazy');
|
|
433
|
+
}
|
|
434
|
+
img.style.maxWidth = '100%';
|
|
435
|
+
img.style.height = 'auto';
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
return template.innerHTML;
|
|
439
|
+
}, rawHtml);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
function escapeHtml(text = '') {
|
|
443
|
+
return text
|
|
444
|
+
.replace(/&/g, '&')
|
|
445
|
+
.replace(/</g, '<')
|
|
446
|
+
.replace(/>/g, '>')
|
|
447
|
+
.replace(/"/g, '"')
|
|
448
|
+
.replace(/'/g, ''');
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
function buildPrintableHtml(title, sanitizedHtml) {
|
|
452
|
+
const baseCss = `
|
|
453
|
+
body {
|
|
454
|
+
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
|
|
455
|
+
font-size: 16px;
|
|
456
|
+
line-height: 1.8;
|
|
457
|
+
color: #1f2329;
|
|
458
|
+
margin: 0;
|
|
459
|
+
padding: 40px;
|
|
460
|
+
background: #fff;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
.article-print-wrapper {
|
|
464
|
+
max-width: 900px;
|
|
465
|
+
margin: 0 auto;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
.article-print-wrapper h1 {
|
|
469
|
+
font-size: 32px;
|
|
470
|
+
line-height: 1.4;
|
|
471
|
+
margin-bottom: 24px;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
a {
|
|
475
|
+
color: #0f5ef2;
|
|
476
|
+
text-decoration: none;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
pre {
|
|
480
|
+
background: #f7f7f7;
|
|
481
|
+
padding: 16px;
|
|
482
|
+
border-radius: 6px;
|
|
483
|
+
overflow: auto;
|
|
484
|
+
}
|
|
485
|
+
`;
|
|
486
|
+
|
|
487
|
+
return `
|
|
488
|
+
<!DOCTYPE html>
|
|
489
|
+
<html lang="zh-CN">
|
|
490
|
+
<head>
|
|
491
|
+
<meta charset="utf-8">
|
|
492
|
+
<base href="${GEEKTIME_BASE_URL}">
|
|
493
|
+
<style>${baseCss}${PRINT_FIX_CSS}</style>
|
|
494
|
+
</head>
|
|
495
|
+
<body>
|
|
496
|
+
<div class="article-print-wrapper">
|
|
497
|
+
<h1>${escapeHtml(title)}</h1>
|
|
498
|
+
${sanitizedHtml}
|
|
499
|
+
</div>
|
|
500
|
+
</body>
|
|
501
|
+
</html>`;
|
|
502
|
+
}
|
|
503
|
+
|
|
260
504
|
// 获取专栏所有文章列表(通过API)
|
|
261
505
|
function getValueByPath(obj, path) {
|
|
262
506
|
if (!obj || !path) return undefined;
|
|
@@ -702,128 +946,67 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
|
|
|
702
946
|
// 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
|
|
703
947
|
async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
704
948
|
try {
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
await page
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
// 2. 克隆正文内容
|
|
719
|
-
const contentClone = articleContent.cloneNode(true);
|
|
720
|
-
|
|
721
|
-
// 3. 清空body的所有内容
|
|
722
|
-
document.body.innerHTML = '';
|
|
723
|
-
|
|
724
|
-
// 4. 重置body样式为全宽
|
|
725
|
-
document.body.style.margin = '0';
|
|
726
|
-
document.body.style.padding = '0';
|
|
727
|
-
document.body.style.width = '100%';
|
|
728
|
-
document.body.style.maxWidth = 'none';
|
|
729
|
-
document.body.style.boxSizing = 'border-box';
|
|
730
|
-
|
|
731
|
-
// 5. 创建一个简单的容器
|
|
732
|
-
const wrapper = document.createElement('div');
|
|
733
|
-
wrapper.style.width = '100%';
|
|
734
|
-
wrapper.style.maxWidth = '100%';
|
|
735
|
-
wrapper.style.margin = '0';
|
|
736
|
-
wrapper.style.padding = '0';
|
|
737
|
-
wrapper.style.boxSizing = 'border-box';
|
|
738
|
-
|
|
739
|
-
// 6. 创建标题元素(使用传入的标题文本)
|
|
740
|
-
if (titleText) {
|
|
741
|
-
const titleElement = document.createElement('h1');
|
|
742
|
-
titleElement.textContent = titleText;
|
|
743
|
-
// 设置标题样式
|
|
744
|
-
titleElement.style.fontSize = '32px';
|
|
745
|
-
titleElement.style.fontWeight = 'bold';
|
|
746
|
-
titleElement.style.marginBottom = '30px';
|
|
747
|
-
titleElement.style.marginTop = '0';
|
|
748
|
-
titleElement.style.lineHeight = '1.4';
|
|
749
|
-
titleElement.style.color = '#000';
|
|
750
|
-
wrapper.appendChild(titleElement);
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
// 7. 将正文插入容器
|
|
754
|
-
wrapper.appendChild(contentClone);
|
|
755
|
-
|
|
756
|
-
// 8. 将容器插入body
|
|
757
|
-
document.body.appendChild(wrapper);
|
|
758
|
-
|
|
759
|
-
// 9. 确保正文内容使用全宽且不溢出
|
|
760
|
-
contentClone.style.width = '100%';
|
|
761
|
-
contentClone.style.maxWidth = '100%';
|
|
762
|
-
contentClone.style.margin = '0';
|
|
763
|
-
contentClone.style.padding = '0';
|
|
764
|
-
contentClone.style.boxSizing = 'border-box';
|
|
765
|
-
contentClone.style.overflowWrap = 'break-word';
|
|
766
|
-
contentClone.style.wordBreak = 'break-word';
|
|
767
|
-
} else {
|
|
768
|
-
// 如果找不到正文,使用原有的删除方法
|
|
769
|
-
const selectors = [
|
|
770
|
-
'aside',
|
|
771
|
-
'[class*="leftSide"]',
|
|
772
|
-
'[class*="LeftSide"]',
|
|
773
|
-
'[class*="sidebar"]',
|
|
774
|
-
'[class*="Sidebar"]',
|
|
775
|
-
'[class*="side_"]',
|
|
776
|
-
'[class*="catalog"]',
|
|
777
|
-
'[class*="directory"]',
|
|
778
|
-
'[class*="toc"]',
|
|
779
|
-
'[class*="outline"]',
|
|
780
|
-
'[class*="Outline"]',
|
|
781
|
-
'nav',
|
|
782
|
-
'[class*="nav"]',
|
|
783
|
-
'[class*="Nav"]',
|
|
784
|
-
'[class*="rightSide"]',
|
|
785
|
-
'[class*="RightSide"]',
|
|
786
|
-
'[class*="comment"]',
|
|
787
|
-
'[class*="recommend"]',
|
|
788
|
-
'[class*="footer"]',
|
|
789
|
-
'[class*="bottom"]'
|
|
790
|
-
];
|
|
949
|
+
if (process.env.DEBUG) {
|
|
950
|
+
console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
|
|
951
|
+
}
|
|
952
|
+
const articleData = await fetchArticleData(page.context(), article.id);
|
|
953
|
+
if (process.env.DEBUG) {
|
|
954
|
+
console.log(chalk.gray(`[silent] 已获取文章数据 ${article.id}`));
|
|
955
|
+
}
|
|
956
|
+
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
957
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
958
|
+
if (process.env.DEBUG) {
|
|
959
|
+
console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
|
|
960
|
+
}
|
|
961
|
+
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
791
962
|
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
});
|
|
963
|
+
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
964
|
+
if (process.env.DEBUG) {
|
|
965
|
+
console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
|
|
966
|
+
}
|
|
967
|
+
try {
|
|
968
|
+
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
969
|
+
if (process.env.DEBUG) {
|
|
970
|
+
console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
|
|
800
971
|
}
|
|
801
|
-
|
|
802
|
-
//
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
(text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
|
|
808
|
-
el.remove();
|
|
809
|
-
}
|
|
810
|
-
});
|
|
811
|
-
}, article.originalTitle || article.title);
|
|
812
|
-
|
|
813
|
-
// 等待文章内容加载
|
|
814
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content');
|
|
972
|
+
} catch {
|
|
973
|
+
// 忽略由于没有额外资源导致的延时
|
|
974
|
+
if (process.env.DEBUG) {
|
|
975
|
+
console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
|
|
976
|
+
}
|
|
977
|
+
}
|
|
815
978
|
|
|
816
979
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
980
|
+
if (process.env.DEBUG) {
|
|
981
|
+
console.log(chalk.gray(`[silent] 开始处理图片 ${article.id}`));
|
|
982
|
+
}
|
|
817
983
|
await page.evaluate(() => {
|
|
818
984
|
const images = document.querySelectorAll('img');
|
|
819
985
|
const promises = Array.from(images).map(img => {
|
|
820
986
|
return new Promise((resolve) => {
|
|
987
|
+
let resolved = false;
|
|
988
|
+
const safeResolve = () => {
|
|
989
|
+
if (!resolved) {
|
|
990
|
+
resolved = true;
|
|
991
|
+
resolve();
|
|
992
|
+
}
|
|
993
|
+
};
|
|
994
|
+
const attachTimeout = () => setTimeout(safeResolve, 3000);
|
|
995
|
+
let fallbackTimer = null;
|
|
996
|
+
|
|
821
997
|
// 如果图片还未加载完成,等待加载
|
|
822
998
|
if (!img.complete) {
|
|
823
|
-
|
|
824
|
-
img.
|
|
999
|
+
fallbackTimer = attachTimeout();
|
|
1000
|
+
img.onload = () => {
|
|
1001
|
+
if (fallbackTimer) clearTimeout(fallbackTimer);
|
|
1002
|
+
processImage(img, safeResolve);
|
|
1003
|
+
};
|
|
1004
|
+
img.onerror = () => {
|
|
1005
|
+
if (fallbackTimer) clearTimeout(fallbackTimer);
|
|
1006
|
+
safeResolve(); // 图片加载失败,跳过
|
|
1007
|
+
};
|
|
825
1008
|
} else {
|
|
826
|
-
processImage(img,
|
|
1009
|
+
processImage(img, safeResolve);
|
|
827
1010
|
}
|
|
828
1011
|
});
|
|
829
1012
|
});
|
|
@@ -851,12 +1034,21 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
851
1034
|
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
|
852
1035
|
|
|
853
1036
|
// 转换为压缩后的data URL
|
|
1037
|
+
let hasResolved = false;
|
|
1038
|
+
const finalize = () => {
|
|
1039
|
+
if (!hasResolved) {
|
|
1040
|
+
hasResolved = true;
|
|
1041
|
+
resolve();
|
|
1042
|
+
}
|
|
1043
|
+
};
|
|
854
1044
|
canvas.toBlob((blob) => {
|
|
855
|
-
|
|
856
|
-
|
|
1045
|
+
if (blob) {
|
|
1046
|
+
const url = URL.createObjectURL(blob);
|
|
1047
|
+
img.src = url;
|
|
1048
|
+
}
|
|
857
1049
|
img.style.width = maxWidth + 'px';
|
|
858
1050
|
img.style.height = 'auto';
|
|
859
|
-
|
|
1051
|
+
finalize();
|
|
860
1052
|
}, 'image/jpeg', quality);
|
|
861
1053
|
} catch (e) {
|
|
862
1054
|
// 如果压缩失败,至少限制大小
|
|
@@ -868,9 +1060,15 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
868
1060
|
|
|
869
1061
|
return Promise.all(promises);
|
|
870
1062
|
});
|
|
1063
|
+
if (process.env.DEBUG) {
|
|
1064
|
+
console.log(chalk.gray(`[silent] 图片处理完成 ${article.id}`));
|
|
1065
|
+
}
|
|
871
1066
|
|
|
872
1067
|
// 等待图片处理完成
|
|
873
1068
|
await page.waitForTimeout(1000);
|
|
1069
|
+
if (process.env.DEBUG) {
|
|
1070
|
+
console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
|
|
1071
|
+
}
|
|
874
1072
|
|
|
875
1073
|
// 生成 PDF
|
|
876
1074
|
const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
|
|
@@ -888,10 +1086,16 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
888
1086
|
printBackground: false, // 关闭背景打印,显著减小文件大小
|
|
889
1087
|
preferCSSPageSize: false
|
|
890
1088
|
});
|
|
1089
|
+
if (process.env.DEBUG) {
|
|
1090
|
+
console.log(chalk.gray(`[silent] PDF生成完成 ${article.id}`));
|
|
1091
|
+
}
|
|
891
1092
|
|
|
892
1093
|
return { success: true, title: article.title };
|
|
893
1094
|
|
|
894
1095
|
} catch (error) {
|
|
1096
|
+
if (process.env.DEBUG) {
|
|
1097
|
+
console.log(chalk.red(`[silent] 文章 ${article.id} 失败: ${error.message}`));
|
|
1098
|
+
}
|
|
895
1099
|
return { success: false, title: article.title, error: error.message };
|
|
896
1100
|
}
|
|
897
1101
|
}
|
|
@@ -901,116 +1105,17 @@ async function downloadArticle(page, article, outputDir, index, total) {
|
|
|
901
1105
|
const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
|
|
902
1106
|
|
|
903
1107
|
try {
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
await page
|
|
907
|
-
|
|
908
|
-
// 注入打印修复样式
|
|
909
|
-
await page.addStyleTag({ content: PRINT_FIX_CSS });
|
|
910
|
-
|
|
911
|
-
// 激进的布局重构:提取正文并重建页面结构
|
|
912
|
-
await page.evaluate((titleText) => {
|
|
913
|
-
// 1. 找到文章正文内容
|
|
914
|
-
const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
|
|
915
|
-
|
|
916
|
-
if (articleContent) {
|
|
917
|
-
// 2. 克隆正文内容
|
|
918
|
-
const contentClone = articleContent.cloneNode(true);
|
|
919
|
-
|
|
920
|
-
// 3. 清空body的所有内容
|
|
921
|
-
document.body.innerHTML = '';
|
|
922
|
-
|
|
923
|
-
// 4. 重置body样式为全宽
|
|
924
|
-
document.body.style.margin = '0';
|
|
925
|
-
document.body.style.padding = '0';
|
|
926
|
-
document.body.style.width = '100%';
|
|
927
|
-
document.body.style.maxWidth = 'none';
|
|
928
|
-
document.body.style.boxSizing = 'border-box';
|
|
929
|
-
|
|
930
|
-
// 5. 创建一个简单的容器
|
|
931
|
-
const wrapper = document.createElement('div');
|
|
932
|
-
wrapper.style.width = '100%';
|
|
933
|
-
wrapper.style.maxWidth = '100%';
|
|
934
|
-
wrapper.style.margin = '0';
|
|
935
|
-
wrapper.style.padding = '0';
|
|
936
|
-
wrapper.style.boxSizing = 'border-box';
|
|
937
|
-
|
|
938
|
-
// 6. 创建标题元素(使用传入的标题文本)
|
|
939
|
-
if (titleText) {
|
|
940
|
-
const titleElement = document.createElement('h1');
|
|
941
|
-
titleElement.textContent = titleText;
|
|
942
|
-
// 设置标题样式
|
|
943
|
-
titleElement.style.fontSize = '32px';
|
|
944
|
-
titleElement.style.fontWeight = 'bold';
|
|
945
|
-
titleElement.style.marginBottom = '30px';
|
|
946
|
-
titleElement.style.marginTop = '0';
|
|
947
|
-
titleElement.style.lineHeight = '1.4';
|
|
948
|
-
titleElement.style.color = '#000';
|
|
949
|
-
wrapper.appendChild(titleElement);
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
// 7. 将正文插入容器
|
|
953
|
-
wrapper.appendChild(contentClone);
|
|
1108
|
+
const articleData = await fetchArticleData(page.context(), article.id);
|
|
1109
|
+
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
1110
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
1111
|
+
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
954
1112
|
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
contentClone.style.margin = '0';
|
|
962
|
-
contentClone.style.padding = '0';
|
|
963
|
-
contentClone.style.boxSizing = 'border-box';
|
|
964
|
-
contentClone.style.overflowWrap = 'break-word';
|
|
965
|
-
contentClone.style.wordBreak = 'break-word';
|
|
966
|
-
} else {
|
|
967
|
-
// 如果找不到正文,使用原有的删除方法
|
|
968
|
-
const selectors = [
|
|
969
|
-
'aside',
|
|
970
|
-
'[class*="leftSide"]',
|
|
971
|
-
'[class*="LeftSide"]',
|
|
972
|
-
'[class*="sidebar"]',
|
|
973
|
-
'[class*="Sidebar"]',
|
|
974
|
-
'[class*="side_"]',
|
|
975
|
-
'[class*="catalog"]',
|
|
976
|
-
'[class*="directory"]',
|
|
977
|
-
'[class*="toc"]',
|
|
978
|
-
'[class*="outline"]',
|
|
979
|
-
'[class*="Outline"]',
|
|
980
|
-
'nav',
|
|
981
|
-
'[class*="nav"]',
|
|
982
|
-
'[class*="Nav"]',
|
|
983
|
-
'[class*="rightSide"]',
|
|
984
|
-
'[class*="RightSide"]',
|
|
985
|
-
'[class*="comment"]',
|
|
986
|
-
'[class*="recommend"]',
|
|
987
|
-
'[class*="footer"]',
|
|
988
|
-
'[class*="bottom"]'
|
|
989
|
-
];
|
|
990
|
-
|
|
991
|
-
selectors.forEach(selector => {
|
|
992
|
-
try {
|
|
993
|
-
const elements = document.querySelectorAll(selector);
|
|
994
|
-
elements.forEach(el => el.remove());
|
|
995
|
-
} catch (e) {
|
|
996
|
-
// 忽略无效选择器
|
|
997
|
-
}
|
|
998
|
-
});
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
|
-
// 额外:删除所有包含"大纲"的元素
|
|
1002
|
-
const allElements = document.querySelectorAll('*');
|
|
1003
|
-
allElements.forEach(el => {
|
|
1004
|
-
const text = el.textContent || el.innerText || '';
|
|
1005
|
-
if (text.trim() === '大纲' ||
|
|
1006
|
-
(text.length < 200 && text.includes('大纲') && el.children.length <= 10)) {
|
|
1007
|
-
el.remove();
|
|
1008
|
-
}
|
|
1009
|
-
});
|
|
1010
|
-
}, article.originalTitle || article.title);
|
|
1011
|
-
|
|
1012
|
-
// 等待文章内容加载
|
|
1013
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content');
|
|
1113
|
+
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
1114
|
+
try {
|
|
1115
|
+
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
1116
|
+
} catch {
|
|
1117
|
+
// 没有额外资源加载时忽略
|
|
1118
|
+
}
|
|
1014
1119
|
|
|
1015
1120
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
1016
1121
|
await page.evaluate(() => {
|
|
@@ -1209,460 +1314,32 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
|
|
|
1209
1314
|
// 提取单篇文章的 HTML 内容(用于 EPUB 生成)
|
|
1210
1315
|
async function extractArticleContent(page, article, index, total) {
|
|
1211
1316
|
try {
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
// 等待文章内容加载
|
|
1216
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 60000 });
|
|
1217
|
-
|
|
1218
|
-
// 关键:等待文章完整内容加载,而不是试看内容
|
|
1219
|
-
// 滚动页面以触发懒加载内容
|
|
1220
|
-
await page.evaluate(async () => {
|
|
1221
|
-
await new Promise((resolve) => {
|
|
1222
|
-
let totalHeight = 0;
|
|
1223
|
-
const distance = 100;
|
|
1224
|
-
const timer = setInterval(() => {
|
|
1225
|
-
const scrollHeight = document.body.scrollHeight;
|
|
1226
|
-
window.scrollBy(0, distance);
|
|
1227
|
-
totalHeight += distance;
|
|
1228
|
-
|
|
1229
|
-
if (totalHeight >= scrollHeight) {
|
|
1230
|
-
clearInterval(timer);
|
|
1231
|
-
resolve();
|
|
1232
|
-
}
|
|
1233
|
-
}, 100);
|
|
1234
|
-
});
|
|
1235
|
-
});
|
|
1236
|
-
|
|
1237
|
-
// 再等待一段时间,确保内容完全加载
|
|
1238
|
-
await page.waitForTimeout(3000);
|
|
1239
|
-
|
|
1240
|
-
// 提取文章 HTML 内容
|
|
1241
|
-
const content = await page.evaluate(() => {
|
|
1242
|
-
// 找到文章正文内容
|
|
1243
|
-
const articleContent = document.querySelector('.Index_articleContent_QBG5G, .article-content, article, [class*="articleContent"]');
|
|
1244
|
-
|
|
1245
|
-
if (!articleContent) {
|
|
1246
|
-
return null;
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
// 克隆正文以避免修改原始DOM
|
|
1250
|
-
const contentClone = articleContent.cloneNode(true);
|
|
1251
|
-
|
|
1252
|
-
// 白名单策略:只保留正文核心元素
|
|
1253
|
-
// 允许的元素标签
|
|
1254
|
-
const allowedTags = new Set([
|
|
1255
|
-
'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', // 段落和标题
|
|
1256
|
-
'UL', 'OL', 'LI', // 列表
|
|
1257
|
-
'BLOCKQUOTE', // 引用
|
|
1258
|
-
'PRE', 'CODE', // 代码
|
|
1259
|
-
'IMG', // 图片
|
|
1260
|
-
'TABLE', 'THEAD', 'TBODY', 'TR', 'TH', 'TD', // 表格
|
|
1261
|
-
'A', // 链接
|
|
1262
|
-
'STRONG', 'B', 'EM', 'I', 'U', // 强调和样式
|
|
1263
|
-
'BR', 'HR', // 换行和分隔线
|
|
1264
|
-
'FIGURE', 'FIGCAPTION', 'DETAILS', 'SUMMARY',
|
|
1265
|
-
'SPAN', 'DIV', 'SECTION', 'ARTICLE' // 容器(可能包含文本)
|
|
1266
|
-
]);
|
|
1267
|
-
|
|
1268
|
-
// 在清理前,移除常见的非正文区域
|
|
1269
|
-
const removalSelectors = [
|
|
1270
|
-
'nav', 'header', 'footer', 'aside',
|
|
1271
|
-
'.comment', '.comments', '.Index_comment',
|
|
1272
|
-
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
1273
|
-
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
1274
|
-
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
1275
|
-
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
1276
|
-
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
1277
|
-
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
1278
|
-
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
1279
|
-
'.copyright', '.statement', '.disclaimer',
|
|
1280
|
-
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
1281
|
-
'audio', 'video',
|
|
1282
|
-
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
1283
|
-
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
1284
|
-
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
1285
|
-
'[data-role="toolbar"]',
|
|
1286
|
-
'button', 'iframe', 'script', 'style'
|
|
1287
|
-
];
|
|
1288
|
-
removalSelectors.forEach(selector => {
|
|
1289
|
-
contentClone.querySelectorAll(selector).forEach(el => el.remove());
|
|
1290
|
-
});
|
|
1291
|
-
|
|
1292
|
-
// 根据关键词进一步移除插件类元素
|
|
1293
|
-
const pluginKeywords = [
|
|
1294
|
-
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
1295
|
-
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
1296
|
-
'copyright', 'geeknote', 'bilingual'
|
|
1297
|
-
];
|
|
1298
|
-
const pluginElements = Array.from(contentClone.querySelectorAll('*')).filter(el => {
|
|
1299
|
-
const className = (el.className || '').toString().toLowerCase();
|
|
1300
|
-
const idValue = (el.id || '').toString().toLowerCase();
|
|
1301
|
-
const roleValue = (el.getAttribute && el.getAttribute('role')) ? el.getAttribute('role').toLowerCase() : '';
|
|
1302
|
-
const datasetValues = el.dataset ? Object.values(el.dataset).join(' ').toLowerCase() : '';
|
|
1303
|
-
const combined = `${className} ${idValue} ${roleValue} ${datasetValues}`;
|
|
1304
|
-
return pluginKeywords.some(keyword => combined.includes(keyword));
|
|
1305
|
-
});
|
|
1306
|
-
pluginElements.forEach(el => el.remove());
|
|
1307
|
-
|
|
1308
|
-
// 移除 MindMap 等 SVG/Canvas 思维导图内容(阅读器无法正确渲染)
|
|
1309
|
-
const mindmapSelectors = [
|
|
1310
|
-
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
1311
|
-
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
1312
|
-
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
1313
|
-
];
|
|
1314
|
-
mindmapSelectors.forEach(selector => {
|
|
1315
|
-
contentClone.querySelectorAll(selector).forEach(el => el.remove());
|
|
1316
|
-
});
|
|
1317
|
-
const vectorCandidates = Array.from(contentClone.querySelectorAll('svg, canvas, object, embed'));
|
|
1318
|
-
vectorCandidates.forEach(el => {
|
|
1319
|
-
const className = typeof el.className === 'object' ? el.className.baseVal : (el.className || '');
|
|
1320
|
-
const meta = `${className} ${el.id || ''} ${el.getAttribute('data-type') || ''}`.toLowerCase();
|
|
1321
|
-
if (meta.includes('mind') || meta.includes('mindmap') || meta.includes('mind-map')) {
|
|
1322
|
-
el.remove();
|
|
1323
|
-
}
|
|
1324
|
-
});
|
|
1325
|
-
|
|
1326
|
-
// 将富文本中的代码块结构转换为标准 <pre><code>
|
|
1327
|
-
const blockSeparatorTags = new Set([
|
|
1328
|
-
'P','DIV','SECTION','ARTICLE','UL','OL','LI','FIGURE','FIGCAPTION',
|
|
1329
|
-
'TABLE','THEAD','TBODY','TR','TD'
|
|
1330
|
-
]);
|
|
1331
|
-
|
|
1332
|
-
function collectCodeText(node) {
|
|
1333
|
-
const parts = [];
|
|
1334
|
-
|
|
1335
|
-
const ensureNewline = () => {
|
|
1336
|
-
if (!parts.length) {
|
|
1337
|
-
parts.push('\n');
|
|
1338
|
-
return;
|
|
1339
|
-
}
|
|
1340
|
-
if (!parts[parts.length - 1].endsWith('\n')) {
|
|
1341
|
-
parts.push('\n');
|
|
1342
|
-
}
|
|
1343
|
-
};
|
|
1344
|
-
|
|
1345
|
-
const traverse = (current) => {
|
|
1346
|
-
if (!current) {
|
|
1347
|
-
return;
|
|
1348
|
-
}
|
|
1349
|
-
if (current.nodeType === Node.TEXT_NODE) {
|
|
1350
|
-
const textValue = current.textContent.replace(/\u00A0/g, ' ');
|
|
1351
|
-
if (textValue) {
|
|
1352
|
-
parts.push(textValue);
|
|
1353
|
-
}
|
|
1354
|
-
return;
|
|
1355
|
-
}
|
|
1356
|
-
if (current.nodeType !== Node.ELEMENT_NODE) {
|
|
1357
|
-
return;
|
|
1358
|
-
}
|
|
1359
|
-
const tag = current.tagName.toUpperCase();
|
|
1360
|
-
if (tag === 'BR') {
|
|
1361
|
-
ensureNewline();
|
|
1362
|
-
return;
|
|
1363
|
-
}
|
|
1364
|
-
Array.from(current.childNodes).forEach(traverse);
|
|
1365
|
-
if (blockSeparatorTags.has(tag)) {
|
|
1366
|
-
ensureNewline();
|
|
1367
|
-
}
|
|
1368
|
-
};
|
|
1369
|
-
|
|
1370
|
-
traverse(node);
|
|
1371
|
-
let text = parts.join('');
|
|
1372
|
-
text = text
|
|
1373
|
-
.replace(/\r\n/g, '\n')
|
|
1374
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
1375
|
-
.replace(/[ \t]+\n/g, '\n')
|
|
1376
|
-
.replace(/\n+$/g, '\n');
|
|
1377
|
-
return text.trim() ? text : '';
|
|
1378
|
-
}
|
|
1379
|
-
|
|
1380
|
-
const codeLikeSelectors = [
|
|
1381
|
-
'[data-slate-type="code"]',
|
|
1382
|
-
'[data-slate-node="code"]',
|
|
1383
|
-
'[data-code-block]',
|
|
1384
|
-
'[data-code]',
|
|
1385
|
-
'[data-code-language]',
|
|
1386
|
-
'[class*="code-block"]',
|
|
1387
|
-
'[class*="CodeBlock"]'
|
|
1388
|
-
];
|
|
1389
|
-
const codeCandidates = new Set();
|
|
1390
|
-
codeLikeSelectors.forEach(selector => {
|
|
1391
|
-
contentClone.querySelectorAll(selector).forEach(el => codeCandidates.add(el));
|
|
1392
|
-
});
|
|
1393
|
-
const replaceWithPre = (element) => {
|
|
1394
|
-
if (!element || !element.parentNode) {
|
|
1395
|
-
return;
|
|
1396
|
-
}
|
|
1397
|
-
const codeText = collectCodeText(element);
|
|
1398
|
-
if (!codeText) {
|
|
1399
|
-
element.remove();
|
|
1400
|
-
return;
|
|
1401
|
-
}
|
|
1402
|
-
const pre = document.createElement('pre');
|
|
1403
|
-
const code = document.createElement('code');
|
|
1404
|
-
code.textContent = codeText;
|
|
1405
|
-
pre.appendChild(code);
|
|
1406
|
-
element.parentNode.replaceChild(pre, element);
|
|
1407
|
-
};
|
|
1408
|
-
codeCandidates.forEach(el => {
|
|
1409
|
-
if (el.tagName && el.tagName.toUpperCase() === 'PRE') {
|
|
1410
|
-
return;
|
|
1411
|
-
}
|
|
1412
|
-
replaceWithPre(el);
|
|
1413
|
-
});
|
|
1414
|
-
|
|
1415
|
-
const multilineInlineCodes = Array.from(contentClone.querySelectorAll('code')).filter(codeEl => {
|
|
1416
|
-
const parent = codeEl.parentElement;
|
|
1417
|
-
return parent && parent.tagName.toUpperCase() !== 'PRE' && codeEl.textContent.includes('\n');
|
|
1418
|
-
});
|
|
1419
|
-
multilineInlineCodes.forEach(codeEl => {
|
|
1420
|
-
const codeText = collectCodeText(codeEl);
|
|
1421
|
-
if (!codeText) {
|
|
1422
|
-
codeEl.remove();
|
|
1423
|
-
return;
|
|
1424
|
-
}
|
|
1425
|
-
const pre = document.createElement('pre');
|
|
1426
|
-
const innerCode = document.createElement('code');
|
|
1427
|
-
innerCode.textContent = codeText;
|
|
1428
|
-
pre.appendChild(innerCode);
|
|
1429
|
-
codeEl.parentNode.replaceChild(pre, codeEl);
|
|
1430
|
-
});
|
|
1431
|
-
|
|
1432
|
-
// 递归清理函数:移除不在白名单中的元素
|
|
1433
|
-
function cleanElement(element) {
|
|
1434
|
-
const children = Array.from(element.childNodes);
|
|
1435
|
-
|
|
1436
|
-
for (const child of children) {
|
|
1437
|
-
if (child.nodeType === Node.ELEMENT_NODE) {
|
|
1438
|
-
const tagName = child.tagName.toUpperCase();
|
|
1439
|
-
|
|
1440
|
-
if (!allowedTags.has(tagName)) {
|
|
1441
|
-
// 先递归处理子节点
|
|
1442
|
-
cleanElement(child);
|
|
1443
|
-
|
|
1444
|
-
if (child.childNodes.length > 0) {
|
|
1445
|
-
while (child.firstChild) {
|
|
1446
|
-
element.insertBefore(child.firstChild, child);
|
|
1447
|
-
}
|
|
1448
|
-
child.remove();
|
|
1449
|
-
} else {
|
|
1450
|
-
const textContent = (child.textContent || '').trim();
|
|
1451
|
-
if (textContent) {
|
|
1452
|
-
const textNode = document.createTextNode(textContent + ' ');
|
|
1453
|
-
element.insertBefore(textNode, child);
|
|
1454
|
-
}
|
|
1455
|
-
child.remove();
|
|
1456
|
-
}
|
|
1457
|
-
} else {
|
|
1458
|
-
cleanElement(child);
|
|
1459
|
-
}
|
|
1460
|
-
}
|
|
1461
|
-
}
|
|
1462
|
-
}
|
|
1463
|
-
|
|
1464
|
-
cleanElement(contentClone);
|
|
1465
|
-
|
|
1466
|
-
// 移除所有style属性,避免样式冲突
|
|
1467
|
-
const allElements = contentClone.querySelectorAll('*');
|
|
1468
|
-
allElements.forEach(el => {
|
|
1469
|
-
el.removeAttribute('style');
|
|
1470
|
-
el.removeAttribute('class');
|
|
1471
|
-
el.removeAttribute('id');
|
|
1472
|
-
el.removeAttribute('onclick');
|
|
1473
|
-
el.removeAttribute('onload');
|
|
1474
|
-
});
|
|
1475
|
-
|
|
1476
|
-
// 处理图片URL
|
|
1477
|
-
const images = contentClone.querySelectorAll('img');
|
|
1478
|
-
const adKeywordLower = ['ad', 'advert', 'banner', 'qrcode', 'qr-code', 'reward', 'donate', 'appdownload', 'app-download', 'sponsor', 'thanks'];
|
|
1479
|
-
const adKeywordCn = ['广告', '二维码', '赞赏', '打赏', '版权', '推广'];
|
|
1480
|
-
images.forEach(img => {
|
|
1481
|
-
let src = img.getAttribute('src');
|
|
1482
|
-
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || img.getAttribute('data-lazy-src');
|
|
1483
|
-
|
|
1484
|
-
if (dataSrc && (dataSrc.startsWith('http://') || dataSrc.startsWith('https://'))) {
|
|
1485
|
-
src = dataSrc;
|
|
1486
|
-
img.setAttribute('src', src);
|
|
1487
|
-
}
|
|
1488
|
-
|
|
1489
|
-
if (!src || src.startsWith('blob:') || src.startsWith('data:')) {
|
|
1490
|
-
img.remove();
|
|
1491
|
-
return;
|
|
1492
|
-
}
|
|
1493
|
-
|
|
1494
|
-
if (!src.startsWith('http://') && !src.startsWith('https://')) {
|
|
1495
|
-
try {
|
|
1496
|
-
const absoluteUrl = new URL(src, window.location.href).href;
|
|
1497
|
-
img.setAttribute('src', absoluteUrl);
|
|
1498
|
-
src = absoluteUrl;
|
|
1499
|
-
} catch (e) {
|
|
1500
|
-
img.remove();
|
|
1501
|
-
}
|
|
1502
|
-
}
|
|
1317
|
+
const articleData = await fetchArticleData(page.context(), article.id);
|
|
1318
|
+
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
1319
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
1503
1320
|
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
if (
|
|
1508
|
-
adKeywordLower.some(keyword => srcLower.includes(keyword)) ||
|
|
1509
|
-
adKeywordLower.some(keyword => altLower.includes(keyword)) ||
|
|
1510
|
-
adKeywordCn.some(keyword => altText.includes(keyword))
|
|
1511
|
-
) {
|
|
1512
|
-
img.remove();
|
|
1513
|
-
return;
|
|
1514
|
-
}
|
|
1515
|
-
|
|
1516
|
-
// 清理图片属性
|
|
1517
|
-
const imgAttrs = img.attributes;
|
|
1518
|
-
for (let i = imgAttrs.length - 1; i >= 0; i--) {
|
|
1519
|
-
const attrName = imgAttrs[i].name;
|
|
1520
|
-
if (attrName !== 'src' && attrName !== 'alt') {
|
|
1521
|
-
img.removeAttribute(attrName);
|
|
1522
|
-
}
|
|
1523
|
-
}
|
|
1524
|
-
});
|
|
1525
|
-
|
|
1526
|
-
// 清理空的div和span
|
|
1527
|
-
const containers = contentClone.querySelectorAll('div, span');
|
|
1528
|
-
containers.forEach(container => {
|
|
1529
|
-
if (!container.textContent.trim() && !container.querySelector('img, pre, code, table')) {
|
|
1530
|
-
container.remove();
|
|
1531
|
-
}
|
|
1532
|
-
});
|
|
1533
|
-
|
|
1534
|
-
// 将只包含纯文本的 div 转换为段落,避免没有段间距
|
|
1535
|
-
const blockLikeTags = new Set(['P','UL','OL','LI','TABLE','PRE','BLOCKQUOTE','H1','H2','H3','H4','H5','H6','IMG','SECTION','ARTICLE','FIGURE','FIGCAPTION','DETAILS','SUMMARY']);
|
|
1536
|
-
const textContainers = Array.from(contentClone.querySelectorAll('div, section, article')).reverse();
|
|
1537
|
-
textContainers.forEach(container => {
|
|
1538
|
-
if (container === contentClone) {
|
|
1539
|
-
return;
|
|
1540
|
-
}
|
|
1541
|
-
|
|
1542
|
-
if (!container.textContent.trim()) {
|
|
1543
|
-
return;
|
|
1544
|
-
}
|
|
1545
|
-
|
|
1546
|
-
if (container.querySelector('img, pre, table, ul, ol, blockquote, h1, h2, h3, h4, h5, h6, figure')) {
|
|
1547
|
-
return;
|
|
1548
|
-
}
|
|
1549
|
-
|
|
1550
|
-
const hasBlockChildren = Array.from(container.children).some(child => blockLikeTags.has(child.tagName?.toUpperCase()));
|
|
1551
|
-
if (hasBlockChildren) {
|
|
1552
|
-
return;
|
|
1553
|
-
}
|
|
1554
|
-
|
|
1555
|
-
const paragraph = document.createElement('p');
|
|
1556
|
-
paragraph.innerHTML = container.innerHTML;
|
|
1557
|
-
container.parentNode.replaceChild(paragraph, container);
|
|
1558
|
-
});
|
|
1559
|
-
|
|
1560
|
-
// 包装直接挂在容器下的文本或行内节点,避免散乱文本没有段落间距
|
|
1561
|
-
const inlineTags = new Set(['A','SPAN','STRONG','B','EM','I','U','CODE','SMALL','SUB','SUP','MARK']);
|
|
1562
|
-
|
|
1563
|
-
function wrapInlineChildren(element) {
|
|
1564
|
-
const tagName = element.tagName ? element.tagName.toUpperCase() : '';
|
|
1565
|
-
if (['P','LI','PRE','CODE','TABLE','THEAD','TBODY','TR'].includes(tagName)) {
|
|
1566
|
-
return;
|
|
1567
|
-
}
|
|
1568
|
-
|
|
1569
|
-
const childNodes = Array.from(element.childNodes);
|
|
1570
|
-
let buffer = [];
|
|
1571
|
-
|
|
1572
|
-
const flushBuffer = (referenceNode) => {
|
|
1573
|
-
if (!buffer.length) {
|
|
1574
|
-
return;
|
|
1575
|
-
}
|
|
1576
|
-
const paragraph = document.createElement('p');
|
|
1577
|
-
buffer.forEach(node => paragraph.appendChild(node));
|
|
1578
|
-
element.insertBefore(paragraph, referenceNode);
|
|
1579
|
-
buffer = [];
|
|
1580
|
-
};
|
|
1581
|
-
|
|
1582
|
-
for (const node of childNodes) {
|
|
1583
|
-
if (node.nodeType === Node.TEXT_NODE) {
|
|
1584
|
-
if (node.textContent.trim()) {
|
|
1585
|
-
buffer.push(node);
|
|
1586
|
-
} else {
|
|
1587
|
-
element.removeChild(node);
|
|
1588
|
-
}
|
|
1589
|
-
continue;
|
|
1590
|
-
}
|
|
1591
|
-
|
|
1592
|
-
if (node.nodeType === Node.ELEMENT_NODE) {
|
|
1593
|
-
const childTag = node.tagName.toUpperCase();
|
|
1594
|
-
if (inlineTags.has(childTag) || childTag === 'BR') {
|
|
1595
|
-
buffer.push(node);
|
|
1596
|
-
continue;
|
|
1597
|
-
}
|
|
1598
|
-
|
|
1599
|
-
flushBuffer(node);
|
|
1600
|
-
wrapInlineChildren(node);
|
|
1601
|
-
continue;
|
|
1602
|
-
}
|
|
1603
|
-
|
|
1604
|
-
flushBuffer(node);
|
|
1605
|
-
}
|
|
1606
|
-
|
|
1607
|
-
flushBuffer(null);
|
|
1608
|
-
}
|
|
1609
|
-
|
|
1610
|
-
wrapInlineChildren(contentClone);
|
|
1611
|
-
|
|
1612
|
-
// 移除尾部的版权/广告声明
|
|
1613
|
-
const footerKeywords = ['版权', '未经许可', '未经授权', '不得转载', '未经允许', 'All Rights Reserved', '最终解释权', '转载'];
|
|
1614
|
-
const trailingElements = Array.from(contentClone.querySelectorAll('p, div, section')).slice(-6);
|
|
1615
|
-
trailingElements.forEach(el => {
|
|
1616
|
-
const text = (el.textContent || '').trim();
|
|
1617
|
-
if (!text) {
|
|
1618
|
-
return;
|
|
1619
|
-
}
|
|
1620
|
-
if (text.length <= 200 && footerKeywords.some(keyword => text.includes(keyword))) {
|
|
1621
|
-
el.remove();
|
|
1622
|
-
}
|
|
1623
|
-
});
|
|
1624
|
-
|
|
1625
|
-
// 处理代码块
|
|
1626
|
-
const codeBlocks = contentClone.querySelectorAll('pre');
|
|
1627
|
-
codeBlocks.forEach(block => {
|
|
1628
|
-
const codeText = collectCodeText(block);
|
|
1629
|
-
if (!codeText) {
|
|
1630
|
-
block.remove();
|
|
1631
|
-
return;
|
|
1632
|
-
}
|
|
1633
|
-
let codeInside = block.querySelector('code');
|
|
1634
|
-
if (!codeInside) {
|
|
1635
|
-
codeInside = document.createElement('code');
|
|
1636
|
-
block.appendChild(codeInside);
|
|
1637
|
-
}
|
|
1638
|
-
codeInside.textContent = codeText;
|
|
1639
|
-
});
|
|
1640
|
-
|
|
1641
|
-
return contentClone.innerHTML;
|
|
1642
|
-
});
|
|
1321
|
+
if (!sanitizedHtml) {
|
|
1322
|
+
throw new Error('未能提取到文章内容');
|
|
1323
|
+
}
|
|
1643
1324
|
|
|
1644
1325
|
return {
|
|
1645
1326
|
success: true,
|
|
1646
1327
|
title: article.originalTitle || article.title,
|
|
1647
|
-
content:
|
|
1328
|
+
content: sanitizedHtml
|
|
1648
1329
|
};
|
|
1649
1330
|
|
|
1650
1331
|
} catch (error) {
|
|
1651
|
-
|
|
1652
|
-
let errorMessage = error.message;
|
|
1653
|
-
if (error.message.includes('Timeout') || error.message.includes('timeout')) {
|
|
1654
|
-
errorMessage = 'Cookie 可能已失效或页面加载超时';
|
|
1655
|
-
}
|
|
1656
|
-
|
|
1332
|
+
console.error(`[${index}/${total}] 提取文章内容失败: ${article.originalTitle || article.title}`, error);
|
|
1657
1333
|
return {
|
|
1658
1334
|
success: false,
|
|
1659
1335
|
title: article.originalTitle || article.title,
|
|
1660
|
-
|
|
1661
|
-
|
|
1336
|
+
error: error.message,
|
|
1337
|
+
content: ''
|
|
1662
1338
|
};
|
|
1663
1339
|
}
|
|
1664
1340
|
}
|
|
1665
1341
|
|
|
1342
|
+
|
|
1666
1343
|
// 并发提取文章内容(用于 EPUB)
|
|
1667
1344
|
async function extractWithConcurrency(context, articles, concurrency = 5, delay = 2000, timeout = 60000) {
|
|
1668
1345
|
const results = [];
|
|
@@ -1769,7 +1446,7 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
|
|
|
1769
1446
|
return null;
|
|
1770
1447
|
}
|
|
1771
1448
|
|
|
1772
|
-
|
|
1449
|
+
const options = {
|
|
1773
1450
|
title: columnTitle,
|
|
1774
1451
|
author: columnAuthor || '极客时间',
|
|
1775
1452
|
publisher: '极客时间',
|
|
@@ -2029,13 +1706,46 @@ async function main(options) {
|
|
|
2029
1706
|
globalBrowser = browser;
|
|
2030
1707
|
|
|
2031
1708
|
const context = await browser.newContext({
|
|
2032
|
-
userAgent:
|
|
1709
|
+
userAgent: DEFAULT_USER_AGENT
|
|
2033
1710
|
});
|
|
2034
1711
|
|
|
1712
|
+
// 兼容用户直接复制整行"Cookie: xxx"
|
|
1713
|
+
let normalizedCookie = cookie.trim();
|
|
1714
|
+
if (/^cookie:/i.test(normalizedCookie)) {
|
|
1715
|
+
normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
|
|
1716
|
+
}
|
|
1717
|
+
globalCookieHeader = normalizedCookie;
|
|
1718
|
+
|
|
2035
1719
|
// 设置 cookies
|
|
2036
|
-
const cookies = parseCookies(
|
|
1720
|
+
const cookies = parseCookies(normalizedCookie);
|
|
2037
1721
|
await context.addCookies(cookies);
|
|
2038
1722
|
|
|
1723
|
+
// 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
|
|
1724
|
+
await context.route('**/*', (route) => {
|
|
1725
|
+
const request = route.request();
|
|
1726
|
+
let url;
|
|
1727
|
+
try {
|
|
1728
|
+
url = new URL(request.url());
|
|
1729
|
+
} catch {
|
|
1730
|
+
return route.continue();
|
|
1731
|
+
}
|
|
1732
|
+
|
|
1733
|
+
const hostname = url.hostname || '';
|
|
1734
|
+
const isGeekbangDomain =
|
|
1735
|
+
hostname === 'geekbang.org' ||
|
|
1736
|
+
hostname.endsWith('.geekbang.org');
|
|
1737
|
+
|
|
1738
|
+
if (!isGeekbangDomain) {
|
|
1739
|
+
return route.continue();
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1742
|
+
const headers = {
|
|
1743
|
+
...request.headers(),
|
|
1744
|
+
cookie: normalizedCookie
|
|
1745
|
+
};
|
|
1746
|
+
route.continue({ headers });
|
|
1747
|
+
});
|
|
1748
|
+
|
|
2039
1749
|
const page = await context.newPage();
|
|
2040
1750
|
|
|
2041
1751
|
try {
|
package/package.json
CHANGED
|
Binary file
|