@kadaliao/geektime-downloader 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +30 -0
- package/config.example.json +1 -0
- package/cookies.json +405 -0
- package/download.js +1481 -275
- package/package.json +3 -1
package/download.js
CHANGED
|
@@ -6,7 +6,10 @@ import chalk from 'chalk';
|
|
|
6
6
|
import ora from 'ora';
|
|
7
7
|
import fs from 'fs/promises';
|
|
8
8
|
import path from 'path';
|
|
9
|
-
import { fileURLToPath } from 'url';
|
|
9
|
+
import { fileURLToPath, pathToFileURL } from 'url';
|
|
10
|
+
import { load as loadHtml } from 'cheerio';
|
|
11
|
+
import crypto from 'crypto';
|
|
12
|
+
import mime from 'mime-types';
|
|
10
13
|
import { createRequire } from 'module';
|
|
11
14
|
import * as pdfLib from 'pdf-lib';
|
|
12
15
|
import { outlinePdfFactory } from '@lillallol/outline-pdf';
|
|
@@ -245,9 +248,335 @@ const PRINT_FIX_CSS = `
|
|
|
245
248
|
}
|
|
246
249
|
`;
|
|
247
250
|
|
|
251
|
+
// 代码高亮彩色语法(覆盖Prism/Highlight.js常见class)
|
|
252
|
+
const CODE_HIGHLIGHT_CSS = `
|
|
253
|
+
pre[class*="language-"],
|
|
254
|
+
code[class*="language-"],
|
|
255
|
+
pre code,
|
|
256
|
+
code.hljs,
|
|
257
|
+
pre.hljs {
|
|
258
|
+
color: #2d2d2d;
|
|
259
|
+
background: #f7f7f7;
|
|
260
|
+
}
|
|
261
|
+
.token.comment,
|
|
262
|
+
.token.prolog,
|
|
263
|
+
.token.doctype,
|
|
264
|
+
.token.cdata,
|
|
265
|
+
.hljs-comment,
|
|
266
|
+
.hljs-quote {
|
|
267
|
+
color: #6a737d;
|
|
268
|
+
font-style: italic;
|
|
269
|
+
}
|
|
270
|
+
.token.punctuation,
|
|
271
|
+
.hljs-punctuation {
|
|
272
|
+
color: #5e6687;
|
|
273
|
+
}
|
|
274
|
+
.token.property,
|
|
275
|
+
.token.tag,
|
|
276
|
+
.token.constant,
|
|
277
|
+
.token.symbol,
|
|
278
|
+
.token.deleted,
|
|
279
|
+
.hljs-keyword,
|
|
280
|
+
.hljs-selector-tag,
|
|
281
|
+
.hljs-subst,
|
|
282
|
+
.hljs-attribute {
|
|
283
|
+
color: #d73a49;
|
|
284
|
+
}
|
|
285
|
+
.token.boolean,
|
|
286
|
+
.token.number,
|
|
287
|
+
.token.selector,
|
|
288
|
+
.token.attr-name,
|
|
289
|
+
.token.char,
|
|
290
|
+
.token.builtin,
|
|
291
|
+
.token.inserted,
|
|
292
|
+
.hljs-number,
|
|
293
|
+
.hljs-literal,
|
|
294
|
+
.hljs-variable,
|
|
295
|
+
.hljs-template-variable {
|
|
296
|
+
color: #b76bff;
|
|
297
|
+
}
|
|
298
|
+
.token.string,
|
|
299
|
+
.token.attr-value,
|
|
300
|
+
.token.operator,
|
|
301
|
+
.token.entity,
|
|
302
|
+
.token.url,
|
|
303
|
+
.token.statement,
|
|
304
|
+
.token.regex,
|
|
305
|
+
.token.important,
|
|
306
|
+
.token.variable,
|
|
307
|
+
.token.bold,
|
|
308
|
+
.hljs-string,
|
|
309
|
+
.hljs-doctag,
|
|
310
|
+
.hljs-addition {
|
|
311
|
+
color: #22863a;
|
|
312
|
+
}
|
|
313
|
+
.token.function,
|
|
314
|
+
.token.class-name,
|
|
315
|
+
.token.keyword,
|
|
316
|
+
.hljs-title,
|
|
317
|
+
.hljs-section,
|
|
318
|
+
.hljs-type,
|
|
319
|
+
.hljs-selector-id,
|
|
320
|
+
.hljs-selector-class {
|
|
321
|
+
color: #005cc5;
|
|
322
|
+
}
|
|
323
|
+
.token.operator,
|
|
324
|
+
.token.entity,
|
|
325
|
+
.token.url,
|
|
326
|
+
.hljs-bullet,
|
|
327
|
+
.hljs-built_in,
|
|
328
|
+
.hljs-builtin-name,
|
|
329
|
+
.hljs-link {
|
|
330
|
+
color: #e36209;
|
|
331
|
+
}
|
|
332
|
+
.token.italic {
|
|
333
|
+
font-style: italic;
|
|
334
|
+
}
|
|
335
|
+
.token.bold {
|
|
336
|
+
font-weight: 600;
|
|
337
|
+
}
|
|
338
|
+
.token.deleted,
|
|
339
|
+
.hljs-deletion {
|
|
340
|
+
color: #b31d28;
|
|
341
|
+
}
|
|
342
|
+
`;
|
|
343
|
+
|
|
248
344
|
const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
|
|
249
|
-
const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
|
|
250
345
|
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
346
|
+
const EPUB_IMAGE_BATCH_SIZE = 5;
|
|
347
|
+
const TEMP_ASSET_PREFIX = '__epub_assets__';
|
|
348
|
+
const ARTICLE_CONTENT_SELECTORS = [
|
|
349
|
+
'#article-content',
|
|
350
|
+
'#article-content-container',
|
|
351
|
+
'.article-content',
|
|
352
|
+
'.article-detail',
|
|
353
|
+
'.article-detail-content',
|
|
354
|
+
'.article-content__body',
|
|
355
|
+
'.Index_articleContent_QBG5G',
|
|
356
|
+
'.ArticleContent_articleContent',
|
|
357
|
+
'article .content',
|
|
358
|
+
'main article',
|
|
359
|
+
'.content-container article'
|
|
360
|
+
];
|
|
361
|
+
const ARTICLE_REMOVAL_SELECTORS = [
|
|
362
|
+
'nav', 'header', 'footer', 'aside',
|
|
363
|
+
'.comment', '.comments', '.Index_comment', '.CommentArea', '.comment-area', '.CommentWrapper', '.Comment-module', '.CommentList',
|
|
364
|
+
'#comments', '#comment', '[data-section="comment"]',
|
|
365
|
+
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
366
|
+
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
367
|
+
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
368
|
+
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
369
|
+
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
370
|
+
'.AudioPlayer', '.VoicePlayer', '.AudioWrapper', '.voice-player',
|
|
371
|
+
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
372
|
+
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
373
|
+
'.copyright', '.statement', '.disclaimer',
|
|
374
|
+
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
375
|
+
'.article-plugin-wrapper',
|
|
376
|
+
'[class*="Share"]', '[data-widget="audio"]', '[data-widget="Audio"]',
|
|
377
|
+
'audio', 'video',
|
|
378
|
+
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
379
|
+
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
380
|
+
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
381
|
+
'[data-role="toolbar"]',
|
|
382
|
+
'button[data-role="comment"]',
|
|
383
|
+
'script[data-role="plugin"]',
|
|
384
|
+
'.ArticleBottomBar',
|
|
385
|
+
'.bottom-toolbar'
|
|
386
|
+
];
|
|
387
|
+
const ARTICLE_PLUGIN_KEYWORDS = [
|
|
388
|
+
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
389
|
+
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
390
|
+
'copyright', 'geeknote', 'bilingual', 'comment'
|
|
391
|
+
];
|
|
392
|
+
const ARTICLE_MINDMAP_SELECTORS = [
|
|
393
|
+
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
394
|
+
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
395
|
+
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
396
|
+
];
|
|
397
|
+
const PDF_BASE_CSS = `
|
|
398
|
+
body {
|
|
399
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", sans-serif;
|
|
400
|
+
margin: 0;
|
|
401
|
+
padding: 0;
|
|
402
|
+
background: #fff;
|
|
403
|
+
color: #1f2329;
|
|
404
|
+
}
|
|
405
|
+
.article-pdf-wrapper {
|
|
406
|
+
max-width: 860px;
|
|
407
|
+
margin: 0 auto;
|
|
408
|
+
padding: 48px 56px 60px;
|
|
409
|
+
}
|
|
410
|
+
.article-title {
|
|
411
|
+
font-size: 32px;
|
|
412
|
+
font-weight: 600;
|
|
413
|
+
margin-bottom: 16px;
|
|
414
|
+
line-height: 1.3;
|
|
415
|
+
color: #111;
|
|
416
|
+
}
|
|
417
|
+
.article-meta {
|
|
418
|
+
color: #7f8c8d;
|
|
419
|
+
font-size: 14px;
|
|
420
|
+
margin-bottom: 32px;
|
|
421
|
+
}
|
|
422
|
+
.article-content p,
|
|
423
|
+
.article-content div {
|
|
424
|
+
margin: 1.1em 0;
|
|
425
|
+
line-height: 1.9;
|
|
426
|
+
font-size: 16px;
|
|
427
|
+
}
|
|
428
|
+
.article-content p + p,
|
|
429
|
+
.article-content div + p,
|
|
430
|
+
.article-content p + div {
|
|
431
|
+
margin-top: 1.6em;
|
|
432
|
+
}
|
|
433
|
+
.article-content h2,
|
|
434
|
+
.article-content h3,
|
|
435
|
+
.article-content h4 {
|
|
436
|
+
margin-top: 2.2em;
|
|
437
|
+
margin-bottom: 1em;
|
|
438
|
+
font-weight: 600;
|
|
439
|
+
color: #111;
|
|
440
|
+
}
|
|
441
|
+
.article-content h2 {
|
|
442
|
+
font-size: 26px;
|
|
443
|
+
}
|
|
444
|
+
.article-content h3 {
|
|
445
|
+
font-size: 22px;
|
|
446
|
+
}
|
|
447
|
+
.article-content h4 {
|
|
448
|
+
font-size: 18px;
|
|
449
|
+
}
|
|
450
|
+
.article-content img {
|
|
451
|
+
max-width: 100%;
|
|
452
|
+
margin: 1.2em auto;
|
|
453
|
+
display: block;
|
|
454
|
+
border-radius: 4px;
|
|
455
|
+
}
|
|
456
|
+
.article-content blockquote {
|
|
457
|
+
margin: 1.3em 0;
|
|
458
|
+
padding: 0.8em 1.2em;
|
|
459
|
+
border-left: 4px solid #d0d7de;
|
|
460
|
+
background: #f8fafc;
|
|
461
|
+
color: #4b5563;
|
|
462
|
+
}
|
|
463
|
+
.article-content ul,
|
|
464
|
+
.article-content ol {
|
|
465
|
+
margin: 1em 0;
|
|
466
|
+
padding-left: 2em;
|
|
467
|
+
}
|
|
468
|
+
.article-content pre {
|
|
469
|
+
background: #0b1220;
|
|
470
|
+
color: #d9e2ff;
|
|
471
|
+
border-radius: 6px;
|
|
472
|
+
padding: 16px 20px;
|
|
473
|
+
overflow: auto;
|
|
474
|
+
margin: 1.4em 0;
|
|
475
|
+
font-size: 14px;
|
|
476
|
+
line-height: 1.6;
|
|
477
|
+
}
|
|
478
|
+
.article-content pre code {
|
|
479
|
+
background: transparent;
|
|
480
|
+
border: none;
|
|
481
|
+
padding: 0;
|
|
482
|
+
color: inherit;
|
|
483
|
+
}
|
|
484
|
+
.article-content code {
|
|
485
|
+
font-family: "Fira Code", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
|
|
486
|
+
background: rgba(15, 23, 42, 0.08);
|
|
487
|
+
border-radius: 4px;
|
|
488
|
+
padding: 0.2em 0.4em;
|
|
489
|
+
}
|
|
490
|
+
.article-content hr {
|
|
491
|
+
border: none;
|
|
492
|
+
border-top: 1px solid #e5e7eb;
|
|
493
|
+
margin: 2.4em 0;
|
|
494
|
+
}
|
|
495
|
+
`;
|
|
496
|
+
|
|
497
|
+
async function fileExists(filePath) {
|
|
498
|
+
try {
|
|
499
|
+
await fs.access(filePath);
|
|
500
|
+
return true;
|
|
501
|
+
} catch {
|
|
502
|
+
return false;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
function normalizeCookieSameSite(value) {
|
|
507
|
+
if (!value) return undefined;
|
|
508
|
+
const lower = value.toString().toLowerCase();
|
|
509
|
+
if (lower.includes('lax')) return 'Lax';
|
|
510
|
+
if (lower.includes('strict')) return 'Strict';
|
|
511
|
+
if (lower.includes('none') || lower.includes('no_restriction')) return 'None';
|
|
512
|
+
return undefined;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
function normalizeCookieDomain(domain) {
|
|
516
|
+
if (!domain || typeof domain !== 'string') {
|
|
517
|
+
return '.geekbang.org';
|
|
518
|
+
}
|
|
519
|
+
return domain.trim();
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
async function loadCookiesFromJsonFile(filePath) {
|
|
523
|
+
const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
|
|
524
|
+
let raw;
|
|
525
|
+
try {
|
|
526
|
+
raw = await fs.readFile(absolutePath, 'utf-8');
|
|
527
|
+
} catch (error) {
|
|
528
|
+
throw new Error(`无法读取 cookie JSON 文件: ${error.message}`);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
let parsed;
|
|
532
|
+
try {
|
|
533
|
+
parsed = JSON.parse(raw);
|
|
534
|
+
} catch (error) {
|
|
535
|
+
throw new Error(`cookie JSON 解析失败: ${error.message}`);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
if (!Array.isArray(parsed)) {
|
|
539
|
+
throw new Error('cookie JSON 必须是数组格式');
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
const cookies = parsed
|
|
543
|
+
.filter(item => item && typeof item.name === 'string' && item.value !== undefined)
|
|
544
|
+
.map(item => {
|
|
545
|
+
const cookieValue = typeof item.value === 'string' ? item.value : String(item.value ?? '');
|
|
546
|
+
const cookie = {
|
|
547
|
+
name: item.name,
|
|
548
|
+
value: cookieValue,
|
|
549
|
+
domain: normalizeCookieDomain(item.domain),
|
|
550
|
+
path: item.path || '/',
|
|
551
|
+
secure: Boolean(item.secure),
|
|
552
|
+
httpOnly: Boolean(item.httpOnly)
|
|
553
|
+
};
|
|
554
|
+
const sameSite = normalizeCookieSameSite(item.sameSite);
|
|
555
|
+
if (sameSite) {
|
|
556
|
+
cookie.sameSite = sameSite;
|
|
557
|
+
}
|
|
558
|
+
return cookie;
|
|
559
|
+
});
|
|
560
|
+
|
|
561
|
+
if (cookies.length === 0) {
|
|
562
|
+
throw new Error('cookie JSON 中没有有效的 cookie 项');
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
const withExpiry = parsed
|
|
566
|
+
.filter(item => item && typeof item.name === 'string' && item.value !== undefined)
|
|
567
|
+
.map((item, idx) => ({ item, target: cookies[idx] }))
|
|
568
|
+
.filter(entry => entry.target);
|
|
569
|
+
withExpiry.forEach(({ item, target }) => {
|
|
570
|
+
const expires = item.expires || item.expirationDate;
|
|
571
|
+
if (expires) {
|
|
572
|
+
target.expires = Math.floor(Number(expires));
|
|
573
|
+
}
|
|
574
|
+
});
|
|
575
|
+
|
|
576
|
+
const cookieHeader = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
|
|
577
|
+
|
|
578
|
+
return { cookieHeader, cookies, absolutePath };
|
|
579
|
+
}
|
|
251
580
|
|
|
252
581
|
// 解析 cookie 字符串
|
|
253
582
|
function parseCookies(cookieString) {
|
|
@@ -272,96 +601,308 @@ function normalizeArticleHtml(html = '') {
|
|
|
272
601
|
.replace(/href='\/\//gi, "href='https://");
|
|
273
602
|
}
|
|
274
603
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
604
|
+
function resolveImageUrl(rawSrc = '') {
|
|
605
|
+
if (!rawSrc) return null;
|
|
606
|
+
let src = rawSrc.trim();
|
|
607
|
+
if (!src || src.startsWith('data:') || src.startsWith('blob:')) {
|
|
608
|
+
return null;
|
|
609
|
+
}
|
|
610
|
+
if (src.startsWith('//')) {
|
|
611
|
+
return `https:${src}`;
|
|
612
|
+
}
|
|
613
|
+
if (src.startsWith('/')) {
|
|
614
|
+
return `${GEEKTIME_BASE_URL}${src}`;
|
|
615
|
+
}
|
|
616
|
+
if (/^https?:/i.test(src)) {
|
|
617
|
+
return src;
|
|
618
|
+
}
|
|
619
|
+
try {
|
|
620
|
+
return new URL(src, GEEKTIME_BASE_URL).toString();
|
|
621
|
+
} catch {
|
|
622
|
+
return null;
|
|
623
|
+
}
|
|
624
|
+
}
|
|
279
625
|
|
|
280
|
-
|
|
626
|
+
async function fetchBinaryWithContext(context, url) {
|
|
627
|
+
const headers = {
|
|
628
|
+
'user-agent': DEFAULT_USER_AGENT,
|
|
629
|
+
'accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
|
|
630
|
+
'referer': GEEKTIME_BASE_URL,
|
|
631
|
+
...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
|
|
632
|
+
};
|
|
633
|
+
const response = await context.request.get(url, { headers, failOnStatusCode: true });
|
|
634
|
+
if (!response.ok()) {
|
|
635
|
+
throw new Error(`HTTP ${response.status()} ${response.statusText()}`);
|
|
636
|
+
}
|
|
637
|
+
const buffer = await response.body();
|
|
638
|
+
const headersMap = response.headers();
|
|
639
|
+
return {
|
|
640
|
+
buffer,
|
|
641
|
+
contentType: headersMap['content-type'] || '',
|
|
642
|
+
finalUrl: response.url()
|
|
643
|
+
};
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
function determineImageExtension(resourceUrl = '', contentType = '') {
|
|
647
|
+
let ext = '';
|
|
648
|
+
if (resourceUrl) {
|
|
281
649
|
try {
|
|
282
|
-
const
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
}
|
|
297
|
-
});
|
|
650
|
+
const { pathname } = new URL(resourceUrl);
|
|
651
|
+
ext = path.extname(pathname).replace('.', '');
|
|
652
|
+
} catch {
|
|
653
|
+
ext = '';
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
if (!ext && contentType) {
|
|
657
|
+
ext = (mime.extension(contentType) || '').toString();
|
|
658
|
+
}
|
|
659
|
+
if (!ext) {
|
|
660
|
+
ext = 'bin';
|
|
661
|
+
}
|
|
662
|
+
return ext.toLowerCase();
|
|
663
|
+
}
|
|
298
664
|
|
|
299
|
-
|
|
665
|
+
async function downloadImageToLocal(context, normalizedUrl, assetsDir, articleIndex) {
|
|
666
|
+
const { buffer, contentType, finalUrl } = await fetchBinaryWithContext(context, normalizedUrl);
|
|
667
|
+
const ext = determineImageExtension(finalUrl || normalizedUrl, contentType);
|
|
668
|
+
const hash = crypto.createHash('md5').update(normalizedUrl).digest('hex').slice(0, 10);
|
|
669
|
+
const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_${hash}.${ext}`;
|
|
670
|
+
const filepath = path.join(assetsDir, filename);
|
|
671
|
+
await fs.writeFile(filepath, buffer);
|
|
672
|
+
return {
|
|
673
|
+
fileUrl: pathToFileURL(filepath).href,
|
|
674
|
+
localPath: filepath
|
|
675
|
+
};
|
|
676
|
+
}
|
|
300
677
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
678
|
+
function mapSameSiteForExport(value) {
|
|
679
|
+
if (!value) return 'unspecified';
|
|
680
|
+
const lower = value.toString().toLowerCase();
|
|
681
|
+
if (lower.includes('strict')) return 'strict';
|
|
682
|
+
if (lower.includes('lax')) return 'lax';
|
|
683
|
+
if (lower.includes('none')) return 'no_restriction';
|
|
684
|
+
return 'unspecified';
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
async function updateGlobalCookieHeaderFromContext(context) {
|
|
688
|
+
if (!context) return;
|
|
689
|
+
try {
|
|
690
|
+
const cookies = await context.cookies();
|
|
691
|
+
if (!cookies || cookies.length === 0) {
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
const header = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
|
|
695
|
+
if (header) {
|
|
696
|
+
globalCookieHeader = header;
|
|
697
|
+
}
|
|
698
|
+
} catch {
|
|
699
|
+
// ignore
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
async function persistCookiesToFile(context, targetPath) {
|
|
704
|
+
if (!context || !targetPath) return;
|
|
705
|
+
try {
|
|
706
|
+
const cookies = await context.cookies();
|
|
707
|
+
if (!cookies || cookies.length === 0) {
|
|
708
|
+
return;
|
|
709
|
+
}
|
|
710
|
+
const serialized = cookies.map(cookie => ({
|
|
711
|
+
domain: cookie.domain,
|
|
712
|
+
expirationDate: cookie.expires || undefined,
|
|
713
|
+
hostOnly: !cookie.domain.startsWith('.'),
|
|
714
|
+
httpOnly: cookie.httpOnly,
|
|
715
|
+
name: cookie.name,
|
|
716
|
+
path: cookie.path,
|
|
717
|
+
sameSite: mapSameSiteForExport(cookie.sameSite),
|
|
718
|
+
secure: cookie.secure,
|
|
719
|
+
session: !cookie.expires,
|
|
720
|
+
storeId: '0',
|
|
721
|
+
value: cookie.value
|
|
722
|
+
}));
|
|
723
|
+
await fs.writeFile(targetPath, JSON.stringify(serialized, null, 2), 'utf-8');
|
|
724
|
+
console.log(chalk.gray(`🍪 已刷新 Cookie → ${targetPath}`));
|
|
725
|
+
} catch (error) {
|
|
726
|
+
console.log(chalk.yellow(`⚠️ 保存 Cookie 失败: ${error.message}`));
|
|
727
|
+
}
|
|
728
|
+
}
|
|
304
729
|
|
|
305
|
-
|
|
730
|
+
async function saveDataUriImage(dataUri, assetsDir, articleIndex, dataIndex) {
|
|
731
|
+
if (!dataUri || typeof dataUri !== 'string') {
|
|
732
|
+
return null;
|
|
733
|
+
}
|
|
734
|
+
const match = dataUri.match(/^data:(.+?);base64,(.+)$/i);
|
|
735
|
+
if (!match) {
|
|
736
|
+
return null;
|
|
737
|
+
}
|
|
738
|
+
const mimeType = match[1] || 'application/octet-stream';
|
|
739
|
+
const base64Data = match[2];
|
|
740
|
+
let buffer;
|
|
741
|
+
try {
|
|
742
|
+
buffer = Buffer.from(base64Data, 'base64');
|
|
743
|
+
} catch {
|
|
744
|
+
return null;
|
|
745
|
+
}
|
|
746
|
+
if (!buffer || buffer.length === 0) {
|
|
747
|
+
return null;
|
|
748
|
+
}
|
|
749
|
+
const ext = mime.extension(mimeType) || 'bin';
|
|
750
|
+
const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_inline_${String(dataIndex).padStart(3, '0')}.${ext}`;
|
|
751
|
+
const filepath = path.join(assetsDir, filename);
|
|
752
|
+
await fs.writeFile(filepath, buffer);
|
|
753
|
+
return pathToFileURL(filepath).href;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
async function rewriteImagesWithLocalFiles(context, htmlContent, assetsDir, articleIndex, sharedCache) {
|
|
757
|
+
if (!htmlContent || htmlContent.indexOf('<img') === -1) {
|
|
758
|
+
return { html: htmlContent, replaced: 0 };
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
const $ = loadHtml(htmlContent, { decodeEntities: false });
|
|
762
|
+
const images = $('img');
|
|
763
|
+
if (images.length === 0) {
|
|
764
|
+
return { html: htmlContent, replaced: 0 };
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
const pendingDownloads = new Map();
|
|
768
|
+
const dataUriImages = [];
|
|
769
|
+
|
|
770
|
+
images.each((_, element) => {
|
|
771
|
+
const originalSrc = $(element).attr('src') || '';
|
|
772
|
+
if (/^data:/i.test(originalSrc.trim())) {
|
|
773
|
+
dataUriImages.push({ element, src: originalSrc.trim() });
|
|
774
|
+
return;
|
|
775
|
+
}
|
|
776
|
+
const normalizedUrl = resolveImageUrl(originalSrc);
|
|
777
|
+
if (!normalizedUrl) {
|
|
778
|
+
return;
|
|
779
|
+
}
|
|
780
|
+
if (sharedCache.has(normalizedUrl)) {
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
if (!pendingDownloads.has(normalizedUrl)) {
|
|
784
|
+
pendingDownloads.set(normalizedUrl, null);
|
|
785
|
+
}
|
|
786
|
+
});
|
|
787
|
+
|
|
788
|
+
const downloadTargets = Array.from(pendingDownloads.keys());
|
|
789
|
+
for (let i = 0; i < downloadTargets.length; i += EPUB_IMAGE_BATCH_SIZE) {
|
|
790
|
+
const batch = downloadTargets.slice(i, i + EPUB_IMAGE_BATCH_SIZE).map(async (targetUrl) => {
|
|
306
791
|
try {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
792
|
+
const info = await downloadImageToLocal(context, targetUrl, assetsDir, articleIndex);
|
|
793
|
+
sharedCache.set(targetUrl, info.fileUrl);
|
|
794
|
+
pendingDownloads.set(targetUrl, info.fileUrl);
|
|
795
|
+
} catch (error) {
|
|
796
|
+
console.log(chalk.yellow(` ⚠️ 图片下载失败: ${targetUrl} (${error.message})`));
|
|
797
|
+
pendingDownloads.set(targetUrl, null);
|
|
310
798
|
}
|
|
799
|
+
});
|
|
800
|
+
await Promise.all(batch);
|
|
801
|
+
}
|
|
311
802
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
803
|
+
images.each((_, element) => {
|
|
804
|
+
const originalSrc = $(element).attr('src') || '';
|
|
805
|
+
if (/^data:/i.test(originalSrc.trim())) {
|
|
806
|
+
return;
|
|
807
|
+
}
|
|
808
|
+
const normalizedUrl = resolveImageUrl(originalSrc);
|
|
809
|
+
if (!normalizedUrl) {
|
|
810
|
+
return;
|
|
811
|
+
}
|
|
812
|
+
const localUrl = sharedCache.get(normalizedUrl) || pendingDownloads.get(normalizedUrl);
|
|
813
|
+
if (localUrl) {
|
|
814
|
+
$(element).attr('src', localUrl);
|
|
815
|
+
}
|
|
816
|
+
});
|
|
315
817
|
|
|
316
|
-
|
|
317
|
-
|
|
818
|
+
let processedInlineImages = 0;
|
|
819
|
+
for (let i = 0; i < dataUriImages.length; i++) {
|
|
820
|
+
const item = dataUriImages[i];
|
|
821
|
+
try {
|
|
822
|
+
const localUrl = await saveDataUriImage(item.src, assetsDir, articleIndex, i);
|
|
823
|
+
if (localUrl) {
|
|
824
|
+
$(item.element).attr('src', localUrl);
|
|
825
|
+
processedInlineImages++;
|
|
826
|
+
} else {
|
|
827
|
+
$(item.element).remove();
|
|
318
828
|
}
|
|
319
|
-
|
|
320
|
-
return json.data;
|
|
321
829
|
} catch (error) {
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
830
|
+
console.log(chalk.yellow(` ⚠️ 内联图片处理失败: ${error.message}`));
|
|
831
|
+
$(item.element).remove();
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
const finalHtml = $.root().html() || htmlContent;
|
|
836
|
+
|
|
837
|
+
return {
|
|
838
|
+
html: finalHtml,
|
|
839
|
+
replaced: downloadTargets.length + processedInlineImages
|
|
840
|
+
};
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
async function rewriteEpubContentImages(context, contentResults, assetsDir) {
|
|
844
|
+
const cache = new Map();
|
|
845
|
+
let processedArticles = 0;
|
|
846
|
+
let processedImages = 0;
|
|
847
|
+
|
|
848
|
+
const spinner = ora('正在缓存 EPUB 图片...').start();
|
|
849
|
+
|
|
850
|
+
const updatedResults = [];
|
|
851
|
+
for (let i = 0; i < contentResults.length; i++) {
|
|
852
|
+
const result = contentResults[i];
|
|
853
|
+
if (!result || !result.success || !result.content) {
|
|
854
|
+
updatedResults.push(result);
|
|
855
|
+
continue;
|
|
856
|
+
}
|
|
857
|
+
try {
|
|
858
|
+
const { html, replaced } = await rewriteImagesWithLocalFiles(context, result.content, assetsDir, i, cache);
|
|
859
|
+
processedImages += replaced;
|
|
860
|
+
if (replaced > 0) {
|
|
861
|
+
processedArticles++;
|
|
325
862
|
}
|
|
863
|
+
updatedResults.push({ ...result, content: html });
|
|
864
|
+
} catch (error) {
|
|
865
|
+
spinner.stop();
|
|
866
|
+
console.log(chalk.yellow(`⚠️ 处理第 ${i + 1} 篇文章图片失败: ${error.message}`));
|
|
867
|
+
spinner.start();
|
|
868
|
+
updatedResults.push(result);
|
|
326
869
|
}
|
|
327
870
|
}
|
|
328
871
|
|
|
329
|
-
|
|
872
|
+
if (processedImages === 0) {
|
|
873
|
+
spinner.stop();
|
|
874
|
+
console.log(chalk.gray('📷 没有检测到需要缓存的图片'));
|
|
875
|
+
} else {
|
|
876
|
+
spinner.succeed(`已缓存 EPUB 图片: ${processedImages} 张(${processedArticles} 篇文章)`);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
return updatedResults;
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
async function createTempAssetsDir(baseDir) {
|
|
883
|
+
const tempDir = path.join(baseDir, `${TEMP_ASSET_PREFIX}_${Date.now().toString(36)}_${Math.random().toString(16).slice(2, 8)}`);
|
|
884
|
+
await fs.mkdir(tempDir, { recursive: true });
|
|
885
|
+
return tempDir;
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
async function cleanupTempAssetsDir(dir) {
|
|
889
|
+
if (!dir) return;
|
|
890
|
+
try {
|
|
891
|
+
await fs.rm(dir, { recursive: true, force: true });
|
|
892
|
+
} catch (error) {
|
|
893
|
+
console.log(chalk.gray(`清理临时目录失败: ${error.message}`));
|
|
894
|
+
}
|
|
330
895
|
}
|
|
331
896
|
|
|
332
897
|
async function sanitizeArticleHtml(page, rawHtml) {
|
|
333
|
-
return page.evaluate((html) => {
|
|
898
|
+
return page.evaluate(({ html, removalSelectors, pluginKeywords, mindmapSelectors }) => {
|
|
334
899
|
const template = document.createElement('template');
|
|
335
900
|
template.innerHTML = html;
|
|
336
901
|
|
|
337
|
-
const removalSelectors = [
|
|
338
|
-
'nav', 'header', 'footer', 'aside',
|
|
339
|
-
'.comment', '.comments', '.Index_comment',
|
|
340
|
-
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
341
|
-
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
342
|
-
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
343
|
-
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
344
|
-
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
345
|
-
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
346
|
-
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
347
|
-
'.copyright', '.statement', '.disclaimer',
|
|
348
|
-
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
349
|
-
'audio', 'video',
|
|
350
|
-
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
351
|
-
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
352
|
-
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
353
|
-
'[data-role="toolbar"]',
|
|
354
|
-
'button', 'iframe', 'script', 'style'
|
|
355
|
-
];
|
|
356
902
|
removalSelectors.forEach(selector => {
|
|
357
903
|
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
358
904
|
});
|
|
359
905
|
|
|
360
|
-
const pluginKeywords = [
|
|
361
|
-
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
362
|
-
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
363
|
-
'copyright', 'geeknote', 'bilingual'
|
|
364
|
-
];
|
|
365
906
|
const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
|
|
366
907
|
const className = (el.className || '').toString().toLowerCase();
|
|
367
908
|
const idValue = (el.id || '').toString().toLowerCase();
|
|
@@ -372,11 +913,6 @@ async function sanitizeArticleHtml(page, rawHtml) {
|
|
|
372
913
|
});
|
|
373
914
|
pluginElements.forEach(el => el.remove());
|
|
374
915
|
|
|
375
|
-
const mindmapSelectors = [
|
|
376
|
-
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
377
|
-
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
378
|
-
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
379
|
-
];
|
|
380
916
|
mindmapSelectors.forEach(selector => {
|
|
381
917
|
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
382
918
|
});
|
|
@@ -435,7 +971,16 @@ async function sanitizeArticleHtml(page, rawHtml) {
|
|
|
435
971
|
});
|
|
436
972
|
|
|
437
973
|
return template.innerHTML;
|
|
438
|
-
},
|
|
974
|
+
}, {
|
|
975
|
+
html: rawHtml,
|
|
976
|
+
removalSelectors: ARTICLE_REMOVAL_SELECTORS,
|
|
977
|
+
pluginKeywords: ARTICLE_PLUGIN_KEYWORDS,
|
|
978
|
+
mindmapSelectors: ARTICLE_MINDMAP_SELECTORS
|
|
979
|
+
});
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
function normalizeTextContent(text = '') {
|
|
983
|
+
return text.replace(/\s+/g, ' ').trim();
|
|
439
984
|
}
|
|
440
985
|
|
|
441
986
|
function escapeHtml(text = '') {
|
|
@@ -447,59 +992,650 @@ function escapeHtml(text = '') {
|
|
|
447
992
|
.replace(/'/g, ''');
|
|
448
993
|
}
|
|
449
994
|
|
|
450
|
-
function
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
.article-print-wrapper h1 {
|
|
468
|
-
font-size: 32px;
|
|
469
|
-
line-height: 1.4;
|
|
470
|
-
margin-bottom: 24px;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
a {
|
|
474
|
-
color: #0f5ef2;
|
|
475
|
-
text-decoration: none;
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
pre {
|
|
479
|
-
background: #f7f7f7;
|
|
480
|
-
padding: 16px;
|
|
481
|
-
border-radius: 6px;
|
|
482
|
-
overflow: auto;
|
|
995
|
+
function removeDuplicateTitle(html, title = '') {
|
|
996
|
+
if (!html || !title) {
|
|
997
|
+
return html;
|
|
998
|
+
}
|
|
999
|
+
const normalizedTitle = normalizeTextContent(title);
|
|
1000
|
+
if (!normalizedTitle) {
|
|
1001
|
+
return html;
|
|
1002
|
+
}
|
|
1003
|
+
try {
|
|
1004
|
+
const $ = loadHtml(html, { decodeEntities: false });
|
|
1005
|
+
const firstHeading = $('h1, h2').first();
|
|
1006
|
+
if (firstHeading.length) {
|
|
1007
|
+
const headingText = normalizeTextContent(firstHeading.text());
|
|
1008
|
+
if (headingText && headingText === normalizedTitle) {
|
|
1009
|
+
firstHeading.remove();
|
|
1010
|
+
}
|
|
483
1011
|
}
|
|
484
|
-
|
|
1012
|
+
return $.root().html() || html;
|
|
1013
|
+
} catch {
|
|
1014
|
+
return html;
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
485
1017
|
|
|
1018
|
+
function buildPdfHtml(title, sanitizedHtml, articleMeta = '') {
|
|
486
1019
|
return `
|
|
487
1020
|
<!DOCTYPE html>
|
|
488
1021
|
<html lang="zh-CN">
|
|
489
1022
|
<head>
|
|
490
1023
|
<meta charset="utf-8">
|
|
491
1024
|
<base href="${GEEKTIME_BASE_URL}">
|
|
492
|
-
<style>${
|
|
1025
|
+
<style>${PDF_BASE_CSS}${PRINT_FIX_CSS}${CODE_HIGHLIGHT_CSS}</style>
|
|
493
1026
|
</head>
|
|
494
1027
|
<body>
|
|
495
|
-
<
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
</div
|
|
1028
|
+
<article class="article-pdf-wrapper">
|
|
1029
|
+
<section class="article-content">
|
|
1030
|
+
<h1 class="article-title">${escapeHtml(title)}</h1>
|
|
1031
|
+
${articleMeta ? `<div class="article-meta">${escapeHtml(articleMeta)}</div>` : ''}
|
|
1032
|
+
${sanitizedHtml}
|
|
1033
|
+
</section>
|
|
1034
|
+
</article>
|
|
499
1035
|
</body>
|
|
500
1036
|
</html>`;
|
|
501
1037
|
}
|
|
502
1038
|
|
|
1039
|
+
function enhanceCodeBlocks(html) {
|
|
1040
|
+
if (!html) return html;
|
|
1041
|
+
try {
|
|
1042
|
+
const $ = loadHtml(html, { decodeEntities: false });
|
|
1043
|
+
const wrapCodeElement = ($source, innerHtml) => {
|
|
1044
|
+
const wrapper = $('<pre class="code-block"></pre>');
|
|
1045
|
+
const codeEl = $('<code></code>').html(innerHtml);
|
|
1046
|
+
wrapper.append(codeEl);
|
|
1047
|
+
$source.replaceWith(wrapper);
|
|
1048
|
+
};
|
|
1049
|
+
|
|
1050
|
+
$('code').each((_, element) => {
|
|
1051
|
+
const $el = $(element);
|
|
1052
|
+
const parent = $el.parent();
|
|
1053
|
+
const text = $el.text() || '';
|
|
1054
|
+
const isBlocky = text.includes('\n') || text.length > 120 || $el.html().includes('<br');
|
|
1055
|
+
if (isBlocky && parent.length && parent[0].tagName !== 'PRE') {
|
|
1056
|
+
wrapCodeElement($el, $el.html());
|
|
1057
|
+
}
|
|
1058
|
+
});
|
|
1059
|
+
$('pre').each((_, element) => {
|
|
1060
|
+
const $el = $(element);
|
|
1061
|
+
if (!$el.hasClass('code-block')) {
|
|
1062
|
+
$el.addClass('code-block');
|
|
1063
|
+
}
|
|
1064
|
+
if ($el.find('code').length === 0) {
|
|
1065
|
+
const text = $el.html();
|
|
1066
|
+
$el.empty().append($('<code></code>').html(text));
|
|
1067
|
+
}
|
|
1068
|
+
});
|
|
1069
|
+
|
|
1070
|
+
const codeLikeSelectors = [
|
|
1071
|
+
'[class*="code"]',
|
|
1072
|
+
'[class*="Code"]',
|
|
1073
|
+
'[class*="code-block"]',
|
|
1074
|
+
'[class*="CodeBlock"]',
|
|
1075
|
+
'[class*="hljs"]',
|
|
1076
|
+
'[class*="language-"]',
|
|
1077
|
+
'.highlight',
|
|
1078
|
+
'.prism-code'
|
|
1079
|
+
];
|
|
1080
|
+
const blockTags = ['P', 'DIV', 'SECTION', 'ARTICLE', 'UL', 'OL', 'TABLE', 'IMG', 'FIGURE'];
|
|
1081
|
+
const isLikelyCodeText = (text = '') => {
|
|
1082
|
+
const trimmed = text.trim();
|
|
1083
|
+
if (trimmed.length === 0) return false;
|
|
1084
|
+
if (trimmed.length > 1200) return false;
|
|
1085
|
+
return trimmed.includes('\n') || trimmed.includes('{') || trimmed.includes(';') || trimmed.includes(' ');
|
|
1086
|
+
};
|
|
1087
|
+
$(codeLikeSelectors.join(',')).each((_, element) => {
|
|
1088
|
+
const $el = $(element);
|
|
1089
|
+
if ($el.is('pre') || $el.find('pre').length > 0) {
|
|
1090
|
+
return;
|
|
1091
|
+
}
|
|
1092
|
+
const hasBlockChildren = blockTags.some(tag => $el.find(tag).length > 0);
|
|
1093
|
+
if (hasBlockChildren) {
|
|
1094
|
+
return;
|
|
1095
|
+
}
|
|
1096
|
+
const text = $el.text() || '';
|
|
1097
|
+
if (!isLikelyCodeText(text)) {
|
|
1098
|
+
return;
|
|
1099
|
+
}
|
|
1100
|
+
wrapCodeElement($el, $el.html());
|
|
1101
|
+
});
|
|
1102
|
+
|
|
1103
|
+
$('figure').each((_, element) => {
|
|
1104
|
+
const $el = $(element);
|
|
1105
|
+
if ($el.find('pre').length === 1 && $el.children().length === 1) {
|
|
1106
|
+
$el.replaceWith($el.find('pre').first());
|
|
1107
|
+
}
|
|
1108
|
+
});
|
|
1109
|
+
|
|
1110
|
+
const highlightSelectors = [
|
|
1111
|
+
'[class*="hljs"]',
|
|
1112
|
+
'[class*="language-"]',
|
|
1113
|
+
'.simplebar-content',
|
|
1114
|
+
'[data-language]',
|
|
1115
|
+
'[data-code-block]',
|
|
1116
|
+
'[class*="RichContent"]'
|
|
1117
|
+
];
|
|
1118
|
+
const containerClassHints = ['simplebar', 'code', 'hljs', 'prism', 'syntax', 'monaco', 'ace', 'terminal', 'shell'];
|
|
1119
|
+
const containerStyleHints = ['white-space: pre', 'white-space:pre', 'font-family: monospace', 'font-family:monospace'];
|
|
1120
|
+
const inlineTags = new Set(['span', 'code', 'em', 'strong', 'b', 'i', 'u', 'a', 'label']);
|
|
1121
|
+
const newlineTags = new Set(['DIV', 'P', 'LI', 'SECTION', 'ARTICLE', 'FIGURE', 'PRE', 'CODE', 'BR', 'TR', 'TD', 'TH']);
|
|
1122
|
+
const looksLikeCodeBlock = (text = '') => {
|
|
1123
|
+
if (!text) return false;
|
|
1124
|
+
const trimmed = text.trim();
|
|
1125
|
+
if (!trimmed) return false;
|
|
1126
|
+
if (trimmed.includes('\n')) return true;
|
|
1127
|
+
const keywords = ['{', '}', ';', '=>', '->', '#!', 'SELECT ', 'INSERT ', 'docker ', 'kubectl ', 'sudo ', 'printf', 'def ', 'class ', 'function ', 'const ', 'let ', 'var ', 'public ', 'private ', 'import ', 'package ', 'namespace ', 'http '];
|
|
1128
|
+
return keywords.some(keyword => trimmed.includes(keyword));
|
|
1129
|
+
};
|
|
1130
|
+
const getTextWithBreaks = (node) => {
|
|
1131
|
+
if (!node) return '';
|
|
1132
|
+
if (node.type === 'text') {
|
|
1133
|
+
return node.data || '';
|
|
1134
|
+
}
|
|
1135
|
+
if (!node.children || node.children.length === 0) {
|
|
1136
|
+
return newlineTags.has((node.tagName || node.name || '').toUpperCase()) ? '\n' : '';
|
|
1137
|
+
}
|
|
1138
|
+
let text = '';
|
|
1139
|
+
for (const child of node.children) {
|
|
1140
|
+
text += getTextWithBreaks(child);
|
|
1141
|
+
}
|
|
1142
|
+
if (newlineTags.has((node.tagName || node.name || '').toUpperCase())) {
|
|
1143
|
+
text += '\n';
|
|
1144
|
+
}
|
|
1145
|
+
return text;
|
|
1146
|
+
};
|
|
1147
|
+
const normalizeCodeText = (text = '') => {
|
|
1148
|
+
const lines = text
|
|
1149
|
+
.replace(/\r\n?/g, '\n')
|
|
1150
|
+
.split('\n')
|
|
1151
|
+
.map(line => line.replace(/\u00a0/g, ' ').replace(/\t/g, ' ').replace(/\s+$/, ''));
|
|
1152
|
+
while (lines.length && !lines[0].trim()) {
|
|
1153
|
+
lines.shift();
|
|
1154
|
+
}
|
|
1155
|
+
while (lines.length && !lines[lines.length - 1].trim()) {
|
|
1156
|
+
lines.pop();
|
|
1157
|
+
}
|
|
1158
|
+
const result = [];
|
|
1159
|
+
let previousBlank = false;
|
|
1160
|
+
for (const line of lines) {
|
|
1161
|
+
const isBlank = line.trim().length === 0;
|
|
1162
|
+
if (isBlank && previousBlank) {
|
|
1163
|
+
continue;
|
|
1164
|
+
}
|
|
1165
|
+
result.push(line);
|
|
1166
|
+
previousBlank = isBlank;
|
|
1167
|
+
}
|
|
1168
|
+
return result.join('\n').trim();
|
|
1169
|
+
};
|
|
1170
|
+
const convertToCodeBlock = ($target) => {
|
|
1171
|
+
if (!$target || !$target.length) {
|
|
1172
|
+
return false;
|
|
1173
|
+
}
|
|
1174
|
+
const rawText = getTextWithBreaks($target[0]) || '';
|
|
1175
|
+
const normalized = normalizeCodeText(rawText);
|
|
1176
|
+
if (!looksLikeCodeBlock(normalized)) {
|
|
1177
|
+
return false;
|
|
1178
|
+
}
|
|
1179
|
+
const $pre = $('<pre class="code-block"></pre>');
|
|
1180
|
+
const $code = $('<code></code>').text(normalized);
|
|
1181
|
+
$pre.append($code);
|
|
1182
|
+
$target.replaceWith($pre);
|
|
1183
|
+
return true;
|
|
1184
|
+
};
|
|
1185
|
+
const processedCandidates = new Set();
|
|
1186
|
+
$(highlightSelectors.join(',')).each((_, node) => {
|
|
1187
|
+
const $start = $(node);
|
|
1188
|
+
if (!$start || !$start.length) {
|
|
1189
|
+
return;
|
|
1190
|
+
}
|
|
1191
|
+
let $candidate = null;
|
|
1192
|
+
let $current = $start;
|
|
1193
|
+
for (let depth = 0; depth < 8 && $current && $current.length; depth++) {
|
|
1194
|
+
const rawTag = ($current[0]?.tagName || $current[0]?.name || '').toLowerCase();
|
|
1195
|
+
const classAttr = ($current.attr('class') || '').toLowerCase();
|
|
1196
|
+
const styleAttr = ($current.attr('style') || '').toLowerCase();
|
|
1197
|
+
const hasClassHint = containerClassHints.some(keyword => classAttr.includes(keyword));
|
|
1198
|
+
const hasStyleHint = containerStyleHints.some(keyword => styleAttr.includes(keyword));
|
|
1199
|
+
if (!inlineTags.has(rawTag) && (hasClassHint || hasStyleHint)) {
|
|
1200
|
+
$candidate = $current;
|
|
1201
|
+
}
|
|
1202
|
+
$current = $current.parent();
|
|
1203
|
+
}
|
|
1204
|
+
if (!$candidate || !$candidate.length || $candidate.is('pre')) {
|
|
1205
|
+
return;
|
|
1206
|
+
}
|
|
1207
|
+
const key = $candidate[0];
|
|
1208
|
+
if (processedCandidates.has(key)) {
|
|
1209
|
+
return;
|
|
1210
|
+
}
|
|
1211
|
+
if (convertToCodeBlock($candidate)) {
|
|
1212
|
+
processedCandidates.add(key);
|
|
1213
|
+
}
|
|
1214
|
+
});
|
|
1215
|
+
|
|
1216
|
+
const simplebarWrappers = [
|
|
1217
|
+
'.simplebar-wrapper',
|
|
1218
|
+
'.simplebar-height-auto-observer-wrapper',
|
|
1219
|
+
'.simplebar-height-auto-observer',
|
|
1220
|
+
'.simplebar-mask',
|
|
1221
|
+
'.simplebar-offset',
|
|
1222
|
+
'.simplebar-content-wrapper',
|
|
1223
|
+
'.simplebar-placeholder'
|
|
1224
|
+
];
|
|
1225
|
+
simplebarWrappers.forEach(selector => {
|
|
1226
|
+
$(selector).each((_, element) => {
|
|
1227
|
+
const $el = $(element);
|
|
1228
|
+
if ($el.find('pre.code-block').length > 0 || !$el.text().trim()) {
|
|
1229
|
+
$el.replaceWith($el.contents());
|
|
1230
|
+
}
|
|
1231
|
+
});
|
|
1232
|
+
});
|
|
1233
|
+
$('.simplebar-track, .simplebar-scrollbar').remove();
|
|
1234
|
+
|
|
1235
|
+
return $.root().html() || html;
|
|
1236
|
+
} catch {
|
|
1237
|
+
return html;
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
async function detectAccessIssuesOnPage(page) {
|
|
1242
|
+
return page.evaluate(() => {
|
|
1243
|
+
const bodyText = document.body ? (document.body.innerText || '') : '';
|
|
1244
|
+
if (!bodyText) {
|
|
1245
|
+
return null;
|
|
1246
|
+
}
|
|
1247
|
+
const normalized = bodyText.replace(/\s+/g, ' ').trim();
|
|
1248
|
+
if (!normalized) {
|
|
1249
|
+
return null;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
const checks = [
|
|
1253
|
+
{
|
|
1254
|
+
keywords: ['请先登录', '重新登录', '立即登录', '登录后'],
|
|
1255
|
+
message: '页面提示需要登录,Cookie 可能已失效或未正确导入'
|
|
1256
|
+
},
|
|
1257
|
+
{
|
|
1258
|
+
keywords: ['试看结束', '购买专栏', '立即订阅', '购买课程', '仅对付费用户开放', '开通会员'],
|
|
1259
|
+
message: '检测到购买/试看提示,可能未订阅该专栏或 Cookie 已失效'
|
|
1260
|
+
},
|
|
1261
|
+
{
|
|
1262
|
+
keywords: ['暂无权限', '没有权限', '权限不足'],
|
|
1263
|
+
message: '账号没有访问该专栏的权限'
|
|
1264
|
+
}
|
|
1265
|
+
];
|
|
1266
|
+
|
|
1267
|
+
const lower = normalized.toLowerCase();
|
|
1268
|
+
for (const check of checks) {
|
|
1269
|
+
for (const keyword of check.keywords) {
|
|
1270
|
+
if (lower.includes(keyword.toLowerCase())) {
|
|
1271
|
+
return check.message;
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
return null;
|
|
1276
|
+
});
|
|
1277
|
+
}
|
|
1278
|
+
|
|
1279
|
+
async function waitForArticleContentSelector(page, timeout = 60000) {
|
|
1280
|
+
const start = Date.now();
|
|
1281
|
+
while ((Date.now() - start) < timeout) {
|
|
1282
|
+
for (const selector of ARTICLE_CONTENT_SELECTORS) {
|
|
1283
|
+
const handle = await page.$(selector);
|
|
1284
|
+
if (handle) {
|
|
1285
|
+
await handle.dispose();
|
|
1286
|
+
return selector;
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
await page.waitForTimeout(300);
|
|
1290
|
+
}
|
|
1291
|
+
return null;
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
async function autoScrollArticle(page, { step = 400, delay = 120, maxIterations = 80 } = {}) {
|
|
1295
|
+
await page.evaluate(({ step, delay, maxIterations }) => {
|
|
1296
|
+
return new Promise((resolve) => {
|
|
1297
|
+
let iterations = 0;
|
|
1298
|
+
const timer = setInterval(() => {
|
|
1299
|
+
window.scrollBy(0, step);
|
|
1300
|
+
iterations += 1;
|
|
1301
|
+
const reachedBottom = window.scrollY + window.innerHeight >= document.body.scrollHeight - 50;
|
|
1302
|
+
if (reachedBottom || iterations >= maxIterations) {
|
|
1303
|
+
clearInterval(timer);
|
|
1304
|
+
window.scrollTo(0, 0);
|
|
1305
|
+
resolve();
|
|
1306
|
+
}
|
|
1307
|
+
}, delay);
|
|
1308
|
+
});
|
|
1309
|
+
}, { step, delay, maxIterations });
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
async function fetchArticleContentFromPage(page, article, timeout = 60000) {
|
|
1313
|
+
const targetUrl = article.url || `${GEEKTIME_BASE_URL}/column/article/${article.id}`;
|
|
1314
|
+
let response;
|
|
1315
|
+
try {
|
|
1316
|
+
response = await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout });
|
|
1317
|
+
} catch (error) {
|
|
1318
|
+
throw new Error(`页面加载失败: ${error.message}`);
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
if (response && !response.ok()) {
|
|
1322
|
+
throw new Error(`页面响应异常: HTTP ${response.status()} ${response.statusText()}`);
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
try {
|
|
1326
|
+
await page.waitForLoadState('networkidle', { timeout: Math.min(10000, timeout) });
|
|
1327
|
+
} catch {
|
|
1328
|
+
// 部分页面可能没有额外请求,忽略 networkidle 超时
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
await autoScrollArticle(page);
|
|
1332
|
+
await page.waitForTimeout(500);
|
|
1333
|
+
|
|
1334
|
+
const selector = await waitForArticleContentSelector(page, timeout);
|
|
1335
|
+
if (!selector) {
|
|
1336
|
+
const issue = await detectAccessIssuesOnPage(page);
|
|
1337
|
+
if (issue) {
|
|
1338
|
+
throw new Error(issue);
|
|
1339
|
+
}
|
|
1340
|
+
throw new Error('未能定位到文章正文,请重试或检查 Cookie 是否有效');
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
let extraction;
|
|
1344
|
+
try {
|
|
1345
|
+
extraction = await page.$eval(selector, (el) => {
|
|
1346
|
+
const clone = el.cloneNode(true);
|
|
1347
|
+
const removalSelectors = [
|
|
1348
|
+
'.article-share',
|
|
1349
|
+
'.article-actions',
|
|
1350
|
+
'.article-copyright',
|
|
1351
|
+
'.article-bottom',
|
|
1352
|
+
'.reward',
|
|
1353
|
+
'.share',
|
|
1354
|
+
'.Index_recommend',
|
|
1355
|
+
'.recommend',
|
|
1356
|
+
'.audio-player',
|
|
1357
|
+
'.AudioPlayer',
|
|
1358
|
+
'.voice-player',
|
|
1359
|
+
'.VoicePlayer',
|
|
1360
|
+
'.audio-wrapper',
|
|
1361
|
+
'.AudioWrapper',
|
|
1362
|
+
'.geek-player',
|
|
1363
|
+
'.Player',
|
|
1364
|
+
'.plugin',
|
|
1365
|
+
'.Plugin',
|
|
1366
|
+
'[data-widget="audio"]',
|
|
1367
|
+
'[data-widget="Audio"]',
|
|
1368
|
+
'[data-role="audio"]',
|
|
1369
|
+
'.comment-area',
|
|
1370
|
+
'.CommentArea',
|
|
1371
|
+
'.comment-wrapper',
|
|
1372
|
+
'.CommentWrapper',
|
|
1373
|
+
'#comments',
|
|
1374
|
+
'#comment',
|
|
1375
|
+
'.comments',
|
|
1376
|
+
'.Comments'
|
|
1377
|
+
];
|
|
1378
|
+
removalSelectors.forEach(sel => {
|
|
1379
|
+
clone.querySelectorAll(sel).forEach(node => node.remove());
|
|
1380
|
+
});
|
|
1381
|
+
|
|
1382
|
+
const toAbsoluteUrl = (value) => {
|
|
1383
|
+
if (!value || typeof value !== 'string') {
|
|
1384
|
+
return '';
|
|
1385
|
+
}
|
|
1386
|
+
const trimmed = value.trim();
|
|
1387
|
+
if (!trimmed) {
|
|
1388
|
+
return '';
|
|
1389
|
+
}
|
|
1390
|
+
if (trimmed.startsWith('blob:')) {
|
|
1391
|
+
return '';
|
|
1392
|
+
}
|
|
1393
|
+
if (trimmed.startsWith('data:')) {
|
|
1394
|
+
return trimmed;
|
|
1395
|
+
}
|
|
1396
|
+
if (/^https?:/i.test(trimmed)) {
|
|
1397
|
+
return trimmed;
|
|
1398
|
+
}
|
|
1399
|
+
if (trimmed.startsWith('//')) {
|
|
1400
|
+
return `${location.protocol}${trimmed}`;
|
|
1401
|
+
}
|
|
1402
|
+
try {
|
|
1403
|
+
const url = new URL(trimmed, location.href);
|
|
1404
|
+
return url.href;
|
|
1405
|
+
} catch {
|
|
1406
|
+
return '';
|
|
1407
|
+
}
|
|
1408
|
+
};
|
|
1409
|
+
|
|
1410
|
+
const imageFallbackAttrs = [
|
|
1411
|
+
'data-src',
|
|
1412
|
+
'data-original',
|
|
1413
|
+
'data-actualsrc',
|
|
1414
|
+
'data-url',
|
|
1415
|
+
'data-image',
|
|
1416
|
+
'data-origin',
|
|
1417
|
+
'data-thumbnail',
|
|
1418
|
+
'data-bigimgsrc',
|
|
1419
|
+
'data-download',
|
|
1420
|
+
'data-href'
|
|
1421
|
+
];
|
|
1422
|
+
|
|
1423
|
+
clone.querySelectorAll('img').forEach(img => {
|
|
1424
|
+
let finalSrc = toAbsoluteUrl(img.getAttribute('src'));
|
|
1425
|
+
if (!finalSrc) {
|
|
1426
|
+
for (const attr of imageFallbackAttrs) {
|
|
1427
|
+
const candidate = toAbsoluteUrl(img.getAttribute(attr));
|
|
1428
|
+
if (candidate) {
|
|
1429
|
+
finalSrc = candidate;
|
|
1430
|
+
break;
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
if (!finalSrc) {
|
|
1436
|
+
img.remove();
|
|
1437
|
+
} else {
|
|
1438
|
+
img.setAttribute('src', finalSrc);
|
|
1439
|
+
}
|
|
1440
|
+
});
|
|
1441
|
+
|
|
1442
|
+
const textLength = clone.innerText ? clone.innerText.trim().length : 0;
|
|
1443
|
+
return {
|
|
1444
|
+
html: clone.innerHTML,
|
|
1445
|
+
textLength
|
|
1446
|
+
};
|
|
1447
|
+
});
|
|
1448
|
+
} catch (error) {
|
|
1449
|
+
throw new Error(`读取文章内容失败: ${error.message}`);
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
if (!extraction || !extraction.html || extraction.textLength < 20) {
|
|
1453
|
+
const issue = await detectAccessIssuesOnPage(page);
|
|
1454
|
+
if (issue) {
|
|
1455
|
+
throw new Error(issue);
|
|
1456
|
+
}
|
|
1457
|
+
throw new Error('正文内容为空,可能是 Cookie 失效或只获取到试看内容');
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
const normalizedHtml = normalizeArticleHtml(extraction.html);
|
|
1461
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
1462
|
+
|
|
1463
|
+
if (!sanitizedHtml || sanitizedHtml.trim().length === 0) {
|
|
1464
|
+
throw new Error('正文清洗后为空,可能是页面结构变化');
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
const cleaned = removeDuplicateTitle(sanitizedHtml, article.originalTitle || article.title || '');
|
|
1468
|
+
return enhanceCodeBlocks(cleaned);
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
function isRetryableContentError(message = '') {
|
|
1472
|
+
if (!message) return true;
|
|
1473
|
+
const lower = message.toLowerCase();
|
|
1474
|
+
const nonRetryableKeywords = [
|
|
1475
|
+
'cookie', '登录', '登陆', '订阅', '试看', '权限', '购买', '未授权', '无权限'
|
|
1476
|
+
];
|
|
1477
|
+
return !nonRetryableKeywords.some(keyword => lower.includes(keyword));
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
async function fetchArticleContentWithRetry(page, article, options = {}) {
|
|
1481
|
+
const {
|
|
1482
|
+
timeout = 60000,
|
|
1483
|
+
maxAttempts = 3,
|
|
1484
|
+
delayMs = 1500
|
|
1485
|
+
} = options;
|
|
1486
|
+
|
|
1487
|
+
let lastError = null;
|
|
1488
|
+
|
|
1489
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
1490
|
+
try {
|
|
1491
|
+
if (attempt > 1) {
|
|
1492
|
+
await page.waitForTimeout(400);
|
|
1493
|
+
}
|
|
1494
|
+
return await fetchArticleContentFromPage(page, article, timeout);
|
|
1495
|
+
} catch (error) {
|
|
1496
|
+
lastError = error;
|
|
1497
|
+
const message = error?.message || '';
|
|
1498
|
+
if (!isRetryableContentError(message) || attempt === maxAttempts) {
|
|
1499
|
+
throw error;
|
|
1500
|
+
}
|
|
1501
|
+
const waitTime = delayMs * attempt;
|
|
1502
|
+
if (process.env.DEBUG) {
|
|
1503
|
+
console.log(chalk.gray(`重试文章 ${article.id} (第${attempt}次失败: ${message}),等待 ${waitTime}ms`));
|
|
1504
|
+
}
|
|
1505
|
+
try {
|
|
1506
|
+
await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 });
|
|
1507
|
+
} catch {
|
|
1508
|
+
// 忽略
|
|
1509
|
+
}
|
|
1510
|
+
await page.waitForTimeout(waitTime);
|
|
1511
|
+
}
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1514
|
+
throw lastError || new Error('无法获取文章内容');
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
async function extractArticlesFromPageDom(page) {
|
|
1518
|
+
return page.evaluate((baseUrl) => {
|
|
1519
|
+
const selectors = [
|
|
1520
|
+
'[class*="catalog"] a[href*="/column/article/"]',
|
|
1521
|
+
'[class*="directory"] a[href*="/column/article/"]',
|
|
1522
|
+
'[class*="Catalogue"] a[href*="/column/article/"]',
|
|
1523
|
+
'[class*="Catalog"] a[href*="/column/article/"]',
|
|
1524
|
+
'nav a[href*="/column/article/"]',
|
|
1525
|
+
'a[href*="/column/article/"]'
|
|
1526
|
+
];
|
|
1527
|
+
|
|
1528
|
+
const collectedAnchors = [];
|
|
1529
|
+
const seenElements = new Set();
|
|
1530
|
+
selectors.forEach(selector => {
|
|
1531
|
+
const nodes = document.querySelectorAll(selector);
|
|
1532
|
+
nodes.forEach(node => {
|
|
1533
|
+
if (!seenElements.has(node)) {
|
|
1534
|
+
seenElements.add(node);
|
|
1535
|
+
collectedAnchors.push(node);
|
|
1536
|
+
}
|
|
1537
|
+
});
|
|
1538
|
+
});
|
|
1539
|
+
|
|
1540
|
+
if (collectedAnchors.length === 0) {
|
|
1541
|
+
return [];
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
const seenIds = new Set();
|
|
1545
|
+
const articles = [];
|
|
1546
|
+
|
|
1547
|
+
const cleanText = (text) => (text || '').replace(/\s+/g, ' ').trim();
|
|
1548
|
+
|
|
1549
|
+
collectedAnchors.forEach((anchor, index) => {
|
|
1550
|
+
const href = anchor.getAttribute('href') || '';
|
|
1551
|
+
const match = href.match(/column\/article\/(\d+)/i);
|
|
1552
|
+
if (!match) {
|
|
1553
|
+
return;
|
|
1554
|
+
}
|
|
1555
|
+
|
|
1556
|
+
const id = parseInt(match[1], 10);
|
|
1557
|
+
if (!id || seenIds.has(id)) {
|
|
1558
|
+
return;
|
|
1559
|
+
}
|
|
1560
|
+
seenIds.add(id);
|
|
1561
|
+
|
|
1562
|
+
let title = cleanText(anchor.innerText || anchor.textContent || anchor.getAttribute('title') || '');
|
|
1563
|
+
if (!title) {
|
|
1564
|
+
const titleNode = anchor.querySelector('[class*="title"], span, div');
|
|
1565
|
+
if (titleNode) {
|
|
1566
|
+
title = cleanText(titleNode.textContent);
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
if (!title) {
|
|
1570
|
+
title = `文章_${id}`;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
let absoluteUrl = href;
|
|
1574
|
+
try {
|
|
1575
|
+
absoluteUrl = new URL(href, baseUrl).toString();
|
|
1576
|
+
} catch {
|
|
1577
|
+
if (href.startsWith('/')) {
|
|
1578
|
+
absoluteUrl = `${baseUrl.replace(/\/$/, '')}${href}`;
|
|
1579
|
+
}
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
const sectionNode = anchor.closest('[data-section],[data-chapter],[class*="section"],[class*="Section"],[class*="chapter"],[class*="Chapter"]');
|
|
1583
|
+
let sectionName = '';
|
|
1584
|
+
if (sectionNode) {
|
|
1585
|
+
sectionName = cleanText(
|
|
1586
|
+
sectionNode.getAttribute('data-section') ||
|
|
1587
|
+
sectionNode.getAttribute('data-chapter') ||
|
|
1588
|
+
sectionNode.getAttribute('data-title') ||
|
|
1589
|
+
sectionNode.querySelector('h2, h3, h4, .title, .section-title')?.textContent ||
|
|
1590
|
+
''
|
|
1591
|
+
);
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
articles.push({
|
|
1595
|
+
id,
|
|
1596
|
+
article_title: title,
|
|
1597
|
+
article_sharetitle: title,
|
|
1598
|
+
url: absoluteUrl,
|
|
1599
|
+
section_name: sectionName,
|
|
1600
|
+
chapter_index: index + 1,
|
|
1601
|
+
originalIndex: index
|
|
1602
|
+
});
|
|
1603
|
+
});
|
|
1604
|
+
|
|
1605
|
+
return articles;
|
|
1606
|
+
}, GEEKTIME_BASE_URL);
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
async function extractColumnAuthorFromPage(page) {
|
|
1610
|
+
try {
|
|
1611
|
+
return await page.evaluate(() => {
|
|
1612
|
+
const selectors = [
|
|
1613
|
+
'.author-name',
|
|
1614
|
+
'.author',
|
|
1615
|
+
'.teacher-name',
|
|
1616
|
+
'.lecturer-name',
|
|
1617
|
+
'.Index_teacherName',
|
|
1618
|
+
'.ProductHeader_teacherName',
|
|
1619
|
+
'.ColumnIntro_teacher__name',
|
|
1620
|
+
'.ColumnIntro_author__name'
|
|
1621
|
+
];
|
|
1622
|
+
for (const selector of selectors) {
|
|
1623
|
+
const el = document.querySelector(selector);
|
|
1624
|
+
if (el && el.textContent && el.textContent.trim()) {
|
|
1625
|
+
return el.textContent.trim();
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
const metaAuthor = document.querySelector('meta[name="author"]');
|
|
1629
|
+
if (metaAuthor && metaAuthor.content) {
|
|
1630
|
+
return metaAuthor.content.trim();
|
|
1631
|
+
}
|
|
1632
|
+
return null;
|
|
1633
|
+
});
|
|
1634
|
+
} catch {
|
|
1635
|
+
return null;
|
|
1636
|
+
}
|
|
1637
|
+
}
|
|
1638
|
+
|
|
503
1639
|
// 获取专栏所有文章列表(通过API)
|
|
504
1640
|
function getValueByPath(obj, path) {
|
|
505
1641
|
if (!obj || !path) return undefined;
|
|
@@ -602,45 +1738,53 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
602
1738
|
let columnInfoHandler = null;
|
|
603
1739
|
|
|
604
1740
|
// 用于同步的 Promise
|
|
605
|
-
const articlesPromise =
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
1741
|
+
const articlesPromise = Promise.race([
|
|
1742
|
+
new Promise((resolve) => {
|
|
1743
|
+
articlesHandler = async (response) => {
|
|
1744
|
+
const url = response.url();
|
|
1745
|
+
// 监听文章列表 API
|
|
1746
|
+
if (url.includes('/serv/v1/column/articles')) {
|
|
1747
|
+
try {
|
|
1748
|
+
const data = await response.json();
|
|
1749
|
+
if (process.env.DEBUG) {
|
|
1750
|
+
console.log(chalk.gray('\n收到文章列表API响应'));
|
|
1751
|
+
}
|
|
1752
|
+
resolve(data);
|
|
1753
|
+
} catch (e) {
|
|
1754
|
+
console.error('解析文章列表API失败:', e);
|
|
1755
|
+
resolve(null);
|
|
614
1756
|
}
|
|
615
|
-
resolve(data);
|
|
616
|
-
} catch (e) {
|
|
617
|
-
console.error('解析文章列表API失败:', e);
|
|
618
1757
|
}
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
url.includes('/serv/v1/column/
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
1758
|
+
};
|
|
1759
|
+
page.on('response', articlesHandler);
|
|
1760
|
+
}),
|
|
1761
|
+
new Promise(resolve => setTimeout(() => resolve(null), 30000))
|
|
1762
|
+
]);
|
|
1763
|
+
|
|
1764
|
+
const columnInfoPromise = Promise.race([
|
|
1765
|
+
new Promise((resolve) => {
|
|
1766
|
+
columnInfoHandler = async (response) => {
|
|
1767
|
+
const url = response.url();
|
|
1768
|
+
// 监听专栏详情相关的 API
|
|
1769
|
+
if (url.includes('/serv/v1/column/intro') ||
|
|
1770
|
+
url.includes('/serv/v3/column/info') ||
|
|
1771
|
+
url.includes('/serv/v1/column/detail')) {
|
|
1772
|
+
try {
|
|
1773
|
+
const data = await response.json();
|
|
1774
|
+
if (process.env.DEBUG) {
|
|
1775
|
+
console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
|
|
1776
|
+
}
|
|
1777
|
+
resolve(data);
|
|
1778
|
+
} catch (e) {
|
|
1779
|
+
console.error('解析专栏信息API失败:', e);
|
|
1780
|
+
resolve(null);
|
|
635
1781
|
}
|
|
636
|
-
resolve(data);
|
|
637
|
-
} catch (e) {
|
|
638
|
-
console.error('解析专栏信息API失败:', e);
|
|
639
1782
|
}
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
1783
|
+
};
|
|
1784
|
+
page.on('response', columnInfoHandler);
|
|
1785
|
+
}),
|
|
1786
|
+
new Promise(resolve => setTimeout(() => resolve(null), 5000))
|
|
1787
|
+
]);
|
|
644
1788
|
|
|
645
1789
|
try {
|
|
646
1790
|
// 先设置监听器,再访问页面
|
|
@@ -649,23 +1793,13 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
649
1793
|
|
|
650
1794
|
spinner.text = '正在获取文章列表...';
|
|
651
1795
|
|
|
652
|
-
// 等待文章列表 API
|
|
653
|
-
articlesData = await
|
|
654
|
-
articlesPromise,
|
|
655
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('文章列表API调用超时')), 30000))
|
|
656
|
-
]);
|
|
1796
|
+
// 等待文章列表 API(如果失败将返回 null)
|
|
1797
|
+
articlesData = await articlesPromise;
|
|
657
1798
|
|
|
658
|
-
// 尝试等待专栏信息 API
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
|
|
663
|
-
]);
|
|
664
|
-
} catch (e) {
|
|
665
|
-
// 获取专栏信息失败不是致命错误
|
|
666
|
-
if (process.env.DEBUG) {
|
|
667
|
-
console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
|
|
668
|
-
}
|
|
1799
|
+
// 尝试等待专栏信息 API(可选)
|
|
1800
|
+
columnInfoData = await columnInfoPromise;
|
|
1801
|
+
if (!columnInfoData && process.env.DEBUG) {
|
|
1802
|
+
console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
|
|
669
1803
|
}
|
|
670
1804
|
|
|
671
1805
|
} catch (error) {
|
|
@@ -694,32 +1828,47 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
694
1828
|
}
|
|
695
1829
|
}
|
|
696
1830
|
|
|
697
|
-
|
|
698
|
-
|
|
1831
|
+
let useDomExtraction = false;
|
|
1832
|
+
let domArticles = [];
|
|
1833
|
+
|
|
1834
|
+
if (!articlesData || !articlesData.data || !Array.isArray(articlesData.data.list) || articlesData.data.list.length === 0) {
|
|
1835
|
+
spinner.text = 'API 不可用,尝试从页面解析文章列表...';
|
|
1836
|
+
try {
|
|
1837
|
+
domArticles = await extractArticlesFromPageDom(page);
|
|
1838
|
+
} catch (error) {
|
|
1839
|
+
if (process.env.DEBUG) {
|
|
1840
|
+
console.log(chalk.gray(`DOM文章提取失败: ${error.message}`));
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1844
|
+
if (!domArticles || domArticles.length === 0) {
|
|
1845
|
+
spinner.fail('无法获取文章列表');
|
|
1846
|
+
|
|
1847
|
+
if (!articlesData) {
|
|
1848
|
+
console.log(chalk.yellow('\n⚠️ 未能从接口或页面获取文章列表\n'));
|
|
1849
|
+
console.log(chalk.cyan('可能的原因:'));
|
|
1850
|
+
console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
|
|
1851
|
+
console.log(chalk.gray(' 2. 页面结构发生变化 - 请联系开发者更新解析逻辑'));
|
|
1852
|
+
console.log(chalk.gray(' 3. 网络连接问题或URL无效\n'));
|
|
1853
|
+
} else if (articlesData.code === -3000 || articlesData.code === -3001) {
|
|
1854
|
+
console.log(chalk.red('\n❌ Cookie 已失效\n'));
|
|
1855
|
+
console.log(chalk.cyan('📖 请重新获取 Cookie:'));
|
|
1856
|
+
console.log(chalk.gray(' 1. 浏览器登录极客时间'));
|
|
1857
|
+
console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
|
|
1858
|
+
console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
|
|
1859
|
+
console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
|
|
1860
|
+
} else if (articlesData.error) {
|
|
1861
|
+
console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
|
|
1862
|
+
}
|
|
699
1863
|
|
|
700
|
-
|
|
701
|
-
if (!articlesData) {
|
|
702
|
-
console.log(chalk.yellow('\n⚠️ 未能获取到文章列表数据\n'));
|
|
703
|
-
console.log(chalk.cyan('可能的原因:'));
|
|
704
|
-
console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
|
|
705
|
-
console.log(chalk.gray(' 2. 网络连接问题 - 请检查网络'));
|
|
706
|
-
console.log(chalk.gray(' 3. 专栏 ID 不正确 - 请检查 URL\n'));
|
|
707
|
-
} else if (articlesData.code === -3000 || articlesData.code === -3001) {
|
|
708
|
-
console.log(chalk.red('\n❌ Cookie 已失效\n'));
|
|
709
|
-
console.log(chalk.cyan('📖 请重新获取 Cookie:'));
|
|
710
|
-
console.log(chalk.gray(' 1. 浏览器登录极客时间'));
|
|
711
|
-
console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
|
|
712
|
-
console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
|
|
713
|
-
console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
|
|
714
|
-
} else if (articlesData.error) {
|
|
715
|
-
console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
|
|
1864
|
+
return { articles: [], columnTitle: 'unknown', columnAuthor: '极客时间' };
|
|
716
1865
|
}
|
|
717
1866
|
|
|
718
|
-
|
|
1867
|
+
useDomExtraction = true;
|
|
719
1868
|
}
|
|
720
1869
|
|
|
721
1870
|
// 调试信息:记录完整的API响应结构(仅在环境变量DEBUG存在时)
|
|
722
|
-
if (process.env.DEBUG) {
|
|
1871
|
+
if (!useDomExtraction && process.env.DEBUG) {
|
|
723
1872
|
console.log(chalk.gray('\n========== 文章列表 API 响应数据 =========='));
|
|
724
1873
|
console.log(chalk.gray(JSON.stringify(articlesData.data, null, 2)));
|
|
725
1874
|
if (columnInfoData) {
|
|
@@ -742,7 +1891,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
742
1891
|
}
|
|
743
1892
|
|
|
744
1893
|
// 方法2: 从文章列表 API 数据中获取
|
|
745
|
-
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
1894
|
+
if ((!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') && articlesData && articlesData.data) {
|
|
746
1895
|
columnTitle = articlesData.data.column_title
|
|
747
1896
|
|| articlesData.data.column_subtitle
|
|
748
1897
|
|| articlesData.data.title
|
|
@@ -826,10 +1975,15 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
826
1975
|
console.log(chalk.gray(` 提取的专栏名: ${columnTitle}\n`));
|
|
827
1976
|
}
|
|
828
1977
|
|
|
829
|
-
|
|
1978
|
+
let columnAuthor = '极客时间';
|
|
1979
|
+
if (!useDomExtraction && articlesData) {
|
|
1980
|
+
columnAuthor = extractColumnAuthor(columnInfoData, articlesData) || '极客时间';
|
|
1981
|
+
} else {
|
|
1982
|
+
columnAuthor = await extractColumnAuthorFromPage(page) || '极客时间';
|
|
1983
|
+
}
|
|
830
1984
|
|
|
831
1985
|
// 解析文章列表
|
|
832
|
-
const rawArticles = articlesData.data.list;
|
|
1986
|
+
const rawArticles = useDomExtraction ? domArticles : (articlesData.data.list || []);
|
|
833
1987
|
|
|
834
1988
|
const articles = rawArticles.map((article, index) => {
|
|
835
1989
|
const title = article.article_title || article.article_sharetitle || 'Untitled';
|
|
@@ -844,7 +1998,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
844
1998
|
|
|
845
1999
|
return {
|
|
846
2000
|
title: cleanTitle,
|
|
847
|
-
url:
|
|
2001
|
+
url: article.url || `${GEEKTIME_BASE_URL}/column/article/${id}`,
|
|
848
2002
|
originalTitle: title,
|
|
849
2003
|
id: id,
|
|
850
2004
|
sectionName: article.section_name || '',
|
|
@@ -888,7 +2042,7 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
|
|
|
888
2042
|
const article = articles[index];
|
|
889
2043
|
|
|
890
2044
|
try {
|
|
891
|
-
const result = await downloadArticleSilent(page, article, outputDir, index + 1, total);
|
|
2045
|
+
const result = await downloadArticleSilent(page, article, outputDir, index + 1, total, timeout);
|
|
892
2046
|
results[index] = result;
|
|
893
2047
|
completed++;
|
|
894
2048
|
|
|
@@ -943,52 +2097,20 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
|
|
|
943
2097
|
}
|
|
944
2098
|
|
|
945
2099
|
// 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
|
|
946
|
-
async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
2100
|
+
async function downloadArticleSilent(page, article, outputDir, index, total, timeout = 60000) {
|
|
947
2101
|
try {
|
|
948
2102
|
if (process.env.DEBUG) {
|
|
949
2103
|
console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
|
|
950
2104
|
}
|
|
951
|
-
const
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
}
|
|
955
|
-
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
956
|
-
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
957
|
-
if (process.env.DEBUG) {
|
|
958
|
-
console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
|
|
959
|
-
}
|
|
960
|
-
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
2105
|
+
const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
|
|
2106
|
+
const meta = article.sectionName ? `章节:${article.sectionName}` : '';
|
|
2107
|
+
const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
|
|
961
2108
|
|
|
962
2109
|
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
963
|
-
if (process.env.DEBUG) {
|
|
964
|
-
console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
|
|
965
|
-
}
|
|
966
|
-
if (process.env.DEBUG) {
|
|
967
|
-
console.log(chalk.gray(`[silent] 等待图片初步加载 ${article.id}`));
|
|
968
|
-
}
|
|
969
|
-
try {
|
|
970
|
-
await page.waitForFunction(() => {
|
|
971
|
-
const imgs = Array.from(document.images || []);
|
|
972
|
-
if (imgs.length === 0) {
|
|
973
|
-
return true;
|
|
974
|
-
}
|
|
975
|
-
return imgs.every(img => img.complete);
|
|
976
|
-
}, { timeout: 30000 });
|
|
977
|
-
} catch (waitError) {
|
|
978
|
-
if (process.env.DEBUG) {
|
|
979
|
-
console.log(chalk.gray(`[silent] 图片初步加载等待超时 ${article.id}: ${waitError?.message || waitError}`));
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
2110
|
try {
|
|
983
2111
|
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
984
|
-
if (process.env.DEBUG) {
|
|
985
|
-
console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
|
|
986
|
-
}
|
|
987
2112
|
} catch {
|
|
988
|
-
//
|
|
989
|
-
if (process.env.DEBUG) {
|
|
990
|
-
console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
|
|
991
|
-
}
|
|
2113
|
+
// ignore
|
|
992
2114
|
}
|
|
993
2115
|
|
|
994
2116
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
@@ -1080,7 +2202,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
1080
2202
|
}
|
|
1081
2203
|
|
|
1082
2204
|
// 等待图片处理完成
|
|
1083
|
-
await page.waitForTimeout(
|
|
2205
|
+
await page.waitForTimeout(1200);
|
|
1084
2206
|
if (process.env.DEBUG) {
|
|
1085
2207
|
console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
|
|
1086
2208
|
}
|
|
@@ -1098,7 +2220,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
1098
2220
|
bottom: '20mm',
|
|
1099
2221
|
left: '15mm'
|
|
1100
2222
|
},
|
|
1101
|
-
printBackground:
|
|
2223
|
+
printBackground: true,
|
|
1102
2224
|
preferCSSPageSize: false
|
|
1103
2225
|
});
|
|
1104
2226
|
if (process.env.DEBUG) {
|
|
@@ -1116,20 +2238,19 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
1116
2238
|
}
|
|
1117
2239
|
|
|
1118
2240
|
// 下载单篇文章为 PDF
|
|
1119
|
-
async function downloadArticle(page, article, outputDir, index, total) {
|
|
2241
|
+
async function downloadArticle(page, article, outputDir, index, total, timeout = 60000) {
|
|
1120
2242
|
const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
|
|
1121
2243
|
|
|
1122
2244
|
try {
|
|
1123
|
-
const
|
|
1124
|
-
const
|
|
1125
|
-
const
|
|
1126
|
-
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
2245
|
+
const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
|
|
2246
|
+
const meta = article.sectionName ? `章节:${article.sectionName}` : '';
|
|
2247
|
+
const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
|
|
1127
2248
|
|
|
1128
2249
|
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
1129
2250
|
try {
|
|
1130
2251
|
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
1131
2252
|
} catch {
|
|
1132
|
-
//
|
|
2253
|
+
// 忽略
|
|
1133
2254
|
}
|
|
1134
2255
|
|
|
1135
2256
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
@@ -1204,7 +2325,7 @@ async function downloadArticle(page, article, outputDir, index, total) {
|
|
|
1204
2325
|
bottom: '20mm',
|
|
1205
2326
|
left: '15mm'
|
|
1206
2327
|
},
|
|
1207
|
-
printBackground:
|
|
2328
|
+
printBackground: true,
|
|
1208
2329
|
preferCSSPageSize: false
|
|
1209
2330
|
});
|
|
1210
2331
|
|
|
@@ -1327,11 +2448,9 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
|
|
|
1327
2448
|
}
|
|
1328
2449
|
|
|
1329
2450
|
// 提取单篇文章的 HTML 内容(用于 EPUB 生成)
|
|
1330
|
-
async function extractArticleContent(page, article, index, total) {
|
|
2451
|
+
async function extractArticleContent(page, article, index, total, timeout = 60000) {
|
|
1331
2452
|
try {
|
|
1332
|
-
const
|
|
1333
|
-
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
1334
|
-
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
2453
|
+
const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
|
|
1335
2454
|
|
|
1336
2455
|
if (!sanitizedHtml) {
|
|
1337
2456
|
throw new Error('未能提取到文章内容');
|
|
@@ -1385,7 +2504,7 @@ async function extractWithConcurrency(context, articles, concurrency = 5, delay
|
|
|
1385
2504
|
const article = articles[index];
|
|
1386
2505
|
|
|
1387
2506
|
try {
|
|
1388
|
-
const result = await extractArticleContent(page, article, index + 1, total);
|
|
2507
|
+
const result = await extractArticleContent(page, article, index + 1, total, timeout);
|
|
1389
2508
|
results[index] = result;
|
|
1390
2509
|
completed++;
|
|
1391
2510
|
|
|
@@ -1500,41 +2619,43 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
|
|
|
1500
2619
|
margin: 1.5em 0;
|
|
1501
2620
|
padding: 0;
|
|
1502
2621
|
}
|
|
1503
|
-
p {
|
|
2622
|
+
p, div {
|
|
1504
2623
|
margin: 1.2em 0;
|
|
1505
2624
|
text-indent: 0;
|
|
1506
|
-
line-height: 1.
|
|
2625
|
+
line-height: 1.9;
|
|
1507
2626
|
word-wrap: break-word;
|
|
1508
2627
|
overflow-wrap: break-word;
|
|
1509
2628
|
display: block;
|
|
1510
2629
|
page-break-inside: avoid;
|
|
1511
2630
|
}
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
2631
|
+
p + p,
|
|
2632
|
+
div + p,
|
|
2633
|
+
p + div {
|
|
2634
|
+
margin-top: 1.6em;
|
|
1515
2635
|
}
|
|
1516
2636
|
/* 代码块样式 */
|
|
1517
2637
|
pre {
|
|
1518
|
-
background-color: #
|
|
2638
|
+
background-color: #0b1220;
|
|
2639
|
+
color: #d9e2ff;
|
|
1519
2640
|
border: 1px solid #e1e4e8;
|
|
1520
2641
|
border-radius: 6px;
|
|
1521
|
-
padding:
|
|
2642
|
+
padding: 18px 20px;
|
|
1522
2643
|
overflow-x: auto;
|
|
1523
2644
|
margin: 1em 0;
|
|
1524
|
-
line-height: 1.
|
|
2645
|
+
line-height: 1.6;
|
|
1525
2646
|
font-size: 14px;
|
|
1526
2647
|
white-space: pre-wrap;
|
|
1527
2648
|
word-wrap: break-word;
|
|
1528
|
-
font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
2649
|
+
font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
1529
2650
|
page-break-inside: avoid;
|
|
1530
2651
|
}
|
|
1531
2652
|
code {
|
|
1532
|
-
font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
2653
|
+
font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
1533
2654
|
font-size: 0.9em;
|
|
1534
|
-
background-color:
|
|
2655
|
+
background-color: rgba(15, 23, 42, 0.1);
|
|
1535
2656
|
padding: 0.2em 0.4em;
|
|
1536
2657
|
border-radius: 3px;
|
|
1537
|
-
border: 1px solid
|
|
2658
|
+
border: 1px solid rgba(15, 23, 42, 0.1);
|
|
1538
2659
|
}
|
|
1539
2660
|
pre code {
|
|
1540
2661
|
background-color: transparent;
|
|
@@ -1645,12 +2766,13 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
|
|
|
1645
2766
|
async function main(options) {
|
|
1646
2767
|
console.log(chalk.bold.cyan('\n🚀 极客时间专栏下载器\n'));
|
|
1647
2768
|
|
|
1648
|
-
// 获取配置:优先级 命令行 > 配置文件
|
|
2769
|
+
// 获取配置:优先级 命令行 > 配置文件 > 默认 cookies.json
|
|
1649
2770
|
let cookie = options.cookie;
|
|
2771
|
+
let cookieFile = options.cookieFile;
|
|
1650
2772
|
let columnUrl = options.url;
|
|
1651
2773
|
|
|
1652
|
-
//
|
|
1653
|
-
if (!cookie || !columnUrl) {
|
|
2774
|
+
// 如果命令行没有提供所需信息,尝试从配置文件读取
|
|
2775
|
+
if (!cookie || !columnUrl || !cookieFile) {
|
|
1654
2776
|
// 使用当前工作目录的config.json,而不是脚本所在目录
|
|
1655
2777
|
const configPath = path.join(process.cwd(), 'config.json');
|
|
1656
2778
|
try {
|
|
@@ -1660,22 +2782,37 @@ async function main(options) {
|
|
|
1660
2782
|
// 使用配置文件中的值作为默认值
|
|
1661
2783
|
if (!cookie) cookie = config.cookie;
|
|
1662
2784
|
if (!columnUrl) columnUrl = config.columnUrl;
|
|
2785
|
+
if (!cookieFile) cookieFile = config.cookieFile;
|
|
1663
2786
|
} catch (error) {
|
|
1664
2787
|
// 配置文件不存在或读取失败,不是致命错误
|
|
1665
2788
|
// 只有在命令行也没提供时才报错
|
|
1666
2789
|
}
|
|
1667
2790
|
}
|
|
1668
2791
|
|
|
2792
|
+
// 如果没有cookie字符串但存在 cookies.json 文件,自动使用
|
|
2793
|
+
if (!cookie && !cookieFile) {
|
|
2794
|
+
const defaultCookieJsonPath = path.join(process.cwd(), 'cookies.json');
|
|
2795
|
+
if (await fileExists(defaultCookieJsonPath)) {
|
|
2796
|
+
cookieFile = defaultCookieJsonPath;
|
|
2797
|
+
}
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
const cookieSavePath = cookieFile || path.join(process.cwd(), 'cookies.json');
|
|
2801
|
+
|
|
1669
2802
|
// 验证必要参数
|
|
1670
|
-
if (!cookie) {
|
|
2803
|
+
if (!cookie && !cookieFile) {
|
|
1671
2804
|
console.error(chalk.red('❌ 缺少 Cookie!'));
|
|
1672
2805
|
console.log(chalk.yellow('\n请通过以下方式之一提供 Cookie:'));
|
|
1673
2806
|
console.log(chalk.gray('1. 命令行参数:--cookie "你的cookie字符串"'));
|
|
1674
2807
|
console.log(chalk.gray('2. 配置文件 config.json:'));
|
|
1675
2808
|
console.log(chalk.gray(' {'));
|
|
1676
2809
|
console.log(chalk.gray(' "cookie": "你的cookie字符串",'));
|
|
1677
|
-
console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx"'));
|
|
1678
|
-
console.log(chalk.gray('
|
|
2810
|
+
console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx",'));
|
|
2811
|
+
console.log(chalk.gray(' "cookieFile": "cookies.json" // 可选,导入JSON文件'));
|
|
2812
|
+
console.log(chalk.gray(' }'));
|
|
2813
|
+
console.log(chalk.gray('3. 提供 Cookie JSON 文件:'));
|
|
2814
|
+
console.log(chalk.gray(' - 命令行参数:--cookie-file ./cookies.json'));
|
|
2815
|
+
console.log(chalk.gray(' - 或将 cookies.json 放到当前目录\n'));
|
|
1679
2816
|
process.exit(1);
|
|
1680
2817
|
}
|
|
1681
2818
|
|
|
@@ -1724,16 +2861,42 @@ async function main(options) {
|
|
|
1724
2861
|
userAgent: DEFAULT_USER_AGENT
|
|
1725
2862
|
});
|
|
1726
2863
|
|
|
1727
|
-
|
|
1728
|
-
let
|
|
1729
|
-
|
|
1730
|
-
|
|
2864
|
+
let normalizedCookie = '';
|
|
2865
|
+
let cookiesForContext = [];
|
|
2866
|
+
|
|
2867
|
+
if (cookie) {
|
|
2868
|
+
normalizedCookie = cookie.trim();
|
|
2869
|
+
if (/^cookie:/i.test(normalizedCookie)) {
|
|
2870
|
+
normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
|
|
2871
|
+
}
|
|
2872
|
+
cookiesForContext = parseCookies(normalizedCookie);
|
|
2873
|
+
} else if (cookieFile) {
|
|
2874
|
+
try {
|
|
2875
|
+
const { cookieHeader, cookies, absolutePath } = await loadCookiesFromJsonFile(cookieFile);
|
|
2876
|
+
normalizedCookie = cookieHeader.trim();
|
|
2877
|
+
cookiesForContext = cookies;
|
|
2878
|
+
console.log(chalk.gray(`🍪 已从 ${absolutePath} 导入 Cookie`));
|
|
2879
|
+
} catch (error) {
|
|
2880
|
+
console.error(chalk.red(`❌ 读取 Cookie JSON 失败: ${error.message}`));
|
|
2881
|
+
process.exit(1);
|
|
2882
|
+
}
|
|
1731
2883
|
}
|
|
2884
|
+
|
|
1732
2885
|
globalCookieHeader = normalizedCookie;
|
|
1733
2886
|
|
|
1734
2887
|
// 设置 cookies
|
|
1735
|
-
|
|
1736
|
-
await context
|
|
2888
|
+
await context.addCookies(cookiesForContext);
|
|
2889
|
+
await updateGlobalCookieHeaderFromContext(context);
|
|
2890
|
+
context.on('response', (response) => {
|
|
2891
|
+
try {
|
|
2892
|
+
const headers = response.headers();
|
|
2893
|
+
if (headers && headers['set-cookie']) {
|
|
2894
|
+
updateGlobalCookieHeaderFromContext(context);
|
|
2895
|
+
}
|
|
2896
|
+
} catch {
|
|
2897
|
+
// ignore
|
|
2898
|
+
}
|
|
2899
|
+
});
|
|
1737
2900
|
|
|
1738
2901
|
// 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
|
|
1739
2902
|
await context.route('**/*', (route) => {
|
|
@@ -1755,9 +2918,12 @@ async function main(options) {
|
|
|
1755
2918
|
}
|
|
1756
2919
|
|
|
1757
2920
|
const headers = {
|
|
1758
|
-
...request.headers()
|
|
1759
|
-
cookie: normalizedCookie
|
|
2921
|
+
...request.headers()
|
|
1760
2922
|
};
|
|
2923
|
+
const outgoingCookieHeader = globalCookieHeader || normalizedCookie;
|
|
2924
|
+
if (outgoingCookieHeader) {
|
|
2925
|
+
headers.cookie = outgoingCookieHeader;
|
|
2926
|
+
}
|
|
1761
2927
|
route.continue({ headers });
|
|
1762
2928
|
});
|
|
1763
2929
|
|
|
@@ -1843,7 +3009,10 @@ async function main(options) {
|
|
|
1843
3009
|
const successCount = results.filter(r => r.success).length;
|
|
1844
3010
|
const failCount = results.filter(r => !r.success).length;
|
|
1845
3011
|
const timeoutCount = results.filter(r =>
|
|
1846
|
-
!r.success && r.error &&
|
|
3012
|
+
!r.success && r.error && /timeout/i.test(r.error)
|
|
3013
|
+
).length;
|
|
3014
|
+
const authIssueCount = results.filter(r =>
|
|
3015
|
+
!r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
|
|
1847
3016
|
).length;
|
|
1848
3017
|
|
|
1849
3018
|
console.log(chalk.bold.cyan('\n📊 PDF 下载统计\n'));
|
|
@@ -1857,6 +3026,11 @@ async function main(options) {
|
|
|
1857
3026
|
console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
|
|
1858
3027
|
console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
|
|
1859
3028
|
console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
|
|
3029
|
+
} else if (authIssueCount > 0) {
|
|
3030
|
+
console.log(chalk.yellow('⚠️ 检测到登录或权限相关异常\n'));
|
|
3031
|
+
console.log(chalk.gray(' 1. 在浏览器中重新登录极客时间,进入该专栏任意文章'));
|
|
3032
|
+
console.log(chalk.gray(' 2. 复制最新的 Cookie(或重新导出 cookies.json)'));
|
|
3033
|
+
console.log(chalk.gray(' 3. 使用新的 --cookie 或 --cookie-file 参数后重试\n'));
|
|
1860
3034
|
}
|
|
1861
3035
|
|
|
1862
3036
|
// 合并 PDF
|
|
@@ -1900,7 +3074,10 @@ async function main(options) {
|
|
|
1900
3074
|
const successCount = contentResults.filter(r => r.success).length;
|
|
1901
3075
|
const failCount = contentResults.filter(r => !r.success).length;
|
|
1902
3076
|
const timeoutCount = contentResults.filter(r =>
|
|
1903
|
-
!r.success && r.error &&
|
|
3077
|
+
!r.success && r.error && /timeout/i.test(r.error)
|
|
3078
|
+
).length;
|
|
3079
|
+
const authIssueCount = contentResults.filter(r =>
|
|
3080
|
+
!r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
|
|
1904
3081
|
).length;
|
|
1905
3082
|
|
|
1906
3083
|
console.log(chalk.bold.cyan('\n📊 EPUB 提取统计\n'));
|
|
@@ -1913,19 +3090,42 @@ async function main(options) {
|
|
|
1913
3090
|
console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
|
|
1914
3091
|
console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
|
|
1915
3092
|
console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
|
|
3093
|
+
} else if (authIssueCount > 0) {
|
|
3094
|
+
console.log(chalk.yellow('⚠️ 检测到登录/权限问题,建议步骤:\n'));
|
|
3095
|
+
console.log(chalk.gray(' 1. 浏览器重新登录极客时间并打开该专栏文章'));
|
|
3096
|
+
console.log(chalk.gray(' 2. 重新复制最新 Cookie 或导出 cookies.json'));
|
|
3097
|
+
console.log(chalk.gray(' 3. 更新 --cookie 或 --cookie-file 后再次执行\n'));
|
|
1916
3098
|
}
|
|
1917
3099
|
|
|
1918
3100
|
// 生成 EPUB
|
|
1919
3101
|
if (successCount > 0) {
|
|
1920
|
-
const
|
|
1921
|
-
|
|
1922
|
-
columnTitle,
|
|
1923
|
-
columnAuthor,
|
|
1924
|
-
articlesToDownload,
|
|
1925
|
-
contentResults
|
|
3102
|
+
const hasImageContent = contentResults.some(result =>
|
|
3103
|
+
result && result.success && typeof result.content === 'string' && result.content.includes('<img')
|
|
1926
3104
|
);
|
|
1927
|
-
|
|
1928
|
-
|
|
3105
|
+
|
|
3106
|
+
let processedContent = contentResults;
|
|
3107
|
+
let tempAssetsDir = null;
|
|
3108
|
+
|
|
3109
|
+
try {
|
|
3110
|
+
if (hasImageContent) {
|
|
3111
|
+
tempAssetsDir = await createTempAssetsDir(outputDir);
|
|
3112
|
+
processedContent = await rewriteEpubContentImages(context, contentResults, tempAssetsDir);
|
|
3113
|
+
}
|
|
3114
|
+
|
|
3115
|
+
const epubPath = await generateEPUB(
|
|
3116
|
+
outputDir,
|
|
3117
|
+
columnTitle,
|
|
3118
|
+
columnAuthor,
|
|
3119
|
+
articlesToDownload,
|
|
3120
|
+
processedContent
|
|
3121
|
+
);
|
|
3122
|
+
if (epubPath) {
|
|
3123
|
+
console.log(chalk.green(`\n✅ EPUB 生成完成: ${epubPath}\n`));
|
|
3124
|
+
}
|
|
3125
|
+
} finally {
|
|
3126
|
+
if (tempAssetsDir) {
|
|
3127
|
+
await cleanupTempAssetsDir(tempAssetsDir);
|
|
3128
|
+
}
|
|
1929
3129
|
}
|
|
1930
3130
|
}
|
|
1931
3131
|
}
|
|
@@ -1942,6 +3142,11 @@ async function main(options) {
|
|
|
1942
3142
|
}
|
|
1943
3143
|
process.exit(1);
|
|
1944
3144
|
} finally {
|
|
3145
|
+
try {
|
|
3146
|
+
await persistCookiesToFile(context, cookieSavePath);
|
|
3147
|
+
} catch {
|
|
3148
|
+
// ignore
|
|
3149
|
+
}
|
|
1945
3150
|
// 确保浏览器完全关闭
|
|
1946
3151
|
try {
|
|
1947
3152
|
if (browser && !isShuttingDown) {
|
|
@@ -1961,6 +3166,7 @@ program
|
|
|
1961
3166
|
.version(version)
|
|
1962
3167
|
.option('-u, --url <url>', '专栏文章URL(任意一篇)')
|
|
1963
3168
|
.option('-c, --cookie <cookie>', 'Cookie字符串(用于认证)')
|
|
3169
|
+
.option('--cookie-file <path>', '从 JSON 文件导入 Cookie(如 chrome 扩展导出的 cookies.json)')
|
|
1964
3170
|
.option('-o, --output <dir>', '输出目录', './downloads')
|
|
1965
3171
|
.option('-f, --format <format>', '输出格式: pdf, epub, both', 'pdf')
|
|
1966
3172
|
.option('--headless <boolean>', '无头模式', true)
|