@kadaliao/geektime-downloader 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +30 -0
- package/config.example.json +1 -0
- package/cookies.json +405 -0
- package/download.js +1484 -263
- package/package.json +3 -1
package/download.js
CHANGED
|
@@ -6,7 +6,10 @@ import chalk from 'chalk';
|
|
|
6
6
|
import ora from 'ora';
|
|
7
7
|
import fs from 'fs/promises';
|
|
8
8
|
import path from 'path';
|
|
9
|
-
import { fileURLToPath } from 'url';
|
|
9
|
+
import { fileURLToPath, pathToFileURL } from 'url';
|
|
10
|
+
import { load as loadHtml } from 'cheerio';
|
|
11
|
+
import crypto from 'crypto';
|
|
12
|
+
import mime from 'mime-types';
|
|
10
13
|
import { createRequire } from 'module';
|
|
11
14
|
import * as pdfLib from 'pdf-lib';
|
|
12
15
|
import { outlinePdfFactory } from '@lillallol/outline-pdf';
|
|
@@ -245,9 +248,335 @@ const PRINT_FIX_CSS = `
|
|
|
245
248
|
}
|
|
246
249
|
`;
|
|
247
250
|
|
|
251
|
+
// 代码高亮彩色语法(覆盖Prism/Highlight.js常见class)
|
|
252
|
+
const CODE_HIGHLIGHT_CSS = `
|
|
253
|
+
pre[class*="language-"],
|
|
254
|
+
code[class*="language-"],
|
|
255
|
+
pre code,
|
|
256
|
+
code.hljs,
|
|
257
|
+
pre.hljs {
|
|
258
|
+
color: #2d2d2d;
|
|
259
|
+
background: #f7f7f7;
|
|
260
|
+
}
|
|
261
|
+
.token.comment,
|
|
262
|
+
.token.prolog,
|
|
263
|
+
.token.doctype,
|
|
264
|
+
.token.cdata,
|
|
265
|
+
.hljs-comment,
|
|
266
|
+
.hljs-quote {
|
|
267
|
+
color: #6a737d;
|
|
268
|
+
font-style: italic;
|
|
269
|
+
}
|
|
270
|
+
.token.punctuation,
|
|
271
|
+
.hljs-punctuation {
|
|
272
|
+
color: #5e6687;
|
|
273
|
+
}
|
|
274
|
+
.token.property,
|
|
275
|
+
.token.tag,
|
|
276
|
+
.token.constant,
|
|
277
|
+
.token.symbol,
|
|
278
|
+
.token.deleted,
|
|
279
|
+
.hljs-keyword,
|
|
280
|
+
.hljs-selector-tag,
|
|
281
|
+
.hljs-subst,
|
|
282
|
+
.hljs-attribute {
|
|
283
|
+
color: #d73a49;
|
|
284
|
+
}
|
|
285
|
+
.token.boolean,
|
|
286
|
+
.token.number,
|
|
287
|
+
.token.selector,
|
|
288
|
+
.token.attr-name,
|
|
289
|
+
.token.char,
|
|
290
|
+
.token.builtin,
|
|
291
|
+
.token.inserted,
|
|
292
|
+
.hljs-number,
|
|
293
|
+
.hljs-literal,
|
|
294
|
+
.hljs-variable,
|
|
295
|
+
.hljs-template-variable {
|
|
296
|
+
color: #b76bff;
|
|
297
|
+
}
|
|
298
|
+
.token.string,
|
|
299
|
+
.token.attr-value,
|
|
300
|
+
.token.operator,
|
|
301
|
+
.token.entity,
|
|
302
|
+
.token.url,
|
|
303
|
+
.token.statement,
|
|
304
|
+
.token.regex,
|
|
305
|
+
.token.important,
|
|
306
|
+
.token.variable,
|
|
307
|
+
.token.bold,
|
|
308
|
+
.hljs-string,
|
|
309
|
+
.hljs-doctag,
|
|
310
|
+
.hljs-addition {
|
|
311
|
+
color: #22863a;
|
|
312
|
+
}
|
|
313
|
+
.token.function,
|
|
314
|
+
.token.class-name,
|
|
315
|
+
.token.keyword,
|
|
316
|
+
.hljs-title,
|
|
317
|
+
.hljs-section,
|
|
318
|
+
.hljs-type,
|
|
319
|
+
.hljs-selector-id,
|
|
320
|
+
.hljs-selector-class {
|
|
321
|
+
color: #005cc5;
|
|
322
|
+
}
|
|
323
|
+
.token.operator,
|
|
324
|
+
.token.entity,
|
|
325
|
+
.token.url,
|
|
326
|
+
.hljs-bullet,
|
|
327
|
+
.hljs-built_in,
|
|
328
|
+
.hljs-builtin-name,
|
|
329
|
+
.hljs-link {
|
|
330
|
+
color: #e36209;
|
|
331
|
+
}
|
|
332
|
+
.token.italic {
|
|
333
|
+
font-style: italic;
|
|
334
|
+
}
|
|
335
|
+
.token.bold {
|
|
336
|
+
font-weight: 600;
|
|
337
|
+
}
|
|
338
|
+
.token.deleted,
|
|
339
|
+
.hljs-deletion {
|
|
340
|
+
color: #b31d28;
|
|
341
|
+
}
|
|
342
|
+
`;
|
|
343
|
+
|
|
248
344
|
const GEEKTIME_BASE_URL = 'https://time.geekbang.org';
|
|
249
|
-
const ARTICLE_API_URL = `${GEEKTIME_BASE_URL}/serv/v1/article`;
|
|
250
345
|
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
346
|
+
const EPUB_IMAGE_BATCH_SIZE = 5;
|
|
347
|
+
const TEMP_ASSET_PREFIX = '__epub_assets__';
|
|
348
|
+
const ARTICLE_CONTENT_SELECTORS = [
|
|
349
|
+
'#article-content',
|
|
350
|
+
'#article-content-container',
|
|
351
|
+
'.article-content',
|
|
352
|
+
'.article-detail',
|
|
353
|
+
'.article-detail-content',
|
|
354
|
+
'.article-content__body',
|
|
355
|
+
'.Index_articleContent_QBG5G',
|
|
356
|
+
'.ArticleContent_articleContent',
|
|
357
|
+
'article .content',
|
|
358
|
+
'main article',
|
|
359
|
+
'.content-container article'
|
|
360
|
+
];
|
|
361
|
+
const ARTICLE_REMOVAL_SELECTORS = [
|
|
362
|
+
'nav', 'header', 'footer', 'aside',
|
|
363
|
+
'.comment', '.comments', '.Index_comment', '.CommentArea', '.comment-area', '.CommentWrapper', '.Comment-module', '.CommentList',
|
|
364
|
+
'#comments', '#comment', '[data-section="comment"]',
|
|
365
|
+
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
366
|
+
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
367
|
+
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
368
|
+
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
369
|
+
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
370
|
+
'.AudioPlayer', '.VoicePlayer', '.AudioWrapper', '.voice-player',
|
|
371
|
+
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
372
|
+
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
373
|
+
'.copyright', '.statement', '.disclaimer',
|
|
374
|
+
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
375
|
+
'.article-plugin-wrapper',
|
|
376
|
+
'[class*="Share"]', '[data-widget="audio"]', '[data-widget="Audio"]',
|
|
377
|
+
'audio', 'video',
|
|
378
|
+
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
379
|
+
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
380
|
+
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
381
|
+
'[data-role="toolbar"]',
|
|
382
|
+
'button[data-role="comment"]',
|
|
383
|
+
'script[data-role="plugin"]',
|
|
384
|
+
'.ArticleBottomBar',
|
|
385
|
+
'.bottom-toolbar'
|
|
386
|
+
];
|
|
387
|
+
const ARTICLE_PLUGIN_KEYWORDS = [
|
|
388
|
+
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
389
|
+
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
390
|
+
'copyright', 'geeknote', 'bilingual', 'comment'
|
|
391
|
+
];
|
|
392
|
+
const ARTICLE_MINDMAP_SELECTORS = [
|
|
393
|
+
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
394
|
+
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
395
|
+
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
396
|
+
];
|
|
397
|
+
const PDF_BASE_CSS = `
|
|
398
|
+
body {
|
|
399
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", sans-serif;
|
|
400
|
+
margin: 0;
|
|
401
|
+
padding: 0;
|
|
402
|
+
background: #fff;
|
|
403
|
+
color: #1f2329;
|
|
404
|
+
}
|
|
405
|
+
.article-pdf-wrapper {
|
|
406
|
+
max-width: 860px;
|
|
407
|
+
margin: 0 auto;
|
|
408
|
+
padding: 48px 56px 60px;
|
|
409
|
+
}
|
|
410
|
+
.article-title {
|
|
411
|
+
font-size: 32px;
|
|
412
|
+
font-weight: 600;
|
|
413
|
+
margin-bottom: 16px;
|
|
414
|
+
line-height: 1.3;
|
|
415
|
+
color: #111;
|
|
416
|
+
}
|
|
417
|
+
.article-meta {
|
|
418
|
+
color: #7f8c8d;
|
|
419
|
+
font-size: 14px;
|
|
420
|
+
margin-bottom: 32px;
|
|
421
|
+
}
|
|
422
|
+
.article-content p,
|
|
423
|
+
.article-content div {
|
|
424
|
+
margin: 1.1em 0;
|
|
425
|
+
line-height: 1.9;
|
|
426
|
+
font-size: 16px;
|
|
427
|
+
}
|
|
428
|
+
.article-content p + p,
|
|
429
|
+
.article-content div + p,
|
|
430
|
+
.article-content p + div {
|
|
431
|
+
margin-top: 1.6em;
|
|
432
|
+
}
|
|
433
|
+
.article-content h2,
|
|
434
|
+
.article-content h3,
|
|
435
|
+
.article-content h4 {
|
|
436
|
+
margin-top: 2.2em;
|
|
437
|
+
margin-bottom: 1em;
|
|
438
|
+
font-weight: 600;
|
|
439
|
+
color: #111;
|
|
440
|
+
}
|
|
441
|
+
.article-content h2 {
|
|
442
|
+
font-size: 26px;
|
|
443
|
+
}
|
|
444
|
+
.article-content h3 {
|
|
445
|
+
font-size: 22px;
|
|
446
|
+
}
|
|
447
|
+
.article-content h4 {
|
|
448
|
+
font-size: 18px;
|
|
449
|
+
}
|
|
450
|
+
.article-content img {
|
|
451
|
+
max-width: 100%;
|
|
452
|
+
margin: 1.2em auto;
|
|
453
|
+
display: block;
|
|
454
|
+
border-radius: 4px;
|
|
455
|
+
}
|
|
456
|
+
.article-content blockquote {
|
|
457
|
+
margin: 1.3em 0;
|
|
458
|
+
padding: 0.8em 1.2em;
|
|
459
|
+
border-left: 4px solid #d0d7de;
|
|
460
|
+
background: #f8fafc;
|
|
461
|
+
color: #4b5563;
|
|
462
|
+
}
|
|
463
|
+
.article-content ul,
|
|
464
|
+
.article-content ol {
|
|
465
|
+
margin: 1em 0;
|
|
466
|
+
padding-left: 2em;
|
|
467
|
+
}
|
|
468
|
+
.article-content pre {
|
|
469
|
+
background: #0b1220;
|
|
470
|
+
color: #d9e2ff;
|
|
471
|
+
border-radius: 6px;
|
|
472
|
+
padding: 16px 20px;
|
|
473
|
+
overflow: auto;
|
|
474
|
+
margin: 1.4em 0;
|
|
475
|
+
font-size: 14px;
|
|
476
|
+
line-height: 1.6;
|
|
477
|
+
}
|
|
478
|
+
.article-content pre code {
|
|
479
|
+
background: transparent;
|
|
480
|
+
border: none;
|
|
481
|
+
padding: 0;
|
|
482
|
+
color: inherit;
|
|
483
|
+
}
|
|
484
|
+
.article-content code {
|
|
485
|
+
font-family: "Fira Code", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
|
|
486
|
+
background: rgba(15, 23, 42, 0.08);
|
|
487
|
+
border-radius: 4px;
|
|
488
|
+
padding: 0.2em 0.4em;
|
|
489
|
+
}
|
|
490
|
+
.article-content hr {
|
|
491
|
+
border: none;
|
|
492
|
+
border-top: 1px solid #e5e7eb;
|
|
493
|
+
margin: 2.4em 0;
|
|
494
|
+
}
|
|
495
|
+
`;
|
|
496
|
+
|
|
497
|
+
async function fileExists(filePath) {
|
|
498
|
+
try {
|
|
499
|
+
await fs.access(filePath);
|
|
500
|
+
return true;
|
|
501
|
+
} catch {
|
|
502
|
+
return false;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
function normalizeCookieSameSite(value) {
|
|
507
|
+
if (!value) return undefined;
|
|
508
|
+
const lower = value.toString().toLowerCase();
|
|
509
|
+
if (lower.includes('lax')) return 'Lax';
|
|
510
|
+
if (lower.includes('strict')) return 'Strict';
|
|
511
|
+
if (lower.includes('none') || lower.includes('no_restriction')) return 'None';
|
|
512
|
+
return undefined;
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
function normalizeCookieDomain(domain) {
|
|
516
|
+
if (!domain || typeof domain !== 'string') {
|
|
517
|
+
return '.geekbang.org';
|
|
518
|
+
}
|
|
519
|
+
return domain.trim();
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
async function loadCookiesFromJsonFile(filePath) {
|
|
523
|
+
const absolutePath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
|
|
524
|
+
let raw;
|
|
525
|
+
try {
|
|
526
|
+
raw = await fs.readFile(absolutePath, 'utf-8');
|
|
527
|
+
} catch (error) {
|
|
528
|
+
throw new Error(`无法读取 cookie JSON 文件: ${error.message}`);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
let parsed;
|
|
532
|
+
try {
|
|
533
|
+
parsed = JSON.parse(raw);
|
|
534
|
+
} catch (error) {
|
|
535
|
+
throw new Error(`cookie JSON 解析失败: ${error.message}`);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
if (!Array.isArray(parsed)) {
|
|
539
|
+
throw new Error('cookie JSON 必须是数组格式');
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
const cookies = parsed
|
|
543
|
+
.filter(item => item && typeof item.name === 'string' && item.value !== undefined)
|
|
544
|
+
.map(item => {
|
|
545
|
+
const cookieValue = typeof item.value === 'string' ? item.value : String(item.value ?? '');
|
|
546
|
+
const cookie = {
|
|
547
|
+
name: item.name,
|
|
548
|
+
value: cookieValue,
|
|
549
|
+
domain: normalizeCookieDomain(item.domain),
|
|
550
|
+
path: item.path || '/',
|
|
551
|
+
secure: Boolean(item.secure),
|
|
552
|
+
httpOnly: Boolean(item.httpOnly)
|
|
553
|
+
};
|
|
554
|
+
const sameSite = normalizeCookieSameSite(item.sameSite);
|
|
555
|
+
if (sameSite) {
|
|
556
|
+
cookie.sameSite = sameSite;
|
|
557
|
+
}
|
|
558
|
+
return cookie;
|
|
559
|
+
});
|
|
560
|
+
|
|
561
|
+
if (cookies.length === 0) {
|
|
562
|
+
throw new Error('cookie JSON 中没有有效的 cookie 项');
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
const withExpiry = parsed
|
|
566
|
+
.filter(item => item && typeof item.name === 'string' && item.value !== undefined)
|
|
567
|
+
.map((item, idx) => ({ item, target: cookies[idx] }))
|
|
568
|
+
.filter(entry => entry.target);
|
|
569
|
+
withExpiry.forEach(({ item, target }) => {
|
|
570
|
+
const expires = item.expires || item.expirationDate;
|
|
571
|
+
if (expires) {
|
|
572
|
+
target.expires = Math.floor(Number(expires));
|
|
573
|
+
}
|
|
574
|
+
});
|
|
575
|
+
|
|
576
|
+
const cookieHeader = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
|
|
577
|
+
|
|
578
|
+
return { cookieHeader, cookies, absolutePath };
|
|
579
|
+
}
|
|
251
580
|
|
|
252
581
|
// 解析 cookie 字符串
|
|
253
582
|
function parseCookies(cookieString) {
|
|
@@ -272,96 +601,308 @@ function normalizeArticleHtml(html = '') {
|
|
|
272
601
|
.replace(/href='\/\//gi, "href='https://");
|
|
273
602
|
}
|
|
274
603
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
604
|
+
function resolveImageUrl(rawSrc = '') {
|
|
605
|
+
if (!rawSrc) return null;
|
|
606
|
+
let src = rawSrc.trim();
|
|
607
|
+
if (!src || src.startsWith('data:') || src.startsWith('blob:')) {
|
|
608
|
+
return null;
|
|
609
|
+
}
|
|
610
|
+
if (src.startsWith('//')) {
|
|
611
|
+
return `https:${src}`;
|
|
612
|
+
}
|
|
613
|
+
if (src.startsWith('/')) {
|
|
614
|
+
return `${GEEKTIME_BASE_URL}${src}`;
|
|
615
|
+
}
|
|
616
|
+
if (/^https?:/i.test(src)) {
|
|
617
|
+
return src;
|
|
618
|
+
}
|
|
619
|
+
try {
|
|
620
|
+
return new URL(src, GEEKTIME_BASE_URL).toString();
|
|
621
|
+
} catch {
|
|
622
|
+
return null;
|
|
623
|
+
}
|
|
624
|
+
}
|
|
279
625
|
|
|
280
|
-
|
|
626
|
+
async function fetchBinaryWithContext(context, url) {
|
|
627
|
+
const headers = {
|
|
628
|
+
'user-agent': DEFAULT_USER_AGENT,
|
|
629
|
+
'accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
|
|
630
|
+
'referer': GEEKTIME_BASE_URL,
|
|
631
|
+
...(globalCookieHeader ? { 'cookie': globalCookieHeader } : {})
|
|
632
|
+
};
|
|
633
|
+
const response = await context.request.get(url, { headers, failOnStatusCode: true });
|
|
634
|
+
if (!response.ok()) {
|
|
635
|
+
throw new Error(`HTTP ${response.status()} ${response.statusText()}`);
|
|
636
|
+
}
|
|
637
|
+
const buffer = await response.body();
|
|
638
|
+
const headersMap = response.headers();
|
|
639
|
+
return {
|
|
640
|
+
buffer,
|
|
641
|
+
contentType: headersMap['content-type'] || '',
|
|
642
|
+
finalUrl: response.url()
|
|
643
|
+
};
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
function determineImageExtension(resourceUrl = '', contentType = '') {
|
|
647
|
+
let ext = '';
|
|
648
|
+
if (resourceUrl) {
|
|
281
649
|
try {
|
|
282
|
-
const
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
}
|
|
297
|
-
});
|
|
650
|
+
const { pathname } = new URL(resourceUrl);
|
|
651
|
+
ext = path.extname(pathname).replace('.', '');
|
|
652
|
+
} catch {
|
|
653
|
+
ext = '';
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
if (!ext && contentType) {
|
|
657
|
+
ext = (mime.extension(contentType) || '').toString();
|
|
658
|
+
}
|
|
659
|
+
if (!ext) {
|
|
660
|
+
ext = 'bin';
|
|
661
|
+
}
|
|
662
|
+
return ext.toLowerCase();
|
|
663
|
+
}
|
|
298
664
|
|
|
299
|
-
|
|
665
|
+
async function downloadImageToLocal(context, normalizedUrl, assetsDir, articleIndex) {
|
|
666
|
+
const { buffer, contentType, finalUrl } = await fetchBinaryWithContext(context, normalizedUrl);
|
|
667
|
+
const ext = determineImageExtension(finalUrl || normalizedUrl, contentType);
|
|
668
|
+
const hash = crypto.createHash('md5').update(normalizedUrl).digest('hex').slice(0, 10);
|
|
669
|
+
const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_${hash}.${ext}`;
|
|
670
|
+
const filepath = path.join(assetsDir, filename);
|
|
671
|
+
await fs.writeFile(filepath, buffer);
|
|
672
|
+
return {
|
|
673
|
+
fileUrl: pathToFileURL(filepath).href,
|
|
674
|
+
localPath: filepath
|
|
675
|
+
};
|
|
676
|
+
}
|
|
300
677
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
678
|
+
function mapSameSiteForExport(value) {
|
|
679
|
+
if (!value) return 'unspecified';
|
|
680
|
+
const lower = value.toString().toLowerCase();
|
|
681
|
+
if (lower.includes('strict')) return 'strict';
|
|
682
|
+
if (lower.includes('lax')) return 'lax';
|
|
683
|
+
if (lower.includes('none')) return 'no_restriction';
|
|
684
|
+
return 'unspecified';
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
async function updateGlobalCookieHeaderFromContext(context) {
|
|
688
|
+
if (!context) return;
|
|
689
|
+
try {
|
|
690
|
+
const cookies = await context.cookies();
|
|
691
|
+
if (!cookies || cookies.length === 0) {
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
const header = cookies.map(cookie => `${cookie.name}=${cookie.value}`).join('; ');
|
|
695
|
+
if (header) {
|
|
696
|
+
globalCookieHeader = header;
|
|
697
|
+
}
|
|
698
|
+
} catch {
|
|
699
|
+
// ignore
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
async function persistCookiesToFile(context, targetPath) {
|
|
704
|
+
if (!context || !targetPath) return;
|
|
705
|
+
try {
|
|
706
|
+
const cookies = await context.cookies();
|
|
707
|
+
if (!cookies || cookies.length === 0) {
|
|
708
|
+
return;
|
|
709
|
+
}
|
|
710
|
+
const serialized = cookies.map(cookie => ({
|
|
711
|
+
domain: cookie.domain,
|
|
712
|
+
expirationDate: cookie.expires || undefined,
|
|
713
|
+
hostOnly: !cookie.domain.startsWith('.'),
|
|
714
|
+
httpOnly: cookie.httpOnly,
|
|
715
|
+
name: cookie.name,
|
|
716
|
+
path: cookie.path,
|
|
717
|
+
sameSite: mapSameSiteForExport(cookie.sameSite),
|
|
718
|
+
secure: cookie.secure,
|
|
719
|
+
session: !cookie.expires,
|
|
720
|
+
storeId: '0',
|
|
721
|
+
value: cookie.value
|
|
722
|
+
}));
|
|
723
|
+
await fs.writeFile(targetPath, JSON.stringify(serialized, null, 2), 'utf-8');
|
|
724
|
+
console.log(chalk.gray(`🍪 已刷新 Cookie → ${targetPath}`));
|
|
725
|
+
} catch (error) {
|
|
726
|
+
console.log(chalk.yellow(`⚠️ 保存 Cookie 失败: ${error.message}`));
|
|
727
|
+
}
|
|
728
|
+
}
|
|
304
729
|
|
|
305
|
-
|
|
730
|
+
async function saveDataUriImage(dataUri, assetsDir, articleIndex, dataIndex) {
|
|
731
|
+
if (!dataUri || typeof dataUri !== 'string') {
|
|
732
|
+
return null;
|
|
733
|
+
}
|
|
734
|
+
const match = dataUri.match(/^data:(.+?);base64,(.+)$/i);
|
|
735
|
+
if (!match) {
|
|
736
|
+
return null;
|
|
737
|
+
}
|
|
738
|
+
const mimeType = match[1] || 'application/octet-stream';
|
|
739
|
+
const base64Data = match[2];
|
|
740
|
+
let buffer;
|
|
741
|
+
try {
|
|
742
|
+
buffer = Buffer.from(base64Data, 'base64');
|
|
743
|
+
} catch {
|
|
744
|
+
return null;
|
|
745
|
+
}
|
|
746
|
+
if (!buffer || buffer.length === 0) {
|
|
747
|
+
return null;
|
|
748
|
+
}
|
|
749
|
+
const ext = mime.extension(mimeType) || 'bin';
|
|
750
|
+
const filename = `article_${String(articleIndex + 1).padStart(3, '0')}_inline_${String(dataIndex).padStart(3, '0')}.${ext}`;
|
|
751
|
+
const filepath = path.join(assetsDir, filename);
|
|
752
|
+
await fs.writeFile(filepath, buffer);
|
|
753
|
+
return pathToFileURL(filepath).href;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
async function rewriteImagesWithLocalFiles(context, htmlContent, assetsDir, articleIndex, sharedCache) {
|
|
757
|
+
if (!htmlContent || htmlContent.indexOf('<img') === -1) {
|
|
758
|
+
return { html: htmlContent, replaced: 0 };
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
const $ = loadHtml(htmlContent, { decodeEntities: false });
|
|
762
|
+
const images = $('img');
|
|
763
|
+
if (images.length === 0) {
|
|
764
|
+
return { html: htmlContent, replaced: 0 };
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
const pendingDownloads = new Map();
|
|
768
|
+
const dataUriImages = [];
|
|
769
|
+
|
|
770
|
+
images.each((_, element) => {
|
|
771
|
+
const originalSrc = $(element).attr('src') || '';
|
|
772
|
+
if (/^data:/i.test(originalSrc.trim())) {
|
|
773
|
+
dataUriImages.push({ element, src: originalSrc.trim() });
|
|
774
|
+
return;
|
|
775
|
+
}
|
|
776
|
+
const normalizedUrl = resolveImageUrl(originalSrc);
|
|
777
|
+
if (!normalizedUrl) {
|
|
778
|
+
return;
|
|
779
|
+
}
|
|
780
|
+
if (sharedCache.has(normalizedUrl)) {
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
if (!pendingDownloads.has(normalizedUrl)) {
|
|
784
|
+
pendingDownloads.set(normalizedUrl, null);
|
|
785
|
+
}
|
|
786
|
+
});
|
|
787
|
+
|
|
788
|
+
const downloadTargets = Array.from(pendingDownloads.keys());
|
|
789
|
+
for (let i = 0; i < downloadTargets.length; i += EPUB_IMAGE_BATCH_SIZE) {
|
|
790
|
+
const batch = downloadTargets.slice(i, i + EPUB_IMAGE_BATCH_SIZE).map(async (targetUrl) => {
|
|
306
791
|
try {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
792
|
+
const info = await downloadImageToLocal(context, targetUrl, assetsDir, articleIndex);
|
|
793
|
+
sharedCache.set(targetUrl, info.fileUrl);
|
|
794
|
+
pendingDownloads.set(targetUrl, info.fileUrl);
|
|
795
|
+
} catch (error) {
|
|
796
|
+
console.log(chalk.yellow(` ⚠️ 图片下载失败: ${targetUrl} (${error.message})`));
|
|
797
|
+
pendingDownloads.set(targetUrl, null);
|
|
310
798
|
}
|
|
799
|
+
});
|
|
800
|
+
await Promise.all(batch);
|
|
801
|
+
}
|
|
311
802
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
803
|
+
images.each((_, element) => {
|
|
804
|
+
const originalSrc = $(element).attr('src') || '';
|
|
805
|
+
if (/^data:/i.test(originalSrc.trim())) {
|
|
806
|
+
return;
|
|
807
|
+
}
|
|
808
|
+
const normalizedUrl = resolveImageUrl(originalSrc);
|
|
809
|
+
if (!normalizedUrl) {
|
|
810
|
+
return;
|
|
811
|
+
}
|
|
812
|
+
const localUrl = sharedCache.get(normalizedUrl) || pendingDownloads.get(normalizedUrl);
|
|
813
|
+
if (localUrl) {
|
|
814
|
+
$(element).attr('src', localUrl);
|
|
815
|
+
}
|
|
816
|
+
});
|
|
315
817
|
|
|
316
|
-
|
|
317
|
-
|
|
818
|
+
let processedInlineImages = 0;
|
|
819
|
+
for (let i = 0; i < dataUriImages.length; i++) {
|
|
820
|
+
const item = dataUriImages[i];
|
|
821
|
+
try {
|
|
822
|
+
const localUrl = await saveDataUriImage(item.src, assetsDir, articleIndex, i);
|
|
823
|
+
if (localUrl) {
|
|
824
|
+
$(item.element).attr('src', localUrl);
|
|
825
|
+
processedInlineImages++;
|
|
826
|
+
} else {
|
|
827
|
+
$(item.element).remove();
|
|
318
828
|
}
|
|
319
|
-
|
|
320
|
-
return json.data;
|
|
321
829
|
} catch (error) {
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
830
|
+
console.log(chalk.yellow(` ⚠️ 内联图片处理失败: ${error.message}`));
|
|
831
|
+
$(item.element).remove();
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
const finalHtml = $.root().html() || htmlContent;
|
|
836
|
+
|
|
837
|
+
return {
|
|
838
|
+
html: finalHtml,
|
|
839
|
+
replaced: downloadTargets.length + processedInlineImages
|
|
840
|
+
};
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
async function rewriteEpubContentImages(context, contentResults, assetsDir) {
|
|
844
|
+
const cache = new Map();
|
|
845
|
+
let processedArticles = 0;
|
|
846
|
+
let processedImages = 0;
|
|
847
|
+
|
|
848
|
+
const spinner = ora('正在缓存 EPUB 图片...').start();
|
|
849
|
+
|
|
850
|
+
const updatedResults = [];
|
|
851
|
+
for (let i = 0; i < contentResults.length; i++) {
|
|
852
|
+
const result = contentResults[i];
|
|
853
|
+
if (!result || !result.success || !result.content) {
|
|
854
|
+
updatedResults.push(result);
|
|
855
|
+
continue;
|
|
856
|
+
}
|
|
857
|
+
try {
|
|
858
|
+
const { html, replaced } = await rewriteImagesWithLocalFiles(context, result.content, assetsDir, i, cache);
|
|
859
|
+
processedImages += replaced;
|
|
860
|
+
if (replaced > 0) {
|
|
861
|
+
processedArticles++;
|
|
325
862
|
}
|
|
863
|
+
updatedResults.push({ ...result, content: html });
|
|
864
|
+
} catch (error) {
|
|
865
|
+
spinner.stop();
|
|
866
|
+
console.log(chalk.yellow(`⚠️ 处理第 ${i + 1} 篇文章图片失败: ${error.message}`));
|
|
867
|
+
spinner.start();
|
|
868
|
+
updatedResults.push(result);
|
|
326
869
|
}
|
|
327
870
|
}
|
|
328
871
|
|
|
329
|
-
|
|
872
|
+
if (processedImages === 0) {
|
|
873
|
+
spinner.stop();
|
|
874
|
+
console.log(chalk.gray('📷 没有检测到需要缓存的图片'));
|
|
875
|
+
} else {
|
|
876
|
+
spinner.succeed(`已缓存 EPUB 图片: ${processedImages} 张(${processedArticles} 篇文章)`);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
return updatedResults;
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
async function createTempAssetsDir(baseDir) {
|
|
883
|
+
const tempDir = path.join(baseDir, `${TEMP_ASSET_PREFIX}_${Date.now().toString(36)}_${Math.random().toString(16).slice(2, 8)}`);
|
|
884
|
+
await fs.mkdir(tempDir, { recursive: true });
|
|
885
|
+
return tempDir;
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
async function cleanupTempAssetsDir(dir) {
|
|
889
|
+
if (!dir) return;
|
|
890
|
+
try {
|
|
891
|
+
await fs.rm(dir, { recursive: true, force: true });
|
|
892
|
+
} catch (error) {
|
|
893
|
+
console.log(chalk.gray(`清理临时目录失败: ${error.message}`));
|
|
894
|
+
}
|
|
330
895
|
}
|
|
331
896
|
|
|
332
897
|
async function sanitizeArticleHtml(page, rawHtml) {
|
|
333
|
-
return page.evaluate((html) => {
|
|
898
|
+
return page.evaluate(({ html, removalSelectors, pluginKeywords, mindmapSelectors }) => {
|
|
334
899
|
const template = document.createElement('template');
|
|
335
900
|
template.innerHTML = html;
|
|
336
901
|
|
|
337
|
-
const removalSelectors = [
|
|
338
|
-
'nav', 'header', 'footer', 'aside',
|
|
339
|
-
'.comment', '.comments', '.Index_comment',
|
|
340
|
-
'.recommend', '.recommendation', '.related', '.advertisement', '.ad', '.banner',
|
|
341
|
-
'.subscribe', '.subscription', '.toolbar', '.Index_shareIcons_1vtJa',
|
|
342
|
-
'.keyboard-wrapper', '.app-download', '.article-actions', '.article-bottom',
|
|
343
|
-
'.note', '.notes', '.annotation', '.translation', '.trans', '.translator',
|
|
344
|
-
'.audio', '.audio-player', '.voice', '.player', '.geek-player', '.podcast', '.radio',
|
|
345
|
-
'.reward', '.appreciate', '.appreciation', '.donate', '.sponsor', '.thanks', '.support',
|
|
346
|
-
'.qrcode', '.qr-code', '.qr', '.promotion', '.promo', '.ad-banner',
|
|
347
|
-
'.copyright', '.statement', '.disclaimer',
|
|
348
|
-
'.app-download-banner', '.article-plugin', '.article-notification', '.float-bar',
|
|
349
|
-
'audio', 'video',
|
|
350
|
-
'[class*="Note"]', '[class*="note"]', '[class*="Translation"]', '[class*="translation"]',
|
|
351
|
-
'[class*="Audio"]', '[class*="audio"]', '[class*="Reward"]', '[class*="reward"]',
|
|
352
|
-
'[data-plugin]', '[data-track]', '[data-track-section]', '[data-translation]', '[data-audio]',
|
|
353
|
-
'[data-role="toolbar"]',
|
|
354
|
-
'button', 'iframe', 'script', 'style'
|
|
355
|
-
];
|
|
356
902
|
removalSelectors.forEach(selector => {
|
|
357
903
|
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
358
904
|
});
|
|
359
905
|
|
|
360
|
-
const pluginKeywords = [
|
|
361
|
-
'note', 'translation', 'audio', 'player', 'reward', 'donate',
|
|
362
|
-
'appreciation', 'sponsor', 'qrcode', 'toolbar', 'plugin',
|
|
363
|
-
'copyright', 'geeknote', 'bilingual'
|
|
364
|
-
];
|
|
365
906
|
const pluginElements = Array.from(template.content.querySelectorAll('*')).filter(el => {
|
|
366
907
|
const className = (el.className || '').toString().toLowerCase();
|
|
367
908
|
const idValue = (el.id || '').toString().toLowerCase();
|
|
@@ -372,11 +913,6 @@ async function sanitizeArticleHtml(page, rawHtml) {
|
|
|
372
913
|
});
|
|
373
914
|
pluginElements.forEach(el => el.remove());
|
|
374
915
|
|
|
375
|
-
const mindmapSelectors = [
|
|
376
|
-
'.mindmap', '.mind-map', '.MindMap', '.Mind-map',
|
|
377
|
-
'[data-type="mindmap"]', '[data-role="mindmap"]', '[data-widget="mindmap"]',
|
|
378
|
-
'[class*="MindMap"]', '[class*="mindMap"]'
|
|
379
|
-
];
|
|
380
916
|
mindmapSelectors.forEach(selector => {
|
|
381
917
|
template.content.querySelectorAll(selector).forEach(el => el.remove());
|
|
382
918
|
});
|
|
@@ -428,15 +964,23 @@ async function sanitizeArticleHtml(page, rawHtml) {
|
|
|
428
964
|
|
|
429
965
|
const images = template.content ? template.content.querySelectorAll('img') : [];
|
|
430
966
|
images.forEach(img => {
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
}
|
|
967
|
+
img.setAttribute('loading', 'eager');
|
|
968
|
+
img.setAttribute('decoding', 'sync');
|
|
434
969
|
img.style.maxWidth = '100%';
|
|
435
970
|
img.style.height = 'auto';
|
|
436
971
|
});
|
|
437
972
|
|
|
438
973
|
return template.innerHTML;
|
|
439
|
-
},
|
|
974
|
+
}, {
|
|
975
|
+
html: rawHtml,
|
|
976
|
+
removalSelectors: ARTICLE_REMOVAL_SELECTORS,
|
|
977
|
+
pluginKeywords: ARTICLE_PLUGIN_KEYWORDS,
|
|
978
|
+
mindmapSelectors: ARTICLE_MINDMAP_SELECTORS
|
|
979
|
+
});
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
function normalizeTextContent(text = '') {
|
|
983
|
+
return text.replace(/\s+/g, ' ').trim();
|
|
440
984
|
}
|
|
441
985
|
|
|
442
986
|
function escapeHtml(text = '') {
|
|
@@ -448,59 +992,650 @@ function escapeHtml(text = '') {
|
|
|
448
992
|
.replace(/'/g, ''');
|
|
449
993
|
}
|
|
450
994
|
|
|
451
|
-
function
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
.article-print-wrapper h1 {
|
|
469
|
-
font-size: 32px;
|
|
470
|
-
line-height: 1.4;
|
|
471
|
-
margin-bottom: 24px;
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
a {
|
|
475
|
-
color: #0f5ef2;
|
|
476
|
-
text-decoration: none;
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
pre {
|
|
480
|
-
background: #f7f7f7;
|
|
481
|
-
padding: 16px;
|
|
482
|
-
border-radius: 6px;
|
|
483
|
-
overflow: auto;
|
|
995
|
+
function removeDuplicateTitle(html, title = '') {
|
|
996
|
+
if (!html || !title) {
|
|
997
|
+
return html;
|
|
998
|
+
}
|
|
999
|
+
const normalizedTitle = normalizeTextContent(title);
|
|
1000
|
+
if (!normalizedTitle) {
|
|
1001
|
+
return html;
|
|
1002
|
+
}
|
|
1003
|
+
try {
|
|
1004
|
+
const $ = loadHtml(html, { decodeEntities: false });
|
|
1005
|
+
const firstHeading = $('h1, h2').first();
|
|
1006
|
+
if (firstHeading.length) {
|
|
1007
|
+
const headingText = normalizeTextContent(firstHeading.text());
|
|
1008
|
+
if (headingText && headingText === normalizedTitle) {
|
|
1009
|
+
firstHeading.remove();
|
|
1010
|
+
}
|
|
484
1011
|
}
|
|
485
|
-
|
|
1012
|
+
return $.root().html() || html;
|
|
1013
|
+
} catch {
|
|
1014
|
+
return html;
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
486
1017
|
|
|
1018
|
+
function buildPdfHtml(title, sanitizedHtml, articleMeta = '') {
|
|
487
1019
|
return `
|
|
488
1020
|
<!DOCTYPE html>
|
|
489
1021
|
<html lang="zh-CN">
|
|
490
1022
|
<head>
|
|
491
1023
|
<meta charset="utf-8">
|
|
492
1024
|
<base href="${GEEKTIME_BASE_URL}">
|
|
493
|
-
<style>${
|
|
1025
|
+
<style>${PDF_BASE_CSS}${PRINT_FIX_CSS}${CODE_HIGHLIGHT_CSS}</style>
|
|
494
1026
|
</head>
|
|
495
1027
|
<body>
|
|
496
|
-
<
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
</div
|
|
1028
|
+
<article class="article-pdf-wrapper">
|
|
1029
|
+
<section class="article-content">
|
|
1030
|
+
<h1 class="article-title">${escapeHtml(title)}</h1>
|
|
1031
|
+
${articleMeta ? `<div class="article-meta">${escapeHtml(articleMeta)}</div>` : ''}
|
|
1032
|
+
${sanitizedHtml}
|
|
1033
|
+
</section>
|
|
1034
|
+
</article>
|
|
500
1035
|
</body>
|
|
501
1036
|
</html>`;
|
|
502
1037
|
}
|
|
503
1038
|
|
|
1039
|
+
function enhanceCodeBlocks(html) {
|
|
1040
|
+
if (!html) return html;
|
|
1041
|
+
try {
|
|
1042
|
+
const $ = loadHtml(html, { decodeEntities: false });
|
|
1043
|
+
const wrapCodeElement = ($source, innerHtml) => {
|
|
1044
|
+
const wrapper = $('<pre class="code-block"></pre>');
|
|
1045
|
+
const codeEl = $('<code></code>').html(innerHtml);
|
|
1046
|
+
wrapper.append(codeEl);
|
|
1047
|
+
$source.replaceWith(wrapper);
|
|
1048
|
+
};
|
|
1049
|
+
|
|
1050
|
+
$('code').each((_, element) => {
|
|
1051
|
+
const $el = $(element);
|
|
1052
|
+
const parent = $el.parent();
|
|
1053
|
+
const text = $el.text() || '';
|
|
1054
|
+
const isBlocky = text.includes('\n') || text.length > 120 || $el.html().includes('<br');
|
|
1055
|
+
if (isBlocky && parent.length && parent[0].tagName !== 'PRE') {
|
|
1056
|
+
wrapCodeElement($el, $el.html());
|
|
1057
|
+
}
|
|
1058
|
+
});
|
|
1059
|
+
$('pre').each((_, element) => {
|
|
1060
|
+
const $el = $(element);
|
|
1061
|
+
if (!$el.hasClass('code-block')) {
|
|
1062
|
+
$el.addClass('code-block');
|
|
1063
|
+
}
|
|
1064
|
+
if ($el.find('code').length === 0) {
|
|
1065
|
+
const text = $el.html();
|
|
1066
|
+
$el.empty().append($('<code></code>').html(text));
|
|
1067
|
+
}
|
|
1068
|
+
});
|
|
1069
|
+
|
|
1070
|
+
const codeLikeSelectors = [
|
|
1071
|
+
'[class*="code"]',
|
|
1072
|
+
'[class*="Code"]',
|
|
1073
|
+
'[class*="code-block"]',
|
|
1074
|
+
'[class*="CodeBlock"]',
|
|
1075
|
+
'[class*="hljs"]',
|
|
1076
|
+
'[class*="language-"]',
|
|
1077
|
+
'.highlight',
|
|
1078
|
+
'.prism-code'
|
|
1079
|
+
];
|
|
1080
|
+
const blockTags = ['P', 'DIV', 'SECTION', 'ARTICLE', 'UL', 'OL', 'TABLE', 'IMG', 'FIGURE'];
|
|
1081
|
+
const isLikelyCodeText = (text = '') => {
|
|
1082
|
+
const trimmed = text.trim();
|
|
1083
|
+
if (trimmed.length === 0) return false;
|
|
1084
|
+
if (trimmed.length > 1200) return false;
|
|
1085
|
+
return trimmed.includes('\n') || trimmed.includes('{') || trimmed.includes(';') || trimmed.includes(' ');
|
|
1086
|
+
};
|
|
1087
|
+
$(codeLikeSelectors.join(',')).each((_, element) => {
|
|
1088
|
+
const $el = $(element);
|
|
1089
|
+
if ($el.is('pre') || $el.find('pre').length > 0) {
|
|
1090
|
+
return;
|
|
1091
|
+
}
|
|
1092
|
+
const hasBlockChildren = blockTags.some(tag => $el.find(tag).length > 0);
|
|
1093
|
+
if (hasBlockChildren) {
|
|
1094
|
+
return;
|
|
1095
|
+
}
|
|
1096
|
+
const text = $el.text() || '';
|
|
1097
|
+
if (!isLikelyCodeText(text)) {
|
|
1098
|
+
return;
|
|
1099
|
+
}
|
|
1100
|
+
wrapCodeElement($el, $el.html());
|
|
1101
|
+
});
|
|
1102
|
+
|
|
1103
|
+
$('figure').each((_, element) => {
|
|
1104
|
+
const $el = $(element);
|
|
1105
|
+
if ($el.find('pre').length === 1 && $el.children().length === 1) {
|
|
1106
|
+
$el.replaceWith($el.find('pre').first());
|
|
1107
|
+
}
|
|
1108
|
+
});
|
|
1109
|
+
|
|
1110
|
+
const highlightSelectors = [
|
|
1111
|
+
'[class*="hljs"]',
|
|
1112
|
+
'[class*="language-"]',
|
|
1113
|
+
'.simplebar-content',
|
|
1114
|
+
'[data-language]',
|
|
1115
|
+
'[data-code-block]',
|
|
1116
|
+
'[class*="RichContent"]'
|
|
1117
|
+
];
|
|
1118
|
+
const containerClassHints = ['simplebar', 'code', 'hljs', 'prism', 'syntax', 'monaco', 'ace', 'terminal', 'shell'];
|
|
1119
|
+
const containerStyleHints = ['white-space: pre', 'white-space:pre', 'font-family: monospace', 'font-family:monospace'];
|
|
1120
|
+
const inlineTags = new Set(['span', 'code', 'em', 'strong', 'b', 'i', 'u', 'a', 'label']);
|
|
1121
|
+
const newlineTags = new Set(['DIV', 'P', 'LI', 'SECTION', 'ARTICLE', 'FIGURE', 'PRE', 'CODE', 'BR', 'TR', 'TD', 'TH']);
|
|
1122
|
+
const looksLikeCodeBlock = (text = '') => {
|
|
1123
|
+
if (!text) return false;
|
|
1124
|
+
const trimmed = text.trim();
|
|
1125
|
+
if (!trimmed) return false;
|
|
1126
|
+
if (trimmed.includes('\n')) return true;
|
|
1127
|
+
const keywords = ['{', '}', ';', '=>', '->', '#!', 'SELECT ', 'INSERT ', 'docker ', 'kubectl ', 'sudo ', 'printf', 'def ', 'class ', 'function ', 'const ', 'let ', 'var ', 'public ', 'private ', 'import ', 'package ', 'namespace ', 'http '];
|
|
1128
|
+
return keywords.some(keyword => trimmed.includes(keyword));
|
|
1129
|
+
};
|
|
1130
|
+
const getTextWithBreaks = (node) => {
|
|
1131
|
+
if (!node) return '';
|
|
1132
|
+
if (node.type === 'text') {
|
|
1133
|
+
return node.data || '';
|
|
1134
|
+
}
|
|
1135
|
+
if (!node.children || node.children.length === 0) {
|
|
1136
|
+
return newlineTags.has((node.tagName || node.name || '').toUpperCase()) ? '\n' : '';
|
|
1137
|
+
}
|
|
1138
|
+
let text = '';
|
|
1139
|
+
for (const child of node.children) {
|
|
1140
|
+
text += getTextWithBreaks(child);
|
|
1141
|
+
}
|
|
1142
|
+
if (newlineTags.has((node.tagName || node.name || '').toUpperCase())) {
|
|
1143
|
+
text += '\n';
|
|
1144
|
+
}
|
|
1145
|
+
return text;
|
|
1146
|
+
};
|
|
1147
|
+
const normalizeCodeText = (text = '') => {
|
|
1148
|
+
const lines = text
|
|
1149
|
+
.replace(/\r\n?/g, '\n')
|
|
1150
|
+
.split('\n')
|
|
1151
|
+
.map(line => line.replace(/\u00a0/g, ' ').replace(/\t/g, ' ').replace(/\s+$/, ''));
|
|
1152
|
+
while (lines.length && !lines[0].trim()) {
|
|
1153
|
+
lines.shift();
|
|
1154
|
+
}
|
|
1155
|
+
while (lines.length && !lines[lines.length - 1].trim()) {
|
|
1156
|
+
lines.pop();
|
|
1157
|
+
}
|
|
1158
|
+
const result = [];
|
|
1159
|
+
let previousBlank = false;
|
|
1160
|
+
for (const line of lines) {
|
|
1161
|
+
const isBlank = line.trim().length === 0;
|
|
1162
|
+
if (isBlank && previousBlank) {
|
|
1163
|
+
continue;
|
|
1164
|
+
}
|
|
1165
|
+
result.push(line);
|
|
1166
|
+
previousBlank = isBlank;
|
|
1167
|
+
}
|
|
1168
|
+
return result.join('\n').trim();
|
|
1169
|
+
};
|
|
1170
|
+
const convertToCodeBlock = ($target) => {
|
|
1171
|
+
if (!$target || !$target.length) {
|
|
1172
|
+
return false;
|
|
1173
|
+
}
|
|
1174
|
+
const rawText = getTextWithBreaks($target[0]) || '';
|
|
1175
|
+
const normalized = normalizeCodeText(rawText);
|
|
1176
|
+
if (!looksLikeCodeBlock(normalized)) {
|
|
1177
|
+
return false;
|
|
1178
|
+
}
|
|
1179
|
+
const $pre = $('<pre class="code-block"></pre>');
|
|
1180
|
+
const $code = $('<code></code>').text(normalized);
|
|
1181
|
+
$pre.append($code);
|
|
1182
|
+
$target.replaceWith($pre);
|
|
1183
|
+
return true;
|
|
1184
|
+
};
|
|
1185
|
+
const processedCandidates = new Set();
|
|
1186
|
+
$(highlightSelectors.join(',')).each((_, node) => {
|
|
1187
|
+
const $start = $(node);
|
|
1188
|
+
if (!$start || !$start.length) {
|
|
1189
|
+
return;
|
|
1190
|
+
}
|
|
1191
|
+
let $candidate = null;
|
|
1192
|
+
let $current = $start;
|
|
1193
|
+
for (let depth = 0; depth < 8 && $current && $current.length; depth++) {
|
|
1194
|
+
const rawTag = ($current[0]?.tagName || $current[0]?.name || '').toLowerCase();
|
|
1195
|
+
const classAttr = ($current.attr('class') || '').toLowerCase();
|
|
1196
|
+
const styleAttr = ($current.attr('style') || '').toLowerCase();
|
|
1197
|
+
const hasClassHint = containerClassHints.some(keyword => classAttr.includes(keyword));
|
|
1198
|
+
const hasStyleHint = containerStyleHints.some(keyword => styleAttr.includes(keyword));
|
|
1199
|
+
if (!inlineTags.has(rawTag) && (hasClassHint || hasStyleHint)) {
|
|
1200
|
+
$candidate = $current;
|
|
1201
|
+
}
|
|
1202
|
+
$current = $current.parent();
|
|
1203
|
+
}
|
|
1204
|
+
if (!$candidate || !$candidate.length || $candidate.is('pre')) {
|
|
1205
|
+
return;
|
|
1206
|
+
}
|
|
1207
|
+
const key = $candidate[0];
|
|
1208
|
+
if (processedCandidates.has(key)) {
|
|
1209
|
+
return;
|
|
1210
|
+
}
|
|
1211
|
+
if (convertToCodeBlock($candidate)) {
|
|
1212
|
+
processedCandidates.add(key);
|
|
1213
|
+
}
|
|
1214
|
+
});
|
|
1215
|
+
|
|
1216
|
+
const simplebarWrappers = [
|
|
1217
|
+
'.simplebar-wrapper',
|
|
1218
|
+
'.simplebar-height-auto-observer-wrapper',
|
|
1219
|
+
'.simplebar-height-auto-observer',
|
|
1220
|
+
'.simplebar-mask',
|
|
1221
|
+
'.simplebar-offset',
|
|
1222
|
+
'.simplebar-content-wrapper',
|
|
1223
|
+
'.simplebar-placeholder'
|
|
1224
|
+
];
|
|
1225
|
+
simplebarWrappers.forEach(selector => {
|
|
1226
|
+
$(selector).each((_, element) => {
|
|
1227
|
+
const $el = $(element);
|
|
1228
|
+
if ($el.find('pre.code-block').length > 0 || !$el.text().trim()) {
|
|
1229
|
+
$el.replaceWith($el.contents());
|
|
1230
|
+
}
|
|
1231
|
+
});
|
|
1232
|
+
});
|
|
1233
|
+
$('.simplebar-track, .simplebar-scrollbar').remove();
|
|
1234
|
+
|
|
1235
|
+
return $.root().html() || html;
|
|
1236
|
+
} catch {
|
|
1237
|
+
return html;
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
async function detectAccessIssuesOnPage(page) {
|
|
1242
|
+
return page.evaluate(() => {
|
|
1243
|
+
const bodyText = document.body ? (document.body.innerText || '') : '';
|
|
1244
|
+
if (!bodyText) {
|
|
1245
|
+
return null;
|
|
1246
|
+
}
|
|
1247
|
+
const normalized = bodyText.replace(/\s+/g, ' ').trim();
|
|
1248
|
+
if (!normalized) {
|
|
1249
|
+
return null;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
const checks = [
|
|
1253
|
+
{
|
|
1254
|
+
keywords: ['请先登录', '重新登录', '立即登录', '登录后'],
|
|
1255
|
+
message: '页面提示需要登录,Cookie 可能已失效或未正确导入'
|
|
1256
|
+
},
|
|
1257
|
+
{
|
|
1258
|
+
keywords: ['试看结束', '购买专栏', '立即订阅', '购买课程', '仅对付费用户开放', '开通会员'],
|
|
1259
|
+
message: '检测到购买/试看提示,可能未订阅该专栏或 Cookie 已失效'
|
|
1260
|
+
},
|
|
1261
|
+
{
|
|
1262
|
+
keywords: ['暂无权限', '没有权限', '权限不足'],
|
|
1263
|
+
message: '账号没有访问该专栏的权限'
|
|
1264
|
+
}
|
|
1265
|
+
];
|
|
1266
|
+
|
|
1267
|
+
const lower = normalized.toLowerCase();
|
|
1268
|
+
for (const check of checks) {
|
|
1269
|
+
for (const keyword of check.keywords) {
|
|
1270
|
+
if (lower.includes(keyword.toLowerCase())) {
|
|
1271
|
+
return check.message;
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
return null;
|
|
1276
|
+
});
|
|
1277
|
+
}
|
|
1278
|
+
|
|
1279
|
+
async function waitForArticleContentSelector(page, timeout = 60000) {
|
|
1280
|
+
const start = Date.now();
|
|
1281
|
+
while ((Date.now() - start) < timeout) {
|
|
1282
|
+
for (const selector of ARTICLE_CONTENT_SELECTORS) {
|
|
1283
|
+
const handle = await page.$(selector);
|
|
1284
|
+
if (handle) {
|
|
1285
|
+
await handle.dispose();
|
|
1286
|
+
return selector;
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
await page.waitForTimeout(300);
|
|
1290
|
+
}
|
|
1291
|
+
return null;
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
async function autoScrollArticle(page, { step = 400, delay = 120, maxIterations = 80 } = {}) {
|
|
1295
|
+
await page.evaluate(({ step, delay, maxIterations }) => {
|
|
1296
|
+
return new Promise((resolve) => {
|
|
1297
|
+
let iterations = 0;
|
|
1298
|
+
const timer = setInterval(() => {
|
|
1299
|
+
window.scrollBy(0, step);
|
|
1300
|
+
iterations += 1;
|
|
1301
|
+
const reachedBottom = window.scrollY + window.innerHeight >= document.body.scrollHeight - 50;
|
|
1302
|
+
if (reachedBottom || iterations >= maxIterations) {
|
|
1303
|
+
clearInterval(timer);
|
|
1304
|
+
window.scrollTo(0, 0);
|
|
1305
|
+
resolve();
|
|
1306
|
+
}
|
|
1307
|
+
}, delay);
|
|
1308
|
+
});
|
|
1309
|
+
}, { step, delay, maxIterations });
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
async function fetchArticleContentFromPage(page, article, timeout = 60000) {
|
|
1313
|
+
const targetUrl = article.url || `${GEEKTIME_BASE_URL}/column/article/${article.id}`;
|
|
1314
|
+
let response;
|
|
1315
|
+
try {
|
|
1316
|
+
response = await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout });
|
|
1317
|
+
} catch (error) {
|
|
1318
|
+
throw new Error(`页面加载失败: ${error.message}`);
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
if (response && !response.ok()) {
|
|
1322
|
+
throw new Error(`页面响应异常: HTTP ${response.status()} ${response.statusText()}`);
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
try {
|
|
1326
|
+
await page.waitForLoadState('networkidle', { timeout: Math.min(10000, timeout) });
|
|
1327
|
+
} catch {
|
|
1328
|
+
// 部分页面可能没有额外请求,忽略 networkidle 超时
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
await autoScrollArticle(page);
|
|
1332
|
+
await page.waitForTimeout(500);
|
|
1333
|
+
|
|
1334
|
+
const selector = await waitForArticleContentSelector(page, timeout);
|
|
1335
|
+
if (!selector) {
|
|
1336
|
+
const issue = await detectAccessIssuesOnPage(page);
|
|
1337
|
+
if (issue) {
|
|
1338
|
+
throw new Error(issue);
|
|
1339
|
+
}
|
|
1340
|
+
throw new Error('未能定位到文章正文,请重试或检查 Cookie 是否有效');
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
let extraction;
|
|
1344
|
+
try {
|
|
1345
|
+
extraction = await page.$eval(selector, (el) => {
|
|
1346
|
+
const clone = el.cloneNode(true);
|
|
1347
|
+
const removalSelectors = [
|
|
1348
|
+
'.article-share',
|
|
1349
|
+
'.article-actions',
|
|
1350
|
+
'.article-copyright',
|
|
1351
|
+
'.article-bottom',
|
|
1352
|
+
'.reward',
|
|
1353
|
+
'.share',
|
|
1354
|
+
'.Index_recommend',
|
|
1355
|
+
'.recommend',
|
|
1356
|
+
'.audio-player',
|
|
1357
|
+
'.AudioPlayer',
|
|
1358
|
+
'.voice-player',
|
|
1359
|
+
'.VoicePlayer',
|
|
1360
|
+
'.audio-wrapper',
|
|
1361
|
+
'.AudioWrapper',
|
|
1362
|
+
'.geek-player',
|
|
1363
|
+
'.Player',
|
|
1364
|
+
'.plugin',
|
|
1365
|
+
'.Plugin',
|
|
1366
|
+
'[data-widget="audio"]',
|
|
1367
|
+
'[data-widget="Audio"]',
|
|
1368
|
+
'[data-role="audio"]',
|
|
1369
|
+
'.comment-area',
|
|
1370
|
+
'.CommentArea',
|
|
1371
|
+
'.comment-wrapper',
|
|
1372
|
+
'.CommentWrapper',
|
|
1373
|
+
'#comments',
|
|
1374
|
+
'#comment',
|
|
1375
|
+
'.comments',
|
|
1376
|
+
'.Comments'
|
|
1377
|
+
];
|
|
1378
|
+
removalSelectors.forEach(sel => {
|
|
1379
|
+
clone.querySelectorAll(sel).forEach(node => node.remove());
|
|
1380
|
+
});
|
|
1381
|
+
|
|
1382
|
+
const toAbsoluteUrl = (value) => {
|
|
1383
|
+
if (!value || typeof value !== 'string') {
|
|
1384
|
+
return '';
|
|
1385
|
+
}
|
|
1386
|
+
const trimmed = value.trim();
|
|
1387
|
+
if (!trimmed) {
|
|
1388
|
+
return '';
|
|
1389
|
+
}
|
|
1390
|
+
if (trimmed.startsWith('blob:')) {
|
|
1391
|
+
return '';
|
|
1392
|
+
}
|
|
1393
|
+
if (trimmed.startsWith('data:')) {
|
|
1394
|
+
return trimmed;
|
|
1395
|
+
}
|
|
1396
|
+
if (/^https?:/i.test(trimmed)) {
|
|
1397
|
+
return trimmed;
|
|
1398
|
+
}
|
|
1399
|
+
if (trimmed.startsWith('//')) {
|
|
1400
|
+
return `${location.protocol}${trimmed}`;
|
|
1401
|
+
}
|
|
1402
|
+
try {
|
|
1403
|
+
const url = new URL(trimmed, location.href);
|
|
1404
|
+
return url.href;
|
|
1405
|
+
} catch {
|
|
1406
|
+
return '';
|
|
1407
|
+
}
|
|
1408
|
+
};
|
|
1409
|
+
|
|
1410
|
+
const imageFallbackAttrs = [
|
|
1411
|
+
'data-src',
|
|
1412
|
+
'data-original',
|
|
1413
|
+
'data-actualsrc',
|
|
1414
|
+
'data-url',
|
|
1415
|
+
'data-image',
|
|
1416
|
+
'data-origin',
|
|
1417
|
+
'data-thumbnail',
|
|
1418
|
+
'data-bigimgsrc',
|
|
1419
|
+
'data-download',
|
|
1420
|
+
'data-href'
|
|
1421
|
+
];
|
|
1422
|
+
|
|
1423
|
+
clone.querySelectorAll('img').forEach(img => {
|
|
1424
|
+
let finalSrc = toAbsoluteUrl(img.getAttribute('src'));
|
|
1425
|
+
if (!finalSrc) {
|
|
1426
|
+
for (const attr of imageFallbackAttrs) {
|
|
1427
|
+
const candidate = toAbsoluteUrl(img.getAttribute(attr));
|
|
1428
|
+
if (candidate) {
|
|
1429
|
+
finalSrc = candidate;
|
|
1430
|
+
break;
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
if (!finalSrc) {
|
|
1436
|
+
img.remove();
|
|
1437
|
+
} else {
|
|
1438
|
+
img.setAttribute('src', finalSrc);
|
|
1439
|
+
}
|
|
1440
|
+
});
|
|
1441
|
+
|
|
1442
|
+
const textLength = clone.innerText ? clone.innerText.trim().length : 0;
|
|
1443
|
+
return {
|
|
1444
|
+
html: clone.innerHTML,
|
|
1445
|
+
textLength
|
|
1446
|
+
};
|
|
1447
|
+
});
|
|
1448
|
+
} catch (error) {
|
|
1449
|
+
throw new Error(`读取文章内容失败: ${error.message}`);
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
if (!extraction || !extraction.html || extraction.textLength < 20) {
|
|
1453
|
+
const issue = await detectAccessIssuesOnPage(page);
|
|
1454
|
+
if (issue) {
|
|
1455
|
+
throw new Error(issue);
|
|
1456
|
+
}
|
|
1457
|
+
throw new Error('正文内容为空,可能是 Cookie 失效或只获取到试看内容');
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
const normalizedHtml = normalizeArticleHtml(extraction.html);
|
|
1461
|
+
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
1462
|
+
|
|
1463
|
+
if (!sanitizedHtml || sanitizedHtml.trim().length === 0) {
|
|
1464
|
+
throw new Error('正文清洗后为空,可能是页面结构变化');
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
const cleaned = removeDuplicateTitle(sanitizedHtml, article.originalTitle || article.title || '');
|
|
1468
|
+
return enhanceCodeBlocks(cleaned);
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
function isRetryableContentError(message = '') {
|
|
1472
|
+
if (!message) return true;
|
|
1473
|
+
const lower = message.toLowerCase();
|
|
1474
|
+
const nonRetryableKeywords = [
|
|
1475
|
+
'cookie', '登录', '登陆', '订阅', '试看', '权限', '购买', '未授权', '无权限'
|
|
1476
|
+
];
|
|
1477
|
+
return !nonRetryableKeywords.some(keyword => lower.includes(keyword));
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
async function fetchArticleContentWithRetry(page, article, options = {}) {
|
|
1481
|
+
const {
|
|
1482
|
+
timeout = 60000,
|
|
1483
|
+
maxAttempts = 3,
|
|
1484
|
+
delayMs = 1500
|
|
1485
|
+
} = options;
|
|
1486
|
+
|
|
1487
|
+
let lastError = null;
|
|
1488
|
+
|
|
1489
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
1490
|
+
try {
|
|
1491
|
+
if (attempt > 1) {
|
|
1492
|
+
await page.waitForTimeout(400);
|
|
1493
|
+
}
|
|
1494
|
+
return await fetchArticleContentFromPage(page, article, timeout);
|
|
1495
|
+
} catch (error) {
|
|
1496
|
+
lastError = error;
|
|
1497
|
+
const message = error?.message || '';
|
|
1498
|
+
if (!isRetryableContentError(message) || attempt === maxAttempts) {
|
|
1499
|
+
throw error;
|
|
1500
|
+
}
|
|
1501
|
+
const waitTime = delayMs * attempt;
|
|
1502
|
+
if (process.env.DEBUG) {
|
|
1503
|
+
console.log(chalk.gray(`重试文章 ${article.id} (第${attempt}次失败: ${message}),等待 ${waitTime}ms`));
|
|
1504
|
+
}
|
|
1505
|
+
try {
|
|
1506
|
+
await page.goto('about:blank', { waitUntil: 'domcontentloaded', timeout: 5000 });
|
|
1507
|
+
} catch {
|
|
1508
|
+
// 忽略
|
|
1509
|
+
}
|
|
1510
|
+
await page.waitForTimeout(waitTime);
|
|
1511
|
+
}
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1514
|
+
throw lastError || new Error('无法获取文章内容');
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
async function extractArticlesFromPageDom(page) {
|
|
1518
|
+
return page.evaluate((baseUrl) => {
|
|
1519
|
+
const selectors = [
|
|
1520
|
+
'[class*="catalog"] a[href*="/column/article/"]',
|
|
1521
|
+
'[class*="directory"] a[href*="/column/article/"]',
|
|
1522
|
+
'[class*="Catalogue"] a[href*="/column/article/"]',
|
|
1523
|
+
'[class*="Catalog"] a[href*="/column/article/"]',
|
|
1524
|
+
'nav a[href*="/column/article/"]',
|
|
1525
|
+
'a[href*="/column/article/"]'
|
|
1526
|
+
];
|
|
1527
|
+
|
|
1528
|
+
const collectedAnchors = [];
|
|
1529
|
+
const seenElements = new Set();
|
|
1530
|
+
selectors.forEach(selector => {
|
|
1531
|
+
const nodes = document.querySelectorAll(selector);
|
|
1532
|
+
nodes.forEach(node => {
|
|
1533
|
+
if (!seenElements.has(node)) {
|
|
1534
|
+
seenElements.add(node);
|
|
1535
|
+
collectedAnchors.push(node);
|
|
1536
|
+
}
|
|
1537
|
+
});
|
|
1538
|
+
});
|
|
1539
|
+
|
|
1540
|
+
if (collectedAnchors.length === 0) {
|
|
1541
|
+
return [];
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
const seenIds = new Set();
|
|
1545
|
+
const articles = [];
|
|
1546
|
+
|
|
1547
|
+
const cleanText = (text) => (text || '').replace(/\s+/g, ' ').trim();
|
|
1548
|
+
|
|
1549
|
+
collectedAnchors.forEach((anchor, index) => {
|
|
1550
|
+
const href = anchor.getAttribute('href') || '';
|
|
1551
|
+
const match = href.match(/column\/article\/(\d+)/i);
|
|
1552
|
+
if (!match) {
|
|
1553
|
+
return;
|
|
1554
|
+
}
|
|
1555
|
+
|
|
1556
|
+
const id = parseInt(match[1], 10);
|
|
1557
|
+
if (!id || seenIds.has(id)) {
|
|
1558
|
+
return;
|
|
1559
|
+
}
|
|
1560
|
+
seenIds.add(id);
|
|
1561
|
+
|
|
1562
|
+
let title = cleanText(anchor.innerText || anchor.textContent || anchor.getAttribute('title') || '');
|
|
1563
|
+
if (!title) {
|
|
1564
|
+
const titleNode = anchor.querySelector('[class*="title"], span, div');
|
|
1565
|
+
if (titleNode) {
|
|
1566
|
+
title = cleanText(titleNode.textContent);
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
if (!title) {
|
|
1570
|
+
title = `文章_${id}`;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
let absoluteUrl = href;
|
|
1574
|
+
try {
|
|
1575
|
+
absoluteUrl = new URL(href, baseUrl).toString();
|
|
1576
|
+
} catch {
|
|
1577
|
+
if (href.startsWith('/')) {
|
|
1578
|
+
absoluteUrl = `${baseUrl.replace(/\/$/, '')}${href}`;
|
|
1579
|
+
}
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
const sectionNode = anchor.closest('[data-section],[data-chapter],[class*="section"],[class*="Section"],[class*="chapter"],[class*="Chapter"]');
|
|
1583
|
+
let sectionName = '';
|
|
1584
|
+
if (sectionNode) {
|
|
1585
|
+
sectionName = cleanText(
|
|
1586
|
+
sectionNode.getAttribute('data-section') ||
|
|
1587
|
+
sectionNode.getAttribute('data-chapter') ||
|
|
1588
|
+
sectionNode.getAttribute('data-title') ||
|
|
1589
|
+
sectionNode.querySelector('h2, h3, h4, .title, .section-title')?.textContent ||
|
|
1590
|
+
''
|
|
1591
|
+
);
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
articles.push({
|
|
1595
|
+
id,
|
|
1596
|
+
article_title: title,
|
|
1597
|
+
article_sharetitle: title,
|
|
1598
|
+
url: absoluteUrl,
|
|
1599
|
+
section_name: sectionName,
|
|
1600
|
+
chapter_index: index + 1,
|
|
1601
|
+
originalIndex: index
|
|
1602
|
+
});
|
|
1603
|
+
});
|
|
1604
|
+
|
|
1605
|
+
return articles;
|
|
1606
|
+
}, GEEKTIME_BASE_URL);
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
async function extractColumnAuthorFromPage(page) {
|
|
1610
|
+
try {
|
|
1611
|
+
return await page.evaluate(() => {
|
|
1612
|
+
const selectors = [
|
|
1613
|
+
'.author-name',
|
|
1614
|
+
'.author',
|
|
1615
|
+
'.teacher-name',
|
|
1616
|
+
'.lecturer-name',
|
|
1617
|
+
'.Index_teacherName',
|
|
1618
|
+
'.ProductHeader_teacherName',
|
|
1619
|
+
'.ColumnIntro_teacher__name',
|
|
1620
|
+
'.ColumnIntro_author__name'
|
|
1621
|
+
];
|
|
1622
|
+
for (const selector of selectors) {
|
|
1623
|
+
const el = document.querySelector(selector);
|
|
1624
|
+
if (el && el.textContent && el.textContent.trim()) {
|
|
1625
|
+
return el.textContent.trim();
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
const metaAuthor = document.querySelector('meta[name="author"]');
|
|
1629
|
+
if (metaAuthor && metaAuthor.content) {
|
|
1630
|
+
return metaAuthor.content.trim();
|
|
1631
|
+
}
|
|
1632
|
+
return null;
|
|
1633
|
+
});
|
|
1634
|
+
} catch {
|
|
1635
|
+
return null;
|
|
1636
|
+
}
|
|
1637
|
+
}
|
|
1638
|
+
|
|
504
1639
|
// 获取专栏所有文章列表(通过API)
|
|
505
1640
|
function getValueByPath(obj, path) {
|
|
506
1641
|
if (!obj || !path) return undefined;
|
|
@@ -603,45 +1738,53 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
603
1738
|
let columnInfoHandler = null;
|
|
604
1739
|
|
|
605
1740
|
// 用于同步的 Promise
|
|
606
|
-
const articlesPromise =
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
1741
|
+
const articlesPromise = Promise.race([
|
|
1742
|
+
new Promise((resolve) => {
|
|
1743
|
+
articlesHandler = async (response) => {
|
|
1744
|
+
const url = response.url();
|
|
1745
|
+
// 监听文章列表 API
|
|
1746
|
+
if (url.includes('/serv/v1/column/articles')) {
|
|
1747
|
+
try {
|
|
1748
|
+
const data = await response.json();
|
|
1749
|
+
if (process.env.DEBUG) {
|
|
1750
|
+
console.log(chalk.gray('\n收到文章列表API响应'));
|
|
1751
|
+
}
|
|
1752
|
+
resolve(data);
|
|
1753
|
+
} catch (e) {
|
|
1754
|
+
console.error('解析文章列表API失败:', e);
|
|
1755
|
+
resolve(null);
|
|
615
1756
|
}
|
|
616
|
-
resolve(data);
|
|
617
|
-
} catch (e) {
|
|
618
|
-
console.error('解析文章列表API失败:', e);
|
|
619
1757
|
}
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
url.includes('/serv/v1/column/
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
1758
|
+
};
|
|
1759
|
+
page.on('response', articlesHandler);
|
|
1760
|
+
}),
|
|
1761
|
+
new Promise(resolve => setTimeout(() => resolve(null), 30000))
|
|
1762
|
+
]);
|
|
1763
|
+
|
|
1764
|
+
const columnInfoPromise = Promise.race([
|
|
1765
|
+
new Promise((resolve) => {
|
|
1766
|
+
columnInfoHandler = async (response) => {
|
|
1767
|
+
const url = response.url();
|
|
1768
|
+
// 监听专栏详情相关的 API
|
|
1769
|
+
if (url.includes('/serv/v1/column/intro') ||
|
|
1770
|
+
url.includes('/serv/v3/column/info') ||
|
|
1771
|
+
url.includes('/serv/v1/column/detail')) {
|
|
1772
|
+
try {
|
|
1773
|
+
const data = await response.json();
|
|
1774
|
+
if (process.env.DEBUG) {
|
|
1775
|
+
console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
|
|
1776
|
+
}
|
|
1777
|
+
resolve(data);
|
|
1778
|
+
} catch (e) {
|
|
1779
|
+
console.error('解析专栏信息API失败:', e);
|
|
1780
|
+
resolve(null);
|
|
636
1781
|
}
|
|
637
|
-
resolve(data);
|
|
638
|
-
} catch (e) {
|
|
639
|
-
console.error('解析专栏信息API失败:', e);
|
|
640
1782
|
}
|
|
641
|
-
}
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
1783
|
+
};
|
|
1784
|
+
page.on('response', columnInfoHandler);
|
|
1785
|
+
}),
|
|
1786
|
+
new Promise(resolve => setTimeout(() => resolve(null), 5000))
|
|
1787
|
+
]);
|
|
645
1788
|
|
|
646
1789
|
try {
|
|
647
1790
|
// 先设置监听器,再访问页面
|
|
@@ -650,23 +1793,13 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
650
1793
|
|
|
651
1794
|
spinner.text = '正在获取文章列表...';
|
|
652
1795
|
|
|
653
|
-
// 等待文章列表 API
|
|
654
|
-
articlesData = await
|
|
655
|
-
articlesPromise,
|
|
656
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('文章列表API调用超时')), 30000))
|
|
657
|
-
]);
|
|
1796
|
+
// 等待文章列表 API(如果失败将返回 null)
|
|
1797
|
+
articlesData = await articlesPromise;
|
|
658
1798
|
|
|
659
|
-
// 尝试等待专栏信息 API
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
|
|
664
|
-
]);
|
|
665
|
-
} catch (e) {
|
|
666
|
-
// 获取专栏信息失败不是致命错误
|
|
667
|
-
if (process.env.DEBUG) {
|
|
668
|
-
console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
|
|
669
|
-
}
|
|
1799
|
+
// 尝试等待专栏信息 API(可选)
|
|
1800
|
+
columnInfoData = await columnInfoPromise;
|
|
1801
|
+
if (!columnInfoData && process.env.DEBUG) {
|
|
1802
|
+
console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
|
|
670
1803
|
}
|
|
671
1804
|
|
|
672
1805
|
} catch (error) {
|
|
@@ -695,32 +1828,47 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
695
1828
|
}
|
|
696
1829
|
}
|
|
697
1830
|
|
|
698
|
-
|
|
699
|
-
|
|
1831
|
+
let useDomExtraction = false;
|
|
1832
|
+
let domArticles = [];
|
|
700
1833
|
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
console.log(chalk.red('\n❌ Cookie 已失效\n'));
|
|
710
|
-
console.log(chalk.cyan('📖 请重新获取 Cookie:'));
|
|
711
|
-
console.log(chalk.gray(' 1. 浏览器登录极客时间'));
|
|
712
|
-
console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
|
|
713
|
-
console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
|
|
714
|
-
console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
|
|
715
|
-
} else if (articlesData.error) {
|
|
716
|
-
console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
|
|
1834
|
+
if (!articlesData || !articlesData.data || !Array.isArray(articlesData.data.list) || articlesData.data.list.length === 0) {
|
|
1835
|
+
spinner.text = 'API 不可用,尝试从页面解析文章列表...';
|
|
1836
|
+
try {
|
|
1837
|
+
domArticles = await extractArticlesFromPageDom(page);
|
|
1838
|
+
} catch (error) {
|
|
1839
|
+
if (process.env.DEBUG) {
|
|
1840
|
+
console.log(chalk.gray(`DOM文章提取失败: ${error.message}`));
|
|
1841
|
+
}
|
|
717
1842
|
}
|
|
718
1843
|
|
|
719
|
-
|
|
1844
|
+
if (!domArticles || domArticles.length === 0) {
|
|
1845
|
+
spinner.fail('无法获取文章列表');
|
|
1846
|
+
|
|
1847
|
+
if (!articlesData) {
|
|
1848
|
+
console.log(chalk.yellow('\n⚠️ 未能从接口或页面获取文章列表\n'));
|
|
1849
|
+
console.log(chalk.cyan('可能的原因:'));
|
|
1850
|
+
console.log(chalk.gray(' 1. Cookie 已过期或无效 - 请重新获取 Cookie'));
|
|
1851
|
+
console.log(chalk.gray(' 2. 页面结构发生变化 - 请联系开发者更新解析逻辑'));
|
|
1852
|
+
console.log(chalk.gray(' 3. 网络连接问题或URL无效\n'));
|
|
1853
|
+
} else if (articlesData.code === -3000 || articlesData.code === -3001) {
|
|
1854
|
+
console.log(chalk.red('\n❌ Cookie 已失效\n'));
|
|
1855
|
+
console.log(chalk.cyan('📖 请重新获取 Cookie:'));
|
|
1856
|
+
console.log(chalk.gray(' 1. 浏览器登录极客时间'));
|
|
1857
|
+
console.log(chalk.gray(' 2. 按 F12 打开开发者工具'));
|
|
1858
|
+
console.log(chalk.gray(' 3. Network 标签 → 刷新页面'));
|
|
1859
|
+
console.log(chalk.gray(' 4. 点击任意请求 → 复制 Cookie\n'));
|
|
1860
|
+
} else if (articlesData.error) {
|
|
1861
|
+
console.log(chalk.yellow(`\n⚠️ API 返回错误: ${articlesData.error.msg || articlesData.error}\n`));
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
return { articles: [], columnTitle: 'unknown', columnAuthor: '极客时间' };
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
useDomExtraction = true;
|
|
720
1868
|
}
|
|
721
1869
|
|
|
722
1870
|
// 调试信息:记录完整的API响应结构(仅在环境变量DEBUG存在时)
|
|
723
|
-
if (process.env.DEBUG) {
|
|
1871
|
+
if (!useDomExtraction && process.env.DEBUG) {
|
|
724
1872
|
console.log(chalk.gray('\n========== 文章列表 API 响应数据 =========='));
|
|
725
1873
|
console.log(chalk.gray(JSON.stringify(articlesData.data, null, 2)));
|
|
726
1874
|
if (columnInfoData) {
|
|
@@ -743,7 +1891,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
743
1891
|
}
|
|
744
1892
|
|
|
745
1893
|
// 方法2: 从文章列表 API 数据中获取
|
|
746
|
-
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
1894
|
+
if ((!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') && articlesData && articlesData.data) {
|
|
747
1895
|
columnTitle = articlesData.data.column_title
|
|
748
1896
|
|| articlesData.data.column_subtitle
|
|
749
1897
|
|| articlesData.data.title
|
|
@@ -827,10 +1975,15 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
827
1975
|
console.log(chalk.gray(` 提取的专栏名: ${columnTitle}\n`));
|
|
828
1976
|
}
|
|
829
1977
|
|
|
830
|
-
|
|
1978
|
+
let columnAuthor = '极客时间';
|
|
1979
|
+
if (!useDomExtraction && articlesData) {
|
|
1980
|
+
columnAuthor = extractColumnAuthor(columnInfoData, articlesData) || '极客时间';
|
|
1981
|
+
} else {
|
|
1982
|
+
columnAuthor = await extractColumnAuthorFromPage(page) || '极客时间';
|
|
1983
|
+
}
|
|
831
1984
|
|
|
832
1985
|
// 解析文章列表
|
|
833
|
-
const rawArticles = articlesData.data.list;
|
|
1986
|
+
const rawArticles = useDomExtraction ? domArticles : (articlesData.data.list || []);
|
|
834
1987
|
|
|
835
1988
|
const articles = rawArticles.map((article, index) => {
|
|
836
1989
|
const title = article.article_title || article.article_sharetitle || 'Untitled';
|
|
@@ -845,7 +1998,7 @@ async function getArticleList(page, columnUrl, timeout = 60000) {
|
|
|
845
1998
|
|
|
846
1999
|
return {
|
|
847
2000
|
title: cleanTitle,
|
|
848
|
-
url:
|
|
2001
|
+
url: article.url || `${GEEKTIME_BASE_URL}/column/article/${id}`,
|
|
849
2002
|
originalTitle: title,
|
|
850
2003
|
id: id,
|
|
851
2004
|
sectionName: article.section_name || '',
|
|
@@ -889,7 +2042,7 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
|
|
|
889
2042
|
const article = articles[index];
|
|
890
2043
|
|
|
891
2044
|
try {
|
|
892
|
-
const result = await downloadArticleSilent(page, article, outputDir, index + 1, total);
|
|
2045
|
+
const result = await downloadArticleSilent(page, article, outputDir, index + 1, total, timeout);
|
|
893
2046
|
results[index] = result;
|
|
894
2047
|
completed++;
|
|
895
2048
|
|
|
@@ -944,36 +2097,20 @@ async function downloadWithConcurrency(context, articles, outputDir, concurrency
|
|
|
944
2097
|
}
|
|
945
2098
|
|
|
946
2099
|
// 下载单篇文章为 PDF(静默模式,不显示单独的spinner)
|
|
947
|
-
async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
2100
|
+
async function downloadArticleSilent(page, article, outputDir, index, total, timeout = 60000) {
|
|
948
2101
|
try {
|
|
949
2102
|
if (process.env.DEBUG) {
|
|
950
2103
|
console.log(chalk.gray(`[silent] 准备处理文章 ${article.id} - ${article.originalTitle || article.title}`));
|
|
951
2104
|
}
|
|
952
|
-
const
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
}
|
|
956
|
-
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
957
|
-
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
958
|
-
if (process.env.DEBUG) {
|
|
959
|
-
console.log(chalk.gray(`[silent] 已完成内容清洗 ${article.id}`));
|
|
960
|
-
}
|
|
961
|
-
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
2105
|
+
const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
|
|
2106
|
+
const meta = article.sectionName ? `章节:${article.sectionName}` : '';
|
|
2107
|
+
const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
|
|
962
2108
|
|
|
963
2109
|
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
964
|
-
if (process.env.DEBUG) {
|
|
965
|
-
console.log(chalk.gray(`[silent] 已设置页面内容 ${article.id}`));
|
|
966
|
-
}
|
|
967
2110
|
try {
|
|
968
2111
|
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
969
|
-
if (process.env.DEBUG) {
|
|
970
|
-
console.log(chalk.gray(`[silent] networkidle 完成 ${article.id}`));
|
|
971
|
-
}
|
|
972
2112
|
} catch {
|
|
973
|
-
//
|
|
974
|
-
if (process.env.DEBUG) {
|
|
975
|
-
console.log(chalk.gray(`[silent] networkidle 超时(已忽略) ${article.id}`));
|
|
976
|
-
}
|
|
2113
|
+
// ignore
|
|
977
2114
|
}
|
|
978
2115
|
|
|
979
2116
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
@@ -991,7 +2128,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
991
2128
|
resolve();
|
|
992
2129
|
}
|
|
993
2130
|
};
|
|
994
|
-
const attachTimeout = () => setTimeout(safeResolve,
|
|
2131
|
+
const attachTimeout = () => setTimeout(safeResolve, 15000);
|
|
995
2132
|
let fallbackTimer = null;
|
|
996
2133
|
|
|
997
2134
|
// 如果图片还未加载完成,等待加载
|
|
@@ -1065,7 +2202,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
1065
2202
|
}
|
|
1066
2203
|
|
|
1067
2204
|
// 等待图片处理完成
|
|
1068
|
-
await page.waitForTimeout(
|
|
2205
|
+
await page.waitForTimeout(1200);
|
|
1069
2206
|
if (process.env.DEBUG) {
|
|
1070
2207
|
console.log(chalk.gray(`[silent] 已准备生成PDF ${article.id}`));
|
|
1071
2208
|
}
|
|
@@ -1083,7 +2220,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
1083
2220
|
bottom: '20mm',
|
|
1084
2221
|
left: '15mm'
|
|
1085
2222
|
},
|
|
1086
|
-
printBackground:
|
|
2223
|
+
printBackground: true,
|
|
1087
2224
|
preferCSSPageSize: false
|
|
1088
2225
|
});
|
|
1089
2226
|
if (process.env.DEBUG) {
|
|
@@ -1101,20 +2238,19 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
1101
2238
|
}
|
|
1102
2239
|
|
|
1103
2240
|
// 下载单篇文章为 PDF
|
|
1104
|
-
async function downloadArticle(page, article, outputDir, index, total) {
|
|
2241
|
+
async function downloadArticle(page, article, outputDir, index, total, timeout = 60000) {
|
|
1105
2242
|
const spinner = ora(`[${index}/${total}] 正在下载: ${article.title}`).start();
|
|
1106
2243
|
|
|
1107
2244
|
try {
|
|
1108
|
-
const
|
|
1109
|
-
const
|
|
1110
|
-
const
|
|
1111
|
-
const printableHtml = buildPrintableHtml(article.originalTitle || article.title, sanitizedHtml);
|
|
2245
|
+
const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
|
|
2246
|
+
const meta = article.sectionName ? `章节:${article.sectionName}` : '';
|
|
2247
|
+
const printableHtml = buildPdfHtml(article.originalTitle || article.title, sanitizedHtml, meta);
|
|
1112
2248
|
|
|
1113
2249
|
await page.setContent(printableHtml, { waitUntil: 'domcontentloaded' });
|
|
1114
2250
|
try {
|
|
1115
2251
|
await page.waitForLoadState('networkidle', { timeout: 5000 });
|
|
1116
2252
|
} catch {
|
|
1117
|
-
//
|
|
2253
|
+
// 忽略
|
|
1118
2254
|
}
|
|
1119
2255
|
|
|
1120
2256
|
// 优化图片大小:将大图片转换为合适的尺寸,减小PDF体积
|
|
@@ -1189,7 +2325,7 @@ async function downloadArticle(page, article, outputDir, index, total) {
|
|
|
1189
2325
|
bottom: '20mm',
|
|
1190
2326
|
left: '15mm'
|
|
1191
2327
|
},
|
|
1192
|
-
printBackground:
|
|
2328
|
+
printBackground: true,
|
|
1193
2329
|
preferCSSPageSize: false
|
|
1194
2330
|
});
|
|
1195
2331
|
|
|
@@ -1312,11 +2448,9 @@ async function mergePDFs(outputDir, columnTitle, articles, deleteAfterMerge = fa
|
|
|
1312
2448
|
}
|
|
1313
2449
|
|
|
1314
2450
|
// 提取单篇文章的 HTML 内容(用于 EPUB 生成)
|
|
1315
|
-
async function extractArticleContent(page, article, index, total) {
|
|
2451
|
+
async function extractArticleContent(page, article, index, total, timeout = 60000) {
|
|
1316
2452
|
try {
|
|
1317
|
-
const
|
|
1318
|
-
const normalizedHtml = normalizeArticleHtml(articleData.article_content || '');
|
|
1319
|
-
const sanitizedHtml = await sanitizeArticleHtml(page, normalizedHtml);
|
|
2453
|
+
const sanitizedHtml = await fetchArticleContentWithRetry(page, article, { timeout });
|
|
1320
2454
|
|
|
1321
2455
|
if (!sanitizedHtml) {
|
|
1322
2456
|
throw new Error('未能提取到文章内容');
|
|
@@ -1370,7 +2504,7 @@ async function extractWithConcurrency(context, articles, concurrency = 5, delay
|
|
|
1370
2504
|
const article = articles[index];
|
|
1371
2505
|
|
|
1372
2506
|
try {
|
|
1373
|
-
const result = await extractArticleContent(page, article, index + 1, total);
|
|
2507
|
+
const result = await extractArticleContent(page, article, index + 1, total, timeout);
|
|
1374
2508
|
results[index] = result;
|
|
1375
2509
|
completed++;
|
|
1376
2510
|
|
|
@@ -1485,41 +2619,43 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
|
|
|
1485
2619
|
margin: 1.5em 0;
|
|
1486
2620
|
padding: 0;
|
|
1487
2621
|
}
|
|
1488
|
-
p {
|
|
2622
|
+
p, div {
|
|
1489
2623
|
margin: 1.2em 0;
|
|
1490
2624
|
text-indent: 0;
|
|
1491
|
-
line-height: 1.
|
|
2625
|
+
line-height: 1.9;
|
|
1492
2626
|
word-wrap: break-word;
|
|
1493
2627
|
overflow-wrap: break-word;
|
|
1494
2628
|
display: block;
|
|
1495
2629
|
page-break-inside: avoid;
|
|
1496
2630
|
}
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
2631
|
+
p + p,
|
|
2632
|
+
div + p,
|
|
2633
|
+
p + div {
|
|
2634
|
+
margin-top: 1.6em;
|
|
1500
2635
|
}
|
|
1501
2636
|
/* 代码块样式 */
|
|
1502
2637
|
pre {
|
|
1503
|
-
background-color: #
|
|
2638
|
+
background-color: #0b1220;
|
|
2639
|
+
color: #d9e2ff;
|
|
1504
2640
|
border: 1px solid #e1e4e8;
|
|
1505
2641
|
border-radius: 6px;
|
|
1506
|
-
padding:
|
|
2642
|
+
padding: 18px 20px;
|
|
1507
2643
|
overflow-x: auto;
|
|
1508
2644
|
margin: 1em 0;
|
|
1509
|
-
line-height: 1.
|
|
2645
|
+
line-height: 1.6;
|
|
1510
2646
|
font-size: 14px;
|
|
1511
2647
|
white-space: pre-wrap;
|
|
1512
2648
|
word-wrap: break-word;
|
|
1513
|
-
font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
2649
|
+
font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
1514
2650
|
page-break-inside: avoid;
|
|
1515
2651
|
}
|
|
1516
2652
|
code {
|
|
1517
|
-
font-family: 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
2653
|
+
font-family: 'Fira Code', 'Monaco', 'Menlo', 'Consolas', 'Courier New', monospace;
|
|
1518
2654
|
font-size: 0.9em;
|
|
1519
|
-
background-color:
|
|
2655
|
+
background-color: rgba(15, 23, 42, 0.1);
|
|
1520
2656
|
padding: 0.2em 0.4em;
|
|
1521
2657
|
border-radius: 3px;
|
|
1522
|
-
border: 1px solid
|
|
2658
|
+
border: 1px solid rgba(15, 23, 42, 0.1);
|
|
1523
2659
|
}
|
|
1524
2660
|
pre code {
|
|
1525
2661
|
background-color: transparent;
|
|
@@ -1630,12 +2766,13 @@ async function generateEPUB(outputDir, columnTitle, columnAuthor, articles, cont
|
|
|
1630
2766
|
async function main(options) {
|
|
1631
2767
|
console.log(chalk.bold.cyan('\n🚀 极客时间专栏下载器\n'));
|
|
1632
2768
|
|
|
1633
|
-
// 获取配置:优先级 命令行 > 配置文件
|
|
2769
|
+
// 获取配置:优先级 命令行 > 配置文件 > 默认 cookies.json
|
|
1634
2770
|
let cookie = options.cookie;
|
|
2771
|
+
let cookieFile = options.cookieFile;
|
|
1635
2772
|
let columnUrl = options.url;
|
|
1636
2773
|
|
|
1637
|
-
//
|
|
1638
|
-
if (!cookie || !columnUrl) {
|
|
2774
|
+
// 如果命令行没有提供所需信息,尝试从配置文件读取
|
|
2775
|
+
if (!cookie || !columnUrl || !cookieFile) {
|
|
1639
2776
|
// 使用当前工作目录的config.json,而不是脚本所在目录
|
|
1640
2777
|
const configPath = path.join(process.cwd(), 'config.json');
|
|
1641
2778
|
try {
|
|
@@ -1645,22 +2782,37 @@ async function main(options) {
|
|
|
1645
2782
|
// 使用配置文件中的值作为默认值
|
|
1646
2783
|
if (!cookie) cookie = config.cookie;
|
|
1647
2784
|
if (!columnUrl) columnUrl = config.columnUrl;
|
|
2785
|
+
if (!cookieFile) cookieFile = config.cookieFile;
|
|
1648
2786
|
} catch (error) {
|
|
1649
2787
|
// 配置文件不存在或读取失败,不是致命错误
|
|
1650
2788
|
// 只有在命令行也没提供时才报错
|
|
1651
2789
|
}
|
|
1652
2790
|
}
|
|
1653
2791
|
|
|
2792
|
+
// 如果没有cookie字符串但存在 cookies.json 文件,自动使用
|
|
2793
|
+
if (!cookie && !cookieFile) {
|
|
2794
|
+
const defaultCookieJsonPath = path.join(process.cwd(), 'cookies.json');
|
|
2795
|
+
if (await fileExists(defaultCookieJsonPath)) {
|
|
2796
|
+
cookieFile = defaultCookieJsonPath;
|
|
2797
|
+
}
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
const cookieSavePath = cookieFile || path.join(process.cwd(), 'cookies.json');
|
|
2801
|
+
|
|
1654
2802
|
// 验证必要参数
|
|
1655
|
-
if (!cookie) {
|
|
2803
|
+
if (!cookie && !cookieFile) {
|
|
1656
2804
|
console.error(chalk.red('❌ 缺少 Cookie!'));
|
|
1657
2805
|
console.log(chalk.yellow('\n请通过以下方式之一提供 Cookie:'));
|
|
1658
2806
|
console.log(chalk.gray('1. 命令行参数:--cookie "你的cookie字符串"'));
|
|
1659
2807
|
console.log(chalk.gray('2. 配置文件 config.json:'));
|
|
1660
2808
|
console.log(chalk.gray(' {'));
|
|
1661
2809
|
console.log(chalk.gray(' "cookie": "你的cookie字符串",'));
|
|
1662
|
-
console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx"'));
|
|
1663
|
-
console.log(chalk.gray('
|
|
2810
|
+
console.log(chalk.gray(' "columnUrl": "https://time.geekbang.org/column/article/xxxxx",'));
|
|
2811
|
+
console.log(chalk.gray(' "cookieFile": "cookies.json" // 可选,导入JSON文件'));
|
|
2812
|
+
console.log(chalk.gray(' }'));
|
|
2813
|
+
console.log(chalk.gray('3. 提供 Cookie JSON 文件:'));
|
|
2814
|
+
console.log(chalk.gray(' - 命令行参数:--cookie-file ./cookies.json'));
|
|
2815
|
+
console.log(chalk.gray(' - 或将 cookies.json 放到当前目录\n'));
|
|
1664
2816
|
process.exit(1);
|
|
1665
2817
|
}
|
|
1666
2818
|
|
|
@@ -1709,16 +2861,42 @@ async function main(options) {
|
|
|
1709
2861
|
userAgent: DEFAULT_USER_AGENT
|
|
1710
2862
|
});
|
|
1711
2863
|
|
|
1712
|
-
|
|
1713
|
-
let
|
|
1714
|
-
|
|
1715
|
-
|
|
2864
|
+
let normalizedCookie = '';
|
|
2865
|
+
let cookiesForContext = [];
|
|
2866
|
+
|
|
2867
|
+
if (cookie) {
|
|
2868
|
+
normalizedCookie = cookie.trim();
|
|
2869
|
+
if (/^cookie:/i.test(normalizedCookie)) {
|
|
2870
|
+
normalizedCookie = normalizedCookie.replace(/^cookie:\s*/i, '');
|
|
2871
|
+
}
|
|
2872
|
+
cookiesForContext = parseCookies(normalizedCookie);
|
|
2873
|
+
} else if (cookieFile) {
|
|
2874
|
+
try {
|
|
2875
|
+
const { cookieHeader, cookies, absolutePath } = await loadCookiesFromJsonFile(cookieFile);
|
|
2876
|
+
normalizedCookie = cookieHeader.trim();
|
|
2877
|
+
cookiesForContext = cookies;
|
|
2878
|
+
console.log(chalk.gray(`🍪 已从 ${absolutePath} 导入 Cookie`));
|
|
2879
|
+
} catch (error) {
|
|
2880
|
+
console.error(chalk.red(`❌ 读取 Cookie JSON 失败: ${error.message}`));
|
|
2881
|
+
process.exit(1);
|
|
2882
|
+
}
|
|
1716
2883
|
}
|
|
2884
|
+
|
|
1717
2885
|
globalCookieHeader = normalizedCookie;
|
|
1718
2886
|
|
|
1719
2887
|
// 设置 cookies
|
|
1720
|
-
|
|
1721
|
-
await context
|
|
2888
|
+
await context.addCookies(cookiesForContext);
|
|
2889
|
+
await updateGlobalCookieHeaderFromContext(context);
|
|
2890
|
+
context.on('response', (response) => {
|
|
2891
|
+
try {
|
|
2892
|
+
const headers = response.headers();
|
|
2893
|
+
if (headers && headers['set-cookie']) {
|
|
2894
|
+
updateGlobalCookieHeaderFromContext(context);
|
|
2895
|
+
}
|
|
2896
|
+
} catch {
|
|
2897
|
+
// ignore
|
|
2898
|
+
}
|
|
2899
|
+
});
|
|
1722
2900
|
|
|
1723
2901
|
// 确保所有极客时间域名的请求都携带原始Cookie串,避免Playwright丢失关键字段
|
|
1724
2902
|
await context.route('**/*', (route) => {
|
|
@@ -1740,9 +2918,12 @@ async function main(options) {
|
|
|
1740
2918
|
}
|
|
1741
2919
|
|
|
1742
2920
|
const headers = {
|
|
1743
|
-
...request.headers()
|
|
1744
|
-
cookie: normalizedCookie
|
|
2921
|
+
...request.headers()
|
|
1745
2922
|
};
|
|
2923
|
+
const outgoingCookieHeader = globalCookieHeader || normalizedCookie;
|
|
2924
|
+
if (outgoingCookieHeader) {
|
|
2925
|
+
headers.cookie = outgoingCookieHeader;
|
|
2926
|
+
}
|
|
1746
2927
|
route.continue({ headers });
|
|
1747
2928
|
});
|
|
1748
2929
|
|
|
@@ -1828,7 +3009,10 @@ async function main(options) {
|
|
|
1828
3009
|
const successCount = results.filter(r => r.success).length;
|
|
1829
3010
|
const failCount = results.filter(r => !r.success).length;
|
|
1830
3011
|
const timeoutCount = results.filter(r =>
|
|
1831
|
-
!r.success && r.error &&
|
|
3012
|
+
!r.success && r.error && /timeout/i.test(r.error)
|
|
3013
|
+
).length;
|
|
3014
|
+
const authIssueCount = results.filter(r =>
|
|
3015
|
+
!r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
|
|
1832
3016
|
).length;
|
|
1833
3017
|
|
|
1834
3018
|
console.log(chalk.bold.cyan('\n📊 PDF 下载统计\n'));
|
|
@@ -1842,6 +3026,11 @@ async function main(options) {
|
|
|
1842
3026
|
console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
|
|
1843
3027
|
console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
|
|
1844
3028
|
console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
|
|
3029
|
+
} else if (authIssueCount > 0) {
|
|
3030
|
+
console.log(chalk.yellow('⚠️ 检测到登录或权限相关异常\n'));
|
|
3031
|
+
console.log(chalk.gray(' 1. 在浏览器中重新登录极客时间,进入该专栏任意文章'));
|
|
3032
|
+
console.log(chalk.gray(' 2. 复制最新的 Cookie(或重新导出 cookies.json)'));
|
|
3033
|
+
console.log(chalk.gray(' 3. 使用新的 --cookie 或 --cookie-file 参数后重试\n'));
|
|
1845
3034
|
}
|
|
1846
3035
|
|
|
1847
3036
|
// 合并 PDF
|
|
@@ -1885,7 +3074,10 @@ async function main(options) {
|
|
|
1885
3074
|
const successCount = contentResults.filter(r => r.success).length;
|
|
1886
3075
|
const failCount = contentResults.filter(r => !r.success).length;
|
|
1887
3076
|
const timeoutCount = contentResults.filter(r =>
|
|
1888
|
-
!r.success && r.error &&
|
|
3077
|
+
!r.success && r.error && /timeout/i.test(r.error)
|
|
3078
|
+
).length;
|
|
3079
|
+
const authIssueCount = contentResults.filter(r =>
|
|
3080
|
+
!r.success && r.error && /(Cookie|登录|登陆|订阅|权限|试看|购买)/i.test(r.error)
|
|
1889
3081
|
).length;
|
|
1890
3082
|
|
|
1891
3083
|
console.log(chalk.bold.cyan('\n📊 EPUB 提取统计\n'));
|
|
@@ -1898,19 +3090,42 @@ async function main(options) {
|
|
|
1898
3090
|
console.log(chalk.gray(' 1. Cookie 已失效 - 请重新获取 Cookie'));
|
|
1899
3091
|
console.log(chalk.gray(' 2. 网络连接慢 - 尝试使用 --timeout 120000 增加超时时间'));
|
|
1900
3092
|
console.log(chalk.gray(' 3. 需要登录或权限不足 - 确认已购买该专栏\n'));
|
|
3093
|
+
} else if (authIssueCount > 0) {
|
|
3094
|
+
console.log(chalk.yellow('⚠️ 检测到登录/权限问题,建议步骤:\n'));
|
|
3095
|
+
console.log(chalk.gray(' 1. 浏览器重新登录极客时间并打开该专栏文章'));
|
|
3096
|
+
console.log(chalk.gray(' 2. 重新复制最新 Cookie 或导出 cookies.json'));
|
|
3097
|
+
console.log(chalk.gray(' 3. 更新 --cookie 或 --cookie-file 后再次执行\n'));
|
|
1901
3098
|
}
|
|
1902
3099
|
|
|
1903
3100
|
// 生成 EPUB
|
|
1904
3101
|
if (successCount > 0) {
|
|
1905
|
-
const
|
|
1906
|
-
|
|
1907
|
-
columnTitle,
|
|
1908
|
-
columnAuthor,
|
|
1909
|
-
articlesToDownload,
|
|
1910
|
-
contentResults
|
|
3102
|
+
const hasImageContent = contentResults.some(result =>
|
|
3103
|
+
result && result.success && typeof result.content === 'string' && result.content.includes('<img')
|
|
1911
3104
|
);
|
|
1912
|
-
|
|
1913
|
-
|
|
3105
|
+
|
|
3106
|
+
let processedContent = contentResults;
|
|
3107
|
+
let tempAssetsDir = null;
|
|
3108
|
+
|
|
3109
|
+
try {
|
|
3110
|
+
if (hasImageContent) {
|
|
3111
|
+
tempAssetsDir = await createTempAssetsDir(outputDir);
|
|
3112
|
+
processedContent = await rewriteEpubContentImages(context, contentResults, tempAssetsDir);
|
|
3113
|
+
}
|
|
3114
|
+
|
|
3115
|
+
const epubPath = await generateEPUB(
|
|
3116
|
+
outputDir,
|
|
3117
|
+
columnTitle,
|
|
3118
|
+
columnAuthor,
|
|
3119
|
+
articlesToDownload,
|
|
3120
|
+
processedContent
|
|
3121
|
+
);
|
|
3122
|
+
if (epubPath) {
|
|
3123
|
+
console.log(chalk.green(`\n✅ EPUB 生成完成: ${epubPath}\n`));
|
|
3124
|
+
}
|
|
3125
|
+
} finally {
|
|
3126
|
+
if (tempAssetsDir) {
|
|
3127
|
+
await cleanupTempAssetsDir(tempAssetsDir);
|
|
3128
|
+
}
|
|
1914
3129
|
}
|
|
1915
3130
|
}
|
|
1916
3131
|
}
|
|
@@ -1927,6 +3142,11 @@ async function main(options) {
|
|
|
1927
3142
|
}
|
|
1928
3143
|
process.exit(1);
|
|
1929
3144
|
} finally {
|
|
3145
|
+
try {
|
|
3146
|
+
await persistCookiesToFile(context, cookieSavePath);
|
|
3147
|
+
} catch {
|
|
3148
|
+
// ignore
|
|
3149
|
+
}
|
|
1930
3150
|
// 确保浏览器完全关闭
|
|
1931
3151
|
try {
|
|
1932
3152
|
if (browser && !isShuttingDown) {
|
|
@@ -1946,6 +3166,7 @@ program
|
|
|
1946
3166
|
.version(version)
|
|
1947
3167
|
.option('-u, --url <url>', '专栏文章URL(任意一篇)')
|
|
1948
3168
|
.option('-c, --cookie <cookie>', 'Cookie字符串(用于认证)')
|
|
3169
|
+
.option('--cookie-file <path>', '从 JSON 文件导入 Cookie(如 chrome 扩展导出的 cookies.json)')
|
|
1949
3170
|
.option('-o, --output <dir>', '输出目录', './downloads')
|
|
1950
3171
|
.option('-f, --format <format>', '输出格式: pdf, epub, both', 'pdf')
|
|
1951
3172
|
.option('--headless <boolean>', '无头模式', true)
|