@kadaliao/geektime-downloader 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/download.js +170 -40
- package/package.json +1 -1
package/download.js
CHANGED
|
@@ -237,39 +237,90 @@ function parseCookies(cookieString) {
|
|
|
237
237
|
});
|
|
238
238
|
}
|
|
239
239
|
|
|
240
|
-
//
|
|
240
|
+
// 获取专栏所有文章列表(通过API)
|
|
241
241
|
async function getArticleList(page, columnUrl) {
|
|
242
|
-
const spinner = ora('
|
|
242
|
+
const spinner = ora('正在获取专栏信息...').start();
|
|
243
243
|
|
|
244
|
-
//
|
|
244
|
+
// 从 URL 提取专栏 ID
|
|
245
|
+
let columnId = null;
|
|
246
|
+
const urlMatch = columnUrl.match(/\/column\/intro\/(\d+)|\/column\/article\/(\d+)/);
|
|
247
|
+
if (urlMatch) {
|
|
248
|
+
columnId = urlMatch[1] || urlMatch[2];
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// 监听多个API响应
|
|
245
252
|
let articlesData = null;
|
|
246
|
-
let
|
|
253
|
+
let columnInfoData = null;
|
|
254
|
+
let articlesHandler = null;
|
|
255
|
+
let columnInfoHandler = null;
|
|
247
256
|
|
|
248
|
-
|
|
249
|
-
|
|
257
|
+
// 用于同步的 Promise
|
|
258
|
+
const articlesPromise = new Promise((resolve, reject) => {
|
|
259
|
+
articlesHandler = async (response) => {
|
|
250
260
|
const url = response.url();
|
|
261
|
+
// 监听文章列表 API
|
|
251
262
|
if (url.includes('/serv/v1/column/articles')) {
|
|
252
263
|
try {
|
|
253
264
|
const data = await response.json();
|
|
265
|
+
if (process.env.DEBUG) {
|
|
266
|
+
console.log(chalk.gray('\n收到文章列表API响应'));
|
|
267
|
+
}
|
|
268
|
+
resolve(data);
|
|
269
|
+
} catch (e) {
|
|
270
|
+
console.error('解析文章列表API失败:', e);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
};
|
|
274
|
+
page.on('response', articlesHandler);
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
const columnInfoPromise = new Promise((resolve) => {
|
|
278
|
+
columnInfoHandler = async (response) => {
|
|
279
|
+
const url = response.url();
|
|
280
|
+
// 监听专栏详情相关的 API
|
|
281
|
+
if (url.includes('/serv/v1/column/intro') ||
|
|
282
|
+
url.includes('/serv/v3/column/info') ||
|
|
283
|
+
url.includes('/serv/v1/column/detail')) {
|
|
284
|
+
try {
|
|
285
|
+
const data = await response.json();
|
|
286
|
+
if (process.env.DEBUG) {
|
|
287
|
+
console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
|
|
288
|
+
}
|
|
254
289
|
resolve(data);
|
|
255
290
|
} catch (e) {
|
|
256
|
-
console.error('
|
|
257
|
-
reject(e);
|
|
291
|
+
console.error('解析专栏信息API失败:', e);
|
|
258
292
|
}
|
|
259
293
|
}
|
|
260
294
|
};
|
|
261
|
-
page.on('response',
|
|
295
|
+
page.on('response', columnInfoHandler);
|
|
262
296
|
});
|
|
263
297
|
|
|
264
298
|
try {
|
|
265
|
-
//
|
|
266
|
-
|
|
299
|
+
// 先设置监听器,再访问页面
|
|
300
|
+
spinner.text = '正在加载页面...';
|
|
301
|
+
await page.goto(columnUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
|
302
|
+
|
|
303
|
+
spinner.text = '正在获取文章列表...';
|
|
267
304
|
|
|
268
|
-
//
|
|
305
|
+
// 等待文章列表 API(必须的)
|
|
269
306
|
articlesData = await Promise.race([
|
|
270
|
-
|
|
271
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('API调用超时')),
|
|
307
|
+
articlesPromise,
|
|
308
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('文章列表API调用超时')), 30000))
|
|
272
309
|
]);
|
|
310
|
+
|
|
311
|
+
// 尝试等待专栏信息 API(可选的,5秒超时)
|
|
312
|
+
try {
|
|
313
|
+
columnInfoData = await Promise.race([
|
|
314
|
+
columnInfoPromise,
|
|
315
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
|
|
316
|
+
]);
|
|
317
|
+
} catch (e) {
|
|
318
|
+
// 获取专栏信息失败不是致命错误
|
|
319
|
+
if (process.env.DEBUG) {
|
|
320
|
+
console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
273
324
|
} catch (error) {
|
|
274
325
|
// 如果是因为浏览器关闭导致的错误,静默处理
|
|
275
326
|
if (isShuttingDown || error.message.includes('Target page, context or browser has been closed')) {
|
|
@@ -279,10 +330,17 @@ async function getArticleList(page, columnUrl) {
|
|
|
279
330
|
spinner.fail('获取文章列表失败');
|
|
280
331
|
throw error;
|
|
281
332
|
} finally {
|
|
282
|
-
//
|
|
283
|
-
if (
|
|
333
|
+
// 确保移除所有监听器,防止内存泄漏
|
|
334
|
+
if (articlesHandler) {
|
|
335
|
+
try {
|
|
336
|
+
page.off('response', articlesHandler);
|
|
337
|
+
} catch (e) {
|
|
338
|
+
// 忽略page已关闭的错误
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
if (columnInfoHandler) {
|
|
284
342
|
try {
|
|
285
|
-
page.off('response',
|
|
343
|
+
page.off('response', columnInfoHandler);
|
|
286
344
|
} catch (e) {
|
|
287
345
|
// 忽略page已关闭的错误
|
|
288
346
|
}
|
|
@@ -294,21 +352,47 @@ async function getArticleList(page, columnUrl) {
|
|
|
294
352
|
return { articles: [], columnTitle: 'unknown' };
|
|
295
353
|
}
|
|
296
354
|
|
|
297
|
-
//
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
const firstArticle = articlesData.data.list[0];
|
|
307
|
-
columnTitle = firstArticle.column_title || firstArticle.product_title;
|
|
355
|
+
// 调试信息:记录完整的API响应结构(仅在环境变量DEBUG存在时)
|
|
356
|
+
if (process.env.DEBUG) {
|
|
357
|
+
console.log(chalk.gray('\n========== 文章列表 API 响应数据 =========='));
|
|
358
|
+
console.log(chalk.gray(JSON.stringify(articlesData.data, null, 2)));
|
|
359
|
+
if (columnInfoData) {
|
|
360
|
+
console.log(chalk.gray('\n========== 专栏信息 API 响应数据 =========='));
|
|
361
|
+
console.log(chalk.gray(JSON.stringify(columnInfoData.data, null, 2)));
|
|
362
|
+
}
|
|
363
|
+
console.log(chalk.gray('=========================================\n'));
|
|
308
364
|
}
|
|
309
365
|
|
|
310
|
-
//
|
|
311
|
-
|
|
366
|
+
// 获取专栏标题 - 优先从专栏信息API获取
|
|
367
|
+
let columnTitle = '';
|
|
368
|
+
|
|
369
|
+
// 方法1(最优先): 从专栏信息 API 数据中获取
|
|
370
|
+
if (columnInfoData && columnInfoData.data) {
|
|
371
|
+
columnTitle = columnInfoData.data.title
|
|
372
|
+
|| columnInfoData.data.column_title
|
|
373
|
+
|| columnInfoData.data.name
|
|
374
|
+
|| columnInfoData.data.product_title
|
|
375
|
+
|| columnInfoData.data.subtitle;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// 方法2: 从文章列表 API 数据中获取
|
|
379
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
380
|
+
columnTitle = articlesData.data.column_title
|
|
381
|
+
|| articlesData.data.column_subtitle
|
|
382
|
+
|| articlesData.data.title
|
|
383
|
+
|| articlesData.data.name
|
|
384
|
+
|| articlesData.data.columnTitle
|
|
385
|
+
|| articlesData.data.product_title;
|
|
386
|
+
|
|
387
|
+
// 如果还是没有,尝试从第一篇文章的信息中提取
|
|
388
|
+
if (!columnTitle && articlesData.data.list && articlesData.data.list.length > 0) {
|
|
389
|
+
const firstArticle = articlesData.data.list[0];
|
|
390
|
+
columnTitle = firstArticle.column_title || firstArticle.product_title;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// 方法3: 从页面标题提取
|
|
395
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
312
396
|
try {
|
|
313
397
|
const pageTitle = await page.title();
|
|
314
398
|
// 页面标题格式通常是:"文章标题 - 专栏名称 - 极客时间"
|
|
@@ -321,8 +405,45 @@ async function getArticleList(page, columnUrl) {
|
|
|
321
405
|
}
|
|
322
406
|
}
|
|
323
407
|
|
|
324
|
-
//
|
|
325
|
-
columnTitle
|
|
408
|
+
// 方法4: 从页面DOM中提取
|
|
409
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
410
|
+
try {
|
|
411
|
+
columnTitle = await page.evaluate(() => {
|
|
412
|
+
// 尝试多个可能的选择器
|
|
413
|
+
const selectors = [
|
|
414
|
+
'.column-title',
|
|
415
|
+
'.product-title',
|
|
416
|
+
'[class*="columnTitle"]',
|
|
417
|
+
'[class*="productTitle"]',
|
|
418
|
+
'h1.title',
|
|
419
|
+
'.bread-crumb a:last-child'
|
|
420
|
+
];
|
|
421
|
+
|
|
422
|
+
for (const selector of selectors) {
|
|
423
|
+
const element = document.querySelector(selector);
|
|
424
|
+
if (element && element.textContent && element.textContent.trim()) {
|
|
425
|
+
return element.textContent.trim();
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
return null;
|
|
429
|
+
});
|
|
430
|
+
} catch (e) {
|
|
431
|
+
console.error('从页面DOM提取失败:', e);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// 方法5: 使用专栏ID(如果提取到了)
|
|
436
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
437
|
+
if (columnId) {
|
|
438
|
+
columnTitle = `专栏_${columnId}`;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// 最后的默认值(添加时间戳避免冲突)
|
|
443
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
444
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
445
|
+
columnTitle = `专栏_${timestamp}`;
|
|
446
|
+
}
|
|
326
447
|
|
|
327
448
|
// 清理标题
|
|
328
449
|
columnTitle = columnTitle
|
|
@@ -341,7 +462,8 @@ async function getArticleList(page, columnUrl) {
|
|
|
341
462
|
|
|
342
463
|
// 解析文章列表
|
|
343
464
|
const rawArticles = articlesData.data.list;
|
|
344
|
-
|
|
465
|
+
|
|
466
|
+
const articles = rawArticles.map((article, index) => {
|
|
345
467
|
const title = article.article_title || article.article_sharetitle || 'Untitled';
|
|
346
468
|
const id = article.id;
|
|
347
469
|
|
|
@@ -356,7 +478,10 @@ async function getArticleList(page, columnUrl) {
|
|
|
356
478
|
title: cleanTitle,
|
|
357
479
|
url: `https://time.geekbang.org/column/article/${id}`,
|
|
358
480
|
originalTitle: title,
|
|
359
|
-
id: id
|
|
481
|
+
id: id,
|
|
482
|
+
sectionName: article.section_name || '',
|
|
483
|
+
chapterIndex: article.chapter_index || 0,
|
|
484
|
+
originalIndex: index
|
|
360
485
|
};
|
|
361
486
|
});
|
|
362
487
|
|
|
@@ -555,7 +680,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
555
680
|
}, article.originalTitle || article.title);
|
|
556
681
|
|
|
557
682
|
// 等待文章内容加载
|
|
558
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout:
|
|
683
|
+
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 30000 });
|
|
559
684
|
|
|
560
685
|
// 生成 PDF
|
|
561
686
|
const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
|
|
@@ -694,7 +819,7 @@ async function downloadArticle(page, article, outputDir, index, total) {
|
|
|
694
819
|
}, article.originalTitle || article.title);
|
|
695
820
|
|
|
696
821
|
// 等待文章内容加载
|
|
697
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout:
|
|
822
|
+
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 30000 });
|
|
698
823
|
|
|
699
824
|
// 生成 PDF
|
|
700
825
|
const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
|
|
@@ -878,11 +1003,11 @@ async function main(options) {
|
|
|
878
1003
|
|
|
879
1004
|
console.log(chalk.gray(`📄 专栏地址: ${columnUrl}`));
|
|
880
1005
|
|
|
881
|
-
//
|
|
882
|
-
const
|
|
883
|
-
await fs.mkdir(
|
|
1006
|
+
// 创建基础输出目录(相对于当前工作目录)
|
|
1007
|
+
const baseOutputDir = options.output || path.join(process.cwd(), 'downloads');
|
|
1008
|
+
await fs.mkdir(baseOutputDir, { recursive: true });
|
|
884
1009
|
|
|
885
|
-
console.log(chalk.gray(`📁
|
|
1010
|
+
console.log(chalk.gray(`📁 基础输出目录: ${baseOutputDir}\n`));
|
|
886
1011
|
|
|
887
1012
|
// 启动浏览器
|
|
888
1013
|
let browser;
|
|
@@ -928,6 +1053,11 @@ async function main(options) {
|
|
|
928
1053
|
return;
|
|
929
1054
|
}
|
|
930
1055
|
|
|
1056
|
+
// 为该专栏创建专用文件夹
|
|
1057
|
+
const outputDir = path.join(baseOutputDir, columnTitle);
|
|
1058
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
1059
|
+
console.log(chalk.gray(`📁 专栏输出目录: ${outputDir}\n`));
|
|
1060
|
+
|
|
931
1061
|
// 如果是 dry-run 模式,只显示列表
|
|
932
1062
|
if (options.dryRun) {
|
|
933
1063
|
console.log(chalk.cyan('\n📋 文章列表(预览模式):\n'));
|