@kadaliao/geektime-downloader 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/download.js +165 -39
- package/package.json +1 -1
package/download.js
CHANGED
|
@@ -239,37 +239,88 @@ function parseCookies(cookieString) {
|
|
|
239
239
|
|
|
240
240
|
// 获取专栏所有文章列表(通过API)
|
|
241
241
|
async function getArticleList(page, columnUrl) {
|
|
242
|
-
const spinner = ora('
|
|
242
|
+
const spinner = ora('正在获取专栏信息...').start();
|
|
243
243
|
|
|
244
|
-
//
|
|
244
|
+
// 从 URL 提取专栏 ID
|
|
245
|
+
let columnId = null;
|
|
246
|
+
const urlMatch = columnUrl.match(/\/column\/intro\/(\d+)|\/column\/article\/(\d+)/);
|
|
247
|
+
if (urlMatch) {
|
|
248
|
+
columnId = urlMatch[1] || urlMatch[2];
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// 监听多个API响应
|
|
245
252
|
let articlesData = null;
|
|
246
|
-
let
|
|
253
|
+
let columnInfoData = null;
|
|
254
|
+
let articlesHandler = null;
|
|
255
|
+
let columnInfoHandler = null;
|
|
247
256
|
|
|
248
|
-
|
|
249
|
-
|
|
257
|
+
// 用于同步的 Promise
|
|
258
|
+
const articlesPromise = new Promise((resolve, reject) => {
|
|
259
|
+
articlesHandler = async (response) => {
|
|
250
260
|
const url = response.url();
|
|
261
|
+
// 监听文章列表 API
|
|
251
262
|
if (url.includes('/serv/v1/column/articles')) {
|
|
252
263
|
try {
|
|
253
264
|
const data = await response.json();
|
|
265
|
+
if (process.env.DEBUG) {
|
|
266
|
+
console.log(chalk.gray('\n收到文章列表API响应'));
|
|
267
|
+
}
|
|
268
|
+
resolve(data);
|
|
269
|
+
} catch (e) {
|
|
270
|
+
console.error('解析文章列表API失败:', e);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
};
|
|
274
|
+
page.on('response', articlesHandler);
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
const columnInfoPromise = new Promise((resolve) => {
|
|
278
|
+
columnInfoHandler = async (response) => {
|
|
279
|
+
const url = response.url();
|
|
280
|
+
// 监听专栏详情相关的 API
|
|
281
|
+
if (url.includes('/serv/v1/column/intro') ||
|
|
282
|
+
url.includes('/serv/v3/column/info') ||
|
|
283
|
+
url.includes('/serv/v1/column/detail')) {
|
|
284
|
+
try {
|
|
285
|
+
const data = await response.json();
|
|
286
|
+
if (process.env.DEBUG) {
|
|
287
|
+
console.log(chalk.gray(`收到专栏信息API响应: ${url}`));
|
|
288
|
+
}
|
|
254
289
|
resolve(data);
|
|
255
290
|
} catch (e) {
|
|
256
|
-
console.error('
|
|
257
|
-
reject(e);
|
|
291
|
+
console.error('解析专栏信息API失败:', e);
|
|
258
292
|
}
|
|
259
293
|
}
|
|
260
294
|
};
|
|
261
|
-
page.on('response',
|
|
295
|
+
page.on('response', columnInfoHandler);
|
|
262
296
|
});
|
|
263
297
|
|
|
264
298
|
try {
|
|
265
|
-
//
|
|
266
|
-
|
|
299
|
+
// 先设置监听器,再访问页面
|
|
300
|
+
spinner.text = '正在加载页面...';
|
|
301
|
+
await page.goto(columnUrl, { waitUntil: 'networkidle', timeout: 30000 });
|
|
302
|
+
|
|
303
|
+
spinner.text = '正在获取文章列表...';
|
|
267
304
|
|
|
268
|
-
//
|
|
305
|
+
// 等待文章列表 API(必须的)
|
|
269
306
|
articlesData = await Promise.race([
|
|
270
|
-
|
|
271
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error('API调用超时')),
|
|
307
|
+
articlesPromise,
|
|
308
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('文章列表API调用超时')), 30000))
|
|
272
309
|
]);
|
|
310
|
+
|
|
311
|
+
// 尝试等待专栏信息 API(可选的,5秒超时)
|
|
312
|
+
try {
|
|
313
|
+
columnInfoData = await Promise.race([
|
|
314
|
+
columnInfoPromise,
|
|
315
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
|
|
316
|
+
]);
|
|
317
|
+
} catch (e) {
|
|
318
|
+
// 获取专栏信息失败不是致命错误
|
|
319
|
+
if (process.env.DEBUG) {
|
|
320
|
+
console.log(chalk.gray('未获取到专栏信息API响应(将使用其他方法)'));
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
273
324
|
} catch (error) {
|
|
274
325
|
// 如果是因为浏览器关闭导致的错误,静默处理
|
|
275
326
|
if (isShuttingDown || error.message.includes('Target page, context or browser has been closed')) {
|
|
@@ -279,10 +330,17 @@ async function getArticleList(page, columnUrl) {
|
|
|
279
330
|
spinner.fail('获取文章列表失败');
|
|
280
331
|
throw error;
|
|
281
332
|
} finally {
|
|
282
|
-
//
|
|
283
|
-
if (
|
|
333
|
+
// 确保移除所有监听器,防止内存泄漏
|
|
334
|
+
if (articlesHandler) {
|
|
284
335
|
try {
|
|
285
|
-
page.off('response',
|
|
336
|
+
page.off('response', articlesHandler);
|
|
337
|
+
} catch (e) {
|
|
338
|
+
// 忽略page已关闭的错误
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
if (columnInfoHandler) {
|
|
342
|
+
try {
|
|
343
|
+
page.off('response', columnInfoHandler);
|
|
286
344
|
} catch (e) {
|
|
287
345
|
// 忽略page已关闭的错误
|
|
288
346
|
}
|
|
@@ -294,21 +352,47 @@ async function getArticleList(page, columnUrl) {
|
|
|
294
352
|
return { articles: [], columnTitle: 'unknown' };
|
|
295
353
|
}
|
|
296
354
|
|
|
297
|
-
//
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
const firstArticle = articlesData.data.list[0];
|
|
307
|
-
columnTitle = firstArticle.column_title || firstArticle.product_title;
|
|
355
|
+
// 调试信息:记录完整的API响应结构(仅在环境变量DEBUG存在时)
|
|
356
|
+
if (process.env.DEBUG) {
|
|
357
|
+
console.log(chalk.gray('\n========== 文章列表 API 响应数据 =========='));
|
|
358
|
+
console.log(chalk.gray(JSON.stringify(articlesData.data, null, 2)));
|
|
359
|
+
if (columnInfoData) {
|
|
360
|
+
console.log(chalk.gray('\n========== 专栏信息 API 响应数据 =========='));
|
|
361
|
+
console.log(chalk.gray(JSON.stringify(columnInfoData.data, null, 2)));
|
|
362
|
+
}
|
|
363
|
+
console.log(chalk.gray('=========================================\n'));
|
|
308
364
|
}
|
|
309
365
|
|
|
310
|
-
//
|
|
311
|
-
|
|
366
|
+
// 获取专栏标题 - 优先从专栏信息API获取
|
|
367
|
+
let columnTitle = '';
|
|
368
|
+
|
|
369
|
+
// 方法1(最优先): 从专栏信息 API 数据中获取
|
|
370
|
+
if (columnInfoData && columnInfoData.data) {
|
|
371
|
+
columnTitle = columnInfoData.data.title
|
|
372
|
+
|| columnInfoData.data.column_title
|
|
373
|
+
|| columnInfoData.data.name
|
|
374
|
+
|| columnInfoData.data.product_title
|
|
375
|
+
|| columnInfoData.data.subtitle;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// 方法2: 从文章列表 API 数据中获取
|
|
379
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
380
|
+
columnTitle = articlesData.data.column_title
|
|
381
|
+
|| articlesData.data.column_subtitle
|
|
382
|
+
|| articlesData.data.title
|
|
383
|
+
|| articlesData.data.name
|
|
384
|
+
|| articlesData.data.columnTitle
|
|
385
|
+
|| articlesData.data.product_title;
|
|
386
|
+
|
|
387
|
+
// 如果还是没有,尝试从第一篇文章的信息中提取
|
|
388
|
+
if (!columnTitle && articlesData.data.list && articlesData.data.list.length > 0) {
|
|
389
|
+
const firstArticle = articlesData.data.list[0];
|
|
390
|
+
columnTitle = firstArticle.column_title || firstArticle.product_title;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// 方法3: 从页面标题提取
|
|
395
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
312
396
|
try {
|
|
313
397
|
const pageTitle = await page.title();
|
|
314
398
|
// 页面标题格式通常是:"文章标题 - 专栏名称 - 极客时间"
|
|
@@ -321,8 +405,45 @@ async function getArticleList(page, columnUrl) {
|
|
|
321
405
|
}
|
|
322
406
|
}
|
|
323
407
|
|
|
324
|
-
//
|
|
325
|
-
columnTitle
|
|
408
|
+
// 方法4: 从页面DOM中提取
|
|
409
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
410
|
+
try {
|
|
411
|
+
columnTitle = await page.evaluate(() => {
|
|
412
|
+
// 尝试多个可能的选择器
|
|
413
|
+
const selectors = [
|
|
414
|
+
'.column-title',
|
|
415
|
+
'.product-title',
|
|
416
|
+
'[class*="columnTitle"]',
|
|
417
|
+
'[class*="productTitle"]',
|
|
418
|
+
'h1.title',
|
|
419
|
+
'.bread-crumb a:last-child'
|
|
420
|
+
];
|
|
421
|
+
|
|
422
|
+
for (const selector of selectors) {
|
|
423
|
+
const element = document.querySelector(selector);
|
|
424
|
+
if (element && element.textContent && element.textContent.trim()) {
|
|
425
|
+
return element.textContent.trim();
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
return null;
|
|
429
|
+
});
|
|
430
|
+
} catch (e) {
|
|
431
|
+
console.error('从页面DOM提取失败:', e);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// 方法5: 使用专栏ID(如果提取到了)
|
|
436
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
437
|
+
if (columnId) {
|
|
438
|
+
columnTitle = `专栏_${columnId}`;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// 最后的默认值(添加时间戳避免冲突)
|
|
443
|
+
if (!columnTitle || columnTitle === '专栏' || columnTitle === '极客时间') {
|
|
444
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
445
|
+
columnTitle = `专栏_${timestamp}`;
|
|
446
|
+
}
|
|
326
447
|
|
|
327
448
|
// 清理标题
|
|
328
449
|
columnTitle = columnTitle
|
|
@@ -555,7 +676,7 @@ async function downloadArticleSilent(page, article, outputDir, index, total) {
|
|
|
555
676
|
}, article.originalTitle || article.title);
|
|
556
677
|
|
|
557
678
|
// 等待文章内容加载
|
|
558
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout:
|
|
679
|
+
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 30000 });
|
|
559
680
|
|
|
560
681
|
// 生成 PDF
|
|
561
682
|
const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
|
|
@@ -694,7 +815,7 @@ async function downloadArticle(page, article, outputDir, index, total) {
|
|
|
694
815
|
}, article.originalTitle || article.title);
|
|
695
816
|
|
|
696
817
|
// 等待文章内容加载
|
|
697
|
-
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout:
|
|
818
|
+
await page.waitForSelector('.Index_articleContent_QBG5G, .content', { timeout: 30000 });
|
|
698
819
|
|
|
699
820
|
// 生成 PDF
|
|
700
821
|
const filename = `${String(index).padStart(3, '0')}_${article.title}.pdf`;
|
|
@@ -878,11 +999,11 @@ async function main(options) {
|
|
|
878
999
|
|
|
879
1000
|
console.log(chalk.gray(`📄 专栏地址: ${columnUrl}`));
|
|
880
1001
|
|
|
881
|
-
//
|
|
882
|
-
const
|
|
883
|
-
await fs.mkdir(
|
|
1002
|
+
// 创建基础输出目录(相对于当前工作目录)
|
|
1003
|
+
const baseOutputDir = options.output || path.join(process.cwd(), 'downloads');
|
|
1004
|
+
await fs.mkdir(baseOutputDir, { recursive: true });
|
|
884
1005
|
|
|
885
|
-
console.log(chalk.gray(`📁
|
|
1006
|
+
console.log(chalk.gray(`📁 基础输出目录: ${baseOutputDir}\n`));
|
|
886
1007
|
|
|
887
1008
|
// 启动浏览器
|
|
888
1009
|
let browser;
|
|
@@ -928,6 +1049,11 @@ async function main(options) {
|
|
|
928
1049
|
return;
|
|
929
1050
|
}
|
|
930
1051
|
|
|
1052
|
+
// 为该专栏创建专用文件夹
|
|
1053
|
+
const outputDir = path.join(baseOutputDir, columnTitle);
|
|
1054
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
1055
|
+
console.log(chalk.gray(`📁 专栏输出目录: ${outputDir}\n`));
|
|
1056
|
+
|
|
931
1057
|
// 如果是 dry-run 模式,只显示列表
|
|
932
1058
|
if (options.dryRun) {
|
|
933
1059
|
console.log(chalk.cyan('\n📋 文章列表(预览模式):\n'));
|
|
@@ -950,7 +1076,7 @@ async function main(options) {
|
|
|
950
1076
|
}
|
|
951
1077
|
|
|
952
1078
|
// 并发下载
|
|
953
|
-
const concurrency = parseInt(options.concurrency) ||
|
|
1079
|
+
const concurrency = parseInt(options.concurrency) || 5;
|
|
954
1080
|
if (concurrency > 1) {
|
|
955
1081
|
console.log(chalk.gray(`📊 并发数: ${concurrency}\n`));
|
|
956
1082
|
}
|
|
@@ -1019,7 +1145,7 @@ program
|
|
|
1019
1145
|
.option('-o, --output <dir>', '输出目录', './downloads')
|
|
1020
1146
|
.option('--headless <boolean>', '无头模式', true)
|
|
1021
1147
|
.option('--delay <ms>', '每篇文章之间的延迟(ms)', '2000')
|
|
1022
|
-
.option('--concurrency <number>', '并发下载数量', '
|
|
1148
|
+
.option('--concurrency <number>', '并发下载数量', '5')
|
|
1023
1149
|
.option('--dry-run', '预览模式,只显示文章列表')
|
|
1024
1150
|
.option('--limit <number>', '限制下载数量(用于测试)')
|
|
1025
1151
|
.option('--no-merge', '禁用PDF合并(默认会合并所有文章为一个PDF)')
|