tt-help-cli-ycl 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/auto-core.cjs +288 -0
- package/src/data-store.cjs +65 -0
- package/src/get-user-videos-core.cjs +165 -0
- package/src/get-user-videos.cjs +59 -0
- package/src/lib/args.js +227 -1
- package/src/lib/auto-browser.mjs +11 -0
- package/src/lib/constants.js +46 -0
- package/src/lib/explore.js +27 -1
- package/src/lib/fetcher.js +20 -10
- package/src/lib/get-user-videos-browser.mjs +6 -0
- package/src/lib/io.js +63 -0
- package/src/lib/scrape-browser.mjs +6 -0
- package/src/main.mjs +391 -18
- package/src/results/user-videos-bar.lar.lar.moeta.json +37 -0
- package/src/scraper/core.cjs +192 -0
- package/src/scraper/index.cjs +93 -0
- package/src/scraper/modules/comment-extractor.cjs +122 -0
- package/src/scraper/modules/page-helpers.cjs +422 -0
- package/src/scraper/modules/video-scanner.cjs +43 -0
- package/src/watch/public/index.html +266 -0
- package/src/watch/server.mjs +145 -0
package/src/main.mjs
CHANGED
|
@@ -4,7 +4,9 @@ import { fetchExplore } from './lib/explore.js';
|
|
|
4
4
|
import { processUrl } from './lib/scrape.js';
|
|
5
5
|
import { deduplicate, formatOutput } from './lib/output.js';
|
|
6
6
|
import { parseFilter, applyFilter, formatFilterDescription } from './lib/filter.js';
|
|
7
|
+
import { createProgressBar, calculateConcurrency, createMultiProgressBars, renderMultiProgressBars, clearProgressBars } from './lib/io.js';
|
|
7
8
|
import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|
9
|
+
import { startWatchServer, openBrowser } from './watch/server.mjs';
|
|
8
10
|
|
|
9
11
|
function showConfig(urls, outputFile) {
|
|
10
12
|
const lines = [...CONFIG_TEXT];
|
|
@@ -72,6 +74,10 @@ function handleConfig(action, value) {
|
|
|
72
74
|
process.exit(1);
|
|
73
75
|
}
|
|
74
76
|
|
|
77
|
+
function randomDelay() {
|
|
78
|
+
return new Promise(r => setTimeout(r, Math.random() * 600 + 200));
|
|
79
|
+
}
|
|
80
|
+
|
|
75
81
|
function cleanError(msg) {
|
|
76
82
|
return msg
|
|
77
83
|
.replace(/\x1b\[[0-9;]*m/g, '')
|
|
@@ -112,16 +118,46 @@ async function runExplore(exploreCount, urls, proxyUrl, outputFile, outputFormat
|
|
|
112
118
|
if (urls.length > 0) {
|
|
113
119
|
const errors = [];
|
|
114
120
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
121
|
+
const concurrency = calculateConcurrency(urls.length);
|
|
122
|
+
const bars = createMultiProgressBars(concurrency);
|
|
123
|
+
|
|
124
|
+
const slots = Array.from({ length: concurrency }, () => []);
|
|
125
|
+
urls.forEach((url, i) => slots[i % concurrency].push(url));
|
|
126
|
+
|
|
127
|
+
bars.forEach((bar, i) => {
|
|
128
|
+
bar.total = slots[i].length;
|
|
129
|
+
bar.status = slots[i].length > 0 ? 'running' : 'done';
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
renderMultiProgressBars(bars);
|
|
133
|
+
|
|
134
|
+
const workers = slots.map(async (slotUrls, slotIndex) => {
|
|
135
|
+
for (const url of slotUrls) {
|
|
136
|
+
bars[slotIndex].url = url;
|
|
137
|
+
renderMultiProgressBars(bars);
|
|
138
|
+
|
|
139
|
+
await randomDelay();
|
|
140
|
+
|
|
141
|
+
try {
|
|
142
|
+
const results = await processUrl(url, proxyUrl);
|
|
143
|
+
allResults.push(...results);
|
|
144
|
+
bars[slotIndex].current++;
|
|
145
|
+
bars[slotIndex].status = 'running';
|
|
146
|
+
} catch (err) {
|
|
147
|
+
errors.push({ url, message: err.message });
|
|
148
|
+
bars[slotIndex].current++;
|
|
149
|
+
bars[slotIndex].status = 'error';
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
renderMultiProgressBars(bars);
|
|
123
153
|
}
|
|
124
|
-
|
|
154
|
+
bars[slotIndex].status = bars[slotIndex].current === bars[slotIndex].total ? 'done' : 'error';
|
|
155
|
+
renderMultiProgressBars(bars);
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
await Promise.all(workers);
|
|
159
|
+
|
|
160
|
+
clearProgressBars();
|
|
125
161
|
console.log();
|
|
126
162
|
|
|
127
163
|
if (errors.length > 0) {
|
|
@@ -173,16 +209,52 @@ async function runScrape(urls, proxyUrl, outputFile, outputFormat, filter) {
|
|
|
173
209
|
const allResults = [];
|
|
174
210
|
const errors = [];
|
|
175
211
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
const results = await processUrl(urls[i], proxyUrl);
|
|
181
|
-
allResults.push(...results);
|
|
182
|
-
} catch (err) {
|
|
183
|
-
errors.push({ url: urls[i], message: err.message });
|
|
212
|
+
if (urls.length === 0) {
|
|
213
|
+
console.log('\n未获取到数据');
|
|
214
|
+
if (outputFile) {
|
|
215
|
+
writeFileSync(outputFile, '[]', 'utf-8');
|
|
184
216
|
}
|
|
217
|
+
return;
|
|
185
218
|
}
|
|
219
|
+
|
|
220
|
+
const concurrency = calculateConcurrency(urls.length);
|
|
221
|
+
const bars = createMultiProgressBars(concurrency);
|
|
222
|
+
|
|
223
|
+
const slots = Array.from({ length: concurrency }, () => []);
|
|
224
|
+
urls.forEach((url, i) => slots[i % concurrency].push(url));
|
|
225
|
+
|
|
226
|
+
bars.forEach((bar, i) => {
|
|
227
|
+
bar.total = slots[i].length;
|
|
228
|
+
bar.status = slots[i].length > 0 ? 'running' : 'done';
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
renderMultiProgressBars(bars);
|
|
232
|
+
|
|
233
|
+
const workers = slots.map(async (slotUrls, slotIndex) => {
|
|
234
|
+
for (const url of slotUrls) {
|
|
235
|
+
bars[slotIndex].url = url;
|
|
236
|
+
renderMultiProgressBars(bars);
|
|
237
|
+
|
|
238
|
+
try {
|
|
239
|
+
const results = await processUrl(url, proxyUrl);
|
|
240
|
+
allResults.push(...results);
|
|
241
|
+
bars[slotIndex].current++;
|
|
242
|
+
bars[slotIndex].status = 'running';
|
|
243
|
+
} catch (err) {
|
|
244
|
+
errors.push({ url, message: err.message });
|
|
245
|
+
bars[slotIndex].current++;
|
|
246
|
+
bars[slotIndex].status = 'error';
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
renderMultiProgressBars(bars);
|
|
250
|
+
}
|
|
251
|
+
bars[slotIndex].status = bars[slotIndex].current === bars[slotIndex].total ? 'done' : 'error';
|
|
252
|
+
renderMultiProgressBars(bars);
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
await Promise.all(workers);
|
|
256
|
+
|
|
257
|
+
clearProgressBars();
|
|
186
258
|
console.log();
|
|
187
259
|
|
|
188
260
|
const uniqueResults = deduplicate(allResults);
|
|
@@ -242,8 +314,309 @@ async function runScrape(urls, proxyUrl, outputFile, outputFormat, filter) {
|
|
|
242
314
|
}
|
|
243
315
|
}
|
|
244
316
|
|
|
317
|
+
async function handleScrape(options) {
|
|
318
|
+
const { scrapeUrl, scrapePreset, scrapeMaxVideos, scrapeMaxComments, scrapeSwitchDelay, scrapeCommentDelay, outputFile } = options;
|
|
319
|
+
|
|
320
|
+
if (!scrapeUrl) {
|
|
321
|
+
console.error('用法: tt-help scrape <视频URL> [preset] [最大视频数] [最大评论数] [-o 输出路径]');
|
|
322
|
+
console.error('预设: fast, normal, slow, stealth');
|
|
323
|
+
console.error('选项: -o, --output <路径> 输出到文件(默认输出到 stdout)');
|
|
324
|
+
console.error(' --switch-delay <ms> 视频切换延迟(毫秒)');
|
|
325
|
+
console.error(' --comment-delay <ms> 评论滚动延迟(毫秒)');
|
|
326
|
+
process.exit(1);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
const { runScrape } = await import('./lib/scrape-browser.mjs');
|
|
330
|
+
|
|
331
|
+
let browser;
|
|
332
|
+
try {
|
|
333
|
+
const { output, browser: b } = await runScrape({
|
|
334
|
+
videoUrl: scrapeUrl,
|
|
335
|
+
maxVideos: scrapeMaxVideos,
|
|
336
|
+
maxComments: scrapeMaxComments,
|
|
337
|
+
preset: scrapePreset,
|
|
338
|
+
switchMax: scrapeSwitchDelay,
|
|
339
|
+
commentMax: scrapeCommentDelay,
|
|
340
|
+
log: console.error,
|
|
341
|
+
});
|
|
342
|
+
browser = b;
|
|
343
|
+
|
|
344
|
+
const json = JSON.stringify(output, null, 2);
|
|
345
|
+
if (outputFile) {
|
|
346
|
+
writeFileSync(outputFile, json, 'utf-8');
|
|
347
|
+
console.error(`结果已写入: ${outputFile}`);
|
|
348
|
+
} else {
|
|
349
|
+
process.stdout.write(json + '\n');
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
const stats = output.stats;
|
|
353
|
+
console.error(`\n共 ${stats.totalVideos} 个视频, ${stats.uniqueVideoAuthors} 个视频作者, ${stats.uniqueCommentAuthors} 个评论作者`);
|
|
354
|
+
} catch (err) {
|
|
355
|
+
console.error(`浏览器抓取失败: ${err.message}`);
|
|
356
|
+
process.exit(1);
|
|
357
|
+
} finally {
|
|
358
|
+
if (browser) await browser.close().catch(() => {});
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
async function handleWatch(options) {
|
|
363
|
+
const { outputFile, watchPort } = options;
|
|
364
|
+
|
|
365
|
+
if (!outputFile) {
|
|
366
|
+
console.error('用法: tt-help watch -o <数据文件> [-p 端口]');
|
|
367
|
+
console.error('示例: tt-help watch -o data.json');
|
|
368
|
+
console.error(' tt-help watch -o data.json -p 8080');
|
|
369
|
+
process.exit(1);
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if (!existsSync(outputFile)) {
|
|
373
|
+
console.error(`文件不存在: ${outputFile}`);
|
|
374
|
+
process.exit(1);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
const { server, port } = await startWatchServer(outputFile, watchPort);
|
|
378
|
+
openBrowser(port);
|
|
379
|
+
|
|
380
|
+
process.once('SIGINT', () => {
|
|
381
|
+
server.close();
|
|
382
|
+
process.exit(0);
|
|
383
|
+
});
|
|
384
|
+
|
|
385
|
+
console.error(`按 Ctrl+C 停止监控服务`);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
async function handleAuto(options) {
|
|
389
|
+
const { autoUsernames, autoCollectMax, autoScrapeDepth, autoMaxComments, autoPreset, autoSwitchDelay, autoCommentDelay, outputFile, autoWatch, autoWatchPort } = options;
|
|
390
|
+
|
|
391
|
+
const runOptions = {
|
|
392
|
+
collectMax: autoCollectMax,
|
|
393
|
+
scrapeDepth: autoScrapeDepth,
|
|
394
|
+
maxComments: autoMaxComments,
|
|
395
|
+
preset: autoPreset,
|
|
396
|
+
switchMax: autoSwitchDelay,
|
|
397
|
+
commentMax: autoCommentDelay,
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
// 数据源
|
|
401
|
+
const { createRequire } = await import('module');
|
|
402
|
+
const require = createRequire(import.meta.url);
|
|
403
|
+
const { createStore } = require('./data-store.cjs');
|
|
404
|
+
const store = createStore(outputFile);
|
|
405
|
+
|
|
406
|
+
// 构建队列:命令行用户名插队到前面,文件中的未处理用户追加到后面
|
|
407
|
+
const queue = [...new Set(autoUsernames)];
|
|
408
|
+
const pendingFromStore = store.getPendingUsers().filter(u => !u.restricted);
|
|
409
|
+
pendingFromStore.forEach(u => {
|
|
410
|
+
if (!queue.includes(u.uniqueId)) {
|
|
411
|
+
queue.push(u.uniqueId);
|
|
412
|
+
}
|
|
413
|
+
});
|
|
414
|
+
|
|
415
|
+
if (queue.length === 0) {
|
|
416
|
+
console.error('没有待处理的用户');
|
|
417
|
+
return;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
console.error(`队列: ${queue.length} 个用户待处理`);
|
|
421
|
+
if (autoUsernames.length > 0) {
|
|
422
|
+
console.error(` 命令行: @${autoUsernames.join(', @')}`);
|
|
423
|
+
}
|
|
424
|
+
if (pendingFromStore.length > 0) {
|
|
425
|
+
console.error(` 数据源: ${pendingFromStore.length} 个未处理用户`);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// Watch server
|
|
429
|
+
let watchServer = null;
|
|
430
|
+
let watchPort = null;
|
|
431
|
+
if (autoWatch) {
|
|
432
|
+
if (!outputFile) {
|
|
433
|
+
console.error('--watch 需要指定 -o 输出文件');
|
|
434
|
+
process.exit(1);
|
|
435
|
+
}
|
|
436
|
+
({ server: watchServer, port: watchPort } = await startWatchServer(outputFile, autoWatchPort || 3000));
|
|
437
|
+
openBrowser(watchPort);
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// 启动浏览器
|
|
441
|
+
const { ensureBrowserReady, processUser } = await import('./lib/auto-browser.mjs');
|
|
442
|
+
|
|
443
|
+
const browser = await ensureBrowserReady();
|
|
444
|
+
|
|
445
|
+
try {
|
|
446
|
+
const contexts = browser.contexts();
|
|
447
|
+
let page = null;
|
|
448
|
+
for (const ctx of contexts) {
|
|
449
|
+
for (const p of ctx.pages()) {
|
|
450
|
+
if (p.url().includes('tiktok.com')) {
|
|
451
|
+
page = p;
|
|
452
|
+
break;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
if (page) break;
|
|
456
|
+
}
|
|
457
|
+
if (!page) {
|
|
458
|
+
const defaultCtx = contexts[0] || await browser.newContext();
|
|
459
|
+
page = await defaultCtx.newPage();
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
let processedCount = 0;
|
|
463
|
+
let errorCount = 0;
|
|
464
|
+
|
|
465
|
+
for (let i = 0; i < queue.length; i++) {
|
|
466
|
+
const username = queue[i];
|
|
467
|
+
console.error(`\n[${i + 1}/${queue.length}] 处理 @${username}...`);
|
|
468
|
+
|
|
469
|
+
const result = await processUser(page, username, { ...runOptions, browser }, console.error);
|
|
470
|
+
|
|
471
|
+
if (result.restricted) {
|
|
472
|
+
store.addUser({
|
|
473
|
+
uniqueId: username,
|
|
474
|
+
restricted: true,
|
|
475
|
+
sources: ['restricted'],
|
|
476
|
+
});
|
|
477
|
+
store.save();
|
|
478
|
+
continue;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
if (result.error) {
|
|
482
|
+
errorCount++;
|
|
483
|
+
store.addUser({
|
|
484
|
+
uniqueId: username,
|
|
485
|
+
error: result.error,
|
|
486
|
+
sources: ['error'],
|
|
487
|
+
});
|
|
488
|
+
store.save();
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// 写入用户信息(持续合并更新,不管是否已存在)
|
|
493
|
+
const userEntry = {
|
|
494
|
+
uniqueId: username,
|
|
495
|
+
...result.userInfo,
|
|
496
|
+
sources: ['processed'],
|
|
497
|
+
};
|
|
498
|
+
store.addUser(userEntry);
|
|
499
|
+
|
|
500
|
+
// 发现的视频作者(持续合并更新,不管是否已存在)
|
|
501
|
+
for (const va of result.discoveredVideoAuthors) {
|
|
502
|
+
store.addUser({
|
|
503
|
+
uniqueId: va.uniqueId,
|
|
504
|
+
nickname: va.nickname,
|
|
505
|
+
locationCreated: va.locationCreated,
|
|
506
|
+
sources: ['video'],
|
|
507
|
+
});
|
|
508
|
+
if (!store.getUser(va.uniqueId) || !store.getUser(va.uniqueId).followerCount) {
|
|
509
|
+
if (!queue.includes(va.uniqueId)) {
|
|
510
|
+
queue.push(va.uniqueId);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// 发现的评论作者
|
|
516
|
+
for (const ca of result.discoveredCommentAuthors) {
|
|
517
|
+
const caId = ca.replace(/^@/, '');
|
|
518
|
+
store.addUser({
|
|
519
|
+
uniqueId: caId,
|
|
520
|
+
sources: ['comment'],
|
|
521
|
+
});
|
|
522
|
+
if (!store.getUser(caId) || !store.getUser(caId).followerCount) {
|
|
523
|
+
if (!queue.includes(caId)) {
|
|
524
|
+
queue.push(caId);
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
processedCount++;
|
|
530
|
+
store.save();
|
|
531
|
+
console.error(` 已保存,当前共 ${store.getAllUsers().length} 个用户`);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
const output = store.getAllUsers();
|
|
535
|
+
if (outputFile) {
|
|
536
|
+
console.error(`\n完成: ${processedCount} 个用户已处理, ${errorCount} 个出错, 共 ${output.length} 个用户`);
|
|
537
|
+
console.error(`数据已保存到: ${outputFile}`);
|
|
538
|
+
} else {
|
|
539
|
+
const json = JSON.stringify(output, null, 2);
|
|
540
|
+
process.stdout.write(json + '\n');
|
|
541
|
+
}
|
|
542
|
+
} catch (err) {
|
|
543
|
+
console.error(`自动抓取失败: ${err.message}`);
|
|
544
|
+
if (watchServer) watchServer.close();
|
|
545
|
+
process.exit(1);
|
|
546
|
+
} finally {
|
|
547
|
+
await browser.close().catch(() => {});
|
|
548
|
+
if (watchServer) {
|
|
549
|
+
watchServer.close();
|
|
550
|
+
console.error(`Watch 监控服务已停止: http://127.0.0.1:${watchPort}`);
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
async function handleVideos(options) {
|
|
556
|
+
const { videosUsername, videosMax, outputFile } = options;
|
|
557
|
+
|
|
558
|
+
if (!videosUsername) {
|
|
559
|
+
console.error('用法: tt-help videos <用户名> [最大视频数] [-o 输出路径]');
|
|
560
|
+
console.error('示例: tt-help videos bar.lar.lar.moeta 1000');
|
|
561
|
+
console.error(' tt-help videos username 50 -o videos.json');
|
|
562
|
+
console.error('');
|
|
563
|
+
console.error('选项: -o, --output <路径> 输出到文件(默认输出到 stdout)');
|
|
564
|
+
process.exit(1);
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
const { runGetUserVideos } = await import('./lib/get-user-videos-browser.mjs');
|
|
568
|
+
|
|
569
|
+
let browser;
|
|
570
|
+
try {
|
|
571
|
+
const { output, browser: b } = await runGetUserVideos({
|
|
572
|
+
username: videosUsername,
|
|
573
|
+
maxVideos: videosMax,
|
|
574
|
+
log: console.error,
|
|
575
|
+
});
|
|
576
|
+
browser = b;
|
|
577
|
+
|
|
578
|
+
const json = JSON.stringify(output, null, 2);
|
|
579
|
+
if (outputFile) {
|
|
580
|
+
writeFileSync(outputFile, json, 'utf-8');
|
|
581
|
+
console.error(`结果已写入: ${outputFile}`);
|
|
582
|
+
} else {
|
|
583
|
+
process.stdout.write(json + '\n');
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
const stats = output.videos.length;
|
|
587
|
+
console.error(`\n共 ${stats} 个视频, 用户: @${videosUsername}`);
|
|
588
|
+
} catch (err) {
|
|
589
|
+
console.error(`获取用户视频失败: ${err.message}`);
|
|
590
|
+
process.exit(1);
|
|
591
|
+
} finally {
|
|
592
|
+
if (browser) await browser.close().catch(() => {});
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
|
|
245
596
|
async function main() {
|
|
246
|
-
const
|
|
597
|
+
const parsed = parseArgs();
|
|
598
|
+
|
|
599
|
+
if (parsed.subcommand === 'scrape') {
|
|
600
|
+
await handleScrape(parsed);
|
|
601
|
+
return;
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
if (parsed.subcommand === 'videos') {
|
|
605
|
+
await handleVideos(parsed);
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
if (parsed.subcommand === 'auto') {
|
|
610
|
+
await handleAuto(parsed);
|
|
611
|
+
return;
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
if (parsed.subcommand === 'watch') {
|
|
615
|
+
await handleWatch(parsed);
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
const { urls, outputFile, outputFormat, exploreCount, showConfig: showCfg, showHelp, customProxy, configAction, configValue, pipeMode, filterStr } = parsed;
|
|
247
620
|
const proxyUrl = customProxy || proxy;
|
|
248
621
|
const filter = parseFilter(filterStr);
|
|
249
622
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"user": {
|
|
3
|
+
"uniqueId": "bar.lar.lar.moeta",
|
|
4
|
+
"secUid": "MS4wLjABAAAA3cgKTWvKfga0JAWeakAzx3zQ-aFAC8RuQvxD4HQFraKKsc_TbOIyMo3_ofVlXofV",
|
|
5
|
+
"nickname": "Bar Lar Lar Moetain",
|
|
6
|
+
"ttSeller": false,
|
|
7
|
+
"verified": false,
|
|
8
|
+
"followerCount": 24000,
|
|
9
|
+
"videoCount": 749,
|
|
10
|
+
"followingCount": 4293,
|
|
11
|
+
"heartCount": 254300,
|
|
12
|
+
"signature": ""
|
|
13
|
+
},
|
|
14
|
+
"totalVideos": 5,
|
|
15
|
+
"videos": [
|
|
16
|
+
{
|
|
17
|
+
"id": "7638231799084158228",
|
|
18
|
+
"url": "https://www.tiktok.com/@bar.lar.lar.moeta/video/7638231799084158228"
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "7638162444698914068",
|
|
22
|
+
"url": "https://www.tiktok.com/@bar.lar.lar.moeta/video/7638162444698914068"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "7638116251767819541",
|
|
26
|
+
"url": "https://www.tiktok.com/@bar.lar.lar.moeta/video/7638116251767819541"
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"id": "7638069637321690388",
|
|
30
|
+
"url": "https://www.tiktok.com/@bar.lar.lar.moeta/video/7638069637321690388"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"id": "7637927171025112341",
|
|
34
|
+
"url": "https://www.tiktok.com/@bar.lar.lar.moeta/video/7637927171025112341"
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
const {
|
|
2
|
+
closeCommentPanel,
|
|
3
|
+
delay,
|
|
4
|
+
ensureBrowserReady,
|
|
5
|
+
ensureTikTokPage,
|
|
6
|
+
setDelayConfig,
|
|
7
|
+
getDelayConfig,
|
|
8
|
+
retryWithBackoff,
|
|
9
|
+
} = require("./modules/page-helpers.cjs");
|
|
10
|
+
const { extractCommentAuthors } = require("./modules/comment-extractor.cjs");
|
|
11
|
+
|
|
12
|
+
async function scrapeSingleVideo(page, maxComments, log) {
|
|
13
|
+
const config = getDelayConfig();
|
|
14
|
+
|
|
15
|
+
await page
|
|
16
|
+
.waitForSelector('[class*="VideoMeta"]', { timeout: 10000 })
|
|
17
|
+
.catch(() => {});
|
|
18
|
+
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
19
|
+
|
|
20
|
+
const userData = await page.evaluate(() => {
|
|
21
|
+
const result = {};
|
|
22
|
+
|
|
23
|
+
const m = window.location.href.match(/\/@([^\/]+)\/video/);
|
|
24
|
+
if (m) result.uniqueId = m[1];
|
|
25
|
+
|
|
26
|
+
const authorEls = document.querySelectorAll('[class*="Author"]');
|
|
27
|
+
for (const el of authorEls) {
|
|
28
|
+
const text = (el.textContent || "").trim();
|
|
29
|
+
if (text && !text.includes("TikTok") && !text.includes("Share")) {
|
|
30
|
+
result.nickname = text;
|
|
31
|
+
break;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const html = document.documentElement.outerHTML;
|
|
36
|
+
const locMatch = html.match(/"locationCreated":"([^"]*)/);
|
|
37
|
+
if (locMatch) result.locationCreated = locMatch[1];
|
|
38
|
+
|
|
39
|
+
return result;
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
const videoAuthor = userData.uniqueId ? "@" + userData.uniqueId : null;
|
|
43
|
+
if (!videoAuthor) {
|
|
44
|
+
throw new Error("无法获取视频作者");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const commentUsers = await extractCommentAuthors(page, maxComments);
|
|
48
|
+
await closeCommentPanel(page);
|
|
49
|
+
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
50
|
+
|
|
51
|
+
const uniqueUsers = [...new Set(commentUsers)];
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
videoAuthor,
|
|
55
|
+
uniqueId: userData.uniqueId,
|
|
56
|
+
nickname: userData.nickname,
|
|
57
|
+
locationCreated: userData.locationCreated,
|
|
58
|
+
commentUsers: uniqueUsers,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async function runScrape(options) {
|
|
63
|
+
const {
|
|
64
|
+
videoUrl,
|
|
65
|
+
maxVideos = 20,
|
|
66
|
+
maxComments = 999,
|
|
67
|
+
preset = null,
|
|
68
|
+
switchMax = null,
|
|
69
|
+
commentMax = null,
|
|
70
|
+
log = console.error,
|
|
71
|
+
browser: externalBrowser = null,
|
|
72
|
+
page: externalPage = null,
|
|
73
|
+
} = options;
|
|
74
|
+
|
|
75
|
+
if (preset) {
|
|
76
|
+
setDelayConfig(preset);
|
|
77
|
+
} else if (switchMax || commentMax) {
|
|
78
|
+
setDelayConfig({
|
|
79
|
+
switchMax: switchMax || 5000,
|
|
80
|
+
commentMax: commentMax || 3000,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const config = getDelayConfig();
|
|
85
|
+
|
|
86
|
+
let browser, page;
|
|
87
|
+
const isExternal = !!(externalBrowser && externalPage);
|
|
88
|
+
|
|
89
|
+
if (!isExternal) {
|
|
90
|
+
log(`视频地址: ${videoUrl}`);
|
|
91
|
+
log(
|
|
92
|
+
`视频数: ${maxVideos}, 评论数: ${maxComments}, 切换延迟: ${config.switchMax}ms, 评论延迟: ${config.commentMax}ms`,
|
|
93
|
+
);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (isExternal) {
|
|
97
|
+
browser = externalBrowser;
|
|
98
|
+
page = externalPage;
|
|
99
|
+
} else {
|
|
100
|
+
browser = await ensureBrowserReady();
|
|
101
|
+
try {
|
|
102
|
+
page = await ensureTikTokPage(browser, videoUrl);
|
|
103
|
+
} catch (e) {
|
|
104
|
+
await browser.close().catch(() => {});
|
|
105
|
+
throw e;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
await retryWithBackoff(() => page.goto(videoUrl, { waitUntil: "load", timeout: 30000 }), { log });
|
|
110
|
+
await delay(Math.round(config.switchMax * 0.5), config.switchMax);
|
|
111
|
+
await closeCommentPanel(page);
|
|
112
|
+
await delay(Math.round(config.commentMax * 0.5), config.commentMax);
|
|
113
|
+
|
|
114
|
+
const allResults = [];
|
|
115
|
+
const videoAuthors = new Set();
|
|
116
|
+
const commentUsers = new Set();
|
|
117
|
+
const allCommentAuthorsList = [];
|
|
118
|
+
|
|
119
|
+
for (let i = 0; i < maxVideos; i++) {
|
|
120
|
+
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
121
|
+
|
|
122
|
+
let result;
|
|
123
|
+
try {
|
|
124
|
+
result = await scrapeSingleVideo(page, maxComments, log);
|
|
125
|
+
} catch (e) {
|
|
126
|
+
log(`[${i + 1}/${maxVideos}] 跳过: ${e.message}`);
|
|
127
|
+
if (i < maxVideos - 1) {
|
|
128
|
+
await page.evaluate(() => {
|
|
129
|
+
const container = document.querySelector(
|
|
130
|
+
'[class*="ColumnListContainer"]',
|
|
131
|
+
);
|
|
132
|
+
if (container) container.scrollTop += 700;
|
|
133
|
+
else window.scrollBy(0, 700);
|
|
134
|
+
});
|
|
135
|
+
await delay(Math.round(config.switchMax * 0.5), config.switchMax);
|
|
136
|
+
}
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
allResults.push(result);
|
|
140
|
+
videoAuthors.add(result.videoAuthor);
|
|
141
|
+
result.commentUsers.forEach((u) => commentUsers.add(u));
|
|
142
|
+
allCommentAuthorsList.push(...result.commentUsers);
|
|
143
|
+
|
|
144
|
+
if ((i + 1) % 5 === 0 || i === 0) {
|
|
145
|
+
log(
|
|
146
|
+
`[${i + 1}/${maxVideos}] ${result.videoAuthor} | 昵称: ${result.nickname || "-"} | 评论用户: ${result.commentUsers.length}`,
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (i < maxVideos - 1) {
|
|
151
|
+
await page.evaluate(() => {
|
|
152
|
+
const container = document.querySelector(
|
|
153
|
+
'[class*="ColumnListContainer"]',
|
|
154
|
+
);
|
|
155
|
+
if (container) container.scrollTop += 700;
|
|
156
|
+
});
|
|
157
|
+
await delay(2000, config.switchMax);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
log(
|
|
162
|
+
`\n结果: 视频作者 ${videoAuthors.size} | 评论用户 ${commentUsers.size} | 总评论 ${allCommentAuthorsList.length}`,
|
|
163
|
+
);
|
|
164
|
+
|
|
165
|
+
const videoDetails = {};
|
|
166
|
+
for (const r of allResults) {
|
|
167
|
+
const key = r.videoAuthor;
|
|
168
|
+
if (!videoDetails[key]) {
|
|
169
|
+
videoDetails[key] = {
|
|
170
|
+
videoAuthor: r.videoAuthor,
|
|
171
|
+
uniqueId: r.uniqueId,
|
|
172
|
+
nickname: r.nickname,
|
|
173
|
+
locationCreated: r.locationCreated,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const output = {
|
|
179
|
+
videoDetails: Object.values(videoDetails),
|
|
180
|
+
commentUsers: [...commentUsers].sort(),
|
|
181
|
+
allCommentAuthorsList,
|
|
182
|
+
stats: {
|
|
183
|
+
totalVideos: allResults.length,
|
|
184
|
+
uniqueVideoAuthors: videoAuthors.size,
|
|
185
|
+
uniqueCommentAuthors: commentUsers.size,
|
|
186
|
+
},
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
return { output, browser, isExternal };
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
module.exports = { scrapeSingleVideo, runScrape };
|