tt-help-cli-ycl 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/auto-core.mjs +174 -0
- package/src/cli/auto.js +94 -0
- package/src/cli/explore.js +117 -0
- package/src/cli/progress.js +111 -0
- package/src/cli/scrape.js +47 -0
- package/src/cli/utils.js +18 -0
- package/src/cli/videos.js +41 -0
- package/src/cli/watch.js +28 -0
- package/src/data-store.mjs +213 -0
- package/src/{explore-core.cjs → explore-core.mjs} +148 -157
- package/src/{get-user-videos-core.cjs → get-user-videos-core.mjs} +6 -23
- package/src/lib/args.js +19 -38
- package/src/lib/auto-browser.mjs +5 -12
- package/src/lib/browser/anti-detect.js +23 -0
- package/src/lib/browser/cdp.js +142 -0
- package/src/lib/browser/launch.js +43 -0
- package/src/lib/browser/page.js +62 -0
- package/src/lib/constants.js +13 -95
- package/src/lib/delay.js +54 -0
- package/src/lib/explore.js +16 -123
- package/src/lib/fetcher.js +3 -18
- package/src/lib/get-user-videos-browser.mjs +1 -6
- package/src/lib/io.js +8 -30
- package/src/lib/parser.js +1 -1
- package/src/lib/retry.js +44 -0
- package/src/lib/scrape-browser.mjs +1 -6
- package/src/lib/scrape.js +5 -4
- package/src/lib/url.js +52 -0
- package/src/main.mjs +59 -822
- package/src/scraper/{core.cjs → core.mjs} +25 -57
- package/src/scraper/modules/{comment-extractor.cjs → comment-extractor.mjs} +23 -15
- package/src/scraper/modules/follow-extractor.mjs +121 -0
- package/src/scraper/modules/{guess-extractor.cjs → guess-extractor.mjs} +3 -5
- package/src/scraper/modules/page-error-detector.mjs +68 -0
- package/src/scraper/modules/page-helpers.mjs +44 -0
- package/src/scraper/modules/scroll-collector.mjs +189 -0
- package/src/watch/public/index.html +139 -64
- package/src/watch/server.mjs +234 -153
- package/src/auto-core.cjs +0 -367
- package/src/data-store.cjs +0 -69
- package/src/get-user-videos.cjs +0 -59
- package/src/scraper/index.cjs +0 -97
- package/src/scraper/modules/follow-extractor.cjs +0 -112
- package/src/scraper/modules/page-helpers.cjs +0 -422
- package/src/scraper/modules/scroll-collector.cjs +0 -173
- package/src/scraper/modules/video-scanner.cjs +0 -43
|
@@ -1,422 +0,0 @@
|
|
|
1
|
-
const { chromium } = require('playwright');
|
|
2
|
-
const { exec } = require('child_process');
|
|
3
|
-
const http = require('http');
|
|
4
|
-
const os = require('os');
|
|
5
|
-
const path = require('path');
|
|
6
|
-
|
|
7
|
-
const USER_SECTION_SIZE = 12000;
|
|
8
|
-
const CDP_PORT = 9222;
|
|
9
|
-
|
|
10
|
-
// --- Retry utilities ---
|
|
11
|
-
|
|
12
|
-
const RETRYABLE_PATTERNS = [
|
|
13
|
-
'interrupted',
|
|
14
|
-
'Navigation.*interrupted',
|
|
15
|
-
'net::',
|
|
16
|
-
'ECONN',
|
|
17
|
-
'ETIMEDOUT',
|
|
18
|
-
'ENOTFOUND',
|
|
19
|
-
'EAI_AGAIN',
|
|
20
|
-
'ESOCKETRESET',
|
|
21
|
-
'connection.*refused',
|
|
22
|
-
'connection.*reset',
|
|
23
|
-
'failed.*navigate',
|
|
24
|
-
'target.*closed',
|
|
25
|
-
'crash',
|
|
26
|
-
];
|
|
27
|
-
|
|
28
|
-
function isRetryableError(error) {
|
|
29
|
-
if (!error) return false;
|
|
30
|
-
const msg = (error.message || error.toString() || '').toLowerCase();
|
|
31
|
-
return RETRYABLE_PATTERNS.some(p => new RegExp(p, 'i').test(msg));
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
async function retryWithBackoff(fn, { maxRetries = 3, baseDelay = 3000, log } = {}) {
|
|
35
|
-
let lastError;
|
|
36
|
-
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
37
|
-
try {
|
|
38
|
-
return await fn();
|
|
39
|
-
} catch (error) {
|
|
40
|
-
lastError = error;
|
|
41
|
-
if (attempt >= maxRetries || !isRetryableError(error)) {
|
|
42
|
-
throw error;
|
|
43
|
-
}
|
|
44
|
-
const jitter = Math.random() * 2000;
|
|
45
|
-
const waitTime = baseDelay * Math.pow(2, attempt) + jitter;
|
|
46
|
-
if (log) {
|
|
47
|
-
log(` [重试] ${attempt + 1}/${maxRetries},${Math.round(waitTime / 1000)}s 后重试...`);
|
|
48
|
-
}
|
|
49
|
-
await delay(Math.round(waitTime), Math.round(waitTime));
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
throw lastError;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
// 预设延迟配置(毫秒)
|
|
56
|
-
const DELAY_PRESETS = {
|
|
57
|
-
fast: { switchMax: 300, commentMax: 200, fast: true },
|
|
58
|
-
normal: { switchMax: 1500, commentMax: 800 },
|
|
59
|
-
slow: { switchMax: 3000, commentMax: 2000 },
|
|
60
|
-
stealth: { switchMax: 5000, commentMax: 3500 },
|
|
61
|
-
};
|
|
62
|
-
|
|
63
|
-
// 当前延迟配置
|
|
64
|
-
const delayConfig = {
|
|
65
|
-
switchMax: 2500,
|
|
66
|
-
commentMax: 1500,
|
|
67
|
-
fast: false,
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
function setDelayConfig(config) {
|
|
71
|
-
if (typeof config === 'string') {
|
|
72
|
-
const preset = DELAY_PRESETS[config.toLowerCase()];
|
|
73
|
-
if (!preset) {
|
|
74
|
-
throw new Error(
|
|
75
|
-
`未知的延迟预设: ${config}\n` +
|
|
76
|
-
`可用预设: ${Object.keys(DELAY_PRESETS).join(', ')}`
|
|
77
|
-
);
|
|
78
|
-
}
|
|
79
|
-
delayConfig.switchMax = preset.switchMax;
|
|
80
|
-
delayConfig.commentMax = preset.commentMax;
|
|
81
|
-
delayConfig.fast = preset.fast || false;
|
|
82
|
-
} else if (typeof config === 'object') {
|
|
83
|
-
if (config.switchMax) delayConfig.switchMax = config.switchMax;
|
|
84
|
-
if (config.commentMax) delayConfig.commentMax = config.commentMax;
|
|
85
|
-
delayConfig.fast = config.fast || false;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
function getDelayConfig() {
|
|
90
|
-
return { ...delayConfig };
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
function listDelayPresets() {
|
|
94
|
-
return DELAY_PRESETS;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
function delay(min, max) {
|
|
98
|
-
const lo = Math.min(min, max);
|
|
99
|
-
const hi = Math.max(min, max);
|
|
100
|
-
let ms;
|
|
101
|
-
if (delayConfig.fast) {
|
|
102
|
-
ms = 0;
|
|
103
|
-
} else {
|
|
104
|
-
ms = Math.floor(Math.random() * (hi - lo + 1)) + lo;
|
|
105
|
-
}
|
|
106
|
-
return new Promise(r => setTimeout(r, ms));
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
async function openCommentPanel(page) {
|
|
110
|
-
await page.evaluate(() => {
|
|
111
|
-
const tabs = document.querySelectorAll('[class*="tabbar-item"]');
|
|
112
|
-
for (const tab of tabs) {
|
|
113
|
-
if (tab.textContent?.includes('评论')) {
|
|
114
|
-
tab.click();
|
|
115
|
-
break;
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
});
|
|
119
|
-
const c = delayConfig.commentMax;
|
|
120
|
-
await delay(Math.round(c * 0.5), c);
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
async function closeCommentPanel(page) {
|
|
124
|
-
await page.evaluate(() => {
|
|
125
|
-
const rightPanel = document.querySelector('[class*="RightPanelContainer"]');
|
|
126
|
-
if (rightPanel) {
|
|
127
|
-
const tabContainer = rightPanel.querySelector('[class*="TabContainer"]');
|
|
128
|
-
if (tabContainer) {
|
|
129
|
-
const closeOverlay = tabContainer.querySelector('div:last-child');
|
|
130
|
-
if (closeOverlay) closeOverlay.click();
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
});
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
async function getVideoAuthor(page) {
|
|
137
|
-
return await page.evaluate(() => {
|
|
138
|
-
const m = window.location.href.match(/@([^/]+)\/video/);
|
|
139
|
-
return m ? '@' + m[1] : null;
|
|
140
|
-
});
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
async function swipeNextVideo(page) {
|
|
144
|
-
await page.evaluate(() => {
|
|
145
|
-
const container = document.querySelector('[class*="ColumnListContainer"]');
|
|
146
|
-
if (container) {
|
|
147
|
-
container.scrollTop += 700;
|
|
148
|
-
}
|
|
149
|
-
});
|
|
150
|
-
const s = delayConfig.switchMax;
|
|
151
|
-
await delay(Math.round(s * 0.5), s);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
function extractUserSection(html) {
|
|
155
|
-
const idx = html.indexOf('"uniqueId"');
|
|
156
|
-
if (idx < 0) return null;
|
|
157
|
-
return html.substring(idx, idx + USER_SECTION_SIZE);
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
function parseUserSection(section) {
|
|
161
|
-
const data = {};
|
|
162
|
-
|
|
163
|
-
for (const key of ['uniqueId', 'uid', 'secUid']) {
|
|
164
|
-
const m = section.match(new RegExp(`"${key}":"([^"]*)`));
|
|
165
|
-
if (m) data[key] = m[1];
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
for (const key of ['nickname', 'signature']) {
|
|
169
|
-
const m = section.match(new RegExp(`"${key}":"((?:[^"\\\\]|\\\\.)*)"`, 'g'));
|
|
170
|
-
if (m) {
|
|
171
|
-
const raw = m[0].replace(`"${key}":"`, '').replace(/"$/, '');
|
|
172
|
-
data[key] = raw.replace(/\\n/g, '\n').replace(/\\\\/g, '\\');
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
for (const key of ['ttSeller', 'verified']) {
|
|
177
|
-
const m = section.match(new RegExp(`"${key}":\\s*(true|false)`));
|
|
178
|
-
data[key] = m ? m[1] === 'true' : undefined;
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
for (const key of ['followerCount', 'followingCount', 'heartCount', 'videoCount', 'diggCount']) {
|
|
182
|
-
const m = section.match(new RegExp(`"${key}":(\\d+)`));
|
|
183
|
-
if (m) data[key] = parseInt(m[1], 10);
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
const mt = section.match(/"createTime":(\d+)/);
|
|
187
|
-
if (mt) data.createTime = parseInt(mt[1], 10);
|
|
188
|
-
|
|
189
|
-
const ma = section.match(/"avatarLarger":"([^"]*)/);
|
|
190
|
-
if (ma) data.avatarLarger = ma[1].replace(/\\u002F/g, '/');
|
|
191
|
-
|
|
192
|
-
return data;
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
function extractLocationCreated(html) {
|
|
196
|
-
const m = html.match(/"locationCreated":"([^"]*)/);
|
|
197
|
-
return m ? m[1] : null;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
/**
|
|
201
|
-
* 检查 CDP 端口是否已就绪
|
|
202
|
-
*/
|
|
203
|
-
function checkCDPPort() {
|
|
204
|
-
return new Promise(resolve => {
|
|
205
|
-
const req = http.get(`http://127.0.0.1:${CDP_PORT}/json`, res => {
|
|
206
|
-
res.on('data', () => {});
|
|
207
|
-
res.on('end', () => resolve(res.statusCode === 200));
|
|
208
|
-
});
|
|
209
|
-
req.on('error', () => resolve(false));
|
|
210
|
-
req.setTimeout(3000, () => { resolve(false); req.destroy(); });
|
|
211
|
-
});
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
/**
|
|
215
|
-
* 检查 Edge 主进程是否带有 --user-data-dir 参数
|
|
216
|
-
*/
|
|
217
|
-
function checkEdgeArgs() {
|
|
218
|
-
return new Promise(resolve => {
|
|
219
|
-
const platform = os.platform();
|
|
220
|
-
let command;
|
|
221
|
-
if (platform === 'darwin') {
|
|
222
|
-
command = 'ps aux | grep "[M]icrosoft Edge" | grep -v "Helper\\|crashpad" | grep "user-data-dir"';
|
|
223
|
-
} else if (platform === 'win32') {
|
|
224
|
-
command = 'wmic process where "name like \"%msedge%\"" get commandline | findstr "user-data-dir"';
|
|
225
|
-
} else {
|
|
226
|
-
command = 'ps aux | grep "[m]sedge" | grep -v "Helper\\|crashpad" | grep "user-data-dir"';
|
|
227
|
-
}
|
|
228
|
-
exec(command, (err, stdout) => {
|
|
229
|
-
resolve(!err && stdout.trim().length > 0);
|
|
230
|
-
});
|
|
231
|
-
});
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
/**
|
|
235
|
-
* 获取 Edge 浏览器可执行文件路径
|
|
236
|
-
*/
|
|
237
|
-
function getEdgePath() {
|
|
238
|
-
const platform = os.platform();
|
|
239
|
-
if (platform === 'darwin') {
|
|
240
|
-
return '"Microsoft Edge"';
|
|
241
|
-
}
|
|
242
|
-
if (platform === 'win32') {
|
|
243
|
-
return 'msedge.exe';
|
|
244
|
-
}
|
|
245
|
-
return 'msedge';
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
/**
|
|
249
|
-
* 杀掉已运行的 Edge 浏览器进程
|
|
250
|
-
*/
|
|
251
|
-
function killEdgeProcesses() {
|
|
252
|
-
return new Promise((resolve, reject) => {
|
|
253
|
-
const platform = os.platform();
|
|
254
|
-
let command;
|
|
255
|
-
|
|
256
|
-
if (platform === 'darwin') {
|
|
257
|
-
command = 'killall -9 "Microsoft Edge" 2>/dev/null; rm -f ~/Library/Caches/Microsoft\\ Edge/Singleton*; true';
|
|
258
|
-
} else if (platform === 'win32') {
|
|
259
|
-
command = 'taskkill /F /IM msedge.exe 2>nul || exit 0';
|
|
260
|
-
} else {
|
|
261
|
-
command = 'pkill -9 -f msedge 2>/dev/null; true';
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
exec(command, () => {
|
|
265
|
-
resolve();
|
|
266
|
-
});
|
|
267
|
-
});
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
/**
|
|
271
|
-
* 启动 Edge 浏览器并启用 CDP 调试端口
|
|
272
|
-
*/
|
|
273
|
-
function launchEdgeWithCDP() {
|
|
274
|
-
return new Promise((resolve, reject) => {
|
|
275
|
-
const platform = os.platform();
|
|
276
|
-
const edgePath = getEdgePath();
|
|
277
|
-
let command;
|
|
278
|
-
|
|
279
|
-
const userDataDir = path.join(os.homedir(), 'Library', 'Application Support', 'Microsoft Edge For Testing');
|
|
280
|
-
if (platform === 'darwin') {
|
|
281
|
-
command = `open -a ${edgePath} --args --remote-debugging-port=${CDP_PORT} --user-data-dir="${userDataDir}"`;
|
|
282
|
-
} else if (platform === 'win32') {
|
|
283
|
-
command = `start msedge --remote-debugging-port=${CDP_PORT} --user-data-dir="${userDataDir}"`;
|
|
284
|
-
} else {
|
|
285
|
-
command = `msedge --remote-debugging-port=${CDP_PORT} --user-data-dir="${userDataDir}" &`;
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
exec(command, (err, stdout, stderr) => {
|
|
289
|
-
if (err) {
|
|
290
|
-
reject(new Error(`启动 Edge 浏览器失败: ${err.message}`));
|
|
291
|
-
return;
|
|
292
|
-
}
|
|
293
|
-
resolve();
|
|
294
|
-
});
|
|
295
|
-
});
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
/**
|
|
299
|
-
* 等待 CDP 端口就绪,最多等待 timeout 毫秒
|
|
300
|
-
*/
|
|
301
|
-
async function waitForCDP(timeout = 30000, interval = 1000) {
|
|
302
|
-
const start = Date.now();
|
|
303
|
-
while (Date.now() - start < timeout) {
|
|
304
|
-
const ready = await checkCDPPort();
|
|
305
|
-
if (ready) return true;
|
|
306
|
-
await new Promise(r => setTimeout(r, interval));
|
|
307
|
-
}
|
|
308
|
-
return false;
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
/**
|
|
312
|
-
* 确保浏览器已启动且 CDP 端口可用。
|
|
313
|
-
* 如果端口未就绪,先杀掉已运行的 Edge,再以调试模式重启。
|
|
314
|
-
* 返回 playwright 的 browser 实例。
|
|
315
|
-
*/
|
|
316
|
-
async function ensureBrowserReady() {
|
|
317
|
-
const isReady = await checkCDPPort();
|
|
318
|
-
let needLaunch = !isReady;
|
|
319
|
-
|
|
320
|
-
if (!needLaunch) {
|
|
321
|
-
const edgeArgsValid = await checkEdgeArgs();
|
|
322
|
-
if (!edgeArgsValid) {
|
|
323
|
-
console.error(`Edge 已运行但启动参数不完整,正在重启...`);
|
|
324
|
-
await killEdgeProcesses();
|
|
325
|
-
await new Promise(r => setTimeout(r, 3000));
|
|
326
|
-
needLaunch = true;
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
if (needLaunch) {
|
|
331
|
-
const edgeRunning = await isEdgeRunning();
|
|
332
|
-
if (edgeRunning) {
|
|
333
|
-
console.error(`Edge 已运行但 CDP 端口 ${CDP_PORT} 未启用,正在重启...`);
|
|
334
|
-
await killEdgeProcesses();
|
|
335
|
-
await new Promise(r => setTimeout(r, 3000));
|
|
336
|
-
} else {
|
|
337
|
-
console.error(`CDP 端口 ${CDP_PORT} 未就绪,正在启动 Edge 浏览器...`);
|
|
338
|
-
}
|
|
339
|
-
await launchEdgeWithCDP();
|
|
340
|
-
|
|
341
|
-
console.error('等待浏览器启动...');
|
|
342
|
-
const launched = await waitForCDP();
|
|
343
|
-
if (!launched) {
|
|
344
|
-
throw new Error(
|
|
345
|
-
`等待 CDP 端口 ${CDP_PORT} 超时。请确认 Edge 浏览器已安装,\n` +
|
|
346
|
-
'或手动启动: Microsoft Edge --remote-debugging-port=9222'
|
|
347
|
-
);
|
|
348
|
-
}
|
|
349
|
-
console.error('浏览器启动成功');
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
const browser = await chromium.connectOverCDP(`http://127.0.0.1:${CDP_PORT}`);
|
|
353
|
-
return browser;
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
/**
|
|
357
|
-
* 检查 Edge 浏览器是否正在运行
|
|
358
|
-
*/
|
|
359
|
-
function isEdgeRunning() {
|
|
360
|
-
return new Promise(resolve => {
|
|
361
|
-
const platform = os.platform();
|
|
362
|
-
let command;
|
|
363
|
-
if (platform === 'darwin') {
|
|
364
|
-
command = 'ps aux | grep -q "[M]icrosoft Edge.app/Contents/MacOS/Microsoft Edge" 2>/dev/null';
|
|
365
|
-
} else if (platform === 'win32') {
|
|
366
|
-
command = 'tasklist /FI "IMAGENAME eq msedge.exe" 2>nul | findstr /I msedge';
|
|
367
|
-
} else {
|
|
368
|
-
command = 'pgrep -f msedge > /dev/null 2>&1';
|
|
369
|
-
}
|
|
370
|
-
exec(command, (err) => {
|
|
371
|
-
resolve(!err);
|
|
372
|
-
});
|
|
373
|
-
});
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
/**
|
|
377
|
-
* 在已连接的 browser 中找到或打开 TikTok 页面
|
|
378
|
-
*/
|
|
379
|
-
async function ensureTikTokPage(browser, url) {
|
|
380
|
-
const contexts = browser.contexts();
|
|
381
|
-
let page = null;
|
|
382
|
-
|
|
383
|
-
for (const ctx of contexts) {
|
|
384
|
-
for (const p of ctx.pages()) {
|
|
385
|
-
if (p.url().includes('tiktok.com')) {
|
|
386
|
-
page = p;
|
|
387
|
-
break;
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
if (page) break;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
if (!page) {
|
|
394
|
-
console.error('未找到 TikTok 页面,正在打开...');
|
|
395
|
-
const defaultCtx = browser.contexts()[0];
|
|
396
|
-
page = await defaultCtx.newPage();
|
|
397
|
-
await retryWithBackoff(() => page.goto(url, { waitUntil: 'load', timeout: 30000 }));
|
|
398
|
-
await delay(Math.round(delayConfig.switchMax * 0.5), delayConfig.switchMax);
|
|
399
|
-
console.error('TikTok 页面已打开');
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
return page;
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
module.exports = {
|
|
406
|
-
delay,
|
|
407
|
-
openCommentPanel,
|
|
408
|
-
closeCommentPanel,
|
|
409
|
-
getVideoAuthor,
|
|
410
|
-
swipeNextVideo,
|
|
411
|
-
extractUserSection,
|
|
412
|
-
parseUserSection,
|
|
413
|
-
extractLocationCreated,
|
|
414
|
-
ensureBrowserReady,
|
|
415
|
-
ensureTikTokPage,
|
|
416
|
-
isEdgeRunning,
|
|
417
|
-
setDelayConfig,
|
|
418
|
-
getDelayConfig,
|
|
419
|
-
listDelayPresets,
|
|
420
|
-
retryWithBackoff,
|
|
421
|
-
isRetryableError,
|
|
422
|
-
};
|
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
const { delay } = require('./page-helpers.cjs');
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* 通用滚动收集器 - 三段式滚动策略
|
|
5
|
-
*
|
|
6
|
-
* 滚动逻辑由框架负责,收集逻辑由调用方通过 collectFn 提供。
|
|
7
|
-
*
|
|
8
|
-
* @param {Page} page - Playwright page 对象
|
|
9
|
-
* @param {Object} options
|
|
10
|
-
* @param {string} [options.container] - 滚动容器选择器,不传则用 window
|
|
11
|
-
* @param {boolean} [options.findScrollable] - 找到容器后向上查可滚动祖先
|
|
12
|
-
* @param {Function} options.collectFn - 在 page context 执行,接收 (containerEl, extraArgs),
|
|
13
|
-
* 返回 { items: any[] },items 为本轮收集到的原始数据
|
|
14
|
-
* @param {*} [options.extraArgs] - 额外参数,透传给 collectFn 的第二个参数
|
|
15
|
-
* @param {number[]} [options.delayRange] - 每轮滚动后等待时间 [min, max],默认 [800, 1500]
|
|
16
|
-
* @param {number} [options.maxItems] - 最大去重后数据数量,不传则滚动到底(靠 stale 停)
|
|
17
|
-
* @param {number} [options.maxRounds] - 最大滚动轮次(安全保险),默认 200
|
|
18
|
-
* @param {number} [options.staleThreshold] - 连续 N 次无新增则停止,默认 3
|
|
19
|
-
* @param {Function} [options.uniqueKey] - 去重键提取函数 (item) => key,不传则不做去重
|
|
20
|
-
* @param {Function} [options.onRound] - 每轮回调 (round, newItems, allItems)
|
|
21
|
-
* @returns {Promise<any[]>} 所有收集到的数据(已去重如果传了 uniqueKey)
|
|
22
|
-
*/
|
|
23
|
-
async function scrollAndCollect(page, options) {
|
|
24
|
-
const {
|
|
25
|
-
container,
|
|
26
|
-
findScrollable = false,
|
|
27
|
-
collectFn,
|
|
28
|
-
extraArgs,
|
|
29
|
-
delayRange = [800, 1500],
|
|
30
|
-
maxItems,
|
|
31
|
-
maxRounds = 200,
|
|
32
|
-
staleThreshold = 3,
|
|
33
|
-
uniqueKey,
|
|
34
|
-
onRound,
|
|
35
|
-
} = options;
|
|
36
|
-
|
|
37
|
-
if (!collectFn) {
|
|
38
|
-
throw new Error('collectFn is required');
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
const fnStr = typeof collectFn === 'function' ? collectFn.toString() : collectFn;
|
|
42
|
-
const allItems = [];
|
|
43
|
-
const seenKeys = uniqueKey ? new Set() : null;
|
|
44
|
-
let staleCount = 0;
|
|
45
|
-
|
|
46
|
-
for (let round = 0; round < maxRounds; round++) {
|
|
47
|
-
// 1. 三段式滚动
|
|
48
|
-
await threePhaseScroll(page, { container, findScrollable });
|
|
49
|
-
|
|
50
|
-
// 2. 等待内容加载
|
|
51
|
-
await delay(delayRange[0], delayRange[1]);
|
|
52
|
-
|
|
53
|
-
// 3. 收集数据
|
|
54
|
-
const result = await page.evaluate(({ fn: fnStr, containerSelector, findScrollableFlag, args }) => {
|
|
55
|
-
let el;
|
|
56
|
-
if (!containerSelector) {
|
|
57
|
-
el = window;
|
|
58
|
-
} else {
|
|
59
|
-
el = document.querySelector(containerSelector);
|
|
60
|
-
if (!el) {
|
|
61
|
-
el = window;
|
|
62
|
-
} else if (findScrollableFlag) {
|
|
63
|
-
let current = el;
|
|
64
|
-
let found = false;
|
|
65
|
-
while (current && current !== document.body) {
|
|
66
|
-
if (current.scrollHeight > current.clientHeight + 10) {
|
|
67
|
-
el = current;
|
|
68
|
-
found = true;
|
|
69
|
-
break;
|
|
70
|
-
}
|
|
71
|
-
current = current.parentElement;
|
|
72
|
-
}
|
|
73
|
-
if (!found) {
|
|
74
|
-
el = document.body;
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const fn = eval('(' + fnStr + ')');
|
|
80
|
-
return fn(el, args);
|
|
81
|
-
}, { fn: fnStr, containerSelector: container, findScrollableFlag: findScrollable, args: extraArgs });
|
|
82
|
-
|
|
83
|
-
const raw = result.items || [];
|
|
84
|
-
|
|
85
|
-
// 4. 去重:只保留本轮新增的
|
|
86
|
-
const newItems = uniqueKey
|
|
87
|
-
? raw.filter(item => {
|
|
88
|
-
const key = uniqueKey(item);
|
|
89
|
-
if (seenKeys.has(key)) return false;
|
|
90
|
-
seenKeys.add(key);
|
|
91
|
-
return true;
|
|
92
|
-
})
|
|
93
|
-
: raw;
|
|
94
|
-
|
|
95
|
-
allItems.push(...newItems);
|
|
96
|
-
|
|
97
|
-
// 5. 回调通知
|
|
98
|
-
if (onRound) {
|
|
99
|
-
onRound(round, newItems, allItems);
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
// 6. 判断是否达到 maxItems
|
|
103
|
-
if (maxItems !== undefined && allItems.length >= maxItems) {
|
|
104
|
-
break;
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// 7. stale 判断(基于本轮新增数)
|
|
108
|
-
if (newItems.length === 0) {
|
|
109
|
-
staleCount++;
|
|
110
|
-
if (staleCount >= staleThreshold) {
|
|
111
|
-
break;
|
|
112
|
-
}
|
|
113
|
-
} else {
|
|
114
|
-
staleCount = 0;
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
return allItems;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
* 三段式滚动:滚到底 → 回退一点 → 再滚到底
|
|
123
|
-
* 用于触发 TikTok 的 IntersectionObserver 懒加载
|
|
124
|
-
*/
|
|
125
|
-
async function threePhaseScroll(page, { container, findScrollable }) {
|
|
126
|
-
await page.evaluate(async (opts) => {
|
|
127
|
-
let el;
|
|
128
|
-
|
|
129
|
-
if (!opts.container) {
|
|
130
|
-
el = window;
|
|
131
|
-
} else {
|
|
132
|
-
el = document.querySelector(opts.container);
|
|
133
|
-
if (!el) {
|
|
134
|
-
el = window;
|
|
135
|
-
} else if (opts.findScrollable) {
|
|
136
|
-
let current = el;
|
|
137
|
-
let found = false;
|
|
138
|
-
while (current && current !== document.body) {
|
|
139
|
-
if (current.scrollHeight > current.clientHeight + 10) {
|
|
140
|
-
el = current;
|
|
141
|
-
found = true;
|
|
142
|
-
break;
|
|
143
|
-
}
|
|
144
|
-
current = current.parentElement;
|
|
145
|
-
}
|
|
146
|
-
if (!found) {
|
|
147
|
-
el = document.body;
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
const randDelay = (min, max) =>
|
|
153
|
-
new Promise(r => setTimeout(r, min + Math.random() * (max - min)));
|
|
154
|
-
|
|
155
|
-
if (el === window) {
|
|
156
|
-
window.scrollBy(0, window.innerHeight);
|
|
157
|
-
await randDelay(400, 800);
|
|
158
|
-
window.scrollBy(0, -200);
|
|
159
|
-
await randDelay(200, 400);
|
|
160
|
-
window.scrollBy(0, window.innerHeight);
|
|
161
|
-
} else {
|
|
162
|
-
el.scrollTop = el.scrollHeight;
|
|
163
|
-
await randDelay(400, 800);
|
|
164
|
-
el.scrollTop -= 100 + Math.random() * 100;
|
|
165
|
-
await randDelay(200, 400);
|
|
166
|
-
el.scrollTop = el.scrollHeight;
|
|
167
|
-
}
|
|
168
|
-
}, { container, findScrollable });
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
module.exports = {
|
|
172
|
-
scrollAndCollect,
|
|
173
|
-
};
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
const { swipeNextVideo, getVideoAuthor, closeCommentPanel, getDelayConfig } = require('./page-helpers.cjs');
|
|
2
|
-
const { extractCommentAuthors } = require('./comment-extractor');
|
|
3
|
-
const { delay } = require('./page-helpers.cjs');
|
|
4
|
-
|
|
5
|
-
async function scrapeSingleVideo(page, maxComments = 10) {
|
|
6
|
-
const videoAuthor = await getVideoAuthor(page);
|
|
7
|
-
if (!videoAuthor) {
|
|
8
|
-
throw new Error('无法获取视频作者');
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
const commentAuthors = await extractCommentAuthors(page, maxComments);
|
|
12
|
-
await closeCommentPanel(page);
|
|
13
|
-
const config = getDelayConfig();
|
|
14
|
-
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
15
|
-
|
|
16
|
-
const uniqueAuthors = [...new Set(commentAuthors.map(c => c.author))];
|
|
17
|
-
|
|
18
|
-
return {
|
|
19
|
-
videoAuthor,
|
|
20
|
-
commentAuthors: uniqueAuthors,
|
|
21
|
-
};
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
async function scanAndScrape(page, maxComments = 10) {
|
|
25
|
-
const results = [];
|
|
26
|
-
|
|
27
|
-
for (let i = 0; i < maxComments * 5; i++) {
|
|
28
|
-
const result = await scrapeSingleVideo(page, maxComments);
|
|
29
|
-
results.push(result);
|
|
30
|
-
console.log(`[${results.length}] ${result.videoAuthor} | 评论作者: ${result.commentAuthors.length}`);
|
|
31
|
-
|
|
32
|
-
if (i < maxComments * 5 - 1) {
|
|
33
|
-
await swipeNextVideo(page);
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
return results;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
module.exports = {
|
|
41
|
-
scrapeSingleVideo,
|
|
42
|
-
scanAndScrape,
|
|
43
|
-
};
|