tt-help-cli-ycl 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -17
- package/cli.js +9 -9
- package/package.json +44 -44
- package/src/cli/auto.js +94 -0
- package/src/cli/explore.js +117 -0
- package/src/cli/progress.js +111 -0
- package/src/cli/scrape.js +47 -0
- package/src/cli/utils.js +18 -0
- package/src/cli/videos.js +41 -0
- package/src/cli/watch.js +28 -0
- package/src/lib/args.js +386 -397
- package/src/lib/browser/anti-detect.js +23 -0
- package/src/lib/browser/cdp.js +142 -0
- package/src/lib/browser/launch.js +43 -0
- package/src/lib/browser/page.js +80 -0
- package/src/lib/constants.js +85 -168
- package/src/lib/delay.js +54 -0
- package/src/lib/explore-fetch.js +118 -0
- package/src/lib/fetcher.js +45 -60
- package/src/lib/filter.js +66 -66
- package/src/lib/io.js +54 -76
- package/src/lib/output.js +80 -80
- package/src/lib/parser.js +47 -47
- package/src/lib/retry.js +44 -0
- package/src/lib/scrape.js +40 -39
- package/src/lib/url.js +52 -0
- package/src/main.mjs +199 -962
- package/src/results/user-videos-bar.lar.lar.moeta.json +37 -0
- package/src/scraper/auto-core.mjs +183 -0
- package/src/scraper/{core.cjs → core.mjs} +188 -214
- package/src/{explore-core.cjs → scraper/explore-core.mjs} +44 -42
- package/src/scraper/modules/captcha-handler.mjs +114 -0
- package/src/scraper/modules/comment-extractor.mjs +69 -0
- package/src/scraper/modules/follow-extractor.mjs +121 -0
- package/src/scraper/modules/{guess-extractor.cjs → guess-extractor.mjs} +51 -53
- package/src/scraper/modules/page-error-detector.mjs +70 -0
- package/src/scraper/modules/page-helpers.mjs +46 -0
- package/src/scraper/modules/scroll-collector.mjs +189 -0
- package/src/{get-user-videos-core.cjs → videos/core.mjs} +126 -143
- package/src/watch/data-store.mjs +239 -0
- package/src/watch/public/index.html +446 -271
- package/src/watch/server.mjs +257 -153
- package/src/auto-core.cjs +0 -367
- package/src/data-store.cjs +0 -69
- package/src/get-user-videos.cjs +0 -59
- package/src/lib/auto-browser.mjs +0 -13
- package/src/lib/explore.js +0 -225
- package/src/lib/get-user-videos-browser.mjs +0 -6
- package/src/lib/scrape-browser.mjs +0 -6
- package/src/scraper/index.cjs +0 -97
- package/src/scraper/modules/comment-extractor.cjs +0 -49
- package/src/scraper/modules/follow-extractor.cjs +0 -112
- package/src/scraper/modules/page-helpers.cjs +0 -422
- package/src/scraper/modules/scroll-collector.cjs +0 -173
- package/src/scraper/modules/video-scanner.cjs +0 -43
- package/src/test-auto-follow.cjs +0 -109
- package/src/test-extractors.cjs +0 -75
- package/src/test-follow.cjs +0 -41
|
@@ -1,27 +1,29 @@
|
|
|
1
|
-
|
|
1
|
+
import {
|
|
2
2
|
delay,
|
|
3
3
|
ensureBrowserReady,
|
|
4
4
|
setDelayConfig,
|
|
5
5
|
closeCommentPanel,
|
|
6
6
|
retryWithBackoff,
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
detectPageError,
|
|
8
|
+
isLoggedIn,
|
|
9
|
+
} from './modules/page-helpers.mjs';
|
|
10
|
+
export { ensureBrowserReady };
|
|
11
|
+
import {
|
|
9
12
|
getUserInfo,
|
|
10
13
|
collectVideos,
|
|
11
|
-
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
const { extractGuessVideos } = require('./scraper/modules/guess-extractor.cjs');
|
|
14
|
+
} from '../videos/core.mjs';
|
|
15
|
+
import { scrapeSingleVideo } from './core.mjs';
|
|
16
|
+
import { extractFollowAndFollowers } from './modules/follow-extractor.mjs';
|
|
17
|
+
import { extractCommentAuthors } from './modules/comment-extractor.mjs';
|
|
18
|
+
import { extractGuessVideos } from './modules/guess-extractor.mjs';
|
|
17
19
|
|
|
18
20
|
async function processExplore(page, username, options, log) {
|
|
19
21
|
const {
|
|
20
22
|
maxComments = 0,
|
|
21
23
|
maxGuess = 0,
|
|
22
24
|
enableFollow = true,
|
|
23
|
-
maxFollowing =
|
|
24
|
-
maxFollowers =
|
|
25
|
+
maxFollowing = 5,
|
|
26
|
+
maxFollowers = 5,
|
|
25
27
|
location = 'ES',
|
|
26
28
|
} = options;
|
|
27
29
|
|
|
@@ -48,35 +50,13 @@ async function processExplore(page, username, options, log) {
|
|
|
48
50
|
await page.waitForSelector('[class*="DivVideoList"]', { timeout: 10000 }).catch(() => {});
|
|
49
51
|
await delay(1000, 2000);
|
|
50
52
|
|
|
51
|
-
|
|
52
|
-
log(` 获取用户信息...`);
|
|
53
|
+
log(' 获取用户信息...');
|
|
53
54
|
const info = await getUserInfo(page);
|
|
54
55
|
if (info) {
|
|
55
56
|
result.userInfo = info;
|
|
56
57
|
log(` 用户: ${info.nickname || username} | 粉丝: ${info.followerCount || '-'} | 视频: ${info.videoCount || '-'}`);
|
|
57
58
|
}
|
|
58
59
|
|
|
59
|
-
// 2. 获取关注+粉丝(在滚动前执行,避免按钮被滚出视口)
|
|
60
|
-
if (enableFollow) {
|
|
61
|
-
try {
|
|
62
|
-
log(` 获取关注/粉丝...`);
|
|
63
|
-
const { following, followers } = await extractFollowAndFollowers(
|
|
64
|
-
page,
|
|
65
|
-
{ maxFollowing, maxFollowers, log }
|
|
66
|
-
);
|
|
67
|
-
result.discoveredFollowing = following || [];
|
|
68
|
-
result.discoveredFollowers = followers || [];
|
|
69
|
-
result.hasFollowData = true;
|
|
70
|
-
log(` 关注: ${result.discoveredFollowing.length}, 粉丝: ${result.discoveredFollowers.length}`);
|
|
71
|
-
} catch (e) {
|
|
72
|
-
log(` 关注/粉丝提取失败: ${e.message}`);
|
|
73
|
-
result.hasFollowData = false;
|
|
74
|
-
result.discoveredFollowing = [];
|
|
75
|
-
result.discoveredFollowers = [];
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// 3. 获取视频列表
|
|
80
60
|
const videoList = await collectVideos(page, username, 1, log);
|
|
81
61
|
const videoArray = videoList ? [...videoList.values()] : [];
|
|
82
62
|
result.collectedVideos = videoArray.length;
|
|
@@ -84,17 +64,42 @@ async function processExplore(page, username, options, log) {
|
|
|
84
64
|
if (videoArray.length <= 0) {
|
|
85
65
|
result.processed = true;
|
|
86
66
|
result.noVideo = true;
|
|
87
|
-
const
|
|
88
|
-
if (
|
|
67
|
+
const pageError = await detectPageError(page);
|
|
68
|
+
if (pageError) {
|
|
89
69
|
result.restricted = true;
|
|
90
|
-
log(` @${username}
|
|
70
|
+
log(` @${username} 页面受限(${pageError}),标记跳过`);
|
|
91
71
|
} else {
|
|
92
72
|
log(` @${username} 没有视频,标记已处理`);
|
|
93
73
|
}
|
|
94
74
|
return result;
|
|
95
75
|
}
|
|
96
76
|
|
|
97
|
-
|
|
77
|
+
if (enableFollow) {
|
|
78
|
+
const loggedIn = await isLoggedIn(page);
|
|
79
|
+
if (!loggedIn) {
|
|
80
|
+
log(' [跳过] 获取关注/粉丝:未登录,请先登录 TikTok');
|
|
81
|
+
result.hasFollowData = false;
|
|
82
|
+
result.discoveredFollowing = [];
|
|
83
|
+
result.discoveredFollowers = [];
|
|
84
|
+
} else {
|
|
85
|
+
try {
|
|
86
|
+
log(' 获取关注/粉丝...');
|
|
87
|
+
const { following, followers } = await extractFollowAndFollowers(
|
|
88
|
+
page, { maxFollowing, maxFollowers, log }
|
|
89
|
+
);
|
|
90
|
+
result.discoveredFollowing = following || [];
|
|
91
|
+
result.discoveredFollowers = followers || [];
|
|
92
|
+
result.hasFollowData = true;
|
|
93
|
+
log(` 关注: ${result.discoveredFollowing.length}, 粉丝: ${result.discoveredFollowers.length}`);
|
|
94
|
+
} catch (e) {
|
|
95
|
+
log(` 关注/粉丝提取失败: ${e.message}`);
|
|
96
|
+
result.hasFollowData = false;
|
|
97
|
+
result.discoveredFollowing = [];
|
|
98
|
+
result.discoveredFollowers = [];
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
98
103
|
const firstVideo = videoArray[0];
|
|
99
104
|
const videoUrl = firstVideo.href.startsWith('http')
|
|
100
105
|
? firstVideo.href
|
|
@@ -104,12 +109,10 @@ async function processExplore(page, username, options, log) {
|
|
|
104
109
|
await retryWithBackoff(() => page.goto(videoUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }), { log });
|
|
105
110
|
await delay(1500, 2500);
|
|
106
111
|
|
|
107
|
-
// 5. 获取视频信息(含 locationCreated)
|
|
108
112
|
const videoData = await scrapeSingleVideo(page, 0, 0, log, 'NEVER_MATCH');
|
|
109
113
|
result.locationCreated = videoData.locationCreated || null;
|
|
110
114
|
log(` 视频作者: ${videoData.videoAuthor} | 国家: ${result.locationCreated || '未知'}`);
|
|
111
115
|
|
|
112
|
-
// 6. 判断是否为目标国家
|
|
113
116
|
const isTargetLocation = result.locationCreated === location;
|
|
114
117
|
|
|
115
118
|
if (isTargetLocation) {
|
|
@@ -145,7 +148,6 @@ async function processExplore(page, username, options, log) {
|
|
|
145
148
|
}
|
|
146
149
|
|
|
147
150
|
result.processed = true;
|
|
148
|
-
|
|
149
151
|
} catch (e) {
|
|
150
152
|
result.error = e.message;
|
|
151
153
|
log(` [错误] ${e.message}`);
|
|
@@ -154,4 +156,4 @@ async function processExplore(page, username, options, log) {
|
|
|
154
156
|
return result;
|
|
155
157
|
}
|
|
156
158
|
|
|
157
|
-
|
|
159
|
+
export { processExplore };
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
export async function detectCaptcha(page) {
|
|
2
|
+
return page.evaluate(() => {
|
|
3
|
+
const container = document.querySelector('.captcha-verify-container');
|
|
4
|
+
if (!container) return null;
|
|
5
|
+
|
|
6
|
+
const r = container.getBoundingClientRect();
|
|
7
|
+
return {
|
|
8
|
+
exists: true,
|
|
9
|
+
visible: container.offsetParent !== null,
|
|
10
|
+
rect: {
|
|
11
|
+
x: Math.round(r.x),
|
|
12
|
+
y: Math.round(r.y),
|
|
13
|
+
w: Math.round(r.width),
|
|
14
|
+
h: Math.round(r.height),
|
|
15
|
+
},
|
|
16
|
+
};
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function closeCaptcha(page) {
|
|
21
|
+
return page.evaluate(() => {
|
|
22
|
+
const closeBtn = document.getElementById('captcha_close_button');
|
|
23
|
+
if (!closeBtn) return { success: false, reason: 'close button not found' };
|
|
24
|
+
|
|
25
|
+
closeBtn.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
|
|
26
|
+
closeBtn.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
|
|
27
|
+
closeBtn.dispatchEvent(new MouseEvent('click', { bubbles: true }));
|
|
28
|
+
|
|
29
|
+
return { success: true };
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export async function handleCaptcha(page, options = {}) {
|
|
34
|
+
const { waitMs = 2000 } = options;
|
|
35
|
+
|
|
36
|
+
const captcha = await detectCaptcha(page);
|
|
37
|
+
if (!captcha) return { detected: false, closed: false };
|
|
38
|
+
|
|
39
|
+
await new Promise(r => setTimeout(r, waitMs));
|
|
40
|
+
|
|
41
|
+
const result = await closeCaptcha(page);
|
|
42
|
+
if (!result.success) return { detected: true, closed: false, reason: result.reason };
|
|
43
|
+
|
|
44
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
45
|
+
|
|
46
|
+
const stillThere = await detectCaptcha(page);
|
|
47
|
+
return { detected: true, closed: !stillThere };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export async function getIncognitoPage(browser, url, options = {}) {
|
|
51
|
+
const { waitMs = 3000 } = options;
|
|
52
|
+
const context = await browser.newContext();
|
|
53
|
+
const page = await context.newPage();
|
|
54
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
55
|
+
await new Promise(r => setTimeout(r, waitMs));
|
|
56
|
+
return { page, context };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export async function waitAndGetCaptcha(page, options = {}) {
|
|
60
|
+
const { waitMs = 180000, pollInterval = 5000, log } = options;
|
|
61
|
+
|
|
62
|
+
const captcha = await detectCaptcha(page);
|
|
63
|
+
if (!captcha) return { detected: false, resolved: false, waited: 0 };
|
|
64
|
+
|
|
65
|
+
if (log) log(' 检测到验证码,等待用户手动输入...');
|
|
66
|
+
|
|
67
|
+
const startTime = Date.now();
|
|
68
|
+
const deadline = startTime + waitMs;
|
|
69
|
+
|
|
70
|
+
while (Date.now() < deadline) {
|
|
71
|
+
await new Promise(r => setTimeout(r, pollInterval));
|
|
72
|
+
const remaining = await detectCaptcha(page);
|
|
73
|
+
if (!remaining) {
|
|
74
|
+
const waited = Math.round((Date.now() - startTime) / 1000);
|
|
75
|
+
if (log) log(` 验证码已解决(等待 ${waited}s)`);
|
|
76
|
+
return { detected: true, resolved: true, waited };
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const waited = Math.round(waitMs / 1000);
|
|
81
|
+
if (log) log(` 验证码等待超时(${waited}s),继续执行`);
|
|
82
|
+
|
|
83
|
+
// 超时后尝试关闭验证码弹窗
|
|
84
|
+
await closeCaptcha(page);
|
|
85
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
86
|
+
|
|
87
|
+
return { detected: true, resolved: false, waited };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export async function safeClickComment(page, options = {}) {
|
|
91
|
+
const { waitMs = 3000 } = options;
|
|
92
|
+
|
|
93
|
+
// 点击评论
|
|
94
|
+
await page.evaluate(() => {
|
|
95
|
+
const all = document.querySelectorAll('button');
|
|
96
|
+
for (const el of all) {
|
|
97
|
+
if (/^评论$/.test(el.textContent?.trim()) && el.offsetParent !== null && el.getBoundingClientRect().width > 0) {
|
|
98
|
+
el.click();
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
await new Promise(r => setTimeout(r, waitMs));
|
|
105
|
+
|
|
106
|
+
// 检测并关闭验证码
|
|
107
|
+
const captcha = await detectCaptcha(page);
|
|
108
|
+
if (captcha) {
|
|
109
|
+
const result = await handleCaptcha(page);
|
|
110
|
+
return { clicked: true, captchaDetected: true, captchaClosed: result.closed };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return { clicked: true, captchaDetected: false, captchaClosed: false };
|
|
114
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { delay, getDelayConfig, closeCommentPanel } from "./page-helpers.mjs";
|
|
2
|
+
import { scrollAndCollect } from "./scroll-collector.mjs";
|
|
3
|
+
import { waitAndGetCaptcha } from "./captcha-handler.mjs";
|
|
4
|
+
|
|
5
|
+
async function openCommentPanel(page) {
|
|
6
|
+
const tabs = page.locator('[class*="tabbar-item"]');
|
|
7
|
+
const commentTab = tabs.filter({ hasText: "评论" }).first();
|
|
8
|
+
await commentTab.click();
|
|
9
|
+
|
|
10
|
+
// 等待短暂时间让页面渲染
|
|
11
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
12
|
+
|
|
13
|
+
// 检测验证码
|
|
14
|
+
await waitAndGetCaptcha(page, {
|
|
15
|
+
waitMs: 180000,
|
|
16
|
+
pollInterval: 5000,
|
|
17
|
+
log: console.error,
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
await page
|
|
21
|
+
.waitForSelector('[class*="CommentListContainer"]', { timeout: 5000 })
|
|
22
|
+
.catch(() => {});
|
|
23
|
+
await page
|
|
24
|
+
.waitForFunction(
|
|
25
|
+
() => {
|
|
26
|
+
const list = document.querySelector('[class*="CommentListContainer"]');
|
|
27
|
+
return list && list.children.length > 0;
|
|
28
|
+
},
|
|
29
|
+
{ timeout: 10000 },
|
|
30
|
+
)
|
|
31
|
+
.catch(() => {});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async function extractCommentAuthors(page, maxComments = 10) {
|
|
35
|
+
await openCommentPanel(page);
|
|
36
|
+
|
|
37
|
+
const config = getDelayConfig();
|
|
38
|
+
const allAuthors = await scrollAndCollect(page, {
|
|
39
|
+
container: '[class*="CommentMain"]',
|
|
40
|
+
findScrollable: true,
|
|
41
|
+
collectFn: (container) => {
|
|
42
|
+
const list = document.querySelector('[class*="CommentListContainer"]');
|
|
43
|
+
if (!list) return { items: [] };
|
|
44
|
+
const authors = [];
|
|
45
|
+
Array.from(list.children).forEach((wrapper) => {
|
|
46
|
+
const link = wrapper.querySelector(
|
|
47
|
+
'[class*="UsernameContentWrapper"] a',
|
|
48
|
+
);
|
|
49
|
+
if (link) {
|
|
50
|
+
const href = link.href || link.getAttribute("href");
|
|
51
|
+
const m = href && href.match(/@([^/]+)/);
|
|
52
|
+
if (m) authors.push("@" + m[1]);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
return { items: authors };
|
|
56
|
+
},
|
|
57
|
+
uniqueKey: (a) => a,
|
|
58
|
+
maxItems: maxComments,
|
|
59
|
+
delayRange: [Math.round(config.commentMax * 0.3), config.commentMax],
|
|
60
|
+
staleThreshold: 2,
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
await closeCommentPanel(page);
|
|
64
|
+
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
65
|
+
|
|
66
|
+
return allAuthors.slice(0, maxComments);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export { extractCommentAuthors };
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { delay, getDelayConfig } from "./page-helpers.mjs";
|
|
2
|
+
import { scrollAndCollect } from "./scroll-collector.mjs";
|
|
3
|
+
|
|
4
|
+
const FILTER_WORDS = ["主页", "已关注", "粉丝", "推荐"];
|
|
5
|
+
|
|
6
|
+
async function waitForListContent(page, minChildren = 1, timeout = 15000) {
|
|
7
|
+
await page
|
|
8
|
+
.waitForFunction(
|
|
9
|
+
(min) => {
|
|
10
|
+
const container = document.querySelector(
|
|
11
|
+
"[class*=DivUserListContainer]",
|
|
12
|
+
);
|
|
13
|
+
return container && container.children.length >= min;
|
|
14
|
+
},
|
|
15
|
+
minChildren,
|
|
16
|
+
{ timeout },
|
|
17
|
+
)
|
|
18
|
+
.catch(() => {});
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
async function openFollowModal(page) {
|
|
22
|
+
const el = await page.$("[data-e2e=following]");
|
|
23
|
+
if (!el) {
|
|
24
|
+
throw new Error(
|
|
25
|
+
"未找到 [data-e2e=following] 元素,请确认当前页面为用户主页",
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
await el.evaluate((el) => el.parentElement.click());
|
|
29
|
+
await page
|
|
30
|
+
.waitForSelector("[class*=DivUserListContainer]", { timeout: 5000 })
|
|
31
|
+
.catch(() => {
|
|
32
|
+
throw new Error("关注弹窗未出现 DivUserListContainer");
|
|
33
|
+
});
|
|
34
|
+
await waitForListContent(page, 1, 3000);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async function switchToFollowersTab(page) {
|
|
38
|
+
await page.evaluate(() => {
|
|
39
|
+
const tabs = document.querySelectorAll("[class*=DivTabItem]");
|
|
40
|
+
for (const tab of tabs) {
|
|
41
|
+
if (tab.textContent?.includes("粉丝")) {
|
|
42
|
+
tab.click();
|
|
43
|
+
return;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
throw new Error("未找到粉丝 Tab");
|
|
47
|
+
});
|
|
48
|
+
await waitForListContent(page, 1, 3000);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function closeFollowModal(page) {
|
|
52
|
+
await page.evaluate(() => {
|
|
53
|
+
const closeBtn = document.querySelector("[data-e2e=follow-popup-close]");
|
|
54
|
+
if (closeBtn) closeBtn.click();
|
|
55
|
+
});
|
|
56
|
+
await page.waitForTimeout(500);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function createUserCollectFn() {
|
|
60
|
+
return (container) => {
|
|
61
|
+
const FILTER_WORDS = ["主页", "已关注", "粉丝", "推荐"];
|
|
62
|
+
const modal = document.querySelector("[class*=eyhy6180]");
|
|
63
|
+
const root = modal || document;
|
|
64
|
+
const users = [];
|
|
65
|
+
const seen = new Set();
|
|
66
|
+
const links = root.querySelectorAll('a[href*="/@"]');
|
|
67
|
+
for (const link of links) {
|
|
68
|
+
const match = link.href.match(/@([^/?]+)/);
|
|
69
|
+
if (!match) continue;
|
|
70
|
+
const handle = "@" + decodeURIComponent(match[1]);
|
|
71
|
+
const text = (link.textContent || "").trim();
|
|
72
|
+
if (text.length <= 2) continue;
|
|
73
|
+
if (FILTER_WORDS.includes(text)) continue;
|
|
74
|
+
if (seen.has(handle)) continue;
|
|
75
|
+
seen.add(handle);
|
|
76
|
+
users.push({ handle, displayName: text });
|
|
77
|
+
}
|
|
78
|
+
return { items: users };
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
async function extractUsersFromModal(page, maxUsers) {
|
|
83
|
+
const config = getDelayConfig();
|
|
84
|
+
const minDelay = Math.max(300, Math.round(config.commentMax * 0.3));
|
|
85
|
+
const maxDelay = Math.max(800, config.commentMax);
|
|
86
|
+
|
|
87
|
+
const allUsers = await scrollAndCollect(page, {
|
|
88
|
+
container: "[class*=DivUserListContainer]",
|
|
89
|
+
findScrollable: false,
|
|
90
|
+
collectFn: createUserCollectFn(),
|
|
91
|
+
uniqueKey: (u) => u.handle,
|
|
92
|
+
maxItems: maxUsers,
|
|
93
|
+
delayRange: [minDelay, maxDelay],
|
|
94
|
+
staleThreshold: 2,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
return allUsers.slice(0, maxUsers);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function extractFollowAndFollowers(page, options = {}) {
|
|
101
|
+
const { maxFollowing = 999, maxFollowers = 999, log = () => {} } = options;
|
|
102
|
+
|
|
103
|
+
await openFollowModal(page);
|
|
104
|
+
|
|
105
|
+
const following = await extractUsersFromModal(page, maxFollowing);
|
|
106
|
+
log(` 已关注: ${following.length}`);
|
|
107
|
+
|
|
108
|
+
await switchToFollowersTab(page);
|
|
109
|
+
|
|
110
|
+
const followers = await extractUsersFromModal(page, maxFollowers);
|
|
111
|
+
log(` 粉丝: ${followers.length}`);
|
|
112
|
+
|
|
113
|
+
await closeFollowModal(page);
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
following: following.map((u) => [u.handle, u.displayName]),
|
|
117
|
+
followers: followers.map((u) => [u.handle, u.displayName]),
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export { extractFollowAndFollowers };
|
|
@@ -1,53 +1,51 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
async function openGuessTab(page) {
|
|
5
|
-
const tabs = page.locator('[class*="tabbar-item"]');
|
|
6
|
-
const guessTab = tabs.filter({ hasText: /猜你喜欢/i }).first();
|
|
7
|
-
await guessTab.click();
|
|
8
|
-
const config = getDelayConfig();
|
|
9
|
-
await delay(Math.round(config.commentMax * 0.5), config.commentMax);
|
|
10
|
-
await page.waitForSelector('[class*="Related"]', { timeout: 5000 }).catch(() => {});
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
async function extractGuessVideos(page, maxVideos = 10) {
|
|
14
|
-
await openGuessTab(page);
|
|
15
|
-
|
|
16
|
-
const config = getDelayConfig();
|
|
17
|
-
const allVideos = await scrollAndCollect(page, {
|
|
18
|
-
container: '[class*="Related"]',
|
|
19
|
-
findScrollable: true,
|
|
20
|
-
collectFn: (container) => {
|
|
21
|
-
const items = [];
|
|
22
|
-
Array.from(container.querySelectorAll('[class*="DivItemContainer"]')).forEach(item => {
|
|
23
|
-
const link = item.querySelector('a[href*="/video/"]');
|
|
24
|
-
if (link) {
|
|
25
|
-
const href = link.href || link.getAttribute('href');
|
|
26
|
-
const m = href && href.match(/@([^/]+)\/video\/(\d+)/);
|
|
27
|
-
if (m) {
|
|
28
|
-
items.push({
|
|
29
|
-
author: '@' + m[1],
|
|
30
|
-
videoId: m[2],
|
|
31
|
-
url: href,
|
|
32
|
-
title: '',
|
|
33
|
-
});
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
});
|
|
37
|
-
return { items };
|
|
38
|
-
},
|
|
39
|
-
uniqueKey: (v) => v.videoId,
|
|
40
|
-
maxItems: maxVideos,
|
|
41
|
-
delayRange: [Math.round(config.commentMax * 0.3), config.commentMax],
|
|
42
|
-
staleThreshold: 3,
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
await closeCommentPanel(page);
|
|
46
|
-
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
47
|
-
|
|
48
|
-
return allVideos.slice(0, maxVideos);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
extractGuessVideos,
|
|
53
|
-
};
|
|
1
|
+
import { delay, getDelayConfig, closeCommentPanel } from './page-helpers.mjs';
|
|
2
|
+
import { scrollAndCollect } from './scroll-collector.mjs';
|
|
3
|
+
|
|
4
|
+
async function openGuessTab(page) {
|
|
5
|
+
const tabs = page.locator('[class*="tabbar-item"]');
|
|
6
|
+
const guessTab = tabs.filter({ hasText: /猜你喜欢/i }).first();
|
|
7
|
+
await guessTab.click();
|
|
8
|
+
const config = getDelayConfig();
|
|
9
|
+
await delay(Math.round(config.commentMax * 0.5), config.commentMax);
|
|
10
|
+
await page.waitForSelector('[class*="Related"]', { timeout: 5000 }).catch(() => {});
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
async function extractGuessVideos(page, maxVideos = 10) {
|
|
14
|
+
await openGuessTab(page);
|
|
15
|
+
|
|
16
|
+
const config = getDelayConfig();
|
|
17
|
+
const allVideos = await scrollAndCollect(page, {
|
|
18
|
+
container: '[class*="Related"]',
|
|
19
|
+
findScrollable: true,
|
|
20
|
+
collectFn: (container) => {
|
|
21
|
+
const items = [];
|
|
22
|
+
Array.from(container.querySelectorAll('[class*="DivItemContainer"]')).forEach(item => {
|
|
23
|
+
const link = item.querySelector('a[href*="/video/"]');
|
|
24
|
+
if (link) {
|
|
25
|
+
const href = link.href || link.getAttribute('href');
|
|
26
|
+
const m = href && href.match(/@([^/]+)\/video\/(\d+)/);
|
|
27
|
+
if (m) {
|
|
28
|
+
items.push({
|
|
29
|
+
author: '@' + m[1],
|
|
30
|
+
videoId: m[2],
|
|
31
|
+
url: href,
|
|
32
|
+
title: '',
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
return { items };
|
|
38
|
+
},
|
|
39
|
+
uniqueKey: (v) => v.videoId,
|
|
40
|
+
maxItems: maxVideos,
|
|
41
|
+
delayRange: [Math.round(config.commentMax * 0.3), config.commentMax],
|
|
42
|
+
staleThreshold: 3,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
await closeCommentPanel(page);
|
|
46
|
+
await delay(Math.round(config.commentMax * 0.3), config.commentMax);
|
|
47
|
+
|
|
48
|
+
return allVideos.slice(0, maxVideos);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export { extractGuessVideos };
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
const PATTERNS = {
|
|
2
|
+
login_required: [
|
|
3
|
+
"登录 TikTok",
|
|
4
|
+
"登录后查看",
|
|
5
|
+
"查看需登录",
|
|
6
|
+
"Log in to TikTok",
|
|
7
|
+
"Login to TikTok",
|
|
8
|
+
"观众管理功能",
|
|
9
|
+
"Viewer management",
|
|
10
|
+
"私密账号",
|
|
11
|
+
"私密状态",
|
|
12
|
+
],
|
|
13
|
+
captcha: [
|
|
14
|
+
"captcha",
|
|
15
|
+
"verify",
|
|
16
|
+
"验证码",
|
|
17
|
+
"点击下一步",
|
|
18
|
+
"Press and hold",
|
|
19
|
+
"slide to verify",
|
|
20
|
+
"滑动验证",
|
|
21
|
+
"人机验证",
|
|
22
|
+
"安全验证",
|
|
23
|
+
],
|
|
24
|
+
rate_limited: [
|
|
25
|
+
"访问过于频繁",
|
|
26
|
+
"操作过于频繁",
|
|
27
|
+
"too many requests",
|
|
28
|
+
"rate limit",
|
|
29
|
+
"稍后再试",
|
|
30
|
+
"try again later",
|
|
31
|
+
"请稍后再来",
|
|
32
|
+
],
|
|
33
|
+
region_blocked: [
|
|
34
|
+
"地区限制",
|
|
35
|
+
"not available in your",
|
|
36
|
+
"此内容不可用",
|
|
37
|
+
"content not available",
|
|
38
|
+
"currently unavailable",
|
|
39
|
+
"抱歉,此内容",
|
|
40
|
+
"此页面不可用",
|
|
41
|
+
],
|
|
42
|
+
not_found: [
|
|
43
|
+
"页面不存在",
|
|
44
|
+
"page not found",
|
|
45
|
+
"找不到",
|
|
46
|
+
"Couldn't find this",
|
|
47
|
+
"nothing here",
|
|
48
|
+
"此页面不存在",
|
|
49
|
+
"没有内容",
|
|
50
|
+
"发起对话",
|
|
51
|
+
"0 条评论",
|
|
52
|
+
],
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
export async function detectPageError(page) {
|
|
56
|
+
return page.evaluate((patterns) => {
|
|
57
|
+
const bodyText = document.body.innerText;
|
|
58
|
+
const lower = bodyText.toLowerCase();
|
|
59
|
+
|
|
60
|
+
for (const [type, phrases] of Object.entries(patterns)) {
|
|
61
|
+
for (const phrase of phrases) {
|
|
62
|
+
if (lower.includes(phrase.toLowerCase())) {
|
|
63
|
+
return type;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return null;
|
|
69
|
+
}, PATTERNS);
|
|
70
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import {
|
|
2
|
+
delay,
|
|
3
|
+
getDelayConfig,
|
|
4
|
+
setDelayConfig,
|
|
5
|
+
listDelayPresets,
|
|
6
|
+
DELAY_PRESETS,
|
|
7
|
+
} from '../../lib/delay.js';
|
|
8
|
+
import { ensureBrowserReady } from '../../lib/browser/cdp.js';
|
|
9
|
+
import {
|
|
10
|
+
ensureTikTokPage,
|
|
11
|
+
closeCommentPanel,
|
|
12
|
+
findTikTokPage,
|
|
13
|
+
getOrCreatePage,
|
|
14
|
+
isLoggedIn,
|
|
15
|
+
} from '../../lib/browser/page.js';
|
|
16
|
+
import { retryWithBackoff, isRetryableError } from '../../lib/retry.js';
|
|
17
|
+
import {
|
|
18
|
+
extractUserSection,
|
|
19
|
+
parseUserSection,
|
|
20
|
+
extractLocationCreated,
|
|
21
|
+
USER_SECTION_SIZE,
|
|
22
|
+
} from '../../lib/parser.js';
|
|
23
|
+
import { detectPageError } from './page-error-detector.mjs';
|
|
24
|
+
|
|
25
|
+
export {
|
|
26
|
+
delay,
|
|
27
|
+
setDelayConfig,
|
|
28
|
+
getDelayConfig,
|
|
29
|
+
listDelayPresets,
|
|
30
|
+
DELAY_PRESETS,
|
|
31
|
+
ensureBrowserReady,
|
|
32
|
+
ensureTikTokPage,
|
|
33
|
+
closeCommentPanel,
|
|
34
|
+
findTikTokPage,
|
|
35
|
+
getOrCreatePage,
|
|
36
|
+
isLoggedIn,
|
|
37
|
+
retryWithBackoff,
|
|
38
|
+
isRetryableError,
|
|
39
|
+
extractUserSection,
|
|
40
|
+
parseUserSection,
|
|
41
|
+
extractLocationCreated,
|
|
42
|
+
USER_SECTION_SIZE,
|
|
43
|
+
detectPageError,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
export const CDP_PORT = 9222;
|