tt-help-cli-ycl 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tt-help-cli-ycl",
3
- "version": "1.0.5",
3
+ "version": "1.0.7",
4
4
  "description": "TikTok user & video data scraper - extract ttSeller, verified, locationCreated from HTML source",
5
5
  "type": "module",
6
6
  "bin": {
@@ -0,0 +1,288 @@
1
+ const {
2
+ delay,
3
+ ensureBrowserReady,
4
+ ensureTikTokPage,
5
+ setDelayConfig,
6
+ getDelayConfig,
7
+ closeCommentPanel,
8
+ retryWithBackoff,
9
+ } = require('./scraper/modules/page-helpers.cjs');
10
+ const {
11
+ getUserInfo,
12
+ collectVideos,
13
+ isPageRestricted,
14
+ } = require('./get-user-videos-core.cjs');
15
+ const { runScrape } = require('./scraper/core.cjs');
16
+
17
+ function mergeUserInfo(existing, incoming, source) {
18
+ const merged = { ...existing };
19
+ for (const [key, value] of Object.entries(incoming)) {
20
+ if (key === '_sources') continue;
21
+ if (value === undefined || value === null || value === '') continue;
22
+ if (typeof value === 'number' && typeof merged[key] === 'number') {
23
+ merged[key] = Math.max(merged[key], value);
24
+ } else if (merged[key] === undefined || merged[key] === null || merged[key] === '') {
25
+ merged[key] = value;
26
+ }
27
+ }
28
+ if (source) {
29
+ if (!merged._sources) merged._sources = [];
30
+ if (!merged._sources.includes(source)) merged._sources.push(source);
31
+ }
32
+ return merged;
33
+ }
34
+
35
+ async function runAuto(options) {
36
+ const {
37
+ username,
38
+ collectMax = 1,
39
+ scrapeDepth = 50,
40
+ maxComments = 200,
41
+ preset = null,
42
+ switchMax = null,
43
+ commentMax = null,
44
+ log = console.error,
45
+ } = options;
46
+
47
+ if (preset) {
48
+ setDelayConfig(preset);
49
+ } else if (switchMax || commentMax) {
50
+ setDelayConfig({
51
+ switchMax: switchMax || 5000,
52
+ commentMax: commentMax || 3000,
53
+ });
54
+ }
55
+
56
+ const config = getDelayConfig();
57
+ const cleanUsername = username.replace('@', '');
58
+
59
+ log(`auto 模式: @${cleanUsername}`);
60
+ log(`收集视频数: ${collectMax}, 每个滑动: ${scrapeDepth}次, 每视频评论数: ${maxComments}`);
61
+
62
+ const browser = await ensureBrowserReady();
63
+ let page;
64
+ try {
65
+ page = await ensureTikTokPage(browser, `https://www.tiktok.com/@${cleanUsername}`);
66
+ } catch (e) {
67
+ await browser.close().catch(() => {});
68
+ throw e;
69
+ }
70
+
71
+ // [1/3] 获取种子用户信息
72
+ const profileUrl = `https://www.tiktok.com/@${cleanUsername}`;
73
+ log(`\n[1/3] 获取 @${cleanUsername} 的用户信息和视频列表...`);
74
+ await retryWithBackoff(() => page.goto(profileUrl, { waitUntil: 'load', timeout: 30000 }), { log });
75
+ await page.waitForSelector('[class*="DivVideoList"]', { timeout: 10000 }).catch(() => {});
76
+ await delay(1000, 2000);
77
+
78
+ const seedUserInfo = await getUserInfo(page);
79
+ if (!seedUserInfo.uniqueId) {
80
+ seedUserInfo.uniqueId = cleanUsername;
81
+ }
82
+ log(`种子用户: ${seedUserInfo.nickname || seedUserInfo.uniqueId} (粉丝: ${seedUserInfo.followerCount || '-'})`);
83
+
84
+ // [2/3] 收集视频列表
85
+ const videos = await collectVideos(page, cleanUsername, collectMax, log);
86
+ const videoList = Array.from(videos.values()).slice(0, collectMax);
87
+ log(`获取到 ${videoList.length} 个视频`);
88
+
89
+ if (videoList.length === 0) {
90
+ const restricted = await isPageRestricted(page);
91
+ if (restricted) {
92
+ log('种子用户页面受限(需登录),结束');
93
+ } else {
94
+ log('没有获取到视频,结束');
95
+ }
96
+ const output = {
97
+ seedUser: { ...seedUserInfo, sources: ['seed'], restricted },
98
+ users: [{ ...seedUserInfo, sources: ['seed'], restricted }],
99
+ stats: {
100
+ totalVideos: 0,
101
+ totalUsers: 1,
102
+ fromSeed: 1,
103
+ fromVideo: 0,
104
+ fromComment: 0,
105
+ },
106
+ };
107
+ return { output, browser };
108
+ }
109
+
110
+ // [3/3] 循环每个视频,执行 runScrape
111
+ log(`\n[3/3] 开始循环抓取(${videoList.length} 个视频,每个滑动 ${scrapeDepth} 次)...`);
112
+
113
+ const users = new Map();
114
+ users.set(seedUserInfo.uniqueId, mergeUserInfo({}, seedUserInfo, 'seed'));
115
+
116
+ const restrictedUsers = new Set();
117
+ let totalVideosScraped = 0;
118
+
119
+ for (let i = 0; i < videoList.length; i++) {
120
+ const videoUrl = videoList[i].href.startsWith('http')
121
+ ? videoList[i].href
122
+ : `https://www.tiktok.com${videoList[i].href}`;
123
+
124
+ log(`\n[${i + 1}/${videoList.length}] ${videoUrl}`);
125
+
126
+ const { output: scrapeOutput } = await runScrape({
127
+ videoUrl,
128
+ maxVideos: scrapeDepth,
129
+ maxComments,
130
+ preset,
131
+ switchMax,
132
+ commentMax,
133
+ log,
134
+ browser,
135
+ page,
136
+ });
137
+
138
+ totalVideosScraped += (scrapeOutput && scrapeOutput.stats) ? scrapeOutput.stats.totalVideos : 0;
139
+
140
+ // 合并视频作者信息
141
+ for (const vd of scrapeOutput.videoDetails) {
142
+ if (restrictedUsers.has(vd.uniqueId)) continue;
143
+ const existing = users.get(vd.uniqueId);
144
+ users.set(vd.uniqueId, mergeUserInfo(existing || {}, vd, 'video'));
145
+ }
146
+
147
+ // 添加评论者
148
+ for (const cu of scrapeOutput.commentUsers) {
149
+ if (restrictedUsers.has(cu)) continue;
150
+ if (!users.has(cu)) {
151
+ users.set(cu, mergeUserInfo({}, { uniqueId: cu }, 'comment'));
152
+ }
153
+ }
154
+ }
155
+
156
+ // 构建输出
157
+ const usersList = [...users.values()].map(u => {
158
+ const { _sources, ...rest } = u;
159
+ return { ...rest, sources: _sources || [] };
160
+ });
161
+
162
+ usersList.sort((a, b) => {
163
+ const aIsSeed = a._sources && a._sources.includes('seed');
164
+ const bIsSeed = b._sources && b._sources.includes('seed');
165
+ if (aIsSeed && !bIsSeed) return -1;
166
+ if (!aIsSeed && bIsSeed) return 1;
167
+ const aHasInfo = a.nickname || a.followerCount;
168
+ const bHasInfo = b.nickname || b.followerCount;
169
+ if (aHasInfo && !bHasInfo) return -1;
170
+ if (!aHasInfo && bHasInfo) return 1;
171
+ return 0;
172
+ });
173
+
174
+ const output = usersList;
175
+
176
+ log(`\n结果: ${usersList.length} 个用户`);
177
+
178
+ return { output, browser };
179
+ }
180
+
181
+ async function processUser(page, username, options, log) {
182
+ const {
183
+ collectMax = 1,
184
+ scrapeDepth = 50,
185
+ maxComments = 200,
186
+ preset = 'fast',
187
+ switchMax = null,
188
+ commentMax = null,
189
+ browser = null,
190
+ } = options;
191
+
192
+ const result = {
193
+ userInfo: null,
194
+ collectedVideos: [],
195
+ discoveredVideoAuthors: [],
196
+ discoveredCommentAuthors: [],
197
+ error: null,
198
+ };
199
+
200
+ try {
201
+ log(`\n[processUser] 访问 @${username}...`);
202
+ await retryWithBackoff(() => page.goto(`https://www.tiktok.com/@${username}`, {
203
+ waitUntil: 'load',
204
+ timeout: 30000,
205
+ }), { log });
206
+ await page.waitForSelector('[class*="DivVideoList"]', { timeout: 10000 }).catch(() => {});
207
+ await delay(1000, 2000);
208
+
209
+ const info = await getUserInfo(page);
210
+ result.userInfo = info;
211
+ if (!info.uniqueId) {
212
+ info.uniqueId = username;
213
+ }
214
+ log(` 昵称: ${info.nickname || '-'} | 粉丝: ${info.followerCount || 0}`);
215
+
216
+ const videos = await collectVideos(page, username, collectMax, log);
217
+ const videoList = Array.from(videos.values()).slice(0, collectMax);
218
+ result.collectedVideos = videoList.map(v => ({
219
+ videoId: v.id,
220
+ videoUrl: v.href,
221
+ }));
222
+
223
+ if (videoList.length > 0) {
224
+ const allVideoAuthors = new Map();
225
+ const allCommentAuthors = new Set();
226
+
227
+ for (let i = 0; i < videoList.length; i++) {
228
+ const video = videoList[i];
229
+ const videoUrl = video.href.startsWith('http')
230
+ ? video.href
231
+ : `https://www.tiktok.com${video.href}`;
232
+ log(` [${i + 1}/${videoList.length}] 开始 scrape: ${videoUrl} (深度 ${scrapeDepth})`);
233
+
234
+ const scrapeResult = await runScrape({
235
+ videoUrl,
236
+ maxVideos: scrapeDepth,
237
+ maxComments,
238
+ preset,
239
+ switchMax,
240
+ commentMax,
241
+ browser,
242
+ page,
243
+ log,
244
+ });
245
+
246
+ const scrapeOutput = scrapeResult.output;
247
+
248
+ if (scrapeOutput && scrapeOutput.videoDetails) {
249
+ for (const vd of scrapeOutput.videoDetails) {
250
+ if (!allVideoAuthors.has(vd.uniqueId)) {
251
+ allVideoAuthors.set(vd.uniqueId, {
252
+ uniqueId: vd.uniqueId,
253
+ nickname: vd.nickname,
254
+ locationCreated: vd.locationCreated,
255
+ });
256
+ }
257
+ }
258
+ }
259
+
260
+ if (scrapeOutput && scrapeOutput.commentUsers) {
261
+ for (const cu of scrapeOutput.commentUsers) {
262
+ allCommentAuthors.add(cu);
263
+ }
264
+ }
265
+ }
266
+
267
+ result.discoveredVideoAuthors = [...allVideoAuthors.values()];
268
+ result.discoveredCommentAuthors = [...allCommentAuthors];
269
+
270
+ log(` 发现: ${result.discoveredVideoAuthors.length} 个视频作者, ${result.discoveredCommentAuthors.length} 个评论作者`);
271
+ } else {
272
+ const restricted = await isPageRestricted(page);
273
+ result.restricted = restricted;
274
+ if (restricted) {
275
+ log(` @${username} 页面受限(需登录),标记跳过`);
276
+ } else {
277
+ log(` @${username} 没有视频,跳过 scrape`);
278
+ }
279
+ }
280
+ } catch (e) {
281
+ result.error = e.message;
282
+ log(` [错误] ${e.message}`);
283
+ }
284
+
285
+ return result;
286
+ }
287
+
288
+ module.exports = { runAuto, processUser };
@@ -0,0 +1,65 @@
1
+ const fs = require('fs');
2
+ const path = require('path');
3
+
4
+ function createStore(filePath) {
5
+ let data = [];
6
+
7
+ if (filePath) {
8
+ const resolved = path.resolve(filePath);
9
+ if (fs.existsSync(resolved)) {
10
+ try {
11
+ const raw = fs.readFileSync(resolved, 'utf-8');
12
+ data = JSON.parse(raw);
13
+ if (!Array.isArray(data)) data = [];
14
+ } catch (e) {
15
+ console.error(`[data-store] 读取文件失败: ${e.message}`);
16
+ data = [];
17
+ }
18
+ }
19
+ }
20
+
21
+ function save() {
22
+ if (!filePath) return;
23
+ const resolved = path.resolve(filePath);
24
+ const json = JSON.stringify(data, null, 2);
25
+ fs.writeFileSync(resolved, json, 'utf-8');
26
+ }
27
+
28
+ function getUser(uid) {
29
+ return data.find(u => u.uniqueId === uid);
30
+ }
31
+
32
+ function addUser(user) {
33
+ const existing = getUser(user.uniqueId);
34
+ if (existing) {
35
+ for (const key of Object.keys(user)) {
36
+ if (key === 'uniqueId') continue;
37
+ if (key === 'sources') continue;
38
+ if (user[key] !== undefined && user[key] !== null && user[key] !== '') {
39
+ existing[key] = user[key];
40
+ }
41
+ }
42
+ if (user.sources && Array.isArray(user.sources)) {
43
+ existing.sources = [...new Set([...(existing.sources || []), ...user.sources])];
44
+ }
45
+ } else {
46
+ data.push(user);
47
+ }
48
+ }
49
+
50
+ function getPendingUsers() {
51
+ return data.filter(u => u.followerCount === undefined);
52
+ }
53
+
54
+ function getAllUsers() {
55
+ return data;
56
+ }
57
+
58
+ function getProcessedUsers() {
59
+ return data.filter(u => u.followerCount !== undefined);
60
+ }
61
+
62
+ return { save, getUser, addUser, getPendingUsers, getAllUsers, getProcessedUsers, data };
63
+ }
64
+
65
+ module.exports = { createStore };
@@ -0,0 +1,165 @@
1
+ const { delay, ensureBrowserReady, ensureTikTokPage, retryWithBackoff } = require('./scraper/modules/page-helpers.cjs');
2
+
3
+ async function getUserInfo(page) {
4
+ return await page.evaluate(() => {
5
+ const html = document.documentElement.outerHTML;
6
+ const result = {};
7
+
8
+ const m = window.location.href.match(/\/@([^\/]+)/);
9
+ if (m) result.uniqueId = m[1];
10
+
11
+ const patterns = {
12
+ secUid: /"secUid":"([^"]+)"/,
13
+ nickname: /"nickname":"((?:[^"\\]|\\.)*)"/,
14
+ ttSeller: /"ttSeller":\s*(true|false)/,
15
+ verified: /"verified":\s*(true|false)/,
16
+ followerCount: /"followerCount":(\d+)/,
17
+ videoCount: /"videoCount":(\d+)/,
18
+ followingCount: /"followingCount":(\d+)/,
19
+ heartCount: /"heartCount":(\d+)/,
20
+ signature: /"signature":"((?:[^"\\]|\\.)*)"/,
21
+ locationCreated: /"locationCreated":"([^"]*)/,
22
+ };
23
+
24
+ const boolKeys = ['ttSeller', 'verified'];
25
+ const numKeys = ['followerCount', 'videoCount', 'followingCount', 'heartCount'];
26
+
27
+ for (const [key, pat] of Object.entries(patterns)) {
28
+ const match = html.match(pat);
29
+ if (match) {
30
+ if (boolKeys.includes(key)) result[key] = match[1] === 'true';
31
+ else if (numKeys.includes(key)) result[key] = parseInt(match[1], 10);
32
+ else if (key === 'signature') result[key] = match[1].replace(/\\n/g, '\n').replace(/\\\\/g, '\\');
33
+ else result[key] = match[1];
34
+ }
35
+ }
36
+
37
+ return result;
38
+ });
39
+ }
40
+
41
+ async function collectVideos(page, username, maxVideos, log) {
42
+ const uniqueVideos = new Map();
43
+
44
+ const collect = async () => {
45
+ const links = await page.evaluate((handle) => {
46
+ const pattern = '/@' + handle + '/video/';
47
+ return Array.from(document.querySelectorAll('a'))
48
+ .filter(el => (el.getAttribute('href') || '').includes(pattern))
49
+ .map(el => {
50
+ const href = el.getAttribute('href') || '';
51
+ const idMatch = href.match(/\/video\/(\d+)/);
52
+ return { id: idMatch ? idMatch[1] : null, href };
53
+ })
54
+ .filter(v => v.id);
55
+ }, username);
56
+
57
+ let newCount = 0;
58
+ for (const v of links) {
59
+ if (!uniqueVideos.has(v.id)) {
60
+ uniqueVideos.set(v.id, v);
61
+ newCount++;
62
+ }
63
+ }
64
+ return newCount;
65
+ };
66
+
67
+ let prevTotal = 0;
68
+ let staleCount = 0;
69
+
70
+ for (let i = 0; i < 500; i++) {
71
+ const newCount = await collect();
72
+ if (newCount > 0) {
73
+ staleCount = 0;
74
+ if (uniqueVideos.size > prevTotal + 10 || i % 10 === 0) {
75
+ log(`滚动 ${i}: ${uniqueVideos.size} 个视频 (新增 ${newCount})`);
76
+ prevTotal = uniqueVideos.size;
77
+ }
78
+ } else {
79
+ staleCount++;
80
+ }
81
+
82
+ if (uniqueVideos.size >= maxVideos) {
83
+ log(`达到上限 ${maxVideos},实际收集 ${uniqueVideos.size} 个(页面一次性加载)`);
84
+ break;
85
+ }
86
+
87
+ if (staleCount >= 5) {
88
+ log(`连续 ${staleCount} 次无新增,停止`);
89
+ break;
90
+ }
91
+
92
+ await page.evaluate(() => {
93
+ const container = document.querySelector('[class*="ColumnListContainer"]');
94
+ if (container) {
95
+ container.scrollTop += 700;
96
+ } else {
97
+ window.scrollBy(0, 700);
98
+ }
99
+ });
100
+ await delay(2000, 3000);
101
+ }
102
+
103
+ return uniqueVideos;
104
+ }
105
+
106
+ async function runGetUserVideos(options) {
107
+ const {
108
+ username,
109
+ maxVideos = 5,
110
+ log = console.error,
111
+ } = options;
112
+
113
+ const url = `https://www.tiktok.com/@${username}`;
114
+
115
+ log(`用户: @${username}`);
116
+ log(`URL: ${url}`);
117
+ log(`最大视频数: ${maxVideos}\n`);
118
+
119
+ log('连接浏览器...');
120
+ const browser = await ensureBrowserReady();
121
+
122
+ let page;
123
+ try {
124
+ page = await ensureTikTokPage(browser, url);
125
+ } catch (e) {
126
+ await browser.close().catch(() => {});
127
+ throw e;
128
+ }
129
+
130
+ await retryWithBackoff(() => page.goto(url, { waitUntil: 'load', timeout: 30000 }), { log });
131
+ await delay(3000, 5000);
132
+ await page.waitForSelector('[class*="DivVideoList"]', { timeout: 10000 }).catch(() => {});
133
+
134
+ log('获取用户信息...');
135
+ const userInfo = await getUserInfo(page);
136
+ log('用户信息: ' + JSON.stringify(userInfo, null, 2));
137
+
138
+ log('\n开始滚动收集视频...');
139
+ const videos = await collectVideos(page, username, maxVideos, log);
140
+ const allVideos = Array.from(videos.values());
141
+
142
+ log(`\n总计: ${allVideos.length} 个视频`);
143
+
144
+ const output = {
145
+ user: userInfo,
146
+ totalVideos: Math.min(allVideos.length, maxVideos),
147
+ videos: allVideos.slice(0, maxVideos).map(v => ({
148
+ id: v.id,
149
+ url: v.href.startsWith('http') ? v.href : `https://www.tiktok.com${v.href}`,
150
+ })),
151
+ };
152
+
153
+ return { output, browser };
154
+ }
155
+
156
+ async function isPageRestricted(page) {
157
+ return await page.evaluate(() => {
158
+ const bodyText = document.body.innerText;
159
+ return !!(bodyText.includes('登录 TikTok') ||
160
+ bodyText.includes('观众管理功能') ||
161
+ bodyText.includes('Login to TikTok'));
162
+ });
163
+ }
164
+
165
+ module.exports = { getUserInfo, collectVideos, runGetUserVideos, isPageRestricted };
@@ -0,0 +1,59 @@
1
+ const { runGetUserVideos } = require('./get-user-videos-core.cjs');
2
+ const fs = require('fs');
3
+ const path = require('path');
4
+
5
+ async function main() {
6
+ const rawArgs = process.argv.slice(2);
7
+
8
+ let outputPath = null;
9
+ const args = [];
10
+ for (let i = 0; i < rawArgs.length; i++) {
11
+ if (rawArgs[i] === '-o' || rawArgs[i] === '--output') {
12
+ outputPath = rawArgs[++i];
13
+ } else {
14
+ args.push(rawArgs[i]);
15
+ }
16
+ }
17
+
18
+ if (args.length < 1) {
19
+ console.error('用法: node get-user-videos.cjs <用户名> [最大视频数] [-o 输出路径]');
20
+ console.error('示例: node get-user-videos.cjs bar.lar.lar.moeta 1000');
21
+ console.error(' node get-user-videos.cjs username 50 -o videos.json');
22
+ console.error('');
23
+ console.error('选项: -o, --output <路径> 输出到文件; 不指定则输出到 stdout');
24
+ process.exit(1);
25
+ }
26
+
27
+ const username = args[0].replace('@', '');
28
+ const maxVideos = parseInt(args[1]) || 5;
29
+
30
+ let browser;
31
+ try {
32
+ const { output, browser: b } = await runGetUserVideos({
33
+ username,
34
+ maxVideos,
35
+ log: console.error,
36
+ });
37
+ browser = b;
38
+
39
+ const json = JSON.stringify(output, null, 2);
40
+ if (outputPath) {
41
+ const resultFile = path.isAbsolute(outputPath) ? outputPath : path.resolve(outputPath);
42
+ fs.mkdirSync(path.dirname(resultFile), { recursive: true });
43
+ fs.writeFileSync(resultFile, json, 'utf-8');
44
+ console.error(`已保存到 ${resultFile}`);
45
+ } else {
46
+ process.stdout.write(json + '\n');
47
+ }
48
+ } catch (err) {
49
+ console.error(err.message);
50
+ process.exit(1);
51
+ } finally {
52
+ if (browser) await browser.close().catch(() => {});
53
+ }
54
+ }
55
+
56
+ main().catch((err) => {
57
+ console.error(err.message);
58
+ process.exit(1);
59
+ });