tt-help-cli-ycl 1.3.93 → 1.3.95
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli/comments.js +49 -24
- package/src/cli/tag.js +239 -91
- package/src/lib/args.js +23 -0
- package/src/lib/browser/cdp.js +5 -1
- package/src/lib/constants.js +15 -0
- package/src/lib/tag-fetcher.js +69 -63
- package/src/watch/data-store.js +635 -2404
- package/src/watch/data-store.js.bak +5091 -0
- package/src/watch/data-store.js.bak2 +5019 -0
- package/src/watch/db-columns.js +160 -0
- package/src/watch/db-crud.js +458 -0
- package/src/watch/db-mappers.js +128 -0
- package/src/watch/db-raw-jobs.js +235 -0
- package/src/watch/db-schema.js +367 -0
- package/src/watch/db-stats.js +235 -0
- package/src/watch/db-tags.js +348 -0
- package/src/watch/llm-scoring.js +235 -0
- package/src/watch/public/app.js +47 -0
- package/src/watch/public/index.html +6 -0
- package/src/watch/server.js +34 -1
- package/src/watch/tag-service.js +142 -11
package/src/lib/args.js
CHANGED
|
@@ -668,6 +668,8 @@ function parseCommentsArgs(args) {
|
|
|
668
668
|
let commentsParallel = 1;
|
|
669
669
|
let commentsInterval = 10;
|
|
670
670
|
let commentsServer = defaultServer;
|
|
671
|
+
let commentsBasePort = 9222;
|
|
672
|
+
let commentsProxy = null;
|
|
671
673
|
|
|
672
674
|
const positional = [];
|
|
673
675
|
|
|
@@ -683,6 +685,10 @@ function parseCommentsArgs(args) {
|
|
|
683
685
|
commentsServer = args[++i];
|
|
684
686
|
} else if (arg === "-m" || arg === "--max-comments") {
|
|
685
687
|
commentsMax = parseInt(args[++i]) || 20;
|
|
688
|
+
} else if (arg === "--base-port") {
|
|
689
|
+
commentsBasePort = parseInt(args[++i]) || 9222;
|
|
690
|
+
} else if (arg === "--proxy") {
|
|
691
|
+
commentsProxy = args[++i];
|
|
686
692
|
} else {
|
|
687
693
|
positional.push(arg);
|
|
688
694
|
}
|
|
@@ -701,6 +707,8 @@ function parseCommentsArgs(args) {
|
|
|
701
707
|
commentsParallel,
|
|
702
708
|
commentsInterval,
|
|
703
709
|
commentsServer,
|
|
710
|
+
commentsBasePort,
|
|
711
|
+
commentsProxy,
|
|
704
712
|
urls: [],
|
|
705
713
|
outputFormat: "json",
|
|
706
714
|
exploreCount: 0,
|
|
@@ -731,8 +739,11 @@ function parseTagArgs(args) {
|
|
|
731
739
|
let isDiscover = false;
|
|
732
740
|
let isScore = false;
|
|
733
741
|
let isScoreAll = false;
|
|
742
|
+
let scoreAllPort = 9222;
|
|
743
|
+
let scoreProxy = null;
|
|
734
744
|
let scoreTag = null;
|
|
735
745
|
let scoreCountries = null;
|
|
746
|
+
let scorePort = 9222;
|
|
736
747
|
|
|
737
748
|
for (let i = 0; i < args.length; i++) {
|
|
738
749
|
const arg = args[i];
|
|
@@ -775,6 +786,14 @@ function parseTagArgs(args) {
|
|
|
775
786
|
.split(",")
|
|
776
787
|
.map((s) => s.trim().toUpperCase())
|
|
777
788
|
.filter(Boolean);
|
|
789
|
+
} else if (arg === "--port") {
|
|
790
|
+
if (isScoreAll) {
|
|
791
|
+
scoreAllPort = parseInt(args[++i]) || 9222;
|
|
792
|
+
} else {
|
|
793
|
+
scorePort = parseInt(args[++i]) || 9222;
|
|
794
|
+
}
|
|
795
|
+
} else if (arg === "--proxy") {
|
|
796
|
+
scoreProxy = args[++i];
|
|
778
797
|
} else if (arg === "-p" || arg === "--prompt") {
|
|
779
798
|
discoverPrompt = args[++i];
|
|
780
799
|
} else if (!arg.startsWith("-")) {
|
|
@@ -837,6 +856,8 @@ function parseTagArgs(args) {
|
|
|
837
856
|
tag: scoreTag,
|
|
838
857
|
countries: scoreCountries,
|
|
839
858
|
serverUrl,
|
|
859
|
+
port: scorePort,
|
|
860
|
+
proxy: scoreProxy,
|
|
840
861
|
},
|
|
841
862
|
urls: [],
|
|
842
863
|
outputFormat: "json",
|
|
@@ -858,6 +879,8 @@ function parseTagArgs(args) {
|
|
|
858
879
|
countries: scoreCountries,
|
|
859
880
|
serverUrl,
|
|
860
881
|
autoDiscover,
|
|
882
|
+
port: scoreAllPort,
|
|
883
|
+
proxy: scoreProxy,
|
|
861
884
|
},
|
|
862
885
|
urls: [],
|
|
863
886
|
outputFormat: "json",
|
package/src/lib/browser/cdp.js
CHANGED
|
@@ -208,7 +208,11 @@ export { killEdgeProcesses };
|
|
|
208
208
|
|
|
209
209
|
export async function ensureBrowserReady(options = {}) {
|
|
210
210
|
const port = options.port || DEFAULT_CDP_PORT;
|
|
211
|
-
const userDataDir =
|
|
211
|
+
const userDataDir =
|
|
212
|
+
options.userDataDir ||
|
|
213
|
+
(port !== DEFAULT_CDP_PORT
|
|
214
|
+
? `${DEFAULT_USER_DATA_DIR}_p${port}`
|
|
215
|
+
: DEFAULT_USER_DATA_DIR);
|
|
212
216
|
const proxyServer = options.proxyServer || null;
|
|
213
217
|
const isCustom = port !== DEFAULT_CDP_PORT || !!options.userDataDir;
|
|
214
218
|
|
package/src/lib/constants.js
CHANGED
|
@@ -208,6 +208,21 @@ const HELP_TEXT = [
|
|
|
208
208
|
" POST /api/tiktok/lookup 同时获取视频和作者信息 { videoUrl: string }",
|
|
209
209
|
" 示例: tt-help webserver -p 3000",
|
|
210
210
|
"",
|
|
211
|
+
" comments [选项]",
|
|
212
|
+
" 评论采集:从视频评论中发现新用户,来源标记为 comment",
|
|
213
|
+
" 自动模式:循环从服务端取视频任务,抓评论,提交新用户",
|
|
214
|
+
" 手动模式:指定视频URL,采集评论后保存",
|
|
215
|
+
" 选项:",
|
|
216
|
+
" -p, --parallel 并行数(默认: 1)",
|
|
217
|
+
" -i, --interval 空闲间隔上限秒(默认: 10,实际 1~N 随机)",
|
|
218
|
+
" -s, --server 服务端地址(默认: http://127.0.0.1:3001)",
|
|
219
|
+
" -m, --max-comments 每视频最大评论数(默认: 200)",
|
|
220
|
+
" --base-port 浏览器CDP端口(默认: 9222)",
|
|
221
|
+
" --save 去重后保存到服务端,来源标记为 comment",
|
|
222
|
+
" 示例: tt-help comments",
|
|
223
|
+
" tt-help comments -p 2 -i 15 --base-port 9223",
|
|
224
|
+
" tt-help comments <视频URL> 50 --save",
|
|
225
|
+
"",
|
|
211
226
|
" tag <标签名> [...] [选项]",
|
|
212
227
|
" 抓取标签页视频和作者(旧版 CLI 模式)",
|
|
213
228
|
" 选项:",
|
package/src/lib/tag-fetcher.js
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import { chromium } from "playwright";
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
2
|
+
import { ensureBrowserReady } from "./browser/cdp.js";
|
|
3
|
+
import { getOrCreatePage } from "./browser/page.js";
|
|
4
4
|
import { TikTokScraper } from "./tiktok-scraper.mjs";
|
|
5
5
|
|
|
6
6
|
const TAG_URL = "https://www.tiktok.com/tag";
|
|
7
|
-
const USER_AGENT =
|
|
8
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
9
7
|
const SCROLL_INTERVAL = 3000;
|
|
10
8
|
const MAX_STALE_ROUNDS = 3;
|
|
11
9
|
|
|
@@ -13,33 +11,7 @@ function sleep(ms) {
|
|
|
13
11
|
return new Promise((r) => setTimeout(r, ms));
|
|
14
12
|
}
|
|
15
13
|
|
|
16
|
-
function
|
|
17
|
-
return detectBrowser();
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
async function launchBrowser(browserPath) {
|
|
21
|
-
const opts = {
|
|
22
|
-
headless: true,
|
|
23
|
-
args: [
|
|
24
|
-
"--no-sandbox",
|
|
25
|
-
"--disable-blink-features=AutomationControlled",
|
|
26
|
-
"--disable-dev-shm-usage",
|
|
27
|
-
],
|
|
28
|
-
};
|
|
29
|
-
if (browserPath) opts.executablePath = browserPath;
|
|
30
|
-
|
|
31
|
-
try {
|
|
32
|
-
return await chromium.launch(opts);
|
|
33
|
-
} catch {
|
|
34
|
-
if (browserPath) {
|
|
35
|
-
opts.executablePath = undefined;
|
|
36
|
-
return await chromium.launch(opts);
|
|
37
|
-
}
|
|
38
|
-
throw new Error("无法启动浏览器");
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
async function extractItemData(item) {
|
|
14
|
+
function extractItemData(item) {
|
|
43
15
|
return {
|
|
44
16
|
id: item.id || "",
|
|
45
17
|
desc: (item.desc || "").trim(),
|
|
@@ -59,43 +31,35 @@ async function extractItemData(item) {
|
|
|
59
31
|
}
|
|
60
32
|
|
|
61
33
|
/**
|
|
62
|
-
* 获取 TikTok
|
|
34
|
+
* 获取 TikTok 标签页下的所有视频和作者(通过 CDP 连接已登录的 Edge 浏览器)
|
|
63
35
|
* @param {string} tag - 标签名称(不含 # 号)
|
|
64
36
|
* @param {object} [options]
|
|
65
37
|
* @param {number} [options.timeout=300000] - 最大等待时间 (ms)
|
|
66
|
-
* @param {
|
|
67
|
-
* @param {string} [options.
|
|
38
|
+
* @param {number} [options.port=9222] - CDP 端口
|
|
39
|
+
* @param {string} [options.userDataDir] - 用户数据目录
|
|
68
40
|
* @param {Function} [options.onProgress] - 进度回调 ({ videos, authors })
|
|
69
41
|
* @returns {Promise<{ tag: string, challengeId: string, totalPosts: number, videos: Array, uniqueAuthors: string[] }>}
|
|
70
42
|
*/
|
|
71
43
|
export async function fetchTagData(tag, options = {}) {
|
|
72
44
|
const {
|
|
73
45
|
timeout = 300000,
|
|
74
|
-
|
|
75
|
-
|
|
46
|
+
port = 9222,
|
|
47
|
+
userDataDir,
|
|
48
|
+
proxyServer,
|
|
76
49
|
onProgress,
|
|
77
50
|
} = options;
|
|
78
51
|
|
|
79
|
-
const
|
|
80
|
-
if (
|
|
81
|
-
|
|
82
|
-
"未找到可用的浏览器,请设置 browserPath 或安装 Chrome/Edge",
|
|
83
|
-
);
|
|
84
|
-
}
|
|
52
|
+
const cdpOptions = { port };
|
|
53
|
+
if (userDataDir) cdpOptions.userDataDir = userDataDir;
|
|
54
|
+
if (proxyServer) cdpOptions.proxyServer = proxyServer;
|
|
85
55
|
|
|
86
|
-
const browser = await
|
|
56
|
+
const browser = await ensureBrowserReady(cdpOptions);
|
|
57
|
+
const page = await getOrCreatePage(browser);
|
|
87
58
|
|
|
88
59
|
try {
|
|
89
|
-
const context = await browser.newContext({
|
|
90
|
-
viewport: { width: 1280, height: 900 },
|
|
91
|
-
userAgent: USER_AGENT,
|
|
92
|
-
locale,
|
|
93
|
-
});
|
|
94
|
-
await context.addInitScript(getAntiDetectScript());
|
|
95
|
-
const page = await context.newPage();
|
|
96
|
-
|
|
97
60
|
let challengeInfo = null;
|
|
98
61
|
const rawVideos = [];
|
|
62
|
+
const seenVideoIds = new Set();
|
|
99
63
|
const authors = new Set();
|
|
100
64
|
|
|
101
65
|
page.on("response", async (resp) => {
|
|
@@ -114,9 +78,13 @@ export async function fetchTagData(tag, options = {}) {
|
|
|
114
78
|
const body = await resp.json();
|
|
115
79
|
if (!body?.itemList) return;
|
|
116
80
|
for (const item of body.itemList) {
|
|
117
|
-
const
|
|
118
|
-
if (
|
|
119
|
-
|
|
81
|
+
const vid = item.id || "";
|
|
82
|
+
if (vid && !seenVideoIds.has(vid)) {
|
|
83
|
+
seenVideoIds.add(vid);
|
|
84
|
+
const uid = item.author?.uniqueId || "";
|
|
85
|
+
if (uid) authors.add(uid);
|
|
86
|
+
rawVideos.push(extractItemData(item));
|
|
87
|
+
}
|
|
120
88
|
}
|
|
121
89
|
if (onProgress) {
|
|
122
90
|
onProgress({ videos: rawVideos.length, authors: authors.size });
|
|
@@ -137,11 +105,49 @@ export async function fetchTagData(tag, options = {}) {
|
|
|
137
105
|
|
|
138
106
|
await page.waitForTimeout(3000);
|
|
139
107
|
|
|
108
|
+
// 检测页面异常:验证码、网络错误、标签不存在
|
|
140
109
|
const pageError = await page.evaluate(() => {
|
|
141
110
|
const text = document.body?.innerText || "";
|
|
142
|
-
if (
|
|
111
|
+
if (
|
|
112
|
+
document.querySelector(
|
|
113
|
+
".captcha-verify-container, #captcha_close_button",
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
return "captcha";
|
|
117
|
+
if (
|
|
118
|
+
text.includes("Verify you are a robot") ||
|
|
119
|
+
text.includes("Please rotate")
|
|
120
|
+
)
|
|
121
|
+
return "captcha";
|
|
122
|
+
if (
|
|
123
|
+
text.includes("找不到此话题标签") ||
|
|
124
|
+
text.includes("This hashtag is unavailable")
|
|
125
|
+
)
|
|
126
|
+
return "tag_not_found";
|
|
127
|
+
if (
|
|
128
|
+
text.includes("Something went wrong") ||
|
|
129
|
+
text.includes("Network error")
|
|
130
|
+
)
|
|
131
|
+
return "page_error";
|
|
143
132
|
return null;
|
|
144
133
|
});
|
|
134
|
+
|
|
135
|
+
if (pageError === "captcha") {
|
|
136
|
+
throw new Error("⚠️ 遇到验证码,请手动处理后重试");
|
|
137
|
+
}
|
|
138
|
+
if (pageError === "tag_not_found") {
|
|
139
|
+
// 标签不存在 → dead
|
|
140
|
+
return {
|
|
141
|
+
tag,
|
|
142
|
+
challengeId: "",
|
|
143
|
+
totalPosts: 0,
|
|
144
|
+
videoCount: 0,
|
|
145
|
+
uniqueAuthorCount: 0,
|
|
146
|
+
videos: [],
|
|
147
|
+
uniqueAuthors: [],
|
|
148
|
+
error: "tag_not_found",
|
|
149
|
+
};
|
|
150
|
+
}
|
|
145
151
|
if (pageError) {
|
|
146
152
|
throw new Error("标签页加载失败,TikTok 返回了错误页面");
|
|
147
153
|
}
|
|
@@ -156,18 +162,16 @@ export async function fetchTagData(tag, options = {}) {
|
|
|
156
162
|
await page.evaluate(() => window.scrollBy(0, 3000));
|
|
157
163
|
await sleep(SCROLL_INTERVAL);
|
|
158
164
|
|
|
159
|
-
if (
|
|
165
|
+
if (seenVideoIds.size === lastCount) {
|
|
160
166
|
staleRounds++;
|
|
161
167
|
} else {
|
|
162
168
|
staleRounds = 0;
|
|
163
|
-
lastCount =
|
|
169
|
+
lastCount = seenVideoIds.size;
|
|
164
170
|
}
|
|
165
171
|
}
|
|
166
172
|
|
|
167
|
-
|
|
168
|
-
const uniqueVideos = rawVideos
|
|
169
|
-
seen.has(v.id) ? false : (seen.add(v.id), true),
|
|
170
|
-
);
|
|
173
|
+
// 已在采集时去重,rawVideos 即为唯一视频列表
|
|
174
|
+
const uniqueVideos = rawVideos;
|
|
171
175
|
|
|
172
176
|
const totalPosts = challengeInfo?.stats?.videoCount || 0;
|
|
173
177
|
|
|
@@ -181,7 +185,7 @@ export async function fetchTagData(tag, options = {}) {
|
|
|
181
185
|
uniqueAuthors: [...authors],
|
|
182
186
|
};
|
|
183
187
|
} finally {
|
|
184
|
-
|
|
188
|
+
// 不关闭 page 和 browser,由用户自行关闭
|
|
185
189
|
}
|
|
186
190
|
}
|
|
187
191
|
|
|
@@ -200,11 +204,13 @@ export async function enrichVideosWithLocation(videos, options = {}) {
|
|
|
200
204
|
mode = "videos",
|
|
201
205
|
poolSize = 3,
|
|
202
206
|
maxRetries = 3,
|
|
207
|
+
proxyServer,
|
|
203
208
|
onProgress,
|
|
204
209
|
existingScraper,
|
|
205
210
|
} = options;
|
|
206
211
|
|
|
207
|
-
const scraper =
|
|
212
|
+
const scraper =
|
|
213
|
+
existingScraper || new TikTokScraper({ poolSize, proxyServer });
|
|
208
214
|
const ownsScraper = !existingScraper;
|
|
209
215
|
if (ownsScraper) await scraper.init();
|
|
210
216
|
|