tt-help-cli-ycl 1.3.93 → 1.3.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/lib/args.js CHANGED
@@ -668,6 +668,8 @@ function parseCommentsArgs(args) {
668
668
  let commentsParallel = 1;
669
669
  let commentsInterval = 10;
670
670
  let commentsServer = defaultServer;
671
+ let commentsBasePort = 9222;
672
+ let commentsProxy = null;
671
673
 
672
674
  const positional = [];
673
675
 
@@ -683,6 +685,10 @@ function parseCommentsArgs(args) {
683
685
  commentsServer = args[++i];
684
686
  } else if (arg === "-m" || arg === "--max-comments") {
685
687
  commentsMax = parseInt(args[++i]) || 20;
688
+ } else if (arg === "--base-port") {
689
+ commentsBasePort = parseInt(args[++i]) || 9222;
690
+ } else if (arg === "--proxy") {
691
+ commentsProxy = args[++i];
686
692
  } else {
687
693
  positional.push(arg);
688
694
  }
@@ -701,6 +707,8 @@ function parseCommentsArgs(args) {
701
707
  commentsParallel,
702
708
  commentsInterval,
703
709
  commentsServer,
710
+ commentsBasePort,
711
+ commentsProxy,
704
712
  urls: [],
705
713
  outputFormat: "json",
706
714
  exploreCount: 0,
@@ -731,8 +739,11 @@ function parseTagArgs(args) {
731
739
  let isDiscover = false;
732
740
  let isScore = false;
733
741
  let isScoreAll = false;
742
+ let scoreAllPort = 9222;
743
+ let scoreProxy = null;
734
744
  let scoreTag = null;
735
745
  let scoreCountries = null;
746
+ let scorePort = 9222;
736
747
 
737
748
  for (let i = 0; i < args.length; i++) {
738
749
  const arg = args[i];
@@ -775,6 +786,14 @@ function parseTagArgs(args) {
775
786
  .split(",")
776
787
  .map((s) => s.trim().toUpperCase())
777
788
  .filter(Boolean);
789
+ } else if (arg === "--port") {
790
+ if (isScoreAll) {
791
+ scoreAllPort = parseInt(args[++i]) || 9222;
792
+ } else {
793
+ scorePort = parseInt(args[++i]) || 9222;
794
+ }
795
+ } else if (arg === "--proxy") {
796
+ scoreProxy = args[++i];
778
797
  } else if (arg === "-p" || arg === "--prompt") {
779
798
  discoverPrompt = args[++i];
780
799
  } else if (!arg.startsWith("-")) {
@@ -837,6 +856,8 @@ function parseTagArgs(args) {
837
856
  tag: scoreTag,
838
857
  countries: scoreCountries,
839
858
  serverUrl,
859
+ port: scorePort,
860
+ proxy: scoreProxy,
840
861
  },
841
862
  urls: [],
842
863
  outputFormat: "json",
@@ -858,6 +879,8 @@ function parseTagArgs(args) {
858
879
  countries: scoreCountries,
859
880
  serverUrl,
860
881
  autoDiscover,
882
+ port: scoreAllPort,
883
+ proxy: scoreProxy,
861
884
  },
862
885
  urls: [],
863
886
  outputFormat: "json",
@@ -208,7 +208,10 @@ export { killEdgeProcesses };
208
208
 
209
209
  export async function ensureBrowserReady(options = {}) {
210
210
  const port = options.port || DEFAULT_CDP_PORT;
211
- const userDataDir = options.userDataDir || DEFAULT_USER_DATA_DIR;
211
+ const baseDir = options.userDataDir || DEFAULT_USER_DATA_DIR;
212
+ // 非默认端口时,userDataDir 加上 _p{port} 后缀
213
+ const userDataDir =
214
+ port !== DEFAULT_CDP_PORT ? `${baseDir}_p${port}` : baseDir;
212
215
  const proxyServer = options.proxyServer || null;
213
216
  const isCustom = port !== DEFAULT_CDP_PORT || !!options.userDataDir;
214
217
 
@@ -208,6 +208,21 @@ const HELP_TEXT = [
208
208
  " POST /api/tiktok/lookup 同时获取视频和作者信息 { videoUrl: string }",
209
209
  " 示例: tt-help webserver -p 3000",
210
210
  "",
211
+ " comments [选项]",
212
+ " 评论采集:从视频评论中发现新用户,来源标记为 comment",
213
+ " 自动模式:循环从服务端取视频任务,抓评论,提交新用户",
214
+ " 手动模式:指定视频URL,采集评论后保存",
215
+ " 选项:",
216
+ " -p, --parallel 并行数(默认: 1)",
217
+ " -i, --interval 空闲间隔上限秒(默认: 10,实际 1~N 随机)",
218
+ " -s, --server 服务端地址(默认: http://127.0.0.1:3001)",
219
+ " -m, --max-comments 每视频最大评论数(默认: 200)",
220
+ " --base-port 浏览器CDP端口(默认: 9222)",
221
+ " --save 去重后保存到服务端,来源标记为 comment",
222
+ " 示例: tt-help comments",
223
+ " tt-help comments -p 2 -i 15 --base-port 9223",
224
+ " tt-help comments <视频URL> 50 --save",
225
+ "",
211
226
  " tag <标签名> [...] [选项]",
212
227
  " 抓取标签页视频和作者(旧版 CLI 模式)",
213
228
  " 选项:",
@@ -1,11 +1,9 @@
1
1
  import { chromium } from "playwright";
2
- import { detectBrowser } from "./browser/launch.js";
3
- import { getAntiDetectScript } from "./browser/anti-detect.js";
2
+ import { ensureBrowserReady } from "./browser/cdp.js";
3
+ import { getOrCreatePage } from "./browser/page.js";
4
4
  import { TikTokScraper } from "./tiktok-scraper.mjs";
5
5
 
6
6
  const TAG_URL = "https://www.tiktok.com/tag";
7
- const USER_AGENT =
8
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
9
7
  const SCROLL_INTERVAL = 3000;
10
8
  const MAX_STALE_ROUNDS = 3;
11
9
 
@@ -13,33 +11,7 @@ function sleep(ms) {
13
11
  return new Promise((r) => setTimeout(r, ms));
14
12
  }
15
13
 
16
- function findBrowser() {
17
- return detectBrowser();
18
- }
19
-
20
- async function launchBrowser(browserPath) {
21
- const opts = {
22
- headless: true,
23
- args: [
24
- "--no-sandbox",
25
- "--disable-blink-features=AutomationControlled",
26
- "--disable-dev-shm-usage",
27
- ],
28
- };
29
- if (browserPath) opts.executablePath = browserPath;
30
-
31
- try {
32
- return await chromium.launch(opts);
33
- } catch {
34
- if (browserPath) {
35
- opts.executablePath = undefined;
36
- return await chromium.launch(opts);
37
- }
38
- throw new Error("无法启动浏览器");
39
- }
40
- }
41
-
42
- async function extractItemData(item) {
14
+ function extractItemData(item) {
43
15
  return {
44
16
  id: item.id || "",
45
17
  desc: (item.desc || "").trim(),
@@ -59,43 +31,35 @@ async function extractItemData(item) {
59
31
  }
60
32
 
61
33
  /**
62
- * 获取 TikTok 标签页下的所有视频和作者
34
+ * 获取 TikTok 标签页下的所有视频和作者(通过 CDP 连接已登录的 Edge 浏览器)
63
35
  * @param {string} tag - 标签名称(不含 # 号)
64
36
  * @param {object} [options]
65
37
  * @param {number} [options.timeout=300000] - 最大等待时间 (ms)
66
- * @param {string} [options.browserPath] - 浏览器可执行文件路径,不传则自动探测
67
- * @param {string} [options.locale='en-US'] - 页面语言
38
+ * @param {number} [options.port=9222] - CDP 端口
39
+ * @param {string} [options.userDataDir] - 用户数据目录
68
40
  * @param {Function} [options.onProgress] - 进度回调 ({ videos, authors })
69
41
  * @returns {Promise<{ tag: string, challengeId: string, totalPosts: number, videos: Array, uniqueAuthors: string[] }>}
70
42
  */
71
43
  export async function fetchTagData(tag, options = {}) {
72
44
  const {
73
45
  timeout = 300000,
74
- browserPath: customBrowserPath,
75
- locale = "en-US",
46
+ port = 9222,
47
+ userDataDir,
48
+ proxyServer,
76
49
  onProgress,
77
50
  } = options;
78
51
 
79
- const browserPath = customBrowserPath || findBrowser();
80
- if (!browserPath) {
81
- throw new Error(
82
- "未找到可用的浏览器,请设置 browserPath 或安装 Chrome/Edge",
83
- );
84
- }
52
+ const cdpOptions = { port };
53
+ if (userDataDir) cdpOptions.userDataDir = userDataDir;
54
+ if (proxyServer) cdpOptions.proxyServer = proxyServer;
85
55
 
86
- const browser = await launchBrowser(browserPath);
56
+ const browser = await ensureBrowserReady(cdpOptions);
57
+ const page = await getOrCreatePage(browser);
87
58
 
88
59
  try {
89
- const context = await browser.newContext({
90
- viewport: { width: 1280, height: 900 },
91
- userAgent: USER_AGENT,
92
- locale,
93
- });
94
- await context.addInitScript(getAntiDetectScript());
95
- const page = await context.newPage();
96
-
97
60
  let challengeInfo = null;
98
61
  const rawVideos = [];
62
+ const seenVideoIds = new Set();
99
63
  const authors = new Set();
100
64
 
101
65
  page.on("response", async (resp) => {
@@ -114,9 +78,13 @@ export async function fetchTagData(tag, options = {}) {
114
78
  const body = await resp.json();
115
79
  if (!body?.itemList) return;
116
80
  for (const item of body.itemList) {
117
- const uid = item.author?.uniqueId || "";
118
- if (uid) authors.add(uid);
119
- rawVideos.push(await extractItemData(item));
81
+ const vid = item.id || "";
82
+ if (vid && !seenVideoIds.has(vid)) {
83
+ seenVideoIds.add(vid);
84
+ const uid = item.author?.uniqueId || "";
85
+ if (uid) authors.add(uid);
86
+ rawVideos.push(extractItemData(item));
87
+ }
120
88
  }
121
89
  if (onProgress) {
122
90
  onProgress({ videos: rawVideos.length, authors: authors.size });
@@ -137,11 +105,49 @@ export async function fetchTagData(tag, options = {}) {
137
105
 
138
106
  await page.waitForTimeout(3000);
139
107
 
108
+ // 检测页面异常:验证码、网络错误、标签不存在
140
109
  const pageError = await page.evaluate(() => {
141
110
  const text = document.body?.innerText || "";
142
- if (text.includes("Something went wrong")) return "page_error";
111
+ if (
112
+ document.querySelector(
113
+ ".captcha-verify-container, #captcha_close_button",
114
+ )
115
+ )
116
+ return "captcha";
117
+ if (
118
+ text.includes("Verify you are a robot") ||
119
+ text.includes("Please rotate")
120
+ )
121
+ return "captcha";
122
+ if (
123
+ text.includes("找不到此话题标签") ||
124
+ text.includes("This hashtag is unavailable")
125
+ )
126
+ return "tag_not_found";
127
+ if (
128
+ text.includes("Something went wrong") ||
129
+ text.includes("Network error")
130
+ )
131
+ return "page_error";
143
132
  return null;
144
133
  });
134
+
135
+ if (pageError === "captcha") {
136
+ throw new Error("⚠️ 遇到验证码,请手动处理后重试");
137
+ }
138
+ if (pageError === "tag_not_found") {
139
+ // 标签不存在 → dead
140
+ return {
141
+ tag,
142
+ challengeId: "",
143
+ totalPosts: 0,
144
+ videoCount: 0,
145
+ uniqueAuthorCount: 0,
146
+ videos: [],
147
+ uniqueAuthors: [],
148
+ error: "tag_not_found",
149
+ };
150
+ }
145
151
  if (pageError) {
146
152
  throw new Error("标签页加载失败,TikTok 返回了错误页面");
147
153
  }
@@ -156,18 +162,16 @@ export async function fetchTagData(tag, options = {}) {
156
162
  await page.evaluate(() => window.scrollBy(0, 3000));
157
163
  await sleep(SCROLL_INTERVAL);
158
164
 
159
- if (rawVideos.length === lastCount) {
165
+ if (seenVideoIds.size === lastCount) {
160
166
  staleRounds++;
161
167
  } else {
162
168
  staleRounds = 0;
163
- lastCount = rawVideos.length;
169
+ lastCount = seenVideoIds.size;
164
170
  }
165
171
  }
166
172
 
167
- const seen = new Set();
168
- const uniqueVideos = rawVideos.filter((v) =>
169
- seen.has(v.id) ? false : (seen.add(v.id), true),
170
- );
173
+ // 已在采集时去重,rawVideos 即为唯一视频列表
174
+ const uniqueVideos = rawVideos;
171
175
 
172
176
  const totalPosts = challengeInfo?.stats?.videoCount || 0;
173
177
 
@@ -181,7 +185,7 @@ export async function fetchTagData(tag, options = {}) {
181
185
  uniqueAuthors: [...authors],
182
186
  };
183
187
  } finally {
184
- await browser.close();
188
+ // 不关闭 page 和 browser,由用户自行关闭
185
189
  }
186
190
  }
187
191
 
@@ -200,11 +204,13 @@ export async function enrichVideosWithLocation(videos, options = {}) {
200
204
  mode = "videos",
201
205
  poolSize = 3,
202
206
  maxRetries = 3,
207
+ proxyServer,
203
208
  onProgress,
204
209
  existingScraper,
205
210
  } = options;
206
211
 
207
- const scraper = existingScraper || new TikTokScraper({ poolSize });
212
+ const scraper =
213
+ existingScraper || new TikTokScraper({ poolSize, proxyServer });
208
214
  const ownsScraper = !existingScraper;
209
215
  if (ownsScraper) await scraper.init();
210
216