tt-help-cli-ycl 1.3.91 → 1.3.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tt-help-cli-ycl",
3
- "version": "1.3.91",
3
+ "version": "1.3.93",
4
4
  "description": "TikTok user & video data scraper - extract ttSeller, verified, locationCreated from HTML source",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli/tag.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import { writeFileSync } from "fs";
2
+ import { randomUUID } from "crypto";
2
3
  import { fetchTagData, enrichVideosWithLocation } from "../lib/tag-fetcher.js";
3
4
  import { TikTokScraper } from "../lib/tiktok-scraper.mjs";
4
5
  import {
@@ -11,7 +12,22 @@ import { server as cfgServer } from "../lib/constants.js";
11
12
  const ALL_COUNTRIES = DEFAULT_TARGET_LOCATIONS;
12
13
  const DEFAULT_SERVER = cfgServer || "http://127.0.0.1:3000";
13
14
 
14
- async function pushToServer(serverUrl, filteredAuthors, videos) {
15
+ // 构建带客户端追踪 header fetch 封装
16
+ function buildClientHeaders(clientId, meta, extra = {}) {
17
+ return {
18
+ "X-Client-Id": clientId,
19
+ "X-Client-Info": JSON.stringify(meta),
20
+ ...extra,
21
+ };
22
+ }
23
+
24
+ async function pushToServer(
25
+ serverUrl,
26
+ filteredAuthors,
27
+ videos,
28
+ clientId,
29
+ meta,
30
+ ) {
15
31
  const users = filteredAuthors.map((author) => {
16
32
  const video = videos.find((v) => v.authorUniqueId === author);
17
33
  return {
@@ -23,7 +39,9 @@ async function pushToServer(serverUrl, filteredAuthors, videos) {
23
39
 
24
40
  const res = await fetch(`${serverUrl}/api/raw-users`, {
25
41
  method: "POST",
26
- headers: { "Content-Type": "application/json" },
42
+ headers: buildClientHeaders(clientId, meta, {
43
+ "Content-Type": "application/json",
44
+ }),
27
45
  body: JSON.stringify({ users }),
28
46
  });
29
47
  const data = await res.json();
@@ -412,11 +430,13 @@ export async function handleScore(parsed) {
412
430
  console.log(JSON.stringify(result, null, 2));
413
431
  }
414
432
 
415
- async function reportToServer(baseUrl, result) {
433
+ async function reportToServer(baseUrl, result, clientId, meta) {
416
434
  try {
417
435
  const res = await fetch(`${baseUrl}/api/tags/score-result`, {
418
436
  method: "POST",
419
- headers: { "Content-Type": "application/json" },
437
+ headers: buildClientHeaders(clientId, meta, {
438
+ "Content-Type": "application/json",
439
+ }),
420
440
  body: JSON.stringify(result),
421
441
  });
422
442
  const data = await res.json();
@@ -464,17 +484,24 @@ export async function handleScoreAll(parsed) {
464
484
  let emptyRounds = 0; // 连续无任务的轮数
465
485
  const DISCOVER_AFTER_EMPTY = 3; // 连续 3 轮无任务时触发 discover
466
486
 
487
+ // 生成客户端 ID,用于服务端追踪
488
+ const clientId = randomUUID();
489
+ const clientMeta = { type: "scoring" };
490
+
467
491
  // 复用 TikTokScraper 实例,避免每次 enrich 都启动/关闭 headless 浏览器
468
492
  const enrichScraper = new TikTokScraper({ poolSize: 3 });
469
493
  await enrichScraper.init();
470
494
  log("✅ TikTokScraper 已就绪 (enrich 复用)");
495
+ log(` 客户端 ID: ${clientId.substring(0, 8)}...`);
471
496
  log("");
472
497
 
473
498
  try {
474
499
  while (true) {
475
500
  try {
476
501
  // 从服务端取下一个 new 标签
477
- const tagsRes = await fetch(`${baseUrl}/api/tags?status=new&limit=1`);
502
+ const tagsRes = await fetch(`${baseUrl}/api/tags?status=new&limit=1`, {
503
+ headers: buildClientHeaders(clientId, clientMeta),
504
+ });
478
505
  const tagsData = await tagsRes.json();
479
506
  if (!tagsData.tags || tagsData.tags.length === 0) {
480
507
  emptyRounds++;
@@ -488,6 +515,7 @@ export async function handleScoreAll(parsed) {
488
515
  try {
489
516
  const discRes = await fetch(
490
517
  `${baseUrl}/api/tags/discover?country=${country}&count=5`,
518
+ { headers: buildClientHeaders(clientId, clientMeta) },
491
519
  );
492
520
  const discData = await discRes.json();
493
521
  if (discData.inserted) {
@@ -530,10 +558,12 @@ export async function handleScoreAll(parsed) {
530
558
  error: null,
531
559
  };
532
560
 
533
- // 锁定 tag
561
+ // 锁定 tag(meta 中不放入 tag,避免非 ASCII 字符导致 header ByteString 报错)
534
562
  const claimRes = await fetch(`${baseUrl}/api/tags/claim`, {
535
563
  method: "POST",
536
- headers: { "Content-Type": "application/json" },
564
+ headers: buildClientHeaders(clientId, clientMeta, {
565
+ "Content-Type": "application/json",
566
+ }),
537
567
  body: JSON.stringify({ tag }),
538
568
  });
539
569
  const claimData = await claimRes.json();
@@ -546,7 +576,7 @@ export async function handleScoreAll(parsed) {
546
576
  log(` ⚠️ 无法锁定 (${claimData.error}),标记为 dead 并跳过`);
547
577
  result.error = claimData.error;
548
578
  result.status = "dead";
549
- await reportToServer(baseUrl, result);
579
+ await reportToServer(baseUrl, result, clientId, clientMeta);
550
580
  totalScored++;
551
581
  continue;
552
582
  }
@@ -570,7 +600,7 @@ export async function handleScoreAll(parsed) {
570
600
  log(" ⚠️ 无视频,标记 dead");
571
601
  result.status = "dead";
572
602
  result.error = "no videos found";
573
- await reportToServer(baseUrl, result);
603
+ await reportToServer(baseUrl, result, clientId, clientMeta);
574
604
  totalScored++;
575
605
  continue;
576
606
  }
@@ -602,12 +632,14 @@ export async function handleScoreAll(parsed) {
602
632
  baseUrl,
603
633
  [...matchedAuthorSet],
604
634
  videos,
635
+ clientId,
636
+ clientMeta,
605
637
  );
606
638
  result.pushedUsers = pushResult.added || 0;
607
639
  }
608
640
 
609
641
  // 上报结果
610
- await reportToServer(baseUrl, result);
642
+ await reportToServer(baseUrl, result, clientId, clientMeta);
611
643
 
612
644
  totalScored++;
613
645
  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
@@ -643,12 +675,17 @@ export async function handleScoreAll(parsed) {
643
675
  }
644
676
  log(` ❌ 失败: ${e.message}`);
645
677
  try {
646
- await reportToServer(baseUrl, {
647
- tag: "",
648
- status: "error",
649
- score: 0,
650
- error: e.message,
651
- });
678
+ await reportToServer(
679
+ baseUrl,
680
+ {
681
+ tag: "",
682
+ status: "error",
683
+ score: 0,
684
+ error: e.message,
685
+ },
686
+ clientId,
687
+ clientMeta,
688
+ );
652
689
  } catch {}
653
690
  totalScored++;
654
691
  }
@@ -2871,6 +2871,8 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2871
2871
  // 如果启用 LLM 打分,先采样一批进行评分(累积模式:按猜测国家分组,使用偏移量记忆避免重复采样)
2872
2872
  if (useLlm && normalizedLocations && normalizedLocations.length > 0) {
2873
2873
  const llmMinReturn = options.llmMinReturn ?? 60; // 最少返回合格数
2874
+ const llmMinTagReturn = options.llmMinTagReturn ?? 30; // tag 最少合格数
2875
+ const llmMinNonTagReturn = options.llmMinNonTagReturn ?? 30; // 非 tag 最少合格数
2874
2876
  const maxBatches = options.llmMaxBatches ?? 10; // 最多采样轮次,防止无限循环
2875
2877
 
2876
2878
  // 打印当前偏移量状态
@@ -2878,7 +2880,7 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2878
2880
  .map(([k, v]) => `${k}:${v}`)
2879
2881
  .join(", ");
2880
2882
  console.error(
2881
- `[data-store] LLM 打分开始: 符合条件 ${count} 条,每批 ${llmSampleSize} 条,最低分 ${llmMinScore},最少返回 ${llmMinReturn} 条`,
2883
+ `[data-store] LLM 打分开始: 符合条件 ${count} 条,每批 ${llmSampleSize} 条,最低分 ${llmMinScore},tag 最少 ${llmMinTagReturn},非 tag 最少 ${llmMinNonTagReturn}`,
2882
2884
  );
2883
2885
  if (offsetSummary) {
2884
2886
  console.error(`[data-store] 偏移量记忆: ${offsetSummary}`);
@@ -2886,7 +2888,8 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2886
2888
 
2887
2889
  // 返回 Promise,调用方需要 await
2888
2890
  return (async () => {
2889
- const allQualified = [];
2891
+ const allTagQualified = []; // tag 合格列表(直接合格)
2892
+ const allNonTagQualified = []; // 非 tag 合格列表(LLM 打分合格)
2890
2893
  const allScores = [];
2891
2894
 
2892
2895
  // 按猜测国家分组处理,每个国家使用独立的偏移量
@@ -2949,7 +2952,7 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2949
2952
 
2950
2953
  // tag 来源直接加入合格列表
2951
2954
  if (tagSamples.length > 0) {
2952
- allQualified.push(...tagSamples.map((s) => s.unique_id));
2955
+ allTagQualified.push(...tagSamples.map((s) => s.unique_id));
2953
2956
  console.error(
2954
2957
  `[data-store] ${location}: 本批 ${tagSamples.length} 条 tag 来源任务跳过 LLM 打分直接合格`,
2955
2958
  );
@@ -2964,47 +2967,51 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2964
2967
  DEFAULT_TARGET_LOCATIONS,
2965
2968
  );
2966
2969
  batchQualified = scores.filter((s) => s.score >= llmMinScore);
2970
+ allNonTagQualified.push(...batchQualified.map((s) => s.uniqueId));
2967
2971
  }
2968
2972
 
2969
2973
  allScores.push(...scores);
2970
- allQualified.push(...batchQualified.map((s) => s.uniqueId));
2971
2974
 
2972
2975
  totalBatches++;
2976
+ const totalQualified = allTagQualified.length + allNonTagQualified.length;
2973
2977
  console.error(
2974
- `[data-store] ${location} 第 ${batch + 1} 批: 采样 ${samples.length} 条,本批合格 ${batchQualified.length} 条,累计合格 ${allQualified.length} 条`,
2978
+ `[data-store] ${location} 第 ${batch + 1} 批: 采样 ${samples.length} 条,tag 合格 ${allTagQualified.length},非 tag 合格 ${allNonTagQualified.length},累计 ${totalQualified} 条`,
2975
2979
  );
2976
2980
 
2977
2981
  // 更新偏移量记忆
2978
2982
  offset += samples.length;
2979
2983
  llmSampleOffsets.set(location, offset);
2980
2984
 
2981
- // 合格数已达到最小返回阈值,停止采样
2982
- if (allQualified.length >= llmMinReturn) break;
2985
+ // 检查是否两个类型都达到阈值,都达到才停止
2986
+ const tagReached = allTagQualified.length >= llmMinTagReturn;
2987
+ const nonTagReached = allNonTagQualified.length >= llmMinNonTagReturn;
2988
+ if (tagReached && nonTagReached) {
2989
+ console.error(
2990
+ `[data-store] 两类任务均已达标 (tag: ${allTagQualified.length}/${llmMinTagReturn}, 非 tag: ${allNonTagQualified.length}/${llmMinNonTagReturn}),停止采样`,
2991
+ );
2992
+ break;
2993
+ }
2983
2994
  }
2984
2995
 
2985
- // 合格数已达到最小返回阈值,停止所有国家的采样
2986
- if (allQualified.length >= llmMinReturn) break;
2996
+ // 检查是否两个类型都达到阈值,都达到才停止所有国家采样
2997
+ const tagReachedGlobal = allTagQualified.length >= llmMinTagReturn;
2998
+ const nonTagReachedGlobal = allNonTagQualified.length >= llmMinNonTagReturn;
2999
+ if (tagReachedGlobal && nonTagReachedGlobal) break;
2987
3000
  }
2988
3001
 
2989
- // 分离 tag 合格和非 tag 合格
2990
- // tag 任务直接合格(不在 allScores 中),非 tag 任务走 LLM 打分
2991
- const tagQualified = allQualified.filter(
2992
- (uid) => !allScores.find((s) => s.uniqueId === uid),
2993
- );
2994
- const nonTagQualifiedScores = allScores
2995
- .filter((s) => s.score >= llmMinScore)
2996
- .sort((a, b) => b.score - a.score);
2997
- const nonTagQualified = nonTagQualifiedScores.map((s) => s.uniqueId);
2998
-
3002
+ // 最终合格列表:tag 优先 + 非 tag 按分数排序
2999
3003
  // 限制 tag 占比:最多占 safeLimit 的 70%,留 30% 给非 tag
3000
3004
  const tagMaxCount = Math.floor(safeLimit * 0.7);
3001
- const tagCount = Math.min(tagQualified.length, tagMaxCount);
3005
+ const tagCount = Math.min(allTagQualified.length, tagMaxCount);
3002
3006
  const nonTagMaxCount = safeLimit - tagCount;
3003
- const finalNonTagQualified = nonTagQualified.slice(0, nonTagMaxCount);
3004
3007
 
3005
- // 最终合格列表:tag 优先 + 非 tag 按分数排序
3008
+ const nonTagQualifiedScores = allScores
3009
+ .filter((s) => s.score >= llmMinScore)
3010
+ .sort((a, b) => b.score - a.score);
3011
+ const finalNonTagQualified = nonTagQualifiedScores.slice(0, nonTagMaxCount).map((s) => s.uniqueId);
3012
+
3006
3013
  const qualified = [
3007
- ...tagQualified.slice(0, tagCount),
3014
+ ...allTagQualified.slice(0, tagCount),
3008
3015
  ...finalNonTagQualified,
3009
3016
  ];
3010
3017
 
@@ -255,8 +255,14 @@ function renderActiveClients(clients) {
255
255
  const tbody = document.getElementById("activeClientsBody");
256
256
  if (!section || !bar) return;
257
257
 
258
- const types = ["explore", "refresh", "attach", "comments"];
259
- const labels = { explore: "Explore", refresh: "Refresh", attach: "Attach", comments: "Comments" };
258
+ const types = ["explore", "refresh", "attach", "comments", "scoring"];
259
+ const labels = {
260
+ explore: "Explore",
261
+ refresh: "Refresh",
262
+ attach: "Attach",
263
+ comments: "Comments",
264
+ scoring: "Scoring",
265
+ };
260
266
  const grouped = {};
261
267
  for (const c of clients) {
262
268
  if (!grouped[c.type]) grouped[c.type] = [];
@@ -314,9 +320,7 @@ function showClientDetail(type, clients) {
314
320
  tbody.innerHTML = clients
315
321
  .map((c) => {
316
322
  const cid = c.clientId ? c.clientId.substring(0, 8) : "-";
317
- const ipPort = c.ip
318
- ? c.ip + (c.port ? ":" + c.port : "")
319
- : "-";
323
+ const ipPort = c.ip ? c.ip + (c.port ? ":" + c.port : "") : "-";
320
324
  const userId = c.userId || "-";
321
325
  const last = formatRelativeTime(c.lastSeen);
322
326
  return `<tr>
@@ -93,6 +93,7 @@ function inferClientType(routePath) {
93
93
  if (routePath.startsWith("/api/redo-job")) return "refresh";
94
94
  if (routePath.startsWith("/api/user-update-tasks")) return "attach";
95
95
  if (routePath.startsWith("/api/comment-task")) return "comments";
96
+ if (routePath.startsWith("/api/tags")) return "scoring";
96
97
  if (
97
98
  routePath.startsWith("/api/job") ||
98
99
  routePath.startsWith("/api/explore-new")