tt-help-cli-ycl 1.3.90 → 1.3.91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tt-help-cli-ycl",
3
- "version": "1.3.90",
3
+ "version": "1.3.91",
4
4
  "description": "TikTok user & video data scraper - extract ttSeller, verified, locationCreated from HTML source",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli/tag.js CHANGED
@@ -472,61 +472,64 @@ export async function handleScoreAll(parsed) {
472
472
 
473
473
  try {
474
474
  while (true) {
475
- // 从服务端取下一个 new 标签
476
- const tagsRes = await fetch(`${baseUrl}/api/tags?status=new&limit=1`);
477
- const tagsData = await tagsRes.json();
478
- if (!tagsData.tags || tagsData.tags.length === 0) {
479
- emptyRounds++;
480
-
481
- // 自动发现:连续 N 轮无任务时自动生成标签
482
- if (autoDiscover && emptyRounds >= DISCOVER_AFTER_EMPTY) {
483
- log(
484
- `🔍 连续 ${emptyRounds} 轮无待打分标签,自动为 ${targetCountries.length} 个国家生成标签...`,
485
- );
486
- for (const country of targetCountries) {
487
- try {
488
- const discRes = await fetch(
489
- `${baseUrl}/api/tags/discover?country=${country}&count=5`,
490
- );
491
- const discData = await discRes.json();
492
- if (discData.inserted) {
493
- log(` ${country}: 新增 ${discData.inserted} 个`);
475
+ try {
476
+ // 从服务端取下一个 new 标签
477
+ const tagsRes = await fetch(`${baseUrl}/api/tags?status=new&limit=1`);
478
+ const tagsData = await tagsRes.json();
479
+ if (!tagsData.tags || tagsData.tags.length === 0) {
480
+ emptyRounds++;
481
+
482
+ // 自动发现:连续 N 轮无任务时自动生成标签
483
+ if (autoDiscover && emptyRounds >= DISCOVER_AFTER_EMPTY) {
484
+ log(
485
+ `🔍 连续 ${emptyRounds} 轮无待打分标签,自动为 ${targetCountries.length} 个国家生成标签...`,
486
+ );
487
+ for (const country of targetCountries) {
488
+ try {
489
+ const discRes = await fetch(
490
+ `${baseUrl}/api/tags/discover?country=${country}&count=5`,
491
+ );
492
+ const discData = await discRes.json();
493
+ if (discData.inserted) {
494
+ log(` ${country}: 新增 ${discData.inserted} 个`);
495
+ }
496
+ } catch (e) {
497
+ log(` ${country}: 请求失败 (${e.message})`);
494
498
  }
495
- } catch (e) {
496
- log(` ${country}: 请求失败 (${e.message})`);
497
499
  }
500
+ emptyRounds = 0; // 重置计数器
501
+ // 等 3 秒让服务端处理完
502
+ await new Promise((r) => setTimeout(r, 3000));
503
+ continue;
498
504
  }
499
- emptyRounds = 0; // 重置计数器
500
- // 3 秒让服务端处理完
501
- await new Promise((r) => setTimeout(r, 3000));
505
+ log(`⏳ 暂无待打分标签(连续 ${emptyRounds} 轮),10 秒后重试...`);
506
+ await new Promise((r) => setTimeout(r, 10000));
502
507
  continue;
503
508
  }
504
- log(`⏳ 暂无待打分标签(连续 ${emptyRounds} 轮),10 秒后重试...`);
505
- await new Promise((r) => setTimeout(r, 10000));
506
- continue;
507
- }
508
-
509
- // 有任务了,重置计数器
510
- emptyRounds = 0;
511
-
512
- const tag = tagsData.tags[0].tag.replace(/^#+/, "").trim().toLowerCase();
513
- const startTime = Date.now();
514
509
 
515
- log(`[${totalScored + 1}] 正在打分 #${tag} ...`);
510
+ // 有任务了,重置计数器
511
+ emptyRounds = 0;
512
+
513
+ const tag = tagsData.tags[0].tag
514
+ .replace(/^#+/, "")
515
+ .trim()
516
+ .toLowerCase();
517
+ const startTime = Date.now();
518
+
519
+ log(`[${totalScored + 1}] 正在打分 #${tag} ...`);
520
+
521
+ const result = {
522
+ tag,
523
+ status: "error",
524
+ score: 0,
525
+ totalPosts: 0,
526
+ authorCount: 0,
527
+ matchedAuthors: 0,
528
+ matchedCountries: [],
529
+ pushedUsers: 0,
530
+ error: null,
531
+ };
516
532
 
517
- const result = {
518
- tag,
519
- status: "error",
520
- score: 0,
521
- totalPosts: 0,
522
- authorCount: 0,
523
- matchedAuthors: 0,
524
- matchedCountries: [],
525
- pushedUsers: 0,
526
- error: null,
527
- };
528
-
529
- try {
530
533
  // 锁定 tag
531
534
  const claimRes = await fetch(`${baseUrl}/api/tags/claim`, {
532
535
  method: "POST",
@@ -624,10 +627,28 @@ export async function handleScoreAll(parsed) {
624
627
  );
625
628
  log("");
626
629
  } catch (e) {
630
+ // 区分网络错误和业务错误
631
+ const isNetworkError =
632
+ e.code === "ECONNREFUSED" ||
633
+ e.code === "ENOTFOUND" ||
634
+ e.code === "ECONNRESET" ||
635
+ (e.message &&
636
+ (e.message.includes("ECONNREFUSED") ||
637
+ e.message.includes("fetch failed") ||
638
+ e.message.includes("network")));
639
+ if (isNetworkError) {
640
+ log(` ⚠️ 服务端连接失败 (${e.message}),15 秒后重试...`);
641
+ await new Promise((r) => setTimeout(r, 15000));
642
+ continue;
643
+ }
627
644
  log(` ❌ 失败: ${e.message}`);
628
- result.error = e.message;
629
645
  try {
630
- await reportToServer(baseUrl, result);
646
+ await reportToServer(baseUrl, {
647
+ tag: "",
648
+ status: "error",
649
+ score: 0,
650
+ error: e.message,
651
+ });
631
652
  } catch {}
632
653
  totalScored++;
633
654
  }
@@ -2223,6 +2223,58 @@ export function createStore(filePath, options = {}) {
2223
2223
  if (filePath) {
2224
2224
  // 初始化 SQLite 用户表(用于判重)
2225
2225
  initUserDb(filePath);
2226
+ // 从数据库恢复偏移量
2227
+ loadLlmSampleOffsets();
2228
+ }
2229
+
2230
+ /**
2231
+ * 从数据库加载 LLM 采样偏移量
2232
+ */
2233
+ function loadLlmSampleOffsets() {
2234
+ try {
2235
+ const row = db
2236
+ .prepare(`SELECT offsets FROM _llm_sample_offsets LIMIT 1`)
2237
+ .get();
2238
+ if (row && row.offsets) {
2239
+ const parsed = JSON.parse(row.offsets);
2240
+ if (parsed && typeof parsed === "object") {
2241
+ Object.entries(parsed).forEach(([k, v]) => {
2242
+ llmSampleOffsets.set(k, v);
2243
+ });
2244
+ console.error(
2245
+ `[data-store] 已恢复 LLM 采样偏移量: ${Array.from(
2246
+ llmSampleOffsets.entries(),
2247
+ )
2248
+ .map(([k, v]) => `${k}:${v}`)
2249
+ .join(", ")}`,
2250
+ );
2251
+ }
2252
+ }
2253
+ } catch (e) {
2254
+ // 表不存在或解析失败,使用空偏移量
2255
+ console.error(
2256
+ `[data-store] 加载 LLM 采样偏移量失败,使用空偏移量: ${e.message}`,
2257
+ );
2258
+ }
2259
+ }
2260
+
2261
+ /**
2262
+ * 将 LLM 采样偏移量持久化到数据库
2263
+ */
2264
+ function saveLlmSampleOffsets() {
2265
+ try {
2266
+ const offsetsJson = JSON.stringify(Object.fromEntries(llmSampleOffsets));
2267
+ // 表不存在则创建
2268
+ db.prepare(
2269
+ `CREATE TABLE IF NOT EXISTS _llm_sample_offsets (id INTEGER PRIMARY KEY CHECK (id = 1), offsets TEXT)`,
2270
+ ).run();
2271
+ // 插入或更新
2272
+ db.prepare(
2273
+ `INSERT OR REPLACE INTO _llm_sample_offsets (id, offsets) VALUES (1, ?)`,
2274
+ ).run(offsetsJson);
2275
+ } catch (e) {
2276
+ console.error(`[data-store] 保存 LLM 采样偏移量失败: ${e.message}`);
2277
+ }
2226
2278
  }
2227
2279
 
2228
2280
  // stats 缓存
@@ -2383,15 +2435,97 @@ export function createStore(filePath, options = {}) {
2383
2435
  }
2384
2436
 
2385
2437
  function flushSave() {
2438
+ // 数据库模式:先保存 LLM 偏移量,再备份数据库
2439
+ if (db && dbPath) {
2440
+ try {
2441
+ saveLlmSampleOffsets();
2442
+ } catch (e) {
2443
+ console.error(`[data-store] 保存 LLM 偏移量失败: ${e.message}`);
2444
+ }
2445
+ }
2386
2446
  return Promise.resolve();
2387
2447
  }
2388
2448
 
2389
- function saveVideos() {
2390
- return;
2449
+ /**
2450
+ * 数据库备份:使用 SQLite BACKUP 命令,保留最新 maxBackups 个备份
2451
+ * @param {number} maxBackups - 保留的备份数量,默认 3
2452
+ * @returns {string|null} 备份文件路径,失败返回 null
2453
+ */
2454
+ function backupDatabase(maxBackups = 3) {
2455
+ if (!db || !dbPath) {
2456
+ console.error("[data-store] 数据库未初始化,跳过备份");
2457
+ return null;
2458
+ }
2459
+
2460
+ try {
2461
+ // 生成备份文件名:result-20260627T094400.db
2462
+ const now = new Date();
2463
+ const timestamp = now
2464
+ .toISOString()
2465
+ .replace(/[-:T.]/g, "")
2466
+ .slice(0, 15); // YYYYMMDDHHmmss
2467
+ const baseName = path.basename(dbPath, ".db");
2468
+ const backupName = `${baseName}-${timestamp}.db`;
2469
+ const backupDir = path.dirname(dbPath);
2470
+ const backupPath = path.join(backupDir, backupName);
2471
+
2472
+ console.error(`[data-store] 正在备份数据库: ${backupName}`);
2473
+
2474
+ // 使用 better-sqlite3 的 backup API(原子性备份,安全可靠)
2475
+ const backupDb = new Database(backupPath);
2476
+ db.backup("main", backupDb, "main");
2477
+ backupDb.close();
2478
+
2479
+ // 验证备份文件大小
2480
+ const stat = fs.statSync(backupPath);
2481
+ const sizeMB = (stat.size / 1024 / 1024).toFixed(2);
2482
+ console.error(`[data-store] 备份完成: ${backupName} (${sizeMB} MB)`);
2483
+
2484
+ // 清理旧备份:保留最新 maxBackups 个
2485
+ cleanupOldBackups(backupDir, baseName, maxBackups);
2486
+
2487
+ return backupPath;
2488
+ } catch (e) {
2489
+ console.error(`[data-store] 备份失败: ${e.message}`);
2490
+ return null;
2491
+ }
2492
+ }
2493
+
2494
+ /**
2495
+ * 清理旧备份文件,保留最新 maxBackups 个
2496
+ */
2497
+ function cleanupOldBackups(backupDir, baseName, maxBackups) {
2498
+ try {
2499
+ // 查找所有备份文件:baseName-YYYYMMDDHHmmss.db
2500
+ const pattern = new RegExp(`^${baseName}-\\d{15}\\.db$`);
2501
+ const backups = fs
2502
+ .readdirSync(backupDir)
2503
+ .filter((f) => pattern.test(f))
2504
+ .sort() // 按时间戳排序(ASCII 排序 = 时间排序)
2505
+ .reverse(); // 最新的在前
2506
+
2507
+ if (backups.length > maxBackups) {
2508
+ const toDelete = backups.slice(maxBackups);
2509
+ for (const file of toDelete) {
2510
+ const filePath = path.join(backupDir, file);
2511
+ fs.unlinkSync(filePath);
2512
+ console.error(`[data-store] 已清理旧备份: ${file}`);
2513
+ }
2514
+ }
2515
+
2516
+ console.error(
2517
+ `[data-store] 备份清理完成: 保留 ${Math.min(backups.length, maxBackups)} / ${backups.length} 个备份`,
2518
+ );
2519
+ } catch (e) {
2520
+ console.error(`[data-store] 清理旧备份失败: ${e.message}`);
2521
+ }
2391
2522
  }
2392
2523
 
2393
2524
  function stopBackup() {
2394
- return;
2525
+ // 退出时执行备份
2526
+ if (db && dbPath) {
2527
+ backupDatabase();
2528
+ }
2395
2529
  }
2396
2530
 
2397
2531
  function getUser(uid) {
@@ -2795,7 +2929,9 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2795
2929
  .prepare(
2796
2930
  `
2797
2931
  SELECT * FROM raw_jobs WHERE ${whereSql} AND guessed_location = ?
2798
- ORDER BY COALESCE(video_count, 0) DESC, created_at DESC
2932
+ ORDER BY
2933
+ CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
2934
+ COALESCE(video_count, 0) DESC, created_at DESC
2799
2935
  LIMIT ? OFFSET ?
2800
2936
  `,
2801
2937
  )
@@ -2803,11 +2939,32 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2803
2939
 
2804
2940
  if (samples.length === 0) break;
2805
2941
 
2806
- const scores = await scoreJobsBatch(
2807
- samples,
2808
- DEFAULT_TARGET_LOCATIONS,
2942
+ // 分离 tag 来源和非 tag 来源:tag 来源跳过 LLM 打分直接合格
2943
+ const tagSamples = samples.filter((s) =>
2944
+ (s.sources || "").includes("tag"),
2809
2945
  );
2810
- const batchQualified = scores.filter((s) => s.score >= llmMinScore);
2946
+ const nonTagSamples = samples.filter(
2947
+ (s) => !(s.sources || "").includes("tag"),
2948
+ );
2949
+
2950
+ // tag 来源直接加入合格列表
2951
+ if (tagSamples.length > 0) {
2952
+ allQualified.push(...tagSamples.map((s) => s.unique_id));
2953
+ console.error(
2954
+ `[data-store] ${location}: 本批 ${tagSamples.length} 条 tag 来源任务跳过 LLM 打分直接合格`,
2955
+ );
2956
+ }
2957
+
2958
+ // 非 tag 来源走 LLM 打分
2959
+ let batchQualified = [];
2960
+ let scores = [];
2961
+ if (nonTagSamples.length > 0) {
2962
+ scores = await scoreJobsBatch(
2963
+ nonTagSamples,
2964
+ DEFAULT_TARGET_LOCATIONS,
2965
+ );
2966
+ batchQualified = scores.filter((s) => s.score >= llmMinScore);
2967
+ }
2811
2968
 
2812
2969
  allScores.push(...scores);
2813
2970
  allQualified.push(...batchQualified.map((s) => s.uniqueId));
@@ -2829,12 +2986,27 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2829
2986
  if (allQualified.length >= llmMinReturn) break;
2830
2987
  }
2831
2988
 
2832
- // 按分数降序排序,取前 safeLimit
2833
- const qualifiedScores = allScores
2989
+ // 分离 tag 合格和非 tag 合格
2990
+ // tag 任务直接合格(不在 allScores 中),非 tag 任务走 LLM 打分
2991
+ const tagQualified = allQualified.filter(
2992
+ (uid) => !allScores.find((s) => s.uniqueId === uid),
2993
+ );
2994
+ const nonTagQualifiedScores = allScores
2834
2995
  .filter((s) => s.score >= llmMinScore)
2835
- .sort((a, b) => b.score - a.score)
2836
- .slice(0, safeLimit);
2837
- const qualified = qualifiedScores.map((s) => s.uniqueId);
2996
+ .sort((a, b) => b.score - a.score);
2997
+ const nonTagQualified = nonTagQualifiedScores.map((s) => s.uniqueId);
2998
+
2999
+ // 限制 tag 占比:最多占 safeLimit 的 70%,留 30% 给非 tag
3000
+ const tagMaxCount = Math.floor(safeLimit * 0.7);
3001
+ const tagCount = Math.min(tagQualified.length, tagMaxCount);
3002
+ const nonTagMaxCount = safeLimit - tagCount;
3003
+ const finalNonTagQualified = nonTagQualified.slice(0, nonTagMaxCount);
3004
+
3005
+ // 最终合格列表:tag 优先 + 非 tag 按分数排序
3006
+ const qualified = [
3007
+ ...tagQualified.slice(0, tagCount),
3008
+ ...finalNonTagQualified,
3009
+ ];
2838
3010
 
2839
3011
  if (!qualified.length) {
2840
3012
  console.error(
@@ -2881,6 +3053,9 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2881
3053
  moveTxn();
2882
3054
  markStatsDirty();
2883
3055
 
3056
+ // 持久化偏移量到数据库
3057
+ saveLlmSampleOffsets();
3058
+
2884
3059
  // 打印最终偏移量状态
2885
3060
  const finalOffsetSummary = Array.from(llmSampleOffsets.entries())
2886
3061
  .map(([k, v]) => `${k}:${v}`)
@@ -2920,7 +3095,9 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2920
3095
  status_code, latest_video_time, user_create_time
2921
3096
  FROM raw_jobs
2922
3097
  WHERE ${whereSql}
2923
- ORDER BY COALESCE(video_count, 0) DESC, created_at DESC
3098
+ ORDER BY
3099
+ CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
3100
+ COALESCE(video_count, 0) DESC, created_at DESC
2924
3101
  LIMIT ?
2925
3102
  `,
2926
3103
  ).run(...args, safeLimit);
@@ -2932,7 +3109,9 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
2932
3109
  WHERE unique_id IN (
2933
3110
  SELECT unique_id FROM raw_jobs
2934
3111
  WHERE ${whereSql}
2935
- ORDER BY COALESCE(video_count, 0) DESC, created_at DESC
3112
+ ORDER BY
3113
+ CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
3114
+ COALESCE(video_count, 0) DESC, created_at DESC
2936
3115
  LIMIT ?
2937
3116
  )
2938
3117
  `,
@@ -4274,7 +4453,12 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
4274
4453
  sqlParams.push(...targetCountries);
4275
4454
  }
4276
4455
 
4277
- sql += ` ORDER BY created_at ASC, unique_id ASC LIMIT ?`;
4456
+ // 优先级:sources 包含 "tag" 的任务优先,其余按 created_at 排序
4457
+ sql += ` ORDER BY
4458
+ CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
4459
+ created_at ASC,
4460
+ unique_id ASC
4461
+ LIMIT ?`;
4278
4462
  sqlParams.push(l);
4279
4463
 
4280
4464
  const rows = db.prepare(sql).all(...sqlParams);
@@ -4322,6 +4506,13 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
4322
4506
  }
4323
4507
  return false;
4324
4508
  })
4509
+ .sort((a, b) => {
4510
+ // 优先级:sources 包含 "tag" 的任务优先
4511
+ const aIsTag = (a.sources || "").includes("tag");
4512
+ const bIsTag = (b.sources || "").includes("tag");
4513
+ if (aIsTag !== bIsTag) return aIsTag ? -1 : 1;
4514
+ return (a.createdAt || 0) - (b.createdAt || 0);
4515
+ })
4325
4516
  .slice(0, l);
4326
4517
  // 接受任务时 userUpdateCount + 1
4327
4518
  pending.forEach((u) => {
@@ -4835,6 +5026,7 @@ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unl
4835
5026
  commitCommentTask,
4836
5027
  debugClaimNextJob,
4837
5028
  stopBackup,
5029
+ backupDatabase, // 手动备份数据库
4838
5030
  rawQuery,
4839
5031
  getLlmSampleOffsets, // 获取 LLM 采样偏移量状态
4840
5032
  // Tag 发现与打分
@@ -1250,7 +1250,10 @@ export function startWatchServer(
1250
1250
  console.error("[server] HTTP 服务已关闭");
1251
1251
  });
1252
1252
  await store.flushSave();
1253
- console.error("[server] 数据已保存,退出");
1253
+ console.error("[server] 数据已保存");
1254
+ // 备份数据库
1255
+ store.stopBackup();
1256
+ console.error("[server] 退出");
1254
1257
  process.exit(0);
1255
1258
  }
1256
1259