tt-help-cli-ycl 1.3.80 → 1.3.82

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
1
  import fs from "fs";
2
2
  import path from "path";
3
3
  import Database from "better-sqlite3";
4
- import { isLocationInList } from "../lib/target-locations.js";
4
+ import {
5
+ isLocationInList,
6
+ DEFAULT_TARGET_LOCATIONS,
7
+ } from "../lib/target-locations.js";
5
8
 
6
9
  // SQLite 用户表(用于判重)
7
10
  let db = null;
@@ -143,6 +146,12 @@ function initUserDb(filePath) {
143
146
  if (!existingJobColumns.has("bio_link")) {
144
147
  db.exec(`ALTER TABLE jobs ADD COLUMN bio_link TEXT`);
145
148
  }
149
+ if (!existingJobColumns.has("top_video_play_count")) {
150
+ db.exec(`ALTER TABLE jobs ADD COLUMN top_video_play_count INTEGER`);
151
+ }
152
+ if (!existingJobColumns.has("top_video_href")) {
153
+ db.exec(`ALTER TABLE jobs ADD COLUMN top_video_href TEXT`);
154
+ }
146
155
  db.exec(`
147
156
  CREATE TABLE IF NOT EXISTS jobs_base (
148
157
  unique_id TEXT PRIMARY KEY,
@@ -650,13 +659,24 @@ function getDashboardStatsFromDb(targetLocations = []) {
650
659
  AND instr(COALESCE(sources, ''), '"guess"') = 0
651
660
  AND instr(COALESCE(sources, ''), '"following"') = 0
652
661
  AND instr(COALESCE(sources, ''), '"follower"') = 0
653
- THEN 1 ELSE 0 END) as seed,
654
- SUM(CASE WHEN COALESCE(tt_seller, '') = '' AND COALESCE(user_update_count, 0) <= 0 THEN 1 ELSE 0 END) as userUpdateTasks
662
+ THEN 1 ELSE 0 END) as seed
655
663
  FROM jobs
656
664
  `,
657
665
  )
658
666
  .get(...targetParams);
659
667
 
668
+ // userUpdateTasks 单独从 jobs_base 统计
669
+ const userUpdateTasksRow = db
670
+ .prepare(
671
+ `
672
+ SELECT COUNT(*) as userUpdateTasks
673
+ FROM jobs_base
674
+ WHERE COALESCE(tt_seller, '') = ''
675
+ AND COALESCE(user_update_count, 0) <= 0
676
+ `,
677
+ )
678
+ .get();
679
+
660
680
  // countryStats 和 targetCountryStats 需要 GROUP BY,保留为独立查询
661
681
  const countryStats = db
662
682
  .prepare(
@@ -712,7 +732,7 @@ function getDashboardStatsFromDb(targetLocations = []) {
712
732
  restrictedUsers: aggregateRow.restricted,
713
733
  errorUsers: aggregateRow.error,
714
734
  targetUsers: aggregateRow.targetUsers,
715
- userUpdateTasks: aggregateRow.userUpdateTasks,
735
+ userUpdateTasks: userUpdateTasksRow.userUpdateTasks,
716
736
  targetCountryStats,
717
737
  countryStats,
718
738
  sourceStats: {
@@ -761,8 +781,8 @@ function getUserUpdateByCountryFromDb() {
761
781
  SELECT
762
782
  COALESCE(guessed_location, '未知') as country,
763
783
  COUNT(*) as count
764
- FROM jobs
765
- WHERE COALESCE(tt_seller, '') = ''
784
+ FROM jobs_base
785
+ WHERE tt_seller IS NULL
766
786
  AND COALESCE(user_update_count, 0) <= 0
767
787
  GROUP BY COALESCE(guessed_location, '未知')
768
788
  ORDER BY count DESC
@@ -782,8 +802,8 @@ function getAttachStuckByCountryFromDb() {
782
802
  SELECT
783
803
  COALESCE(guessed_location, '未知') as country,
784
804
  COUNT(*) as count
785
- FROM jobs
786
- WHERE COALESCE(tt_seller, '') = ''
805
+ FROM jobs_base
806
+ WHERE tt_seller IS NULL
787
807
  AND COALESCE(user_update_count, 0) = 1
788
808
  GROUP BY COALESCE(guessed_location, '未知')
789
809
  ORDER BY count DESC
@@ -816,7 +836,7 @@ function restoreAttachStuckByCountry(country) {
816
836
  .prepare(
817
837
  `
818
838
  SELECT COUNT(*) as c
819
- FROM jobs
839
+ FROM jobs_base
820
840
  WHERE ${whereSql}
821
841
  `,
822
842
  )
@@ -828,7 +848,7 @@ function restoreAttachStuckByCountry(country) {
828
848
 
829
849
  db.prepare(
830
850
  `
831
- UPDATE jobs
851
+ UPDATE jobs_base
832
852
  SET user_update_count = 0,
833
853
  updated_at = ?,
834
854
  claimed_by = NULL,
@@ -920,11 +940,36 @@ function moveJobsToRawByCountry(scope, country) {
920
940
  };
921
941
  }
922
942
 
943
+ // pending 操作 jobs 表(与 getPendingByCountryFromDb 数据源一致)
944
+ // userUpdate 操作 jobs_base 表(与 getUserUpdateByCountryFromDb 数据源一致)
945
+ let sourceTable = "";
923
946
  let scopeWhere = "";
947
+ let columns = "";
948
+
924
949
  if (normalizedScope === "pending") {
925
- scopeWhere = `status = 'pending' AND COALESCE(user_update_count, 0) >= 2`;
950
+ sourceTable = "jobs";
951
+ scopeWhere = `status = 'pending'`;
952
+ columns = `
953
+ unique_id, nickname, status, sources, claimed_by, claimed_at,
954
+ error, pinned, no_video, restricted, user_update_count,
955
+ tt_seller, verified, video_count, comment_count,
956
+ guessed_location, location_created, follower_count,
957
+ following_count, heart_count, refresh_time, processed,
958
+ processed_at, created_at, updated_at, region, signature,
959
+ sec_uid, latest_video_time
960
+ `;
926
961
  } else if (normalizedScope === "userUpdate") {
927
- scopeWhere = `COALESCE(tt_seller, '') = '' AND COALESCE(user_update_count, 0) <= 0`;
962
+ sourceTable = "jobs_base";
963
+ scopeWhere = `tt_seller IS NULL AND COALESCE(user_update_count, 0) <= 0`;
964
+ columns = `
965
+ unique_id, nickname, status, sources, claimed_by, claimed_at,
966
+ error, pinned, no_video, restricted, user_update_count,
967
+ tt_seller, verified, video_count, comment_count,
968
+ guessed_location, location_created, follower_count,
969
+ following_count, heart_count, refresh_time, processed,
970
+ processed_at, created_at, updated_at, region, signature,
971
+ sec_uid, latest_video_time
972
+ `;
928
973
  } else {
929
974
  return {
930
975
  moved: 0,
@@ -943,7 +988,7 @@ function moveJobsToRawByCountry(scope, country) {
943
988
  .prepare(
944
989
  `
945
990
  SELECT COUNT(*) as c
946
- FROM jobs
991
+ FROM ${sourceTable}
947
992
  WHERE ${whereSql}
948
993
  `,
949
994
  )
@@ -957,74 +1002,18 @@ function moveJobsToRawByCountry(scope, country) {
957
1002
  db.prepare(
958
1003
  `
959
1004
  INSERT OR REPLACE INTO raw_jobs (
960
- unique_id,
961
- nickname,
962
- status,
963
- sources,
964
- claimed_by,
965
- claimed_at,
966
- error,
967
- pinned,
968
- no_video,
969
- restricted,
970
- user_update_count,
971
- tt_seller,
972
- verified,
973
- video_count,
974
- comment_count,
975
- guessed_location,
976
- location_created,
977
- follower_count,
978
- following_count,
979
- heart_count,
980
- refresh_time,
981
- processed,
982
- processed_at,
983
- created_at,
984
- updated_at,
985
- region,
986
- signature,
987
- sec_uid,
988
- latest_video_time
1005
+ ${columns}
989
1006
  )
990
1007
  SELECT
991
- unique_id,
992
- nickname,
993
- status,
994
- sources,
995
- claimed_by,
996
- claimed_at,
997
- error,
998
- pinned,
999
- no_video,
1000
- restricted,
1001
- user_update_count,
1002
- tt_seller,
1003
- verified,
1004
- video_count,
1005
- comment_count,
1006
- guessed_location,
1007
- location_created,
1008
- follower_count,
1009
- following_count,
1010
- heart_count,
1011
- refresh_time,
1012
- processed,
1013
- processed_at,
1014
- created_at,
1015
- updated_at,
1016
- region,
1017
- signature,
1018
- sec_uid,
1019
- latest_video_time
1020
- FROM jobs
1008
+ ${columns}
1009
+ FROM ${sourceTable}
1021
1010
  WHERE ${whereSql}
1022
1011
  `,
1023
1012
  ).run(targetCountry);
1024
1013
 
1025
1014
  db.prepare(
1026
1015
  `
1027
- DELETE FROM jobs
1016
+ DELETE FROM ${sourceTable}
1028
1017
  WHERE ${whereSql}
1029
1018
  `,
1030
1019
  ).run(targetCountry);
@@ -1527,6 +1516,8 @@ function getTargetUsersByCountryFromDb(targetLocations = [], options = {}) {
1527
1516
  modified_at,
1528
1517
  latest_video_time,
1529
1518
  refresh_time,
1519
+ top_video_play_count,
1520
+ top_video_href,
1530
1521
  status,
1531
1522
  sources
1532
1523
  FROM jobs
@@ -1675,6 +1666,8 @@ const writableJobColumns = new Set([
1675
1666
  "sec_uid",
1676
1667
  "status_code",
1677
1668
  "latest_video_time",
1669
+ "top_video_play_count",
1670
+ "top_video_href",
1678
1671
  ]);
1679
1672
 
1680
1673
  function normalizeJobValue(column, value) {
@@ -1718,6 +1711,13 @@ function getJobRow(uniqueId) {
1718
1711
  return db.prepare("SELECT * FROM jobs WHERE unique_id = ?").get(uniqueId);
1719
1712
  }
1720
1713
 
1714
+ function getJobBaseRow(uniqueId) {
1715
+ if (!db) return null;
1716
+ return db
1717
+ .prepare("SELECT * FROM jobs_base WHERE unique_id = ?")
1718
+ .get(uniqueId);
1719
+ }
1720
+
1721
1721
  function getJob(uniqueId) {
1722
1722
  return mapJobRow(getJobRow(uniqueId));
1723
1723
  }
@@ -1795,6 +1795,43 @@ function inferStatus(u) {
1795
1795
  return "pending";
1796
1796
  }
1797
1797
 
1798
+ function updateJobBaseInfo(uniqueId, info, incrementCount = true) {
1799
+ if (!db) return { error: "db not initialized" };
1800
+ const existing = getJobBaseRow(uniqueId);
1801
+ if (!existing) return { error: "user not found" };
1802
+
1803
+ const nextValues = {};
1804
+ for (const [key, value] of Object.entries(info || {})) {
1805
+ if (key === "uniqueId" || key === "unique_id") continue;
1806
+ if (value === undefined || value === "") continue;
1807
+ let column = camelToSnake(key);
1808
+ // 字段别名:bio → signature
1809
+ if (column === "bio") column = "signature";
1810
+ if (!writableJobColumns.has(column)) continue;
1811
+ nextValues[column] = normalizeJobValue(column, value);
1812
+ }
1813
+
1814
+ nextValues.updated_at = Date.now();
1815
+ if (incrementCount) {
1816
+ nextValues.user_update_count = (existing.user_update_count || 0) + 1;
1817
+ }
1818
+
1819
+ const columns = Object.keys(nextValues);
1820
+ if (columns.length > 0) {
1821
+ const sql = `UPDATE jobs_base SET ${columns.map((column) => `${column} = ?`).join(", ")} WHERE unique_id = ?`;
1822
+ db.prepare(sql).run(
1823
+ ...columns.map((column) => nextValues[column]),
1824
+ uniqueId,
1825
+ );
1826
+ }
1827
+
1828
+ return {
1829
+ ok: true,
1830
+ userUpdateCount:
1831
+ nextValues.user_update_count ?? existing.user_update_count ?? 0,
1832
+ };
1833
+ }
1834
+
1798
1835
  function addJobBaseToDb(user) {
1799
1836
  if (!db) return;
1800
1837
  const now = Date.now();
@@ -1895,16 +1932,30 @@ function addJob(user) {
1895
1932
  writeTxn(user);
1896
1933
  }
1897
1934
 
1898
- export function createStore(filePath) {
1935
+ export function createStore(filePath, options = {}) {
1899
1936
  if (!filePath) {
1900
1937
  throw new Error("createStore requires an explicit .db path");
1901
1938
  }
1939
+
1940
+ // refillJobsFromRaw 的 LLM 打分配置(自动补充任务时使用)
1941
+ const refillLlmConfig = {
1942
+ llmScore: false,
1943
+ llmMinScore: 60,
1944
+ llmSampleSize: 100,
1945
+ ...options.refillLlm,
1946
+ };
1947
+
1902
1948
  let data = [];
1903
1949
  // uniqueId → index 内存索引,O(1) 查找
1904
1950
  let uidIndex = new Map();
1905
1951
  let clientErrors = new Map();
1906
1952
  // 客户端登录状态:userId → boolean
1907
1953
  let clientLoginStatus = new Map();
1954
+ // refill 锁:防止多个 claimNextJob 同时触发 LLM refill
1955
+ let refillLock = null; // Promise | null
1956
+ // LLM 采样偏移量记忆:按猜测国家记录上次查询位置,避免重复采样
1957
+ // 格式: { "ES": 300, "PL": 500, "NL": 400 }
1958
+ let llmSampleOffsets = new Map();
1908
1959
  if (filePath) {
1909
1960
  // 初始化 SQLite 用户表(用于判重)
1910
1961
  initUserDb(filePath);
@@ -2162,7 +2213,439 @@ export function createStore(filePath) {
2162
2213
  return data;
2163
2214
  }
2164
2215
 
2165
- function claimNextJob(
2216
+ /**
2217
+ * 使用 LLM 对单个 job 的国家匹配度打分(0-100)
2218
+ * @param {Object} job - raw_jobs 中的一条记录
2219
+ * @param {string[]} targetLocations - 目标国家列表
2220
+ * @returns {Promise<{ uniqueId: string, score: number, reason: string }>}
2221
+ */
2222
+ async function scoreJobLocation(job, targetLocations) {
2223
+ const { fetch: undiciFetch } = await import("undici");
2224
+
2225
+ const prompt = `
2226
+ 你是一个 TikTok 用户数据分析助手。请根据以下用户信息,判断该用户是否来自以下**任意一个**目标国家。
2227
+
2228
+ 目标国家列表: ${targetLocations.join(", ")}
2229
+
2230
+ 重要:
2231
+ - 用户只要来自上述**任意一个**国家就算匹配。
2232
+ - guessed_location 是系统初步猜测的结果,**仅供参考**,不要完全依赖它。
2233
+ - 请综合用户名、昵称、签名、位置等信息做判断。
2234
+
2235
+ 用户信息:
2236
+ - 用户名: ${job.unique_id || "未知"}
2237
+ - 昵称: ${job.nickname || "未知"}
2238
+ - 签名: ${job.signature || "未知"}
2239
+ - 地区: ${job.region || "未知"}
2240
+ - 猜测国家(参考): ${job.guessed_location || "未知"}
2241
+ - 位置信息: ${job.location_created || "未知"}
2242
+ - 主页链接: ${job.bio_link || "未知"}
2243
+
2244
+ 返回 JSON(仅返回 JSON,无其他内容):
2245
+ {"score": 0-100, "reason": "English only, under 50 chars, no quotes/brackets"}
2246
+
2247
+ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unlikely
2248
+ `;
2249
+
2250
+ try {
2251
+ const apiKey = process.env.APIKEY || "";
2252
+ const response = await undiciFetch(
2253
+ "http://82.156.52.214:18000/v1/chat/completions",
2254
+ {
2255
+ method: "POST",
2256
+ headers: {
2257
+ "Content-Type": "application/json",
2258
+ Authorization: `Bearer ${apiKey}`,
2259
+ },
2260
+ body: JSON.stringify({
2261
+ model: "zc-fast",
2262
+ messages: [{ role: "user", content: prompt }],
2263
+ max_tokens: 512,
2264
+ temperature: 0.1,
2265
+ }),
2266
+ },
2267
+ );
2268
+
2269
+ const result = await response.json();
2270
+ const content = result.choices?.[0]?.message?.content || "";
2271
+
2272
+ // 解析 JSON 响应(多层容错)
2273
+ let parsed = null;
2274
+
2275
+ // 尝试 1: 直接解析
2276
+ try {
2277
+ parsed = JSON.parse(content);
2278
+ } catch {
2279
+ // 尝试 2: 提取 {} 包裹的内容
2280
+ const match = content.match(/\{[\s\S]*\}/);
2281
+ if (match) {
2282
+ try {
2283
+ parsed = JSON.parse(match[0]);
2284
+ } catch {
2285
+ // 尝试 3: 清理常见问题后解析
2286
+ const cleaned = match[0]
2287
+ .replace(/"/g, '"') // 弯引号 → 直引号
2288
+ .replace(/\s+/g, " ") // 多余空白
2289
+ .trim();
2290
+ try {
2291
+ parsed = JSON.parse(cleaned);
2292
+ } catch {
2293
+ // 尝试 4: 从文本中提取 score 和 reason(reason 可能包含引号等特殊字符)
2294
+ const scoreMatch = content.match(/"?score"?\s*:\s*(\d+)/i);
2295
+ if (scoreMatch) {
2296
+ let reason = "解析降级";
2297
+ // 找 "reason": 的位置,取到最后一个 } 前的内容
2298
+ const reasonKeyPos = content.search(/"?reason"?\s*:\s*"/i);
2299
+ if (reasonKeyPos !== -1) {
2300
+ const afterKey = content.substring(reasonKeyPos);
2301
+ const colonPos = afterKey.indexOf(":");
2302
+ const valueStart = afterKey.indexOf('"', colonPos + 1) + 1;
2303
+ const rawValue = afterKey.substring(valueStart);
2304
+ // 取到原始 content 最后一个 } 前
2305
+ const lastBrace = content.lastIndexOf("}");
2306
+ const reasonEnd = lastBrace - reasonKeyPos - valueStart;
2307
+ if (reasonEnd > 0) {
2308
+ reason = rawValue.substring(0, reasonEnd).trim();
2309
+ // 去掉首尾的引号
2310
+ if (reason.startsWith('"')) reason = reason.substring(1);
2311
+ if (reason.endsWith('"'))
2312
+ reason = reason.substring(0, reason.length - 1);
2313
+ }
2314
+ }
2315
+ parsed = {
2316
+ score: parseInt(scoreMatch[1]) || 50,
2317
+ reason,
2318
+ };
2319
+ }
2320
+ }
2321
+ }
2322
+ }
2323
+
2324
+ // 尝试 5: 如果以上都失败,用更宽松的正则提取
2325
+ if (!parsed) {
2326
+ const scoreMatch = content.match(/"score"\s*:\s*(\d+)/);
2327
+ const reasonMatch = content.match(/"reason"\s*:\s*"([^"]*)"/);
2328
+ if (scoreMatch) {
2329
+ parsed = {
2330
+ score: parseInt(scoreMatch[1]) || 50,
2331
+ reason: reasonMatch ? reasonMatch[1] : "解析降级 - 宽松模式",
2332
+ };
2333
+ }
2334
+ }
2335
+ }
2336
+
2337
+ if (parsed && typeof parsed.score === "number") {
2338
+ return {
2339
+ uniqueId: job.unique_id,
2340
+ score: Math.max(0, Math.min(100, parsed.score)),
2341
+ reason: parsed.reason || "",
2342
+ };
2343
+ }
2344
+
2345
+ // 所有解析都失败,返回默认分
2346
+ console.error(
2347
+ `[scoreJobLocation] JSON 解析失败 (${job.unique_id}): ${content.substring(0, 100)}`,
2348
+ );
2349
+ return {
2350
+ uniqueId: job.unique_id,
2351
+ score: 50,
2352
+ reason: "LLM 响应解析失败,使用默认分",
2353
+ };
2354
+ } catch (e) {
2355
+ console.error(
2356
+ `[scoreJobLocation] LLM 调用失败 (${job.unique_id}): ${e.message}`,
2357
+ );
2358
+ return {
2359
+ uniqueId: job.unique_id,
2360
+ score: 50,
2361
+ reason: `LLM 调用异常: ${e.message}`,
2362
+ };
2363
+ }
2364
+ }
2365
+
2366
+ /**
2367
+ * 批量对 jobs 进行 LLM 国家匹配度打分
2368
+ * @param {Object[]} jobs - raw_jobs 记录数组
2369
+ * @param {string[]} targetLocations - 目标国家列表
2370
+ * @param {number} batchSize - 每批处理数量(并发),默认 10
2371
+ * @returns {Promise<Array<{ uniqueId: string, score: number, reason: string }>>}
2372
+ */
2373
+ async function scoreJobsBatch(jobs, targetLocations, batchSize = 10) {
2374
+ const results = [];
2375
+ for (let i = 0; i < jobs.length; i += batchSize) {
2376
+ const batch = jobs.slice(i, i + batchSize);
2377
+ const batchResults = await Promise.all(
2378
+ batch.map((job) => scoreJobLocation(job, targetLocations)),
2379
+ );
2380
+ results.push(...batchResults);
2381
+ }
2382
+ return results;
2383
+ }
2384
+
2385
+ /**
2386
+ * 从 raw_jobs 中移动一批符合条件的任务到 jobs 表
2387
+ * @param {string[]} locations - 目标国家列表(null 表示不限制)
2388
+ * @param {number} limit - 每次移动的最大数量,默认 500
2389
+ * @param {Object} options - 可选配置
2390
+ * @param {boolean} options.llmScore - 是否启用 LLM 打分过滤,默认 false
2391
+ * @param {number} options.llmMinScore - LLM 最低分数阈值,默认 60
2392
+ * @param {number} options.llmSampleSize - LLM 打分的采样数量,默认 100
2393
+ * @returns {{ moved: number }} 实际移动的数量
2394
+ */
2395
+ function refillJobsFromRaw(locations = null, limit = 500, options = {}) {
2396
+ if (!db) {
2397
+ return { moved: 0, error: "db not ready" };
2398
+ }
2399
+
2400
+ const safeLimit = Math.max(1, Math.min(2000, parseInt(limit) || 500));
2401
+ const normalizedLocations = locations
2402
+ ? locations.map((loc) => String(loc).trim().toUpperCase()).filter(Boolean)
2403
+ : null;
2404
+
2405
+ const useLlm = !!options.llmScore;
2406
+ const llmMinScore = options.llmMinScore ?? 60;
2407
+ const llmSampleSize = options.llmSampleSize ?? 100;
2408
+
2409
+ // 构建 WHERE 条件
2410
+ const conditions = [
2411
+ "COALESCE(video_count, 0) > 0",
2412
+ "COALESCE(follower_count, 0) > 0",
2413
+ "COALESCE(following_count, 0) > 0",
2414
+ ];
2415
+ const args = [];
2416
+
2417
+ if (normalizedLocations && normalizedLocations.length > 0) {
2418
+ conditions.push(
2419
+ `UPPER(COALESCE(guessed_location, '')) IN (${normalizedLocations.map(() => "?").join(", ")})`,
2420
+ );
2421
+ args.push(...normalizedLocations);
2422
+ }
2423
+
2424
+ const whereSql = conditions.join(" AND ");
2425
+
2426
+ // 统计符合条件的数量
2427
+ const count =
2428
+ db
2429
+ .prepare(`SELECT COUNT(*) as c FROM raw_jobs WHERE ${whereSql}`)
2430
+ .get(...args)?.c || 0;
2431
+
2432
+ if (!count) {
2433
+ return { moved: 0 };
2434
+ }
2435
+
2436
+ // 如果启用 LLM 打分,先采样一批进行评分(累积模式:按猜测国家分组,使用偏移量记忆避免重复采样)
2437
+ if (useLlm && normalizedLocations && normalizedLocations.length > 0) {
2438
+ const llmMinReturn = options.llmMinReturn ?? 60; // 最少返回合格数
2439
+ const maxBatches = options.llmMaxBatches ?? 10; // 最多采样轮次,防止无限循环
2440
+
2441
+ // 打印当前偏移量状态
2442
+ const offsetSummary = Array.from(llmSampleOffsets.entries())
2443
+ .map(([k, v]) => `${k}:${v}`)
2444
+ .join(", ");
2445
+ console.error(
2446
+ `[data-store] LLM 打分开始: 符合条件 ${count} 条,每批 ${llmSampleSize} 条,最低分 ${llmMinScore},最少返回 ${llmMinReturn} 条`,
2447
+ );
2448
+ if (offsetSummary) {
2449
+ console.error(`[data-store] 偏移量记忆: ${offsetSummary}`);
2450
+ }
2451
+
2452
+ // 返回 Promise,调用方需要 await
2453
+ return (async () => {
2454
+ const allQualified = [];
2455
+ const allScores = [];
2456
+
2457
+ // 按猜测国家分组处理,每个国家使用独立的偏移量
2458
+ const locationGroups = normalizedLocations;
2459
+ let totalBatches = 0;
2460
+
2461
+ for (const location of locationGroups) {
2462
+ // 获取该国家上次的偏移量
2463
+ let offset = llmSampleOffsets.get(location) || 0;
2464
+
2465
+ // 查询该国家的总数量
2466
+ const locationCountSql = `SELECT COUNT(*) as c FROM raw_jobs WHERE ${whereSql} AND guessed_location = ?`;
2467
+ const locationArgs = [...args, location];
2468
+ const locationCount =
2469
+ db.prepare(locationCountSql).get(...locationArgs)?.c || 0;
2470
+
2471
+ if (locationCount === 0) {
2472
+ console.error(
2473
+ `[data-store] 国家 ${location}: raw_jobs 中无数据,跳过`,
2474
+ );
2475
+ continue;
2476
+ }
2477
+
2478
+ // 如果偏移量超过总数,重置为 0(一轮结束,重新开始)
2479
+ if (offset >= locationCount) {
2480
+ offset = 0;
2481
+ llmSampleOffsets.set(location, 0);
2482
+ }
2483
+
2484
+ console.error(
2485
+ `[data-store] 国家 ${location}: 共 ${locationCount} 条,从偏移量 ${offset} 开始`,
2486
+ );
2487
+
2488
+ for (let batch = 0; batch < maxBatches; batch++) {
2489
+ const remaining = locationCount - offset;
2490
+ if (remaining <= 0) break;
2491
+
2492
+ const sampleLimit = Math.min(llmSampleSize, remaining);
2493
+ const samples = db
2494
+ .prepare(
2495
+ `
2496
+ SELECT * FROM raw_jobs WHERE ${whereSql} AND guessed_location = ?
2497
+ ORDER BY created_at DESC
2498
+ LIMIT ? OFFSET ?
2499
+ `,
2500
+ )
2501
+ .all(...locationArgs, sampleLimit, offset);
2502
+
2503
+ if (samples.length === 0) break;
2504
+
2505
+ const scores = await scoreJobsBatch(
2506
+ samples,
2507
+ DEFAULT_TARGET_LOCATIONS,
2508
+ );
2509
+ const batchQualified = scores.filter((s) => s.score >= llmMinScore);
2510
+
2511
+ allScores.push(...scores);
2512
+ allQualified.push(...batchQualified.map((s) => s.uniqueId));
2513
+
2514
+ totalBatches++;
2515
+ console.error(
2516
+ `[data-store] ${location} 第 ${batch + 1} 批: 采样 ${samples.length} 条,本批合格 ${batchQualified.length} 条,累计合格 ${allQualified.length} 条`,
2517
+ );
2518
+
2519
+ // 更新偏移量记忆
2520
+ offset += samples.length;
2521
+ llmSampleOffsets.set(location, offset);
2522
+
2523
+ // 合格数已达到最小返回阈值,停止采样
2524
+ if (allQualified.length >= llmMinReturn) break;
2525
+ }
2526
+
2527
+ // 合格数已达到最小返回阈值,停止所有国家的采样
2528
+ if (allQualified.length >= llmMinReturn) break;
2529
+ }
2530
+
2531
+ // 按分数降序排序,取前 safeLimit 条
2532
+ const qualifiedScores = allScores
2533
+ .filter((s) => s.score >= llmMinScore)
2534
+ .sort((a, b) => b.score - a.score)
2535
+ .slice(0, safeLimit);
2536
+ const qualified = qualifiedScores.map((s) => s.uniqueId);
2537
+
2538
+ if (!qualified.length) {
2539
+ console.error(
2540
+ `[data-store] LLM 打分后无符合条件的任务(阈值: ${llmMinScore},共采样 ${allScores.length} 条)`,
2541
+ );
2542
+ return {
2543
+ moved: 0,
2544
+ scored: allScores.length,
2545
+ qualified: 0,
2546
+ scores: allScores,
2547
+ };
2548
+ }
2549
+
2550
+ // 移动符合条件的记录
2551
+ const placeholders = qualified.map(() => "?").join(", ");
2552
+ const moveTxn = db.transaction(() => {
2553
+ db.prepare(
2554
+ `
2555
+ INSERT OR IGNORE INTO jobs (
2556
+ unique_id, nickname, status, sources, pinned,
2557
+ tt_seller, verified, video_count, comment_count,
2558
+ guessed_location, location_created, confirmed_location,
2559
+ follower_count, following_count, heart_count,
2560
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2561
+ status_code, latest_video_time
2562
+ )
2563
+ SELECT
2564
+ unique_id, nickname, 'pending', sources, pinned,
2565
+ tt_seller, verified, video_count, comment_count,
2566
+ guessed_location, location_created, confirmed_location,
2567
+ follower_count, following_count, heart_count,
2568
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2569
+ status_code, latest_video_time
2570
+ FROM raw_jobs
2571
+ WHERE unique_id IN (${placeholders})
2572
+ `,
2573
+ ).run(...qualified);
2574
+
2575
+ db.prepare(
2576
+ `DELETE FROM raw_jobs WHERE unique_id IN (${placeholders})`,
2577
+ ).run(...qualified);
2578
+ });
2579
+
2580
+ moveTxn();
2581
+ markStatsDirty();
2582
+
2583
+ // 打印最终偏移量状态
2584
+ const finalOffsetSummary = Array.from(llmSampleOffsets.entries())
2585
+ .map(([k, v]) => `${k}:${v}`)
2586
+ .join(", ");
2587
+ console.error(
2588
+ `[data-store] LLM 打分完成: 共采样 ${allScores.length} 条,合格 ${qualified.length} 条,已移动到 jobs`,
2589
+ );
2590
+ console.error(`[data-store] 偏移量记忆更新: ${finalOffsetSummary}`);
2591
+ const scoresDetail = allScores.map((s) => s);
2592
+ return {
2593
+ moved: qualified.length,
2594
+ scored: allScores.length,
2595
+ qualified: qualified.length,
2596
+ scores: scoresDetail,
2597
+ };
2598
+ })();
2599
+ }
2600
+
2601
+ // 常规移动:INSERT + DELETE 事务
2602
+ const moveTxn = db.transaction(() => {
2603
+ db.prepare(
2604
+ `
2605
+ INSERT OR IGNORE INTO jobs (
2606
+ unique_id, nickname, status, sources, pinned,
2607
+ tt_seller, verified, video_count, comment_count,
2608
+ guessed_location, location_created, confirmed_location,
2609
+ follower_count, following_count, heart_count,
2610
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2611
+ status_code, latest_video_time
2612
+ )
2613
+ SELECT
2614
+ unique_id, nickname, 'pending', sources, pinned,
2615
+ tt_seller, verified, video_count, comment_count,
2616
+ guessed_location, location_created, confirmed_location,
2617
+ follower_count, following_count, heart_count,
2618
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2619
+ status_code, latest_video_time
2620
+ FROM raw_jobs
2621
+ WHERE ${whereSql}
2622
+ ORDER BY created_at DESC
2623
+ LIMIT ?
2624
+ `,
2625
+ ).run(...args, safeLimit);
2626
+
2627
+ // 删除已移动的记录:用子查询匹配刚 INSERT 的 unique_id
2628
+ db.prepare(
2629
+ `
2630
+ DELETE FROM raw_jobs
2631
+ WHERE unique_id IN (
2632
+ SELECT unique_id FROM raw_jobs
2633
+ WHERE ${whereSql}
2634
+ ORDER BY created_at DESC
2635
+ LIMIT ?
2636
+ )
2637
+ `,
2638
+ ).run(...args, safeLimit);
2639
+ });
2640
+
2641
+ moveTxn();
2642
+ markStatsDirty();
2643
+
2644
+ const actualMoved = Math.min(count, safeLimit);
2645
+ return { moved: actualMoved };
2646
+ }
2647
+
2648
+ async function claimNextJob(
2166
2649
  userId,
2167
2650
  expireMs = 5 * 60 * 1000,
2168
2651
  locations = null,
@@ -2438,6 +2921,71 @@ export function createStore(filePath) {
2438
2921
  return claimRow(ranked);
2439
2922
  }
2440
2923
  }
2924
+ // 尝试从 raw_jobs 毛料库补充任务(使用 createStore 时配置的 LLM 打分)
2925
+ // 使用锁防止多个请求同时触发 LLM refill
2926
+ if (refillLock) {
2927
+ // 已有 refill 在进行中,等待完成后重新尝试领取
2928
+ await refillLock;
2929
+ for (const requireVideo of [true, false]) {
2930
+ const pinned = findPinnedPending(requireVideo);
2931
+ if (pinned) {
2932
+ return claimRow(pinned);
2933
+ }
2934
+ const ranked = findPrioritizedPending(requireVideo);
2935
+ if (ranked) {
2936
+ return claimRow(ranked);
2937
+ }
2938
+ }
2939
+ return null;
2940
+ }
2941
+ const refillResult = (async () => {
2942
+ refillLock = Promise.resolve(); // 占位
2943
+ const result = refillJobsFromRaw(
2944
+ normalizedLocations.length ? normalizedLocations : null,
2945
+ 500,
2946
+ refillLlmConfig,
2947
+ );
2948
+ // refillJobsFromRaw 在 LLM 模式下返回 Promise
2949
+ if (result && typeof result.then === "function") {
2950
+ return result.finally(() => {
2951
+ refillLock = null;
2952
+ });
2953
+ }
2954
+ return result;
2955
+ })();
2956
+ if (refillResult && typeof refillResult.then === "function") {
2957
+ const awaited = await refillResult;
2958
+ if (awaited.moved > 0) {
2959
+ console.error(
2960
+ `[data-store] 从 raw_jobs 补充了 ${awaited.moved} 条任务到 jobs`,
2961
+ );
2962
+ for (const requireVideo of [true, false]) {
2963
+ const pinned = findPinnedPending(requireVideo);
2964
+ if (pinned) {
2965
+ return claimRow(pinned);
2966
+ }
2967
+ const ranked = findPrioritizedPending(requireVideo);
2968
+ if (ranked) {
2969
+ return claimRow(ranked);
2970
+ }
2971
+ }
2972
+ }
2973
+ } else if (refillResult.moved > 0) {
2974
+ console.error(
2975
+ `[data-store] 从 raw_jobs 补充了 ${refillResult.moved} 条任务到 jobs`,
2976
+ );
2977
+ for (const requireVideo of [true, false]) {
2978
+ const pinned = findPinnedPending(requireVideo);
2979
+ if (pinned) {
2980
+ return claimRow(pinned);
2981
+ }
2982
+ const ranked = findPrioritizedPending(requireVideo);
2983
+ if (ranked) {
2984
+ return claimRow(ranked);
2985
+ }
2986
+ }
2987
+ }
2988
+
2441
2989
  return null;
2442
2990
  }
2443
2991
 
@@ -3033,6 +3581,7 @@ export function createStore(filePath) {
3033
3581
  "discoveredFollowers",
3034
3582
  "uniqueId",
3035
3583
  "sources",
3584
+ "topRecentVideo", // 单独处理,不进入通用循环
3036
3585
  ];
3037
3586
  for (const key of Object.keys(result)) {
3038
3587
  if (extraFields.includes(key)) continue;
@@ -3044,6 +3593,11 @@ export function createStore(filePath) {
3044
3593
  user[key] = result[key];
3045
3594
  }
3046
3595
  }
3596
+ // 将 topRecentVideo 对象展开为扁平字段
3597
+ if (result.topRecentVideo && typeof result.topRecentVideo === "object") {
3598
+ user.topVideoPlayCount = result.topRecentVideo.playCount || null;
3599
+ user.topVideoHref = result.topRecentVideo.href || null;
3600
+ }
3047
3601
  user.sources = [...new Set([...(user.sources || []), "processed"])];
3048
3602
  }
3049
3603
  if (user.status !== oldStatus) markStatsDirty();
@@ -3285,6 +3839,11 @@ export function createStore(filePath) {
3285
3839
  }
3286
3840
  }
3287
3841
  }
3842
+ // 将 topRecentVideo 对象展开为扁平字段
3843
+ if (result.topRecentVideo && typeof result.topRecentVideo === "object") {
3844
+ user.topVideoPlayCount = result.topRecentVideo.playCount || null;
3845
+ user.topVideoHref = result.topRecentVideo.href || null;
3846
+ }
3288
3847
  const newUsers = processDiscoveredUsers(result);
3289
3848
  const ret = updateJobInfo(uniqueId, user, false);
3290
3849
  if (ret.error) return { saved: false, error: ret.error };
@@ -3305,6 +3864,11 @@ export function createStore(filePath) {
3305
3864
  }
3306
3865
  }
3307
3866
  }
3867
+ // 将 topRecentVideo 对象展开为扁平字段
3868
+ if (result.topRecentVideo && typeof result.topRecentVideo === "object") {
3869
+ user.topVideoPlayCount = result.topRecentVideo.playCount || null;
3870
+ user.topVideoHref = result.topRecentVideo.href || null;
3871
+ }
3308
3872
  const newUsers = processDiscoveredUsers(result);
3309
3873
  return { saved: true, newUsers };
3310
3874
  }
@@ -3375,7 +3939,7 @@ export function createStore(filePath) {
3375
3939
 
3376
3940
  let sql = `
3377
3941
  SELECT *
3378
- FROM jobs
3942
+ FROM jobs_base
3379
3943
  WHERE COALESCE(tt_seller, '') = ''
3380
3944
  AND COALESCE(user_update_count, 0) <= 0
3381
3945
  `;
@@ -3395,7 +3959,7 @@ export function createStore(filePath) {
3395
3959
  const now = Date.now();
3396
3960
  const bumpStmt = db.prepare(
3397
3961
  `
3398
- UPDATE jobs
3962
+ UPDATE jobs_base
3399
3963
  SET user_update_count = COALESCE(user_update_count, 0) + 1,
3400
3964
  updated_at = ?
3401
3965
  WHERE unique_id = ?
@@ -3526,7 +4090,8 @@ export function createStore(filePath) {
3526
4090
  function batchUpdateUserInfo(updates) {
3527
4091
  if (db) {
3528
4092
  const results = [];
3529
- const moveList = [];
4093
+ const rawMoveList = [];
4094
+ const sellerMoveList = [];
3530
4095
 
3531
4096
  const txn = db.transaction((items) => {
3532
4097
  items.forEach((item) => {
@@ -3536,13 +4101,13 @@ export function createStore(filePath) {
3536
4101
  let updateResult;
3537
4102
  if (info && info.error && info.statusCode !== undefined) {
3538
4103
  // 只更新 status_code,不更新其他字段
3539
- updateResult = updateJobInfo(
4104
+ updateResult = updateJobBaseInfo(
3540
4105
  uniqueId,
3541
4106
  { statusCode: info.statusCode },
3542
4107
  true,
3543
4108
  );
3544
4109
  } else {
3545
- updateResult = updateJobInfo(uniqueId, info, true);
4110
+ updateResult = updateJobBaseInfo(uniqueId, info, true);
3546
4111
  }
3547
4112
 
3548
4113
  if (updateResult.error) {
@@ -3550,34 +4115,66 @@ export function createStore(filePath) {
3550
4115
  return;
3551
4116
  }
3552
4117
 
3553
- // 检查 tt_seller:非商家则标记为需要移动到毛料表
3554
- const row = getJobRow(uniqueId);
4118
+ // 检查 tt_seller:商家移到 jobs,非商家移到 raw_jobs
4119
+ const row = getJobBaseRow(uniqueId);
3555
4120
  const ttSeller = row ? row.tt_seller : null;
3556
4121
  if (ttSeller) {
3557
- // 商家:保持当前逻辑
4122
+ // 商家:标记移动到 jobs
3558
4123
  results.push({
3559
4124
  uniqueId,
3560
4125
  ok: true,
3561
4126
  userUpdateCount: updateResult.userUpdateCount,
4127
+ _movedToJobs: true,
3562
4128
  });
4129
+ sellerMoveList.push(uniqueId);
3563
4130
  } else {
3564
- // 非商家:标记移动
4131
+ // 非商家:标记移动到 raw_jobs
3565
4132
  results.push({
3566
4133
  uniqueId,
3567
4134
  ok: true,
3568
4135
  userUpdateCount: updateResult.userUpdateCount,
3569
4136
  _movedToRaw: true,
3570
4137
  });
3571
- moveList.push(uniqueId);
4138
+ rawMoveList.push(uniqueId);
3572
4139
  }
3573
4140
  });
3574
4141
  });
3575
4142
  txn(updates);
3576
4143
 
3577
- // 批量移动非商家用户到 raw_jobs(优化:一次 SQL 搞定)
3578
- if (moveList.length > 0) {
3579
- const placeholders = moveList.map(() => "?").join(",");
3580
- // 批量 INSERT 到 raw_jobs
4144
+ // 批量移动商家用户到 jobs
4145
+ if (sellerMoveList.length > 0) {
4146
+ const placeholders = sellerMoveList.map(() => "?").join(",");
4147
+ db.prepare(
4148
+ `
4149
+ INSERT OR REPLACE INTO jobs (
4150
+ unique_id, nickname, status, sources, claimed_by, claimed_at,
4151
+ error, pinned, no_video, restricted, user_update_count,
4152
+ tt_seller, verified, video_count, comment_count,
4153
+ guessed_location, location_created, confirmed_location, modified_at,
4154
+ follower_count, following_count, heart_count, refresh_time,
4155
+ processed, processed_at, created_at, updated_at,
4156
+ region, signature, bio_link, sec_uid, status_code, latest_video_time
4157
+ )
4158
+ SELECT
4159
+ unique_id, nickname, status, sources, claimed_by, claimed_at,
4160
+ error, pinned, no_video, restricted, user_update_count,
4161
+ tt_seller, verified, video_count, comment_count,
4162
+ guessed_location, location_created, confirmed_location, modified_at,
4163
+ follower_count, following_count, heart_count, refresh_time,
4164
+ processed, processed_at, created_at, updated_at,
4165
+ region, signature, bio_link, sec_uid, status_code, latest_video_time
4166
+ FROM jobs_base WHERE unique_id IN (${placeholders})
4167
+ `,
4168
+ ).run(...sellerMoveList);
4169
+
4170
+ db.prepare(
4171
+ `DELETE FROM jobs_base WHERE unique_id IN (${placeholders})`,
4172
+ ).run(...sellerMoveList);
4173
+ }
4174
+
4175
+ // 批量移动非商家用户到 raw_jobs
4176
+ if (rawMoveList.length > 0) {
4177
+ const placeholders = rawMoveList.map(() => "?").join(",");
3581
4178
  db.prepare(
3582
4179
  `
3583
4180
  INSERT OR REPLACE INTO raw_jobs (
@@ -3597,19 +4194,18 @@ export function createStore(filePath) {
3597
4194
  follower_count, following_count, heart_count, refresh_time,
3598
4195
  processed, processed_at, created_at, updated_at,
3599
4196
  region, signature, bio_link, sec_uid, status_code, latest_video_time
3600
- FROM jobs WHERE unique_id IN (${placeholders})
4197
+ FROM jobs_base WHERE unique_id IN (${placeholders})
3601
4198
  `,
3602
- ).run(...moveList);
4199
+ ).run(...rawMoveList);
3603
4200
 
3604
- // 批量 DELETE 从 jobs
3605
- db.prepare(`DELETE FROM jobs WHERE unique_id IN (${placeholders})`).run(
3606
- ...moveList,
3607
- );
4201
+ db.prepare(
4202
+ `DELETE FROM jobs_base WHERE unique_id IN (${placeholders})`,
4203
+ ).run(...rawMoveList);
3608
4204
  }
3609
4205
 
3610
4206
  // 清理内部标记
3611
4207
  return results.map((r) => {
3612
- const { _movedToRaw, ...rest } = r;
4208
+ const { _movedToRaw, _movedToJobs, ...rest } = r;
3613
4209
  return rest;
3614
4210
  });
3615
4211
  }
@@ -3879,6 +4475,9 @@ export function createStore(filePath) {
3879
4475
  getStats,
3880
4476
  getStatusGroups,
3881
4477
  markGroupsDirty,
4478
+ refillJobsFromRaw,
4479
+ scoreJobLocation,
4480
+ scoreJobsBatch,
3882
4481
  claimNextJob,
3883
4482
  commitJob,
3884
4483
  commitNewExplore,
@@ -3904,6 +4503,12 @@ export function createStore(filePath) {
3904
4503
  debugClaimNextJob,
3905
4504
  stopBackup,
3906
4505
  rawQuery,
4506
+ getLlmSampleOffsets, // 获取 LLM 采样偏移量状态
3907
4507
  data,
3908
4508
  };
4509
+
4510
+ // 辅助函数:获取 LLM 采样偏移量
4511
+ function getLlmSampleOffsets() {
4512
+ return Object.fromEntries(llmSampleOffsets);
4513
+ }
3909
4514
  }