tt-help-cli-ycl 1.3.81 → 1.3.83

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
1
  import fs from "fs";
2
2
  import path from "path";
3
3
  import Database from "better-sqlite3";
4
- import { isLocationInList } from "../lib/target-locations.js";
4
+ import {
5
+ isLocationInList,
6
+ DEFAULT_TARGET_LOCATIONS,
7
+ } from "../lib/target-locations.js";
5
8
 
6
9
  // SQLite 用户表(用于判重)
7
10
  let db = null;
@@ -143,6 +146,12 @@ function initUserDb(filePath) {
143
146
  if (!existingJobColumns.has("bio_link")) {
144
147
  db.exec(`ALTER TABLE jobs ADD COLUMN bio_link TEXT`);
145
148
  }
149
+ if (!existingJobColumns.has("top_video_play_count")) {
150
+ db.exec(`ALTER TABLE jobs ADD COLUMN top_video_play_count INTEGER`);
151
+ }
152
+ if (!existingJobColumns.has("top_video_href")) {
153
+ db.exec(`ALTER TABLE jobs ADD COLUMN top_video_href TEXT`);
154
+ }
146
155
  db.exec(`
147
156
  CREATE TABLE IF NOT EXISTS jobs_base (
148
157
  unique_id TEXT PRIMARY KEY,
@@ -773,7 +782,7 @@ function getUserUpdateByCountryFromDb() {
773
782
  COALESCE(guessed_location, '未知') as country,
774
783
  COUNT(*) as count
775
784
  FROM jobs_base
776
- WHERE COALESCE(tt_seller, '') = ''
785
+ WHERE tt_seller IS NULL
777
786
  AND COALESCE(user_update_count, 0) <= 0
778
787
  GROUP BY COALESCE(guessed_location, '未知')
779
788
  ORDER BY count DESC
@@ -794,7 +803,7 @@ function getAttachStuckByCountryFromDb() {
794
803
  COALESCE(guessed_location, '未知') as country,
795
804
  COUNT(*) as count
796
805
  FROM jobs_base
797
- WHERE COALESCE(tt_seller, '') = ''
806
+ WHERE tt_seller IS NULL
798
807
  AND COALESCE(user_update_count, 0) = 1
799
808
  GROUP BY COALESCE(guessed_location, '未知')
800
809
  ORDER BY count DESC
@@ -931,11 +940,36 @@ function moveJobsToRawByCountry(scope, country) {
931
940
  };
932
941
  }
933
942
 
943
+ // pending 操作 jobs 表(与 getPendingByCountryFromDb 数据源一致)
944
+ // userUpdate 操作 jobs_base 表(与 getUserUpdateByCountryFromDb 数据源一致)
945
+ let sourceTable = "";
934
946
  let scopeWhere = "";
947
+ let columns = "";
948
+
935
949
  if (normalizedScope === "pending") {
936
- scopeWhere = `status = 'pending' AND COALESCE(user_update_count, 0) >= 2`;
950
+ sourceTable = "jobs";
951
+ scopeWhere = `status = 'pending'`;
952
+ columns = `
953
+ unique_id, nickname, status, sources, claimed_by, claimed_at,
954
+ error, pinned, no_video, restricted, user_update_count,
955
+ tt_seller, verified, video_count, comment_count,
956
+ guessed_location, location_created, follower_count,
957
+ following_count, heart_count, refresh_time, processed,
958
+ processed_at, created_at, updated_at, region, signature,
959
+ sec_uid, latest_video_time
960
+ `;
937
961
  } else if (normalizedScope === "userUpdate") {
938
- scopeWhere = `COALESCE(tt_seller, '') = '' AND COALESCE(user_update_count, 0) <= 0`;
962
+ sourceTable = "jobs_base";
963
+ scopeWhere = `tt_seller IS NULL AND COALESCE(user_update_count, 0) <= 0`;
964
+ columns = `
965
+ unique_id, nickname, status, sources, claimed_by, claimed_at,
966
+ error, pinned, no_video, restricted, user_update_count,
967
+ tt_seller, verified, video_count, comment_count,
968
+ guessed_location, location_created, follower_count,
969
+ following_count, heart_count, refresh_time, processed,
970
+ processed_at, created_at, updated_at, region, signature,
971
+ sec_uid, latest_video_time
972
+ `;
939
973
  } else {
940
974
  return {
941
975
  moved: 0,
@@ -954,7 +988,7 @@ function moveJobsToRawByCountry(scope, country) {
954
988
  .prepare(
955
989
  `
956
990
  SELECT COUNT(*) as c
957
- FROM jobs_base
991
+ FROM ${sourceTable}
958
992
  WHERE ${whereSql}
959
993
  `,
960
994
  )
@@ -968,74 +1002,18 @@ function moveJobsToRawByCountry(scope, country) {
968
1002
  db.prepare(
969
1003
  `
970
1004
  INSERT OR REPLACE INTO raw_jobs (
971
- unique_id,
972
- nickname,
973
- status,
974
- sources,
975
- claimed_by,
976
- claimed_at,
977
- error,
978
- pinned,
979
- no_video,
980
- restricted,
981
- user_update_count,
982
- tt_seller,
983
- verified,
984
- video_count,
985
- comment_count,
986
- guessed_location,
987
- location_created,
988
- follower_count,
989
- following_count,
990
- heart_count,
991
- refresh_time,
992
- processed,
993
- processed_at,
994
- created_at,
995
- updated_at,
996
- region,
997
- signature,
998
- sec_uid,
999
- latest_video_time
1005
+ ${columns}
1000
1006
  )
1001
1007
  SELECT
1002
- unique_id,
1003
- nickname,
1004
- status,
1005
- sources,
1006
- claimed_by,
1007
- claimed_at,
1008
- error,
1009
- pinned,
1010
- no_video,
1011
- restricted,
1012
- user_update_count,
1013
- tt_seller,
1014
- verified,
1015
- video_count,
1016
- comment_count,
1017
- guessed_location,
1018
- location_created,
1019
- follower_count,
1020
- following_count,
1021
- heart_count,
1022
- refresh_time,
1023
- processed,
1024
- processed_at,
1025
- created_at,
1026
- updated_at,
1027
- region,
1028
- signature,
1029
- sec_uid,
1030
- latest_video_time
1031
- FROM jobs_base
1008
+ ${columns}
1009
+ FROM ${sourceTable}
1032
1010
  WHERE ${whereSql}
1033
1011
  `,
1034
1012
  ).run(targetCountry);
1035
1013
 
1036
1014
  db.prepare(
1037
1015
  `
1038
- DELETE FROM jobs_base
1016
+ DELETE FROM ${sourceTable}
1039
1017
  WHERE ${whereSql}
1040
1018
  `,
1041
1019
  ).run(targetCountry);
@@ -1538,6 +1516,8 @@ function getTargetUsersByCountryFromDb(targetLocations = [], options = {}) {
1538
1516
  modified_at,
1539
1517
  latest_video_time,
1540
1518
  refresh_time,
1519
+ top_video_play_count,
1520
+ top_video_href,
1541
1521
  status,
1542
1522
  sources
1543
1523
  FROM jobs
@@ -1686,6 +1666,8 @@ const writableJobColumns = new Set([
1686
1666
  "sec_uid",
1687
1667
  "status_code",
1688
1668
  "latest_video_time",
1669
+ "top_video_play_count",
1670
+ "top_video_href",
1689
1671
  ]);
1690
1672
 
1691
1673
  function normalizeJobValue(column, value) {
@@ -1950,16 +1932,30 @@ function addJob(user) {
1950
1932
  writeTxn(user);
1951
1933
  }
1952
1934
 
1953
- export function createStore(filePath) {
1935
+ export function createStore(filePath, options = {}) {
1954
1936
  if (!filePath) {
1955
1937
  throw new Error("createStore requires an explicit .db path");
1956
1938
  }
1939
+
1940
+ // refillJobsFromRaw 的 LLM 打分配置(自动补充任务时使用)
1941
+ const refillLlmConfig = {
1942
+ llmScore: false,
1943
+ llmMinScore: 60,
1944
+ llmSampleSize: 100,
1945
+ ...options.refillLlm,
1946
+ };
1947
+
1957
1948
  let data = [];
1958
1949
  // uniqueId → index 内存索引,O(1) 查找
1959
1950
  let uidIndex = new Map();
1960
1951
  let clientErrors = new Map();
1961
1952
  // 客户端登录状态:userId → boolean
1962
1953
  let clientLoginStatus = new Map();
1954
+ // refill 锁:防止多个 claimNextJob 同时触发 LLM refill
1955
+ let refillLock = null; // Promise | null
1956
+ // LLM 采样偏移量记忆:按猜测国家记录上次查询位置,避免重复采样
1957
+ // 格式: { "ES": 300, "PL": 500, "NL": 400 }
1958
+ let llmSampleOffsets = new Map();
1963
1959
  if (filePath) {
1964
1960
  // 初始化 SQLite 用户表(用于判重)
1965
1961
  initUserDb(filePath);
@@ -2217,7 +2213,439 @@ export function createStore(filePath) {
2217
2213
  return data;
2218
2214
  }
2219
2215
 
2220
- function claimNextJob(
2216
+ /**
2217
+ * 使用 LLM 对单个 job 的国家匹配度打分(0-100)
2218
+ * @param {Object} job - raw_jobs 中的一条记录
2219
+ * @param {string[]} targetLocations - 目标国家列表
2220
+ * @returns {Promise<{ uniqueId: string, score: number, reason: string }>}
2221
+ */
2222
+ async function scoreJobLocation(job, targetLocations) {
2223
+ const { fetch: undiciFetch } = await import("undici");
2224
+
2225
+ const prompt = `
2226
+ 你是一个 TikTok 用户数据分析助手。请根据以下用户信息,判断该用户是否来自以下**任意一个**目标国家。
2227
+
2228
+ 目标国家列表: ${targetLocations.join(", ")}
2229
+
2230
+ 重要:
2231
+ - 用户只要来自上述**任意一个**国家就算匹配。
2232
+ - guessed_location 是系统初步猜测的结果,**仅供参考**,不要完全依赖它。
2233
+ - 请综合用户名、昵称、签名、位置等信息做判断。
2234
+
2235
+ 用户信息:
2236
+ - 用户名: ${job.unique_id || "未知"}
2237
+ - 昵称: ${job.nickname || "未知"}
2238
+ - 签名: ${job.signature || "未知"}
2239
+ - 地区: ${job.region || "未知"}
2240
+ - 猜测国家(参考): ${job.guessed_location || "未知"}
2241
+ - 位置信息: ${job.location_created || "未知"}
2242
+ - 主页链接: ${job.bio_link || "未知"}
2243
+
2244
+ 返回 JSON(仅返回 JSON,无其他内容):
2245
+ {"score": 0-100, "reason": "English only, under 50 chars, no quotes/brackets"}
2246
+
2247
+ Standards: 90-100=clear match, 70-89=likely, 50-69=possible, 20-49=low, 0-19=unlikely
2248
+ `;
2249
+
2250
+ try {
2251
+ const apiKey = process.env.APIKEY || "";
2252
+ const response = await undiciFetch(
2253
+ "http://82.156.52.214:18000/v1/chat/completions",
2254
+ {
2255
+ method: "POST",
2256
+ headers: {
2257
+ "Content-Type": "application/json",
2258
+ Authorization: `Bearer ${apiKey}`,
2259
+ },
2260
+ body: JSON.stringify({
2261
+ model: "zc-fast",
2262
+ messages: [{ role: "user", content: prompt }],
2263
+ max_tokens: 512,
2264
+ temperature: 0.1,
2265
+ }),
2266
+ },
2267
+ );
2268
+
2269
+ const result = await response.json();
2270
+ const content = result.choices?.[0]?.message?.content || "";
2271
+
2272
+ // 解析 JSON 响应(多层容错)
2273
+ let parsed = null;
2274
+
2275
+ // 尝试 1: 直接解析
2276
+ try {
2277
+ parsed = JSON.parse(content);
2278
+ } catch {
2279
+ // 尝试 2: 提取 {} 包裹的内容
2280
+ const match = content.match(/\{[\s\S]*\}/);
2281
+ if (match) {
2282
+ try {
2283
+ parsed = JSON.parse(match[0]);
2284
+ } catch {
2285
+ // 尝试 3: 清理常见问题后解析
2286
+ const cleaned = match[0]
2287
+ .replace(/"/g, '"') // 弯引号 → 直引号
2288
+ .replace(/\s+/g, " ") // 多余空白
2289
+ .trim();
2290
+ try {
2291
+ parsed = JSON.parse(cleaned);
2292
+ } catch {
2293
+ // 尝试 4: 从文本中提取 score 和 reason(reason 可能包含引号等特殊字符)
2294
+ const scoreMatch = content.match(/"?score"?\s*:\s*(\d+)/i);
2295
+ if (scoreMatch) {
2296
+ let reason = "解析降级";
2297
+ // 找 "reason": 的位置,取到最后一个 } 前的内容
2298
+ const reasonKeyPos = content.search(/"?reason"?\s*:\s*"/i);
2299
+ if (reasonKeyPos !== -1) {
2300
+ const afterKey = content.substring(reasonKeyPos);
2301
+ const colonPos = afterKey.indexOf(":");
2302
+ const valueStart = afterKey.indexOf('"', colonPos + 1) + 1;
2303
+ const rawValue = afterKey.substring(valueStart);
2304
+ // 取到原始 content 最后一个 } 前
2305
+ const lastBrace = content.lastIndexOf("}");
2306
+ const reasonEnd = lastBrace - reasonKeyPos - valueStart;
2307
+ if (reasonEnd > 0) {
2308
+ reason = rawValue.substring(0, reasonEnd).trim();
2309
+ // 去掉首尾的引号
2310
+ if (reason.startsWith('"')) reason = reason.substring(1);
2311
+ if (reason.endsWith('"'))
2312
+ reason = reason.substring(0, reason.length - 1);
2313
+ }
2314
+ }
2315
+ parsed = {
2316
+ score: parseInt(scoreMatch[1]) || 50,
2317
+ reason,
2318
+ };
2319
+ }
2320
+ }
2321
+ }
2322
+ }
2323
+
2324
+ // 尝试 5: 如果以上都失败,用更宽松的正则提取
2325
+ if (!parsed) {
2326
+ const scoreMatch = content.match(/"score"\s*:\s*(\d+)/);
2327
+ const reasonMatch = content.match(/"reason"\s*:\s*"([^"]*)"/);
2328
+ if (scoreMatch) {
2329
+ parsed = {
2330
+ score: parseInt(scoreMatch[1]) || 50,
2331
+ reason: reasonMatch ? reasonMatch[1] : "解析降级 - 宽松模式",
2332
+ };
2333
+ }
2334
+ }
2335
+ }
2336
+
2337
+ if (parsed && typeof parsed.score === "number") {
2338
+ return {
2339
+ uniqueId: job.unique_id,
2340
+ score: Math.max(0, Math.min(100, parsed.score)),
2341
+ reason: parsed.reason || "",
2342
+ };
2343
+ }
2344
+
2345
+ // 所有解析都失败,返回默认分
2346
+ console.error(
2347
+ `[scoreJobLocation] JSON 解析失败 (${job.unique_id}): ${content.substring(0, 100)}`,
2348
+ );
2349
+ return {
2350
+ uniqueId: job.unique_id,
2351
+ score: 50,
2352
+ reason: "LLM 响应解析失败,使用默认分",
2353
+ };
2354
+ } catch (e) {
2355
+ console.error(
2356
+ `[scoreJobLocation] LLM 调用失败 (${job.unique_id}): ${e.message}`,
2357
+ );
2358
+ return {
2359
+ uniqueId: job.unique_id,
2360
+ score: 50,
2361
+ reason: `LLM 调用异常: ${e.message}`,
2362
+ };
2363
+ }
2364
+ }
2365
+
2366
+ /**
2367
+ * 批量对 jobs 进行 LLM 国家匹配度打分
2368
+ * @param {Object[]} jobs - raw_jobs 记录数组
2369
+ * @param {string[]} targetLocations - 目标国家列表
2370
+ * @param {number} batchSize - 每批处理数量(并发),默认 10
2371
+ * @returns {Promise<Array<{ uniqueId: string, score: number, reason: string }>>}
2372
+ */
2373
+ async function scoreJobsBatch(jobs, targetLocations, batchSize = 10) {
2374
+ const results = [];
2375
+ for (let i = 0; i < jobs.length; i += batchSize) {
2376
+ const batch = jobs.slice(i, i + batchSize);
2377
+ const batchResults = await Promise.all(
2378
+ batch.map((job) => scoreJobLocation(job, targetLocations)),
2379
+ );
2380
+ results.push(...batchResults);
2381
+ }
2382
+ return results;
2383
+ }
2384
+
2385
+ /**
2386
+ * 从 raw_jobs 中移动一批符合条件的任务到 jobs 表
2387
+ * @param {string[]} locations - 目标国家列表(null 表示不限制)
2388
+ * @param {number} limit - 每次移动的最大数量,默认 500
2389
+ * @param {Object} options - 可选配置
2390
+ * @param {boolean} options.llmScore - 是否启用 LLM 打分过滤,默认 false
2391
+ * @param {number} options.llmMinScore - LLM 最低分数阈值,默认 60
2392
+ * @param {number} options.llmSampleSize - LLM 打分的采样数量,默认 100
2393
+ * @returns {{ moved: number }} 实际移动的数量
2394
+ */
2395
+ function refillJobsFromRaw(locations = null, limit = 500, options = {}) {
2396
+ if (!db) {
2397
+ return { moved: 0, error: "db not ready" };
2398
+ }
2399
+
2400
+ const safeLimit = Math.max(1, Math.min(2000, parseInt(limit) || 500));
2401
+ const normalizedLocations = locations
2402
+ ? locations.map((loc) => String(loc).trim().toUpperCase()).filter(Boolean)
2403
+ : null;
2404
+
2405
+ const useLlm = !!options.llmScore;
2406
+ const llmMinScore = options.llmMinScore ?? 60;
2407
+ const llmSampleSize = options.llmSampleSize ?? 100;
2408
+
2409
+ // 构建 WHERE 条件
2410
+ const conditions = [
2411
+ "COALESCE(video_count, 0) > 0",
2412
+ "COALESCE(follower_count, 0) > 0",
2413
+ "COALESCE(following_count, 0) > 0",
2414
+ ];
2415
+ const args = [];
2416
+
2417
+ if (normalizedLocations && normalizedLocations.length > 0) {
2418
+ conditions.push(
2419
+ `UPPER(COALESCE(guessed_location, '')) IN (${normalizedLocations.map(() => "?").join(", ")})`,
2420
+ );
2421
+ args.push(...normalizedLocations);
2422
+ }
2423
+
2424
+ const whereSql = conditions.join(" AND ");
2425
+
2426
+ // 统计符合条件的数量
2427
+ const count =
2428
+ db
2429
+ .prepare(`SELECT COUNT(*) as c FROM raw_jobs WHERE ${whereSql}`)
2430
+ .get(...args)?.c || 0;
2431
+
2432
+ if (!count) {
2433
+ return { moved: 0 };
2434
+ }
2435
+
2436
+ // 如果启用 LLM 打分,先采样一批进行评分(累积模式:按猜测国家分组,使用偏移量记忆避免重复采样)
2437
+ if (useLlm && normalizedLocations && normalizedLocations.length > 0) {
2438
+ const llmMinReturn = options.llmMinReturn ?? 60; // 最少返回合格数
2439
+ const maxBatches = options.llmMaxBatches ?? 10; // 最多采样轮次,防止无限循环
2440
+
2441
+ // 打印当前偏移量状态
2442
+ const offsetSummary = Array.from(llmSampleOffsets.entries())
2443
+ .map(([k, v]) => `${k}:${v}`)
2444
+ .join(", ");
2445
+ console.error(
2446
+ `[data-store] LLM 打分开始: 符合条件 ${count} 条,每批 ${llmSampleSize} 条,最低分 ${llmMinScore},最少返回 ${llmMinReturn} 条`,
2447
+ );
2448
+ if (offsetSummary) {
2449
+ console.error(`[data-store] 偏移量记忆: ${offsetSummary}`);
2450
+ }
2451
+
2452
+ // 返回 Promise,调用方需要 await
2453
+ return (async () => {
2454
+ const allQualified = [];
2455
+ const allScores = [];
2456
+
2457
+ // 按猜测国家分组处理,每个国家使用独立的偏移量
2458
+ const locationGroups = normalizedLocations;
2459
+ let totalBatches = 0;
2460
+
2461
+ for (const location of locationGroups) {
2462
+ // 获取该国家上次的偏移量
2463
+ let offset = llmSampleOffsets.get(location) || 0;
2464
+
2465
+ // 查询该国家的总数量
2466
+ const locationCountSql = `SELECT COUNT(*) as c FROM raw_jobs WHERE ${whereSql} AND guessed_location = ?`;
2467
+ const locationArgs = [...args, location];
2468
+ const locationCount =
2469
+ db.prepare(locationCountSql).get(...locationArgs)?.c || 0;
2470
+
2471
+ if (locationCount === 0) {
2472
+ console.error(
2473
+ `[data-store] 国家 ${location}: raw_jobs 中无数据,跳过`,
2474
+ );
2475
+ continue;
2476
+ }
2477
+
2478
+ // 如果偏移量超过总数,重置为 0(一轮结束,重新开始)
2479
+ if (offset >= locationCount) {
2480
+ offset = 0;
2481
+ llmSampleOffsets.set(location, 0);
2482
+ }
2483
+
2484
+ console.error(
2485
+ `[data-store] 国家 ${location}: 共 ${locationCount} 条,从偏移量 ${offset} 开始`,
2486
+ );
2487
+
2488
+ for (let batch = 0; batch < maxBatches; batch++) {
2489
+ const remaining = locationCount - offset;
2490
+ if (remaining <= 0) break;
2491
+
2492
+ const sampleLimit = Math.min(llmSampleSize, remaining);
2493
+ const samples = db
2494
+ .prepare(
2495
+ `
2496
+ SELECT * FROM raw_jobs WHERE ${whereSql} AND guessed_location = ?
2497
+ ORDER BY created_at DESC
2498
+ LIMIT ? OFFSET ?
2499
+ `,
2500
+ )
2501
+ .all(...locationArgs, sampleLimit, offset);
2502
+
2503
+ if (samples.length === 0) break;
2504
+
2505
+ const scores = await scoreJobsBatch(
2506
+ samples,
2507
+ DEFAULT_TARGET_LOCATIONS,
2508
+ );
2509
+ const batchQualified = scores.filter((s) => s.score >= llmMinScore);
2510
+
2511
+ allScores.push(...scores);
2512
+ allQualified.push(...batchQualified.map((s) => s.uniqueId));
2513
+
2514
+ totalBatches++;
2515
+ console.error(
2516
+ `[data-store] ${location} 第 ${batch + 1} 批: 采样 ${samples.length} 条,本批合格 ${batchQualified.length} 条,累计合格 ${allQualified.length} 条`,
2517
+ );
2518
+
2519
+ // 更新偏移量记忆
2520
+ offset += samples.length;
2521
+ llmSampleOffsets.set(location, offset);
2522
+
2523
+ // 合格数已达到最小返回阈值,停止采样
2524
+ if (allQualified.length >= llmMinReturn) break;
2525
+ }
2526
+
2527
+ // 合格数已达到最小返回阈值,停止所有国家的采样
2528
+ if (allQualified.length >= llmMinReturn) break;
2529
+ }
2530
+
2531
+ // 按分数降序排序,取前 safeLimit 条
2532
+ const qualifiedScores = allScores
2533
+ .filter((s) => s.score >= llmMinScore)
2534
+ .sort((a, b) => b.score - a.score)
2535
+ .slice(0, safeLimit);
2536
+ const qualified = qualifiedScores.map((s) => s.uniqueId);
2537
+
2538
+ if (!qualified.length) {
2539
+ console.error(
2540
+ `[data-store] LLM 打分后无符合条件的任务(阈值: ${llmMinScore},共采样 ${allScores.length} 条)`,
2541
+ );
2542
+ return {
2543
+ moved: 0,
2544
+ scored: allScores.length,
2545
+ qualified: 0,
2546
+ scores: allScores,
2547
+ };
2548
+ }
2549
+
2550
+ // 移动符合条件的记录
2551
+ const placeholders = qualified.map(() => "?").join(", ");
2552
+ const moveTxn = db.transaction(() => {
2553
+ db.prepare(
2554
+ `
2555
+ INSERT OR IGNORE INTO jobs (
2556
+ unique_id, nickname, status, sources, pinned,
2557
+ tt_seller, verified, video_count, comment_count,
2558
+ guessed_location, location_created, confirmed_location,
2559
+ follower_count, following_count, heart_count,
2560
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2561
+ status_code, latest_video_time
2562
+ )
2563
+ SELECT
2564
+ unique_id, nickname, 'pending', sources, pinned,
2565
+ tt_seller, verified, video_count, comment_count,
2566
+ guessed_location, location_created, confirmed_location,
2567
+ follower_count, following_count, heart_count,
2568
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2569
+ status_code, latest_video_time
2570
+ FROM raw_jobs
2571
+ WHERE unique_id IN (${placeholders})
2572
+ `,
2573
+ ).run(...qualified);
2574
+
2575
+ db.prepare(
2576
+ `DELETE FROM raw_jobs WHERE unique_id IN (${placeholders})`,
2577
+ ).run(...qualified);
2578
+ });
2579
+
2580
+ moveTxn();
2581
+ markStatsDirty();
2582
+
2583
+ // 打印最终偏移量状态
2584
+ const finalOffsetSummary = Array.from(llmSampleOffsets.entries())
2585
+ .map(([k, v]) => `${k}:${v}`)
2586
+ .join(", ");
2587
+ console.error(
2588
+ `[data-store] LLM 打分完成: 共采样 ${allScores.length} 条,合格 ${qualified.length} 条,已移动到 jobs`,
2589
+ );
2590
+ console.error(`[data-store] 偏移量记忆更新: ${finalOffsetSummary}`);
2591
+ const scoresDetail = allScores.map((s) => s);
2592
+ return {
2593
+ moved: qualified.length,
2594
+ scored: allScores.length,
2595
+ qualified: qualified.length,
2596
+ scores: scoresDetail,
2597
+ };
2598
+ })();
2599
+ }
2600
+
2601
+ // 常规移动:INSERT + DELETE 事务
2602
+ const moveTxn = db.transaction(() => {
2603
+ db.prepare(
2604
+ `
2605
+ INSERT OR IGNORE INTO jobs (
2606
+ unique_id, nickname, status, sources, pinned,
2607
+ tt_seller, verified, video_count, comment_count,
2608
+ guessed_location, location_created, confirmed_location,
2609
+ follower_count, following_count, heart_count,
2610
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2611
+ status_code, latest_video_time
2612
+ )
2613
+ SELECT
2614
+ unique_id, nickname, 'pending', sources, pinned,
2615
+ tt_seller, verified, video_count, comment_count,
2616
+ guessed_location, location_created, confirmed_location,
2617
+ follower_count, following_count, heart_count,
2618
+ created_at, updated_at, region, signature, bio_link, sec_uid,
2619
+ status_code, latest_video_time
2620
+ FROM raw_jobs
2621
+ WHERE ${whereSql}
2622
+ ORDER BY created_at DESC
2623
+ LIMIT ?
2624
+ `,
2625
+ ).run(...args, safeLimit);
2626
+
2627
+ // 删除已移动的记录:用子查询匹配刚 INSERT 的 unique_id
2628
+ db.prepare(
2629
+ `
2630
+ DELETE FROM raw_jobs
2631
+ WHERE unique_id IN (
2632
+ SELECT unique_id FROM raw_jobs
2633
+ WHERE ${whereSql}
2634
+ ORDER BY created_at DESC
2635
+ LIMIT ?
2636
+ )
2637
+ `,
2638
+ ).run(...args, safeLimit);
2639
+ });
2640
+
2641
+ moveTxn();
2642
+ markStatsDirty();
2643
+
2644
+ const actualMoved = Math.min(count, safeLimit);
2645
+ return { moved: actualMoved };
2646
+ }
2647
+
2648
+ async function claimNextJob(
2221
2649
  userId,
2222
2650
  expireMs = 5 * 60 * 1000,
2223
2651
  locations = null,
@@ -2493,6 +2921,71 @@ export function createStore(filePath) {
2493
2921
  return claimRow(ranked);
2494
2922
  }
2495
2923
  }
2924
+ // 尝试从 raw_jobs 毛料库补充任务(使用 createStore 时配置的 LLM 打分)
2925
+ // 使用锁防止多个请求同时触发 LLM refill
2926
+ if (refillLock) {
2927
+ // 已有 refill 在进行中,等待完成后重新尝试领取
2928
+ await refillLock;
2929
+ for (const requireVideo of [true, false]) {
2930
+ const pinned = findPinnedPending(requireVideo);
2931
+ if (pinned) {
2932
+ return claimRow(pinned);
2933
+ }
2934
+ const ranked = findPrioritizedPending(requireVideo);
2935
+ if (ranked) {
2936
+ return claimRow(ranked);
2937
+ }
2938
+ }
2939
+ return null;
2940
+ }
2941
+ const refillResult = (async () => {
2942
+ refillLock = Promise.resolve(); // 占位
2943
+ const result = refillJobsFromRaw(
2944
+ normalizedLocations.length ? normalizedLocations : null,
2945
+ 500,
2946
+ refillLlmConfig,
2947
+ );
2948
+ // refillJobsFromRaw 在 LLM 模式下返回 Promise
2949
+ if (result && typeof result.then === "function") {
2950
+ return result.finally(() => {
2951
+ refillLock = null;
2952
+ });
2953
+ }
2954
+ return result;
2955
+ })();
2956
+ if (refillResult && typeof refillResult.then === "function") {
2957
+ const awaited = await refillResult;
2958
+ if (awaited.moved > 0) {
2959
+ console.error(
2960
+ `[data-store] 从 raw_jobs 补充了 ${awaited.moved} 条任务到 jobs`,
2961
+ );
2962
+ for (const requireVideo of [true, false]) {
2963
+ const pinned = findPinnedPending(requireVideo);
2964
+ if (pinned) {
2965
+ return claimRow(pinned);
2966
+ }
2967
+ const ranked = findPrioritizedPending(requireVideo);
2968
+ if (ranked) {
2969
+ return claimRow(ranked);
2970
+ }
2971
+ }
2972
+ }
2973
+ } else if (refillResult.moved > 0) {
2974
+ console.error(
2975
+ `[data-store] 从 raw_jobs 补充了 ${refillResult.moved} 条任务到 jobs`,
2976
+ );
2977
+ for (const requireVideo of [true, false]) {
2978
+ const pinned = findPinnedPending(requireVideo);
2979
+ if (pinned) {
2980
+ return claimRow(pinned);
2981
+ }
2982
+ const ranked = findPrioritizedPending(requireVideo);
2983
+ if (ranked) {
2984
+ return claimRow(ranked);
2985
+ }
2986
+ }
2987
+ }
2988
+
2496
2989
  return null;
2497
2990
  }
2498
2991
 
@@ -3088,6 +3581,7 @@ export function createStore(filePath) {
3088
3581
  "discoveredFollowers",
3089
3582
  "uniqueId",
3090
3583
  "sources",
3584
+ "topRecentVideo", // 单独处理,不进入通用循环
3091
3585
  ];
3092
3586
  for (const key of Object.keys(result)) {
3093
3587
  if (extraFields.includes(key)) continue;
@@ -3099,6 +3593,11 @@ export function createStore(filePath) {
3099
3593
  user[key] = result[key];
3100
3594
  }
3101
3595
  }
3596
+ // 将 topRecentVideo 对象展开为扁平字段
3597
+ if (result.topRecentVideo && typeof result.topRecentVideo === "object") {
3598
+ user.topVideoPlayCount = result.topRecentVideo.playCount || null;
3599
+ user.topVideoHref = result.topRecentVideo.href || null;
3600
+ }
3102
3601
  user.sources = [...new Set([...(user.sources || []), "processed"])];
3103
3602
  }
3104
3603
  if (user.status !== oldStatus) markStatsDirty();
@@ -3340,6 +3839,11 @@ export function createStore(filePath) {
3340
3839
  }
3341
3840
  }
3342
3841
  }
3842
+ // 将 topRecentVideo 对象展开为扁平字段
3843
+ if (result.topRecentVideo && typeof result.topRecentVideo === "object") {
3844
+ user.topVideoPlayCount = result.topRecentVideo.playCount || null;
3845
+ user.topVideoHref = result.topRecentVideo.href || null;
3846
+ }
3343
3847
  const newUsers = processDiscoveredUsers(result);
3344
3848
  const ret = updateJobInfo(uniqueId, user, false);
3345
3849
  if (ret.error) return { saved: false, error: ret.error };
@@ -3360,6 +3864,11 @@ export function createStore(filePath) {
3360
3864
  }
3361
3865
  }
3362
3866
  }
3867
+ // 将 topRecentVideo 对象展开为扁平字段
3868
+ if (result.topRecentVideo && typeof result.topRecentVideo === "object") {
3869
+ user.topVideoPlayCount = result.topRecentVideo.playCount || null;
3870
+ user.topVideoHref = result.topRecentVideo.href || null;
3871
+ }
3363
3872
  const newUsers = processDiscoveredUsers(result);
3364
3873
  return { saved: true, newUsers };
3365
3874
  }
@@ -3966,6 +4475,9 @@ export function createStore(filePath) {
3966
4475
  getStats,
3967
4476
  getStatusGroups,
3968
4477
  markGroupsDirty,
4478
+ refillJobsFromRaw,
4479
+ scoreJobLocation,
4480
+ scoreJobsBatch,
3969
4481
  claimNextJob,
3970
4482
  commitJob,
3971
4483
  commitNewExplore,
@@ -3991,6 +4503,12 @@ export function createStore(filePath) {
3991
4503
  debugClaimNextJob,
3992
4504
  stopBackup,
3993
4505
  rawQuery,
4506
+ getLlmSampleOffsets, // 获取 LLM 采样偏移量状态
3994
4507
  data,
3995
4508
  };
4509
+
4510
+ // 辅助函数:获取 LLM 采样偏移量
4511
+ function getLlmSampleOffsets() {
4512
+ return Object.fromEntries(llmSampleOffsets);
4513
+ }
3996
4514
  }