tt-help-cli-ycl 1.3.99 → 1.3.100

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tt-help-cli-ycl",
3
- "version": "1.3.99",
3
+ "version": "1.3.100",
4
4
  "description": "TikTok user & video data scraper - extract ttSeller, verified, locationCreated from HTML source",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli/tag.js CHANGED
@@ -5,6 +5,7 @@ import { fetchTagData, enrichVideosWithLocation } from "../lib/tag-fetcher.js";
5
5
  import { killEdgeProcesses, ensureBrowserReady } from "../lib/browser/cdp.js";
6
6
  import { getOrCreatePage } from "../lib/browser/page.js";
7
7
  import { TikTokScraper } from "../lib/tiktok-scraper.mjs";
8
+ import { CDNBlockedError } from "../lib/parse-ssr.mjs";
8
9
  import {
9
10
  DEFAULT_TARGET_LOCATIONS,
10
11
  isLocationInList,
@@ -42,6 +43,18 @@ function formatMemoryUsage(mem = process.memoryUsage()) {
42
43
  return `rss:${(mem.rss / 1024 / 1024).toFixed(0)}MB heap:${(mem.heapUsed / 1024 / 1024).toFixed(0)}MB ext:${(mem.external / 1024 / 1024).toFixed(0)}MB ab:${(mem.arrayBuffers / 1024 / 1024).toFixed(0)}MB`;
43
44
  }
44
45
 
46
+ function getCdnCooldownSeconds(blockedCount, totalCount, isTooManyRequests = false) {
47
+ if (isTooManyRequests) return 120;
48
+ const ratio = blockedCount / Math.max(totalCount, 1);
49
+ return ratio > 0.3 ? 120 : 60;
50
+ }
51
+
52
+ async function cooldownAndRecycle(cooldownSec, recyclePage, maybeRecycleForMemory) {
53
+ await new Promise((r) => setTimeout(r, cooldownSec * 1000));
54
+ await recyclePage();
55
+ await maybeRecycleForMemory();
56
+ }
57
+
45
58
  // 构建带客户端追踪 header 的 fetch 封装
46
59
  function buildClientHeaders(clientId, meta, extra = {}) {
47
60
  return {
@@ -906,12 +919,11 @@ export async function handleScoreAll(parsed) {
906
919
  // CDN 限流检测:有拦截则冷却 + 重启 scraper
907
920
  const cdnBlocked = enriched.cdnBlockedCount || 0;
908
921
  if (cdnBlocked > 0) {
909
- const cdnRatio = cdnBlocked / (videos.length || 1);
910
- const coolSec = cdnRatio > 0.3 ? 120 : 60;
922
+ const coolSec = getCdnCooldownSeconds(cdnBlocked, videos.length);
911
923
  log(
912
- ` ⚠️ CDN 限流: ${cdnBlocked}/${videos.length} (${(cdnRatio * 100).toFixed(0)}%),冷却 ${coolSec} 秒后重启 scraper`,
924
+ ` ⚠️ CDN 限流: ${cdnBlocked}/${videos.length} (${((cdnBlocked / Math.max(videos.length, 1)) * 100).toFixed(0)}%),冷却 ${coolSec} 秒后重启 scraper`,
913
925
  );
914
- await new Promise((r) => setTimeout(r, coolSec * 1000));
926
+ await cooldownAndRecycle(coolSec, recyclePage, maybeRecycleForMemory);
915
927
  log(` 正在重启 TikTokScraper...`);
916
928
  await enrichScraper.restart();
917
929
  log(` ✅ TikTokScraper 已重启`);
@@ -961,6 +973,17 @@ export async function handleScoreAll(parsed) {
961
973
  await maybeRecycleForMemory();
962
974
  await randomDelay(3000, 7000);
963
975
  } catch (e) {
976
+ if (e instanceof CDNBlockedError || /HTTP\s+(403|429)/.test(e.message)) {
977
+ log(` ⚠️ CDN 被封: ${e.message}`);
978
+ result.status = "dead";
979
+ result.error = "cdn_blocked";
980
+ await reportToServer(baseUrl, result, clientId, clientMeta);
981
+ totalScored++;
982
+ const cooldownSec = getCdnCooldownSeconds(1, 1, /429/.test(e.message));
983
+ log(` 冷却 ${cooldownSec} 秒后再继续...`);
984
+ await cooldownAndRecycle(cooldownSec, recyclePage, maybeRecycleForMemory);
985
+ continue;
986
+ }
964
987
  // 区分网络错误和业务错误
965
988
  const isNetworkError =
966
989
  e.code === "ECONNREFUSED" ||
@@ -107,6 +107,9 @@ export async function fetchTagData(tag, options = {}) {
107
107
  timeout: 30000,
108
108
  });
109
109
 
110
+ if (resp.status() === 403 || resp.status() === 429) {
111
+ throw new CDNBlockedError(`标签页返回 HTTP ${resp.status()}`);
112
+ }
110
113
  if (resp.status() !== 200) {
111
114
  throw new Error(`标签页返回 HTTP ${resp.status()}`);
112
115
  }
@@ -1160,59 +1160,112 @@ export function createStore(filePath, options = {}) {
1160
1160
  })();
1161
1161
  }
1162
1162
 
1163
- // 常规移动:INSERT + DELETE 事务
1164
- const moveTxn = getDb().transaction(() => {
1165
- getDb()
1166
- .prepare(
1167
- `
1168
- INSERT OR IGNORE INTO jobs (
1169
- unique_id, nickname, status, sources, pinned,
1170
- tt_seller, verified, video_count, comment_count,
1171
- guessed_location, location_created, confirmed_location,
1172
- follower_count, following_count, heart_count,
1173
- created_at, updated_at, region, signature, bio_link, sec_uid,
1174
- status_code, latest_video_time, user_create_time
1175
- )
1176
- SELECT
1177
- unique_id, nickname, 'pending', sources, pinned,
1178
- tt_seller, verified, video_count, comment_count,
1179
- guessed_location, location_created, confirmed_location,
1180
- follower_count, following_count, heart_count,
1181
- created_at, updated_at, region, signature, bio_link, sec_uid,
1182
- status_code, latest_video_time, user_create_time
1183
- FROM raw_jobs
1184
- WHERE ${whereSql}
1163
+ // 常规移动:多国家时先按国家均衡补充,再用全局兜底补齐剩余额度
1164
+ const insertFromRawSql = `
1165
+ INSERT OR IGNORE INTO jobs (
1166
+ unique_id, nickname, status, sources, pinned,
1167
+ tt_seller, verified, video_count, comment_count,
1168
+ guessed_location, location_created, confirmed_location,
1169
+ follower_count, following_count, heart_count,
1170
+ created_at, updated_at, region, signature, bio_link, sec_uid,
1171
+ status_code, latest_video_time, user_create_time
1172
+ )
1173
+ SELECT
1174
+ unique_id, nickname, 'pending', sources, pinned,
1175
+ tt_seller, verified, video_count, comment_count,
1176
+ guessed_location, location_created, confirmed_location,
1177
+ follower_count, following_count, heart_count,
1178
+ created_at, updated_at, region, signature, bio_link, sec_uid,
1179
+ status_code, latest_video_time, user_create_time
1180
+ FROM raw_jobs
1181
+ WHERE __WHERE__
1182
+ ORDER BY
1183
+ CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
1184
+ COALESCE(video_count, 0) DESC, created_at DESC
1185
+ LIMIT ?
1186
+ `;
1187
+
1188
+ const deleteFromRawSql = `
1189
+ DELETE FROM raw_jobs
1190
+ WHERE unique_id IN (
1191
+ SELECT unique_id FROM raw_jobs
1192
+ WHERE __WHERE__
1185
1193
  ORDER BY
1186
1194
  CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
1187
1195
  COALESCE(video_count, 0) DESC, created_at DESC
1188
1196
  LIMIT ?
1189
- `,
1190
- )
1191
- .run(...args, safeLimit);
1197
+ )
1198
+ `;
1192
1199
 
1193
- // 删除已移动的记录:用子查询匹配刚 INSERT 的 unique_id
1194
- getDb()
1195
- .prepare(
1196
- `
1197
- DELETE FROM raw_jobs
1198
- WHERE unique_id IN (
1199
- SELECT unique_id FROM raw_jobs
1200
- WHERE ${whereSql}
1201
- ORDER BY
1202
- CASE WHEN sources LIKE '%tag%' THEN 0 ELSE 1 END,
1203
- COALESCE(video_count, 0) DESC, created_at DESC
1204
- LIMIT ?
1205
- )
1206
- `,
1207
- )
1208
- .run(...args, safeLimit);
1200
+ const uniqueLocations = normalizedLocations
1201
+ ? Array.from(new Set(normalizedLocations))
1202
+ : [];
1203
+ const shouldBalanceByCountry = uniqueLocations.length > 1;
1204
+
1205
+ const moveTxn = getDb().transaction(() => {
1206
+ let moved = 0;
1207
+ const movedByCountry = {};
1208
+
1209
+ if (shouldBalanceByCountry) {
1210
+ const baseQuota = Math.floor(safeLimit / uniqueLocations.length);
1211
+ const remainder = safeLimit % uniqueLocations.length;
1212
+
1213
+ for (let i = 0; i < uniqueLocations.length; i++) {
1214
+ if (moved >= safeLimit) break;
1215
+
1216
+ const location = uniqueLocations[i];
1217
+ const quota = baseQuota + (i < remainder ? 1 : 0);
1218
+ const currentLimit = Math.max(0, Math.min(quota, safeLimit - moved));
1219
+ if (!currentLimit) continue;
1220
+
1221
+ const locationWhere = `${whereSql} AND UPPER(COALESCE(guessed_location, '')) = ?`;
1222
+ const locationArgs = [...args, location];
1223
+
1224
+ getDb()
1225
+ .prepare(insertFromRawSql.replace("__WHERE__", locationWhere))
1226
+ .run(...locationArgs, currentLimit);
1227
+ const del = getDb()
1228
+ .prepare(deleteFromRawSql.replace("__WHERE__", locationWhere))
1229
+ .run(...locationArgs, currentLimit);
1230
+
1231
+ const movedThisCountry = del?.changes || 0;
1232
+ moved += movedThisCountry;
1233
+ movedByCountry[location] = movedThisCountry;
1234
+ }
1235
+
1236
+ // 某些国家库存不足时,用全局查询补齐剩余额度(仍受 whereSql 国家范围约束)
1237
+ const remaining = safeLimit - moved;
1238
+ if (remaining > 0) {
1239
+ getDb()
1240
+ .prepare(insertFromRawSql.replace("__WHERE__", whereSql))
1241
+ .run(...args, remaining);
1242
+ const del = getDb()
1243
+ .prepare(deleteFromRawSql.replace("__WHERE__", whereSql))
1244
+ .run(...args, remaining);
1245
+ moved += del?.changes || 0;
1246
+ }
1247
+
1248
+ console.error(
1249
+ `[data-store] refill 国家均衡: ${uniqueLocations
1250
+ .map((loc) => `${loc}:${movedByCountry[loc] || 0}`)
1251
+ .join(", ")} | total=${moved}`,
1252
+ );
1253
+ } else {
1254
+ getDb()
1255
+ .prepare(insertFromRawSql.replace("__WHERE__", whereSql))
1256
+ .run(...args, safeLimit);
1257
+ const del = getDb()
1258
+ .prepare(deleteFromRawSql.replace("__WHERE__", whereSql))
1259
+ .run(...args, safeLimit);
1260
+ moved = del?.changes || 0;
1261
+ }
1262
+
1263
+ return moved;
1209
1264
  });
1210
1265
 
1211
- moveTxn();
1266
+ const moved = moveTxn();
1212
1267
  markStatsDirty();
1213
-
1214
- const actualMoved = Math.min(count, safeLimit);
1215
- return { moved: actualMoved };
1268
+ return { moved };
1216
1269
  }
1217
1270
 
1218
1271
  async function claimNextJob(
@@ -1512,8 +1565,7 @@ export function createStore(filePath, options = {}) {
1512
1565
  }
1513
1566
  return null;
1514
1567
  }
1515
- const refillResult = (async () => {
1516
- refillLock = Promise.resolve(); // 占位
1568
+ const refillPromise = (async () => {
1517
1569
  const result = refillJobsFromRaw(
1518
1570
  normalizedLocations.length ? normalizedLocations : null,
1519
1571
  500,
@@ -1521,30 +1573,17 @@ export function createStore(filePath, options = {}) {
1521
1573
  );
1522
1574
  // refillJobsFromRaw 在 LLM 模式下返回 Promise
1523
1575
  if (result && typeof result.then === "function") {
1524
- return result.finally(() => {
1525
- refillLock = null;
1526
- });
1576
+ return await result;
1527
1577
  }
1528
1578
  return result;
1529
1579
  })();
1530
- if (refillResult && typeof refillResult.then === "function") {
1531
- const awaited = await refillResult;
1532
- if (awaited.moved > 0) {
1533
- console.error(
1534
- `[data-store] 从 raw_jobs 补充了 ${awaited.moved} 条任务到 jobs`,
1535
- );
1536
- for (const requireVideo of [true, false]) {
1537
- const pinned = findPinnedPending(requireVideo);
1538
- if (pinned) {
1539
- return claimRow(pinned);
1540
- }
1541
- const ranked = findPrioritizedPending(requireVideo);
1542
- if (ranked) {
1543
- return claimRow(ranked);
1544
- }
1545
- }
1546
- }
1547
- } else if (refillResult.moved > 0) {
1580
+ // 让并发请求等待同一个 refill,并且无论成功/失败都释放锁
1581
+ refillLock = refillPromise.finally(() => {
1582
+ refillLock = null;
1583
+ });
1584
+
1585
+ const refillResult = await refillLock;
1586
+ if (refillResult.moved > 0) {
1548
1587
  console.error(
1549
1588
  `[data-store] 从 raw_jobs 补充了 ${refillResult.moved} 条任务到 jobs`,
1550
1589
  );