@tikomni/skills 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (451) hide show
  1. package/.skill-package-allowlist.txt +4 -0
  2. package/LICENSE +21 -0
  3. package/README.md +167 -0
  4. package/README.zh-CN.md +167 -0
  5. package/bin/tikomni-skills.js +127 -0
  6. package/env.example +160 -0
  7. package/lib/installer.js +176 -0
  8. package/package.json +44 -0
  9. package/skills/creator-analysis/SKILL.md +71 -0
  10. package/skills/creator-analysis/agents/openai.yaml +4 -0
  11. package/skills/creator-analysis/env.example +36 -0
  12. package/skills/creator-analysis/references/api-capability-index.md +92 -0
  13. package/skills/creator-analysis/references/api-contracts/asr-api.md +130 -0
  14. package/skills/creator-analysis/references/api-contracts/bilibili-app-api.md +776 -0
  15. package/skills/creator-analysis/references/api-contracts/bilibili-web-api.md +2017 -0
  16. package/skills/creator-analysis/references/api-contracts/demo-api.md +717 -0
  17. package/skills/creator-analysis/references/api-contracts/douyin-app-v3-api.md +3594 -0
  18. package/skills/creator-analysis/references/api-contracts/douyin-billboard-api.md +2274 -0
  19. package/skills/creator-analysis/references/api-contracts/douyin-creator-api.md +1575 -0
  20. package/skills/creator-analysis/references/api-contracts/douyin-creator-v2-api.md +3254 -0
  21. package/skills/creator-analysis/references/api-contracts/douyin-search-api.md +4118 -0
  22. package/skills/creator-analysis/references/api-contracts/douyin-web-api.md +5544 -0
  23. package/skills/creator-analysis/references/api-contracts/douyin-xingtu-api.md +1916 -0
  24. package/skills/creator-analysis/references/api-contracts/douyin-xingtu-v2-api.md +1540 -0
  25. package/skills/creator-analysis/references/api-contracts/health-check.md +69 -0
  26. package/skills/creator-analysis/references/api-contracts/hybrid-parsing.md +78 -0
  27. package/skills/creator-analysis/references/api-contracts/instagram-v1-api.md +2256 -0
  28. package/skills/creator-analysis/references/api-contracts/instagram-v2-api.md +2011 -0
  29. package/skills/creator-analysis/references/api-contracts/instagram-v3-api.md +2630 -0
  30. package/skills/creator-analysis/references/api-contracts/ios-shortcut.md +44 -0
  31. package/skills/creator-analysis/references/api-contracts/kuaishou-app-api.md +1518 -0
  32. package/skills/creator-analysis/references/api-contracts/kuaishou-web-api.md +1242 -0
  33. package/skills/creator-analysis/references/api-contracts/lemon8-app-api.md +1088 -0
  34. package/skills/creator-analysis/references/api-contracts/linkedin-web-api.md +1949 -0
  35. package/skills/creator-analysis/references/api-contracts/media-ingest-api.md +126 -0
  36. package/skills/creator-analysis/references/api-contracts/pipixia-app-api.md +1142 -0
  37. package/skills/creator-analysis/references/api-contracts/reddit-app-api.md +2025 -0
  38. package/skills/creator-analysis/references/api-contracts/sora2-api.md +2266 -0
  39. package/skills/creator-analysis/references/api-contracts/temp-mail-api.md +208 -0
  40. package/skills/creator-analysis/references/api-contracts/threads-web-api.md +897 -0
  41. package/skills/creator-analysis/references/api-contracts/tikhub-downloader-api.md +134 -0
  42. package/skills/creator-analysis/references/api-contracts/tikhub-user-api.md +494 -0
  43. package/skills/creator-analysis/references/api-contracts/tiktok-ads-api.md +5947 -0
  44. package/skills/creator-analysis/references/api-contracts/tiktok-analytics-api.md +968 -0
  45. package/skills/creator-analysis/references/api-contracts/tiktok-app-v3-api.md +5735 -0
  46. package/skills/creator-analysis/references/api-contracts/tiktok-creator-api.md +1951 -0
  47. package/skills/creator-analysis/references/api-contracts/tiktok-interaction-api.md +742 -0
  48. package/skills/creator-analysis/references/api-contracts/tiktok-shop-web-api.md +1890 -0
  49. package/skills/creator-analysis/references/api-contracts/tiktok-web-api.md +4448 -0
  50. package/skills/creator-analysis/references/api-contracts/toutiao-app-api.md +342 -0
  51. package/skills/creator-analysis/references/api-contracts/toutiao-web-api.md +143 -0
  52. package/skills/creator-analysis/references/api-contracts/twitter-web-api.md +989 -0
  53. package/skills/creator-analysis/references/api-contracts/wechat-channels-api.md +809 -0
  54. package/skills/creator-analysis/references/api-contracts/wechat-media-platform-web-api.md +677 -0
  55. package/skills/creator-analysis/references/api-contracts/weibo-app-api.md +1547 -0
  56. package/skills/creator-analysis/references/api-contracts/weibo-web-api.md +798 -0
  57. package/skills/creator-analysis/references/api-contracts/weibo-web-v2-api.md +2459 -0
  58. package/skills/creator-analysis/references/api-contracts/xiaohongshu-app-api.md +1291 -0
  59. package/skills/creator-analysis/references/api-contracts/xiaohongshu-app-v2-api.md +1683 -0
  60. package/skills/creator-analysis/references/api-contracts/xiaohongshu-web-api.md +1324 -0
  61. package/skills/creator-analysis/references/api-contracts/xiaohongshu-web-v2-api.md +1209 -0
  62. package/skills/creator-analysis/references/api-contracts/xigua-app-v2-api.md +489 -0
  63. package/skills/creator-analysis/references/api-contracts/youtube-web-api.md +2636 -0
  64. package/skills/creator-analysis/references/api-contracts/youtube-web-v2-api.md +2660 -0
  65. package/skills/creator-analysis/references/api-contracts/zhihu-web-api.md +2315 -0
  66. package/skills/creator-analysis/references/api-tags/asr-api.md +100 -0
  67. package/skills/creator-analysis/references/api-tags/bilibili-app-api.md +482 -0
  68. package/skills/creator-analysis/references/api-tags/bilibili-web-api.md +1267 -0
  69. package/skills/creator-analysis/references/api-tags/demo-api.md +365 -0
  70. package/skills/creator-analysis/references/api-tags/douyin-app-v3-api.md +2012 -0
  71. package/skills/creator-analysis/references/api-tags/douyin-billboard-api.md +1428 -0
  72. package/skills/creator-analysis/references/api-tags/douyin-creator-api.md +694 -0
  73. package/skills/creator-analysis/references/api-tags/douyin-creator-v2-api.md +694 -0
  74. package/skills/creator-analysis/references/api-tags/douyin-search-api.md +1059 -0
  75. package/skills/creator-analysis/references/api-tags/douyin-web-api.md +3314 -0
  76. package/skills/creator-analysis/references/api-tags/douyin-xingtu-api.md +935 -0
  77. package/skills/creator-analysis/references/api-tags/douyin-xingtu-v2-api.md +925 -0
  78. package/skills/creator-analysis/references/api-tags/health-check.md +40 -0
  79. package/skills/creator-analysis/references/api-tags/hybrid-parsing.md +57 -0
  80. package/skills/creator-analysis/references/api-tags/instagram-v1-api.md +1224 -0
  81. package/skills/creator-analysis/references/api-tags/instagram-v2-api.md +1147 -0
  82. package/skills/creator-analysis/references/api-tags/instagram-v3-api.md +1123 -0
  83. package/skills/creator-analysis/references/api-tags/ios-shortcut.md +45 -0
  84. package/skills/creator-analysis/references/api-tags/kuaishou-app-api.md +846 -0
  85. package/skills/creator-analysis/references/api-tags/kuaishou-web-api.md +551 -0
  86. package/skills/creator-analysis/references/api-tags/lemon8-app-api.md +687 -0
  87. package/skills/creator-analysis/references/api-tags/linkedin-web-api.md +1105 -0
  88. package/skills/creator-analysis/references/api-tags/media-ingest-api.md +112 -0
  89. package/skills/creator-analysis/references/api-tags/pipixia-app-api.md +721 -0
  90. package/skills/creator-analysis/references/api-tags/reddit-app-api.md +1057 -0
  91. package/skills/creator-analysis/references/api-tags/sora2-api.md +737 -0
  92. package/skills/creator-analysis/references/api-tags/temp-mail-api.md +136 -0
  93. package/skills/creator-analysis/references/api-tags/threads-web-api.md +472 -0
  94. package/skills/creator-analysis/references/api-tags/tikhub-downloader-api.md +65 -0
  95. package/skills/creator-analysis/references/api-tags/tikhub-user-api.md +253 -0
  96. package/skills/creator-analysis/references/api-tags/tiktok-ads-api.md +1393 -0
  97. package/skills/creator-analysis/references/api-tags/tiktok-analytics-api.md +179 -0
  98. package/skills/creator-analysis/references/api-tags/tiktok-app-v3-api.md +3264 -0
  99. package/skills/creator-analysis/references/api-tags/tiktok-creator-api.md +709 -0
  100. package/skills/creator-analysis/references/api-tags/tiktok-interaction-api.md +366 -0
  101. package/skills/creator-analysis/references/api-tags/tiktok-shop-web-api.md +663 -0
  102. package/skills/creator-analysis/references/api-tags/tiktok-web-api.md +2516 -0
  103. package/skills/creator-analysis/references/api-tags/toutiao-app-api.md +220 -0
  104. package/skills/creator-analysis/references/api-tags/toutiao-web-api.md +96 -0
  105. package/skills/creator-analysis/references/api-tags/twitter-web-api.md +562 -0
  106. package/skills/creator-analysis/references/api-tags/wechat-channels-api.md +405 -0
  107. package/skills/creator-analysis/references/api-tags/wechat-media-platform-web-api.md +431 -0
  108. package/skills/creator-analysis/references/api-tags/weibo-app-api.md +851 -0
  109. package/skills/creator-analysis/references/api-tags/weibo-web-api.md +470 -0
  110. package/skills/creator-analysis/references/api-tags/weibo-web-v2-api.md +1405 -0
  111. package/skills/creator-analysis/references/api-tags/xiaohongshu-app-api.md +534 -0
  112. package/skills/creator-analysis/references/api-tags/xiaohongshu-app-v2-api.md +934 -0
  113. package/skills/creator-analysis/references/api-tags/xiaohongshu-web-api.md +757 -0
  114. package/skills/creator-analysis/references/api-tags/xiaohongshu-web-v2-api.md +762 -0
  115. package/skills/creator-analysis/references/api-tags/xigua-app-v2-api.md +308 -0
  116. package/skills/creator-analysis/references/api-tags/youtube-web-api.md +934 -0
  117. package/skills/creator-analysis/references/api-tags/youtube-web-v2-api.md +717 -0
  118. package/skills/creator-analysis/references/api-tags/zhihu-web-api.md +1384 -0
  119. package/skills/creator-analysis/references/asr-orchestration.md +33 -0
  120. package/skills/creator-analysis/references/config-templates/defaults.yaml +60 -0
  121. package/skills/creator-analysis/references/contracts/creator-card-fields.md +23 -0
  122. package/skills/creator-analysis/references/contracts/work-card-fields.md +32 -0
  123. package/skills/creator-analysis/references/platform-guides/douyin.md +49 -0
  124. package/skills/creator-analysis/references/platform-guides/generic.md +46 -0
  125. package/skills/creator-analysis/references/platform-guides/xiaohongshu.md +54 -0
  126. package/skills/creator-analysis/references/prompt-contracts/asr-clean.md +28 -0
  127. package/skills/creator-analysis/references/prompt-contracts/author-analysis-v2.md +46 -0
  128. package/skills/creator-analysis/references/prompt-contracts/author-analysis.md +49 -0
  129. package/skills/creator-analysis/references/prompt-contracts/cta.md +24 -0
  130. package/skills/creator-analysis/references/prompt-contracts/hook.md +25 -0
  131. package/skills/creator-analysis/references/prompt-contracts/insight.md +47 -0
  132. package/skills/creator-analysis/references/prompt-contracts/sampled-work-batch-explanations.md +30 -0
  133. package/skills/creator-analysis/references/prompt-contracts/structure.md +25 -0
  134. package/skills/creator-analysis/references/prompt-contracts/style.md +27 -0
  135. package/skills/creator-analysis/references/prompt-contracts/summary.md +29 -0
  136. package/skills/creator-analysis/references/prompt-contracts/topic.md +29 -0
  137. package/skills/creator-analysis/references/schemas/author-analysis-input-v1.schema.json +325 -0
  138. package/skills/creator-analysis/references/schemas/author-analysis-v2.schema.json +158 -0
  139. package/skills/creator-analysis/references/schemas/sampled-work-batch-explanations.schema.json +41 -0
  140. package/skills/creator-analysis/references/service-guides/asr-u2-u3-fallback.md +75 -0
  141. package/skills/creator-analysis/references/workflow.md +18 -0
  142. package/skills/creator-analysis/scripts/__init__.py +0 -0
  143. package/skills/creator-analysis/scripts/author_home/__init__.py +0 -0
  144. package/skills/creator-analysis/scripts/author_home/adapters/__init__.py +0 -0
  145. package/skills/creator-analysis/scripts/author_home/adapters/platform_adapters.py +299 -0
  146. package/skills/creator-analysis/scripts/author_home/analyzers/__init__.py +0 -0
  147. package/skills/creator-analysis/scripts/author_home/analyzers/author_analysis_v2_support.py +1122 -0
  148. package/skills/creator-analysis/scripts/author_home/analyzers/prompt_first_analyzers.py +260 -0
  149. package/skills/creator-analysis/scripts/author_home/analyzers/sampled_work_batch_explainer.py +260 -0
  150. package/skills/creator-analysis/scripts/author_home/asr/__init__.py +5 -0
  151. package/skills/creator-analysis/scripts/author_home/asr/home_asr.py +961 -0
  152. package/skills/creator-analysis/scripts/author_home/builders/__init__.py +0 -0
  153. package/skills/creator-analysis/scripts/author_home/builders/home_builders.py +149 -0
  154. package/skills/creator-analysis/scripts/author_home/collectors/__init__.py +0 -0
  155. package/skills/creator-analysis/scripts/author_home/collectors/homepage_collectors.py +636 -0
  156. package/skills/creator-analysis/scripts/author_home/orchestrator/__init__.py +0 -0
  157. package/skills/creator-analysis/scripts/author_home/orchestrator/run_author_analysis.py +491 -0
  158. package/skills/creator-analysis/scripts/author_home/orchestrator/work_analysis_artifacts.py +553 -0
  159. package/skills/creator-analysis/scripts/author_home/schema.py +417 -0
  160. package/skills/creator-analysis/scripts/core/__init__.py +0 -0
  161. package/skills/creator-analysis/scripts/core/analysis_pipeline.py +133 -0
  162. package/skills/creator-analysis/scripts/core/bootstrap_env.py +35 -0
  163. package/skills/creator-analysis/scripts/core/config_loader.py +418 -0
  164. package/skills/creator-analysis/scripts/core/extract_pipeline.py +173 -0
  165. package/skills/creator-analysis/scripts/core/progress_report.py +111 -0
  166. package/skills/creator-analysis/scripts/core/storage_router.py +253 -0
  167. package/skills/creator-analysis/scripts/core/tikomni_common.py +588 -0
  168. package/skills/creator-analysis/scripts/pipeline/__init__.py +0 -0
  169. package/skills/creator-analysis/scripts/pipeline/asr/__init__.py +0 -0
  170. package/skills/creator-analysis/scripts/pipeline/asr/asr_pipeline.py +1189 -0
  171. package/skills/creator-analysis/scripts/pipeline/asr/poll_u2_task.py +95 -0
  172. package/skills/creator-analysis/scripts/platform/__init__.py +0 -0
  173. package/skills/creator-analysis/scripts/platform/douyin/__init__.py +0 -0
  174. package/skills/creator-analysis/scripts/platform/douyin/douyin_video_type_matrix.py +224 -0
  175. package/skills/creator-analysis/scripts/platform/douyin/run_douyin_single_video.py +1208 -0
  176. package/skills/creator-analysis/scripts/platform/douyin/select_low_quality_video_url.py +200 -0
  177. package/skills/creator-analysis/scripts/platform/xiaohongshu/__init__.py +0 -0
  178. package/skills/creator-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +2128 -0
  179. package/skills/creator-analysis/scripts/writers/__init__.py +0 -0
  180. package/skills/creator-analysis/scripts/writers/write_author_homepage_samples.py +106 -0
  181. package/skills/creator-analysis/scripts/writers/write_benchmark_card.py +1402 -0
  182. package/skills/meta-capability/SKILL.md +69 -0
  183. package/skills/meta-capability/agents/openai.yaml +4 -0
  184. package/skills/meta-capability/env.example +42 -0
  185. package/skills/meta-capability/references/api-capability-index.md +92 -0
  186. package/skills/meta-capability/references/api-contracts/asr-api.md +130 -0
  187. package/skills/meta-capability/references/api-contracts/bilibili-app-api.md +776 -0
  188. package/skills/meta-capability/references/api-contracts/bilibili-web-api.md +2017 -0
  189. package/skills/meta-capability/references/api-contracts/demo-api.md +717 -0
  190. package/skills/meta-capability/references/api-contracts/douyin-app-v3-api.md +3594 -0
  191. package/skills/meta-capability/references/api-contracts/douyin-billboard-api.md +2274 -0
  192. package/skills/meta-capability/references/api-contracts/douyin-creator-api.md +1575 -0
  193. package/skills/meta-capability/references/api-contracts/douyin-creator-v2-api.md +3254 -0
  194. package/skills/meta-capability/references/api-contracts/douyin-search-api.md +4118 -0
  195. package/skills/meta-capability/references/api-contracts/douyin-web-api.md +5544 -0
  196. package/skills/meta-capability/references/api-contracts/douyin-xingtu-api.md +1916 -0
  197. package/skills/meta-capability/references/api-contracts/douyin-xingtu-v2-api.md +1540 -0
  198. package/skills/meta-capability/references/api-contracts/health-check.md +69 -0
  199. package/skills/meta-capability/references/api-contracts/hybrid-parsing.md +78 -0
  200. package/skills/meta-capability/references/api-contracts/instagram-v1-api.md +2256 -0
  201. package/skills/meta-capability/references/api-contracts/instagram-v2-api.md +2011 -0
  202. package/skills/meta-capability/references/api-contracts/instagram-v3-api.md +2630 -0
  203. package/skills/meta-capability/references/api-contracts/ios-shortcut.md +44 -0
  204. package/skills/meta-capability/references/api-contracts/kuaishou-app-api.md +1518 -0
  205. package/skills/meta-capability/references/api-contracts/kuaishou-web-api.md +1242 -0
  206. package/skills/meta-capability/references/api-contracts/lemon8-app-api.md +1088 -0
  207. package/skills/meta-capability/references/api-contracts/linkedin-web-api.md +1949 -0
  208. package/skills/meta-capability/references/api-contracts/media-ingest-api.md +126 -0
  209. package/skills/meta-capability/references/api-contracts/pipixia-app-api.md +1142 -0
  210. package/skills/meta-capability/references/api-contracts/reddit-app-api.md +2025 -0
  211. package/skills/meta-capability/references/api-contracts/sora2-api.md +2266 -0
  212. package/skills/meta-capability/references/api-contracts/temp-mail-api.md +208 -0
  213. package/skills/meta-capability/references/api-contracts/threads-web-api.md +897 -0
  214. package/skills/meta-capability/references/api-contracts/tikhub-downloader-api.md +134 -0
  215. package/skills/meta-capability/references/api-contracts/tikhub-user-api.md +494 -0
  216. package/skills/meta-capability/references/api-contracts/tiktok-ads-api.md +5947 -0
  217. package/skills/meta-capability/references/api-contracts/tiktok-analytics-api.md +968 -0
  218. package/skills/meta-capability/references/api-contracts/tiktok-app-v3-api.md +5735 -0
  219. package/skills/meta-capability/references/api-contracts/tiktok-creator-api.md +1951 -0
  220. package/skills/meta-capability/references/api-contracts/tiktok-interaction-api.md +742 -0
  221. package/skills/meta-capability/references/api-contracts/tiktok-shop-web-api.md +1890 -0
  222. package/skills/meta-capability/references/api-contracts/tiktok-web-api.md +4448 -0
  223. package/skills/meta-capability/references/api-contracts/toutiao-app-api.md +342 -0
  224. package/skills/meta-capability/references/api-contracts/toutiao-web-api.md +143 -0
  225. package/skills/meta-capability/references/api-contracts/twitter-web-api.md +989 -0
  226. package/skills/meta-capability/references/api-contracts/wechat-channels-api.md +809 -0
  227. package/skills/meta-capability/references/api-contracts/wechat-media-platform-web-api.md +677 -0
  228. package/skills/meta-capability/references/api-contracts/weibo-app-api.md +1547 -0
  229. package/skills/meta-capability/references/api-contracts/weibo-web-api.md +798 -0
  230. package/skills/meta-capability/references/api-contracts/weibo-web-v2-api.md +2459 -0
  231. package/skills/meta-capability/references/api-contracts/xiaohongshu-app-api.md +1291 -0
  232. package/skills/meta-capability/references/api-contracts/xiaohongshu-app-v2-api.md +1683 -0
  233. package/skills/meta-capability/references/api-contracts/xiaohongshu-web-api.md +1324 -0
  234. package/skills/meta-capability/references/api-contracts/xiaohongshu-web-v2-api.md +1209 -0
  235. package/skills/meta-capability/references/api-contracts/xigua-app-v2-api.md +489 -0
  236. package/skills/meta-capability/references/api-contracts/youtube-web-api.md +2636 -0
  237. package/skills/meta-capability/references/api-contracts/youtube-web-v2-api.md +2660 -0
  238. package/skills/meta-capability/references/api-contracts/zhihu-web-api.md +2315 -0
  239. package/skills/meta-capability/references/api-tags/asr-api.md +100 -0
  240. package/skills/meta-capability/references/api-tags/bilibili-app-api.md +482 -0
  241. package/skills/meta-capability/references/api-tags/bilibili-web-api.md +1267 -0
  242. package/skills/meta-capability/references/api-tags/demo-api.md +365 -0
  243. package/skills/meta-capability/references/api-tags/douyin-app-v3-api.md +2012 -0
  244. package/skills/meta-capability/references/api-tags/douyin-billboard-api.md +1428 -0
  245. package/skills/meta-capability/references/api-tags/douyin-creator-api.md +694 -0
  246. package/skills/meta-capability/references/api-tags/douyin-creator-v2-api.md +694 -0
  247. package/skills/meta-capability/references/api-tags/douyin-search-api.md +1059 -0
  248. package/skills/meta-capability/references/api-tags/douyin-web-api.md +3314 -0
  249. package/skills/meta-capability/references/api-tags/douyin-xingtu-api.md +935 -0
  250. package/skills/meta-capability/references/api-tags/douyin-xingtu-v2-api.md +925 -0
  251. package/skills/meta-capability/references/api-tags/health-check.md +40 -0
  252. package/skills/meta-capability/references/api-tags/hybrid-parsing.md +57 -0
  253. package/skills/meta-capability/references/api-tags/instagram-v1-api.md +1224 -0
  254. package/skills/meta-capability/references/api-tags/instagram-v2-api.md +1147 -0
  255. package/skills/meta-capability/references/api-tags/instagram-v3-api.md +1123 -0
  256. package/skills/meta-capability/references/api-tags/ios-shortcut.md +45 -0
  257. package/skills/meta-capability/references/api-tags/kuaishou-app-api.md +846 -0
  258. package/skills/meta-capability/references/api-tags/kuaishou-web-api.md +551 -0
  259. package/skills/meta-capability/references/api-tags/lemon8-app-api.md +687 -0
  260. package/skills/meta-capability/references/api-tags/linkedin-web-api.md +1105 -0
  261. package/skills/meta-capability/references/api-tags/media-ingest-api.md +112 -0
  262. package/skills/meta-capability/references/api-tags/pipixia-app-api.md +721 -0
  263. package/skills/meta-capability/references/api-tags/reddit-app-api.md +1057 -0
  264. package/skills/meta-capability/references/api-tags/sora2-api.md +737 -0
  265. package/skills/meta-capability/references/api-tags/temp-mail-api.md +136 -0
  266. package/skills/meta-capability/references/api-tags/threads-web-api.md +472 -0
  267. package/skills/meta-capability/references/api-tags/tikhub-downloader-api.md +65 -0
  268. package/skills/meta-capability/references/api-tags/tikhub-user-api.md +253 -0
  269. package/skills/meta-capability/references/api-tags/tiktok-ads-api.md +1393 -0
  270. package/skills/meta-capability/references/api-tags/tiktok-analytics-api.md +179 -0
  271. package/skills/meta-capability/references/api-tags/tiktok-app-v3-api.md +3264 -0
  272. package/skills/meta-capability/references/api-tags/tiktok-creator-api.md +709 -0
  273. package/skills/meta-capability/references/api-tags/tiktok-interaction-api.md +366 -0
  274. package/skills/meta-capability/references/api-tags/tiktok-shop-web-api.md +663 -0
  275. package/skills/meta-capability/references/api-tags/tiktok-web-api.md +2516 -0
  276. package/skills/meta-capability/references/api-tags/toutiao-app-api.md +220 -0
  277. package/skills/meta-capability/references/api-tags/toutiao-web-api.md +96 -0
  278. package/skills/meta-capability/references/api-tags/twitter-web-api.md +562 -0
  279. package/skills/meta-capability/references/api-tags/wechat-channels-api.md +405 -0
  280. package/skills/meta-capability/references/api-tags/wechat-media-platform-web-api.md +431 -0
  281. package/skills/meta-capability/references/api-tags/weibo-app-api.md +851 -0
  282. package/skills/meta-capability/references/api-tags/weibo-web-api.md +470 -0
  283. package/skills/meta-capability/references/api-tags/weibo-web-v2-api.md +1405 -0
  284. package/skills/meta-capability/references/api-tags/xiaohongshu-app-api.md +534 -0
  285. package/skills/meta-capability/references/api-tags/xiaohongshu-app-v2-api.md +934 -0
  286. package/skills/meta-capability/references/api-tags/xiaohongshu-web-api.md +757 -0
  287. package/skills/meta-capability/references/api-tags/xiaohongshu-web-v2-api.md +762 -0
  288. package/skills/meta-capability/references/api-tags/xigua-app-v2-api.md +308 -0
  289. package/skills/meta-capability/references/api-tags/youtube-web-api.md +934 -0
  290. package/skills/meta-capability/references/api-tags/youtube-web-v2-api.md +717 -0
  291. package/skills/meta-capability/references/api-tags/zhihu-web-api.md +1384 -0
  292. package/skills/meta-capability/references/config-templates/defaults.yaml +18 -0
  293. package/skills/meta-capability/references/dispatch.md +27 -0
  294. package/skills/meta-capability/references/execution-guidelines.md +25 -0
  295. package/skills/meta-capability/references/implemented-route-map.md +177 -0
  296. package/skills/meta-capability/references/service-guides/asr-u2-u3-fallback.md +75 -0
  297. package/skills/meta-capability/scripts/__init__.py +1 -0
  298. package/skills/meta-capability/scripts/call_route.py +141 -0
  299. package/skills/meta-capability/scripts/core/__init__.py +1 -0
  300. package/skills/meta-capability/scripts/core/bootstrap_env.py +32 -0
  301. package/skills/meta-capability/scripts/core/config_loader.py +204 -0
  302. package/skills/meta-capability/scripts/core/tikomni_common.py +443 -0
  303. package/skills/meta-capability/scripts/test_auth.py +98 -0
  304. package/skills/single-work-analysis/SKILL.md +62 -0
  305. package/skills/single-work-analysis/agents/openai.yaml +4 -0
  306. package/skills/single-work-analysis/env.example +36 -0
  307. package/skills/single-work-analysis/references/api-capability-index.md +92 -0
  308. package/skills/single-work-analysis/references/api-contracts/asr-api.md +130 -0
  309. package/skills/single-work-analysis/references/api-contracts/bilibili-app-api.md +776 -0
  310. package/skills/single-work-analysis/references/api-contracts/bilibili-web-api.md +2017 -0
  311. package/skills/single-work-analysis/references/api-contracts/demo-api.md +717 -0
  312. package/skills/single-work-analysis/references/api-contracts/douyin-app-v3-api.md +3594 -0
  313. package/skills/single-work-analysis/references/api-contracts/douyin-billboard-api.md +2274 -0
  314. package/skills/single-work-analysis/references/api-contracts/douyin-creator-api.md +1575 -0
  315. package/skills/single-work-analysis/references/api-contracts/douyin-creator-v2-api.md +3254 -0
  316. package/skills/single-work-analysis/references/api-contracts/douyin-search-api.md +4118 -0
  317. package/skills/single-work-analysis/references/api-contracts/douyin-web-api.md +5544 -0
  318. package/skills/single-work-analysis/references/api-contracts/douyin-xingtu-api.md +1916 -0
  319. package/skills/single-work-analysis/references/api-contracts/douyin-xingtu-v2-api.md +1540 -0
  320. package/skills/single-work-analysis/references/api-contracts/health-check.md +69 -0
  321. package/skills/single-work-analysis/references/api-contracts/hybrid-parsing.md +78 -0
  322. package/skills/single-work-analysis/references/api-contracts/instagram-v1-api.md +2256 -0
  323. package/skills/single-work-analysis/references/api-contracts/instagram-v2-api.md +2011 -0
  324. package/skills/single-work-analysis/references/api-contracts/instagram-v3-api.md +2630 -0
  325. package/skills/single-work-analysis/references/api-contracts/ios-shortcut.md +44 -0
  326. package/skills/single-work-analysis/references/api-contracts/kuaishou-app-api.md +1518 -0
  327. package/skills/single-work-analysis/references/api-contracts/kuaishou-web-api.md +1242 -0
  328. package/skills/single-work-analysis/references/api-contracts/lemon8-app-api.md +1088 -0
  329. package/skills/single-work-analysis/references/api-contracts/linkedin-web-api.md +1949 -0
  330. package/skills/single-work-analysis/references/api-contracts/media-ingest-api.md +126 -0
  331. package/skills/single-work-analysis/references/api-contracts/pipixia-app-api.md +1142 -0
  332. package/skills/single-work-analysis/references/api-contracts/reddit-app-api.md +2025 -0
  333. package/skills/single-work-analysis/references/api-contracts/sora2-api.md +2266 -0
  334. package/skills/single-work-analysis/references/api-contracts/temp-mail-api.md +208 -0
  335. package/skills/single-work-analysis/references/api-contracts/threads-web-api.md +897 -0
  336. package/skills/single-work-analysis/references/api-contracts/tikhub-downloader-api.md +134 -0
  337. package/skills/single-work-analysis/references/api-contracts/tikhub-user-api.md +494 -0
  338. package/skills/single-work-analysis/references/api-contracts/tiktok-ads-api.md +5947 -0
  339. package/skills/single-work-analysis/references/api-contracts/tiktok-analytics-api.md +968 -0
  340. package/skills/single-work-analysis/references/api-contracts/tiktok-app-v3-api.md +5735 -0
  341. package/skills/single-work-analysis/references/api-contracts/tiktok-creator-api.md +1951 -0
  342. package/skills/single-work-analysis/references/api-contracts/tiktok-interaction-api.md +742 -0
  343. package/skills/single-work-analysis/references/api-contracts/tiktok-shop-web-api.md +1890 -0
  344. package/skills/single-work-analysis/references/api-contracts/tiktok-web-api.md +4448 -0
  345. package/skills/single-work-analysis/references/api-contracts/toutiao-app-api.md +342 -0
  346. package/skills/single-work-analysis/references/api-contracts/toutiao-web-api.md +143 -0
  347. package/skills/single-work-analysis/references/api-contracts/twitter-web-api.md +989 -0
  348. package/skills/single-work-analysis/references/api-contracts/wechat-channels-api.md +809 -0
  349. package/skills/single-work-analysis/references/api-contracts/wechat-media-platform-web-api.md +677 -0
  350. package/skills/single-work-analysis/references/api-contracts/weibo-app-api.md +1547 -0
  351. package/skills/single-work-analysis/references/api-contracts/weibo-web-api.md +798 -0
  352. package/skills/single-work-analysis/references/api-contracts/weibo-web-v2-api.md +2459 -0
  353. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-app-api.md +1291 -0
  354. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-app-v2-api.md +1683 -0
  355. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-web-api.md +1324 -0
  356. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-web-v2-api.md +1209 -0
  357. package/skills/single-work-analysis/references/api-contracts/xigua-app-v2-api.md +489 -0
  358. package/skills/single-work-analysis/references/api-contracts/youtube-web-api.md +2636 -0
  359. package/skills/single-work-analysis/references/api-contracts/youtube-web-v2-api.md +2660 -0
  360. package/skills/single-work-analysis/references/api-contracts/zhihu-web-api.md +2315 -0
  361. package/skills/single-work-analysis/references/api-tags/asr-api.md +100 -0
  362. package/skills/single-work-analysis/references/api-tags/bilibili-app-api.md +482 -0
  363. package/skills/single-work-analysis/references/api-tags/bilibili-web-api.md +1267 -0
  364. package/skills/single-work-analysis/references/api-tags/demo-api.md +365 -0
  365. package/skills/single-work-analysis/references/api-tags/douyin-app-v3-api.md +2012 -0
  366. package/skills/single-work-analysis/references/api-tags/douyin-billboard-api.md +1428 -0
  367. package/skills/single-work-analysis/references/api-tags/douyin-creator-api.md +694 -0
  368. package/skills/single-work-analysis/references/api-tags/douyin-creator-v2-api.md +694 -0
  369. package/skills/single-work-analysis/references/api-tags/douyin-search-api.md +1059 -0
  370. package/skills/single-work-analysis/references/api-tags/douyin-web-api.md +3314 -0
  371. package/skills/single-work-analysis/references/api-tags/douyin-xingtu-api.md +935 -0
  372. package/skills/single-work-analysis/references/api-tags/douyin-xingtu-v2-api.md +925 -0
  373. package/skills/single-work-analysis/references/api-tags/health-check.md +40 -0
  374. package/skills/single-work-analysis/references/api-tags/hybrid-parsing.md +57 -0
  375. package/skills/single-work-analysis/references/api-tags/instagram-v1-api.md +1224 -0
  376. package/skills/single-work-analysis/references/api-tags/instagram-v2-api.md +1147 -0
  377. package/skills/single-work-analysis/references/api-tags/instagram-v3-api.md +1123 -0
  378. package/skills/single-work-analysis/references/api-tags/ios-shortcut.md +45 -0
  379. package/skills/single-work-analysis/references/api-tags/kuaishou-app-api.md +846 -0
  380. package/skills/single-work-analysis/references/api-tags/kuaishou-web-api.md +551 -0
  381. package/skills/single-work-analysis/references/api-tags/lemon8-app-api.md +687 -0
  382. package/skills/single-work-analysis/references/api-tags/linkedin-web-api.md +1105 -0
  383. package/skills/single-work-analysis/references/api-tags/media-ingest-api.md +112 -0
  384. package/skills/single-work-analysis/references/api-tags/pipixia-app-api.md +721 -0
  385. package/skills/single-work-analysis/references/api-tags/reddit-app-api.md +1057 -0
  386. package/skills/single-work-analysis/references/api-tags/sora2-api.md +737 -0
  387. package/skills/single-work-analysis/references/api-tags/temp-mail-api.md +136 -0
  388. package/skills/single-work-analysis/references/api-tags/threads-web-api.md +472 -0
  389. package/skills/single-work-analysis/references/api-tags/tikhub-downloader-api.md +65 -0
  390. package/skills/single-work-analysis/references/api-tags/tikhub-user-api.md +253 -0
  391. package/skills/single-work-analysis/references/api-tags/tiktok-ads-api.md +1393 -0
  392. package/skills/single-work-analysis/references/api-tags/tiktok-analytics-api.md +179 -0
  393. package/skills/single-work-analysis/references/api-tags/tiktok-app-v3-api.md +3264 -0
  394. package/skills/single-work-analysis/references/api-tags/tiktok-creator-api.md +709 -0
  395. package/skills/single-work-analysis/references/api-tags/tiktok-interaction-api.md +366 -0
  396. package/skills/single-work-analysis/references/api-tags/tiktok-shop-web-api.md +663 -0
  397. package/skills/single-work-analysis/references/api-tags/tiktok-web-api.md +2516 -0
  398. package/skills/single-work-analysis/references/api-tags/toutiao-app-api.md +220 -0
  399. package/skills/single-work-analysis/references/api-tags/toutiao-web-api.md +96 -0
  400. package/skills/single-work-analysis/references/api-tags/twitter-web-api.md +562 -0
  401. package/skills/single-work-analysis/references/api-tags/wechat-channels-api.md +405 -0
  402. package/skills/single-work-analysis/references/api-tags/wechat-media-platform-web-api.md +431 -0
  403. package/skills/single-work-analysis/references/api-tags/weibo-app-api.md +851 -0
  404. package/skills/single-work-analysis/references/api-tags/weibo-web-api.md +470 -0
  405. package/skills/single-work-analysis/references/api-tags/weibo-web-v2-api.md +1405 -0
  406. package/skills/single-work-analysis/references/api-tags/xiaohongshu-app-api.md +534 -0
  407. package/skills/single-work-analysis/references/api-tags/xiaohongshu-app-v2-api.md +934 -0
  408. package/skills/single-work-analysis/references/api-tags/xiaohongshu-web-api.md +757 -0
  409. package/skills/single-work-analysis/references/api-tags/xiaohongshu-web-v2-api.md +762 -0
  410. package/skills/single-work-analysis/references/api-tags/xigua-app-v2-api.md +308 -0
  411. package/skills/single-work-analysis/references/api-tags/youtube-web-api.md +934 -0
  412. package/skills/single-work-analysis/references/api-tags/youtube-web-v2-api.md +717 -0
  413. package/skills/single-work-analysis/references/api-tags/zhihu-web-api.md +1384 -0
  414. package/skills/single-work-analysis/references/asr-and-fallback.md +20 -0
  415. package/skills/single-work-analysis/references/config-templates/defaults.yaml +58 -0
  416. package/skills/single-work-analysis/references/contracts/work-card-fields.md +41 -0
  417. package/skills/single-work-analysis/references/platform-guides/douyin.md +47 -0
  418. package/skills/single-work-analysis/references/platform-guides/generic.md +43 -0
  419. package/skills/single-work-analysis/references/platform-guides/xiaohongshu.md +54 -0
  420. package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +28 -0
  421. package/skills/single-work-analysis/references/prompt-contracts/cta.md +24 -0
  422. package/skills/single-work-analysis/references/prompt-contracts/hook.md +25 -0
  423. package/skills/single-work-analysis/references/prompt-contracts/insight.md +47 -0
  424. package/skills/single-work-analysis/references/prompt-contracts/structure.md +25 -0
  425. package/skills/single-work-analysis/references/prompt-contracts/style.md +27 -0
  426. package/skills/single-work-analysis/references/prompt-contracts/summary.md +29 -0
  427. package/skills/single-work-analysis/references/prompt-contracts/topic.md +29 -0
  428. package/skills/single-work-analysis/references/schemas/work-card.schema.json +39 -0
  429. package/skills/single-work-analysis/references/service-guides/asr-u2-u3-fallback.md +75 -0
  430. package/skills/single-work-analysis/scripts/__init__.py +0 -0
  431. package/skills/single-work-analysis/scripts/core/__init__.py +0 -0
  432. package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +133 -0
  433. package/skills/single-work-analysis/scripts/core/bootstrap_env.py +35 -0
  434. package/skills/single-work-analysis/scripts/core/config_loader.py +418 -0
  435. package/skills/single-work-analysis/scripts/core/extract_pipeline.py +173 -0
  436. package/skills/single-work-analysis/scripts/core/progress_report.py +111 -0
  437. package/skills/single-work-analysis/scripts/core/storage_router.py +253 -0
  438. package/skills/single-work-analysis/scripts/core/tikomni_common.py +588 -0
  439. package/skills/single-work-analysis/scripts/pipeline/__init__.py +0 -0
  440. package/skills/single-work-analysis/scripts/pipeline/asr/__init__.py +0 -0
  441. package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +1189 -0
  442. package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +95 -0
  443. package/skills/single-work-analysis/scripts/platform/__init__.py +0 -0
  444. package/skills/single-work-analysis/scripts/platform/douyin/__init__.py +0 -0
  445. package/skills/single-work-analysis/scripts/platform/douyin/douyin_video_type_matrix.py +224 -0
  446. package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +1233 -0
  447. package/skills/single-work-analysis/scripts/platform/douyin/select_low_quality_video_url.py +200 -0
  448. package/skills/single-work-analysis/scripts/platform/xiaohongshu/__init__.py +0 -0
  449. package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +2156 -0
  450. package/skills/single-work-analysis/scripts/writers/__init__.py +0 -0
  451. package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +1402 -0
@@ -0,0 +1,961 @@
1
+ #!/usr/bin/env python3
2
+ """Author-home ASR enrichment (batch + checkpoint + idempotent dedupe)."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import re
8
+ import urllib.request
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ from scripts.core.progress_report import ProgressReporter
12
+ from scripts.core.tikomni_common import normalize_text
13
+ from scripts.pipeline.asr.asr_pipeline import (
14
+ clamp_u2_batch_submit_size,
15
+ normalize_media_url,
16
+ run_u2_asr_batch_with_timeout_retry,
17
+ run_u2_asr_candidates_with_timeout_retry,
18
+ )
19
+
20
+ DEFAULT_BATCH_SUBMIT_SIZE = 50
21
+ MAX_BATCH_SUBMIT_SIZE = 100
22
+ U2_GATE_MIN_DURATION_MS = 13000
23
+ U2_GATE_MAX_DURATION_MS = 1800000
24
+ U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
25
+
26
+
27
+ def _to_int_or_none(value: Any) -> Optional[int]:
28
+ try:
29
+ if isinstance(value, bool):
30
+ return int(value)
31
+ if isinstance(value, (int, float)):
32
+ parsed = int(value)
33
+ return parsed if parsed > 0 else None
34
+ text = normalize_text(value)
35
+ if not text:
36
+ return None
37
+ parsed = int(float(text.replace(",", "")))
38
+ return parsed if parsed > 0 else None
39
+ except Exception:
40
+ return None
41
+
42
+
43
+ def _to_bool(value: Any) -> Optional[bool]:
44
+ if isinstance(value, bool):
45
+ return value
46
+ if isinstance(value, (int, float)):
47
+ return bool(int(value))
48
+ text = normalize_text(value).lower()
49
+ if not text:
50
+ return None
51
+ if text in {"1", "true", "yes", "y", "video"}:
52
+ return True
53
+ if text in {"0", "false", "no", "n", "image", "photo", "note"}:
54
+ return False
55
+ return None
56
+
57
+
58
+ def _resolve_is_video(work: Dict[str, Any], *, platform: str) -> bool:
59
+ explicit = _to_bool(work.get("is_video"))
60
+ if explicit is not None:
61
+ return explicit
62
+
63
+ work_modality = normalize_text(work.get("work_modality")).lower()
64
+ if work_modality == "video":
65
+ return True
66
+ if work_modality == "text":
67
+ return False
68
+
69
+ content_type = normalize_text(work.get("content_type")).lower()
70
+ if content_type in {"video", "mixed", "mix", "video_note", "note_video"}:
71
+ return True
72
+ if content_type in {"image", "photo", "album", "note", "text"}:
73
+ return False
74
+
75
+ if platform == "douyin":
76
+ return True
77
+
78
+ raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
79
+ xhs_type_hint = normalize_text(raw_ref.get("type") or raw_ref.get("note_type")).lower()
80
+ if xhs_type_hint in {"video", "0", "normal", "mixed", "mix"}:
81
+ return True
82
+ if xhs_type_hint in {"image", "1", "note", "photo"}:
83
+ return False
84
+
85
+ return False
86
+
87
+
88
+ def _evaluate_u2_gate(work: Dict[str, Any], *, platform: str) -> Dict[str, Any]:
89
+ is_video = _resolve_is_video(work, platform=platform)
90
+ duration_ms = _to_int_or_none(work.get("duration_ms"))
91
+ video_download_url = normalize_text(work.get("video_download_url") or work.get("video_down_url"))
92
+
93
+ if not is_video:
94
+ gate_reason = "skip:not_video"
95
+ elif duration_ms is None:
96
+ gate_reason = "skip:duration_missing"
97
+ elif duration_ms <= U2_GATE_MIN_DURATION_MS:
98
+ gate_reason = "skip:duration_too_short"
99
+ elif duration_ms > U2_GATE_MAX_DURATION_MS:
100
+ gate_reason = "skip:duration_too_long"
101
+ elif not video_download_url:
102
+ gate_reason = "skip:video_download_url_missing"
103
+ else:
104
+ gate_reason = "pass"
105
+
106
+ return {
107
+ "can_u2": gate_reason == "pass",
108
+ "gate_reason": gate_reason,
109
+ "is_video": is_video,
110
+ "duration_ms": duration_ms,
111
+ "video_download_url": video_download_url,
112
+ "video_download_url_present": bool(video_download_url),
113
+ }
114
+
115
+
116
+ def _clean_text(text: Any) -> str:
117
+ if text is None:
118
+ return ""
119
+ lines = [normalize_text(line) for line in str(text).splitlines()]
120
+ lines = [line for line in lines if line]
121
+ return "\n".join(lines).strip()
122
+
123
+
124
+ def _subtitle_text_from_raw(raw: str) -> str:
125
+ content = (raw or "").strip()
126
+ if not content:
127
+ return ""
128
+
129
+ try:
130
+ payload = json.loads(content)
131
+ candidates: List[str] = []
132
+ if isinstance(payload, list):
133
+ for item in payload:
134
+ if isinstance(item, dict):
135
+ for key in ("text", "content", "sentence", "line"):
136
+ value = normalize_text(item.get(key))
137
+ if value:
138
+ candidates.append(value)
139
+ elif isinstance(payload, dict):
140
+ stack = [payload]
141
+ while stack:
142
+ node = stack.pop(0)
143
+ if isinstance(node, dict):
144
+ for key, value in node.items():
145
+ if key in {"text", "content", "sentence", "line"} and isinstance(value, str):
146
+ cleaned = normalize_text(value)
147
+ if cleaned:
148
+ candidates.append(cleaned)
149
+ elif isinstance(value, (dict, list)):
150
+ stack.append(value)
151
+ elif isinstance(node, list):
152
+ stack.extend(node)
153
+ if candidates:
154
+ return "\n".join(list(dict.fromkeys(candidates))).strip()
155
+ except Exception:
156
+ pass
157
+
158
+ lines: List[str] = []
159
+ for line in content.splitlines():
160
+ stripped = line.strip()
161
+ if not stripped:
162
+ continue
163
+ if re.match(r"^\d+$", stripped):
164
+ continue
165
+ if re.match(r"^\d{1,2}:\d{2}(?::\d{2})?(?:[\.,]\d{1,3})?\s*-->\s*\d{1,2}:\d{2}(?::\d{2})?(?:[\.,]\d{1,3})?$", stripped):
166
+ continue
167
+ lines.append(stripped)
168
+ return "\n".join(lines).strip()
169
+
170
+
171
+ def _fetch_subtitle_text(urls: List[str], timeout_ms: int) -> str:
172
+ for url in urls:
173
+ text = normalize_text(url)
174
+ if not text:
175
+ continue
176
+ try:
177
+ req = urllib.request.Request(text, method="GET")
178
+ with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
179
+ raw = response.read().decode("utf-8", errors="replace")
180
+ parsed = _subtitle_text_from_raw(raw)
181
+ if parsed:
182
+ return parsed
183
+ except Exception:
184
+ continue
185
+ return ""
186
+
187
+
188
+ def _invalid_subtitle_reason(text: str) -> Optional[str]:
189
+ clean = _clean_text(text)
190
+ if not clean:
191
+ return "subtitle_empty"
192
+
193
+ if len(clean) < 20:
194
+ return "subtitle_too_short"
195
+
196
+ normalized = clean.replace("\n", "")
197
+ total_chars = len(normalized)
198
+ if total_chars <= 0:
199
+ return "subtitle_empty"
200
+
201
+ allowed_punct = " ,。!?;:“”‘’()()《》、,.!?;:-_/\\\"'\t"
202
+ invalid_chars = 0
203
+ for char in normalized:
204
+ if char.isalnum() or "\u4e00" <= char <= "\u9fff" or char in allowed_punct:
205
+ continue
206
+ invalid_chars += 1
207
+
208
+ if invalid_chars / max(total_chars, 1) > 0.35:
209
+ return "subtitle_garbled"
210
+
211
+ timeline_like_lines = 0
212
+ non_timeline_lines = 0
213
+ for line in clean.splitlines():
214
+ line = line.strip()
215
+ if not line:
216
+ continue
217
+ if re.match(r"^\d{1,2}:\d{2}(?::\d{2})?(?:[\.,]\d{1,3})?\s*-->\s*\d{1,2}:\d{2}(?::\d{2})?(?:[\.,]\d{1,3})?$", line):
218
+ timeline_like_lines += 1
219
+ elif re.match(r"^[\d:\-\.,\s>]+$", line):
220
+ timeline_like_lines += 1
221
+ else:
222
+ non_timeline_lines += 1
223
+
224
+ if timeline_like_lines > 0 and non_timeline_lines == 0:
225
+ return "subtitle_timeline_only"
226
+
227
+ lines = [line for line in clean.splitlines() if line.strip()]
228
+ if len(lines) >= 4 and len(set(lines)) <= 1:
229
+ return "subtitle_noise_repeated"
230
+
231
+ return None
232
+
233
+
234
+ def _run_u2_for_work(
235
+ *,
236
+ platform: str,
237
+ work: Dict[str, Any],
238
+ base_url: str,
239
+ token: str,
240
+ timeout_ms: int,
241
+ poll_interval_sec: float,
242
+ max_polls: int,
243
+ submit_max_retries: int,
244
+ submit_backoff_ms: int,
245
+ timeout_retry_enabled: bool,
246
+ timeout_retry_max_retries: int,
247
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
248
+ gate = _evaluate_u2_gate(work, platform=platform)
249
+ if not gate.get("can_u2"):
250
+ gate_reason = normalize_text(gate.get("gate_reason")) or "skip:unknown"
251
+ return _fallback_none_result(gate_reason), {
252
+ "step": "author_home.asr.u2_gate",
253
+ "platform_work_id": work.get("platform_work_id"),
254
+ "ok": False,
255
+ "can_u2": False,
256
+ "gate_reason": gate_reason,
257
+ "rule": U2_GATE_RULE,
258
+ "is_video": gate.get("is_video"),
259
+ "duration_ms": gate.get("duration_ms"),
260
+ "video_download_url_present": gate.get("video_download_url_present"),
261
+ }
262
+
263
+ video_download_url = normalize_text(gate.get("video_download_url"))
264
+
265
+ bundle = run_u2_asr_candidates_with_timeout_retry(
266
+ base_url=base_url,
267
+ token=token,
268
+ timeout_ms=timeout_ms,
269
+ candidates=[video_download_url],
270
+ submit_max_retries=submit_max_retries,
271
+ submit_backoff_ms=submit_backoff_ms,
272
+ poll_interval_sec=poll_interval_sec,
273
+ max_polls=max_polls,
274
+ timeout_retry_enabled=timeout_retry_enabled,
275
+ timeout_retry_max_retries=timeout_retry_max_retries,
276
+ )
277
+ poll_result = bundle.get("poll_result", {}) if isinstance(bundle.get("poll_result"), dict) else {}
278
+ transcript = _clean_text(poll_result.get("transcript_text"))
279
+
280
+ trace = {
281
+ "step": "author_home.asr.u2",
282
+ "platform_work_id": work.get("platform_work_id"),
283
+ "ok": bool(poll_result.get("ok") and transcript),
284
+ "task_status": poll_result.get("task_status"),
285
+ "error_reason": poll_result.get("error_reason"),
286
+ "selected_video_url": bundle.get("chosen_candidate") or video_download_url,
287
+ "rounds": bundle.get("rounds", []),
288
+ "candidate_attempts": bundle.get("candidate_attempts", []),
289
+ "timeout_retry": bundle.get("timeout_retry", {}),
290
+ "gate_reason": "pass",
291
+ "rule": U2_GATE_RULE,
292
+ }
293
+
294
+ if transcript:
295
+ return {
296
+ "subtitle_raw": transcript,
297
+ "subtitle_source": "external_asr",
298
+ "asr_raw": transcript,
299
+ "asr_clean": transcript,
300
+ "primary_text": transcript,
301
+ "primary_text_source": "asr_clean",
302
+ "analysis_eligibility": "eligible",
303
+ "analysis_exclusion_reason": "",
304
+ "asr_status": "success",
305
+ "asr_error_reason": "",
306
+ "asr_source": "external_asr",
307
+ }, trace
308
+
309
+ return {
310
+ "subtitle_raw": "",
311
+ "subtitle_source": "missing",
312
+ "asr_raw": "",
313
+ "asr_clean": "",
314
+ "primary_text": "",
315
+ "primary_text_source": "asr_clean",
316
+ "analysis_eligibility": "incomplete",
317
+ "analysis_exclusion_reason": "video_asr_unavailable",
318
+ "asr_status": "failed",
319
+ "asr_error_reason": normalize_text(poll_result.get("error_reason")) or "u2_failed",
320
+ "asr_source": "fallback_none",
321
+ }, trace
322
+
323
+
324
+ def _iter_xhs_interface_text_candidates(work: Dict[str, Any]) -> List[Tuple[str, str]]:
325
+ raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
326
+ raw_item = raw_ref.get("raw_item") if isinstance(raw_ref.get("raw_item"), dict) else {}
327
+
328
+ candidates: List[Tuple[str, str]] = []
329
+
330
+ def _append(source: str, value: Any) -> None:
331
+ clean = _clean_text(value)
332
+ if clean:
333
+ candidates.append((source, clean))
334
+
335
+ # Strict subtitle separation: only subtitle/transcript-like fields are allowed here.
336
+ _append("work.subtitle_raw", work.get("subtitle_raw"))
337
+ _append("work.asr_raw", work.get("asr_raw"))
338
+ _append("raw_ref.subtitle_inline", raw_ref.get("subtitle_inline"))
339
+
340
+ for key in (
341
+ "subtitle_text",
342
+ "subtitle",
343
+ "subtitles",
344
+ "captions",
345
+ "caption_text",
346
+ "transcript",
347
+ "transcript_text",
348
+ "subtitle_content",
349
+ "subtitle_list",
350
+ "subtitleList",
351
+ "srt",
352
+ "vtt",
353
+ ):
354
+ _append(f"raw_item.{key}", raw_item.get(key))
355
+
356
+ deduped: List[Tuple[str, str]] = []
357
+ seen = set()
358
+ for source, text in candidates:
359
+ if text in seen:
360
+ continue
361
+ seen.add(text)
362
+ deduped.append((source, text))
363
+ return deduped
364
+
365
+
366
+ def _resolve_xhs_subtitle(work: Dict[str, Any], timeout_ms: int) -> Tuple[str, str, List[str], str]:
367
+ raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
368
+ subtitle_urls = raw_ref.get("subtitle_urls") if isinstance(raw_ref.get("subtitle_urls"), list) else []
369
+ subtitle_urls = [normalize_text(item) for item in subtitle_urls if normalize_text(item)]
370
+
371
+ for source, candidate in _iter_xhs_interface_text_candidates(work):
372
+ if _invalid_subtitle_reason(candidate) is None:
373
+ return candidate, "interface", subtitle_urls, source
374
+
375
+ fetched = _fetch_subtitle_text(subtitle_urls, timeout_ms=timeout_ms)
376
+ return _clean_text(fetched), "url", subtitle_urls, "subtitle_url"
377
+
378
+
379
+ def _dedupe_works_by_platform_id(works: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
380
+ deduped: List[Dict[str, Any]] = []
381
+ seen = set()
382
+ duplicates = 0
383
+
384
+ for work in works:
385
+ if not isinstance(work, dict):
386
+ continue
387
+ platform_work_id = normalize_text(work.get("platform_work_id"))
388
+ dedupe_key = platform_work_id or f"anonymous-{len(deduped)}"
389
+ if dedupe_key in seen:
390
+ duplicates += 1
391
+ continue
392
+ seen.add(dedupe_key)
393
+ deduped.append(work)
394
+
395
+ return deduped, duplicates
396
+
397
+
398
+ def _fallback_none_result(reason: str) -> Dict[str, Any]:
399
+ return {
400
+ "subtitle_raw": "",
401
+ "subtitle_source": "missing",
402
+ "asr_raw": "",
403
+ "asr_clean": "",
404
+ "primary_text": "",
405
+ "primary_text_source": "asr_clean",
406
+ "analysis_eligibility": "incomplete",
407
+ "analysis_exclusion_reason": normalize_text(reason) or "video_asr_unavailable",
408
+ "asr_status": "failed",
409
+ "asr_error_reason": normalize_text(reason) or "asr_failed",
410
+ "asr_source": "fallback_none",
411
+ }
412
+
413
+
414
+ def _mark_text_work_ready(work: Dict[str, Any]) -> Dict[str, Any]:
415
+ caption_raw = normalize_text(work.get("caption_raw"))
416
+ return {
417
+ "primary_text": caption_raw,
418
+ "primary_text_source": "caption_raw",
419
+ "analysis_eligibility": "eligible" if caption_raw else "incomplete",
420
+ "analysis_exclusion_reason": "" if caption_raw else "caption_raw_missing",
421
+ "asr_status": "not_applicable",
422
+ "asr_error_reason": "",
423
+ "asr_source": "fallback_none",
424
+ }
425
+
426
+
427
+ def _run_u2_batch_for_entries(
428
+ *,
429
+ batch_id: str,
430
+ entries: List[Dict[str, Any]],
431
+ base_url: str,
432
+ token: str,
433
+ timeout_ms: int,
434
+ poll_interval_sec: float,
435
+ max_polls: int,
436
+ submit_max_retries: int,
437
+ submit_backoff_ms: int,
438
+ timeout_retry_enabled: bool,
439
+ timeout_retry_max_retries: int,
440
+ ) -> Dict[str, Any]:
441
+ url_to_entries: Dict[str, List[Dict[str, Any]]] = {}
442
+ unique_urls: List[str] = []
443
+
444
+ for entry in entries:
445
+ normalized_url = normalize_media_url(entry.get("video_download_url") or entry.get("video_down_url"))
446
+ if not normalized_url:
447
+ continue
448
+ entry["normalized_video_url"] = normalized_url
449
+ if normalized_url not in url_to_entries:
450
+ url_to_entries[normalized_url] = []
451
+ unique_urls.append(normalized_url)
452
+ url_to_entries[normalized_url].append(entry)
453
+
454
+ if not unique_urls:
455
+ return {
456
+ "trace": [
457
+ {
458
+ "step": "author_home.asr.batch.submitted",
459
+ "batch_id": batch_id,
460
+ "ok": False,
461
+ "error_reason": "batch_no_valid_urls",
462
+ "batch_size": len(entries),
463
+ }
464
+ ],
465
+ "unmapped_entries": list(entries),
466
+ "mapped_count": 0,
467
+ "task_metrics": {},
468
+ "submitted": False,
469
+ "completed": False,
470
+ }
471
+
472
+ bundle = run_u2_asr_batch_with_timeout_retry(
473
+ base_url=base_url,
474
+ token=token,
475
+ timeout_ms=timeout_ms,
476
+ file_urls=unique_urls,
477
+ submit_max_retries=submit_max_retries,
478
+ submit_backoff_ms=submit_backoff_ms,
479
+ poll_interval_sec=poll_interval_sec,
480
+ max_polls=max_polls,
481
+ timeout_retry_enabled=timeout_retry_enabled,
482
+ timeout_retry_max_retries=timeout_retry_max_retries,
483
+ )
484
+
485
+ submit_bundle = bundle.get("submit_bundle") if isinstance(bundle.get("submit_bundle"), dict) else {}
486
+ submit_response = submit_bundle.get("submit_response") if isinstance(submit_bundle.get("submit_response"), dict) else {}
487
+ poll_result = bundle.get("poll_result") if isinstance(bundle.get("poll_result"), dict) else {}
488
+ task_metrics = bundle.get("task_metrics") if isinstance(bundle.get("task_metrics"), dict) else {}
489
+ submit_accepted = bool(submit_response.get("ok") and submit_bundle.get("task_id"))
490
+
491
+ trace: List[Dict[str, Any]] = [
492
+ {
493
+ "step": "author_home.asr.batch.submitted",
494
+ "batch_id": batch_id,
495
+ "ok": submit_accepted,
496
+ "batch_size": len(entries),
497
+ "batch_unique_urls": len(unique_urls),
498
+ "task_id": submit_bundle.get("task_id"),
499
+ "submit_status": submit_bundle.get("final_submit_status"),
500
+ "error_reason": submit_response.get("error_reason"),
501
+ }
502
+ ]
503
+
504
+ batch_progress = bundle.get("batch_progress") if isinstance(bundle.get("batch_progress"), dict) else {}
505
+ batch_complete = bool(bundle.get("batch_complete") or poll_result.get("batch_complete"))
506
+
507
+ trace.append(
508
+ {
509
+ "step": "author_home.asr.batch.completed",
510
+ "batch_id": batch_id,
511
+ "ok": batch_complete,
512
+ "task_status": poll_result.get("task_status"),
513
+ "error_reason": poll_result.get("error_reason"),
514
+ "task_metrics": task_metrics,
515
+ "batch_progress": batch_progress,
516
+ "batch_complete": batch_complete,
517
+ }
518
+ )
519
+
520
+ mapped_results = bundle.get("mapped_results") if isinstance(bundle.get("mapped_results"), dict) else {}
521
+ mapped_count = 0
522
+ unmapped_entries: List[Dict[str, Any]] = []
523
+
524
+ batch_error = normalize_text(poll_result.get("error_reason")) or "batch_result_unmapped"
525
+
526
+ for normalized_url, grouped_entries in url_to_entries.items():
527
+ mapped_item = mapped_results.get(normalized_url) if isinstance(mapped_results, dict) else None
528
+ transcript = _clean_text(mapped_item.get("transcript_text")) if isinstance(mapped_item, dict) else ""
529
+ mapped_status = normalize_text(mapped_item.get("task_status") if isinstance(mapped_item, dict) else "").upper()
530
+ mapped_error = normalize_text(mapped_item.get("error_reason") if isinstance(mapped_item, dict) else "")
531
+ mapped_ok = bool(mapped_item.get("ok")) if isinstance(mapped_item, dict) else False
532
+
533
+ if (mapped_ok or mapped_status in {"SUCCEEDED", "SUCCESS", "COMPLETED", "DONE"}) and transcript:
534
+ for entry in grouped_entries:
535
+ entry["work"].update(
536
+ {
537
+ "asr_raw": transcript,
538
+ "asr_clean": transcript,
539
+ "primary_text": transcript,
540
+ "primary_text_source": "asr_clean",
541
+ "analysis_eligibility": "eligible",
542
+ "analysis_exclusion_reason": "",
543
+ "asr_status": "success",
544
+ "asr_error_reason": "",
545
+ "asr_source": "external_asr",
546
+ }
547
+ )
548
+ mapped_count += 1
549
+ else:
550
+ fallback_reason = mapped_error or batch_error or ("u2_batch_incomplete" if not batch_complete else "batch_result_unmapped")
551
+ for entry in grouped_entries:
552
+ entry["fallback_reason"] = fallback_reason
553
+ unmapped_entries.append(entry)
554
+
555
+ trace.append(
556
+ {
557
+ "step": "author_home.asr.batch.mapped",
558
+ "batch_id": batch_id,
559
+ "mapped_count": mapped_count,
560
+ "mapped_urls": len([key for key in url_to_entries.keys() if isinstance(mapped_results.get(key), dict)]),
561
+ }
562
+ )
563
+
564
+ if unmapped_entries:
565
+ trace.append(
566
+ {
567
+ "step": "author_home.asr.batch.unmapped",
568
+ "batch_id": batch_id,
569
+ "unmapped_count": len(unmapped_entries),
570
+ "reason": normalize_text(unmapped_entries[0].get("fallback_reason")) if unmapped_entries else "batch_result_unmapped",
571
+ }
572
+ )
573
+
574
+ return {
575
+ "trace": trace,
576
+ "unmapped_entries": unmapped_entries,
577
+ "mapped_count": mapped_count,
578
+ "task_metrics": task_metrics,
579
+ "batch_progress": batch_progress,
580
+ "batch_complete": batch_complete,
581
+ "submitted": submit_accepted,
582
+ "completed": batch_complete,
583
+ }
584
+
585
+
586
+ def enrich_author_home_asr(
587
+ *,
588
+ platform: str,
589
+ works: List[Dict[str, Any]],
590
+ base_url: str,
591
+ token: str,
592
+ timeout_ms: int,
593
+ poll_interval_sec: float = 3.0,
594
+ max_polls: int = 30,
595
+ douyin_submit_max_retries: int = 2,
596
+ douyin_submit_backoff_ms: int = 1500,
597
+ xhs_submit_max_retries: int = 0,
598
+ xhs_submit_backoff_ms: int = 0,
599
+ timeout_retry_enabled: bool = True,
600
+ timeout_retry_max_retries: int = 3,
601
+ batch_size: int = DEFAULT_BATCH_SUBMIT_SIZE,
602
+ checkpoint: Optional[Dict[str, Any]] = None,
603
+ progress: Optional[ProgressReporter] = None,
604
+ ) -> Dict[str, Any]:
605
+ trace: List[Dict[str, Any]] = []
606
+ deduped_works, duplicate_count = _dedupe_works_by_platform_id(works)
607
+
608
+ checkpoint_in = checkpoint if isinstance(checkpoint, dict) else {}
609
+ completed_ids = {
610
+ normalize_text(item)
611
+ for item in (checkpoint_in.get("completed_work_ids") or [])
612
+ if normalize_text(item)
613
+ }
614
+
615
+ requested_batch = int(batch_size or DEFAULT_BATCH_SUBMIT_SIZE)
616
+ effective_batch = clamp_u2_batch_submit_size(
617
+ requested_batch,
618
+ default=DEFAULT_BATCH_SUBMIT_SIZE,
619
+ hard_limit=MAX_BATCH_SUBMIT_SIZE,
620
+ )
621
+
622
+ trace.append(
623
+ {
624
+ "step": "author_home.asr.init",
625
+ "platform": platform,
626
+ "input_count": len(works),
627
+ "deduped_count": len(deduped_works),
628
+ "duplicate_count": duplicate_count,
629
+ "resume_completed": len(completed_ids),
630
+ "requested_batch_size": requested_batch,
631
+ "batch_size": effective_batch,
632
+ "batch_size_clamped": requested_batch != effective_batch,
633
+ "batch_submit_hard_limit": MAX_BATCH_SUBMIT_SIZE,
634
+ }
635
+ )
636
+ if progress is not None:
637
+ progress.started(
638
+ stage="author_home.asr",
639
+ message="author_home asr enrichment started",
640
+ data={
641
+ "input_count": len(works),
642
+ "deduped_count": len(deduped_works),
643
+ "resume_completed": len(completed_ids),
644
+ "batch_size": effective_batch,
645
+ },
646
+ )
647
+
648
+ queue: List[Dict[str, Any]] = []
649
+ for work in deduped_works:
650
+ work_id = normalize_text(work.get("platform_work_id"))
651
+ if work_id and work_id in completed_ids:
652
+ continue
653
+ queue.append(work)
654
+
655
+ trace.append(
656
+ {
657
+ "step": "author_home.asr.collected_works",
658
+ "platform": platform,
659
+ "queued_count": len(queue),
660
+ }
661
+ )
662
+
663
+ batch_total = (len(queue) + effective_batch - 1) // effective_batch if queue else 0
664
+ if progress is not None:
665
+ progress.progress(
666
+ stage="author_home.asr.queue",
667
+ message="author_home asr queue prepared",
668
+ data={"queued_count": len(queue), "batch_total": batch_total},
669
+ )
670
+
671
+ success_count = 0
672
+ fallback_none_count = 0
673
+ submitted_batches = 0
674
+ completed_batches = 0
675
+ batch_mapped_count = 0
676
+ batch_unmapped_count = 0
677
+ fallback_single_count = 0
678
+
679
+ for batch_index in range(batch_total):
680
+ batch = queue[batch_index * effective_batch : (batch_index + 1) * effective_batch]
681
+ batch_id = f"batch-{batch_index + 1:03d}"
682
+ if progress is not None:
683
+ progress.progress(
684
+ stage="author_home.asr.batch",
685
+ message="processing author_home asr batch",
686
+ data={"batch_id": batch_id, "batch_index": batch_index + 1, "batch_total": batch_total, "batch_size": len(batch)},
687
+ )
688
+
689
+ batch_u2_entries: List[Dict[str, Any]] = []
690
+
691
+ for work in batch:
692
+ work_id = normalize_text(work.get("platform_work_id"))
693
+
694
+ if platform == "douyin":
695
+ gate = _evaluate_u2_gate(work, platform=platform)
696
+ trace.append(
697
+ {
698
+ "step": "author_home.asr.u2_gate",
699
+ "batch_id": batch_id,
700
+ "platform_work_id": work_id,
701
+ "ok": bool(gate.get("can_u2")),
702
+ "can_u2": bool(gate.get("can_u2")),
703
+ "gate_reason": gate.get("gate_reason"),
704
+ "rule": U2_GATE_RULE,
705
+ "is_video": gate.get("is_video"),
706
+ "duration_ms": gate.get("duration_ms"),
707
+ "video_download_url_present": gate.get("video_download_url_present"),
708
+ }
709
+ )
710
+
711
+ if not gate.get("can_u2"):
712
+ work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
713
+ else:
714
+ batch_u2_entries.append(
715
+ {
716
+ "work": work,
717
+ "work_id": work_id,
718
+ "video_download_url": gate.get("video_download_url"),
719
+ "fallback_reason": "batch_result_unmapped",
720
+ }
721
+ )
722
+ continue
723
+
724
+ if not _resolve_is_video(work, platform=platform):
725
+ work.update(_mark_text_work_ready(work))
726
+ trace.append(
727
+ {
728
+ "step": "author_home.asr.skip",
729
+ "batch_id": batch_id,
730
+ "platform_work_id": work_id,
731
+ "ok": True,
732
+ "reason": "text_work_no_asr_required",
733
+ }
734
+ )
735
+ continue
736
+
737
+ subtitle_text, subtitle_source, subtitle_urls, subtitle_field = _resolve_xhs_subtitle(work, timeout_ms=timeout_ms)
738
+ subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
739
+ if subtitle_invalid is None:
740
+ work.update(
741
+ {
742
+ "subtitle_raw": subtitle_text,
743
+ "subtitle_source": "native_subtitle",
744
+ "asr_raw": subtitle_text,
745
+ "asr_clean": subtitle_text,
746
+ "primary_text": subtitle_text,
747
+ "primary_text_source": "asr_clean",
748
+ "analysis_eligibility": "eligible",
749
+ "analysis_exclusion_reason": "",
750
+ "asr_status": "success",
751
+ "asr_error_reason": "",
752
+ "asr_source": "native_subtitle",
753
+ }
754
+ )
755
+ trace.append(
756
+ {
757
+ "step": "author_home.asr.xhs_subtitle",
758
+ "batch_id": batch_id,
759
+ "platform_work_id": work_id,
760
+ "ok": True,
761
+ "subtitle_source": subtitle_source,
762
+ "subtitle_field": subtitle_field,
763
+ "subtitle_url_count": len(subtitle_urls),
764
+ }
765
+ )
766
+ else:
767
+ trace.append(
768
+ {
769
+ "step": "author_home.asr.xhs_subtitle",
770
+ "batch_id": batch_id,
771
+ "platform_work_id": work_id,
772
+ "ok": False,
773
+ "error_reason": subtitle_invalid,
774
+ "subtitle_source": subtitle_source,
775
+ "subtitle_field": subtitle_field,
776
+ "subtitle_url_count": len(subtitle_urls),
777
+ }
778
+ )
779
+
780
+ gate = _evaluate_u2_gate(work, platform=platform)
781
+ trace.append(
782
+ {
783
+ "step": "author_home.asr.u2_gate",
784
+ "batch_id": batch_id,
785
+ "platform_work_id": work_id,
786
+ "ok": bool(gate.get("can_u2")),
787
+ "can_u2": bool(gate.get("can_u2")),
788
+ "gate_reason": gate.get("gate_reason"),
789
+ "rule": U2_GATE_RULE,
790
+ "is_video": gate.get("is_video"),
791
+ "duration_ms": gate.get("duration_ms"),
792
+ "video_download_url_present": gate.get("video_download_url_present"),
793
+ "subtitle_invalid": subtitle_invalid,
794
+ }
795
+ )
796
+
797
+ if not gate.get("can_u2"):
798
+ work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
799
+ else:
800
+ batch_u2_entries.append(
801
+ {
802
+ "work": work,
803
+ "work_id": work_id,
804
+ "video_download_url": gate.get("video_download_url"),
805
+ "fallback_reason": f"xhs_subtitle_invalid:{subtitle_invalid}",
806
+ }
807
+ )
808
+
809
+ fallback_entries: List[Dict[str, Any]] = []
810
+ if batch_u2_entries:
811
+ batch_bundle = _run_u2_batch_for_entries(
812
+ batch_id=batch_id,
813
+ entries=batch_u2_entries,
814
+ base_url=base_url,
815
+ token=token,
816
+ timeout_ms=timeout_ms,
817
+ poll_interval_sec=poll_interval_sec,
818
+ max_polls=max_polls,
819
+ submit_max_retries=max(0, int(douyin_submit_max_retries if platform == "douyin" else xhs_submit_max_retries)),
820
+ submit_backoff_ms=max(0, int(douyin_submit_backoff_ms if platform == "douyin" else xhs_submit_backoff_ms)),
821
+ timeout_retry_enabled=timeout_retry_enabled,
822
+ timeout_retry_max_retries=max(0, int(timeout_retry_max_retries)),
823
+ )
824
+ trace.extend(batch_bundle.get("trace") if isinstance(batch_bundle.get("trace"), list) else [])
825
+
826
+ if batch_bundle.get("submitted"):
827
+ submitted_batches += 1
828
+ if batch_bundle.get("completed"):
829
+ completed_batches += 1
830
+
831
+ batch_mapped_count += int(batch_bundle.get("mapped_count") or 0)
832
+ fallback_entries = list(batch_bundle.get("unmapped_entries") or [])
833
+ batch_unmapped_count += len(fallback_entries)
834
+
835
+ for fallback_entry in fallback_entries:
836
+ fallback_work = fallback_entry.get("work")
837
+ if not isinstance(fallback_work, dict):
838
+ continue
839
+
840
+ retry_result, retry_trace = _run_u2_for_work(
841
+ platform=platform,
842
+ work=fallback_work,
843
+ base_url=base_url,
844
+ token=token,
845
+ timeout_ms=timeout_ms,
846
+ poll_interval_sec=poll_interval_sec,
847
+ max_polls=max_polls,
848
+ submit_max_retries=max(0, int(douyin_submit_max_retries if platform == "douyin" else xhs_submit_max_retries)),
849
+ submit_backoff_ms=max(0, int(douyin_submit_backoff_ms if platform == "douyin" else xhs_submit_backoff_ms)),
850
+ timeout_retry_enabled=timeout_retry_enabled,
851
+ timeout_retry_max_retries=max(0, int(timeout_retry_max_retries)),
852
+ )
853
+ retry_trace["step"] = "author_home.asr.batch.fallback"
854
+ retry_trace["batch_id"] = batch_id
855
+ retry_trace["fallback_trigger_reason"] = fallback_entry.get("fallback_reason")
856
+ trace.append(retry_trace)
857
+ fallback_single_count += 1
858
+ fallback_work.update(retry_result)
859
+
860
+ batch_success = 0
861
+ batch_failed = 0
862
+ for work in batch:
863
+ work_id = normalize_text(work.get("platform_work_id"))
864
+ if work_id:
865
+ completed_ids.add(work_id)
866
+
867
+ if str(work.get("analysis_eligibility") or "") == "eligible":
868
+ success_count += 1
869
+ batch_success += 1
870
+ else:
871
+ fallback_none_count += 1
872
+ batch_failed += 1
873
+
874
+ trace.append(
875
+ {
876
+ "step": "author_home.asr.batch_done",
877
+ "batch_id": batch_id,
878
+ "batch_index": batch_index + 1,
879
+ "batch_total": batch_total,
880
+ "batch_size": len(batch),
881
+ "batch_success": batch_success,
882
+ "batch_failed": batch_failed,
883
+ "fallback_singles": fallback_single_count,
884
+ }
885
+ )
886
+ if progress is not None:
887
+ progress.progress(
888
+ stage="author_home.asr.batch",
889
+ message="author_home asr batch finished",
890
+ data={
891
+ "batch_id": batch_id,
892
+ "batch_index": batch_index + 1,
893
+ "batch_total": batch_total,
894
+ "batch_success": batch_success,
895
+ "batch_failed": batch_failed,
896
+ },
897
+ )
898
+
899
+ failed_work_ids = sorted(
900
+ list(
901
+ {
902
+ normalize_text(work.get("platform_work_id"))
903
+ for work in deduped_works
904
+ if isinstance(work, dict)
905
+ and normalize_text(work.get("platform_work_id"))
906
+ and str(work.get("analysis_eligibility") or "") != "eligible"
907
+ }
908
+ )
909
+ )
910
+
911
+ checkpoint_out = {
912
+ "platform": platform,
913
+ "completed_work_ids": sorted(completed_ids),
914
+ "failed_work_ids": failed_work_ids,
915
+ "batch_size": effective_batch,
916
+ "batches_total": batch_total,
917
+ "batches_submitted": submitted_batches,
918
+ "batches_completed": completed_batches,
919
+ "batch_mapped": batch_mapped_count,
920
+ "batch_unmapped": batch_unmapped_count,
921
+ "fallback_singles": fallback_single_count,
922
+ "total_works": len(deduped_works),
923
+ "processed_works": len(completed_ids),
924
+ # backward-compatible checkpoint fields
925
+ "refill_attempted": fallback_single_count,
926
+ }
927
+
928
+ stats = {
929
+ "total": len(deduped_works),
930
+ "success": success_count,
931
+ "fallback_none": fallback_none_count,
932
+ "duplicates_dropped": duplicate_count,
933
+ "submitted_batches": submitted_batches,
934
+ "completed_batches": completed_batches,
935
+ "batch_mapped": batch_mapped_count,
936
+ "batch_unmapped": batch_unmapped_count,
937
+ "fallback_singles": fallback_single_count,
938
+ # backward-compatible stats fields
939
+ "refill_attempted": fallback_single_count,
940
+ "refill_failed": len(failed_work_ids),
941
+ }
942
+
943
+ if progress is not None:
944
+ progress.done(
945
+ stage="author_home.asr",
946
+ message="author_home asr enrichment finished",
947
+ data={
948
+ "total": len(deduped_works),
949
+ "success": success_count,
950
+ "fallback_none": fallback_none_count,
951
+ "submitted_batches": submitted_batches,
952
+ "completed_batches": completed_batches,
953
+ },
954
+ )
955
+
956
+ return {
957
+ "works": deduped_works,
958
+ "trace": trace,
959
+ "checkpoint": checkpoint_out,
960
+ "stats": stats,
961
+ }