@tikomni/skills 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (451) hide show
  1. package/.skill-package-allowlist.txt +4 -0
  2. package/LICENSE +21 -0
  3. package/README.md +167 -0
  4. package/README.zh-CN.md +167 -0
  5. package/bin/tikomni-skills.js +127 -0
  6. package/env.example +160 -0
  7. package/lib/installer.js +176 -0
  8. package/package.json +44 -0
  9. package/skills/creator-analysis/SKILL.md +71 -0
  10. package/skills/creator-analysis/agents/openai.yaml +4 -0
  11. package/skills/creator-analysis/env.example +36 -0
  12. package/skills/creator-analysis/references/api-capability-index.md +92 -0
  13. package/skills/creator-analysis/references/api-contracts/asr-api.md +130 -0
  14. package/skills/creator-analysis/references/api-contracts/bilibili-app-api.md +776 -0
  15. package/skills/creator-analysis/references/api-contracts/bilibili-web-api.md +2017 -0
  16. package/skills/creator-analysis/references/api-contracts/demo-api.md +717 -0
  17. package/skills/creator-analysis/references/api-contracts/douyin-app-v3-api.md +3594 -0
  18. package/skills/creator-analysis/references/api-contracts/douyin-billboard-api.md +2274 -0
  19. package/skills/creator-analysis/references/api-contracts/douyin-creator-api.md +1575 -0
  20. package/skills/creator-analysis/references/api-contracts/douyin-creator-v2-api.md +3254 -0
  21. package/skills/creator-analysis/references/api-contracts/douyin-search-api.md +4118 -0
  22. package/skills/creator-analysis/references/api-contracts/douyin-web-api.md +5544 -0
  23. package/skills/creator-analysis/references/api-contracts/douyin-xingtu-api.md +1916 -0
  24. package/skills/creator-analysis/references/api-contracts/douyin-xingtu-v2-api.md +1540 -0
  25. package/skills/creator-analysis/references/api-contracts/health-check.md +69 -0
  26. package/skills/creator-analysis/references/api-contracts/hybrid-parsing.md +78 -0
  27. package/skills/creator-analysis/references/api-contracts/instagram-v1-api.md +2256 -0
  28. package/skills/creator-analysis/references/api-contracts/instagram-v2-api.md +2011 -0
  29. package/skills/creator-analysis/references/api-contracts/instagram-v3-api.md +2630 -0
  30. package/skills/creator-analysis/references/api-contracts/ios-shortcut.md +44 -0
  31. package/skills/creator-analysis/references/api-contracts/kuaishou-app-api.md +1518 -0
  32. package/skills/creator-analysis/references/api-contracts/kuaishou-web-api.md +1242 -0
  33. package/skills/creator-analysis/references/api-contracts/lemon8-app-api.md +1088 -0
  34. package/skills/creator-analysis/references/api-contracts/linkedin-web-api.md +1949 -0
  35. package/skills/creator-analysis/references/api-contracts/media-ingest-api.md +126 -0
  36. package/skills/creator-analysis/references/api-contracts/pipixia-app-api.md +1142 -0
  37. package/skills/creator-analysis/references/api-contracts/reddit-app-api.md +2025 -0
  38. package/skills/creator-analysis/references/api-contracts/sora2-api.md +2266 -0
  39. package/skills/creator-analysis/references/api-contracts/temp-mail-api.md +208 -0
  40. package/skills/creator-analysis/references/api-contracts/threads-web-api.md +897 -0
  41. package/skills/creator-analysis/references/api-contracts/tikhub-downloader-api.md +134 -0
  42. package/skills/creator-analysis/references/api-contracts/tikhub-user-api.md +494 -0
  43. package/skills/creator-analysis/references/api-contracts/tiktok-ads-api.md +5947 -0
  44. package/skills/creator-analysis/references/api-contracts/tiktok-analytics-api.md +968 -0
  45. package/skills/creator-analysis/references/api-contracts/tiktok-app-v3-api.md +5735 -0
  46. package/skills/creator-analysis/references/api-contracts/tiktok-creator-api.md +1951 -0
  47. package/skills/creator-analysis/references/api-contracts/tiktok-interaction-api.md +742 -0
  48. package/skills/creator-analysis/references/api-contracts/tiktok-shop-web-api.md +1890 -0
  49. package/skills/creator-analysis/references/api-contracts/tiktok-web-api.md +4448 -0
  50. package/skills/creator-analysis/references/api-contracts/toutiao-app-api.md +342 -0
  51. package/skills/creator-analysis/references/api-contracts/toutiao-web-api.md +143 -0
  52. package/skills/creator-analysis/references/api-contracts/twitter-web-api.md +989 -0
  53. package/skills/creator-analysis/references/api-contracts/wechat-channels-api.md +809 -0
  54. package/skills/creator-analysis/references/api-contracts/wechat-media-platform-web-api.md +677 -0
  55. package/skills/creator-analysis/references/api-contracts/weibo-app-api.md +1547 -0
  56. package/skills/creator-analysis/references/api-contracts/weibo-web-api.md +798 -0
  57. package/skills/creator-analysis/references/api-contracts/weibo-web-v2-api.md +2459 -0
  58. package/skills/creator-analysis/references/api-contracts/xiaohongshu-app-api.md +1291 -0
  59. package/skills/creator-analysis/references/api-contracts/xiaohongshu-app-v2-api.md +1683 -0
  60. package/skills/creator-analysis/references/api-contracts/xiaohongshu-web-api.md +1324 -0
  61. package/skills/creator-analysis/references/api-contracts/xiaohongshu-web-v2-api.md +1209 -0
  62. package/skills/creator-analysis/references/api-contracts/xigua-app-v2-api.md +489 -0
  63. package/skills/creator-analysis/references/api-contracts/youtube-web-api.md +2636 -0
  64. package/skills/creator-analysis/references/api-contracts/youtube-web-v2-api.md +2660 -0
  65. package/skills/creator-analysis/references/api-contracts/zhihu-web-api.md +2315 -0
  66. package/skills/creator-analysis/references/api-tags/asr-api.md +100 -0
  67. package/skills/creator-analysis/references/api-tags/bilibili-app-api.md +482 -0
  68. package/skills/creator-analysis/references/api-tags/bilibili-web-api.md +1267 -0
  69. package/skills/creator-analysis/references/api-tags/demo-api.md +365 -0
  70. package/skills/creator-analysis/references/api-tags/douyin-app-v3-api.md +2012 -0
  71. package/skills/creator-analysis/references/api-tags/douyin-billboard-api.md +1428 -0
  72. package/skills/creator-analysis/references/api-tags/douyin-creator-api.md +694 -0
  73. package/skills/creator-analysis/references/api-tags/douyin-creator-v2-api.md +694 -0
  74. package/skills/creator-analysis/references/api-tags/douyin-search-api.md +1059 -0
  75. package/skills/creator-analysis/references/api-tags/douyin-web-api.md +3314 -0
  76. package/skills/creator-analysis/references/api-tags/douyin-xingtu-api.md +935 -0
  77. package/skills/creator-analysis/references/api-tags/douyin-xingtu-v2-api.md +925 -0
  78. package/skills/creator-analysis/references/api-tags/health-check.md +40 -0
  79. package/skills/creator-analysis/references/api-tags/hybrid-parsing.md +57 -0
  80. package/skills/creator-analysis/references/api-tags/instagram-v1-api.md +1224 -0
  81. package/skills/creator-analysis/references/api-tags/instagram-v2-api.md +1147 -0
  82. package/skills/creator-analysis/references/api-tags/instagram-v3-api.md +1123 -0
  83. package/skills/creator-analysis/references/api-tags/ios-shortcut.md +45 -0
  84. package/skills/creator-analysis/references/api-tags/kuaishou-app-api.md +846 -0
  85. package/skills/creator-analysis/references/api-tags/kuaishou-web-api.md +551 -0
  86. package/skills/creator-analysis/references/api-tags/lemon8-app-api.md +687 -0
  87. package/skills/creator-analysis/references/api-tags/linkedin-web-api.md +1105 -0
  88. package/skills/creator-analysis/references/api-tags/media-ingest-api.md +112 -0
  89. package/skills/creator-analysis/references/api-tags/pipixia-app-api.md +721 -0
  90. package/skills/creator-analysis/references/api-tags/reddit-app-api.md +1057 -0
  91. package/skills/creator-analysis/references/api-tags/sora2-api.md +737 -0
  92. package/skills/creator-analysis/references/api-tags/temp-mail-api.md +136 -0
  93. package/skills/creator-analysis/references/api-tags/threads-web-api.md +472 -0
  94. package/skills/creator-analysis/references/api-tags/tikhub-downloader-api.md +65 -0
  95. package/skills/creator-analysis/references/api-tags/tikhub-user-api.md +253 -0
  96. package/skills/creator-analysis/references/api-tags/tiktok-ads-api.md +1393 -0
  97. package/skills/creator-analysis/references/api-tags/tiktok-analytics-api.md +179 -0
  98. package/skills/creator-analysis/references/api-tags/tiktok-app-v3-api.md +3264 -0
  99. package/skills/creator-analysis/references/api-tags/tiktok-creator-api.md +709 -0
  100. package/skills/creator-analysis/references/api-tags/tiktok-interaction-api.md +366 -0
  101. package/skills/creator-analysis/references/api-tags/tiktok-shop-web-api.md +663 -0
  102. package/skills/creator-analysis/references/api-tags/tiktok-web-api.md +2516 -0
  103. package/skills/creator-analysis/references/api-tags/toutiao-app-api.md +220 -0
  104. package/skills/creator-analysis/references/api-tags/toutiao-web-api.md +96 -0
  105. package/skills/creator-analysis/references/api-tags/twitter-web-api.md +562 -0
  106. package/skills/creator-analysis/references/api-tags/wechat-channels-api.md +405 -0
  107. package/skills/creator-analysis/references/api-tags/wechat-media-platform-web-api.md +431 -0
  108. package/skills/creator-analysis/references/api-tags/weibo-app-api.md +851 -0
  109. package/skills/creator-analysis/references/api-tags/weibo-web-api.md +470 -0
  110. package/skills/creator-analysis/references/api-tags/weibo-web-v2-api.md +1405 -0
  111. package/skills/creator-analysis/references/api-tags/xiaohongshu-app-api.md +534 -0
  112. package/skills/creator-analysis/references/api-tags/xiaohongshu-app-v2-api.md +934 -0
  113. package/skills/creator-analysis/references/api-tags/xiaohongshu-web-api.md +757 -0
  114. package/skills/creator-analysis/references/api-tags/xiaohongshu-web-v2-api.md +762 -0
  115. package/skills/creator-analysis/references/api-tags/xigua-app-v2-api.md +308 -0
  116. package/skills/creator-analysis/references/api-tags/youtube-web-api.md +934 -0
  117. package/skills/creator-analysis/references/api-tags/youtube-web-v2-api.md +717 -0
  118. package/skills/creator-analysis/references/api-tags/zhihu-web-api.md +1384 -0
  119. package/skills/creator-analysis/references/asr-orchestration.md +33 -0
  120. package/skills/creator-analysis/references/config-templates/defaults.yaml +60 -0
  121. package/skills/creator-analysis/references/contracts/creator-card-fields.md +23 -0
  122. package/skills/creator-analysis/references/contracts/work-card-fields.md +32 -0
  123. package/skills/creator-analysis/references/platform-guides/douyin.md +49 -0
  124. package/skills/creator-analysis/references/platform-guides/generic.md +46 -0
  125. package/skills/creator-analysis/references/platform-guides/xiaohongshu.md +54 -0
  126. package/skills/creator-analysis/references/prompt-contracts/asr-clean.md +28 -0
  127. package/skills/creator-analysis/references/prompt-contracts/author-analysis-v2.md +46 -0
  128. package/skills/creator-analysis/references/prompt-contracts/author-analysis.md +49 -0
  129. package/skills/creator-analysis/references/prompt-contracts/cta.md +24 -0
  130. package/skills/creator-analysis/references/prompt-contracts/hook.md +25 -0
  131. package/skills/creator-analysis/references/prompt-contracts/insight.md +47 -0
  132. package/skills/creator-analysis/references/prompt-contracts/sampled-work-batch-explanations.md +30 -0
  133. package/skills/creator-analysis/references/prompt-contracts/structure.md +25 -0
  134. package/skills/creator-analysis/references/prompt-contracts/style.md +27 -0
  135. package/skills/creator-analysis/references/prompt-contracts/summary.md +29 -0
  136. package/skills/creator-analysis/references/prompt-contracts/topic.md +29 -0
  137. package/skills/creator-analysis/references/schemas/author-analysis-input-v1.schema.json +325 -0
  138. package/skills/creator-analysis/references/schemas/author-analysis-v2.schema.json +158 -0
  139. package/skills/creator-analysis/references/schemas/sampled-work-batch-explanations.schema.json +41 -0
  140. package/skills/creator-analysis/references/service-guides/asr-u2-u3-fallback.md +75 -0
  141. package/skills/creator-analysis/references/workflow.md +18 -0
  142. package/skills/creator-analysis/scripts/__init__.py +0 -0
  143. package/skills/creator-analysis/scripts/author_home/__init__.py +0 -0
  144. package/skills/creator-analysis/scripts/author_home/adapters/__init__.py +0 -0
  145. package/skills/creator-analysis/scripts/author_home/adapters/platform_adapters.py +299 -0
  146. package/skills/creator-analysis/scripts/author_home/analyzers/__init__.py +0 -0
  147. package/skills/creator-analysis/scripts/author_home/analyzers/author_analysis_v2_support.py +1122 -0
  148. package/skills/creator-analysis/scripts/author_home/analyzers/prompt_first_analyzers.py +260 -0
  149. package/skills/creator-analysis/scripts/author_home/analyzers/sampled_work_batch_explainer.py +260 -0
  150. package/skills/creator-analysis/scripts/author_home/asr/__init__.py +5 -0
  151. package/skills/creator-analysis/scripts/author_home/asr/home_asr.py +961 -0
  152. package/skills/creator-analysis/scripts/author_home/builders/__init__.py +0 -0
  153. package/skills/creator-analysis/scripts/author_home/builders/home_builders.py +149 -0
  154. package/skills/creator-analysis/scripts/author_home/collectors/__init__.py +0 -0
  155. package/skills/creator-analysis/scripts/author_home/collectors/homepage_collectors.py +636 -0
  156. package/skills/creator-analysis/scripts/author_home/orchestrator/__init__.py +0 -0
  157. package/skills/creator-analysis/scripts/author_home/orchestrator/run_author_analysis.py +491 -0
  158. package/skills/creator-analysis/scripts/author_home/orchestrator/work_analysis_artifacts.py +553 -0
  159. package/skills/creator-analysis/scripts/author_home/schema.py +417 -0
  160. package/skills/creator-analysis/scripts/core/__init__.py +0 -0
  161. package/skills/creator-analysis/scripts/core/analysis_pipeline.py +133 -0
  162. package/skills/creator-analysis/scripts/core/bootstrap_env.py +35 -0
  163. package/skills/creator-analysis/scripts/core/config_loader.py +418 -0
  164. package/skills/creator-analysis/scripts/core/extract_pipeline.py +173 -0
  165. package/skills/creator-analysis/scripts/core/progress_report.py +111 -0
  166. package/skills/creator-analysis/scripts/core/storage_router.py +253 -0
  167. package/skills/creator-analysis/scripts/core/tikomni_common.py +588 -0
  168. package/skills/creator-analysis/scripts/pipeline/__init__.py +0 -0
  169. package/skills/creator-analysis/scripts/pipeline/asr/__init__.py +0 -0
  170. package/skills/creator-analysis/scripts/pipeline/asr/asr_pipeline.py +1189 -0
  171. package/skills/creator-analysis/scripts/pipeline/asr/poll_u2_task.py +95 -0
  172. package/skills/creator-analysis/scripts/platform/__init__.py +0 -0
  173. package/skills/creator-analysis/scripts/platform/douyin/__init__.py +0 -0
  174. package/skills/creator-analysis/scripts/platform/douyin/douyin_video_type_matrix.py +224 -0
  175. package/skills/creator-analysis/scripts/platform/douyin/run_douyin_single_video.py +1208 -0
  176. package/skills/creator-analysis/scripts/platform/douyin/select_low_quality_video_url.py +200 -0
  177. package/skills/creator-analysis/scripts/platform/xiaohongshu/__init__.py +0 -0
  178. package/skills/creator-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +2128 -0
  179. package/skills/creator-analysis/scripts/writers/__init__.py +0 -0
  180. package/skills/creator-analysis/scripts/writers/write_author_homepage_samples.py +106 -0
  181. package/skills/creator-analysis/scripts/writers/write_benchmark_card.py +1402 -0
  182. package/skills/meta-capability/SKILL.md +69 -0
  183. package/skills/meta-capability/agents/openai.yaml +4 -0
  184. package/skills/meta-capability/env.example +42 -0
  185. package/skills/meta-capability/references/api-capability-index.md +92 -0
  186. package/skills/meta-capability/references/api-contracts/asr-api.md +130 -0
  187. package/skills/meta-capability/references/api-contracts/bilibili-app-api.md +776 -0
  188. package/skills/meta-capability/references/api-contracts/bilibili-web-api.md +2017 -0
  189. package/skills/meta-capability/references/api-contracts/demo-api.md +717 -0
  190. package/skills/meta-capability/references/api-contracts/douyin-app-v3-api.md +3594 -0
  191. package/skills/meta-capability/references/api-contracts/douyin-billboard-api.md +2274 -0
  192. package/skills/meta-capability/references/api-contracts/douyin-creator-api.md +1575 -0
  193. package/skills/meta-capability/references/api-contracts/douyin-creator-v2-api.md +3254 -0
  194. package/skills/meta-capability/references/api-contracts/douyin-search-api.md +4118 -0
  195. package/skills/meta-capability/references/api-contracts/douyin-web-api.md +5544 -0
  196. package/skills/meta-capability/references/api-contracts/douyin-xingtu-api.md +1916 -0
  197. package/skills/meta-capability/references/api-contracts/douyin-xingtu-v2-api.md +1540 -0
  198. package/skills/meta-capability/references/api-contracts/health-check.md +69 -0
  199. package/skills/meta-capability/references/api-contracts/hybrid-parsing.md +78 -0
  200. package/skills/meta-capability/references/api-contracts/instagram-v1-api.md +2256 -0
  201. package/skills/meta-capability/references/api-contracts/instagram-v2-api.md +2011 -0
  202. package/skills/meta-capability/references/api-contracts/instagram-v3-api.md +2630 -0
  203. package/skills/meta-capability/references/api-contracts/ios-shortcut.md +44 -0
  204. package/skills/meta-capability/references/api-contracts/kuaishou-app-api.md +1518 -0
  205. package/skills/meta-capability/references/api-contracts/kuaishou-web-api.md +1242 -0
  206. package/skills/meta-capability/references/api-contracts/lemon8-app-api.md +1088 -0
  207. package/skills/meta-capability/references/api-contracts/linkedin-web-api.md +1949 -0
  208. package/skills/meta-capability/references/api-contracts/media-ingest-api.md +126 -0
  209. package/skills/meta-capability/references/api-contracts/pipixia-app-api.md +1142 -0
  210. package/skills/meta-capability/references/api-contracts/reddit-app-api.md +2025 -0
  211. package/skills/meta-capability/references/api-contracts/sora2-api.md +2266 -0
  212. package/skills/meta-capability/references/api-contracts/temp-mail-api.md +208 -0
  213. package/skills/meta-capability/references/api-contracts/threads-web-api.md +897 -0
  214. package/skills/meta-capability/references/api-contracts/tikhub-downloader-api.md +134 -0
  215. package/skills/meta-capability/references/api-contracts/tikhub-user-api.md +494 -0
  216. package/skills/meta-capability/references/api-contracts/tiktok-ads-api.md +5947 -0
  217. package/skills/meta-capability/references/api-contracts/tiktok-analytics-api.md +968 -0
  218. package/skills/meta-capability/references/api-contracts/tiktok-app-v3-api.md +5735 -0
  219. package/skills/meta-capability/references/api-contracts/tiktok-creator-api.md +1951 -0
  220. package/skills/meta-capability/references/api-contracts/tiktok-interaction-api.md +742 -0
  221. package/skills/meta-capability/references/api-contracts/tiktok-shop-web-api.md +1890 -0
  222. package/skills/meta-capability/references/api-contracts/tiktok-web-api.md +4448 -0
  223. package/skills/meta-capability/references/api-contracts/toutiao-app-api.md +342 -0
  224. package/skills/meta-capability/references/api-contracts/toutiao-web-api.md +143 -0
  225. package/skills/meta-capability/references/api-contracts/twitter-web-api.md +989 -0
  226. package/skills/meta-capability/references/api-contracts/wechat-channels-api.md +809 -0
  227. package/skills/meta-capability/references/api-contracts/wechat-media-platform-web-api.md +677 -0
  228. package/skills/meta-capability/references/api-contracts/weibo-app-api.md +1547 -0
  229. package/skills/meta-capability/references/api-contracts/weibo-web-api.md +798 -0
  230. package/skills/meta-capability/references/api-contracts/weibo-web-v2-api.md +2459 -0
  231. package/skills/meta-capability/references/api-contracts/xiaohongshu-app-api.md +1291 -0
  232. package/skills/meta-capability/references/api-contracts/xiaohongshu-app-v2-api.md +1683 -0
  233. package/skills/meta-capability/references/api-contracts/xiaohongshu-web-api.md +1324 -0
  234. package/skills/meta-capability/references/api-contracts/xiaohongshu-web-v2-api.md +1209 -0
  235. package/skills/meta-capability/references/api-contracts/xigua-app-v2-api.md +489 -0
  236. package/skills/meta-capability/references/api-contracts/youtube-web-api.md +2636 -0
  237. package/skills/meta-capability/references/api-contracts/youtube-web-v2-api.md +2660 -0
  238. package/skills/meta-capability/references/api-contracts/zhihu-web-api.md +2315 -0
  239. package/skills/meta-capability/references/api-tags/asr-api.md +100 -0
  240. package/skills/meta-capability/references/api-tags/bilibili-app-api.md +482 -0
  241. package/skills/meta-capability/references/api-tags/bilibili-web-api.md +1267 -0
  242. package/skills/meta-capability/references/api-tags/demo-api.md +365 -0
  243. package/skills/meta-capability/references/api-tags/douyin-app-v3-api.md +2012 -0
  244. package/skills/meta-capability/references/api-tags/douyin-billboard-api.md +1428 -0
  245. package/skills/meta-capability/references/api-tags/douyin-creator-api.md +694 -0
  246. package/skills/meta-capability/references/api-tags/douyin-creator-v2-api.md +694 -0
  247. package/skills/meta-capability/references/api-tags/douyin-search-api.md +1059 -0
  248. package/skills/meta-capability/references/api-tags/douyin-web-api.md +3314 -0
  249. package/skills/meta-capability/references/api-tags/douyin-xingtu-api.md +935 -0
  250. package/skills/meta-capability/references/api-tags/douyin-xingtu-v2-api.md +925 -0
  251. package/skills/meta-capability/references/api-tags/health-check.md +40 -0
  252. package/skills/meta-capability/references/api-tags/hybrid-parsing.md +57 -0
  253. package/skills/meta-capability/references/api-tags/instagram-v1-api.md +1224 -0
  254. package/skills/meta-capability/references/api-tags/instagram-v2-api.md +1147 -0
  255. package/skills/meta-capability/references/api-tags/instagram-v3-api.md +1123 -0
  256. package/skills/meta-capability/references/api-tags/ios-shortcut.md +45 -0
  257. package/skills/meta-capability/references/api-tags/kuaishou-app-api.md +846 -0
  258. package/skills/meta-capability/references/api-tags/kuaishou-web-api.md +551 -0
  259. package/skills/meta-capability/references/api-tags/lemon8-app-api.md +687 -0
  260. package/skills/meta-capability/references/api-tags/linkedin-web-api.md +1105 -0
  261. package/skills/meta-capability/references/api-tags/media-ingest-api.md +112 -0
  262. package/skills/meta-capability/references/api-tags/pipixia-app-api.md +721 -0
  263. package/skills/meta-capability/references/api-tags/reddit-app-api.md +1057 -0
  264. package/skills/meta-capability/references/api-tags/sora2-api.md +737 -0
  265. package/skills/meta-capability/references/api-tags/temp-mail-api.md +136 -0
  266. package/skills/meta-capability/references/api-tags/threads-web-api.md +472 -0
  267. package/skills/meta-capability/references/api-tags/tikhub-downloader-api.md +65 -0
  268. package/skills/meta-capability/references/api-tags/tikhub-user-api.md +253 -0
  269. package/skills/meta-capability/references/api-tags/tiktok-ads-api.md +1393 -0
  270. package/skills/meta-capability/references/api-tags/tiktok-analytics-api.md +179 -0
  271. package/skills/meta-capability/references/api-tags/tiktok-app-v3-api.md +3264 -0
  272. package/skills/meta-capability/references/api-tags/tiktok-creator-api.md +709 -0
  273. package/skills/meta-capability/references/api-tags/tiktok-interaction-api.md +366 -0
  274. package/skills/meta-capability/references/api-tags/tiktok-shop-web-api.md +663 -0
  275. package/skills/meta-capability/references/api-tags/tiktok-web-api.md +2516 -0
  276. package/skills/meta-capability/references/api-tags/toutiao-app-api.md +220 -0
  277. package/skills/meta-capability/references/api-tags/toutiao-web-api.md +96 -0
  278. package/skills/meta-capability/references/api-tags/twitter-web-api.md +562 -0
  279. package/skills/meta-capability/references/api-tags/wechat-channels-api.md +405 -0
  280. package/skills/meta-capability/references/api-tags/wechat-media-platform-web-api.md +431 -0
  281. package/skills/meta-capability/references/api-tags/weibo-app-api.md +851 -0
  282. package/skills/meta-capability/references/api-tags/weibo-web-api.md +470 -0
  283. package/skills/meta-capability/references/api-tags/weibo-web-v2-api.md +1405 -0
  284. package/skills/meta-capability/references/api-tags/xiaohongshu-app-api.md +534 -0
  285. package/skills/meta-capability/references/api-tags/xiaohongshu-app-v2-api.md +934 -0
  286. package/skills/meta-capability/references/api-tags/xiaohongshu-web-api.md +757 -0
  287. package/skills/meta-capability/references/api-tags/xiaohongshu-web-v2-api.md +762 -0
  288. package/skills/meta-capability/references/api-tags/xigua-app-v2-api.md +308 -0
  289. package/skills/meta-capability/references/api-tags/youtube-web-api.md +934 -0
  290. package/skills/meta-capability/references/api-tags/youtube-web-v2-api.md +717 -0
  291. package/skills/meta-capability/references/api-tags/zhihu-web-api.md +1384 -0
  292. package/skills/meta-capability/references/config-templates/defaults.yaml +18 -0
  293. package/skills/meta-capability/references/dispatch.md +27 -0
  294. package/skills/meta-capability/references/execution-guidelines.md +25 -0
  295. package/skills/meta-capability/references/implemented-route-map.md +177 -0
  296. package/skills/meta-capability/references/service-guides/asr-u2-u3-fallback.md +75 -0
  297. package/skills/meta-capability/scripts/__init__.py +1 -0
  298. package/skills/meta-capability/scripts/call_route.py +141 -0
  299. package/skills/meta-capability/scripts/core/__init__.py +1 -0
  300. package/skills/meta-capability/scripts/core/bootstrap_env.py +32 -0
  301. package/skills/meta-capability/scripts/core/config_loader.py +204 -0
  302. package/skills/meta-capability/scripts/core/tikomni_common.py +443 -0
  303. package/skills/meta-capability/scripts/test_auth.py +98 -0
  304. package/skills/single-work-analysis/SKILL.md +62 -0
  305. package/skills/single-work-analysis/agents/openai.yaml +4 -0
  306. package/skills/single-work-analysis/env.example +36 -0
  307. package/skills/single-work-analysis/references/api-capability-index.md +92 -0
  308. package/skills/single-work-analysis/references/api-contracts/asr-api.md +130 -0
  309. package/skills/single-work-analysis/references/api-contracts/bilibili-app-api.md +776 -0
  310. package/skills/single-work-analysis/references/api-contracts/bilibili-web-api.md +2017 -0
  311. package/skills/single-work-analysis/references/api-contracts/demo-api.md +717 -0
  312. package/skills/single-work-analysis/references/api-contracts/douyin-app-v3-api.md +3594 -0
  313. package/skills/single-work-analysis/references/api-contracts/douyin-billboard-api.md +2274 -0
  314. package/skills/single-work-analysis/references/api-contracts/douyin-creator-api.md +1575 -0
  315. package/skills/single-work-analysis/references/api-contracts/douyin-creator-v2-api.md +3254 -0
  316. package/skills/single-work-analysis/references/api-contracts/douyin-search-api.md +4118 -0
  317. package/skills/single-work-analysis/references/api-contracts/douyin-web-api.md +5544 -0
  318. package/skills/single-work-analysis/references/api-contracts/douyin-xingtu-api.md +1916 -0
  319. package/skills/single-work-analysis/references/api-contracts/douyin-xingtu-v2-api.md +1540 -0
  320. package/skills/single-work-analysis/references/api-contracts/health-check.md +69 -0
  321. package/skills/single-work-analysis/references/api-contracts/hybrid-parsing.md +78 -0
  322. package/skills/single-work-analysis/references/api-contracts/instagram-v1-api.md +2256 -0
  323. package/skills/single-work-analysis/references/api-contracts/instagram-v2-api.md +2011 -0
  324. package/skills/single-work-analysis/references/api-contracts/instagram-v3-api.md +2630 -0
  325. package/skills/single-work-analysis/references/api-contracts/ios-shortcut.md +44 -0
  326. package/skills/single-work-analysis/references/api-contracts/kuaishou-app-api.md +1518 -0
  327. package/skills/single-work-analysis/references/api-contracts/kuaishou-web-api.md +1242 -0
  328. package/skills/single-work-analysis/references/api-contracts/lemon8-app-api.md +1088 -0
  329. package/skills/single-work-analysis/references/api-contracts/linkedin-web-api.md +1949 -0
  330. package/skills/single-work-analysis/references/api-contracts/media-ingest-api.md +126 -0
  331. package/skills/single-work-analysis/references/api-contracts/pipixia-app-api.md +1142 -0
  332. package/skills/single-work-analysis/references/api-contracts/reddit-app-api.md +2025 -0
  333. package/skills/single-work-analysis/references/api-contracts/sora2-api.md +2266 -0
  334. package/skills/single-work-analysis/references/api-contracts/temp-mail-api.md +208 -0
  335. package/skills/single-work-analysis/references/api-contracts/threads-web-api.md +897 -0
  336. package/skills/single-work-analysis/references/api-contracts/tikhub-downloader-api.md +134 -0
  337. package/skills/single-work-analysis/references/api-contracts/tikhub-user-api.md +494 -0
  338. package/skills/single-work-analysis/references/api-contracts/tiktok-ads-api.md +5947 -0
  339. package/skills/single-work-analysis/references/api-contracts/tiktok-analytics-api.md +968 -0
  340. package/skills/single-work-analysis/references/api-contracts/tiktok-app-v3-api.md +5735 -0
  341. package/skills/single-work-analysis/references/api-contracts/tiktok-creator-api.md +1951 -0
  342. package/skills/single-work-analysis/references/api-contracts/tiktok-interaction-api.md +742 -0
  343. package/skills/single-work-analysis/references/api-contracts/tiktok-shop-web-api.md +1890 -0
  344. package/skills/single-work-analysis/references/api-contracts/tiktok-web-api.md +4448 -0
  345. package/skills/single-work-analysis/references/api-contracts/toutiao-app-api.md +342 -0
  346. package/skills/single-work-analysis/references/api-contracts/toutiao-web-api.md +143 -0
  347. package/skills/single-work-analysis/references/api-contracts/twitter-web-api.md +989 -0
  348. package/skills/single-work-analysis/references/api-contracts/wechat-channels-api.md +809 -0
  349. package/skills/single-work-analysis/references/api-contracts/wechat-media-platform-web-api.md +677 -0
  350. package/skills/single-work-analysis/references/api-contracts/weibo-app-api.md +1547 -0
  351. package/skills/single-work-analysis/references/api-contracts/weibo-web-api.md +798 -0
  352. package/skills/single-work-analysis/references/api-contracts/weibo-web-v2-api.md +2459 -0
  353. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-app-api.md +1291 -0
  354. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-app-v2-api.md +1683 -0
  355. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-web-api.md +1324 -0
  356. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-web-v2-api.md +1209 -0
  357. package/skills/single-work-analysis/references/api-contracts/xigua-app-v2-api.md +489 -0
  358. package/skills/single-work-analysis/references/api-contracts/youtube-web-api.md +2636 -0
  359. package/skills/single-work-analysis/references/api-contracts/youtube-web-v2-api.md +2660 -0
  360. package/skills/single-work-analysis/references/api-contracts/zhihu-web-api.md +2315 -0
  361. package/skills/single-work-analysis/references/api-tags/asr-api.md +100 -0
  362. package/skills/single-work-analysis/references/api-tags/bilibili-app-api.md +482 -0
  363. package/skills/single-work-analysis/references/api-tags/bilibili-web-api.md +1267 -0
  364. package/skills/single-work-analysis/references/api-tags/demo-api.md +365 -0
  365. package/skills/single-work-analysis/references/api-tags/douyin-app-v3-api.md +2012 -0
  366. package/skills/single-work-analysis/references/api-tags/douyin-billboard-api.md +1428 -0
  367. package/skills/single-work-analysis/references/api-tags/douyin-creator-api.md +694 -0
  368. package/skills/single-work-analysis/references/api-tags/douyin-creator-v2-api.md +694 -0
  369. package/skills/single-work-analysis/references/api-tags/douyin-search-api.md +1059 -0
  370. package/skills/single-work-analysis/references/api-tags/douyin-web-api.md +3314 -0
  371. package/skills/single-work-analysis/references/api-tags/douyin-xingtu-api.md +935 -0
  372. package/skills/single-work-analysis/references/api-tags/douyin-xingtu-v2-api.md +925 -0
  373. package/skills/single-work-analysis/references/api-tags/health-check.md +40 -0
  374. package/skills/single-work-analysis/references/api-tags/hybrid-parsing.md +57 -0
  375. package/skills/single-work-analysis/references/api-tags/instagram-v1-api.md +1224 -0
  376. package/skills/single-work-analysis/references/api-tags/instagram-v2-api.md +1147 -0
  377. package/skills/single-work-analysis/references/api-tags/instagram-v3-api.md +1123 -0
  378. package/skills/single-work-analysis/references/api-tags/ios-shortcut.md +45 -0
  379. package/skills/single-work-analysis/references/api-tags/kuaishou-app-api.md +846 -0
  380. package/skills/single-work-analysis/references/api-tags/kuaishou-web-api.md +551 -0
  381. package/skills/single-work-analysis/references/api-tags/lemon8-app-api.md +687 -0
  382. package/skills/single-work-analysis/references/api-tags/linkedin-web-api.md +1105 -0
  383. package/skills/single-work-analysis/references/api-tags/media-ingest-api.md +112 -0
  384. package/skills/single-work-analysis/references/api-tags/pipixia-app-api.md +721 -0
  385. package/skills/single-work-analysis/references/api-tags/reddit-app-api.md +1057 -0
  386. package/skills/single-work-analysis/references/api-tags/sora2-api.md +737 -0
  387. package/skills/single-work-analysis/references/api-tags/temp-mail-api.md +136 -0
  388. package/skills/single-work-analysis/references/api-tags/threads-web-api.md +472 -0
  389. package/skills/single-work-analysis/references/api-tags/tikhub-downloader-api.md +65 -0
  390. package/skills/single-work-analysis/references/api-tags/tikhub-user-api.md +253 -0
  391. package/skills/single-work-analysis/references/api-tags/tiktok-ads-api.md +1393 -0
  392. package/skills/single-work-analysis/references/api-tags/tiktok-analytics-api.md +179 -0
  393. package/skills/single-work-analysis/references/api-tags/tiktok-app-v3-api.md +3264 -0
  394. package/skills/single-work-analysis/references/api-tags/tiktok-creator-api.md +709 -0
  395. package/skills/single-work-analysis/references/api-tags/tiktok-interaction-api.md +366 -0
  396. package/skills/single-work-analysis/references/api-tags/tiktok-shop-web-api.md +663 -0
  397. package/skills/single-work-analysis/references/api-tags/tiktok-web-api.md +2516 -0
  398. package/skills/single-work-analysis/references/api-tags/toutiao-app-api.md +220 -0
  399. package/skills/single-work-analysis/references/api-tags/toutiao-web-api.md +96 -0
  400. package/skills/single-work-analysis/references/api-tags/twitter-web-api.md +562 -0
  401. package/skills/single-work-analysis/references/api-tags/wechat-channels-api.md +405 -0
  402. package/skills/single-work-analysis/references/api-tags/wechat-media-platform-web-api.md +431 -0
  403. package/skills/single-work-analysis/references/api-tags/weibo-app-api.md +851 -0
  404. package/skills/single-work-analysis/references/api-tags/weibo-web-api.md +470 -0
  405. package/skills/single-work-analysis/references/api-tags/weibo-web-v2-api.md +1405 -0
  406. package/skills/single-work-analysis/references/api-tags/xiaohongshu-app-api.md +534 -0
  407. package/skills/single-work-analysis/references/api-tags/xiaohongshu-app-v2-api.md +934 -0
  408. package/skills/single-work-analysis/references/api-tags/xiaohongshu-web-api.md +757 -0
  409. package/skills/single-work-analysis/references/api-tags/xiaohongshu-web-v2-api.md +762 -0
  410. package/skills/single-work-analysis/references/api-tags/xigua-app-v2-api.md +308 -0
  411. package/skills/single-work-analysis/references/api-tags/youtube-web-api.md +934 -0
  412. package/skills/single-work-analysis/references/api-tags/youtube-web-v2-api.md +717 -0
  413. package/skills/single-work-analysis/references/api-tags/zhihu-web-api.md +1384 -0
  414. package/skills/single-work-analysis/references/asr-and-fallback.md +20 -0
  415. package/skills/single-work-analysis/references/config-templates/defaults.yaml +58 -0
  416. package/skills/single-work-analysis/references/contracts/work-card-fields.md +41 -0
  417. package/skills/single-work-analysis/references/platform-guides/douyin.md +47 -0
  418. package/skills/single-work-analysis/references/platform-guides/generic.md +43 -0
  419. package/skills/single-work-analysis/references/platform-guides/xiaohongshu.md +54 -0
  420. package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +28 -0
  421. package/skills/single-work-analysis/references/prompt-contracts/cta.md +24 -0
  422. package/skills/single-work-analysis/references/prompt-contracts/hook.md +25 -0
  423. package/skills/single-work-analysis/references/prompt-contracts/insight.md +47 -0
  424. package/skills/single-work-analysis/references/prompt-contracts/structure.md +25 -0
  425. package/skills/single-work-analysis/references/prompt-contracts/style.md +27 -0
  426. package/skills/single-work-analysis/references/prompt-contracts/summary.md +29 -0
  427. package/skills/single-work-analysis/references/prompt-contracts/topic.md +29 -0
  428. package/skills/single-work-analysis/references/schemas/work-card.schema.json +39 -0
  429. package/skills/single-work-analysis/references/service-guides/asr-u2-u3-fallback.md +75 -0
  430. package/skills/single-work-analysis/scripts/__init__.py +0 -0
  431. package/skills/single-work-analysis/scripts/core/__init__.py +0 -0
  432. package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +133 -0
  433. package/skills/single-work-analysis/scripts/core/bootstrap_env.py +35 -0
  434. package/skills/single-work-analysis/scripts/core/config_loader.py +418 -0
  435. package/skills/single-work-analysis/scripts/core/extract_pipeline.py +173 -0
  436. package/skills/single-work-analysis/scripts/core/progress_report.py +111 -0
  437. package/skills/single-work-analysis/scripts/core/storage_router.py +253 -0
  438. package/skills/single-work-analysis/scripts/core/tikomni_common.py +588 -0
  439. package/skills/single-work-analysis/scripts/pipeline/__init__.py +0 -0
  440. package/skills/single-work-analysis/scripts/pipeline/asr/__init__.py +0 -0
  441. package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +1189 -0
  442. package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +95 -0
  443. package/skills/single-work-analysis/scripts/platform/__init__.py +0 -0
  444. package/skills/single-work-analysis/scripts/platform/douyin/__init__.py +0 -0
  445. package/skills/single-work-analysis/scripts/platform/douyin/douyin_video_type_matrix.py +224 -0
  446. package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +1233 -0
  447. package/skills/single-work-analysis/scripts/platform/douyin/select_low_quality_video_url.py +200 -0
  448. package/skills/single-work-analysis/scripts/platform/xiaohongshu/__init__.py +0 -0
  449. package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +2156 -0
  450. package/skills/single-work-analysis/scripts/writers/__init__.py +0 -0
  451. package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +1402 -0
@@ -0,0 +1,2156 @@
1
+ #!/usr/bin/env python3
2
+
3
+ if __package__ in {None, ""}:
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ _self = Path(__file__).resolve()
8
+ for _parent in _self.parents:
9
+ if (_parent / "scripts" / "core" / "bootstrap_env.py").is_file():
10
+ sys.path.insert(0, str(_parent))
11
+ break
12
+
13
+ """Xiaohongshu extraction: APP V2 -> APP V1 -> WEB_V2 -> WEB."""
14
+
15
+ from scripts.core.bootstrap_env import bootstrap_for_direct_run
16
+
17
+ bootstrap_for_direct_run(__file__, __package__)
18
+
19
+ import argparse
20
+ import hashlib
21
+ import json
22
+ import re
23
+ import urllib.parse
24
+ import urllib.request
25
+ from datetime import datetime
26
+ from pathlib import Path
27
+ from typing import Any, Dict, List, Optional, Tuple
28
+
29
+ from scripts.pipeline.asr.asr_pipeline import run_u2_asr_candidates_with_timeout_retry
30
+ from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
31
+ from scripts.core.progress_report import ProgressReporter
32
+ from scripts.core.storage_router import render_output_filename, resolve_json_filename_pattern
33
+ from scripts.core.extract_pipeline import build_api_trace, resolve_trace_error_context
34
+ from scripts.core.tikomni_common import (
35
+ call_json_api,
36
+ deep_find_all,
37
+ deep_find_first,
38
+ normalize_text,
39
+ resolve_runtime,
40
+ summarize_content,
41
+ write_json_stdout,
42
+ )
43
+ from scripts.writers.write_benchmark_card import write_benchmark_card
44
+
45
+ APP_V2_VIDEO_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_video_note_detail"
46
+ APP_V2_IMAGE_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_image_note_detail"
47
+ APP_V2_MIXED_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_mixed_note_detail"
48
+ APP_V1_ENDPOINT = "/api/u1/v1/xiaohongshu/app/get_note_info"
49
+ WEB_V2_V2_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v2"
50
+ WEB_V2_V3_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v3"
51
+ WEB_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v7"
52
+ U2_GATE_MIN_DURATION_MS = 13000
53
+ U2_GATE_MAX_DURATION_MS = 1800000
54
+ U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
55
+
56
+
57
+ def _format_published_date(value: Any) -> str:
58
+ ts = _to_int_or_none(value)
59
+ if ts is None:
60
+ return "N/A"
61
+ try:
62
+ return datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
63
+ except Exception:
64
+ return "N/A"
65
+
66
+
67
+ def _to_int_or_none(value: Any) -> Optional[int]:
68
+ try:
69
+ if isinstance(value, bool):
70
+ return int(value)
71
+ if isinstance(value, (int, float)):
72
+ parsed = int(value)
73
+ return parsed if parsed > 0 else None
74
+ text = normalize_text(value)
75
+ if not text:
76
+ return None
77
+ parsed = int(float(text.replace(",", "")))
78
+ return parsed if parsed > 0 else None
79
+ except Exception:
80
+ return None
81
+
82
+
83
+ def _evaluate_u2_gate_for_xhs(*, note_content_type: str, duration_ms: Any, video_down_url: Optional[str]) -> Dict[str, Any]:
84
+ content_type = normalize_text(note_content_type).lower()
85
+ is_video = content_type in {"video", "mixed"}
86
+ normalized_duration = _to_int_or_none(duration_ms)
87
+ normalized_video_down_url = normalize_text(video_down_url)
88
+
89
+ if not is_video:
90
+ gate_reason = "skip:not_video"
91
+ elif normalized_duration is None:
92
+ gate_reason = "skip:duration_missing"
93
+ elif normalized_duration <= U2_GATE_MIN_DURATION_MS:
94
+ gate_reason = "skip:duration_too_short"
95
+ elif normalized_duration > U2_GATE_MAX_DURATION_MS:
96
+ gate_reason = "skip:duration_too_long"
97
+ elif not normalized_video_down_url:
98
+ gate_reason = "skip:video_download_url_missing"
99
+ else:
100
+ gate_reason = "pass"
101
+
102
+ return {
103
+ "can_u2": gate_reason == "pass",
104
+ "gate_reason": gate_reason,
105
+ "is_video": is_video,
106
+ "duration_ms": normalized_duration,
107
+ "video_down_url": normalized_video_down_url,
108
+ "video_download_url": normalized_video_down_url,
109
+ "video_download_url_present": bool(normalized_video_down_url),
110
+ }
111
+
112
+
113
+ def _safe_slug(value: Optional[str], fallback: str = "unknown") -> str:
114
+ text = normalize_text(value)
115
+ if not text:
116
+ return fallback
117
+ slug = re.sub(r"[^a-zA-Z0-9_-]+", "-", text).strip("-").lower()
118
+ return slug[:64] or fallback
119
+
120
+
121
+ def _traceable_identifier(source_input: Dict[str, Optional[str]], note_id: Optional[str]) -> str:
122
+ if note_id:
123
+ return _safe_slug(note_id)
124
+ share = normalize_text(source_input.get("share_text"))
125
+ if not share:
126
+ return "missing_input"
127
+ digest = hashlib.sha1(share.encode("utf-8")).hexdigest()[:10]
128
+ return f"url-{digest}"
129
+
130
+
131
+ def _build_persist_payload(
132
+ *,
133
+ result: Dict[str, Any],
134
+ source_input: Dict[str, Optional[str]],
135
+ note_id: Optional[str],
136
+ status: str,
137
+ written_at: datetime,
138
+ ) -> Dict[str, Any]:
139
+ summary = {
140
+ "summary": result.get("summary", ""),
141
+ "insights": result.get("insights", []),
142
+ "confidence": result.get("confidence"),
143
+ "error_reason": result.get("error_reason"),
144
+ }
145
+ normalized = {
146
+ "platform": "xiaohongshu",
147
+ "content_kind": result.get("content_kind", "note"),
148
+ "note_id": result.get("note_id") or note_id,
149
+ "note_content_type": result.get("note_content_type"),
150
+ "text_source": result.get("text_source"),
151
+ "request_id": result.get("request_id"),
152
+ "source": source_input,
153
+ }
154
+ return {
155
+ "meta": {
156
+ "written_at": written_at.isoformat(timespec="seconds"),
157
+ "status": status,
158
+ "platform": "xiaohongshu",
159
+ "identifier": _traceable_identifier(source_input, note_id),
160
+ },
161
+ "summary": summary,
162
+ "normalized": normalized,
163
+ "raw": result,
164
+ }
165
+
166
+
167
+ def _persist_output_artifact(
168
+ *,
169
+ result: Dict[str, Any],
170
+ source_input: Dict[str, Optional[str]],
171
+ note_id: Optional[str],
172
+ storage_config: Optional[Dict[str, Any]],
173
+ persist_output: bool,
174
+ ) -> Dict[str, Any]:
175
+ if not persist_output:
176
+ return {"enabled": False, "skipped": True, "reason": "disabled_by_flag"}
177
+
178
+ try:
179
+ paths = resolve_storage_paths(storage_config or {})
180
+ except Exception as error:
181
+ return {"enabled": True, "ok": False, "error": f"resolve_storage_paths_failed:{error}"}
182
+
183
+ now = datetime.now()
184
+ date_key = now.strftime("%Y%m%d")
185
+ timestamp = now.strftime("%Y%m%dT%H%M%S")
186
+ identifier = _traceable_identifier(source_input, note_id)
187
+ has_error = bool(result.get("error_reason"))
188
+ status = "error" if has_error else "success"
189
+
190
+ if has_error:
191
+ target_dir = Path(paths.get("errors_root", "")) / date_key
192
+ else:
193
+ target_dir = Path(paths.get("results_root", "")) / date_key
194
+
195
+ target_dir.mkdir(parents=True, exist_ok=True)
196
+ file_name = render_output_filename(
197
+ pattern=resolve_json_filename_pattern(storage_config),
198
+ context={
199
+ "prefix": status,
200
+ "platform": "xiaohongshu",
201
+ "card_type": "single_work_result",
202
+ "author_slug": identifier,
203
+ "title_slug": identifier,
204
+ "identifier": identifier,
205
+ "timestamp": timestamp,
206
+ "date": date_key,
207
+ "ext": ".json",
208
+ },
209
+ default_filename=f"{timestamp}-xiaohongshu-{identifier}.json",
210
+ default_ext=".json",
211
+ )
212
+ file_path = target_dir / file_name
213
+
214
+ payload = _build_persist_payload(
215
+ result=result,
216
+ source_input=source_input,
217
+ note_id=note_id,
218
+ status=status,
219
+ written_at=now,
220
+ )
221
+ file_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
222
+
223
+ return {
224
+ "enabled": True,
225
+ "ok": True,
226
+ "status": status,
227
+ "path": str(file_path),
228
+ }
229
+
230
+
231
+ def _finalize_result(
232
+ *,
233
+ result: Dict[str, Any],
234
+ source_input: Dict[str, Optional[str]],
235
+ note_id: Optional[str],
236
+ storage_config: Optional[Dict[str, Any]],
237
+ persist_output: bool,
238
+ ) -> Dict[str, Any]:
239
+ result["output_persist"] = _persist_output_artifact(
240
+ result=result,
241
+ source_input=source_input,
242
+ note_id=note_id,
243
+ storage_config=storage_config,
244
+ persist_output=persist_output,
245
+ )
246
+ return result
247
+
248
+
249
+ def _normalize_input(input_value: Optional[str], share_text: Optional[str], note_id: Optional[str]) -> Dict[str, Optional[str]]:
250
+ normalized_share = normalize_text(share_text) or None
251
+ normalized_note_id = normalize_text(note_id) or None
252
+
253
+ if input_value and not normalized_share and not normalized_note_id:
254
+ candidate = input_value.strip()
255
+ if candidate.startswith("http://") or candidate.startswith("https://"):
256
+ normalized_share = candidate
257
+ else:
258
+ normalized_note_id = candidate
259
+
260
+ return {
261
+ "share_text": normalized_share,
262
+ "note_id": normalized_note_id,
263
+ }
264
+
265
+
266
+ def _extract_note_id_from_share(share_text: Optional[str]) -> Optional[str]:
267
+ if not share_text:
268
+ return None
269
+ text = share_text.strip()
270
+ patterns = [
271
+ r"/explore/([0-9a-zA-Z]+)",
272
+ r"/discovery/item/([0-9a-zA-Z]+)",
273
+ r"note_id=([0-9a-zA-Z]+)",
274
+ ]
275
+ for pattern in patterns:
276
+ match = re.search(pattern, text)
277
+ if match:
278
+ return match.group(1)
279
+ return None
280
+
281
+
282
+ def _resolve_note_id(payload: Any, source_input: Dict[str, Optional[str]]) -> Optional[str]:
283
+ # Priority 1: explicit source input
284
+ if source_input.get("note_id"):
285
+ return source_input.get("note_id")
286
+
287
+ # Priority 2: canonical keys from payload
288
+ for key in ["note_id", "noteid", "item_id", "itemId"]:
289
+ value = deep_find_first(payload, [key])
290
+ text = normalize_text(value)
291
+ if text and len(text) >= 16:
292
+ return text
293
+
294
+ # Priority 3: parse from canonical webpage URLs
295
+ for key in ["webpage_url", "share_url", "url"]:
296
+ values = deep_find_all(payload, [key])
297
+ for value in values:
298
+ text = normalize_text(value)
299
+ hit = _extract_note_id_from_share(text)
300
+ if hit:
301
+ return hit
302
+
303
+ # Priority 4: parse from source share text
304
+ hit = _extract_note_id_from_share(source_input.get("share_text"))
305
+ if hit:
306
+ return hit
307
+
308
+ return None
309
+
310
+
311
+ def _is_short_share_url(share_text: Optional[str]) -> bool:
312
+ if not share_text:
313
+ return False
314
+ try:
315
+ host = urllib.parse.urlparse(share_text).netloc.lower()
316
+ except Exception:
317
+ return False
318
+ return "xhslink.com" in host
319
+
320
+
321
+ def _app_response_has_core_fields(response_data: Any) -> bool:
322
+ subtitle_hit = bool(_extract_subtitle_inline_text(response_data)) or bool(_extract_subtitle_urls(response_data))
323
+ video_hit = bool(_extract_video_candidates(response_data))
324
+ # For APP-first strategy, if app only has weak image frames but no subtitle/video core,
325
+ # continue probing WEB_V2 to improve media fidelity.
326
+ return subtitle_hit or video_hit
327
+
328
+
329
+ def _route_field_completeness(payload: Any, source_input: Dict[str, Optional[str]]) -> Dict[str, Any]:
330
+ note_id_hit = bool(_resolve_note_id(payload, source_input))
331
+ title_hit = bool(
332
+ _pick_text_from_paths(
333
+ payload,
334
+ [["title"], ["desc"], ["content"], ["note", "title"], ["note", "desc"], ["note", "content"]],
335
+ )
336
+ )
337
+ author_hit = bool(
338
+ _pick_text_from_paths(
339
+ payload,
340
+ [
341
+ ["nickname"],
342
+ ["author_nickname"],
343
+ ["user_nickname"],
344
+ ["author", "nickname"],
345
+ ["user", "nickname"],
346
+ ["author", "name"],
347
+ ["user", "name"],
348
+ ],
349
+ )
350
+ )
351
+ media_hit = bool(_extract_video_candidates(payload) or _extract_image_candidates(payload))
352
+ subtitle_hit = bool(_extract_subtitle_inline_text(payload)) or bool(_extract_subtitle_urls(payload))
353
+ metrics_hit = any(
354
+ _pick_int_from_paths(payload, [path], prefer_positive=True) is not None
355
+ for path in (
356
+ ["digg_count"],
357
+ ["liked_count"],
358
+ ["like_count"],
359
+ ["comment_count"],
360
+ ["collect_count"],
361
+ ["share_count"],
362
+ ["view_count"],
363
+ ["play_count"],
364
+ )
365
+ )
366
+
367
+ fields = {
368
+ "note_id": note_id_hit,
369
+ "title_or_desc": title_hit,
370
+ "author": author_hit,
371
+ "media": media_hit,
372
+ "subtitle": subtitle_hit,
373
+ "metrics": metrics_hit,
374
+ }
375
+ filled_count = sum(1 for hit in fields.values() if hit)
376
+ missing_core = [key for key in ("note_id", "title_or_desc", "media") if not fields.get(key)]
377
+ return {
378
+ "fields": fields,
379
+ "filled_count": filled_count,
380
+ "total_fields": len(fields),
381
+ "ratio": round(filled_count / max(len(fields), 1), 3),
382
+ "missing_core": missing_core,
383
+ "core_ready": not missing_core,
384
+ }
385
+
386
+
387
+ def _route_success_for_note(response: Dict[str, Any], source_input: Dict[str, Optional[str]]) -> bool:
388
+ if not response.get("ok"):
389
+ return False
390
+ completeness = response.get("_field_completeness")
391
+ if not isinstance(completeness, dict):
392
+ completeness = _route_field_completeness(response.get("data"), source_input)
393
+ response["_field_completeness"] = completeness
394
+ return bool(completeness.get("core_ready"))
395
+
396
+
397
+ def _pick_text_from_paths(payload: Any, paths: List[List[str]]) -> str:
398
+ for path in paths:
399
+ raw = deep_find_first(payload, path)
400
+ if isinstance(raw, (dict, list)):
401
+ continue
402
+ text = normalize_text(raw)
403
+ if text:
404
+ return text
405
+ return ""
406
+
407
+
408
+ def _to_int(value: Any) -> Optional[int]:
409
+ if isinstance(value, bool):
410
+ return int(value)
411
+ if isinstance(value, int):
412
+ return value
413
+ if isinstance(value, float):
414
+ return int(value)
415
+ if isinstance(value, str):
416
+ text = value.strip()
417
+ if text.isdigit() or (text.startswith("-") and text[1:].isdigit()):
418
+ return int(text)
419
+ return None
420
+
421
+
422
+ def _extract_value_by_path(payload: Any, path: List[str]) -> Optional[Any]:
423
+ if not path:
424
+ return None
425
+
426
+ def _walk(node: Any, idx: int) -> Optional[Any]:
427
+ if idx >= len(path):
428
+ if node in (None, "", [], {}):
429
+ return None
430
+ return node
431
+
432
+ key = path[idx]
433
+ if isinstance(node, dict):
434
+ if key in node:
435
+ hit = _walk(node.get(key), idx + 1)
436
+ if hit is not None:
437
+ return hit
438
+ for value in node.values():
439
+ hit = _walk(value, idx)
440
+ if hit is not None:
441
+ return hit
442
+ return None
443
+
444
+ if isinstance(node, list):
445
+ for item in node:
446
+ hit = _walk(item, idx)
447
+ if hit is not None:
448
+ return hit
449
+ return None
450
+
451
+ return None
452
+
453
+ return _walk(payload, 0)
454
+
455
+
456
+ def _normalize_unix_sec(value: int) -> int:
457
+ # 13-digit timestamps are milliseconds.
458
+ if value > 1_000_000_000_000:
459
+ return value // 1000
460
+ return value
461
+
462
+
463
+ def _pick_int_with_source_from_paths(
464
+ payload: Any,
465
+ paths: List[List[str]],
466
+ *,
467
+ prefer_positive: bool = False,
468
+ normalize_unix_sec: bool = False,
469
+ ) -> Tuple[Optional[int], str]:
470
+ for path in paths:
471
+ value = _extract_value_by_path(payload, path)
472
+ if value is None:
473
+ value = deep_find_first(payload, path)
474
+ parsed = _to_int(value)
475
+ if parsed is None:
476
+ continue
477
+ if normalize_unix_sec:
478
+ parsed = _normalize_unix_sec(parsed)
479
+ if prefer_positive and parsed <= 0:
480
+ continue
481
+ return parsed, ".".join(path)
482
+ return None, ""
483
+
484
+
485
+ def _pick_int_from_paths(
486
+ payload: Any,
487
+ paths: List[List[str]],
488
+ *,
489
+ prefer_positive: bool = False,
490
+ normalize_unix_sec: bool = False,
491
+ ) -> Optional[int]:
492
+ value, _ = _pick_int_with_source_from_paths(
493
+ payload,
494
+ paths,
495
+ prefer_positive=prefer_positive,
496
+ normalize_unix_sec=normalize_unix_sec,
497
+ )
498
+ return value
499
+
500
+
501
+ def _dedupe_keep_order(values: List[str]) -> List[str]:
502
+ output: List[str] = []
503
+ seen = set()
504
+ for value in values:
505
+ if value in seen:
506
+ continue
507
+ seen.add(value)
508
+ output.append(value)
509
+ return output
510
+
511
+
512
+ def _clean_tag_text(value: Any) -> str:
513
+ text = normalize_text(value)
514
+ if not text:
515
+ return ""
516
+ text = text.strip().strip("#")
517
+ text = re.sub(r"\[话题\]$", "", text)
518
+ text = text.strip().strip("#")
519
+ return text
520
+
521
+
522
+ def _append_tag(raw: Any, output: List[str], seen: set) -> None:
523
+ tag = _clean_tag_text(raw)
524
+ if not tag or tag in seen:
525
+ return
526
+ seen.add(tag)
527
+ output.append(tag)
528
+
529
+
530
+ def _extract_tags_from_container(value: Any, output: List[str], seen: set) -> None:
531
+ if isinstance(value, str):
532
+ _append_tag(value, output, seen)
533
+ return
534
+ if isinstance(value, list):
535
+ for item in value:
536
+ _extract_tags_from_container(item, output, seen)
537
+ return
538
+ if isinstance(value, dict):
539
+ for key in ("name", "tag_name", "topic_name", "hashtag_name"):
540
+ _append_tag(value.get(key), output, seen)
541
+
542
+
543
+ def _extract_xhs_tags(payload: Any) -> List[str]:
544
+ primary_tags: List[str] = []
545
+ primary_seen: set = set()
546
+ for key in ("tagList", "taglist", "tag_list"):
547
+ for value in deep_find_all(payload, [key]):
548
+ _extract_tags_from_container(value, primary_tags, primary_seen)
549
+ if primary_tags:
550
+ return primary_tags
551
+
552
+ tags: List[str] = []
553
+ seen: set = set()
554
+ for key in ("topics", "hash_tag", "hashTag", "head_tags", "foot_tags"):
555
+ for value in deep_find_all(payload, [key]):
556
+ _extract_tags_from_container(value, tags, seen)
557
+
558
+ for desc in deep_find_all(payload, ["desc", "content"]):
559
+ if not isinstance(desc, str):
560
+ continue
561
+ for match in re.findall(r"#([^#\n\r]+?)#", desc):
562
+ _append_tag(match, tags, seen)
563
+
564
+ return tags
565
+
566
+
567
+ def _build_candidate_merge_sources(*, app_candidates: List[str], enrich_candidates: List[str], app_label: str) -> List[str]:
568
+ sources: List[str] = []
569
+ if app_candidates:
570
+ sources.append(app_label)
571
+ if enrich_candidates:
572
+ sources.append("web_v2_enrich")
573
+ return sources
574
+
575
+
576
+ def _extract_xhs_metadata(
577
+ *,
578
+ payload: Any,
579
+ source_input: Dict[str, Optional[str]],
580
+ selected_video_url: Optional[str],
581
+ selected_image_urls: List[str],
582
+ ) -> Dict[str, Any]:
583
+ share_from_source = normalize_text(source_input.get("share_text"))
584
+
585
+ title = _pick_text_from_paths(payload, [["title"], ["note", "title"], ["desc"], ["content"]])
586
+ author = _pick_text_from_paths(
587
+ payload,
588
+ [
589
+ ["nickname"],
590
+ ["author_nickname"],
591
+ ["user_nickname"],
592
+ ["author", "nickname"],
593
+ ["user", "nickname"],
594
+ ["author", "name"],
595
+ ["user", "name"],
596
+ ],
597
+ )
598
+
599
+ create_time_paths = [
600
+ ["create_time_sec"],
601
+ ["create_time"],
602
+ ["publish_time_sec"],
603
+ ["publish_time"],
604
+ ["time"],
605
+ ["timestamp"],
606
+ ["createTime"],
607
+ ["publishTime"],
608
+ ["note", "create_time_sec"],
609
+ ["note", "create_time"],
610
+ ["note", "createTime"],
611
+ ["note", "publish_time_sec"],
612
+ ["note", "publish_time"],
613
+ ["note", "publishTime"],
614
+ ["note", "time"],
615
+ ["note", "timestamp"],
616
+ ["note_list", "create_time_sec"],
617
+ ["note_list", "create_time"],
618
+ ["note_list", "createTime"],
619
+ ["note_list", "publish_time_sec"],
620
+ ["note_list", "publish_time"],
621
+ ["note_list", "publishTime"],
622
+ ["note_list", "time"],
623
+ ["note_list", "timestamp"],
624
+ ["noteList", "create_time_sec"],
625
+ ["noteList", "create_time"],
626
+ ["noteList", "createTime"],
627
+ ["noteList", "publish_time_sec"],
628
+ ["noteList", "publish_time"],
629
+ ["noteList", "publishTime"],
630
+ ["noteList", "time"],
631
+ ["noteList", "timestamp"],
632
+ ]
633
+ create_time_sec, create_time_source = _pick_int_with_source_from_paths(
634
+ payload,
635
+ create_time_paths,
636
+ prefer_positive=True,
637
+ normalize_unix_sec=True,
638
+ )
639
+ duration_ms = _pick_int_from_paths(
640
+ payload,
641
+ [["duration_ms"], ["duration"], ["duration_sec"], ["video", "duration"], ["note", "duration"]],
642
+ )
643
+ if duration_ms is not None and duration_ms > 0 and duration_ms < 10000:
644
+ duration_ms *= 1000
645
+
646
+ share_url = _pick_text_from_paths(payload, [["share_url"], ["webpage_url"], ["url"], ["share_link"], ["share_text"]])
647
+ source_url = _pick_text_from_paths(payload, [["source_url"], ["webpage_url"], ["url"], ["share_url"]])
648
+ if not share_url:
649
+ share_url = share_from_source
650
+ if not source_url:
651
+ source_url = share_url or share_from_source
652
+
653
+ cover_image = _pick_text_from_paths(payload, [["cover_image"], ["cover_url"], ["cover"], ["image", "url"], ["origin_cover"]])
654
+ if not cover_image and selected_image_urls:
655
+ cover_image = selected_image_urls[0]
656
+
657
+ video_down_url = _pick_text_from_paths(
658
+ payload,
659
+ [
660
+ ["video_down_url"],
661
+ ["original_video_url"],
662
+ ["video_url"],
663
+ ["play_url"],
664
+ ["master_url"],
665
+ ["selected_video_url"],
666
+ ],
667
+ )
668
+ if not video_down_url:
669
+ video_down_url = normalize_text(selected_video_url)
670
+
671
+ xhs_user_id = _pick_text_from_paths(
672
+ payload,
673
+ [["author", "userid"], ["author", "user_id"], ["user", "userid"], ["user", "user_id"], ["user_id"], ["userid"], ["id"]],
674
+ )
675
+ author_handle = _pick_text_from_paths(
676
+ payload,
677
+ [["author", "red_id"], ["user", "red_id"], ["red_id"], ["author", "nickname"], ["user", "nickname"], ["nickname"]],
678
+ ) or author
679
+
680
+ xhs_sec_token = _pick_text_from_paths(
681
+ payload,
682
+ [["xhs_sec_token"], ["xsec_token"], ["xsecToken"], ["note", "xsecToken"], ["user", "xsecToken"], ["user", "xsec_token"]],
683
+ )
684
+ if not xhs_sec_token:
685
+ for url_text in [share_url, source_url, share_from_source]:
686
+ text = normalize_text(url_text)
687
+ if not text:
688
+ continue
689
+ try:
690
+ query = urllib.parse.urlparse(text).query
691
+ xhs_sec_token = urllib.parse.parse_qs(query).get("xsec_token", [""])[0]
692
+ except Exception:
693
+ xhs_sec_token = ""
694
+ if normalize_text(xhs_sec_token):
695
+ break
696
+
697
+ return {
698
+ "title": title,
699
+ "caption_raw": _pick_text_from_paths(payload, [["desc"], ["content"], ["note", "desc"], ["note", "content"]]),
700
+ "author": author,
701
+ "author_handle": author_handle,
702
+ "platform_author_id": xhs_user_id,
703
+ "author_platform_id": xhs_user_id,
704
+ "xhs_user_id": xhs_user_id,
705
+ "xhs_sec_token": normalize_text(xhs_sec_token),
706
+ "create_time_sec": create_time_sec,
707
+ "publish_time": create_time_sec,
708
+ "publish_time_source": create_time_source or "unknown",
709
+ "duration_ms": duration_ms,
710
+ "tags": _extract_xhs_tags(payload),
711
+ "digg_count": _pick_int_from_paths(payload, [["digg_count"], ["liked_count"], ["like_count"], ["likes"]]),
712
+ "comment_count": _pick_int_from_paths(payload, [["comment_count"], ["comments_count"], ["comments"]]),
713
+ "collect_count": _pick_int_from_paths(payload, [["collect_count"], ["collected_count"], ["favorite_count"]]),
714
+ "share_count": _pick_int_from_paths(payload, [["share_count"], ["shared_count"]]),
715
+ "share_url": share_url,
716
+ "source_url": source_url,
717
+ "cover_image": cover_image,
718
+ "video_down_url": video_down_url,
719
+ "video_download_url": video_down_url,
720
+ }
721
+
722
+
723
+ def _is_sparse_metadata(metadata_fields: Dict[str, Any]) -> bool:
724
+ if not normalize_text(metadata_fields.get("title")):
725
+ return True
726
+ if not normalize_text(metadata_fields.get("author")):
727
+ return True
728
+ if metadata_fields.get("create_time_sec") is None:
729
+ return True
730
+ metric_keys = ["digg_count", "comment_count", "collect_count", "share_count"]
731
+ return not any(metadata_fields.get(key) is not None for key in metric_keys)
732
+
733
+
734
+ def _append_missing_metadata_fields(missing_fields: List[Dict[str, str]], metadata_fields: Dict[str, Any]) -> None:
735
+ missing_set = {item.get("field") for item in missing_fields if isinstance(item, dict)}
736
+
737
+ def _append(field: str) -> None:
738
+ if field in missing_set:
739
+ return
740
+ missing_fields.append({"field": field, "reason": "missing_metadata"})
741
+ missing_set.add(field)
742
+
743
+ for key in [
744
+ "title",
745
+ "author",
746
+ "author_handle",
747
+ "platform_author_id",
748
+ "xhs_user_id",
749
+ "xhs_sec_token",
750
+ "share_url",
751
+ "source_url",
752
+ "cover_image",
753
+ "video_down_url",
754
+ ]:
755
+ if not normalize_text(metadata_fields.get(key)):
756
+ _append(key)
757
+
758
+ for key in ["create_time_sec", "duration_ms", "digg_count", "comment_count", "collect_count", "share_count"]:
759
+ if metadata_fields.get(key) is None:
760
+ _append(key)
761
+
762
+
763
+ def _fetch_sparse_metadata_enrich(
764
+ *,
765
+ base_url: str,
766
+ token: str,
767
+ timeout_ms: int,
768
+ source_input: Dict[str, Optional[str]],
769
+ note_id: Optional[str],
770
+ ) -> Dict[str, Any]:
771
+ share_text = source_input.get("share_text")
772
+ resolved_note_id = note_id or source_input.get("note_id") or _extract_note_id_from_share(share_text)
773
+
774
+ if _is_short_share_url(share_text) and share_text:
775
+ response = call_json_api(
776
+ base_url=base_url,
777
+ path=WEB_V2_V3_ENDPOINT,
778
+ token=token,
779
+ method="GET",
780
+ timeout_ms=timeout_ms,
781
+ params={"short_url": share_text},
782
+ )
783
+ response["_endpoint"] = WEB_V2_V3_ENDPOINT
784
+ response["_route_label"] = "web_v2_v3_sparse_enrich"
785
+ return response
786
+
787
+ if resolved_note_id:
788
+ response = call_json_api(
789
+ base_url=base_url,
790
+ path=WEB_V2_V2_ENDPOINT,
791
+ token=token,
792
+ method="GET",
793
+ timeout_ms=timeout_ms,
794
+ params={"note_id": resolved_note_id},
795
+ )
796
+ response["_endpoint"] = WEB_V2_V2_ENDPOINT
797
+ response["_route_label"] = "web_v2_v2_sparse_enrich"
798
+ return response
799
+
800
+ return {
801
+ "ok": False,
802
+ "error": "missing_share_text_and_note_id_for_sparse_enrich",
803
+ "_endpoint": None,
804
+ "_route_label": "web_v2_sparse_enrich_skipped",
805
+ }
806
+
807
+
808
+ def _fetch_note_info(*, base_url: str, token: str, timeout_ms: int, source_input: Dict[str, Optional[str]]) -> Dict[str, Any]:
809
+ attempts: List[Dict[str, Any]] = []
810
+
811
+ share_text = source_input.get("share_text")
812
+ note_id = source_input.get("note_id") or _extract_note_id_from_share(share_text)
813
+
814
+ def _call(path: str, params: Dict[str, Any], label: str, fallback_reason: Optional[str] = None) -> Dict[str, Any]:
815
+ response = call_json_api(
816
+ base_url=base_url,
817
+ path=path,
818
+ token=token,
819
+ method="GET",
820
+ timeout_ms=timeout_ms,
821
+ params=params,
822
+ )
823
+ response["_endpoint"] = path
824
+ response["_route_label"] = label
825
+ if fallback_reason:
826
+ response["fallback_trigger_reason"] = fallback_reason
827
+ response["_field_completeness"] = _route_field_completeness(response.get("data"), source_input) if response.get("ok") else {
828
+ "fields": {},
829
+ "filled_count": 0,
830
+ "total_fields": 0,
831
+ "ratio": 0.0,
832
+ "missing_core": ["note_id", "title_or_desc", "media"],
833
+ "core_ready": False,
834
+ }
835
+ attempts.append({"label": label, "endpoint": path, "response": response})
836
+ return response
837
+
838
+ app_params: Dict[str, Any] = {}
839
+ if share_text:
840
+ app_params["share_text"] = share_text
841
+ if note_id:
842
+ app_params["note_id"] = note_id
843
+
844
+ app_v2_attempts = [
845
+ (APP_V2_VIDEO_ENDPOINT, "app_v2_video"),
846
+ (APP_V2_IMAGE_ENDPOINT, "app_v2_image"),
847
+ (APP_V2_MIXED_ENDPOINT, "app_v2_mixed"),
848
+ ]
849
+ next_reason: Optional[str] = None
850
+
851
+ for path, label in app_v2_attempts:
852
+ app_v2_response = _call(path, app_params, label, fallback_reason=next_reason)
853
+ if _route_success_for_note(app_v2_response, source_input):
854
+ app_v2_response["_attempts"] = attempts
855
+ return app_v2_response
856
+ if app_v2_response.get("ok"):
857
+ app_v2_response["fallback_trigger_reason"] = "field_completeness_below_threshold"
858
+ next_reason = "field_completeness_below_threshold" if app_v2_response.get("ok") else (
859
+ "primary_timeout_retry_exhausted" if app_v2_response.get("timeout_retry_exhausted") else "primary_non_timeout_failure"
860
+ )
861
+
862
+ app_response = _call(APP_V1_ENDPOINT, app_params, "app_v1", fallback_reason=next_reason)
863
+ if _route_success_for_note(app_response, source_input):
864
+ app_response["_attempts"] = attempts
865
+ return app_response
866
+ if app_response.get("ok"):
867
+ app_response["fallback_trigger_reason"] = "field_completeness_below_threshold"
868
+
869
+ app_fallback_reason = (
870
+ "field_completeness_below_threshold"
871
+ if app_response.get("ok")
872
+ else ("primary_timeout_retry_exhausted" if app_response.get("timeout_retry_exhausted") else "primary_non_timeout_failure")
873
+ )
874
+ is_short = _is_short_share_url(share_text)
875
+
876
+ if is_short and share_text:
877
+ v3_response = _call(
878
+ WEB_V2_V3_ENDPOINT,
879
+ {"short_url": share_text},
880
+ "web_v2_v3_short",
881
+ fallback_reason=app_fallback_reason,
882
+ )
883
+ if v3_response.get("ok"):
884
+ v3_response["_attempts"] = attempts
885
+ return v3_response
886
+
887
+ if note_id:
888
+ v2_response = _call(
889
+ WEB_V2_V2_ENDPOINT,
890
+ {"note_id": note_id},
891
+ "web_v2_v2_note_id",
892
+ fallback_reason=app_fallback_reason,
893
+ )
894
+ if v2_response.get("ok"):
895
+ v2_response["_attempts"] = attempts
896
+ return v2_response
897
+
898
+ web_params: Dict[str, Any] = {}
899
+ if share_text:
900
+ web_params["share_text"] = share_text
901
+ if note_id:
902
+ web_params["note_id"] = note_id
903
+
904
+ web_response = _call(WEB_ENDPOINT, web_params, "web_v7", fallback_reason=app_fallback_reason)
905
+ web_response["_attempts"] = attempts
906
+ return web_response
907
+
908
+
909
+ def _extract_subtitle_urls(payload: Any) -> List[str]:
910
+ urls: List[str] = []
911
+ for key in ["subtitle_url", "subtitleUrl", "srt_url", "srtUrl", "vtt_url", "vttUrl"]:
912
+ for value in deep_find_all(payload, [key]):
913
+ if isinstance(value, str):
914
+ text = value.strip()
915
+ if text.startswith("http://") or text.startswith("https://"):
916
+ urls.append(text)
917
+
918
+ unique: List[str] = []
919
+ seen = set()
920
+ for url in urls:
921
+ if url not in seen:
922
+ unique.append(url)
923
+ seen.add(url)
924
+ return unique
925
+
926
+
927
+ def _extract_subtitle_inline_text(payload: Any) -> str:
928
+ lines: List[str] = []
929
+ subtitle_containers = deep_find_all(payload, ["subtitles", "subtitle_list", "subtitleList"])
930
+
931
+ for container in subtitle_containers:
932
+ if isinstance(container, list):
933
+ for item in container:
934
+ if isinstance(item, dict):
935
+ for key in ["text", "content", "sentence", "line"]:
936
+ val = item.get(key)
937
+ if isinstance(val, str) and normalize_text(val):
938
+ lines.append(normalize_text(val))
939
+ elif isinstance(item, str) and normalize_text(item):
940
+ lines.append(normalize_text(item))
941
+ elif isinstance(container, dict):
942
+ for key in ["text", "content"]:
943
+ val = container.get(key)
944
+ if isinstance(val, str) and normalize_text(val):
945
+ lines.append(normalize_text(val))
946
+
947
+ deduped = list(dict.fromkeys(lines))
948
+ return "\n".join(deduped).strip()
949
+
950
+
951
+ def _subtitle_text_from_raw(raw: str) -> str:
952
+ if not raw:
953
+ return ""
954
+
955
+ raw = raw.strip()
956
+ if not raw:
957
+ return ""
958
+
959
+ if raw.startswith("{") or raw.startswith("["):
960
+ try:
961
+ data = json.loads(raw)
962
+ texts: List[str] = []
963
+ if isinstance(data, dict):
964
+ for key in ["segments", "subtitles", "data", "result", "body"]:
965
+ val = data.get(key)
966
+ if isinstance(val, list):
967
+ for item in val:
968
+ if isinstance(item, dict):
969
+ t = item.get("text") or item.get("content") or item.get("sentence")
970
+ if isinstance(t, str) and normalize_text(t):
971
+ texts.append(normalize_text(t))
972
+ elif isinstance(data, list):
973
+ for item in data:
974
+ if isinstance(item, dict):
975
+ t = item.get("text") or item.get("content") or item.get("sentence")
976
+ if isinstance(t, str) and normalize_text(t):
977
+ texts.append(normalize_text(t))
978
+ return "\n".join(dict.fromkeys(texts)).strip()
979
+ except Exception:
980
+ pass
981
+
982
+ lines = []
983
+ for line in raw.splitlines():
984
+ t = line.strip()
985
+ if not t:
986
+ continue
987
+ if t.upper() == "WEBVTT":
988
+ continue
989
+ if re.match(r"^\d+$", t):
990
+ continue
991
+ if "-->" in t:
992
+ continue
993
+ if t.startswith("NOTE"):
994
+ continue
995
+ cleaned = normalize_text(t)
996
+ if cleaned:
997
+ lines.append(cleaned)
998
+
999
+ return "\n".join(dict.fromkeys(lines)).strip()
1000
+
1001
+
1002
+ def _fetch_subtitle_text(urls: List[str], timeout_ms: int) -> str:
1003
+ for url in urls:
1004
+ try:
1005
+ req = urllib.request.Request(url=url, method="GET")
1006
+ with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
1007
+ raw = response.read().decode("utf-8", errors="replace")
1008
+ text = _subtitle_text_from_raw(raw)
1009
+ if text:
1010
+ return text
1011
+ except Exception:
1012
+ continue
1013
+ return ""
1014
+
1015
+
1016
+ def _url_likely_image(url: str) -> bool:
1017
+ lower = url.lower()
1018
+ image_tokens = [
1019
+ ".jpg",
1020
+ ".jpeg",
1021
+ ".png",
1022
+ ".webp",
1023
+ "_jpg_",
1024
+ "_png_",
1025
+ "imageview2",
1026
+ "imagemogr2",
1027
+ "redimage",
1028
+ "frame/",
1029
+ "sns-img",
1030
+ "sns-webpic",
1031
+ "notes_pre_post",
1032
+ ]
1033
+ return any(token in lower for token in image_tokens)
1034
+
1035
+
1036
+ def _url_likely_video(url: str) -> bool:
1037
+ lower = url.lower()
1038
+ video_tokens = [
1039
+ ".mp4",
1040
+ ".m3u8",
1041
+ ".m4a",
1042
+ ".mp3",
1043
+ "video",
1044
+ "play",
1045
+ "stream",
1046
+ "master",
1047
+ "sns-video",
1048
+ "redvideo",
1049
+ "vod",
1050
+ "/audio/",
1051
+ ]
1052
+ if _url_likely_image(url):
1053
+ return False
1054
+ return any(token in lower for token in video_tokens)
1055
+
1056
+
1057
+ def _video_quality_hint(url: str) -> int:
1058
+ lower = url.lower()
1059
+ score = 9999
1060
+
1061
+ query = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
1062
+ for key in ("w", "width", "ratio", "quality", "qn"):
1063
+ values = query.get(key)
1064
+ if not values:
1065
+ continue
1066
+ value = str(values[0]).lower()
1067
+ m = re.search(r"(\d{3,4})", value)
1068
+ if m:
1069
+ score = min(score, int(m.group(1)))
1070
+
1071
+ for token, value in (("240p", 240), ("360p", 360), ("480p", 480), ("540p", 540), ("576p", 576), ("720p", 720), ("1080p", 1080), ("2k", 2000), ("4k", 4000)):
1072
+ if token in lower:
1073
+ score = min(score, value)
1074
+
1075
+ return score
1076
+
1077
+
1078
+ def _extract_video_candidates(payload: Any) -> List[str]:
1079
+ candidates: List[str] = []
1080
+ key_priority = [
1081
+ "master_url",
1082
+ "masterUrl",
1083
+ "video_url",
1084
+ "play_url",
1085
+ "origin_video_key",
1086
+ "origin_video_url",
1087
+ "video_play_url",
1088
+ "audio_url",
1089
+ "note_sound_info",
1090
+ "url",
1091
+ ]
1092
+
1093
+ for key in key_priority:
1094
+ values = deep_find_all(payload, [key])
1095
+ for value in values:
1096
+ if isinstance(value, str):
1097
+ v = value.strip()
1098
+ if v.startswith("http://") or v.startswith("https://"):
1099
+ candidates.append(v)
1100
+ elif isinstance(value, list):
1101
+ for item in value:
1102
+ if isinstance(item, str):
1103
+ v = item.strip()
1104
+ if v.startswith("http://") or v.startswith("https://"):
1105
+ candidates.append(v)
1106
+ elif isinstance(value, dict):
1107
+ nested = value.get("url") or value.get("play_url")
1108
+ if isinstance(nested, str):
1109
+ v = nested.strip()
1110
+ if v.startswith("http://") or v.startswith("https://"):
1111
+ candidates.append(v)
1112
+
1113
+ unique: List[str] = []
1114
+ seen = set()
1115
+ for url in candidates:
1116
+ if url not in seen:
1117
+ unique.append(url)
1118
+ seen.add(url)
1119
+
1120
+ video_only = [u for u in unique if _url_likely_video(u)]
1121
+ if not video_only:
1122
+ return []
1123
+
1124
+ scored = sorted(video_only, key=lambda u: (_video_quality_hint(u), video_only.index(u)))
1125
+ return scored
1126
+
1127
+
1128
+ def _collect_urls(value: Any) -> List[str]:
1129
+ out: List[str] = []
1130
+ if isinstance(value, str):
1131
+ v = value.strip()
1132
+ if v.startswith("http://") or v.startswith("https://"):
1133
+ out.append(v)
1134
+ elif isinstance(value, list):
1135
+ for item in value:
1136
+ out.extend(_collect_urls(item))
1137
+ elif isinstance(value, dict):
1138
+ for key in ("url", "urlDefault", "url_default", "urlPre", "url_pre", "original"):
1139
+ if key in value:
1140
+ out.extend(_collect_urls(value.get(key)))
1141
+ return out
1142
+
1143
+
1144
+ def _dedupe_image_urls(urls: List[str]) -> List[str]:
1145
+ unique: List[str] = []
1146
+ seen = set()
1147
+ for url in urls:
1148
+ if url in seen:
1149
+ continue
1150
+ seen.add(url)
1151
+ if _url_likely_image(url):
1152
+ unique.append(url)
1153
+ return unique
1154
+
1155
+
1156
+ def _extract_image_candidates_with_strategy(payload: Any) -> Tuple[List[str], str]:
1157
+ # Priority 1: original image set
1158
+ originals = _dedupe_image_urls(deep_find_all(payload, ["original"]))
1159
+ if originals:
1160
+ return originals, "original"
1161
+
1162
+ # Priority 2: WEB_V2 infoList with WB_DFT scene
1163
+ dft_urls: List[str] = []
1164
+ for key in ("imageList", "images_list"):
1165
+ image_lists = deep_find_all(payload, [key])
1166
+ for image_list in image_lists:
1167
+ if not isinstance(image_list, list):
1168
+ continue
1169
+ for item in image_list:
1170
+ if not isinstance(item, dict):
1171
+ continue
1172
+ info_list = item.get("infoList") or item.get("info_list")
1173
+ if isinstance(info_list, list):
1174
+ for info in info_list:
1175
+ if not isinstance(info, dict):
1176
+ continue
1177
+ scene = str(info.get("imageScene") or info.get("image_scene") or "").upper()
1178
+ if scene == "WB_DFT":
1179
+ dft_urls.extend(_collect_urls(info.get("url")))
1180
+ dft_urls = _dedupe_image_urls(dft_urls)
1181
+ if dft_urls:
1182
+ return dft_urls, "wb_dft"
1183
+
1184
+ # Priority 3: default representative image URLs
1185
+ default_urls: List[str] = []
1186
+ for key in ("urlDefault", "url_default", "urlPre", "url_pre"):
1187
+ default_urls.extend(_collect_urls(deep_find_all(payload, [key])))
1188
+ default_urls = _dedupe_image_urls(default_urls)
1189
+ if default_urls:
1190
+ return default_urls, "default"
1191
+
1192
+ # Priority 4: generic fallback (single quality group intended)
1193
+ generic: List[str] = []
1194
+ for key in ("url", "url_list", "origin_image", "origin_image_url", "cover", "thumb", "image_url"):
1195
+ generic.extend(_collect_urls(deep_find_all(payload, [key])))
1196
+ generic = _dedupe_image_urls(generic)
1197
+ return generic, "fallback"
1198
+
1199
+
1200
+ def _extract_image_candidates(payload: Any) -> List[str]:
1201
+ urls, _ = _extract_image_candidates_with_strategy(payload)
1202
+ return urls
1203
+
1204
+
1205
+ def _extract_note_type_field(payload: Any) -> str:
1206
+ # WEB_V2 schema: note.type
1207
+ for note_obj in deep_find_all(payload, ["note"]):
1208
+ if isinstance(note_obj, dict):
1209
+ note_type = normalize_text(note_obj.get("type")).lower()
1210
+ if note_type:
1211
+ return note_type
1212
+
1213
+ # APP schema: note_list[].type
1214
+ for key in ("note_list", "noteList"):
1215
+ for note_list in deep_find_all(payload, [key]):
1216
+ if not isinstance(note_list, list):
1217
+ continue
1218
+ for item in note_list:
1219
+ if not isinstance(item, dict):
1220
+ continue
1221
+ note_type = normalize_text(item.get("type")).lower()
1222
+ if note_type:
1223
+ return note_type
1224
+
1225
+ # Strict fallback: only accept expected scalar values.
1226
+ for value in deep_find_all(payload, ["type"]):
1227
+ note_type = normalize_text(value).lower()
1228
+ if note_type in {"video", "normal", "image"}:
1229
+ return note_type
1230
+
1231
+ return ""
1232
+
1233
+
1234
+ def _detect_note_content_type(payload: Any, video_candidates: List[str], image_candidates: List[str]) -> str:
1235
+ note_type_value = _extract_note_type_field(payload)
1236
+ if note_type_value == "video":
1237
+ return "video"
1238
+ if note_type_value == "normal":
1239
+ return "image"
1240
+ if "video" in note_type_value:
1241
+ return "video"
1242
+ if "image" in note_type_value:
1243
+ return "image"
1244
+
1245
+ note_sound_url = normalize_text(deep_find_first(payload, ["note_sound_info", "url"])).lower()
1246
+ has_note_audio = bool(note_sound_url and any(token in note_sound_url for token in [".m4a", ".mp3", "/audio/"]))
1247
+
1248
+ has_video = bool(video_candidates) or has_note_audio
1249
+ has_image = bool(image_candidates)
1250
+ if has_video and has_image:
1251
+ return "mixed"
1252
+ if has_video:
1253
+ return "video"
1254
+ if has_image:
1255
+ return "image"
1256
+ return "unknown"
1257
+
1258
+
1259
+ def _guess_ext_from_url(url: str) -> str:
1260
+ parsed = urllib.parse.urlparse(url)
1261
+ path = parsed.path.lower()
1262
+ for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif"]:
1263
+ if path.endswith(ext):
1264
+ return ext
1265
+ return ".jpg"
1266
+
1267
+
1268
+ def _download_images(
1269
+ *,
1270
+ urls: List[str],
1271
+ timeout_ms: int,
1272
+ source_input: Dict[str, Optional[str]],
1273
+ note_id: Optional[str],
1274
+ storage_config: Optional[Dict[str, Any]],
1275
+ ) -> List[Dict[str, Any]]:
1276
+ if not urls:
1277
+ return []
1278
+
1279
+ try:
1280
+ paths = resolve_storage_paths(storage_config or {})
1281
+ base_dir = Path(paths.get("runs_root", "")) / "assets" / datetime.now().strftime("%Y%m%d") / _traceable_identifier(source_input, note_id)
1282
+ except Exception:
1283
+ base_dir = Path("./tikomni-output/_runs/assets") / datetime.now().strftime("%Y%m%d") / _traceable_identifier(source_input, note_id)
1284
+
1285
+ base_dir.mkdir(parents=True, exist_ok=True)
1286
+ results: List[Dict[str, Any]] = []
1287
+
1288
+ for idx, url in enumerate(urls[:30], start=1):
1289
+ ext = _guess_ext_from_url(url)
1290
+ path = base_dir / f"image-{idx:02d}{ext}"
1291
+ try:
1292
+ req = urllib.request.Request(url=url, method="GET")
1293
+ with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
1294
+ content = response.read()
1295
+ path.write_bytes(content)
1296
+ results.append({"index": idx, "url": url, "path": str(path), "ok": True})
1297
+ except Exception as error:
1298
+ results.append({"index": idx, "url": url, "path": str(path), "ok": False, "error": str(error)})
1299
+
1300
+ return results
1301
+
1302
+
1303
+ def _build_result(
1304
+ *,
1305
+ source_input: Dict[str, Optional[str]],
1306
+ raw_content: str,
1307
+ confidence: str,
1308
+ error_reason: Optional[str],
1309
+ extract_trace: List[Dict[str, Any]],
1310
+ fallback_trace: List[Dict[str, Any]],
1311
+ request_id: Optional[str],
1312
+ text_source: str,
1313
+ note_id: Optional[str],
1314
+ subtitle_hit: bool,
1315
+ u2_task_id: Optional[str],
1316
+ u2_task_status: Optional[str],
1317
+ note_content_type: str,
1318
+ analysis_mode: str,
1319
+ selected_video_url: Optional[str],
1320
+ selected_video_candidates: List[str],
1321
+ selected_image_urls: List[str],
1322
+ downloaded_assets: List[Dict[str, Any]],
1323
+ missing_fields: Optional[List[Dict[str, str]]] = None,
1324
+ metadata_fields: Optional[Dict[str, Any]] = None,
1325
+ asr_source: Optional[str] = None,
1326
+ ) -> Dict[str, Any]:
1327
+ metadata = metadata_fields or {}
1328
+ summary_block = summarize_content(raw_content, source=f"xiaohongshu:{text_source}")
1329
+ insights = list(summary_block.get("insights", []))
1330
+ insights.extend([
1331
+ f"note_content_type={note_content_type}",
1332
+ f"analysis_mode={analysis_mode}",
1333
+ f"selected_image_count={len(selected_image_urls)}",
1334
+ ])
1335
+
1336
+ resolved_asr_source = normalize_text(asr_source)
1337
+ if not resolved_asr_source:
1338
+ if text_source == "subtitle":
1339
+ resolved_asr_source = "native_subtitle"
1340
+ elif text_source == "u2":
1341
+ resolved_asr_source = "external_asr"
1342
+ else:
1343
+ resolved_asr_source = "fallback_none"
1344
+
1345
+ work_modality = "video" if normalize_text(note_content_type).lower() in {"video", "mixed"} else "text"
1346
+ caption_raw = normalize_text(metadata.get("caption_raw"))
1347
+ primary_text = raw_content if work_modality == "video" else (caption_raw or raw_content)
1348
+ primary_text_source = "asr_clean" if work_modality == "video" else "caption_raw"
1349
+ analysis_eligibility = "eligible" if primary_text else "incomplete"
1350
+ analysis_exclusion_reason = "" if analysis_eligibility == "eligible" else ("video_asr_unavailable" if work_modality == "video" else "caption_raw_missing")
1351
+
1352
+ return {
1353
+ "platform": "xiaohongshu",
1354
+ "content_kind": "note",
1355
+ "source": source_input,
1356
+ "note_id": note_id,
1357
+ "note_content_type": note_content_type,
1358
+ "analysis_mode": analysis_mode,
1359
+ "subtitle_hit": subtitle_hit,
1360
+ "text_source": text_source,
1361
+ "asr_source": resolved_asr_source,
1362
+ "u2_task_id": u2_task_id,
1363
+ "u2_task_status": u2_task_status,
1364
+ "selected_video_url": selected_video_url,
1365
+ "selected_video_candidates": selected_video_candidates,
1366
+ "selected_image_urls": selected_image_urls,
1367
+ "title": metadata.get("title"),
1368
+ "caption_raw": caption_raw,
1369
+ "author": metadata.get("author"),
1370
+ "create_time_sec": metadata.get("create_time_sec"),
1371
+ "publish_time": metadata.get("publish_time"),
1372
+ "published_date": _format_published_date(metadata.get("publish_time")),
1373
+ "publish_time_source": metadata.get("publish_time_source"),
1374
+ "duration_ms": metadata.get("duration_ms"),
1375
+ "tags": metadata.get("tags", []),
1376
+ "digg_count": metadata.get("digg_count"),
1377
+ "comment_count": metadata.get("comment_count"),
1378
+ "collect_count": metadata.get("collect_count"),
1379
+ "share_count": metadata.get("share_count"),
1380
+ "share_url": metadata.get("share_url"),
1381
+ "source_url": metadata.get("source_url"),
1382
+ "cover_image": metadata.get("cover_image"),
1383
+ "video_down_url": metadata.get("video_down_url"),
1384
+ "video_download_url": metadata.get("video_download_url") or metadata.get("video_down_url"),
1385
+ "work_modality": work_modality,
1386
+ "author_handle": metadata.get("author_handle"),
1387
+ "platform_author_id": metadata.get("platform_author_id") or metadata.get("author_platform_id"),
1388
+ "xhs_user_id": metadata.get("xhs_user_id"),
1389
+ "xhs_sec_token": metadata.get("xhs_sec_token"),
1390
+ "downloaded_assets": downloaded_assets,
1391
+ "raw_content": raw_content,
1392
+ "primary_text": primary_text,
1393
+ "primary_text_source": primary_text_source,
1394
+ "analysis_eligibility": analysis_eligibility,
1395
+ "analysis_exclusion_reason": analysis_exclusion_reason,
1396
+ "summary": summary_block["summary"],
1397
+ "insights": insights,
1398
+ "confidence": confidence,
1399
+ "error_reason": error_reason,
1400
+ "missing_fields": missing_fields or [],
1401
+ "extract_trace": extract_trace,
1402
+ "fallback_trace": fallback_trace,
1403
+ "request_id": request_id,
1404
+ }
1405
+
1406
+
1407
+ def run_xiaohongshu_extract(
1408
+ *,
1409
+ input_value: Optional[str],
1410
+ share_text: Optional[str],
1411
+ note_id: Optional[str],
1412
+ env_file: Optional[str],
1413
+ api_key_env: str,
1414
+ base_url: Optional[str],
1415
+ timeout_ms: Optional[int],
1416
+ poll_interval_sec: float,
1417
+ max_polls: int,
1418
+ u2_submit_max_retries: int,
1419
+ u2_submit_backoff_ms: int,
1420
+ u2_timeout_retry_enabled: bool,
1421
+ u2_timeout_retry_max_retries: int,
1422
+ force_u2_fallback: bool,
1423
+ write_card: bool,
1424
+ card_type: str,
1425
+ card_root: Optional[str],
1426
+ storage_config: Optional[Dict[str, Any]] = None,
1427
+ allow_process_env: bool = False,
1428
+ persist_output: bool = True,
1429
+ progress: Optional[ProgressReporter] = None,
1430
+ ) -> Dict[str, Any]:
1431
+ if not write_card or not persist_output:
1432
+ raise ValueError(
1433
+ f"fixed_pipeline_requires_full_persistence:xiaohongshu:note:write_card={bool(write_card)}:persist_output={bool(persist_output)}"
1434
+ )
1435
+
1436
+ source_input = _normalize_input(input_value, share_text, note_id)
1437
+ if progress is not None:
1438
+ progress.started(stage="note.workflow", message="xiaohongshu note workflow started")
1439
+ metadata_fields: Dict[str, Any] = {}
1440
+ if not source_input["share_text"] and not source_input["note_id"]:
1441
+ result = _build_result(
1442
+ source_input=source_input,
1443
+ raw_content="",
1444
+ confidence="low",
1445
+ error_reason="missing_share_text_or_note_id",
1446
+ extract_trace=[],
1447
+ fallback_trace=[],
1448
+ request_id=None,
1449
+ text_source="none",
1450
+ note_id=None,
1451
+ subtitle_hit=False,
1452
+ u2_task_id=None,
1453
+ u2_task_status="UNKNOWN",
1454
+ note_content_type="unknown",
1455
+ analysis_mode="none",
1456
+ selected_video_url=None,
1457
+ selected_video_candidates=[],
1458
+ selected_image_urls=[],
1459
+ downloaded_assets=[],
1460
+ missing_fields=[{"field": "share_text_or_note_id", "reason": "missing_input"}],
1461
+ metadata_fields=metadata_fields,
1462
+ )
1463
+ if write_card:
1464
+ result["card_write"] = write_benchmark_card(
1465
+ payload=result,
1466
+ platform="xiaohongshu",
1467
+ card_type=card_type,
1468
+ card_root=card_root,
1469
+ content_kind="note",
1470
+ storage_config=storage_config,
1471
+ )
1472
+ return _finalize_result(
1473
+ result=result,
1474
+ source_input=source_input,
1475
+ note_id=None,
1476
+ storage_config=storage_config,
1477
+ persist_output=persist_output,
1478
+ )
1479
+
1480
+ runtime = resolve_runtime(
1481
+ env_file=env_file,
1482
+ api_key_env=api_key_env,
1483
+ base_url=base_url,
1484
+ timeout_ms=timeout_ms,
1485
+ allow_process_env=allow_process_env,
1486
+ )
1487
+
1488
+ trace: List[Dict[str, Any]] = []
1489
+
1490
+ if progress is not None:
1491
+ progress.progress(stage="note.fetch", message="fetching xiaohongshu note payload")
1492
+ note_response = _fetch_note_info(
1493
+ base_url=runtime["base_url"],
1494
+ token=runtime["token"],
1495
+ timeout_ms=runtime["timeout_ms"],
1496
+ source_input=source_input,
1497
+ )
1498
+
1499
+ attempts = note_response.get("_attempts") or []
1500
+ for index, attempt in enumerate(attempts, start=1):
1501
+ response = attempt.get("response") if isinstance(attempt, dict) else None
1502
+ endpoint = attempt.get("endpoint") if isinstance(attempt, dict) else None
1503
+ label = attempt.get("label") if isinstance(attempt, dict) else None
1504
+ if not isinstance(response, dict):
1505
+ continue
1506
+ step = "u1_get_note_info_effective" if index == len(attempts) else f"u1_get_note_info_attempt_{index}"
1507
+ trace.append(
1508
+ build_api_trace(
1509
+ step=step,
1510
+ endpoint=endpoint,
1511
+ response=response,
1512
+ extra={
1513
+ "route_label": label,
1514
+ "attempt": index,
1515
+ "chosen_route": note_response.get("_route_label"),
1516
+ "field_completeness": response.get("_field_completeness"),
1517
+ },
1518
+ )
1519
+ )
1520
+
1521
+ trace.append(
1522
+ {
1523
+ "step": "u1_get_note_info_route_decision",
1524
+ "chosen_route": note_response.get("_route_label"),
1525
+ "request_id": note_response.get("request_id"),
1526
+ "field_completeness": note_response.get("_field_completeness"),
1527
+ "attempt_count": len(attempts),
1528
+ }
1529
+ )
1530
+
1531
+ if not note_response.get("ok"):
1532
+ error_ctx = resolve_trace_error_context(
1533
+ responses=[note_response],
1534
+ extract_trace=trace,
1535
+ default_error_reason="u1_get_note_info_failed",
1536
+ )
1537
+ result = _build_result(
1538
+ source_input=source_input,
1539
+ raw_content="",
1540
+ confidence="low",
1541
+ error_reason=error_ctx.get("error_reason"),
1542
+ extract_trace=trace,
1543
+ fallback_trace=error_ctx.get("fallback_trace", []),
1544
+ request_id=error_ctx.get("request_id"),
1545
+ text_source="none",
1546
+ note_id=source_input.get("note_id"),
1547
+ subtitle_hit=False,
1548
+ u2_task_id=None,
1549
+ u2_task_status="UNKNOWN",
1550
+ note_content_type="unknown",
1551
+ analysis_mode="none",
1552
+ selected_video_url=None,
1553
+ selected_video_candidates=[],
1554
+ selected_image_urls=[],
1555
+ downloaded_assets=[],
1556
+ missing_fields=[{"field": "u1_note_info", "reason": "all_routes_failed"}],
1557
+ metadata_fields=metadata_fields,
1558
+ )
1559
+ if write_card:
1560
+ result["card_write"] = write_benchmark_card(
1561
+ payload=result,
1562
+ platform="xiaohongshu",
1563
+ card_type=card_type,
1564
+ card_root=card_root,
1565
+ content_kind="note",
1566
+ storage_config=storage_config,
1567
+ )
1568
+ return _finalize_result(
1569
+ result=result,
1570
+ source_input=source_input,
1571
+ note_id=source_input.get("note_id"),
1572
+ storage_config=storage_config,
1573
+ persist_output=persist_output,
1574
+ )
1575
+
1576
+ effective_payload = note_response.get("data")
1577
+ app_route_success = str(note_response.get("_route_label") or "").startswith("app")
1578
+ metadata_enrich_on_sparse = bool(config_get(storage_config or {}, "xhs.metadata_enrich_on_sparse", True))
1579
+
1580
+ initial_metadata = _extract_xhs_metadata(
1581
+ payload=effective_payload,
1582
+ source_input=source_input,
1583
+ selected_video_url=None,
1584
+ selected_image_urls=[],
1585
+ )
1586
+ sparse_metadata_detected = bool(app_route_success and metadata_enrich_on_sparse and _is_sparse_metadata(initial_metadata))
1587
+ metadata_enrich_hit = False
1588
+ enrich_response: Optional[Dict[str, Any]] = None
1589
+ enrich_payload: Any = None
1590
+
1591
+ if sparse_metadata_detected:
1592
+ enrich_response = _fetch_sparse_metadata_enrich(
1593
+ base_url=runtime["base_url"],
1594
+ token=runtime["token"],
1595
+ timeout_ms=runtime["timeout_ms"],
1596
+ source_input=source_input,
1597
+ note_id=source_input.get("note_id"),
1598
+ )
1599
+ trace.append(
1600
+ build_api_trace(
1601
+ step="u1_sparse_metadata_enrich",
1602
+ endpoint=enrich_response.get("_endpoint"),
1603
+ response=enrich_response,
1604
+ extra={"route_label": enrich_response.get("_route_label")},
1605
+ )
1606
+ )
1607
+ if enrich_response.get("ok"):
1608
+ metadata_enrich_hit = True
1609
+ enrich_payload = enrich_response.get("data")
1610
+ effective_payload = {"app": note_response.get("data"), "web_v2_enrich": enrich_payload}
1611
+
1612
+ resolved_note_id = _resolve_note_id(effective_payload, source_input)
1613
+
1614
+ title = normalize_text(deep_find_first(effective_payload, ["title"]))
1615
+ desc = normalize_text(deep_find_first(effective_payload, ["desc", "content"]))
1616
+ caption_text = "\n".join([t for t in [title, desc] if t]).strip()
1617
+
1618
+ subtitle_inline_text = "" if force_u2_fallback else _extract_subtitle_inline_text(effective_payload)
1619
+ subtitle_urls = [] if force_u2_fallback else _extract_subtitle_urls(effective_payload)
1620
+ subtitle_url_text = "" if force_u2_fallback else _fetch_subtitle_text(subtitle_urls, runtime["timeout_ms"])
1621
+ subtitle_text = subtitle_inline_text or subtitle_url_text
1622
+
1623
+ app_video_candidates = _extract_video_candidates(note_response.get("data"))
1624
+ app_image_candidates, image_quality_strategy = _extract_image_candidates_with_strategy(note_response.get("data"))
1625
+ enrich_video_candidates = _extract_video_candidates(enrich_payload) if metadata_enrich_hit else []
1626
+ enrich_image_candidates = _extract_image_candidates(enrich_payload) if metadata_enrich_hit else []
1627
+
1628
+ video_candidates = _dedupe_keep_order(app_video_candidates + enrich_video_candidates)
1629
+ image_candidates = _dedupe_keep_order(app_image_candidates + enrich_image_candidates)
1630
+
1631
+ selected_video_url = video_candidates[0] if video_candidates else None
1632
+ type_field_value = _extract_note_type_field(effective_payload)
1633
+ note_content_type = _detect_note_content_type(effective_payload, video_candidates, image_candidates)
1634
+
1635
+ metadata_fields = _extract_xhs_metadata(
1636
+ payload=effective_payload,
1637
+ source_input=source_input,
1638
+ selected_video_url=selected_video_url,
1639
+ selected_image_urls=image_candidates,
1640
+ )
1641
+
1642
+ missing_fields: List[Dict[str, str]] = []
1643
+ _append_missing_metadata_fields(missing_fields, metadata_fields)
1644
+
1645
+ trace.append(
1646
+ {
1647
+ "step": "media_probe",
1648
+ "type_field_value": type_field_value,
1649
+ "note_content_type": note_content_type,
1650
+ "video_candidate_count": len(video_candidates),
1651
+ "image_candidate_count": len(image_candidates),
1652
+ "image_quality_strategy": image_quality_strategy,
1653
+ "subtitle_hit": bool(subtitle_text),
1654
+ "subtitle_url_count": len(subtitle_urls),
1655
+ "force_u2_fallback": force_u2_fallback,
1656
+ "sparse_metadata_detected": sparse_metadata_detected,
1657
+ "metadata_enrich_hit": metadata_enrich_hit,
1658
+ "candidate_merge_sources": {
1659
+ "video": _build_candidate_merge_sources(
1660
+ app_candidates=app_video_candidates,
1661
+ enrich_candidates=enrich_video_candidates,
1662
+ app_label="app",
1663
+ ),
1664
+ "image": _build_candidate_merge_sources(
1665
+ app_candidates=app_image_candidates,
1666
+ enrich_candidates=enrich_image_candidates,
1667
+ app_label="app",
1668
+ ),
1669
+ },
1670
+ }
1671
+ )
1672
+
1673
+ u2_gate = _evaluate_u2_gate_for_xhs(
1674
+ note_content_type=note_content_type,
1675
+ duration_ms=metadata_fields.get("duration_ms"),
1676
+ video_down_url=metadata_fields.get("video_down_url") or selected_video_url,
1677
+ )
1678
+ trace.append(
1679
+ {
1680
+ "step": "u2_gate",
1681
+ "can_u2": bool(u2_gate.get("can_u2")),
1682
+ "gate_reason": u2_gate.get("gate_reason"),
1683
+ "rule": U2_GATE_RULE,
1684
+ "is_video": u2_gate.get("is_video"),
1685
+ "duration_ms": u2_gate.get("duration_ms"),
1686
+ "video_download_url_present": u2_gate.get("video_download_url_present"),
1687
+ }
1688
+ )
1689
+
1690
+ # Video-note path: aligned with douyin single-video pipeline (subtitle-first difference retained).
1691
+ if note_content_type in {"video", "mixed"}:
1692
+ if subtitle_text:
1693
+ success_ctx = resolve_trace_error_context(
1694
+ responses=[note_response],
1695
+ extract_trace=trace,
1696
+ explicit_error_reason=None,
1697
+ explicit_request_id=note_response.get("request_id"),
1698
+ )
1699
+ result = _build_result(
1700
+ source_input=source_input,
1701
+ raw_content=subtitle_text,
1702
+ confidence="high",
1703
+ error_reason=None,
1704
+ extract_trace=trace,
1705
+ fallback_trace=success_ctx.get("fallback_trace", []),
1706
+ request_id=success_ctx.get("request_id"),
1707
+ text_source="subtitle",
1708
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1709
+ subtitle_hit=True,
1710
+ u2_task_id=None,
1711
+ u2_task_status="SKIPPED",
1712
+ note_content_type=note_content_type,
1713
+ analysis_mode="video_full",
1714
+ selected_video_url=selected_video_url,
1715
+ selected_video_candidates=video_candidates,
1716
+ selected_image_urls=image_candidates,
1717
+ downloaded_assets=[],
1718
+ missing_fields=missing_fields,
1719
+ metadata_fields=metadata_fields,
1720
+ )
1721
+ if write_card:
1722
+ result["card_write"] = write_benchmark_card(
1723
+ payload=result,
1724
+ platform="xiaohongshu",
1725
+ card_type=card_type,
1726
+ card_root=card_root,
1727
+ content_kind="single_video",
1728
+ storage_config=storage_config,
1729
+ )
1730
+ return _finalize_result(
1731
+ result=result,
1732
+ source_input=source_input,
1733
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1734
+ storage_config=storage_config,
1735
+ persist_output=persist_output,
1736
+ )
1737
+
1738
+ if not u2_gate.get("can_u2"):
1739
+ gate_reason = normalize_text(u2_gate.get("gate_reason")) or "skip:unknown"
1740
+ if gate_reason == "skip:duration_missing":
1741
+ missing_fields.append({"field": "duration_ms", "reason": gate_reason})
1742
+ elif gate_reason in {"skip:duration_too_short", "skip:duration_too_long"}:
1743
+ missing_fields.append({"field": "duration_ms", "reason": gate_reason})
1744
+ elif gate_reason == "skip:video_download_url_missing":
1745
+ missing_fields.append({"field": "video_download_url", "reason": gate_reason})
1746
+ elif gate_reason == "skip:not_video":
1747
+ missing_fields.append({"field": "note_content_type", "reason": gate_reason})
1748
+
1749
+ error_ctx = resolve_trace_error_context(
1750
+ responses=[note_response],
1751
+ extract_trace=trace,
1752
+ default_error_reason=gate_reason,
1753
+ )
1754
+ fallback_text = caption_text
1755
+ result = _build_result(
1756
+ source_input=source_input,
1757
+ raw_content=fallback_text,
1758
+ confidence="medium" if fallback_text else "low",
1759
+ error_reason=None if fallback_text else error_ctx.get("error_reason"),
1760
+ extract_trace=trace,
1761
+ fallback_trace=error_ctx.get("fallback_trace", []),
1762
+ request_id=error_ctx.get("request_id"),
1763
+ text_source="caption_fallback" if fallback_text else "none",
1764
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1765
+ subtitle_hit=False,
1766
+ u2_task_id=None,
1767
+ u2_task_status="SKIPPED",
1768
+ note_content_type=note_content_type,
1769
+ analysis_mode="video_full",
1770
+ selected_video_url=u2_gate.get("video_down_url") or selected_video_url,
1771
+ selected_video_candidates=video_candidates,
1772
+ selected_image_urls=image_candidates,
1773
+ downloaded_assets=[],
1774
+ missing_fields=missing_fields,
1775
+ metadata_fields=metadata_fields,
1776
+ )
1777
+ if write_card:
1778
+ result["card_write"] = write_benchmark_card(
1779
+ payload=result,
1780
+ platform="xiaohongshu",
1781
+ card_type=card_type,
1782
+ card_root=card_root,
1783
+ content_kind="single_video",
1784
+ storage_config=storage_config,
1785
+ )
1786
+ return _finalize_result(
1787
+ result=result,
1788
+ source_input=source_input,
1789
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1790
+ storage_config=storage_config,
1791
+ persist_output=persist_output,
1792
+ )
1793
+
1794
+ u2_candidates = _dedupe_keep_order([u2_gate.get("video_down_url")] + list(video_candidates))
1795
+ if progress is not None:
1796
+ progress.progress(
1797
+ stage="note.u2",
1798
+ message="starting xiaohongshu u2 flow",
1799
+ data={"candidate_count": len(u2_candidates)},
1800
+ )
1801
+ u2_bundle = run_u2_asr_candidates_with_timeout_retry(
1802
+ base_url=runtime["base_url"],
1803
+ token=runtime["token"],
1804
+ timeout_ms=runtime["timeout_ms"],
1805
+ candidates=u2_candidates,
1806
+ submit_max_retries=u2_submit_max_retries,
1807
+ submit_backoff_ms=u2_submit_backoff_ms,
1808
+ poll_interval_sec=poll_interval_sec,
1809
+ max_polls=max_polls,
1810
+ timeout_retry_enabled=u2_timeout_retry_enabled,
1811
+ timeout_retry_max_retries=u2_timeout_retry_max_retries,
1812
+ )
1813
+ submit_bundle = u2_bundle.get("submit_bundle", {})
1814
+ submit_response = submit_bundle.get("submit_response", {})
1815
+ task_id = submit_bundle.get("task_id")
1816
+ poll_result = u2_bundle.get("poll_result", {})
1817
+ selected_video_url = u2_bundle.get("chosen_candidate") or selected_video_url
1818
+ if selected_video_url and not normalize_text(metadata_fields.get("video_down_url")):
1819
+ metadata_fields["video_down_url"] = selected_video_url
1820
+
1821
+ trace.append(
1822
+ {
1823
+ "step": "u2_asr_timeout_retry",
1824
+ "endpoint": "/api/u2/v1/services/audio/asr/transcription + /api/u2/v1/tasks/{task_id}",
1825
+ "selected_video_url": selected_video_url,
1826
+ "selected_video_candidates": u2_candidates,
1827
+ "candidate_attempts": u2_bundle.get("candidate_attempts", []),
1828
+ "submit_retries_config": {
1829
+ "u2_submit_max_retries": max(0, int(u2_submit_max_retries)),
1830
+ "u2_submit_backoff_ms": max(0, int(u2_submit_backoff_ms)),
1831
+ },
1832
+ "timeout_retry": u2_bundle.get("timeout_retry", {}),
1833
+ "rounds": u2_bundle.get("rounds", []),
1834
+ "final_task_id": poll_result.get("task_id") or task_id,
1835
+ "final_task_status": poll_result.get("task_status"),
1836
+ "final_error_reason": poll_result.get("error_reason"),
1837
+ }
1838
+ )
1839
+ if progress is not None:
1840
+ (progress.done if poll_result.get("ok") else progress.failed)(
1841
+ stage="note.u2",
1842
+ message="xiaohongshu u2 flow finished" if poll_result.get("ok") else "xiaohongshu u2 flow failed",
1843
+ data={
1844
+ "task_id": poll_result.get("task_id") or task_id,
1845
+ "task_status": poll_result.get("task_status"),
1846
+ "error_reason": poll_result.get("error_reason"),
1847
+ },
1848
+ )
1849
+
1850
+ if not poll_result.get("ok") and (
1851
+ not submit_response.get("ok") or not (poll_result.get("task_id") or task_id)
1852
+ ):
1853
+ error_ctx = resolve_trace_error_context(
1854
+ responses=[poll_result, submit_response, note_response],
1855
+ extract_trace=trace,
1856
+ default_error_reason="u2_submit_failed_or_missing_task_id",
1857
+ explicit_request_id=(
1858
+ poll_result.get("request_id")
1859
+ or submit_response.get("request_id")
1860
+ or note_response.get("request_id")
1861
+ ),
1862
+ )
1863
+ fallback_text = caption_text
1864
+ if fallback_text:
1865
+ missing_fields.append({"field": "asr_transcript", "reason": f"u2_failed:{error_ctx.get('error_reason')}"})
1866
+ else:
1867
+ missing_fields.append({"field": "raw_content", "reason": "u2_failed_and_caption_missing"})
1868
+ result = _build_result(
1869
+ source_input=source_input,
1870
+ raw_content=fallback_text,
1871
+ confidence="medium" if fallback_text else "low",
1872
+ error_reason=None if fallback_text else error_ctx.get("error_reason"),
1873
+ extract_trace=trace,
1874
+ fallback_trace=error_ctx.get("fallback_trace", []),
1875
+ request_id=error_ctx.get("request_id"),
1876
+ text_source="caption_fallback" if fallback_text else "u2",
1877
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1878
+ subtitle_hit=False,
1879
+ u2_task_id=poll_result.get("task_id") or task_id,
1880
+ u2_task_status=poll_result.get("task_status") or "UNKNOWN",
1881
+ note_content_type=note_content_type,
1882
+ analysis_mode="video_full",
1883
+ selected_video_url=selected_video_url,
1884
+ selected_video_candidates=u2_candidates,
1885
+ selected_image_urls=image_candidates,
1886
+ downloaded_assets=[],
1887
+ missing_fields=missing_fields,
1888
+ metadata_fields=metadata_fields,
1889
+ )
1890
+ if write_card:
1891
+ result["card_write"] = write_benchmark_card(
1892
+ payload=result,
1893
+ platform="xiaohongshu",
1894
+ card_type=card_type,
1895
+ card_root=card_root,
1896
+ content_kind="single_video",
1897
+ storage_config=storage_config,
1898
+ )
1899
+ return _finalize_result(
1900
+ result=result,
1901
+ source_input=source_input,
1902
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1903
+ storage_config=storage_config,
1904
+ persist_output=persist_output,
1905
+ )
1906
+
1907
+ raw_content = poll_result.get("transcript_text", "") if poll_result.get("ok") else ""
1908
+ final_ctx = resolve_trace_error_context(
1909
+ responses=[poll_result, submit_response, note_response],
1910
+ extract_trace=trace,
1911
+ explicit_error_reason=poll_result.get("error_reason"),
1912
+ explicit_request_id=poll_result.get("request_id") or submit_response.get("request_id") or note_response.get("request_id"),
1913
+ )
1914
+ result = _build_result(
1915
+ source_input=source_input,
1916
+ raw_content=raw_content,
1917
+ confidence="high" if poll_result.get("ok") and raw_content else "low",
1918
+ error_reason=final_ctx.get("error_reason"),
1919
+ extract_trace=trace,
1920
+ fallback_trace=final_ctx.get("fallback_trace", []),
1921
+ request_id=final_ctx.get("request_id"),
1922
+ text_source="u2",
1923
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1924
+ subtitle_hit=False,
1925
+ u2_task_id=poll_result.get("task_id") or task_id,
1926
+ u2_task_status=poll_result.get("task_status"),
1927
+ note_content_type=note_content_type,
1928
+ analysis_mode="video_full",
1929
+ selected_video_url=selected_video_url,
1930
+ selected_video_candidates=u2_candidates,
1931
+ selected_image_urls=image_candidates,
1932
+ downloaded_assets=[],
1933
+ missing_fields=missing_fields,
1934
+ metadata_fields=metadata_fields,
1935
+ )
1936
+
1937
+ if write_card:
1938
+ result["card_write"] = write_benchmark_card(
1939
+ payload=result,
1940
+ platform="xiaohongshu",
1941
+ card_type=card_type,
1942
+ card_root=card_root,
1943
+ content_kind="single_video",
1944
+ storage_config=storage_config,
1945
+ )
1946
+
1947
+ return _finalize_result(
1948
+ result=result,
1949
+ source_input=source_input,
1950
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1951
+ storage_config=storage_config,
1952
+ persist_output=persist_output,
1953
+ )
1954
+
1955
+ # Image-note path, strategy B: download images + light text analysis + write card.
1956
+ raw_content = caption_text
1957
+
1958
+ downloaded_assets = _download_images(
1959
+ urls=image_candidates,
1960
+ timeout_ms=runtime["timeout_ms"],
1961
+ source_input=source_input,
1962
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1963
+ storage_config=storage_config,
1964
+ )
1965
+
1966
+ if not image_candidates:
1967
+ missing_fields.append({"field": "selected_image_urls", "reason": "image_note_but_no_image_url"})
1968
+ if not raw_content:
1969
+ missing_fields.append({"field": "raw_content", "reason": "title_and_desc_missing"})
1970
+
1971
+ success_ctx = resolve_trace_error_context(
1972
+ responses=[note_response],
1973
+ extract_trace=trace,
1974
+ explicit_error_reason=None,
1975
+ explicit_request_id=note_response.get("request_id"),
1976
+ )
1977
+
1978
+ result = _build_result(
1979
+ source_input=source_input,
1980
+ raw_content=raw_content,
1981
+ confidence="high" if raw_content else "medium",
1982
+ error_reason=None,
1983
+ extract_trace=trace,
1984
+ fallback_trace=success_ctx.get("fallback_trace", []),
1985
+ request_id=success_ctx.get("request_id"),
1986
+ text_source="caption",
1987
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1988
+ subtitle_hit=False,
1989
+ u2_task_id=None,
1990
+ u2_task_status="SKIPPED",
1991
+ note_content_type="image" if note_content_type == "unknown" else note_content_type,
1992
+ analysis_mode="image_light_analysis",
1993
+ selected_video_url=None,
1994
+ selected_video_candidates=video_candidates,
1995
+ selected_image_urls=image_candidates,
1996
+ downloaded_assets=downloaded_assets,
1997
+ missing_fields=missing_fields,
1998
+ metadata_fields=metadata_fields,
1999
+ )
2000
+
2001
+ if write_card:
2002
+ result["card_write"] = write_benchmark_card(
2003
+ payload=result,
2004
+ platform="xiaohongshu",
2005
+ card_type=card_type,
2006
+ card_root=card_root,
2007
+ content_kind="note",
2008
+ storage_config=storage_config,
2009
+ )
2010
+
2011
+ finalized = _finalize_result(
2012
+ result=result,
2013
+ source_input=source_input,
2014
+ note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
2015
+ storage_config=storage_config,
2016
+ persist_output=persist_output,
2017
+ )
2018
+ if progress is not None:
2019
+ final_event = progress.failed if finalized.get("error_reason") else progress.done
2020
+ final_event(
2021
+ stage="note.workflow",
2022
+ message="xiaohongshu note workflow finished" if not finalized.get("error_reason") else "xiaohongshu note workflow failed",
2023
+ data={
2024
+ "request_id": finalized.get("request_id"),
2025
+ "card_write_ok": bool((finalized.get("card_write") or {}).get("ok")),
2026
+ "output_persist_ok": bool((finalized.get("output_persist") or {}).get("ok")),
2027
+ "text_source": finalized.get("text_source"),
2028
+ },
2029
+ )
2030
+ return finalized
2031
+
2032
+
2033
+ def main() -> None:
2034
+ parser = argparse.ArgumentParser(description="Run xiaohongshu extraction chain")
2035
+ parser.add_argument("input", nargs="?", default=None, help="Share URL or note_id")
2036
+ parser.add_argument("--share-text", default=None, help="Xiaohongshu share URL/text")
2037
+ parser.add_argument("--note-id", default=None, help="Xiaohongshu note_id")
2038
+ parser.add_argument("--config", default=None, help="Runtime config YAML path")
2039
+ parser.add_argument("--env-file", default=None, help="Optional env file path")
2040
+ parser.add_argument("--allow-process-env", action="store_true", help="Allow process env to override .env/.env.local")
2041
+ parser.add_argument("--api-key-env", default=None, help="API key env variable name")
2042
+ parser.add_argument("--base-url", default=None, help="Tikomni base URL")
2043
+ parser.add_argument("--timeout-ms", type=int, default=None, help="Request timeout ms")
2044
+ parser.add_argument("--poll-interval-sec", type=float, default=None, help="U2 polling interval seconds")
2045
+ parser.add_argument("--max-polls", type=int, default=None, help="Max U2 polls")
2046
+ parser.add_argument(
2047
+ "--u2-submit-max-retries",
2048
+ type=int,
2049
+ default=None,
2050
+ help="Max retries for retriable U2 submit failures",
2051
+ )
2052
+ parser.add_argument(
2053
+ "--u2-submit-backoff-ms",
2054
+ type=int,
2055
+ default=None,
2056
+ help="Base backoff ms for retriable U2 submit failures (exponential)",
2057
+ )
2058
+ parser.add_argument(
2059
+ "--u2-timeout-retry-enabled",
2060
+ type=str,
2061
+ choices=["true", "false"],
2062
+ default=None,
2063
+ help="Enable conservative retry only when U2 polling times out",
2064
+ )
2065
+ parser.add_argument(
2066
+ "--u2-timeout-retry-max-retries",
2067
+ type=int,
2068
+ default=None,
2069
+ help="Conservative max retries for U2 timeout-only retry (0~3)",
2070
+ )
2071
+ parser.add_argument("--force-u2-fallback", action="store_true", help="Skip subtitle usage and force U2 fallback (test)")
2072
+ parser.add_argument("--card-type", choices=["work", "author", "author_sample_work"], default="work", help="Primary card type")
2073
+ parser.add_argument("--card-root", default=None, help="Card root (absolute); falls back to TIKOMNI_CARD_ROOT when writing cards")
2074
+ args = parser.parse_args()
2075
+
2076
+ config, _ = load_tikomni_config(
2077
+ args.config,
2078
+ env_file=args.env_file,
2079
+ allow_process_env=args.allow_process_env,
2080
+ )
2081
+
2082
+ resolved_env_file = args.env_file or config_get(config, "runtime.env_file", None)
2083
+ api_key_env = args.api_key_env or config_get(config, "runtime.auth_env_key", "TIKOMNI_API_KEY")
2084
+ base_url = args.base_url or config_get(config, "runtime.base_url", None)
2085
+ timeout_ms = args.timeout_ms if args.timeout_ms is not None else config_get(config, "runtime.timeout_ms", None)
2086
+ poll_interval_sec = (
2087
+ args.poll_interval_sec
2088
+ if args.poll_interval_sec is not None
2089
+ else config_get(config, "asr_strategy.poll_interval_sec", 3.0)
2090
+ )
2091
+ max_polls = args.max_polls if args.max_polls is not None else config_get(config, "asr_strategy.max_polls", 30)
2092
+ u2_submit_max_retries = (
2093
+ args.u2_submit_max_retries
2094
+ if args.u2_submit_max_retries is not None
2095
+ else config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.max_retries", 0)
2096
+ )
2097
+ u2_submit_backoff_ms = (
2098
+ args.u2_submit_backoff_ms
2099
+ if args.u2_submit_backoff_ms is not None
2100
+ else config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.backoff_ms", 0)
2101
+ )
2102
+ u2_timeout_retry_enabled = (
2103
+ (str(args.u2_timeout_retry_enabled).lower() == "true")
2104
+ if args.u2_timeout_retry_enabled is not None
2105
+ else bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True))
2106
+ )
2107
+ u2_timeout_retry_max_retries = (
2108
+ args.u2_timeout_retry_max_retries
2109
+ if args.u2_timeout_retry_max_retries is not None
2110
+ else config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 3)
2111
+ )
2112
+
2113
+ try:
2114
+ result = run_xiaohongshu_extract(
2115
+ input_value=args.input,
2116
+ share_text=args.share_text,
2117
+ note_id=args.note_id,
2118
+ env_file=resolved_env_file,
2119
+ api_key_env=api_key_env,
2120
+ base_url=base_url,
2121
+ timeout_ms=timeout_ms,
2122
+ poll_interval_sec=float(poll_interval_sec),
2123
+ max_polls=int(max_polls),
2124
+ u2_submit_max_retries=int(u2_submit_max_retries),
2125
+ u2_submit_backoff_ms=int(u2_submit_backoff_ms),
2126
+ u2_timeout_retry_enabled=bool(u2_timeout_retry_enabled),
2127
+ u2_timeout_retry_max_retries=int(u2_timeout_retry_max_retries),
2128
+ force_u2_fallback=args.force_u2_fallback,
2129
+ write_card=True,
2130
+ card_type=args.card_type,
2131
+ card_root=args.card_root,
2132
+ storage_config=config,
2133
+ allow_process_env=args.allow_process_env,
2134
+ persist_output=True,
2135
+ )
2136
+ except ValueError as error:
2137
+ result = {
2138
+ "platform": "xiaohongshu",
2139
+ "content_kind": "note",
2140
+ "raw_content": "",
2141
+ "summary": "",
2142
+ "insights": ["source=xiaohongshu:runtime", "runtime_not_ready"],
2143
+ "confidence": "low",
2144
+ "error_reason": str(error),
2145
+ "missing_fields": [],
2146
+ "extract_trace": [],
2147
+ "fallback_trace": [],
2148
+ "request_id": None,
2149
+ }
2150
+
2151
+ write_json_stdout(result)
2152
+ raise SystemExit(0 if not result.get("error_reason") else 1)
2153
+
2154
+
2155
+ if __name__ == "__main__":
2156
+ main()