@tikomni/skills 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (480) hide show
  1. package/.skill-package-allowlist.txt +1 -3
  2. package/README.md +41 -49
  3. package/README.zh-CN.md +43 -51
  4. package/bin/tikomni-skills.js +2 -2
  5. package/env.example +37 -56
  6. package/package.json +7 -3
  7. package/skills/social-media-crawl/SKILL.md +53 -0
  8. package/skills/social-media-crawl/agents/openai.yaml +5 -0
  9. package/skills/social-media-crawl/references/contracts/output-envelope.md +22 -0
  10. package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md +48 -0
  11. package/skills/social-media-crawl/references/guides/generic-mcp-objects.md +30 -0
  12. package/skills/social-media-crawl/references/mcp-usage-contract.md +30 -0
  13. package/skills/social-media-crawl/references/pipelines/douyin-creator-home.md +7 -0
  14. package/skills/social-media-crawl/references/pipelines/douyin-single-work.md +7 -0
  15. package/skills/social-media-crawl/references/pipelines/xiaohongshu-creator-home.md +7 -0
  16. package/skills/social-media-crawl/references/pipelines/xiaohongshu-single-work.md +7 -0
  17. package/skills/social-media-crawl/references/schemas/creator-profile.schema.json +33 -0
  18. package/skills/social-media-crawl/references/schemas/output-envelope.schema.json +41 -0
  19. package/skills/social-media-crawl/references/schemas/work-collection.schema.json +29 -0
  20. package/skills/social-media-crawl/references/schemas/work-fact-card.schema.json +67 -0
  21. package/skills/social-media-crawl/references/service-guides/u2-u3-mandatory-fallback.md +21 -0
  22. package/skills/social-media-crawl/scripts/__init__.py +2 -0
  23. package/skills/social-media-crawl/scripts/core/__init__.py +2 -0
  24. package/skills/{creator-analysis/scripts/pipeline/asr → social-media-crawl/scripts/core}/asr_pipeline.py +252 -9
  25. package/skills/social-media-crawl/scripts/core/completeness.py +83 -0
  26. package/skills/{single-work-analysis → social-media-crawl}/scripts/core/config_loader.py +108 -167
  27. package/skills/social-media-crawl/scripts/core/mcp_dispatch.py +145 -0
  28. package/skills/social-media-crawl/scripts/core/object_detection.py +63 -0
  29. package/skills/{creator-analysis/scripts/pipeline/asr → social-media-crawl/scripts/core}/poll_u2_task.py +6 -2
  30. package/skills/{single-work-analysis → social-media-crawl}/scripts/core/progress_report.py +32 -0
  31. package/skills/social-media-crawl/scripts/core/storage_router.py +160 -0
  32. package/skills/{creator-analysis → social-media-crawl}/scripts/core/tikomni_common.py +13 -3
  33. package/skills/social-media-crawl/scripts/core/u3_fallback.py +328 -0
  34. package/skills/social-media-crawl/scripts/pipelines/__init__.py +2 -0
  35. package/skills/social-media-crawl/scripts/pipelines/douyin_creator_home_helpers.py +35 -0
  36. package/skills/social-media-crawl/scripts/pipelines/douyin_platform_adapter.py +7 -0
  37. package/skills/{creator-analysis/scripts/author_home/asr → social-media-crawl/scripts/pipelines}/home_asr.py +1 -1
  38. package/skills/{creator-analysis/scripts/author_home/adapters → social-media-crawl/scripts/pipelines}/platform_adapters.py +8 -2
  39. package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +237 -0
  40. package/skills/{single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py → social-media-crawl/scripts/pipelines/run_douyin_single_work.py} +282 -174
  41. package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +237 -0
  42. package/skills/{single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py → social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py} +290 -141
  43. package/skills/{creator-analysis/scripts/author_home → social-media-crawl/scripts/pipelines}/schema.py +1 -1
  44. package/skills/social-media-crawl/scripts/pipelines/xiaohongshu_creator_home_helpers.py +35 -0
  45. package/skills/social-media-crawl/scripts/pipelines/xiaohongshu_platform_adapter.py +7 -0
  46. package/skills/social-media-crawl/scripts/writers/__init__.py +1 -0
  47. package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +391 -0
  48. package/skills/creator-analysis/SKILL.md +0 -95
  49. package/skills/creator-analysis/agents/openai.yaml +0 -4
  50. package/skills/creator-analysis/env.example +0 -36
  51. package/skills/creator-analysis/references/api-capability-index.md +0 -92
  52. package/skills/creator-analysis/references/api-contracts/asr-api.md +0 -130
  53. package/skills/creator-analysis/references/api-contracts/bilibili-app-api.md +0 -776
  54. package/skills/creator-analysis/references/api-contracts/bilibili-web-api.md +0 -2017
  55. package/skills/creator-analysis/references/api-contracts/demo-api.md +0 -717
  56. package/skills/creator-analysis/references/api-contracts/douyin-app-v3-api.md +0 -3594
  57. package/skills/creator-analysis/references/api-contracts/douyin-billboard-api.md +0 -2274
  58. package/skills/creator-analysis/references/api-contracts/douyin-creator-api.md +0 -1575
  59. package/skills/creator-analysis/references/api-contracts/douyin-creator-v2-api.md +0 -3254
  60. package/skills/creator-analysis/references/api-contracts/douyin-search-api.md +0 -4118
  61. package/skills/creator-analysis/references/api-contracts/douyin-web-api.md +0 -5544
  62. package/skills/creator-analysis/references/api-contracts/douyin-xingtu-api.md +0 -1916
  63. package/skills/creator-analysis/references/api-contracts/douyin-xingtu-v2-api.md +0 -1540
  64. package/skills/creator-analysis/references/api-contracts/health-check.md +0 -69
  65. package/skills/creator-analysis/references/api-contracts/hybrid-parsing.md +0 -78
  66. package/skills/creator-analysis/references/api-contracts/instagram-v1-api.md +0 -2256
  67. package/skills/creator-analysis/references/api-contracts/instagram-v2-api.md +0 -2011
  68. package/skills/creator-analysis/references/api-contracts/instagram-v3-api.md +0 -2630
  69. package/skills/creator-analysis/references/api-contracts/ios-shortcut.md +0 -44
  70. package/skills/creator-analysis/references/api-contracts/kuaishou-app-api.md +0 -1518
  71. package/skills/creator-analysis/references/api-contracts/kuaishou-web-api.md +0 -1242
  72. package/skills/creator-analysis/references/api-contracts/lemon8-app-api.md +0 -1088
  73. package/skills/creator-analysis/references/api-contracts/linkedin-web-api.md +0 -1949
  74. package/skills/creator-analysis/references/api-contracts/media-ingest-api.md +0 -126
  75. package/skills/creator-analysis/references/api-contracts/pipixia-app-api.md +0 -1142
  76. package/skills/creator-analysis/references/api-contracts/reddit-app-api.md +0 -2025
  77. package/skills/creator-analysis/references/api-contracts/sora2-api.md +0 -2266
  78. package/skills/creator-analysis/references/api-contracts/temp-mail-api.md +0 -208
  79. package/skills/creator-analysis/references/api-contracts/threads-web-api.md +0 -897
  80. package/skills/creator-analysis/references/api-contracts/tikhub-downloader-api.md +0 -134
  81. package/skills/creator-analysis/references/api-contracts/tikhub-user-api.md +0 -494
  82. package/skills/creator-analysis/references/api-contracts/tiktok-ads-api.md +0 -5947
  83. package/skills/creator-analysis/references/api-contracts/tiktok-analytics-api.md +0 -968
  84. package/skills/creator-analysis/references/api-contracts/tiktok-app-v3-api.md +0 -5735
  85. package/skills/creator-analysis/references/api-contracts/tiktok-creator-api.md +0 -1951
  86. package/skills/creator-analysis/references/api-contracts/tiktok-interaction-api.md +0 -742
  87. package/skills/creator-analysis/references/api-contracts/tiktok-shop-web-api.md +0 -1890
  88. package/skills/creator-analysis/references/api-contracts/tiktok-web-api.md +0 -4448
  89. package/skills/creator-analysis/references/api-contracts/toutiao-app-api.md +0 -342
  90. package/skills/creator-analysis/references/api-contracts/toutiao-web-api.md +0 -143
  91. package/skills/creator-analysis/references/api-contracts/twitter-web-api.md +0 -989
  92. package/skills/creator-analysis/references/api-contracts/wechat-channels-api.md +0 -809
  93. package/skills/creator-analysis/references/api-contracts/wechat-media-platform-web-api.md +0 -677
  94. package/skills/creator-analysis/references/api-contracts/weibo-app-api.md +0 -1547
  95. package/skills/creator-analysis/references/api-contracts/weibo-web-api.md +0 -798
  96. package/skills/creator-analysis/references/api-contracts/weibo-web-v2-api.md +0 -2459
  97. package/skills/creator-analysis/references/api-contracts/xiaohongshu-app-api.md +0 -1291
  98. package/skills/creator-analysis/references/api-contracts/xiaohongshu-app-v2-api.md +0 -1683
  99. package/skills/creator-analysis/references/api-contracts/xiaohongshu-web-api.md +0 -1324
  100. package/skills/creator-analysis/references/api-contracts/xiaohongshu-web-v2-api.md +0 -1209
  101. package/skills/creator-analysis/references/api-contracts/xigua-app-v2-api.md +0 -489
  102. package/skills/creator-analysis/references/api-contracts/youtube-web-api.md +0 -2636
  103. package/skills/creator-analysis/references/api-contracts/youtube-web-v2-api.md +0 -2660
  104. package/skills/creator-analysis/references/api-contracts/zhihu-web-api.md +0 -2315
  105. package/skills/creator-analysis/references/api-tags/asr-api.md +0 -100
  106. package/skills/creator-analysis/references/api-tags/bilibili-app-api.md +0 -482
  107. package/skills/creator-analysis/references/api-tags/bilibili-web-api.md +0 -1267
  108. package/skills/creator-analysis/references/api-tags/demo-api.md +0 -365
  109. package/skills/creator-analysis/references/api-tags/douyin-app-v3-api.md +0 -2012
  110. package/skills/creator-analysis/references/api-tags/douyin-billboard-api.md +0 -1428
  111. package/skills/creator-analysis/references/api-tags/douyin-creator-api.md +0 -694
  112. package/skills/creator-analysis/references/api-tags/douyin-creator-v2-api.md +0 -694
  113. package/skills/creator-analysis/references/api-tags/douyin-search-api.md +0 -1059
  114. package/skills/creator-analysis/references/api-tags/douyin-web-api.md +0 -3314
  115. package/skills/creator-analysis/references/api-tags/douyin-xingtu-api.md +0 -935
  116. package/skills/creator-analysis/references/api-tags/douyin-xingtu-v2-api.md +0 -925
  117. package/skills/creator-analysis/references/api-tags/health-check.md +0 -40
  118. package/skills/creator-analysis/references/api-tags/hybrid-parsing.md +0 -57
  119. package/skills/creator-analysis/references/api-tags/instagram-v1-api.md +0 -1224
  120. package/skills/creator-analysis/references/api-tags/instagram-v2-api.md +0 -1147
  121. package/skills/creator-analysis/references/api-tags/instagram-v3-api.md +0 -1123
  122. package/skills/creator-analysis/references/api-tags/ios-shortcut.md +0 -45
  123. package/skills/creator-analysis/references/api-tags/kuaishou-app-api.md +0 -846
  124. package/skills/creator-analysis/references/api-tags/kuaishou-web-api.md +0 -551
  125. package/skills/creator-analysis/references/api-tags/lemon8-app-api.md +0 -687
  126. package/skills/creator-analysis/references/api-tags/linkedin-web-api.md +0 -1105
  127. package/skills/creator-analysis/references/api-tags/media-ingest-api.md +0 -112
  128. package/skills/creator-analysis/references/api-tags/pipixia-app-api.md +0 -721
  129. package/skills/creator-analysis/references/api-tags/reddit-app-api.md +0 -1057
  130. package/skills/creator-analysis/references/api-tags/sora2-api.md +0 -737
  131. package/skills/creator-analysis/references/api-tags/temp-mail-api.md +0 -136
  132. package/skills/creator-analysis/references/api-tags/threads-web-api.md +0 -472
  133. package/skills/creator-analysis/references/api-tags/tikhub-downloader-api.md +0 -65
  134. package/skills/creator-analysis/references/api-tags/tikhub-user-api.md +0 -253
  135. package/skills/creator-analysis/references/api-tags/tiktok-ads-api.md +0 -1393
  136. package/skills/creator-analysis/references/api-tags/tiktok-analytics-api.md +0 -179
  137. package/skills/creator-analysis/references/api-tags/tiktok-app-v3-api.md +0 -3264
  138. package/skills/creator-analysis/references/api-tags/tiktok-creator-api.md +0 -709
  139. package/skills/creator-analysis/references/api-tags/tiktok-interaction-api.md +0 -366
  140. package/skills/creator-analysis/references/api-tags/tiktok-shop-web-api.md +0 -663
  141. package/skills/creator-analysis/references/api-tags/tiktok-web-api.md +0 -2516
  142. package/skills/creator-analysis/references/api-tags/toutiao-app-api.md +0 -220
  143. package/skills/creator-analysis/references/api-tags/toutiao-web-api.md +0 -96
  144. package/skills/creator-analysis/references/api-tags/twitter-web-api.md +0 -562
  145. package/skills/creator-analysis/references/api-tags/wechat-channels-api.md +0 -405
  146. package/skills/creator-analysis/references/api-tags/wechat-media-platform-web-api.md +0 -431
  147. package/skills/creator-analysis/references/api-tags/weibo-app-api.md +0 -851
  148. package/skills/creator-analysis/references/api-tags/weibo-web-api.md +0 -470
  149. package/skills/creator-analysis/references/api-tags/weibo-web-v2-api.md +0 -1405
  150. package/skills/creator-analysis/references/api-tags/xiaohongshu-app-api.md +0 -534
  151. package/skills/creator-analysis/references/api-tags/xiaohongshu-app-v2-api.md +0 -934
  152. package/skills/creator-analysis/references/api-tags/xiaohongshu-web-api.md +0 -757
  153. package/skills/creator-analysis/references/api-tags/xiaohongshu-web-v2-api.md +0 -762
  154. package/skills/creator-analysis/references/api-tags/xigua-app-v2-api.md +0 -308
  155. package/skills/creator-analysis/references/api-tags/youtube-web-api.md +0 -934
  156. package/skills/creator-analysis/references/api-tags/youtube-web-v2-api.md +0 -717
  157. package/skills/creator-analysis/references/api-tags/zhihu-web-api.md +0 -1384
  158. package/skills/creator-analysis/references/asr-orchestration.md +0 -33
  159. package/skills/creator-analysis/references/config-templates/defaults.yaml +0 -60
  160. package/skills/creator-analysis/references/contracts/creator-card-fields.md +0 -25
  161. package/skills/creator-analysis/references/contracts/work-card-fields.md +0 -68
  162. package/skills/creator-analysis/references/platform-guides/douyin.md +0 -54
  163. package/skills/creator-analysis/references/platform-guides/generic.md +0 -50
  164. package/skills/creator-analysis/references/platform-guides/xiaohongshu.md +0 -69
  165. package/skills/creator-analysis/references/prompt-contracts/asr-clean.md +0 -28
  166. package/skills/creator-analysis/references/prompt-contracts/author-analysis-v2.md +0 -46
  167. package/skills/creator-analysis/references/prompt-contracts/author-analysis.md +0 -49
  168. package/skills/creator-analysis/references/prompt-contracts/cta.md +0 -24
  169. package/skills/creator-analysis/references/prompt-contracts/hook.md +0 -25
  170. package/skills/creator-analysis/references/prompt-contracts/insight.md +0 -47
  171. package/skills/creator-analysis/references/prompt-contracts/sampled-work-batch-explanations.md +0 -30
  172. package/skills/creator-analysis/references/prompt-contracts/structure.md +0 -25
  173. package/skills/creator-analysis/references/prompt-contracts/style.md +0 -27
  174. package/skills/creator-analysis/references/prompt-contracts/summary.md +0 -29
  175. package/skills/creator-analysis/references/prompt-contracts/topic.md +0 -29
  176. package/skills/creator-analysis/references/schemas/author-analysis-input-v1.schema.json +0 -325
  177. package/skills/creator-analysis/references/schemas/author-analysis-v2.schema.json +0 -287
  178. package/skills/creator-analysis/references/schemas/sampled-work-batch-explanations.schema.json +0 -41
  179. package/skills/creator-analysis/references/service-guides/asr-u2-u3-fallback.md +0 -75
  180. package/skills/creator-analysis/references/workflow.md +0 -23
  181. package/skills/creator-analysis/scripts/__init__.py +0 -0
  182. package/skills/creator-analysis/scripts/author_home/__init__.py +0 -0
  183. package/skills/creator-analysis/scripts/author_home/adapters/__init__.py +0 -0
  184. package/skills/creator-analysis/scripts/author_home/analyzers/__init__.py +0 -0
  185. package/skills/creator-analysis/scripts/author_home/analyzers/author_analysis_v2_support.py +0 -1165
  186. package/skills/creator-analysis/scripts/author_home/analyzers/prompt_first_analyzers.py +0 -447
  187. package/skills/creator-analysis/scripts/author_home/analyzers/sampled_work_batch_explainer.py +0 -331
  188. package/skills/creator-analysis/scripts/author_home/asr/__init__.py +0 -5
  189. package/skills/creator-analysis/scripts/author_home/builders/__init__.py +0 -0
  190. package/skills/creator-analysis/scripts/author_home/builders/home_builders.py +0 -213
  191. package/skills/creator-analysis/scripts/author_home/collectors/__init__.py +0 -0
  192. package/skills/creator-analysis/scripts/author_home/orchestrator/__init__.py +0 -0
  193. package/skills/creator-analysis/scripts/author_home/orchestrator/run_author_analysis.py +0 -834
  194. package/skills/creator-analysis/scripts/author_home/orchestrator/work_analysis_artifacts.py +0 -609
  195. package/skills/creator-analysis/scripts/core/__init__.py +0 -0
  196. package/skills/creator-analysis/scripts/core/analysis_pipeline.py +0 -133
  197. package/skills/creator-analysis/scripts/core/config_loader.py +0 -418
  198. package/skills/creator-analysis/scripts/core/progress_report.py +0 -111
  199. package/skills/creator-analysis/scripts/core/storage_router.py +0 -256
  200. package/skills/creator-analysis/scripts/pipeline/__init__.py +0 -0
  201. package/skills/creator-analysis/scripts/pipeline/asr/__init__.py +0 -0
  202. package/skills/creator-analysis/scripts/platform/__init__.py +0 -0
  203. package/skills/creator-analysis/scripts/platform/douyin/__init__.py +0 -0
  204. package/skills/creator-analysis/scripts/platform/douyin/run_douyin_single_video.py +0 -1208
  205. package/skills/creator-analysis/scripts/platform/xiaohongshu/__init__.py +0 -0
  206. package/skills/creator-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +0 -2128
  207. package/skills/creator-analysis/scripts/writers/__init__.py +0 -0
  208. package/skills/creator-analysis/scripts/writers/write_author_homepage_samples.py +0 -107
  209. package/skills/creator-analysis/scripts/writers/write_benchmark_card.py +0 -1579
  210. package/skills/meta-capability/SKILL.md +0 -69
  211. package/skills/meta-capability/agents/openai.yaml +0 -4
  212. package/skills/meta-capability/env.example +0 -42
  213. package/skills/meta-capability/references/api-capability-index.md +0 -92
  214. package/skills/meta-capability/references/api-contracts/asr-api.md +0 -130
  215. package/skills/meta-capability/references/api-contracts/bilibili-app-api.md +0 -776
  216. package/skills/meta-capability/references/api-contracts/bilibili-web-api.md +0 -2017
  217. package/skills/meta-capability/references/api-contracts/demo-api.md +0 -717
  218. package/skills/meta-capability/references/api-contracts/douyin-app-v3-api.md +0 -3594
  219. package/skills/meta-capability/references/api-contracts/douyin-billboard-api.md +0 -2274
  220. package/skills/meta-capability/references/api-contracts/douyin-creator-api.md +0 -1575
  221. package/skills/meta-capability/references/api-contracts/douyin-creator-v2-api.md +0 -3254
  222. package/skills/meta-capability/references/api-contracts/douyin-search-api.md +0 -4118
  223. package/skills/meta-capability/references/api-contracts/douyin-web-api.md +0 -5544
  224. package/skills/meta-capability/references/api-contracts/douyin-xingtu-api.md +0 -1916
  225. package/skills/meta-capability/references/api-contracts/douyin-xingtu-v2-api.md +0 -1540
  226. package/skills/meta-capability/references/api-contracts/health-check.md +0 -69
  227. package/skills/meta-capability/references/api-contracts/hybrid-parsing.md +0 -78
  228. package/skills/meta-capability/references/api-contracts/instagram-v1-api.md +0 -2256
  229. package/skills/meta-capability/references/api-contracts/instagram-v2-api.md +0 -2011
  230. package/skills/meta-capability/references/api-contracts/instagram-v3-api.md +0 -2630
  231. package/skills/meta-capability/references/api-contracts/ios-shortcut.md +0 -44
  232. package/skills/meta-capability/references/api-contracts/kuaishou-app-api.md +0 -1518
  233. package/skills/meta-capability/references/api-contracts/kuaishou-web-api.md +0 -1242
  234. package/skills/meta-capability/references/api-contracts/lemon8-app-api.md +0 -1088
  235. package/skills/meta-capability/references/api-contracts/linkedin-web-api.md +0 -1949
  236. package/skills/meta-capability/references/api-contracts/media-ingest-api.md +0 -126
  237. package/skills/meta-capability/references/api-contracts/pipixia-app-api.md +0 -1142
  238. package/skills/meta-capability/references/api-contracts/reddit-app-api.md +0 -2025
  239. package/skills/meta-capability/references/api-contracts/sora2-api.md +0 -2266
  240. package/skills/meta-capability/references/api-contracts/temp-mail-api.md +0 -208
  241. package/skills/meta-capability/references/api-contracts/threads-web-api.md +0 -897
  242. package/skills/meta-capability/references/api-contracts/tikhub-downloader-api.md +0 -134
  243. package/skills/meta-capability/references/api-contracts/tikhub-user-api.md +0 -494
  244. package/skills/meta-capability/references/api-contracts/tiktok-ads-api.md +0 -5947
  245. package/skills/meta-capability/references/api-contracts/tiktok-analytics-api.md +0 -968
  246. package/skills/meta-capability/references/api-contracts/tiktok-app-v3-api.md +0 -5735
  247. package/skills/meta-capability/references/api-contracts/tiktok-creator-api.md +0 -1951
  248. package/skills/meta-capability/references/api-contracts/tiktok-interaction-api.md +0 -742
  249. package/skills/meta-capability/references/api-contracts/tiktok-shop-web-api.md +0 -1890
  250. package/skills/meta-capability/references/api-contracts/tiktok-web-api.md +0 -4448
  251. package/skills/meta-capability/references/api-contracts/toutiao-app-api.md +0 -342
  252. package/skills/meta-capability/references/api-contracts/toutiao-web-api.md +0 -143
  253. package/skills/meta-capability/references/api-contracts/twitter-web-api.md +0 -989
  254. package/skills/meta-capability/references/api-contracts/wechat-channels-api.md +0 -809
  255. package/skills/meta-capability/references/api-contracts/wechat-media-platform-web-api.md +0 -677
  256. package/skills/meta-capability/references/api-contracts/weibo-app-api.md +0 -1547
  257. package/skills/meta-capability/references/api-contracts/weibo-web-api.md +0 -798
  258. package/skills/meta-capability/references/api-contracts/weibo-web-v2-api.md +0 -2459
  259. package/skills/meta-capability/references/api-contracts/xiaohongshu-app-api.md +0 -1291
  260. package/skills/meta-capability/references/api-contracts/xiaohongshu-app-v2-api.md +0 -1683
  261. package/skills/meta-capability/references/api-contracts/xiaohongshu-web-api.md +0 -1324
  262. package/skills/meta-capability/references/api-contracts/xiaohongshu-web-v2-api.md +0 -1209
  263. package/skills/meta-capability/references/api-contracts/xigua-app-v2-api.md +0 -489
  264. package/skills/meta-capability/references/api-contracts/youtube-web-api.md +0 -2636
  265. package/skills/meta-capability/references/api-contracts/youtube-web-v2-api.md +0 -2660
  266. package/skills/meta-capability/references/api-contracts/zhihu-web-api.md +0 -2315
  267. package/skills/meta-capability/references/api-tags/asr-api.md +0 -100
  268. package/skills/meta-capability/references/api-tags/bilibili-app-api.md +0 -482
  269. package/skills/meta-capability/references/api-tags/bilibili-web-api.md +0 -1267
  270. package/skills/meta-capability/references/api-tags/demo-api.md +0 -365
  271. package/skills/meta-capability/references/api-tags/douyin-app-v3-api.md +0 -2012
  272. package/skills/meta-capability/references/api-tags/douyin-billboard-api.md +0 -1428
  273. package/skills/meta-capability/references/api-tags/douyin-creator-api.md +0 -694
  274. package/skills/meta-capability/references/api-tags/douyin-creator-v2-api.md +0 -694
  275. package/skills/meta-capability/references/api-tags/douyin-search-api.md +0 -1059
  276. package/skills/meta-capability/references/api-tags/douyin-web-api.md +0 -3314
  277. package/skills/meta-capability/references/api-tags/douyin-xingtu-api.md +0 -935
  278. package/skills/meta-capability/references/api-tags/douyin-xingtu-v2-api.md +0 -925
  279. package/skills/meta-capability/references/api-tags/health-check.md +0 -40
  280. package/skills/meta-capability/references/api-tags/hybrid-parsing.md +0 -57
  281. package/skills/meta-capability/references/api-tags/instagram-v1-api.md +0 -1224
  282. package/skills/meta-capability/references/api-tags/instagram-v2-api.md +0 -1147
  283. package/skills/meta-capability/references/api-tags/instagram-v3-api.md +0 -1123
  284. package/skills/meta-capability/references/api-tags/ios-shortcut.md +0 -45
  285. package/skills/meta-capability/references/api-tags/kuaishou-app-api.md +0 -846
  286. package/skills/meta-capability/references/api-tags/kuaishou-web-api.md +0 -551
  287. package/skills/meta-capability/references/api-tags/lemon8-app-api.md +0 -687
  288. package/skills/meta-capability/references/api-tags/linkedin-web-api.md +0 -1105
  289. package/skills/meta-capability/references/api-tags/media-ingest-api.md +0 -112
  290. package/skills/meta-capability/references/api-tags/pipixia-app-api.md +0 -721
  291. package/skills/meta-capability/references/api-tags/reddit-app-api.md +0 -1057
  292. package/skills/meta-capability/references/api-tags/sora2-api.md +0 -737
  293. package/skills/meta-capability/references/api-tags/temp-mail-api.md +0 -136
  294. package/skills/meta-capability/references/api-tags/threads-web-api.md +0 -472
  295. package/skills/meta-capability/references/api-tags/tikhub-downloader-api.md +0 -65
  296. package/skills/meta-capability/references/api-tags/tikhub-user-api.md +0 -253
  297. package/skills/meta-capability/references/api-tags/tiktok-ads-api.md +0 -1393
  298. package/skills/meta-capability/references/api-tags/tiktok-analytics-api.md +0 -179
  299. package/skills/meta-capability/references/api-tags/tiktok-app-v3-api.md +0 -3264
  300. package/skills/meta-capability/references/api-tags/tiktok-creator-api.md +0 -709
  301. package/skills/meta-capability/references/api-tags/tiktok-interaction-api.md +0 -366
  302. package/skills/meta-capability/references/api-tags/tiktok-shop-web-api.md +0 -663
  303. package/skills/meta-capability/references/api-tags/tiktok-web-api.md +0 -2516
  304. package/skills/meta-capability/references/api-tags/toutiao-app-api.md +0 -220
  305. package/skills/meta-capability/references/api-tags/toutiao-web-api.md +0 -96
  306. package/skills/meta-capability/references/api-tags/twitter-web-api.md +0 -562
  307. package/skills/meta-capability/references/api-tags/wechat-channels-api.md +0 -405
  308. package/skills/meta-capability/references/api-tags/wechat-media-platform-web-api.md +0 -431
  309. package/skills/meta-capability/references/api-tags/weibo-app-api.md +0 -851
  310. package/skills/meta-capability/references/api-tags/weibo-web-api.md +0 -470
  311. package/skills/meta-capability/references/api-tags/weibo-web-v2-api.md +0 -1405
  312. package/skills/meta-capability/references/api-tags/xiaohongshu-app-api.md +0 -534
  313. package/skills/meta-capability/references/api-tags/xiaohongshu-app-v2-api.md +0 -934
  314. package/skills/meta-capability/references/api-tags/xiaohongshu-web-api.md +0 -757
  315. package/skills/meta-capability/references/api-tags/xiaohongshu-web-v2-api.md +0 -762
  316. package/skills/meta-capability/references/api-tags/xigua-app-v2-api.md +0 -308
  317. package/skills/meta-capability/references/api-tags/youtube-web-api.md +0 -934
  318. package/skills/meta-capability/references/api-tags/youtube-web-v2-api.md +0 -717
  319. package/skills/meta-capability/references/api-tags/zhihu-web-api.md +0 -1384
  320. package/skills/meta-capability/references/config-templates/defaults.yaml +0 -18
  321. package/skills/meta-capability/references/dispatch.md +0 -27
  322. package/skills/meta-capability/references/execution-guidelines.md +0 -25
  323. package/skills/meta-capability/references/implemented-route-map.md +0 -177
  324. package/skills/meta-capability/references/service-guides/asr-u2-u3-fallback.md +0 -75
  325. package/skills/meta-capability/scripts/__init__.py +0 -1
  326. package/skills/meta-capability/scripts/call_route.py +0 -141
  327. package/skills/meta-capability/scripts/core/__init__.py +0 -1
  328. package/skills/meta-capability/scripts/core/bootstrap_env.py +0 -32
  329. package/skills/meta-capability/scripts/core/config_loader.py +0 -204
  330. package/skills/meta-capability/scripts/core/tikomni_common.py +0 -443
  331. package/skills/meta-capability/scripts/test_auth.py +0 -98
  332. package/skills/single-work-analysis/SKILL.md +0 -62
  333. package/skills/single-work-analysis/agents/openai.yaml +0 -4
  334. package/skills/single-work-analysis/env.example +0 -36
  335. package/skills/single-work-analysis/references/api-capability-index.md +0 -92
  336. package/skills/single-work-analysis/references/api-contracts/asr-api.md +0 -130
  337. package/skills/single-work-analysis/references/api-contracts/bilibili-app-api.md +0 -776
  338. package/skills/single-work-analysis/references/api-contracts/bilibili-web-api.md +0 -2017
  339. package/skills/single-work-analysis/references/api-contracts/demo-api.md +0 -717
  340. package/skills/single-work-analysis/references/api-contracts/douyin-app-v3-api.md +0 -3594
  341. package/skills/single-work-analysis/references/api-contracts/douyin-billboard-api.md +0 -2274
  342. package/skills/single-work-analysis/references/api-contracts/douyin-creator-api.md +0 -1575
  343. package/skills/single-work-analysis/references/api-contracts/douyin-creator-v2-api.md +0 -3254
  344. package/skills/single-work-analysis/references/api-contracts/douyin-search-api.md +0 -4118
  345. package/skills/single-work-analysis/references/api-contracts/douyin-web-api.md +0 -5544
  346. package/skills/single-work-analysis/references/api-contracts/douyin-xingtu-api.md +0 -1916
  347. package/skills/single-work-analysis/references/api-contracts/douyin-xingtu-v2-api.md +0 -1540
  348. package/skills/single-work-analysis/references/api-contracts/health-check.md +0 -69
  349. package/skills/single-work-analysis/references/api-contracts/hybrid-parsing.md +0 -78
  350. package/skills/single-work-analysis/references/api-contracts/instagram-v1-api.md +0 -2256
  351. package/skills/single-work-analysis/references/api-contracts/instagram-v2-api.md +0 -2011
  352. package/skills/single-work-analysis/references/api-contracts/instagram-v3-api.md +0 -2630
  353. package/skills/single-work-analysis/references/api-contracts/ios-shortcut.md +0 -44
  354. package/skills/single-work-analysis/references/api-contracts/kuaishou-app-api.md +0 -1518
  355. package/skills/single-work-analysis/references/api-contracts/kuaishou-web-api.md +0 -1242
  356. package/skills/single-work-analysis/references/api-contracts/lemon8-app-api.md +0 -1088
  357. package/skills/single-work-analysis/references/api-contracts/linkedin-web-api.md +0 -1949
  358. package/skills/single-work-analysis/references/api-contracts/media-ingest-api.md +0 -126
  359. package/skills/single-work-analysis/references/api-contracts/pipixia-app-api.md +0 -1142
  360. package/skills/single-work-analysis/references/api-contracts/reddit-app-api.md +0 -2025
  361. package/skills/single-work-analysis/references/api-contracts/sora2-api.md +0 -2266
  362. package/skills/single-work-analysis/references/api-contracts/temp-mail-api.md +0 -208
  363. package/skills/single-work-analysis/references/api-contracts/threads-web-api.md +0 -897
  364. package/skills/single-work-analysis/references/api-contracts/tikhub-downloader-api.md +0 -134
  365. package/skills/single-work-analysis/references/api-contracts/tikhub-user-api.md +0 -494
  366. package/skills/single-work-analysis/references/api-contracts/tiktok-ads-api.md +0 -5947
  367. package/skills/single-work-analysis/references/api-contracts/tiktok-analytics-api.md +0 -968
  368. package/skills/single-work-analysis/references/api-contracts/tiktok-app-v3-api.md +0 -5735
  369. package/skills/single-work-analysis/references/api-contracts/tiktok-creator-api.md +0 -1951
  370. package/skills/single-work-analysis/references/api-contracts/tiktok-interaction-api.md +0 -742
  371. package/skills/single-work-analysis/references/api-contracts/tiktok-shop-web-api.md +0 -1890
  372. package/skills/single-work-analysis/references/api-contracts/tiktok-web-api.md +0 -4448
  373. package/skills/single-work-analysis/references/api-contracts/toutiao-app-api.md +0 -342
  374. package/skills/single-work-analysis/references/api-contracts/toutiao-web-api.md +0 -143
  375. package/skills/single-work-analysis/references/api-contracts/twitter-web-api.md +0 -989
  376. package/skills/single-work-analysis/references/api-contracts/wechat-channels-api.md +0 -809
  377. package/skills/single-work-analysis/references/api-contracts/wechat-media-platform-web-api.md +0 -677
  378. package/skills/single-work-analysis/references/api-contracts/weibo-app-api.md +0 -1547
  379. package/skills/single-work-analysis/references/api-contracts/weibo-web-api.md +0 -798
  380. package/skills/single-work-analysis/references/api-contracts/weibo-web-v2-api.md +0 -2459
  381. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-app-api.md +0 -1291
  382. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-app-v2-api.md +0 -1683
  383. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-web-api.md +0 -1324
  384. package/skills/single-work-analysis/references/api-contracts/xiaohongshu-web-v2-api.md +0 -1209
  385. package/skills/single-work-analysis/references/api-contracts/xigua-app-v2-api.md +0 -489
  386. package/skills/single-work-analysis/references/api-contracts/youtube-web-api.md +0 -2636
  387. package/skills/single-work-analysis/references/api-contracts/youtube-web-v2-api.md +0 -2660
  388. package/skills/single-work-analysis/references/api-contracts/zhihu-web-api.md +0 -2315
  389. package/skills/single-work-analysis/references/api-tags/asr-api.md +0 -100
  390. package/skills/single-work-analysis/references/api-tags/bilibili-app-api.md +0 -482
  391. package/skills/single-work-analysis/references/api-tags/bilibili-web-api.md +0 -1267
  392. package/skills/single-work-analysis/references/api-tags/demo-api.md +0 -365
  393. package/skills/single-work-analysis/references/api-tags/douyin-app-v3-api.md +0 -2012
  394. package/skills/single-work-analysis/references/api-tags/douyin-billboard-api.md +0 -1428
  395. package/skills/single-work-analysis/references/api-tags/douyin-creator-api.md +0 -694
  396. package/skills/single-work-analysis/references/api-tags/douyin-creator-v2-api.md +0 -694
  397. package/skills/single-work-analysis/references/api-tags/douyin-search-api.md +0 -1059
  398. package/skills/single-work-analysis/references/api-tags/douyin-web-api.md +0 -3314
  399. package/skills/single-work-analysis/references/api-tags/douyin-xingtu-api.md +0 -935
  400. package/skills/single-work-analysis/references/api-tags/douyin-xingtu-v2-api.md +0 -925
  401. package/skills/single-work-analysis/references/api-tags/health-check.md +0 -40
  402. package/skills/single-work-analysis/references/api-tags/hybrid-parsing.md +0 -57
  403. package/skills/single-work-analysis/references/api-tags/instagram-v1-api.md +0 -1224
  404. package/skills/single-work-analysis/references/api-tags/instagram-v2-api.md +0 -1147
  405. package/skills/single-work-analysis/references/api-tags/instagram-v3-api.md +0 -1123
  406. package/skills/single-work-analysis/references/api-tags/ios-shortcut.md +0 -45
  407. package/skills/single-work-analysis/references/api-tags/kuaishou-app-api.md +0 -846
  408. package/skills/single-work-analysis/references/api-tags/kuaishou-web-api.md +0 -551
  409. package/skills/single-work-analysis/references/api-tags/lemon8-app-api.md +0 -687
  410. package/skills/single-work-analysis/references/api-tags/linkedin-web-api.md +0 -1105
  411. package/skills/single-work-analysis/references/api-tags/media-ingest-api.md +0 -112
  412. package/skills/single-work-analysis/references/api-tags/pipixia-app-api.md +0 -721
  413. package/skills/single-work-analysis/references/api-tags/reddit-app-api.md +0 -1057
  414. package/skills/single-work-analysis/references/api-tags/sora2-api.md +0 -737
  415. package/skills/single-work-analysis/references/api-tags/temp-mail-api.md +0 -136
  416. package/skills/single-work-analysis/references/api-tags/threads-web-api.md +0 -472
  417. package/skills/single-work-analysis/references/api-tags/tikhub-downloader-api.md +0 -65
  418. package/skills/single-work-analysis/references/api-tags/tikhub-user-api.md +0 -253
  419. package/skills/single-work-analysis/references/api-tags/tiktok-ads-api.md +0 -1393
  420. package/skills/single-work-analysis/references/api-tags/tiktok-analytics-api.md +0 -179
  421. package/skills/single-work-analysis/references/api-tags/tiktok-app-v3-api.md +0 -3264
  422. package/skills/single-work-analysis/references/api-tags/tiktok-creator-api.md +0 -709
  423. package/skills/single-work-analysis/references/api-tags/tiktok-interaction-api.md +0 -366
  424. package/skills/single-work-analysis/references/api-tags/tiktok-shop-web-api.md +0 -663
  425. package/skills/single-work-analysis/references/api-tags/tiktok-web-api.md +0 -2516
  426. package/skills/single-work-analysis/references/api-tags/toutiao-app-api.md +0 -220
  427. package/skills/single-work-analysis/references/api-tags/toutiao-web-api.md +0 -96
  428. package/skills/single-work-analysis/references/api-tags/twitter-web-api.md +0 -562
  429. package/skills/single-work-analysis/references/api-tags/wechat-channels-api.md +0 -405
  430. package/skills/single-work-analysis/references/api-tags/wechat-media-platform-web-api.md +0 -431
  431. package/skills/single-work-analysis/references/api-tags/weibo-app-api.md +0 -851
  432. package/skills/single-work-analysis/references/api-tags/weibo-web-api.md +0 -470
  433. package/skills/single-work-analysis/references/api-tags/weibo-web-v2-api.md +0 -1405
  434. package/skills/single-work-analysis/references/api-tags/xiaohongshu-app-api.md +0 -534
  435. package/skills/single-work-analysis/references/api-tags/xiaohongshu-app-v2-api.md +0 -934
  436. package/skills/single-work-analysis/references/api-tags/xiaohongshu-web-api.md +0 -757
  437. package/skills/single-work-analysis/references/api-tags/xiaohongshu-web-v2-api.md +0 -762
  438. package/skills/single-work-analysis/references/api-tags/xigua-app-v2-api.md +0 -308
  439. package/skills/single-work-analysis/references/api-tags/youtube-web-api.md +0 -934
  440. package/skills/single-work-analysis/references/api-tags/youtube-web-v2-api.md +0 -717
  441. package/skills/single-work-analysis/references/api-tags/zhihu-web-api.md +0 -1384
  442. package/skills/single-work-analysis/references/asr-and-fallback.md +0 -20
  443. package/skills/single-work-analysis/references/config-templates/defaults.yaml +0 -58
  444. package/skills/single-work-analysis/references/contracts/work-card-fields.md +0 -41
  445. package/skills/single-work-analysis/references/platform-guides/douyin.md +0 -47
  446. package/skills/single-work-analysis/references/platform-guides/generic.md +0 -43
  447. package/skills/single-work-analysis/references/platform-guides/xiaohongshu.md +0 -54
  448. package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
  449. package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
  450. package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
  451. package/skills/single-work-analysis/references/prompt-contracts/insight.md +0 -47
  452. package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
  453. package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
  454. package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
  455. package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
  456. package/skills/single-work-analysis/references/schemas/work-card.schema.json +0 -39
  457. package/skills/single-work-analysis/references/service-guides/asr-u2-u3-fallback.md +0 -75
  458. package/skills/single-work-analysis/scripts/__init__.py +0 -0
  459. package/skills/single-work-analysis/scripts/core/__init__.py +0 -0
  460. package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +0 -133
  461. package/skills/single-work-analysis/scripts/core/bootstrap_env.py +0 -35
  462. package/skills/single-work-analysis/scripts/core/extract_pipeline.py +0 -173
  463. package/skills/single-work-analysis/scripts/core/storage_router.py +0 -253
  464. package/skills/single-work-analysis/scripts/core/tikomni_common.py +0 -588
  465. package/skills/single-work-analysis/scripts/pipeline/__init__.py +0 -0
  466. package/skills/single-work-analysis/scripts/pipeline/asr/__init__.py +0 -0
  467. package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +0 -1189
  468. package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +0 -95
  469. package/skills/single-work-analysis/scripts/platform/__init__.py +0 -0
  470. package/skills/single-work-analysis/scripts/platform/douyin/__init__.py +0 -0
  471. package/skills/single-work-analysis/scripts/platform/douyin/douyin_video_type_matrix.py +0 -224
  472. package/skills/single-work-analysis/scripts/platform/douyin/select_low_quality_video_url.py +0 -200
  473. package/skills/single-work-analysis/scripts/platform/xiaohongshu/__init__.py +0 -0
  474. package/skills/single-work-analysis/scripts/writers/__init__.py +0 -0
  475. package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +0 -1402
  476. /package/skills/{creator-analysis → social-media-crawl}/scripts/core/bootstrap_env.py +0 -0
  477. /package/skills/{creator-analysis → social-media-crawl}/scripts/core/extract_pipeline.py +0 -0
  478. /package/skills/{creator-analysis/scripts/platform/douyin → social-media-crawl/scripts/pipelines}/douyin_video_type_matrix.py +0 -0
  479. /package/skills/{creator-analysis/scripts/author_home/collectors → social-media-crawl/scripts/pipelines}/homepage_collectors.py +0 -0
  480. /package/skills/{creator-analysis/scripts/platform/douyin → social-media-crawl/scripts/pipelines}/select_low_quality_video_url.py +0 -0
@@ -1,2128 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- if __package__ in {None, ""}:
4
- import sys
5
- from pathlib import Path
6
-
7
- _self = Path(__file__).resolve()
8
- for _parent in _self.parents:
9
- if (_parent / "scripts" / "core" / "bootstrap_env.py").is_file():
10
- sys.path.insert(0, str(_parent))
11
- break
12
-
13
- """Xiaohongshu extraction: APP V2 -> APP V1 -> WEB_V2 -> WEB."""
14
-
15
- from scripts.core.bootstrap_env import bootstrap_for_direct_run
16
-
17
- bootstrap_for_direct_run(__file__, __package__)
18
-
19
- import argparse
20
- import hashlib
21
- import json
22
- import re
23
- import urllib.parse
24
- import urllib.request
25
- from datetime import datetime
26
- from pathlib import Path
27
- from typing import Any, Dict, List, Optional, Tuple
28
-
29
- from scripts.pipeline.asr.asr_pipeline import run_u2_asr_candidates_with_timeout_retry
30
- from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
31
- from scripts.core.progress_report import ProgressReporter
32
- from scripts.core.storage_router import render_output_filename, resolve_json_filename_pattern
33
- from scripts.core.extract_pipeline import build_api_trace, resolve_trace_error_context
34
- from scripts.core.tikomni_common import (
35
- call_json_api,
36
- deep_find_all,
37
- deep_find_first,
38
- normalize_text,
39
- resolve_runtime,
40
- summarize_content,
41
- write_json_stdout,
42
- )
43
- from scripts.writers.write_benchmark_card import write_benchmark_card
44
-
45
- APP_V2_VIDEO_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_video_note_detail"
46
- APP_V2_IMAGE_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_image_note_detail"
47
- APP_V2_MIXED_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_mixed_note_detail"
48
- APP_V1_ENDPOINT = "/api/u1/v1/xiaohongshu/app/get_note_info"
49
- WEB_V2_V2_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v2"
50
- WEB_V2_V3_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v3"
51
- WEB_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v7"
52
- U2_GATE_MIN_DURATION_MS = 13000
53
- U2_GATE_MAX_DURATION_MS = 1800000
54
- U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_down_url_present"
55
-
56
-
57
- def _to_int_or_none(value: Any) -> Optional[int]:
58
- try:
59
- if isinstance(value, bool):
60
- return int(value)
61
- if isinstance(value, (int, float)):
62
- parsed = int(value)
63
- return parsed if parsed > 0 else None
64
- text = normalize_text(value)
65
- if not text:
66
- return None
67
- parsed = int(float(text.replace(",", "")))
68
- return parsed if parsed > 0 else None
69
- except Exception:
70
- return None
71
-
72
-
73
- def _evaluate_u2_gate_for_xhs(*, note_content_type: str, duration_ms: Any, video_down_url: Optional[str]) -> Dict[str, Any]:
74
- content_type = normalize_text(note_content_type).lower()
75
- is_video = content_type in {"video", "mixed"}
76
- normalized_duration = _to_int_or_none(duration_ms)
77
- normalized_video_down_url = normalize_text(video_down_url)
78
-
79
- if not is_video:
80
- gate_reason = "skip:not_video"
81
- elif normalized_duration is None:
82
- gate_reason = "skip:duration_missing"
83
- elif normalized_duration <= U2_GATE_MIN_DURATION_MS:
84
- gate_reason = "skip:duration_too_short"
85
- elif normalized_duration > U2_GATE_MAX_DURATION_MS:
86
- gate_reason = "skip:duration_too_long"
87
- elif not normalized_video_down_url:
88
- gate_reason = "skip:video_down_url_missing"
89
- else:
90
- gate_reason = "pass"
91
-
92
- return {
93
- "can_u2": gate_reason == "pass",
94
- "gate_reason": gate_reason,
95
- "is_video": is_video,
96
- "duration_ms": normalized_duration,
97
- "video_down_url": normalized_video_down_url,
98
- "video_down_url_present": bool(normalized_video_down_url),
99
- }
100
-
101
-
102
- def _safe_slug(value: Optional[str], fallback: str = "unknown") -> str:
103
- text = normalize_text(value)
104
- if not text:
105
- return fallback
106
- slug = re.sub(r"[^a-zA-Z0-9_-]+", "-", text).strip("-").lower()
107
- return slug[:64] or fallback
108
-
109
-
110
- def _traceable_identifier(source_input: Dict[str, Optional[str]], note_id: Optional[str]) -> str:
111
- if note_id:
112
- return _safe_slug(note_id)
113
- share = normalize_text(source_input.get("share_text"))
114
- if not share:
115
- return "missing_input"
116
- digest = hashlib.sha1(share.encode("utf-8")).hexdigest()[:10]
117
- return f"url-{digest}"
118
-
119
-
120
- def _build_persist_payload(
121
- *,
122
- result: Dict[str, Any],
123
- source_input: Dict[str, Optional[str]],
124
- note_id: Optional[str],
125
- status: str,
126
- written_at: datetime,
127
- ) -> Dict[str, Any]:
128
- summary = {
129
- "summary": result.get("summary", ""),
130
- "insights": result.get("insights", []),
131
- "confidence": result.get("confidence"),
132
- "error_reason": result.get("error_reason"),
133
- }
134
- normalized = {
135
- "platform": "xiaohongshu",
136
- "content_kind": result.get("content_kind", "note"),
137
- "note_id": result.get("note_id") or note_id,
138
- "note_content_type": result.get("note_content_type"),
139
- "text_source": result.get("text_source"),
140
- "request_id": result.get("request_id"),
141
- "source": source_input,
142
- }
143
- return {
144
- "meta": {
145
- "written_at": written_at.isoformat(timespec="seconds"),
146
- "status": status,
147
- "platform": "xiaohongshu",
148
- "identifier": _traceable_identifier(source_input, note_id),
149
- },
150
- "summary": summary,
151
- "normalized": normalized,
152
- "raw": result,
153
- }
154
-
155
-
156
- def _persist_output_artifact(
157
- *,
158
- result: Dict[str, Any],
159
- source_input: Dict[str, Optional[str]],
160
- note_id: Optional[str],
161
- storage_config: Optional[Dict[str, Any]],
162
- persist_output: bool,
163
- ) -> Dict[str, Any]:
164
- if not persist_output:
165
- return {"enabled": False, "skipped": True, "reason": "disabled_by_flag"}
166
-
167
- try:
168
- paths = resolve_storage_paths(storage_config or {})
169
- except Exception as error:
170
- return {"enabled": True, "ok": False, "error": f"resolve_storage_paths_failed:{error}"}
171
-
172
- now = datetime.now()
173
- date_key = now.strftime("%Y%m%d")
174
- timestamp = now.strftime("%Y%m%dT%H%M%S")
175
- identifier = _traceable_identifier(source_input, note_id)
176
- has_error = bool(result.get("error_reason"))
177
- status = "error" if has_error else "success"
178
-
179
- if has_error:
180
- target_dir = Path(paths.get("errors_root", "")) / date_key
181
- else:
182
- target_dir = Path(paths.get("results_root", "")) / date_key
183
-
184
- target_dir.mkdir(parents=True, exist_ok=True)
185
- file_name = render_output_filename(
186
- pattern=resolve_json_filename_pattern(storage_config),
187
- context={
188
- "prefix": status,
189
- "platform": "xiaohongshu",
190
- "card_type": "single_work_result",
191
- "author_slug": identifier,
192
- "title_slug": identifier,
193
- "identifier": identifier,
194
- "timestamp": timestamp,
195
- "date": date_key,
196
- "ext": ".json",
197
- },
198
- default_filename=f"{timestamp}-xiaohongshu-{identifier}.json",
199
- default_ext=".json",
200
- )
201
- file_path = target_dir / file_name
202
-
203
- payload = _build_persist_payload(
204
- result=result,
205
- source_input=source_input,
206
- note_id=note_id,
207
- status=status,
208
- written_at=now,
209
- )
210
- file_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
211
-
212
- return {
213
- "enabled": True,
214
- "ok": True,
215
- "status": status,
216
- "path": str(file_path),
217
- }
218
-
219
-
220
- def _finalize_result(
221
- *,
222
- result: Dict[str, Any],
223
- source_input: Dict[str, Optional[str]],
224
- note_id: Optional[str],
225
- storage_config: Optional[Dict[str, Any]],
226
- persist_output: bool,
227
- ) -> Dict[str, Any]:
228
- result["output_persist"] = _persist_output_artifact(
229
- result=result,
230
- source_input=source_input,
231
- note_id=note_id,
232
- storage_config=storage_config,
233
- persist_output=persist_output,
234
- )
235
- return result
236
-
237
-
238
- def _normalize_input(input_value: Optional[str], share_text: Optional[str], note_id: Optional[str]) -> Dict[str, Optional[str]]:
239
- normalized_share = normalize_text(share_text) or None
240
- normalized_note_id = normalize_text(note_id) or None
241
-
242
- if input_value and not normalized_share and not normalized_note_id:
243
- candidate = input_value.strip()
244
- if candidate.startswith("http://") or candidate.startswith("https://"):
245
- normalized_share = candidate
246
- else:
247
- normalized_note_id = candidate
248
-
249
- return {
250
- "share_text": normalized_share,
251
- "note_id": normalized_note_id,
252
- }
253
-
254
-
255
- def _extract_note_id_from_share(share_text: Optional[str]) -> Optional[str]:
256
- if not share_text:
257
- return None
258
- text = share_text.strip()
259
- patterns = [
260
- r"/explore/([0-9a-zA-Z]+)",
261
- r"/discovery/item/([0-9a-zA-Z]+)",
262
- r"note_id=([0-9a-zA-Z]+)",
263
- ]
264
- for pattern in patterns:
265
- match = re.search(pattern, text)
266
- if match:
267
- return match.group(1)
268
- return None
269
-
270
-
271
- def _resolve_note_id(payload: Any, source_input: Dict[str, Optional[str]]) -> Optional[str]:
272
- # Priority 1: explicit source input
273
- if source_input.get("note_id"):
274
- return source_input.get("note_id")
275
-
276
- # Priority 2: canonical keys from payload
277
- for key in ["note_id", "noteid", "item_id", "itemId"]:
278
- value = deep_find_first(payload, [key])
279
- text = normalize_text(value)
280
- if text and len(text) >= 16:
281
- return text
282
-
283
- # Priority 3: parse from canonical webpage URLs
284
- for key in ["webpage_url", "share_url", "url"]:
285
- values = deep_find_all(payload, [key])
286
- for value in values:
287
- text = normalize_text(value)
288
- hit = _extract_note_id_from_share(text)
289
- if hit:
290
- return hit
291
-
292
- # Priority 4: parse from source share text
293
- hit = _extract_note_id_from_share(source_input.get("share_text"))
294
- if hit:
295
- return hit
296
-
297
- return None
298
-
299
-
300
- def _is_short_share_url(share_text: Optional[str]) -> bool:
301
- if not share_text:
302
- return False
303
- try:
304
- host = urllib.parse.urlparse(share_text).netloc.lower()
305
- except Exception:
306
- return False
307
- return "xhslink.com" in host
308
-
309
-
310
- def _app_response_has_core_fields(response_data: Any) -> bool:
311
- subtitle_hit = bool(_extract_subtitle_inline_text(response_data)) or bool(_extract_subtitle_urls(response_data))
312
- video_hit = bool(_extract_video_candidates(response_data))
313
- # For APP-first strategy, if app only has weak image frames but no subtitle/video core,
314
- # continue probing WEB_V2 to improve media fidelity.
315
- return subtitle_hit or video_hit
316
-
317
-
318
- def _route_field_completeness(payload: Any, source_input: Dict[str, Optional[str]]) -> Dict[str, Any]:
319
- note_id_hit = bool(_resolve_note_id(payload, source_input))
320
- title_hit = bool(
321
- _pick_text_from_paths(
322
- payload,
323
- [["title"], ["desc"], ["content"], ["note", "title"], ["note", "desc"], ["note", "content"]],
324
- )
325
- )
326
- author_hit = bool(
327
- _pick_text_from_paths(
328
- payload,
329
- [
330
- ["nickname"],
331
- ["author_nickname"],
332
- ["user_nickname"],
333
- ["author", "nickname"],
334
- ["user", "nickname"],
335
- ["author", "name"],
336
- ["user", "name"],
337
- ],
338
- )
339
- )
340
- media_hit = bool(_extract_video_candidates(payload) or _extract_image_candidates(payload))
341
- subtitle_hit = bool(_extract_subtitle_inline_text(payload)) or bool(_extract_subtitle_urls(payload))
342
- metrics_hit = any(
343
- _pick_int_from_paths(payload, [path], prefer_positive=True) is not None
344
- for path in (
345
- ["digg_count"],
346
- ["liked_count"],
347
- ["like_count"],
348
- ["comment_count"],
349
- ["collect_count"],
350
- ["share_count"],
351
- ["view_count"],
352
- ["play_count"],
353
- )
354
- )
355
-
356
- fields = {
357
- "note_id": note_id_hit,
358
- "title_or_desc": title_hit,
359
- "author": author_hit,
360
- "media": media_hit,
361
- "subtitle": subtitle_hit,
362
- "metrics": metrics_hit,
363
- }
364
- filled_count = sum(1 for hit in fields.values() if hit)
365
- missing_core = [key for key in ("note_id", "title_or_desc", "media") if not fields.get(key)]
366
- return {
367
- "fields": fields,
368
- "filled_count": filled_count,
369
- "total_fields": len(fields),
370
- "ratio": round(filled_count / max(len(fields), 1), 3),
371
- "missing_core": missing_core,
372
- "core_ready": not missing_core,
373
- }
374
-
375
-
376
- def _route_success_for_note(response: Dict[str, Any], source_input: Dict[str, Optional[str]]) -> bool:
377
- if not response.get("ok"):
378
- return False
379
- completeness = response.get("_field_completeness")
380
- if not isinstance(completeness, dict):
381
- completeness = _route_field_completeness(response.get("data"), source_input)
382
- response["_field_completeness"] = completeness
383
- return bool(completeness.get("core_ready"))
384
-
385
-
386
- def _pick_text_from_paths(payload: Any, paths: List[List[str]]) -> str:
387
- for path in paths:
388
- raw = deep_find_first(payload, path)
389
- if isinstance(raw, (dict, list)):
390
- continue
391
- text = normalize_text(raw)
392
- if text:
393
- return text
394
- return ""
395
-
396
-
397
- def _to_int(value: Any) -> Optional[int]:
398
- if isinstance(value, bool):
399
- return int(value)
400
- if isinstance(value, int):
401
- return value
402
- if isinstance(value, float):
403
- return int(value)
404
- if isinstance(value, str):
405
- text = value.strip()
406
- if text.isdigit() or (text.startswith("-") and text[1:].isdigit()):
407
- return int(text)
408
- return None
409
-
410
-
411
- def _extract_value_by_path(payload: Any, path: List[str]) -> Optional[Any]:
412
- if not path:
413
- return None
414
-
415
- def _walk(node: Any, idx: int) -> Optional[Any]:
416
- if idx >= len(path):
417
- if node in (None, "", [], {}):
418
- return None
419
- return node
420
-
421
- key = path[idx]
422
- if isinstance(node, dict):
423
- if key in node:
424
- hit = _walk(node.get(key), idx + 1)
425
- if hit is not None:
426
- return hit
427
- for value in node.values():
428
- hit = _walk(value, idx)
429
- if hit is not None:
430
- return hit
431
- return None
432
-
433
- if isinstance(node, list):
434
- for item in node:
435
- hit = _walk(item, idx)
436
- if hit is not None:
437
- return hit
438
- return None
439
-
440
- return None
441
-
442
- return _walk(payload, 0)
443
-
444
-
445
- def _normalize_unix_sec(value: int) -> int:
446
- # 13-digit timestamps are milliseconds.
447
- if value > 1_000_000_000_000:
448
- return value // 1000
449
- return value
450
-
451
-
452
- def _pick_int_with_source_from_paths(
453
- payload: Any,
454
- paths: List[List[str]],
455
- *,
456
- prefer_positive: bool = False,
457
- normalize_unix_sec: bool = False,
458
- ) -> Tuple[Optional[int], str]:
459
- for path in paths:
460
- value = _extract_value_by_path(payload, path)
461
- if value is None:
462
- value = deep_find_first(payload, path)
463
- parsed = _to_int(value)
464
- if parsed is None:
465
- continue
466
- if normalize_unix_sec:
467
- parsed = _normalize_unix_sec(parsed)
468
- if prefer_positive and parsed <= 0:
469
- continue
470
- return parsed, ".".join(path)
471
- return None, ""
472
-
473
-
474
- def _pick_int_from_paths(
475
- payload: Any,
476
- paths: List[List[str]],
477
- *,
478
- prefer_positive: bool = False,
479
- normalize_unix_sec: bool = False,
480
- ) -> Optional[int]:
481
- value, _ = _pick_int_with_source_from_paths(
482
- payload,
483
- paths,
484
- prefer_positive=prefer_positive,
485
- normalize_unix_sec=normalize_unix_sec,
486
- )
487
- return value
488
-
489
-
490
- def _dedupe_keep_order(values: List[str]) -> List[str]:
491
- output: List[str] = []
492
- seen = set()
493
- for value in values:
494
- if value in seen:
495
- continue
496
- seen.add(value)
497
- output.append(value)
498
- return output
499
-
500
-
501
- def _clean_tag_text(value: Any) -> str:
502
- text = normalize_text(value)
503
- if not text:
504
- return ""
505
- text = text.strip().strip("#")
506
- text = re.sub(r"\[话题\]$", "", text)
507
- text = text.strip().strip("#")
508
- return text
509
-
510
-
511
- def _append_tag(raw: Any, output: List[str], seen: set) -> None:
512
- tag = _clean_tag_text(raw)
513
- if not tag or tag in seen:
514
- return
515
- seen.add(tag)
516
- output.append(tag)
517
-
518
-
519
- def _extract_tags_from_container(value: Any, output: List[str], seen: set) -> None:
520
- if isinstance(value, str):
521
- _append_tag(value, output, seen)
522
- return
523
- if isinstance(value, list):
524
- for item in value:
525
- _extract_tags_from_container(item, output, seen)
526
- return
527
- if isinstance(value, dict):
528
- for key in ("name", "tag_name", "topic_name", "hashtag_name"):
529
- _append_tag(value.get(key), output, seen)
530
-
531
-
532
- def _extract_xhs_tags(payload: Any) -> List[str]:
533
- primary_tags: List[str] = []
534
- primary_seen: set = set()
535
- for key in ("tagList", "taglist", "tag_list"):
536
- for value in deep_find_all(payload, [key]):
537
- _extract_tags_from_container(value, primary_tags, primary_seen)
538
- if primary_tags:
539
- return primary_tags
540
-
541
- tags: List[str] = []
542
- seen: set = set()
543
- for key in ("topics", "hash_tag", "hashTag", "head_tags", "foot_tags"):
544
- for value in deep_find_all(payload, [key]):
545
- _extract_tags_from_container(value, tags, seen)
546
-
547
- for desc in deep_find_all(payload, ["desc", "content"]):
548
- if not isinstance(desc, str):
549
- continue
550
- for match in re.findall(r"#([^#\n\r]+?)#", desc):
551
- _append_tag(match, tags, seen)
552
-
553
- return tags
554
-
555
-
556
- def _build_candidate_merge_sources(*, app_candidates: List[str], enrich_candidates: List[str], app_label: str) -> List[str]:
557
- sources: List[str] = []
558
- if app_candidates:
559
- sources.append(app_label)
560
- if enrich_candidates:
561
- sources.append("web_v2_enrich")
562
- return sources
563
-
564
-
565
- def _extract_xhs_metadata(
566
- *,
567
- payload: Any,
568
- source_input: Dict[str, Optional[str]],
569
- selected_video_url: Optional[str],
570
- selected_image_urls: List[str],
571
- ) -> Dict[str, Any]:
572
- share_from_source = normalize_text(source_input.get("share_text"))
573
-
574
- title = _pick_text_from_paths(payload, [["title"], ["note", "title"], ["desc"], ["content"]])
575
- author = _pick_text_from_paths(
576
- payload,
577
- [
578
- ["nickname"],
579
- ["author_nickname"],
580
- ["user_nickname"],
581
- ["author", "nickname"],
582
- ["user", "nickname"],
583
- ["author", "name"],
584
- ["user", "name"],
585
- ],
586
- )
587
-
588
- create_time_paths = [
589
- ["create_time_sec"],
590
- ["create_time"],
591
- ["publish_time_sec"],
592
- ["publish_time"],
593
- ["time"],
594
- ["timestamp"],
595
- ["createTime"],
596
- ["publishTime"],
597
- ["note", "create_time_sec"],
598
- ["note", "create_time"],
599
- ["note", "createTime"],
600
- ["note", "publish_time_sec"],
601
- ["note", "publish_time"],
602
- ["note", "publishTime"],
603
- ["note", "time"],
604
- ["note", "timestamp"],
605
- ["note_list", "create_time_sec"],
606
- ["note_list", "create_time"],
607
- ["note_list", "createTime"],
608
- ["note_list", "publish_time_sec"],
609
- ["note_list", "publish_time"],
610
- ["note_list", "publishTime"],
611
- ["note_list", "time"],
612
- ["note_list", "timestamp"],
613
- ["noteList", "create_time_sec"],
614
- ["noteList", "create_time"],
615
- ["noteList", "createTime"],
616
- ["noteList", "publish_time_sec"],
617
- ["noteList", "publish_time"],
618
- ["noteList", "publishTime"],
619
- ["noteList", "time"],
620
- ["noteList", "timestamp"],
621
- ]
622
- create_time_sec, create_time_source = _pick_int_with_source_from_paths(
623
- payload,
624
- create_time_paths,
625
- prefer_positive=True,
626
- normalize_unix_sec=True,
627
- )
628
- duration_ms = _pick_int_from_paths(
629
- payload,
630
- [["duration_ms"], ["duration"], ["duration_sec"], ["video", "duration"], ["note", "duration"]],
631
- )
632
- if duration_ms is not None and duration_ms > 0 and duration_ms < 10000:
633
- duration_ms *= 1000
634
-
635
- share_url = _pick_text_from_paths(payload, [["share_url"], ["webpage_url"], ["url"], ["share_link"], ["share_text"]])
636
- source_url = _pick_text_from_paths(payload, [["source_url"], ["webpage_url"], ["url"], ["share_url"]])
637
- if not share_url:
638
- share_url = share_from_source
639
- if not source_url:
640
- source_url = share_url or share_from_source
641
-
642
- cover_image = _pick_text_from_paths(payload, [["cover_image"], ["cover_url"], ["cover"], ["image", "url"], ["origin_cover"]])
643
- if not cover_image and selected_image_urls:
644
- cover_image = selected_image_urls[0]
645
-
646
- video_down_url = _pick_text_from_paths(
647
- payload,
648
- [
649
- ["video_down_url"],
650
- ["original_video_url"],
651
- ["video_url"],
652
- ["play_url"],
653
- ["master_url"],
654
- ["selected_video_url"],
655
- ],
656
- )
657
- if not video_down_url:
658
- video_down_url = normalize_text(selected_video_url)
659
-
660
- xhs_user_id = _pick_text_from_paths(
661
- payload,
662
- [["author", "userid"], ["author", "user_id"], ["user", "userid"], ["user", "user_id"], ["user_id"], ["userid"], ["id"]],
663
- )
664
- author_handle = _pick_text_from_paths(
665
- payload,
666
- [["author", "red_id"], ["user", "red_id"], ["red_id"], ["author", "nickname"], ["user", "nickname"], ["nickname"]],
667
- ) or author
668
-
669
- xhs_sec_token = _pick_text_from_paths(
670
- payload,
671
- [["xhs_sec_token"], ["xsec_token"], ["xsecToken"], ["note", "xsecToken"], ["user", "xsecToken"], ["user", "xsec_token"]],
672
- )
673
- if not xhs_sec_token:
674
- for url_text in [share_url, source_url, share_from_source]:
675
- text = normalize_text(url_text)
676
- if not text:
677
- continue
678
- try:
679
- query = urllib.parse.urlparse(text).query
680
- xhs_sec_token = urllib.parse.parse_qs(query).get("xsec_token", [""])[0]
681
- except Exception:
682
- xhs_sec_token = ""
683
- if normalize_text(xhs_sec_token):
684
- break
685
-
686
- return {
687
- "title": title,
688
- "author": author,
689
- "author_handle": author_handle,
690
- "platform_author_id": xhs_user_id,
691
- "author_platform_id": xhs_user_id,
692
- "xhs_user_id": xhs_user_id,
693
- "xhs_sec_token": normalize_text(xhs_sec_token),
694
- "create_time_sec": create_time_sec,
695
- "publish_time": create_time_sec,
696
- "publish_time_source": create_time_source or "unknown",
697
- "duration_ms": duration_ms,
698
- "tags": _extract_xhs_tags(payload),
699
- "digg_count": _pick_int_from_paths(payload, [["digg_count"], ["liked_count"], ["like_count"], ["likes"]]),
700
- "comment_count": _pick_int_from_paths(payload, [["comment_count"], ["comments_count"], ["comments"]]),
701
- "collect_count": _pick_int_from_paths(payload, [["collect_count"], ["collected_count"], ["favorite_count"]]),
702
- "share_count": _pick_int_from_paths(payload, [["share_count"], ["shared_count"]]),
703
- "share_url": share_url,
704
- "source_url": source_url,
705
- "cover_image": cover_image,
706
- "video_down_url": video_down_url,
707
- }
708
-
709
-
710
- def _is_sparse_metadata(metadata_fields: Dict[str, Any]) -> bool:
711
- if not normalize_text(metadata_fields.get("title")):
712
- return True
713
- if not normalize_text(metadata_fields.get("author")):
714
- return True
715
- if metadata_fields.get("create_time_sec") is None:
716
- return True
717
- metric_keys = ["digg_count", "comment_count", "collect_count", "share_count"]
718
- return not any(metadata_fields.get(key) is not None for key in metric_keys)
719
-
720
-
721
- def _append_missing_metadata_fields(missing_fields: List[Dict[str, str]], metadata_fields: Dict[str, Any]) -> None:
722
- missing_set = {item.get("field") for item in missing_fields if isinstance(item, dict)}
723
-
724
- def _append(field: str) -> None:
725
- if field in missing_set:
726
- return
727
- missing_fields.append({"field": field, "reason": "missing_metadata"})
728
- missing_set.add(field)
729
-
730
- for key in [
731
- "title",
732
- "author",
733
- "author_handle",
734
- "platform_author_id",
735
- "xhs_user_id",
736
- "xhs_sec_token",
737
- "share_url",
738
- "source_url",
739
- "cover_image",
740
- "video_down_url",
741
- ]:
742
- if not normalize_text(metadata_fields.get(key)):
743
- _append(key)
744
-
745
- for key in ["create_time_sec", "duration_ms", "digg_count", "comment_count", "collect_count", "share_count"]:
746
- if metadata_fields.get(key) is None:
747
- _append(key)
748
-
749
-
750
- def _fetch_sparse_metadata_enrich(
751
- *,
752
- base_url: str,
753
- token: str,
754
- timeout_ms: int,
755
- source_input: Dict[str, Optional[str]],
756
- note_id: Optional[str],
757
- ) -> Dict[str, Any]:
758
- share_text = source_input.get("share_text")
759
- resolved_note_id = note_id or source_input.get("note_id") or _extract_note_id_from_share(share_text)
760
-
761
- if _is_short_share_url(share_text) and share_text:
762
- response = call_json_api(
763
- base_url=base_url,
764
- path=WEB_V2_V3_ENDPOINT,
765
- token=token,
766
- method="GET",
767
- timeout_ms=timeout_ms,
768
- params={"short_url": share_text},
769
- )
770
- response["_endpoint"] = WEB_V2_V3_ENDPOINT
771
- response["_route_label"] = "web_v2_v3_sparse_enrich"
772
- return response
773
-
774
- if resolved_note_id:
775
- response = call_json_api(
776
- base_url=base_url,
777
- path=WEB_V2_V2_ENDPOINT,
778
- token=token,
779
- method="GET",
780
- timeout_ms=timeout_ms,
781
- params={"note_id": resolved_note_id},
782
- )
783
- response["_endpoint"] = WEB_V2_V2_ENDPOINT
784
- response["_route_label"] = "web_v2_v2_sparse_enrich"
785
- return response
786
-
787
- return {
788
- "ok": False,
789
- "error": "missing_share_text_and_note_id_for_sparse_enrich",
790
- "_endpoint": None,
791
- "_route_label": "web_v2_sparse_enrich_skipped",
792
- }
793
-
794
-
795
- def _fetch_note_info(*, base_url: str, token: str, timeout_ms: int, source_input: Dict[str, Optional[str]]) -> Dict[str, Any]:
796
- attempts: List[Dict[str, Any]] = []
797
-
798
- share_text = source_input.get("share_text")
799
- note_id = source_input.get("note_id") or _extract_note_id_from_share(share_text)
800
-
801
- def _call(path: str, params: Dict[str, Any], label: str, fallback_reason: Optional[str] = None) -> Dict[str, Any]:
802
- response = call_json_api(
803
- base_url=base_url,
804
- path=path,
805
- token=token,
806
- method="GET",
807
- timeout_ms=timeout_ms,
808
- params=params,
809
- )
810
- response["_endpoint"] = path
811
- response["_route_label"] = label
812
- if fallback_reason:
813
- response["fallback_trigger_reason"] = fallback_reason
814
- response["_field_completeness"] = _route_field_completeness(response.get("data"), source_input) if response.get("ok") else {
815
- "fields": {},
816
- "filled_count": 0,
817
- "total_fields": 0,
818
- "ratio": 0.0,
819
- "missing_core": ["note_id", "title_or_desc", "media"],
820
- "core_ready": False,
821
- }
822
- attempts.append({"label": label, "endpoint": path, "response": response})
823
- return response
824
-
825
- app_params: Dict[str, Any] = {}
826
- if share_text:
827
- app_params["share_text"] = share_text
828
- if note_id:
829
- app_params["note_id"] = note_id
830
-
831
- app_v2_attempts = [
832
- (APP_V2_VIDEO_ENDPOINT, "app_v2_video"),
833
- (APP_V2_IMAGE_ENDPOINT, "app_v2_image"),
834
- (APP_V2_MIXED_ENDPOINT, "app_v2_mixed"),
835
- ]
836
- next_reason: Optional[str] = None
837
-
838
- for path, label in app_v2_attempts:
839
- app_v2_response = _call(path, app_params, label, fallback_reason=next_reason)
840
- if _route_success_for_note(app_v2_response, source_input):
841
- app_v2_response["_attempts"] = attempts
842
- return app_v2_response
843
- if app_v2_response.get("ok"):
844
- app_v2_response["fallback_trigger_reason"] = "field_completeness_below_threshold"
845
- next_reason = "field_completeness_below_threshold" if app_v2_response.get("ok") else (
846
- "primary_timeout_retry_exhausted" if app_v2_response.get("timeout_retry_exhausted") else "primary_non_timeout_failure"
847
- )
848
-
849
- app_response = _call(APP_V1_ENDPOINT, app_params, "app_v1", fallback_reason=next_reason)
850
- if _route_success_for_note(app_response, source_input):
851
- app_response["_attempts"] = attempts
852
- return app_response
853
- if app_response.get("ok"):
854
- app_response["fallback_trigger_reason"] = "field_completeness_below_threshold"
855
-
856
- app_fallback_reason = (
857
- "field_completeness_below_threshold"
858
- if app_response.get("ok")
859
- else ("primary_timeout_retry_exhausted" if app_response.get("timeout_retry_exhausted") else "primary_non_timeout_failure")
860
- )
861
- is_short = _is_short_share_url(share_text)
862
-
863
- if is_short and share_text:
864
- v3_response = _call(
865
- WEB_V2_V3_ENDPOINT,
866
- {"short_url": share_text},
867
- "web_v2_v3_short",
868
- fallback_reason=app_fallback_reason,
869
- )
870
- if v3_response.get("ok"):
871
- v3_response["_attempts"] = attempts
872
- return v3_response
873
-
874
- if note_id:
875
- v2_response = _call(
876
- WEB_V2_V2_ENDPOINT,
877
- {"note_id": note_id},
878
- "web_v2_v2_note_id",
879
- fallback_reason=app_fallback_reason,
880
- )
881
- if v2_response.get("ok"):
882
- v2_response["_attempts"] = attempts
883
- return v2_response
884
-
885
- web_params: Dict[str, Any] = {}
886
- if share_text:
887
- web_params["share_text"] = share_text
888
- if note_id:
889
- web_params["note_id"] = note_id
890
-
891
- web_response = _call(WEB_ENDPOINT, web_params, "web_v7", fallback_reason=app_fallback_reason)
892
- web_response["_attempts"] = attempts
893
- return web_response
894
-
895
-
896
- def _extract_subtitle_urls(payload: Any) -> List[str]:
897
- urls: List[str] = []
898
- for key in ["subtitle_url", "subtitleUrl", "srt_url", "srtUrl", "vtt_url", "vttUrl"]:
899
- for value in deep_find_all(payload, [key]):
900
- if isinstance(value, str):
901
- text = value.strip()
902
- if text.startswith("http://") or text.startswith("https://"):
903
- urls.append(text)
904
-
905
- unique: List[str] = []
906
- seen = set()
907
- for url in urls:
908
- if url not in seen:
909
- unique.append(url)
910
- seen.add(url)
911
- return unique
912
-
913
-
914
- def _extract_subtitle_inline_text(payload: Any) -> str:
915
- lines: List[str] = []
916
- subtitle_containers = deep_find_all(payload, ["subtitles", "subtitle_list", "subtitleList"])
917
-
918
- for container in subtitle_containers:
919
- if isinstance(container, list):
920
- for item in container:
921
- if isinstance(item, dict):
922
- for key in ["text", "content", "sentence", "line"]:
923
- val = item.get(key)
924
- if isinstance(val, str) and normalize_text(val):
925
- lines.append(normalize_text(val))
926
- elif isinstance(item, str) and normalize_text(item):
927
- lines.append(normalize_text(item))
928
- elif isinstance(container, dict):
929
- for key in ["text", "content"]:
930
- val = container.get(key)
931
- if isinstance(val, str) and normalize_text(val):
932
- lines.append(normalize_text(val))
933
-
934
- deduped = list(dict.fromkeys(lines))
935
- return "\n".join(deduped).strip()
936
-
937
-
938
- def _subtitle_text_from_raw(raw: str) -> str:
939
- if not raw:
940
- return ""
941
-
942
- raw = raw.strip()
943
- if not raw:
944
- return ""
945
-
946
- if raw.startswith("{") or raw.startswith("["):
947
- try:
948
- data = json.loads(raw)
949
- texts: List[str] = []
950
- if isinstance(data, dict):
951
- for key in ["segments", "subtitles", "data", "result", "body"]:
952
- val = data.get(key)
953
- if isinstance(val, list):
954
- for item in val:
955
- if isinstance(item, dict):
956
- t = item.get("text") or item.get("content") or item.get("sentence")
957
- if isinstance(t, str) and normalize_text(t):
958
- texts.append(normalize_text(t))
959
- elif isinstance(data, list):
960
- for item in data:
961
- if isinstance(item, dict):
962
- t = item.get("text") or item.get("content") or item.get("sentence")
963
- if isinstance(t, str) and normalize_text(t):
964
- texts.append(normalize_text(t))
965
- return "\n".join(dict.fromkeys(texts)).strip()
966
- except Exception:
967
- pass
968
-
969
- lines = []
970
- for line in raw.splitlines():
971
- t = line.strip()
972
- if not t:
973
- continue
974
- if t.upper() == "WEBVTT":
975
- continue
976
- if re.match(r"^\d+$", t):
977
- continue
978
- if "-->" in t:
979
- continue
980
- if t.startswith("NOTE"):
981
- continue
982
- cleaned = normalize_text(t)
983
- if cleaned:
984
- lines.append(cleaned)
985
-
986
- return "\n".join(dict.fromkeys(lines)).strip()
987
-
988
-
989
- def _fetch_subtitle_text(urls: List[str], timeout_ms: int) -> str:
990
- for url in urls:
991
- try:
992
- req = urllib.request.Request(url=url, method="GET")
993
- with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
994
- raw = response.read().decode("utf-8", errors="replace")
995
- text = _subtitle_text_from_raw(raw)
996
- if text:
997
- return text
998
- except Exception:
999
- continue
1000
- return ""
1001
-
1002
-
1003
- def _url_likely_image(url: str) -> bool:
1004
- lower = url.lower()
1005
- image_tokens = [
1006
- ".jpg",
1007
- ".jpeg",
1008
- ".png",
1009
- ".webp",
1010
- "_jpg_",
1011
- "_png_",
1012
- "imageview2",
1013
- "imagemogr2",
1014
- "redimage",
1015
- "frame/",
1016
- "sns-img",
1017
- "sns-webpic",
1018
- "notes_pre_post",
1019
- ]
1020
- return any(token in lower for token in image_tokens)
1021
-
1022
-
1023
- def _url_likely_video(url: str) -> bool:
1024
- lower = url.lower()
1025
- video_tokens = [
1026
- ".mp4",
1027
- ".m3u8",
1028
- ".m4a",
1029
- ".mp3",
1030
- "video",
1031
- "play",
1032
- "stream",
1033
- "master",
1034
- "sns-video",
1035
- "redvideo",
1036
- "vod",
1037
- "/audio/",
1038
- ]
1039
- if _url_likely_image(url):
1040
- return False
1041
- return any(token in lower for token in video_tokens)
1042
-
1043
-
1044
- def _video_quality_hint(url: str) -> int:
1045
- lower = url.lower()
1046
- score = 9999
1047
-
1048
- query = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
1049
- for key in ("w", "width", "ratio", "quality", "qn"):
1050
- values = query.get(key)
1051
- if not values:
1052
- continue
1053
- value = str(values[0]).lower()
1054
- m = re.search(r"(\d{3,4})", value)
1055
- if m:
1056
- score = min(score, int(m.group(1)))
1057
-
1058
- for token, value in (("240p", 240), ("360p", 360), ("480p", 480), ("540p", 540), ("576p", 576), ("720p", 720), ("1080p", 1080), ("2k", 2000), ("4k", 4000)):
1059
- if token in lower:
1060
- score = min(score, value)
1061
-
1062
- return score
1063
-
1064
-
1065
- def _extract_video_candidates(payload: Any) -> List[str]:
1066
- candidates: List[str] = []
1067
- key_priority = [
1068
- "master_url",
1069
- "masterUrl",
1070
- "video_url",
1071
- "play_url",
1072
- "origin_video_key",
1073
- "origin_video_url",
1074
- "video_play_url",
1075
- "audio_url",
1076
- "note_sound_info",
1077
- "url",
1078
- ]
1079
-
1080
- for key in key_priority:
1081
- values = deep_find_all(payload, [key])
1082
- for value in values:
1083
- if isinstance(value, str):
1084
- v = value.strip()
1085
- if v.startswith("http://") or v.startswith("https://"):
1086
- candidates.append(v)
1087
- elif isinstance(value, list):
1088
- for item in value:
1089
- if isinstance(item, str):
1090
- v = item.strip()
1091
- if v.startswith("http://") or v.startswith("https://"):
1092
- candidates.append(v)
1093
- elif isinstance(value, dict):
1094
- nested = value.get("url") or value.get("play_url")
1095
- if isinstance(nested, str):
1096
- v = nested.strip()
1097
- if v.startswith("http://") or v.startswith("https://"):
1098
- candidates.append(v)
1099
-
1100
- unique: List[str] = []
1101
- seen = set()
1102
- for url in candidates:
1103
- if url not in seen:
1104
- unique.append(url)
1105
- seen.add(url)
1106
-
1107
- video_only = [u for u in unique if _url_likely_video(u)]
1108
- if not video_only:
1109
- return []
1110
-
1111
- scored = sorted(video_only, key=lambda u: (_video_quality_hint(u), video_only.index(u)))
1112
- return scored
1113
-
1114
-
1115
- def _collect_urls(value: Any) -> List[str]:
1116
- out: List[str] = []
1117
- if isinstance(value, str):
1118
- v = value.strip()
1119
- if v.startswith("http://") or v.startswith("https://"):
1120
- out.append(v)
1121
- elif isinstance(value, list):
1122
- for item in value:
1123
- out.extend(_collect_urls(item))
1124
- elif isinstance(value, dict):
1125
- for key in ("url", "urlDefault", "url_default", "urlPre", "url_pre", "original"):
1126
- if key in value:
1127
- out.extend(_collect_urls(value.get(key)))
1128
- return out
1129
-
1130
-
1131
- def _dedupe_image_urls(urls: List[str]) -> List[str]:
1132
- unique: List[str] = []
1133
- seen = set()
1134
- for url in urls:
1135
- if url in seen:
1136
- continue
1137
- seen.add(url)
1138
- if _url_likely_image(url):
1139
- unique.append(url)
1140
- return unique
1141
-
1142
-
1143
- def _extract_image_candidates_with_strategy(payload: Any) -> Tuple[List[str], str]:
1144
- # Priority 1: original image set
1145
- originals = _dedupe_image_urls(deep_find_all(payload, ["original"]))
1146
- if originals:
1147
- return originals, "original"
1148
-
1149
- # Priority 2: WEB_V2 infoList with WB_DFT scene
1150
- dft_urls: List[str] = []
1151
- for key in ("imageList", "images_list"):
1152
- image_lists = deep_find_all(payload, [key])
1153
- for image_list in image_lists:
1154
- if not isinstance(image_list, list):
1155
- continue
1156
- for item in image_list:
1157
- if not isinstance(item, dict):
1158
- continue
1159
- info_list = item.get("infoList") or item.get("info_list")
1160
- if isinstance(info_list, list):
1161
- for info in info_list:
1162
- if not isinstance(info, dict):
1163
- continue
1164
- scene = str(info.get("imageScene") or info.get("image_scene") or "").upper()
1165
- if scene == "WB_DFT":
1166
- dft_urls.extend(_collect_urls(info.get("url")))
1167
- dft_urls = _dedupe_image_urls(dft_urls)
1168
- if dft_urls:
1169
- return dft_urls, "wb_dft"
1170
-
1171
- # Priority 3: default representative image URLs
1172
- default_urls: List[str] = []
1173
- for key in ("urlDefault", "url_default", "urlPre", "url_pre"):
1174
- default_urls.extend(_collect_urls(deep_find_all(payload, [key])))
1175
- default_urls = _dedupe_image_urls(default_urls)
1176
- if default_urls:
1177
- return default_urls, "default"
1178
-
1179
- # Priority 4: generic fallback (single quality group intended)
1180
- generic: List[str] = []
1181
- for key in ("url", "url_list", "origin_image", "origin_image_url", "cover", "thumb", "image_url"):
1182
- generic.extend(_collect_urls(deep_find_all(payload, [key])))
1183
- generic = _dedupe_image_urls(generic)
1184
- return generic, "fallback"
1185
-
1186
-
1187
- def _extract_image_candidates(payload: Any) -> List[str]:
1188
- urls, _ = _extract_image_candidates_with_strategy(payload)
1189
- return urls
1190
-
1191
-
1192
- def _extract_note_type_field(payload: Any) -> str:
1193
- # WEB_V2 schema: note.type
1194
- for note_obj in deep_find_all(payload, ["note"]):
1195
- if isinstance(note_obj, dict):
1196
- note_type = normalize_text(note_obj.get("type")).lower()
1197
- if note_type:
1198
- return note_type
1199
-
1200
- # APP schema: note_list[].type
1201
- for key in ("note_list", "noteList"):
1202
- for note_list in deep_find_all(payload, [key]):
1203
- if not isinstance(note_list, list):
1204
- continue
1205
- for item in note_list:
1206
- if not isinstance(item, dict):
1207
- continue
1208
- note_type = normalize_text(item.get("type")).lower()
1209
- if note_type:
1210
- return note_type
1211
-
1212
- # Strict fallback: only accept expected scalar values.
1213
- for value in deep_find_all(payload, ["type"]):
1214
- note_type = normalize_text(value).lower()
1215
- if note_type in {"video", "normal", "image"}:
1216
- return note_type
1217
-
1218
- return ""
1219
-
1220
-
1221
- def _detect_note_content_type(payload: Any, video_candidates: List[str], image_candidates: List[str]) -> str:
1222
- note_type_value = _extract_note_type_field(payload)
1223
- if note_type_value == "video":
1224
- return "video"
1225
- if note_type_value == "normal":
1226
- return "image"
1227
- if "video" in note_type_value:
1228
- return "video"
1229
- if "image" in note_type_value:
1230
- return "image"
1231
-
1232
- note_sound_url = normalize_text(deep_find_first(payload, ["note_sound_info", "url"])).lower()
1233
- has_note_audio = bool(note_sound_url and any(token in note_sound_url for token in [".m4a", ".mp3", "/audio/"]))
1234
-
1235
- has_video = bool(video_candidates) or has_note_audio
1236
- has_image = bool(image_candidates)
1237
- if has_video and has_image:
1238
- return "mixed"
1239
- if has_video:
1240
- return "video"
1241
- if has_image:
1242
- return "image"
1243
- return "unknown"
1244
-
1245
-
1246
- def _guess_ext_from_url(url: str) -> str:
1247
- parsed = urllib.parse.urlparse(url)
1248
- path = parsed.path.lower()
1249
- for ext in [".jpg", ".jpeg", ".png", ".webp", ".gif"]:
1250
- if path.endswith(ext):
1251
- return ext
1252
- return ".jpg"
1253
-
1254
-
1255
- def _download_images(
1256
- *,
1257
- urls: List[str],
1258
- timeout_ms: int,
1259
- source_input: Dict[str, Optional[str]],
1260
- note_id: Optional[str],
1261
- storage_config: Optional[Dict[str, Any]],
1262
- ) -> List[Dict[str, Any]]:
1263
- if not urls:
1264
- return []
1265
-
1266
- try:
1267
- paths = resolve_storage_paths(storage_config or {})
1268
- base_dir = Path(paths.get("runs_root", "")) / "assets" / datetime.now().strftime("%Y%m%d") / _traceable_identifier(source_input, note_id)
1269
- except Exception:
1270
- base_dir = Path("./tikomni-output/_runs/assets") / datetime.now().strftime("%Y%m%d") / _traceable_identifier(source_input, note_id)
1271
-
1272
- base_dir.mkdir(parents=True, exist_ok=True)
1273
- results: List[Dict[str, Any]] = []
1274
-
1275
- for idx, url in enumerate(urls[:30], start=1):
1276
- ext = _guess_ext_from_url(url)
1277
- path = base_dir / f"image-{idx:02d}{ext}"
1278
- try:
1279
- req = urllib.request.Request(url=url, method="GET")
1280
- with urllib.request.urlopen(req, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
1281
- content = response.read()
1282
- path.write_bytes(content)
1283
- results.append({"index": idx, "url": url, "path": str(path), "ok": True})
1284
- except Exception as error:
1285
- results.append({"index": idx, "url": url, "path": str(path), "ok": False, "error": str(error)})
1286
-
1287
- return results
1288
-
1289
-
1290
- def _build_result(
1291
- *,
1292
- source_input: Dict[str, Optional[str]],
1293
- raw_content: str,
1294
- confidence: str,
1295
- error_reason: Optional[str],
1296
- extract_trace: List[Dict[str, Any]],
1297
- fallback_trace: List[Dict[str, Any]],
1298
- request_id: Optional[str],
1299
- text_source: str,
1300
- note_id: Optional[str],
1301
- subtitle_hit: bool,
1302
- u2_task_id: Optional[str],
1303
- u2_task_status: Optional[str],
1304
- note_content_type: str,
1305
- analysis_mode: str,
1306
- selected_video_url: Optional[str],
1307
- selected_video_candidates: List[str],
1308
- selected_image_urls: List[str],
1309
- downloaded_assets: List[Dict[str, Any]],
1310
- missing_fields: Optional[List[Dict[str, str]]] = None,
1311
- metadata_fields: Optional[Dict[str, Any]] = None,
1312
- asr_source: Optional[str] = None,
1313
- ) -> Dict[str, Any]:
1314
- metadata = metadata_fields or {}
1315
- summary_block = summarize_content(raw_content, source=f"xiaohongshu:{text_source}")
1316
- insights = list(summary_block.get("insights", []))
1317
- insights.extend([
1318
- f"note_content_type={note_content_type}",
1319
- f"analysis_mode={analysis_mode}",
1320
- f"selected_image_count={len(selected_image_urls)}",
1321
- ])
1322
-
1323
- resolved_asr_source = normalize_text(asr_source)
1324
- if not resolved_asr_source:
1325
- if text_source == "subtitle":
1326
- resolved_asr_source = "xhs_subtitle"
1327
- elif text_source == "u2":
1328
- resolved_asr_source = "u2"
1329
- else:
1330
- resolved_asr_source = "fallback_none"
1331
-
1332
- return {
1333
- "platform": "xiaohongshu",
1334
- "content_kind": "note",
1335
- "source": source_input,
1336
- "note_id": note_id,
1337
- "note_content_type": note_content_type,
1338
- "analysis_mode": analysis_mode,
1339
- "subtitle_hit": subtitle_hit,
1340
- "text_source": text_source,
1341
- "asr_source": resolved_asr_source,
1342
- "u2_task_id": u2_task_id,
1343
- "u2_task_status": u2_task_status,
1344
- "selected_video_url": selected_video_url,
1345
- "selected_video_candidates": selected_video_candidates,
1346
- "selected_image_urls": selected_image_urls,
1347
- "title": metadata.get("title"),
1348
- "author": metadata.get("author"),
1349
- "create_time_sec": metadata.get("create_time_sec"),
1350
- "publish_time": metadata.get("publish_time"),
1351
- "publish_time_source": metadata.get("publish_time_source"),
1352
- "duration_ms": metadata.get("duration_ms"),
1353
- "tags": metadata.get("tags", []),
1354
- "digg_count": metadata.get("digg_count"),
1355
- "comment_count": metadata.get("comment_count"),
1356
- "collect_count": metadata.get("collect_count"),
1357
- "share_count": metadata.get("share_count"),
1358
- "share_url": metadata.get("share_url"),
1359
- "source_url": metadata.get("source_url"),
1360
- "cover_image": metadata.get("cover_image"),
1361
- "video_down_url": metadata.get("video_down_url"),
1362
- "author_handle": metadata.get("author_handle"),
1363
- "platform_author_id": metadata.get("platform_author_id") or metadata.get("author_platform_id"),
1364
- "xhs_user_id": metadata.get("xhs_user_id"),
1365
- "xhs_sec_token": metadata.get("xhs_sec_token"),
1366
- "downloaded_assets": downloaded_assets,
1367
- "raw_content": raw_content,
1368
- "summary": summary_block["summary"],
1369
- "insights": insights,
1370
- "confidence": confidence,
1371
- "error_reason": error_reason,
1372
- "missing_fields": missing_fields or [],
1373
- "extract_trace": extract_trace,
1374
- "fallback_trace": fallback_trace,
1375
- "request_id": request_id,
1376
- }
1377
-
1378
-
1379
- def run_xiaohongshu_extract(
1380
- *,
1381
- input_value: Optional[str],
1382
- share_text: Optional[str],
1383
- note_id: Optional[str],
1384
- env_file: Optional[str],
1385
- api_key_env: str,
1386
- base_url: Optional[str],
1387
- timeout_ms: Optional[int],
1388
- poll_interval_sec: float,
1389
- max_polls: int,
1390
- u2_submit_max_retries: int,
1391
- u2_submit_backoff_ms: int,
1392
- u2_timeout_retry_enabled: bool,
1393
- u2_timeout_retry_max_retries: int,
1394
- force_u2_fallback: bool,
1395
- write_card: bool,
1396
- card_type: str,
1397
- card_root: Optional[str],
1398
- storage_config: Optional[Dict[str, Any]] = None,
1399
- allow_process_env: bool = False,
1400
- persist_output: bool = True,
1401
- progress: Optional[ProgressReporter] = None,
1402
- ) -> Dict[str, Any]:
1403
- if not write_card or not persist_output:
1404
- raise ValueError(
1405
- f"fixed_pipeline_requires_full_persistence:xiaohongshu:note:write_card={bool(write_card)}:persist_output={bool(persist_output)}"
1406
- )
1407
-
1408
- source_input = _normalize_input(input_value, share_text, note_id)
1409
- if progress is not None:
1410
- progress.started(stage="note.workflow", message="xiaohongshu note workflow started")
1411
- metadata_fields: Dict[str, Any] = {}
1412
- if not source_input["share_text"] and not source_input["note_id"]:
1413
- result = _build_result(
1414
- source_input=source_input,
1415
- raw_content="",
1416
- confidence="low",
1417
- error_reason="missing_share_text_or_note_id",
1418
- extract_trace=[],
1419
- fallback_trace=[],
1420
- request_id=None,
1421
- text_source="none",
1422
- note_id=None,
1423
- subtitle_hit=False,
1424
- u2_task_id=None,
1425
- u2_task_status="UNKNOWN",
1426
- note_content_type="unknown",
1427
- analysis_mode="none",
1428
- selected_video_url=None,
1429
- selected_video_candidates=[],
1430
- selected_image_urls=[],
1431
- downloaded_assets=[],
1432
- missing_fields=[{"field": "share_text_or_note_id", "reason": "missing_input"}],
1433
- metadata_fields=metadata_fields,
1434
- )
1435
- if write_card:
1436
- result["card_write"] = write_benchmark_card(
1437
- payload=result,
1438
- platform="xiaohongshu",
1439
- card_type=card_type,
1440
- card_root=card_root,
1441
- content_kind="note",
1442
- storage_config=storage_config,
1443
- )
1444
- return _finalize_result(
1445
- result=result,
1446
- source_input=source_input,
1447
- note_id=None,
1448
- storage_config=storage_config,
1449
- persist_output=persist_output,
1450
- )
1451
-
1452
- runtime = resolve_runtime(
1453
- env_file=env_file,
1454
- api_key_env=api_key_env,
1455
- base_url=base_url,
1456
- timeout_ms=timeout_ms,
1457
- allow_process_env=allow_process_env,
1458
- )
1459
-
1460
- trace: List[Dict[str, Any]] = []
1461
-
1462
- if progress is not None:
1463
- progress.progress(stage="note.fetch", message="fetching xiaohongshu note payload")
1464
- note_response = _fetch_note_info(
1465
- base_url=runtime["base_url"],
1466
- token=runtime["token"],
1467
- timeout_ms=runtime["timeout_ms"],
1468
- source_input=source_input,
1469
- )
1470
-
1471
- attempts = note_response.get("_attempts") or []
1472
- for index, attempt in enumerate(attempts, start=1):
1473
- response = attempt.get("response") if isinstance(attempt, dict) else None
1474
- endpoint = attempt.get("endpoint") if isinstance(attempt, dict) else None
1475
- label = attempt.get("label") if isinstance(attempt, dict) else None
1476
- if not isinstance(response, dict):
1477
- continue
1478
- step = "u1_get_note_info_effective" if index == len(attempts) else f"u1_get_note_info_attempt_{index}"
1479
- trace.append(
1480
- build_api_trace(
1481
- step=step,
1482
- endpoint=endpoint,
1483
- response=response,
1484
- extra={
1485
- "route_label": label,
1486
- "attempt": index,
1487
- "chosen_route": note_response.get("_route_label"),
1488
- "field_completeness": response.get("_field_completeness"),
1489
- },
1490
- )
1491
- )
1492
-
1493
- trace.append(
1494
- {
1495
- "step": "u1_get_note_info_route_decision",
1496
- "chosen_route": note_response.get("_route_label"),
1497
- "request_id": note_response.get("request_id"),
1498
- "field_completeness": note_response.get("_field_completeness"),
1499
- "attempt_count": len(attempts),
1500
- }
1501
- )
1502
-
1503
- if not note_response.get("ok"):
1504
- error_ctx = resolve_trace_error_context(
1505
- responses=[note_response],
1506
- extract_trace=trace,
1507
- default_error_reason="u1_get_note_info_failed",
1508
- )
1509
- result = _build_result(
1510
- source_input=source_input,
1511
- raw_content="",
1512
- confidence="low",
1513
- error_reason=error_ctx.get("error_reason"),
1514
- extract_trace=trace,
1515
- fallback_trace=error_ctx.get("fallback_trace", []),
1516
- request_id=error_ctx.get("request_id"),
1517
- text_source="none",
1518
- note_id=source_input.get("note_id"),
1519
- subtitle_hit=False,
1520
- u2_task_id=None,
1521
- u2_task_status="UNKNOWN",
1522
- note_content_type="unknown",
1523
- analysis_mode="none",
1524
- selected_video_url=None,
1525
- selected_video_candidates=[],
1526
- selected_image_urls=[],
1527
- downloaded_assets=[],
1528
- missing_fields=[{"field": "u1_note_info", "reason": "all_routes_failed"}],
1529
- metadata_fields=metadata_fields,
1530
- )
1531
- if write_card:
1532
- result["card_write"] = write_benchmark_card(
1533
- payload=result,
1534
- platform="xiaohongshu",
1535
- card_type=card_type,
1536
- card_root=card_root,
1537
- content_kind="note",
1538
- storage_config=storage_config,
1539
- )
1540
- return _finalize_result(
1541
- result=result,
1542
- source_input=source_input,
1543
- note_id=source_input.get("note_id"),
1544
- storage_config=storage_config,
1545
- persist_output=persist_output,
1546
- )
1547
-
1548
- effective_payload = note_response.get("data")
1549
- app_route_success = str(note_response.get("_route_label") or "").startswith("app")
1550
- metadata_enrich_on_sparse = bool(config_get(storage_config or {}, "xhs.metadata_enrich_on_sparse", True))
1551
-
1552
- initial_metadata = _extract_xhs_metadata(
1553
- payload=effective_payload,
1554
- source_input=source_input,
1555
- selected_video_url=None,
1556
- selected_image_urls=[],
1557
- )
1558
- sparse_metadata_detected = bool(app_route_success and metadata_enrich_on_sparse and _is_sparse_metadata(initial_metadata))
1559
- metadata_enrich_hit = False
1560
- enrich_response: Optional[Dict[str, Any]] = None
1561
- enrich_payload: Any = None
1562
-
1563
- if sparse_metadata_detected:
1564
- enrich_response = _fetch_sparse_metadata_enrich(
1565
- base_url=runtime["base_url"],
1566
- token=runtime["token"],
1567
- timeout_ms=runtime["timeout_ms"],
1568
- source_input=source_input,
1569
- note_id=source_input.get("note_id"),
1570
- )
1571
- trace.append(
1572
- build_api_trace(
1573
- step="u1_sparse_metadata_enrich",
1574
- endpoint=enrich_response.get("_endpoint"),
1575
- response=enrich_response,
1576
- extra={"route_label": enrich_response.get("_route_label")},
1577
- )
1578
- )
1579
- if enrich_response.get("ok"):
1580
- metadata_enrich_hit = True
1581
- enrich_payload = enrich_response.get("data")
1582
- effective_payload = {"app": note_response.get("data"), "web_v2_enrich": enrich_payload}
1583
-
1584
- resolved_note_id = _resolve_note_id(effective_payload, source_input)
1585
-
1586
- title = normalize_text(deep_find_first(effective_payload, ["title"]))
1587
- desc = normalize_text(deep_find_first(effective_payload, ["desc", "content"]))
1588
- caption_text = "\n".join([t for t in [title, desc] if t]).strip()
1589
-
1590
- subtitle_inline_text = "" if force_u2_fallback else _extract_subtitle_inline_text(effective_payload)
1591
- subtitle_urls = [] if force_u2_fallback else _extract_subtitle_urls(effective_payload)
1592
- subtitle_url_text = "" if force_u2_fallback else _fetch_subtitle_text(subtitle_urls, runtime["timeout_ms"])
1593
- subtitle_text = subtitle_inline_text or subtitle_url_text
1594
-
1595
- app_video_candidates = _extract_video_candidates(note_response.get("data"))
1596
- app_image_candidates, image_quality_strategy = _extract_image_candidates_with_strategy(note_response.get("data"))
1597
- enrich_video_candidates = _extract_video_candidates(enrich_payload) if metadata_enrich_hit else []
1598
- enrich_image_candidates = _extract_image_candidates(enrich_payload) if metadata_enrich_hit else []
1599
-
1600
- video_candidates = _dedupe_keep_order(app_video_candidates + enrich_video_candidates)
1601
- image_candidates = _dedupe_keep_order(app_image_candidates + enrich_image_candidates)
1602
-
1603
- selected_video_url = video_candidates[0] if video_candidates else None
1604
- type_field_value = _extract_note_type_field(effective_payload)
1605
- note_content_type = _detect_note_content_type(effective_payload, video_candidates, image_candidates)
1606
-
1607
- metadata_fields = _extract_xhs_metadata(
1608
- payload=effective_payload,
1609
- source_input=source_input,
1610
- selected_video_url=selected_video_url,
1611
- selected_image_urls=image_candidates,
1612
- )
1613
-
1614
- missing_fields: List[Dict[str, str]] = []
1615
- _append_missing_metadata_fields(missing_fields, metadata_fields)
1616
-
1617
- trace.append(
1618
- {
1619
- "step": "media_probe",
1620
- "type_field_value": type_field_value,
1621
- "note_content_type": note_content_type,
1622
- "video_candidate_count": len(video_candidates),
1623
- "image_candidate_count": len(image_candidates),
1624
- "image_quality_strategy": image_quality_strategy,
1625
- "subtitle_hit": bool(subtitle_text),
1626
- "subtitle_url_count": len(subtitle_urls),
1627
- "force_u2_fallback": force_u2_fallback,
1628
- "sparse_metadata_detected": sparse_metadata_detected,
1629
- "metadata_enrich_hit": metadata_enrich_hit,
1630
- "candidate_merge_sources": {
1631
- "video": _build_candidate_merge_sources(
1632
- app_candidates=app_video_candidates,
1633
- enrich_candidates=enrich_video_candidates,
1634
- app_label="app",
1635
- ),
1636
- "image": _build_candidate_merge_sources(
1637
- app_candidates=app_image_candidates,
1638
- enrich_candidates=enrich_image_candidates,
1639
- app_label="app",
1640
- ),
1641
- },
1642
- }
1643
- )
1644
-
1645
- u2_gate = _evaluate_u2_gate_for_xhs(
1646
- note_content_type=note_content_type,
1647
- duration_ms=metadata_fields.get("duration_ms"),
1648
- video_down_url=metadata_fields.get("video_down_url") or selected_video_url,
1649
- )
1650
- trace.append(
1651
- {
1652
- "step": "u2_gate",
1653
- "can_u2": bool(u2_gate.get("can_u2")),
1654
- "gate_reason": u2_gate.get("gate_reason"),
1655
- "rule": U2_GATE_RULE,
1656
- "is_video": u2_gate.get("is_video"),
1657
- "duration_ms": u2_gate.get("duration_ms"),
1658
- "video_down_url_present": u2_gate.get("video_down_url_present"),
1659
- }
1660
- )
1661
-
1662
- # Video-note path: aligned with douyin single-video pipeline (subtitle-first difference retained).
1663
- if note_content_type in {"video", "mixed"}:
1664
- if subtitle_text:
1665
- success_ctx = resolve_trace_error_context(
1666
- responses=[note_response],
1667
- extract_trace=trace,
1668
- explicit_error_reason=None,
1669
- explicit_request_id=note_response.get("request_id"),
1670
- )
1671
- result = _build_result(
1672
- source_input=source_input,
1673
- raw_content=subtitle_text,
1674
- confidence="high",
1675
- error_reason=None,
1676
- extract_trace=trace,
1677
- fallback_trace=success_ctx.get("fallback_trace", []),
1678
- request_id=success_ctx.get("request_id"),
1679
- text_source="subtitle",
1680
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1681
- subtitle_hit=True,
1682
- u2_task_id=None,
1683
- u2_task_status="SKIPPED",
1684
- note_content_type=note_content_type,
1685
- analysis_mode="video_full",
1686
- selected_video_url=selected_video_url,
1687
- selected_video_candidates=video_candidates,
1688
- selected_image_urls=image_candidates,
1689
- downloaded_assets=[],
1690
- missing_fields=missing_fields,
1691
- metadata_fields=metadata_fields,
1692
- )
1693
- if write_card:
1694
- result["card_write"] = write_benchmark_card(
1695
- payload=result,
1696
- platform="xiaohongshu",
1697
- card_type=card_type,
1698
- card_root=card_root,
1699
- content_kind="single_video",
1700
- storage_config=storage_config,
1701
- )
1702
- return _finalize_result(
1703
- result=result,
1704
- source_input=source_input,
1705
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1706
- storage_config=storage_config,
1707
- persist_output=persist_output,
1708
- )
1709
-
1710
- if not u2_gate.get("can_u2"):
1711
- gate_reason = normalize_text(u2_gate.get("gate_reason")) or "skip:unknown"
1712
- if gate_reason == "skip:duration_missing":
1713
- missing_fields.append({"field": "duration_ms", "reason": gate_reason})
1714
- elif gate_reason in {"skip:duration_too_short", "skip:duration_too_long"}:
1715
- missing_fields.append({"field": "duration_ms", "reason": gate_reason})
1716
- elif gate_reason == "skip:video_down_url_missing":
1717
- missing_fields.append({"field": "video_down_url", "reason": gate_reason})
1718
- elif gate_reason == "skip:not_video":
1719
- missing_fields.append({"field": "note_content_type", "reason": gate_reason})
1720
-
1721
- error_ctx = resolve_trace_error_context(
1722
- responses=[note_response],
1723
- extract_trace=trace,
1724
- default_error_reason=gate_reason,
1725
- )
1726
- fallback_text = caption_text
1727
- result = _build_result(
1728
- source_input=source_input,
1729
- raw_content=fallback_text,
1730
- confidence="medium" if fallback_text else "low",
1731
- error_reason=None if fallback_text else error_ctx.get("error_reason"),
1732
- extract_trace=trace,
1733
- fallback_trace=error_ctx.get("fallback_trace", []),
1734
- request_id=error_ctx.get("request_id"),
1735
- text_source="caption_fallback" if fallback_text else "none",
1736
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1737
- subtitle_hit=False,
1738
- u2_task_id=None,
1739
- u2_task_status="SKIPPED",
1740
- note_content_type=note_content_type,
1741
- analysis_mode="video_full",
1742
- selected_video_url=u2_gate.get("video_down_url") or selected_video_url,
1743
- selected_video_candidates=video_candidates,
1744
- selected_image_urls=image_candidates,
1745
- downloaded_assets=[],
1746
- missing_fields=missing_fields,
1747
- metadata_fields=metadata_fields,
1748
- )
1749
- if write_card:
1750
- result["card_write"] = write_benchmark_card(
1751
- payload=result,
1752
- platform="xiaohongshu",
1753
- card_type=card_type,
1754
- card_root=card_root,
1755
- content_kind="single_video",
1756
- storage_config=storage_config,
1757
- )
1758
- return _finalize_result(
1759
- result=result,
1760
- source_input=source_input,
1761
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1762
- storage_config=storage_config,
1763
- persist_output=persist_output,
1764
- )
1765
-
1766
- u2_candidates = _dedupe_keep_order([u2_gate.get("video_down_url")] + list(video_candidates))
1767
- if progress is not None:
1768
- progress.progress(
1769
- stage="note.u2",
1770
- message="starting xiaohongshu u2 flow",
1771
- data={"candidate_count": len(u2_candidates)},
1772
- )
1773
- u2_bundle = run_u2_asr_candidates_with_timeout_retry(
1774
- base_url=runtime["base_url"],
1775
- token=runtime["token"],
1776
- timeout_ms=runtime["timeout_ms"],
1777
- candidates=u2_candidates,
1778
- submit_max_retries=u2_submit_max_retries,
1779
- submit_backoff_ms=u2_submit_backoff_ms,
1780
- poll_interval_sec=poll_interval_sec,
1781
- max_polls=max_polls,
1782
- timeout_retry_enabled=u2_timeout_retry_enabled,
1783
- timeout_retry_max_retries=u2_timeout_retry_max_retries,
1784
- )
1785
- submit_bundle = u2_bundle.get("submit_bundle", {})
1786
- submit_response = submit_bundle.get("submit_response", {})
1787
- task_id = submit_bundle.get("task_id")
1788
- poll_result = u2_bundle.get("poll_result", {})
1789
- selected_video_url = u2_bundle.get("chosen_candidate") or selected_video_url
1790
- if selected_video_url and not normalize_text(metadata_fields.get("video_down_url")):
1791
- metadata_fields["video_down_url"] = selected_video_url
1792
-
1793
- trace.append(
1794
- {
1795
- "step": "u2_asr_timeout_retry",
1796
- "endpoint": "/api/u2/v1/services/audio/asr/transcription + /api/u2/v1/tasks/{task_id}",
1797
- "selected_video_url": selected_video_url,
1798
- "selected_video_candidates": u2_candidates,
1799
- "candidate_attempts": u2_bundle.get("candidate_attempts", []),
1800
- "submit_retries_config": {
1801
- "u2_submit_max_retries": max(0, int(u2_submit_max_retries)),
1802
- "u2_submit_backoff_ms": max(0, int(u2_submit_backoff_ms)),
1803
- },
1804
- "timeout_retry": u2_bundle.get("timeout_retry", {}),
1805
- "rounds": u2_bundle.get("rounds", []),
1806
- "final_task_id": poll_result.get("task_id") or task_id,
1807
- "final_task_status": poll_result.get("task_status"),
1808
- "final_error_reason": poll_result.get("error_reason"),
1809
- }
1810
- )
1811
- if progress is not None:
1812
- (progress.done if poll_result.get("ok") else progress.failed)(
1813
- stage="note.u2",
1814
- message="xiaohongshu u2 flow finished" if poll_result.get("ok") else "xiaohongshu u2 flow failed",
1815
- data={
1816
- "task_id": poll_result.get("task_id") or task_id,
1817
- "task_status": poll_result.get("task_status"),
1818
- "error_reason": poll_result.get("error_reason"),
1819
- },
1820
- )
1821
-
1822
- if not poll_result.get("ok") and (
1823
- not submit_response.get("ok") or not (poll_result.get("task_id") or task_id)
1824
- ):
1825
- error_ctx = resolve_trace_error_context(
1826
- responses=[poll_result, submit_response, note_response],
1827
- extract_trace=trace,
1828
- default_error_reason="u2_submit_failed_or_missing_task_id",
1829
- explicit_request_id=(
1830
- poll_result.get("request_id")
1831
- or submit_response.get("request_id")
1832
- or note_response.get("request_id")
1833
- ),
1834
- )
1835
- fallback_text = caption_text
1836
- if fallback_text:
1837
- missing_fields.append({"field": "asr_transcript", "reason": f"u2_failed:{error_ctx.get('error_reason')}"})
1838
- else:
1839
- missing_fields.append({"field": "raw_content", "reason": "u2_failed_and_caption_missing"})
1840
- result = _build_result(
1841
- source_input=source_input,
1842
- raw_content=fallback_text,
1843
- confidence="medium" if fallback_text else "low",
1844
- error_reason=None if fallback_text else error_ctx.get("error_reason"),
1845
- extract_trace=trace,
1846
- fallback_trace=error_ctx.get("fallback_trace", []),
1847
- request_id=error_ctx.get("request_id"),
1848
- text_source="caption_fallback" if fallback_text else "u2",
1849
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1850
- subtitle_hit=False,
1851
- u2_task_id=poll_result.get("task_id") or task_id,
1852
- u2_task_status=poll_result.get("task_status") or "UNKNOWN",
1853
- note_content_type=note_content_type,
1854
- analysis_mode="video_full",
1855
- selected_video_url=selected_video_url,
1856
- selected_video_candidates=u2_candidates,
1857
- selected_image_urls=image_candidates,
1858
- downloaded_assets=[],
1859
- missing_fields=missing_fields,
1860
- metadata_fields=metadata_fields,
1861
- )
1862
- if write_card:
1863
- result["card_write"] = write_benchmark_card(
1864
- payload=result,
1865
- platform="xiaohongshu",
1866
- card_type=card_type,
1867
- card_root=card_root,
1868
- content_kind="single_video",
1869
- storage_config=storage_config,
1870
- )
1871
- return _finalize_result(
1872
- result=result,
1873
- source_input=source_input,
1874
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1875
- storage_config=storage_config,
1876
- persist_output=persist_output,
1877
- )
1878
-
1879
- raw_content = poll_result.get("transcript_text", "") if poll_result.get("ok") else ""
1880
- final_ctx = resolve_trace_error_context(
1881
- responses=[poll_result, submit_response, note_response],
1882
- extract_trace=trace,
1883
- explicit_error_reason=poll_result.get("error_reason"),
1884
- explicit_request_id=poll_result.get("request_id") or submit_response.get("request_id") or note_response.get("request_id"),
1885
- )
1886
- result = _build_result(
1887
- source_input=source_input,
1888
- raw_content=raw_content,
1889
- confidence="high" if poll_result.get("ok") and raw_content else "low",
1890
- error_reason=final_ctx.get("error_reason"),
1891
- extract_trace=trace,
1892
- fallback_trace=final_ctx.get("fallback_trace", []),
1893
- request_id=final_ctx.get("request_id"),
1894
- text_source="u2",
1895
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1896
- subtitle_hit=False,
1897
- u2_task_id=poll_result.get("task_id") or task_id,
1898
- u2_task_status=poll_result.get("task_status"),
1899
- note_content_type=note_content_type,
1900
- analysis_mode="video_full",
1901
- selected_video_url=selected_video_url,
1902
- selected_video_candidates=u2_candidates,
1903
- selected_image_urls=image_candidates,
1904
- downloaded_assets=[],
1905
- missing_fields=missing_fields,
1906
- metadata_fields=metadata_fields,
1907
- )
1908
-
1909
- if write_card:
1910
- result["card_write"] = write_benchmark_card(
1911
- payload=result,
1912
- platform="xiaohongshu",
1913
- card_type=card_type,
1914
- card_root=card_root,
1915
- content_kind="single_video",
1916
- storage_config=storage_config,
1917
- )
1918
-
1919
- return _finalize_result(
1920
- result=result,
1921
- source_input=source_input,
1922
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1923
- storage_config=storage_config,
1924
- persist_output=persist_output,
1925
- )
1926
-
1927
- # Image-note path, strategy B: download images + light text analysis + write card.
1928
- raw_content = caption_text
1929
-
1930
- downloaded_assets = _download_images(
1931
- urls=image_candidates,
1932
- timeout_ms=runtime["timeout_ms"],
1933
- source_input=source_input,
1934
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1935
- storage_config=storage_config,
1936
- )
1937
-
1938
- if not image_candidates:
1939
- missing_fields.append({"field": "selected_image_urls", "reason": "image_note_but_no_image_url"})
1940
- if not raw_content:
1941
- missing_fields.append({"field": "raw_content", "reason": "title_and_desc_missing"})
1942
-
1943
- success_ctx = resolve_trace_error_context(
1944
- responses=[note_response],
1945
- extract_trace=trace,
1946
- explicit_error_reason=None,
1947
- explicit_request_id=note_response.get("request_id"),
1948
- )
1949
-
1950
- result = _build_result(
1951
- source_input=source_input,
1952
- raw_content=raw_content,
1953
- confidence="high" if raw_content else "medium",
1954
- error_reason=None,
1955
- extract_trace=trace,
1956
- fallback_trace=success_ctx.get("fallback_trace", []),
1957
- request_id=success_ctx.get("request_id"),
1958
- text_source="caption",
1959
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1960
- subtitle_hit=False,
1961
- u2_task_id=None,
1962
- u2_task_status="SKIPPED",
1963
- note_content_type="image" if note_content_type == "unknown" else note_content_type,
1964
- analysis_mode="image_light_analysis",
1965
- selected_video_url=None,
1966
- selected_video_candidates=video_candidates,
1967
- selected_image_urls=image_candidates,
1968
- downloaded_assets=downloaded_assets,
1969
- missing_fields=missing_fields,
1970
- metadata_fields=metadata_fields,
1971
- )
1972
-
1973
- if write_card:
1974
- result["card_write"] = write_benchmark_card(
1975
- payload=result,
1976
- platform="xiaohongshu",
1977
- card_type=card_type,
1978
- card_root=card_root,
1979
- content_kind="note",
1980
- storage_config=storage_config,
1981
- )
1982
-
1983
- finalized = _finalize_result(
1984
- result=result,
1985
- source_input=source_input,
1986
- note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
1987
- storage_config=storage_config,
1988
- persist_output=persist_output,
1989
- )
1990
- if progress is not None:
1991
- final_event = progress.failed if finalized.get("error_reason") else progress.done
1992
- final_event(
1993
- stage="note.workflow",
1994
- message="xiaohongshu note workflow finished" if not finalized.get("error_reason") else "xiaohongshu note workflow failed",
1995
- data={
1996
- "request_id": finalized.get("request_id"),
1997
- "card_write_ok": bool((finalized.get("card_write") or {}).get("ok")),
1998
- "output_persist_ok": bool((finalized.get("output_persist") or {}).get("ok")),
1999
- "text_source": finalized.get("text_source"),
2000
- },
2001
- )
2002
- return finalized
2003
-
2004
-
2005
- def main() -> None:
2006
- parser = argparse.ArgumentParser(description="Run xiaohongshu extraction chain")
2007
- parser.add_argument("input", nargs="?", default=None, help="Share URL or note_id")
2008
- parser.add_argument("--share-text", default=None, help="Xiaohongshu share URL/text")
2009
- parser.add_argument("--note-id", default=None, help="Xiaohongshu note_id")
2010
- parser.add_argument("--config", default=None, help="Runtime config YAML path")
2011
- parser.add_argument("--env-file", default=None, help="Optional env file path")
2012
- parser.add_argument("--allow-process-env", action="store_true", help="Allow process env to override .env/.env.local")
2013
- parser.add_argument("--api-key-env", default=None, help="API key env variable name")
2014
- parser.add_argument("--base-url", default=None, help="Tikomni base URL")
2015
- parser.add_argument("--timeout-ms", type=int, default=None, help="Request timeout ms")
2016
- parser.add_argument("--poll-interval-sec", type=float, default=None, help="U2 polling interval seconds")
2017
- parser.add_argument("--max-polls", type=int, default=None, help="Max U2 polls")
2018
- parser.add_argument(
2019
- "--u2-submit-max-retries",
2020
- type=int,
2021
- default=None,
2022
- help="Max retries for retriable U2 submit failures",
2023
- )
2024
- parser.add_argument(
2025
- "--u2-submit-backoff-ms",
2026
- type=int,
2027
- default=None,
2028
- help="Base backoff ms for retriable U2 submit failures (exponential)",
2029
- )
2030
- parser.add_argument(
2031
- "--u2-timeout-retry-enabled",
2032
- type=str,
2033
- choices=["true", "false"],
2034
- default=None,
2035
- help="Enable conservative retry only when U2 polling times out",
2036
- )
2037
- parser.add_argument(
2038
- "--u2-timeout-retry-max-retries",
2039
- type=int,
2040
- default=None,
2041
- help="Conservative max retries for U2 timeout-only retry (0~3)",
2042
- )
2043
- parser.add_argument("--force-u2-fallback", action="store_true", help="Skip subtitle usage and force U2 fallback (test)")
2044
- parser.add_argument("--card-type", choices=["work", "author", "author_sample_work"], default="work", help="Primary card type")
2045
- parser.add_argument("--card-root", default=None, help="Card root (absolute); falls back to TIKOMNI_CARD_ROOT when writing cards")
2046
- args = parser.parse_args()
2047
-
2048
- config, _ = load_tikomni_config(
2049
- args.config,
2050
- env_file=args.env_file,
2051
- allow_process_env=args.allow_process_env,
2052
- )
2053
-
2054
- resolved_env_file = args.env_file or config_get(config, "runtime.env_file", None)
2055
- api_key_env = args.api_key_env or config_get(config, "runtime.auth_env_key", "TIKOMNI_API_KEY")
2056
- base_url = args.base_url or config_get(config, "runtime.base_url", None)
2057
- timeout_ms = args.timeout_ms if args.timeout_ms is not None else config_get(config, "runtime.timeout_ms", None)
2058
- poll_interval_sec = (
2059
- args.poll_interval_sec
2060
- if args.poll_interval_sec is not None
2061
- else config_get(config, "asr_strategy.poll_interval_sec", 3.0)
2062
- )
2063
- max_polls = args.max_polls if args.max_polls is not None else config_get(config, "asr_strategy.max_polls", 30)
2064
- u2_submit_max_retries = (
2065
- args.u2_submit_max_retries
2066
- if args.u2_submit_max_retries is not None
2067
- else config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.max_retries", 0)
2068
- )
2069
- u2_submit_backoff_ms = (
2070
- args.u2_submit_backoff_ms
2071
- if args.u2_submit_backoff_ms is not None
2072
- else config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.backoff_ms", 0)
2073
- )
2074
- u2_timeout_retry_enabled = (
2075
- (str(args.u2_timeout_retry_enabled).lower() == "true")
2076
- if args.u2_timeout_retry_enabled is not None
2077
- else bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True))
2078
- )
2079
- u2_timeout_retry_max_retries = (
2080
- args.u2_timeout_retry_max_retries
2081
- if args.u2_timeout_retry_max_retries is not None
2082
- else config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 3)
2083
- )
2084
-
2085
- try:
2086
- result = run_xiaohongshu_extract(
2087
- input_value=args.input,
2088
- share_text=args.share_text,
2089
- note_id=args.note_id,
2090
- env_file=resolved_env_file,
2091
- api_key_env=api_key_env,
2092
- base_url=base_url,
2093
- timeout_ms=timeout_ms,
2094
- poll_interval_sec=float(poll_interval_sec),
2095
- max_polls=int(max_polls),
2096
- u2_submit_max_retries=int(u2_submit_max_retries),
2097
- u2_submit_backoff_ms=int(u2_submit_backoff_ms),
2098
- u2_timeout_retry_enabled=bool(u2_timeout_retry_enabled),
2099
- u2_timeout_retry_max_retries=int(u2_timeout_retry_max_retries),
2100
- force_u2_fallback=args.force_u2_fallback,
2101
- write_card=True,
2102
- card_type=args.card_type,
2103
- card_root=args.card_root,
2104
- storage_config=config,
2105
- allow_process_env=args.allow_process_env,
2106
- persist_output=True,
2107
- )
2108
- except ValueError as error:
2109
- result = {
2110
- "platform": "xiaohongshu",
2111
- "content_kind": "note",
2112
- "raw_content": "",
2113
- "summary": "",
2114
- "insights": ["source=xiaohongshu:runtime", "runtime_not_ready"],
2115
- "confidence": "low",
2116
- "error_reason": str(error),
2117
- "missing_fields": [],
2118
- "extract_trace": [],
2119
- "fallback_trace": [],
2120
- "request_id": None,
2121
- }
2122
-
2123
- write_json_stdout(result)
2124
- raise SystemExit(0 if not result.get("error_reason") else 1)
2125
-
2126
-
2127
- if __name__ == "__main__":
2128
- main()