@tikomni/skills 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tikomni/skills",
3
- "version": "0.1.6",
3
+ "version": "0.1.7",
4
4
  "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
@@ -32,7 +32,8 @@ description: Use this skill when the user asks about social media links, posts,
32
32
  - 先看用户给的对象是什么,再选固定 pipeline 或 MCP 工具。
33
33
  - 优先返回事实字段,不补充主观分析。
34
34
  - 视频文本优先使用平台原生字幕;拿不到再走 ASR。
35
- - 默认输出以结构化 JSON 为主。
35
+ - 默认输出以结构化 JSON 和Markdown并主动落库。
36
+ - 当固定pipeline时,默认执行抓取并落库(强制)。
36
37
 
37
38
  ## Workflow
38
39
 
@@ -1014,6 +1014,159 @@ def run_u2_asr_candidates_with_timeout_retry(
1014
1014
  return final_bundle
1015
1015
 
1016
1016
 
1017
+ def run_u3_then_u2_asr_candidates_with_timeout_retry(
1018
+ *,
1019
+ base_url: str,
1020
+ token: str,
1021
+ timeout_ms: int,
1022
+ candidates: List[str],
1023
+ submit_max_retries: int,
1024
+ submit_backoff_ms: int,
1025
+ poll_interval_sec: float,
1026
+ max_polls: int,
1027
+ timeout_retry_enabled: bool = True,
1028
+ timeout_retry_max_retries: int = 3,
1029
+ pending_timeout_sec: int = DEFAULT_U2_PENDING_TIMEOUT_SEC,
1030
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1031
+ ) -> Dict[str, Any]:
1032
+ normalized_candidates = normalize_media_candidates(candidates)
1033
+ attempts: List[Dict[str, Any]] = []
1034
+
1035
+ final_bundle: Dict[str, Any] = {
1036
+ "submit_bundle": {},
1037
+ "poll_result": {"ok": False, "task_status": "UNKNOWN", "error_reason": "no_candidates"},
1038
+ "rounds": [],
1039
+ "timeout_retry": {
1040
+ "enabled": bool(timeout_retry_enabled),
1041
+ "configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
1042
+ "triggered": False,
1043
+ "result": "not_triggered",
1044
+ },
1045
+ "u3_fallback": {
1046
+ "enabled": False,
1047
+ "triggered": False,
1048
+ "ok": False,
1049
+ "result": "not_triggered",
1050
+ "public_url": "",
1051
+ "trace": [],
1052
+ },
1053
+ }
1054
+ chosen_url: Optional[str] = None
1055
+ chosen_public_url: Optional[str] = None
1056
+
1057
+ for index, candidate in enumerate(normalized_candidates, start=1):
1058
+ valid = is_valid_u2_media_candidate(candidate)
1059
+ if not valid:
1060
+ attempts.append(
1061
+ {
1062
+ "index": index,
1063
+ "candidate": candidate,
1064
+ "valid": False,
1065
+ "result": "skipped_non_media_candidate",
1066
+ }
1067
+ )
1068
+ continue
1069
+
1070
+ u3_result = run_u3_public_url_fallback(
1071
+ base_url=base_url,
1072
+ token=token,
1073
+ timeout_ms=timeout_ms,
1074
+ source_url=candidate,
1075
+ )
1076
+ u3_bundle = {
1077
+ "enabled": True,
1078
+ "triggered": True,
1079
+ "ok": bool(u3_result.get("ok")),
1080
+ "result": "u3_completed" if u3_result.get("ok") else "u3_failed",
1081
+ "public_url": normalize_media_url(u3_result.get("public_url")),
1082
+ "request_id": u3_result.get("request_id"),
1083
+ "error_reason": u3_result.get("error_reason"),
1084
+ "trace": u3_result.get("trace", []),
1085
+ }
1086
+
1087
+ attempts.append(
1088
+ {
1089
+ "index": index,
1090
+ "candidate": candidate,
1091
+ "valid": True,
1092
+ "u3_bridge": u3_bundle,
1093
+ }
1094
+ )
1095
+
1096
+ if not u3_bundle.get("ok") or not u3_bundle.get("public_url"):
1097
+ final_bundle = {
1098
+ "submit_bundle": {},
1099
+ "poll_result": {
1100
+ "ok": False,
1101
+ "task_status": "UNKNOWN",
1102
+ "error_reason": u3_bundle.get("error_reason") or "u3_bridge_failed",
1103
+ "request_id": u3_bundle.get("request_id"),
1104
+ "trace": list(u3_bundle.get("trace", [])),
1105
+ },
1106
+ "rounds": [],
1107
+ "timeout_retry": {
1108
+ "enabled": bool(timeout_retry_enabled),
1109
+ "configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
1110
+ "triggered": False,
1111
+ "result": "not_triggered",
1112
+ },
1113
+ "u3_fallback": {
1114
+ "enabled": False,
1115
+ "triggered": False,
1116
+ "ok": False,
1117
+ "result": "not_triggered",
1118
+ "public_url": "",
1119
+ "trace": [],
1120
+ },
1121
+ "u3_bridge": u3_bundle,
1122
+ }
1123
+ continue
1124
+
1125
+ bundle = run_u2_asr_with_timeout_retry(
1126
+ base_url=base_url,
1127
+ token=token,
1128
+ timeout_ms=timeout_ms,
1129
+ video_url=str(u3_bundle.get("public_url")),
1130
+ submit_max_retries=submit_max_retries,
1131
+ submit_backoff_ms=submit_backoff_ms,
1132
+ poll_interval_sec=poll_interval_sec,
1133
+ max_polls=max_polls,
1134
+ timeout_retry_enabled=timeout_retry_enabled,
1135
+ timeout_retry_max_retries=timeout_retry_max_retries,
1136
+ pending_timeout_sec=pending_timeout_sec,
1137
+ u3_fallback_enabled=False,
1138
+ progress_callback=progress_callback,
1139
+ )
1140
+ poll_result = bundle.get("poll_result", {})
1141
+ error_reason = str(poll_result.get("error_reason") or "")
1142
+ ok = bool(poll_result.get("ok"))
1143
+
1144
+ attempts[-1].update(
1145
+ {
1146
+ "ok": ok,
1147
+ "error_reason": error_reason,
1148
+ "task_status": poll_result.get("task_status"),
1149
+ "u2_public_url": u3_bundle.get("public_url"),
1150
+ }
1151
+ )
1152
+
1153
+ final_bundle = dict(bundle)
1154
+ final_bundle["u3_bridge"] = u3_bundle
1155
+ chosen_url = candidate
1156
+ chosen_public_url = str(u3_bundle.get("public_url") or "")
1157
+ if ok:
1158
+ break
1159
+ if error_reason == "INVALID_SOURCE_URL":
1160
+ continue
1161
+ break
1162
+
1163
+ final_bundle["candidate_attempts"] = attempts
1164
+ final_bundle["chosen_candidate"] = chosen_url
1165
+ final_bundle["chosen_public_url"] = chosen_public_url
1166
+ final_bundle["normalized_candidates"] = normalized_candidates
1167
+ return final_bundle
1168
+
1169
+
1017
1170
  def run_u2_asr_batch_with_timeout_retry(
1018
1171
  *,
1019
1172
  base_url: str,
@@ -16,9 +16,11 @@ from scripts.core.asr_pipeline import (
16
16
  run_u2_asr_batch_with_timeout_retry,
17
17
  run_u2_asr_candidates_with_timeout_retry,
18
18
  )
19
+ from scripts.core.u3_fallback import run_u3_public_url_fallback
19
20
 
20
21
  DEFAULT_BATCH_SUBMIT_SIZE = 50
21
22
  MAX_BATCH_SUBMIT_SIZE = 100
23
+ XHS_U3_U2_BATCH_SIZE = 20
22
24
  U2_GATE_MIN_DURATION_MS = 13000
23
25
  U2_GATE_MAX_DURATION_MS = 1800000
24
26
  U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
@@ -406,23 +408,31 @@ def _resolve_xhs_subtitle(work: Dict[str, Any], timeout_ms: int) -> Dict[str, An
406
408
  }
407
409
  invalid_reasons.append({"field": source, "reason": invalid_reason})
408
410
 
409
- fetched = _fetch_subtitle_text(subtitle_urls, timeout_ms=timeout_ms)
410
- cleaned = _clean_text(fetched)
411
- fetched_invalid = _invalid_subtitle_reason(cleaned)
412
- if fetched_invalid is not None and subtitle_urls:
413
- invalid_reasons.append({"field": "subtitle_url", "reason": fetched_invalid})
411
+ subtitle_text = _fetch_subtitle_text(subtitle_urls, timeout_ms)
412
+ if subtitle_text:
413
+ invalid_reason = _invalid_subtitle_reason(subtitle_text)
414
+ if invalid_reason is None:
415
+ return {
416
+ "text": subtitle_text,
417
+ "subtitle_source": "subtitle_url",
418
+ "subtitle_field": "raw_ref.subtitle_urls",
419
+ "subtitle_urls": subtitle_urls,
420
+ "invalid_reasons": invalid_reasons,
421
+ "failure_category": "",
422
+ }
423
+ invalid_reasons.append({"field": "raw_ref.subtitle_urls", "reason": invalid_reason})
414
424
 
415
425
  return {
416
- "text": cleaned,
417
- "subtitle_source": "url" if subtitle_urls else "missing",
418
- "subtitle_field": "subtitle_url" if subtitle_urls else "",
426
+ "text": "",
427
+ "subtitle_source": "missing",
428
+ "subtitle_field": "",
419
429
  "subtitle_urls": subtitle_urls,
420
430
  "invalid_reasons": invalid_reasons,
421
431
  "failure_category": _classify_xhs_subtitle_failure(
422
432
  work=work,
423
433
  interface_candidates=interface_candidates,
424
434
  subtitle_urls=subtitle_urls,
425
- invalid_reason=fetched_invalid or "subtitle_empty",
435
+ invalid_reason="subtitle_empty",
426
436
  ),
427
437
  }
428
438
 
@@ -462,6 +472,127 @@ def _fallback_none_result(reason: str) -> Dict[str, Any]:
462
472
  }
463
473
 
464
474
 
475
+ def _run_xhs_u3_then_u2_batch_for_entries(
476
+ *,
477
+ batch_id: str,
478
+ entries: List[Dict[str, Any]],
479
+ base_url: str,
480
+ token: str,
481
+ timeout_ms: int,
482
+ poll_interval_sec: float,
483
+ max_polls: int,
484
+ submit_max_retries: int,
485
+ submit_backoff_ms: int,
486
+ timeout_retry_enabled: bool,
487
+ timeout_retry_max_retries: int,
488
+ ) -> Dict[str, Any]:
489
+ trace: List[Dict[str, Any]] = []
490
+ u2_entries: List[Dict[str, Any]] = []
491
+ u3_failed_count = 0
492
+
493
+ for entry in entries:
494
+ work = entry.get("work")
495
+ if not isinstance(work, dict):
496
+ continue
497
+
498
+ source_url = normalize_media_url(entry.get("video_download_url") or work.get("video_download_url") or work.get("video_down_url"))
499
+ work_id = normalize_text(entry.get("work_id") or work.get("platform_work_id"))
500
+ subtitle_invalid = normalize_text(entry.get("subtitle_invalid")) or "subtitle_missing"
501
+
502
+ if not source_url:
503
+ work.update(_fallback_none_result("skip:video_download_url_missing"))
504
+ trace.append(
505
+ {
506
+ "step": "author_home.asr.xhs_u3",
507
+ "batch_id": batch_id,
508
+ "platform_work_id": work_id,
509
+ "ok": False,
510
+ "error_reason": "skip:video_download_url_missing",
511
+ "subtitle_invalid": subtitle_invalid,
512
+ "public_url_present": False,
513
+ }
514
+ )
515
+ u3_failed_count += 1
516
+ continue
517
+
518
+ u3_result = run_u3_public_url_fallback(
519
+ base_url=base_url,
520
+ token=token,
521
+ timeout_ms=timeout_ms,
522
+ source_url=source_url,
523
+ )
524
+ public_url = normalize_media_url(u3_result.get("public_url"))
525
+ trace.append(
526
+ {
527
+ "step": "author_home.asr.xhs_u3",
528
+ "batch_id": batch_id,
529
+ "platform_work_id": work_id,
530
+ "ok": bool(u3_result.get("ok") and public_url),
531
+ "error_reason": u3_result.get("error_reason"),
532
+ "subtitle_invalid": subtitle_invalid,
533
+ "source_url": source_url,
534
+ "public_url_present": bool(public_url),
535
+ "u3_trace": u3_result.get("trace", []),
536
+ }
537
+ )
538
+
539
+ if not u3_result.get("ok") or not public_url:
540
+ work.update(_fallback_none_result(normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
541
+ u3_failed_count += 1
542
+ continue
543
+
544
+ u2_entries.append(
545
+ {
546
+ "work": work,
547
+ "work_id": work_id,
548
+ "video_download_url": public_url,
549
+ "fallback_reason": f"xhs_u3_then_u2_failed:{subtitle_invalid}",
550
+ "u3_public_url": public_url,
551
+ }
552
+ )
553
+
554
+ batch_bundle = {
555
+ "trace": [],
556
+ "submitted": False,
557
+ "completed": False,
558
+ "mapped_count": 0,
559
+ "unmapped_entries": [],
560
+ "batch_progress": {},
561
+ }
562
+ if u2_entries:
563
+ batch_bundle = _run_u2_batch_for_entries(
564
+ batch_id=batch_id,
565
+ entries=u2_entries,
566
+ base_url=base_url,
567
+ token=token,
568
+ timeout_ms=timeout_ms,
569
+ poll_interval_sec=poll_interval_sec,
570
+ max_polls=max_polls,
571
+ submit_max_retries=submit_max_retries,
572
+ submit_backoff_ms=submit_backoff_ms,
573
+ timeout_retry_enabled=timeout_retry_enabled,
574
+ timeout_retry_max_retries=timeout_retry_max_retries,
575
+ )
576
+ trace.extend(batch_bundle.get("trace") if isinstance(batch_bundle.get("trace"), list) else [])
577
+
578
+ unmapped_entries = list(batch_bundle.get("unmapped_entries") or [])
579
+ for entry in unmapped_entries:
580
+ work = entry.get("work")
581
+ if not isinstance(work, dict):
582
+ continue
583
+ work.update(_fallback_none_result(normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
584
+
585
+ return {
586
+ "trace": trace,
587
+ "submitted": bool(batch_bundle.get("submitted")),
588
+ "completed": bool(batch_bundle.get("completed")),
589
+ "mapped_count": int(batch_bundle.get("mapped_count") or 0),
590
+ "unmapped_count": len(unmapped_entries),
591
+ "u3_ready_count": len(u2_entries),
592
+ "u3_failed_count": u3_failed_count,
593
+ }
594
+
595
+
465
596
  def _mark_text_work_ready(work: Dict[str, Any]) -> Dict[str, Any]:
466
597
  caption_raw = normalize_text(work.get("caption_raw"))
467
598
  return {
@@ -669,6 +800,8 @@ def enrich_author_home_asr(
669
800
  default=DEFAULT_BATCH_SUBMIT_SIZE,
670
801
  hard_limit=MAX_BATCH_SUBMIT_SIZE,
671
802
  )
803
+ if platform == "xiaohongshu":
804
+ effective_batch = min(effective_batch, XHS_U3_U2_BATCH_SIZE)
672
805
 
673
806
  trace.append(
674
807
  {
@@ -738,6 +871,7 @@ def enrich_author_home_asr(
738
871
  )
739
872
 
740
873
  batch_u2_entries: List[Dict[str, Any]] = []
874
+ batch_xhs_u3_entries: List[Dict[str, Any]] = []
741
875
 
742
876
  for work in batch:
743
877
  work_id = normalize_text(work.get("platform_work_id"))
@@ -834,7 +968,6 @@ def enrich_author_home_asr(
834
968
  "invalid_reasons": subtitle_probe.get("invalid_reasons"),
835
969
  }
836
970
  )
837
-
838
971
  gate = _evaluate_u2_gate(work, platform=platform)
839
972
  trace.append(
840
973
  {
@@ -848,19 +981,17 @@ def enrich_author_home_asr(
848
981
  "is_video": gate.get("is_video"),
849
982
  "duration_ms": gate.get("duration_ms"),
850
983
  "video_download_url_present": gate.get("video_download_url_present"),
851
- "subtitle_invalid": subtitle_invalid,
852
984
  }
853
985
  )
854
-
855
986
  if not gate.get("can_u2"):
856
987
  work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
857
988
  else:
858
- batch_u2_entries.append(
989
+ batch_xhs_u3_entries.append(
859
990
  {
860
991
  "work": work,
861
992
  "work_id": work_id,
862
993
  "video_download_url": gate.get("video_download_url"),
863
- "fallback_reason": f"xhs_subtitle_invalid:{subtitle_invalid}",
994
+ "subtitle_invalid": subtitle_invalid,
864
995
  }
865
996
  )
866
997
 
@@ -890,6 +1021,28 @@ def enrich_author_home_asr(
890
1021
  fallback_entries = list(batch_bundle.get("unmapped_entries") or [])
891
1022
  batch_unmapped_count += len(fallback_entries)
892
1023
 
1024
+ if batch_xhs_u3_entries:
1025
+ xhs_batch_bundle = _run_xhs_u3_then_u2_batch_for_entries(
1026
+ batch_id=batch_id,
1027
+ entries=batch_xhs_u3_entries,
1028
+ base_url=base_url,
1029
+ token=token,
1030
+ timeout_ms=timeout_ms,
1031
+ poll_interval_sec=poll_interval_sec,
1032
+ max_polls=max_polls,
1033
+ submit_max_retries=max(0, int(xhs_submit_max_retries)),
1034
+ submit_backoff_ms=max(0, int(xhs_submit_backoff_ms)),
1035
+ timeout_retry_enabled=timeout_retry_enabled,
1036
+ timeout_retry_max_retries=max(0, int(timeout_retry_max_retries)),
1037
+ )
1038
+ trace.extend(xhs_batch_bundle.get("trace") if isinstance(xhs_batch_bundle.get("trace"), list) else [])
1039
+ if xhs_batch_bundle.get("submitted"):
1040
+ submitted_batches += 1
1041
+ if xhs_batch_bundle.get("completed"):
1042
+ completed_batches += 1
1043
+ batch_mapped_count += int(xhs_batch_bundle.get("mapped_count") or 0)
1044
+ batch_unmapped_count += int(xhs_batch_bundle.get("unmapped_count") or 0)
1045
+
893
1046
  for fallback_entry in fallback_entries:
894
1047
  fallback_work = fallback_entry.get("work")
895
1048
  if not isinstance(fallback_work, dict):
@@ -217,7 +217,8 @@ def _extract_xhs_subtitle_inline(item: Dict[str, Any]) -> str:
217
217
 
218
218
 
219
219
  def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
220
- return _pick_http_urls(
220
+ preferred_language_keys = ("source", "zh-CN", "zh_CN", "zh-Hans", "zh", "zh-Hant", "zh-TW", "zh-HK")
221
+ urls = _pick_http_urls(
221
222
  item,
222
223
  [
223
224
  "subtitle_url",
@@ -233,6 +234,42 @@ def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
233
234
  ],
234
235
  )
235
236
 
237
+ def _append(value: Any) -> None:
238
+ text = _t(value)
239
+ if text.startswith("http://") or text.startswith("https://"):
240
+ urls.append(text)
241
+
242
+ def _walk(node: Any) -> None:
243
+ if isinstance(node, dict):
244
+ for key in preferred_language_keys:
245
+ if key in node and isinstance(node.get(key), (dict, list)):
246
+ _walk(node.get(key))
247
+ _append(node.get("url"))
248
+ _append(node.get("src"))
249
+ for key, value in node.items():
250
+ if key in preferred_language_keys:
251
+ continue
252
+ if isinstance(value, (dict, list)):
253
+ _walk(value)
254
+ elif isinstance(node, list):
255
+ for item in node:
256
+ if isinstance(item, (dict, list)):
257
+ _walk(item)
258
+ else:
259
+ _append(item)
260
+
261
+ for container in deep_find_all(item, ["subtitles", "subtitle_list", "subtitleList"]):
262
+ _walk(container)
263
+
264
+ deduped: List[str] = []
265
+ seen = set()
266
+ for url in urls:
267
+ if url in seen:
268
+ continue
269
+ seen.add(url)
270
+ deduped.append(url)
271
+ return deduped
272
+
236
273
 
237
274
  def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str, subtitle_inline: str) -> str:
238
275
  content_type_raw = _t(_first(item, ["type", "note_type", "model_type"])).lower()
@@ -27,7 +27,7 @@ from datetime import datetime
27
27
  from pathlib import Path
28
28
  from typing import Any, Dict, List, Optional, Tuple
29
29
 
30
- from scripts.core.asr_pipeline import derive_asr_clean_text, run_u2_asr_candidates_with_timeout_retry
30
+ from scripts.core.asr_pipeline import derive_asr_clean_text, run_u3_then_u2_asr_candidates_with_timeout_retry
31
31
  from scripts.core.config_loader import config_get, load_tikomni_config
32
32
  from scripts.core.progress_report import ProgressReporter, build_progress_reporter
33
33
  from scripts.core.extract_pipeline import build_api_trace, resolve_trace_error_context
@@ -576,6 +576,14 @@ def _extract_xhs_metadata(
576
576
  ["noteList", "publishTime"],
577
577
  ["noteList", "time"],
578
578
  ["noteList", "timestamp"],
579
+ ["data", "data", "create_time_sec"],
580
+ ["data", "data", "create_time"],
581
+ ["data", "data", "createTime"],
582
+ ["data", "data", "publish_time_sec"],
583
+ ["data", "data", "publish_time"],
584
+ ["data", "data", "publishTime"],
585
+ ["data", "data", "time"],
586
+ ["data", "data", "timestamp"],
579
587
  ]
580
588
  create_time_sec, create_time_source = _pick_int_with_source_from_paths(
581
589
  payload,
@@ -930,12 +938,39 @@ def _fetch_note_info(
930
938
 
931
939
  def _extract_subtitle_urls(payload: Any) -> List[str]:
932
940
  urls: List[str] = []
941
+ preferred_language_keys = ("source", "zh-CN", "zh_CN", "zh-Hans", "zh", "zh-Hant", "zh-TW", "zh-HK")
942
+
943
+ def _append_url(value: Any) -> None:
944
+ if isinstance(value, str):
945
+ text = value.strip()
946
+ if text.startswith("http://") or text.startswith("https://"):
947
+ urls.append(text)
948
+
949
+ def _walk_subtitle_container(node: Any) -> None:
950
+ if isinstance(node, dict):
951
+ for key in preferred_language_keys:
952
+ if key in node and isinstance(node.get(key), (dict, list)):
953
+ _walk_subtitle_container(node.get(key))
954
+ _append_url(node.get("url"))
955
+ _append_url(node.get("src"))
956
+ for key, value in node.items():
957
+ if key in preferred_language_keys:
958
+ continue
959
+ if isinstance(value, (dict, list)):
960
+ _walk_subtitle_container(value)
961
+ elif isinstance(node, list):
962
+ for item in node:
963
+ if isinstance(item, (dict, list)):
964
+ _walk_subtitle_container(item)
965
+ else:
966
+ _append_url(item)
967
+
933
968
  for key in ["subtitle_url", "subtitleUrl", "srt_url", "srtUrl", "vtt_url", "vttUrl"]:
934
969
  for value in deep_find_all(payload, [key]):
935
- if isinstance(value, str):
936
- text = value.strip()
937
- if text.startswith("http://") or text.startswith("https://"):
938
- urls.append(text)
970
+ _append_url(value)
971
+
972
+ for container in deep_find_all(payload, ["subtitles", "subtitle_list", "subtitleList"]):
973
+ _walk_subtitle_container(container)
939
974
 
940
975
  unique: List[str] = []
941
976
  seen = set()
@@ -1672,8 +1707,9 @@ def run_xiaohongshu_extract(
1672
1707
 
1673
1708
  subtitle_inline_text = "" if force_u2_fallback else _extract_subtitle_inline_text(effective_payload)
1674
1709
  subtitle_urls = [] if force_u2_fallback else _extract_subtitle_urls(effective_payload)
1675
- subtitle_url_text = "" if force_u2_fallback else _fetch_subtitle_text(subtitle_urls, runtime["timeout_ms"])
1676
- subtitle_text = subtitle_inline_text or subtitle_url_text
1710
+ subtitle_text = subtitle_inline_text
1711
+ if not subtitle_text and subtitle_urls:
1712
+ subtitle_text = _fetch_subtitle_text(subtitle_urls, runtime["timeout_ms"])
1677
1713
 
1678
1714
  app_video_candidates = _extract_video_candidates(note_response.get("data"))
1679
1715
  app_image_candidates, image_quality_strategy = _extract_image_candidates_with_strategy(note_response.get("data"))
@@ -1869,11 +1905,11 @@ def run_xiaohongshu_extract(
1869
1905
  if progress is not None:
1870
1906
  progress.progress(
1871
1907
  stage="note.u2",
1872
- message="starting xiaohongshu u2 flow",
1908
+ message="starting xiaohongshu u3->u2 flow",
1873
1909
  data={"candidate_count": len(u2_candidates), "timeout_ms": u2_timeout_ms},
1874
1910
  )
1875
1911
  u2_started_at = time.perf_counter()
1876
- u2_bundle = run_u2_asr_candidates_with_timeout_retry(
1912
+ u2_bundle = run_u3_then_u2_asr_candidates_with_timeout_retry(
1877
1913
  base_url=runtime["base_url"],
1878
1914
  token=runtime["token"],
1879
1915
  timeout_ms=u2_timeout_ms,
@@ -1914,16 +1950,18 @@ def run_xiaohongshu_extract(
1914
1950
 
1915
1951
  trace.append(
1916
1952
  {
1917
- "step": "u2_asr_timeout_retry",
1953
+ "step": "u3_then_u2_asr",
1918
1954
  "endpoint": "/api/u2/v1/services/audio/asr/transcription + /api/u2/v1/tasks/{task_id}",
1919
1955
  "selected_video_url": selected_video_url,
1920
1956
  "selected_video_candidates": u2_candidates,
1957
+ "chosen_public_url": u2_bundle.get("chosen_public_url"),
1921
1958
  "candidate_attempts": u2_bundle.get("candidate_attempts", []),
1922
1959
  "submit_retries_config": {
1923
1960
  "u2_submit_max_retries": max(0, int(u2_submit_max_retries)),
1924
1961
  "u2_submit_backoff_ms": max(0, int(u2_submit_backoff_ms)),
1925
1962
  },
1926
1963
  "timeout_retry": u2_bundle.get("timeout_retry", {}),
1964
+ "u3_bridge": u2_bundle.get("u3_bridge", {}),
1927
1965
  "u3_fallback": u2_bundle.get("u3_fallback", {}),
1928
1966
  "rounds": u2_bundle.get("rounds", []),
1929
1967
  "final_task_id": poll_result.get("task_id") or task_id,
@@ -1934,7 +1972,7 @@ def run_xiaohongshu_extract(
1934
1972
  if progress is not None:
1935
1973
  (progress.done if poll_result.get("ok") else progress.failed)(
1936
1974
  stage="note.u2",
1937
- message="xiaohongshu u2 flow finished" if poll_result.get("ok") else "xiaohongshu u2 flow failed",
1975
+ message="xiaohongshu u3->u2 flow finished" if poll_result.get("ok") else "xiaohongshu u3->u2 flow failed",
1938
1976
  data={
1939
1977
  "task_id": poll_result.get("task_id") or task_id,
1940
1978
  "task_status": poll_result.get("task_status"),
@@ -246,6 +246,9 @@ def build_work_fact_card(payload: Dict[str, Any], platform: Optional[str] = None
246
246
  "caption_raw": caption_raw,
247
247
  "subtitle_raw": _safe_text(payload.get("subtitle_raw")),
248
248
  "work_modality": _resolve_work_modality(payload),
249
+ "publish_time": payload.get("publish_time"),
250
+ "create_time_sec": payload.get("create_time_sec"),
251
+ "publish_time_source": _safe_text(payload.get("publish_time_source")),
249
252
  "published_date": _resolve_published_date(payload),
250
253
  "digg_count": _safe_int(payload.get("digg_count")),
251
254
  "comment_count": _safe_int(payload.get("comment_count")),