@tikomni/skills 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tikomni/skills",
3
- "version": "0.1.6",
3
+ "version": "0.1.8",
4
4
  "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
@@ -32,7 +32,8 @@ description: Use this skill when the user asks about social media links, posts,
32
32
  - 先看用户给的对象是什么,再选固定 pipeline 或 MCP 工具。
33
33
  - 优先返回事实字段,不补充主观分析。
34
34
  - 视频文本优先使用平台原生字幕;拿不到再走 ASR。
35
- - 默认输出以结构化 JSON 为主。
35
+ - 默认输出以结构化 JSON 和Markdown并主动落库。
36
+ - 当固定pipeline时,默认执行抓取并落库(强制)。
36
37
 
37
38
  ## Workflow
38
39
 
@@ -1014,6 +1014,159 @@ def run_u2_asr_candidates_with_timeout_retry(
1014
1014
  return final_bundle
1015
1015
 
1016
1016
 
1017
+ def run_u3_then_u2_asr_candidates_with_timeout_retry(
1018
+ *,
1019
+ base_url: str,
1020
+ token: str,
1021
+ timeout_ms: int,
1022
+ candidates: List[str],
1023
+ submit_max_retries: int,
1024
+ submit_backoff_ms: int,
1025
+ poll_interval_sec: float,
1026
+ max_polls: int,
1027
+ timeout_retry_enabled: bool = True,
1028
+ timeout_retry_max_retries: int = 3,
1029
+ pending_timeout_sec: int = DEFAULT_U2_PENDING_TIMEOUT_SEC,
1030
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1031
+ ) -> Dict[str, Any]:
1032
+ normalized_candidates = normalize_media_candidates(candidates)
1033
+ attempts: List[Dict[str, Any]] = []
1034
+
1035
+ final_bundle: Dict[str, Any] = {
1036
+ "submit_bundle": {},
1037
+ "poll_result": {"ok": False, "task_status": "UNKNOWN", "error_reason": "no_candidates"},
1038
+ "rounds": [],
1039
+ "timeout_retry": {
1040
+ "enabled": bool(timeout_retry_enabled),
1041
+ "configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
1042
+ "triggered": False,
1043
+ "result": "not_triggered",
1044
+ },
1045
+ "u3_fallback": {
1046
+ "enabled": False,
1047
+ "triggered": False,
1048
+ "ok": False,
1049
+ "result": "not_triggered",
1050
+ "public_url": "",
1051
+ "trace": [],
1052
+ },
1053
+ }
1054
+ chosen_url: Optional[str] = None
1055
+ chosen_public_url: Optional[str] = None
1056
+
1057
+ for index, candidate in enumerate(normalized_candidates, start=1):
1058
+ valid = is_valid_u2_media_candidate(candidate)
1059
+ if not valid:
1060
+ attempts.append(
1061
+ {
1062
+ "index": index,
1063
+ "candidate": candidate,
1064
+ "valid": False,
1065
+ "result": "skipped_non_media_candidate",
1066
+ }
1067
+ )
1068
+ continue
1069
+
1070
+ u3_result = run_u3_public_url_fallback(
1071
+ base_url=base_url,
1072
+ token=token,
1073
+ timeout_ms=timeout_ms,
1074
+ source_url=candidate,
1075
+ )
1076
+ u3_bundle = {
1077
+ "enabled": True,
1078
+ "triggered": True,
1079
+ "ok": bool(u3_result.get("ok")),
1080
+ "result": "u3_completed" if u3_result.get("ok") else "u3_failed",
1081
+ "public_url": normalize_media_url(u3_result.get("public_url")),
1082
+ "request_id": u3_result.get("request_id"),
1083
+ "error_reason": u3_result.get("error_reason"),
1084
+ "trace": u3_result.get("trace", []),
1085
+ }
1086
+
1087
+ attempts.append(
1088
+ {
1089
+ "index": index,
1090
+ "candidate": candidate,
1091
+ "valid": True,
1092
+ "u3_bridge": u3_bundle,
1093
+ }
1094
+ )
1095
+
1096
+ if not u3_bundle.get("ok") or not u3_bundle.get("public_url"):
1097
+ final_bundle = {
1098
+ "submit_bundle": {},
1099
+ "poll_result": {
1100
+ "ok": False,
1101
+ "task_status": "UNKNOWN",
1102
+ "error_reason": u3_bundle.get("error_reason") or "u3_bridge_failed",
1103
+ "request_id": u3_bundle.get("request_id"),
1104
+ "trace": list(u3_bundle.get("trace", [])),
1105
+ },
1106
+ "rounds": [],
1107
+ "timeout_retry": {
1108
+ "enabled": bool(timeout_retry_enabled),
1109
+ "configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
1110
+ "triggered": False,
1111
+ "result": "not_triggered",
1112
+ },
1113
+ "u3_fallback": {
1114
+ "enabled": False,
1115
+ "triggered": False,
1116
+ "ok": False,
1117
+ "result": "not_triggered",
1118
+ "public_url": "",
1119
+ "trace": [],
1120
+ },
1121
+ "u3_bridge": u3_bundle,
1122
+ }
1123
+ continue
1124
+
1125
+ bundle = run_u2_asr_with_timeout_retry(
1126
+ base_url=base_url,
1127
+ token=token,
1128
+ timeout_ms=timeout_ms,
1129
+ video_url=str(u3_bundle.get("public_url")),
1130
+ submit_max_retries=submit_max_retries,
1131
+ submit_backoff_ms=submit_backoff_ms,
1132
+ poll_interval_sec=poll_interval_sec,
1133
+ max_polls=max_polls,
1134
+ timeout_retry_enabled=timeout_retry_enabled,
1135
+ timeout_retry_max_retries=timeout_retry_max_retries,
1136
+ pending_timeout_sec=pending_timeout_sec,
1137
+ u3_fallback_enabled=False,
1138
+ progress_callback=progress_callback,
1139
+ )
1140
+ poll_result = bundle.get("poll_result", {})
1141
+ error_reason = str(poll_result.get("error_reason") or "")
1142
+ ok = bool(poll_result.get("ok"))
1143
+
1144
+ attempts[-1].update(
1145
+ {
1146
+ "ok": ok,
1147
+ "error_reason": error_reason,
1148
+ "task_status": poll_result.get("task_status"),
1149
+ "u2_public_url": u3_bundle.get("public_url"),
1150
+ }
1151
+ )
1152
+
1153
+ final_bundle = dict(bundle)
1154
+ final_bundle["u3_bridge"] = u3_bundle
1155
+ chosen_url = candidate
1156
+ chosen_public_url = str(u3_bundle.get("public_url") or "")
1157
+ if ok:
1158
+ break
1159
+ if error_reason == "INVALID_SOURCE_URL":
1160
+ continue
1161
+ break
1162
+
1163
+ final_bundle["candidate_attempts"] = attempts
1164
+ final_bundle["chosen_candidate"] = chosen_url
1165
+ final_bundle["chosen_public_url"] = chosen_public_url
1166
+ final_bundle["normalized_candidates"] = normalized_candidates
1167
+ return final_bundle
1168
+
1169
+
1017
1170
  def run_u2_asr_batch_with_timeout_retry(
1018
1171
  *,
1019
1172
  base_url: str,
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env python3
2
+ """Shared Douyin metadata extraction helpers."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import re
7
+ from typing import Any, Dict, Optional
8
+
9
+ from scripts.core.tikomni_common import normalize_text
10
+
11
+ INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
12
+ MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
13
+
14
+
15
+ def _safe_int(value: Any) -> Optional[int]:
16
+ if value is None:
17
+ return None
18
+ if isinstance(value, bool):
19
+ return int(value)
20
+ if isinstance(value, int):
21
+ return value
22
+ if isinstance(value, float):
23
+ return int(value)
24
+
25
+ text = normalize_text(value)
26
+ if not text:
27
+ return None
28
+ try:
29
+ return int(float(text.replace(",", "")))
30
+ except Exception:
31
+ return None
32
+
33
+
34
+ def normalize_douyin_author_handle(*values: Any) -> str:
35
+ for value in values:
36
+ text = normalize_text(value)
37
+ if not text:
38
+ continue
39
+ if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
40
+ continue
41
+ return text
42
+ return ""
43
+
44
+
45
+ def looks_like_douyin_music_title(value: Any) -> bool:
46
+ title = normalize_text(value)
47
+ if not title:
48
+ return False
49
+ return bool(MUSIC_TITLE_PATTERN.match(title))
50
+
51
+
52
+ def extract_douyin_caption(item: Dict[str, Any]) -> str:
53
+ if not isinstance(item, dict):
54
+ return ""
55
+ for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
56
+ text = normalize_text(item.get(key))
57
+ if text:
58
+ return text
59
+ return ""
60
+
61
+
62
+ def title_from_douyin_caption(caption: Any) -> str:
63
+ text = normalize_text(caption)
64
+ if not text:
65
+ return ""
66
+
67
+ stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
68
+ if stripped:
69
+ return stripped
70
+ return text
71
+
72
+
73
+ def extract_douyin_title(item: Dict[str, Any]) -> str:
74
+ if not isinstance(item, dict):
75
+ return ""
76
+
77
+ # Only read title-like fields from the work object itself.
78
+ # Nested `music.title` is an audio title, not the work title.
79
+ caption_title = title_from_douyin_caption(extract_douyin_caption(item))
80
+ for key in ("item_title", "preview_title", "title"):
81
+ candidate = normalize_text(item.get(key))
82
+ if not candidate:
83
+ continue
84
+ if looks_like_douyin_music_title(candidate) and caption_title:
85
+ continue
86
+ return candidate
87
+ return caption_title
88
+
89
+
90
+ def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
91
+ author = item.get("author") if isinstance(item.get("author"), dict) else {}
92
+
93
+ author_platform_id = (
94
+ normalize_text(author.get("uid"))
95
+ or normalize_text(author.get("id"))
96
+ or normalize_text(item.get("author_user_id"))
97
+ )
98
+ author_handle = normalize_douyin_author_handle(
99
+ author.get("unique_id"),
100
+ author.get("short_id"),
101
+ author.get("douyin_id"),
102
+ author.get("display_id"),
103
+ author.get("nickname"),
104
+ )
105
+ douyin_sec_uid = normalize_text(author.get("sec_uid"))
106
+ douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
107
+
108
+ return {
109
+ "author_handle": author_handle or None,
110
+ "platform_author_id": author_platform_id or None,
111
+ "author_platform_id": author_platform_id or None,
112
+ "douyin_sec_uid": douyin_sec_uid or None,
113
+ "douyin_aweme_author_id": douyin_aweme_author_id or None,
114
+ "unique_id": normalize_text(author.get("unique_id")) or None,
115
+ "nickname": normalize_text(author.get("nickname")) or None,
116
+ "signature": normalize_text(author.get("signature")) or None,
117
+ }
118
+
119
+
120
+ def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
121
+ statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
122
+
123
+ def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
124
+ for key in keys:
125
+ value = _safe_int(statistics.get(key))
126
+ if value is not None:
127
+ return value
128
+ value = _safe_int(item.get(key))
129
+ if value is not None:
130
+ return value
131
+ return default
132
+
133
+ metrics = {
134
+ "digg_count": metric("digg_count", "like_count"),
135
+ "comment_count": metric("comment_count"),
136
+ "collect_count": metric("collect_count"),
137
+ "share_count": metric("share_count", "forward_count"),
138
+ "play_count": metric("play_count", "view_count", default=None),
139
+ }
140
+
141
+ play_count = metrics.get("play_count")
142
+ engagement_floor = max(
143
+ int(metrics.get("digg_count") or 0),
144
+ int(metrics.get("comment_count") or 0),
145
+ int(metrics.get("collect_count") or 0),
146
+ int(metrics.get("share_count") or 0),
147
+ )
148
+ if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
149
+ metrics["play_count"] = None
150
+
151
+ return metrics