@tikomni/skills 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/SKILL.md +2 -1
- package/skills/social-media-crawl/scripts/core/asr_pipeline.py +153 -0
- package/skills/social-media-crawl/scripts/pipelines/douyin_metadata.py +151 -0
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +207 -51
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +5 -11
- package/skills/social-media-crawl/scripts/pipelines/input_contracts.py +318 -0
- package/skills/social-media-crawl/scripts/pipelines/media_url_rules.py +86 -0
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +115 -31
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +79 -73
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +135 -71
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +8 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -32,7 +32,8 @@ description: Use this skill when the user asks about social media links, posts,
|
|
|
32
32
|
- 先看用户给的对象是什么,再选固定 pipeline 或 MCP 工具。
|
|
33
33
|
- 优先返回事实字段,不补充主观分析。
|
|
34
34
|
- 视频文本优先使用平台原生字幕;拿不到再走 ASR。
|
|
35
|
-
- 默认输出以结构化 JSON
|
|
35
|
+
- 默认输出以结构化 JSON 和Markdown并主动落库。
|
|
36
|
+
- 当固定pipeline时,默认执行抓取并落库(强制)。
|
|
36
37
|
|
|
37
38
|
## Workflow
|
|
38
39
|
|
|
@@ -1014,6 +1014,159 @@ def run_u2_asr_candidates_with_timeout_retry(
|
|
|
1014
1014
|
return final_bundle
|
|
1015
1015
|
|
|
1016
1016
|
|
|
1017
|
+
def run_u3_then_u2_asr_candidates_with_timeout_retry(
|
|
1018
|
+
*,
|
|
1019
|
+
base_url: str,
|
|
1020
|
+
token: str,
|
|
1021
|
+
timeout_ms: int,
|
|
1022
|
+
candidates: List[str],
|
|
1023
|
+
submit_max_retries: int,
|
|
1024
|
+
submit_backoff_ms: int,
|
|
1025
|
+
poll_interval_sec: float,
|
|
1026
|
+
max_polls: int,
|
|
1027
|
+
timeout_retry_enabled: bool = True,
|
|
1028
|
+
timeout_retry_max_retries: int = 3,
|
|
1029
|
+
pending_timeout_sec: int = DEFAULT_U2_PENDING_TIMEOUT_SEC,
|
|
1030
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
1031
|
+
) -> Dict[str, Any]:
|
|
1032
|
+
normalized_candidates = normalize_media_candidates(candidates)
|
|
1033
|
+
attempts: List[Dict[str, Any]] = []
|
|
1034
|
+
|
|
1035
|
+
final_bundle: Dict[str, Any] = {
|
|
1036
|
+
"submit_bundle": {},
|
|
1037
|
+
"poll_result": {"ok": False, "task_status": "UNKNOWN", "error_reason": "no_candidates"},
|
|
1038
|
+
"rounds": [],
|
|
1039
|
+
"timeout_retry": {
|
|
1040
|
+
"enabled": bool(timeout_retry_enabled),
|
|
1041
|
+
"configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
|
|
1042
|
+
"triggered": False,
|
|
1043
|
+
"result": "not_triggered",
|
|
1044
|
+
},
|
|
1045
|
+
"u3_fallback": {
|
|
1046
|
+
"enabled": False,
|
|
1047
|
+
"triggered": False,
|
|
1048
|
+
"ok": False,
|
|
1049
|
+
"result": "not_triggered",
|
|
1050
|
+
"public_url": "",
|
|
1051
|
+
"trace": [],
|
|
1052
|
+
},
|
|
1053
|
+
}
|
|
1054
|
+
chosen_url: Optional[str] = None
|
|
1055
|
+
chosen_public_url: Optional[str] = None
|
|
1056
|
+
|
|
1057
|
+
for index, candidate in enumerate(normalized_candidates, start=1):
|
|
1058
|
+
valid = is_valid_u2_media_candidate(candidate)
|
|
1059
|
+
if not valid:
|
|
1060
|
+
attempts.append(
|
|
1061
|
+
{
|
|
1062
|
+
"index": index,
|
|
1063
|
+
"candidate": candidate,
|
|
1064
|
+
"valid": False,
|
|
1065
|
+
"result": "skipped_non_media_candidate",
|
|
1066
|
+
}
|
|
1067
|
+
)
|
|
1068
|
+
continue
|
|
1069
|
+
|
|
1070
|
+
u3_result = run_u3_public_url_fallback(
|
|
1071
|
+
base_url=base_url,
|
|
1072
|
+
token=token,
|
|
1073
|
+
timeout_ms=timeout_ms,
|
|
1074
|
+
source_url=candidate,
|
|
1075
|
+
)
|
|
1076
|
+
u3_bundle = {
|
|
1077
|
+
"enabled": True,
|
|
1078
|
+
"triggered": True,
|
|
1079
|
+
"ok": bool(u3_result.get("ok")),
|
|
1080
|
+
"result": "u3_completed" if u3_result.get("ok") else "u3_failed",
|
|
1081
|
+
"public_url": normalize_media_url(u3_result.get("public_url")),
|
|
1082
|
+
"request_id": u3_result.get("request_id"),
|
|
1083
|
+
"error_reason": u3_result.get("error_reason"),
|
|
1084
|
+
"trace": u3_result.get("trace", []),
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
attempts.append(
|
|
1088
|
+
{
|
|
1089
|
+
"index": index,
|
|
1090
|
+
"candidate": candidate,
|
|
1091
|
+
"valid": True,
|
|
1092
|
+
"u3_bridge": u3_bundle,
|
|
1093
|
+
}
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
if not u3_bundle.get("ok") or not u3_bundle.get("public_url"):
|
|
1097
|
+
final_bundle = {
|
|
1098
|
+
"submit_bundle": {},
|
|
1099
|
+
"poll_result": {
|
|
1100
|
+
"ok": False,
|
|
1101
|
+
"task_status": "UNKNOWN",
|
|
1102
|
+
"error_reason": u3_bundle.get("error_reason") or "u3_bridge_failed",
|
|
1103
|
+
"request_id": u3_bundle.get("request_id"),
|
|
1104
|
+
"trace": list(u3_bundle.get("trace", [])),
|
|
1105
|
+
},
|
|
1106
|
+
"rounds": [],
|
|
1107
|
+
"timeout_retry": {
|
|
1108
|
+
"enabled": bool(timeout_retry_enabled),
|
|
1109
|
+
"configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
|
|
1110
|
+
"triggered": False,
|
|
1111
|
+
"result": "not_triggered",
|
|
1112
|
+
},
|
|
1113
|
+
"u3_fallback": {
|
|
1114
|
+
"enabled": False,
|
|
1115
|
+
"triggered": False,
|
|
1116
|
+
"ok": False,
|
|
1117
|
+
"result": "not_triggered",
|
|
1118
|
+
"public_url": "",
|
|
1119
|
+
"trace": [],
|
|
1120
|
+
},
|
|
1121
|
+
"u3_bridge": u3_bundle,
|
|
1122
|
+
}
|
|
1123
|
+
continue
|
|
1124
|
+
|
|
1125
|
+
bundle = run_u2_asr_with_timeout_retry(
|
|
1126
|
+
base_url=base_url,
|
|
1127
|
+
token=token,
|
|
1128
|
+
timeout_ms=timeout_ms,
|
|
1129
|
+
video_url=str(u3_bundle.get("public_url")),
|
|
1130
|
+
submit_max_retries=submit_max_retries,
|
|
1131
|
+
submit_backoff_ms=submit_backoff_ms,
|
|
1132
|
+
poll_interval_sec=poll_interval_sec,
|
|
1133
|
+
max_polls=max_polls,
|
|
1134
|
+
timeout_retry_enabled=timeout_retry_enabled,
|
|
1135
|
+
timeout_retry_max_retries=timeout_retry_max_retries,
|
|
1136
|
+
pending_timeout_sec=pending_timeout_sec,
|
|
1137
|
+
u3_fallback_enabled=False,
|
|
1138
|
+
progress_callback=progress_callback,
|
|
1139
|
+
)
|
|
1140
|
+
poll_result = bundle.get("poll_result", {})
|
|
1141
|
+
error_reason = str(poll_result.get("error_reason") or "")
|
|
1142
|
+
ok = bool(poll_result.get("ok"))
|
|
1143
|
+
|
|
1144
|
+
attempts[-1].update(
|
|
1145
|
+
{
|
|
1146
|
+
"ok": ok,
|
|
1147
|
+
"error_reason": error_reason,
|
|
1148
|
+
"task_status": poll_result.get("task_status"),
|
|
1149
|
+
"u2_public_url": u3_bundle.get("public_url"),
|
|
1150
|
+
}
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
final_bundle = dict(bundle)
|
|
1154
|
+
final_bundle["u3_bridge"] = u3_bundle
|
|
1155
|
+
chosen_url = candidate
|
|
1156
|
+
chosen_public_url = str(u3_bundle.get("public_url") or "")
|
|
1157
|
+
if ok:
|
|
1158
|
+
break
|
|
1159
|
+
if error_reason == "INVALID_SOURCE_URL":
|
|
1160
|
+
continue
|
|
1161
|
+
break
|
|
1162
|
+
|
|
1163
|
+
final_bundle["candidate_attempts"] = attempts
|
|
1164
|
+
final_bundle["chosen_candidate"] = chosen_url
|
|
1165
|
+
final_bundle["chosen_public_url"] = chosen_public_url
|
|
1166
|
+
final_bundle["normalized_candidates"] = normalized_candidates
|
|
1167
|
+
return final_bundle
|
|
1168
|
+
|
|
1169
|
+
|
|
1017
1170
|
def run_u2_asr_batch_with_timeout_retry(
|
|
1018
1171
|
*,
|
|
1019
1172
|
base_url: str,
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared Douyin metadata extraction helpers."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from scripts.core.tikomni_common import normalize_text
|
|
10
|
+
|
|
11
|
+
INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
|
|
12
|
+
MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _safe_int(value: Any) -> Optional[int]:
|
|
16
|
+
if value is None:
|
|
17
|
+
return None
|
|
18
|
+
if isinstance(value, bool):
|
|
19
|
+
return int(value)
|
|
20
|
+
if isinstance(value, int):
|
|
21
|
+
return value
|
|
22
|
+
if isinstance(value, float):
|
|
23
|
+
return int(value)
|
|
24
|
+
|
|
25
|
+
text = normalize_text(value)
|
|
26
|
+
if not text:
|
|
27
|
+
return None
|
|
28
|
+
try:
|
|
29
|
+
return int(float(text.replace(",", "")))
|
|
30
|
+
except Exception:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def normalize_douyin_author_handle(*values: Any) -> str:
|
|
35
|
+
for value in values:
|
|
36
|
+
text = normalize_text(value)
|
|
37
|
+
if not text:
|
|
38
|
+
continue
|
|
39
|
+
if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
|
|
40
|
+
continue
|
|
41
|
+
return text
|
|
42
|
+
return ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def looks_like_douyin_music_title(value: Any) -> bool:
|
|
46
|
+
title = normalize_text(value)
|
|
47
|
+
if not title:
|
|
48
|
+
return False
|
|
49
|
+
return bool(MUSIC_TITLE_PATTERN.match(title))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def extract_douyin_caption(item: Dict[str, Any]) -> str:
|
|
53
|
+
if not isinstance(item, dict):
|
|
54
|
+
return ""
|
|
55
|
+
for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
|
|
56
|
+
text = normalize_text(item.get(key))
|
|
57
|
+
if text:
|
|
58
|
+
return text
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def title_from_douyin_caption(caption: Any) -> str:
|
|
63
|
+
text = normalize_text(caption)
|
|
64
|
+
if not text:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
|
|
68
|
+
if stripped:
|
|
69
|
+
return stripped
|
|
70
|
+
return text
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_douyin_title(item: Dict[str, Any]) -> str:
|
|
74
|
+
if not isinstance(item, dict):
|
|
75
|
+
return ""
|
|
76
|
+
|
|
77
|
+
# Only read title-like fields from the work object itself.
|
|
78
|
+
# Nested `music.title` is an audio title, not the work title.
|
|
79
|
+
caption_title = title_from_douyin_caption(extract_douyin_caption(item))
|
|
80
|
+
for key in ("item_title", "preview_title", "title"):
|
|
81
|
+
candidate = normalize_text(item.get(key))
|
|
82
|
+
if not candidate:
|
|
83
|
+
continue
|
|
84
|
+
if looks_like_douyin_music_title(candidate) and caption_title:
|
|
85
|
+
continue
|
|
86
|
+
return candidate
|
|
87
|
+
return caption_title
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
|
91
|
+
author = item.get("author") if isinstance(item.get("author"), dict) else {}
|
|
92
|
+
|
|
93
|
+
author_platform_id = (
|
|
94
|
+
normalize_text(author.get("uid"))
|
|
95
|
+
or normalize_text(author.get("id"))
|
|
96
|
+
or normalize_text(item.get("author_user_id"))
|
|
97
|
+
)
|
|
98
|
+
author_handle = normalize_douyin_author_handle(
|
|
99
|
+
author.get("unique_id"),
|
|
100
|
+
author.get("short_id"),
|
|
101
|
+
author.get("douyin_id"),
|
|
102
|
+
author.get("display_id"),
|
|
103
|
+
author.get("nickname"),
|
|
104
|
+
)
|
|
105
|
+
douyin_sec_uid = normalize_text(author.get("sec_uid"))
|
|
106
|
+
douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"author_handle": author_handle or None,
|
|
110
|
+
"platform_author_id": author_platform_id or None,
|
|
111
|
+
"author_platform_id": author_platform_id or None,
|
|
112
|
+
"douyin_sec_uid": douyin_sec_uid or None,
|
|
113
|
+
"douyin_aweme_author_id": douyin_aweme_author_id or None,
|
|
114
|
+
"unique_id": normalize_text(author.get("unique_id")) or None,
|
|
115
|
+
"nickname": normalize_text(author.get("nickname")) or None,
|
|
116
|
+
"signature": normalize_text(author.get("signature")) or None,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
|
|
121
|
+
statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
|
|
122
|
+
|
|
123
|
+
def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
|
|
124
|
+
for key in keys:
|
|
125
|
+
value = _safe_int(statistics.get(key))
|
|
126
|
+
if value is not None:
|
|
127
|
+
return value
|
|
128
|
+
value = _safe_int(item.get(key))
|
|
129
|
+
if value is not None:
|
|
130
|
+
return value
|
|
131
|
+
return default
|
|
132
|
+
|
|
133
|
+
metrics = {
|
|
134
|
+
"digg_count": metric("digg_count", "like_count"),
|
|
135
|
+
"comment_count": metric("comment_count"),
|
|
136
|
+
"collect_count": metric("collect_count"),
|
|
137
|
+
"share_count": metric("share_count", "forward_count"),
|
|
138
|
+
"play_count": metric("play_count", "view_count", default=None),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
play_count = metrics.get("play_count")
|
|
142
|
+
engagement_floor = max(
|
|
143
|
+
int(metrics.get("digg_count") or 0),
|
|
144
|
+
int(metrics.get("comment_count") or 0),
|
|
145
|
+
int(metrics.get("collect_count") or 0),
|
|
146
|
+
int(metrics.get("share_count") or 0),
|
|
147
|
+
)
|
|
148
|
+
if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
|
|
149
|
+
metrics["play_count"] = None
|
|
150
|
+
|
|
151
|
+
return metrics
|