@tikomni/skills 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/skills/single-work-analysis/env.example +3 -3
- package/skills/single-work-analysis/references/config-templates/defaults.yaml +8 -19
- package/skills/single-work-analysis/references/prompt-contracts/{insight.md → analysis-bundle.md} +43 -8
- package/skills/single-work-analysis/scripts/core/analysis_adapter.py +384 -0
- package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +399 -76
- package/skills/single-work-analysis/scripts/core/config_loader.py +18 -42
- package/skills/single-work-analysis/scripts/core/progress_report.py +163 -16
- package/skills/single-work-analysis/scripts/core/storage_router.py +24 -57
- package/skills/single-work-analysis/scripts/core/tikomni_common.py +13 -3
- package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +154 -7
- package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +3 -1
- package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +243 -44
- package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +263 -25
- package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +244 -894
- package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
- package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
- package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
- package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
- package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
"""Shared ASR pipeline helpers for runner scripts."""
|
|
3
3
|
|
|
4
4
|
import json
|
|
5
|
+
import re
|
|
6
|
+
import threading
|
|
5
7
|
import time
|
|
6
8
|
import urllib.error
|
|
7
9
|
import urllib.request
|
|
8
10
|
from urllib.parse import urlparse, urlunparse
|
|
9
|
-
from typing import Any, Dict, List, Optional
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
10
12
|
|
|
11
13
|
from scripts.core.tikomni_common import (
|
|
12
14
|
call_json_api,
|
|
@@ -83,6 +85,7 @@ def submit_u2_asr_batch_with_retry(
|
|
|
83
85
|
file_urls: List[str],
|
|
84
86
|
max_retries: int,
|
|
85
87
|
backoff_ms: int,
|
|
88
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
86
89
|
) -> Dict[str, Any]:
|
|
87
90
|
retries = max(0, int(max_retries))
|
|
88
91
|
base_backoff = max(0, int(backoff_ms))
|
|
@@ -110,20 +113,75 @@ def submit_u2_asr_batch_with_retry(
|
|
|
110
113
|
final_task_id: Optional[str] = None
|
|
111
114
|
final_submit_status = "failed_unknown"
|
|
112
115
|
|
|
116
|
+
def _emit_submit_progress(event: Dict[str, Any]) -> None:
|
|
117
|
+
if progress_callback is None:
|
|
118
|
+
return
|
|
119
|
+
try:
|
|
120
|
+
progress_callback(event)
|
|
121
|
+
except Exception:
|
|
122
|
+
pass
|
|
123
|
+
|
|
113
124
|
for attempt in range(1, max_attempts + 1):
|
|
114
125
|
wait_ms = 0 if attempt == 1 else base_backoff * (2 ** (attempt - 2))
|
|
115
126
|
if wait_ms > 0:
|
|
116
127
|
time.sleep(wait_ms / 1000.0)
|
|
117
128
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
129
|
+
_emit_submit_progress(
|
|
130
|
+
{
|
|
131
|
+
"phase": "submit",
|
|
132
|
+
"state": "started",
|
|
133
|
+
"attempt": attempt,
|
|
134
|
+
"wait_ms": wait_ms,
|
|
135
|
+
"candidate_count": len(limited_urls),
|
|
136
|
+
}
|
|
123
137
|
)
|
|
138
|
+
|
|
139
|
+
heartbeat_stop = threading.Event()
|
|
140
|
+
|
|
141
|
+
def _heartbeat() -> None:
|
|
142
|
+
while not heartbeat_stop.wait(5.0):
|
|
143
|
+
_emit_submit_progress(
|
|
144
|
+
{
|
|
145
|
+
"phase": "submit",
|
|
146
|
+
"state": "heartbeat",
|
|
147
|
+
"attempt": attempt,
|
|
148
|
+
"wait_ms": wait_ms,
|
|
149
|
+
"candidate_count": len(limited_urls),
|
|
150
|
+
}
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
heartbeat_thread = threading.Thread(target=_heartbeat, daemon=True)
|
|
154
|
+
heartbeat_thread.start()
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
submit_response = submit_u2_asr_batch(
|
|
158
|
+
base_url=base_url,
|
|
159
|
+
token=token,
|
|
160
|
+
timeout_ms=timeout_ms,
|
|
161
|
+
file_urls=limited_urls,
|
|
162
|
+
)
|
|
163
|
+
finally:
|
|
164
|
+
heartbeat_stop.set()
|
|
165
|
+
heartbeat_thread.join(timeout=0.2)
|
|
124
166
|
task_id = extract_task_id(submit_response.get("data"))
|
|
125
167
|
retriable = is_retriable_submit_failure(submit_response)
|
|
126
168
|
|
|
169
|
+
_emit_submit_progress(
|
|
170
|
+
{
|
|
171
|
+
"phase": "submit",
|
|
172
|
+
"state": "finished",
|
|
173
|
+
"attempt": attempt,
|
|
174
|
+
"wait_ms": wait_ms,
|
|
175
|
+
"candidate_count": len(limited_urls),
|
|
176
|
+
"task_id": task_id,
|
|
177
|
+
"status_code": submit_response.get("status_code"),
|
|
178
|
+
"ok": bool(submit_response.get("ok")),
|
|
179
|
+
"error_reason": submit_response.get("error_reason"),
|
|
180
|
+
"request_id": submit_response.get("request_id"),
|
|
181
|
+
"retriable": retriable,
|
|
182
|
+
}
|
|
183
|
+
)
|
|
184
|
+
|
|
127
185
|
retry_chain.append(
|
|
128
186
|
{
|
|
129
187
|
"attempt": attempt,
|
|
@@ -172,6 +230,7 @@ def submit_u2_asr_with_retry(
|
|
|
172
230
|
video_url: str,
|
|
173
231
|
max_retries: int,
|
|
174
232
|
backoff_ms: int,
|
|
233
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
175
234
|
) -> Dict[str, Any]:
|
|
176
235
|
return submit_u2_asr_batch_with_retry(
|
|
177
236
|
base_url=base_url,
|
|
@@ -180,13 +239,55 @@ def submit_u2_asr_with_retry(
|
|
|
180
239
|
file_urls=[video_url],
|
|
181
240
|
max_retries=max_retries,
|
|
182
241
|
backoff_ms=backoff_ms,
|
|
242
|
+
progress_callback=progress_callback,
|
|
183
243
|
)
|
|
184
244
|
|
|
185
245
|
|
|
186
246
|
def clean_transcript_text(raw_text: Any) -> str:
|
|
187
247
|
if raw_text is None:
|
|
188
248
|
return ""
|
|
189
|
-
return
|
|
249
|
+
return normalize_text(raw_text)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _ensure_sentence_end(text: str) -> str:
|
|
253
|
+
if not text:
|
|
254
|
+
return text
|
|
255
|
+
if text[-1] in "。!?!?" or text.endswith("..."):
|
|
256
|
+
return text
|
|
257
|
+
return f"{text}。"
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
|
|
261
|
+
base = clean_transcript_text(asr_raw) or clean_transcript_text(legacy_clean)
|
|
262
|
+
if not base:
|
|
263
|
+
return ""
|
|
264
|
+
|
|
265
|
+
denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
|
|
266
|
+
denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
|
|
267
|
+
denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
|
|
268
|
+
denoised = re.sub(r"\s+", " ", denoised).strip()
|
|
269
|
+
|
|
270
|
+
units = [clean_transcript_text(part) for part in re.split(r"[。!?!?;;\n]+", denoised)]
|
|
271
|
+
sentences = [_ensure_sentence_end(unit) for unit in units if unit]
|
|
272
|
+
if not sentences:
|
|
273
|
+
fallback = _ensure_sentence_end(denoised)
|
|
274
|
+
return fallback if fallback else ""
|
|
275
|
+
|
|
276
|
+
paragraphs: List[str] = []
|
|
277
|
+
bucket: List[str] = []
|
|
278
|
+
for sentence in sentences:
|
|
279
|
+
bucket.append(sentence)
|
|
280
|
+
if len(bucket) >= 3:
|
|
281
|
+
paragraphs.append("\n".join(bucket))
|
|
282
|
+
bucket = []
|
|
283
|
+
|
|
284
|
+
if bucket:
|
|
285
|
+
if len(bucket) == 1 and paragraphs:
|
|
286
|
+
paragraphs[-1] = f"{paragraphs[-1]}\n{bucket[0]}"
|
|
287
|
+
else:
|
|
288
|
+
paragraphs.append("\n".join(bucket))
|
|
289
|
+
|
|
290
|
+
return "\n\n".join(paragraphs)
|
|
190
291
|
|
|
191
292
|
|
|
192
293
|
def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
|
|
@@ -612,6 +713,7 @@ def poll_u2_task_core(
|
|
|
612
713
|
max_polls: int,
|
|
613
714
|
require_batch_complete: bool = False,
|
|
614
715
|
expected_total: int = 0,
|
|
716
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
615
717
|
) -> Dict[str, Any]:
|
|
616
718
|
trace = []
|
|
617
719
|
last_request_id = None
|
|
@@ -666,6 +768,25 @@ def poll_u2_task_core(
|
|
|
666
768
|
}
|
|
667
769
|
)
|
|
668
770
|
|
|
771
|
+
if progress_callback is not None:
|
|
772
|
+
try:
|
|
773
|
+
progress_callback(
|
|
774
|
+
{
|
|
775
|
+
"attempt": attempt,
|
|
776
|
+
"task_id": task_id,
|
|
777
|
+
"task_status": status or "UNKNOWN",
|
|
778
|
+
"platform_task_status": platform_status or "UNKNOWN",
|
|
779
|
+
"pending_count": pending_count,
|
|
780
|
+
"request_id": response.get("request_id"),
|
|
781
|
+
"status_code": response.get("status_code"),
|
|
782
|
+
"ok": bool(response.get("ok")),
|
|
783
|
+
"error_reason": response.get("error_reason"),
|
|
784
|
+
"batch_progress": batch_progress,
|
|
785
|
+
}
|
|
786
|
+
)
|
|
787
|
+
except Exception:
|
|
788
|
+
pass
|
|
789
|
+
|
|
669
790
|
if not response.get("ok"):
|
|
670
791
|
if attempt < max_polls:
|
|
671
792
|
time.sleep(max(poll_interval_sec, 0.2))
|
|
@@ -797,6 +918,7 @@ def run_u2_asr_candidates_with_timeout_retry(
|
|
|
797
918
|
max_polls: int,
|
|
798
919
|
timeout_retry_enabled: bool = True,
|
|
799
920
|
timeout_retry_max_retries: int = 3,
|
|
921
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
800
922
|
) -> Dict[str, Any]:
|
|
801
923
|
normalized_candidates = normalize_media_candidates(candidates)
|
|
802
924
|
attempts: List[Dict[str, Any]] = []
|
|
@@ -831,6 +953,7 @@ def run_u2_asr_candidates_with_timeout_retry(
|
|
|
831
953
|
max_polls=max_polls,
|
|
832
954
|
timeout_retry_enabled=timeout_retry_enabled,
|
|
833
955
|
timeout_retry_max_retries=timeout_retry_max_retries,
|
|
956
|
+
progress_callback=progress_callback,
|
|
834
957
|
)
|
|
835
958
|
poll_result = bundle.get("poll_result", {})
|
|
836
959
|
error_reason = str(poll_result.get("error_reason") or "")
|
|
@@ -871,6 +994,7 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
871
994
|
max_polls: int,
|
|
872
995
|
timeout_retry_enabled: bool = True,
|
|
873
996
|
timeout_retry_max_retries: int = 3,
|
|
997
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
874
998
|
) -> Dict[str, Any]:
|
|
875
999
|
normalized_urls = normalize_media_candidates(file_urls)
|
|
876
1000
|
limited_urls = normalized_urls[:U2_BATCH_SUBMIT_HARD_LIMIT]
|
|
@@ -921,6 +1045,7 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
921
1045
|
}
|
|
922
1046
|
|
|
923
1047
|
for round_index in range(1, max_rounds + 1):
|
|
1048
|
+
submit_started_at = time.perf_counter()
|
|
924
1049
|
submit_bundle = submit_u2_asr_batch_with_retry(
|
|
925
1050
|
base_url=base_url,
|
|
926
1051
|
token=token,
|
|
@@ -928,12 +1053,16 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
928
1053
|
file_urls=limited_urls,
|
|
929
1054
|
max_retries=submit_max_retries,
|
|
930
1055
|
backoff_ms=submit_backoff_ms,
|
|
1056
|
+
progress_callback=progress_callback,
|
|
931
1057
|
)
|
|
1058
|
+
submit_duration_ms = int((time.perf_counter() - submit_started_at) * 1000)
|
|
932
1059
|
submit_response = submit_bundle.get("submit_response", {})
|
|
933
1060
|
task_id = submit_bundle.get("task_id")
|
|
934
1061
|
|
|
935
1062
|
poll_result: Dict[str, Any]
|
|
1063
|
+
poll_duration_ms = 0
|
|
936
1064
|
if submit_response.get("ok") and task_id:
|
|
1065
|
+
poll_started_at = time.perf_counter()
|
|
937
1066
|
poll_result = poll_u2_task_core(
|
|
938
1067
|
base_url=base_url,
|
|
939
1068
|
token=token,
|
|
@@ -943,7 +1072,9 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
943
1072
|
max_polls=max_polls,
|
|
944
1073
|
require_batch_complete=True,
|
|
945
1074
|
expected_total=len(limited_urls),
|
|
1075
|
+
progress_callback=progress_callback,
|
|
946
1076
|
)
|
|
1077
|
+
poll_duration_ms = int((time.perf_counter() - poll_started_at) * 1000)
|
|
947
1078
|
else:
|
|
948
1079
|
poll_result = {
|
|
949
1080
|
"ok": False,
|
|
@@ -975,6 +1106,7 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
975
1106
|
"error_reason": submit_response.get("error_reason"),
|
|
976
1107
|
"retry_chain": submit_bundle.get("retry_chain", []),
|
|
977
1108
|
"file_url_count": len(limited_urls),
|
|
1109
|
+
"duration_ms": submit_duration_ms,
|
|
978
1110
|
},
|
|
979
1111
|
"poll": {
|
|
980
1112
|
"task_id": poll_result.get("task_id") or task_id,
|
|
@@ -986,6 +1118,7 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
986
1118
|
"task_metrics": poll_result.get("task_metrics", {}),
|
|
987
1119
|
"batch_complete": bool(poll_result.get("batch_complete")),
|
|
988
1120
|
"batch_progress": poll_result.get("batch_progress", {}),
|
|
1121
|
+
"duration_ms": poll_duration_ms,
|
|
989
1122
|
},
|
|
990
1123
|
}
|
|
991
1124
|
)
|
|
@@ -1071,6 +1204,8 @@ def run_u2_asr_batch_with_timeout_retry(
|
|
|
1071
1204
|
"task_metrics": final_poll_result.get("task_metrics") if isinstance(final_poll_result.get("task_metrics"), dict) else extract_u2_task_metrics(raw_task_payload),
|
|
1072
1205
|
"batch_progress": final_poll_result.get("batch_progress") if isinstance(final_poll_result.get("batch_progress"), dict) else build_u2_batch_progress(payload=raw_task_payload, expected_total=len(limited_urls)),
|
|
1073
1206
|
"batch_complete": bool(final_poll_result.get("batch_complete")),
|
|
1207
|
+
"submit_duration_ms": _safe_int((rounds[-1].get("submit") if rounds else {}).get("duration_ms")),
|
|
1208
|
+
"poll_duration_ms": _safe_int((rounds[-1].get("poll") if rounds else {}).get("duration_ms")),
|
|
1074
1209
|
}
|
|
1075
1210
|
|
|
1076
1211
|
|
|
@@ -1086,6 +1221,7 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1086
1221
|
max_polls: int,
|
|
1087
1222
|
timeout_retry_enabled: bool = True,
|
|
1088
1223
|
timeout_retry_max_retries: int = 3,
|
|
1224
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
1089
1225
|
) -> Dict[str, Any]:
|
|
1090
1226
|
video_url = normalize_media_url(video_url)
|
|
1091
1227
|
conservative_retries = max(0, min(3, int(timeout_retry_max_retries)))
|
|
@@ -1103,6 +1239,7 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1103
1239
|
timeout_retry_result = "not_triggered"
|
|
1104
1240
|
|
|
1105
1241
|
for round_index in range(1, max_rounds + 1):
|
|
1242
|
+
submit_started_at = time.perf_counter()
|
|
1106
1243
|
submit_bundle = submit_u2_asr_with_retry(
|
|
1107
1244
|
base_url=base_url,
|
|
1108
1245
|
token=token,
|
|
@@ -1110,12 +1247,16 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1110
1247
|
video_url=video_url,
|
|
1111
1248
|
max_retries=submit_max_retries,
|
|
1112
1249
|
backoff_ms=submit_backoff_ms,
|
|
1250
|
+
progress_callback=progress_callback,
|
|
1113
1251
|
)
|
|
1252
|
+
submit_duration_ms = int((time.perf_counter() - submit_started_at) * 1000)
|
|
1114
1253
|
submit_response = submit_bundle.get("submit_response", {})
|
|
1115
1254
|
task_id = submit_bundle.get("task_id")
|
|
1116
1255
|
|
|
1117
1256
|
poll_result: Dict[str, Any]
|
|
1257
|
+
poll_duration_ms = 0
|
|
1118
1258
|
if submit_response.get("ok") and task_id:
|
|
1259
|
+
poll_started_at = time.perf_counter()
|
|
1119
1260
|
poll_result = poll_u2_task_core(
|
|
1120
1261
|
base_url=base_url,
|
|
1121
1262
|
token=token,
|
|
@@ -1123,7 +1264,9 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1123
1264
|
task_id=str(task_id),
|
|
1124
1265
|
poll_interval_sec=poll_interval_sec,
|
|
1125
1266
|
max_polls=max_polls,
|
|
1267
|
+
progress_callback=progress_callback,
|
|
1126
1268
|
)
|
|
1269
|
+
poll_duration_ms = int((time.perf_counter() - poll_started_at) * 1000)
|
|
1127
1270
|
else:
|
|
1128
1271
|
poll_result = {
|
|
1129
1272
|
"ok": False,
|
|
@@ -1145,6 +1288,7 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1145
1288
|
"ok": submit_response.get("ok"),
|
|
1146
1289
|
"error_reason": submit_response.get("error_reason"),
|
|
1147
1290
|
"retry_chain": submit_bundle.get("retry_chain", []),
|
|
1291
|
+
"duration_ms": submit_duration_ms,
|
|
1148
1292
|
},
|
|
1149
1293
|
"poll": {
|
|
1150
1294
|
"task_id": poll_result.get("task_id") or task_id,
|
|
@@ -1153,6 +1297,7 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1153
1297
|
"ok": poll_result.get("ok"),
|
|
1154
1298
|
"error_reason": poll_result.get("error_reason"),
|
|
1155
1299
|
"attempts": len(poll_result.get("trace", [])),
|
|
1300
|
+
"duration_ms": poll_duration_ms,
|
|
1156
1301
|
},
|
|
1157
1302
|
}
|
|
1158
1303
|
)
|
|
@@ -1186,4 +1331,6 @@ def run_u2_asr_with_timeout_retry(
|
|
|
1186
1331
|
"triggered": timeout_retry_triggered,
|
|
1187
1332
|
"result": timeout_retry_result,
|
|
1188
1333
|
},
|
|
1334
|
+
"submit_duration_ms": _safe_int((rounds[-1].get("submit") if rounds else {}).get("duration_ms")),
|
|
1335
|
+
"poll_duration_ms": _safe_int((rounds[-1].get("poll") if rounds else {}).get("duration_ms")),
|
|
1189
1336
|
}
|
|
@@ -17,7 +17,7 @@ from scripts.core.bootstrap_env import bootstrap_for_direct_run
|
|
|
17
17
|
bootstrap_for_direct_run(__file__, __package__)
|
|
18
18
|
|
|
19
19
|
import argparse
|
|
20
|
-
from typing import Any, Dict
|
|
20
|
+
from typing import Any, Callable, Dict, Optional
|
|
21
21
|
|
|
22
22
|
from scripts.pipeline.asr.asr_pipeline import poll_u2_task_core
|
|
23
23
|
from scripts.core.tikomni_common import extract_error_reason, resolve_runtime, write_json_stdout
|
|
@@ -31,6 +31,7 @@ def poll_u2_task(
|
|
|
31
31
|
task_id: str,
|
|
32
32
|
poll_interval_sec: float,
|
|
33
33
|
max_polls: int,
|
|
34
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
34
35
|
) -> Dict[str, Any]:
|
|
35
36
|
return poll_u2_task_core(
|
|
36
37
|
base_url=base_url,
|
|
@@ -39,6 +40,7 @@ def poll_u2_task(
|
|
|
39
40
|
task_id=task_id,
|
|
40
41
|
poll_interval_sec=poll_interval_sec,
|
|
41
42
|
max_polls=max_polls,
|
|
43
|
+
progress_callback=progress_callback,
|
|
42
44
|
)
|
|
43
45
|
|
|
44
46
|
|