@tikomni/skills 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. package/package.json +4 -2
  2. package/skills/single-work-analysis/env.example +3 -3
  3. package/skills/single-work-analysis/references/config-templates/defaults.yaml +8 -19
  4. package/skills/single-work-analysis/references/prompt-contracts/{insight.md → analysis-bundle.md} +43 -8
  5. package/skills/single-work-analysis/scripts/core/analysis_adapter.py +384 -0
  6. package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +399 -76
  7. package/skills/single-work-analysis/scripts/core/config_loader.py +18 -42
  8. package/skills/single-work-analysis/scripts/core/progress_report.py +163 -16
  9. package/skills/single-work-analysis/scripts/core/storage_router.py +24 -57
  10. package/skills/single-work-analysis/scripts/core/tikomni_common.py +13 -3
  11. package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +154 -7
  12. package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +3 -1
  13. package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +243 -44
  14. package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +263 -25
  15. package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +244 -894
  16. package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
  17. package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
  18. package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
  19. package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
  20. package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
  21. package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
  22. package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
@@ -2,11 +2,13 @@
2
2
  """Shared ASR pipeline helpers for runner scripts."""
3
3
 
4
4
  import json
5
+ import re
6
+ import threading
5
7
  import time
6
8
  import urllib.error
7
9
  import urllib.request
8
10
  from urllib.parse import urlparse, urlunparse
9
- from typing import Any, Dict, List, Optional
11
+ from typing import Any, Callable, Dict, List, Optional
10
12
 
11
13
  from scripts.core.tikomni_common import (
12
14
  call_json_api,
@@ -83,6 +85,7 @@ def submit_u2_asr_batch_with_retry(
83
85
  file_urls: List[str],
84
86
  max_retries: int,
85
87
  backoff_ms: int,
88
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
86
89
  ) -> Dict[str, Any]:
87
90
  retries = max(0, int(max_retries))
88
91
  base_backoff = max(0, int(backoff_ms))
@@ -110,20 +113,75 @@ def submit_u2_asr_batch_with_retry(
110
113
  final_task_id: Optional[str] = None
111
114
  final_submit_status = "failed_unknown"
112
115
 
116
+ def _emit_submit_progress(event: Dict[str, Any]) -> None:
117
+ if progress_callback is None:
118
+ return
119
+ try:
120
+ progress_callback(event)
121
+ except Exception:
122
+ pass
123
+
113
124
  for attempt in range(1, max_attempts + 1):
114
125
  wait_ms = 0 if attempt == 1 else base_backoff * (2 ** (attempt - 2))
115
126
  if wait_ms > 0:
116
127
  time.sleep(wait_ms / 1000.0)
117
128
 
118
- submit_response = submit_u2_asr_batch(
119
- base_url=base_url,
120
- token=token,
121
- timeout_ms=timeout_ms,
122
- file_urls=limited_urls,
129
+ _emit_submit_progress(
130
+ {
131
+ "phase": "submit",
132
+ "state": "started",
133
+ "attempt": attempt,
134
+ "wait_ms": wait_ms,
135
+ "candidate_count": len(limited_urls),
136
+ }
123
137
  )
138
+
139
+ heartbeat_stop = threading.Event()
140
+
141
+ def _heartbeat() -> None:
142
+ while not heartbeat_stop.wait(5.0):
143
+ _emit_submit_progress(
144
+ {
145
+ "phase": "submit",
146
+ "state": "heartbeat",
147
+ "attempt": attempt,
148
+ "wait_ms": wait_ms,
149
+ "candidate_count": len(limited_urls),
150
+ }
151
+ )
152
+
153
+ heartbeat_thread = threading.Thread(target=_heartbeat, daemon=True)
154
+ heartbeat_thread.start()
155
+
156
+ try:
157
+ submit_response = submit_u2_asr_batch(
158
+ base_url=base_url,
159
+ token=token,
160
+ timeout_ms=timeout_ms,
161
+ file_urls=limited_urls,
162
+ )
163
+ finally:
164
+ heartbeat_stop.set()
165
+ heartbeat_thread.join(timeout=0.2)
124
166
  task_id = extract_task_id(submit_response.get("data"))
125
167
  retriable = is_retriable_submit_failure(submit_response)
126
168
 
169
+ _emit_submit_progress(
170
+ {
171
+ "phase": "submit",
172
+ "state": "finished",
173
+ "attempt": attempt,
174
+ "wait_ms": wait_ms,
175
+ "candidate_count": len(limited_urls),
176
+ "task_id": task_id,
177
+ "status_code": submit_response.get("status_code"),
178
+ "ok": bool(submit_response.get("ok")),
179
+ "error_reason": submit_response.get("error_reason"),
180
+ "request_id": submit_response.get("request_id"),
181
+ "retriable": retriable,
182
+ }
183
+ )
184
+
127
185
  retry_chain.append(
128
186
  {
129
187
  "attempt": attempt,
@@ -172,6 +230,7 @@ def submit_u2_asr_with_retry(
172
230
  video_url: str,
173
231
  max_retries: int,
174
232
  backoff_ms: int,
233
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
175
234
  ) -> Dict[str, Any]:
176
235
  return submit_u2_asr_batch_with_retry(
177
236
  base_url=base_url,
@@ -180,13 +239,55 @@ def submit_u2_asr_with_retry(
180
239
  file_urls=[video_url],
181
240
  max_retries=max_retries,
182
241
  backoff_ms=backoff_ms,
242
+ progress_callback=progress_callback,
183
243
  )
184
244
 
185
245
 
186
246
  def clean_transcript_text(raw_text: Any) -> str:
187
247
  if raw_text is None:
188
248
  return ""
189
- return str(raw_text).strip()
249
+ return normalize_text(raw_text)
250
+
251
+
252
+ def _ensure_sentence_end(text: str) -> str:
253
+ if not text:
254
+ return text
255
+ if text[-1] in "。!?!?" or text.endswith("..."):
256
+ return text
257
+ return f"{text}。"
258
+
259
+
260
+ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
261
+ base = clean_transcript_text(asr_raw) or clean_transcript_text(legacy_clean)
262
+ if not base:
263
+ return ""
264
+
265
+ denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
266
+ denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
267
+ denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
268
+ denoised = re.sub(r"\s+", " ", denoised).strip()
269
+
270
+ units = [clean_transcript_text(part) for part in re.split(r"[。!?!?;;\n]+", denoised)]
271
+ sentences = [_ensure_sentence_end(unit) for unit in units if unit]
272
+ if not sentences:
273
+ fallback = _ensure_sentence_end(denoised)
274
+ return fallback if fallback else ""
275
+
276
+ paragraphs: List[str] = []
277
+ bucket: List[str] = []
278
+ for sentence in sentences:
279
+ bucket.append(sentence)
280
+ if len(bucket) >= 3:
281
+ paragraphs.append("\n".join(bucket))
282
+ bucket = []
283
+
284
+ if bucket:
285
+ if len(bucket) == 1 and paragraphs:
286
+ paragraphs[-1] = f"{paragraphs[-1]}\n{bucket[0]}"
287
+ else:
288
+ paragraphs.append("\n".join(bucket))
289
+
290
+ return "\n\n".join(paragraphs)
190
291
 
191
292
 
192
293
  def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
@@ -612,6 +713,7 @@ def poll_u2_task_core(
612
713
  max_polls: int,
613
714
  require_batch_complete: bool = False,
614
715
  expected_total: int = 0,
716
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
615
717
  ) -> Dict[str, Any]:
616
718
  trace = []
617
719
  last_request_id = None
@@ -666,6 +768,25 @@ def poll_u2_task_core(
666
768
  }
667
769
  )
668
770
 
771
+ if progress_callback is not None:
772
+ try:
773
+ progress_callback(
774
+ {
775
+ "attempt": attempt,
776
+ "task_id": task_id,
777
+ "task_status": status or "UNKNOWN",
778
+ "platform_task_status": platform_status or "UNKNOWN",
779
+ "pending_count": pending_count,
780
+ "request_id": response.get("request_id"),
781
+ "status_code": response.get("status_code"),
782
+ "ok": bool(response.get("ok")),
783
+ "error_reason": response.get("error_reason"),
784
+ "batch_progress": batch_progress,
785
+ }
786
+ )
787
+ except Exception:
788
+ pass
789
+
669
790
  if not response.get("ok"):
670
791
  if attempt < max_polls:
671
792
  time.sleep(max(poll_interval_sec, 0.2))
@@ -797,6 +918,7 @@ def run_u2_asr_candidates_with_timeout_retry(
797
918
  max_polls: int,
798
919
  timeout_retry_enabled: bool = True,
799
920
  timeout_retry_max_retries: int = 3,
921
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
800
922
  ) -> Dict[str, Any]:
801
923
  normalized_candidates = normalize_media_candidates(candidates)
802
924
  attempts: List[Dict[str, Any]] = []
@@ -831,6 +953,7 @@ def run_u2_asr_candidates_with_timeout_retry(
831
953
  max_polls=max_polls,
832
954
  timeout_retry_enabled=timeout_retry_enabled,
833
955
  timeout_retry_max_retries=timeout_retry_max_retries,
956
+ progress_callback=progress_callback,
834
957
  )
835
958
  poll_result = bundle.get("poll_result", {})
836
959
  error_reason = str(poll_result.get("error_reason") or "")
@@ -871,6 +994,7 @@ def run_u2_asr_batch_with_timeout_retry(
871
994
  max_polls: int,
872
995
  timeout_retry_enabled: bool = True,
873
996
  timeout_retry_max_retries: int = 3,
997
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
874
998
  ) -> Dict[str, Any]:
875
999
  normalized_urls = normalize_media_candidates(file_urls)
876
1000
  limited_urls = normalized_urls[:U2_BATCH_SUBMIT_HARD_LIMIT]
@@ -921,6 +1045,7 @@ def run_u2_asr_batch_with_timeout_retry(
921
1045
  }
922
1046
 
923
1047
  for round_index in range(1, max_rounds + 1):
1048
+ submit_started_at = time.perf_counter()
924
1049
  submit_bundle = submit_u2_asr_batch_with_retry(
925
1050
  base_url=base_url,
926
1051
  token=token,
@@ -928,12 +1053,16 @@ def run_u2_asr_batch_with_timeout_retry(
928
1053
  file_urls=limited_urls,
929
1054
  max_retries=submit_max_retries,
930
1055
  backoff_ms=submit_backoff_ms,
1056
+ progress_callback=progress_callback,
931
1057
  )
1058
+ submit_duration_ms = int((time.perf_counter() - submit_started_at) * 1000)
932
1059
  submit_response = submit_bundle.get("submit_response", {})
933
1060
  task_id = submit_bundle.get("task_id")
934
1061
 
935
1062
  poll_result: Dict[str, Any]
1063
+ poll_duration_ms = 0
936
1064
  if submit_response.get("ok") and task_id:
1065
+ poll_started_at = time.perf_counter()
937
1066
  poll_result = poll_u2_task_core(
938
1067
  base_url=base_url,
939
1068
  token=token,
@@ -943,7 +1072,9 @@ def run_u2_asr_batch_with_timeout_retry(
943
1072
  max_polls=max_polls,
944
1073
  require_batch_complete=True,
945
1074
  expected_total=len(limited_urls),
1075
+ progress_callback=progress_callback,
946
1076
  )
1077
+ poll_duration_ms = int((time.perf_counter() - poll_started_at) * 1000)
947
1078
  else:
948
1079
  poll_result = {
949
1080
  "ok": False,
@@ -975,6 +1106,7 @@ def run_u2_asr_batch_with_timeout_retry(
975
1106
  "error_reason": submit_response.get("error_reason"),
976
1107
  "retry_chain": submit_bundle.get("retry_chain", []),
977
1108
  "file_url_count": len(limited_urls),
1109
+ "duration_ms": submit_duration_ms,
978
1110
  },
979
1111
  "poll": {
980
1112
  "task_id": poll_result.get("task_id") or task_id,
@@ -986,6 +1118,7 @@ def run_u2_asr_batch_with_timeout_retry(
986
1118
  "task_metrics": poll_result.get("task_metrics", {}),
987
1119
  "batch_complete": bool(poll_result.get("batch_complete")),
988
1120
  "batch_progress": poll_result.get("batch_progress", {}),
1121
+ "duration_ms": poll_duration_ms,
989
1122
  },
990
1123
  }
991
1124
  )
@@ -1071,6 +1204,8 @@ def run_u2_asr_batch_with_timeout_retry(
1071
1204
  "task_metrics": final_poll_result.get("task_metrics") if isinstance(final_poll_result.get("task_metrics"), dict) else extract_u2_task_metrics(raw_task_payload),
1072
1205
  "batch_progress": final_poll_result.get("batch_progress") if isinstance(final_poll_result.get("batch_progress"), dict) else build_u2_batch_progress(payload=raw_task_payload, expected_total=len(limited_urls)),
1073
1206
  "batch_complete": bool(final_poll_result.get("batch_complete")),
1207
+ "submit_duration_ms": _safe_int((rounds[-1].get("submit") if rounds else {}).get("duration_ms")),
1208
+ "poll_duration_ms": _safe_int((rounds[-1].get("poll") if rounds else {}).get("duration_ms")),
1074
1209
  }
1075
1210
 
1076
1211
 
@@ -1086,6 +1221,7 @@ def run_u2_asr_with_timeout_retry(
1086
1221
  max_polls: int,
1087
1222
  timeout_retry_enabled: bool = True,
1088
1223
  timeout_retry_max_retries: int = 3,
1224
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1089
1225
  ) -> Dict[str, Any]:
1090
1226
  video_url = normalize_media_url(video_url)
1091
1227
  conservative_retries = max(0, min(3, int(timeout_retry_max_retries)))
@@ -1103,6 +1239,7 @@ def run_u2_asr_with_timeout_retry(
1103
1239
  timeout_retry_result = "not_triggered"
1104
1240
 
1105
1241
  for round_index in range(1, max_rounds + 1):
1242
+ submit_started_at = time.perf_counter()
1106
1243
  submit_bundle = submit_u2_asr_with_retry(
1107
1244
  base_url=base_url,
1108
1245
  token=token,
@@ -1110,12 +1247,16 @@ def run_u2_asr_with_timeout_retry(
1110
1247
  video_url=video_url,
1111
1248
  max_retries=submit_max_retries,
1112
1249
  backoff_ms=submit_backoff_ms,
1250
+ progress_callback=progress_callback,
1113
1251
  )
1252
+ submit_duration_ms = int((time.perf_counter() - submit_started_at) * 1000)
1114
1253
  submit_response = submit_bundle.get("submit_response", {})
1115
1254
  task_id = submit_bundle.get("task_id")
1116
1255
 
1117
1256
  poll_result: Dict[str, Any]
1257
+ poll_duration_ms = 0
1118
1258
  if submit_response.get("ok") and task_id:
1259
+ poll_started_at = time.perf_counter()
1119
1260
  poll_result = poll_u2_task_core(
1120
1261
  base_url=base_url,
1121
1262
  token=token,
@@ -1123,7 +1264,9 @@ def run_u2_asr_with_timeout_retry(
1123
1264
  task_id=str(task_id),
1124
1265
  poll_interval_sec=poll_interval_sec,
1125
1266
  max_polls=max_polls,
1267
+ progress_callback=progress_callback,
1126
1268
  )
1269
+ poll_duration_ms = int((time.perf_counter() - poll_started_at) * 1000)
1127
1270
  else:
1128
1271
  poll_result = {
1129
1272
  "ok": False,
@@ -1145,6 +1288,7 @@ def run_u2_asr_with_timeout_retry(
1145
1288
  "ok": submit_response.get("ok"),
1146
1289
  "error_reason": submit_response.get("error_reason"),
1147
1290
  "retry_chain": submit_bundle.get("retry_chain", []),
1291
+ "duration_ms": submit_duration_ms,
1148
1292
  },
1149
1293
  "poll": {
1150
1294
  "task_id": poll_result.get("task_id") or task_id,
@@ -1153,6 +1297,7 @@ def run_u2_asr_with_timeout_retry(
1153
1297
  "ok": poll_result.get("ok"),
1154
1298
  "error_reason": poll_result.get("error_reason"),
1155
1299
  "attempts": len(poll_result.get("trace", [])),
1300
+ "duration_ms": poll_duration_ms,
1156
1301
  },
1157
1302
  }
1158
1303
  )
@@ -1186,4 +1331,6 @@ def run_u2_asr_with_timeout_retry(
1186
1331
  "triggered": timeout_retry_triggered,
1187
1332
  "result": timeout_retry_result,
1188
1333
  },
1334
+ "submit_duration_ms": _safe_int((rounds[-1].get("submit") if rounds else {}).get("duration_ms")),
1335
+ "poll_duration_ms": _safe_int((rounds[-1].get("poll") if rounds else {}).get("duration_ms")),
1189
1336
  }
@@ -17,7 +17,7 @@ from scripts.core.bootstrap_env import bootstrap_for_direct_run
17
17
  bootstrap_for_direct_run(__file__, __package__)
18
18
 
19
19
  import argparse
20
- from typing import Any, Dict
20
+ from typing import Any, Callable, Dict, Optional
21
21
 
22
22
  from scripts.pipeline.asr.asr_pipeline import poll_u2_task_core
23
23
  from scripts.core.tikomni_common import extract_error_reason, resolve_runtime, write_json_stdout
@@ -31,6 +31,7 @@ def poll_u2_task(
31
31
  task_id: str,
32
32
  poll_interval_sec: float,
33
33
  max_polls: int,
34
+ progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
34
35
  ) -> Dict[str, Any]:
35
36
  return poll_u2_task_core(
36
37
  base_url=base_url,
@@ -39,6 +40,7 @@ def poll_u2_task(
39
40
  task_id=task_id,
40
41
  poll_interval_sec=poll_interval_sec,
41
42
  max_polls=max_polls,
43
+ progress_callback=progress_callback,
42
44
  )
43
45
 
44
46