@tikomni/skills 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/skills/single-work-analysis/env.example +3 -3
- package/skills/single-work-analysis/references/config-templates/defaults.yaml +8 -19
- package/skills/single-work-analysis/references/prompt-contracts/{insight.md → analysis-bundle.md} +43 -8
- package/skills/single-work-analysis/scripts/core/analysis_adapter.py +384 -0
- package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +399 -76
- package/skills/single-work-analysis/scripts/core/config_loader.py +18 -42
- package/skills/single-work-analysis/scripts/core/progress_report.py +163 -16
- package/skills/single-work-analysis/scripts/core/storage_router.py +24 -57
- package/skills/single-work-analysis/scripts/core/tikomni_common.py +13 -3
- package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +154 -7
- package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +3 -1
- package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +243 -44
- package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +263 -25
- package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +244 -894
- package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
- package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
- package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
- package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
- package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
|
@@ -1,11 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""Structured stderr progress reporting for long-running TikOmni workflows.
|
|
3
|
-
|
|
4
|
-
Design goals:
|
|
5
|
-
- emit machine-readable progress events to stderr only
|
|
6
|
-
- keep final JSON stdout contract untouched
|
|
7
|
-
- offer a tiny shared helper so handlers do not duplicate logging logic
|
|
8
|
-
"""
|
|
2
|
+
"""Structured stderr progress reporting for long-running TikOmni workflows."""
|
|
9
3
|
|
|
10
4
|
from __future__ import annotations
|
|
11
5
|
|
|
@@ -13,9 +7,95 @@ import json
|
|
|
13
7
|
import sys
|
|
14
8
|
from datetime import datetime, timezone
|
|
15
9
|
from typing import Any, Dict, Optional
|
|
10
|
+
from urllib.parse import urlparse
|
|
16
11
|
|
|
17
12
|
|
|
18
13
|
VALID_EVENTS = {"started", "progress", "done", "failed"}
|
|
14
|
+
_MASKED_TEXT = "<redacted>"
|
|
15
|
+
_MAX_TEXT_PREVIEW = 180
|
|
16
|
+
_MAX_LIST_ITEMS = 8
|
|
17
|
+
_SENSITIVE_KEYS = {
|
|
18
|
+
"api_key",
|
|
19
|
+
"authorization",
|
|
20
|
+
"cookie",
|
|
21
|
+
"cookies",
|
|
22
|
+
"set_cookie",
|
|
23
|
+
"token",
|
|
24
|
+
"xsec_token",
|
|
25
|
+
}
|
|
26
|
+
_LONG_TEXT_KEYS = {
|
|
27
|
+
"asr_clean",
|
|
28
|
+
"asr_raw",
|
|
29
|
+
"prompt",
|
|
30
|
+
"prompt_text",
|
|
31
|
+
"raw_content",
|
|
32
|
+
"stderr",
|
|
33
|
+
"stdout",
|
|
34
|
+
"transcript",
|
|
35
|
+
"transcript_text",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _looks_like_url(text: str) -> bool:
|
|
40
|
+
return text.startswith("http://") or text.startswith("https://")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _mask_key(key: str) -> bool:
|
|
44
|
+
lowered = key.lower()
|
|
45
|
+
if lowered in _SENSITIVE_KEYS:
|
|
46
|
+
return True
|
|
47
|
+
return any(token in lowered for token in ("api_key", "token", "cookie", "authorization"))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _sanitize_url(text: str) -> str:
|
|
51
|
+
try:
|
|
52
|
+
parsed = urlparse(text)
|
|
53
|
+
except Exception:
|
|
54
|
+
return text[:_MAX_TEXT_PREVIEW]
|
|
55
|
+
if not parsed.scheme or not parsed.netloc:
|
|
56
|
+
return text[:_MAX_TEXT_PREVIEW]
|
|
57
|
+
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _truncate_text(text: str) -> str:
|
|
61
|
+
if len(text) <= _MAX_TEXT_PREVIEW:
|
|
62
|
+
return text
|
|
63
|
+
return f"{text[:_MAX_TEXT_PREVIEW]}…(len={len(text)})"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _sanitize_scalar(value: Any, *, key: str = "") -> Any:
|
|
67
|
+
if value is None or isinstance(value, (bool, int, float)):
|
|
68
|
+
return value
|
|
69
|
+
|
|
70
|
+
text = str(value)
|
|
71
|
+
if _mask_key(key):
|
|
72
|
+
return _MASKED_TEXT
|
|
73
|
+
if key.lower() in _LONG_TEXT_KEYS:
|
|
74
|
+
return f"<redacted:{key}:len={len(text)}>"
|
|
75
|
+
if _looks_like_url(text):
|
|
76
|
+
return _sanitize_url(text)
|
|
77
|
+
return _truncate_text(text)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _sanitize_payload(value: Any, *, key: str = "") -> Any:
|
|
81
|
+
if isinstance(value, dict):
|
|
82
|
+
sanitized: Dict[str, Any] = {}
|
|
83
|
+
for child_key, child_value in value.items():
|
|
84
|
+
child_key_text = str(child_key)
|
|
85
|
+
if _mask_key(child_key_text):
|
|
86
|
+
sanitized[child_key_text] = _MASKED_TEXT
|
|
87
|
+
continue
|
|
88
|
+
sanitized[child_key_text] = _sanitize_payload(child_value, key=child_key_text)
|
|
89
|
+
return sanitized
|
|
90
|
+
|
|
91
|
+
if isinstance(value, list):
|
|
92
|
+
items = value[:_MAX_LIST_ITEMS]
|
|
93
|
+
sanitized_items = [_sanitize_payload(item, key=key) for item in items]
|
|
94
|
+
if len(value) > _MAX_LIST_ITEMS:
|
|
95
|
+
sanitized_items.append(f"...({len(value) - _MAX_LIST_ITEMS} more)")
|
|
96
|
+
return sanitized_items
|
|
97
|
+
|
|
98
|
+
return _sanitize_scalar(value, key=key)
|
|
19
99
|
|
|
20
100
|
|
|
21
101
|
class ProgressReporter:
|
|
@@ -36,12 +116,12 @@ class ProgressReporter:
|
|
|
36
116
|
self.run_id = str(run_id or f"{self.platform}.{self.content_kind}")
|
|
37
117
|
self.scope = str(scope or "workflow")
|
|
38
118
|
self.enabled = bool(enabled)
|
|
39
|
-
self.defaults = dict(defaults or {})
|
|
119
|
+
self.defaults = _sanitize_payload(dict(defaults or {}))
|
|
40
120
|
|
|
41
121
|
def child(self, *, scope: str, defaults: Optional[Dict[str, Any]] = None) -> "ProgressReporter":
|
|
42
122
|
merged = dict(self.defaults)
|
|
43
123
|
if defaults:
|
|
44
|
-
merged.update(defaults)
|
|
124
|
+
merged.update(_sanitize_payload(defaults))
|
|
45
125
|
return ProgressReporter(
|
|
46
126
|
workflow=self.workflow,
|
|
47
127
|
platform=self.platform,
|
|
@@ -70,13 +150,12 @@ class ProgressReporter:
|
|
|
70
150
|
"stage": str(stage or "unknown"),
|
|
71
151
|
}
|
|
72
152
|
if message:
|
|
73
|
-
payload["message"] = str(message)
|
|
74
|
-
|
|
75
|
-
payload["data"] = dict(self.defaults)
|
|
153
|
+
payload["message"] = _truncate_text(str(message))
|
|
154
|
+
merged_data = dict(self.defaults)
|
|
76
155
|
if isinstance(data, dict) and data:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
payload["data"] =
|
|
156
|
+
merged_data.update(_sanitize_payload(data))
|
|
157
|
+
if merged_data:
|
|
158
|
+
payload["data"] = merged_data
|
|
80
159
|
sys.stderr.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
|
81
160
|
sys.stderr.flush()
|
|
82
161
|
|
|
@@ -92,6 +171,74 @@ class ProgressReporter:
|
|
|
92
171
|
def failed(self, *, stage: str, message: str = "", data: Optional[Dict[str, Any]] = None) -> None:
|
|
93
172
|
self.emit("failed", stage=stage, message=message, data=data)
|
|
94
173
|
|
|
174
|
+
def heartbeat(self, *, stage: str, message: str = "", data: Optional[Dict[str, Any]] = None) -> None:
|
|
175
|
+
heartbeat_data = dict(data or {})
|
|
176
|
+
heartbeat_data["heartbeat"] = True
|
|
177
|
+
self.progress(stage=stage, message=message or "heartbeat", data=heartbeat_data)
|
|
178
|
+
|
|
179
|
+
def http_event(
|
|
180
|
+
self,
|
|
181
|
+
*,
|
|
182
|
+
stage: str,
|
|
183
|
+
endpoint: str,
|
|
184
|
+
response: Optional[Dict[str, Any]],
|
|
185
|
+
summary: Optional[Dict[str, Any]] = None,
|
|
186
|
+
route_label: Optional[str] = None,
|
|
187
|
+
) -> None:
|
|
188
|
+
payload: Dict[str, Any] = {
|
|
189
|
+
"kind": "http",
|
|
190
|
+
"endpoint": endpoint,
|
|
191
|
+
}
|
|
192
|
+
if route_label:
|
|
193
|
+
payload["route_label"] = route_label
|
|
194
|
+
if isinstance(response, dict):
|
|
195
|
+
payload.update(
|
|
196
|
+
{
|
|
197
|
+
"ok": bool(response.get("ok")),
|
|
198
|
+
"status_code": response.get("status_code"),
|
|
199
|
+
"request_id": response.get("request_id"),
|
|
200
|
+
"attempt": int(response.get("retry_attempt", 0)) + 1,
|
|
201
|
+
"fallback_trigger_reason": response.get("fallback_trigger_reason"),
|
|
202
|
+
"timeout_retry_exhausted": bool(response.get("timeout_retry_exhausted")),
|
|
203
|
+
}
|
|
204
|
+
)
|
|
205
|
+
if isinstance(summary, dict) and summary:
|
|
206
|
+
payload["summary"] = summary
|
|
207
|
+
event_fn = self.done if isinstance(response, dict) and response.get("ok") else self.failed
|
|
208
|
+
event_fn(stage=stage, message="http request finished", data=payload)
|
|
209
|
+
|
|
210
|
+
def subprocess_event(
|
|
211
|
+
self,
|
|
212
|
+
*,
|
|
213
|
+
stage: str,
|
|
214
|
+
provider: str,
|
|
215
|
+
operation: str,
|
|
216
|
+
event: str,
|
|
217
|
+
duration_ms: Optional[int] = None,
|
|
218
|
+
exit_code: Optional[int] = None,
|
|
219
|
+
summary: Optional[Dict[str, Any]] = None,
|
|
220
|
+
) -> None:
|
|
221
|
+
payload: Dict[str, Any] = {
|
|
222
|
+
"kind": "subprocess",
|
|
223
|
+
"provider": provider,
|
|
224
|
+
"operation": operation,
|
|
225
|
+
}
|
|
226
|
+
if duration_ms is not None:
|
|
227
|
+
payload["duration_ms"] = int(duration_ms)
|
|
228
|
+
if exit_code is not None:
|
|
229
|
+
payload["exit_code"] = int(exit_code)
|
|
230
|
+
if isinstance(summary, dict) and summary:
|
|
231
|
+
payload["summary"] = summary
|
|
232
|
+
|
|
233
|
+
if event == "started":
|
|
234
|
+
self.started(stage=stage, message="subprocess started", data=payload)
|
|
235
|
+
elif event == "done":
|
|
236
|
+
self.done(stage=stage, message="subprocess finished", data=payload)
|
|
237
|
+
elif event == "failed":
|
|
238
|
+
self.failed(stage=stage, message="subprocess failed", data=payload)
|
|
239
|
+
else:
|
|
240
|
+
self.progress(stage=stage, message="subprocess progress", data=payload)
|
|
241
|
+
|
|
95
242
|
|
|
96
243
|
def build_progress_reporter(
|
|
97
244
|
*,
|
|
@@ -107,5 +254,5 @@ def build_progress_reporter(
|
|
|
107
254
|
content_kind=content_kind,
|
|
108
255
|
run_id=f"{platform}.{content_kind}",
|
|
109
256
|
enabled=enabled,
|
|
110
|
-
defaults={"input_value": str(input_value or "")
|
|
257
|
+
defaults={"input_value": _sanitize_scalar(str(input_value or ""), key="input_value")},
|
|
111
258
|
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""Storage routing helpers for
|
|
2
|
+
"""Storage routing helpers for single-work card outputs."""
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
@@ -8,53 +8,33 @@ import re
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
DEFAULT_CARD_TYPE_ROUTES: Dict[str, Dict[str, Any]] = {
|
|
12
13
|
"work": {
|
|
13
14
|
"prefix": "CBV",
|
|
14
15
|
"parts": ["内容系统", "对标研究", "作品卡"],
|
|
15
16
|
},
|
|
16
|
-
"author": {
|
|
17
|
-
"prefix": "CBA",
|
|
18
|
-
"parts": ["内容系统", "对标研究", "作者卡"],
|
|
19
|
-
},
|
|
20
|
-
"author_sample_work": {
|
|
21
|
-
"prefix": "CBV",
|
|
22
|
-
"parts": ["内容系统", "对标研究", "作者样本卡", "{platform}-{author_slug}"],
|
|
23
|
-
},
|
|
24
17
|
}
|
|
25
18
|
|
|
26
19
|
DEFAULT_CONTENT_KIND_CARD_TYPE: Dict[str, str] = {
|
|
27
20
|
"single_video": "work",
|
|
21
|
+
"note": "work",
|
|
28
22
|
"work": "work",
|
|
29
|
-
"author_home": "author_sample_work",
|
|
30
|
-
"author_sample_work": "author_sample_work",
|
|
31
|
-
"author_analysis": "author",
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
CARD_TYPE_ALIASES: Dict[str, str] = {
|
|
35
|
-
"sample": "author_sample_work",
|
|
36
|
-
"sample_work": "author_sample_work",
|
|
37
|
-
"homepage_sample": "author_sample_work",
|
|
38
|
-
"author_homepage_sample": "author_sample_work",
|
|
39
|
-
"author_home": "author_sample_work",
|
|
40
|
-
"author_analysis": "author",
|
|
41
23
|
}
|
|
42
24
|
|
|
43
25
|
CONTENT_KIND_ALIASES: Dict[str, str] = {
|
|
44
|
-
"
|
|
45
|
-
"author_homepage_sample": "author_home",
|
|
46
|
-
"homepage_sample": "author_home",
|
|
47
|
-
"analysis_author": "author_analysis",
|
|
26
|
+
"single-work": "work",
|
|
48
27
|
}
|
|
49
28
|
|
|
29
|
+
DEFAULT_CARD_FILENAME_PATTERN = "{prefix}-{platform}-{author_slug}-{title_slug}{ext}"
|
|
30
|
+
DEFAULT_JSON_FILENAME_PATTERN = "{timestamp}-{platform}-{identifier}{ext}"
|
|
31
|
+
_INVALID_FILENAME_CHARS = re.compile(r"[\\\\/:*?\"<>|]+")
|
|
32
|
+
_SPACE_RUN = re.compile(r"\s+")
|
|
33
|
+
|
|
50
34
|
|
|
51
35
|
def normalize_card_type(card_type: str) -> str:
|
|
52
36
|
normalized = (card_type or "").strip().lower().replace("-", "_")
|
|
53
|
-
if normalized
|
|
54
|
-
normalized = CARD_TYPE_ALIASES[normalized]
|
|
55
|
-
if normalized in {"work", "author", "author_sample_work"}:
|
|
56
|
-
return normalized
|
|
57
|
-
return "work"
|
|
37
|
+
return "work" if normalized == "work" else "work"
|
|
58
38
|
|
|
59
39
|
|
|
60
40
|
def normalize_content_kind(content_kind: Optional[str]) -> str:
|
|
@@ -75,14 +55,13 @@ def _configured_content_kind_map(storage_config: Optional[Dict[str, Any]]) -> Di
|
|
|
75
55
|
routes = _storage_routes_cfg(storage_config)
|
|
76
56
|
configured = routes.get("content_kind_card_type")
|
|
77
57
|
if not isinstance(configured, dict):
|
|
78
|
-
return DEFAULT_CONTENT_KIND_CARD_TYPE
|
|
58
|
+
return dict(DEFAULT_CONTENT_KIND_CARD_TYPE)
|
|
79
59
|
|
|
80
60
|
merged = dict(DEFAULT_CONTENT_KIND_CARD_TYPE)
|
|
81
61
|
for key, value in configured.items():
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
merged[k] = v
|
|
62
|
+
normalized_key = normalize_content_kind(str(key))
|
|
63
|
+
if normalized_key:
|
|
64
|
+
merged[normalized_key] = normalize_card_type(str(value))
|
|
86
65
|
return merged
|
|
87
66
|
|
|
88
67
|
|
|
@@ -90,32 +69,24 @@ def _configured_card_routes(storage_config: Optional[Dict[str, Any]]) -> Dict[st
|
|
|
90
69
|
routes = _storage_routes_cfg(storage_config)
|
|
91
70
|
configured = routes.get("card_type_routes")
|
|
92
71
|
if not isinstance(configured, dict):
|
|
93
|
-
return DEFAULT_CARD_TYPE_ROUTES
|
|
72
|
+
return {key: dict(value) for key, value in DEFAULT_CARD_TYPE_ROUTES.items()}
|
|
94
73
|
|
|
95
|
-
merged
|
|
74
|
+
merged = {key: dict(value) for key, value in DEFAULT_CARD_TYPE_ROUTES.items()}
|
|
96
75
|
for key, value in configured.items():
|
|
97
76
|
card_type = normalize_card_type(str(key))
|
|
98
|
-
if not isinstance(value, dict):
|
|
77
|
+
if card_type != "work" or not isinstance(value, dict):
|
|
99
78
|
continue
|
|
100
|
-
|
|
101
|
-
prefix = value.get("prefix")
|
|
102
79
|
parts = value.get("parts")
|
|
103
|
-
if not isinstance(parts, list) or not all(isinstance(
|
|
80
|
+
if not isinstance(parts, list) or not all(isinstance(item, str) and item for item in parts):
|
|
104
81
|
continue
|
|
105
|
-
|
|
106
|
-
merged[
|
|
107
|
-
"prefix":
|
|
82
|
+
prefix = str(value.get("prefix") or merged["work"]["prefix"])
|
|
83
|
+
merged["work"] = {
|
|
84
|
+
"prefix": prefix,
|
|
108
85
|
"parts": parts,
|
|
109
86
|
}
|
|
110
87
|
return merged
|
|
111
88
|
|
|
112
89
|
|
|
113
|
-
DEFAULT_CARD_FILENAME_PATTERN = "{prefix}-{author_slug}-{title_slug}{ext}"
|
|
114
|
-
DEFAULT_JSON_FILENAME_PATTERN = "{timestamp}-{platform}-{identifier}{ext}"
|
|
115
|
-
_INVALID_FILENAME_CHARS = re.compile(r"[\\\\/:*?\"<>|]+")
|
|
116
|
-
_SPACE_RUN = re.compile(r"\s+")
|
|
117
|
-
|
|
118
|
-
|
|
119
90
|
def _sanitize_filename_token(value: Any, fallback: str = "item") -> str:
|
|
120
91
|
text = str(value or "").strip()
|
|
121
92
|
if not text:
|
|
@@ -185,12 +156,8 @@ def resolve_effective_card_type(
|
|
|
185
156
|
if not normalized_content_kind:
|
|
186
157
|
return normalized_card_type
|
|
187
158
|
|
|
188
|
-
|
|
189
|
-
mapped
|
|
190
|
-
if mapped is not None:
|
|
191
|
-
return normalize_card_type(str(mapped))
|
|
192
|
-
|
|
193
|
-
return normalized_card_type
|
|
159
|
+
mapped = _configured_content_kind_map(storage_config).get(normalized_content_kind)
|
|
160
|
+
return normalize_card_type(str(mapped)) if mapped is not None else normalized_card_type
|
|
194
161
|
|
|
195
162
|
|
|
196
163
|
def render_route_parts(parts: List[str], *, context: Dict[str, str]) -> List[str]:
|
|
@@ -232,7 +199,7 @@ def build_card_output_path(
|
|
|
232
199
|
directory = os.path.join(card_root, *rendered_parts)
|
|
233
200
|
os.makedirs(directory, exist_ok=True)
|
|
234
201
|
|
|
235
|
-
default_filename = f"{prefix}-{author_slug}-{title_slug}.md"
|
|
202
|
+
default_filename = f"{prefix}-{platform}-{author_slug}-{title_slug}.md"
|
|
236
203
|
filename = render_output_filename(
|
|
237
204
|
pattern=resolve_card_filename_pattern(storage_config),
|
|
238
205
|
context={
|
|
@@ -69,9 +69,19 @@ def _resolve_env_file_path(env_file: Optional[str]) -> Path:
|
|
|
69
69
|
return (skills_root / ".env").resolve()
|
|
70
70
|
|
|
71
71
|
candidate = Path(env_file).expanduser()
|
|
72
|
-
if
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
if candidate.is_absolute():
|
|
73
|
+
return candidate.resolve()
|
|
74
|
+
|
|
75
|
+
search_roots = [
|
|
76
|
+
Path.cwd(),
|
|
77
|
+
get_repo_root(),
|
|
78
|
+
skills_root,
|
|
79
|
+
]
|
|
80
|
+
for root in search_roots:
|
|
81
|
+
resolved = (root / candidate).resolve()
|
|
82
|
+
if resolved.exists():
|
|
83
|
+
return resolved
|
|
84
|
+
return (Path.cwd() / candidate).resolve()
|
|
75
85
|
|
|
76
86
|
|
|
77
87
|
def _infer_default_env_paths(primary_env_file: Optional[str]) -> Tuple[Path, Path]:
|