@tikomni/skills 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/skills/single-work-analysis/env.example +3 -3
- package/skills/single-work-analysis/references/config-templates/defaults.yaml +8 -19
- package/skills/single-work-analysis/references/prompt-contracts/{insight.md → analysis-bundle.md} +43 -8
- package/skills/single-work-analysis/scripts/core/analysis_adapter.py +384 -0
- package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +399 -76
- package/skills/single-work-analysis/scripts/core/config_loader.py +18 -42
- package/skills/single-work-analysis/scripts/core/progress_report.py +163 -16
- package/skills/single-work-analysis/scripts/core/storage_router.py +24 -57
- package/skills/single-work-analysis/scripts/core/tikomni_common.py +13 -3
- package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +154 -7
- package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +3 -1
- package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +243 -44
- package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +263 -25
- package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +244 -894
- package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
- package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
- package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
- package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
- package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
|
@@ -10,33 +10,42 @@ if __package__ in {None, ""}:
|
|
|
10
10
|
sys.path.insert(0, str(_parent))
|
|
11
11
|
break
|
|
12
12
|
|
|
13
|
-
"""Write benchmark markdown cards into card root zones."""
|
|
13
|
+
"""Write single-work benchmark markdown cards into card root zones."""
|
|
14
14
|
|
|
15
15
|
import argparse
|
|
16
16
|
import datetime as dt
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
19
|
import re
|
|
20
|
+
import time
|
|
20
21
|
import unicodedata
|
|
21
22
|
from pathlib import Path
|
|
22
23
|
from typing import Any, Dict, List, Optional
|
|
23
24
|
|
|
24
25
|
try:
|
|
25
26
|
from zoneinfo import ZoneInfo
|
|
26
|
-
except Exception: # pragma: no cover
|
|
27
|
+
except Exception: # pragma: no cover
|
|
27
28
|
ZoneInfo = None
|
|
28
29
|
|
|
29
|
-
from scripts.core.analysis_pipeline import
|
|
30
|
+
from scripts.core.analysis_pipeline import (
|
|
31
|
+
DEFAULT_MODULE_SECTIONS,
|
|
32
|
+
build_analysis_sections,
|
|
33
|
+
ensure_analysis_sections_schema,
|
|
34
|
+
)
|
|
30
35
|
from scripts.core.config_loader import load_tikomni_config
|
|
36
|
+
from scripts.core.progress_report import ProgressReporter
|
|
31
37
|
from scripts.core.storage_router import build_card_output_path, normalize_card_type, resolve_effective_card_type
|
|
32
38
|
from scripts.core.tikomni_common import normalize_text, read_json_file, write_json_stdout
|
|
39
|
+
from scripts.pipeline.asr.asr_pipeline import derive_asr_clean_text
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
CARD_TYPES = ["work"]
|
|
43
|
+
|
|
33
44
|
|
|
34
45
|
def resolve_default_card_root() -> str:
|
|
35
46
|
raw = os.getenv("TIKOMNI_CARD_ROOT", "").strip()
|
|
36
47
|
if not raw:
|
|
37
|
-
raise ValueError(
|
|
38
|
-
"missing_card_root: set --card-root or define TIKOMNI_CARD_ROOT in .env/.env.local"
|
|
39
|
-
)
|
|
48
|
+
raise ValueError("missing_card_root: set --card-root or define TIKOMNI_CARD_ROOT in .env/.env.local")
|
|
40
49
|
|
|
41
50
|
candidate = Path(raw).expanduser()
|
|
42
51
|
if not candidate.is_absolute():
|
|
@@ -44,18 +53,7 @@ def resolve_default_card_root() -> str:
|
|
|
44
53
|
return str(candidate.resolve())
|
|
45
54
|
|
|
46
55
|
|
|
47
|
-
# Keep import-time compatibility for other scripts without crashing when env is absent.
|
|
48
56
|
DEFAULT_CARD_ROOT = ""
|
|
49
|
-
CARD_TYPES = ["work", "author", "author_sample_work"]
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _normalize_lines(value: Any) -> List[str]:
|
|
53
|
-
if isinstance(value, list):
|
|
54
|
-
return [normalize_text(item) for item in value if normalize_text(item)]
|
|
55
|
-
if isinstance(value, str):
|
|
56
|
-
text = normalize_text(value)
|
|
57
|
-
return [text] if text else []
|
|
58
|
-
return []
|
|
59
57
|
|
|
60
58
|
|
|
61
59
|
def _safe_int(value: Any, default: int = 0) -> int:
|
|
@@ -67,11 +65,13 @@ def _safe_int(value: Any, default: int = 0) -> int:
|
|
|
67
65
|
return value
|
|
68
66
|
if isinstance(value, float):
|
|
69
67
|
return int(value)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
68
|
+
text = str(value).strip()
|
|
69
|
+
if not text:
|
|
70
|
+
return default
|
|
71
|
+
try:
|
|
72
|
+
return int(float(text))
|
|
73
|
+
except Exception:
|
|
74
|
+
return default
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
def _safe_optional_int(value: Any) -> Optional[int]:
|
|
@@ -83,18 +83,38 @@ def _safe_optional_int(value: Any) -> Optional[int]:
|
|
|
83
83
|
return value
|
|
84
84
|
if isinstance(value, float):
|
|
85
85
|
return int(value)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
86
|
+
text = str(value).strip()
|
|
87
|
+
if not text:
|
|
88
|
+
return None
|
|
89
|
+
try:
|
|
90
|
+
return int(float(text))
|
|
91
|
+
except Exception:
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _display_metric(value: Optional[int]) -> str:
|
|
96
|
+
return "N/A" if value is None else str(value)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
100
|
+
source = payload.get("source")
|
|
101
|
+
return source if isinstance(source, dict) else {}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _pick_text(payload: Dict[str, Any], keys: List[str], source_keys: Optional[List[str]] = None) -> str:
|
|
105
|
+
source = _source_dict(payload)
|
|
106
|
+
for key in keys:
|
|
107
|
+
text = normalize_text(payload.get(key))
|
|
108
|
+
if text:
|
|
109
|
+
return text
|
|
110
|
+
for key in (source_keys or keys):
|
|
111
|
+
text = normalize_text(source.get(key))
|
|
112
|
+
if text:
|
|
113
|
+
return text
|
|
114
|
+
return ""
|
|
93
115
|
|
|
94
116
|
|
|
95
117
|
def _to_unix_sec(value: Any) -> int:
|
|
96
|
-
if value is None:
|
|
97
|
-
return 0
|
|
98
118
|
parsed = _safe_int(value, default=0)
|
|
99
119
|
if parsed <= 0:
|
|
100
120
|
return 0
|
|
@@ -117,168 +137,47 @@ def _format_shanghai_datetime(value: Any) -> str:
|
|
|
117
137
|
return ""
|
|
118
138
|
|
|
119
139
|
|
|
120
|
-
def _resolve_publish_time(payload: Dict[str, Any], create_time_sec: int) -> Dict[str, str]:
|
|
121
|
-
publish_time_text = normalize_text(payload.get("publish_time_text"))
|
|
122
|
-
if publish_time_text:
|
|
123
|
-
return {"publish_time_text": publish_time_text, "publish_time_source": "payload.publish_time_text"}
|
|
124
|
-
|
|
125
|
-
source = _source_dict(payload)
|
|
126
|
-
candidates = [
|
|
127
|
-
("payload.publish_time", payload.get("publish_time")),
|
|
128
|
-
("payload.create_time", payload.get("create_time")),
|
|
129
|
-
("source.publish_time", source.get("publish_time")),
|
|
130
|
-
("source.create_time", source.get("create_time")),
|
|
131
|
-
("source.time", source.get("time")),
|
|
132
|
-
]
|
|
133
|
-
for source_key, raw in candidates:
|
|
134
|
-
text = normalize_text(raw)
|
|
135
|
-
if not text:
|
|
136
|
-
continue
|
|
137
|
-
ts_text = _format_shanghai_datetime(raw)
|
|
138
|
-
if ts_text:
|
|
139
|
-
return {"publish_time_text": ts_text, "publish_time_source": source_key}
|
|
140
|
-
return {"publish_time_text": text, "publish_time_source": source_key}
|
|
141
|
-
|
|
142
|
-
fallback_text = _format_shanghai_datetime(create_time_sec)
|
|
143
|
-
if fallback_text:
|
|
144
|
-
return {"publish_time_text": fallback_text, "publish_time_source": "create_time_sec"}
|
|
145
|
-
|
|
146
|
-
return {"publish_time_text": "未知", "publish_time_source": "unknown"}
|
|
147
|
-
|
|
148
|
-
|
|
149
140
|
def _resolve_published_date(payload: Dict[str, Any], create_time_sec: int) -> str:
|
|
150
141
|
published_date = normalize_text(payload.get("published_date"))
|
|
151
142
|
if published_date:
|
|
152
143
|
return published_date
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
return "N/A"
|
|
157
|
-
return text[:10]
|
|
144
|
+
publish_time_text = normalize_text(payload.get("publish_time_text"))
|
|
145
|
+
if publish_time_text:
|
|
146
|
+
return publish_time_text[:10]
|
|
158
147
|
|
|
148
|
+
source = _source_dict(payload)
|
|
149
|
+
for key in ("publish_time", "create_time", "time"):
|
|
150
|
+
text = _format_shanghai_datetime(payload.get(key))
|
|
151
|
+
if text:
|
|
152
|
+
return text[:10]
|
|
153
|
+
text = _format_shanghai_datetime(source.get(key))
|
|
154
|
+
if text:
|
|
155
|
+
return text[:10]
|
|
159
156
|
|
|
160
|
-
|
|
161
|
-
if
|
|
162
|
-
return "N/A"
|
|
163
|
-
return str(value)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
167
|
-
source = payload.get("source")
|
|
168
|
-
return source if isinstance(source, dict) else {}
|
|
157
|
+
fallback = _format_shanghai_datetime(create_time_sec)
|
|
158
|
+
return fallback[:10] if fallback else "N/A"
|
|
169
159
|
|
|
170
160
|
|
|
171
161
|
def _extract_duration_ms(payload: Dict[str, Any]) -> int:
|
|
172
162
|
source = _source_dict(payload)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
base = source if from_source else payload
|
|
176
|
-
for key in keys:
|
|
163
|
+
for base in (payload, source):
|
|
164
|
+
for key in ("duration_ms", "duration", "duration_sec"):
|
|
177
165
|
value = _safe_int(base.get(key), default=0)
|
|
178
166
|
if value > 0:
|
|
179
|
-
return value
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
duration_ms = _pick_int(["duration_ms"])
|
|
183
|
-
if duration_ms <= 0:
|
|
184
|
-
duration_ms = _pick_int(["duration_ms"], from_source=True)
|
|
185
|
-
|
|
186
|
-
if duration_ms <= 0:
|
|
187
|
-
raw_duration = _pick_int(["duration", "duration_sec"])
|
|
188
|
-
if raw_duration <= 0:
|
|
189
|
-
raw_duration = _pick_int(["duration", "duration_sec"], from_source=True)
|
|
190
|
-
if raw_duration > 0:
|
|
191
|
-
duration_ms = raw_duration * 1000 if raw_duration < 10000 else raw_duration
|
|
192
|
-
|
|
193
|
-
return duration_ms
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
def _ensure_sentence_end(text: str) -> str:
|
|
197
|
-
if not text:
|
|
198
|
-
return text
|
|
199
|
-
if text[-1] in "。!?!?" or text.endswith("..."):
|
|
200
|
-
return text
|
|
201
|
-
return f"{text}。"
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def _clean_asr_text(raw: str, provided_clean: str) -> str:
|
|
205
|
-
"""ASR_CLEAN prompt-contracts/asr-clean.md@v1
|
|
206
|
-
|
|
207
|
-
Steps:
|
|
208
|
-
1) base select: provided_clean > raw
|
|
209
|
-
2) denoise: remove filler/repetition/whitespace noise
|
|
210
|
-
3) sentence split + punctuation restore
|
|
211
|
-
4) paragraphize: one sentence per line, 2-4 sentences per paragraph
|
|
212
|
-
"""
|
|
213
|
-
base = normalize_text(provided_clean) or normalize_text(raw)
|
|
214
|
-
if not base:
|
|
215
|
-
return ""
|
|
216
|
-
|
|
217
|
-
# step2: 去噪(口头禅/重复)
|
|
218
|
-
base = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
|
|
219
|
-
base = re.sub(r"(嗯+|啊+|呃+)", " ", base)
|
|
220
|
-
base = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", base)
|
|
221
|
-
base = re.sub(r"\s+", " ", base).strip()
|
|
222
|
-
|
|
223
|
-
# step3: 断句 + 句尾标点
|
|
224
|
-
units = [normalize_text(part) for part in re.split(r"[。!?!?;;\n]+", base)]
|
|
225
|
-
sentences = [_ensure_sentence_end(unit) for unit in units if unit]
|
|
226
|
-
if not sentences:
|
|
227
|
-
fallback = _ensure_sentence_end(base)
|
|
228
|
-
return fallback if fallback else ""
|
|
229
|
-
|
|
230
|
-
# step4: 每句一行;每段 2~4 句(默认 3 句)
|
|
231
|
-
paragraphs: List[str] = []
|
|
232
|
-
bucket: List[str] = []
|
|
233
|
-
for sentence in sentences:
|
|
234
|
-
bucket.append(sentence)
|
|
235
|
-
if len(bucket) >= 3:
|
|
236
|
-
paragraphs.append("\n".join(bucket))
|
|
237
|
-
bucket = []
|
|
238
|
-
|
|
239
|
-
if bucket:
|
|
240
|
-
if len(bucket) == 1 and paragraphs:
|
|
241
|
-
paragraphs[-1] = f"{paragraphs[-1]}\n{bucket[0]}"
|
|
242
|
-
else:
|
|
243
|
-
paragraphs.append("\n".join(bucket))
|
|
244
|
-
|
|
245
|
-
return "\n\n".join(paragraphs)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def _pick_text(payload: Dict[str, Any], keys: List[str], source_keys: Optional[List[str]] = None) -> str:
|
|
249
|
-
source = _source_dict(payload)
|
|
250
|
-
for key in keys:
|
|
251
|
-
text = normalize_text(payload.get(key))
|
|
252
|
-
if text:
|
|
253
|
-
return text
|
|
254
|
-
for key in (source_keys or keys):
|
|
255
|
-
text = normalize_text(source.get(key))
|
|
256
|
-
if text:
|
|
257
|
-
return text
|
|
258
|
-
return ""
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def _extract_platform_work_id(payload: Dict[str, Any]) -> str:
|
|
262
|
-
return _pick_text(
|
|
263
|
-
payload,
|
|
264
|
-
["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
|
|
265
|
-
["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
|
|
266
|
-
)
|
|
167
|
+
return value * 1000 if key != "duration_ms" and value < 10000 else value
|
|
168
|
+
return 0
|
|
267
169
|
|
|
268
170
|
|
|
269
171
|
def _extract_author(payload: Dict[str, Any]) -> Dict[str, str]:
|
|
270
172
|
author_raw = payload.get("author")
|
|
271
173
|
author = author_raw if isinstance(author_raw, dict) else {}
|
|
272
|
-
|
|
273
174
|
source = _source_dict(payload)
|
|
274
175
|
source_author = source.get("author") if isinstance(source.get("author"), dict) else {}
|
|
275
176
|
|
|
276
|
-
|
|
277
|
-
nickname
|
|
278
|
-
normalize_text(
|
|
279
|
-
|
|
280
|
-
or normalize_text(source_author.get("nickname"))
|
|
281
|
-
)
|
|
177
|
+
nickname = normalize_text(author.get("nickname"))
|
|
178
|
+
if not nickname and isinstance(author_raw, str):
|
|
179
|
+
nickname = normalize_text(author_raw)
|
|
180
|
+
nickname = nickname or normalize_text(source_author.get("nickname"))
|
|
282
181
|
|
|
283
182
|
author_handle = (
|
|
284
183
|
normalize_text(payload.get("author_handle"))
|
|
@@ -294,37 +193,10 @@ def _extract_author(payload: Dict[str, Any]) -> Dict[str, str]:
|
|
|
294
193
|
or normalize_text(source_author.get("platform_author_id"))
|
|
295
194
|
or normalize_text(source_author.get("author_platform_id"))
|
|
296
195
|
)
|
|
297
|
-
|
|
298
|
-
xhs_user_id = (
|
|
299
|
-
normalize_text(payload.get("xhs_user_id"))
|
|
300
|
-
or normalize_text(author.get("xhs_user_id"))
|
|
301
|
-
or normalize_text(source_author.get("xhs_user_id"))
|
|
302
|
-
)
|
|
303
|
-
xhs_sec_token = (
|
|
304
|
-
normalize_text(payload.get("xhs_sec_token"))
|
|
305
|
-
or normalize_text(author.get("xhs_sec_token"))
|
|
306
|
-
or normalize_text(source_author.get("xhs_sec_token"))
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
douyin_sec_uid = (
|
|
310
|
-
normalize_text(payload.get("douyin_sec_uid"))
|
|
311
|
-
or normalize_text(author.get("douyin_sec_uid"))
|
|
312
|
-
or normalize_text(source_author.get("douyin_sec_uid"))
|
|
313
|
-
)
|
|
314
|
-
douyin_aweme_author_id = (
|
|
315
|
-
normalize_text(payload.get("douyin_aweme_author_id"))
|
|
316
|
-
or normalize_text(author.get("douyin_aweme_author_id"))
|
|
317
|
-
or normalize_text(source_author.get("douyin_aweme_author_id"))
|
|
318
|
-
)
|
|
319
|
-
|
|
320
196
|
return {
|
|
321
197
|
"nickname": nickname,
|
|
322
198
|
"author_handle": author_handle,
|
|
323
199
|
"platform_author_id": platform_author_id,
|
|
324
|
-
"xhs_user_id": xhs_user_id,
|
|
325
|
-
"xhs_sec_token": xhs_sec_token,
|
|
326
|
-
"douyin_sec_uid": douyin_sec_uid,
|
|
327
|
-
"douyin_aweme_author_id": douyin_aweme_author_id,
|
|
328
200
|
}
|
|
329
201
|
|
|
330
202
|
|
|
@@ -343,11 +215,11 @@ def _clean_for_filename(text: str) -> str:
|
|
|
343
215
|
normalized = normalized.replace("\n", " ").replace("\r", " ")
|
|
344
216
|
|
|
345
217
|
kept: List[str] = []
|
|
346
|
-
for
|
|
347
|
-
|
|
348
|
-
if _is_cjk(
|
|
349
|
-
kept.append(
|
|
350
|
-
elif
|
|
218
|
+
for char in normalized:
|
|
219
|
+
category = unicodedata.category(char)
|
|
220
|
+
if _is_cjk(char) or char.isalnum() or char in {" ", "-", "_"}:
|
|
221
|
+
kept.append(char)
|
|
222
|
+
elif category.startswith("Z"):
|
|
351
223
|
kept.append(" ")
|
|
352
224
|
|
|
353
225
|
compact = "".join(kept)
|
|
@@ -359,25 +231,29 @@ def _clean_for_filename(text: str) -> str:
|
|
|
359
231
|
def _clip_with_min(text: str, min_len: int, max_len: int, fallback: str) -> str:
|
|
360
232
|
candidate = _clean_for_filename(text)
|
|
361
233
|
fallback_clean = _clean_for_filename(fallback)
|
|
362
|
-
|
|
363
234
|
if not candidate:
|
|
364
235
|
candidate = fallback_clean
|
|
365
236
|
if len(candidate) < min_len:
|
|
366
237
|
candidate = (candidate + fallback_clean)[:max_len]
|
|
367
238
|
if len(candidate) < min_len:
|
|
368
239
|
candidate = (candidate + "内容速览")[:max_len]
|
|
369
|
-
|
|
370
240
|
candidate = candidate[:max_len]
|
|
371
241
|
if len(candidate) < min_len:
|
|
372
242
|
candidate = (candidate + "作品卡")[:max_len]
|
|
373
243
|
return candidate[:max_len] if candidate else fallback_clean[:max_len]
|
|
374
244
|
|
|
375
245
|
|
|
376
|
-
def
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
246
|
+
def _extract_platform_work_id(payload: Dict[str, Any]) -> str:
|
|
247
|
+
return _pick_text(
|
|
248
|
+
payload,
|
|
249
|
+
["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
|
|
250
|
+
["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _pick_author_slug(payload: Dict[str, Any]) -> str:
|
|
255
|
+
author = _extract_author(payload)
|
|
256
|
+
base = author["nickname"] or author["author_handle"] or author["platform_author_id"] or "作者"
|
|
381
257
|
slug = _clip_with_min(base, min_len=2, max_len=18, fallback="作者")
|
|
382
258
|
return slug if len(slug) >= 2 else "作者"
|
|
383
259
|
|
|
@@ -433,37 +309,62 @@ def _extract_tags(payload: Dict[str, Any]) -> List[str]:
|
|
|
433
309
|
tags = [normalize_text(item).lstrip("#") for item in value if normalize_text(item)]
|
|
434
310
|
if tags:
|
|
435
311
|
return list(dict.fromkeys(tags))
|
|
436
|
-
|
|
437
312
|
return []
|
|
438
313
|
|
|
439
314
|
|
|
315
|
+
def _format_duration(duration_ms: int) -> str:
|
|
316
|
+
if duration_ms <= 0:
|
|
317
|
+
return "未知"
|
|
318
|
+
total_sec = duration_ms // 1000
|
|
319
|
+
minute, second = divmod(total_sec, 60)
|
|
320
|
+
if minute:
|
|
321
|
+
return f"{minute}分{second:02d}秒"
|
|
322
|
+
return f"{second}秒"
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _analysis_status_from_sections(analysis_sections: Dict[str, Any]) -> Dict[str, Any]:
|
|
326
|
+
meta = analysis_sections.get("meta") if isinstance(analysis_sections.get("meta"), dict) else {}
|
|
327
|
+
reason = normalize_text(meta.get("reason"))
|
|
328
|
+
if meta.get("llm_used"):
|
|
329
|
+
status = "completed"
|
|
330
|
+
elif not reason or reason == "analysis_mode_local":
|
|
331
|
+
status = "skipped"
|
|
332
|
+
elif "timeout" in reason:
|
|
333
|
+
status = "timeout"
|
|
334
|
+
elif "unavailable" in reason:
|
|
335
|
+
status = "unavailable"
|
|
336
|
+
else:
|
|
337
|
+
status = "failed"
|
|
338
|
+
return {
|
|
339
|
+
"status": status,
|
|
340
|
+
"provider": normalize_text(analysis_sections.get("provider")) or "local",
|
|
341
|
+
"reason": reason or None,
|
|
342
|
+
"duration_ms": _safe_int(meta.get("duration_ms"), default=0),
|
|
343
|
+
"llm_used": bool(meta.get("llm_used")),
|
|
344
|
+
"degraded": bool(meta.get("degraded")),
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _has_meaningful_analysis_sections(value: Any) -> bool:
|
|
349
|
+
if not isinstance(value, dict):
|
|
350
|
+
return False
|
|
351
|
+
modules = value.get("modules")
|
|
352
|
+
if not isinstance(modules, dict):
|
|
353
|
+
return False
|
|
354
|
+
return any(bool(normalize_text(item)) for items in modules.values() if isinstance(items, list) for item in items)
|
|
355
|
+
|
|
356
|
+
|
|
440
357
|
def _extract_required_fields(payload: Dict[str, Any], platform: str) -> Dict[str, Any]:
|
|
441
358
|
author = _extract_author(payload)
|
|
442
|
-
|
|
443
359
|
title = _pick_text(payload, ["title", "desc"], ["title", "desc"])
|
|
444
360
|
caption_raw = normalize_text(payload.get("caption_raw") or payload.get("desc"))
|
|
445
361
|
platform_work_id = _extract_platform_work_id(payload)
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
["source_url", "share_url", "url"],
|
|
450
|
-
["source_url", "share_url", "url", "share_text"],
|
|
451
|
-
)
|
|
452
|
-
share_url = _pick_text(
|
|
453
|
-
payload,
|
|
454
|
-
["share_url", "canonical_share_url"],
|
|
455
|
-
["share_url", "canonical_share_url", "url", "source_url", "share_text"],
|
|
456
|
-
) or source_url
|
|
457
|
-
|
|
458
|
-
cover_image = _pick_text(
|
|
459
|
-
payload,
|
|
460
|
-
["cover_image", "cover_url", "cover"],
|
|
461
|
-
["cover_image", "cover_url", "cover", "origin_cover"],
|
|
462
|
-
)
|
|
362
|
+
source_url = _pick_text(payload, ["source_url", "share_url", "url"], ["source_url", "share_url", "url"])
|
|
363
|
+
share_url = _pick_text(payload, ["share_url", "canonical_share_url"], ["share_url", "canonical_share_url", "url"]) or source_url
|
|
364
|
+
cover_image = _pick_text(payload, ["cover_image", "cover_url", "cover"], ["cover_image", "cover_url", "cover"])
|
|
463
365
|
selected_images = payload.get("selected_image_urls")
|
|
464
366
|
if not cover_image and isinstance(selected_images, list) and selected_images:
|
|
465
367
|
cover_image = normalize_text(selected_images[0])
|
|
466
|
-
|
|
467
368
|
video_download_url = _pick_text(
|
|
468
369
|
payload,
|
|
469
370
|
["video_download_url", "video_down_url", "selected_video_url", "original_video_url", "video_url", "download_url"],
|
|
@@ -476,41 +377,26 @@ def _extract_required_fields(payload: Dict[str, Any], platform: str) -> Dict[str
|
|
|
476
377
|
if create_time_sec <= 0:
|
|
477
378
|
create_time_sec = _to_unix_sec(_source_dict(payload).get("create_time"))
|
|
478
379
|
|
|
479
|
-
|
|
480
|
-
comment_count = _safe_int(payload.get("comment_count"), default=0)
|
|
481
|
-
collect_count = _safe_int(payload.get("collect_count"), default=0)
|
|
482
|
-
share_count = _safe_int(payload.get("share_count"), default=0)
|
|
483
|
-
play_count = _safe_optional_int(payload.get("play_count"))
|
|
484
|
-
|
|
485
|
-
summary = normalize_text(payload.get("summary"))
|
|
486
|
-
raw_content = normalize_text(payload.get("raw_content"))
|
|
487
|
-
primary_text = normalize_text(payload.get("primary_text"))
|
|
380
|
+
raw_content = normalize_text(payload.get("asr_raw") or payload.get("raw_content"))
|
|
488
381
|
provided_asr_clean = normalize_text(payload.get("asr_clean"))
|
|
489
|
-
asr_clean =
|
|
490
|
-
|
|
491
|
-
duration_ms = _extract_duration_ms(payload)
|
|
492
|
-
|
|
493
|
-
category = normalize_text(payload.get("category"))
|
|
494
|
-
if not category:
|
|
495
|
-
category = "观点"
|
|
496
|
-
|
|
497
|
-
hot_score = _safe_int(payload.get("hot_score"), default=0)
|
|
498
|
-
if hot_score <= 0:
|
|
499
|
-
hot_score = digg_count + comment_count * 2 + collect_count * 3 + share_count * 4
|
|
382
|
+
asr_clean = derive_asr_clean_text(raw_content, provided_asr_clean)
|
|
500
383
|
|
|
501
384
|
work_modality = normalize_text(payload.get("work_modality"))
|
|
502
385
|
if not work_modality:
|
|
503
386
|
work_modality = "video" if video_download_url or raw_content else "text"
|
|
504
387
|
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
if primary_text_source_raw in {"asr_clean", "caption_raw"}
|
|
510
|
-
else ("asr_clean" if work_modality == "video" else "caption_raw")
|
|
511
|
-
)
|
|
388
|
+
primary_text_source = normalize_text(payload.get("primary_text_source"))
|
|
389
|
+
if primary_text_source not in {"asr_clean", "caption_raw"}:
|
|
390
|
+
primary_text_source = "asr_clean" if work_modality == "video" else "caption_raw"
|
|
391
|
+
primary_text = normalize_text(payload.get("primary_text"))
|
|
512
392
|
if not primary_text:
|
|
513
|
-
primary_text = asr_clean if primary_text_source == "asr_clean" else
|
|
393
|
+
primary_text = asr_clean if primary_text_source == "asr_clean" else (caption_raw or raw_content)
|
|
394
|
+
|
|
395
|
+
analysis_sections = ensure_analysis_sections_schema(
|
|
396
|
+
payload.get("analysis_sections") if isinstance(payload.get("analysis_sections"), dict) else {},
|
|
397
|
+
provider="local",
|
|
398
|
+
llm_used=False,
|
|
399
|
+
)
|
|
514
400
|
|
|
515
401
|
return {
|
|
516
402
|
"title": title,
|
|
@@ -524,414 +410,28 @@ def _extract_required_fields(payload: Dict[str, Any], platform: str) -> Dict[str
|
|
|
524
410
|
"source_url": source_url,
|
|
525
411
|
"cover_image": cover_image,
|
|
526
412
|
"video_download_url": video_download_url,
|
|
527
|
-
"published_date":
|
|
528
|
-
"duration_ms":
|
|
529
|
-
"digg_count": digg_count,
|
|
530
|
-
"comment_count": comment_count,
|
|
531
|
-
"collect_count": collect_count,
|
|
532
|
-
"share_count": share_count,
|
|
533
|
-
"play_count": play_count,
|
|
413
|
+
"published_date": _resolve_published_date(payload, create_time_sec),
|
|
414
|
+
"duration_ms": _extract_duration_ms(payload),
|
|
415
|
+
"digg_count": _safe_int(payload.get("digg_count"), default=0),
|
|
416
|
+
"comment_count": _safe_int(payload.get("comment_count"), default=0),
|
|
417
|
+
"collect_count": _safe_int(payload.get("collect_count"), default=0),
|
|
418
|
+
"share_count": _safe_int(payload.get("share_count"), default=0),
|
|
419
|
+
"play_count": _safe_optional_int(payload.get("play_count")),
|
|
534
420
|
"tags": _extract_tags(payload),
|
|
535
421
|
"work_modality": work_modality,
|
|
536
|
-
"category": category,
|
|
422
|
+
"category": normalize_text(payload.get("category")) or "观点",
|
|
537
423
|
"content_kind": normalize_text(payload.get("content_kind")),
|
|
538
|
-
"summary": summary,
|
|
539
|
-
"hot_score": hot_score,
|
|
424
|
+
"summary": normalize_text(payload.get("summary")),
|
|
540
425
|
"raw_content": raw_content,
|
|
541
|
-
"
|
|
426
|
+
"asr_raw": raw_content,
|
|
542
427
|
"asr_clean": asr_clean,
|
|
543
|
-
"
|
|
428
|
+
"primary_text": primary_text,
|
|
429
|
+
"primary_text_source": primary_text_source,
|
|
544
430
|
"request_id": payload.get("request_id"),
|
|
545
431
|
"confidence": normalize_text(payload.get("confidence")) or "low",
|
|
546
432
|
"error_reason": payload.get("error_reason"),
|
|
547
433
|
"extract_trace": payload.get("extract_trace", []),
|
|
548
|
-
"analysis_sections":
|
|
549
|
-
"analysis_output": payload.get("analysis_output") if isinstance(payload.get("analysis_output"), dict) else {},
|
|
550
|
-
"author_analysis_v2": payload.get("author_analysis_v2") if isinstance(payload.get("author_analysis_v2"), dict) else {},
|
|
551
|
-
"author_analysis_input_v1": payload.get("author_analysis_input_v1") if isinstance(payload.get("author_analysis_input_v1"), dict) else {},
|
|
552
|
-
"sampled_work_explanations": payload.get("sampled_work_explanations") if isinstance(payload.get("sampled_work_explanations"), dict) else {},
|
|
553
|
-
"author_card_highlights": payload.get("author_card_highlights") if isinstance(payload.get("author_card_highlights"), dict) else {},
|
|
554
|
-
"validation": payload.get("validation") if isinstance(payload.get("validation"), dict) else {},
|
|
555
|
-
"business_score": _safe_int(payload.get("business_score"), default=0),
|
|
556
|
-
"benchmark_gap_score": _safe_int(payload.get("benchmark_gap_score"), default=0),
|
|
557
|
-
"style_radar": payload.get("style_radar") if isinstance(payload.get("style_radar"), dict) else {},
|
|
558
|
-
"core_contradictions": payload.get("core_contradictions") if isinstance(payload.get("core_contradictions"), list) else [],
|
|
559
|
-
"recommendations": payload.get("recommendations") if isinstance(payload.get("recommendations"), list) else [],
|
|
560
|
-
"business_analysis": normalize_text(payload.get("business_analysis")),
|
|
561
|
-
"benchmark_analysis": normalize_text(payload.get("benchmark_analysis")),
|
|
562
|
-
"nickname": normalize_text(payload.get("nickname")),
|
|
563
|
-
"ip_location": normalize_text(payload.get("ip_location")),
|
|
564
|
-
"signature": normalize_text(payload.get("signature")),
|
|
565
|
-
"avatar_url": normalize_text(payload.get("avatar_url")),
|
|
566
|
-
"fans_count": _safe_optional_int(payload.get("fans_count")),
|
|
567
|
-
"liked_count": _safe_optional_int(payload.get("liked_count")),
|
|
568
|
-
"collected_count": _safe_optional_int(payload.get("collected_count")),
|
|
569
|
-
"works_count": _safe_optional_int(payload.get("works_count")),
|
|
570
|
-
"verified": payload.get("verified") if isinstance(payload.get("verified"), bool) else None,
|
|
571
|
-
"snapshot_at": normalize_text(payload.get("snapshot_at")),
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
def _format_create_time(create_time_sec: int) -> str:
|
|
576
|
-
text = _format_shanghai_datetime(create_time_sec)
|
|
577
|
-
if text:
|
|
578
|
-
return text
|
|
579
|
-
if create_time_sec <= 0:
|
|
580
|
-
return "未知"
|
|
581
|
-
return str(create_time_sec)
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
def _format_duration(duration_ms: int) -> str:
|
|
585
|
-
if duration_ms <= 0:
|
|
586
|
-
return "未知"
|
|
587
|
-
total_sec = duration_ms // 1000
|
|
588
|
-
minute, second = divmod(total_sec, 60)
|
|
589
|
-
if minute:
|
|
590
|
-
return f"{minute}分{second:02d}秒"
|
|
591
|
-
return f"{second}秒"
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
def _sentence_units(text: str) -> List[str]:
|
|
595
|
-
if not text:
|
|
596
|
-
return []
|
|
597
|
-
return [normalize_text(x) for x in re.split(r"[。!?!?;;\\n]+", text) if normalize_text(x)]
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
def _first_sentence(text: str) -> str:
|
|
601
|
-
units = _sentence_units(text)
|
|
602
|
-
return units[0] if units else ""
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
def _hit_count(text: str, keywords: List[str]) -> int:
|
|
606
|
-
if not text:
|
|
607
|
-
return 0
|
|
608
|
-
return sum(1 for token in keywords if token in text)
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
def _top_keywords(text: str, candidates: List[str], topn: int = 3) -> List[str]:
|
|
612
|
-
if not text:
|
|
613
|
-
return []
|
|
614
|
-
scored = []
|
|
615
|
-
for token in candidates:
|
|
616
|
-
count = text.count(token)
|
|
617
|
-
if count > 0:
|
|
618
|
-
scored.append((count, token))
|
|
619
|
-
scored.sort(key=lambda x: (-x[0], len(x[1])))
|
|
620
|
-
return [token for _, token in scored[:topn]]
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
def _score_from_hits(hits: int, full_score_hits: int = 4) -> int:
|
|
624
|
-
if hits <= 0:
|
|
625
|
-
return 2
|
|
626
|
-
if hits >= full_score_hits:
|
|
627
|
-
return 5
|
|
628
|
-
return min(5, hits + 2)
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
def _analyze_topic(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
633
|
-
title = normalize_text(fields.get("title") or "")
|
|
634
|
-
asr = normalize_text(fields.get("asr_clean") or "")
|
|
635
|
-
category = normalize_text(fields.get("category") or "")
|
|
636
|
-
text = f"{title} {asr}"
|
|
637
|
-
|
|
638
|
-
if not text.strip():
|
|
639
|
-
return {
|
|
640
|
-
"score": 2,
|
|
641
|
-
"lines": ["- 类型:数据不足。", "- 细分主题:数据不足。", "- 受众痛点:数据不足,需补充标题或ASR。"],
|
|
642
|
-
"gaps": ["补齐标题或ASR文本,才能完成选题分类与主题归因"],
|
|
643
|
-
"evidence": "输入文本缺失",
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
type_rules = {
|
|
647
|
-
"流量型": ["热点", "挑战", "反转", "揭秘", "真相", "别再", "为什么", "踩坑", "3秒", "爆款"],
|
|
648
|
-
"人设型": ["我是", "我们", "日常", "系列", "分享", "经历", "成长", "复盘", "带你", "我"],
|
|
649
|
-
"营销型": ["领取", "私信", "咨询", "下单", "课程", "优惠", "链接", "报名", "合作", "购买"],
|
|
650
|
-
}
|
|
651
|
-
type_scores = {name: _hit_count(text, kws) for name, kws in type_rules.items()}
|
|
652
|
-
|
|
653
|
-
if category in ["教程", "知识", "方法"]:
|
|
654
|
-
type_scores["营销型"] += 1
|
|
655
|
-
if category in ["观点", "人设", "日常"]:
|
|
656
|
-
type_scores["人设型"] += 1
|
|
657
|
-
|
|
658
|
-
main_type = max(type_scores, key=lambda k: type_scores[k])
|
|
659
|
-
main_hits = type_scores[main_type]
|
|
660
|
-
|
|
661
|
-
theme_candidates = [
|
|
662
|
-
"AI", "智能体", "变现", "副业", "教程", "工作流", "流量", "涨粉", "投流", "口播", "脚本", "工具", "私域", "创业", "营销",
|
|
663
|
-
]
|
|
664
|
-
themes = _top_keywords(text, theme_candidates, topn=3)
|
|
665
|
-
pain_candidates = ["不会", "焦虑", "卡住", "没流量", "转化", "不会写", "不会做", "时间不够", "担心", "风险"]
|
|
666
|
-
pains = _top_keywords(text, pain_candidates, topn=2)
|
|
667
|
-
|
|
668
|
-
lines = [
|
|
669
|
-
f"- 基础类型:{main_type}(命中信号 {main_hits} 个)。",
|
|
670
|
-
f"- 细分主题:{'、'.join(themes) if themes else '数据不足(未检测到显著主题词)'}。",
|
|
671
|
-
f"- 受众痛点:{'、'.join(pains) if pains else '以“快速落地/降低门槛”为主(显性痛点词不足)'}。",
|
|
672
|
-
]
|
|
673
|
-
|
|
674
|
-
return {
|
|
675
|
-
"score": _score_from_hits(main_hits),
|
|
676
|
-
"lines": lines,
|
|
677
|
-
"gaps": [] if themes else ["补充更完整ASR,提高细分主题识别稳定性"],
|
|
678
|
-
"evidence": f"类型命中分布={type_scores}",
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
def _analyze_style(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
683
|
-
asr = normalize_text(fields.get("asr_clean") or "")
|
|
684
|
-
title = normalize_text(fields.get("title") or "")
|
|
685
|
-
text = f"{title} {asr}".strip()
|
|
686
|
-
units = _sentence_units(asr)
|
|
687
|
-
|
|
688
|
-
if not text:
|
|
689
|
-
return {
|
|
690
|
-
"score": 2,
|
|
691
|
-
"lines": ["- 人设匹配:数据不足。", "- 句式结构:数据不足。", "- 语气与情绪:数据不足。"],
|
|
692
|
-
"gaps": ["补齐ASR文本后再做文风拆解"],
|
|
693
|
-
"evidence": "输入文本缺失",
|
|
694
|
-
}
|
|
695
|
-
|
|
696
|
-
avg_len = int(sum(len(u) for u in units) / max(1, len(units))) if units else 0
|
|
697
|
-
if avg_len <= 14:
|
|
698
|
-
length_type = "短句为主"
|
|
699
|
-
elif avg_len <= 24:
|
|
700
|
-
length_type = "中短句混合"
|
|
701
|
-
else:
|
|
702
|
-
length_type = "中长句为主"
|
|
703
|
-
|
|
704
|
-
q_count = text.count("?") + text.count("?")
|
|
705
|
-
e_count = text.count("!") + text.count("!")
|
|
706
|
-
statement_count = max(0, len(units) - q_count - e_count)
|
|
707
|
-
persona_hits = _hit_count(text, ["我", "我们", "你", "大家", "朋友们", "聪明的你"])
|
|
708
|
-
rhetoric_hits = _hit_count(text, ["不是", "而是", "其实", "真的", "一定", "必须", "先", "再"])
|
|
709
|
-
|
|
710
|
-
lines = [
|
|
711
|
-
f"- 句式结构:{length_type},平均句长约 {avg_len} 字。",
|
|
712
|
-
f"- 语气分布:疑问 {q_count} / 感叹 {e_count} / 陈述 {statement_count}。",
|
|
713
|
-
f"- 人设与修辞:人设代词命中 {persona_hits} 次,强调/转折词命中 {rhetoric_hits} 次。",
|
|
714
|
-
]
|
|
715
|
-
|
|
716
|
-
strength_hits = int(avg_len > 0) + int(persona_hits > 0) + int(rhetoric_hits > 0)
|
|
717
|
-
return {
|
|
718
|
-
"score": _score_from_hits(strength_hits, full_score_hits=3),
|
|
719
|
-
"lines": lines,
|
|
720
|
-
"gaps": [] if units else ["ASR分句失败,建议人工复核"],
|
|
721
|
-
"evidence": f"avg_len={avg_len}, persona_hits={persona_hits}, rhetoric_hits={rhetoric_hits}",
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
def _analyze_hook(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
726
|
-
title = normalize_text(fields.get("title") or "")
|
|
727
|
-
asr = normalize_text(fields.get("asr_clean") or "")
|
|
728
|
-
first = _first_sentence(asr) or title
|
|
729
|
-
middle = _sentence_units(asr)[len(_sentence_units(asr)) // 2] if _sentence_units(asr) else ""
|
|
730
|
-
|
|
731
|
-
if not first:
|
|
732
|
-
return {
|
|
733
|
-
"score": 2,
|
|
734
|
-
"lines": ["- 开头钩子:数据不足。", "- 中段钩子:数据不足。", "- 结尾钩子:数据不足。"],
|
|
735
|
-
"gaps": ["缺少标题与ASR,无法提取钩子原话"],
|
|
736
|
-
"evidence": "开头句缺失",
|
|
737
|
-
}
|
|
738
|
-
|
|
739
|
-
hook_type = "陈述式"
|
|
740
|
-
if any(k in first for k in ["?", "?", "为什么", "怎么"]):
|
|
741
|
-
hook_type = "疑问式"
|
|
742
|
-
elif any(k in first for k in ["别再", "误区", "真相", "不是"]):
|
|
743
|
-
hook_type = "反常识式"
|
|
744
|
-
elif any(k in first for k in ["当你", "如果", "今天"]):
|
|
745
|
-
hook_type = "场景代入式"
|
|
746
|
-
|
|
747
|
-
end_candidates = [u for u in _sentence_units(asr) if _hit_count(u, ["关注", "评论", "私信", "收藏", "转发", "下次见", "领取"]) > 0]
|
|
748
|
-
end = end_candidates[-1] if end_candidates else "未检测到明确结尾钩子"
|
|
749
|
-
|
|
750
|
-
lines = [
|
|
751
|
-
f"- 开头钩子({hook_type}):{first}",
|
|
752
|
-
f"- 中段钩子:{middle or '数据不足(中段文本不足)'}",
|
|
753
|
-
f"- 结尾钩子:{end}",
|
|
754
|
-
]
|
|
755
|
-
|
|
756
|
-
hook_hits = int(first != "") + int(bool(middle)) + int(end != "未检测到明确结尾钩子")
|
|
757
|
-
return {
|
|
758
|
-
"score": _score_from_hits(hook_hits, full_score_hits=3),
|
|
759
|
-
"lines": lines,
|
|
760
|
-
"gaps": [] if hook_hits >= 2 else ["建议补充中段转折钩子与结尾动作钩子"],
|
|
761
|
-
"evidence": f"hook_type={hook_type}, hook_hits={hook_hits}",
|
|
762
|
-
}
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
def _analyze_structure(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
766
|
-
asr = normalize_text(fields.get("asr_clean") or "")
|
|
767
|
-
units = _sentence_units(asr)
|
|
768
|
-
if not units:
|
|
769
|
-
return {
|
|
770
|
-
"score": 2,
|
|
771
|
-
"lines": ["- 结构标签:数据不足。", "- 模板判定:数据不足。"],
|
|
772
|
-
"gaps": ["补充ASR后再进行结构标注"],
|
|
773
|
-
"evidence": "分句为空",
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
label_rules = {
|
|
777
|
-
"钩子": ["?", "?", "为什么", "怎么", "别再", "真相", "当你", "如果"],
|
|
778
|
-
"冲突": ["但是", "却", "问题", "误区", "卡住", "焦虑", "失败"],
|
|
779
|
-
"转折": ["所以", "于是", "然后", "接着", "这时候", "其实"],
|
|
780
|
-
"举证": ["数据", "案例", "比如", "步骤", "第一", "第二", "第三"],
|
|
781
|
-
"CTA": ["评论", "关注", "私信", "收藏", "转发", "点击", "领取", "报名"],
|
|
782
|
-
}
|
|
783
|
-
coverage = {k: 0 for k in label_rules}
|
|
784
|
-
for sent in units:
|
|
785
|
-
for label, kws in label_rules.items():
|
|
786
|
-
if any(kw in sent for kw in kws):
|
|
787
|
-
coverage[label] += 1
|
|
788
|
-
|
|
789
|
-
present = [k for k, v in coverage.items() if v > 0]
|
|
790
|
-
missing = [k for k, v in coverage.items() if v == 0]
|
|
791
|
-
template = "钩子→冲突→转折→举证→CTA" if len(present) >= 4 else "钩子→观点→补充说明"
|
|
792
|
-
|
|
793
|
-
lines = [
|
|
794
|
-
f"- 结构标签覆盖:{', '.join([f'{k}:{v}' for k, v in coverage.items()])}。",
|
|
795
|
-
f"- 模板判定:{template}。",
|
|
796
|
-
f"- 缺失模块:{'、'.join(missing) if missing else '无'}。",
|
|
797
|
-
]
|
|
798
|
-
|
|
799
|
-
return {
|
|
800
|
-
"score": _score_from_hits(len(present), full_score_hits=5),
|
|
801
|
-
"lines": lines,
|
|
802
|
-
"gaps": [f"优先补齐结构模块:{'、'.join(missing)}"] if missing else [],
|
|
803
|
-
"evidence": f"coverage={coverage}",
|
|
804
|
-
}
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
def _analyze_cta(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
808
|
-
asr = normalize_text(fields.get("asr_clean") or "")
|
|
809
|
-
units = _sentence_units(asr)
|
|
810
|
-
cta_tokens = ["评论", "关注", "私信", "收藏", "转发", "点击", "领取", "报名", "下单", "咨询", "试试"]
|
|
811
|
-
cta_sentences = [u for u in units if any(token in u for token in cta_tokens)]
|
|
812
|
-
|
|
813
|
-
if not units:
|
|
814
|
-
return {
|
|
815
|
-
"score": 2,
|
|
816
|
-
"lines": ["- CTA策略:数据不足。", "- 行动指令:数据不足。"],
|
|
817
|
-
"gaps": ["缺少ASR,无法识别CTA"],
|
|
818
|
-
"evidence": "分句为空",
|
|
819
|
-
}
|
|
820
|
-
|
|
821
|
-
if not cta_sentences:
|
|
822
|
-
return {
|
|
823
|
-
"score": 2,
|
|
824
|
-
"lines": ["- CTA策略:未检测到明确行动号召。", "- 行动指令:建议补一句“评论区/私信领取”。"],
|
|
825
|
-
"gaps": ["补充单一明确CTA,避免只有信息陈述"],
|
|
826
|
-
"evidence": "cta_sentences=0",
|
|
827
|
-
}
|
|
828
|
-
|
|
829
|
-
primary_cta = cta_sentences[-1]
|
|
830
|
-
cta_types = []
|
|
831
|
-
if any(k in asr for k in ["评论", "点赞", "收藏", "转发", "关注"]):
|
|
832
|
-
cta_types.append("互动型")
|
|
833
|
-
if any(k in asr for k in ["私信", "领取", "链接", "资料"]):
|
|
834
|
-
cta_types.append("线索型")
|
|
835
|
-
if any(k in asr for k in ["下单", "报名", "咨询", "购买"]):
|
|
836
|
-
cta_types.append("转化型")
|
|
837
|
-
|
|
838
|
-
lines = [
|
|
839
|
-
f"- CTA类型:{'、'.join(cta_types) if cta_types else '互动型(弱)'}。",
|
|
840
|
-
f"- 关键动作句:{primary_cta}",
|
|
841
|
-
f"- CTA密度:{len(cta_sentences)}/{len(units)} 句。",
|
|
842
|
-
]
|
|
843
|
-
|
|
844
|
-
return {
|
|
845
|
-
"score": _score_from_hits(len(cta_types) + int(len(cta_sentences) > 0), full_score_hits=3),
|
|
846
|
-
"lines": lines,
|
|
847
|
-
"gaps": [] if len(cta_types) > 0 else ["补充线索型或转化型CTA,提高商业闭环"],
|
|
848
|
-
"evidence": f"cta_types={cta_types}, cta_count={len(cta_sentences)}",
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
def _build_summary_module(results: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
853
|
-
ordered = ["选题", "文风", "Hook", "结构", "CTA"]
|
|
854
|
-
scored = [(name, results[name]["score"]) for name in ordered]
|
|
855
|
-
avg_score = round(sum(score for _, score in scored) / max(1, len(scored)), 2)
|
|
856
|
-
weakest = sorted(scored, key=lambda x: x[1])[:2]
|
|
857
|
-
|
|
858
|
-
if avg_score >= 4.2:
|
|
859
|
-
verdict = "可直接复用"
|
|
860
|
-
elif avg_score >= 3.4:
|
|
861
|
-
verdict = "可用,但需小幅优化"
|
|
862
|
-
else:
|
|
863
|
-
verdict = "需重写关键模块后再投放"
|
|
864
|
-
|
|
865
|
-
suggestions = []
|
|
866
|
-
for name, _ in weakest:
|
|
867
|
-
gaps = results[name].get("gaps") or []
|
|
868
|
-
if gaps:
|
|
869
|
-
suggestions.append(f"- [{name}] {gaps[0]}")
|
|
870
|
-
if not suggestions:
|
|
871
|
-
suggestions = ["- 保持当前结构,持续做A/B测试验证Hook与CTA。"]
|
|
872
|
-
|
|
873
|
-
return {
|
|
874
|
-
"score": int(round(avg_score)),
|
|
875
|
-
"lines": [
|
|
876
|
-
f"- 结论:综合评分 {avg_score}/5,判定为“{verdict}”。",
|
|
877
|
-
"- 建议:",
|
|
878
|
-
*suggestions[:3],
|
|
879
|
-
],
|
|
880
|
-
"gaps": [],
|
|
881
|
-
"evidence": f"scores={dict(scored)}",
|
|
882
|
-
}
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
def _insight_metric_snapshot(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
886
|
-
digg = _safe_int(fields.get("digg_count"), default=0)
|
|
887
|
-
comment = _safe_int(fields.get("comment_count"), default=0)
|
|
888
|
-
collect = _safe_int(fields.get("collect_count"), default=0)
|
|
889
|
-
share = _safe_int(fields.get("share_count"), default=0)
|
|
890
|
-
play = _safe_int(fields.get("play_count"), default=0)
|
|
891
|
-
|
|
892
|
-
interaction = digg + comment * 2 + collect * 3 + share * 4
|
|
893
|
-
interaction_rate = interaction / play if play > 0 else 0.0
|
|
894
|
-
return {
|
|
895
|
-
"interaction": interaction,
|
|
896
|
-
"interaction_rate": interaction_rate,
|
|
897
|
-
"digg": digg,
|
|
898
|
-
"comment": comment,
|
|
899
|
-
"collect": collect,
|
|
900
|
-
"share": share,
|
|
901
|
-
}
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
def _build_local_analysis_sections(fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
905
|
-
topic = _analyze_topic(fields)
|
|
906
|
-
style = _analyze_style(fields)
|
|
907
|
-
hook = _analyze_hook(fields)
|
|
908
|
-
structure = _analyze_structure(fields)
|
|
909
|
-
cta = _analyze_cta(fields)
|
|
910
|
-
summary = _build_summary_module(
|
|
911
|
-
{
|
|
912
|
-
"选题": topic,
|
|
913
|
-
"文风": style,
|
|
914
|
-
"Hook": hook,
|
|
915
|
-
"结构": structure,
|
|
916
|
-
"CTA": cta,
|
|
917
|
-
}
|
|
918
|
-
)
|
|
919
|
-
metrics = _insight_metric_snapshot(fields)
|
|
920
|
-
insight_lines = list(summary.get("lines") or [])
|
|
921
|
-
insight_lines.extend(
|
|
922
|
-
[
|
|
923
|
-
f"- 互动折算值:{metrics.get('interaction', 0)}。",
|
|
924
|
-
f"- 粗略互动率:{metrics.get('interaction_rate', 0.0):.4f}。",
|
|
925
|
-
]
|
|
926
|
-
)
|
|
927
|
-
return {
|
|
928
|
-
"modules": {
|
|
929
|
-
"选题": topic.get("lines", ["数据不足"]),
|
|
930
|
-
"文风": style.get("lines", ["数据不足"]),
|
|
931
|
-
"Hook": hook.get("lines", ["数据不足"]),
|
|
932
|
-
"结构": structure.get("lines", ["数据不足"]),
|
|
933
|
-
},
|
|
934
|
-
"insight": insight_lines or ["数据不足"],
|
|
434
|
+
"analysis_sections": analysis_sections,
|
|
935
435
|
}
|
|
936
436
|
|
|
937
437
|
|
|
@@ -940,19 +440,34 @@ def build_card_analysis_artifact(
|
|
|
940
440
|
payload: Dict[str, Any],
|
|
941
441
|
platform: str,
|
|
942
442
|
card_type: str,
|
|
443
|
+
analysis_mode: str = "auto",
|
|
444
|
+
storage_config: Optional[Dict[str, Any]] = None,
|
|
445
|
+
progress: Optional[ProgressReporter] = None,
|
|
943
446
|
) -> Dict[str, Any]:
|
|
944
447
|
fields = _extract_required_fields(payload, platform=platform)
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
448
|
+
if _has_meaningful_analysis_sections(payload.get("analysis_sections")):
|
|
449
|
+
existing = payload.get("analysis_sections")
|
|
450
|
+
meta = existing.get("meta") if isinstance(existing, dict) and isinstance(existing.get("meta"), dict) else {}
|
|
451
|
+
analysis_sections = ensure_analysis_sections_schema(
|
|
452
|
+
existing,
|
|
453
|
+
provider=normalize_text(existing.get("provider")) or "local",
|
|
454
|
+
llm_used=bool(meta.get("llm_used")),
|
|
455
|
+
degraded=bool(meta.get("degraded")),
|
|
456
|
+
reason=normalize_text(meta.get("reason")),
|
|
457
|
+
duration_ms=_safe_int(meta.get("duration_ms"), default=0),
|
|
458
|
+
)
|
|
950
459
|
else:
|
|
951
|
-
analysis_sections = build_analysis_sections(
|
|
460
|
+
analysis_sections = build_analysis_sections(
|
|
461
|
+
fields,
|
|
462
|
+
analysis_mode=analysis_mode,
|
|
463
|
+
analysis_config=storage_config.get("analysis") if isinstance(storage_config, dict) else None,
|
|
464
|
+
progress=progress,
|
|
465
|
+
)
|
|
952
466
|
fields["analysis_sections"] = analysis_sections
|
|
953
467
|
return {
|
|
954
468
|
"fields": fields,
|
|
955
469
|
"analysis_sections": analysis_sections,
|
|
470
|
+
"card_type": normalize_card_type(card_type),
|
|
956
471
|
}
|
|
957
472
|
|
|
958
473
|
|
|
@@ -963,12 +478,10 @@ def _build_output_path(
|
|
|
963
478
|
card_type: str,
|
|
964
479
|
payload: Dict[str, Any],
|
|
965
480
|
now: dt.datetime,
|
|
966
|
-
sample_author: Optional[str],
|
|
967
481
|
storage_config: Optional[Dict[str, Any]],
|
|
968
482
|
) -> Dict[str, str]:
|
|
969
|
-
author_slug = _pick_author_slug(payload
|
|
483
|
+
author_slug = _pick_author_slug(payload)
|
|
970
484
|
title_slug = _pick_title_slug(payload)
|
|
971
|
-
|
|
972
485
|
path, route_parts = build_card_output_path(
|
|
973
486
|
card_root=card_root,
|
|
974
487
|
platform=platform,
|
|
@@ -983,170 +496,9 @@ def _build_output_path(
|
|
|
983
496
|
return {
|
|
984
497
|
"path": path,
|
|
985
498
|
"route_parts": route_parts,
|
|
986
|
-
"author_slug": author_slug,
|
|
987
|
-
"title_slug": title_slug,
|
|
988
|
-
"target_type": card_type,
|
|
989
499
|
}
|
|
990
500
|
|
|
991
501
|
|
|
992
|
-
def _render_author_markdown(
|
|
993
|
-
*,
|
|
994
|
-
card_id: str,
|
|
995
|
-
card_type: str,
|
|
996
|
-
fields: Dict[str, Any],
|
|
997
|
-
generated_at: str,
|
|
998
|
-
) -> str:
|
|
999
|
-
analysis_output = fields.get("analysis_output") if isinstance(fields.get("analysis_output"), dict) else {}
|
|
1000
|
-
author_analysis_v2 = fields.get("author_analysis_v2") if isinstance(fields.get("author_analysis_v2"), dict) else analysis_output.get("author_analysis_v2", {})
|
|
1001
|
-
if not isinstance(author_analysis_v2, dict):
|
|
1002
|
-
author_analysis_v2 = {}
|
|
1003
|
-
sampled_work_explanations = fields.get("sampled_work_explanations") if isinstance(fields.get("sampled_work_explanations"), dict) else analysis_output.get("sampled_work_explanations", {})
|
|
1004
|
-
if not isinstance(sampled_work_explanations, dict):
|
|
1005
|
-
sampled_work_explanations = {}
|
|
1006
|
-
author_card_highlights = fields.get("author_card_highlights") if isinstance(fields.get("author_card_highlights"), dict) else {}
|
|
1007
|
-
if not isinstance(author_card_highlights, dict):
|
|
1008
|
-
author_card_highlights = {}
|
|
1009
|
-
validation = fields.get("validation") if isinstance(fields.get("validation"), dict) else analysis_output.get("validation", {})
|
|
1010
|
-
if not isinstance(validation, dict):
|
|
1011
|
-
validation = {}
|
|
1012
|
-
|
|
1013
|
-
business_score = _safe_int(fields.get("business_score"), default=_safe_int(analysis_output.get("business_score"), default=0))
|
|
1014
|
-
benchmark_gap_score = _safe_int(fields.get("benchmark_gap_score"), default=_safe_int(analysis_output.get("benchmark_gap_score"), default=0))
|
|
1015
|
-
style_radar = fields.get("style_radar") if isinstance(fields.get("style_radar"), dict) else analysis_output.get("style_radar", {})
|
|
1016
|
-
if not isinstance(style_radar, dict):
|
|
1017
|
-
style_radar = {}
|
|
1018
|
-
|
|
1019
|
-
core_contradictions = fields.get("core_contradictions") if isinstance(fields.get("core_contradictions"), list) else analysis_output.get("core_contradictions", [])
|
|
1020
|
-
if not isinstance(core_contradictions, list):
|
|
1021
|
-
core_contradictions = []
|
|
1022
|
-
|
|
1023
|
-
recommendations = fields.get("recommendations") if isinstance(fields.get("recommendations"), list) else analysis_output.get("recommendations", [])
|
|
1024
|
-
if not isinstance(recommendations, list):
|
|
1025
|
-
recommendations = []
|
|
1026
|
-
|
|
1027
|
-
business_analysis = normalize_text(fields.get("business_analysis")) or normalize_text(analysis_output.get("business_analysis"))
|
|
1028
|
-
benchmark_analysis = normalize_text(fields.get("benchmark_analysis")) or normalize_text(analysis_output.get("benchmark_analysis"))
|
|
1029
|
-
author_portrait = normalize_text(author_card_highlights.get("one_liner")) or normalize_text(fields.get("summary")) or normalize_text(analysis_output.get("author_portrait"))
|
|
1030
|
-
|
|
1031
|
-
fm = {
|
|
1032
|
-
"card_id": card_id,
|
|
1033
|
-
"card_type": card_type,
|
|
1034
|
-
"platform": fields.get("platform"),
|
|
1035
|
-
"generated_at": generated_at,
|
|
1036
|
-
"updated_at": generated_at,
|
|
1037
|
-
"title": fields.get("title"),
|
|
1038
|
-
"platform_work_id": fields.get("platform_work_id"),
|
|
1039
|
-
"author": fields.get("author"),
|
|
1040
|
-
"author_handle": fields.get("author_handle"),
|
|
1041
|
-
"platform_author_id": fields.get("platform_author_id"),
|
|
1042
|
-
"nickname": fields.get("nickname"),
|
|
1043
|
-
"ip_location": fields.get("ip_location"),
|
|
1044
|
-
"avatar_url": fields.get("avatar_url"),
|
|
1045
|
-
"signature": fields.get("signature"),
|
|
1046
|
-
"fans_count": fields.get("fans_count"),
|
|
1047
|
-
"liked_count": fields.get("liked_count"),
|
|
1048
|
-
"collected_count": fields.get("collected_count"),
|
|
1049
|
-
"works_count": fields.get("works_count"),
|
|
1050
|
-
"verified": fields.get("verified"),
|
|
1051
|
-
"snapshot_at": fields.get("snapshot_at"),
|
|
1052
|
-
"business_score": business_score,
|
|
1053
|
-
"benchmark_gap_score": benchmark_gap_score,
|
|
1054
|
-
"request_id": fields.get("request_id"),
|
|
1055
|
-
}
|
|
1056
|
-
|
|
1057
|
-
frontmatter = ["---"]
|
|
1058
|
-
for key, value in fm.items():
|
|
1059
|
-
frontmatter.append(f"{key}: {json.dumps(value, ensure_ascii=False)}")
|
|
1060
|
-
frontmatter.append("---")
|
|
1061
|
-
|
|
1062
|
-
lines = [
|
|
1063
|
-
*frontmatter,
|
|
1064
|
-
"",
|
|
1065
|
-
"## 基础事实",
|
|
1066
|
-
f"- 平台:{fields.get('platform') or '未知'}",
|
|
1067
|
-
f"- 作者ID:{fields.get('platform_author_id') or '未知'}",
|
|
1068
|
-
f"- 账号标识:{fields.get('author_handle') or 'N/A'}",
|
|
1069
|
-
f"- 昵称:{fields.get('nickname') or fields.get('author') or '未知'}",
|
|
1070
|
-
f"- IP属地:{fields.get('ip_location') or 'N/A'}",
|
|
1071
|
-
f"- 签名:{fields.get('signature') or 'N/A'}",
|
|
1072
|
-
f"- 头像:{fields.get('avatar_url') or 'N/A'}",
|
|
1073
|
-
f"- 粉丝数:{_display_metric(fields.get('fans_count'))}",
|
|
1074
|
-
f"- 累计获赞:{_display_metric(fields.get('liked_count'))}",
|
|
1075
|
-
f"- 累计收藏:{_display_metric(fields.get('collected_count'))}",
|
|
1076
|
-
f"- 作品数:{_display_metric(fields.get('works_count'))}",
|
|
1077
|
-
f"- 认证状态:{'是' if fields.get('verified') else '否'}" if fields.get('verified') is not None else "- 认证状态:N/A",
|
|
1078
|
-
f"- 抓取时间:{fields.get('snapshot_at') or 'N/A'}",
|
|
1079
|
-
"",
|
|
1080
|
-
"## 作者画像",
|
|
1081
|
-
author_portrait or "数据不足",
|
|
1082
|
-
"",
|
|
1083
|
-
"## 主页摘要卡",
|
|
1084
|
-
f"- 核心价值:{normalize_text(author_card_highlights.get('core_value_proposition')) or '数据不足'}",
|
|
1085
|
-
f"- 主要信任源:{normalize_text(author_card_highlights.get('primary_trust_source')) or '数据不足'}",
|
|
1086
|
-
f"- 胜率结构:{('、'.join([normalize_text(x) for x in author_card_highlights.get('winning_content_structures', []) if normalize_text(x)])) or '数据不足'}",
|
|
1087
|
-
f"- 可能产品:{('、'.join([normalize_text(x) for x in author_card_highlights.get('likely_products', []) if normalize_text(x)])) or '证据不足'}",
|
|
1088
|
-
f"- 最大张力:{normalize_text(author_card_highlights.get('most_important_tension')) or '数据不足'}",
|
|
1089
|
-
f"- 只学一件事:{normalize_text(author_card_highlights.get('if_only_learn_one_thing')) or '数据不足'}",
|
|
1090
|
-
"",
|
|
1091
|
-
"## 商业分析",
|
|
1092
|
-
business_analysis or "数据不足",
|
|
1093
|
-
"",
|
|
1094
|
-
"## 对标分析",
|
|
1095
|
-
benchmark_analysis or "数据不足",
|
|
1096
|
-
"",
|
|
1097
|
-
"## 评分",
|
|
1098
|
-
f"- business_score: {business_score}",
|
|
1099
|
-
f"- benchmark_gap_score: {benchmark_gap_score}",
|
|
1100
|
-
"",
|
|
1101
|
-
"## 风格雷达",
|
|
1102
|
-
"```json",
|
|
1103
|
-
json.dumps(style_radar, ensure_ascii=False, indent=2),
|
|
1104
|
-
"```",
|
|
1105
|
-
"",
|
|
1106
|
-
"## 核心矛盾",
|
|
1107
|
-
]
|
|
1108
|
-
|
|
1109
|
-
if core_contradictions:
|
|
1110
|
-
lines.extend([f"- {normalize_text(item)}" for item in core_contradictions if normalize_text(item)])
|
|
1111
|
-
else:
|
|
1112
|
-
lines.append("- 数据不足")
|
|
1113
|
-
|
|
1114
|
-
lines.extend(["", "## 建议动作"])
|
|
1115
|
-
if recommendations:
|
|
1116
|
-
lines.extend([f"- {normalize_text(item)}" for item in recommendations if normalize_text(item)])
|
|
1117
|
-
else:
|
|
1118
|
-
lines.append("- 数据不足")
|
|
1119
|
-
|
|
1120
|
-
lines.extend(
|
|
1121
|
-
[
|
|
1122
|
-
"",
|
|
1123
|
-
"## author_analysis_v2",
|
|
1124
|
-
"```json",
|
|
1125
|
-
json.dumps(author_analysis_v2, ensure_ascii=False, indent=2),
|
|
1126
|
-
"```",
|
|
1127
|
-
"",
|
|
1128
|
-
"## sampled_work_explanations",
|
|
1129
|
-
"```json",
|
|
1130
|
-
json.dumps(sampled_work_explanations, ensure_ascii=False, indent=2),
|
|
1131
|
-
"```",
|
|
1132
|
-
"",
|
|
1133
|
-
"## 校验",
|
|
1134
|
-
f"- validation_ok: {bool(validation.get('ok'))}",
|
|
1135
|
-
f"- validation_error_count: {len(validation.get('errors') or [])}",
|
|
1136
|
-
"",
|
|
1137
|
-
"## 附录",
|
|
1138
|
-
f"- confidence: {fields.get('confidence')}",
|
|
1139
|
-
f"- error_reason: {fields.get('error_reason')}",
|
|
1140
|
-
"",
|
|
1141
|
-
"```json",
|
|
1142
|
-
json.dumps(fields.get("extract_trace", []), ensure_ascii=False, indent=2),
|
|
1143
|
-
"```",
|
|
1144
|
-
"",
|
|
1145
|
-
]
|
|
1146
|
-
)
|
|
1147
|
-
return "\n".join(lines)
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
502
|
def _render_markdown(
|
|
1151
503
|
*,
|
|
1152
504
|
card_id: str,
|
|
@@ -1154,24 +506,9 @@ def _render_markdown(
|
|
|
1154
506
|
fields: Dict[str, Any],
|
|
1155
507
|
generated_at: str,
|
|
1156
508
|
) -> str:
|
|
1157
|
-
if card_type == "author":
|
|
1158
|
-
return _render_author_markdown(
|
|
1159
|
-
card_id=card_id,
|
|
1160
|
-
card_type=card_type,
|
|
1161
|
-
fields=fields,
|
|
1162
|
-
generated_at=generated_at,
|
|
1163
|
-
)
|
|
1164
509
|
author_name = fields.get("author") or fields.get("author_handle") or fields.get("platform_author_id") or "未知作者"
|
|
1165
510
|
title = fields.get("title") or "(标题缺失)"
|
|
1166
|
-
|
|
1167
|
-
f"赞 {_display_metric(fields.get('digg_count'))} / 评 {_display_metric(fields.get('comment_count'))} / "
|
|
1168
|
-
f"藏 {_display_metric(fields.get('collect_count'))} / 转 {_display_metric(fields.get('share_count'))} / 播 {_display_metric(fields.get('play_count'))}"
|
|
1169
|
-
)
|
|
1170
|
-
precomputed_sections = fields.get("analysis_sections") if isinstance(fields.get("analysis_sections"), dict) else {}
|
|
1171
|
-
if precomputed_sections:
|
|
1172
|
-
analysis_sections = precomputed_sections
|
|
1173
|
-
else:
|
|
1174
|
-
analysis_sections = {} if card_type == "author_sample_work" else build_analysis_sections(fields)
|
|
511
|
+
analysis_sections = ensure_analysis_sections_schema(fields.get("analysis_sections"), provider="local", llm_used=False)
|
|
1175
512
|
creative_modules = analysis_sections.get("modules", {})
|
|
1176
513
|
insight_lines = analysis_sections.get("insight", ["数据不足"])
|
|
1177
514
|
extract_trace_json = json.dumps(fields.get("extract_trace", []), ensure_ascii=False, indent=2)
|
|
@@ -1188,7 +525,6 @@ def _render_markdown(
|
|
|
1188
525
|
"author_handle": fields.get("author_handle"),
|
|
1189
526
|
"platform_author_id": fields.get("platform_author_id"),
|
|
1190
527
|
"caption_raw": fields.get("caption_raw"),
|
|
1191
|
-
"primary_text": fields.get("primary_text"),
|
|
1192
528
|
"share_url": fields.get("share_url"),
|
|
1193
529
|
"source_url": fields.get("source_url"),
|
|
1194
530
|
"cover_image": fields.get("cover_image"),
|
|
@@ -1212,16 +548,6 @@ def _render_markdown(
|
|
|
1212
548
|
lines = [
|
|
1213
549
|
*frontmatter,
|
|
1214
550
|
"",
|
|
1215
|
-
"## 基础信息",
|
|
1216
|
-
f"- 作者:{author_name}",
|
|
1217
|
-
f"- 标题:{title}",
|
|
1218
|
-
f"- 原始文案:{fields.get('caption_raw') or 'N/A'}",
|
|
1219
|
-
f"- 作品模态:{fields.get('work_modality') or '未知'}",
|
|
1220
|
-
f"- 发布时间:{fields.get('published_date') or 'N/A'}",
|
|
1221
|
-
f"- {'视频时长' if fields.get('work_modality') == 'video' else '阅读载体'}:{_format_duration(fields.get('duration_ms', 0)) if fields.get('work_modality') == 'video' else '文本'}",
|
|
1222
|
-
f"- 互动:{metrics_line}",
|
|
1223
|
-
f"- 链接:{fields.get('share_url') or '(未提供)'}",
|
|
1224
|
-
f"- 下载链接:{fields.get('video_download_url') or 'N/A'}" if fields.get("work_modality") == "video" else "- 下载链接:N/A",
|
|
1225
551
|
]
|
|
1226
552
|
|
|
1227
553
|
for heading in DEFAULT_MODULE_SECTIONS:
|
|
@@ -1235,24 +561,15 @@ def _render_markdown(
|
|
|
1235
561
|
for item in insight_lines:
|
|
1236
562
|
lines.append(item)
|
|
1237
563
|
|
|
1238
|
-
transcript_heading = "## 主文本"
|
|
1239
|
-
transcript_body = fields.get("primary_text")
|
|
1240
|
-
transcript_fallback = "(无可用主文本)"
|
|
1241
|
-
|
|
1242
564
|
lines.extend(
|
|
1243
565
|
[
|
|
1244
566
|
"",
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
]
|
|
1248
|
-
)
|
|
1249
|
-
|
|
1250
|
-
lines.extend(
|
|
1251
|
-
[
|
|
567
|
+
"## 主文本",
|
|
568
|
+
fields.get("primary_text") or "(无可用主文本)",
|
|
1252
569
|
"",
|
|
1253
570
|
"## 附录",
|
|
1254
571
|
"### ASR_RAW",
|
|
1255
|
-
fields.get("
|
|
572
|
+
fields.get("asr_raw") or "(无可用 ASR 原文)",
|
|
1256
573
|
"",
|
|
1257
574
|
"### trace",
|
|
1258
575
|
f"- request_id: {fields.get('request_id')}",
|
|
@@ -1282,30 +599,39 @@ def _resolve_card_root(card_root: Optional[str]) -> str:
|
|
|
1282
599
|
raw = (card_root or "").strip()
|
|
1283
600
|
if not raw:
|
|
1284
601
|
return resolve_default_card_root()
|
|
1285
|
-
|
|
1286
602
|
candidate = Path(raw).expanduser()
|
|
1287
603
|
if not candidate.is_absolute():
|
|
1288
604
|
raise ValueError("card_root must be an absolute path")
|
|
1289
605
|
return str(candidate.resolve())
|
|
1290
606
|
|
|
1291
607
|
|
|
608
|
+
def _read_payload_from_input(input_json: str) -> Dict[str, Any]:
|
|
609
|
+
if input_json == "-":
|
|
610
|
+
raw = os.read(0, 1024 * 1024).decode("utf-8", errors="replace").strip()
|
|
611
|
+
if not raw:
|
|
612
|
+
return {}
|
|
613
|
+
return json.loads(raw)
|
|
614
|
+
return read_json_file(input_json)
|
|
615
|
+
|
|
616
|
+
|
|
1292
617
|
def write_benchmark_card(
|
|
1293
618
|
*,
|
|
1294
619
|
payload: Dict[str, Any],
|
|
1295
620
|
platform: str,
|
|
1296
621
|
card_type: str,
|
|
1297
622
|
card_root: Optional[str],
|
|
1298
|
-
sample_author: Optional[str] = None,
|
|
1299
623
|
content_kind: Optional[str] = None,
|
|
1300
624
|
storage_config: Optional[Dict[str, Any]] = None,
|
|
1301
625
|
force_card_type: bool = False,
|
|
626
|
+
analysis_mode: str = "auto",
|
|
627
|
+
progress: Optional[ProgressReporter] = None,
|
|
1302
628
|
) -> Dict[str, Any]:
|
|
629
|
+
started_at = time.perf_counter()
|
|
1303
630
|
now = dt.datetime.now()
|
|
1304
631
|
generated_at = now.isoformat(timespec="seconds")
|
|
1305
632
|
|
|
1306
633
|
payload_content_kind = normalize_text(payload.get("content_kind"))
|
|
1307
634
|
resolved_content_kind = normalize_text(content_kind) or payload_content_kind
|
|
1308
|
-
|
|
1309
635
|
normalized_card_type = normalize_card_type(card_type)
|
|
1310
636
|
effective_card_type = resolve_effective_card_type(
|
|
1311
637
|
card_type=normalized_card_type,
|
|
@@ -1313,29 +639,61 @@ def write_benchmark_card(
|
|
|
1313
639
|
storage_config=storage_config,
|
|
1314
640
|
force_card_type=force_card_type,
|
|
1315
641
|
)
|
|
642
|
+
if effective_card_type != "work":
|
|
643
|
+
effective_card_type = "work"
|
|
644
|
+
|
|
1316
645
|
fields = _extract_required_fields(payload, platform=platform)
|
|
1317
|
-
|
|
646
|
+
if progress is not None:
|
|
647
|
+
progress.progress(stage="card.analysis", message="building card analysis")
|
|
648
|
+
if _has_meaningful_analysis_sections(payload.get("analysis_sections")):
|
|
649
|
+
existing = payload.get("analysis_sections")
|
|
650
|
+
meta = existing.get("meta") if isinstance(existing, dict) and isinstance(existing.get("meta"), dict) else {}
|
|
651
|
+
analysis_sections = ensure_analysis_sections_schema(
|
|
652
|
+
existing,
|
|
653
|
+
provider=normalize_text(existing.get("provider")) or "local",
|
|
654
|
+
llm_used=bool(meta.get("llm_used")),
|
|
655
|
+
degraded=bool(meta.get("degraded")),
|
|
656
|
+
reason=normalize_text(meta.get("reason")),
|
|
657
|
+
duration_ms=_safe_int(meta.get("duration_ms"), default=0),
|
|
658
|
+
)
|
|
659
|
+
else:
|
|
660
|
+
analysis_sections = build_analysis_sections(
|
|
661
|
+
fields,
|
|
662
|
+
analysis_mode=analysis_mode,
|
|
663
|
+
analysis_config=storage_config.get("analysis") if isinstance(storage_config, dict) else None,
|
|
664
|
+
progress=progress.child(scope="card.analysis") if progress is not None else None,
|
|
665
|
+
)
|
|
666
|
+
fields["analysis_sections"] = analysis_sections
|
|
1318
667
|
|
|
668
|
+
payload["analysis_sections"] = analysis_sections
|
|
669
|
+
payload["asr_raw"] = fields.get("asr_raw")
|
|
670
|
+
payload["asr_clean"] = fields.get("asr_clean")
|
|
671
|
+
payload["primary_text"] = fields.get("primary_text")
|
|
672
|
+
payload["primary_text_source"] = fields.get("primary_text_source")
|
|
673
|
+
deep_analysis = _analysis_status_from_sections(analysis_sections)
|
|
674
|
+
payload["deep_analysis"] = deep_analysis
|
|
675
|
+
|
|
676
|
+
resolved_card_root = _resolve_card_root(card_root)
|
|
1319
677
|
primary_target = _build_output_path(
|
|
1320
678
|
card_root=resolved_card_root,
|
|
1321
679
|
platform=platform,
|
|
1322
680
|
card_type=effective_card_type,
|
|
1323
681
|
payload=payload,
|
|
1324
682
|
now=now,
|
|
1325
|
-
sample_author=sample_author,
|
|
1326
683
|
storage_config=storage_config,
|
|
1327
684
|
)
|
|
1328
685
|
primary_path = primary_target["path"]
|
|
1329
|
-
|
|
1330
686
|
primary_card_id = os.path.basename(primary_path).replace(".md", "")
|
|
1331
|
-
|
|
687
|
+
|
|
688
|
+
markdown = _render_markdown(
|
|
1332
689
|
card_id=primary_card_id,
|
|
1333
690
|
card_type=effective_card_type,
|
|
1334
691
|
fields=fields,
|
|
1335
692
|
generated_at=generated_at,
|
|
1336
693
|
)
|
|
1337
|
-
_write_file(primary_path,
|
|
694
|
+
_write_file(primary_path, markdown)
|
|
1338
695
|
|
|
696
|
+
duration_ms = int((time.perf_counter() - started_at) * 1000)
|
|
1339
697
|
return {
|
|
1340
698
|
"ok": True,
|
|
1341
699
|
"platform": platform,
|
|
@@ -1349,34 +707,25 @@ def write_benchmark_card(
|
|
|
1349
707
|
"storage_routes_configured": bool(isinstance(storage_config, dict) and isinstance(storage_config.get("storage_routes"), dict)),
|
|
1350
708
|
},
|
|
1351
709
|
"required_fields": fields,
|
|
710
|
+
"analysis_sections": analysis_sections,
|
|
711
|
+
"analysis_status": deep_analysis,
|
|
712
|
+
"duration_ms": duration_ms,
|
|
713
|
+
"llm_analysis_ms": _safe_int(analysis_sections.get("meta", {}).get("duration_ms"), default=0),
|
|
1352
714
|
}
|
|
1353
715
|
|
|
1354
716
|
|
|
1355
|
-
def _read_payload_from_input(input_json: str) -> Dict[str, Any]:
|
|
1356
|
-
if input_json == "-":
|
|
1357
|
-
raw = os.read(0, 1024 * 1024).decode("utf-8", errors="replace").strip()
|
|
1358
|
-
if not raw:
|
|
1359
|
-
return {}
|
|
1360
|
-
return json.loads(raw)
|
|
1361
|
-
return read_json_file(input_json)
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
717
|
def main() -> None:
|
|
1365
|
-
parser = argparse.ArgumentParser(description="Write benchmark card markdown to card root")
|
|
718
|
+
parser = argparse.ArgumentParser(description="Write single-work benchmark card markdown to card root")
|
|
1366
719
|
parser.add_argument("--platform", required=True, help="Platform name, e.g. douyin or xiaohongshu")
|
|
1367
720
|
parser.add_argument("--card-type", choices=CARD_TYPES, default="work", help="Primary card type")
|
|
721
|
+
parser.add_argument("--analysis-mode", choices=["auto", "local"], default="auto", help="Card analysis mode")
|
|
1368
722
|
parser.add_argument("--config", default=None, help="Runtime config YAML path")
|
|
1369
723
|
parser.add_argument("--env-file", default=None, help="Shared env file path; defaults to <skills_root>/.env")
|
|
1370
724
|
parser.add_argument("--allow-process-env", action="store_true", help="Allow process env to override .env/.env.local")
|
|
1371
|
-
parser.add_argument("--
|
|
1372
|
-
parser.add_argument("--content-kind", default=None, help="Optional workflow kind, e.g. single_video/author_home/author_analysis")
|
|
725
|
+
parser.add_argument("--content-kind", default=None, help="Optional workflow kind, e.g. single_video/note/work")
|
|
1373
726
|
parser.add_argument("--force-card-type", action="store_true", help="Force manual --card-type to override content_kind mapping")
|
|
1374
727
|
parser.add_argument("--card-root", default=None, help="Card root path (absolute); falls back to TIKOMNI_CARD_ROOT when omitted")
|
|
1375
|
-
parser.add_argument(
|
|
1376
|
-
"--input-json",
|
|
1377
|
-
default="-",
|
|
1378
|
-
help="Input JSON path or '-' to read from stdin",
|
|
1379
|
-
)
|
|
728
|
+
parser.add_argument("--input-json", default="-", help="Input JSON path or '-' to read from stdin")
|
|
1380
729
|
args = parser.parse_args()
|
|
1381
730
|
|
|
1382
731
|
config, _ = load_tikomni_config(
|
|
@@ -1390,10 +739,11 @@ def main() -> None:
|
|
|
1390
739
|
platform=args.platform,
|
|
1391
740
|
card_type=args.card_type,
|
|
1392
741
|
card_root=args.card_root,
|
|
1393
|
-
sample_author=args.sample_author,
|
|
1394
742
|
content_kind=args.content_kind,
|
|
1395
743
|
storage_config=config,
|
|
1396
744
|
force_card_type=args.force_card_type,
|
|
745
|
+
analysis_mode=args.analysis_mode,
|
|
746
|
+
progress=None,
|
|
1397
747
|
)
|
|
1398
748
|
write_json_stdout(result)
|
|
1399
749
|
|