@tikomni/skills 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. package/package.json +4 -2
  2. package/skills/single-work-analysis/env.example +3 -3
  3. package/skills/single-work-analysis/references/config-templates/defaults.yaml +8 -19
  4. package/skills/single-work-analysis/references/prompt-contracts/{insight.md → analysis-bundle.md} +43 -8
  5. package/skills/single-work-analysis/scripts/core/analysis_adapter.py +384 -0
  6. package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +399 -76
  7. package/skills/single-work-analysis/scripts/core/config_loader.py +18 -42
  8. package/skills/single-work-analysis/scripts/core/progress_report.py +163 -16
  9. package/skills/single-work-analysis/scripts/core/storage_router.py +24 -57
  10. package/skills/single-work-analysis/scripts/core/tikomni_common.py +13 -3
  11. package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +154 -7
  12. package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +3 -1
  13. package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +243 -44
  14. package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +263 -25
  15. package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +244 -894
  16. package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
  17. package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
  18. package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
  19. package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
  20. package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
  21. package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
  22. package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
@@ -10,33 +10,42 @@ if __package__ in {None, ""}:
10
10
  sys.path.insert(0, str(_parent))
11
11
  break
12
12
 
13
- """Write benchmark markdown cards into card root zones."""
13
+ """Write single-work benchmark markdown cards into card root zones."""
14
14
 
15
15
  import argparse
16
16
  import datetime as dt
17
17
  import json
18
18
  import os
19
19
  import re
20
+ import time
20
21
  import unicodedata
21
22
  from pathlib import Path
22
23
  from typing import Any, Dict, List, Optional
23
24
 
24
25
  try:
25
26
  from zoneinfo import ZoneInfo
26
- except Exception: # pragma: no cover - py<3.9 fallback
27
+ except Exception: # pragma: no cover
27
28
  ZoneInfo = None
28
29
 
29
- from scripts.core.analysis_pipeline import DEFAULT_MODULE_SECTIONS, build_analysis_sections
30
+ from scripts.core.analysis_pipeline import (
31
+ DEFAULT_MODULE_SECTIONS,
32
+ build_analysis_sections,
33
+ ensure_analysis_sections_schema,
34
+ )
30
35
  from scripts.core.config_loader import load_tikomni_config
36
+ from scripts.core.progress_report import ProgressReporter
31
37
  from scripts.core.storage_router import build_card_output_path, normalize_card_type, resolve_effective_card_type
32
38
  from scripts.core.tikomni_common import normalize_text, read_json_file, write_json_stdout
39
+ from scripts.pipeline.asr.asr_pipeline import derive_asr_clean_text
40
+
41
+
42
+ CARD_TYPES = ["work"]
43
+
33
44
 
34
45
  def resolve_default_card_root() -> str:
35
46
  raw = os.getenv("TIKOMNI_CARD_ROOT", "").strip()
36
47
  if not raw:
37
- raise ValueError(
38
- "missing_card_root: set --card-root or define TIKOMNI_CARD_ROOT in .env/.env.local"
39
- )
48
+ raise ValueError("missing_card_root: set --card-root or define TIKOMNI_CARD_ROOT in .env/.env.local")
40
49
 
41
50
  candidate = Path(raw).expanduser()
42
51
  if not candidate.is_absolute():
@@ -44,18 +53,7 @@ def resolve_default_card_root() -> str:
44
53
  return str(candidate.resolve())
45
54
 
46
55
 
47
- # Keep import-time compatibility for other scripts without crashing when env is absent.
48
56
  DEFAULT_CARD_ROOT = ""
49
- CARD_TYPES = ["work", "author", "author_sample_work"]
50
-
51
-
52
- def _normalize_lines(value: Any) -> List[str]:
53
- if isinstance(value, list):
54
- return [normalize_text(item) for item in value if normalize_text(item)]
55
- if isinstance(value, str):
56
- text = normalize_text(value)
57
- return [text] if text else []
58
- return []
59
57
 
60
58
 
61
59
  def _safe_int(value: Any, default: int = 0) -> int:
@@ -67,11 +65,13 @@ def _safe_int(value: Any, default: int = 0) -> int:
67
65
  return value
68
66
  if isinstance(value, float):
69
67
  return int(value)
70
- if isinstance(value, str):
71
- text = value.strip()
72
- if text.isdigit() or (text.startswith("-") and text[1:].isdigit()):
73
- return int(text)
74
- return default
68
+ text = str(value).strip()
69
+ if not text:
70
+ return default
71
+ try:
72
+ return int(float(text))
73
+ except Exception:
74
+ return default
75
75
 
76
76
 
77
77
  def _safe_optional_int(value: Any) -> Optional[int]:
@@ -83,18 +83,38 @@ def _safe_optional_int(value: Any) -> Optional[int]:
83
83
  return value
84
84
  if isinstance(value, float):
85
85
  return int(value)
86
- if isinstance(value, str):
87
- text = value.strip()
88
- if not text:
89
- return None
90
- if text.isdigit() or (text.startswith("-") and text[1:].isdigit()):
91
- return int(text)
92
- return None
86
+ text = str(value).strip()
87
+ if not text:
88
+ return None
89
+ try:
90
+ return int(float(text))
91
+ except Exception:
92
+ return None
93
+
94
+
95
+ def _display_metric(value: Optional[int]) -> str:
96
+ return "N/A" if value is None else str(value)
97
+
98
+
99
+ def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
100
+ source = payload.get("source")
101
+ return source if isinstance(source, dict) else {}
102
+
103
+
104
+ def _pick_text(payload: Dict[str, Any], keys: List[str], source_keys: Optional[List[str]] = None) -> str:
105
+ source = _source_dict(payload)
106
+ for key in keys:
107
+ text = normalize_text(payload.get(key))
108
+ if text:
109
+ return text
110
+ for key in (source_keys or keys):
111
+ text = normalize_text(source.get(key))
112
+ if text:
113
+ return text
114
+ return ""
93
115
 
94
116
 
95
117
  def _to_unix_sec(value: Any) -> int:
96
- if value is None:
97
- return 0
98
118
  parsed = _safe_int(value, default=0)
99
119
  if parsed <= 0:
100
120
  return 0
@@ -117,168 +137,47 @@ def _format_shanghai_datetime(value: Any) -> str:
117
137
  return ""
118
138
 
119
139
 
120
- def _resolve_publish_time(payload: Dict[str, Any], create_time_sec: int) -> Dict[str, str]:
121
- publish_time_text = normalize_text(payload.get("publish_time_text"))
122
- if publish_time_text:
123
- return {"publish_time_text": publish_time_text, "publish_time_source": "payload.publish_time_text"}
124
-
125
- source = _source_dict(payload)
126
- candidates = [
127
- ("payload.publish_time", payload.get("publish_time")),
128
- ("payload.create_time", payload.get("create_time")),
129
- ("source.publish_time", source.get("publish_time")),
130
- ("source.create_time", source.get("create_time")),
131
- ("source.time", source.get("time")),
132
- ]
133
- for source_key, raw in candidates:
134
- text = normalize_text(raw)
135
- if not text:
136
- continue
137
- ts_text = _format_shanghai_datetime(raw)
138
- if ts_text:
139
- return {"publish_time_text": ts_text, "publish_time_source": source_key}
140
- return {"publish_time_text": text, "publish_time_source": source_key}
141
-
142
- fallback_text = _format_shanghai_datetime(create_time_sec)
143
- if fallback_text:
144
- return {"publish_time_text": fallback_text, "publish_time_source": "create_time_sec"}
145
-
146
- return {"publish_time_text": "未知", "publish_time_source": "unknown"}
147
-
148
-
149
140
  def _resolve_published_date(payload: Dict[str, Any], create_time_sec: int) -> str:
150
141
  published_date = normalize_text(payload.get("published_date"))
151
142
  if published_date:
152
143
  return published_date
153
- publish_time_info = _resolve_publish_time(payload, create_time_sec)
154
- text = normalize_text(publish_time_info.get("publish_time_text"))
155
- if not text or text == "未知":
156
- return "N/A"
157
- return text[:10]
144
+ publish_time_text = normalize_text(payload.get("publish_time_text"))
145
+ if publish_time_text:
146
+ return publish_time_text[:10]
158
147
 
148
+ source = _source_dict(payload)
149
+ for key in ("publish_time", "create_time", "time"):
150
+ text = _format_shanghai_datetime(payload.get(key))
151
+ if text:
152
+ return text[:10]
153
+ text = _format_shanghai_datetime(source.get(key))
154
+ if text:
155
+ return text[:10]
159
156
 
160
- def _display_metric(value: Optional[int]) -> str:
161
- if value is None:
162
- return "N/A"
163
- return str(value)
164
-
165
-
166
- def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
167
- source = payload.get("source")
168
- return source if isinstance(source, dict) else {}
157
+ fallback = _format_shanghai_datetime(create_time_sec)
158
+ return fallback[:10] if fallback else "N/A"
169
159
 
170
160
 
171
161
  def _extract_duration_ms(payload: Dict[str, Any]) -> int:
172
162
  source = _source_dict(payload)
173
-
174
- def _pick_int(keys: List[str], from_source: bool = False) -> int:
175
- base = source if from_source else payload
176
- for key in keys:
163
+ for base in (payload, source):
164
+ for key in ("duration_ms", "duration", "duration_sec"):
177
165
  value = _safe_int(base.get(key), default=0)
178
166
  if value > 0:
179
- return value
180
- return 0
181
-
182
- duration_ms = _pick_int(["duration_ms"])
183
- if duration_ms <= 0:
184
- duration_ms = _pick_int(["duration_ms"], from_source=True)
185
-
186
- if duration_ms <= 0:
187
- raw_duration = _pick_int(["duration", "duration_sec"])
188
- if raw_duration <= 0:
189
- raw_duration = _pick_int(["duration", "duration_sec"], from_source=True)
190
- if raw_duration > 0:
191
- duration_ms = raw_duration * 1000 if raw_duration < 10000 else raw_duration
192
-
193
- return duration_ms
194
-
195
-
196
- def _ensure_sentence_end(text: str) -> str:
197
- if not text:
198
- return text
199
- if text[-1] in "。!?!?" or text.endswith("..."):
200
- return text
201
- return f"{text}。"
202
-
203
-
204
- def _clean_asr_text(raw: str, provided_clean: str) -> str:
205
- """ASR_CLEAN prompt-contracts/asr-clean.md@v1
206
-
207
- Steps:
208
- 1) base select: provided_clean > raw
209
- 2) denoise: remove filler/repetition/whitespace noise
210
- 3) sentence split + punctuation restore
211
- 4) paragraphize: one sentence per line, 2-4 sentences per paragraph
212
- """
213
- base = normalize_text(provided_clean) or normalize_text(raw)
214
- if not base:
215
- return ""
216
-
217
- # step2: 去噪(口头禅/重复)
218
- base = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
219
- base = re.sub(r"(嗯+|啊+|呃+)", " ", base)
220
- base = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", base)
221
- base = re.sub(r"\s+", " ", base).strip()
222
-
223
- # step3: 断句 + 句尾标点
224
- units = [normalize_text(part) for part in re.split(r"[。!?!?;;\n]+", base)]
225
- sentences = [_ensure_sentence_end(unit) for unit in units if unit]
226
- if not sentences:
227
- fallback = _ensure_sentence_end(base)
228
- return fallback if fallback else ""
229
-
230
- # step4: 每句一行;每段 2~4 句(默认 3 句)
231
- paragraphs: List[str] = []
232
- bucket: List[str] = []
233
- for sentence in sentences:
234
- bucket.append(sentence)
235
- if len(bucket) >= 3:
236
- paragraphs.append("\n".join(bucket))
237
- bucket = []
238
-
239
- if bucket:
240
- if len(bucket) == 1 and paragraphs:
241
- paragraphs[-1] = f"{paragraphs[-1]}\n{bucket[0]}"
242
- else:
243
- paragraphs.append("\n".join(bucket))
244
-
245
- return "\n\n".join(paragraphs)
246
-
247
-
248
- def _pick_text(payload: Dict[str, Any], keys: List[str], source_keys: Optional[List[str]] = None) -> str:
249
- source = _source_dict(payload)
250
- for key in keys:
251
- text = normalize_text(payload.get(key))
252
- if text:
253
- return text
254
- for key in (source_keys or keys):
255
- text = normalize_text(source.get(key))
256
- if text:
257
- return text
258
- return ""
259
-
260
-
261
- def _extract_platform_work_id(payload: Dict[str, Any]) -> str:
262
- return _pick_text(
263
- payload,
264
- ["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
265
- ["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
266
- )
167
+ return value * 1000 if key != "duration_ms" and value < 10000 else value
168
+ return 0
267
169
 
268
170
 
269
171
  def _extract_author(payload: Dict[str, Any]) -> Dict[str, str]:
270
172
  author_raw = payload.get("author")
271
173
  author = author_raw if isinstance(author_raw, dict) else {}
272
-
273
174
  source = _source_dict(payload)
274
175
  source_author = source.get("author") if isinstance(source.get("author"), dict) else {}
275
176
 
276
- author_text = normalize_text(author_raw) if isinstance(author_raw, str) else ""
277
- nickname = (
278
- normalize_text(author.get("nickname"))
279
- or author_text
280
- or normalize_text(source_author.get("nickname"))
281
- )
177
+ nickname = normalize_text(author.get("nickname"))
178
+ if not nickname and isinstance(author_raw, str):
179
+ nickname = normalize_text(author_raw)
180
+ nickname = nickname or normalize_text(source_author.get("nickname"))
282
181
 
283
182
  author_handle = (
284
183
  normalize_text(payload.get("author_handle"))
@@ -294,37 +193,10 @@ def _extract_author(payload: Dict[str, Any]) -> Dict[str, str]:
294
193
  or normalize_text(source_author.get("platform_author_id"))
295
194
  or normalize_text(source_author.get("author_platform_id"))
296
195
  )
297
-
298
- xhs_user_id = (
299
- normalize_text(payload.get("xhs_user_id"))
300
- or normalize_text(author.get("xhs_user_id"))
301
- or normalize_text(source_author.get("xhs_user_id"))
302
- )
303
- xhs_sec_token = (
304
- normalize_text(payload.get("xhs_sec_token"))
305
- or normalize_text(author.get("xhs_sec_token"))
306
- or normalize_text(source_author.get("xhs_sec_token"))
307
- )
308
-
309
- douyin_sec_uid = (
310
- normalize_text(payload.get("douyin_sec_uid"))
311
- or normalize_text(author.get("douyin_sec_uid"))
312
- or normalize_text(source_author.get("douyin_sec_uid"))
313
- )
314
- douyin_aweme_author_id = (
315
- normalize_text(payload.get("douyin_aweme_author_id"))
316
- or normalize_text(author.get("douyin_aweme_author_id"))
317
- or normalize_text(source_author.get("douyin_aweme_author_id"))
318
- )
319
-
320
196
  return {
321
197
  "nickname": nickname,
322
198
  "author_handle": author_handle,
323
199
  "platform_author_id": platform_author_id,
324
- "xhs_user_id": xhs_user_id,
325
- "xhs_sec_token": xhs_sec_token,
326
- "douyin_sec_uid": douyin_sec_uid,
327
- "douyin_aweme_author_id": douyin_aweme_author_id,
328
200
  }
329
201
 
330
202
 
@@ -343,11 +215,11 @@ def _clean_for_filename(text: str) -> str:
343
215
  normalized = normalized.replace("\n", " ").replace("\r", " ")
344
216
 
345
217
  kept: List[str] = []
346
- for ch in normalized:
347
- cat = unicodedata.category(ch)
348
- if _is_cjk(ch) or ch.isalnum() or ch in {" ", "-", "_"}:
349
- kept.append(ch)
350
- elif cat.startswith("Z"):
218
+ for char in normalized:
219
+ category = unicodedata.category(char)
220
+ if _is_cjk(char) or char.isalnum() or char in {" ", "-", "_"}:
221
+ kept.append(char)
222
+ elif category.startswith("Z"):
351
223
  kept.append(" ")
352
224
 
353
225
  compact = "".join(kept)
@@ -359,25 +231,29 @@ def _clean_for_filename(text: str) -> str:
359
231
  def _clip_with_min(text: str, min_len: int, max_len: int, fallback: str) -> str:
360
232
  candidate = _clean_for_filename(text)
361
233
  fallback_clean = _clean_for_filename(fallback)
362
-
363
234
  if not candidate:
364
235
  candidate = fallback_clean
365
236
  if len(candidate) < min_len:
366
237
  candidate = (candidate + fallback_clean)[:max_len]
367
238
  if len(candidate) < min_len:
368
239
  candidate = (candidate + "内容速览")[:max_len]
369
-
370
240
  candidate = candidate[:max_len]
371
241
  if len(candidate) < min_len:
372
242
  candidate = (candidate + "作品卡")[:max_len]
373
243
  return candidate[:max_len] if candidate else fallback_clean[:max_len]
374
244
 
375
245
 
376
- def _pick_author_slug(payload: Dict[str, Any], author_hint: Optional[str] = None) -> str:
377
- base = normalize_text(author_hint)
378
- if not base:
379
- author = _extract_author(payload)
380
- base = author["nickname"] or author["author_handle"] or author["platform_author_id"] or "作者"
246
+ def _extract_platform_work_id(payload: Dict[str, Any]) -> str:
247
+ return _pick_text(
248
+ payload,
249
+ ["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
250
+ ["platform_work_id", "aweme_id", "note_id", "item_id", "id"],
251
+ )
252
+
253
+
254
+ def _pick_author_slug(payload: Dict[str, Any]) -> str:
255
+ author = _extract_author(payload)
256
+ base = author["nickname"] or author["author_handle"] or author["platform_author_id"] or "作者"
381
257
  slug = _clip_with_min(base, min_len=2, max_len=18, fallback="作者")
382
258
  return slug if len(slug) >= 2 else "作者"
383
259
 
@@ -433,37 +309,62 @@ def _extract_tags(payload: Dict[str, Any]) -> List[str]:
433
309
  tags = [normalize_text(item).lstrip("#") for item in value if normalize_text(item)]
434
310
  if tags:
435
311
  return list(dict.fromkeys(tags))
436
-
437
312
  return []
438
313
 
439
314
 
315
+ def _format_duration(duration_ms: int) -> str:
316
+ if duration_ms <= 0:
317
+ return "未知"
318
+ total_sec = duration_ms // 1000
319
+ minute, second = divmod(total_sec, 60)
320
+ if minute:
321
+ return f"{minute}分{second:02d}秒"
322
+ return f"{second}秒"
323
+
324
+
325
+ def _analysis_status_from_sections(analysis_sections: Dict[str, Any]) -> Dict[str, Any]:
326
+ meta = analysis_sections.get("meta") if isinstance(analysis_sections.get("meta"), dict) else {}
327
+ reason = normalize_text(meta.get("reason"))
328
+ if meta.get("llm_used"):
329
+ status = "completed"
330
+ elif not reason or reason == "analysis_mode_local":
331
+ status = "skipped"
332
+ elif "timeout" in reason:
333
+ status = "timeout"
334
+ elif "unavailable" in reason:
335
+ status = "unavailable"
336
+ else:
337
+ status = "failed"
338
+ return {
339
+ "status": status,
340
+ "provider": normalize_text(analysis_sections.get("provider")) or "local",
341
+ "reason": reason or None,
342
+ "duration_ms": _safe_int(meta.get("duration_ms"), default=0),
343
+ "llm_used": bool(meta.get("llm_used")),
344
+ "degraded": bool(meta.get("degraded")),
345
+ }
346
+
347
+
348
+ def _has_meaningful_analysis_sections(value: Any) -> bool:
349
+ if not isinstance(value, dict):
350
+ return False
351
+ modules = value.get("modules")
352
+ if not isinstance(modules, dict):
353
+ return False
354
+ return any(bool(normalize_text(item)) for items in modules.values() if isinstance(items, list) for item in items)
355
+
356
+
440
357
  def _extract_required_fields(payload: Dict[str, Any], platform: str) -> Dict[str, Any]:
441
358
  author = _extract_author(payload)
442
-
443
359
  title = _pick_text(payload, ["title", "desc"], ["title", "desc"])
444
360
  caption_raw = normalize_text(payload.get("caption_raw") or payload.get("desc"))
445
361
  platform_work_id = _extract_platform_work_id(payload)
446
-
447
- source_url = _pick_text(
448
- payload,
449
- ["source_url", "share_url", "url"],
450
- ["source_url", "share_url", "url", "share_text"],
451
- )
452
- share_url = _pick_text(
453
- payload,
454
- ["share_url", "canonical_share_url"],
455
- ["share_url", "canonical_share_url", "url", "source_url", "share_text"],
456
- ) or source_url
457
-
458
- cover_image = _pick_text(
459
- payload,
460
- ["cover_image", "cover_url", "cover"],
461
- ["cover_image", "cover_url", "cover", "origin_cover"],
462
- )
362
+ source_url = _pick_text(payload, ["source_url", "share_url", "url"], ["source_url", "share_url", "url"])
363
+ share_url = _pick_text(payload, ["share_url", "canonical_share_url"], ["share_url", "canonical_share_url", "url"]) or source_url
364
+ cover_image = _pick_text(payload, ["cover_image", "cover_url", "cover"], ["cover_image", "cover_url", "cover"])
463
365
  selected_images = payload.get("selected_image_urls")
464
366
  if not cover_image and isinstance(selected_images, list) and selected_images:
465
367
  cover_image = normalize_text(selected_images[0])
466
-
467
368
  video_download_url = _pick_text(
468
369
  payload,
469
370
  ["video_download_url", "video_down_url", "selected_video_url", "original_video_url", "video_url", "download_url"],
@@ -476,41 +377,26 @@ def _extract_required_fields(payload: Dict[str, Any], platform: str) -> Dict[str
476
377
  if create_time_sec <= 0:
477
378
  create_time_sec = _to_unix_sec(_source_dict(payload).get("create_time"))
478
379
 
479
- digg_count = _safe_int(payload.get("digg_count"), default=0)
480
- comment_count = _safe_int(payload.get("comment_count"), default=0)
481
- collect_count = _safe_int(payload.get("collect_count"), default=0)
482
- share_count = _safe_int(payload.get("share_count"), default=0)
483
- play_count = _safe_optional_int(payload.get("play_count"))
484
-
485
- summary = normalize_text(payload.get("summary"))
486
- raw_content = normalize_text(payload.get("raw_content"))
487
- primary_text = normalize_text(payload.get("primary_text"))
380
+ raw_content = normalize_text(payload.get("asr_raw") or payload.get("raw_content"))
488
381
  provided_asr_clean = normalize_text(payload.get("asr_clean"))
489
- asr_clean = _clean_asr_text(raw_content, provided_asr_clean)
490
-
491
- duration_ms = _extract_duration_ms(payload)
492
-
493
- category = normalize_text(payload.get("category"))
494
- if not category:
495
- category = "观点"
496
-
497
- hot_score = _safe_int(payload.get("hot_score"), default=0)
498
- if hot_score <= 0:
499
- hot_score = digg_count + comment_count * 2 + collect_count * 3 + share_count * 4
382
+ asr_clean = derive_asr_clean_text(raw_content, provided_asr_clean)
500
383
 
501
384
  work_modality = normalize_text(payload.get("work_modality"))
502
385
  if not work_modality:
503
386
  work_modality = "video" if video_download_url or raw_content else "text"
504
387
 
505
- published_date = _resolve_published_date(payload, create_time_sec)
506
- primary_text_source_raw = normalize_text(payload.get("primary_text_source"))
507
- primary_text_source = (
508
- primary_text_source_raw
509
- if primary_text_source_raw in {"asr_clean", "caption_raw"}
510
- else ("asr_clean" if work_modality == "video" else "caption_raw")
511
- )
388
+ primary_text_source = normalize_text(payload.get("primary_text_source"))
389
+ if primary_text_source not in {"asr_clean", "caption_raw"}:
390
+ primary_text_source = "asr_clean" if work_modality == "video" else "caption_raw"
391
+ primary_text = normalize_text(payload.get("primary_text"))
512
392
  if not primary_text:
513
- primary_text = asr_clean if primary_text_source == "asr_clean" else normalize_text(payload.get("desc"))
393
+ primary_text = asr_clean if primary_text_source == "asr_clean" else (caption_raw or raw_content)
394
+
395
+ analysis_sections = ensure_analysis_sections_schema(
396
+ payload.get("analysis_sections") if isinstance(payload.get("analysis_sections"), dict) else {},
397
+ provider="local",
398
+ llm_used=False,
399
+ )
514
400
 
515
401
  return {
516
402
  "title": title,
@@ -524,414 +410,28 @@ def _extract_required_fields(payload: Dict[str, Any], platform: str) -> Dict[str
524
410
  "source_url": source_url,
525
411
  "cover_image": cover_image,
526
412
  "video_download_url": video_download_url,
527
- "published_date": published_date,
528
- "duration_ms": duration_ms,
529
- "digg_count": digg_count,
530
- "comment_count": comment_count,
531
- "collect_count": collect_count,
532
- "share_count": share_count,
533
- "play_count": play_count,
413
+ "published_date": _resolve_published_date(payload, create_time_sec),
414
+ "duration_ms": _extract_duration_ms(payload),
415
+ "digg_count": _safe_int(payload.get("digg_count"), default=0),
416
+ "comment_count": _safe_int(payload.get("comment_count"), default=0),
417
+ "collect_count": _safe_int(payload.get("collect_count"), default=0),
418
+ "share_count": _safe_int(payload.get("share_count"), default=0),
419
+ "play_count": _safe_optional_int(payload.get("play_count")),
534
420
  "tags": _extract_tags(payload),
535
421
  "work_modality": work_modality,
536
- "category": category,
422
+ "category": normalize_text(payload.get("category")) or "观点",
537
423
  "content_kind": normalize_text(payload.get("content_kind")),
538
- "summary": summary,
539
- "hot_score": hot_score,
424
+ "summary": normalize_text(payload.get("summary")),
540
425
  "raw_content": raw_content,
541
- "primary_text": primary_text,
426
+ "asr_raw": raw_content,
542
427
  "asr_clean": asr_clean,
543
- "platform_native_refs": payload.get("platform_native_refs") if isinstance(payload.get("platform_native_refs"), dict) else {},
428
+ "primary_text": primary_text,
429
+ "primary_text_source": primary_text_source,
544
430
  "request_id": payload.get("request_id"),
545
431
  "confidence": normalize_text(payload.get("confidence")) or "low",
546
432
  "error_reason": payload.get("error_reason"),
547
433
  "extract_trace": payload.get("extract_trace", []),
548
- "analysis_sections": payload.get("analysis_sections") if isinstance(payload.get("analysis_sections"), dict) else {},
549
- "analysis_output": payload.get("analysis_output") if isinstance(payload.get("analysis_output"), dict) else {},
550
- "author_analysis_v2": payload.get("author_analysis_v2") if isinstance(payload.get("author_analysis_v2"), dict) else {},
551
- "author_analysis_input_v1": payload.get("author_analysis_input_v1") if isinstance(payload.get("author_analysis_input_v1"), dict) else {},
552
- "sampled_work_explanations": payload.get("sampled_work_explanations") if isinstance(payload.get("sampled_work_explanations"), dict) else {},
553
- "author_card_highlights": payload.get("author_card_highlights") if isinstance(payload.get("author_card_highlights"), dict) else {},
554
- "validation": payload.get("validation") if isinstance(payload.get("validation"), dict) else {},
555
- "business_score": _safe_int(payload.get("business_score"), default=0),
556
- "benchmark_gap_score": _safe_int(payload.get("benchmark_gap_score"), default=0),
557
- "style_radar": payload.get("style_radar") if isinstance(payload.get("style_radar"), dict) else {},
558
- "core_contradictions": payload.get("core_contradictions") if isinstance(payload.get("core_contradictions"), list) else [],
559
- "recommendations": payload.get("recommendations") if isinstance(payload.get("recommendations"), list) else [],
560
- "business_analysis": normalize_text(payload.get("business_analysis")),
561
- "benchmark_analysis": normalize_text(payload.get("benchmark_analysis")),
562
- "nickname": normalize_text(payload.get("nickname")),
563
- "ip_location": normalize_text(payload.get("ip_location")),
564
- "signature": normalize_text(payload.get("signature")),
565
- "avatar_url": normalize_text(payload.get("avatar_url")),
566
- "fans_count": _safe_optional_int(payload.get("fans_count")),
567
- "liked_count": _safe_optional_int(payload.get("liked_count")),
568
- "collected_count": _safe_optional_int(payload.get("collected_count")),
569
- "works_count": _safe_optional_int(payload.get("works_count")),
570
- "verified": payload.get("verified") if isinstance(payload.get("verified"), bool) else None,
571
- "snapshot_at": normalize_text(payload.get("snapshot_at")),
572
- }
573
-
574
-
575
- def _format_create_time(create_time_sec: int) -> str:
576
- text = _format_shanghai_datetime(create_time_sec)
577
- if text:
578
- return text
579
- if create_time_sec <= 0:
580
- return "未知"
581
- return str(create_time_sec)
582
-
583
-
584
- def _format_duration(duration_ms: int) -> str:
585
- if duration_ms <= 0:
586
- return "未知"
587
- total_sec = duration_ms // 1000
588
- minute, second = divmod(total_sec, 60)
589
- if minute:
590
- return f"{minute}分{second:02d}秒"
591
- return f"{second}秒"
592
-
593
-
594
- def _sentence_units(text: str) -> List[str]:
595
- if not text:
596
- return []
597
- return [normalize_text(x) for x in re.split(r"[。!?!?;;\\n]+", text) if normalize_text(x)]
598
-
599
-
600
- def _first_sentence(text: str) -> str:
601
- units = _sentence_units(text)
602
- return units[0] if units else ""
603
-
604
-
605
- def _hit_count(text: str, keywords: List[str]) -> int:
606
- if not text:
607
- return 0
608
- return sum(1 for token in keywords if token in text)
609
-
610
-
611
- def _top_keywords(text: str, candidates: List[str], topn: int = 3) -> List[str]:
612
- if not text:
613
- return []
614
- scored = []
615
- for token in candidates:
616
- count = text.count(token)
617
- if count > 0:
618
- scored.append((count, token))
619
- scored.sort(key=lambda x: (-x[0], len(x[1])))
620
- return [token for _, token in scored[:topn]]
621
-
622
-
623
- def _score_from_hits(hits: int, full_score_hits: int = 4) -> int:
624
- if hits <= 0:
625
- return 2
626
- if hits >= full_score_hits:
627
- return 5
628
- return min(5, hits + 2)
629
-
630
-
631
-
632
- def _analyze_topic(fields: Dict[str, Any]) -> Dict[str, Any]:
633
- title = normalize_text(fields.get("title") or "")
634
- asr = normalize_text(fields.get("asr_clean") or "")
635
- category = normalize_text(fields.get("category") or "")
636
- text = f"{title} {asr}"
637
-
638
- if not text.strip():
639
- return {
640
- "score": 2,
641
- "lines": ["- 类型:数据不足。", "- 细分主题:数据不足。", "- 受众痛点:数据不足,需补充标题或ASR。"],
642
- "gaps": ["补齐标题或ASR文本,才能完成选题分类与主题归因"],
643
- "evidence": "输入文本缺失",
644
- }
645
-
646
- type_rules = {
647
- "流量型": ["热点", "挑战", "反转", "揭秘", "真相", "别再", "为什么", "踩坑", "3秒", "爆款"],
648
- "人设型": ["我是", "我们", "日常", "系列", "分享", "经历", "成长", "复盘", "带你", "我"],
649
- "营销型": ["领取", "私信", "咨询", "下单", "课程", "优惠", "链接", "报名", "合作", "购买"],
650
- }
651
- type_scores = {name: _hit_count(text, kws) for name, kws in type_rules.items()}
652
-
653
- if category in ["教程", "知识", "方法"]:
654
- type_scores["营销型"] += 1
655
- if category in ["观点", "人设", "日常"]:
656
- type_scores["人设型"] += 1
657
-
658
- main_type = max(type_scores, key=lambda k: type_scores[k])
659
- main_hits = type_scores[main_type]
660
-
661
- theme_candidates = [
662
- "AI", "智能体", "变现", "副业", "教程", "工作流", "流量", "涨粉", "投流", "口播", "脚本", "工具", "私域", "创业", "营销",
663
- ]
664
- themes = _top_keywords(text, theme_candidates, topn=3)
665
- pain_candidates = ["不会", "焦虑", "卡住", "没流量", "转化", "不会写", "不会做", "时间不够", "担心", "风险"]
666
- pains = _top_keywords(text, pain_candidates, topn=2)
667
-
668
- lines = [
669
- f"- 基础类型:{main_type}(命中信号 {main_hits} 个)。",
670
- f"- 细分主题:{'、'.join(themes) if themes else '数据不足(未检测到显著主题词)'}。",
671
- f"- 受众痛点:{'、'.join(pains) if pains else '以“快速落地/降低门槛”为主(显性痛点词不足)'}。",
672
- ]
673
-
674
- return {
675
- "score": _score_from_hits(main_hits),
676
- "lines": lines,
677
- "gaps": [] if themes else ["补充更完整ASR,提高细分主题识别稳定性"],
678
- "evidence": f"类型命中分布={type_scores}",
679
- }
680
-
681
-
682
- def _analyze_style(fields: Dict[str, Any]) -> Dict[str, Any]:
683
- asr = normalize_text(fields.get("asr_clean") or "")
684
- title = normalize_text(fields.get("title") or "")
685
- text = f"{title} {asr}".strip()
686
- units = _sentence_units(asr)
687
-
688
- if not text:
689
- return {
690
- "score": 2,
691
- "lines": ["- 人设匹配:数据不足。", "- 句式结构:数据不足。", "- 语气与情绪:数据不足。"],
692
- "gaps": ["补齐ASR文本后再做文风拆解"],
693
- "evidence": "输入文本缺失",
694
- }
695
-
696
- avg_len = int(sum(len(u) for u in units) / max(1, len(units))) if units else 0
697
- if avg_len <= 14:
698
- length_type = "短句为主"
699
- elif avg_len <= 24:
700
- length_type = "中短句混合"
701
- else:
702
- length_type = "中长句为主"
703
-
704
- q_count = text.count("?") + text.count("?")
705
- e_count = text.count("!") + text.count("!")
706
- statement_count = max(0, len(units) - q_count - e_count)
707
- persona_hits = _hit_count(text, ["我", "我们", "你", "大家", "朋友们", "聪明的你"])
708
- rhetoric_hits = _hit_count(text, ["不是", "而是", "其实", "真的", "一定", "必须", "先", "再"])
709
-
710
- lines = [
711
- f"- 句式结构:{length_type},平均句长约 {avg_len} 字。",
712
- f"- 语气分布:疑问 {q_count} / 感叹 {e_count} / 陈述 {statement_count}。",
713
- f"- 人设与修辞:人设代词命中 {persona_hits} 次,强调/转折词命中 {rhetoric_hits} 次。",
714
- ]
715
-
716
- strength_hits = int(avg_len > 0) + int(persona_hits > 0) + int(rhetoric_hits > 0)
717
- return {
718
- "score": _score_from_hits(strength_hits, full_score_hits=3),
719
- "lines": lines,
720
- "gaps": [] if units else ["ASR分句失败,建议人工复核"],
721
- "evidence": f"avg_len={avg_len}, persona_hits={persona_hits}, rhetoric_hits={rhetoric_hits}",
722
- }
723
-
724
-
725
- def _analyze_hook(fields: Dict[str, Any]) -> Dict[str, Any]:
726
- title = normalize_text(fields.get("title") or "")
727
- asr = normalize_text(fields.get("asr_clean") or "")
728
- first = _first_sentence(asr) or title
729
- middle = _sentence_units(asr)[len(_sentence_units(asr)) // 2] if _sentence_units(asr) else ""
730
-
731
- if not first:
732
- return {
733
- "score": 2,
734
- "lines": ["- 开头钩子:数据不足。", "- 中段钩子:数据不足。", "- 结尾钩子:数据不足。"],
735
- "gaps": ["缺少标题与ASR,无法提取钩子原话"],
736
- "evidence": "开头句缺失",
737
- }
738
-
739
- hook_type = "陈述式"
740
- if any(k in first for k in ["?", "?", "为什么", "怎么"]):
741
- hook_type = "疑问式"
742
- elif any(k in first for k in ["别再", "误区", "真相", "不是"]):
743
- hook_type = "反常识式"
744
- elif any(k in first for k in ["当你", "如果", "今天"]):
745
- hook_type = "场景代入式"
746
-
747
- end_candidates = [u for u in _sentence_units(asr) if _hit_count(u, ["关注", "评论", "私信", "收藏", "转发", "下次见", "领取"]) > 0]
748
- end = end_candidates[-1] if end_candidates else "未检测到明确结尾钩子"
749
-
750
- lines = [
751
- f"- 开头钩子({hook_type}):{first}",
752
- f"- 中段钩子:{middle or '数据不足(中段文本不足)'}",
753
- f"- 结尾钩子:{end}",
754
- ]
755
-
756
- hook_hits = int(first != "") + int(bool(middle)) + int(end != "未检测到明确结尾钩子")
757
- return {
758
- "score": _score_from_hits(hook_hits, full_score_hits=3),
759
- "lines": lines,
760
- "gaps": [] if hook_hits >= 2 else ["建议补充中段转折钩子与结尾动作钩子"],
761
- "evidence": f"hook_type={hook_type}, hook_hits={hook_hits}",
762
- }
763
-
764
-
765
- def _analyze_structure(fields: Dict[str, Any]) -> Dict[str, Any]:
766
- asr = normalize_text(fields.get("asr_clean") or "")
767
- units = _sentence_units(asr)
768
- if not units:
769
- return {
770
- "score": 2,
771
- "lines": ["- 结构标签:数据不足。", "- 模板判定:数据不足。"],
772
- "gaps": ["补充ASR后再进行结构标注"],
773
- "evidence": "分句为空",
774
- }
775
-
776
- label_rules = {
777
- "钩子": ["?", "?", "为什么", "怎么", "别再", "真相", "当你", "如果"],
778
- "冲突": ["但是", "却", "问题", "误区", "卡住", "焦虑", "失败"],
779
- "转折": ["所以", "于是", "然后", "接着", "这时候", "其实"],
780
- "举证": ["数据", "案例", "比如", "步骤", "第一", "第二", "第三"],
781
- "CTA": ["评论", "关注", "私信", "收藏", "转发", "点击", "领取", "报名"],
782
- }
783
- coverage = {k: 0 for k in label_rules}
784
- for sent in units:
785
- for label, kws in label_rules.items():
786
- if any(kw in sent for kw in kws):
787
- coverage[label] += 1
788
-
789
- present = [k for k, v in coverage.items() if v > 0]
790
- missing = [k for k, v in coverage.items() if v == 0]
791
- template = "钩子→冲突→转折→举证→CTA" if len(present) >= 4 else "钩子→观点→补充说明"
792
-
793
- lines = [
794
- f"- 结构标签覆盖:{', '.join([f'{k}:{v}' for k, v in coverage.items()])}。",
795
- f"- 模板判定:{template}。",
796
- f"- 缺失模块:{'、'.join(missing) if missing else '无'}。",
797
- ]
798
-
799
- return {
800
- "score": _score_from_hits(len(present), full_score_hits=5),
801
- "lines": lines,
802
- "gaps": [f"优先补齐结构模块:{'、'.join(missing)}"] if missing else [],
803
- "evidence": f"coverage={coverage}",
804
- }
805
-
806
-
807
- def _analyze_cta(fields: Dict[str, Any]) -> Dict[str, Any]:
808
- asr = normalize_text(fields.get("asr_clean") or "")
809
- units = _sentence_units(asr)
810
- cta_tokens = ["评论", "关注", "私信", "收藏", "转发", "点击", "领取", "报名", "下单", "咨询", "试试"]
811
- cta_sentences = [u for u in units if any(token in u for token in cta_tokens)]
812
-
813
- if not units:
814
- return {
815
- "score": 2,
816
- "lines": ["- CTA策略:数据不足。", "- 行动指令:数据不足。"],
817
- "gaps": ["缺少ASR,无法识别CTA"],
818
- "evidence": "分句为空",
819
- }
820
-
821
- if not cta_sentences:
822
- return {
823
- "score": 2,
824
- "lines": ["- CTA策略:未检测到明确行动号召。", "- 行动指令:建议补一句“评论区/私信领取”。"],
825
- "gaps": ["补充单一明确CTA,避免只有信息陈述"],
826
- "evidence": "cta_sentences=0",
827
- }
828
-
829
- primary_cta = cta_sentences[-1]
830
- cta_types = []
831
- if any(k in asr for k in ["评论", "点赞", "收藏", "转发", "关注"]):
832
- cta_types.append("互动型")
833
- if any(k in asr for k in ["私信", "领取", "链接", "资料"]):
834
- cta_types.append("线索型")
835
- if any(k in asr for k in ["下单", "报名", "咨询", "购买"]):
836
- cta_types.append("转化型")
837
-
838
- lines = [
839
- f"- CTA类型:{'、'.join(cta_types) if cta_types else '互动型(弱)'}。",
840
- f"- 关键动作句:{primary_cta}",
841
- f"- CTA密度:{len(cta_sentences)}/{len(units)} 句。",
842
- ]
843
-
844
- return {
845
- "score": _score_from_hits(len(cta_types) + int(len(cta_sentences) > 0), full_score_hits=3),
846
- "lines": lines,
847
- "gaps": [] if len(cta_types) > 0 else ["补充线索型或转化型CTA,提高商业闭环"],
848
- "evidence": f"cta_types={cta_types}, cta_count={len(cta_sentences)}",
849
- }
850
-
851
-
852
- def _build_summary_module(results: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
853
- ordered = ["选题", "文风", "Hook", "结构", "CTA"]
854
- scored = [(name, results[name]["score"]) for name in ordered]
855
- avg_score = round(sum(score for _, score in scored) / max(1, len(scored)), 2)
856
- weakest = sorted(scored, key=lambda x: x[1])[:2]
857
-
858
- if avg_score >= 4.2:
859
- verdict = "可直接复用"
860
- elif avg_score >= 3.4:
861
- verdict = "可用,但需小幅优化"
862
- else:
863
- verdict = "需重写关键模块后再投放"
864
-
865
- suggestions = []
866
- for name, _ in weakest:
867
- gaps = results[name].get("gaps") or []
868
- if gaps:
869
- suggestions.append(f"- [{name}] {gaps[0]}")
870
- if not suggestions:
871
- suggestions = ["- 保持当前结构,持续做A/B测试验证Hook与CTA。"]
872
-
873
- return {
874
- "score": int(round(avg_score)),
875
- "lines": [
876
- f"- 结论:综合评分 {avg_score}/5,判定为“{verdict}”。",
877
- "- 建议:",
878
- *suggestions[:3],
879
- ],
880
- "gaps": [],
881
- "evidence": f"scores={dict(scored)}",
882
- }
883
-
884
-
885
- def _insight_metric_snapshot(fields: Dict[str, Any]) -> Dict[str, Any]:
886
- digg = _safe_int(fields.get("digg_count"), default=0)
887
- comment = _safe_int(fields.get("comment_count"), default=0)
888
- collect = _safe_int(fields.get("collect_count"), default=0)
889
- share = _safe_int(fields.get("share_count"), default=0)
890
- play = _safe_int(fields.get("play_count"), default=0)
891
-
892
- interaction = digg + comment * 2 + collect * 3 + share * 4
893
- interaction_rate = interaction / play if play > 0 else 0.0
894
- return {
895
- "interaction": interaction,
896
- "interaction_rate": interaction_rate,
897
- "digg": digg,
898
- "comment": comment,
899
- "collect": collect,
900
- "share": share,
901
- }
902
-
903
-
904
- def _build_local_analysis_sections(fields: Dict[str, Any]) -> Dict[str, Any]:
905
- topic = _analyze_topic(fields)
906
- style = _analyze_style(fields)
907
- hook = _analyze_hook(fields)
908
- structure = _analyze_structure(fields)
909
- cta = _analyze_cta(fields)
910
- summary = _build_summary_module(
911
- {
912
- "选题": topic,
913
- "文风": style,
914
- "Hook": hook,
915
- "结构": structure,
916
- "CTA": cta,
917
- }
918
- )
919
- metrics = _insight_metric_snapshot(fields)
920
- insight_lines = list(summary.get("lines") or [])
921
- insight_lines.extend(
922
- [
923
- f"- 互动折算值:{metrics.get('interaction', 0)}。",
924
- f"- 粗略互动率:{metrics.get('interaction_rate', 0.0):.4f}。",
925
- ]
926
- )
927
- return {
928
- "modules": {
929
- "选题": topic.get("lines", ["数据不足"]),
930
- "文风": style.get("lines", ["数据不足"]),
931
- "Hook": hook.get("lines", ["数据不足"]),
932
- "结构": structure.get("lines", ["数据不足"]),
933
- },
934
- "insight": insight_lines or ["数据不足"],
434
+ "analysis_sections": analysis_sections,
935
435
  }
936
436
 
937
437
 
@@ -940,19 +440,34 @@ def build_card_analysis_artifact(
940
440
  payload: Dict[str, Any],
941
441
  platform: str,
942
442
  card_type: str,
443
+ analysis_mode: str = "auto",
444
+ storage_config: Optional[Dict[str, Any]] = None,
445
+ progress: Optional[ProgressReporter] = None,
943
446
  ) -> Dict[str, Any]:
944
447
  fields = _extract_required_fields(payload, platform=platform)
945
- precomputed = fields.get("analysis_sections") if isinstance(fields.get("analysis_sections"), dict) else {}
946
- if precomputed:
947
- analysis_sections = precomputed
948
- elif card_type == "author":
949
- analysis_sections = {}
448
+ if _has_meaningful_analysis_sections(payload.get("analysis_sections")):
449
+ existing = payload.get("analysis_sections")
450
+ meta = existing.get("meta") if isinstance(existing, dict) and isinstance(existing.get("meta"), dict) else {}
451
+ analysis_sections = ensure_analysis_sections_schema(
452
+ existing,
453
+ provider=normalize_text(existing.get("provider")) or "local",
454
+ llm_used=bool(meta.get("llm_used")),
455
+ degraded=bool(meta.get("degraded")),
456
+ reason=normalize_text(meta.get("reason")),
457
+ duration_ms=_safe_int(meta.get("duration_ms"), default=0),
458
+ )
950
459
  else:
951
- analysis_sections = build_analysis_sections(fields)
460
+ analysis_sections = build_analysis_sections(
461
+ fields,
462
+ analysis_mode=analysis_mode,
463
+ analysis_config=storage_config.get("analysis") if isinstance(storage_config, dict) else None,
464
+ progress=progress,
465
+ )
952
466
  fields["analysis_sections"] = analysis_sections
953
467
  return {
954
468
  "fields": fields,
955
469
  "analysis_sections": analysis_sections,
470
+ "card_type": normalize_card_type(card_type),
956
471
  }
957
472
 
958
473
 
@@ -963,12 +478,10 @@ def _build_output_path(
963
478
  card_type: str,
964
479
  payload: Dict[str, Any],
965
480
  now: dt.datetime,
966
- sample_author: Optional[str],
967
481
  storage_config: Optional[Dict[str, Any]],
968
482
  ) -> Dict[str, str]:
969
- author_slug = _pick_author_slug(payload, author_hint=sample_author)
483
+ author_slug = _pick_author_slug(payload)
970
484
  title_slug = _pick_title_slug(payload)
971
-
972
485
  path, route_parts = build_card_output_path(
973
486
  card_root=card_root,
974
487
  platform=platform,
@@ -983,170 +496,9 @@ def _build_output_path(
983
496
  return {
984
497
  "path": path,
985
498
  "route_parts": route_parts,
986
- "author_slug": author_slug,
987
- "title_slug": title_slug,
988
- "target_type": card_type,
989
499
  }
990
500
 
991
501
 
992
- def _render_author_markdown(
993
- *,
994
- card_id: str,
995
- card_type: str,
996
- fields: Dict[str, Any],
997
- generated_at: str,
998
- ) -> str:
999
- analysis_output = fields.get("analysis_output") if isinstance(fields.get("analysis_output"), dict) else {}
1000
- author_analysis_v2 = fields.get("author_analysis_v2") if isinstance(fields.get("author_analysis_v2"), dict) else analysis_output.get("author_analysis_v2", {})
1001
- if not isinstance(author_analysis_v2, dict):
1002
- author_analysis_v2 = {}
1003
- sampled_work_explanations = fields.get("sampled_work_explanations") if isinstance(fields.get("sampled_work_explanations"), dict) else analysis_output.get("sampled_work_explanations", {})
1004
- if not isinstance(sampled_work_explanations, dict):
1005
- sampled_work_explanations = {}
1006
- author_card_highlights = fields.get("author_card_highlights") if isinstance(fields.get("author_card_highlights"), dict) else {}
1007
- if not isinstance(author_card_highlights, dict):
1008
- author_card_highlights = {}
1009
- validation = fields.get("validation") if isinstance(fields.get("validation"), dict) else analysis_output.get("validation", {})
1010
- if not isinstance(validation, dict):
1011
- validation = {}
1012
-
1013
- business_score = _safe_int(fields.get("business_score"), default=_safe_int(analysis_output.get("business_score"), default=0))
1014
- benchmark_gap_score = _safe_int(fields.get("benchmark_gap_score"), default=_safe_int(analysis_output.get("benchmark_gap_score"), default=0))
1015
- style_radar = fields.get("style_radar") if isinstance(fields.get("style_radar"), dict) else analysis_output.get("style_radar", {})
1016
- if not isinstance(style_radar, dict):
1017
- style_radar = {}
1018
-
1019
- core_contradictions = fields.get("core_contradictions") if isinstance(fields.get("core_contradictions"), list) else analysis_output.get("core_contradictions", [])
1020
- if not isinstance(core_contradictions, list):
1021
- core_contradictions = []
1022
-
1023
- recommendations = fields.get("recommendations") if isinstance(fields.get("recommendations"), list) else analysis_output.get("recommendations", [])
1024
- if not isinstance(recommendations, list):
1025
- recommendations = []
1026
-
1027
- business_analysis = normalize_text(fields.get("business_analysis")) or normalize_text(analysis_output.get("business_analysis"))
1028
- benchmark_analysis = normalize_text(fields.get("benchmark_analysis")) or normalize_text(analysis_output.get("benchmark_analysis"))
1029
- author_portrait = normalize_text(author_card_highlights.get("one_liner")) or normalize_text(fields.get("summary")) or normalize_text(analysis_output.get("author_portrait"))
1030
-
1031
- fm = {
1032
- "card_id": card_id,
1033
- "card_type": card_type,
1034
- "platform": fields.get("platform"),
1035
- "generated_at": generated_at,
1036
- "updated_at": generated_at,
1037
- "title": fields.get("title"),
1038
- "platform_work_id": fields.get("platform_work_id"),
1039
- "author": fields.get("author"),
1040
- "author_handle": fields.get("author_handle"),
1041
- "platform_author_id": fields.get("platform_author_id"),
1042
- "nickname": fields.get("nickname"),
1043
- "ip_location": fields.get("ip_location"),
1044
- "avatar_url": fields.get("avatar_url"),
1045
- "signature": fields.get("signature"),
1046
- "fans_count": fields.get("fans_count"),
1047
- "liked_count": fields.get("liked_count"),
1048
- "collected_count": fields.get("collected_count"),
1049
- "works_count": fields.get("works_count"),
1050
- "verified": fields.get("verified"),
1051
- "snapshot_at": fields.get("snapshot_at"),
1052
- "business_score": business_score,
1053
- "benchmark_gap_score": benchmark_gap_score,
1054
- "request_id": fields.get("request_id"),
1055
- }
1056
-
1057
- frontmatter = ["---"]
1058
- for key, value in fm.items():
1059
- frontmatter.append(f"{key}: {json.dumps(value, ensure_ascii=False)}")
1060
- frontmatter.append("---")
1061
-
1062
- lines = [
1063
- *frontmatter,
1064
- "",
1065
- "## 基础事实",
1066
- f"- 平台:{fields.get('platform') or '未知'}",
1067
- f"- 作者ID:{fields.get('platform_author_id') or '未知'}",
1068
- f"- 账号标识:{fields.get('author_handle') or 'N/A'}",
1069
- f"- 昵称:{fields.get('nickname') or fields.get('author') or '未知'}",
1070
- f"- IP属地:{fields.get('ip_location') or 'N/A'}",
1071
- f"- 签名:{fields.get('signature') or 'N/A'}",
1072
- f"- 头像:{fields.get('avatar_url') or 'N/A'}",
1073
- f"- 粉丝数:{_display_metric(fields.get('fans_count'))}",
1074
- f"- 累计获赞:{_display_metric(fields.get('liked_count'))}",
1075
- f"- 累计收藏:{_display_metric(fields.get('collected_count'))}",
1076
- f"- 作品数:{_display_metric(fields.get('works_count'))}",
1077
- f"- 认证状态:{'是' if fields.get('verified') else '否'}" if fields.get('verified') is not None else "- 认证状态:N/A",
1078
- f"- 抓取时间:{fields.get('snapshot_at') or 'N/A'}",
1079
- "",
1080
- "## 作者画像",
1081
- author_portrait or "数据不足",
1082
- "",
1083
- "## 主页摘要卡",
1084
- f"- 核心价值:{normalize_text(author_card_highlights.get('core_value_proposition')) or '数据不足'}",
1085
- f"- 主要信任源:{normalize_text(author_card_highlights.get('primary_trust_source')) or '数据不足'}",
1086
- f"- 胜率结构:{('、'.join([normalize_text(x) for x in author_card_highlights.get('winning_content_structures', []) if normalize_text(x)])) or '数据不足'}",
1087
- f"- 可能产品:{('、'.join([normalize_text(x) for x in author_card_highlights.get('likely_products', []) if normalize_text(x)])) or '证据不足'}",
1088
- f"- 最大张力:{normalize_text(author_card_highlights.get('most_important_tension')) or '数据不足'}",
1089
- f"- 只学一件事:{normalize_text(author_card_highlights.get('if_only_learn_one_thing')) or '数据不足'}",
1090
- "",
1091
- "## 商业分析",
1092
- business_analysis or "数据不足",
1093
- "",
1094
- "## 对标分析",
1095
- benchmark_analysis or "数据不足",
1096
- "",
1097
- "## 评分",
1098
- f"- business_score: {business_score}",
1099
- f"- benchmark_gap_score: {benchmark_gap_score}",
1100
- "",
1101
- "## 风格雷达",
1102
- "```json",
1103
- json.dumps(style_radar, ensure_ascii=False, indent=2),
1104
- "```",
1105
- "",
1106
- "## 核心矛盾",
1107
- ]
1108
-
1109
- if core_contradictions:
1110
- lines.extend([f"- {normalize_text(item)}" for item in core_contradictions if normalize_text(item)])
1111
- else:
1112
- lines.append("- 数据不足")
1113
-
1114
- lines.extend(["", "## 建议动作"])
1115
- if recommendations:
1116
- lines.extend([f"- {normalize_text(item)}" for item in recommendations if normalize_text(item)])
1117
- else:
1118
- lines.append("- 数据不足")
1119
-
1120
- lines.extend(
1121
- [
1122
- "",
1123
- "## author_analysis_v2",
1124
- "```json",
1125
- json.dumps(author_analysis_v2, ensure_ascii=False, indent=2),
1126
- "```",
1127
- "",
1128
- "## sampled_work_explanations",
1129
- "```json",
1130
- json.dumps(sampled_work_explanations, ensure_ascii=False, indent=2),
1131
- "```",
1132
- "",
1133
- "## 校验",
1134
- f"- validation_ok: {bool(validation.get('ok'))}",
1135
- f"- validation_error_count: {len(validation.get('errors') or [])}",
1136
- "",
1137
- "## 附录",
1138
- f"- confidence: {fields.get('confidence')}",
1139
- f"- error_reason: {fields.get('error_reason')}",
1140
- "",
1141
- "```json",
1142
- json.dumps(fields.get("extract_trace", []), ensure_ascii=False, indent=2),
1143
- "```",
1144
- "",
1145
- ]
1146
- )
1147
- return "\n".join(lines)
1148
-
1149
-
1150
502
  def _render_markdown(
1151
503
  *,
1152
504
  card_id: str,
@@ -1154,24 +506,9 @@ def _render_markdown(
1154
506
  fields: Dict[str, Any],
1155
507
  generated_at: str,
1156
508
  ) -> str:
1157
- if card_type == "author":
1158
- return _render_author_markdown(
1159
- card_id=card_id,
1160
- card_type=card_type,
1161
- fields=fields,
1162
- generated_at=generated_at,
1163
- )
1164
509
  author_name = fields.get("author") or fields.get("author_handle") or fields.get("platform_author_id") or "未知作者"
1165
510
  title = fields.get("title") or "(标题缺失)"
1166
- metrics_line = (
1167
- f"赞 {_display_metric(fields.get('digg_count'))} / 评 {_display_metric(fields.get('comment_count'))} / "
1168
- f"藏 {_display_metric(fields.get('collect_count'))} / 转 {_display_metric(fields.get('share_count'))} / 播 {_display_metric(fields.get('play_count'))}"
1169
- )
1170
- precomputed_sections = fields.get("analysis_sections") if isinstance(fields.get("analysis_sections"), dict) else {}
1171
- if precomputed_sections:
1172
- analysis_sections = precomputed_sections
1173
- else:
1174
- analysis_sections = {} if card_type == "author_sample_work" else build_analysis_sections(fields)
511
+ analysis_sections = ensure_analysis_sections_schema(fields.get("analysis_sections"), provider="local", llm_used=False)
1175
512
  creative_modules = analysis_sections.get("modules", {})
1176
513
  insight_lines = analysis_sections.get("insight", ["数据不足"])
1177
514
  extract_trace_json = json.dumps(fields.get("extract_trace", []), ensure_ascii=False, indent=2)
@@ -1188,7 +525,6 @@ def _render_markdown(
1188
525
  "author_handle": fields.get("author_handle"),
1189
526
  "platform_author_id": fields.get("platform_author_id"),
1190
527
  "caption_raw": fields.get("caption_raw"),
1191
- "primary_text": fields.get("primary_text"),
1192
528
  "share_url": fields.get("share_url"),
1193
529
  "source_url": fields.get("source_url"),
1194
530
  "cover_image": fields.get("cover_image"),
@@ -1212,16 +548,6 @@ def _render_markdown(
1212
548
  lines = [
1213
549
  *frontmatter,
1214
550
  "",
1215
- "## 基础信息",
1216
- f"- 作者:{author_name}",
1217
- f"- 标题:{title}",
1218
- f"- 原始文案:{fields.get('caption_raw') or 'N/A'}",
1219
- f"- 作品模态:{fields.get('work_modality') or '未知'}",
1220
- f"- 发布时间:{fields.get('published_date') or 'N/A'}",
1221
- f"- {'视频时长' if fields.get('work_modality') == 'video' else '阅读载体'}:{_format_duration(fields.get('duration_ms', 0)) if fields.get('work_modality') == 'video' else '文本'}",
1222
- f"- 互动:{metrics_line}",
1223
- f"- 链接:{fields.get('share_url') or '(未提供)'}",
1224
- f"- 下载链接:{fields.get('video_download_url') or 'N/A'}" if fields.get("work_modality") == "video" else "- 下载链接:N/A",
1225
551
  ]
1226
552
 
1227
553
  for heading in DEFAULT_MODULE_SECTIONS:
@@ -1235,24 +561,15 @@ def _render_markdown(
1235
561
  for item in insight_lines:
1236
562
  lines.append(item)
1237
563
 
1238
- transcript_heading = "## 主文本"
1239
- transcript_body = fields.get("primary_text")
1240
- transcript_fallback = "(无可用主文本)"
1241
-
1242
564
  lines.extend(
1243
565
  [
1244
566
  "",
1245
- transcript_heading,
1246
- transcript_body or transcript_fallback,
1247
- ]
1248
- )
1249
-
1250
- lines.extend(
1251
- [
567
+ "## 主文本",
568
+ fields.get("primary_text") or "(无可用主文本)",
1252
569
  "",
1253
570
  "## 附录",
1254
571
  "### ASR_RAW",
1255
- fields.get("raw_content") or "(无可用 ASR 原文)",
572
+ fields.get("asr_raw") or "(无可用 ASR 原文)",
1256
573
  "",
1257
574
  "### trace",
1258
575
  f"- request_id: {fields.get('request_id')}",
@@ -1282,30 +599,39 @@ def _resolve_card_root(card_root: Optional[str]) -> str:
1282
599
  raw = (card_root or "").strip()
1283
600
  if not raw:
1284
601
  return resolve_default_card_root()
1285
-
1286
602
  candidate = Path(raw).expanduser()
1287
603
  if not candidate.is_absolute():
1288
604
  raise ValueError("card_root must be an absolute path")
1289
605
  return str(candidate.resolve())
1290
606
 
1291
607
 
608
+ def _read_payload_from_input(input_json: str) -> Dict[str, Any]:
609
+ if input_json == "-":
610
+ raw = os.read(0, 1024 * 1024).decode("utf-8", errors="replace").strip()
611
+ if not raw:
612
+ return {}
613
+ return json.loads(raw)
614
+ return read_json_file(input_json)
615
+
616
+
1292
617
  def write_benchmark_card(
1293
618
  *,
1294
619
  payload: Dict[str, Any],
1295
620
  platform: str,
1296
621
  card_type: str,
1297
622
  card_root: Optional[str],
1298
- sample_author: Optional[str] = None,
1299
623
  content_kind: Optional[str] = None,
1300
624
  storage_config: Optional[Dict[str, Any]] = None,
1301
625
  force_card_type: bool = False,
626
+ analysis_mode: str = "auto",
627
+ progress: Optional[ProgressReporter] = None,
1302
628
  ) -> Dict[str, Any]:
629
+ started_at = time.perf_counter()
1303
630
  now = dt.datetime.now()
1304
631
  generated_at = now.isoformat(timespec="seconds")
1305
632
 
1306
633
  payload_content_kind = normalize_text(payload.get("content_kind"))
1307
634
  resolved_content_kind = normalize_text(content_kind) or payload_content_kind
1308
-
1309
635
  normalized_card_type = normalize_card_type(card_type)
1310
636
  effective_card_type = resolve_effective_card_type(
1311
637
  card_type=normalized_card_type,
@@ -1313,29 +639,61 @@ def write_benchmark_card(
1313
639
  storage_config=storage_config,
1314
640
  force_card_type=force_card_type,
1315
641
  )
642
+ if effective_card_type != "work":
643
+ effective_card_type = "work"
644
+
1316
645
  fields = _extract_required_fields(payload, platform=platform)
1317
- resolved_card_root = _resolve_card_root(card_root)
646
+ if progress is not None:
647
+ progress.progress(stage="card.analysis", message="building card analysis")
648
+ if _has_meaningful_analysis_sections(payload.get("analysis_sections")):
649
+ existing = payload.get("analysis_sections")
650
+ meta = existing.get("meta") if isinstance(existing, dict) and isinstance(existing.get("meta"), dict) else {}
651
+ analysis_sections = ensure_analysis_sections_schema(
652
+ existing,
653
+ provider=normalize_text(existing.get("provider")) or "local",
654
+ llm_used=bool(meta.get("llm_used")),
655
+ degraded=bool(meta.get("degraded")),
656
+ reason=normalize_text(meta.get("reason")),
657
+ duration_ms=_safe_int(meta.get("duration_ms"), default=0),
658
+ )
659
+ else:
660
+ analysis_sections = build_analysis_sections(
661
+ fields,
662
+ analysis_mode=analysis_mode,
663
+ analysis_config=storage_config.get("analysis") if isinstance(storage_config, dict) else None,
664
+ progress=progress.child(scope="card.analysis") if progress is not None else None,
665
+ )
666
+ fields["analysis_sections"] = analysis_sections
1318
667
 
668
+ payload["analysis_sections"] = analysis_sections
669
+ payload["asr_raw"] = fields.get("asr_raw")
670
+ payload["asr_clean"] = fields.get("asr_clean")
671
+ payload["primary_text"] = fields.get("primary_text")
672
+ payload["primary_text_source"] = fields.get("primary_text_source")
673
+ deep_analysis = _analysis_status_from_sections(analysis_sections)
674
+ payload["deep_analysis"] = deep_analysis
675
+
676
+ resolved_card_root = _resolve_card_root(card_root)
1319
677
  primary_target = _build_output_path(
1320
678
  card_root=resolved_card_root,
1321
679
  platform=platform,
1322
680
  card_type=effective_card_type,
1323
681
  payload=payload,
1324
682
  now=now,
1325
- sample_author=sample_author,
1326
683
  storage_config=storage_config,
1327
684
  )
1328
685
  primary_path = primary_target["path"]
1329
-
1330
686
  primary_card_id = os.path.basename(primary_path).replace(".md", "")
1331
- primary_markdown = _render_markdown(
687
+
688
+ markdown = _render_markdown(
1332
689
  card_id=primary_card_id,
1333
690
  card_type=effective_card_type,
1334
691
  fields=fields,
1335
692
  generated_at=generated_at,
1336
693
  )
1337
- _write_file(primary_path, primary_markdown)
694
+ _write_file(primary_path, markdown)
1338
695
 
696
+ duration_ms = int((time.perf_counter() - started_at) * 1000)
1339
697
  return {
1340
698
  "ok": True,
1341
699
  "platform": platform,
@@ -1349,34 +707,25 @@ def write_benchmark_card(
1349
707
  "storage_routes_configured": bool(isinstance(storage_config, dict) and isinstance(storage_config.get("storage_routes"), dict)),
1350
708
  },
1351
709
  "required_fields": fields,
710
+ "analysis_sections": analysis_sections,
711
+ "analysis_status": deep_analysis,
712
+ "duration_ms": duration_ms,
713
+ "llm_analysis_ms": _safe_int(analysis_sections.get("meta", {}).get("duration_ms"), default=0),
1352
714
  }
1353
715
 
1354
716
 
1355
- def _read_payload_from_input(input_json: str) -> Dict[str, Any]:
1356
- if input_json == "-":
1357
- raw = os.read(0, 1024 * 1024).decode("utf-8", errors="replace").strip()
1358
- if not raw:
1359
- return {}
1360
- return json.loads(raw)
1361
- return read_json_file(input_json)
1362
-
1363
-
1364
717
  def main() -> None:
1365
- parser = argparse.ArgumentParser(description="Write benchmark card markdown to card root")
718
+ parser = argparse.ArgumentParser(description="Write single-work benchmark card markdown to card root")
1366
719
  parser.add_argument("--platform", required=True, help="Platform name, e.g. douyin or xiaohongshu")
1367
720
  parser.add_argument("--card-type", choices=CARD_TYPES, default="work", help="Primary card type")
721
+ parser.add_argument("--analysis-mode", choices=["auto", "local"], default="auto", help="Card analysis mode")
1368
722
  parser.add_argument("--config", default=None, help="Runtime config YAML path")
1369
723
  parser.add_argument("--env-file", default=None, help="Shared env file path; defaults to <skills_root>/.env")
1370
724
  parser.add_argument("--allow-process-env", action="store_true", help="Allow process env to override .env/.env.local")
1371
- parser.add_argument("--sample-author", default=None, help="Optional author slug override for author_sample_work")
1372
- parser.add_argument("--content-kind", default=None, help="Optional workflow kind, e.g. single_video/author_home/author_analysis")
725
+ parser.add_argument("--content-kind", default=None, help="Optional workflow kind, e.g. single_video/note/work")
1373
726
  parser.add_argument("--force-card-type", action="store_true", help="Force manual --card-type to override content_kind mapping")
1374
727
  parser.add_argument("--card-root", default=None, help="Card root path (absolute); falls back to TIKOMNI_CARD_ROOT when omitted")
1375
- parser.add_argument(
1376
- "--input-json",
1377
- default="-",
1378
- help="Input JSON path or '-' to read from stdin",
1379
- )
728
+ parser.add_argument("--input-json", default="-", help="Input JSON path or '-' to read from stdin")
1380
729
  args = parser.parse_args()
1381
730
 
1382
731
  config, _ = load_tikomni_config(
@@ -1390,10 +739,11 @@ def main() -> None:
1390
739
  platform=args.platform,
1391
740
  card_type=args.card_type,
1392
741
  card_root=args.card_root,
1393
- sample_author=args.sample_author,
1394
742
  content_kind=args.content_kind,
1395
743
  storage_config=config,
1396
744
  force_card_type=args.force_card_type,
745
+ analysis_mode=args.analysis_mode,
746
+ progress=None,
1397
747
  )
1398
748
  write_json_stdout(result)
1399
749