@tikomni/skills 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. package/package.json +1 -1
  2. package/skills/creator-analysis/SKILL.md +34 -10
  3. package/skills/creator-analysis/references/contracts/creator-card-fields.md +2 -0
  4. package/skills/creator-analysis/references/contracts/work-card-fields.md +40 -4
  5. package/skills/creator-analysis/references/platform-guides/douyin.md +41 -36
  6. package/skills/creator-analysis/references/platform-guides/generic.md +11 -7
  7. package/skills/creator-analysis/references/platform-guides/xiaohongshu.md +45 -30
  8. package/skills/creator-analysis/references/schemas/author-analysis-v2.schema.json +224 -95
  9. package/skills/creator-analysis/references/workflow.md +8 -3
  10. package/skills/creator-analysis/scripts/author_home/adapters/platform_adapters.py +205 -21
  11. package/skills/creator-analysis/scripts/author_home/analyzers/author_analysis_v2_support.py +54 -11
  12. package/skills/creator-analysis/scripts/author_home/analyzers/prompt_first_analyzers.py +200 -13
  13. package/skills/creator-analysis/scripts/author_home/analyzers/sampled_work_batch_explainer.py +113 -42
  14. package/skills/creator-analysis/scripts/author_home/asr/home_asr.py +65 -7
  15. package/skills/creator-analysis/scripts/author_home/builders/home_builders.py +82 -18
  16. package/skills/creator-analysis/scripts/author_home/collectors/homepage_collectors.py +198 -32
  17. package/skills/creator-analysis/scripts/author_home/orchestrator/run_author_analysis.py +374 -31
  18. package/skills/creator-analysis/scripts/author_home/orchestrator/work_analysis_artifacts.py +68 -12
  19. package/skills/creator-analysis/scripts/core/storage_router.py +3 -0
  20. package/skills/creator-analysis/scripts/writers/write_author_homepage_samples.py +3 -2
  21. package/skills/creator-analysis/scripts/writers/write_benchmark_card.py +314 -137
@@ -10,9 +10,13 @@ import subprocess
10
10
  from typing import Any, Dict, List, Tuple
11
11
 
12
12
  from scripts.author_home.analyzers.author_analysis_v2_support import (
13
+ AnalysisResourceError,
14
+ OUTPUT_SCHEMA_PATH,
15
+ PROMPT_CONTRACT_PATH,
13
16
  build_author_analysis_input_v1,
14
17
  build_fallback_author_analysis_v2,
15
18
  derive_legacy_summary,
19
+ prepare_author_analysis_bundle,
16
20
  prompt_contract_text,
17
21
  validate_author_analysis_v2,
18
22
  )
@@ -106,8 +110,7 @@ def _compact_analysis_input_for_prompt(analysis_input: Dict[str, Any]) -> Dict[s
106
110
 
107
111
 
108
112
 
109
- def _build_prompt(analysis_input: Dict[str, Any], sampled_work_explanations: Dict[str, Any]) -> str:
110
- contract_prompt = prompt_contract_text()
113
+ def _build_prompt(analysis_input: Dict[str, Any], sampled_work_explanations: Dict[str, Any], *, contract_prompt: str) -> str:
111
114
  prompt_input = _compact_analysis_input_for_prompt(analysis_input)
112
115
  prompt_payload = {"author_analysis_input_v1": prompt_input}
113
116
  if isinstance(sampled_work_explanations, dict) and sampled_work_explanations.get("sampled_work_explanations"):
@@ -126,6 +129,17 @@ def _build_prompt(analysis_input: Dict[str, Any], sampled_work_explanations: Dic
126
129
  )
127
130
 
128
131
 
132
+ def _stage_status(*, status: str, ok_count: int, failed_count: int, degraded_count: int, reason_codes: List[str], failure_kind: str = "") -> Dict[str, Any]:
133
+ return {
134
+ "status": status,
135
+ "ok_count": ok_count,
136
+ "failed_count": failed_count,
137
+ "degraded_count": degraded_count,
138
+ "reason_codes": list(dict.fromkeys([code for code in reason_codes if code])),
139
+ "failure_kind": failure_kind or None,
140
+ }
141
+
142
+
129
143
  def _extract_json_block(text: str) -> Dict[str, Any]:
130
144
  content = (text or "").strip()
131
145
  if not content:
@@ -159,9 +173,29 @@ def _unwrap_author_analysis(payload: Dict[str, Any]) -> Dict[str, Any]:
159
173
  return payload
160
174
 
161
175
 
162
- def run_prompt_first_author_analysis(profile: Dict[str, Any], works: List[Dict[str, Any]]) -> Tuple[Dict[str, Any], List[Dict[str, str]], List[Dict[str, Any]]]:
163
- analysis_input, input_errors = build_author_analysis_input_v1(profile=profile, works=works, platform=str(profile.get("platform") or "unknown"))
164
- sampled_work_explanations, sampled_explanation_errors, sampled_explanation_trace = run_sampled_work_batch_explanations(analysis_input)
176
+ def run_prompt_first_author_analysis(
177
+ profile: Dict[str, Any],
178
+ works: List[Dict[str, Any]],
179
+ *,
180
+ analysis_bundle: Dict[str, Any] | None = None,
181
+ ) -> Tuple[Dict[str, Any], List[Dict[str, str]], List[Dict[str, Any]]]:
182
+ prepared = analysis_bundle if isinstance(analysis_bundle, dict) else prepare_author_analysis_bundle(
183
+ profile=profile,
184
+ works=works,
185
+ platform=str(profile.get("platform") or "unknown"),
186
+ )
187
+ analysis_input = prepared.get("analysis_input") if isinstance(prepared.get("analysis_input"), dict) else {}
188
+ input_errors: List[Dict[str, str]] = []
189
+ input_resource_error: AnalysisResourceError | None = None
190
+ try:
191
+ input_errors = build_author_analysis_input_v1(
192
+ profile=profile,
193
+ works=works,
194
+ platform=str(profile.get("platform") or "unknown"),
195
+ )[1]
196
+ except AnalysisResourceError as error:
197
+ input_resource_error = error
198
+ sampled_work_explanations, sampled_explanation_errors, sampled_explanation_trace, sampled_explanations_status = run_sampled_work_batch_explanations(analysis_input)
165
199
  sampled_works_count = len(analysis_input.get("sampled_works") or [])
166
200
  total_works = ((analysis_input.get("aggregate_stats") or {}).get("total_works") if isinstance(analysis_input.get("aggregate_stats"), dict) else 0)
167
201
  llm_timeout_sec = max(int(os.getenv("TIKOMNI_AUTHOR_ANALYSIS_TIMEOUT_SEC", str(DEFAULT_ANALYSIS_TIMEOUT_SEC))), 5)
@@ -173,20 +207,46 @@ def run_prompt_first_author_analysis(profile: Dict[str, Any], works: List[Dict[s
173
207
  "total_works": total_works,
174
208
  "sampled_works_count": sampled_works_count,
175
209
  "prompt_contract": f"prompt-contracts/{AUTHOR_ANALYSIS_PROMPT_FILE}@v1",
210
+ "contract_path": str(PROMPT_CONTRACT_PATH),
211
+ "schema_path": str(OUTPUT_SCHEMA_PATH),
176
212
  "llm_timeout_sec": llm_timeout_sec,
177
213
  "small_sample_skip_threshold": small_sample_skip_threshold,
178
214
  }
179
215
  ] + sampled_explanation_trace
180
216
  if input_errors:
181
217
  trace.append({"step": "analysis.input_validation_failed", "error_count": len(input_errors)})
218
+ if input_resource_error is not None:
219
+ trace.append({"step": "analysis.input_resource_error", "error": str(input_resource_error)})
220
+ result = {
221
+ **derive_legacy_summary({}, analysis_input=analysis_input, validation_errors=[]),
222
+ "author_analysis_v2": {},
223
+ "author_analysis_input_v1": analysis_input,
224
+ "sampled_work_explanations": sampled_work_explanations,
225
+ "sampled_explanations_status": sampled_explanations_status,
226
+ "author_analysis_status": _stage_status(
227
+ status="failed",
228
+ ok_count=0,
229
+ failed_count=1,
230
+ degraded_count=0,
231
+ reason_codes=[input_resource_error.code],
232
+ failure_kind="configuration",
233
+ ),
234
+ "quality_tier": "failed",
235
+ "validation": {
236
+ "ok": False,
237
+ "errors": [],
238
+ },
239
+ }
240
+ return result, [], trace
182
241
  if sampled_explanation_errors:
183
242
  trace.append({"step": "analysis.sampled_work_explanations_validation_failed", "error_count": len(sampled_explanation_errors)})
184
243
 
185
- prompt = _build_prompt(analysis_input, sampled_work_explanations)
186
244
  response_text = ""
187
245
  analysis_v2: Dict[str, Any] = {}
188
246
  llm_ok = False
189
247
  skip_llm = sampled_works_count < small_sample_skip_threshold
248
+ author_reason_codes: List[str] = []
249
+ author_status = _stage_status(status="failed", ok_count=0, failed_count=1, degraded_count=0, reason_codes=["analysis_not_started"])
190
250
  if skip_llm:
191
251
  trace.append(
192
252
  {
@@ -196,7 +256,42 @@ def run_prompt_first_author_analysis(profile: Dict[str, Any], works: List[Dict[s
196
256
  "threshold": small_sample_skip_threshold,
197
257
  }
198
258
  )
259
+ author_reason_codes.append("small_sample_below_threshold")
199
260
  else:
261
+ try:
262
+ contract_prompt = prompt_contract_text()
263
+ trace.append(
264
+ {
265
+ "step": "analysis.resources_loaded",
266
+ "contract_loaded": True,
267
+ "contract_chars": len(contract_prompt),
268
+ }
269
+ )
270
+ prompt = _build_prompt(analysis_input, sampled_work_explanations, contract_prompt=contract_prompt)
271
+ except AnalysisResourceError as error:
272
+ trace.append({"step": "analysis.resource_error", "error": str(error)})
273
+ author_status = _stage_status(
274
+ status="failed",
275
+ ok_count=0,
276
+ failed_count=1,
277
+ degraded_count=0,
278
+ reason_codes=[error.code],
279
+ failure_kind="configuration",
280
+ )
281
+ result = {
282
+ **derive_legacy_summary({}, analysis_input=analysis_input, validation_errors=input_errors),
283
+ "author_analysis_v2": {},
284
+ "author_analysis_input_v1": analysis_input,
285
+ "sampled_work_explanations": sampled_work_explanations,
286
+ "sampled_explanations_status": sampled_explanations_status,
287
+ "author_analysis_status": author_status,
288
+ "quality_tier": "failed",
289
+ "validation": {
290
+ "ok": False,
291
+ "errors": input_errors,
292
+ },
293
+ }
294
+ return result, input_errors, trace
200
295
  try:
201
296
  run = subprocess.run(
202
297
  ["openclaw", "agent", "--agent", "main", "--message", prompt, "--json"],
@@ -226,32 +321,124 @@ def run_prompt_first_author_analysis(profile: Dict[str, Any], works: List[Dict[s
226
321
  )
227
322
  except Exception as error:
228
323
  trace.append({"step": "analysis.llm_error", "error": f"{type(error).__name__}:{error}"})
324
+ author_reason_codes.append("author_llm_runtime_error")
325
+
326
+ validation_errors: List[Dict[str, str]] = []
327
+ resource_error: AnalysisResourceError | None = None
328
+ if analysis_v2:
329
+ try:
330
+ validation_errors = validate_author_analysis_v2(analysis_v2, analysis_input=analysis_input)
331
+ trace.append({"step": "analysis.output_schema_loaded", "schema_loaded": True})
332
+ except AnalysisResourceError as error:
333
+ resource_error = error
334
+
335
+ if resource_error is not None:
336
+ trace.append(
337
+ {
338
+ "step": "analysis.resource_error",
339
+ "error": str(resource_error),
340
+ "contract_path": str(resource_error.path) if resource_error.code == "contract_load_failed" else str(resource_error.path),
341
+ }
342
+ )
343
+ author_status = _stage_status(
344
+ status="failed",
345
+ ok_count=0,
346
+ failed_count=1,
347
+ degraded_count=0,
348
+ reason_codes=[resource_error.code],
349
+ failure_kind="configuration",
350
+ )
351
+ result = {
352
+ **derive_legacy_summary({}, analysis_input=analysis_input, validation_errors=input_errors),
353
+ "author_analysis_v2": {},
354
+ "author_analysis_input_v1": analysis_input,
355
+ "sampled_work_explanations": sampled_work_explanations,
356
+ "sampled_explanations_status": sampled_explanations_status,
357
+ "author_analysis_status": author_status,
358
+ "quality_tier": "failed",
359
+ "validation": {
360
+ "ok": False,
361
+ "errors": input_errors,
362
+ },
363
+ }
364
+ return result, input_errors, trace
365
+
366
+ if not analysis_v2 or validation_errors or skip_llm:
367
+ try:
368
+ fallback = build_fallback_author_analysis_v2(analysis_input)
369
+ fallback_errors = validate_author_analysis_v2(fallback, analysis_input=analysis_input)
370
+ except AnalysisResourceError as error:
371
+ trace.append({"step": "analysis.fallback_resource_error", "error": str(error)})
372
+ author_status = _stage_status(
373
+ status="failed",
374
+ ok_count=0,
375
+ failed_count=1,
376
+ degraded_count=0,
377
+ reason_codes=[error.code],
378
+ failure_kind="configuration",
379
+ )
380
+ result = {
381
+ **derive_legacy_summary({}, analysis_input=analysis_input, validation_errors=input_errors),
382
+ "author_analysis_v2": {},
383
+ "author_analysis_input_v1": analysis_input,
384
+ "sampled_work_explanations": sampled_work_explanations,
385
+ "sampled_explanations_status": sampled_explanations_status,
386
+ "author_analysis_status": author_status,
387
+ "quality_tier": "failed",
388
+ "validation": {
389
+ "ok": False,
390
+ "errors": input_errors,
391
+ },
392
+ }
393
+ return result, input_errors, trace
229
394
 
230
- validation_errors = validate_author_analysis_v2(analysis_v2, analysis_input=analysis_input) if analysis_v2 else []
231
- if not analysis_v2 or validation_errors:
232
- fallback = build_fallback_author_analysis_v2(analysis_input)
233
- fallback_errors = validate_author_analysis_v2(fallback, analysis_input=analysis_input)
234
395
  trace.append(
235
396
  {
236
397
  "step": "analysis.fallback_used",
237
- "reason": "llm_empty_or_validation_failed",
398
+ "reason": "small_sample_below_threshold" if skip_llm else "llm_empty_or_validation_failed",
238
399
  "llm_ok": llm_ok,
239
400
  "validation_error_count": len(validation_errors),
240
401
  "fallback_error_count": len(fallback_errors),
241
402
  }
242
403
  )
243
404
  analysis_v2 = fallback
244
- validation_errors = input_errors + sampled_explanation_errors + validation_errors + fallback_errors
405
+ validation_errors = input_errors + validation_errors + fallback_errors
406
+ author_status = _stage_status(
407
+ status="fallback",
408
+ ok_count=1 if analysis_v2 else 0,
409
+ failed_count=0 if analysis_v2 else 1,
410
+ degraded_count=0,
411
+ reason_codes=author_reason_codes or ["fallback_used"],
412
+ failure_kind="runtime",
413
+ )
245
414
  else:
246
- validation_errors = input_errors + sampled_explanation_errors + validation_errors
415
+ validation_errors = input_errors + validation_errors
247
416
  trace.append({"step": "analysis.schema_validation_passed"})
417
+ author_status = _stage_status(
418
+ status="full",
419
+ ok_count=1,
420
+ failed_count=0,
421
+ degraded_count=0,
422
+ reason_codes=[],
423
+ )
248
424
 
249
425
  legacy = derive_legacy_summary(analysis_v2, analysis_input=analysis_input, validation_errors=validation_errors)
426
+ if author_status.get("status") == "failed":
427
+ quality_tier = "failed"
428
+ elif author_status.get("status") == "fallback":
429
+ quality_tier = "fallback"
430
+ elif sampled_explanations_status.get("status") != "full":
431
+ quality_tier = "degraded_author_only"
432
+ else:
433
+ quality_tier = "full"
250
434
  result = {
251
435
  **legacy,
252
436
  "author_analysis_v2": analysis_v2,
253
437
  "author_analysis_input_v1": analysis_input,
254
438
  "sampled_work_explanations": sampled_work_explanations,
439
+ "sampled_explanations_status": sampled_explanations_status,
440
+ "author_analysis_status": author_status,
441
+ "quality_tier": quality_tier,
255
442
  "validation": {
256
443
  "ok": not bool(validation_errors),
257
444
  "errors": validation_errors,
@@ -13,8 +13,9 @@ from typing import Any, Dict, List, Tuple
13
13
  import jsonschema
14
14
 
15
15
 
16
- PROMPT_CONTRACT_PATH = Path(__file__).resolve().parents[2] / "references" / "prompt-contracts" / "sampled-work-batch-explanations.md"
17
- SCHEMA_PATH = Path(__file__).resolve().parents[2] / "references" / "schemas" / "sampled-work-batch-explanations.schema.json"
16
+ SKILL_ROOT = Path(__file__).resolve().parents[3]
17
+ PROMPT_CONTRACT_PATH = SKILL_ROOT / "references" / "prompt-contracts" / "sampled-work-batch-explanations.md"
18
+ SCHEMA_PATH = SKILL_ROOT / "references" / "schemas" / "sampled-work-batch-explanations.schema.json"
18
19
  DEFAULT_TIMEOUT_SEC = 45
19
20
  TEXT_LIMITS = {
20
21
  "title": 120,
@@ -24,6 +25,17 @@ TEXT_LIMITS = {
24
25
  }
25
26
 
26
27
 
28
+ class SampledExplanationResourceError(RuntimeError):
29
+ def __init__(self, *, code: str, path: Path, detail: str = "") -> None:
30
+ self.code = code
31
+ self.path = path
32
+ self.detail = detail
33
+ message = f"{code}:{path}"
34
+ if detail:
35
+ message = f"{message}:{detail}"
36
+ super().__init__(message)
37
+
38
+
27
39
  def _safe_text(value: Any) -> str:
28
40
  if value is None:
29
41
  return ""
@@ -42,14 +54,13 @@ def _truncate_text(value: Any, limit: int) -> str:
42
54
  def _load_json(path: Path) -> Dict[str, Any]:
43
55
  try:
44
56
  return json.loads(path.read_text(encoding="utf-8"))
45
- except Exception:
46
- return {}
57
+ except Exception as error:
58
+ raise SampledExplanationResourceError(code="schema_load_failed", path=path, detail=f"{type(error).__name__}:{error}") from error
47
59
 
48
60
 
49
- def _schema_errors(payload: Any) -> List[Dict[str, str]]:
50
- schema = _load_json(SCHEMA_PATH)
61
+ def _schema_errors(payload: Any, schema: Dict[str, Any]) -> List[Dict[str, str]]:
51
62
  if not schema:
52
- return []
63
+ raise SampledExplanationResourceError(code="schema_empty", path=SCHEMA_PATH)
53
64
  try:
54
65
  validator = jsonschema.Draft202012Validator(schema)
55
66
  errors: List[Dict[str, str]] = []
@@ -64,8 +75,8 @@ def _schema_errors(payload: Any) -> List[Dict[str, str]]:
64
75
  def _prompt_contract_text() -> str:
65
76
  try:
66
77
  return PROMPT_CONTRACT_PATH.read_text(encoding="utf-8").strip()
67
- except Exception:
68
- return ""
78
+ except Exception as error:
79
+ raise SampledExplanationResourceError(code="contract_load_failed", path=PROMPT_CONTRACT_PATH, detail=f"{type(error).__name__}:{error}") from error
69
80
 
70
81
 
71
82
  def _extract_json_block(text: str) -> Dict[str, Any]:
@@ -144,40 +155,20 @@ def _compact_input(analysis_input: Dict[str, Any]) -> Dict[str, Any]:
144
155
  }
145
156
 
146
157
 
147
- def _build_prompt(analysis_input: Dict[str, Any]) -> str:
158
+ def _build_prompt(analysis_input: Dict[str, Any], *, contract_text: str) -> str:
148
159
  compacted = _compact_input(analysis_input)
149
160
  return (
150
161
  "请严格根据以下提示词原文输出,结果必须是 JSON 对象,且只输出 JSON。\n"
151
162
  "顶层对象必须是 sampled_work_explanations。\n"
152
163
  "不得输出 markdown,不得输出解释。\n\n"
153
164
  "=== 提示词原文开始 ===\n"
154
- f"{_prompt_contract_text()}\n"
165
+ f"{contract_text}\n"
155
166
  "=== 提示词原文结束 ===\n\n"
156
167
  "=== 输入数据(JSON) ===\n"
157
168
  f"{json.dumps(compacted, ensure_ascii=False)}"
158
169
  )
159
170
 
160
171
 
161
- def _fallback_explanations(analysis_input: Dict[str, Any]) -> Dict[str, Any]:
162
- sampled = analysis_input.get("sampled_works") if isinstance(analysis_input.get("sampled_works"), list) else []
163
- explanations: Dict[str, Any] = {}
164
- for item in sampled:
165
- if not isinstance(item, dict):
166
- continue
167
- work_id = _safe_text(item.get("platform_work_id"))
168
- if not work_id:
169
- continue
170
- explanations[work_id] = {
171
- "why_it_worked_or_failed": f"该样本主要依赖 { _safe_text(item.get('hook_type')) or 'hook' }、{ _safe_text(item.get('structure_type')) or 'structure' } 与 { _safe_text(item.get('content_form')) or 'content_form' } 的组合。",
172
- "copyable_elements": [value for value in [_safe_text(item.get("hook_type")), _safe_text(item.get("structure_type")), _safe_text(item.get("cta_type"))] if value],
173
- "non_copyable_elements": ["具体个人经历或原始案例背书"],
174
- "emotional_triggers": [_safe_text(item.get("hook_type")) or "结果预期"],
175
- "cognitive_gap": "观众想知道为什么这个结构能成立,以及自己如何快速套用。",
176
- "commercial_signal": "从 CTA 与内容结构看,具备基础商业承接意图,但证据仍有限。",
177
- }
178
- return {"sampled_work_explanations": explanations}
179
-
180
-
181
172
  def _coverage_errors(payload: Dict[str, Any], analysis_input: Dict[str, Any]) -> List[Dict[str, str]]:
182
173
  sampled = analysis_input.get("sampled_works") if isinstance(analysis_input.get("sampled_works"), list) else []
183
174
  explanations = payload.get("sampled_work_explanations") if isinstance(payload.get("sampled_work_explanations"), dict) else {}
@@ -193,24 +184,84 @@ def _coverage_errors(payload: Dict[str, Any], analysis_input: Dict[str, Any]) ->
193
184
  return errors
194
185
 
195
186
 
196
- def run_sampled_work_batch_explanations(analysis_input: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dict[str, str]], List[Dict[str, Any]]]:
187
+ def _classify_runtime_reason(text: str) -> str:
188
+ lowered = (text or "").lower()
189
+ if "timeoutexpired" in lowered or "timeout" in lowered:
190
+ return "timeout"
191
+ if "rate limit" in lowered or "ratelimit" in lowered or "429" in lowered:
192
+ return "rate_limit"
193
+ return "transient_llm_error"
194
+
195
+
196
+ def _stage_status(*, status: str, ok_count: int, failed_count: int, degraded_count: int, reason_codes: List[str], failure_kind: str = "") -> Dict[str, Any]:
197
+ return {
198
+ "status": status,
199
+ "ok_count": ok_count,
200
+ "failed_count": failed_count,
201
+ "degraded_count": degraded_count,
202
+ "reason_codes": list(dict.fromkeys([code for code in reason_codes if code])),
203
+ "failure_kind": failure_kind or None,
204
+ }
205
+
206
+
207
+ def run_sampled_work_batch_explanations(analysis_input: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dict[str, str]], List[Dict[str, Any]], Dict[str, Any]]:
197
208
  sampled = analysis_input.get("sampled_works") if isinstance(analysis_input.get("sampled_works"), list) else []
198
209
  trace: List[Dict[str, Any]] = [
199
210
  {
200
211
  "step": "sampled_work_explanations.input_built",
201
212
  "sampled_works_count": len(sampled),
202
213
  "prompt_contract": "prompt-contracts/sampled-work-batch-explanations.md@v1",
214
+ "contract_path": str(PROMPT_CONTRACT_PATH),
215
+ "schema_path": str(SCHEMA_PATH),
203
216
  }
204
217
  ]
205
218
 
206
219
  if not sampled:
207
220
  trace.append({"step": "sampled_work_explanations.skipped", "reason": "empty_sampled_works"})
208
- return {"sampled_work_explanations": {}}, [], trace
221
+ return {"sampled_work_explanations": {}}, [], trace, _stage_status(
222
+ status="skipped",
223
+ ok_count=0,
224
+ failed_count=0,
225
+ degraded_count=0,
226
+ reason_codes=["empty_sampled_works"],
227
+ )
209
228
 
210
229
  llm_timeout_sec = max(int(os.getenv("TIKOMNI_SAMPLED_EXPLANATION_TIMEOUT_SEC", str(DEFAULT_TIMEOUT_SEC))), 5)
211
- prompt = _build_prompt(analysis_input)
212
230
  result: Dict[str, Any] = {}
213
231
  errors: List[Dict[str, str]] = []
232
+ reason_codes: List[str] = []
233
+
234
+ try:
235
+ contract_text = _prompt_contract_text()
236
+ schema = _load_json(SCHEMA_PATH)
237
+ trace.append(
238
+ {
239
+ "step": "sampled_work_explanations.resources_loaded",
240
+ "contract_loaded": True,
241
+ "contract_chars": len(contract_text),
242
+ "schema_loaded": True,
243
+ }
244
+ )
245
+ except SampledExplanationResourceError as error:
246
+ trace.append(
247
+ {
248
+ "step": "sampled_work_explanations.resource_error",
249
+ "contract_loaded": error.code != "contract_load_failed",
250
+ "contract_chars": 0,
251
+ "schema_loaded": error.code not in {"schema_load_failed", "schema_empty"},
252
+ "error": str(error),
253
+ }
254
+ )
255
+ return {"sampled_work_explanations": {}}, [], trace, _stage_status(
256
+ status="failed",
257
+ ok_count=0,
258
+ failed_count=len(sampled),
259
+ degraded_count=0,
260
+ reason_codes=[error.code],
261
+ failure_kind="configuration",
262
+ )
263
+
264
+ prompt = _build_prompt(analysis_input, contract_text=contract_text)
214
265
 
215
266
  try:
216
267
  run = subprocess.run(
@@ -238,23 +289,43 @@ def run_sampled_work_batch_explanations(analysis_input: Dict[str, Any]) -> Tuple
238
289
  }
239
290
  )
240
291
  except Exception as error:
241
- trace.append({"step": "sampled_work_explanations.llm_error", "error": f"{type(error).__name__}:{error}"})
292
+ reason_code = _classify_runtime_reason(f"{type(error).__name__}:{error}")
293
+ trace.append({"step": "sampled_work_explanations.llm_error", "error": f"{type(error).__name__}:{error}", "reason_code": reason_code})
294
+ return {"sampled_work_explanations": {}}, [], trace, _stage_status(
295
+ status="degraded",
296
+ ok_count=0,
297
+ failed_count=0,
298
+ degraded_count=len(sampled),
299
+ reason_codes=[reason_code],
300
+ failure_kind="runtime",
301
+ )
242
302
 
243
- errors = _schema_errors(result) if result else [{"field": "$", "reason": "empty_result"}]
303
+ errors = _schema_errors(result, schema) if result else [{"field": "$", "reason": "empty_result"}]
244
304
  if not errors:
245
305
  errors.extend(_coverage_errors(result, analysis_input))
246
306
  if errors:
247
- fallback = _fallback_explanations(analysis_input)
248
- fallback_errors = _schema_errors(fallback) + _coverage_errors(fallback, analysis_input)
249
307
  trace.append(
250
308
  {
251
- "step": "sampled_work_explanations.fallback_used",
309
+ "step": "sampled_work_explanations.validation_failed",
252
310
  "reason": "llm_empty_or_validation_failed",
253
311
  "validation_error_count": len(errors),
254
- "fallback_error_count": len(fallback_errors),
255
312
  }
256
313
  )
257
- return fallback, errors + fallback_errors, trace
314
+ reason_codes.append("validation_failed")
315
+ return {"sampled_work_explanations": {}}, errors, trace, _stage_status(
316
+ status="degraded",
317
+ ok_count=0,
318
+ failed_count=0,
319
+ degraded_count=len(sampled),
320
+ reason_codes=reason_codes,
321
+ failure_kind="runtime",
322
+ )
258
323
 
259
324
  trace.append({"step": "sampled_work_explanations.schema_validation_passed"})
260
- return result, [], trace
325
+ return result, [], trace, _stage_status(
326
+ status="full",
327
+ ok_count=len(sampled),
328
+ failed_count=0,
329
+ degraded_count=0,
330
+ reason_codes=[],
331
+ )
@@ -363,17 +363,68 @@ def _iter_xhs_interface_text_candidates(work: Dict[str, Any]) -> List[Tuple[str,
363
363
  return deduped
364
364
 
365
365
 
366
- def _resolve_xhs_subtitle(work: Dict[str, Any], timeout_ms: int) -> Tuple[str, str, List[str], str]:
366
+ def _classify_xhs_subtitle_failure(*, work: Dict[str, Any], interface_candidates: List[Tuple[str, str]], subtitle_urls: List[str], invalid_reason: str) -> str:
367
+ raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
368
+ has_subtitle_signal = any(
369
+ normalize_text(raw_ref.get(key))
370
+ for key in (
371
+ "subtitle_inline",
372
+ "subtitle_text",
373
+ "subtitle_raw",
374
+ "caption_text",
375
+ "transcript_text",
376
+ )
377
+ )
378
+ if interface_candidates:
379
+ return "subtitle_content_invalid"
380
+ if subtitle_urls:
381
+ return "subtitle_url_unavailable"
382
+ if has_subtitle_signal:
383
+ return "subtitle_structure_unrecognized"
384
+ if invalid_reason == "subtitle_empty":
385
+ return "subtitle_missing"
386
+ return "subtitle_content_invalid"
387
+
388
+
389
+ def _resolve_xhs_subtitle(work: Dict[str, Any], timeout_ms: int) -> Dict[str, Any]:
367
390
  raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
368
391
  subtitle_urls = raw_ref.get("subtitle_urls") if isinstance(raw_ref.get("subtitle_urls"), list) else []
369
392
  subtitle_urls = [normalize_text(item) for item in subtitle_urls if normalize_text(item)]
370
-
371
- for source, candidate in _iter_xhs_interface_text_candidates(work):
372
- if _invalid_subtitle_reason(candidate) is None:
373
- return candidate, "interface", subtitle_urls, source
393
+ interface_candidates = _iter_xhs_interface_text_candidates(work)
394
+ invalid_reasons: List[Dict[str, str]] = []
395
+
396
+ for source, candidate in interface_candidates:
397
+ invalid_reason = _invalid_subtitle_reason(candidate)
398
+ if invalid_reason is None:
399
+ return {
400
+ "text": candidate,
401
+ "subtitle_source": "interface",
402
+ "subtitle_field": source,
403
+ "subtitle_urls": subtitle_urls,
404
+ "invalid_reasons": invalid_reasons,
405
+ "failure_category": "",
406
+ }
407
+ invalid_reasons.append({"field": source, "reason": invalid_reason})
374
408
 
375
409
  fetched = _fetch_subtitle_text(subtitle_urls, timeout_ms=timeout_ms)
376
- return _clean_text(fetched), "url", subtitle_urls, "subtitle_url"
410
+ cleaned = _clean_text(fetched)
411
+ fetched_invalid = _invalid_subtitle_reason(cleaned)
412
+ if fetched_invalid is not None and subtitle_urls:
413
+ invalid_reasons.append({"field": "subtitle_url", "reason": fetched_invalid})
414
+
415
+ return {
416
+ "text": cleaned,
417
+ "subtitle_source": "url" if subtitle_urls else "missing",
418
+ "subtitle_field": "subtitle_url" if subtitle_urls else "",
419
+ "subtitle_urls": subtitle_urls,
420
+ "invalid_reasons": invalid_reasons,
421
+ "failure_category": _classify_xhs_subtitle_failure(
422
+ work=work,
423
+ interface_candidates=interface_candidates,
424
+ subtitle_urls=subtitle_urls,
425
+ invalid_reason=fetched_invalid or "subtitle_empty",
426
+ ),
427
+ }
377
428
 
378
429
 
379
430
  def _dedupe_works_by_platform_id(works: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], int]:
@@ -734,7 +785,11 @@ def enrich_author_home_asr(
734
785
  )
735
786
  continue
736
787
 
737
- subtitle_text, subtitle_source, subtitle_urls, subtitle_field = _resolve_xhs_subtitle(work, timeout_ms=timeout_ms)
788
+ subtitle_probe = _resolve_xhs_subtitle(work, timeout_ms=timeout_ms)
789
+ subtitle_text = normalize_text(subtitle_probe.get("text"))
790
+ subtitle_source = normalize_text(subtitle_probe.get("subtitle_source"))
791
+ subtitle_urls = subtitle_probe.get("subtitle_urls") if isinstance(subtitle_probe.get("subtitle_urls"), list) else []
792
+ subtitle_field = normalize_text(subtitle_probe.get("subtitle_field"))
738
793
  subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
739
794
  if subtitle_invalid is None:
740
795
  work.update(
@@ -761,6 +816,7 @@ def enrich_author_home_asr(
761
816
  "subtitle_source": subtitle_source,
762
817
  "subtitle_field": subtitle_field,
763
818
  "subtitle_url_count": len(subtitle_urls),
819
+ "failure_category": "",
764
820
  }
765
821
  )
766
822
  else:
@@ -774,6 +830,8 @@ def enrich_author_home_asr(
774
830
  "subtitle_source": subtitle_source,
775
831
  "subtitle_field": subtitle_field,
776
832
  "subtitle_url_count": len(subtitle_urls),
833
+ "failure_category": subtitle_probe.get("failure_category"),
834
+ "invalid_reasons": subtitle_probe.get("invalid_reasons"),
777
835
  }
778
836
  )
779
837