@tikomni/skills 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tikomni/skills",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
@@ -42,6 +42,7 @@
42
42
  - Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
43
43
  - The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
44
44
  - `primary_text` is the text that is best suited for reading and indexing in the current task.
45
+ - `asr_raw` and `subtitle_raw` are internal preserved text fields. Keep them in the normalized card data, but do not render them as standalone sections in the Markdown body.
45
46
  - `play_count` may be `null`. Leave it empty when missing, and keep `0` only when the platform explicitly returns `0`.
46
47
  - Preferred order for video works:
47
48
  - `subtitle_raw`
@@ -8,7 +8,7 @@ import time
8
8
  import urllib.error
9
9
  import urllib.request
10
10
  from urllib.parse import urlparse, urlunparse
11
- from typing import Any, Callable, Dict, List, Optional
11
+ from typing import Any, Callable, Dict, List, Optional, Tuple
12
12
 
13
13
  from scripts.core.tikomni_common import (
14
14
  call_json_api,
@@ -23,6 +23,21 @@ from scripts.core.u3_fallback import run_u3_public_url_fallback
23
23
 
24
24
  U2_BATCH_SUBMIT_HARD_LIMIT = 100
25
25
  DEFAULT_U2_PENDING_TIMEOUT_SEC = 60
26
+ SUMMARY_TEXT_FIELDS = (
27
+ "full_text",
28
+ "transcript_text",
29
+ "transcription_text",
30
+ "result_text",
31
+ "summary_text",
32
+ "transcript",
33
+ "transcription",
34
+ "result",
35
+ "content",
36
+ "text",
37
+ )
38
+ SEGMENT_CONTAINER_FIELDS = ("sentences", "segments", "paragraphs")
39
+ SEGMENT_TEXT_FIELDS = ("text", "sentence", "content", "paragraph", "transcript_text")
40
+ CHAR_SPACED_RUN_RE = re.compile(r"(?:[A-Za-z0-9\u4e00-\u9fff]{1,4}\s+){5,}[A-Za-z0-9\u4e00-\u9fff]{1,4}")
26
41
 
27
42
 
28
43
  def clamp_u2_batch_submit_size(size: int, *, default: int = 50, hard_limit: int = U2_BATCH_SUBMIT_HARD_LIMIT) -> int:
@@ -251,6 +266,33 @@ def clean_transcript_text(raw_text: Any) -> str:
251
266
  return normalize_text(raw_text)
252
267
 
253
268
 
269
+ def _text_signature(text: str) -> str:
270
+ return re.sub(r"[\W_]+", "", clean_transcript_text(text)).lower()
271
+
272
+
273
+ def _is_char_spaced_noise_sequence(text: str) -> bool:
274
+ tokens = [token for token in clean_transcript_text(text).split(" ") if token]
275
+ if len(tokens) < 6:
276
+ return False
277
+ single_char_tokens = sum(1 for token in tokens if len(token) == 1)
278
+ short_tokens = sum(1 for token in tokens if len(token) <= 2)
279
+ cjk_tokens = sum(1 for token in tokens if any("\u4e00" <= char <= "\u9fff" for char in token))
280
+ return (
281
+ single_char_tokens >= 4
282
+ and short_tokens / max(len(tokens), 1) >= 0.75
283
+ and cjk_tokens / max(len(tokens), 1) >= 0.5
284
+ )
285
+
286
+
287
+ def _strip_char_spaced_noise_runs(text: str) -> str:
288
+ def _replace(match: re.Match[str]) -> str:
289
+ chunk = match.group(0)
290
+ return " " if _is_char_spaced_noise_sequence(chunk) else chunk
291
+
292
+ cleaned = CHAR_SPACED_RUN_RE.sub(_replace, text)
293
+ return re.sub(r"\s+", " ", cleaned).strip()
294
+
295
+
254
296
  def _ensure_sentence_end(text: str) -> str:
255
297
  if not text:
256
298
  return text
@@ -264,13 +306,36 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
264
306
  if not base:
265
307
  return ""
266
308
 
267
- denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
309
+ denoised = _strip_char_spaced_noise_runs(base)
310
+ denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", denoised)
268
311
  denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
269
312
  denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
270
313
  denoised = re.sub(r"\s+", " ", denoised).strip()
271
314
 
272
315
  units = [clean_transcript_text(part) for part in re.split(r"[。!?!?;;\n]+", denoised)]
273
- sentences = [_ensure_sentence_end(unit) for unit in units if unit]
316
+ sentences: List[str] = []
317
+ signatures: List[str] = []
318
+ for unit in units:
319
+ if not unit or _is_char_spaced_noise_sequence(unit):
320
+ continue
321
+ sentence = _ensure_sentence_end(unit)
322
+ signature = _text_signature(sentence)
323
+ if not signature:
324
+ continue
325
+ duplicate = False
326
+ for existing in signatures:
327
+ if signature == existing:
328
+ duplicate = True
329
+ break
330
+ smaller = signature if len(signature) <= len(existing) else existing
331
+ larger = existing if len(signature) <= len(existing) else signature
332
+ if len(smaller) >= 12 and smaller in larger:
333
+ duplicate = True
334
+ break
335
+ if duplicate:
336
+ continue
337
+ signatures.append(signature)
338
+ sentences.append(sentence)
274
339
  if not sentences:
275
340
  fallback = _ensure_sentence_end(denoised)
276
341
  return fallback if fallback else ""
@@ -292,6 +357,94 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
292
357
  return "\n\n".join(paragraphs)
293
358
 
294
359
 
360
+ def _extract_summary_text_from_node(node: Dict[str, Any]) -> Tuple[str, str]:
361
+ for key in SUMMARY_TEXT_FIELDS:
362
+ value = node.get(key)
363
+ if isinstance(value, str):
364
+ cleaned = clean_transcript_text(value)
365
+ if cleaned:
366
+ return cleaned, key
367
+ return "", ""
368
+
369
+
370
+ def _append_segment_lines(node: Any, lines: List[str]) -> None:
371
+ if isinstance(node, str):
372
+ cleaned = clean_transcript_text(node)
373
+ if cleaned:
374
+ lines.append(cleaned)
375
+ return
376
+ if isinstance(node, dict):
377
+ for key in SEGMENT_TEXT_FIELDS:
378
+ value = node.get(key)
379
+ if isinstance(value, str):
380
+ cleaned = clean_transcript_text(value)
381
+ if cleaned:
382
+ lines.append(cleaned)
383
+ break
384
+ return
385
+ if isinstance(node, list):
386
+ for item in node:
387
+ _append_segment_lines(item, lines)
388
+
389
+
390
+ def _extract_segment_text_from_node(node: Dict[str, Any]) -> str:
391
+ lines: List[str] = []
392
+ for key in SEGMENT_CONTAINER_FIELDS:
393
+ if key not in node:
394
+ continue
395
+ _append_segment_lines(node.get(key), lines)
396
+ if lines:
397
+ break
398
+ if not lines:
399
+ return ""
400
+
401
+ deduped: List[str] = []
402
+ seen = set()
403
+ for line in lines:
404
+ signature = _text_signature(line)
405
+ if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
406
+ continue
407
+ seen.add(signature)
408
+ deduped.append(line)
409
+ return "\n".join(deduped).strip()
410
+
411
+
412
+ def _extract_canonical_transcript_from_node(node: Dict[str, Any]) -> Dict[str, Any]:
413
+ summary_text, summary_field = _extract_summary_text_from_node(node)
414
+ if summary_text:
415
+ return {
416
+ "transcript_text": summary_text,
417
+ "summary_field_used": summary_field,
418
+ "segment_fallback_used": False,
419
+ "canonical_text_source": f"summary:{summary_field}",
420
+ }
421
+
422
+ segment_text = _extract_segment_text_from_node(node)
423
+ if segment_text:
424
+ return {
425
+ "transcript_text": segment_text,
426
+ "summary_field_used": "",
427
+ "segment_fallback_used": True,
428
+ "canonical_text_source": "segments",
429
+ }
430
+
431
+ fallback_text = clean_transcript_text(extract_transcript_text(node))
432
+ if fallback_text:
433
+ return {
434
+ "transcript_text": fallback_text,
435
+ "summary_field_used": "",
436
+ "segment_fallback_used": True,
437
+ "canonical_text_source": "deep_search_fallback",
438
+ }
439
+
440
+ return {
441
+ "transcript_text": "",
442
+ "summary_field_used": "",
443
+ "segment_fallback_used": False,
444
+ "canonical_text_source": "missing",
445
+ }
446
+
447
+
295
448
  def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
296
449
  metrics = deep_find_first(payload, ["task_metrics", "metrics"])
297
450
  return metrics if isinstance(metrics, dict) else {}
@@ -349,16 +502,8 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
349
502
  )
350
503
  file_url = normalize_media_url(str(raw_file_url or ""))
351
504
  if file_url:
352
- transcript = clean_transcript_text(
353
- node.get("transcript_text")
354
- or node.get("text")
355
- or node.get("transcript")
356
- or node.get("transcription")
357
- or node.get("content")
358
- or ""
359
- )
360
- if not transcript:
361
- transcript = clean_transcript_text(extract_transcript_text(node))
505
+ canonical = _extract_canonical_transcript_from_node(node)
506
+ transcript = clean_transcript_text(canonical.get("transcript_text"))
362
507
 
363
508
  status = _status_upper(node.get("status") or node.get("task_status") or node.get("state"))
364
509
  error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
@@ -372,6 +517,9 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
372
517
  "transcription_url": transcription_url,
373
518
  "error_reason": error_reason,
374
519
  "ok": ok,
520
+ "summary_field_used": canonical.get("summary_field_used", ""),
521
+ "segment_fallback_used": bool(canonical.get("segment_fallback_used")),
522
+ "canonical_text_source": canonical.get("canonical_text_source", "missing"),
375
523
  }
376
524
 
377
525
  existing = found.get(file_url)
@@ -380,12 +528,16 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
380
528
  else:
381
529
  old_score = (
382
530
  1 if existing.get("ok") else 0,
531
+ 1 if not existing.get("segment_fallback_used") else 0,
532
+ 1 if existing.get("summary_field_used") else 0,
383
533
  len(str(existing.get("transcript_text") or "")),
384
534
  1 if existing.get("transcription_url") else 0,
385
535
  1 if not existing.get("error_reason") else 0,
386
536
  )
387
537
  new_score = (
388
538
  1 if candidate.get("ok") else 0,
539
+ 1 if not candidate.get("segment_fallback_used") else 0,
540
+ 1 if candidate.get("summary_field_used") else 0,
389
541
  len(str(candidate.get("transcript_text") or "")),
390
542
  1 if candidate.get("transcription_url") else 0,
391
543
  1 if not candidate.get("error_reason") else 0,
@@ -441,16 +593,8 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
441
593
  item_index_raw = node.get("item_index")
442
594
  item_index = _parse_non_negative_item_index(item_index_raw)
443
595
  if item_index is not None:
444
- transcript = clean_transcript_text(
445
- node.get("transcript_text")
446
- or node.get("text")
447
- or node.get("transcript")
448
- or node.get("transcription")
449
- or node.get("content")
450
- or ""
451
- )
452
- if not transcript:
453
- transcript = clean_transcript_text(extract_transcript_text(node))
596
+ canonical = _extract_canonical_transcript_from_node(node)
597
+ transcript = clean_transcript_text(canonical.get("transcript_text"))
454
598
 
455
599
  status = _status_upper(node.get("task_status") or node.get("status") or node.get("state"))
456
600
  error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
@@ -464,6 +608,9 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
464
608
  "error_reason": error_reason,
465
609
  "transcription_url": transcription_url,
466
610
  "ok": ok,
611
+ "summary_field_used": canonical.get("summary_field_used", ""),
612
+ "segment_fallback_used": bool(canonical.get("segment_fallback_used")),
613
+ "canonical_text_source": canonical.get("canonical_text_source", "missing"),
467
614
  }
468
615
 
469
616
  existing = mapped.get(item_index)
@@ -472,12 +619,16 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
472
619
  else:
473
620
  old_score = (
474
621
  1 if existing.get("ok") else 0,
622
+ 1 if not existing.get("segment_fallback_used") else 0,
623
+ 1 if existing.get("summary_field_used") else 0,
475
624
  len(str(existing.get("transcript_text") or "")),
476
625
  1 if existing.get("transcription_url") else 0,
477
626
  1 if not existing.get("error_reason") else 0,
478
627
  )
479
628
  new_score = (
480
629
  1 if candidate.get("ok") else 0,
630
+ 1 if not candidate.get("segment_fallback_used") else 0,
631
+ 1 if candidate.get("summary_field_used") else 0,
481
632
  len(str(candidate.get("transcript_text") or "")),
482
633
  1 if candidate.get("transcription_url") else 0,
483
634
  1 if not candidate.get("error_reason") else 0,
@@ -506,29 +657,102 @@ def _extract_transcript_from_transcription_payload(payload: Any) -> str:
506
657
  except Exception:
507
658
  return ""
508
659
 
509
- transcript = clean_transcript_text(deep_find_first(payload, ["full_text"]))
510
- if transcript:
511
- return transcript
660
+ for key in SUMMARY_TEXT_FIELDS:
661
+ transcript = clean_transcript_text(deep_find_first(payload, [key]))
662
+ if transcript:
663
+ return transcript
664
+
665
+ for key in SEGMENT_CONTAINER_FIELDS:
666
+ segments = deep_find_first(payload, [key])
667
+ if segments is None:
668
+ continue
669
+ lines: List[str] = []
670
+ _append_segment_lines(segments, lines)
671
+ deduped: List[str] = []
672
+ seen = set()
673
+ for line in lines:
674
+ signature = _text_signature(line)
675
+ if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
676
+ continue
677
+ seen.add(signature)
678
+ deduped.append(line)
679
+ if deduped:
680
+ return "\n".join(deduped)
512
681
 
513
682
  transcript = clean_transcript_text(extract_transcript_text(payload))
514
683
  if transcript:
515
684
  return transcript
516
685
 
517
- sentences = deep_find_first(payload, ["sentences"])
518
- if isinstance(sentences, list):
686
+ return ""
687
+
688
+
689
+ def _extract_transcript_bundle_from_transcription_payload(payload: Any) -> Dict[str, Any]:
690
+ if isinstance(payload, str):
691
+ text = clean_transcript_text(payload)
692
+ if text:
693
+ return {
694
+ "transcript_text": text,
695
+ "summary_field_used": "raw_string",
696
+ "segment_fallback_used": False,
697
+ "canonical_text_source": "summary:raw_string",
698
+ }
699
+ try:
700
+ payload = json.loads(payload)
701
+ except Exception:
702
+ return {
703
+ "transcript_text": "",
704
+ "summary_field_used": "",
705
+ "segment_fallback_used": False,
706
+ "canonical_text_source": "missing",
707
+ }
708
+
709
+ for key in SUMMARY_TEXT_FIELDS:
710
+ transcript = clean_transcript_text(deep_find_first(payload, [key]))
711
+ if transcript:
712
+ return {
713
+ "transcript_text": transcript,
714
+ "summary_field_used": key,
715
+ "segment_fallback_used": False,
716
+ "canonical_text_source": f"summary:{key}",
717
+ }
718
+
719
+ for key in SEGMENT_CONTAINER_FIELDS:
720
+ segments = deep_find_first(payload, [key])
721
+ if segments is None:
722
+ continue
519
723
  lines: List[str] = []
520
- for sentence in sentences:
521
- if not isinstance(sentence, dict):
724
+ _append_segment_lines(segments, lines)
725
+ deduped: List[str] = []
726
+ seen = set()
727
+ for line in lines:
728
+ signature = _text_signature(line)
729
+ if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
522
730
  continue
523
- line = clean_transcript_text(
524
- sentence.get("text") or sentence.get("sentence") or sentence.get("content")
525
- )
526
- if line:
527
- lines.append(line)
528
- if lines:
529
- return "\n".join(lines)
731
+ seen.add(signature)
732
+ deduped.append(line)
733
+ if deduped:
734
+ return {
735
+ "transcript_text": "\n".join(deduped),
736
+ "summary_field_used": "",
737
+ "segment_fallback_used": True,
738
+ "canonical_text_source": f"segments:{key}",
739
+ }
530
740
 
531
- return ""
741
+ transcript = clean_transcript_text(extract_transcript_text(payload))
742
+ if transcript:
743
+ return {
744
+ "transcript_text": transcript,
745
+ "summary_field_used": "",
746
+ "segment_fallback_used": True,
747
+ "canonical_text_source": "deep_search_fallback",
748
+ }
749
+
750
+ return {
751
+ "transcript_text": "",
752
+ "summary_field_used": "",
753
+ "segment_fallback_used": False,
754
+ "canonical_text_source": "missing",
755
+ }
532
756
 
533
757
 
534
758
  def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int) -> Dict[str, Any]:
@@ -573,13 +797,17 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
573
797
  except Exception:
574
798
  payload = raw_text
575
799
 
576
- transcript = _extract_transcript_from_transcription_payload(payload)
800
+ transcript_bundle = _extract_transcript_bundle_from_transcription_payload(payload)
801
+ transcript = transcript_bundle.get("transcript_text", "")
577
802
  if transcript:
578
803
  return {
579
804
  "ok": True,
580
805
  "transcription_url": url,
581
806
  "error_reason": "",
582
807
  "transcript_text": transcript,
808
+ "summary_field_used": transcript_bundle.get("summary_field_used", ""),
809
+ "segment_fallback_used": bool(transcript_bundle.get("segment_fallback_used")),
810
+ "canonical_text_source": transcript_bundle.get("canonical_text_source", "missing"),
583
811
  }
584
812
 
585
813
  return {
@@ -587,6 +815,9 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
587
815
  "transcription_url": url,
588
816
  "error_reason": "transcription_payload_empty",
589
817
  "transcript_text": "",
818
+ "summary_field_used": "",
819
+ "segment_fallback_used": False,
820
+ "canonical_text_source": "missing",
590
821
  }
591
822
 
592
823
 
@@ -620,6 +851,9 @@ def hydrate_u2_batch_results_from_transcription_urls(
620
851
  if fetched_text:
621
852
  transcript = fetched_text
622
853
  candidate["transcript_text"] = fetched_text
854
+ candidate["summary_field_used"] = fetch_result.get("summary_field_used", "")
855
+ candidate["segment_fallback_used"] = bool(fetch_result.get("segment_fallback_used"))
856
+ candidate["canonical_text_source"] = fetch_result.get("canonical_text_source", "missing")
623
857
  elif not candidate.get("error_reason"):
624
858
  candidate["error_reason"] = fetch_result.get("error_reason") or "transcription_payload_empty"
625
859
 
@@ -3,10 +3,12 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ from copy import deepcopy
6
7
  import json
7
8
  import re
8
9
  import urllib.request
9
- from typing import Any, Dict, List, Optional, Tuple
10
+ from datetime import datetime
11
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple
10
12
 
11
13
  from scripts.core.progress_report import ProgressReporter
12
14
  from scripts.core.tikomni_common import normalize_text
@@ -26,6 +28,20 @@ XHS_U3_U2_BATCH_SIZE = 20
26
28
  U2_GATE_MIN_DURATION_MS = 13000
27
29
  U2_GATE_MAX_DURATION_MS = 1800000
28
30
  U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
31
+ CHECKPOINT_WORK_FIELDS = (
32
+ "platform_work_id",
33
+ "subtitle_raw",
34
+ "subtitle_source",
35
+ "asr_raw",
36
+ "asr_clean",
37
+ "primary_text",
38
+ "primary_text_source",
39
+ "analysis_eligibility",
40
+ "analysis_exclusion_reason",
41
+ "asr_status",
42
+ "asr_error_reason",
43
+ "asr_source",
44
+ )
29
45
 
30
46
 
31
47
  def _to_int_or_none(value: Any) -> Optional[int]:
@@ -467,6 +483,108 @@ def _dedupe_works_by_platform_id(works: List[Dict[str, Any]]) -> Tuple[List[Dict
467
483
  return deduped, duplicates
468
484
 
469
485
 
486
+ def _snapshot_work_for_checkpoint(work: Dict[str, Any]) -> Dict[str, Any]:
487
+ snapshot: Dict[str, Any] = {}
488
+ for key in CHECKPOINT_WORK_FIELDS:
489
+ if key in work:
490
+ snapshot[key] = deepcopy(work.get(key))
491
+ platform_work_id = normalize_text(work.get("platform_work_id"))
492
+ if platform_work_id:
493
+ snapshot["platform_work_id"] = platform_work_id
494
+ return snapshot
495
+
496
+
497
+ def _restore_completed_work_payloads(*, works: List[Dict[str, Any]], checkpoint: Dict[str, Any]) -> int:
498
+ completed_payloads = checkpoint.get("completed_work_payloads")
499
+ if not isinstance(completed_payloads, dict):
500
+ return 0
501
+
502
+ restored = 0
503
+ for work in works:
504
+ if not isinstance(work, dict):
505
+ continue
506
+ work_id = normalize_text(work.get("platform_work_id"))
507
+ if not work_id:
508
+ continue
509
+ payload = completed_payloads.get(work_id)
510
+ if not isinstance(payload, dict):
511
+ continue
512
+ work.update(deepcopy(payload))
513
+ restored += 1
514
+ return restored
515
+
516
+
517
+ def _count_processed_results(*, works: List[Dict[str, Any]], completed_ids: Set[str]) -> Tuple[int, int, List[str]]:
518
+ success_count = 0
519
+ failed_ids: List[str] = []
520
+ completed_id_set = {normalize_text(item) for item in completed_ids if normalize_text(item)}
521
+
522
+ for work in works:
523
+ if not isinstance(work, dict):
524
+ continue
525
+ work_id = normalize_text(work.get("platform_work_id"))
526
+ if not work_id or work_id not in completed_id_set:
527
+ continue
528
+ if str(work.get("analysis_eligibility") or "") == "eligible":
529
+ success_count += 1
530
+ else:
531
+ failed_ids.append(work_id)
532
+
533
+ failed_ids = sorted(set(failed_ids))
534
+ return success_count, len(failed_ids), failed_ids
535
+
536
+
537
+ def _build_checkpoint_snapshot(
538
+ *,
539
+ platform: str,
540
+ works: List[Dict[str, Any]],
541
+ completed_ids: Set[str],
542
+ batch_size: int,
543
+ batches_total: int,
544
+ batches_submitted: int,
545
+ batches_completed: int,
546
+ batch_mapped: int,
547
+ batch_unmapped: int,
548
+ fallback_singles: int,
549
+ request_id: str,
550
+ last_completed_batch_id: str,
551
+ ) -> Dict[str, Any]:
552
+ completed_id_set = {normalize_text(item) for item in completed_ids if normalize_text(item)}
553
+ success_count, failed_count, failed_work_ids = _count_processed_results(works=works, completed_ids=completed_id_set)
554
+ completed_work_payloads: Dict[str, Any] = {}
555
+ for work in works:
556
+ if not isinstance(work, dict):
557
+ continue
558
+ work_id = normalize_text(work.get("platform_work_id"))
559
+ if not work_id or work_id not in completed_id_set:
560
+ continue
561
+ completed_work_payloads[work_id] = _snapshot_work_for_checkpoint(work)
562
+
563
+ return {
564
+ "platform": platform,
565
+ "request_id": request_id or None,
566
+ "completed_work_ids": sorted(completed_id_set),
567
+ "failed_work_ids": failed_work_ids,
568
+ "completed_work_payloads": completed_work_payloads,
569
+ "batch_size": batch_size,
570
+ "batches_total": batches_total,
571
+ "batches_submitted": batches_submitted,
572
+ "batches_completed": batches_completed,
573
+ "batch_mapped": batch_mapped,
574
+ "batch_unmapped": batch_unmapped,
575
+ "fallback_singles": fallback_singles,
576
+ "total_works": len(works),
577
+ "processed_works": len(completed_id_set),
578
+ "success_works": success_count,
579
+ "failed_works": failed_count,
580
+ "pending_works": max(0, len(works) - len(completed_id_set)),
581
+ "last_completed_batch_id": last_completed_batch_id,
582
+ "updated_at": datetime.now().isoformat(timespec="seconds"),
583
+ # backward-compatible checkpoint fields
584
+ "refill_attempted": fallback_singles,
585
+ }
586
+
587
+
470
588
  def _fallback_none_result(reason: str) -> Dict[str, Any]:
471
589
  return {
472
590
  "subtitle_raw": "",
@@ -804,12 +922,15 @@ def enrich_author_home_asr(
804
922
  timeout_retry_max_retries: int = 3,
805
923
  batch_size: int = DEFAULT_BATCH_SUBMIT_SIZE,
806
924
  checkpoint: Optional[Dict[str, Any]] = None,
925
+ request_id: str = "",
926
+ on_batch_complete: Optional[Callable[[Dict[str, Any]], None]] = None,
807
927
  progress: Optional[ProgressReporter] = None,
808
928
  ) -> Dict[str, Any]:
809
929
  trace: List[Dict[str, Any]] = []
810
930
  deduped_works, duplicate_count = _dedupe_works_by_platform_id(works)
811
931
 
812
932
  checkpoint_in = checkpoint if isinstance(checkpoint, dict) else {}
933
+ restored_payloads = _restore_completed_work_payloads(works=deduped_works, checkpoint=checkpoint_in)
813
934
  completed_ids = {
814
935
  normalize_text(item)
815
936
  for item in (checkpoint_in.get("completed_work_ids") or [])
@@ -833,6 +954,7 @@ def enrich_author_home_asr(
833
954
  "deduped_count": len(deduped_works),
834
955
  "duplicate_count": duplicate_count,
835
956
  "resume_completed": len(completed_ids),
957
+ "resume_payloads_restored": restored_payloads,
836
958
  "requested_batch_size": requested_batch,
837
959
  "batch_size": effective_batch,
838
960
  "batch_size_clamped": requested_batch != effective_batch,
@@ -847,6 +969,7 @@ def enrich_author_home_asr(
847
969
  "input_count": len(works),
848
970
  "deduped_count": len(deduped_works),
849
971
  "resume_completed": len(completed_ids),
972
+ "resume_payloads_restored": restored_payloads,
850
973
  "batch_size": effective_batch,
851
974
  },
852
975
  )
@@ -874,8 +997,9 @@ def enrich_author_home_asr(
874
997
  data={"queued_count": len(queue), "batch_total": batch_total},
875
998
  )
876
999
 
877
- success_count = 0
878
- fallback_none_count = 0
1000
+ restored_success_count, restored_failed_count, _ = _count_processed_results(works=deduped_works, completed_ids=completed_ids)
1001
+ success_count = restored_success_count
1002
+ fallback_none_count = restored_failed_count
879
1003
  submitted_batches = 0
880
1004
  completed_batches = 0
881
1005
  batch_mapped_count = 0
@@ -1124,34 +1248,66 @@ def enrich_author_home_asr(
1124
1248
  },
1125
1249
  )
1126
1250
 
1127
- failed_work_ids = sorted(
1128
- list(
1129
- {
1130
- normalize_text(work.get("platform_work_id"))
1131
- for work in deduped_works
1132
- if isinstance(work, dict)
1133
- and normalize_text(work.get("platform_work_id"))
1134
- and str(work.get("analysis_eligibility") or "") != "eligible"
1135
- }
1251
+ checkpoint_snapshot = _build_checkpoint_snapshot(
1252
+ platform=platform,
1253
+ works=deduped_works,
1254
+ completed_ids=completed_ids,
1255
+ batch_size=effective_batch,
1256
+ batches_total=batch_total,
1257
+ batches_submitted=submitted_batches,
1258
+ batches_completed=completed_batches,
1259
+ batch_mapped=batch_mapped_count,
1260
+ batch_unmapped=batch_unmapped_count,
1261
+ fallback_singles=fallback_single_count,
1262
+ request_id=request_id,
1263
+ last_completed_batch_id=batch_id,
1136
1264
  )
1265
+ if on_batch_complete is not None:
1266
+ on_batch_complete(
1267
+ {
1268
+ "platform": platform,
1269
+ "batch_id": batch_id,
1270
+ "batch_index": batch_index + 1,
1271
+ "batch_total": batch_total,
1272
+ "batch_works": batch,
1273
+ "works": deduped_works,
1274
+ "trace": list(trace),
1275
+ "checkpoint": checkpoint_snapshot,
1276
+ "stats": {
1277
+ "total": len(deduped_works),
1278
+ "success": success_count,
1279
+ "fallback_none": fallback_none_count,
1280
+ "duplicates_dropped": duplicate_count,
1281
+ "submitted_batches": submitted_batches,
1282
+ "completed_batches": completed_batches,
1283
+ "batch_mapped": batch_mapped_count,
1284
+ "batch_unmapped": batch_unmapped_count,
1285
+ "fallback_singles": fallback_single_count,
1286
+ "refill_attempted": fallback_single_count,
1287
+ "refill_failed": checkpoint_snapshot.get("failed_works", 0),
1288
+ },
1289
+ }
1290
+ )
1291
+
1292
+ success_count, fallback_none_count, failed_work_ids = _count_processed_results(
1293
+ works=deduped_works,
1294
+ completed_ids=completed_ids,
1137
1295
  )
1138
1296
 
1139
- checkpoint_out = {
1140
- "platform": platform,
1141
- "completed_work_ids": sorted(completed_ids),
1142
- "failed_work_ids": failed_work_ids,
1143
- "batch_size": effective_batch,
1144
- "batches_total": batch_total,
1145
- "batches_submitted": submitted_batches,
1146
- "batches_completed": completed_batches,
1147
- "batch_mapped": batch_mapped_count,
1148
- "batch_unmapped": batch_unmapped_count,
1149
- "fallback_singles": fallback_single_count,
1150
- "total_works": len(deduped_works),
1151
- "processed_works": len(completed_ids),
1152
- # backward-compatible checkpoint fields
1153
- "refill_attempted": fallback_single_count,
1154
- }
1297
+ checkpoint_out = _build_checkpoint_snapshot(
1298
+ platform=platform,
1299
+ works=deduped_works,
1300
+ completed_ids=completed_ids,
1301
+ batch_size=effective_batch,
1302
+ batches_total=batch_total,
1303
+ batches_submitted=submitted_batches,
1304
+ batches_completed=completed_batches,
1305
+ batch_mapped=batch_mapped_count,
1306
+ batch_unmapped=batch_unmapped_count,
1307
+ fallback_singles=fallback_single_count,
1308
+ request_id=request_id,
1309
+ last_completed_batch_id=f"batch-{batch_total:03d}" if batch_total > 0 else normalize_text(checkpoint_in.get("last_completed_batch_id")),
1310
+ )
1155
1311
 
1156
1312
  stats = {
1157
1313
  "total": len(deduped_works),
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env python3
2
+ """Shared runtime-state helpers for homepage pipelines."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from scripts.core.storage_router import resolve_author_directory_name
12
+
13
+
14
+ def _safe_text(value: Any) -> str:
15
+ if value is None:
16
+ return ""
17
+ return str(value).strip()
18
+
19
+
20
+ def _now_iso() -> str:
21
+ return datetime.now().isoformat(timespec="seconds")
22
+
23
+
24
+ def resolve_homepage_author_dir(*, platform: str, profile: Dict[str, Any], card_root: str) -> Path:
25
+ author_dir_name = resolve_author_directory_name(
26
+ platform,
27
+ _safe_text(profile.get("author_handle")),
28
+ _safe_text(profile.get("platform_author_id")),
29
+ _safe_text(profile.get("nickname")),
30
+ )
31
+ author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
32
+ author_dir.mkdir(parents=True, exist_ok=True)
33
+ return author_dir
34
+
35
+
36
+ def load_homepage_checkpoint(*, platform: str, profile: Dict[str, Any], card_root: str) -> Dict[str, Any]:
37
+ author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
38
+ checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
39
+ if not checkpoint_path.is_file():
40
+ return {}
41
+ try:
42
+ payload = json.loads(checkpoint_path.read_text(encoding="utf-8"))
43
+ except Exception:
44
+ return {}
45
+ return payload if isinstance(payload, dict) else {}
46
+
47
+
48
+ def clear_homepage_checkpoint(*, platform: str, profile: Dict[str, Any], card_root: str) -> Optional[str]:
49
+ author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
50
+ checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
51
+ if not checkpoint_path.exists():
52
+ return None
53
+ checkpoint_path.unlink()
54
+ return str(checkpoint_path)
55
+
56
+
57
+ def persist_homepage_runtime_artifacts(
58
+ *,
59
+ platform: str,
60
+ profile: Dict[str, Any],
61
+ works: List[Dict[str, Any]],
62
+ card_root: str,
63
+ extract_trace: List[Dict[str, Any]],
64
+ request_id: str,
65
+ checkpoint: Optional[Dict[str, Any]],
66
+ run_status: str,
67
+ last_completed_batch_id: str = "",
68
+ ) -> Dict[str, str]:
69
+ author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
70
+ updated_at = _now_iso()
71
+
72
+ checkpoint_payload = checkpoint if isinstance(checkpoint, dict) else {}
73
+ completed_work_ids = sorted({_safe_text(item) for item in (checkpoint_payload.get("completed_work_ids") or []) if _safe_text(item)})
74
+ failed_work_ids = sorted({_safe_text(item) for item in (checkpoint_payload.get("failed_work_ids") or []) if _safe_text(item)})
75
+ completed_id_set = set(completed_work_ids)
76
+ failed_id_set = set(failed_work_ids)
77
+
78
+ collection_items: List[Dict[str, Any]] = []
79
+ for work in works:
80
+ if not isinstance(work, dict):
81
+ continue
82
+ work_id = _safe_text(work.get("platform_work_id"))
83
+ processing_status = "pending"
84
+ if work_id in failed_id_set:
85
+ processing_status = "failed"
86
+ elif work_id in completed_id_set:
87
+ processing_status = "completed"
88
+ collection_items.append(
89
+ {
90
+ "platform_work_id": work_id,
91
+ "title": work.get("title"),
92
+ "published_date": work.get("published_date"),
93
+ "processing_status": processing_status,
94
+ }
95
+ )
96
+
97
+ completed_count = len(completed_work_ids)
98
+ failed_count = len(failed_work_ids)
99
+ total_count = len(collection_items)
100
+ pending_count = max(0, total_count - completed_count)
101
+
102
+ creator_profile = dict(profile)
103
+ creator_profile.update(
104
+ {
105
+ "request_id": request_id,
106
+ "extract_trace": extract_trace,
107
+ "run_status": run_status,
108
+ "completed_count": completed_count,
109
+ "failed_count": failed_count,
110
+ "pending_count": pending_count,
111
+ "updated_at": updated_at,
112
+ }
113
+ )
114
+
115
+ work_collection = {
116
+ "platform": platform,
117
+ "platform_author_id": profile.get("platform_author_id"),
118
+ "count": total_count,
119
+ "items": collection_items,
120
+ "request_id": request_id,
121
+ "extract_trace": extract_trace,
122
+ "run_status": run_status,
123
+ "completed_count": completed_count,
124
+ "failed_count": failed_count,
125
+ "pending_count": pending_count,
126
+ "completed_work_ids": completed_work_ids,
127
+ "failed_work_ids": failed_work_ids,
128
+ "batch_size": checkpoint_payload.get("batch_size"),
129
+ "batches_total": checkpoint_payload.get("batches_total"),
130
+ "batches_completed": checkpoint_payload.get("batches_completed"),
131
+ "batch_mapped": checkpoint_payload.get("batch_mapped"),
132
+ "batch_unmapped": checkpoint_payload.get("batch_unmapped"),
133
+ "fallback_singles": checkpoint_payload.get("fallback_singles"),
134
+ "last_completed_batch_id": last_completed_batch_id or _safe_text(checkpoint_payload.get("last_completed_batch_id")),
135
+ "updated_at": updated_at,
136
+ }
137
+
138
+ if checkpoint_payload:
139
+ checkpoint_to_write = dict(checkpoint_payload)
140
+ checkpoint_to_write["request_id"] = request_id
141
+ checkpoint_to_write["updated_at"] = updated_at
142
+ checkpoint_to_write["last_completed_batch_id"] = last_completed_batch_id or _safe_text(checkpoint_payload.get("last_completed_batch_id"))
143
+ else:
144
+ checkpoint_to_write = {}
145
+
146
+ profile_path = author_dir / "_creator_profile.json"
147
+ collection_path = author_dir / "_work_collection.json"
148
+ checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
149
+ profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
150
+ collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
151
+ if checkpoint_to_write:
152
+ checkpoint_path.write_text(json.dumps(checkpoint_to_write, ensure_ascii=False, indent=2), encoding="utf-8")
153
+
154
+ return {
155
+ "author_dir": str(author_dir),
156
+ "creator_profile_path": str(profile_path),
157
+ "work_collection_path": str(collection_path),
158
+ "checkpoint_path": str(checkpoint_path),
159
+ }
160
+
161
+
162
+ def resolve_homepage_run_status(stats: Optional[Dict[str, Any]]) -> str:
163
+ payload = stats if isinstance(stats, dict) else {}
164
+ total = int(payload.get("total") or 0)
165
+ success = int(payload.get("success") or 0)
166
+ failed = int(payload.get("fallback_none") or 0)
167
+ if total <= 0:
168
+ return "complete"
169
+ if failed <= 0 and success >= total:
170
+ return "complete"
171
+ if success > 0:
172
+ return "partial"
173
+ return "failed"
@@ -14,9 +14,7 @@ if __package__ in {None, ""}:
14
14
  break
15
15
 
16
16
  import argparse
17
- import json
18
- from pathlib import Path
19
- from typing import Any, Dict, List
17
+ from typing import Any, Dict, List, Set
20
18
 
21
19
  from scripts.core.bootstrap_env import bootstrap_for_direct_run
22
20
 
@@ -25,63 +23,23 @@ bootstrap_for_direct_run(__file__, __package__)
25
23
  from scripts.core.completeness import ensure_request_id, evaluate_collection, normalize_missing_fields
26
24
  from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
27
25
  from scripts.core.progress_report import build_progress_reporter
28
- from scripts.core.storage_router import resolve_author_directory_name
29
26
  from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
30
27
  from scripts.pipelines.input_contracts import normalize_douyin_creator_input
31
28
  from scripts.pipelines.schema import build_author_profile
32
29
  from scripts.pipelines.douyin_creator_home_helpers import collect_and_adapt
33
30
  from scripts.pipelines.home_asr import enrich_author_home_asr
31
+ from scripts.pipelines.homepage_runtime_state import (
32
+ clear_homepage_checkpoint,
33
+ load_homepage_checkpoint,
34
+ persist_homepage_runtime_artifacts,
35
+ resolve_homepage_run_status,
36
+ )
34
37
  from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
35
38
 
36
39
  DEFAULT_MAX_ITEMS = 200
37
40
  MAX_ITEMS_HARD_LIMIT = 200
38
41
 
39
42
 
40
- def _write_collection_artifacts(
41
- *,
42
- profile: Dict[str, Any],
43
- works: List[Dict[str, Any]],
44
- card_root: str,
45
- extract_trace: List[Dict[str, Any]],
46
- request_id: str,
47
- ) -> Dict[str, str]:
48
- author_dir_name = resolve_author_directory_name(
49
- "douyin",
50
- str(profile.get("author_handle") or ""),
51
- str(profile.get("platform_author_id") or ""),
52
- str(profile.get("nickname") or ""),
53
- )
54
- author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
55
- author_dir.mkdir(parents=True, exist_ok=True)
56
-
57
- creator_profile = dict(profile)
58
- creator_profile["request_id"] = request_id
59
- creator_profile["extract_trace"] = extract_trace
60
-
61
- work_collection = {
62
- "platform": "douyin",
63
- "platform_author_id": profile.get("platform_author_id"),
64
- "count": len(works),
65
- "items": [
66
- {
67
- "platform_work_id": item.get("platform_work_id"),
68
- "title": item.get("title"),
69
- "published_date": item.get("published_date"),
70
- }
71
- for item in works
72
- if isinstance(item, dict)
73
- ],
74
- "request_id": request_id,
75
- "extract_trace": extract_trace,
76
- }
77
-
78
- profile_path = author_dir / "_creator_profile.json"
79
- collection_path = author_dir / "_work_collection.json"
80
- profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
81
- collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
82
- return {"creator_profile_path": str(profile_path), "work_collection_path": str(collection_path)}
83
-
84
-
85
43
  def run_douyin_creator_home(
86
44
  *,
87
45
  input_value: str,
@@ -166,7 +124,76 @@ def run_douyin_creator_home(
166
124
  progress=progress.child(scope="author_home.collect"),
167
125
  )
168
126
 
127
+ card_root = resolve_storage_paths(config)["card_root"]
128
+ request_id = ensure_request_id(
129
+ raw.get("request_id") or profile.get("request_id"),
130
+ fallback_seed=normalized_input_value or input_value,
131
+ )
132
+ raw_extract_trace = list(raw.get("extract_trace") or [])
133
+ checkpoint = load_homepage_checkpoint(
134
+ platform="douyin",
135
+ profile=profile,
136
+ card_root=card_root,
137
+ )
138
+ if checkpoint:
139
+ progress.progress(
140
+ stage="author_home.workflow.resume",
141
+ message="douyin author_home checkpoint loaded",
142
+ data={
143
+ "completed_work_ids": len(checkpoint.get("completed_work_ids") or []),
144
+ "last_completed_batch_id": checkpoint.get("last_completed_batch_id"),
145
+ },
146
+ )
147
+
169
148
  asr_strategy = config_get(config, "asr_strategy", {})
149
+ card_results: List[Dict[str, Any]] = []
150
+ written_work_ids: Set[str] = set()
151
+
152
+ def _persist_batch(event: Dict[str, Any]) -> None:
153
+ batch_id = str(event.get("batch_id") or "")
154
+ batch_works = event.get("batch_works") if isinstance(event.get("batch_works"), list) else []
155
+ all_works = event.get("works") if isinstance(event.get("works"), list) else []
156
+ batch_trace = raw_extract_trace + list(event.get("trace") or [])
157
+
158
+ batch_card_count = 0
159
+ if write_card:
160
+ for work in batch_works:
161
+ if not isinstance(work, dict):
162
+ continue
163
+ result = write_work_fact_card(
164
+ payload=work,
165
+ platform="douyin",
166
+ card_root=card_root,
167
+ storage_config=config,
168
+ )
169
+ card_results.append(result)
170
+ work_id = str(work.get("platform_work_id") or "").strip()
171
+ if work_id:
172
+ written_work_ids.add(work_id)
173
+ batch_card_count += 1
174
+
175
+ persist_homepage_runtime_artifacts(
176
+ platform="douyin",
177
+ profile=profile,
178
+ works=all_works,
179
+ card_root=card_root,
180
+ extract_trace=batch_trace,
181
+ request_id=request_id,
182
+ checkpoint=event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {},
183
+ run_status="in_progress",
184
+ last_completed_batch_id=batch_id,
185
+ )
186
+ progress.progress(
187
+ stage="author_home.persist.batch",
188
+ message="douyin author_home batch persisted",
189
+ data={
190
+ "batch_id": batch_id,
191
+ "batch_cards": batch_card_count,
192
+ "completed_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("processed_works"),
193
+ "pending_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("pending_works"),
194
+ },
195
+ )
196
+
170
197
  asr_bundle = enrich_author_home_asr(
171
198
  platform="douyin",
172
199
  works=works,
@@ -179,14 +206,18 @@ def run_douyin_creator_home(
179
206
  douyin_submit_backoff_ms=int(config_get(config, "asr_strategy.submit_retry.douyin_video.backoff_ms", 1500)),
180
207
  timeout_retry_enabled=bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True)),
181
208
  timeout_retry_max_retries=int(config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 0)),
209
+ checkpoint=checkpoint,
210
+ request_id=request_id,
211
+ on_batch_complete=_persist_batch,
182
212
  progress=progress.child(scope="author_home.asr"),
183
213
  )
184
214
  works = list(asr_bundle.get("works") or [])
185
215
 
186
- card_root = resolve_storage_paths(config)["card_root"]
187
- card_results: List[Dict[str, Any]] = []
188
216
  if write_card:
189
217
  for work in works:
218
+ work_id = str(work.get("platform_work_id") or "").strip()
219
+ if work_id and work_id in written_work_ids:
220
+ continue
190
221
  card_results.append(
191
222
  write_work_fact_card(
192
223
  payload=work,
@@ -196,19 +227,23 @@ def run_douyin_creator_home(
196
227
  )
197
228
  )
198
229
 
199
- request_id = ensure_request_id(
200
- raw.get("request_id") or profile.get("request_id"),
201
- fallback_seed=normalized_input_value or input_value,
202
- )
203
- extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
204
-
205
- collection_artifacts = _write_collection_artifacts(
230
+ extract_trace = raw_extract_trace + list(asr_bundle.get("trace") or [])
231
+ checkpoint_out = asr_bundle.get("checkpoint") if isinstance(asr_bundle.get("checkpoint"), dict) else {}
232
+ collection_artifacts = persist_homepage_runtime_artifacts(
233
+ platform="douyin",
206
234
  profile=profile,
207
235
  works=works,
208
236
  card_root=card_root,
209
237
  extract_trace=extract_trace,
210
238
  request_id=request_id,
239
+ checkpoint=checkpoint_out,
240
+ run_status=resolve_homepage_run_status(asr_bundle.get("stats")),
241
+ last_completed_batch_id=str(checkpoint_out.get("last_completed_batch_id") or ""),
211
242
  )
243
+ if int(checkpoint_out.get("pending_works") or 0) <= 0:
244
+ cleared_checkpoint_path = clear_homepage_checkpoint(platform="douyin", profile=profile, card_root=card_root)
245
+ if cleared_checkpoint_path:
246
+ collection_artifacts["checkpoint_cleared_path"] = cleared_checkpoint_path
212
247
 
213
248
  normalized_profile = dict(profile)
214
249
  normalized_profile["request_id"] = request_id
@@ -14,9 +14,7 @@ if __package__ in {None, ""}:
14
14
  break
15
15
 
16
16
  import argparse
17
- import json
18
- from pathlib import Path
19
- from typing import Any, Dict, List
17
+ from typing import Any, Dict, List, Set
20
18
 
21
19
  from scripts.core.bootstrap_env import bootstrap_for_direct_run
22
20
 
@@ -25,9 +23,14 @@ bootstrap_for_direct_run(__file__, __package__)
25
23
  from scripts.core.completeness import ensure_request_id, evaluate_collection, normalize_missing_fields
26
24
  from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
27
25
  from scripts.core.progress_report import build_progress_reporter
28
- from scripts.core.storage_router import resolve_author_directory_name
29
26
  from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
30
27
  from scripts.pipelines.home_asr import enrich_author_home_asr
28
+ from scripts.pipelines.homepage_runtime_state import (
29
+ clear_homepage_checkpoint,
30
+ load_homepage_checkpoint,
31
+ persist_homepage_runtime_artifacts,
32
+ resolve_homepage_run_status,
33
+ )
31
34
  from scripts.pipelines.input_contracts import normalize_xhs_creator_input
32
35
  from scripts.pipelines.schema import build_author_profile
33
36
  from scripts.pipelines.xiaohongshu_creator_home_helpers import collect_and_adapt
@@ -37,51 +40,6 @@ DEFAULT_MAX_ITEMS = 200
37
40
  MAX_ITEMS_HARD_LIMIT = 200
38
41
 
39
42
 
40
- def _write_collection_artifacts(
41
- *,
42
- profile: Dict[str, Any],
43
- works: List[Dict[str, Any]],
44
- card_root: str,
45
- extract_trace: List[Dict[str, Any]],
46
- request_id: str,
47
- ) -> Dict[str, str]:
48
- author_dir_name = resolve_author_directory_name(
49
- "xiaohongshu",
50
- str(profile.get("author_handle") or ""),
51
- str(profile.get("platform_author_id") or ""),
52
- str(profile.get("nickname") or ""),
53
- )
54
- author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
55
- author_dir.mkdir(parents=True, exist_ok=True)
56
-
57
- creator_profile = dict(profile)
58
- creator_profile["request_id"] = request_id
59
- creator_profile["extract_trace"] = extract_trace
60
-
61
- work_collection = {
62
- "platform": "xiaohongshu",
63
- "platform_author_id": profile.get("platform_author_id"),
64
- "count": len(works),
65
- "items": [
66
- {
67
- "platform_work_id": item.get("platform_work_id"),
68
- "title": item.get("title"),
69
- "published_date": item.get("published_date"),
70
- }
71
- for item in works
72
- if isinstance(item, dict)
73
- ],
74
- "request_id": request_id,
75
- "extract_trace": extract_trace,
76
- }
77
-
78
- profile_path = author_dir / "_creator_profile.json"
79
- collection_path = author_dir / "_work_collection.json"
80
- profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
81
- collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
82
- return {"creator_profile_path": str(profile_path), "work_collection_path": str(collection_path)}
83
-
84
-
85
43
  def run_xiaohongshu_creator_home(
86
44
  *,
87
45
  input_value: str,
@@ -166,7 +124,76 @@ def run_xiaohongshu_creator_home(
166
124
  progress=progress.child(scope="author_home.collect"),
167
125
  )
168
126
 
127
+ card_root = resolve_storage_paths(config)["card_root"]
128
+ request_id = ensure_request_id(
129
+ raw.get("request_id") or profile.get("request_id"),
130
+ fallback_seed=normalized_input_value or input_value,
131
+ )
132
+ raw_extract_trace = list(raw.get("extract_trace") or [])
133
+ checkpoint = load_homepage_checkpoint(
134
+ platform="xiaohongshu",
135
+ profile=profile,
136
+ card_root=card_root,
137
+ )
138
+ if checkpoint:
139
+ progress.progress(
140
+ stage="author_home.workflow.resume",
141
+ message="xiaohongshu author_home checkpoint loaded",
142
+ data={
143
+ "completed_work_ids": len(checkpoint.get("completed_work_ids") or []),
144
+ "last_completed_batch_id": checkpoint.get("last_completed_batch_id"),
145
+ },
146
+ )
147
+
169
148
  asr_strategy = config_get(config, "asr_strategy", {})
149
+ card_results: List[Dict[str, Any]] = []
150
+ written_work_ids: Set[str] = set()
151
+
152
+ def _persist_batch(event: Dict[str, Any]) -> None:
153
+ batch_id = str(event.get("batch_id") or "")
154
+ batch_works = event.get("batch_works") if isinstance(event.get("batch_works"), list) else []
155
+ all_works = event.get("works") if isinstance(event.get("works"), list) else []
156
+ batch_trace = raw_extract_trace + list(event.get("trace") or [])
157
+
158
+ batch_card_count = 0
159
+ if write_card:
160
+ for work in batch_works:
161
+ if not isinstance(work, dict):
162
+ continue
163
+ result = write_work_fact_card(
164
+ payload=work,
165
+ platform="xiaohongshu",
166
+ card_root=card_root,
167
+ storage_config=config,
168
+ )
169
+ card_results.append(result)
170
+ work_id = str(work.get("platform_work_id") or "").strip()
171
+ if work_id:
172
+ written_work_ids.add(work_id)
173
+ batch_card_count += 1
174
+
175
+ persist_homepage_runtime_artifacts(
176
+ platform="xiaohongshu",
177
+ profile=profile,
178
+ works=all_works,
179
+ card_root=card_root,
180
+ extract_trace=batch_trace,
181
+ request_id=request_id,
182
+ checkpoint=event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {},
183
+ run_status="in_progress",
184
+ last_completed_batch_id=batch_id,
185
+ )
186
+ progress.progress(
187
+ stage="author_home.persist.batch",
188
+ message="xiaohongshu author_home batch persisted",
189
+ data={
190
+ "batch_id": batch_id,
191
+ "batch_cards": batch_card_count,
192
+ "completed_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("processed_works"),
193
+ "pending_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("pending_works"),
194
+ },
195
+ )
196
+
170
197
  asr_bundle = enrich_author_home_asr(
171
198
  platform="xiaohongshu",
172
199
  works=works,
@@ -179,14 +206,18 @@ def run_xiaohongshu_creator_home(
179
206
  xhs_submit_backoff_ms=int(config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.backoff_ms", 0)),
180
207
  timeout_retry_enabled=bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True)),
181
208
  timeout_retry_max_retries=int(config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 0)),
209
+ checkpoint=checkpoint,
210
+ request_id=request_id,
211
+ on_batch_complete=_persist_batch,
182
212
  progress=progress.child(scope="author_home.asr"),
183
213
  )
184
214
  works = list(asr_bundle.get("works") or [])
185
215
 
186
- card_root = resolve_storage_paths(config)["card_root"]
187
- card_results: List[Dict[str, Any]] = []
188
216
  if write_card:
189
217
  for work in works:
218
+ work_id = str(work.get("platform_work_id") or "").strip()
219
+ if work_id and work_id in written_work_ids:
220
+ continue
190
221
  card_results.append(
191
222
  write_work_fact_card(
192
223
  payload=work,
@@ -196,19 +227,23 @@ def run_xiaohongshu_creator_home(
196
227
  )
197
228
  )
198
229
 
199
- request_id = ensure_request_id(
200
- raw.get("request_id") or profile.get("request_id"),
201
- fallback_seed=normalized_input_value or input_value,
202
- )
203
- extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
204
-
205
- collection_artifacts = _write_collection_artifacts(
230
+ extract_trace = raw_extract_trace + list(asr_bundle.get("trace") or [])
231
+ checkpoint_out = asr_bundle.get("checkpoint") if isinstance(asr_bundle.get("checkpoint"), dict) else {}
232
+ collection_artifacts = persist_homepage_runtime_artifacts(
233
+ platform="xiaohongshu",
206
234
  profile=profile,
207
235
  works=works,
208
236
  card_root=card_root,
209
237
  extract_trace=extract_trace,
210
238
  request_id=request_id,
239
+ checkpoint=checkpoint_out,
240
+ run_status=resolve_homepage_run_status(asr_bundle.get("stats")),
241
+ last_completed_batch_id=str(checkpoint_out.get("last_completed_batch_id") or ""),
211
242
  )
243
+ if int(checkpoint_out.get("pending_works") or 0) <= 0:
244
+ cleared_checkpoint_path = clear_homepage_checkpoint(platform="xiaohongshu", profile=profile, card_root=card_root)
245
+ if cleared_checkpoint_path:
246
+ collection_artifacts["checkpoint_cleared_path"] = cleared_checkpoint_path
212
247
 
213
248
  normalized_profile = dict(profile)
214
249
  normalized_profile["request_id"] = request_id
@@ -359,18 +359,10 @@ def _markdown_lines(card: Dict[str, Any]) -> List[str]:
359
359
  lines = _frontmatter_lines(card)
360
360
  primary_text = _safe_text(card.get("primary_text"))
361
361
  caption_raw = _safe_text(card.get("caption_raw"))
362
- subtitle_raw = _safe_text(card.get("subtitle_raw"))
363
- asr_raw = _safe_text(card.get("asr_raw"))
364
362
 
365
363
  lines.extend(["", "## 主文本", primary_text or ""])
366
364
  if caption_raw and caption_raw != primary_text:
367
365
  lines.extend(["", "## 原始文案", caption_raw])
368
- if asr_raw and subtitle_raw and asr_raw == subtitle_raw and asr_raw != primary_text:
369
- lines.extend(["", "## 原始转写", asr_raw])
370
- elif subtitle_raw and subtitle_raw != primary_text:
371
- lines.extend(["", "## 原始字幕", subtitle_raw])
372
- if asr_raw and asr_raw not in {primary_text, subtitle_raw}:
373
- lines.extend(["", "## 原始转写", asr_raw])
374
366
  if card.get("missing_fields"):
375
367
  lines.extend(["", "## 缺失字段"])
376
368
  for entry in card["missing_fields"]: