npm - @tikomni/skills - Versions diffs - 1.0.0 → 1.0.2 - Mend

@tikomni/skills 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tikomni/skills",
-  "version": "1.0.0",
+  "version": "1.0.2",
   "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
   "license": "MIT",
   "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",

package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md CHANGED Viewed

@@ -42,6 +42,7 @@
 - Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
 - The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
 - `primary_text` is the text that is best suited for reading and indexing in the current task.
+- `asr_raw` and `subtitle_raw` are internal preserved text fields. Keep them in the normalized card data, but do not render them as standalone sections in the Markdown body.
 - `play_count` may be `null`. Leave it empty when missing, and keep `0` only when the platform explicitly returns `0`.
 - Preferred order for video works:
   - `subtitle_raw`

package/skills/social-media-crawl/scripts/core/asr_pipeline.py CHANGED Viewed

@@ -8,7 +8,7 @@ import time
 import urllib.error
 import urllib.request
 from urllib.parse import urlparse, urlunparse
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
 from scripts.core.tikomni_common import (
     call_json_api,
@@ -23,6 +23,21 @@ from scripts.core.u3_fallback import run_u3_public_url_fallback
 U2_BATCH_SUBMIT_HARD_LIMIT = 100
 DEFAULT_U2_PENDING_TIMEOUT_SEC = 60
+SUMMARY_TEXT_FIELDS = (
+    "full_text",
+    "transcript_text",
+    "transcription_text",
+    "result_text",
+    "summary_text",
+    "transcript",
+    "transcription",
+    "result",
+    "content",
+    "text",
+)
+SEGMENT_CONTAINER_FIELDS = ("sentences", "segments", "paragraphs")
+SEGMENT_TEXT_FIELDS = ("text", "sentence", "content", "paragraph", "transcript_text")
+CHAR_SPACED_RUN_RE = re.compile(r"(?:[A-Za-z0-9\u4e00-\u9fff]{1,4}\s+){5,}[A-Za-z0-9\u4e00-\u9fff]{1,4}")
 def clamp_u2_batch_submit_size(size: int, *, default: int = 50, hard_limit: int = U2_BATCH_SUBMIT_HARD_LIMIT) -> int:
@@ -251,6 +266,33 @@ def clean_transcript_text(raw_text: Any) -> str:
     return normalize_text(raw_text)
+def _text_signature(text: str) -> str:
+    return re.sub(r"[\W_]+", "", clean_transcript_text(text)).lower()
+def _is_char_spaced_noise_sequence(text: str) -> bool:
+    tokens = [token for token in clean_transcript_text(text).split(" ") if token]
+    if len(tokens) < 6:
+        return False
+    single_char_tokens = sum(1 for token in tokens if len(token) == 1)
+    short_tokens = sum(1 for token in tokens if len(token) <= 2)
+    cjk_tokens = sum(1 for token in tokens if any("\u4e00" <= char <= "\u9fff" for char in token))
+    return (
+        single_char_tokens >= 4
+        and short_tokens / max(len(tokens), 1) >= 0.75
+        and cjk_tokens / max(len(tokens), 1) >= 0.5
+    )
+def _strip_char_spaced_noise_runs(text: str) -> str:
+    def _replace(match: re.Match[str]) -> str:
+        chunk = match.group(0)
+        return " " if _is_char_spaced_noise_sequence(chunk) else chunk
+    cleaned = CHAR_SPACED_RUN_RE.sub(_replace, text)
+    return re.sub(r"\s+", " ", cleaned).strip()
 def _ensure_sentence_end(text: str) -> str:
     if not text:
         return text
@@ -264,13 +306,36 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
     if not base:
         return ""
-    denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
+    denoised = _strip_char_spaced_noise_runs(base)
+    denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", denoised)
     denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
     denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
     denoised = re.sub(r"\s+", " ", denoised).strip()
     units = [clean_transcript_text(part) for part in re.split(r"[。！？!?；;\n]+", denoised)]
-    sentences = [_ensure_sentence_end(unit) for unit in units if unit]
+    sentences: List[str] = []
+    signatures: List[str] = []
+    for unit in units:
+        if not unit or _is_char_spaced_noise_sequence(unit):
+            continue
+        sentence = _ensure_sentence_end(unit)
+        signature = _text_signature(sentence)
+        if not signature:
+            continue
+        duplicate = False
+        for existing in signatures:
+            if signature == existing:
+                duplicate = True
+                break
+            smaller = signature if len(signature) <= len(existing) else existing
+            larger = existing if len(signature) <= len(existing) else signature
+            if len(smaller) >= 12 and smaller in larger:
+                duplicate = True
+                break
+        if duplicate:
+            continue
+        signatures.append(signature)
+        sentences.append(sentence)
     if not sentences:
         fallback = _ensure_sentence_end(denoised)
         return fallback if fallback else ""
@@ -292,6 +357,94 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
     return "\n\n".join(paragraphs)
+def _extract_summary_text_from_node(node: Dict[str, Any]) -> Tuple[str, str]:
+    for key in SUMMARY_TEXT_FIELDS:
+        value = node.get(key)
+        if isinstance(value, str):
+            cleaned = clean_transcript_text(value)
+            if cleaned:
+                return cleaned, key
+    return "", ""
+def _append_segment_lines(node: Any, lines: List[str]) -> None:
+    if isinstance(node, str):
+        cleaned = clean_transcript_text(node)
+        if cleaned:
+            lines.append(cleaned)
+        return
+    if isinstance(node, dict):
+        for key in SEGMENT_TEXT_FIELDS:
+            value = node.get(key)
+            if isinstance(value, str):
+                cleaned = clean_transcript_text(value)
+                if cleaned:
+                    lines.append(cleaned)
+                    break
+        return
+    if isinstance(node, list):
+        for item in node:
+            _append_segment_lines(item, lines)
+def _extract_segment_text_from_node(node: Dict[str, Any]) -> str:
+    lines: List[str] = []
+    for key in SEGMENT_CONTAINER_FIELDS:
+        if key not in node:
+            continue
+        _append_segment_lines(node.get(key), lines)
+        if lines:
+            break
+    if not lines:
+        return ""
+    deduped: List[str] = []
+    seen = set()
+    for line in lines:
+        signature = _text_signature(line)
+        if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
+            continue
+        seen.add(signature)
+        deduped.append(line)
+    return "\n".join(deduped).strip()
+def _extract_canonical_transcript_from_node(node: Dict[str, Any]) -> Dict[str, Any]:
+    summary_text, summary_field = _extract_summary_text_from_node(node)
+    if summary_text:
+        return {
+            "transcript_text": summary_text,
+            "summary_field_used": summary_field,
+            "segment_fallback_used": False,
+            "canonical_text_source": f"summary:{summary_field}",
+        }
+    segment_text = _extract_segment_text_from_node(node)
+    if segment_text:
+        return {
+            "transcript_text": segment_text,
+            "summary_field_used": "",
+            "segment_fallback_used": True,
+            "canonical_text_source": "segments",
+        }
+    fallback_text = clean_transcript_text(extract_transcript_text(node))
+    if fallback_text:
+        return {
+            "transcript_text": fallback_text,
+            "summary_field_used": "",
+            "segment_fallback_used": True,
+            "canonical_text_source": "deep_search_fallback",
+        }
+    return {
+        "transcript_text": "",
+        "summary_field_used": "",
+        "segment_fallback_used": False,
+        "canonical_text_source": "missing",
+    }
 def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
     metrics = deep_find_first(payload, ["task_metrics", "metrics"])
     return metrics if isinstance(metrics, dict) else {}
@@ -349,16 +502,8 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
             )
             file_url = normalize_media_url(str(raw_file_url or ""))
             if file_url:
-                transcript = clean_transcript_text(
-                    node.get("transcript_text")
-                    or node.get("text")
-                    or node.get("transcript")
-                    or node.get("transcription")
-                    or node.get("content")
-                    or ""
-                )
-                if not transcript:
-                    transcript = clean_transcript_text(extract_transcript_text(node))
+                canonical = _extract_canonical_transcript_from_node(node)
+                transcript = clean_transcript_text(canonical.get("transcript_text"))
                 status = _status_upper(node.get("status") or node.get("task_status") or node.get("state"))
                 error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
@@ -372,6 +517,9 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
                     "transcription_url": transcription_url,
                     "error_reason": error_reason,
                     "ok": ok,
+                    "summary_field_used": canonical.get("summary_field_used", ""),
+                    "segment_fallback_used": bool(canonical.get("segment_fallback_used")),
+                    "canonical_text_source": canonical.get("canonical_text_source", "missing"),
                 }
                 existing = found.get(file_url)
@@ -380,12 +528,16 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
                 else:
                     old_score = (
                         1 if existing.get("ok") else 0,
+                        1 if not existing.get("segment_fallback_used") else 0,
+                        1 if existing.get("summary_field_used") else 0,
                         len(str(existing.get("transcript_text") or "")),
                         1 if existing.get("transcription_url") else 0,
                         1 if not existing.get("error_reason") else 0,
                     )
                     new_score = (
                         1 if candidate.get("ok") else 0,
+                        1 if not candidate.get("segment_fallback_used") else 0,
+                        1 if candidate.get("summary_field_used") else 0,
                         len(str(candidate.get("transcript_text") or "")),
                         1 if candidate.get("transcription_url") else 0,
                         1 if not candidate.get("error_reason") else 0,
@@ -441,16 +593,8 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
             item_index_raw = node.get("item_index")
             item_index = _parse_non_negative_item_index(item_index_raw)
             if item_index is not None:
-                transcript = clean_transcript_text(
-                    node.get("transcript_text")
-                    or node.get("text")
-                    or node.get("transcript")
-                    or node.get("transcription")
-                    or node.get("content")
-                    or ""
-                )
-                if not transcript:
-                    transcript = clean_transcript_text(extract_transcript_text(node))
+                canonical = _extract_canonical_transcript_from_node(node)
+                transcript = clean_transcript_text(canonical.get("transcript_text"))
                 status = _status_upper(node.get("task_status") or node.get("status") or node.get("state"))
                 error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
@@ -464,6 +608,9 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
                     "error_reason": error_reason,
                     "transcription_url": transcription_url,
                     "ok": ok,
+                    "summary_field_used": canonical.get("summary_field_used", ""),
+                    "segment_fallback_used": bool(canonical.get("segment_fallback_used")),
+                    "canonical_text_source": canonical.get("canonical_text_source", "missing"),
                 }
                 existing = mapped.get(item_index)
@@ -472,12 +619,16 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
                 else:
                     old_score = (
                         1 if existing.get("ok") else 0,
+                        1 if not existing.get("segment_fallback_used") else 0,
+                        1 if existing.get("summary_field_used") else 0,
                         len(str(existing.get("transcript_text") or "")),
                         1 if existing.get("transcription_url") else 0,
                         1 if not existing.get("error_reason") else 0,
                     )
                     new_score = (
                         1 if candidate.get("ok") else 0,
+                        1 if not candidate.get("segment_fallback_used") else 0,
+                        1 if candidate.get("summary_field_used") else 0,
                         len(str(candidate.get("transcript_text") or "")),
                         1 if candidate.get("transcription_url") else 0,
                         1 if not candidate.get("error_reason") else 0,
@@ -506,29 +657,102 @@ def _extract_transcript_from_transcription_payload(payload: Any) -> str:
         except Exception:
             return ""
-    transcript = clean_transcript_text(deep_find_first(payload, ["full_text"]))
-    if transcript:
-        return transcript
+    for key in SUMMARY_TEXT_FIELDS:
+        transcript = clean_transcript_text(deep_find_first(payload, [key]))
+        if transcript:
+            return transcript
+    for key in SEGMENT_CONTAINER_FIELDS:
+        segments = deep_find_first(payload, [key])
+        if segments is None:
+            continue
+        lines: List[str] = []
+        _append_segment_lines(segments, lines)
+        deduped: List[str] = []
+        seen = set()
+        for line in lines:
+            signature = _text_signature(line)
+            if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
+                continue
+            seen.add(signature)
+            deduped.append(line)
+        if deduped:
+            return "\n".join(deduped)
     transcript = clean_transcript_text(extract_transcript_text(payload))
     if transcript:
         return transcript
-    sentences = deep_find_first(payload, ["sentences"])
-    if isinstance(sentences, list):
+    return ""
+def _extract_transcript_bundle_from_transcription_payload(payload: Any) -> Dict[str, Any]:
+    if isinstance(payload, str):
+        text = clean_transcript_text(payload)
+        if text:
+            return {
+                "transcript_text": text,
+                "summary_field_used": "raw_string",
+                "segment_fallback_used": False,
+                "canonical_text_source": "summary:raw_string",
+            }
+        try:
+            payload = json.loads(payload)
+        except Exception:
+            return {
+                "transcript_text": "",
+                "summary_field_used": "",
+                "segment_fallback_used": False,
+                "canonical_text_source": "missing",
+            }
+    for key in SUMMARY_TEXT_FIELDS:
+        transcript = clean_transcript_text(deep_find_first(payload, [key]))
+        if transcript:
+            return {
+                "transcript_text": transcript,
+                "summary_field_used": key,
+                "segment_fallback_used": False,
+                "canonical_text_source": f"summary:{key}",
+            }
+    for key in SEGMENT_CONTAINER_FIELDS:
+        segments = deep_find_first(payload, [key])
+        if segments is None:
+            continue
         lines: List[str] = []
-        for sentence in sentences:
-            if not isinstance(sentence, dict):
+        _append_segment_lines(segments, lines)
+        deduped: List[str] = []
+        seen = set()
+        for line in lines:
+            signature = _text_signature(line)
+            if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
                 continue
-            line = clean_transcript_text(
-                sentence.get("text") or sentence.get("sentence") or sentence.get("content")
-            )
-            if line:
-                lines.append(line)
-        if lines:
-            return "\n".join(lines)
+            seen.add(signature)
+            deduped.append(line)
+        if deduped:
+            return {
+                "transcript_text": "\n".join(deduped),
+                "summary_field_used": "",
+                "segment_fallback_used": True,
+                "canonical_text_source": f"segments:{key}",
+            }
-    return ""
+    transcript = clean_transcript_text(extract_transcript_text(payload))
+    if transcript:
+        return {
+            "transcript_text": transcript,
+            "summary_field_used": "",
+            "segment_fallback_used": True,
+            "canonical_text_source": "deep_search_fallback",
+        }
+    return {
+        "transcript_text": "",
+        "summary_field_used": "",
+        "segment_fallback_used": False,
+        "canonical_text_source": "missing",
+    }
 def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int) -> Dict[str, Any]:
@@ -573,13 +797,17 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
     except Exception:
         payload = raw_text
-    transcript = _extract_transcript_from_transcription_payload(payload)
+    transcript_bundle = _extract_transcript_bundle_from_transcription_payload(payload)
+    transcript = transcript_bundle.get("transcript_text", "")
     if transcript:
         return {
             "ok": True,
             "transcription_url": url,
             "error_reason": "",
             "transcript_text": transcript,
+            "summary_field_used": transcript_bundle.get("summary_field_used", ""),
+            "segment_fallback_used": bool(transcript_bundle.get("segment_fallback_used")),
+            "canonical_text_source": transcript_bundle.get("canonical_text_source", "missing"),
         }
     return {
@@ -587,6 +815,9 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
         "transcription_url": url,
         "error_reason": "transcription_payload_empty",
         "transcript_text": "",
+        "summary_field_used": "",
+        "segment_fallback_used": False,
+        "canonical_text_source": "missing",
     }
@@ -620,6 +851,9 @@ def hydrate_u2_batch_results_from_transcription_urls(
             if fetched_text:
                 transcript = fetched_text
                 candidate["transcript_text"] = fetched_text
+                candidate["summary_field_used"] = fetch_result.get("summary_field_used", "")
+                candidate["segment_fallback_used"] = bool(fetch_result.get("segment_fallback_used"))
+                candidate["canonical_text_source"] = fetch_result.get("canonical_text_source", "missing")
             elif not candidate.get("error_reason"):
                 candidate["error_reason"] = fetch_result.get("error_reason") or "transcription_payload_empty"

package/skills/social-media-crawl/scripts/pipelines/home_asr.py CHANGED Viewed

@@ -3,10 +3,12 @@
 from __future__ import annotations
+from copy import deepcopy
 import json
 import re
 import urllib.request
-from typing import Any, Dict, List, Optional, Tuple
+from datetime import datetime
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
 from scripts.core.progress_report import ProgressReporter
 from scripts.core.tikomni_common import normalize_text
@@ -26,6 +28,20 @@ XHS_U3_U2_BATCH_SIZE = 20
 U2_GATE_MIN_DURATION_MS = 13000
 U2_GATE_MAX_DURATION_MS = 1800000
 U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
+CHECKPOINT_WORK_FIELDS = (
+    "platform_work_id",
+    "subtitle_raw",
+    "subtitle_source",
+    "asr_raw",
+    "asr_clean",
+    "primary_text",
+    "primary_text_source",
+    "analysis_eligibility",
+    "analysis_exclusion_reason",
+    "asr_status",
+    "asr_error_reason",
+    "asr_source",
+)
 def _to_int_or_none(value: Any) -> Optional[int]:
@@ -467,6 +483,108 @@ def _dedupe_works_by_platform_id(works: List[Dict[str, Any]]) -> Tuple[List[Dict
     return deduped, duplicates
+def _snapshot_work_for_checkpoint(work: Dict[str, Any]) -> Dict[str, Any]:
+    snapshot: Dict[str, Any] = {}
+    for key in CHECKPOINT_WORK_FIELDS:
+        if key in work:
+            snapshot[key] = deepcopy(work.get(key))
+    platform_work_id = normalize_text(work.get("platform_work_id"))
+    if platform_work_id:
+        snapshot["platform_work_id"] = platform_work_id
+    return snapshot
+def _restore_completed_work_payloads(*, works: List[Dict[str, Any]], checkpoint: Dict[str, Any]) -> int:
+    completed_payloads = checkpoint.get("completed_work_payloads")
+    if not isinstance(completed_payloads, dict):
+        return 0
+    restored = 0
+    for work in works:
+        if not isinstance(work, dict):
+            continue
+        work_id = normalize_text(work.get("platform_work_id"))
+        if not work_id:
+            continue
+        payload = completed_payloads.get(work_id)
+        if not isinstance(payload, dict):
+            continue
+        work.update(deepcopy(payload))
+        restored += 1
+    return restored
+def _count_processed_results(*, works: List[Dict[str, Any]], completed_ids: Set[str]) -> Tuple[int, int, List[str]]:
+    success_count = 0
+    failed_ids: List[str] = []
+    completed_id_set = {normalize_text(item) for item in completed_ids if normalize_text(item)}
+    for work in works:
+        if not isinstance(work, dict):
+            continue
+        work_id = normalize_text(work.get("platform_work_id"))
+        if not work_id or work_id not in completed_id_set:
+            continue
+        if str(work.get("analysis_eligibility") or "") == "eligible":
+            success_count += 1
+        else:
+            failed_ids.append(work_id)
+    failed_ids = sorted(set(failed_ids))
+    return success_count, len(failed_ids), failed_ids
+def _build_checkpoint_snapshot(
+    *,
+    platform: str,
+    works: List[Dict[str, Any]],
+    completed_ids: Set[str],
+    batch_size: int,
+    batches_total: int,
+    batches_submitted: int,
+    batches_completed: int,
+    batch_mapped: int,
+    batch_unmapped: int,
+    fallback_singles: int,
+    request_id: str,
+    last_completed_batch_id: str,
+) -> Dict[str, Any]:
+    completed_id_set = {normalize_text(item) for item in completed_ids if normalize_text(item)}
+    success_count, failed_count, failed_work_ids = _count_processed_results(works=works, completed_ids=completed_id_set)
+    completed_work_payloads: Dict[str, Any] = {}
+    for work in works:
+        if not isinstance(work, dict):
+            continue
+        work_id = normalize_text(work.get("platform_work_id"))
+        if not work_id or work_id not in completed_id_set:
+            continue
+        completed_work_payloads[work_id] = _snapshot_work_for_checkpoint(work)
+    return {
+        "platform": platform,
+        "request_id": request_id or None,
+        "completed_work_ids": sorted(completed_id_set),
+        "failed_work_ids": failed_work_ids,
+        "completed_work_payloads": completed_work_payloads,
+        "batch_size": batch_size,
+        "batches_total": batches_total,
+        "batches_submitted": batches_submitted,
+        "batches_completed": batches_completed,
+        "batch_mapped": batch_mapped,
+        "batch_unmapped": batch_unmapped,
+        "fallback_singles": fallback_singles,
+        "total_works": len(works),
+        "processed_works": len(completed_id_set),
+        "success_works": success_count,
+        "failed_works": failed_count,
+        "pending_works": max(0, len(works) - len(completed_id_set)),
+        "last_completed_batch_id": last_completed_batch_id,
+        "updated_at": datetime.now().isoformat(timespec="seconds"),
+        # backward-compatible checkpoint fields
+        "refill_attempted": fallback_singles,
+    }
 def _fallback_none_result(reason: str) -> Dict[str, Any]:
     return {
         "subtitle_raw": "",
@@ -804,12 +922,15 @@ def enrich_author_home_asr(
     timeout_retry_max_retries: int = 3,
     batch_size: int = DEFAULT_BATCH_SUBMIT_SIZE,
     checkpoint: Optional[Dict[str, Any]] = None,
+    request_id: str = "",
+    on_batch_complete: Optional[Callable[[Dict[str, Any]], None]] = None,
     progress: Optional[ProgressReporter] = None,
 ) -> Dict[str, Any]:
     trace: List[Dict[str, Any]] = []
     deduped_works, duplicate_count = _dedupe_works_by_platform_id(works)
     checkpoint_in = checkpoint if isinstance(checkpoint, dict) else {}
+    restored_payloads = _restore_completed_work_payloads(works=deduped_works, checkpoint=checkpoint_in)
     completed_ids = {
         normalize_text(item)
         for item in (checkpoint_in.get("completed_work_ids") or [])
@@ -833,6 +954,7 @@ def enrich_author_home_asr(
             "deduped_count": len(deduped_works),
             "duplicate_count": duplicate_count,
             "resume_completed": len(completed_ids),
+            "resume_payloads_restored": restored_payloads,
             "requested_batch_size": requested_batch,
             "batch_size": effective_batch,
             "batch_size_clamped": requested_batch != effective_batch,
@@ -847,6 +969,7 @@ def enrich_author_home_asr(
                 "input_count": len(works),
                 "deduped_count": len(deduped_works),
                 "resume_completed": len(completed_ids),
+                "resume_payloads_restored": restored_payloads,
                 "batch_size": effective_batch,
             },
         )
@@ -874,8 +997,9 @@ def enrich_author_home_asr(
             data={"queued_count": len(queue), "batch_total": batch_total},
         )
-    success_count = 0
-    fallback_none_count = 0
+    restored_success_count, restored_failed_count, _ = _count_processed_results(works=deduped_works, completed_ids=completed_ids)
+    success_count = restored_success_count
+    fallback_none_count = restored_failed_count
     submitted_batches = 0
     completed_batches = 0
     batch_mapped_count = 0
@@ -1124,34 +1248,66 @@ def enrich_author_home_asr(
                 },
             )
-    failed_work_ids = sorted(
-        list(
-            {
-                normalize_text(work.get("platform_work_id"))
-                for work in deduped_works
-                if isinstance(work, dict)
-                and normalize_text(work.get("platform_work_id"))
-                and str(work.get("analysis_eligibility") or "") != "eligible"
-            }
+        checkpoint_snapshot = _build_checkpoint_snapshot(
+            platform=platform,
+            works=deduped_works,
+            completed_ids=completed_ids,
+            batch_size=effective_batch,
+            batches_total=batch_total,
+            batches_submitted=submitted_batches,
+            batches_completed=completed_batches,
+            batch_mapped=batch_mapped_count,
+            batch_unmapped=batch_unmapped_count,
+            fallback_singles=fallback_single_count,
+            request_id=request_id,
+            last_completed_batch_id=batch_id,
         )
+        if on_batch_complete is not None:
+            on_batch_complete(
+                {
+                    "platform": platform,
+                    "batch_id": batch_id,
+                    "batch_index": batch_index + 1,
+                    "batch_total": batch_total,
+                    "batch_works": batch,
+                    "works": deduped_works,
+                    "trace": list(trace),
+                    "checkpoint": checkpoint_snapshot,
+                    "stats": {
+                        "total": len(deduped_works),
+                        "success": success_count,
+                        "fallback_none": fallback_none_count,
+                        "duplicates_dropped": duplicate_count,
+                        "submitted_batches": submitted_batches,
+                        "completed_batches": completed_batches,
+                        "batch_mapped": batch_mapped_count,
+                        "batch_unmapped": batch_unmapped_count,
+                        "fallback_singles": fallback_single_count,
+                        "refill_attempted": fallback_single_count,
+                        "refill_failed": checkpoint_snapshot.get("failed_works", 0),
+                    },
+                }
+            )
+    success_count, fallback_none_count, failed_work_ids = _count_processed_results(
+        works=deduped_works,
+        completed_ids=completed_ids,
     )
-    checkpoint_out = {
-        "platform": platform,
-        "completed_work_ids": sorted(completed_ids),
-        "failed_work_ids": failed_work_ids,
-        "batch_size": effective_batch,
-        "batches_total": batch_total,
-        "batches_submitted": submitted_batches,
-        "batches_completed": completed_batches,
-        "batch_mapped": batch_mapped_count,
-        "batch_unmapped": batch_unmapped_count,
-        "fallback_singles": fallback_single_count,
-        "total_works": len(deduped_works),
-        "processed_works": len(completed_ids),
-        # backward-compatible checkpoint fields
-        "refill_attempted": fallback_single_count,
-    }
+    checkpoint_out = _build_checkpoint_snapshot(
+        platform=platform,
+        works=deduped_works,
+        completed_ids=completed_ids,
+        batch_size=effective_batch,
+        batches_total=batch_total,
+        batches_submitted=submitted_batches,
+        batches_completed=completed_batches,
+        batch_mapped=batch_mapped_count,
+        batch_unmapped=batch_unmapped_count,
+        fallback_singles=fallback_single_count,
+        request_id=request_id,
+        last_completed_batch_id=f"batch-{batch_total:03d}" if batch_total > 0 else normalize_text(checkpoint_in.get("last_completed_batch_id")),
+    )
     stats = {
         "total": len(deduped_works),

package/skills/social-media-crawl/scripts/pipelines/homepage_runtime_state.py ADDED Viewed

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Shared runtime-state helpers for homepage pipelines."""
+from __future__ import annotations
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from scripts.core.storage_router import resolve_author_directory_name
+def _safe_text(value: Any) -> str:
+    if value is None:
+        return ""
+    return str(value).strip()
+def _now_iso() -> str:
+    return datetime.now().isoformat(timespec="seconds")
+def resolve_homepage_author_dir(*, platform: str, profile: Dict[str, Any], card_root: str) -> Path:
+    author_dir_name = resolve_author_directory_name(
+        platform,
+        _safe_text(profile.get("author_handle")),
+        _safe_text(profile.get("platform_author_id")),
+        _safe_text(profile.get("nickname")),
+    )
+    author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
+    author_dir.mkdir(parents=True, exist_ok=True)
+    return author_dir
+def load_homepage_checkpoint(*, platform: str, profile: Dict[str, Any], card_root: str) -> Dict[str, Any]:
+    author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
+    checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
+    if not checkpoint_path.is_file():
+        return {}
+    try:
+        payload = json.loads(checkpoint_path.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
+    return payload if isinstance(payload, dict) else {}
+def clear_homepage_checkpoint(*, platform: str, profile: Dict[str, Any], card_root: str) -> Optional[str]:
+    author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
+    checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
+    if not checkpoint_path.exists():
+        return None
+    checkpoint_path.unlink()
+    return str(checkpoint_path)
+def persist_homepage_runtime_artifacts(
+    *,
+    platform: str,
+    profile: Dict[str, Any],
+    works: List[Dict[str, Any]],
+    card_root: str,
+    extract_trace: List[Dict[str, Any]],
+    request_id: str,
+    checkpoint: Optional[Dict[str, Any]],
+    run_status: str,
+    last_completed_batch_id: str = "",
+) -> Dict[str, str]:
+    author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
+    updated_at = _now_iso()
+    checkpoint_payload = checkpoint if isinstance(checkpoint, dict) else {}
+    completed_work_ids = sorted({_safe_text(item) for item in (checkpoint_payload.get("completed_work_ids") or []) if _safe_text(item)})
+    failed_work_ids = sorted({_safe_text(item) for item in (checkpoint_payload.get("failed_work_ids") or []) if _safe_text(item)})
+    completed_id_set = set(completed_work_ids)
+    failed_id_set = set(failed_work_ids)
+    collection_items: List[Dict[str, Any]] = []
+    for work in works:
+        if not isinstance(work, dict):
+            continue
+        work_id = _safe_text(work.get("platform_work_id"))
+        processing_status = "pending"
+        if work_id in failed_id_set:
+            processing_status = "failed"
+        elif work_id in completed_id_set:
+            processing_status = "completed"
+        collection_items.append(
+            {
+                "platform_work_id": work_id,
+                "title": work.get("title"),
+                "published_date": work.get("published_date"),
+                "processing_status": processing_status,
+            }
+        )
+    completed_count = len(completed_work_ids)
+    failed_count = len(failed_work_ids)
+    total_count = len(collection_items)
+    pending_count = max(0, total_count - completed_count)
+    creator_profile = dict(profile)
+    creator_profile.update(
+        {
+            "request_id": request_id,
+            "extract_trace": extract_trace,
+            "run_status": run_status,
+            "completed_count": completed_count,
+            "failed_count": failed_count,
+            "pending_count": pending_count,
+            "updated_at": updated_at,
+        }
+    )
+    work_collection = {
+        "platform": platform,
+        "platform_author_id": profile.get("platform_author_id"),
+        "count": total_count,
+        "items": collection_items,
+        "request_id": request_id,
+        "extract_trace": extract_trace,
+        "run_status": run_status,
+        "completed_count": completed_count,
+        "failed_count": failed_count,
+        "pending_count": pending_count,
+        "completed_work_ids": completed_work_ids,
+        "failed_work_ids": failed_work_ids,
+        "batch_size": checkpoint_payload.get("batch_size"),
+        "batches_total": checkpoint_payload.get("batches_total"),
+        "batches_completed": checkpoint_payload.get("batches_completed"),
+        "batch_mapped": checkpoint_payload.get("batch_mapped"),
+        "batch_unmapped": checkpoint_payload.get("batch_unmapped"),
+        "fallback_singles": checkpoint_payload.get("fallback_singles"),
+        "last_completed_batch_id": last_completed_batch_id or _safe_text(checkpoint_payload.get("last_completed_batch_id")),
+        "updated_at": updated_at,
+    }
+    if checkpoint_payload:
+        checkpoint_to_write = dict(checkpoint_payload)
+        checkpoint_to_write["request_id"] = request_id
+        checkpoint_to_write["updated_at"] = updated_at
+        checkpoint_to_write["last_completed_batch_id"] = last_completed_batch_id or _safe_text(checkpoint_payload.get("last_completed_batch_id"))
+    else:
+        checkpoint_to_write = {}
+    profile_path = author_dir / "_creator_profile.json"
+    collection_path = author_dir / "_work_collection.json"
+    checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
+    profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
+    collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
+    if checkpoint_to_write:
+        checkpoint_path.write_text(json.dumps(checkpoint_to_write, ensure_ascii=False, indent=2), encoding="utf-8")
+    return {
+        "author_dir": str(author_dir),
+        "creator_profile_path": str(profile_path),
+        "work_collection_path": str(collection_path),
+        "checkpoint_path": str(checkpoint_path),
+    }
+def resolve_homepage_run_status(stats: Optional[Dict[str, Any]]) -> str:
+    payload = stats if isinstance(stats, dict) else {}
+    total = int(payload.get("total") or 0)
+    success = int(payload.get("success") or 0)
+    failed = int(payload.get("fallback_none") or 0)
+    if total <= 0:
+        return "complete"
+    if failed <= 0 and success >= total:
+        return "complete"
+    if success > 0:
+        return "partial"
+    return "failed"

package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py CHANGED Viewed

@@ -14,9 +14,7 @@ if __package__ in {None, ""}:
             break
 import argparse
-import json
-from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Set
 from scripts.core.bootstrap_env import bootstrap_for_direct_run
@@ -25,63 +23,23 @@ bootstrap_for_direct_run(__file__, __package__)
 from scripts.core.completeness import ensure_request_id, evaluate_collection, normalize_missing_fields
 from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
 from scripts.core.progress_report import build_progress_reporter
-from scripts.core.storage_router import resolve_author_directory_name
 from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
 from scripts.pipelines.input_contracts import normalize_douyin_creator_input
 from scripts.pipelines.schema import build_author_profile
 from scripts.pipelines.douyin_creator_home_helpers import collect_and_adapt
 from scripts.pipelines.home_asr import enrich_author_home_asr
+from scripts.pipelines.homepage_runtime_state import (
+    clear_homepage_checkpoint,
+    load_homepage_checkpoint,
+    persist_homepage_runtime_artifacts,
+    resolve_homepage_run_status,
+)
 from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
 DEFAULT_MAX_ITEMS = 200
 MAX_ITEMS_HARD_LIMIT = 200
-def _write_collection_artifacts(
-    *,
-    profile: Dict[str, Any],
-    works: List[Dict[str, Any]],
-    card_root: str,
-    extract_trace: List[Dict[str, Any]],
-    request_id: str,
-) -> Dict[str, str]:
-    author_dir_name = resolve_author_directory_name(
-        "douyin",
-        str(profile.get("author_handle") or ""),
-        str(profile.get("platform_author_id") or ""),
-        str(profile.get("nickname") or ""),
-    )
-    author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
-    author_dir.mkdir(parents=True, exist_ok=True)
-    creator_profile = dict(profile)
-    creator_profile["request_id"] = request_id
-    creator_profile["extract_trace"] = extract_trace
-    work_collection = {
-        "platform": "douyin",
-        "platform_author_id": profile.get("platform_author_id"),
-        "count": len(works),
-        "items": [
-            {
-                "platform_work_id": item.get("platform_work_id"),
-                "title": item.get("title"),
-                "published_date": item.get("published_date"),
-            }
-            for item in works
-            if isinstance(item, dict)
-        ],
-        "request_id": request_id,
-        "extract_trace": extract_trace,
-    }
-    profile_path = author_dir / "_creator_profile.json"
-    collection_path = author_dir / "_work_collection.json"
-    profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
-    collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
-    return {"creator_profile_path": str(profile_path), "work_collection_path": str(collection_path)}
 def run_douyin_creator_home(
     *,
     input_value: str,
@@ -166,7 +124,76 @@ def run_douyin_creator_home(
         progress=progress.child(scope="author_home.collect"),
     )
+    card_root = resolve_storage_paths(config)["card_root"]
+    request_id = ensure_request_id(
+        raw.get("request_id") or profile.get("request_id"),
+        fallback_seed=normalized_input_value or input_value,
+    )
+    raw_extract_trace = list(raw.get("extract_trace") or [])
+    checkpoint = load_homepage_checkpoint(
+        platform="douyin",
+        profile=profile,
+        card_root=card_root,
+    )
+    if checkpoint:
+        progress.progress(
+            stage="author_home.workflow.resume",
+            message="douyin author_home checkpoint loaded",
+            data={
+                "completed_work_ids": len(checkpoint.get("completed_work_ids") or []),
+                "last_completed_batch_id": checkpoint.get("last_completed_batch_id"),
+            },
+        )
     asr_strategy = config_get(config, "asr_strategy", {})
+    card_results: List[Dict[str, Any]] = []
+    written_work_ids: Set[str] = set()
+    def _persist_batch(event: Dict[str, Any]) -> None:
+        batch_id = str(event.get("batch_id") or "")
+        batch_works = event.get("batch_works") if isinstance(event.get("batch_works"), list) else []
+        all_works = event.get("works") if isinstance(event.get("works"), list) else []
+        batch_trace = raw_extract_trace + list(event.get("trace") or [])
+        batch_card_count = 0
+        if write_card:
+            for work in batch_works:
+                if not isinstance(work, dict):
+                    continue
+                result = write_work_fact_card(
+                    payload=work,
+                    platform="douyin",
+                    card_root=card_root,
+                    storage_config=config,
+                )
+                card_results.append(result)
+                work_id = str(work.get("platform_work_id") or "").strip()
+                if work_id:
+                    written_work_ids.add(work_id)
+                batch_card_count += 1
+        persist_homepage_runtime_artifacts(
+            platform="douyin",
+            profile=profile,
+            works=all_works,
+            card_root=card_root,
+            extract_trace=batch_trace,
+            request_id=request_id,
+            checkpoint=event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {},
+            run_status="in_progress",
+            last_completed_batch_id=batch_id,
+        )
+        progress.progress(
+            stage="author_home.persist.batch",
+            message="douyin author_home batch persisted",
+            data={
+                "batch_id": batch_id,
+                "batch_cards": batch_card_count,
+                "completed_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("processed_works"),
+                "pending_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("pending_works"),
+            },
+        )
     asr_bundle = enrich_author_home_asr(
         platform="douyin",
         works=works,
@@ -179,14 +206,18 @@ def run_douyin_creator_home(
         douyin_submit_backoff_ms=int(config_get(config, "asr_strategy.submit_retry.douyin_video.backoff_ms", 1500)),
         timeout_retry_enabled=bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True)),
         timeout_retry_max_retries=int(config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 0)),
+        checkpoint=checkpoint,
+        request_id=request_id,
+        on_batch_complete=_persist_batch,
         progress=progress.child(scope="author_home.asr"),
     )
     works = list(asr_bundle.get("works") or [])
-    card_root = resolve_storage_paths(config)["card_root"]
-    card_results: List[Dict[str, Any]] = []
     if write_card:
         for work in works:
+            work_id = str(work.get("platform_work_id") or "").strip()
+            if work_id and work_id in written_work_ids:
+                continue
             card_results.append(
                 write_work_fact_card(
                     payload=work,
@@ -196,19 +227,23 @@ def run_douyin_creator_home(
                 )
             )
-    request_id = ensure_request_id(
-        raw.get("request_id") or profile.get("request_id"),
-        fallback_seed=normalized_input_value or input_value,
-    )
-    extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
-    collection_artifacts = _write_collection_artifacts(
+    extract_trace = raw_extract_trace + list(asr_bundle.get("trace") or [])
+    checkpoint_out = asr_bundle.get("checkpoint") if isinstance(asr_bundle.get("checkpoint"), dict) else {}
+    collection_artifacts = persist_homepage_runtime_artifacts(
+        platform="douyin",
         profile=profile,
         works=works,
         card_root=card_root,
         extract_trace=extract_trace,
         request_id=request_id,
+        checkpoint=checkpoint_out,
+        run_status=resolve_homepage_run_status(asr_bundle.get("stats")),
+        last_completed_batch_id=str(checkpoint_out.get("last_completed_batch_id") or ""),
     )
+    if int(checkpoint_out.get("pending_works") or 0) <= 0:
+        cleared_checkpoint_path = clear_homepage_checkpoint(platform="douyin", profile=profile, card_root=card_root)
+        if cleared_checkpoint_path:
+            collection_artifacts["checkpoint_cleared_path"] = cleared_checkpoint_path
     normalized_profile = dict(profile)
     normalized_profile["request_id"] = request_id

package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py CHANGED Viewed

@@ -14,9 +14,7 @@ if __package__ in {None, ""}:
             break
 import argparse
-import json
-from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Set
 from scripts.core.bootstrap_env import bootstrap_for_direct_run
@@ -25,9 +23,14 @@ bootstrap_for_direct_run(__file__, __package__)
 from scripts.core.completeness import ensure_request_id, evaluate_collection, normalize_missing_fields
 from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
 from scripts.core.progress_report import build_progress_reporter
-from scripts.core.storage_router import resolve_author_directory_name
 from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
 from scripts.pipelines.home_asr import enrich_author_home_asr
+from scripts.pipelines.homepage_runtime_state import (
+    clear_homepage_checkpoint,
+    load_homepage_checkpoint,
+    persist_homepage_runtime_artifacts,
+    resolve_homepage_run_status,
+)
 from scripts.pipelines.input_contracts import normalize_xhs_creator_input
 from scripts.pipelines.schema import build_author_profile
 from scripts.pipelines.xiaohongshu_creator_home_helpers import collect_and_adapt
@@ -37,51 +40,6 @@ DEFAULT_MAX_ITEMS = 200
 MAX_ITEMS_HARD_LIMIT = 200
-def _write_collection_artifacts(
-    *,
-    profile: Dict[str, Any],
-    works: List[Dict[str, Any]],
-    card_root: str,
-    extract_trace: List[Dict[str, Any]],
-    request_id: str,
-) -> Dict[str, str]:
-    author_dir_name = resolve_author_directory_name(
-        "xiaohongshu",
-        str(profile.get("author_handle") or ""),
-        str(profile.get("platform_author_id") or ""),
-        str(profile.get("nickname") or ""),
-    )
-    author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
-    author_dir.mkdir(parents=True, exist_ok=True)
-    creator_profile = dict(profile)
-    creator_profile["request_id"] = request_id
-    creator_profile["extract_trace"] = extract_trace
-    work_collection = {
-        "platform": "xiaohongshu",
-        "platform_author_id": profile.get("platform_author_id"),
-        "count": len(works),
-        "items": [
-            {
-                "platform_work_id": item.get("platform_work_id"),
-                "title": item.get("title"),
-                "published_date": item.get("published_date"),
-            }
-            for item in works
-            if isinstance(item, dict)
-        ],
-        "request_id": request_id,
-        "extract_trace": extract_trace,
-    }
-    profile_path = author_dir / "_creator_profile.json"
-    collection_path = author_dir / "_work_collection.json"
-    profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
-    collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
-    return {"creator_profile_path": str(profile_path), "work_collection_path": str(collection_path)}
 def run_xiaohongshu_creator_home(
     *,
     input_value: str,
@@ -166,7 +124,76 @@ def run_xiaohongshu_creator_home(
         progress=progress.child(scope="author_home.collect"),
     )
+    card_root = resolve_storage_paths(config)["card_root"]
+    request_id = ensure_request_id(
+        raw.get("request_id") or profile.get("request_id"),
+        fallback_seed=normalized_input_value or input_value,
+    )
+    raw_extract_trace = list(raw.get("extract_trace") or [])
+    checkpoint = load_homepage_checkpoint(
+        platform="xiaohongshu",
+        profile=profile,
+        card_root=card_root,
+    )
+    if checkpoint:
+        progress.progress(
+            stage="author_home.workflow.resume",
+            message="xiaohongshu author_home checkpoint loaded",
+            data={
+                "completed_work_ids": len(checkpoint.get("completed_work_ids") or []),
+                "last_completed_batch_id": checkpoint.get("last_completed_batch_id"),
+            },
+        )
     asr_strategy = config_get(config, "asr_strategy", {})
+    card_results: List[Dict[str, Any]] = []
+    written_work_ids: Set[str] = set()
+    def _persist_batch(event: Dict[str, Any]) -> None:
+        batch_id = str(event.get("batch_id") or "")
+        batch_works = event.get("batch_works") if isinstance(event.get("batch_works"), list) else []
+        all_works = event.get("works") if isinstance(event.get("works"), list) else []
+        batch_trace = raw_extract_trace + list(event.get("trace") or [])
+        batch_card_count = 0
+        if write_card:
+            for work in batch_works:
+                if not isinstance(work, dict):
+                    continue
+                result = write_work_fact_card(
+                    payload=work,
+                    platform="xiaohongshu",
+                    card_root=card_root,
+                    storage_config=config,
+                )
+                card_results.append(result)
+                work_id = str(work.get("platform_work_id") or "").strip()
+                if work_id:
+                    written_work_ids.add(work_id)
+                batch_card_count += 1
+        persist_homepage_runtime_artifacts(
+            platform="xiaohongshu",
+            profile=profile,
+            works=all_works,
+            card_root=card_root,
+            extract_trace=batch_trace,
+            request_id=request_id,
+            checkpoint=event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {},
+            run_status="in_progress",
+            last_completed_batch_id=batch_id,
+        )
+        progress.progress(
+            stage="author_home.persist.batch",
+            message="xiaohongshu author_home batch persisted",
+            data={
+                "batch_id": batch_id,
+                "batch_cards": batch_card_count,
+                "completed_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("processed_works"),
+                "pending_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("pending_works"),
+            },
+        )
     asr_bundle = enrich_author_home_asr(
         platform="xiaohongshu",
         works=works,
@@ -179,14 +206,18 @@ def run_xiaohongshu_creator_home(
         xhs_submit_backoff_ms=int(config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.backoff_ms", 0)),
         timeout_retry_enabled=bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True)),
         timeout_retry_max_retries=int(config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 0)),
+        checkpoint=checkpoint,
+        request_id=request_id,
+        on_batch_complete=_persist_batch,
         progress=progress.child(scope="author_home.asr"),
     )
     works = list(asr_bundle.get("works") or [])
-    card_root = resolve_storage_paths(config)["card_root"]
-    card_results: List[Dict[str, Any]] = []
     if write_card:
         for work in works:
+            work_id = str(work.get("platform_work_id") or "").strip()
+            if work_id and work_id in written_work_ids:
+                continue
             card_results.append(
                 write_work_fact_card(
                     payload=work,
@@ -196,19 +227,23 @@ def run_xiaohongshu_creator_home(
                 )
             )
-    request_id = ensure_request_id(
-        raw.get("request_id") or profile.get("request_id"),
-        fallback_seed=normalized_input_value or input_value,
-    )
-    extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
-    collection_artifacts = _write_collection_artifacts(
+    extract_trace = raw_extract_trace + list(asr_bundle.get("trace") or [])
+    checkpoint_out = asr_bundle.get("checkpoint") if isinstance(asr_bundle.get("checkpoint"), dict) else {}
+    collection_artifacts = persist_homepage_runtime_artifacts(
+        platform="xiaohongshu",
         profile=profile,
         works=works,
         card_root=card_root,
         extract_trace=extract_trace,
         request_id=request_id,
+        checkpoint=checkpoint_out,
+        run_status=resolve_homepage_run_status(asr_bundle.get("stats")),
+        last_completed_batch_id=str(checkpoint_out.get("last_completed_batch_id") or ""),
     )
+    if int(checkpoint_out.get("pending_works") or 0) <= 0:
+        cleared_checkpoint_path = clear_homepage_checkpoint(platform="xiaohongshu", profile=profile, card_root=card_root)
+        if cleared_checkpoint_path:
+            collection_artifacts["checkpoint_cleared_path"] = cleared_checkpoint_path
     normalized_profile = dict(profile)
     normalized_profile["request_id"] = request_id

package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py CHANGED Viewed

@@ -359,18 +359,10 @@ def _markdown_lines(card: Dict[str, Any]) -> List[str]:
     lines = _frontmatter_lines(card)
     primary_text = _safe_text(card.get("primary_text"))
     caption_raw = _safe_text(card.get("caption_raw"))
-    subtitle_raw = _safe_text(card.get("subtitle_raw"))
-    asr_raw = _safe_text(card.get("asr_raw"))
     lines.extend(["", "## 主文本", primary_text or ""])
     if caption_raw and caption_raw != primary_text:
         lines.extend(["", "## 原始文案", caption_raw])
-    if asr_raw and subtitle_raw and asr_raw == subtitle_raw and asr_raw != primary_text:
-        lines.extend(["", "## 原始转写", asr_raw])
-    elif subtitle_raw and subtitle_raw != primary_text:
-        lines.extend(["", "## 原始字幕", subtitle_raw])
-    if asr_raw and asr_raw not in {primary_text, subtitle_raw}:
-        lines.extend(["", "## 原始转写", asr_raw])
     if card.get("missing_fields"):
         lines.extend(["", "## 缺失字段"])
         for entry in card["missing_fields"]: