npm - @tikomni/skills - Versions diffs - 0.1.11 → 1.0.0 - Mend

@tikomni/skills 0.1.11 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tikomni/skills",
-  "version": "0.1.11",
+  "version": "1.0.0",
   "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
   "license": "MIT",
   "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",

package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md CHANGED Viewed

@@ -24,6 +24,7 @@
 ## Optional Fields
+- `duration_ms`
 - `digg_count`
 - `comment_count`
 - `collect_count`
@@ -37,6 +38,7 @@
 ## Field Rules
 - `author` is the display name, not an object.
+- `duration_ms` uses milliseconds. Write `null` when the duration is unavailable or not applicable.
 - Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
 - The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
 - `primary_text` is the text that is best suited for reading and indexing in the current task.

package/skills/social-media-crawl/references/schemas/work-fact-card.schema.json CHANGED Viewed

@@ -32,6 +32,7 @@
     "subtitle_raw": { "type": "string" },
     "work_modality": { "type": "string" },
     "published_date": { "type": "string" },
+    "duration_ms": { "type": ["integer", "null"] },
     "digg_count": { "type": "integer" },
     "comment_count": { "type": "integer" },
     "collect_count": { "type": "integer" },

package/skills/social-media-crawl/scripts/pipelines/home_asr.py CHANGED Viewed

@@ -18,6 +18,7 @@ from scripts.core.asr_pipeline import (
     run_u2_asr_candidates_with_timeout_retry,
 )
 from scripts.core.u3_fallback import run_u3_public_url_fallback
+from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
 DEFAULT_BATCH_SUBMIT_SIZE = 50
 MAX_BATCH_SUBMIT_SIZE = 100
@@ -76,13 +77,17 @@ def _resolve_is_video(work: Dict[str, Any], *, platform: str) -> bool:
         return False
     if platform == "douyin":
-        return True
+        raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
+        raw_item = raw_ref.get("raw_item") if isinstance(raw_ref.get("raw_item"), dict) else {}
+        if raw_item:
+            return bool(normalize_douyin_video_type(raw_item).get("is_video"))
+        return False
     raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
     xhs_type_hint = normalize_text(raw_ref.get("type") or raw_ref.get("note_type")).lower()
-    if xhs_type_hint in {"video", "0", "normal", "mixed", "mix"}:
+    if xhs_type_hint in {"video", "0", "mixed", "mix", "video_note", "note_video"}:
         return True
-    if xhs_type_hint in {"image", "1", "note", "photo"}:
+    if xhs_type_hint in {"normal", "image", "1", "note", "photo", "text", "album"}:
         return False
     return False
@@ -275,7 +280,8 @@ def _run_u2_for_work(
     gate = _evaluate_u2_gate(work, platform=platform)
     if not gate.get("can_u2"):
         gate_reason = normalize_text(gate.get("gate_reason")) or "skip:unknown"
-        return _fallback_none_result(gate_reason), {
+        fallback_result = _video_caption_fallback_result(work, gate_reason) if gate.get("is_video") else _fallback_none_result(gate_reason)
+        return fallback_result, {
             "step": "author_home.asr.u2_gate",
             "platform_work_id": work.get("platform_work_id"),
             "ok": False,
@@ -325,19 +331,7 @@ def _run_u2_for_work(
             asr_source="external_asr",
         ), trace
-    return {
-        "subtitle_raw": "",
-        "subtitle_source": "missing",
-        "asr_raw": "",
-        "asr_clean": "",
-        "primary_text": "",
-        "primary_text_source": "asr_clean",
-        "analysis_eligibility": "incomplete",
-        "analysis_exclusion_reason": "video_asr_unavailable",
-        "asr_status": "failed",
-        "asr_error_reason": normalize_text(poll_result.get("error_reason")) or "u2_failed",
-        "asr_source": "fallback_none",
-    }, trace
+    return _video_caption_fallback_result(work, normalize_text(poll_result.get("error_reason")) or "u2_failed"), trace
 def _iter_xhs_interface_text_candidates(work: Dict[str, Any]) -> List[Tuple[str, str]]:
@@ -489,6 +483,23 @@ def _fallback_none_result(reason: str) -> Dict[str, Any]:
     }
+def _video_caption_fallback_result(work: Dict[str, Any], reason: str) -> Dict[str, Any]:
+    caption_raw = normalize_text(work.get("caption_raw"))
+    return {
+        "subtitle_raw": "",
+        "subtitle_source": "missing",
+        "asr_raw": "",
+        "asr_clean": "",
+        "primary_text": caption_raw,
+        "primary_text_source": "caption_raw" if caption_raw else "missing",
+        "analysis_eligibility": "eligible" if caption_raw else "incomplete",
+        "analysis_exclusion_reason": "" if caption_raw else (normalize_text(reason) or "video_asr_unavailable"),
+        "asr_status": "failed",
+        "asr_error_reason": normalize_text(reason) or "asr_failed",
+        "asr_source": "fallback_none",
+    }
 def _run_xhs_u3_then_u2_batch_for_entries(
     *,
     batch_id: str,
@@ -517,7 +528,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
         subtitle_invalid = normalize_text(entry.get("subtitle_invalid")) or "subtitle_missing"
         if not source_url:
-            work.update(_fallback_none_result("skip:video_download_url_missing"))
+            work.update(_video_caption_fallback_result(work, "skip:video_download_url_missing"))
             trace.append(
                 {
                     "step": "author_home.asr.xhs_u3",
@@ -554,7 +565,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
         )
         if not u3_result.get("ok") or not public_url:
-            work.update(_fallback_none_result(normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
+            work.update(_video_caption_fallback_result(work, normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
             u3_failed_count += 1
             continue
@@ -597,7 +608,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
         work = entry.get("work")
         if not isinstance(work, dict):
             continue
-        work.update(_fallback_none_result(normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
+        work.update(_video_caption_fallback_result(work, normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
     return {
         "trace": trace,
@@ -905,7 +916,10 @@ def enrich_author_home_asr(
                 )
                 if not gate.get("can_u2"):
-                    work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
+                    if gate.get("is_video"):
+                        work.update(_video_caption_fallback_result(work, str(gate.get("gate_reason") or "skip:unknown")))
+                    else:
+                        work.update(_mark_text_work_ready(work))
                 else:
                     batch_u2_entries.append(
                         {
@@ -987,7 +1001,7 @@ def enrich_author_home_asr(
                     }
                 )
                 if not gate.get("can_u2"):
-                    work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
+                    work.update(_video_caption_fallback_result(work, str(gate.get("gate_reason") or "skip:unknown")))
                 else:
                     batch_xhs_u3_entries.append(
                         {

package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py CHANGED Viewed

@@ -72,6 +72,177 @@ def _pick_list(payload: Any, keys: List[str]) -> List[Any]:
     return hit if isinstance(hit, list) else []
+def _pick_raw(payload: Any, keys: List[str]) -> Any:
+    ordered_keys = [str(key) for key in keys if str(key).strip()]
+    def _walk(node: Any) -> Any:
+        if isinstance(node, dict):
+            for key in ordered_keys:
+                if key in node and node.get(key) is not None:
+                    return node.get(key)
+            for value in node.values():
+                hit = _walk(value)
+                if hit is not None:
+                    return hit
+        elif isinstance(node, list):
+            for item in node:
+                hit = _walk(item)
+                if hit is not None:
+                    return hit
+        return None
+    return _walk(payload)
+def _payload_candidates(payload: Any) -> List[Any]:
+    candidates: List[Any] = []
+    if payload is not None:
+        candidates.append(payload)
+    if isinstance(payload, dict):
+        nested = payload.get("data")
+        if nested is not None:
+            candidates.append(nested)
+    return candidates
+def _pick_raw_from_candidates(payload: Any, keys: List[str]) -> Any:
+    for candidate in _payload_candidates(payload):
+        hit = _pick_raw(candidate, keys)
+        if hit is not None:
+            return hit
+    return None
+def _pick_list_from_candidates(payload: Any, keys: List[str]) -> List[Any]:
+    for candidate in _payload_candidates(payload):
+        hit = _pick_list(candidate, keys)
+        if hit:
+            return hit
+    return []
+def _unwrap_data_layers(payload: Any, *, max_depth: int = 3) -> Any:
+    node = payload
+    depth = 0
+    while depth < max_depth and isinstance(node, dict) and isinstance(node.get("data"), dict):
+        node = node.get("data")
+        depth += 1
+    return node
+def _extract_douyin_posts_page(payload: Any) -> Dict[str, Any]:
+    node = _unwrap_data_layers(payload)
+    if not isinstance(node, dict):
+        return {}
+    return node
+def _extract_douyin_posts_items(payload: Any) -> List[Any]:
+    node = _extract_douyin_posts_page(payload)
+    items = node.get("aweme_list")
+    return items if isinstance(items, list) else []
+def _extract_douyin_posts_next_cursor(payload: Any) -> Any:
+    node = _extract_douyin_posts_page(payload)
+    if not isinstance(node, dict):
+        return None
+    for key in ("max_cursor", "cursor", "next_cursor"):
+        if key in node and node.get(key) is not None:
+            return node.get(key)
+    return None
+def _extract_douyin_posts_has_more(payload: Any) -> Any:
+    node = _extract_douyin_posts_page(payload)
+    if not isinstance(node, dict):
+        return None
+    for key in ("has_more", "hasMore"):
+        if key in node and node.get(key) is not None:
+            return node.get(key)
+    return None
+def _extract_xhs_posts_page(payload: Any) -> Dict[str, Any]:
+    node = _unwrap_data_layers(payload)
+    if not isinstance(node, dict):
+        return {}
+    return node
+def _extract_xhs_posts_items(payload: Any) -> List[Any]:
+    node = _extract_xhs_posts_page(payload)
+    for key in ("notes", "note_list", "noteList", "items", "list"):
+        value = node.get(key)
+        if isinstance(value, list):
+            return value
+    return []
+def _extract_xhs_response_cursor(payload: Any) -> str:
+    node = _extract_xhs_posts_page(payload)
+    if not isinstance(node, dict):
+        return ""
+    for key in ("cursor", "next_cursor", "last_cursor", "last_note_id"):
+        value = _to_text(node.get(key))
+        if value:
+            return value
+    return ""
+def _extract_xhs_posts_has_more(payload: Any) -> Any:
+    node = _extract_xhs_posts_page(payload)
+    if not isinstance(node, dict):
+        return None
+    for key in ("has_more", "hasMore"):
+        if key in node and node.get(key) is not None:
+            return node.get(key)
+    return None
+def _normalize_has_more(value: Any) -> Optional[bool]:
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, (int, float)):
+        return bool(int(value))
+    text = _to_text(value).lower()
+    if not text:
+        return None
+    if text in {"1", "true", "yes", "y"}:
+        return True
+    if text in {"0", "false", "no", "n"}:
+        return False
+    return None
+def _normalize_int_like(value: Any) -> Optional[int]:
+    if value is None:
+        return None
+    try:
+        if isinstance(value, bool):
+            return int(value)
+        if isinstance(value, (int, float)):
+            return int(value)
+        text = _to_text(value)
+        if not text:
+            return None
+        return int(float(text.replace(",", "")))
+    except Exception:
+        return None
+def _last_xhs_note_id(items: List[Any]) -> str:
+    for item in reversed(items):
+        if not isinstance(item, dict):
+            continue
+        note_id = _pick_text(item, ["note_id", "id", "item_id"])
+        if note_id:
+            return note_id
+    return ""
 def _looks_like_douyin_sec_user_id(value: str) -> bool:
     return value.startswith("MS4wLjA")
@@ -206,10 +377,10 @@ def _pick_first_mapping(items: List[Any]) -> Dict[str, Any]:
 def _xhs_posts_field_completeness(payload: Any) -> Dict[str, Any]:
-    page_items = _pick_list(payload, ["notes", "note_list", "noteList", "items", "list"])
+    page_items = _extract_xhs_posts_items(payload)
     first_item = _pick_first_mapping(page_items)
-    has_more_flag = _pick_int(payload, ["has_more", "hasMore"], default=-1) >= 0
-    cursor_hit = bool(_pick_text(payload, ["cursor", "next_cursor", "last_cursor", "last_note_id"]))
+    has_more_flag = _extract_xhs_posts_has_more(payload) is not None
+    cursor_hit = bool(_extract_xhs_response_cursor(payload))
     cover_hit = bool(_extract_first_url(_first_url_candidate(first_item, ["cover", "cover_url", "cover_image", "image", "image_url"])))
     share_or_source = bool(_pick_text(first_item, ["share_url", "share_link", "url", "note_url"])) or bool(_pick_text(first_item, ["note_id", "id", "item_id"]))
     interaction_values = [
@@ -466,7 +637,10 @@ def collect_douyin_author_home_raw(
     page = 0
     pagination_trace: List[Dict[str, Any]] = []
-    while has_more and page < max(pages_max, 1) and len(works) < max_items:
+    max_pages = max(pages_max, 1)
+    page_limit = min(max(page_size, 1), 20)
+    while has_more and page < max_pages and len(works) < max_items:
         page += 1
         posts_resp = call_json_api(
             base_url=base_url,
@@ -476,7 +650,7 @@ def collect_douyin_author_home_raw(
             timeout_ms=timeout_ms,
             params={
                 "sec_user_id": sec_user_id,
-                "count": min(max(page_size, 1), 20),
+                "count": page_limit,
                 "max_cursor": cursor,
                 "sort_type": 0,
             },
@@ -497,22 +671,12 @@ def collect_douyin_author_home_raw(
             )
         request_id_candidates.append(posts_resp)
         response_payload = posts_resp.get("data")
-        page_items = _pick_list(response_payload, ["aweme_list", "items", "list"])
-        if not page_items and isinstance(response_payload, dict):
-            page_items = _pick_list(response_payload.get("data"), ["aweme_list", "items", "list"])
+        page_items = _extract_douyin_posts_items(response_payload)
-        data = response_payload
-        next_cursor = _pick_int(data, ["max_cursor", "cursor", "next_cursor"], default=0)
-        has_more_flag = _pick_int(data, ["has_more", "hasMore"], default=0)
-        pagination_trace.append(
-            {
-                "page": page,
-                "cursor_in": cursor,
-                "cursor_out": next_cursor,
-                "has_more": has_more_flag,
-                "items": len(page_items),
-            }
-        )
+        next_cursor_raw = _extract_douyin_posts_next_cursor(response_payload)
+        has_more_raw = _extract_douyin_posts_has_more(response_payload)
+        next_cursor = _normalize_int_like(next_cursor_raw)
+        has_more_normalized = _normalize_has_more(has_more_raw)
         for item in page_items:
             if not isinstance(item, dict):
@@ -526,8 +690,44 @@ def collect_douyin_author_home_raw(
             if len(works) >= max_items:
                 break
-        has_more = bool(has_more_flag == 1 and next_cursor != cursor)
-        cursor = next_cursor
+        next_cursor_changed = next_cursor is not None and next_cursor != cursor
+        stop_reason = ""
+        should_continue = False
+        if len(works) >= max_items:
+            stop_reason = "max_items_reached"
+        elif not page_items:
+            stop_reason = "page_empty"
+        elif has_more_normalized is False:
+            stop_reason = "has_more_false"
+        elif next_cursor is not None and next_cursor == cursor:
+            stop_reason = "cursor_not_advanced"
+        elif has_more_normalized is True and next_cursor is None:
+            stop_reason = "pagination_contract_incomplete"
+        elif has_more_normalized is True or next_cursor_changed:
+            should_continue = True
+        else:
+            stop_reason = "pagination_contract_incomplete"
+        if should_continue and page >= max_pages:
+            should_continue = False
+            stop_reason = "pages_max_reached"
+        pagination_trace.append(
+            {
+                "page": page,
+                "cursor_in": cursor,
+                "cursor_out": next_cursor,
+                "has_more_raw": has_more_raw,
+                "has_more_normalized": has_more_normalized,
+                "items": len(page_items),
+                "stop_reason": stop_reason,
+            }
+        )
+        has_more = should_continue
+        if should_continue and next_cursor is not None:
+            cursor = next_cursor
     request_id = _pick_request_id(request_id_candidates, trace)
     if progress is not None:
@@ -661,7 +861,10 @@ def collect_xhs_author_home_raw(
     page = 0
     pagination_trace: List[Dict[str, Any]] = []
-    while has_more and page < max(pages_max, 1) and len(works) < max_items:
+    max_pages = max(pages_max, 1)
+    page_limit = min(max(page_size, 1), 20)
+    while has_more and page < max_pages and len(works) < max_items:
         page += 1
         if progress is not None:
             progress.progress(
@@ -684,7 +887,7 @@ def collect_xhs_author_home_raw(
                     "user_id": user_id,
                     "share_text": input_value,
                     "cursor": cursor or None,
-                    "num": min(max(page_size, 1), 20),
+                    "num": page_limit,
                     "xsec_token": xsec_token or None,
                 },
                 fallback_reason=posts_reason,
@@ -739,20 +942,14 @@ def collect_xhs_author_home_raw(
         )
         data = posts_resp.get("data")
-        page_items = _pick_list(data, ["notes", "note_list", "noteList", "items", "list"])
-        next_cursor = _pick_text(data, ["cursor", "next_cursor", "last_cursor", "last_note_id"])
-        has_more_flag = _pick_int(data, ["has_more", "hasMore"], default=0)
-        pagination_trace.append(
-            {
-                "page": page,
-                "cursor_in": cursor,
-                "cursor_out": next_cursor,
-                "has_more": has_more_flag,
-                "items": len(page_items),
-                "route_label": posts_resp.get("_route_label"),
-                "request_id": posts_resp.get("request_id"),
-            }
-        )
+        page_items = _extract_xhs_posts_items(data)
+        next_cursor_raw = _extract_xhs_response_cursor(data)
+        has_more_raw = _extract_xhs_posts_has_more(data)
+        has_more_normalized = _normalize_has_more(has_more_raw)
+        fallback_cursor = _last_xhs_note_id(page_items)
+        explicit_cursor = _to_text(next_cursor_raw)
+        cursor_source = "response_cursor" if explicit_cursor else ("last_note_id_fallback" if fallback_cursor else "missing")
+        next_cursor = explicit_cursor or fallback_cursor
         for item in page_items:
             if not isinstance(item, dict):
@@ -766,8 +963,47 @@ def collect_xhs_author_home_raw(
             if len(works) >= max_items:
                 break
-        has_more = bool(has_more_flag == 1 and next_cursor and str(next_cursor) != str(cursor))
-        cursor = next_cursor
+        next_cursor_changed = bool(next_cursor and str(next_cursor) != str(cursor))
+        stop_reason = ""
+        should_continue = False
+        if len(works) >= max_items:
+            stop_reason = "max_items_reached"
+        elif not page_items:
+            stop_reason = "page_empty"
+        elif has_more_normalized is False:
+            stop_reason = "has_more_false"
+        elif next_cursor and str(next_cursor) == str(cursor):
+            stop_reason = "cursor_not_advanced"
+        elif has_more_normalized is True and not next_cursor:
+            stop_reason = "pagination_contract_incomplete"
+        elif has_more_normalized is True or next_cursor_changed:
+            should_continue = True
+        else:
+            stop_reason = "pagination_contract_incomplete"
+        if should_continue and page >= max_pages:
+            should_continue = False
+            stop_reason = "pages_max_reached"
+        pagination_trace.append(
+            {
+                "page": page,
+                "cursor_in": cursor,
+                "cursor_out": next_cursor,
+                "cursor_source": cursor_source,
+                "has_more_raw": has_more_raw,
+                "has_more_normalized": has_more_normalized,
+                "items": len(page_items),
+                "route_label": posts_resp.get("_route_label"),
+                "request_id": posts_resp.get("request_id"),
+                "stop_reason": stop_reason,
+            }
+        )
+        has_more = should_continue
+        if should_continue and next_cursor:
+            cursor = next_cursor
     request_id = _pick_request_id(request_id_candidates, trace)
     if progress is not None:

package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py CHANGED Viewed

@@ -20,6 +20,7 @@ from scripts.pipelines.douyin_metadata import (
     extract_douyin_title,
     normalize_douyin_author_handle,
 )
+from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
 from scripts.pipelines.media_url_rules import is_probable_video_url as is_shared_probable_video_url
 from scripts.core.tikomni_common import deep_find_all, deep_find_first
 from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
@@ -54,6 +55,15 @@ def _optional_i(value: Any) -> int | None:
         return None
+def _normalize_duration_ms(value: Any) -> int:
+    parsed = _optional_i(value)
+    if parsed is None or parsed <= 0:
+        return 0
+    if 0 < parsed < 10000:
+        return parsed * 1000
+    return parsed
 def _first(payload: Any, keys: List[str], default: Any = "") -> Any:
     hit = deep_find_first(payload, keys)
     return default if hit is None else hit
@@ -134,6 +144,16 @@ def _extract_douyin_video_down_url(item: Dict[str, Any]) -> str:
     return _t(selected.get("video_down_url"))
+def _extract_douyin_duration_ms(item: Dict[str, Any]) -> int:
+    raw = item.get("duration_ms")
+    if raw is None:
+        raw = item.get("duration")
+    video = item.get("video")
+    if raw is None and isinstance(video, dict):
+        raw = video.get("duration")
+    return _normalize_duration_ms(raw)
 def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
     urls = _pick_http_urls(
         item,
@@ -154,6 +174,33 @@ def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
     return ""
+def _extract_xhs_duration_ms(item: Dict[str, Any]) -> int:
+    video_info_v2 = item.get("video_info_v2") if isinstance(item.get("video_info_v2"), dict) else {}
+    media = video_info_v2.get("media") if isinstance(video_info_v2.get("media"), dict) else {}
+    media_video = media.get("video") if isinstance(media.get("video"), dict) else {}
+    capa = video_info_v2.get("capa") if isinstance(video_info_v2.get("capa"), dict) else {}
+    raw_candidates = [
+        item.get("duration_ms"),
+        item.get("duration"),
+        item.get("video_duration"),
+        item.get("duration_sec"),
+        media_video.get("duration"),
+        capa.get("duration"),
+    ]
+    video = item.get("video")
+    if isinstance(video, dict):
+        raw_candidates.append(video.get("duration"))
+    note = item.get("note")
+    if isinstance(note, dict):
+        raw_candidates.append(note.get("duration"))
+    for candidate in raw_candidates:
+        normalized = _normalize_duration_ms(candidate)
+        if normalized > 0:
+            return normalized
+    return 0
 def _normalize_text_list(value: Any) -> List[str]:
     values: List[str] = []
     if isinstance(value, list):
@@ -276,9 +323,9 @@ def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
 def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str, subtitle_inline: str) -> str:
     content_type_raw = _t(_first(item, ["type", "note_type", "model_type"])).lower()
-    if content_type_raw in {"video", "0", "normal", "mixed", "mix", "video_note", "note_video"}:
+    if content_type_raw in {"video", "0", "mixed", "mix", "video_note", "note_video"}:
         return "video"
-    if content_type_raw in {"image", "1", "photo", "album", "note", "text"}:
+    if content_type_raw in {"normal", "image", "1", "photo", "album", "note", "text"}:
         return "text"
     if video_download_url or subtitle_inline:
         return "video"
@@ -393,6 +440,9 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
         if not isinstance(item, dict):
             continue
         aweme_id = _t(_first(item, ["aweme_id", "item_id", "id"]))
+        video_type_info = normalize_douyin_video_type(item)
+        is_video = bool(video_type_info.get("is_video"))
+        duration_ms = _extract_douyin_duration_ms(item)
         author_info = extract_douyin_author(item)
         metrics = extract_douyin_metrics(item)
         video_down_url = _extract_douyin_video_down_url(item)
@@ -417,9 +467,9 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
             subtitle_raw="",
             subtitle_source="missing",
             publish_time=_t(_first(item, ["create_time", "publish_time"])),
-            work_modality="video",
-            content_type="video",
-            duration_ms=_i(_first(item, ["duration_ms", "duration"], 0)),
+            work_modality="video" if is_video else "text",
+            content_type="video" if is_video else "text",
+            duration_ms=duration_ms,
             tags=tags,
             metrics={
                 "digg_count": int(metrics.get("digg_count") or 0),
@@ -438,10 +488,13 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
             asr_status="pending",
             asr_error_reason="",
             asr_source="fallback_none",
+            is_video=is_video,
             platform_native_refs={
                 "douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
                 "douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
                 "douyin_unique_id": _t(author_info.get("unique_id")),
+                "douyin_video_type_reason": _t(video_type_info.get("video_type_reason")),
+                "douyin_video_type_field": _t(video_type_info.get("matched_field")),
             },
             raw_ref={"aweme_id": aweme_id, "raw_item": item},
         )
@@ -519,6 +572,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
         subtitle_inline = _extract_xhs_subtitle_inline(item)
         subtitle_urls = _extract_xhs_subtitle_urls(item)
         video_down_url = _extract_xhs_video_down_url(item)
+        duration_ms = _extract_xhs_duration_ms(item)
         content_type_raw = _t(_first(item, ["type", "note_type", "model_type"]))
         work_modality = _extract_xhs_work_modality(item, video_download_url=video_down_url, subtitle_inline=subtitle_inline)
         content_type = "video" if work_modality == "video" else (content_type_raw or "text")
@@ -541,7 +595,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
             publish_time=_t(_first(item, ["publish_time", "time", "create_time", "publishTime", "created_at"])),
             work_modality=work_modality,
             content_type=content_type,
-            duration_ms=_i(_first(item, ["duration_ms", "duration", "video_duration"], 0)),
+            duration_ms=duration_ms,
             tags=_extract_xhs_tags(item),
             metrics=metrics,
             cover_image=cover_image,
@@ -551,6 +605,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
             asr_status="subtitle_ready" if subtitle_inline else "pending",
             asr_error_reason="",
             asr_source="native_subtitle" if subtitle_inline else "fallback_none",
+            is_video=work_modality == "video",
             platform_native_refs={"xhs_user_id": author_id, "xhs_red_id": author_handle},
             raw_ref={
                 "note_id": note_id,

package/skills/social-media-crawl/scripts/pipelines/schema.py CHANGED Viewed

@@ -108,9 +108,9 @@ def _infer_work_modality(*, work_modality: str, is_video: Any, content_type: str
         return "video"
     content_type_value = _to_text(content_type).lower()
-    if content_type_value in {"video", "mixed", "mix", "video_note", "note_video", "normal", "0"}:
+    if content_type_value in {"video", "mixed", "mix", "video_note", "note_video", "0"}:
         return "video"
-    if content_type_value in {"text", "note", "image", "photo", "album", "1"}:
+    if content_type_value in {"normal", "text", "note", "image", "photo", "album", "1"}:
         return "text"
     if subtitle_raw or video_download_url:

package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py CHANGED Viewed

@@ -62,6 +62,13 @@ def _safe_optional_int(value: Any) -> Optional[int]:
         return None
+def _safe_optional_positive_int(value: Any) -> Optional[int]:
+    parsed = _safe_optional_int(value)
+    if parsed is None or parsed <= 0:
+        return None
+    return parsed
 def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
     source = payload.get("source")
     return source if isinstance(source, dict) else {}
@@ -250,6 +257,7 @@ def build_work_fact_card(payload: Dict[str, Any], platform: Optional[str] = None
         "create_time_sec": payload.get("create_time_sec"),
         "publish_time_source": _safe_text(payload.get("publish_time_source")),
         "published_date": _resolve_published_date(payload),
+        "duration_ms": _safe_optional_positive_int(payload.get("duration_ms")),
         "digg_count": _safe_int(payload.get("digg_count")),
         "comment_count": _safe_int(payload.get("comment_count")),
         "collect_count": _safe_int(payload.get("collect_count")),
@@ -324,6 +332,7 @@ def _frontmatter_lines(card: Dict[str, Any]) -> List[str]:
         ("title", card.get("title")),
         ("published_date", card.get("published_date")),
         ("work_modality", card.get("work_modality")),
+        ("duration_ms", card.get("duration_ms")),
         ("digg_count", card.get("digg_count")),
         ("comment_count", card.get("comment_count")),
         ("collect_count", card.get("collect_count")),