npm - @tikomni/skills - Versions diffs - 0.1.7 → 0.1.8 - Mend

@tikomni/skills 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tikomni/skills",
-  "version": "0.1.7",
+  "version": "0.1.8",
   "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
   "license": "MIT",
   "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",

package/skills/social-media-crawl/scripts/pipelines/douyin_metadata.py ADDED Viewed

@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Shared Douyin metadata extraction helpers."""
+from __future__ import annotations
+import re
+from typing import Any, Dict, Optional
+from scripts.core.tikomni_common import normalize_text
+INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
+MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
+def _safe_int(value: Any) -> Optional[int]:
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        return int(value)
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float):
+        return int(value)
+    text = normalize_text(value)
+    if not text:
+        return None
+    try:
+        return int(float(text.replace(",", "")))
+    except Exception:
+        return None
+def normalize_douyin_author_handle(*values: Any) -> str:
+    for value in values:
+        text = normalize_text(value)
+        if not text:
+            continue
+        if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
+            continue
+        return text
+    return ""
+def looks_like_douyin_music_title(value: Any) -> bool:
+    title = normalize_text(value)
+    if not title:
+        return False
+    return bool(MUSIC_TITLE_PATTERN.match(title))
+def extract_douyin_caption(item: Dict[str, Any]) -> str:
+    if not isinstance(item, dict):
+        return ""
+    for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
+        text = normalize_text(item.get(key))
+        if text:
+            return text
+    return ""
+def title_from_douyin_caption(caption: Any) -> str:
+    text = normalize_text(caption)
+    if not text:
+        return ""
+    stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
+    if stripped:
+        return stripped
+    return text
+def extract_douyin_title(item: Dict[str, Any]) -> str:
+    if not isinstance(item, dict):
+        return ""
+    # Only read title-like fields from the work object itself.
+    # Nested `music.title` is an audio title, not the work title.
+    caption_title = title_from_douyin_caption(extract_douyin_caption(item))
+    for key in ("item_title", "preview_title", "title"):
+        candidate = normalize_text(item.get(key))
+        if not candidate:
+            continue
+        if looks_like_douyin_music_title(candidate) and caption_title:
+            continue
+        return candidate
+    return caption_title
+def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
+    author = item.get("author") if isinstance(item.get("author"), dict) else {}
+    author_platform_id = (
+        normalize_text(author.get("uid"))
+        or normalize_text(author.get("id"))
+        or normalize_text(item.get("author_user_id"))
+    )
+    author_handle = normalize_douyin_author_handle(
+        author.get("unique_id"),
+        author.get("short_id"),
+        author.get("douyin_id"),
+        author.get("display_id"),
+        author.get("nickname"),
+    )
+    douyin_sec_uid = normalize_text(author.get("sec_uid"))
+    douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
+    return {
+        "author_handle": author_handle or None,
+        "platform_author_id": author_platform_id or None,
+        "author_platform_id": author_platform_id or None,
+        "douyin_sec_uid": douyin_sec_uid or None,
+        "douyin_aweme_author_id": douyin_aweme_author_id or None,
+        "unique_id": normalize_text(author.get("unique_id")) or None,
+        "nickname": normalize_text(author.get("nickname")) or None,
+        "signature": normalize_text(author.get("signature")) or None,
+    }
+def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
+    statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
+    def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
+        for key in keys:
+            value = _safe_int(statistics.get(key))
+            if value is not None:
+                return value
+            value = _safe_int(item.get(key))
+            if value is not None:
+                return value
+        return default
+    metrics = {
+        "digg_count": metric("digg_count", "like_count"),
+        "comment_count": metric("comment_count"),
+        "collect_count": metric("collect_count"),
+        "share_count": metric("share_count", "forward_count"),
+        "play_count": metric("play_count", "view_count", default=None),
+    }
+    play_count = metrics.get("play_count")
+    engagement_floor = max(
+        int(metrics.get("digg_count") or 0),
+        int(metrics.get("comment_count") or 0),
+        int(metrics.get("collect_count") or 0),
+        int(metrics.get("share_count") or 0),
+    )
+    if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
+        metrics["play_count"] = None
+    return metrics

package/skills/social-media-crawl/scripts/pipelines/home_asr.py CHANGED Viewed

@@ -12,6 +12,7 @@ from scripts.core.progress_report import ProgressReporter
 from scripts.core.tikomni_common import normalize_text
 from scripts.core.asr_pipeline import (
     clamp_u2_batch_submit_size,
+    derive_asr_clean_text,
     normalize_media_url,
     run_u2_asr_batch_with_timeout_retry,
     run_u2_asr_candidates_with_timeout_retry,
@@ -123,6 +124,30 @@ def _clean_text(text: Any) -> str:
     return "\n".join(lines).strip()
+def _build_transcript_result(
+    raw_text: Any,
+    *,
+    subtitle_source: str,
+    asr_source: str,
+) -> Dict[str, Any]:
+    transcript = _clean_text(raw_text)
+    asr_clean = derive_asr_clean_text(transcript)
+    primary_text = asr_clean or transcript
+    return {
+        "subtitle_raw": transcript,
+        "subtitle_source": subtitle_source,
+        "asr_raw": transcript,
+        "asr_clean": asr_clean,
+        "primary_text": primary_text,
+        "primary_text_source": "asr_clean",
+        "analysis_eligibility": "eligible" if transcript else "incomplete",
+        "analysis_exclusion_reason": "" if transcript else "video_asr_unavailable",
+        "asr_status": "success" if transcript else "failed",
+        "asr_error_reason": "",
+        "asr_source": asr_source,
+    }
 def _subtitle_text_from_raw(raw: str) -> str:
     content = (raw or "").strip()
     if not content:
@@ -294,19 +319,11 @@ def _run_u2_for_work(
     }
     if transcript:
-        return {
-            "subtitle_raw": transcript,
-            "subtitle_source": "external_asr",
-            "asr_raw": transcript,
-            "asr_clean": transcript,
-            "primary_text": transcript,
-            "primary_text_source": "asr_clean",
-            "analysis_eligibility": "eligible",
-            "analysis_exclusion_reason": "",
-            "asr_status": "success",
-            "asr_error_reason": "",
-            "asr_source": "external_asr",
-        }, trace
+        return _build_transcript_result(
+            transcript,
+            subtitle_source="external_asr",
+            asr_source="external_asr",
+        ), trace
     return {
         "subtitle_raw": "",
@@ -715,17 +732,11 @@ def _run_u2_batch_for_entries(
         if (mapped_ok or mapped_status in {"SUCCEEDED", "SUCCESS", "COMPLETED", "DONE"}) and transcript:
             for entry in grouped_entries:
                 entry["work"].update(
-                    {
-                        "asr_raw": transcript,
-                        "asr_clean": transcript,
-                        "primary_text": transcript,
-                        "primary_text_source": "asr_clean",
-                        "analysis_eligibility": "eligible",
-                        "analysis_exclusion_reason": "",
-                        "asr_status": "success",
-                        "asr_error_reason": "",
-                        "asr_source": "external_asr",
-                    }
+                    _build_transcript_result(
+                        transcript,
+                        subtitle_source="external_asr",
+                        asr_source="external_asr",
+                    )
                 )
                 mapped_count += 1
         else:
@@ -927,19 +938,11 @@ def enrich_author_home_asr(
             subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
             if subtitle_invalid is None:
                 work.update(
-                    {
-                        "subtitle_raw": subtitle_text,
-                        "subtitle_source": "native_subtitle",
-                        "asr_raw": subtitle_text,
-                        "asr_clean": subtitle_text,
-                        "primary_text": subtitle_text,
-                        "primary_text_source": "asr_clean",
-                        "analysis_eligibility": "eligible",
-                        "analysis_exclusion_reason": "",
-                        "asr_status": "success",
-                        "asr_error_reason": "",
-                        "asr_source": "native_subtitle",
-                    }
+                    _build_transcript_result(
+                        subtitle_text,
+                        subtitle_source="native_subtitle",
+                        asr_source="native_subtitle",
+                    )
                 )
                 trace.append(
                     {

package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py CHANGED Viewed

@@ -9,6 +9,7 @@ from urllib.parse import parse_qs, urlparse
 from scripts.core.extract_pipeline import build_api_trace
 from scripts.core.progress_report import ProgressReporter
 from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
+from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
 def _to_text(value: Any) -> str:
@@ -359,23 +360,16 @@ def _call_xhs_route(
 def _guess_douyin_sec_user_id(input_value: str) -> str:
-    value = (input_value or "").strip()
-    if not value:
-        return ""
-    if "sec_uid=" in value:
-        query = parse_qs(urlparse(value).query)
-        sec = query.get("sec_uid") or query.get("sec_user_id")
-        if sec and sec[0]:
-            return sec[0]
-    if value.startswith("MS4wLjAB") or value.startswith("MS4wLjA"):
-        return value
-    return ""
+    return str(extract_douyin_sec_uid(input_value) or "")
 def _guess_xhs_ids(input_value: str) -> Tuple[str, str]:
     value = (input_value or "").strip()
     if not value:
         return "", ""
+    direct_user_id = str(extract_xhs_user_id(value) or "")
+    if direct_user_id and looks_like_xhs_user_id(direct_user_id) and not value.startswith(("http://", "https://")):
+        return direct_user_id, ""
     parsed = urlparse(value)
     if parsed.query:
         query = parse_qs(parsed.query)

package/skills/social-media-crawl/scripts/pipelines/input_contracts.py ADDED Viewed

@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+"""Shared input normalization and validation for social-media pipelines."""
+from __future__ import annotations
+import re
+from typing import Dict, List, Optional
+from urllib.parse import parse_qs, urlparse
+from scripts.core.tikomni_common import normalize_text
+_HTTP_URL_RE = re.compile(r"https?://[^\s<>'\"，。！？；：）】》]+", re.IGNORECASE)
+_URL_TRAILING_PUNCTUATION = ".,!?;:)]}>'\"，。！？；：）】》、"
+_XHS_NOTE_ID_RE = re.compile(r"^[0-9A-Za-z]{16,32}$")
+_XHS_USER_ID_RE = re.compile(r"^[0-9A-Za-z]{8,32}$")
+_DOUYIN_SEC_UID_RE = re.compile(r"^MS4wLjA[A-Za-z0-9_-]{8,}$")
+_DOUYIN_HOST_TOKENS = ("douyin.com", "iesdouyin.com", "v.douyin.com")
+_XHS_HOST_TOKENS = ("xiaohongshu.com", "xhslink.com")
+def _dedupe_keep_order(items: List[str]) -> List[str]:
+    unique: List[str] = []
+    seen = set()
+    for item in items:
+        if item in seen:
+            continue
+        unique.append(item)
+        seen.add(item)
+    return unique
+def _strip_url_punctuation(value: str) -> str:
+    return value.rstrip(_URL_TRAILING_PUNCTUATION)
+def extract_http_urls(text: Optional[str]) -> List[str]:
+    raw = normalize_text(text)
+    if not raw:
+        return []
+    matches = [_strip_url_punctuation(match.group(0)) for match in _HTTP_URL_RE.finditer(raw)]
+    return _dedupe_keep_order([item for item in matches if item])
+def _first_platform_url(text: Optional[str], host_tokens: tuple[str, ...]) -> Optional[str]:
+    for url in extract_http_urls(text):
+        host = urlparse(url).netloc.lower()
+        if any(token in host for token in host_tokens):
+            return url
+    return None
+def _text_contains_host(text: Optional[str], host_tokens: tuple[str, ...]) -> bool:
+    lowered = normalize_text(text).lower()
+    return any(token in lowered for token in host_tokens)
+def looks_like_douyin_sec_uid(value: Optional[str]) -> bool:
+    return bool(_DOUYIN_SEC_UID_RE.fullmatch(normalize_text(value)))
+def looks_like_xhs_note_id(value: Optional[str]) -> bool:
+    return bool(_XHS_NOTE_ID_RE.fullmatch(normalize_text(value)))
+def looks_like_xhs_user_id(value: Optional[str]) -> bool:
+    return bool(_XHS_USER_ID_RE.fullmatch(normalize_text(value)))
+def extract_douyin_share_url(text: Optional[str]) -> Optional[str]:
+    return _first_platform_url(text, _DOUYIN_HOST_TOKENS)
+def extract_douyin_sec_uid(text: Optional[str]) -> Optional[str]:
+    raw = normalize_text(text)
+    if not raw:
+        return None
+    if looks_like_douyin_sec_uid(raw):
+        return raw
+    for url in extract_http_urls(raw):
+        query = parse_qs(urlparse(url).query)
+        for key in ("sec_uid", "sec_user_id"):
+            candidate = normalize_text((query.get(key) or [""])[0])
+            if looks_like_douyin_sec_uid(candidate):
+                return candidate
+    match = re.search(r"(?:sec_uid|sec_user_id)=([A-Za-z0-9._-]+)", raw)
+    if match:
+        candidate = normalize_text(match.group(1))
+        if looks_like_douyin_sec_uid(candidate):
+            return candidate
+    return None
+def extract_xhs_note_id(text: Optional[str]) -> Optional[str]:
+    raw = normalize_text(text)
+    if not raw:
+        return None
+    if looks_like_xhs_note_id(raw):
+        return raw
+    for pattern in (
+        r"/explore/([0-9A-Za-z]+)",
+        r"/discovery/item/([0-9A-Za-z]+)",
+        r"note_id=([0-9A-Za-z]+)",
+    ):
+        match = re.search(pattern, raw)
+        if not match:
+            continue
+        candidate = normalize_text(match.group(1))
+        if looks_like_xhs_note_id(candidate):
+            return candidate
+    return None
+def extract_xhs_user_id(text: Optional[str]) -> Optional[str]:
+    raw = normalize_text(text)
+    if not raw:
+        return None
+    if looks_like_xhs_user_id(raw):
+        return raw
+    for pattern in (
+        r"/user/profile/([0-9A-Za-z]+)",
+        r"(?:user_id|userid)=([0-9A-Za-z]+)",
+    ):
+        match = re.search(pattern, raw)
+        if not match:
+            continue
+        candidate = normalize_text(match.group(1))
+        if looks_like_xhs_user_id(candidate):
+            return candidate
+    return None
+def extract_xhs_note_share_url(text: Optional[str]) -> Optional[str]:
+    for url in extract_http_urls(text):
+        parsed = urlparse(url)
+        host = parsed.netloc.lower()
+        path = parsed.path.lower()
+        query = parsed.query.lower()
+        if "xhslink.com" in host:
+            return url
+        if "xiaohongshu.com" not in host:
+            continue
+        if "/explore/" in path or "/discovery/item/" in path or "note_id=" in query:
+            return url
+    return None
+def extract_xhs_creator_share_url(text: Optional[str]) -> Optional[str]:
+    for url in extract_http_urls(text):
+        parsed = urlparse(url)
+        host = parsed.netloc.lower()
+        path = parsed.path.lower()
+        query = parsed.query.lower()
+        if "xhslink.com" in host:
+            return url
+        if "xiaohongshu.com" not in host:
+            continue
+        if "/user/profile/" in path or "/user/" in path or "user_id=" in query or "userid=" in query:
+            return url
+    return None
+def text_has_douyin_short_link(text: Optional[str]) -> bool:
+    return _text_contains_host(text, ("v.douyin.com",))
+def text_has_xhs_short_link(text: Optional[str]) -> bool:
+    return _text_contains_host(text, ("xhslink.com",))
+def normalize_douyin_work_input(input_value: Optional[str], share_url: Optional[str]) -> Dict[str, object]:
+    explicit_share = normalize_text(share_url)
+    raw_input = normalize_text(input_value)
+    normalized_share = extract_douyin_share_url(explicit_share) or extract_douyin_share_url(raw_input)
+    if normalized_share:
+        return {
+            "share_url": normalized_share,
+            "error_reason": None,
+            "missing_fields": [],
+        }
+    if raw_input:
+        if text_has_douyin_short_link(raw_input):
+            return {
+                "share_url": None,
+                "error_reason": "short_link_detected_but_unresolved",
+                "missing_fields": [{"field": "share_url", "reason": "short_link_unresolved"}],
+            }
+        return {
+            "share_url": None,
+            "error_reason": "invalid_share_url",
+            "missing_fields": [{"field": "share_url", "reason": "invalid_input"}],
+        }
+    return {
+        "share_url": None,
+        "error_reason": None,
+        "missing_fields": [],
+    }
+def normalize_xhs_note_input(
+    input_value: Optional[str],
+    share_text: Optional[str],
+    note_id: Optional[str],
+) -> Dict[str, object]:
+    explicit_note_id = normalize_text(note_id)
+    if explicit_note_id and not looks_like_xhs_note_id(explicit_note_id):
+        return {
+            "share_text": None,
+            "note_id": None,
+            "error_reason": "invalid_note_id",
+            "missing_fields": [{"field": "note_id", "reason": "invalid_format"}],
+        }
+    explicit_share = normalize_text(share_text)
+    raw_input = normalize_text(input_value)
+    normalized_note_id = explicit_note_id or extract_xhs_note_id(explicit_share) or extract_xhs_note_id(raw_input)
+    normalized_share = extract_xhs_note_share_url(explicit_share) or extract_xhs_note_share_url(raw_input)
+    if normalized_note_id or normalized_share:
+        return {
+            "share_text": normalized_share or None,
+            "note_id": normalized_note_id or None,
+            "error_reason": None,
+            "missing_fields": [],
+        }
+    candidate = explicit_share or raw_input
+    if candidate:
+        if text_has_xhs_short_link(candidate):
+            return {
+                "share_text": None,
+                "note_id": None,
+                "error_reason": "short_link_detected_but_unresolved",
+                "missing_fields": [{"field": "note_id", "reason": "short_link_unresolved"}],
+            }
+        return {
+            "share_text": None,
+            "note_id": None,
+            "error_reason": "invalid_note_id",
+            "missing_fields": [{"field": "note_id", "reason": "invalid_format"}],
+        }
+    return {
+        "share_text": None,
+        "note_id": None,
+        "error_reason": None,
+        "missing_fields": [],
+    }
+def normalize_douyin_creator_input(input_value: Optional[str]) -> Dict[str, object]:
+    raw_input = normalize_text(input_value)
+    normalized_input = extract_douyin_sec_uid(raw_input) or extract_douyin_share_url(raw_input) or raw_input or None
+    if extract_douyin_sec_uid(raw_input) or extract_douyin_share_url(raw_input):
+        return {
+            "input_value": normalized_input,
+            "error_reason": None,
+            "missing_fields": [],
+        }
+    if raw_input:
+        if text_has_douyin_short_link(raw_input):
+            return {
+                "input_value": None,
+                "error_reason": "short_link_detected_but_unresolved",
+                "missing_fields": [{"field": "platform_author_id", "reason": "short_link_unresolved"}],
+            }
+        return {
+            "input_value": None,
+            "error_reason": "invalid_creator_input",
+            "missing_fields": [{"field": "platform_author_id", "reason": "invalid_format"}],
+        }
+    return {
+        "input_value": None,
+        "error_reason": None,
+        "missing_fields": [],
+    }
+def normalize_xhs_creator_input(input_value: Optional[str]) -> Dict[str, object]:
+    raw_input = normalize_text(input_value)
+    normalized_input = extract_xhs_user_id(raw_input) or extract_xhs_creator_share_url(raw_input) or None
+    if normalized_input:
+        return {
+            "input_value": normalized_input,
+            "error_reason": None,
+            "missing_fields": [],
+        }
+    if raw_input:
+        if text_has_xhs_short_link(raw_input):
+            return {
+                "input_value": None,
+                "error_reason": "short_link_detected_but_unresolved",
+                "missing_fields": [{"field": "platform_author_id", "reason": "short_link_unresolved"}],
+            }
+        return {
+            "input_value": None,
+            "error_reason": "invalid_creator_input",
+            "missing_fields": [{"field": "platform_author_id", "reason": "invalid_format"}],
+        }
+    return {
+        "input_value": None,
+        "error_reason": None,
+        "missing_fields": [],
+    }