npm - @tikomni/skills - Versions diffs - 0.1.7 → 0.1.9 - Mend

@tikomni/skills 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tikomni/skills",
-  "version": "0.1.7",
+  "version": "0.1.9",
   "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
   "license": "MIT",
   "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",

package/skills/social-media-crawl/scripts/core/tikomni_common.py CHANGED Viewed

@@ -257,6 +257,13 @@ def _resolve_timeout_retry_backoff_ms() -> int:
     return max(0, min(backoff, 5000))
+def resolve_timeout_retry_policy() -> Dict[str, int]:
+    return {
+        "max_retries": _resolve_timeout_retry_max(),
+        "backoff_ms": _resolve_timeout_retry_backoff_ms(),
+    }
 def _wait_rate_limit_slot(qps: float) -> int:
     global _NEXT_ALLOWED_TS
     interval_sec = 1.0 / max(qps, 0.1)

package/skills/social-media-crawl/scripts/core/u3_fallback.py CHANGED Viewed

@@ -5,18 +5,26 @@ from __future__ import annotations
 import mimetypes
 import os
+import socket
 import tempfile
+import time
 import urllib.error
 import urllib.parse
 import urllib.request
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
-from scripts.core.tikomni_common import DEFAULT_USER_AGENT, call_json_api, normalize_text
+from scripts.core.tikomni_common import (
+    DEFAULT_USER_AGENT,
+    call_json_api,
+    normalize_text,
+    resolve_timeout_retry_policy,
+)
 DEFAULT_U3_PROVIDER = "oss"
 DEFAULT_CONTENT_TYPE = "video/mp4"
 DOWNLOAD_CHUNK_SIZE = 1024 * 1024
+TIMEOUT_LIKE_HTTP_STATUS_CODES = {408, 429, 502, 503, 504}
 def _safe_name_from_url(source_url: str) -> str:
@@ -135,6 +143,16 @@ def create_u3_upload(
     )
+def _is_timeout_like_upload_error(status_code: Optional[int], error_reason: Optional[str]) -> bool:
+    if isinstance(status_code, (int, float)) and int(status_code) in TIMEOUT_LIKE_HTTP_STATUS_CODES:
+        return True
+    reason = str(error_reason or "").strip().lower()
+    if not reason:
+        return False
+    return any(token in reason for token in ("timeout", "timed out", "deadline exceeded"))
 def upload_file_to_presigned_url(
     *,
     upload_url: str,
@@ -147,35 +165,130 @@ def upload_file_to_presigned_url(
     try:
         with open(file_path, "rb") as handle:
             data = handle.read()
-        headers = {
-            "Content-Type": content_type or DEFAULT_CONTENT_TYPE,
-            "User-Agent": os.getenv("TIKOMNI_HTTP_USER_AGENT", DEFAULT_USER_AGENT),
+    except Exception as error:
+        return {
+            "ok": False,
+            "status_code": None,
+            "error_reason": f"u3_upload_failed:{normalize_text(error)}",
+            "retry_attempt": 0,
+            "timeout_retry_max": 0,
+            "timeout_retry_exhausted": False,
+            "retry_chain": [],
         }
-        if isinstance(upload_headers, dict):
-            for key, value in upload_headers.items():
-                header_key = str(key).strip()
-                if not header_key:
-                    continue
-                headers[header_key] = str(value)
-        request = urllib.request.Request(
-            upload_url,
-            data=data,
-            headers=headers,
-            method=(upload_method or "PUT").upper(),
+    headers = {
+        "Content-Type": content_type or DEFAULT_CONTENT_TYPE,
+        "User-Agent": os.getenv("TIKOMNI_HTTP_USER_AGENT", DEFAULT_USER_AGENT),
+    }
+    if isinstance(upload_headers, dict):
+        for key, value in upload_headers.items():
+            header_key = str(key).strip()
+            if not header_key:
+                continue
+            headers[header_key] = str(value)
+    retry_policy = resolve_timeout_retry_policy()
+    timeout_retry_max = int(retry_policy.get("max_retries", 0) or 0)
+    retry_backoff_ms = int(retry_policy.get("backoff_ms", 0) or 0)
+    max_attempts = 1 + timeout_retry_max
+    retry_chain: List[Dict[str, Any]] = []
+    last_result: Dict[str, Any] = {
+        "ok": False,
+        "status_code": None,
+        "error_reason": "u3_upload_failed:unknown",
+    }
+    for attempt in range(1, max_attempts + 1):
+        if attempt > 1 and retry_backoff_ms > 0:
+            sleep_ms = retry_backoff_ms * (2 ** (attempt - 2))
+            time.sleep(sleep_ms / 1000.0)
+        try:
+            request = urllib.request.Request(
+                upload_url,
+                data=data,
+                headers=headers,
+                method=(upload_method or "PUT").upper(),
+            )
+            with urllib.request.urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
+                status_code = response.getcode()
+                result: Dict[str, Any] = {
+                    "ok": 200 <= int(status_code) < 300,
+                    "status_code": status_code,
+                    "error_reason": None if 200 <= int(status_code) < 300 else f"u3_upload_http_{status_code}",
+                }
+        except urllib.error.HTTPError as error:
+            result = {
+                "ok": False,
+                "status_code": error.code,
+                "error_reason": f"u3_upload_http_{error.code}",
+            }
+        except urllib.error.URLError as error:
+            reason_obj = getattr(error, "reason", error)
+            reason_text = normalize_text(reason_obj)
+            result = {
+                "ok": False,
+                "status_code": None,
+                "error_reason": f"u3_upload_failed:{reason_text or 'network_error'}",
+                "_timeout_like": isinstance(reason_obj, socket.timeout)
+                or _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
+            }
+        except (TimeoutError, socket.timeout) as error:
+            result = {
+                "ok": False,
+                "status_code": None,
+                "error_reason": f"u3_upload_failed:{normalize_text(error) or 'timeout'}",
+                "_timeout_like": True,
+            }
+        except Exception as error:
+            reason_text = normalize_text(error)
+            result = {
+                "ok": False,
+                "status_code": None,
+                "error_reason": f"u3_upload_failed:{reason_text or 'unknown'}",
+                "_timeout_like": _is_timeout_like_upload_error(status_code=None, error_reason=reason_text),
+            }
+        if result.get("ok"):
+            result["retry_attempt"] = max(0, attempt - 1)
+            result["timeout_retry_max"] = timeout_retry_max
+            result["timeout_retry_exhausted"] = False
+            result["retry_chain"] = retry_chain
+            return result
+        timeout_like = bool(
+            result.pop(
+                "_timeout_like",
+                _is_timeout_like_upload_error(
+                    status_code=result.get("status_code"),
+                    error_reason=result.get("error_reason"),
+                ),
+            )
         )
-        with urllib.request.urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response:
-            status_code = response.getcode()
-            return {
-                "ok": 200 <= int(status_code) < 300,
-                "status_code": status_code,
-                "error_reason": None if 200 <= int(status_code) < 300 else f"u3_upload_http_{status_code}",
+        retry_chain.append(
+            {
+                "attempt": attempt,
+                "status_code": result.get("status_code"),
+                "error_reason": result.get("error_reason"),
+                "timeout_like": timeout_like,
             }
-    except urllib.error.HTTPError as error:
-        return {"ok": False, "status_code": error.code, "error_reason": f"u3_upload_http_{error.code}"}
-    except Exception as error:
-        return {"ok": False, "status_code": None, "error_reason": f"u3_upload_failed:{normalize_text(error)}"}
+        )
+        last_result = dict(result)
+        if timeout_like and attempt < max_attempts:
+            continue
+        last_result["retry_attempt"] = max(0, attempt - 1)
+        last_result["timeout_retry_max"] = timeout_retry_max
+        last_result["timeout_retry_exhausted"] = bool(timeout_like and attempt >= max_attempts)
+        last_result["retry_chain"] = retry_chain
+        return last_result
+    last_result["retry_attempt"] = timeout_retry_max
+    last_result["timeout_retry_max"] = timeout_retry_max
+    last_result["timeout_retry_exhausted"] = True
+    last_result["retry_chain"] = retry_chain
+    return last_result
 def complete_u3_upload(
@@ -284,6 +397,11 @@ def run_u3_public_url_fallback(
             "ok": bool(upload_response.get("ok")),
             "status_code": upload_response.get("status_code"),
             "error_reason": upload_response.get("error_reason"),
+            "retry_attempt": upload_response.get("retry_attempt", 0),
+            "retry_count": len(upload_response.get("retry_chain") or []),
+            "timeout_retry_max": upload_response.get("timeout_retry_max", 0),
+            "timeout_retry_exhausted": bool(upload_response.get("timeout_retry_exhausted")),
+            "retry_chain": upload_response.get("retry_chain") or [],
         }
     )
     if not upload_response.get("ok"):

package/skills/social-media-crawl/scripts/pipelines/douyin_metadata.py ADDED Viewed

@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Shared Douyin metadata extraction helpers."""
+from __future__ import annotations
+import re
+from typing import Any, Dict, Optional
+from scripts.core.tikomni_common import normalize_text
+INVALID_AUTHOR_HANDLE_VALUES = {"0", "unknown", "none", "null", "nil", "na", "n/a"}
+MUSIC_TITLE_PATTERN = re.compile(r"^@?.+?(?:创作的原声|作品使用的原声|的原声)$")
+def _safe_int(value: Any) -> Optional[int]:
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        return int(value)
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float):
+        return int(value)
+    text = normalize_text(value)
+    if not text:
+        return None
+    try:
+        return int(float(text.replace(",", "")))
+    except Exception:
+        return None
+def normalize_douyin_author_handle(*values: Any) -> str:
+    for value in values:
+        text = normalize_text(value)
+        if not text:
+            continue
+        if text.lower() in INVALID_AUTHOR_HANDLE_VALUES:
+            continue
+        return text
+    return ""
+def looks_like_douyin_music_title(value: Any) -> bool:
+    title = normalize_text(value)
+    if not title:
+        return False
+    return bool(MUSIC_TITLE_PATTERN.match(title))
+def extract_douyin_caption(item: Dict[str, Any]) -> str:
+    if not isinstance(item, dict):
+        return ""
+    for key in ("desc", "caption", "content", "item_title", "preview_title", "title"):
+        text = normalize_text(item.get(key))
+        if text:
+            return text
+    return ""
+def title_from_douyin_caption(caption: Any) -> str:
+    text = normalize_text(caption)
+    if not text:
+        return ""
+    stripped = re.split(r"\s*#\S+", text, maxsplit=1)[0].strip()
+    if stripped:
+        return stripped
+    return text
+def extract_douyin_title(item: Dict[str, Any]) -> str:
+    if not isinstance(item, dict):
+        return ""
+    # Only read title-like fields from the work object itself.
+    # Nested `music.title` is an audio title, not the work title.
+    caption_title = title_from_douyin_caption(extract_douyin_caption(item))
+    for key in ("item_title", "preview_title", "title"):
+        candidate = normalize_text(item.get(key))
+        if not candidate:
+            continue
+        if looks_like_douyin_music_title(candidate) and caption_title:
+            continue
+        return candidate
+    return caption_title
+def extract_douyin_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
+    author = item.get("author") if isinstance(item.get("author"), dict) else {}
+    author_platform_id = (
+        normalize_text(author.get("uid"))
+        or normalize_text(author.get("id"))
+        or normalize_text(item.get("author_user_id"))
+    )
+    author_handle = normalize_douyin_author_handle(
+        author.get("unique_id"),
+        author.get("short_id"),
+        author.get("douyin_id"),
+        author.get("display_id"),
+        author.get("nickname"),
+    )
+    douyin_sec_uid = normalize_text(author.get("sec_uid"))
+    douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
+    return {
+        "author_handle": author_handle or None,
+        "platform_author_id": author_platform_id or None,
+        "author_platform_id": author_platform_id or None,
+        "douyin_sec_uid": douyin_sec_uid or None,
+        "douyin_aweme_author_id": douyin_aweme_author_id or None,
+        "unique_id": normalize_text(author.get("unique_id")) or None,
+        "nickname": normalize_text(author.get("nickname")) or None,
+        "signature": normalize_text(author.get("signature")) or None,
+    }
+def extract_douyin_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
+    statistics = item.get("statistics") if isinstance(item.get("statistics"), dict) else {}
+    def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
+        for key in keys:
+            value = _safe_int(statistics.get(key))
+            if value is not None:
+                return value
+            value = _safe_int(item.get(key))
+            if value is not None:
+                return value
+        return default
+    metrics = {
+        "digg_count": metric("digg_count", "like_count"),
+        "comment_count": metric("comment_count"),
+        "collect_count": metric("collect_count"),
+        "share_count": metric("share_count", "forward_count"),
+        "play_count": metric("play_count", "view_count", default=None),
+    }
+    play_count = metrics.get("play_count")
+    engagement_floor = max(
+        int(metrics.get("digg_count") or 0),
+        int(metrics.get("comment_count") or 0),
+        int(metrics.get("collect_count") or 0),
+        int(metrics.get("share_count") or 0),
+    )
+    if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
+        metrics["play_count"] = None
+    return metrics

package/skills/social-media-crawl/scripts/pipelines/home_asr.py CHANGED Viewed

@@ -12,6 +12,7 @@ from scripts.core.progress_report import ProgressReporter
 from scripts.core.tikomni_common import normalize_text
 from scripts.core.asr_pipeline import (
     clamp_u2_batch_submit_size,
+    derive_asr_clean_text,
     normalize_media_url,
     run_u2_asr_batch_with_timeout_retry,
     run_u2_asr_candidates_with_timeout_retry,
@@ -123,6 +124,30 @@ def _clean_text(text: Any) -> str:
     return "\n".join(lines).strip()
+def _build_transcript_result(
+    raw_text: Any,
+    *,
+    subtitle_source: str,
+    asr_source: str,
+) -> Dict[str, Any]:
+    transcript = _clean_text(raw_text)
+    asr_clean = derive_asr_clean_text(transcript)
+    primary_text = asr_clean or transcript
+    return {
+        "subtitle_raw": transcript,
+        "subtitle_source": subtitle_source,
+        "asr_raw": transcript,
+        "asr_clean": asr_clean,
+        "primary_text": primary_text,
+        "primary_text_source": "asr_clean",
+        "analysis_eligibility": "eligible" if transcript else "incomplete",
+        "analysis_exclusion_reason": "" if transcript else "video_asr_unavailable",
+        "asr_status": "success" if transcript else "failed",
+        "asr_error_reason": "",
+        "asr_source": asr_source,
+    }
 def _subtitle_text_from_raw(raw: str) -> str:
     content = (raw or "").strip()
     if not content:
@@ -294,19 +319,11 @@ def _run_u2_for_work(
     }
     if transcript:
-        return {
-            "subtitle_raw": transcript,
-            "subtitle_source": "external_asr",
-            "asr_raw": transcript,
-            "asr_clean": transcript,
-            "primary_text": transcript,
-            "primary_text_source": "asr_clean",
-            "analysis_eligibility": "eligible",
-            "analysis_exclusion_reason": "",
-            "asr_status": "success",
-            "asr_error_reason": "",
-            "asr_source": "external_asr",
-        }, trace
+        return _build_transcript_result(
+            transcript,
+            subtitle_source="external_asr",
+            asr_source="external_asr",
+        ), trace
     return {
         "subtitle_raw": "",
@@ -715,17 +732,11 @@ def _run_u2_batch_for_entries(
         if (mapped_ok or mapped_status in {"SUCCEEDED", "SUCCESS", "COMPLETED", "DONE"}) and transcript:
             for entry in grouped_entries:
                 entry["work"].update(
-                    {
-                        "asr_raw": transcript,
-                        "asr_clean": transcript,
-                        "primary_text": transcript,
-                        "primary_text_source": "asr_clean",
-                        "analysis_eligibility": "eligible",
-                        "analysis_exclusion_reason": "",
-                        "asr_status": "success",
-                        "asr_error_reason": "",
-                        "asr_source": "external_asr",
-                    }
+                    _build_transcript_result(
+                        transcript,
+                        subtitle_source="external_asr",
+                        asr_source="external_asr",
+                    )
                 )
                 mapped_count += 1
         else:
@@ -927,19 +938,11 @@ def enrich_author_home_asr(
             subtitle_invalid = _invalid_subtitle_reason(subtitle_text)
             if subtitle_invalid is None:
                 work.update(
-                    {
-                        "subtitle_raw": subtitle_text,
-                        "subtitle_source": "native_subtitle",
-                        "asr_raw": subtitle_text,
-                        "asr_clean": subtitle_text,
-                        "primary_text": subtitle_text,
-                        "primary_text_source": "asr_clean",
-                        "analysis_eligibility": "eligible",
-                        "analysis_exclusion_reason": "",
-                        "asr_status": "success",
-                        "asr_error_reason": "",
-                        "asr_source": "native_subtitle",
-                    }
+                    _build_transcript_result(
+                        subtitle_text,
+                        subtitle_source="native_subtitle",
+                        asr_source="native_subtitle",
+                    )
                 )
                 trace.append(
                     {

package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py CHANGED Viewed

@@ -9,6 +9,7 @@ from urllib.parse import parse_qs, urlparse
 from scripts.core.extract_pipeline import build_api_trace
 from scripts.core.progress_report import ProgressReporter
 from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
+from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
 def _to_text(value: Any) -> str:
@@ -359,23 +360,16 @@ def _call_xhs_route(
 def _guess_douyin_sec_user_id(input_value: str) -> str:
-    value = (input_value or "").strip()
-    if not value:
-        return ""
-    if "sec_uid=" in value:
-        query = parse_qs(urlparse(value).query)
-        sec = query.get("sec_uid") or query.get("sec_user_id")
-        if sec and sec[0]:
-            return sec[0]
-    if value.startswith("MS4wLjAB") or value.startswith("MS4wLjA"):
-        return value
-    return ""
+    return str(extract_douyin_sec_uid(input_value) or "")
 def _guess_xhs_ids(input_value: str) -> Tuple[str, str]:
     value = (input_value or "").strip()
     if not value:
         return "", ""
+    direct_user_id = str(extract_xhs_user_id(value) or "")
+    if direct_user_id and looks_like_xhs_user_id(direct_user_id) and not value.startswith(("http://", "https://")):
+        return direct_user_id, ""
     parsed = urlparse(value)
     if parsed.query:
         query = parse_qs(parsed.query)