npm - @tikomni/skills - Versions diffs - 0.1.7 → 0.1.8 - Mend

@tikomni/skills 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py CHANGED Viewed

@@ -28,9 +28,14 @@ from scripts.core.progress_report import build_progress_reporter
 from scripts.core.storage_router import resolve_author_directory_name
 from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
 from scripts.pipelines.home_asr import enrich_author_home_asr
+from scripts.pipelines.input_contracts import normalize_xhs_creator_input
+from scripts.pipelines.schema import build_author_profile
 from scripts.pipelines.xiaohongshu_creator_home_helpers import collect_and_adapt
 from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
+DEFAULT_MAX_ITEMS = 200
+MAX_ITEMS_HARD_LIMIT = 200
 def _write_collection_artifacts(
     *,
@@ -81,11 +86,12 @@ def run_xiaohongshu_creator_home(
     *,
     input_value: str,
     config: Dict[str, Any],
-    runtime: Dict[str, Any],
+    runtime: Dict[str, Any] | None,
     max_items: int,
     write_card: bool,
     persist_output: bool,
 ) -> Dict[str, Any]:
+    bounded_max_items = max(1, min(int(max_items), MAX_ITEMS_HARD_LIMIT))
     progress = build_progress_reporter(
         workflow="social-media-crawl",
         platform="xiaohongshu",
@@ -94,15 +100,69 @@ def run_xiaohongshu_creator_home(
         scope="workflow",
     )
     progress.started(stage="author_home.workflow", message="xiaohongshu author_home workflow started")
+    preflight = normalize_xhs_creator_input(input_value)
+    normalized_input_value = str(preflight.get("input_value") or "")
+    if preflight.get("error_reason"):
+        request_id = ensure_request_id(None, fallback_seed=input_value)
+        empty_profile = build_author_profile(platform="xiaohongshu", request_id=request_id)
+        extract_trace = [
+            {
+                "step": "input.preflight",
+                "ok": False,
+                "input_kind": "creator_url_or_user_id",
+                "normalized_input_value": normalized_input_value or None,
+                "error_reason": preflight.get("error_reason"),
+                "missing_fields": list(preflight.get("missing_fields") or []),
+            }
+        ]
+        envelope = {
+            "object_type": "creator",
+            "platform": "xiaohongshu",
+            "input": input_value,
+            "normalized": {
+                "creator_profile": {**empty_profile, "request_id": request_id, "extract_trace": extract_trace},
+                "work_collection": {
+                    "platform": "xiaohongshu",
+                    "platform_author_id": "",
+                    "count": 0,
+                    "items": [],
+                    "request_id": request_id,
+                    "extract_trace": extract_trace,
+                },
+            },
+            "completeness": evaluate_collection(empty_profile, []),
+            "missing_fields": normalize_missing_fields(preflight.get("missing_fields")),
+            "error_reason": str(preflight.get("error_reason") or "invalid_creator_input"),
+            "extract_trace": extract_trace,
+            "request_id": request_id,
+            "card_write": {
+                "enabled": bool(write_card),
+                "ok": False,
+                "count": 0,
+                "results": [],
+                "reason": "skipped_invalid_input",
+            },
+            "collection_artifacts": {},
+            "output_persist": {"enabled": False, "skipped": True, "reason": "invalid_input"},
+        }
+        progress.done(
+            stage="author_home.workflow",
+            message="xiaohongshu author_home workflow finished",
+            data={"request_id": request_id, "works_count": 0, "error_reason": envelope["error_reason"]},
+        )
+        return envelope
+    if runtime is None:
+        raise ValueError("runtime_required_for_valid_input")
     raw, profile, works, missing = collect_and_adapt(
-        input_value=input_value,
+        input_value=normalized_input_value or input_value,
         base_url=runtime["base_url"],
         token=runtime["token"],
         timeout_ms=runtime["timeout_ms"],
         page_size=20,
         pages_max=50,
-        max_items=max(1, int(max_items)),
+        max_items=bounded_max_items,
         progress=progress.child(scope="author_home.collect"),
     )
@@ -138,7 +198,7 @@ def run_xiaohongshu_creator_home(
     request_id = ensure_request_id(
         raw.get("request_id") or profile.get("request_id"),
-        fallback_seed=input_value,
+        fallback_seed=normalized_input_value or input_value,
     )
     extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
@@ -206,7 +266,12 @@ def main() -> None:
     parser.add_argument("--allow-process-env", action="store_true", help="Allow process env overrides")
     parser.add_argument("--base-url", default=None, help="Override Tikomni base URL")
     parser.add_argument("--timeout-ms", type=int, default=None, help="Override timeout in ms")
-    parser.add_argument("--max-items", type=int, default=5, help="Max works to collect from homepage")
+    parser.add_argument(
+        "--max-items",
+        type=int,
+        default=DEFAULT_MAX_ITEMS,
+        help=f"Max works to collect from homepage (default full crawl, capped at {MAX_ITEMS_HARD_LIMIT})",
+    )
     parser.set_defaults(write_card=True, persist_output=True)
     parser.add_argument("--write-card", dest="write_card", action="store_true", help="Write work fact cards")
     parser.add_argument("--no-write-card", dest="write_card", action="store_false", help="Skip card writing")
@@ -215,6 +280,19 @@ def main() -> None:
     args = parser.parse_args()
     config, _ = load_tikomni_config(args.config, env_file=args.env_file, allow_process_env=args.allow_process_env)
+    preflight = normalize_xhs_creator_input(args.input)
+    if preflight.get("error_reason"):
+        write_json_stdout(
+            run_xiaohongshu_creator_home(
+                input_value=args.input,
+                config=config,
+                runtime=None,
+                max_items=int(args.max_items),
+                write_card=bool(args.write_card),
+                persist_output=bool(args.persist_output),
+            )
+        )
+        return
     runtime = resolve_runtime(
         env_file=args.env_file,
         api_key_env=str(config_get(config, "runtime.auth_env_key", "TIKOMNI_API_KEY")),
@@ -224,7 +302,7 @@ def main() -> None:
     )
     write_json_stdout(
         run_xiaohongshu_creator_home(
-            input_value=args.input,
+            input_value=str(preflight.get("input_value") or args.input),
             config=config,
             runtime=runtime,
             max_items=int(args.max_items),

package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py CHANGED Viewed

@@ -40,6 +40,12 @@ from scripts.core.tikomni_common import (
     summarize_content,
     write_json_stdout,
 )
+from scripts.pipelines.input_contracts import (
+    extract_xhs_note_id as extract_shared_xhs_note_id,
+    normalize_xhs_note_input,
+    text_has_xhs_short_link,
+)
+from scripts.pipelines.media_url_rules import filter_video_urls, is_probable_video_url
 from scripts.writers.write_work_fact_card import (
     build_work_output_envelope,
     persist_output_envelope,
@@ -194,36 +200,15 @@ def _finalize_result(
 def _normalize_input(input_value: Optional[str], share_text: Optional[str], note_id: Optional[str]) -> Dict[str, Optional[str]]:
-    normalized_share = normalize_text(share_text) or None
-    normalized_note_id = normalize_text(note_id) or None
-    if input_value and not normalized_share and not normalized_note_id:
-        candidate = input_value.strip()
-        if candidate.startswith("http://") or candidate.startswith("https://"):
-            normalized_share = candidate
-        else:
-            normalized_note_id = candidate
+    normalized = normalize_xhs_note_input(input_value, share_text, note_id)
     return {
-        "share_text": normalized_share,
-        "note_id": normalized_note_id,
+        "share_text": normalize_text(normalized.get("share_text")) or None,
+        "note_id": normalize_text(normalized.get("note_id")) or None,
     }
 def _extract_note_id_from_share(share_text: Optional[str]) -> Optional[str]:
-    if not share_text:
-        return None
-    text = share_text.strip()
-    patterns = [
-        r"/explore/([0-9a-zA-Z]+)",
-        r"/discovery/item/([0-9a-zA-Z]+)",
-        r"note_id=([0-9a-zA-Z]+)",
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, text)
-        if match:
-            return match.group(1)
-    return None
+    return extract_shared_xhs_note_id(share_text)
 def _resolve_note_id(payload: Any, source_input: Dict[str, Optional[str]]) -> Optional[str]:
@@ -256,13 +241,7 @@ def _resolve_note_id(payload: Any, source_input: Dict[str, Optional[str]]) -> Op
 def _is_short_share_url(share_text: Optional[str]) -> bool:
-    if not share_text:
-        return False
-    try:
-        host = urllib.parse.urlparse(share_text).netloc.lower()
-    except Exception:
-        return False
-    return "xhslink.com" in host
+    return text_has_xhs_short_link(share_text)
 def _app_response_has_core_fields(response_data: Any) -> bool:
@@ -609,17 +588,19 @@ def _extract_xhs_metadata(
     if not cover_image and selected_image_urls:
         cover_image = selected_image_urls[0]
-    video_down_url = _pick_text_from_paths(
-        payload,
-        [
+    video_down_url_candidates = [
+        _pick_text_from_paths(payload, [path])
+        for path in [
             ["video_down_url"],
             ["original_video_url"],
             ["video_url"],
             ["play_url"],
             ["master_url"],
             ["selected_video_url"],
-        ],
-    )
+        ]
+    ]
+    filtered_video_down_urls = filter_video_urls(video_down_url_candidates)
+    video_down_url = filtered_video_down_urls[0] if filtered_video_down_urls else ""
     if not video_down_url:
         video_down_url = normalize_text(selected_video_url)
@@ -1091,24 +1072,9 @@ def _url_likely_image(url: str) -> bool:
 def _url_likely_video(url: str) -> bool:
-    lower = url.lower()
-    video_tokens = [
-        ".mp4",
-        ".m3u8",
-        ".m4a",
-        ".mp3",
-        "video",
-        "play",
-        "stream",
-        "master",
-        "sns-video",
-        "redvideo",
-        "vod",
-        "/audio/",
-    ]
     if _url_likely_image(url):
         return False
-    return any(token in lower for token in video_tokens)
+    return is_probable_video_url(url)
 def _video_quality_hint(url: str) -> int:
@@ -1174,7 +1140,7 @@ def _extract_video_candidates(payload: Any) -> List[str]:
             unique.append(url)
             seen.add(url)
-    video_only = [u for u in unique if _url_likely_video(u)]
+    video_only = filter_video_urls([u for u in unique if _url_likely_video(u)])
     if not video_only:
         return []
@@ -1299,10 +1265,7 @@ def _detect_note_content_type(payload: Any, video_candidates: List[str], image_c
     if "image" in note_type_value:
         return "image"
-    note_sound_url = normalize_text(deep_find_first(payload, ["note_sound_info", "url"])).lower()
-    has_note_audio = bool(note_sound_url and any(token in note_sound_url for token in [".m4a", ".mp3", "/audio/"]))
-    has_video = bool(video_candidates) or has_note_audio
+    has_video = bool(video_candidates)
     has_image = bool(image_candidates)
     if has_video and has_image:
         return "mixed"
@@ -1494,7 +1457,11 @@ def run_xiaohongshu_extract(
     workflow_started_at = time.perf_counter()
     timings = _empty_timings()
     parse_started_at = time.perf_counter()
-    source_input = _normalize_input(input_value, share_text, note_id)
+    preflight = normalize_xhs_note_input(input_value, share_text, note_id)
+    source_input = {
+        "share_text": normalize_text(preflight.get("share_text")) or None,
+        "note_id": normalize_text(preflight.get("note_id")) or None,
+    }
     timings["url_parse_ms"] = _elapsed_ms(parse_started_at)
     if progress is not None:
         progress.started(
@@ -1503,13 +1470,72 @@ def run_xiaohongshu_extract(
             data={"analysis_mode": analysis_mode, "write_card": bool(write_card), "persist_output": bool(persist_output)},
         )
     metadata_fields: Dict[str, Any] = {}
+    preflight_trace = [
+        {
+            "step": "input.preflight",
+            "ok": preflight.get("error_reason") is None,
+            "input_kind": "share_text_or_note_id",
+            "normalized_share_text": source_input.get("share_text"),
+            "normalized_note_id": source_input.get("note_id"),
+            "error_reason": preflight.get("error_reason"),
+            "missing_fields": list(preflight.get("missing_fields") or []),
+        }
+    ]
+    if preflight.get("error_reason"):
+        result = _build_result(
+            source_input=source_input,
+            raw_content="",
+            confidence="low",
+            error_reason=str(preflight.get("error_reason") or "invalid_note_id"),
+            extract_trace=preflight_trace,
+            fallback_trace=[],
+            request_id=None,
+            text_source="none",
+            note_id=None,
+            subtitle_hit=False,
+            u2_task_id=None,
+            u2_task_status="UNKNOWN",
+            note_content_type="unknown",
+            analysis_mode=analysis_mode,
+            selected_video_url=None,
+            selected_video_candidates=[],
+            selected_image_urls=[],
+            downloaded_assets=[],
+            missing_fields=list(preflight.get("missing_fields") or []),
+            metadata_fields=metadata_fields,
+            timings=timings,
+        )
+        if write_card:
+            card_started_at = time.perf_counter()
+            result["card_write"] = write_work_fact_card(
+                payload=result,
+                platform="xiaohongshu",
+                card_type=card_type,
+                card_root=card_root,
+                content_kind="note",
+                storage_config=storage_config,
+                analysis_mode=analysis_mode,
+                progress=progress.child(scope="card_write") if progress is not None else None,
+            )
+            timings["card_write_ms"] = _elapsed_ms(card_started_at)
+            timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
+        timings["total_ms"] = _elapsed_ms(workflow_started_at)
+        result["timings"] = dict(timings)
+        _update_pipeline_status(result)
+        return _finalize_result(
+            result=result,
+            source_input=source_input,
+            note_id=None,
+            storage_config=storage_config,
+            persist_output=persist_output,
+        )
     if not source_input["share_text"] and not source_input["note_id"]:
         result = _build_result(
             source_input=source_input,
             raw_content="",
             confidence="low",
             error_reason="missing_share_text_or_note_id",
-            extract_trace=[],
+            extract_trace=preflight_trace,
             fallback_trace=[],
             request_id=None,
             text_source="none",

package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py CHANGED Viewed

@@ -215,10 +215,10 @@ def _resolve_primary_text(payload: Dict[str, Any], caption_raw: str) -> Dict[str
     subtitle_raw = _safe_text(payload.get("subtitle_raw"))
     asr_clean = _safe_text(payload.get("asr_clean"))
     asr_raw = _safe_text(payload.get("asr_raw"))
-    if subtitle_raw:
-        return {"primary_text": subtitle_raw, "primary_text_source": "subtitle_raw"}
     if asr_clean:
         return {"primary_text": asr_clean, "primary_text_source": "asr_clean"}
+    if subtitle_raw:
+        return {"primary_text": subtitle_raw, "primary_text_source": "subtitle_raw"}
     if asr_raw:
         return {"primary_text": asr_raw, "primary_text_source": "asr_raw"}
     if caption_raw:
@@ -356,7 +356,9 @@ def _markdown_lines(card: Dict[str, Any]) -> List[str]:
     lines.extend(["", "## 主文本", primary_text or ""])
     if caption_raw and caption_raw != primary_text:
         lines.extend(["", "## 原始文案", caption_raw])
-    if subtitle_raw and subtitle_raw != primary_text:
+    if asr_raw and subtitle_raw and asr_raw == subtitle_raw and asr_raw != primary_text:
+        lines.extend(["", "## 原始转写", asr_raw])
+    elif subtitle_raw and subtitle_raw != primary_text:
         lines.extend(["", "## 原始字幕", subtitle_raw])
     if asr_raw and asr_raw not in {primary_text, subtitle_raw}:
         lines.extend(["", "## 原始转写", asr_raw])