@tikomni/skills 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,16 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import os
6
7
  from typing import Any, Dict, List, Optional, Tuple
7
8
  from urllib.parse import parse_qs, urlparse
8
9
 
9
- from scripts.core.extract_pipeline import build_api_trace
10
+ from scripts.core.extract_pipeline import (
11
+ build_api_trace,
12
+ build_attempted_route,
13
+ build_route_plan_entry,
14
+ build_stage_status,
15
+ )
10
16
  from scripts.core.progress_report import ProgressReporter
11
17
  from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
12
18
  from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
@@ -301,6 +307,34 @@ def _preview(value: Any, max_len: int = 160) -> str:
301
307
  return text[:max_len]
302
308
 
303
309
 
310
+ def _response_failure_reason(response: Dict[str, Any]) -> str:
311
+ if response.get("timeout_retry_exhausted"):
312
+ return "primary_timeout_retry_exhausted"
313
+ if response.get("error_reason"):
314
+ return "primary_non_timeout_failure"
315
+ return "primary_unknown_failure"
316
+
317
+
318
+ def _build_unavailable_attempt(
319
+ *,
320
+ route_label: str,
321
+ endpoint: str,
322
+ reason: str,
323
+ extra: Optional[Dict[str, Any]] = None,
324
+ ) -> Dict[str, Any]:
325
+ return build_attempted_route(
326
+ route_label=route_label,
327
+ endpoint=endpoint,
328
+ accepted=False,
329
+ accept_reason="skipped_param_unavailable",
330
+ fallback_reason=reason,
331
+ param_readiness="unavailable",
332
+ param_reason=reason,
333
+ skipped=True,
334
+ extra=extra,
335
+ )
336
+
337
+
304
338
  def _pick_request_id(responses: List[Optional[Dict[str, Any]]], trace: Optional[List[Dict[str, Any]]] = None) -> Optional[str]:
305
339
  for response in responses:
306
340
  if isinstance(response, dict) and response.get("request_id"):
@@ -404,20 +438,12 @@ def _xhs_posts_field_completeness(payload: Any) -> Dict[str, Any]:
404
438
  return _build_field_completeness(fields, core_keys=["items", "platform_work_id", "title_or_caption", "published_date"])
405
439
 
406
440
 
407
- def _xhs_route_failure_reason(response: Dict[str, Any]) -> str:
408
- if response.get("timeout_retry_exhausted"):
409
- return "primary_timeout_retry_exhausted"
410
- if response.get("error_reason"):
411
- return "primary_non_timeout_failure"
412
- return "primary_unknown_failure"
413
-
414
-
415
441
  def _xhs_profile_accept_decision(response: Dict[str, Any], completeness: Dict[str, Any]) -> Dict[str, Any]:
416
442
  if not response.get("ok"):
417
443
  return {
418
444
  "accepted": False,
419
445
  "accept_reason": "response_not_ok",
420
- "fallback_reason": _xhs_route_failure_reason(response),
446
+ "fallback_reason": _response_failure_reason(response),
421
447
  }
422
448
 
423
449
  missing_core = list(completeness.get("missing_core") or [])
@@ -449,7 +475,7 @@ def _xhs_posts_accept_decision(response: Dict[str, Any], completeness: Dict[str,
449
475
  return {
450
476
  "accepted": False,
451
477
  "accept_reason": "response_not_ok",
452
- "fallback_reason": _xhs_route_failure_reason(response),
478
+ "fallback_reason": _response_failure_reason(response),
453
479
  }
454
480
 
455
481
  missing_core = list(completeness.get("missing_core") or [])
@@ -480,18 +506,287 @@ def _xhs_posts_accept_decision(response: Dict[str, Any], completeness: Dict[str,
480
506
  }
481
507
 
482
508
 
483
- def _xhs_route_plan(kind: str) -> List[Tuple[str, str, str]]:
509
+ def _douyin_profile_field_completeness(payload: Any, resolved_author_id: str) -> Dict[str, Any]:
510
+ fields = {
511
+ "platform_author_id": bool(_pick_text(payload, ["sec_user_id", "sec_uid", "uid", "user_id", "id"]) or resolved_author_id),
512
+ "nickname": bool(_pick_text(payload, ["nickname", "name"])),
513
+ "avatar_url": bool(_extract_first_url(_first_url_candidate(payload, ["avatar_larger", "avatar_thumb", "avatar_url", "avatar", "images"]))),
514
+ "fans_count": _pick_int(payload, ["follower_count", "fans_count", "mplatform_followers_count"], default=0) > 0,
515
+ "works_count": _pick_int(payload, ["aweme_count", "works_count", "video_count"], default=0) > 0,
516
+ "unique_id": bool(_pick_text(payload, ["unique_id", "short_id", "douyin_id", "display_id"])),
517
+ }
518
+ return _build_field_completeness(fields, core_keys=["platform_author_id", "nickname"])
519
+
520
+
521
+ def _douyin_posts_field_completeness(payload: Any) -> Dict[str, Any]:
522
+ page_items = _extract_douyin_posts_items(payload)
523
+ first_item = _pick_first_mapping(page_items)
524
+ has_more_flag = _extract_douyin_posts_has_more(payload) is not None
525
+ cursor_hit = _extract_douyin_posts_next_cursor(payload) is not None
526
+ cover_hit = bool(_extract_first_url(_first_url_candidate(first_item, ["cover_url", "cover", "origin_cover", "image", "images"])))
527
+ share_or_source = bool(_pick_text(first_item, ["share_url", "share_link", "url"])) or bool(_pick_text(first_item, ["aweme_id", "item_id", "id"]))
528
+ interaction_values = [
529
+ _pick_int(first_item, ["digg_count"], default=-1),
530
+ _pick_int(first_item, ["comment_count"], default=-1),
531
+ _pick_int(first_item, ["collect_count"], default=-1),
532
+ _pick_int(first_item, ["share_count"], default=-1),
533
+ _pick_int(first_item, ["play_count"], default=-1),
534
+ ]
535
+ fields = {
536
+ "items": len(page_items) > 0,
537
+ "platform_work_id": bool(_pick_text(first_item, ["aweme_id", "item_id", "id"])),
538
+ "title_or_caption": bool(_pick_text(first_item, ["title", "caption", "desc"])),
539
+ "published_date": bool(_pick_text(first_item, ["create_time", "publish_time"])),
540
+ "base_link_fields": cover_hit or share_or_source,
541
+ "interaction_fields": any(value >= 0 for value in interaction_values),
542
+ "cursor": cursor_hit,
543
+ "has_more_flag": has_more_flag,
544
+ "response_shape": len(page_items) > 0 or cursor_hit or has_more_flag,
545
+ }
546
+ return _build_field_completeness(fields, core_keys=["items", "platform_work_id", "title_or_caption", "published_date"])
547
+
548
+
549
+ def _douyin_profile_accept_decision(response: Dict[str, Any], completeness: Dict[str, Any]) -> Dict[str, Any]:
550
+ if not response.get("ok"):
551
+ return {
552
+ "accepted": False,
553
+ "accept_reason": "response_not_ok",
554
+ "fallback_reason": _response_failure_reason(response),
555
+ }
556
+ missing_core = list(completeness.get("missing_core") or [])
557
+ if missing_core:
558
+ return {
559
+ "accepted": False,
560
+ "accept_reason": "profile_missing_core_fields",
561
+ "fallback_reason": f"profile_missing_core:{','.join(missing_core)}",
562
+ }
563
+ fields = completeness.get("fields") if isinstance(completeness.get("fields"), dict) else {}
564
+ optional_missing = [
565
+ field_name
566
+ for field_name in ("avatar_url", "fans_count", "works_count", "unique_id")
567
+ if not fields.get(field_name)
568
+ ]
569
+ accept_reason = "profile_core_fields_ready"
570
+ if optional_missing:
571
+ accept_reason = f"profile_core_fields_ready_optional_missing:{','.join(optional_missing)}"
572
+ return {
573
+ "accepted": True,
574
+ "accept_reason": accept_reason,
575
+ "fallback_reason": "",
576
+ }
577
+
578
+
579
+ def _douyin_posts_accept_decision(response: Dict[str, Any], completeness: Dict[str, Any]) -> Dict[str, Any]:
580
+ if not response.get("ok"):
581
+ return {
582
+ "accepted": False,
583
+ "accept_reason": "response_not_ok",
584
+ "fallback_reason": _response_failure_reason(response),
585
+ }
586
+ missing_core = list(completeness.get("missing_core") or [])
587
+ if missing_core:
588
+ return {
589
+ "accepted": False,
590
+ "accept_reason": "posts_missing_core_fields",
591
+ "fallback_reason": f"posts_missing_core:{','.join(missing_core)}",
592
+ }
593
+ fields = completeness.get("fields") if isinstance(completeness.get("fields"), dict) else {}
594
+ if not fields.get("base_link_fields"):
595
+ return {
596
+ "accepted": False,
597
+ "accept_reason": "posts_missing_base_link_fields",
598
+ "fallback_reason": "posts_missing_base_link_fields",
599
+ }
600
+ return {
601
+ "accepted": True,
602
+ "accept_reason": "posts_contract_fields_ready",
603
+ "fallback_reason": "",
604
+ }
605
+
606
+
607
+ def _douyin_profile_route_plan(*, sec_user_id: str, unique_id: str, uid: str) -> List[Dict[str, Any]]:
608
+ sec_ready = bool(sec_user_id)
609
+ unique_ready = bool(unique_id)
610
+ uid_ready = bool(uid)
611
+ return [
612
+ {
613
+ "step_name": "douyin.profile.app_v3",
614
+ "path": "/api/u1/v1/douyin/app/v3/handler_user_profile",
615
+ "route_label": "app_v3",
616
+ "params": {"sec_user_id": sec_user_id or None},
617
+ "param_readiness": "ready" if sec_ready else "unavailable",
618
+ "param_reason": "" if sec_ready else "missing_sec_user_id",
619
+ },
620
+ {
621
+ "step_name": "douyin.profile.web_v4",
622
+ "path": "/api/u1/v1/douyin/web/handler_user_profile_v4",
623
+ "route_label": "web_v4",
624
+ "params": {"sec_user_id": sec_user_id or None},
625
+ "param_readiness": "ready" if sec_ready else "unavailable",
626
+ "param_reason": "" if sec_ready else "missing_sec_user_id",
627
+ },
628
+ {
629
+ "step_name": "douyin.profile.web",
630
+ "path": "/api/u1/v1/douyin/web/handler_user_profile",
631
+ "route_label": "web",
632
+ "params": {"sec_user_id": sec_user_id or None},
633
+ "param_readiness": "ready" if sec_ready else "unavailable",
634
+ "param_reason": "" if sec_ready else "missing_sec_user_id",
635
+ },
636
+ {
637
+ "step_name": "douyin.profile.web_v2",
638
+ "path": "/api/u1/v1/douyin/web/handler_user_profile_v2",
639
+ "route_label": "web_v2",
640
+ "params": {"unique_id": unique_id or None},
641
+ "param_readiness": "ready" if unique_ready else "unavailable",
642
+ "param_reason": "" if unique_ready else "missing_unique_id",
643
+ },
644
+ {
645
+ "step_name": "douyin.profile.web_v3",
646
+ "path": "/api/u1/v1/douyin/web/handler_user_profile_v3",
647
+ "route_label": "web_v3",
648
+ "params": {"uid": uid or None},
649
+ "param_readiness": "ready" if uid_ready else "unavailable",
650
+ "param_reason": "" if uid_ready else "missing_uid",
651
+ },
652
+ ]
653
+
654
+
655
+ def _douyin_posts_route_plan(*, sec_user_id: str, cursor: int, count: int, cookie: str) -> List[Dict[str, Any]]:
656
+ sec_ready = bool(sec_user_id)
657
+ web_ready = sec_ready and bool(cookie)
658
+ web_reason = ""
659
+ if not sec_ready:
660
+ web_reason = "missing_sec_user_id"
661
+ elif not cookie:
662
+ web_reason = "fallback_requires_cookie"
663
+ return [
664
+ {
665
+ "step_name": "douyin.posts.app_v3",
666
+ "path": "/api/u1/v1/douyin/app/v3/fetch_user_post_videos",
667
+ "route_label": "app_v3",
668
+ "params": {
669
+ "sec_user_id": sec_user_id or None,
670
+ "count": count,
671
+ "max_cursor": cursor,
672
+ "sort_type": 0,
673
+ },
674
+ "param_readiness": "ready" if sec_ready else "unavailable",
675
+ "param_reason": "" if sec_ready else "missing_sec_user_id",
676
+ },
677
+ {
678
+ "step_name": "douyin.posts.web",
679
+ "path": "/api/u1/v1/douyin/web/fetch_user_post_videos",
680
+ "route_label": "web",
681
+ "params": {
682
+ "sec_user_id": sec_user_id or None,
683
+ "count": count,
684
+ "max_cursor": cursor,
685
+ "filter_type": 0,
686
+ "cookie": cookie or None,
687
+ },
688
+ "param_readiness": "ready" if web_ready else "unavailable",
689
+ "param_reason": web_reason,
690
+ },
691
+ ]
692
+
693
+
694
+ def _xhs_route_plan(kind: str, *, user_id: str, input_value: str, cursor: Any = "") -> List[Dict[str, Any]]:
695
+ user_ready = bool(user_id)
484
696
  if kind == "profile":
485
697
  return [
486
- ("xhs.profile.app_v2", "/api/u1/v1/xiaohongshu/app_v2/get_user_info", "app_v2"),
487
- ("xhs.profile.app", "/api/u1/v1/xiaohongshu/app/get_user_info", "app"),
488
- ("xhs.profile.web_v2", "/api/u1/v1/xiaohongshu/web_v2/fetch_user_info_app", "web_v2"),
698
+ {
699
+ "step_name": "xhs.profile.app_v2",
700
+ "path": "/api/u1/v1/xiaohongshu/app_v2/get_user_info",
701
+ "route_label": "app_v2",
702
+ "params": {"user_id": user_id or None, "share_text": input_value or None},
703
+ "param_readiness": "ready" if user_ready else "unavailable",
704
+ "param_reason": "" if user_ready else "missing_user_id",
705
+ },
706
+ {
707
+ "step_name": "xhs.profile.app",
708
+ "path": "/api/u1/v1/xiaohongshu/app/get_user_info",
709
+ "route_label": "app",
710
+ "params": {"user_id": user_id or None},
711
+ "param_readiness": "ready" if user_ready else "unavailable",
712
+ "param_reason": "" if user_ready else "missing_user_id",
713
+ },
714
+ {
715
+ "step_name": "xhs.profile.web_v2_app",
716
+ "path": "/api/u1/v1/xiaohongshu/web_v2/fetch_user_info_app",
717
+ "route_label": "web_v2_app",
718
+ "params": {"user_id": user_id or None},
719
+ "param_readiness": "ready" if user_ready else "unavailable",
720
+ "param_reason": "" if user_ready else "missing_user_id",
721
+ },
722
+ {
723
+ "step_name": "xhs.profile.web_v2",
724
+ "path": "/api/u1/v1/xiaohongshu/web_v2/fetch_user_info",
725
+ "route_label": "web_v2",
726
+ "params": {"user_id": user_id or None},
727
+ "param_readiness": "ready" if user_ready else "unavailable",
728
+ "param_reason": "" if user_ready else "missing_user_id",
729
+ },
730
+ {
731
+ "step_name": "xhs.profile.web_v1_v2",
732
+ "path": "/api/u1/v1/xiaohongshu/web/get_user_info_v2",
733
+ "route_label": "web_v1_v2",
734
+ "params": {"user_id": user_id or None, "share_text": input_value or None},
735
+ "param_readiness": "ready" if user_ready else "unavailable",
736
+ "param_reason": "" if user_ready else "missing_user_id",
737
+ },
738
+ {
739
+ "step_name": "xhs.profile.web_v1",
740
+ "path": "/api/u1/v1/xiaohongshu/web/get_user_info",
741
+ "route_label": "web_v1",
742
+ "params": {"user_id": user_id or None},
743
+ "param_readiness": "ready" if user_ready else "unavailable",
744
+ "param_reason": "" if user_ready else "missing_user_id",
745
+ },
489
746
  ]
490
747
  if kind == "posts":
748
+ last_cursor = _to_text(cursor)
491
749
  return [
492
- ("xhs.posts.app_v2", "/api/u1/v1/xiaohongshu/app_v2/get_user_posted_notes", "app_v2"),
493
- ("xhs.posts.app", "/api/u1/v1/xiaohongshu/app/get_user_notes", "app"),
494
- ("xhs.posts.web_v2", "/api/u1/v1/xiaohongshu/web_v2/fetch_home_notes_app", "web_v2"),
750
+ {
751
+ "step_name": "xhs.posts.app_v2",
752
+ "path": "/api/u1/v1/xiaohongshu/app_v2/get_user_posted_notes",
753
+ "route_label": "app_v2",
754
+ "params": {"user_id": user_id or None, "share_text": input_value or None, "cursor": last_cursor or None},
755
+ "param_readiness": "ready" if user_ready else "unavailable",
756
+ "param_reason": "" if user_ready else "missing_user_id",
757
+ },
758
+ {
759
+ "step_name": "xhs.posts.app",
760
+ "path": "/api/u1/v1/xiaohongshu/app/get_user_notes",
761
+ "route_label": "app",
762
+ "params": {"user_id": user_id or None, "cursor": last_cursor or None},
763
+ "param_readiness": "ready" if user_ready else "unavailable",
764
+ "param_reason": "" if user_ready else "missing_user_id",
765
+ },
766
+ {
767
+ "step_name": "xhs.posts.web_v2_app",
768
+ "path": "/api/u1/v1/xiaohongshu/web_v2/fetch_home_notes_app",
769
+ "route_label": "web_v2_app",
770
+ "params": {"user_id": user_id or None, "cursor": last_cursor or None},
771
+ "param_readiness": "ready" if user_ready else "unavailable",
772
+ "param_reason": "" if user_ready else "missing_user_id",
773
+ },
774
+ {
775
+ "step_name": "xhs.posts.web_v2",
776
+ "path": "/api/u1/v1/xiaohongshu/web_v2/fetch_home_notes",
777
+ "route_label": "web_v2",
778
+ "params": {"user_id": user_id or None, "cursor": last_cursor or None},
779
+ "param_readiness": "ready" if user_ready else "unavailable",
780
+ "param_reason": "" if user_ready else "missing_user_id",
781
+ },
782
+ {
783
+ "step_name": "xhs.posts.web_v1_v2",
784
+ "path": "/api/u1/v1/xiaohongshu/web/get_user_notes_v2",
785
+ "route_label": "web_v1_v2",
786
+ "params": {"user_id": user_id or None, "lastCursor": last_cursor or None},
787
+ "param_readiness": "ready" if user_ready else "unavailable",
788
+ "param_reason": "" if user_ready else "missing_user_id",
789
+ },
495
790
  ]
496
791
  raise ValueError(f"unsupported_xhs_route_kind:{kind}")
497
792
 
@@ -567,25 +862,79 @@ def collect_douyin_author_home_raw(
567
862
  progress: Optional[ProgressReporter] = None,
568
863
  ) -> Dict[str, Any]:
569
864
  trace: List[Dict[str, Any]] = []
865
+ stage_status: Dict[str, Any] = {}
570
866
  if progress is not None:
571
867
  progress.started(stage="author_home.collect", message="collecting douyin author homepage")
572
868
  sec_user_id = _guess_douyin_sec_user_id(input_value)
573
869
  resolve_resp: Optional[Dict[str, Any]] = None
574
870
  request_id_candidates: List[Optional[Dict[str, Any]]] = []
575
-
576
- if not sec_user_id:
871
+ page_limit = min(max(page_size, 1), 20)
872
+ max_pages = max(pages_max, 1)
873
+ resolver_route_plan = [
874
+ build_route_plan_entry(route_label="local_extract", endpoint=None, method="LOCAL"),
875
+ build_route_plan_entry(
876
+ route_label="web",
877
+ endpoint="/api/u1/v1/douyin/web/get_sec_user_id",
878
+ method="GET",
879
+ ),
880
+ ]
881
+ resolver_attempts: List[Dict[str, Any]] = []
882
+
883
+ if sec_user_id:
884
+ resolver_attempts.append(
885
+ build_attempted_route(
886
+ route_label="local_extract",
887
+ endpoint=None,
888
+ accepted=True,
889
+ accept_reason="author_id_ready",
890
+ param_readiness="ready",
891
+ extra={"resolved_author_id": sec_user_id},
892
+ )
893
+ )
894
+ trace.append(
895
+ {
896
+ "step": "douyin.resolve_sec_user_id.local",
897
+ "route_label": "local_extract",
898
+ "ok": True,
899
+ "resolved_author_id": sec_user_id,
900
+ "accept_reason": "author_id_ready",
901
+ }
902
+ )
903
+ stage_status["resolver"] = build_stage_status(
904
+ stage="resolver",
905
+ status="succeeded",
906
+ route_plan=resolver_route_plan,
907
+ attempted_routes=resolver_attempts,
908
+ chosen_route="local_extract",
909
+ accept_reason="author_id_ready",
910
+ fallback_reason="",
911
+ error_reason=None,
912
+ all_routes_failed=False,
913
+ )
914
+ else:
577
915
  resolve_resp = call_json_api(
578
916
  base_url=base_url,
579
917
  path="/api/u1/v1/douyin/web/get_sec_user_id",
580
918
  token=token,
581
919
  method="GET",
582
920
  timeout_ms=timeout_ms,
583
- params={"url": input_value, "share_url": input_value},
921
+ params={"url": input_value},
584
922
  )
585
923
  trace.append(build_api_trace(step="douyin.resolve_sec_user_id", endpoint="/api/u1/v1/douyin/web/get_sec_user_id", response=resolve_resp))
586
924
  request_id_candidates.append(resolve_resp)
587
925
  resolve_data = resolve_resp.get("data")
588
926
  sec_user_id = _extract_douyin_sec_user_id(resolve_data)
927
+ resolver_attempts.append(
928
+ build_attempted_route(
929
+ route_label="web",
930
+ endpoint="/api/u1/v1/douyin/web/get_sec_user_id",
931
+ response=resolve_resp,
932
+ accepted=bool(sec_user_id),
933
+ accept_reason="author_id_ready" if sec_user_id else "author_id_unresolved",
934
+ fallback_reason="" if sec_user_id else "resolver_fallback_unavailable",
935
+ extra={"resolved_author_id": sec_user_id},
936
+ )
937
+ )
589
938
  if not sec_user_id:
590
939
  trace.append(
591
940
  {
@@ -601,17 +950,226 @@ def collect_douyin_author_home_raw(
601
950
  "input_hint": _preview(input_value, max_len=120),
602
951
  }
603
952
  )
953
+ stage_status["resolver"] = build_stage_status(
954
+ stage="resolver",
955
+ status="succeeded" if sec_user_id else "failed",
956
+ route_plan=resolver_route_plan,
957
+ attempted_routes=resolver_attempts,
958
+ chosen_route="web",
959
+ accept_reason="author_id_ready" if sec_user_id else "author_id_unresolved",
960
+ fallback_reason="" if sec_user_id else "resolver_fallback_unavailable",
961
+ error_reason=None if sec_user_id else "author_id_unresolved",
962
+ all_routes_failed=not bool(sec_user_id),
963
+ )
604
964
 
605
- profile_resp = call_json_api(
606
- base_url=base_url,
607
- path="/api/u1/v1/douyin/app/v3/handler_user_profile",
608
- token=token,
609
- method="GET",
610
- timeout_ms=timeout_ms,
611
- params={"sec_user_id": sec_user_id},
965
+ trace.append(
966
+ {
967
+ "step": "douyin.resolver.route_decision",
968
+ **stage_status["resolver"],
969
+ }
612
970
  )
613
- trace.append(build_api_trace(step="douyin.profile", endpoint="/api/u1/v1/douyin/app/v3/handler_user_profile", response=profile_resp))
614
- request_id_candidates.append(profile_resp)
971
+
972
+ web_cookie = os.getenv("TIKOMNI_DOUYIN_WEB_COOKIE", "").strip()
973
+ if not sec_user_id:
974
+ stage_status["profile"] = build_stage_status(
975
+ stage="profile",
976
+ status="skipped",
977
+ route_plan=[
978
+ build_route_plan_entry(
979
+ route_label=route["route_label"],
980
+ endpoint=route["path"],
981
+ method="GET",
982
+ param_readiness=route["param_readiness"],
983
+ param_reason=route["param_reason"],
984
+ )
985
+ for route in _douyin_profile_route_plan(sec_user_id="", unique_id="", uid="")
986
+ ],
987
+ attempted_routes=[],
988
+ chosen_route="",
989
+ accept_reason="",
990
+ fallback_reason="author_id_unresolved",
991
+ error_reason="author_id_unresolved",
992
+ all_routes_failed=False,
993
+ )
994
+ stage_status["posts"] = build_stage_status(
995
+ stage="posts",
996
+ status="skipped",
997
+ route_plan=[
998
+ build_route_plan_entry(
999
+ route_label=route["route_label"],
1000
+ endpoint=route["path"],
1001
+ method="GET",
1002
+ param_readiness=route["param_readiness"],
1003
+ param_reason=route["param_reason"],
1004
+ )
1005
+ for route in _douyin_posts_route_plan(sec_user_id="", cursor=0, count=page_limit, cookie=web_cookie)
1006
+ ],
1007
+ attempted_routes=[],
1008
+ chosen_route="",
1009
+ accept_reason="",
1010
+ fallback_reason="author_id_unresolved",
1011
+ error_reason="author_id_unresolved",
1012
+ all_routes_failed=False,
1013
+ )
1014
+ trace.append({"step": "douyin.profile.route_decision", **stage_status["profile"]})
1015
+ trace.append({"step": "douyin.posts.stage_decision", **stage_status["posts"]})
1016
+ request_id = _pick_request_id(request_id_candidates, trace)
1017
+ if progress is not None:
1018
+ progress.done(
1019
+ stage="author_home.collect",
1020
+ message="douyin author homepage collected",
1021
+ data={"works_count": 0, "pages": 0, "request_id": request_id},
1022
+ )
1023
+ return {
1024
+ "platform": "douyin",
1025
+ "resolved_author_id": "",
1026
+ "profile_response": {},
1027
+ "works": [],
1028
+ "pagination": {
1029
+ "sort": "latest",
1030
+ "sort_type": 0,
1031
+ "cursor_mode": "max_cursor",
1032
+ "pages": [],
1033
+ "total_collected": 0,
1034
+ "max_items": max_items,
1035
+ },
1036
+ "extract_trace": trace,
1037
+ "request_id": request_id,
1038
+ "stage_status": stage_status,
1039
+ "error_reason": "author_id_unresolved",
1040
+ }
1041
+
1042
+ profile_resp: Dict[str, Any] = {}
1043
+ profile_attempts: List[Dict[str, Any]] = []
1044
+ profile_reason: str = ""
1045
+ profile_unique_id = ""
1046
+ profile_uid = ""
1047
+ sec_profile_routes = _douyin_profile_route_plan(sec_user_id=sec_user_id, unique_id="", uid="")[:3]
1048
+ extra_profile_routes: List[Dict[str, Any]] = []
1049
+
1050
+ def _run_douyin_profile_route(route: Dict[str, Any], *, fallback_reason: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
1051
+ response = call_json_api(
1052
+ base_url=base_url,
1053
+ path=str(route["path"]),
1054
+ token=token,
1055
+ method="GET",
1056
+ timeout_ms=timeout_ms,
1057
+ params=dict(route.get("params") or {}),
1058
+ )
1059
+ response["_endpoint"] = route["path"]
1060
+ response["_route_label"] = route["route_label"]
1061
+ if fallback_reason:
1062
+ response["fallback_trigger_reason"] = fallback_reason
1063
+ response["_field_completeness"] = _douyin_profile_field_completeness(response.get("data"), sec_user_id) if response.get("ok") else {
1064
+ "fields": {},
1065
+ "filled_count": 0,
1066
+ "total_fields": 0,
1067
+ "ratio": 0.0,
1068
+ "missing_core": [],
1069
+ "core_ready": False,
1070
+ }
1071
+ decision = _douyin_profile_accept_decision(response, response.get("_field_completeness") or {})
1072
+ return response, decision
1073
+
1074
+ for route in sec_profile_routes:
1075
+ response, decision = _run_douyin_profile_route(route, fallback_reason=profile_reason)
1076
+ profile_resp = response
1077
+ request_id_candidates.append(response)
1078
+ profile_unique_id = profile_unique_id or _pick_text(response.get("data"), ["unique_id", "short_id", "douyin_id", "display_id"])
1079
+ profile_uid = profile_uid or _pick_text(response.get("data"), ["uid", "user_id", "id"])
1080
+ profile_attempts.append(
1081
+ build_attempted_route(
1082
+ route_label=str(route["route_label"]),
1083
+ endpoint=str(route["path"]),
1084
+ response=response,
1085
+ accepted=bool(decision.get("accepted")),
1086
+ accept_reason=str(decision.get("accept_reason") or ""),
1087
+ fallback_reason=str(decision.get("fallback_reason") or ""),
1088
+ extra={"field_completeness": response.get("_field_completeness")},
1089
+ )
1090
+ )
1091
+ trace.append(
1092
+ build_api_trace(
1093
+ step=str(route["step_name"]),
1094
+ endpoint=str(route["path"]),
1095
+ response=response,
1096
+ extra={
1097
+ "route_label": route["route_label"],
1098
+ "field_completeness": response.get("_field_completeness"),
1099
+ "accept_reason": decision.get("accept_reason"),
1100
+ "route_accepted": bool(decision.get("accepted")),
1101
+ },
1102
+ )
1103
+ )
1104
+ if decision.get("accepted"):
1105
+ profile_resp["_accept_reason"] = decision.get("accept_reason")
1106
+ break
1107
+ profile_reason = str(decision.get("fallback_reason") or "field_completeness_below_threshold")
1108
+ profile_resp["fallback_trigger_reason"] = profile_reason
1109
+
1110
+ if not profile_resp.get("_accept_reason"):
1111
+ extra_profile_routes = _douyin_profile_route_plan(
1112
+ sec_user_id=sec_user_id,
1113
+ unique_id=profile_unique_id,
1114
+ uid=profile_uid,
1115
+ )[3:]
1116
+ for route in extra_profile_routes:
1117
+ if route.get("param_readiness") != "ready":
1118
+ profile_attempts.append(
1119
+ _build_unavailable_attempt(
1120
+ route_label=str(route["route_label"]),
1121
+ endpoint=str(route["path"]),
1122
+ reason=str(route.get("param_reason") or "fallback_param_unavailable"),
1123
+ )
1124
+ )
1125
+ continue
1126
+ response, decision = _run_douyin_profile_route(route, fallback_reason=profile_reason)
1127
+ profile_resp = response
1128
+ request_id_candidates.append(response)
1129
+ profile_attempts.append(
1130
+ build_attempted_route(
1131
+ route_label=str(route["route_label"]),
1132
+ endpoint=str(route["path"]),
1133
+ response=response,
1134
+ accepted=bool(decision.get("accepted")),
1135
+ accept_reason=str(decision.get("accept_reason") or ""),
1136
+ fallback_reason=str(decision.get("fallback_reason") or ""),
1137
+ extra={"field_completeness": response.get("_field_completeness")},
1138
+ )
1139
+ )
1140
+ trace.append(
1141
+ build_api_trace(
1142
+ step=str(route["step_name"]),
1143
+ endpoint=str(route["path"]),
1144
+ response=response,
1145
+ extra={
1146
+ "route_label": route["route_label"],
1147
+ "field_completeness": response.get("_field_completeness"),
1148
+ "accept_reason": decision.get("accept_reason"),
1149
+ "route_accepted": bool(decision.get("accepted")),
1150
+ },
1151
+ )
1152
+ )
1153
+ if decision.get("accepted"):
1154
+ profile_resp["_accept_reason"] = decision.get("accept_reason")
1155
+ break
1156
+ profile_reason = str(decision.get("fallback_reason") or "field_completeness_below_threshold")
1157
+ profile_resp["fallback_trigger_reason"] = profile_reason
1158
+ else:
1159
+ extra_profile_routes = _douyin_profile_route_plan(
1160
+ sec_user_id=sec_user_id,
1161
+ unique_id=profile_unique_id,
1162
+ uid=profile_uid,
1163
+ )[3:]
1164
+ for route in extra_profile_routes:
1165
+ if route.get("param_readiness") != "ready":
1166
+ profile_attempts.append(
1167
+ _build_unavailable_attempt(
1168
+ route_label=str(route["route_label"]),
1169
+ endpoint=str(route["path"]),
1170
+ reason=str(route.get("param_reason") or "fallback_param_unavailable"),
1171
+ )
1172
+ )
615
1173
 
616
1174
  profile_author_id = _pick_text(profile_resp.get("data"), ["sec_user_id", "sec_uid", "secUserId", "uid", "user_id"])
617
1175
  resolved_author_id = sec_user_id or profile_author_id
@@ -627,8 +1185,36 @@ def collect_douyin_author_home_raw(
627
1185
  "profile_data_type": type(profile_resp.get("data")).__name__,
628
1186
  "profile_data_keys": list(profile_resp.get("data").keys())[:8] if isinstance(profile_resp.get("data"), dict) else [],
629
1187
  "profile_data_preview": _preview(profile_resp.get("data")),
630
- }
1188
+ }
1189
+ )
1190
+
1191
+ profile_route_plan = [
1192
+ build_route_plan_entry(
1193
+ route_label=str(route["route_label"]),
1194
+ endpoint=str(route["path"]),
1195
+ method="GET",
1196
+ param_readiness=str(route.get("param_readiness") or "ready"),
1197
+ param_reason=str(route.get("param_reason") or ""),
631
1198
  )
1199
+ for route in _douyin_profile_route_plan(sec_user_id=sec_user_id, unique_id=profile_unique_id, uid=profile_uid)
1200
+ ]
1201
+ profile_has_accepted = any(bool(attempt.get("accepted")) for attempt in profile_attempts)
1202
+ profile_has_ok_response = any(bool(attempt.get("ok")) for attempt in profile_attempts if not attempt.get("skipped"))
1203
+ profile_error_reason = None
1204
+ if not profile_has_accepted:
1205
+ profile_error_reason = "profile_contract_incomplete" if profile_has_ok_response else "profile_all_routes_failed"
1206
+ stage_status["profile"] = build_stage_status(
1207
+ stage="profile",
1208
+ status="succeeded" if profile_has_accepted else "failed",
1209
+ route_plan=profile_route_plan,
1210
+ attempted_routes=profile_attempts,
1211
+ chosen_route=str(profile_resp.get("_route_label") or ""),
1212
+ accept_reason=str(profile_resp.get("_accept_reason") or ""),
1213
+ fallback_reason=profile_reason,
1214
+ error_reason=profile_error_reason,
1215
+ all_routes_failed=not profile_has_accepted,
1216
+ )
1217
+ trace.append({"step": "douyin.profile.route_decision", **stage_status["profile"]})
632
1218
 
633
1219
  works: List[Dict[str, Any]] = []
634
1220
  seen_ids = set()
@@ -636,40 +1222,133 @@ def collect_douyin_author_home_raw(
636
1222
  has_more = True
637
1223
  page = 0
638
1224
  pagination_trace: List[Dict[str, Any]] = []
639
-
640
- max_pages = max(pages_max, 1)
641
- page_limit = min(max(page_size, 1), 20)
1225
+ posts_attempts_all: List[Dict[str, Any]] = []
1226
+ posts_accepted_routes: List[str] = []
1227
+ posts_error_reason: Optional[str] = None
642
1228
 
643
1229
  while has_more and page < max_pages and len(works) < max_items:
644
1230
  page += 1
645
- posts_resp = call_json_api(
646
- base_url=base_url,
647
- path="/api/u1/v1/douyin/app/v3/fetch_user_post_videos",
648
- token=token,
649
- method="GET",
650
- timeout_ms=timeout_ms,
651
- params={
652
- "sec_user_id": sec_user_id,
653
- "count": page_limit,
654
- "max_cursor": cursor,
655
- "sort_type": 0,
656
- },
657
- )
658
- trace.append(
659
- build_api_trace(
660
- step="douyin.posts_page",
661
- endpoint="/api/u1/v1/douyin/app/v3/fetch_user_post_videos",
662
- response=posts_resp,
663
- extra={"page": page, "cursor": cursor, "sort_type": 0},
664
- )
665
- )
666
1231
  if progress is not None:
667
1232
  progress.progress(
668
1233
  stage="author_home.collect.pagination",
669
1234
  message="douyin pagination page requested",
670
1235
  data={"page": page, "cursor_in": cursor},
671
1236
  )
672
- request_id_candidates.append(posts_resp)
1237
+ posts_routes = _douyin_posts_route_plan(sec_user_id=sec_user_id, cursor=cursor, count=page_limit, cookie=web_cookie)
1238
+ page_attempts: List[Dict[str, Any]] = []
1239
+ posts_resp: Dict[str, Any] = {}
1240
+ page_reason = ""
1241
+ for route in posts_routes:
1242
+ if route.get("param_readiness") != "ready":
1243
+ skipped_attempt = _build_unavailable_attempt(
1244
+ route_label=str(route["route_label"]),
1245
+ endpoint=str(route["path"]),
1246
+ reason=str(route.get("param_reason") or "fallback_param_unavailable"),
1247
+ extra={"page": page, "cursor_in": cursor},
1248
+ )
1249
+ page_attempts.append(skipped_attempt)
1250
+ posts_attempts_all.append(skipped_attempt)
1251
+ continue
1252
+ posts_resp = call_json_api(
1253
+ base_url=base_url,
1254
+ path=str(route["path"]),
1255
+ token=token,
1256
+ method="GET",
1257
+ timeout_ms=timeout_ms,
1258
+ params=dict(route.get("params") or {}),
1259
+ )
1260
+ posts_resp["_endpoint"] = route["path"]
1261
+ posts_resp["_route_label"] = route["route_label"]
1262
+ if page_reason:
1263
+ posts_resp["fallback_trigger_reason"] = page_reason
1264
+ posts_resp["_field_completeness"] = _douyin_posts_field_completeness(posts_resp.get("data")) if posts_resp.get("ok") else {
1265
+ "fields": {},
1266
+ "filled_count": 0,
1267
+ "total_fields": 0,
1268
+ "ratio": 0.0,
1269
+ "missing_core": [],
1270
+ "core_ready": False,
1271
+ }
1272
+ posts_decision = _douyin_posts_accept_decision(posts_resp, posts_resp.get("_field_completeness") or {})
1273
+ page_attempt = build_attempted_route(
1274
+ route_label=str(route["route_label"]),
1275
+ endpoint=str(route["path"]),
1276
+ response=posts_resp,
1277
+ accepted=bool(posts_decision.get("accepted")),
1278
+ accept_reason=str(posts_decision.get("accept_reason") or ""),
1279
+ fallback_reason=str(posts_decision.get("fallback_reason") or ""),
1280
+ extra={
1281
+ "page": page,
1282
+ "cursor_in": cursor,
1283
+ "field_completeness": posts_resp.get("_field_completeness"),
1284
+ },
1285
+ )
1286
+ page_attempts.append(page_attempt)
1287
+ posts_attempts_all.append(page_attempt)
1288
+ trace.append(
1289
+ build_api_trace(
1290
+ step=str(route["step_name"]),
1291
+ endpoint=str(route["path"]),
1292
+ response=posts_resp,
1293
+ extra={
1294
+ "page": page,
1295
+ "cursor": cursor,
1296
+ "route_label": route["route_label"],
1297
+ "field_completeness": posts_resp.get("_field_completeness"),
1298
+ "accept_reason": posts_decision.get("accept_reason"),
1299
+ "route_accepted": bool(posts_decision.get("accepted")),
1300
+ },
1301
+ )
1302
+ )
1303
+ request_id_candidates.append(posts_resp)
1304
+ if posts_decision.get("accepted"):
1305
+ posts_resp["_accept_reason"] = posts_decision.get("accept_reason")
1306
+ posts_accepted_routes.append(str(route["route_label"]))
1307
+ break
1308
+ page_reason = str(posts_decision.get("fallback_reason") or "field_completeness_below_threshold")
1309
+ posts_resp["fallback_trigger_reason"] = page_reason
1310
+
1311
+ page_route_plan = [
1312
+ build_route_plan_entry(
1313
+ route_label=str(route["route_label"]),
1314
+ endpoint=str(route["path"]),
1315
+ method="GET",
1316
+ param_readiness=str(route.get("param_readiness") or "ready"),
1317
+ param_reason=str(route.get("param_reason") or ""),
1318
+ )
1319
+ for route in posts_routes
1320
+ ]
1321
+ trace.append(
1322
+ {
1323
+ "step": "douyin.posts.route_decision",
1324
+ "page": page,
1325
+ "cursor_in": cursor,
1326
+ "route_plan": page_route_plan,
1327
+ "chosen_route": posts_resp.get("_route_label"),
1328
+ "request_id": posts_resp.get("request_id"),
1329
+ "field_completeness": posts_resp.get("_field_completeness"),
1330
+ "accept_reason": posts_resp.get("_accept_reason"),
1331
+ "fallback_reason": page_reason,
1332
+ "attempted_routes": page_attempts,
1333
+ "all_routes_failed": not any(bool(attempt.get("accepted")) for attempt in page_attempts),
1334
+ }
1335
+ )
1336
+
1337
+ if not posts_resp.get("_accept_reason"):
1338
+ posts_error_reason = "posts_contract_incomplete" if any(bool(attempt.get("ok")) for attempt in page_attempts if not attempt.get("skipped")) else "posts_all_routes_failed"
1339
+ pagination_trace.append(
1340
+ {
1341
+ "page": page,
1342
+ "cursor_in": cursor,
1343
+ "cursor_out": None,
1344
+ "has_more_raw": None,
1345
+ "has_more_normalized": None,
1346
+ "items": 0,
1347
+ "stop_reason": posts_error_reason,
1348
+ }
1349
+ )
1350
+ break
1351
+
673
1352
  response_payload = posts_resp.get("data")
674
1353
  page_items = _extract_douyin_posts_items(response_payload)
675
1354
 
@@ -729,6 +1408,32 @@ def collect_douyin_author_home_raw(
729
1408
  if should_continue and next_cursor is not None:
730
1409
  cursor = next_cursor
731
1410
 
1411
+ posts_route_plan = [
1412
+ build_route_plan_entry(
1413
+ route_label=str(route["route_label"]),
1414
+ endpoint=str(route["path"]),
1415
+ method="GET",
1416
+ param_readiness=str(route.get("param_readiness") or "ready"),
1417
+ param_reason=str(route.get("param_reason") or ""),
1418
+ )
1419
+ for route in _douyin_posts_route_plan(sec_user_id=sec_user_id, cursor=0, count=page_limit, cookie=web_cookie)
1420
+ ]
1421
+ posts_has_accepted = bool(posts_accepted_routes)
1422
+ if not posts_has_accepted and posts_error_reason is None:
1423
+ posts_error_reason = "posts_all_routes_failed"
1424
+ stage_status["posts"] = build_stage_status(
1425
+ stage="posts",
1426
+ status="succeeded" if posts_has_accepted else "failed",
1427
+ route_plan=posts_route_plan,
1428
+ attempted_routes=posts_attempts_all,
1429
+ chosen_route=posts_accepted_routes[0] if len(set(posts_accepted_routes)) == 1 and posts_accepted_routes else ("mixed" if posts_accepted_routes else ""),
1430
+ accept_reason="posts_pages_collected" if posts_has_accepted else "",
1431
+ fallback_reason=posts_error_reason or "",
1432
+ error_reason=None if posts_has_accepted else posts_error_reason,
1433
+ all_routes_failed=not posts_has_accepted,
1434
+ )
1435
+ trace.append({"step": "douyin.posts.stage_decision", **stage_status["posts"]})
1436
+
732
1437
  request_id = _pick_request_id(request_id_candidates, trace)
733
1438
  if progress is not None:
734
1439
  progress.done(
@@ -737,6 +1442,14 @@ def collect_douyin_author_home_raw(
737
1442
  data={"works_count": len(works), "pages": len(pagination_trace), "request_id": request_id},
738
1443
  )
739
1444
 
1445
+ collect_error_reason: Optional[str] = None
1446
+ if stage_status.get("resolver", {}).get("status") == "failed":
1447
+ collect_error_reason = str(stage_status["resolver"].get("error_reason") or "author_id_unresolved")
1448
+ elif not works and stage_status.get("posts", {}).get("status") == "failed":
1449
+ collect_error_reason = stage_status["posts"].get("error_reason")
1450
+ elif stage_status.get("profile", {}).get("status") == "failed":
1451
+ collect_error_reason = stage_status["profile"].get("error_reason")
1452
+
740
1453
  return {
741
1454
  "platform": "douyin",
742
1455
  "resolved_author_id": resolved_author_id,
@@ -752,6 +1465,8 @@ def collect_douyin_author_home_raw(
752
1465
  },
753
1466
  "extract_trace": trace,
754
1467
  "request_id": request_id,
1468
+ "stage_status": stage_status,
1469
+ "error_reason": collect_error_reason,
755
1470
  }
756
1471
 
757
1472
 
@@ -767,20 +1482,63 @@ def collect_xhs_author_home_raw(
767
1482
  progress: Optional[ProgressReporter] = None,
768
1483
  ) -> Dict[str, Any]:
769
1484
  trace: List[Dict[str, Any]] = []
1485
+ stage_status: Dict[str, Any] = {}
770
1486
  if progress is not None:
771
1487
  progress.started(stage="author_home.collect", message="collecting xiaohongshu author homepage")
772
1488
  user_id, xsec_token = _guess_xhs_ids(input_value)
773
1489
  resolve_resp: Optional[Dict[str, Any]] = None
774
1490
  request_id_candidates: List[Optional[Dict[str, Any]]] = []
775
-
776
- if not user_id:
1491
+ max_pages = max(pages_max, 1)
1492
+ page_limit = min(max(page_size, 1), 20)
1493
+ resolver_route_plan = [
1494
+ build_route_plan_entry(route_label="local_extract", endpoint=None, method="LOCAL"),
1495
+ build_route_plan_entry(
1496
+ route_label="app",
1497
+ endpoint="/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token",
1498
+ method="GET",
1499
+ ),
1500
+ ]
1501
+ resolver_attempts: List[Dict[str, Any]] = []
1502
+
1503
+ if user_id:
1504
+ resolver_attempts.append(
1505
+ build_attempted_route(
1506
+ route_label="local_extract",
1507
+ endpoint=None,
1508
+ accepted=True,
1509
+ accept_reason="author_id_ready",
1510
+ param_readiness="ready",
1511
+ extra={"resolved_author_id": user_id},
1512
+ )
1513
+ )
1514
+ trace.append(
1515
+ {
1516
+ "step": "xhs.resolve_user_id.local",
1517
+ "route_label": "local_extract",
1518
+ "ok": True,
1519
+ "resolved_author_id": user_id,
1520
+ "accept_reason": "author_id_ready",
1521
+ }
1522
+ )
1523
+ stage_status["resolver"] = build_stage_status(
1524
+ stage="resolver",
1525
+ status="succeeded",
1526
+ route_plan=resolver_route_plan,
1527
+ attempted_routes=resolver_attempts,
1528
+ chosen_route="local_extract",
1529
+ accept_reason="author_id_ready",
1530
+ fallback_reason="",
1531
+ error_reason=None,
1532
+ all_routes_failed=False,
1533
+ )
1534
+ else:
777
1535
  resolve_resp = call_json_api(
778
1536
  base_url=base_url,
779
1537
  path="/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token",
780
1538
  token=token,
781
1539
  method="GET",
782
1540
  timeout_ms=timeout_ms,
783
- params={"share_link": input_value, "share_url": input_value, "url": input_value},
1541
+ params={"share_link": input_value},
784
1542
  )
785
1543
  trace.append(
786
1544
  build_api_trace(
@@ -794,41 +1552,154 @@ def collect_xhs_author_home_raw(
794
1552
  user_id = _pick_text(data, ["user_id", "userid", "uid"])
795
1553
  if not xsec_token:
796
1554
  xsec_token = _pick_text(data, ["xsec_token", "xsecToken"])
1555
+ resolver_attempts.append(
1556
+ build_attempted_route(
1557
+ route_label="app",
1558
+ endpoint="/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token",
1559
+ response=resolve_resp,
1560
+ accepted=bool(user_id),
1561
+ accept_reason="author_id_ready" if user_id else "author_id_unresolved",
1562
+ fallback_reason="" if user_id else "resolver_fallback_unavailable",
1563
+ extra={"resolved_author_id": user_id},
1564
+ )
1565
+ )
1566
+ stage_status["resolver"] = build_stage_status(
1567
+ stage="resolver",
1568
+ status="succeeded" if user_id else "failed",
1569
+ route_plan=resolver_route_plan,
1570
+ attempted_routes=resolver_attempts,
1571
+ chosen_route="app",
1572
+ accept_reason="author_id_ready" if user_id else "author_id_unresolved",
1573
+ fallback_reason="" if user_id else "resolver_fallback_unavailable",
1574
+ error_reason=None if user_id else "author_id_unresolved",
1575
+ all_routes_failed=not bool(user_id),
1576
+ )
1577
+
1578
+ trace.append({"step": "xhs.resolver.route_decision", **stage_status["resolver"]})
1579
+
1580
+ if not user_id:
1581
+ stage_status["profile"] = build_stage_status(
1582
+ stage="profile",
1583
+ status="skipped",
1584
+ route_plan=[
1585
+ build_route_plan_entry(
1586
+ route_label=route["route_label"],
1587
+ endpoint=route["path"],
1588
+ method="GET",
1589
+ param_readiness=route["param_readiness"],
1590
+ param_reason=route["param_reason"],
1591
+ )
1592
+ for route in _xhs_route_plan("profile", user_id="", input_value=input_value)
1593
+ ],
1594
+ attempted_routes=[],
1595
+ chosen_route="",
1596
+ accept_reason="",
1597
+ fallback_reason="author_id_unresolved",
1598
+ error_reason="author_id_unresolved",
1599
+ all_routes_failed=False,
1600
+ )
1601
+ stage_status["posts"] = build_stage_status(
1602
+ stage="posts",
1603
+ status="skipped",
1604
+ route_plan=[
1605
+ build_route_plan_entry(
1606
+ route_label=route["route_label"],
1607
+ endpoint=route["path"],
1608
+ method="GET",
1609
+ param_readiness=route["param_readiness"],
1610
+ param_reason=route["param_reason"],
1611
+ )
1612
+ for route in _xhs_route_plan("posts", user_id="", input_value=input_value, cursor="")
1613
+ ],
1614
+ attempted_routes=[],
1615
+ chosen_route="",
1616
+ accept_reason="",
1617
+ fallback_reason="author_id_unresolved",
1618
+ error_reason="author_id_unresolved",
1619
+ all_routes_failed=False,
1620
+ )
1621
+ trace.append({"step": "xhs.profile.route_decision", **stage_status["profile"]})
1622
+ trace.append({"step": "xhs.posts.stage_decision", **stage_status["posts"]})
1623
+ request_id = _pick_request_id(request_id_candidates, trace)
1624
+ if progress is not None:
1625
+ progress.done(
1626
+ stage="author_home.collect",
1627
+ message="xiaohongshu author homepage collected",
1628
+ data={"works_count": 0, "pages": 0, "request_id": request_id},
1629
+ )
1630
+ return {
1631
+ "platform": "xiaohongshu",
1632
+ "resolved_author_id": "",
1633
+ "resolved_xsec_token": xsec_token,
1634
+ "profile_response": {},
1635
+ "works": [],
1636
+ "pagination": {
1637
+ "sort": "latest",
1638
+ "sort_type": "latest",
1639
+ "cursor_mode": "cursor",
1640
+ "pages": [],
1641
+ "total_collected": 0,
1642
+ "max_items": max_items,
1643
+ },
1644
+ "extract_trace": trace,
1645
+ "request_id": request_id,
1646
+ "stage_status": stage_status,
1647
+ "error_reason": "author_id_unresolved",
1648
+ }
797
1649
 
798
- profile_routes = _xhs_route_plan("profile")
1650
+ profile_routes = _xhs_route_plan("profile", user_id=user_id, input_value=input_value)
1651
+ profile_route_plan = [
1652
+ build_route_plan_entry(
1653
+ route_label=str(route["route_label"]),
1654
+ endpoint=str(route["path"]),
1655
+ method="GET",
1656
+ param_readiness=str(route.get("param_readiness") or "ready"),
1657
+ param_reason=str(route.get("param_reason") or ""),
1658
+ )
1659
+ for route in profile_routes
1660
+ ]
799
1661
  profile_resp: Dict[str, Any] = {}
800
1662
  profile_reason: Optional[str] = None
801
1663
  profile_attempts: List[Dict[str, Any]] = []
802
- for step_name, path, route_label in profile_routes:
1664
+ for route in profile_routes:
1665
+ if route.get("param_readiness") != "ready":
1666
+ profile_attempts.append(
1667
+ _build_unavailable_attempt(
1668
+ route_label=str(route["route_label"]),
1669
+ endpoint=str(route["path"]),
1670
+ reason=str(route.get("param_reason") or "fallback_param_unavailable"),
1671
+ )
1672
+ )
1673
+ continue
803
1674
  profile_resp = _call_xhs_route(
804
1675
  base_url=base_url,
805
1676
  token=token,
806
1677
  timeout_ms=timeout_ms,
807
- path=path,
808
- route_label=route_label,
809
- params={"user_id": user_id, "share_text": input_value, "xsec_token": xsec_token or None},
1678
+ path=str(route["path"]),
1679
+ route_label=str(route["route_label"]),
1680
+ params=dict(route.get("params") or {}),
810
1681
  fallback_reason=profile_reason,
811
1682
  completeness_builder=lambda data, resolved_author_id=user_id: _xhs_profile_field_completeness(data, resolved_author_id),
812
1683
  )
813
1684
  profile_decision = _xhs_profile_accept_decision(profile_resp, profile_resp.get("_field_completeness") or {})
814
1685
  profile_attempts.append(
815
- {
816
- "route_label": route_label,
817
- "endpoint": path,
818
- "accepted": bool(profile_decision.get("accepted")),
819
- "accept_reason": profile_decision.get("accept_reason"),
820
- "fallback_reason": profile_decision.get("fallback_reason"),
821
- "field_completeness": profile_resp.get("_field_completeness"),
822
- "request_id": profile_resp.get("request_id"),
823
- }
1686
+ build_attempted_route(
1687
+ route_label=str(route["route_label"]),
1688
+ endpoint=str(route["path"]),
1689
+ response=profile_resp,
1690
+ accepted=bool(profile_decision.get("accepted")),
1691
+ accept_reason=str(profile_decision.get("accept_reason") or ""),
1692
+ fallback_reason=str(profile_decision.get("fallback_reason") or ""),
1693
+ extra={"field_completeness": profile_resp.get("_field_completeness")},
1694
+ )
824
1695
  )
825
1696
  trace.append(
826
1697
  build_api_trace(
827
- step=step_name,
828
- endpoint=path,
1698
+ step=str(route["step_name"]),
1699
+ endpoint=str(route["path"]),
829
1700
  response=profile_resp,
830
1701
  extra={
831
- "route_label": route_label,
1702
+ "route_label": route["route_label"],
832
1703
  "field_completeness": profile_resp.get("_field_completeness"),
833
1704
  "accept_reason": profile_decision.get("accept_reason"),
834
1705
  "route_accepted": bool(profile_decision.get("accepted")),
@@ -845,14 +1716,32 @@ def collect_xhs_author_home_raw(
845
1716
  trace.append(
846
1717
  {
847
1718
  "step": "xhs.profile.route_decision",
1719
+ "route_plan": profile_route_plan,
848
1720
  "chosen_route": profile_resp.get("_route_label"),
849
1721
  "request_id": profile_resp.get("request_id"),
850
1722
  "field_completeness": profile_resp.get("_field_completeness"),
851
1723
  "accept_reason": profile_resp.get("_accept_reason"),
852
1724
  "fallback_reason": profile_reason,
853
1725
  "attempted_routes": profile_attempts,
1726
+ "all_routes_failed": not any(bool(attempt.get("accepted")) for attempt in profile_attempts),
854
1727
  }
855
1728
  )
1729
+ profile_has_accepted = any(bool(attempt.get("accepted")) for attempt in profile_attempts)
1730
+ profile_has_ok_response = any(bool(attempt.get("ok")) for attempt in profile_attempts if not attempt.get("skipped"))
1731
+ profile_error_reason = None
1732
+ if not profile_has_accepted:
1733
+ profile_error_reason = "profile_contract_incomplete" if profile_has_ok_response else "profile_all_routes_failed"
1734
+ stage_status["profile"] = build_stage_status(
1735
+ stage="profile",
1736
+ status="succeeded" if profile_has_accepted else "failed",
1737
+ route_plan=profile_route_plan,
1738
+ attempted_routes=profile_attempts,
1739
+ chosen_route=str(profile_resp.get("_route_label") or ""),
1740
+ accept_reason=str(profile_resp.get("_accept_reason") or ""),
1741
+ fallback_reason=str(profile_reason or ""),
1742
+ error_reason=profile_error_reason,
1743
+ all_routes_failed=not profile_has_accepted,
1744
+ )
856
1745
 
857
1746
  works: List[Dict[str, Any]] = []
858
1747
  seen_ids = set()
@@ -860,9 +1749,9 @@ def collect_xhs_author_home_raw(
860
1749
  has_more = True
861
1750
  page = 0
862
1751
  pagination_trace: List[Dict[str, Any]] = []
863
-
864
- max_pages = max(pages_max, 1)
865
- page_limit = min(max(page_size, 1), 20)
1752
+ posts_attempts_all: List[Dict[str, Any]] = []
1753
+ posts_accepted_routes: List[str] = []
1754
+ posts_error_reason: Optional[str] = None
866
1755
 
867
1756
  while has_more and page < max_pages and len(works) < max_items:
868
1757
  page += 1
@@ -872,48 +1761,66 @@ def collect_xhs_author_home_raw(
872
1761
  message="xiaohongshu pagination page requested",
873
1762
  data={"page": page, "cursor_in": cursor},
874
1763
  )
875
- posts_routes = _xhs_route_plan("posts")
1764
+ posts_routes = _xhs_route_plan("posts", user_id=user_id, input_value=input_value, cursor=cursor)
876
1765
  posts_resp: Dict[str, Any] = {}
877
1766
  posts_reason: Optional[str] = None
878
1767
  posts_attempts: List[Dict[str, Any]] = []
879
- for step_name, path, route_label in posts_routes:
1768
+ page_route_plan = [
1769
+ build_route_plan_entry(
1770
+ route_label=str(route["route_label"]),
1771
+ endpoint=str(route["path"]),
1772
+ method="GET",
1773
+ param_readiness=str(route.get("param_readiness") or "ready"),
1774
+ param_reason=str(route.get("param_reason") or ""),
1775
+ )
1776
+ for route in posts_routes
1777
+ ]
1778
+ for route in posts_routes:
1779
+ if route.get("param_readiness") != "ready":
1780
+ skipped_attempt = _build_unavailable_attempt(
1781
+ route_label=str(route["route_label"]),
1782
+ endpoint=str(route["path"]),
1783
+ reason=str(route.get("param_reason") or "fallback_param_unavailable"),
1784
+ extra={"page": page, "cursor_in": cursor},
1785
+ )
1786
+ posts_attempts.append(skipped_attempt)
1787
+ posts_attempts_all.append(skipped_attempt)
1788
+ continue
880
1789
  posts_resp = _call_xhs_route(
881
1790
  base_url=base_url,
882
1791
  token=token,
883
1792
  timeout_ms=timeout_ms,
884
- path=path,
885
- route_label=route_label,
886
- params={
887
- "user_id": user_id,
888
- "share_text": input_value,
889
- "cursor": cursor or None,
890
- "num": page_limit,
891
- "xsec_token": xsec_token or None,
892
- },
1793
+ path=str(route["path"]),
1794
+ route_label=str(route["route_label"]),
1795
+ params=dict(route.get("params") or {}),
893
1796
  fallback_reason=posts_reason,
894
1797
  completeness_builder=_xhs_posts_field_completeness,
895
1798
  )
896
1799
  posts_decision = _xhs_posts_accept_decision(posts_resp, posts_resp.get("_field_completeness") or {})
897
- posts_attempts.append(
898
- {
899
- "route_label": route_label,
900
- "endpoint": path,
901
- "accepted": bool(posts_decision.get("accepted")),
902
- "accept_reason": posts_decision.get("accept_reason"),
903
- "fallback_reason": posts_decision.get("fallback_reason"),
1800
+ posts_attempt = build_attempted_route(
1801
+ route_label=str(route["route_label"]),
1802
+ endpoint=str(route["path"]),
1803
+ response=posts_resp,
1804
+ accepted=bool(posts_decision.get("accepted")),
1805
+ accept_reason=str(posts_decision.get("accept_reason") or ""),
1806
+ fallback_reason=str(posts_decision.get("fallback_reason") or ""),
1807
+ extra={
1808
+ "page": page,
1809
+ "cursor_in": cursor,
904
1810
  "field_completeness": posts_resp.get("_field_completeness"),
905
- "request_id": posts_resp.get("request_id"),
906
- }
1811
+ },
907
1812
  )
1813
+ posts_attempts.append(posts_attempt)
1814
+ posts_attempts_all.append(posts_attempt)
908
1815
  trace.append(
909
1816
  build_api_trace(
910
- step=step_name,
911
- endpoint=path,
1817
+ step=str(route["step_name"]),
1818
+ endpoint=str(route["path"]),
912
1819
  response=posts_resp,
913
1820
  extra={
914
1821
  "page": page,
915
1822
  "cursor": cursor,
916
- "route_label": route_label,
1823
+ "route_label": route["route_label"],
917
1824
  "field_completeness": posts_resp.get("_field_completeness"),
918
1825
  "accept_reason": posts_decision.get("accept_reason"),
919
1826
  "route_accepted": bool(posts_decision.get("accepted")),
@@ -923,6 +1830,7 @@ def collect_xhs_author_home_raw(
923
1830
  request_id_candidates.append(posts_resp)
924
1831
  if posts_decision.get("accepted"):
925
1832
  posts_resp["_accept_reason"] = posts_decision.get("accept_reason")
1833
+ posts_accepted_routes.append(str(route["route_label"]))
926
1834
  break
927
1835
  posts_reason = str(posts_decision.get("fallback_reason") or "field_completeness_below_threshold")
928
1836
  posts_resp["fallback_trigger_reason"] = posts_reason
@@ -932,15 +1840,35 @@ def collect_xhs_author_home_raw(
932
1840
  "step": "xhs.posts.route_decision",
933
1841
  "page": page,
934
1842
  "cursor_in": cursor,
1843
+ "route_plan": page_route_plan,
935
1844
  "chosen_route": posts_resp.get("_route_label"),
936
1845
  "request_id": posts_resp.get("request_id"),
937
1846
  "field_completeness": posts_resp.get("_field_completeness"),
938
1847
  "accept_reason": posts_resp.get("_accept_reason"),
939
1848
  "fallback_reason": posts_reason,
940
1849
  "attempted_routes": posts_attempts,
1850
+ "all_routes_failed": not any(bool(attempt.get("accepted")) for attempt in posts_attempts),
941
1851
  }
942
1852
  )
943
1853
 
1854
+ if not posts_resp.get("_accept_reason"):
1855
+ posts_error_reason = "posts_contract_incomplete" if any(bool(attempt.get("ok")) for attempt in posts_attempts if not attempt.get("skipped")) else "posts_all_routes_failed"
1856
+ pagination_trace.append(
1857
+ {
1858
+ "page": page,
1859
+ "cursor_in": cursor,
1860
+ "cursor_out": "",
1861
+ "cursor_source": "missing",
1862
+ "has_more_raw": None,
1863
+ "has_more_normalized": None,
1864
+ "items": 0,
1865
+ "route_label": posts_resp.get("_route_label"),
1866
+ "request_id": posts_resp.get("request_id"),
1867
+ "stop_reason": posts_error_reason,
1868
+ }
1869
+ )
1870
+ break
1871
+
944
1872
  data = posts_resp.get("data")
945
1873
  page_items = _extract_xhs_posts_items(data)
946
1874
  next_cursor_raw = _extract_xhs_response_cursor(data)
@@ -1005,6 +1933,32 @@ def collect_xhs_author_home_raw(
1005
1933
  if should_continue and next_cursor:
1006
1934
  cursor = next_cursor
1007
1935
 
1936
+ posts_route_plan = [
1937
+ build_route_plan_entry(
1938
+ route_label=str(route["route_label"]),
1939
+ endpoint=str(route["path"]),
1940
+ method="GET",
1941
+ param_readiness=str(route.get("param_readiness") or "ready"),
1942
+ param_reason=str(route.get("param_reason") or ""),
1943
+ )
1944
+ for route in _xhs_route_plan("posts", user_id=user_id, input_value=input_value, cursor="")
1945
+ ]
1946
+ posts_has_accepted = bool(posts_accepted_routes)
1947
+ if not posts_has_accepted and posts_error_reason is None:
1948
+ posts_error_reason = "posts_all_routes_failed"
1949
+ stage_status["posts"] = build_stage_status(
1950
+ stage="posts",
1951
+ status="succeeded" if posts_has_accepted else "failed",
1952
+ route_plan=posts_route_plan,
1953
+ attempted_routes=posts_attempts_all,
1954
+ chosen_route=posts_accepted_routes[0] if len(set(posts_accepted_routes)) == 1 and posts_accepted_routes else ("mixed" if posts_accepted_routes else ""),
1955
+ accept_reason="posts_pages_collected" if posts_has_accepted else "",
1956
+ fallback_reason=str(posts_error_reason or ""),
1957
+ error_reason=None if posts_has_accepted else posts_error_reason,
1958
+ all_routes_failed=not posts_has_accepted,
1959
+ )
1960
+ trace.append({"step": "xhs.posts.stage_decision", **stage_status["posts"]})
1961
+
1008
1962
  request_id = _pick_request_id(request_id_candidates, trace)
1009
1963
  if progress is not None:
1010
1964
  progress.done(
@@ -1013,6 +1967,14 @@ def collect_xhs_author_home_raw(
1013
1967
  data={"works_count": len(works), "pages": len(pagination_trace), "request_id": request_id},
1014
1968
  )
1015
1969
 
1970
+ collect_error_reason: Optional[str] = None
1971
+ if stage_status.get("resolver", {}).get("status") == "failed":
1972
+ collect_error_reason = str(stage_status["resolver"].get("error_reason") or "author_id_unresolved")
1973
+ elif not works and stage_status.get("posts", {}).get("status") == "failed":
1974
+ collect_error_reason = stage_status["posts"].get("error_reason")
1975
+ elif stage_status.get("profile", {}).get("status") == "failed":
1976
+ collect_error_reason = stage_status["profile"].get("error_reason")
1977
+
1016
1978
  return {
1017
1979
  "platform": "xiaohongshu",
1018
1980
  "resolved_author_id": user_id,
@@ -1029,4 +1991,6 @@ def collect_xhs_author_home_raw(
1029
1991
  },
1030
1992
  "extract_trace": trace,
1031
1993
  "request_id": request_id,
1994
+ "stage_status": stage_status,
1995
+ "error_reason": collect_error_reason,
1032
1996
  }