@tikomni/skills 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/core/extract_pipeline.py +93 -1
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +1066 -102
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +102 -25
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +335 -78
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +8 -1
- package/skills/social-media-crawl/tests/test_fixed_pipeline_fallback.py +169 -0
|
@@ -3,10 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
import os
|
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple
|
|
7
8
|
from urllib.parse import parse_qs, urlparse
|
|
8
9
|
|
|
9
|
-
from scripts.core.extract_pipeline import
|
|
10
|
+
from scripts.core.extract_pipeline import (
|
|
11
|
+
build_api_trace,
|
|
12
|
+
build_attempted_route,
|
|
13
|
+
build_route_plan_entry,
|
|
14
|
+
build_stage_status,
|
|
15
|
+
)
|
|
10
16
|
from scripts.core.progress_report import ProgressReporter
|
|
11
17
|
from scripts.core.tikomni_common import call_json_api, deep_find_all, deep_find_first
|
|
12
18
|
from scripts.pipelines.input_contracts import extract_douyin_sec_uid, extract_xhs_user_id, looks_like_xhs_user_id
|
|
@@ -301,6 +307,34 @@ def _preview(value: Any, max_len: int = 160) -> str:
|
|
|
301
307
|
return text[:max_len]
|
|
302
308
|
|
|
303
309
|
|
|
310
|
+
def _response_failure_reason(response: Dict[str, Any]) -> str:
|
|
311
|
+
if response.get("timeout_retry_exhausted"):
|
|
312
|
+
return "primary_timeout_retry_exhausted"
|
|
313
|
+
if response.get("error_reason"):
|
|
314
|
+
return "primary_non_timeout_failure"
|
|
315
|
+
return "primary_unknown_failure"
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _build_unavailable_attempt(
|
|
319
|
+
*,
|
|
320
|
+
route_label: str,
|
|
321
|
+
endpoint: str,
|
|
322
|
+
reason: str,
|
|
323
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
324
|
+
) -> Dict[str, Any]:
|
|
325
|
+
return build_attempted_route(
|
|
326
|
+
route_label=route_label,
|
|
327
|
+
endpoint=endpoint,
|
|
328
|
+
accepted=False,
|
|
329
|
+
accept_reason="skipped_param_unavailable",
|
|
330
|
+
fallback_reason=reason,
|
|
331
|
+
param_readiness="unavailable",
|
|
332
|
+
param_reason=reason,
|
|
333
|
+
skipped=True,
|
|
334
|
+
extra=extra,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
304
338
|
def _pick_request_id(responses: List[Optional[Dict[str, Any]]], trace: Optional[List[Dict[str, Any]]] = None) -> Optional[str]:
|
|
305
339
|
for response in responses:
|
|
306
340
|
if isinstance(response, dict) and response.get("request_id"):
|
|
@@ -404,20 +438,12 @@ def _xhs_posts_field_completeness(payload: Any) -> Dict[str, Any]:
|
|
|
404
438
|
return _build_field_completeness(fields, core_keys=["items", "platform_work_id", "title_or_caption", "published_date"])
|
|
405
439
|
|
|
406
440
|
|
|
407
|
-
def _xhs_route_failure_reason(response: Dict[str, Any]) -> str:
|
|
408
|
-
if response.get("timeout_retry_exhausted"):
|
|
409
|
-
return "primary_timeout_retry_exhausted"
|
|
410
|
-
if response.get("error_reason"):
|
|
411
|
-
return "primary_non_timeout_failure"
|
|
412
|
-
return "primary_unknown_failure"
|
|
413
|
-
|
|
414
|
-
|
|
415
441
|
def _xhs_profile_accept_decision(response: Dict[str, Any], completeness: Dict[str, Any]) -> Dict[str, Any]:
|
|
416
442
|
if not response.get("ok"):
|
|
417
443
|
return {
|
|
418
444
|
"accepted": False,
|
|
419
445
|
"accept_reason": "response_not_ok",
|
|
420
|
-
"fallback_reason":
|
|
446
|
+
"fallback_reason": _response_failure_reason(response),
|
|
421
447
|
}
|
|
422
448
|
|
|
423
449
|
missing_core = list(completeness.get("missing_core") or [])
|
|
@@ -449,7 +475,7 @@ def _xhs_posts_accept_decision(response: Dict[str, Any], completeness: Dict[str,
|
|
|
449
475
|
return {
|
|
450
476
|
"accepted": False,
|
|
451
477
|
"accept_reason": "response_not_ok",
|
|
452
|
-
"fallback_reason":
|
|
478
|
+
"fallback_reason": _response_failure_reason(response),
|
|
453
479
|
}
|
|
454
480
|
|
|
455
481
|
missing_core = list(completeness.get("missing_core") or [])
|
|
@@ -480,18 +506,287 @@ def _xhs_posts_accept_decision(response: Dict[str, Any], completeness: Dict[str,
|
|
|
480
506
|
}
|
|
481
507
|
|
|
482
508
|
|
|
483
|
-
def
|
|
509
|
+
def _douyin_profile_field_completeness(payload: Any, resolved_author_id: str) -> Dict[str, Any]:
|
|
510
|
+
fields = {
|
|
511
|
+
"platform_author_id": bool(_pick_text(payload, ["sec_user_id", "sec_uid", "uid", "user_id", "id"]) or resolved_author_id),
|
|
512
|
+
"nickname": bool(_pick_text(payload, ["nickname", "name"])),
|
|
513
|
+
"avatar_url": bool(_extract_first_url(_first_url_candidate(payload, ["avatar_larger", "avatar_thumb", "avatar_url", "avatar", "images"]))),
|
|
514
|
+
"fans_count": _pick_int(payload, ["follower_count", "fans_count", "mplatform_followers_count"], default=0) > 0,
|
|
515
|
+
"works_count": _pick_int(payload, ["aweme_count", "works_count", "video_count"], default=0) > 0,
|
|
516
|
+
"unique_id": bool(_pick_text(payload, ["unique_id", "short_id", "douyin_id", "display_id"])),
|
|
517
|
+
}
|
|
518
|
+
return _build_field_completeness(fields, core_keys=["platform_author_id", "nickname"])
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def _douyin_posts_field_completeness(payload: Any) -> Dict[str, Any]:
|
|
522
|
+
page_items = _extract_douyin_posts_items(payload)
|
|
523
|
+
first_item = _pick_first_mapping(page_items)
|
|
524
|
+
has_more_flag = _extract_douyin_posts_has_more(payload) is not None
|
|
525
|
+
cursor_hit = _extract_douyin_posts_next_cursor(payload) is not None
|
|
526
|
+
cover_hit = bool(_extract_first_url(_first_url_candidate(first_item, ["cover_url", "cover", "origin_cover", "image", "images"])))
|
|
527
|
+
share_or_source = bool(_pick_text(first_item, ["share_url", "share_link", "url"])) or bool(_pick_text(first_item, ["aweme_id", "item_id", "id"]))
|
|
528
|
+
interaction_values = [
|
|
529
|
+
_pick_int(first_item, ["digg_count"], default=-1),
|
|
530
|
+
_pick_int(first_item, ["comment_count"], default=-1),
|
|
531
|
+
_pick_int(first_item, ["collect_count"], default=-1),
|
|
532
|
+
_pick_int(first_item, ["share_count"], default=-1),
|
|
533
|
+
_pick_int(first_item, ["play_count"], default=-1),
|
|
534
|
+
]
|
|
535
|
+
fields = {
|
|
536
|
+
"items": len(page_items) > 0,
|
|
537
|
+
"platform_work_id": bool(_pick_text(first_item, ["aweme_id", "item_id", "id"])),
|
|
538
|
+
"title_or_caption": bool(_pick_text(first_item, ["title", "caption", "desc"])),
|
|
539
|
+
"published_date": bool(_pick_text(first_item, ["create_time", "publish_time"])),
|
|
540
|
+
"base_link_fields": cover_hit or share_or_source,
|
|
541
|
+
"interaction_fields": any(value >= 0 for value in interaction_values),
|
|
542
|
+
"cursor": cursor_hit,
|
|
543
|
+
"has_more_flag": has_more_flag,
|
|
544
|
+
"response_shape": len(page_items) > 0 or cursor_hit or has_more_flag,
|
|
545
|
+
}
|
|
546
|
+
return _build_field_completeness(fields, core_keys=["items", "platform_work_id", "title_or_caption", "published_date"])
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _douyin_profile_accept_decision(response: Dict[str, Any], completeness: Dict[str, Any]) -> Dict[str, Any]:
|
|
550
|
+
if not response.get("ok"):
|
|
551
|
+
return {
|
|
552
|
+
"accepted": False,
|
|
553
|
+
"accept_reason": "response_not_ok",
|
|
554
|
+
"fallback_reason": _response_failure_reason(response),
|
|
555
|
+
}
|
|
556
|
+
missing_core = list(completeness.get("missing_core") or [])
|
|
557
|
+
if missing_core:
|
|
558
|
+
return {
|
|
559
|
+
"accepted": False,
|
|
560
|
+
"accept_reason": "profile_missing_core_fields",
|
|
561
|
+
"fallback_reason": f"profile_missing_core:{','.join(missing_core)}",
|
|
562
|
+
}
|
|
563
|
+
fields = completeness.get("fields") if isinstance(completeness.get("fields"), dict) else {}
|
|
564
|
+
optional_missing = [
|
|
565
|
+
field_name
|
|
566
|
+
for field_name in ("avatar_url", "fans_count", "works_count", "unique_id")
|
|
567
|
+
if not fields.get(field_name)
|
|
568
|
+
]
|
|
569
|
+
accept_reason = "profile_core_fields_ready"
|
|
570
|
+
if optional_missing:
|
|
571
|
+
accept_reason = f"profile_core_fields_ready_optional_missing:{','.join(optional_missing)}"
|
|
572
|
+
return {
|
|
573
|
+
"accepted": True,
|
|
574
|
+
"accept_reason": accept_reason,
|
|
575
|
+
"fallback_reason": "",
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def _douyin_posts_accept_decision(response: Dict[str, Any], completeness: Dict[str, Any]) -> Dict[str, Any]:
|
|
580
|
+
if not response.get("ok"):
|
|
581
|
+
return {
|
|
582
|
+
"accepted": False,
|
|
583
|
+
"accept_reason": "response_not_ok",
|
|
584
|
+
"fallback_reason": _response_failure_reason(response),
|
|
585
|
+
}
|
|
586
|
+
missing_core = list(completeness.get("missing_core") or [])
|
|
587
|
+
if missing_core:
|
|
588
|
+
return {
|
|
589
|
+
"accepted": False,
|
|
590
|
+
"accept_reason": "posts_missing_core_fields",
|
|
591
|
+
"fallback_reason": f"posts_missing_core:{','.join(missing_core)}",
|
|
592
|
+
}
|
|
593
|
+
fields = completeness.get("fields") if isinstance(completeness.get("fields"), dict) else {}
|
|
594
|
+
if not fields.get("base_link_fields"):
|
|
595
|
+
return {
|
|
596
|
+
"accepted": False,
|
|
597
|
+
"accept_reason": "posts_missing_base_link_fields",
|
|
598
|
+
"fallback_reason": "posts_missing_base_link_fields",
|
|
599
|
+
}
|
|
600
|
+
return {
|
|
601
|
+
"accepted": True,
|
|
602
|
+
"accept_reason": "posts_contract_fields_ready",
|
|
603
|
+
"fallback_reason": "",
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _douyin_profile_route_plan(*, sec_user_id: str, unique_id: str, uid: str) -> List[Dict[str, Any]]:
|
|
608
|
+
sec_ready = bool(sec_user_id)
|
|
609
|
+
unique_ready = bool(unique_id)
|
|
610
|
+
uid_ready = bool(uid)
|
|
611
|
+
return [
|
|
612
|
+
{
|
|
613
|
+
"step_name": "douyin.profile.app_v3",
|
|
614
|
+
"path": "/api/u1/v1/douyin/app/v3/handler_user_profile",
|
|
615
|
+
"route_label": "app_v3",
|
|
616
|
+
"params": {"sec_user_id": sec_user_id or None},
|
|
617
|
+
"param_readiness": "ready" if sec_ready else "unavailable",
|
|
618
|
+
"param_reason": "" if sec_ready else "missing_sec_user_id",
|
|
619
|
+
},
|
|
620
|
+
{
|
|
621
|
+
"step_name": "douyin.profile.web_v4",
|
|
622
|
+
"path": "/api/u1/v1/douyin/web/handler_user_profile_v4",
|
|
623
|
+
"route_label": "web_v4",
|
|
624
|
+
"params": {"sec_user_id": sec_user_id or None},
|
|
625
|
+
"param_readiness": "ready" if sec_ready else "unavailable",
|
|
626
|
+
"param_reason": "" if sec_ready else "missing_sec_user_id",
|
|
627
|
+
},
|
|
628
|
+
{
|
|
629
|
+
"step_name": "douyin.profile.web",
|
|
630
|
+
"path": "/api/u1/v1/douyin/web/handler_user_profile",
|
|
631
|
+
"route_label": "web",
|
|
632
|
+
"params": {"sec_user_id": sec_user_id or None},
|
|
633
|
+
"param_readiness": "ready" if sec_ready else "unavailable",
|
|
634
|
+
"param_reason": "" if sec_ready else "missing_sec_user_id",
|
|
635
|
+
},
|
|
636
|
+
{
|
|
637
|
+
"step_name": "douyin.profile.web_v2",
|
|
638
|
+
"path": "/api/u1/v1/douyin/web/handler_user_profile_v2",
|
|
639
|
+
"route_label": "web_v2",
|
|
640
|
+
"params": {"unique_id": unique_id or None},
|
|
641
|
+
"param_readiness": "ready" if unique_ready else "unavailable",
|
|
642
|
+
"param_reason": "" if unique_ready else "missing_unique_id",
|
|
643
|
+
},
|
|
644
|
+
{
|
|
645
|
+
"step_name": "douyin.profile.web_v3",
|
|
646
|
+
"path": "/api/u1/v1/douyin/web/handler_user_profile_v3",
|
|
647
|
+
"route_label": "web_v3",
|
|
648
|
+
"params": {"uid": uid or None},
|
|
649
|
+
"param_readiness": "ready" if uid_ready else "unavailable",
|
|
650
|
+
"param_reason": "" if uid_ready else "missing_uid",
|
|
651
|
+
},
|
|
652
|
+
]
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def _douyin_posts_route_plan(*, sec_user_id: str, cursor: int, count: int, cookie: str) -> List[Dict[str, Any]]:
|
|
656
|
+
sec_ready = bool(sec_user_id)
|
|
657
|
+
web_ready = sec_ready and bool(cookie)
|
|
658
|
+
web_reason = ""
|
|
659
|
+
if not sec_ready:
|
|
660
|
+
web_reason = "missing_sec_user_id"
|
|
661
|
+
elif not cookie:
|
|
662
|
+
web_reason = "fallback_requires_cookie"
|
|
663
|
+
return [
|
|
664
|
+
{
|
|
665
|
+
"step_name": "douyin.posts.app_v3",
|
|
666
|
+
"path": "/api/u1/v1/douyin/app/v3/fetch_user_post_videos",
|
|
667
|
+
"route_label": "app_v3",
|
|
668
|
+
"params": {
|
|
669
|
+
"sec_user_id": sec_user_id or None,
|
|
670
|
+
"count": count,
|
|
671
|
+
"max_cursor": cursor,
|
|
672
|
+
"sort_type": 0,
|
|
673
|
+
},
|
|
674
|
+
"param_readiness": "ready" if sec_ready else "unavailable",
|
|
675
|
+
"param_reason": "" if sec_ready else "missing_sec_user_id",
|
|
676
|
+
},
|
|
677
|
+
{
|
|
678
|
+
"step_name": "douyin.posts.web",
|
|
679
|
+
"path": "/api/u1/v1/douyin/web/fetch_user_post_videos",
|
|
680
|
+
"route_label": "web",
|
|
681
|
+
"params": {
|
|
682
|
+
"sec_user_id": sec_user_id or None,
|
|
683
|
+
"count": count,
|
|
684
|
+
"max_cursor": cursor,
|
|
685
|
+
"filter_type": 0,
|
|
686
|
+
"cookie": cookie or None,
|
|
687
|
+
},
|
|
688
|
+
"param_readiness": "ready" if web_ready else "unavailable",
|
|
689
|
+
"param_reason": web_reason,
|
|
690
|
+
},
|
|
691
|
+
]
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def _xhs_route_plan(kind: str, *, user_id: str, input_value: str, cursor: Any = "") -> List[Dict[str, Any]]:
|
|
695
|
+
user_ready = bool(user_id)
|
|
484
696
|
if kind == "profile":
|
|
485
697
|
return [
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
698
|
+
{
|
|
699
|
+
"step_name": "xhs.profile.app_v2",
|
|
700
|
+
"path": "/api/u1/v1/xiaohongshu/app_v2/get_user_info",
|
|
701
|
+
"route_label": "app_v2",
|
|
702
|
+
"params": {"user_id": user_id or None, "share_text": input_value or None},
|
|
703
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
704
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
705
|
+
},
|
|
706
|
+
{
|
|
707
|
+
"step_name": "xhs.profile.app",
|
|
708
|
+
"path": "/api/u1/v1/xiaohongshu/app/get_user_info",
|
|
709
|
+
"route_label": "app",
|
|
710
|
+
"params": {"user_id": user_id or None},
|
|
711
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
712
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
713
|
+
},
|
|
714
|
+
{
|
|
715
|
+
"step_name": "xhs.profile.web_v2_app",
|
|
716
|
+
"path": "/api/u1/v1/xiaohongshu/web_v2/fetch_user_info_app",
|
|
717
|
+
"route_label": "web_v2_app",
|
|
718
|
+
"params": {"user_id": user_id or None},
|
|
719
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
720
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
721
|
+
},
|
|
722
|
+
{
|
|
723
|
+
"step_name": "xhs.profile.web_v2",
|
|
724
|
+
"path": "/api/u1/v1/xiaohongshu/web_v2/fetch_user_info",
|
|
725
|
+
"route_label": "web_v2",
|
|
726
|
+
"params": {"user_id": user_id or None},
|
|
727
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
728
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
729
|
+
},
|
|
730
|
+
{
|
|
731
|
+
"step_name": "xhs.profile.web_v1_v2",
|
|
732
|
+
"path": "/api/u1/v1/xiaohongshu/web/get_user_info_v2",
|
|
733
|
+
"route_label": "web_v1_v2",
|
|
734
|
+
"params": {"user_id": user_id or None, "share_text": input_value or None},
|
|
735
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
736
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
737
|
+
},
|
|
738
|
+
{
|
|
739
|
+
"step_name": "xhs.profile.web_v1",
|
|
740
|
+
"path": "/api/u1/v1/xiaohongshu/web/get_user_info",
|
|
741
|
+
"route_label": "web_v1",
|
|
742
|
+
"params": {"user_id": user_id or None},
|
|
743
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
744
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
745
|
+
},
|
|
489
746
|
]
|
|
490
747
|
if kind == "posts":
|
|
748
|
+
last_cursor = _to_text(cursor)
|
|
491
749
|
return [
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
750
|
+
{
|
|
751
|
+
"step_name": "xhs.posts.app_v2",
|
|
752
|
+
"path": "/api/u1/v1/xiaohongshu/app_v2/get_user_posted_notes",
|
|
753
|
+
"route_label": "app_v2",
|
|
754
|
+
"params": {"user_id": user_id or None, "share_text": input_value or None, "cursor": last_cursor or None},
|
|
755
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
756
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
757
|
+
},
|
|
758
|
+
{
|
|
759
|
+
"step_name": "xhs.posts.app",
|
|
760
|
+
"path": "/api/u1/v1/xiaohongshu/app/get_user_notes",
|
|
761
|
+
"route_label": "app",
|
|
762
|
+
"params": {"user_id": user_id or None, "cursor": last_cursor or None},
|
|
763
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
764
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
765
|
+
},
|
|
766
|
+
{
|
|
767
|
+
"step_name": "xhs.posts.web_v2_app",
|
|
768
|
+
"path": "/api/u1/v1/xiaohongshu/web_v2/fetch_home_notes_app",
|
|
769
|
+
"route_label": "web_v2_app",
|
|
770
|
+
"params": {"user_id": user_id or None, "cursor": last_cursor or None},
|
|
771
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
772
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
773
|
+
},
|
|
774
|
+
{
|
|
775
|
+
"step_name": "xhs.posts.web_v2",
|
|
776
|
+
"path": "/api/u1/v1/xiaohongshu/web_v2/fetch_home_notes",
|
|
777
|
+
"route_label": "web_v2",
|
|
778
|
+
"params": {"user_id": user_id or None, "cursor": last_cursor or None},
|
|
779
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
780
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
781
|
+
},
|
|
782
|
+
{
|
|
783
|
+
"step_name": "xhs.posts.web_v1_v2",
|
|
784
|
+
"path": "/api/u1/v1/xiaohongshu/web/get_user_notes_v2",
|
|
785
|
+
"route_label": "web_v1_v2",
|
|
786
|
+
"params": {"user_id": user_id or None, "lastCursor": last_cursor or None},
|
|
787
|
+
"param_readiness": "ready" if user_ready else "unavailable",
|
|
788
|
+
"param_reason": "" if user_ready else "missing_user_id",
|
|
789
|
+
},
|
|
495
790
|
]
|
|
496
791
|
raise ValueError(f"unsupported_xhs_route_kind:{kind}")
|
|
497
792
|
|
|
@@ -567,25 +862,79 @@ def collect_douyin_author_home_raw(
|
|
|
567
862
|
progress: Optional[ProgressReporter] = None,
|
|
568
863
|
) -> Dict[str, Any]:
|
|
569
864
|
trace: List[Dict[str, Any]] = []
|
|
865
|
+
stage_status: Dict[str, Any] = {}
|
|
570
866
|
if progress is not None:
|
|
571
867
|
progress.started(stage="author_home.collect", message="collecting douyin author homepage")
|
|
572
868
|
sec_user_id = _guess_douyin_sec_user_id(input_value)
|
|
573
869
|
resolve_resp: Optional[Dict[str, Any]] = None
|
|
574
870
|
request_id_candidates: List[Optional[Dict[str, Any]]] = []
|
|
575
|
-
|
|
576
|
-
|
|
871
|
+
page_limit = min(max(page_size, 1), 20)
|
|
872
|
+
max_pages = max(pages_max, 1)
|
|
873
|
+
resolver_route_plan = [
|
|
874
|
+
build_route_plan_entry(route_label="local_extract", endpoint=None, method="LOCAL"),
|
|
875
|
+
build_route_plan_entry(
|
|
876
|
+
route_label="web",
|
|
877
|
+
endpoint="/api/u1/v1/douyin/web/get_sec_user_id",
|
|
878
|
+
method="GET",
|
|
879
|
+
),
|
|
880
|
+
]
|
|
881
|
+
resolver_attempts: List[Dict[str, Any]] = []
|
|
882
|
+
|
|
883
|
+
if sec_user_id:
|
|
884
|
+
resolver_attempts.append(
|
|
885
|
+
build_attempted_route(
|
|
886
|
+
route_label="local_extract",
|
|
887
|
+
endpoint=None,
|
|
888
|
+
accepted=True,
|
|
889
|
+
accept_reason="author_id_ready",
|
|
890
|
+
param_readiness="ready",
|
|
891
|
+
extra={"resolved_author_id": sec_user_id},
|
|
892
|
+
)
|
|
893
|
+
)
|
|
894
|
+
trace.append(
|
|
895
|
+
{
|
|
896
|
+
"step": "douyin.resolve_sec_user_id.local",
|
|
897
|
+
"route_label": "local_extract",
|
|
898
|
+
"ok": True,
|
|
899
|
+
"resolved_author_id": sec_user_id,
|
|
900
|
+
"accept_reason": "author_id_ready",
|
|
901
|
+
}
|
|
902
|
+
)
|
|
903
|
+
stage_status["resolver"] = build_stage_status(
|
|
904
|
+
stage="resolver",
|
|
905
|
+
status="succeeded",
|
|
906
|
+
route_plan=resolver_route_plan,
|
|
907
|
+
attempted_routes=resolver_attempts,
|
|
908
|
+
chosen_route="local_extract",
|
|
909
|
+
accept_reason="author_id_ready",
|
|
910
|
+
fallback_reason="",
|
|
911
|
+
error_reason=None,
|
|
912
|
+
all_routes_failed=False,
|
|
913
|
+
)
|
|
914
|
+
else:
|
|
577
915
|
resolve_resp = call_json_api(
|
|
578
916
|
base_url=base_url,
|
|
579
917
|
path="/api/u1/v1/douyin/web/get_sec_user_id",
|
|
580
918
|
token=token,
|
|
581
919
|
method="GET",
|
|
582
920
|
timeout_ms=timeout_ms,
|
|
583
|
-
params={"url": input_value
|
|
921
|
+
params={"url": input_value},
|
|
584
922
|
)
|
|
585
923
|
trace.append(build_api_trace(step="douyin.resolve_sec_user_id", endpoint="/api/u1/v1/douyin/web/get_sec_user_id", response=resolve_resp))
|
|
586
924
|
request_id_candidates.append(resolve_resp)
|
|
587
925
|
resolve_data = resolve_resp.get("data")
|
|
588
926
|
sec_user_id = _extract_douyin_sec_user_id(resolve_data)
|
|
927
|
+
resolver_attempts.append(
|
|
928
|
+
build_attempted_route(
|
|
929
|
+
route_label="web",
|
|
930
|
+
endpoint="/api/u1/v1/douyin/web/get_sec_user_id",
|
|
931
|
+
response=resolve_resp,
|
|
932
|
+
accepted=bool(sec_user_id),
|
|
933
|
+
accept_reason="author_id_ready" if sec_user_id else "author_id_unresolved",
|
|
934
|
+
fallback_reason="" if sec_user_id else "resolver_fallback_unavailable",
|
|
935
|
+
extra={"resolved_author_id": sec_user_id},
|
|
936
|
+
)
|
|
937
|
+
)
|
|
589
938
|
if not sec_user_id:
|
|
590
939
|
trace.append(
|
|
591
940
|
{
|
|
@@ -601,17 +950,226 @@ def collect_douyin_author_home_raw(
|
|
|
601
950
|
"input_hint": _preview(input_value, max_len=120),
|
|
602
951
|
}
|
|
603
952
|
)
|
|
953
|
+
stage_status["resolver"] = build_stage_status(
|
|
954
|
+
stage="resolver",
|
|
955
|
+
status="succeeded" if sec_user_id else "failed",
|
|
956
|
+
route_plan=resolver_route_plan,
|
|
957
|
+
attempted_routes=resolver_attempts,
|
|
958
|
+
chosen_route="web",
|
|
959
|
+
accept_reason="author_id_ready" if sec_user_id else "author_id_unresolved",
|
|
960
|
+
fallback_reason="" if sec_user_id else "resolver_fallback_unavailable",
|
|
961
|
+
error_reason=None if sec_user_id else "author_id_unresolved",
|
|
962
|
+
all_routes_failed=not bool(sec_user_id),
|
|
963
|
+
)
|
|
604
964
|
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
timeout_ms=timeout_ms,
|
|
611
|
-
params={"sec_user_id": sec_user_id},
|
|
965
|
+
trace.append(
|
|
966
|
+
{
|
|
967
|
+
"step": "douyin.resolver.route_decision",
|
|
968
|
+
**stage_status["resolver"],
|
|
969
|
+
}
|
|
612
970
|
)
|
|
613
|
-
|
|
614
|
-
|
|
971
|
+
|
|
972
|
+
web_cookie = os.getenv("TIKOMNI_DOUYIN_WEB_COOKIE", "").strip()
|
|
973
|
+
if not sec_user_id:
|
|
974
|
+
stage_status["profile"] = build_stage_status(
|
|
975
|
+
stage="profile",
|
|
976
|
+
status="skipped",
|
|
977
|
+
route_plan=[
|
|
978
|
+
build_route_plan_entry(
|
|
979
|
+
route_label=route["route_label"],
|
|
980
|
+
endpoint=route["path"],
|
|
981
|
+
method="GET",
|
|
982
|
+
param_readiness=route["param_readiness"],
|
|
983
|
+
param_reason=route["param_reason"],
|
|
984
|
+
)
|
|
985
|
+
for route in _douyin_profile_route_plan(sec_user_id="", unique_id="", uid="")
|
|
986
|
+
],
|
|
987
|
+
attempted_routes=[],
|
|
988
|
+
chosen_route="",
|
|
989
|
+
accept_reason="",
|
|
990
|
+
fallback_reason="author_id_unresolved",
|
|
991
|
+
error_reason="author_id_unresolved",
|
|
992
|
+
all_routes_failed=False,
|
|
993
|
+
)
|
|
994
|
+
stage_status["posts"] = build_stage_status(
|
|
995
|
+
stage="posts",
|
|
996
|
+
status="skipped",
|
|
997
|
+
route_plan=[
|
|
998
|
+
build_route_plan_entry(
|
|
999
|
+
route_label=route["route_label"],
|
|
1000
|
+
endpoint=route["path"],
|
|
1001
|
+
method="GET",
|
|
1002
|
+
param_readiness=route["param_readiness"],
|
|
1003
|
+
param_reason=route["param_reason"],
|
|
1004
|
+
)
|
|
1005
|
+
for route in _douyin_posts_route_plan(sec_user_id="", cursor=0, count=page_limit, cookie=web_cookie)
|
|
1006
|
+
],
|
|
1007
|
+
attempted_routes=[],
|
|
1008
|
+
chosen_route="",
|
|
1009
|
+
accept_reason="",
|
|
1010
|
+
fallback_reason="author_id_unresolved",
|
|
1011
|
+
error_reason="author_id_unresolved",
|
|
1012
|
+
all_routes_failed=False,
|
|
1013
|
+
)
|
|
1014
|
+
trace.append({"step": "douyin.profile.route_decision", **stage_status["profile"]})
|
|
1015
|
+
trace.append({"step": "douyin.posts.stage_decision", **stage_status["posts"]})
|
|
1016
|
+
request_id = _pick_request_id(request_id_candidates, trace)
|
|
1017
|
+
if progress is not None:
|
|
1018
|
+
progress.done(
|
|
1019
|
+
stage="author_home.collect",
|
|
1020
|
+
message="douyin author homepage collected",
|
|
1021
|
+
data={"works_count": 0, "pages": 0, "request_id": request_id},
|
|
1022
|
+
)
|
|
1023
|
+
return {
|
|
1024
|
+
"platform": "douyin",
|
|
1025
|
+
"resolved_author_id": "",
|
|
1026
|
+
"profile_response": {},
|
|
1027
|
+
"works": [],
|
|
1028
|
+
"pagination": {
|
|
1029
|
+
"sort": "latest",
|
|
1030
|
+
"sort_type": 0,
|
|
1031
|
+
"cursor_mode": "max_cursor",
|
|
1032
|
+
"pages": [],
|
|
1033
|
+
"total_collected": 0,
|
|
1034
|
+
"max_items": max_items,
|
|
1035
|
+
},
|
|
1036
|
+
"extract_trace": trace,
|
|
1037
|
+
"request_id": request_id,
|
|
1038
|
+
"stage_status": stage_status,
|
|
1039
|
+
"error_reason": "author_id_unresolved",
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
profile_resp: Dict[str, Any] = {}
|
|
1043
|
+
profile_attempts: List[Dict[str, Any]] = []
|
|
1044
|
+
profile_reason: str = ""
|
|
1045
|
+
profile_unique_id = ""
|
|
1046
|
+
profile_uid = ""
|
|
1047
|
+
sec_profile_routes = _douyin_profile_route_plan(sec_user_id=sec_user_id, unique_id="", uid="")[:3]
|
|
1048
|
+
extra_profile_routes: List[Dict[str, Any]] = []
|
|
1049
|
+
|
|
1050
|
+
def _run_douyin_profile_route(route: Dict[str, Any], *, fallback_reason: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
1051
|
+
response = call_json_api(
|
|
1052
|
+
base_url=base_url,
|
|
1053
|
+
path=str(route["path"]),
|
|
1054
|
+
token=token,
|
|
1055
|
+
method="GET",
|
|
1056
|
+
timeout_ms=timeout_ms,
|
|
1057
|
+
params=dict(route.get("params") or {}),
|
|
1058
|
+
)
|
|
1059
|
+
response["_endpoint"] = route["path"]
|
|
1060
|
+
response["_route_label"] = route["route_label"]
|
|
1061
|
+
if fallback_reason:
|
|
1062
|
+
response["fallback_trigger_reason"] = fallback_reason
|
|
1063
|
+
response["_field_completeness"] = _douyin_profile_field_completeness(response.get("data"), sec_user_id) if response.get("ok") else {
|
|
1064
|
+
"fields": {},
|
|
1065
|
+
"filled_count": 0,
|
|
1066
|
+
"total_fields": 0,
|
|
1067
|
+
"ratio": 0.0,
|
|
1068
|
+
"missing_core": [],
|
|
1069
|
+
"core_ready": False,
|
|
1070
|
+
}
|
|
1071
|
+
decision = _douyin_profile_accept_decision(response, response.get("_field_completeness") or {})
|
|
1072
|
+
return response, decision
|
|
1073
|
+
|
|
1074
|
+
for route in sec_profile_routes:
|
|
1075
|
+
response, decision = _run_douyin_profile_route(route, fallback_reason=profile_reason)
|
|
1076
|
+
profile_resp = response
|
|
1077
|
+
request_id_candidates.append(response)
|
|
1078
|
+
profile_unique_id = profile_unique_id or _pick_text(response.get("data"), ["unique_id", "short_id", "douyin_id", "display_id"])
|
|
1079
|
+
profile_uid = profile_uid or _pick_text(response.get("data"), ["uid", "user_id", "id"])
|
|
1080
|
+
profile_attempts.append(
|
|
1081
|
+
build_attempted_route(
|
|
1082
|
+
route_label=str(route["route_label"]),
|
|
1083
|
+
endpoint=str(route["path"]),
|
|
1084
|
+
response=response,
|
|
1085
|
+
accepted=bool(decision.get("accepted")),
|
|
1086
|
+
accept_reason=str(decision.get("accept_reason") or ""),
|
|
1087
|
+
fallback_reason=str(decision.get("fallback_reason") or ""),
|
|
1088
|
+
extra={"field_completeness": response.get("_field_completeness")},
|
|
1089
|
+
)
|
|
1090
|
+
)
|
|
1091
|
+
trace.append(
|
|
1092
|
+
build_api_trace(
|
|
1093
|
+
step=str(route["step_name"]),
|
|
1094
|
+
endpoint=str(route["path"]),
|
|
1095
|
+
response=response,
|
|
1096
|
+
extra={
|
|
1097
|
+
"route_label": route["route_label"],
|
|
1098
|
+
"field_completeness": response.get("_field_completeness"),
|
|
1099
|
+
"accept_reason": decision.get("accept_reason"),
|
|
1100
|
+
"route_accepted": bool(decision.get("accepted")),
|
|
1101
|
+
},
|
|
1102
|
+
)
|
|
1103
|
+
)
|
|
1104
|
+
if decision.get("accepted"):
|
|
1105
|
+
profile_resp["_accept_reason"] = decision.get("accept_reason")
|
|
1106
|
+
break
|
|
1107
|
+
profile_reason = str(decision.get("fallback_reason") or "field_completeness_below_threshold")
|
|
1108
|
+
profile_resp["fallback_trigger_reason"] = profile_reason
|
|
1109
|
+
|
|
1110
|
+
if not profile_resp.get("_accept_reason"):
|
|
1111
|
+
extra_profile_routes = _douyin_profile_route_plan(
|
|
1112
|
+
sec_user_id=sec_user_id,
|
|
1113
|
+
unique_id=profile_unique_id,
|
|
1114
|
+
uid=profile_uid,
|
|
1115
|
+
)[3:]
|
|
1116
|
+
for route in extra_profile_routes:
|
|
1117
|
+
if route.get("param_readiness") != "ready":
|
|
1118
|
+
profile_attempts.append(
|
|
1119
|
+
_build_unavailable_attempt(
|
|
1120
|
+
route_label=str(route["route_label"]),
|
|
1121
|
+
endpoint=str(route["path"]),
|
|
1122
|
+
reason=str(route.get("param_reason") or "fallback_param_unavailable"),
|
|
1123
|
+
)
|
|
1124
|
+
)
|
|
1125
|
+
continue
|
|
1126
|
+
response, decision = _run_douyin_profile_route(route, fallback_reason=profile_reason)
|
|
1127
|
+
profile_resp = response
|
|
1128
|
+
request_id_candidates.append(response)
|
|
1129
|
+
profile_attempts.append(
|
|
1130
|
+
build_attempted_route(
|
|
1131
|
+
route_label=str(route["route_label"]),
|
|
1132
|
+
endpoint=str(route["path"]),
|
|
1133
|
+
response=response,
|
|
1134
|
+
accepted=bool(decision.get("accepted")),
|
|
1135
|
+
accept_reason=str(decision.get("accept_reason") or ""),
|
|
1136
|
+
fallback_reason=str(decision.get("fallback_reason") or ""),
|
|
1137
|
+
extra={"field_completeness": response.get("_field_completeness")},
|
|
1138
|
+
)
|
|
1139
|
+
)
|
|
1140
|
+
trace.append(
|
|
1141
|
+
build_api_trace(
|
|
1142
|
+
step=str(route["step_name"]),
|
|
1143
|
+
endpoint=str(route["path"]),
|
|
1144
|
+
response=response,
|
|
1145
|
+
extra={
|
|
1146
|
+
"route_label": route["route_label"],
|
|
1147
|
+
"field_completeness": response.get("_field_completeness"),
|
|
1148
|
+
"accept_reason": decision.get("accept_reason"),
|
|
1149
|
+
"route_accepted": bool(decision.get("accepted")),
|
|
1150
|
+
},
|
|
1151
|
+
)
|
|
1152
|
+
)
|
|
1153
|
+
if decision.get("accepted"):
|
|
1154
|
+
profile_resp["_accept_reason"] = decision.get("accept_reason")
|
|
1155
|
+
break
|
|
1156
|
+
profile_reason = str(decision.get("fallback_reason") or "field_completeness_below_threshold")
|
|
1157
|
+
profile_resp["fallback_trigger_reason"] = profile_reason
|
|
1158
|
+
else:
|
|
1159
|
+
extra_profile_routes = _douyin_profile_route_plan(
|
|
1160
|
+
sec_user_id=sec_user_id,
|
|
1161
|
+
unique_id=profile_unique_id,
|
|
1162
|
+
uid=profile_uid,
|
|
1163
|
+
)[3:]
|
|
1164
|
+
for route in extra_profile_routes:
|
|
1165
|
+
if route.get("param_readiness") != "ready":
|
|
1166
|
+
profile_attempts.append(
|
|
1167
|
+
_build_unavailable_attempt(
|
|
1168
|
+
route_label=str(route["route_label"]),
|
|
1169
|
+
endpoint=str(route["path"]),
|
|
1170
|
+
reason=str(route.get("param_reason") or "fallback_param_unavailable"),
|
|
1171
|
+
)
|
|
1172
|
+
)
|
|
615
1173
|
|
|
616
1174
|
profile_author_id = _pick_text(profile_resp.get("data"), ["sec_user_id", "sec_uid", "secUserId", "uid", "user_id"])
|
|
617
1175
|
resolved_author_id = sec_user_id or profile_author_id
|
|
@@ -627,8 +1185,36 @@ def collect_douyin_author_home_raw(
|
|
|
627
1185
|
"profile_data_type": type(profile_resp.get("data")).__name__,
|
|
628
1186
|
"profile_data_keys": list(profile_resp.get("data").keys())[:8] if isinstance(profile_resp.get("data"), dict) else [],
|
|
629
1187
|
"profile_data_preview": _preview(profile_resp.get("data")),
|
|
630
|
-
|
|
1188
|
+
}
|
|
1189
|
+
)
|
|
1190
|
+
|
|
1191
|
+
profile_route_plan = [
|
|
1192
|
+
build_route_plan_entry(
|
|
1193
|
+
route_label=str(route["route_label"]),
|
|
1194
|
+
endpoint=str(route["path"]),
|
|
1195
|
+
method="GET",
|
|
1196
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1197
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
631
1198
|
)
|
|
1199
|
+
for route in _douyin_profile_route_plan(sec_user_id=sec_user_id, unique_id=profile_unique_id, uid=profile_uid)
|
|
1200
|
+
]
|
|
1201
|
+
profile_has_accepted = any(bool(attempt.get("accepted")) for attempt in profile_attempts)
|
|
1202
|
+
profile_has_ok_response = any(bool(attempt.get("ok")) for attempt in profile_attempts if not attempt.get("skipped"))
|
|
1203
|
+
profile_error_reason = None
|
|
1204
|
+
if not profile_has_accepted:
|
|
1205
|
+
profile_error_reason = "profile_contract_incomplete" if profile_has_ok_response else "profile_all_routes_failed"
|
|
1206
|
+
stage_status["profile"] = build_stage_status(
|
|
1207
|
+
stage="profile",
|
|
1208
|
+
status="succeeded" if profile_has_accepted else "failed",
|
|
1209
|
+
route_plan=profile_route_plan,
|
|
1210
|
+
attempted_routes=profile_attempts,
|
|
1211
|
+
chosen_route=str(profile_resp.get("_route_label") or ""),
|
|
1212
|
+
accept_reason=str(profile_resp.get("_accept_reason") or ""),
|
|
1213
|
+
fallback_reason=profile_reason,
|
|
1214
|
+
error_reason=profile_error_reason,
|
|
1215
|
+
all_routes_failed=not profile_has_accepted,
|
|
1216
|
+
)
|
|
1217
|
+
trace.append({"step": "douyin.profile.route_decision", **stage_status["profile"]})
|
|
632
1218
|
|
|
633
1219
|
works: List[Dict[str, Any]] = []
|
|
634
1220
|
seen_ids = set()
|
|
@@ -636,40 +1222,133 @@ def collect_douyin_author_home_raw(
|
|
|
636
1222
|
has_more = True
|
|
637
1223
|
page = 0
|
|
638
1224
|
pagination_trace: List[Dict[str, Any]] = []
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
1225
|
+
posts_attempts_all: List[Dict[str, Any]] = []
|
|
1226
|
+
posts_accepted_routes: List[str] = []
|
|
1227
|
+
posts_error_reason: Optional[str] = None
|
|
642
1228
|
|
|
643
1229
|
while has_more and page < max_pages and len(works) < max_items:
|
|
644
1230
|
page += 1
|
|
645
|
-
posts_resp = call_json_api(
|
|
646
|
-
base_url=base_url,
|
|
647
|
-
path="/api/u1/v1/douyin/app/v3/fetch_user_post_videos",
|
|
648
|
-
token=token,
|
|
649
|
-
method="GET",
|
|
650
|
-
timeout_ms=timeout_ms,
|
|
651
|
-
params={
|
|
652
|
-
"sec_user_id": sec_user_id,
|
|
653
|
-
"count": page_limit,
|
|
654
|
-
"max_cursor": cursor,
|
|
655
|
-
"sort_type": 0,
|
|
656
|
-
},
|
|
657
|
-
)
|
|
658
|
-
trace.append(
|
|
659
|
-
build_api_trace(
|
|
660
|
-
step="douyin.posts_page",
|
|
661
|
-
endpoint="/api/u1/v1/douyin/app/v3/fetch_user_post_videos",
|
|
662
|
-
response=posts_resp,
|
|
663
|
-
extra={"page": page, "cursor": cursor, "sort_type": 0},
|
|
664
|
-
)
|
|
665
|
-
)
|
|
666
1231
|
if progress is not None:
|
|
667
1232
|
progress.progress(
|
|
668
1233
|
stage="author_home.collect.pagination",
|
|
669
1234
|
message="douyin pagination page requested",
|
|
670
1235
|
data={"page": page, "cursor_in": cursor},
|
|
671
1236
|
)
|
|
672
|
-
|
|
1237
|
+
posts_routes = _douyin_posts_route_plan(sec_user_id=sec_user_id, cursor=cursor, count=page_limit, cookie=web_cookie)
|
|
1238
|
+
page_attempts: List[Dict[str, Any]] = []
|
|
1239
|
+
posts_resp: Dict[str, Any] = {}
|
|
1240
|
+
page_reason = ""
|
|
1241
|
+
for route in posts_routes:
|
|
1242
|
+
if route.get("param_readiness") != "ready":
|
|
1243
|
+
skipped_attempt = _build_unavailable_attempt(
|
|
1244
|
+
route_label=str(route["route_label"]),
|
|
1245
|
+
endpoint=str(route["path"]),
|
|
1246
|
+
reason=str(route.get("param_reason") or "fallback_param_unavailable"),
|
|
1247
|
+
extra={"page": page, "cursor_in": cursor},
|
|
1248
|
+
)
|
|
1249
|
+
page_attempts.append(skipped_attempt)
|
|
1250
|
+
posts_attempts_all.append(skipped_attempt)
|
|
1251
|
+
continue
|
|
1252
|
+
posts_resp = call_json_api(
|
|
1253
|
+
base_url=base_url,
|
|
1254
|
+
path=str(route["path"]),
|
|
1255
|
+
token=token,
|
|
1256
|
+
method="GET",
|
|
1257
|
+
timeout_ms=timeout_ms,
|
|
1258
|
+
params=dict(route.get("params") or {}),
|
|
1259
|
+
)
|
|
1260
|
+
posts_resp["_endpoint"] = route["path"]
|
|
1261
|
+
posts_resp["_route_label"] = route["route_label"]
|
|
1262
|
+
if page_reason:
|
|
1263
|
+
posts_resp["fallback_trigger_reason"] = page_reason
|
|
1264
|
+
posts_resp["_field_completeness"] = _douyin_posts_field_completeness(posts_resp.get("data")) if posts_resp.get("ok") else {
|
|
1265
|
+
"fields": {},
|
|
1266
|
+
"filled_count": 0,
|
|
1267
|
+
"total_fields": 0,
|
|
1268
|
+
"ratio": 0.0,
|
|
1269
|
+
"missing_core": [],
|
|
1270
|
+
"core_ready": False,
|
|
1271
|
+
}
|
|
1272
|
+
posts_decision = _douyin_posts_accept_decision(posts_resp, posts_resp.get("_field_completeness") or {})
|
|
1273
|
+
page_attempt = build_attempted_route(
|
|
1274
|
+
route_label=str(route["route_label"]),
|
|
1275
|
+
endpoint=str(route["path"]),
|
|
1276
|
+
response=posts_resp,
|
|
1277
|
+
accepted=bool(posts_decision.get("accepted")),
|
|
1278
|
+
accept_reason=str(posts_decision.get("accept_reason") or ""),
|
|
1279
|
+
fallback_reason=str(posts_decision.get("fallback_reason") or ""),
|
|
1280
|
+
extra={
|
|
1281
|
+
"page": page,
|
|
1282
|
+
"cursor_in": cursor,
|
|
1283
|
+
"field_completeness": posts_resp.get("_field_completeness"),
|
|
1284
|
+
},
|
|
1285
|
+
)
|
|
1286
|
+
page_attempts.append(page_attempt)
|
|
1287
|
+
posts_attempts_all.append(page_attempt)
|
|
1288
|
+
trace.append(
|
|
1289
|
+
build_api_trace(
|
|
1290
|
+
step=str(route["step_name"]),
|
|
1291
|
+
endpoint=str(route["path"]),
|
|
1292
|
+
response=posts_resp,
|
|
1293
|
+
extra={
|
|
1294
|
+
"page": page,
|
|
1295
|
+
"cursor": cursor,
|
|
1296
|
+
"route_label": route["route_label"],
|
|
1297
|
+
"field_completeness": posts_resp.get("_field_completeness"),
|
|
1298
|
+
"accept_reason": posts_decision.get("accept_reason"),
|
|
1299
|
+
"route_accepted": bool(posts_decision.get("accepted")),
|
|
1300
|
+
},
|
|
1301
|
+
)
|
|
1302
|
+
)
|
|
1303
|
+
request_id_candidates.append(posts_resp)
|
|
1304
|
+
if posts_decision.get("accepted"):
|
|
1305
|
+
posts_resp["_accept_reason"] = posts_decision.get("accept_reason")
|
|
1306
|
+
posts_accepted_routes.append(str(route["route_label"]))
|
|
1307
|
+
break
|
|
1308
|
+
page_reason = str(posts_decision.get("fallback_reason") or "field_completeness_below_threshold")
|
|
1309
|
+
posts_resp["fallback_trigger_reason"] = page_reason
|
|
1310
|
+
|
|
1311
|
+
page_route_plan = [
|
|
1312
|
+
build_route_plan_entry(
|
|
1313
|
+
route_label=str(route["route_label"]),
|
|
1314
|
+
endpoint=str(route["path"]),
|
|
1315
|
+
method="GET",
|
|
1316
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1317
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
1318
|
+
)
|
|
1319
|
+
for route in posts_routes
|
|
1320
|
+
]
|
|
1321
|
+
trace.append(
|
|
1322
|
+
{
|
|
1323
|
+
"step": "douyin.posts.route_decision",
|
|
1324
|
+
"page": page,
|
|
1325
|
+
"cursor_in": cursor,
|
|
1326
|
+
"route_plan": page_route_plan,
|
|
1327
|
+
"chosen_route": posts_resp.get("_route_label"),
|
|
1328
|
+
"request_id": posts_resp.get("request_id"),
|
|
1329
|
+
"field_completeness": posts_resp.get("_field_completeness"),
|
|
1330
|
+
"accept_reason": posts_resp.get("_accept_reason"),
|
|
1331
|
+
"fallback_reason": page_reason,
|
|
1332
|
+
"attempted_routes": page_attempts,
|
|
1333
|
+
"all_routes_failed": not any(bool(attempt.get("accepted")) for attempt in page_attempts),
|
|
1334
|
+
}
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
if not posts_resp.get("_accept_reason"):
|
|
1338
|
+
posts_error_reason = "posts_contract_incomplete" if any(bool(attempt.get("ok")) for attempt in page_attempts if not attempt.get("skipped")) else "posts_all_routes_failed"
|
|
1339
|
+
pagination_trace.append(
|
|
1340
|
+
{
|
|
1341
|
+
"page": page,
|
|
1342
|
+
"cursor_in": cursor,
|
|
1343
|
+
"cursor_out": None,
|
|
1344
|
+
"has_more_raw": None,
|
|
1345
|
+
"has_more_normalized": None,
|
|
1346
|
+
"items": 0,
|
|
1347
|
+
"stop_reason": posts_error_reason,
|
|
1348
|
+
}
|
|
1349
|
+
)
|
|
1350
|
+
break
|
|
1351
|
+
|
|
673
1352
|
response_payload = posts_resp.get("data")
|
|
674
1353
|
page_items = _extract_douyin_posts_items(response_payload)
|
|
675
1354
|
|
|
@@ -729,6 +1408,32 @@ def collect_douyin_author_home_raw(
|
|
|
729
1408
|
if should_continue and next_cursor is not None:
|
|
730
1409
|
cursor = next_cursor
|
|
731
1410
|
|
|
1411
|
+
posts_route_plan = [
|
|
1412
|
+
build_route_plan_entry(
|
|
1413
|
+
route_label=str(route["route_label"]),
|
|
1414
|
+
endpoint=str(route["path"]),
|
|
1415
|
+
method="GET",
|
|
1416
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1417
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
1418
|
+
)
|
|
1419
|
+
for route in _douyin_posts_route_plan(sec_user_id=sec_user_id, cursor=0, count=page_limit, cookie=web_cookie)
|
|
1420
|
+
]
|
|
1421
|
+
posts_has_accepted = bool(posts_accepted_routes)
|
|
1422
|
+
if not posts_has_accepted and posts_error_reason is None:
|
|
1423
|
+
posts_error_reason = "posts_all_routes_failed"
|
|
1424
|
+
stage_status["posts"] = build_stage_status(
|
|
1425
|
+
stage="posts",
|
|
1426
|
+
status="succeeded" if posts_has_accepted else "failed",
|
|
1427
|
+
route_plan=posts_route_plan,
|
|
1428
|
+
attempted_routes=posts_attempts_all,
|
|
1429
|
+
chosen_route=posts_accepted_routes[0] if len(set(posts_accepted_routes)) == 1 and posts_accepted_routes else ("mixed" if posts_accepted_routes else ""),
|
|
1430
|
+
accept_reason="posts_pages_collected" if posts_has_accepted else "",
|
|
1431
|
+
fallback_reason=posts_error_reason or "",
|
|
1432
|
+
error_reason=None if posts_has_accepted else posts_error_reason,
|
|
1433
|
+
all_routes_failed=not posts_has_accepted,
|
|
1434
|
+
)
|
|
1435
|
+
trace.append({"step": "douyin.posts.stage_decision", **stage_status["posts"]})
|
|
1436
|
+
|
|
732
1437
|
request_id = _pick_request_id(request_id_candidates, trace)
|
|
733
1438
|
if progress is not None:
|
|
734
1439
|
progress.done(
|
|
@@ -737,6 +1442,14 @@ def collect_douyin_author_home_raw(
|
|
|
737
1442
|
data={"works_count": len(works), "pages": len(pagination_trace), "request_id": request_id},
|
|
738
1443
|
)
|
|
739
1444
|
|
|
1445
|
+
collect_error_reason: Optional[str] = None
|
|
1446
|
+
if stage_status.get("resolver", {}).get("status") == "failed":
|
|
1447
|
+
collect_error_reason = str(stage_status["resolver"].get("error_reason") or "author_id_unresolved")
|
|
1448
|
+
elif not works and stage_status.get("posts", {}).get("status") == "failed":
|
|
1449
|
+
collect_error_reason = stage_status["posts"].get("error_reason")
|
|
1450
|
+
elif stage_status.get("profile", {}).get("status") == "failed":
|
|
1451
|
+
collect_error_reason = stage_status["profile"].get("error_reason")
|
|
1452
|
+
|
|
740
1453
|
return {
|
|
741
1454
|
"platform": "douyin",
|
|
742
1455
|
"resolved_author_id": resolved_author_id,
|
|
@@ -752,6 +1465,8 @@ def collect_douyin_author_home_raw(
|
|
|
752
1465
|
},
|
|
753
1466
|
"extract_trace": trace,
|
|
754
1467
|
"request_id": request_id,
|
|
1468
|
+
"stage_status": stage_status,
|
|
1469
|
+
"error_reason": collect_error_reason,
|
|
755
1470
|
}
|
|
756
1471
|
|
|
757
1472
|
|
|
@@ -767,20 +1482,63 @@ def collect_xhs_author_home_raw(
|
|
|
767
1482
|
progress: Optional[ProgressReporter] = None,
|
|
768
1483
|
) -> Dict[str, Any]:
|
|
769
1484
|
trace: List[Dict[str, Any]] = []
|
|
1485
|
+
stage_status: Dict[str, Any] = {}
|
|
770
1486
|
if progress is not None:
|
|
771
1487
|
progress.started(stage="author_home.collect", message="collecting xiaohongshu author homepage")
|
|
772
1488
|
user_id, xsec_token = _guess_xhs_ids(input_value)
|
|
773
1489
|
resolve_resp: Optional[Dict[str, Any]] = None
|
|
774
1490
|
request_id_candidates: List[Optional[Dict[str, Any]]] = []
|
|
775
|
-
|
|
776
|
-
|
|
1491
|
+
max_pages = max(pages_max, 1)
|
|
1492
|
+
page_limit = min(max(page_size, 1), 20)
|
|
1493
|
+
resolver_route_plan = [
|
|
1494
|
+
build_route_plan_entry(route_label="local_extract", endpoint=None, method="LOCAL"),
|
|
1495
|
+
build_route_plan_entry(
|
|
1496
|
+
route_label="app",
|
|
1497
|
+
endpoint="/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token",
|
|
1498
|
+
method="GET",
|
|
1499
|
+
),
|
|
1500
|
+
]
|
|
1501
|
+
resolver_attempts: List[Dict[str, Any]] = []
|
|
1502
|
+
|
|
1503
|
+
if user_id:
|
|
1504
|
+
resolver_attempts.append(
|
|
1505
|
+
build_attempted_route(
|
|
1506
|
+
route_label="local_extract",
|
|
1507
|
+
endpoint=None,
|
|
1508
|
+
accepted=True,
|
|
1509
|
+
accept_reason="author_id_ready",
|
|
1510
|
+
param_readiness="ready",
|
|
1511
|
+
extra={"resolved_author_id": user_id},
|
|
1512
|
+
)
|
|
1513
|
+
)
|
|
1514
|
+
trace.append(
|
|
1515
|
+
{
|
|
1516
|
+
"step": "xhs.resolve_user_id.local",
|
|
1517
|
+
"route_label": "local_extract",
|
|
1518
|
+
"ok": True,
|
|
1519
|
+
"resolved_author_id": user_id,
|
|
1520
|
+
"accept_reason": "author_id_ready",
|
|
1521
|
+
}
|
|
1522
|
+
)
|
|
1523
|
+
stage_status["resolver"] = build_stage_status(
|
|
1524
|
+
stage="resolver",
|
|
1525
|
+
status="succeeded",
|
|
1526
|
+
route_plan=resolver_route_plan,
|
|
1527
|
+
attempted_routes=resolver_attempts,
|
|
1528
|
+
chosen_route="local_extract",
|
|
1529
|
+
accept_reason="author_id_ready",
|
|
1530
|
+
fallback_reason="",
|
|
1531
|
+
error_reason=None,
|
|
1532
|
+
all_routes_failed=False,
|
|
1533
|
+
)
|
|
1534
|
+
else:
|
|
777
1535
|
resolve_resp = call_json_api(
|
|
778
1536
|
base_url=base_url,
|
|
779
1537
|
path="/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token",
|
|
780
1538
|
token=token,
|
|
781
1539
|
method="GET",
|
|
782
1540
|
timeout_ms=timeout_ms,
|
|
783
|
-
params={"share_link": input_value
|
|
1541
|
+
params={"share_link": input_value},
|
|
784
1542
|
)
|
|
785
1543
|
trace.append(
|
|
786
1544
|
build_api_trace(
|
|
@@ -794,41 +1552,154 @@ def collect_xhs_author_home_raw(
|
|
|
794
1552
|
user_id = _pick_text(data, ["user_id", "userid", "uid"])
|
|
795
1553
|
if not xsec_token:
|
|
796
1554
|
xsec_token = _pick_text(data, ["xsec_token", "xsecToken"])
|
|
1555
|
+
resolver_attempts.append(
|
|
1556
|
+
build_attempted_route(
|
|
1557
|
+
route_label="app",
|
|
1558
|
+
endpoint="/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token",
|
|
1559
|
+
response=resolve_resp,
|
|
1560
|
+
accepted=bool(user_id),
|
|
1561
|
+
accept_reason="author_id_ready" if user_id else "author_id_unresolved",
|
|
1562
|
+
fallback_reason="" if user_id else "resolver_fallback_unavailable",
|
|
1563
|
+
extra={"resolved_author_id": user_id},
|
|
1564
|
+
)
|
|
1565
|
+
)
|
|
1566
|
+
stage_status["resolver"] = build_stage_status(
|
|
1567
|
+
stage="resolver",
|
|
1568
|
+
status="succeeded" if user_id else "failed",
|
|
1569
|
+
route_plan=resolver_route_plan,
|
|
1570
|
+
attempted_routes=resolver_attempts,
|
|
1571
|
+
chosen_route="app",
|
|
1572
|
+
accept_reason="author_id_ready" if user_id else "author_id_unresolved",
|
|
1573
|
+
fallback_reason="" if user_id else "resolver_fallback_unavailable",
|
|
1574
|
+
error_reason=None if user_id else "author_id_unresolved",
|
|
1575
|
+
all_routes_failed=not bool(user_id),
|
|
1576
|
+
)
|
|
1577
|
+
|
|
1578
|
+
trace.append({"step": "xhs.resolver.route_decision", **stage_status["resolver"]})
|
|
1579
|
+
|
|
1580
|
+
if not user_id:
|
|
1581
|
+
stage_status["profile"] = build_stage_status(
|
|
1582
|
+
stage="profile",
|
|
1583
|
+
status="skipped",
|
|
1584
|
+
route_plan=[
|
|
1585
|
+
build_route_plan_entry(
|
|
1586
|
+
route_label=route["route_label"],
|
|
1587
|
+
endpoint=route["path"],
|
|
1588
|
+
method="GET",
|
|
1589
|
+
param_readiness=route["param_readiness"],
|
|
1590
|
+
param_reason=route["param_reason"],
|
|
1591
|
+
)
|
|
1592
|
+
for route in _xhs_route_plan("profile", user_id="", input_value=input_value)
|
|
1593
|
+
],
|
|
1594
|
+
attempted_routes=[],
|
|
1595
|
+
chosen_route="",
|
|
1596
|
+
accept_reason="",
|
|
1597
|
+
fallback_reason="author_id_unresolved",
|
|
1598
|
+
error_reason="author_id_unresolved",
|
|
1599
|
+
all_routes_failed=False,
|
|
1600
|
+
)
|
|
1601
|
+
stage_status["posts"] = build_stage_status(
|
|
1602
|
+
stage="posts",
|
|
1603
|
+
status="skipped",
|
|
1604
|
+
route_plan=[
|
|
1605
|
+
build_route_plan_entry(
|
|
1606
|
+
route_label=route["route_label"],
|
|
1607
|
+
endpoint=route["path"],
|
|
1608
|
+
method="GET",
|
|
1609
|
+
param_readiness=route["param_readiness"],
|
|
1610
|
+
param_reason=route["param_reason"],
|
|
1611
|
+
)
|
|
1612
|
+
for route in _xhs_route_plan("posts", user_id="", input_value=input_value, cursor="")
|
|
1613
|
+
],
|
|
1614
|
+
attempted_routes=[],
|
|
1615
|
+
chosen_route="",
|
|
1616
|
+
accept_reason="",
|
|
1617
|
+
fallback_reason="author_id_unresolved",
|
|
1618
|
+
error_reason="author_id_unresolved",
|
|
1619
|
+
all_routes_failed=False,
|
|
1620
|
+
)
|
|
1621
|
+
trace.append({"step": "xhs.profile.route_decision", **stage_status["profile"]})
|
|
1622
|
+
trace.append({"step": "xhs.posts.stage_decision", **stage_status["posts"]})
|
|
1623
|
+
request_id = _pick_request_id(request_id_candidates, trace)
|
|
1624
|
+
if progress is not None:
|
|
1625
|
+
progress.done(
|
|
1626
|
+
stage="author_home.collect",
|
|
1627
|
+
message="xiaohongshu author homepage collected",
|
|
1628
|
+
data={"works_count": 0, "pages": 0, "request_id": request_id},
|
|
1629
|
+
)
|
|
1630
|
+
return {
|
|
1631
|
+
"platform": "xiaohongshu",
|
|
1632
|
+
"resolved_author_id": "",
|
|
1633
|
+
"resolved_xsec_token": xsec_token,
|
|
1634
|
+
"profile_response": {},
|
|
1635
|
+
"works": [],
|
|
1636
|
+
"pagination": {
|
|
1637
|
+
"sort": "latest",
|
|
1638
|
+
"sort_type": "latest",
|
|
1639
|
+
"cursor_mode": "cursor",
|
|
1640
|
+
"pages": [],
|
|
1641
|
+
"total_collected": 0,
|
|
1642
|
+
"max_items": max_items,
|
|
1643
|
+
},
|
|
1644
|
+
"extract_trace": trace,
|
|
1645
|
+
"request_id": request_id,
|
|
1646
|
+
"stage_status": stage_status,
|
|
1647
|
+
"error_reason": "author_id_unresolved",
|
|
1648
|
+
}
|
|
797
1649
|
|
|
798
|
-
profile_routes = _xhs_route_plan("profile")
|
|
1650
|
+
profile_routes = _xhs_route_plan("profile", user_id=user_id, input_value=input_value)
|
|
1651
|
+
profile_route_plan = [
|
|
1652
|
+
build_route_plan_entry(
|
|
1653
|
+
route_label=str(route["route_label"]),
|
|
1654
|
+
endpoint=str(route["path"]),
|
|
1655
|
+
method="GET",
|
|
1656
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1657
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
1658
|
+
)
|
|
1659
|
+
for route in profile_routes
|
|
1660
|
+
]
|
|
799
1661
|
profile_resp: Dict[str, Any] = {}
|
|
800
1662
|
profile_reason: Optional[str] = None
|
|
801
1663
|
profile_attempts: List[Dict[str, Any]] = []
|
|
802
|
-
for
|
|
1664
|
+
for route in profile_routes:
|
|
1665
|
+
if route.get("param_readiness") != "ready":
|
|
1666
|
+
profile_attempts.append(
|
|
1667
|
+
_build_unavailable_attempt(
|
|
1668
|
+
route_label=str(route["route_label"]),
|
|
1669
|
+
endpoint=str(route["path"]),
|
|
1670
|
+
reason=str(route.get("param_reason") or "fallback_param_unavailable"),
|
|
1671
|
+
)
|
|
1672
|
+
)
|
|
1673
|
+
continue
|
|
803
1674
|
profile_resp = _call_xhs_route(
|
|
804
1675
|
base_url=base_url,
|
|
805
1676
|
token=token,
|
|
806
1677
|
timeout_ms=timeout_ms,
|
|
807
|
-
path=path,
|
|
808
|
-
route_label=route_label,
|
|
809
|
-
params=
|
|
1678
|
+
path=str(route["path"]),
|
|
1679
|
+
route_label=str(route["route_label"]),
|
|
1680
|
+
params=dict(route.get("params") or {}),
|
|
810
1681
|
fallback_reason=profile_reason,
|
|
811
1682
|
completeness_builder=lambda data, resolved_author_id=user_id: _xhs_profile_field_completeness(data, resolved_author_id),
|
|
812
1683
|
)
|
|
813
1684
|
profile_decision = _xhs_profile_accept_decision(profile_resp, profile_resp.get("_field_completeness") or {})
|
|
814
1685
|
profile_attempts.append(
|
|
815
|
-
|
|
816
|
-
"route_label"
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
"
|
|
823
|
-
|
|
1686
|
+
build_attempted_route(
|
|
1687
|
+
route_label=str(route["route_label"]),
|
|
1688
|
+
endpoint=str(route["path"]),
|
|
1689
|
+
response=profile_resp,
|
|
1690
|
+
accepted=bool(profile_decision.get("accepted")),
|
|
1691
|
+
accept_reason=str(profile_decision.get("accept_reason") or ""),
|
|
1692
|
+
fallback_reason=str(profile_decision.get("fallback_reason") or ""),
|
|
1693
|
+
extra={"field_completeness": profile_resp.get("_field_completeness")},
|
|
1694
|
+
)
|
|
824
1695
|
)
|
|
825
1696
|
trace.append(
|
|
826
1697
|
build_api_trace(
|
|
827
|
-
step=step_name,
|
|
828
|
-
endpoint=path,
|
|
1698
|
+
step=str(route["step_name"]),
|
|
1699
|
+
endpoint=str(route["path"]),
|
|
829
1700
|
response=profile_resp,
|
|
830
1701
|
extra={
|
|
831
|
-
"route_label": route_label,
|
|
1702
|
+
"route_label": route["route_label"],
|
|
832
1703
|
"field_completeness": profile_resp.get("_field_completeness"),
|
|
833
1704
|
"accept_reason": profile_decision.get("accept_reason"),
|
|
834
1705
|
"route_accepted": bool(profile_decision.get("accepted")),
|
|
@@ -845,14 +1716,32 @@ def collect_xhs_author_home_raw(
|
|
|
845
1716
|
trace.append(
|
|
846
1717
|
{
|
|
847
1718
|
"step": "xhs.profile.route_decision",
|
|
1719
|
+
"route_plan": profile_route_plan,
|
|
848
1720
|
"chosen_route": profile_resp.get("_route_label"),
|
|
849
1721
|
"request_id": profile_resp.get("request_id"),
|
|
850
1722
|
"field_completeness": profile_resp.get("_field_completeness"),
|
|
851
1723
|
"accept_reason": profile_resp.get("_accept_reason"),
|
|
852
1724
|
"fallback_reason": profile_reason,
|
|
853
1725
|
"attempted_routes": profile_attempts,
|
|
1726
|
+
"all_routes_failed": not any(bool(attempt.get("accepted")) for attempt in profile_attempts),
|
|
854
1727
|
}
|
|
855
1728
|
)
|
|
1729
|
+
profile_has_accepted = any(bool(attempt.get("accepted")) for attempt in profile_attempts)
|
|
1730
|
+
profile_has_ok_response = any(bool(attempt.get("ok")) for attempt in profile_attempts if not attempt.get("skipped"))
|
|
1731
|
+
profile_error_reason = None
|
|
1732
|
+
if not profile_has_accepted:
|
|
1733
|
+
profile_error_reason = "profile_contract_incomplete" if profile_has_ok_response else "profile_all_routes_failed"
|
|
1734
|
+
stage_status["profile"] = build_stage_status(
|
|
1735
|
+
stage="profile",
|
|
1736
|
+
status="succeeded" if profile_has_accepted else "failed",
|
|
1737
|
+
route_plan=profile_route_plan,
|
|
1738
|
+
attempted_routes=profile_attempts,
|
|
1739
|
+
chosen_route=str(profile_resp.get("_route_label") or ""),
|
|
1740
|
+
accept_reason=str(profile_resp.get("_accept_reason") or ""),
|
|
1741
|
+
fallback_reason=str(profile_reason or ""),
|
|
1742
|
+
error_reason=profile_error_reason,
|
|
1743
|
+
all_routes_failed=not profile_has_accepted,
|
|
1744
|
+
)
|
|
856
1745
|
|
|
857
1746
|
works: List[Dict[str, Any]] = []
|
|
858
1747
|
seen_ids = set()
|
|
@@ -860,9 +1749,9 @@ def collect_xhs_author_home_raw(
|
|
|
860
1749
|
has_more = True
|
|
861
1750
|
page = 0
|
|
862
1751
|
pagination_trace: List[Dict[str, Any]] = []
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
1752
|
+
posts_attempts_all: List[Dict[str, Any]] = []
|
|
1753
|
+
posts_accepted_routes: List[str] = []
|
|
1754
|
+
posts_error_reason: Optional[str] = None
|
|
866
1755
|
|
|
867
1756
|
while has_more and page < max_pages and len(works) < max_items:
|
|
868
1757
|
page += 1
|
|
@@ -872,48 +1761,66 @@ def collect_xhs_author_home_raw(
|
|
|
872
1761
|
message="xiaohongshu pagination page requested",
|
|
873
1762
|
data={"page": page, "cursor_in": cursor},
|
|
874
1763
|
)
|
|
875
|
-
posts_routes = _xhs_route_plan("posts")
|
|
1764
|
+
posts_routes = _xhs_route_plan("posts", user_id=user_id, input_value=input_value, cursor=cursor)
|
|
876
1765
|
posts_resp: Dict[str, Any] = {}
|
|
877
1766
|
posts_reason: Optional[str] = None
|
|
878
1767
|
posts_attempts: List[Dict[str, Any]] = []
|
|
879
|
-
|
|
1768
|
+
page_route_plan = [
|
|
1769
|
+
build_route_plan_entry(
|
|
1770
|
+
route_label=str(route["route_label"]),
|
|
1771
|
+
endpoint=str(route["path"]),
|
|
1772
|
+
method="GET",
|
|
1773
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1774
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
1775
|
+
)
|
|
1776
|
+
for route in posts_routes
|
|
1777
|
+
]
|
|
1778
|
+
for route in posts_routes:
|
|
1779
|
+
if route.get("param_readiness") != "ready":
|
|
1780
|
+
skipped_attempt = _build_unavailable_attempt(
|
|
1781
|
+
route_label=str(route["route_label"]),
|
|
1782
|
+
endpoint=str(route["path"]),
|
|
1783
|
+
reason=str(route.get("param_reason") or "fallback_param_unavailable"),
|
|
1784
|
+
extra={"page": page, "cursor_in": cursor},
|
|
1785
|
+
)
|
|
1786
|
+
posts_attempts.append(skipped_attempt)
|
|
1787
|
+
posts_attempts_all.append(skipped_attempt)
|
|
1788
|
+
continue
|
|
880
1789
|
posts_resp = _call_xhs_route(
|
|
881
1790
|
base_url=base_url,
|
|
882
1791
|
token=token,
|
|
883
1792
|
timeout_ms=timeout_ms,
|
|
884
|
-
path=path,
|
|
885
|
-
route_label=route_label,
|
|
886
|
-
params={
|
|
887
|
-
"user_id": user_id,
|
|
888
|
-
"share_text": input_value,
|
|
889
|
-
"cursor": cursor or None,
|
|
890
|
-
"num": page_limit,
|
|
891
|
-
"xsec_token": xsec_token or None,
|
|
892
|
-
},
|
|
1793
|
+
path=str(route["path"]),
|
|
1794
|
+
route_label=str(route["route_label"]),
|
|
1795
|
+
params=dict(route.get("params") or {}),
|
|
893
1796
|
fallback_reason=posts_reason,
|
|
894
1797
|
completeness_builder=_xhs_posts_field_completeness,
|
|
895
1798
|
)
|
|
896
1799
|
posts_decision = _xhs_posts_accept_decision(posts_resp, posts_resp.get("_field_completeness") or {})
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
1800
|
+
posts_attempt = build_attempted_route(
|
|
1801
|
+
route_label=str(route["route_label"]),
|
|
1802
|
+
endpoint=str(route["path"]),
|
|
1803
|
+
response=posts_resp,
|
|
1804
|
+
accepted=bool(posts_decision.get("accepted")),
|
|
1805
|
+
accept_reason=str(posts_decision.get("accept_reason") or ""),
|
|
1806
|
+
fallback_reason=str(posts_decision.get("fallback_reason") or ""),
|
|
1807
|
+
extra={
|
|
1808
|
+
"page": page,
|
|
1809
|
+
"cursor_in": cursor,
|
|
904
1810
|
"field_completeness": posts_resp.get("_field_completeness"),
|
|
905
|
-
|
|
906
|
-
}
|
|
1811
|
+
},
|
|
907
1812
|
)
|
|
1813
|
+
posts_attempts.append(posts_attempt)
|
|
1814
|
+
posts_attempts_all.append(posts_attempt)
|
|
908
1815
|
trace.append(
|
|
909
1816
|
build_api_trace(
|
|
910
|
-
step=step_name,
|
|
911
|
-
endpoint=path,
|
|
1817
|
+
step=str(route["step_name"]),
|
|
1818
|
+
endpoint=str(route["path"]),
|
|
912
1819
|
response=posts_resp,
|
|
913
1820
|
extra={
|
|
914
1821
|
"page": page,
|
|
915
1822
|
"cursor": cursor,
|
|
916
|
-
"route_label": route_label,
|
|
1823
|
+
"route_label": route["route_label"],
|
|
917
1824
|
"field_completeness": posts_resp.get("_field_completeness"),
|
|
918
1825
|
"accept_reason": posts_decision.get("accept_reason"),
|
|
919
1826
|
"route_accepted": bool(posts_decision.get("accepted")),
|
|
@@ -923,6 +1830,7 @@ def collect_xhs_author_home_raw(
|
|
|
923
1830
|
request_id_candidates.append(posts_resp)
|
|
924
1831
|
if posts_decision.get("accepted"):
|
|
925
1832
|
posts_resp["_accept_reason"] = posts_decision.get("accept_reason")
|
|
1833
|
+
posts_accepted_routes.append(str(route["route_label"]))
|
|
926
1834
|
break
|
|
927
1835
|
posts_reason = str(posts_decision.get("fallback_reason") or "field_completeness_below_threshold")
|
|
928
1836
|
posts_resp["fallback_trigger_reason"] = posts_reason
|
|
@@ -932,15 +1840,35 @@ def collect_xhs_author_home_raw(
|
|
|
932
1840
|
"step": "xhs.posts.route_decision",
|
|
933
1841
|
"page": page,
|
|
934
1842
|
"cursor_in": cursor,
|
|
1843
|
+
"route_plan": page_route_plan,
|
|
935
1844
|
"chosen_route": posts_resp.get("_route_label"),
|
|
936
1845
|
"request_id": posts_resp.get("request_id"),
|
|
937
1846
|
"field_completeness": posts_resp.get("_field_completeness"),
|
|
938
1847
|
"accept_reason": posts_resp.get("_accept_reason"),
|
|
939
1848
|
"fallback_reason": posts_reason,
|
|
940
1849
|
"attempted_routes": posts_attempts,
|
|
1850
|
+
"all_routes_failed": not any(bool(attempt.get("accepted")) for attempt in posts_attempts),
|
|
941
1851
|
}
|
|
942
1852
|
)
|
|
943
1853
|
|
|
1854
|
+
if not posts_resp.get("_accept_reason"):
|
|
1855
|
+
posts_error_reason = "posts_contract_incomplete" if any(bool(attempt.get("ok")) for attempt in posts_attempts if not attempt.get("skipped")) else "posts_all_routes_failed"
|
|
1856
|
+
pagination_trace.append(
|
|
1857
|
+
{
|
|
1858
|
+
"page": page,
|
|
1859
|
+
"cursor_in": cursor,
|
|
1860
|
+
"cursor_out": "",
|
|
1861
|
+
"cursor_source": "missing",
|
|
1862
|
+
"has_more_raw": None,
|
|
1863
|
+
"has_more_normalized": None,
|
|
1864
|
+
"items": 0,
|
|
1865
|
+
"route_label": posts_resp.get("_route_label"),
|
|
1866
|
+
"request_id": posts_resp.get("request_id"),
|
|
1867
|
+
"stop_reason": posts_error_reason,
|
|
1868
|
+
}
|
|
1869
|
+
)
|
|
1870
|
+
break
|
|
1871
|
+
|
|
944
1872
|
data = posts_resp.get("data")
|
|
945
1873
|
page_items = _extract_xhs_posts_items(data)
|
|
946
1874
|
next_cursor_raw = _extract_xhs_response_cursor(data)
|
|
@@ -1005,6 +1933,32 @@ def collect_xhs_author_home_raw(
|
|
|
1005
1933
|
if should_continue and next_cursor:
|
|
1006
1934
|
cursor = next_cursor
|
|
1007
1935
|
|
|
1936
|
+
posts_route_plan = [
|
|
1937
|
+
build_route_plan_entry(
|
|
1938
|
+
route_label=str(route["route_label"]),
|
|
1939
|
+
endpoint=str(route["path"]),
|
|
1940
|
+
method="GET",
|
|
1941
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1942
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
1943
|
+
)
|
|
1944
|
+
for route in _xhs_route_plan("posts", user_id=user_id, input_value=input_value, cursor="")
|
|
1945
|
+
]
|
|
1946
|
+
posts_has_accepted = bool(posts_accepted_routes)
|
|
1947
|
+
if not posts_has_accepted and posts_error_reason is None:
|
|
1948
|
+
posts_error_reason = "posts_all_routes_failed"
|
|
1949
|
+
stage_status["posts"] = build_stage_status(
|
|
1950
|
+
stage="posts",
|
|
1951
|
+
status="succeeded" if posts_has_accepted else "failed",
|
|
1952
|
+
route_plan=posts_route_plan,
|
|
1953
|
+
attempted_routes=posts_attempts_all,
|
|
1954
|
+
chosen_route=posts_accepted_routes[0] if len(set(posts_accepted_routes)) == 1 and posts_accepted_routes else ("mixed" if posts_accepted_routes else ""),
|
|
1955
|
+
accept_reason="posts_pages_collected" if posts_has_accepted else "",
|
|
1956
|
+
fallback_reason=str(posts_error_reason or ""),
|
|
1957
|
+
error_reason=None if posts_has_accepted else posts_error_reason,
|
|
1958
|
+
all_routes_failed=not posts_has_accepted,
|
|
1959
|
+
)
|
|
1960
|
+
trace.append({"step": "xhs.posts.stage_decision", **stage_status["posts"]})
|
|
1961
|
+
|
|
1008
1962
|
request_id = _pick_request_id(request_id_candidates, trace)
|
|
1009
1963
|
if progress is not None:
|
|
1010
1964
|
progress.done(
|
|
@@ -1013,6 +1967,14 @@ def collect_xhs_author_home_raw(
|
|
|
1013
1967
|
data={"works_count": len(works), "pages": len(pagination_trace), "request_id": request_id},
|
|
1014
1968
|
)
|
|
1015
1969
|
|
|
1970
|
+
collect_error_reason: Optional[str] = None
|
|
1971
|
+
if stage_status.get("resolver", {}).get("status") == "failed":
|
|
1972
|
+
collect_error_reason = str(stage_status["resolver"].get("error_reason") or "author_id_unresolved")
|
|
1973
|
+
elif not works and stage_status.get("posts", {}).get("status") == "failed":
|
|
1974
|
+
collect_error_reason = stage_status["posts"].get("error_reason")
|
|
1975
|
+
elif stage_status.get("profile", {}).get("status") == "failed":
|
|
1976
|
+
collect_error_reason = stage_status["profile"].get("error_reason")
|
|
1977
|
+
|
|
1016
1978
|
return {
|
|
1017
1979
|
"platform": "xiaohongshu",
|
|
1018
1980
|
"resolved_author_id": user_id,
|
|
@@ -1029,4 +1991,6 @@ def collect_xhs_author_home_raw(
|
|
|
1029
1991
|
},
|
|
1030
1992
|
"extract_trace": trace,
|
|
1031
1993
|
"request_id": request_id,
|
|
1994
|
+
"stage_status": stage_status,
|
|
1995
|
+
"error_reason": collect_error_reason,
|
|
1032
1996
|
}
|