@tikomni/skills 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/core/extract_pipeline.py +93 -1
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +1066 -102
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +102 -25
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +335 -78
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +8 -1
- package/skills/social-media-crawl/tests/test_fixed_pipeline_fallback.py +169 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import unittest
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from unittest.mock import patch
|
|
10
|
+
|
|
11
|
+
SKILL_ROOT = Path(__file__).resolve().parents[1]
|
|
12
|
+
if str(SKILL_ROOT) not in sys.path:
|
|
13
|
+
sys.path.insert(0, str(SKILL_ROOT))
|
|
14
|
+
|
|
15
|
+
from scripts.pipelines import homepage_collectors
|
|
16
|
+
from scripts.pipelines import run_xiaohongshu_single_work
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FixedPipelineFallbackTest(unittest.TestCase):
|
|
20
|
+
def test_xhs_single_route_plan_respects_version_priority_and_cookie_gate(self) -> None:
|
|
21
|
+
source_input = {
|
|
22
|
+
"share_text": (
|
|
23
|
+
"https://www.xiaohongshu.com/discovery/item/"
|
|
24
|
+
"6995700a000000000a02ef2d?xsec_token=token123"
|
|
25
|
+
),
|
|
26
|
+
"note_id": "6995700a000000000a02ef2d",
|
|
27
|
+
}
|
|
28
|
+
with patch.dict(os.environ, {}, clear=False):
|
|
29
|
+
routes = run_xiaohongshu_single_work._build_note_fetch_routes(source_input)
|
|
30
|
+
|
|
31
|
+
self.assertEqual(
|
|
32
|
+
[route["route_label"] for route in routes],
|
|
33
|
+
[
|
|
34
|
+
"app_v2_video",
|
|
35
|
+
"app_v2_image",
|
|
36
|
+
"app_v2_mixed",
|
|
37
|
+
"app_v1_v2",
|
|
38
|
+
"app_v1",
|
|
39
|
+
"web_v2_v3",
|
|
40
|
+
"web_v2_v2",
|
|
41
|
+
"web_v1_v7",
|
|
42
|
+
"web_v1_v5",
|
|
43
|
+
"web_v1_v4",
|
|
44
|
+
"web_v1_v2",
|
|
45
|
+
],
|
|
46
|
+
)
|
|
47
|
+
self.assertEqual(routes[3]["endpoint"], run_xiaohongshu_single_work.APP_V1_V2_ENDPOINT)
|
|
48
|
+
self.assertEqual(routes[7]["endpoint"], run_xiaohongshu_single_work.WEB_V1_V7_ENDPOINT)
|
|
49
|
+
self.assertEqual(routes[8]["method"], "POST")
|
|
50
|
+
self.assertEqual(routes[8]["param_readiness"], "unavailable")
|
|
51
|
+
self.assertEqual(routes[8]["param_reason"], "fallback_requires_cookie")
|
|
52
|
+
self.assertEqual(routes[5]["param_readiness"], "unavailable")
|
|
53
|
+
self.assertEqual(routes[5]["param_reason"], "missing_short_share_url")
|
|
54
|
+
|
|
55
|
+
def test_douyin_creator_home_fails_fast_when_author_id_unresolved(self) -> None:
|
|
56
|
+
def fake_call_json_api(*, path: str, **_: object) -> dict:
|
|
57
|
+
self.assertEqual(path, "/api/u1/v1/douyin/web/get_sec_user_id")
|
|
58
|
+
return {
|
|
59
|
+
"ok": False,
|
|
60
|
+
"status_code": 502,
|
|
61
|
+
"request_id": "req-dy-resolver",
|
|
62
|
+
"error_reason": "resolver_failed",
|
|
63
|
+
"data": {},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
with patch.object(homepage_collectors, "call_json_api", side_effect=fake_call_json_api):
|
|
67
|
+
raw = homepage_collectors.collect_douyin_author_home_raw(
|
|
68
|
+
input_value="https://v.douyin.com/test-resolver/",
|
|
69
|
+
base_url="https://api.tikomni.com",
|
|
70
|
+
token="test-token",
|
|
71
|
+
timeout_ms=1000,
|
|
72
|
+
page_size=20,
|
|
73
|
+
pages_max=1,
|
|
74
|
+
max_items=5,
|
|
75
|
+
progress=None,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
self.assertEqual(raw.get("error_reason"), "author_id_unresolved")
|
|
79
|
+
self.assertEqual(raw["stage_status"]["resolver"]["status"], "failed")
|
|
80
|
+
self.assertEqual(raw["stage_status"]["profile"]["status"], "skipped")
|
|
81
|
+
self.assertEqual(raw["stage_status"]["posts"]["status"], "skipped")
|
|
82
|
+
|
|
83
|
+
def test_xhs_creator_home_fails_fast_when_author_id_unresolved(self) -> None:
|
|
84
|
+
def fake_call_json_api(*, path: str, **_: object) -> dict:
|
|
85
|
+
self.assertEqual(path, "/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token")
|
|
86
|
+
return {
|
|
87
|
+
"ok": False,
|
|
88
|
+
"status_code": 502,
|
|
89
|
+
"request_id": "req-xhs-resolver",
|
|
90
|
+
"error_reason": "resolver_failed",
|
|
91
|
+
"data": {},
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
with patch.object(homepage_collectors, "call_json_api", side_effect=fake_call_json_api):
|
|
95
|
+
raw = homepage_collectors.collect_xhs_author_home_raw(
|
|
96
|
+
input_value="https://xhslink.com/m/test-resolver",
|
|
97
|
+
base_url="https://api.tikomni.com",
|
|
98
|
+
token="test-token",
|
|
99
|
+
timeout_ms=1000,
|
|
100
|
+
page_size=20,
|
|
101
|
+
pages_max=1,
|
|
102
|
+
max_items=5,
|
|
103
|
+
progress=None,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
self.assertEqual(raw.get("error_reason"), "author_id_unresolved")
|
|
107
|
+
self.assertEqual(raw["stage_status"]["resolver"]["status"], "failed")
|
|
108
|
+
self.assertEqual(raw["stage_status"]["profile"]["status"], "skipped")
|
|
109
|
+
self.assertEqual(raw["stage_status"]["posts"]["status"], "skipped")
|
|
110
|
+
|
|
111
|
+
def test_douyin_posts_records_cookie_required_skip(self) -> None:
|
|
112
|
+
def fake_call_json_api(*, path: str, params: dict | None = None, **_: object) -> dict:
|
|
113
|
+
if path == "/api/u1/v1/douyin/web/get_sec_user_id":
|
|
114
|
+
return {
|
|
115
|
+
"ok": True,
|
|
116
|
+
"status_code": 200,
|
|
117
|
+
"request_id": "req-resolve",
|
|
118
|
+
"error_reason": None,
|
|
119
|
+
"data": {"sec_user_id": "MS4wLjABAAAA-test-sec"},
|
|
120
|
+
}
|
|
121
|
+
if path == "/api/u1/v1/douyin/app/v3/handler_user_profile":
|
|
122
|
+
return {
|
|
123
|
+
"ok": True,
|
|
124
|
+
"status_code": 200,
|
|
125
|
+
"request_id": "req-profile",
|
|
126
|
+
"error_reason": None,
|
|
127
|
+
"data": {
|
|
128
|
+
"sec_user_id": "MS4wLjABAAAA-test-sec",
|
|
129
|
+
"nickname": "tester",
|
|
130
|
+
},
|
|
131
|
+
}
|
|
132
|
+
if path == "/api/u1/v1/douyin/app/v3/fetch_user_post_videos":
|
|
133
|
+
self.assertEqual(params or {}, {
|
|
134
|
+
"sec_user_id": "MS4wLjABAAAA-test-sec",
|
|
135
|
+
"count": 20,
|
|
136
|
+
"max_cursor": 0,
|
|
137
|
+
"sort_type": 0,
|
|
138
|
+
})
|
|
139
|
+
return {
|
|
140
|
+
"ok": False,
|
|
141
|
+
"status_code": 502,
|
|
142
|
+
"request_id": "req-posts",
|
|
143
|
+
"error_reason": "posts_failed",
|
|
144
|
+
"data": {},
|
|
145
|
+
}
|
|
146
|
+
raise AssertionError(f"unexpected path: {path}")
|
|
147
|
+
|
|
148
|
+
with patch.dict(os.environ, {"TIKOMNI_DOUYIN_WEB_COOKIE": ""}, clear=False):
|
|
149
|
+
with patch.object(homepage_collectors, "call_json_api", side_effect=fake_call_json_api):
|
|
150
|
+
raw = homepage_collectors.collect_douyin_author_home_raw(
|
|
151
|
+
input_value="https://v.douyin.com/test-posts/",
|
|
152
|
+
base_url="https://api.tikomni.com",
|
|
153
|
+
token="test-token",
|
|
154
|
+
timeout_ms=1000,
|
|
155
|
+
page_size=20,
|
|
156
|
+
pages_max=1,
|
|
157
|
+
max_items=5,
|
|
158
|
+
progress=None,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
attempted_routes = raw["stage_status"]["posts"]["attempted_routes"]
|
|
162
|
+
web_attempt = next(attempt for attempt in attempted_routes if attempt.get("route_label") == "web")
|
|
163
|
+
self.assertTrue(web_attempt.get("skipped"))
|
|
164
|
+
self.assertEqual(web_attempt.get("param_reason"), "fallback_requires_cookie")
|
|
165
|
+
self.assertEqual(raw.get("error_reason"), "posts_all_routes_failed")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
unittest.main()
|