@tikomni/skills 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/core/asr_pipeline.py +2 -16
- package/skills/social-media-crawl/scripts/core/extract_pipeline.py +93 -1
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +1066 -102
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +102 -25
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +335 -78
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +8 -1
- package/skills/social-media-crawl/tests/test_fixed_pipeline_fallback.py +235 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import unittest
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from unittest.mock import patch
|
|
10
|
+
|
|
11
|
+
SKILL_ROOT = Path(__file__).resolve().parents[1]
|
|
12
|
+
if str(SKILL_ROOT) not in sys.path:
|
|
13
|
+
sys.path.insert(0, str(SKILL_ROOT))
|
|
14
|
+
|
|
15
|
+
from scripts.core.asr_pipeline import run_u2_asr_batch_with_timeout_retry
|
|
16
|
+
from scripts.pipelines import homepage_collectors
|
|
17
|
+
from scripts.pipelines import run_xiaohongshu_single_work
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FixedPipelineFallbackTest(unittest.TestCase):
|
|
21
|
+
def test_xhs_single_route_plan_respects_version_priority_and_cookie_gate(self) -> None:
|
|
22
|
+
source_input = {
|
|
23
|
+
"share_text": (
|
|
24
|
+
"https://www.xiaohongshu.com/discovery/item/"
|
|
25
|
+
"6995700a000000000a02ef2d?xsec_token=token123"
|
|
26
|
+
),
|
|
27
|
+
"note_id": "6995700a000000000a02ef2d",
|
|
28
|
+
}
|
|
29
|
+
with patch.dict(os.environ, {}, clear=False):
|
|
30
|
+
routes = run_xiaohongshu_single_work._build_note_fetch_routes(source_input)
|
|
31
|
+
|
|
32
|
+
self.assertEqual(
|
|
33
|
+
[route["route_label"] for route in routes],
|
|
34
|
+
[
|
|
35
|
+
"app_v2_video",
|
|
36
|
+
"app_v2_image",
|
|
37
|
+
"app_v2_mixed",
|
|
38
|
+
"app_v1_v2",
|
|
39
|
+
"app_v1",
|
|
40
|
+
"web_v2_v3",
|
|
41
|
+
"web_v2_v2",
|
|
42
|
+
"web_v1_v7",
|
|
43
|
+
"web_v1_v5",
|
|
44
|
+
"web_v1_v4",
|
|
45
|
+
"web_v1_v2",
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
self.assertEqual(routes[3]["endpoint"], run_xiaohongshu_single_work.APP_V1_V2_ENDPOINT)
|
|
49
|
+
self.assertEqual(routes[7]["endpoint"], run_xiaohongshu_single_work.WEB_V1_V7_ENDPOINT)
|
|
50
|
+
self.assertEqual(routes[8]["method"], "POST")
|
|
51
|
+
self.assertEqual(routes[8]["param_readiness"], "unavailable")
|
|
52
|
+
self.assertEqual(routes[8]["param_reason"], "fallback_requires_cookie")
|
|
53
|
+
self.assertEqual(routes[5]["param_readiness"], "unavailable")
|
|
54
|
+
self.assertEqual(routes[5]["param_reason"], "missing_short_share_url")
|
|
55
|
+
|
|
56
|
+
def test_douyin_creator_home_fails_fast_when_author_id_unresolved(self) -> None:
|
|
57
|
+
def fake_call_json_api(*, path: str, **_: object) -> dict:
|
|
58
|
+
self.assertEqual(path, "/api/u1/v1/douyin/web/get_sec_user_id")
|
|
59
|
+
return {
|
|
60
|
+
"ok": False,
|
|
61
|
+
"status_code": 502,
|
|
62
|
+
"request_id": "req-dy-resolver",
|
|
63
|
+
"error_reason": "resolver_failed",
|
|
64
|
+
"data": {},
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
with patch.object(homepage_collectors, "call_json_api", side_effect=fake_call_json_api):
|
|
68
|
+
raw = homepage_collectors.collect_douyin_author_home_raw(
|
|
69
|
+
input_value="https://v.douyin.com/test-resolver/",
|
|
70
|
+
base_url="https://api.tikomni.com",
|
|
71
|
+
token="test-token",
|
|
72
|
+
timeout_ms=1000,
|
|
73
|
+
page_size=20,
|
|
74
|
+
pages_max=1,
|
|
75
|
+
max_items=5,
|
|
76
|
+
progress=None,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
self.assertEqual(raw.get("error_reason"), "author_id_unresolved")
|
|
80
|
+
self.assertEqual(raw["stage_status"]["resolver"]["status"], "failed")
|
|
81
|
+
self.assertEqual(raw["stage_status"]["profile"]["status"], "skipped")
|
|
82
|
+
self.assertEqual(raw["stage_status"]["posts"]["status"], "skipped")
|
|
83
|
+
|
|
84
|
+
def test_xhs_creator_home_fails_fast_when_author_id_unresolved(self) -> None:
|
|
85
|
+
def fake_call_json_api(*, path: str, **_: object) -> dict:
|
|
86
|
+
self.assertEqual(path, "/api/u1/v1/xiaohongshu/app/get_user_id_and_xsec_token")
|
|
87
|
+
return {
|
|
88
|
+
"ok": False,
|
|
89
|
+
"status_code": 502,
|
|
90
|
+
"request_id": "req-xhs-resolver",
|
|
91
|
+
"error_reason": "resolver_failed",
|
|
92
|
+
"data": {},
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
with patch.object(homepage_collectors, "call_json_api", side_effect=fake_call_json_api):
|
|
96
|
+
raw = homepage_collectors.collect_xhs_author_home_raw(
|
|
97
|
+
input_value="https://xhslink.com/m/test-resolver",
|
|
98
|
+
base_url="https://api.tikomni.com",
|
|
99
|
+
token="test-token",
|
|
100
|
+
timeout_ms=1000,
|
|
101
|
+
page_size=20,
|
|
102
|
+
pages_max=1,
|
|
103
|
+
max_items=5,
|
|
104
|
+
progress=None,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.assertEqual(raw.get("error_reason"), "author_id_unresolved")
|
|
108
|
+
self.assertEqual(raw["stage_status"]["resolver"]["status"], "failed")
|
|
109
|
+
self.assertEqual(raw["stage_status"]["profile"]["status"], "skipped")
|
|
110
|
+
self.assertEqual(raw["stage_status"]["posts"]["status"], "skipped")
|
|
111
|
+
|
|
112
|
+
def test_douyin_posts_records_cookie_required_skip(self) -> None:
|
|
113
|
+
def fake_call_json_api(*, path: str, params: dict | None = None, **_: object) -> dict:
|
|
114
|
+
if path == "/api/u1/v1/douyin/web/get_sec_user_id":
|
|
115
|
+
return {
|
|
116
|
+
"ok": True,
|
|
117
|
+
"status_code": 200,
|
|
118
|
+
"request_id": "req-resolve",
|
|
119
|
+
"error_reason": None,
|
|
120
|
+
"data": {"sec_user_id": "MS4wLjABAAAA-test-sec"},
|
|
121
|
+
}
|
|
122
|
+
if path == "/api/u1/v1/douyin/app/v3/handler_user_profile":
|
|
123
|
+
return {
|
|
124
|
+
"ok": True,
|
|
125
|
+
"status_code": 200,
|
|
126
|
+
"request_id": "req-profile",
|
|
127
|
+
"error_reason": None,
|
|
128
|
+
"data": {
|
|
129
|
+
"sec_user_id": "MS4wLjABAAAA-test-sec",
|
|
130
|
+
"nickname": "tester",
|
|
131
|
+
},
|
|
132
|
+
}
|
|
133
|
+
if path == "/api/u1/v1/douyin/app/v3/fetch_user_post_videos":
|
|
134
|
+
self.assertEqual(params or {}, {
|
|
135
|
+
"sec_user_id": "MS4wLjABAAAA-test-sec",
|
|
136
|
+
"count": 20,
|
|
137
|
+
"max_cursor": 0,
|
|
138
|
+
"sort_type": 0,
|
|
139
|
+
})
|
|
140
|
+
return {
|
|
141
|
+
"ok": False,
|
|
142
|
+
"status_code": 502,
|
|
143
|
+
"request_id": "req-posts",
|
|
144
|
+
"error_reason": "posts_failed",
|
|
145
|
+
"data": {},
|
|
146
|
+
}
|
|
147
|
+
raise AssertionError(f"unexpected path: {path}")
|
|
148
|
+
|
|
149
|
+
with patch.dict(os.environ, {"TIKOMNI_DOUYIN_WEB_COOKIE": ""}, clear=False):
|
|
150
|
+
with patch.object(homepage_collectors, "call_json_api", side_effect=fake_call_json_api):
|
|
151
|
+
raw = homepage_collectors.collect_douyin_author_home_raw(
|
|
152
|
+
input_value="https://v.douyin.com/test-posts/",
|
|
153
|
+
base_url="https://api.tikomni.com",
|
|
154
|
+
token="test-token",
|
|
155
|
+
timeout_ms=1000,
|
|
156
|
+
page_size=20,
|
|
157
|
+
pages_max=1,
|
|
158
|
+
max_items=5,
|
|
159
|
+
progress=None,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
attempted_routes = raw["stage_status"]["posts"]["attempted_routes"]
|
|
163
|
+
web_attempt = next(attempt for attempt in attempted_routes if attempt.get("route_label") == "web")
|
|
164
|
+
self.assertTrue(web_attempt.get("skipped"))
|
|
165
|
+
self.assertEqual(web_attempt.get("param_reason"), "fallback_requires_cookie")
|
|
166
|
+
self.assertEqual(raw.get("error_reason"), "posts_all_routes_failed")
|
|
167
|
+
|
|
168
|
+
def test_u2_batch_prefers_file_url_mapping_over_item_index_fallback(self) -> None:
|
|
169
|
+
file_url = "https://example.com/video.mp4"
|
|
170
|
+
raw_task = {
|
|
171
|
+
"data": {
|
|
172
|
+
"task_status": "SUCCEEDED",
|
|
173
|
+
"task_metrics": {"TOTAL": 1, "SUCCEEDED": 1, "FAILED": 0},
|
|
174
|
+
"items": [
|
|
175
|
+
{
|
|
176
|
+
"item_index": 0,
|
|
177
|
+
"task_status": "SUCCEEDED",
|
|
178
|
+
"transcript_text": "索引映射文本。",
|
|
179
|
+
"transcription_url": "https://example.com/index.json",
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
"file_url": file_url,
|
|
183
|
+
"task_status": "SUCCEEDED",
|
|
184
|
+
"transcript_text": "file_url 映射文本。",
|
|
185
|
+
"transcription_url": "https://example.com/file.json",
|
|
186
|
+
},
|
|
187
|
+
],
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
submit_bundle = {
|
|
192
|
+
"submit_response": {"ok": True, "request_id": "req-submit"},
|
|
193
|
+
"task_id": "task-1",
|
|
194
|
+
"final_submit_status": "success",
|
|
195
|
+
"retry_chain": [],
|
|
196
|
+
}
|
|
197
|
+
poll_result = {
|
|
198
|
+
"ok": True,
|
|
199
|
+
"task_id": "task-1",
|
|
200
|
+
"task_status": "SUCCEEDED",
|
|
201
|
+
"request_id": "req-poll",
|
|
202
|
+
"error_reason": "",
|
|
203
|
+
"raw_task": raw_task,
|
|
204
|
+
"task_metrics": {"TOTAL": 1, "SUCCEEDED": 1, "FAILED": 0},
|
|
205
|
+
"batch_progress": {"expected_total": 1, "complete": True},
|
|
206
|
+
"batch_complete": True,
|
|
207
|
+
"trace": [],
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
with patch(
|
|
211
|
+
"scripts.core.asr_pipeline.submit_u2_asr_batch_with_retry",
|
|
212
|
+
return_value=submit_bundle,
|
|
213
|
+
), patch(
|
|
214
|
+
"scripts.core.asr_pipeline.poll_u2_task_core",
|
|
215
|
+
return_value=poll_result,
|
|
216
|
+
):
|
|
217
|
+
bundle = run_u2_asr_batch_with_timeout_retry(
|
|
218
|
+
base_url="https://api.tikomni.com",
|
|
219
|
+
token="test-token",
|
|
220
|
+
timeout_ms=1000,
|
|
221
|
+
file_urls=[file_url],
|
|
222
|
+
submit_max_retries=0,
|
|
223
|
+
submit_backoff_ms=0,
|
|
224
|
+
poll_interval_sec=0.01,
|
|
225
|
+
max_polls=1,
|
|
226
|
+
timeout_retry_enabled=False,
|
|
227
|
+
timeout_retry_max_retries=0,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
mapped_item = bundle["mapped_results"][file_url]
|
|
231
|
+
self.assertEqual(mapped_item["transcript_text"], "file_url 映射文本。")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
if __name__ == "__main__":
|
|
235
|
+
unittest.main()
|