rednote-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. rednote_cli/__init__.py +5 -0
  2. rednote_cli/_runtime/__init__.py +0 -0
  3. rednote_cli/_runtime/common/__init__.py +0 -0
  4. rednote_cli/_runtime/common/app_utils.py +77 -0
  5. rednote_cli/_runtime/common/config.py +83 -0
  6. rednote_cli/_runtime/common/enums.py +17 -0
  7. rednote_cli/_runtime/common/errors.py +22 -0
  8. rednote_cli/_runtime/core/__init__.py +0 -0
  9. rednote_cli/_runtime/core/account_manager.py +349 -0
  10. rednote_cli/_runtime/core/browser/__init__.py +0 -0
  11. rednote_cli/_runtime/core/browser/manager.py +247 -0
  12. rednote_cli/_runtime/core/database/__init__.py +0 -0
  13. rednote_cli/_runtime/core/database/manager.py +334 -0
  14. rednote_cli/_runtime/platforms/__init__.py +0 -0
  15. rednote_cli/_runtime/platforms/base.py +62 -0
  16. rednote_cli/_runtime/platforms/factory.py +55 -0
  17. rednote_cli/_runtime/platforms/publishing/__init__.py +12 -0
  18. rednote_cli/_runtime/platforms/publishing/media.py +275 -0
  19. rednote_cli/_runtime/platforms/publishing/models.py +59 -0
  20. rednote_cli/_runtime/platforms/publishing/validator.py +124 -0
  21. rednote_cli/_runtime/services/__init__.py +1 -0
  22. rednote_cli/_runtime/services/scraper_service.py +235 -0
  23. rednote_cli/adapters/__init__.py +1 -0
  24. rednote_cli/adapters/output/__init__.py +1 -0
  25. rednote_cli/adapters/output/event_stream.py +29 -0
  26. rednote_cli/adapters/output/formatter_json.py +23 -0
  27. rednote_cli/adapters/output/formatter_table.py +39 -0
  28. rednote_cli/adapters/output/writer.py +17 -0
  29. rednote_cli/adapters/persistence/__init__.py +1 -0
  30. rednote_cli/adapters/persistence/file_account_repo.py +51 -0
  31. rednote_cli/adapters/platform/__init__.py +1 -0
  32. rednote_cli/adapters/platform/rednote/__init__.py +1 -0
  33. rednote_cli/adapters/platform/rednote/extractor.py +65 -0
  34. rednote_cli/adapters/platform/rednote/publisher.py +26 -0
  35. rednote_cli/adapters/platform/rednote/runtime_extractor.py +818 -0
  36. rednote_cli/adapters/platform/rednote/runtime_publisher.py +373 -0
  37. rednote_cli/adapters/platform/rednote/runtime_registration.py +20 -0
  38. rednote_cli/application/__init__.py +1 -0
  39. rednote_cli/application/dto/__init__.py +1 -0
  40. rednote_cli/application/dto/input_models.py +121 -0
  41. rednote_cli/application/dto/output_models.py +78 -0
  42. rednote_cli/application/use_cases/__init__.py +1 -0
  43. rednote_cli/application/use_cases/account_list.py +9 -0
  44. rednote_cli/application/use_cases/account_mutation.py +22 -0
  45. rednote_cli/application/use_cases/auth_login.py +64 -0
  46. rednote_cli/application/use_cases/auth_status.py +96 -0
  47. rednote_cli/application/use_cases/doctor.py +49 -0
  48. rednote_cli/application/use_cases/init_runtime.py +20 -0
  49. rednote_cli/application/use_cases/note_get.py +22 -0
  50. rednote_cli/application/use_cases/note_search.py +26 -0
  51. rednote_cli/application/use_cases/publish_note.py +25 -0
  52. rednote_cli/application/use_cases/user_get.py +18 -0
  53. rednote_cli/application/use_cases/user_search.py +8 -0
  54. rednote_cli/application/use_cases/user_self.py +8 -0
  55. rednote_cli/cli/__init__.py +1 -0
  56. rednote_cli/cli/__main__.py +5 -0
  57. rednote_cli/cli/commands/__init__.py +1 -0
  58. rednote_cli/cli/commands/account.py +204 -0
  59. rednote_cli/cli/commands/doctor.py +20 -0
  60. rednote_cli/cli/commands/init.py +20 -0
  61. rednote_cli/cli/commands/note.py +101 -0
  62. rednote_cli/cli/commands/publish.py +147 -0
  63. rednote_cli/cli/commands/search.py +185 -0
  64. rednote_cli/cli/commands/user.py +113 -0
  65. rednote_cli/cli/main.py +163 -0
  66. rednote_cli/cli/options.py +13 -0
  67. rednote_cli/cli/runtime.py +142 -0
  68. rednote_cli/cli/utils.py +74 -0
  69. rednote_cli/domain/__init__.py +1 -0
  70. rednote_cli/domain/errors.py +50 -0
  71. rednote_cli/domain/note_search_filters.py +155 -0
  72. rednote_cli/infra/__init__.py +1 -0
  73. rednote_cli/infra/exit_codes.py +30 -0
  74. rednote_cli/infra/logger.py +11 -0
  75. rednote_cli/infra/paths.py +31 -0
  76. rednote_cli/infra/platforms.py +4 -0
  77. rednote_cli-0.1.0.dist-info/METADATA +81 -0
  78. rednote_cli-0.1.0.dist-info/RECORD +81 -0
  79. rednote_cli-0.1.0.dist-info/WHEEL +5 -0
  80. rednote_cli-0.1.0.dist-info/entry_points.txt +2 -0
  81. rednote_cli-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,818 @@
1
+ import asyncio
2
+ import shutil
3
+ import urllib.parse
4
+ from contextlib import contextmanager
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from urllib.parse import quote
8
+ from urllib.parse import urlparse
9
+
10
+ import requests
11
+ from loguru import logger
12
+ from yt_dlp.utils import *
13
+
14
+ from rednote_cli._runtime.common.enums import XsecSource
15
+ from rednote_cli._runtime.common.errors import (
16
+ InvalidPublishParameterError,
17
+ PublishMediaPreparationError,
18
+ PublishWorkflowNotReadyError,
19
+ UnsupportedPublishTargetError,
20
+ )
21
+ from rednote_cli._runtime.common.errors import (
22
+ PublishExecutionError,
23
+ PublishNoteException,
24
+ )
25
+ from rednote_cli._runtime.platforms.base import BaseExtractor, RiskControlException
26
+ from rednote_cli._runtime.platforms.publishing.validator import parse_rfc3339
27
+ from rednote_cli.adapters.platform.rednote.runtime_publisher import RednotePublisher
28
+ from rednote_cli.domain.note_search_filters import build_search_filter_selections
29
+
30
+
31
+ def _is_url(path: str) -> bool:
32
+ try:
33
+ parsed = urlparse(path)
34
+ return bool(parsed.scheme and parsed.netloc)
35
+ except Exception:
36
+ return False
37
+
38
+
39
+ def _validate_extension(file_identify: str, allowed_extensions) -> str:
40
+ path_part = urlparse(file_identify).path
41
+ suffix = Path(path_part).suffix.lower()
42
+ if suffix not in allowed_extensions:
43
+ supported = ", ".join(ext.lstrip(".") for ext in sorted(allowed_extensions))
44
+ raise InvalidPublishParameterError(
45
+ f"不支持的文件格式: {file_identify} (仅支持 {supported})"
46
+ )
47
+ return suffix
48
+
49
+
50
+ def _normalize_media_list(media_list: list, field_name: str) -> list[str]:
51
+ if media_list is None:
52
+ raise InvalidPublishParameterError(f"`{field_name}` 不能为空")
53
+ if not isinstance(media_list, (list, tuple)):
54
+ raise InvalidPublishParameterError(f"`{field_name}` 必须是 list 或 tuple")
55
+ if len(media_list) == 0:
56
+ raise InvalidPublishParameterError(f"`{field_name}` 至少包含 1 个文件")
57
+
58
+ normalized = []
59
+ for index, item in enumerate(media_list):
60
+ text = "" if item is None else str(item).strip()
61
+ if not text:
62
+ raise InvalidPublishParameterError(f"第 {index + 1} 个素材项为空,请检查输入")
63
+ normalized.append(text)
64
+ return normalized
65
+
66
+
67
+ @contextmanager
68
+ def prepare_image_paths(image_list: list, allowed_extensions):
69
+ """Validate/download image assets and cleanup temporary files automatically."""
70
+ normalized_images = _normalize_media_list(image_list, field_name="image_list")
71
+ temp_dir = tempfile.mkdtemp(prefix="publish_upload_")
72
+ final_paths = []
73
+
74
+ logger.info(f"开始校验发布素材, count: {len(normalized_images)}")
75
+
76
+ try:
77
+ for index, item in enumerate(normalized_images):
78
+ if _is_url(item):
79
+ suffix = _validate_extension(item, allowed_extensions=allowed_extensions)
80
+ target_path = Path(temp_dir) / f"download_{index}{suffix}"
81
+ try:
82
+ headers = {
83
+ "User-Agent": (
84
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
85
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
86
+ "Chrome/110.0.0.0 Safari/537.36"
87
+ )
88
+ }
89
+ response = requests.get(item, headers=headers, timeout=15, stream=True)
90
+ response.raise_for_status()
91
+ with open(target_path, "wb") as output:
92
+ for chunk in response.iter_content(chunk_size=8192):
93
+ output.write(chunk)
94
+ except requests.exceptions.RequestException as e:
95
+ raise PublishMediaPreparationError(
96
+ f"图片 {index + 1} 下载失败: {item}。具体原因: {e}"
97
+ ) from e
98
+
99
+ final_paths.append(str(target_path.absolute()))
100
+ logger.info(f"图片 {index + 1} 下载成功, source: {item}, target_path: {str(target_path.absolute())}")
101
+ continue
102
+
103
+ local_path = Path(item)
104
+ if not local_path.exists():
105
+ raise InvalidPublishParameterError(f"本地图片不存在: {item}")
106
+ if not local_path.is_file():
107
+ raise InvalidPublishParameterError(f"路径不是一个有效文件: {item}")
108
+
109
+ _validate_extension(item, allowed_extensions=allowed_extensions)
110
+ final_paths.append(str(local_path.absolute()))
111
+ logger.info(f"图片 {index + 1} 校验通过, path: {str(local_path.absolute())}")
112
+
113
+ yield final_paths
114
+ finally:
115
+ if os.path.exists(temp_dir):
116
+ shutil.rmtree(temp_dir)
117
+ logger.info("发布素材临时目录已清理, temp_dir: {temp_dir}")
118
+
119
+
120
+ class RednoteExtractor(BaseExtractor):
121
+ """Rednote-specific data extraction using yt_dlp InfoExtractor pattern."""
122
+ # 采用拦截后端访问直接获取接口返回数据
123
+ IE_NAME = 'rednote:all'
124
+
125
+ def __init__(self, page):
126
+ super().__init__(page)
127
+
128
+ def _to_snake_case(self, name):
129
+ """将 camelCase 转换为 snake_case"""
130
+ import re
131
+ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
132
+ return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
133
+
134
+ def _format_dict(self, data):
135
+ """递归统一所有键名为小写下划线,不漏掉任何数据"""
136
+ if not isinstance(data, (dict, list)):
137
+ return data
138
+ if isinstance(data, list):
139
+ return [self._format_dict(i) for i in data]
140
+
141
+ new_data = {}
142
+ for k, v in data.items():
143
+ new_key = self._to_snake_case(k)
144
+ new_data[new_key] = self._format_dict(v)
145
+ return new_data
146
+
147
+ async def _apply_note_search_filters(
148
+ self,
149
+ *,
150
+ sort_by: str | None = None,
151
+ note_type: str | None = None,
152
+ publish_time: str | None = None,
153
+ search_scope: str | None = None,
154
+ location: str | None = None,
155
+ ) -> bool:
156
+ selections = build_search_filter_selections(
157
+ sort_by=sort_by,
158
+ note_type=note_type,
159
+ publish_time=publish_time,
160
+ search_scope=search_scope,
161
+ location=location,
162
+ )
163
+ if not selections:
164
+ return False
165
+
166
+ filter_button = self.page.locator("div.filter").first
167
+ filter_panel = self.page.locator("div.filter-panel").first
168
+
169
+ for item in selections:
170
+ await filter_button.hover()
171
+ await filter_panel.wait_for(state="visible", timeout=10_000)
172
+ selector = f"div.filter-panel div.filters:nth-child({item.filters_index}) div.tags:nth-child({item.tags_index})"
173
+ await self.page.locator(selector).first.click(timeout=10_000)
174
+ logger.info(f"DEBUG: [search_notes] 应用筛选 {item.field}={item.value}({item.label})")
175
+ await asyncio.sleep(random.uniform(0.5, 1.0))
176
+
177
+ await self.page.wait_for_load_state("networkidle")
178
+ await asyncio.sleep(random.uniform(1.0, 1.8))
179
+ return True
180
+
181
+ async def _extract_search_feeds_from_initial_state(self) -> list[dict]:
182
+ feeds = await self.page.evaluate(
183
+ """
184
+ () => {
185
+ const state = window.__INITIAL_STATE__;
186
+ if (!state || !state.search || !state.search.feeds) return [];
187
+ const feeds = state.search.feeds;
188
+ const data = feeds.value !== undefined ? feeds.value : feeds._value;
189
+ return Array.isArray(data) ? data : [];
190
+ }
191
+ """
192
+ )
193
+ if not isinstance(feeds, list):
194
+ return []
195
+ return [self._format_dict(item) for item in feeds if isinstance(item, dict)]
196
+
197
+ async def get_self_info(self):
198
+ """Extract personal information of the logged-in user with robust measures."""
199
+ logger.info("DEBUG: [get_self_info] 开始执行")
200
+ # 采用后端拦截方式直接获取接口数据
201
+ result = {}
202
+ state = {"risk_detected": False, "risk_msg": "", "self_info_obtained": False}
203
+ event = asyncio.Event()
204
+
205
+ async def handle_response(response):
206
+ if "/api/sns/web/v2/user/me" in response.url or "/api/sns/web/v1/user/selfinfo" in response.url:
207
+ if response.status != 200:
208
+ if not state["self_info_obtained"]:
209
+ state["risk_detected"] = True
210
+ state["risk_msg"] = f"Self info API status {response.status}"
211
+ logger.warning(f"Self info API returned non-200: {response.status}")
212
+ return
213
+ else:
214
+ try:
215
+ data = await response.json()
216
+ if not data.get("success"):
217
+ msg = data.get("msg", "")
218
+ if ("薯队长遇到了点小麻烦" in msg or "风控" in msg) and not state["self_info_obtained"]:
219
+ state["risk_detected"] = True
220
+ state["risk_msg"] = msg
221
+ logger.error(f"Rednote API error: {data}")
222
+ return
223
+
224
+ r = data.get('data', {})
225
+ if "/api/sns/web/v2/user/me" in response.url:
226
+ result["red_id"] = r.get("red_id")
227
+ result["user_id"] = r.get("user_id")
228
+ result["nickname"] = r.get("nickname")
229
+ result["desc"] = r.get("desc")
230
+ result["gender"] = r.get("gender")
231
+ result["guest"] = r.get("guest")
232
+ result["images"] = r.get("images")
233
+ result["imageb"] = r.get("imageb")
234
+ else:
235
+ basic_info = r.get("basic_info", {})
236
+ result["red_id"] = basic_info.get("red_id")
237
+ result["nickname"] = basic_info.get("nickname")
238
+ result["desc"] = basic_info.get("desc")
239
+ result["gender"] = basic_info.get("gender")
240
+ result["images"] = basic_info.get("images")
241
+ result["imageb"] = basic_info.get("imageb")
242
+ result["ip_location"] = basic_info.get("ip_location")
243
+ result["interactions"] = r.get("interactions")
244
+ result["tags"] = r.get("tags")
245
+ result["tab_public"] = r.get("tab_public")
246
+
247
+ if result.get("red_id"):
248
+ state["self_info_obtained"] = True
249
+ state["risk_detected"] = False
250
+ state["risk_msg"] = ""
251
+ logger.info(f"DEBUG: [get_self_info] 成功拦截到接口数据: {result.get('nickname')}")
252
+ event.set()
253
+ except Exception as e:
254
+ if "No resource with given identifier found" in str(e):
255
+ return
256
+ logger.error(f"JSON parse error: {e}")
257
+
258
+ from rednote_cli._runtime.core.browser.manager import browser_manager
259
+
260
+ async with browser_manager.observe_responses(self.page, handle_response):
261
+ try:
262
+ await self.page.goto("https://www.xiaohongshu.com/explore", wait_until="domcontentloaded")
263
+ await self.page.wait_for_selector(".main-container .user .link-wrapper .channel", timeout=30000)
264
+ logger.info("DEBUG: [get_self_info] 正在触发用户信息请求...")
265
+ await self.page.evaluate(
266
+ "() => fetch('/api/sns/web/v2/user/me', { credentials: 'include' }).catch(() => null)"
267
+ )
268
+ await self.page.wait_for_timeout(500)
269
+
270
+ try:
271
+ # 等待接口返回,最多等待 10 秒
272
+ await asyncio.wait_for(event.wait(), timeout=10.0)
273
+ except asyncio.TimeoutError:
274
+ logger.warning("DEBUG: [get_self_info] 等待接口返回超时")
275
+
276
+ if result.get("red_id"):
277
+ return self._format_dict(result)
278
+
279
+ if "验证码" in await self.page.content():
280
+ logger.error("Detected Captcha on self page.")
281
+ if "薯队长遇到了点小麻烦" in await self.page.content():
282
+ raise RiskControlException("Detected 'Captain Potato' risk control on self page")
283
+ if state["risk_detected"]:
284
+ raise RiskControlException(f"Risk control detected during self info extraction: {state['risk_msg']}")
285
+ return self._format_dict(result)
286
+ except Exception as e:
287
+ logger.error(f"Extract self info failed: {e}")
288
+ raise
289
+
290
+ # initial_state = self._search_json(
291
+ # r'window\s*\.\s*__INITIAL_STATE__\s*=',
292
+ # content, 'initial state', 'self',
293
+ # end_pattern=r'</script>', transform_source=js_to_json, default={}
294
+ # )
295
+ #
296
+ # return traverse_obj(initial_state, {
297
+ # "user_id": ("user", "userInfo", ("user_id", "userId"), {str}, any),
298
+ # "red_id": ("user", "userInfo", ("red_id", "redId"), {str}, any),
299
+ # "nickname": ("user", "userInfo", "nickname", {str}),
300
+ # "desc": ("user", "userInfo", "desc", {str}),
301
+ # "gender": ("user", "userInfo", "gender", {int}),
302
+ # "images": ("user", "userInfo", "images", {str}),
303
+ # "imageb": ("user", "userInfo", "imageb", {str}),
304
+ # "guest": ("user", "userInfo", "guest", {bool}),
305
+ # "logged_in": ("user", "loggedIn", {bool}),
306
+ # "ip_location": ("user", "userPageData", "basicInfo", "ipLocation", {str}),
307
+ # "fans": ("user", "userPageData", "interactions", lambda _, v: v.get("type") == "fans", "count", {str_to_int}, any),
308
+ # "follows": ("user", "userPageData", "interactions", lambda _, v: v.get("type") == "follows", "count", {str_to_int}, any),
309
+ # "interaction": ("user", "userPageData", "interactions", lambda _, v: v.get("type") == "interaction", "count", {str_to_int}, any),
310
+ # "tags": (
311
+ # 'user', 'userPageData', 'tags', ...,
312
+ # {
313
+ # "name": ("name", {str}),
314
+ # "icon": ("icon", {str}),
315
+ # "type": ("tagType", {str}),
316
+ # }
317
+ # ),
318
+ # })
319
+
320
+ async def get_user_info(self, user_id: str, xsec_token: str = None, xsec_source: XsecSource = XsecSource.PC_FEED):
321
+ """Extract information about a specific user with robust anti-bot measures."""
322
+ logger.info(f"DEBUG: [get_user_info] 目标 ID: {user_id}?xsec_source={xsec_source.value}")
323
+ # 进入用户页面后不会触发 `/api/sns/web/v1/user/otherinfo` 和 `/api/sns/web/v1/user_posted` 接口, 因此采用解析 HTML 的方式
324
+ try:
325
+ url = f"https://www.xiaohongshu.com/user/profile/{user_id}"
326
+ if xsec_token:
327
+ url += f"&xsec_token={xsec_token}"
328
+
329
+ await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
330
+ await self.page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
331
+ await self.page.wait_for_selector(".side-bar .user a", timeout=30000)
332
+ await asyncio.sleep(random.uniform(1.0, 2.0))
333
+
334
+ logger.info(f"DEBUG: [get_user_info] 正在跳转: {url}")
335
+ await self.page.goto(url, wait_until="load")
336
+ await asyncio.sleep(random.uniform(2.0, 4.0))
337
+
338
+ # 风控页面检测与自动刷新
339
+ content = await self.page.content()
340
+ if "你访问的页面不见了" in content or "验证码" in content or "未连接到服务器" in content:
341
+ logger.error(f"要访问该用户({user_id})信息, 必须传入 xsec_token 参数")
342
+ if "薯队长遇到了点小麻烦" in content:
343
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on user profile: {user_id}")
344
+
345
+ initial_state = self._search_json(
346
+ r'window\s*\.\s*__INITIAL_STATE__\s*=',
347
+ content, 'initial state', user_id,
348
+ end_pattern=r'</script>', transform_source=js_to_json, default={}
349
+ )
350
+ result = traverse_obj(initial_state, {
351
+ "red_id": ("user", "userPageData", "basicInfo", "redId", {str}),
352
+ "nickname": ("user", "userPageData", "basicInfo", "nickname", {str}),
353
+ "desc": ("user", "userPageData", "basicInfo", "desc", {str}),
354
+ "gender": ("user", "userPageData", "basicInfo", "gender", {int}),
355
+ "images": ("user", "userPageData", "basicInfo", "images", {str}),
356
+ "imageb": ("user", "userPageData", "basicInfo", "imageb", {str}),
357
+ "ip_location": ("user", "userPageData", "basicInfo", "ipLocation", {str}),
358
+ "interactions": ("user", "userPageData", "interactions", {list}),
359
+ "tags": ("user", "userPageData", "tags", {list}),
360
+ "tab_public": ("user", "userPageData", "tabPublic", {list}),
361
+ "notes": ('user', 'notes', ..., ...),
362
+ })
363
+
364
+ result["user_id"] = user_id
365
+ logger.info(f"DEBUG: [get_user_info] 提取完成,昵称: {result.get('nickname')}")
366
+ return self._format_dict(result)
367
+ except Exception as e:
368
+ logger.error(f"Failed to extract user info for {user_id}: {e}")
369
+ raise
370
+
371
+ async def get_note_info(self, note_id: str, xsec_token: str = None, xsec_source: XsecSource = XsecSource.PC_FEED, comment_size: int = 10, sub_comment_size: int = 5):
372
+ """Search for notes by keyword with robust anti-bot measures."""
373
+ logger.info(f"DEBUG: [get_note_info] 目标笔记: {note_id}")
374
+ # 进入笔记页面后不会触发 `/api/sns/web/v1/feed` 接口, 因此采用解析 HTML 的方式获取"笔记信息", 而采用拦截后端访问直接获取接口返回的评论数据
375
+ comments = []
376
+ state = {"has_more": True, "risk_detected": False, "risk_msg": ""}
377
+
378
+ async def handle_response(response):
379
+ if "/api/sns/web/v2/comment/sub/page" in response.url:
380
+ if response.status != 200:
381
+ state["risk_detected"] = True
382
+ state["risk_msg"] = f"Sub-comment API status {response.status}"
383
+ logger.warning(f"Search blocked by risk control: {response.status}")
384
+ return
385
+
386
+ if response.status == 200:
387
+ try:
388
+ parsed = urllib.parse.urlparse(str(response.url))
389
+ query_params = urllib.parse.parse_qs(parsed.query)
390
+ root_comment_id = query_params.get('root_comment_id', [None])[0]
391
+ data = await response.json()
392
+ if not data.get("success"):
393
+ msg = data.get("msg", "")
394
+ if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
395
+ state["risk_detected"] = True
396
+ state["risk_msg"] = msg
397
+ logger.error(f"Rednote API error: {data}")
398
+ return
399
+ r = data.get('data', {})
400
+ cs = r.get("comments", [])
401
+ sub_comment_count = ""
402
+ now_sub_comment_count = -1
403
+ for c in comments:
404
+ if c.get("id") == root_comment_id:
405
+ sub_comment_count = c.get("sub_comment_count", "")
406
+ c.get("sub_comments", []).extend(cs)
407
+ now_sub_comment_count = len(c.get("sub_comments", []))
408
+ break
409
+
410
+ logger.info(f"DEBUG: [get_note_info] 拦截到 {root_comment_id} 二级评论数据: {now_sub_comment_count}/{sub_comment_count} 条")
411
+ except Exception as e:
412
+ logger.error(f"JSON parse error: {e}")
413
+ logger.error(f"response.text: {response.text}")
414
+ traceback.print_exc()
415
+
416
+ elif "/api/sns/web/v2/comment/page" in response.url:
417
+ if response.status != 200:
418
+ state["risk_detected"] = True
419
+ state["risk_msg"] = f"Comment API status {response.status}"
420
+ logger.warning(f"Search blocked by risk control: {response.status}")
421
+ return
422
+
423
+ if response.status == 200:
424
+ try:
425
+ data = await response.json()
426
+ if not data.get("success"):
427
+ msg = data.get("msg", "")
428
+ if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
429
+ state["risk_detected"] = True
430
+ state["risk_msg"] = msg
431
+ logger.error(f"Rednote API error: {data}")
432
+ return
433
+ r = data.get('data', {})
434
+ state["has_more"] = r.get('has_more', False)
435
+ comments.extend(r.get('comments', []))
436
+ logger.info(f"DEBUG: [get_note_info] 拦截到评论数据: {len(comments)} 条")
437
+ except Exception as e:
438
+ if "No resource with given identifier found" in str(e):
439
+ return
440
+ logger.error(f"JSON parse error: {e}")
441
+
442
+ from rednote_cli._runtime.core.browser.manager import browser_manager
443
+
444
+ async with browser_manager.observe_responses(self.page, handle_response):
445
+ try:
446
+ url = f"https://www.xiaohongshu.com/explore/{note_id}?xsec_source={xsec_source.value}"
447
+ if xsec_token:
448
+ url += f"&xsec_token={xsec_token}"
449
+
450
+ await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
451
+ await self.page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
452
+ await self.page.wait_for_selector(".side-bar .user a", timeout=30000)
453
+ await asyncio.sleep(random.uniform(1.0, 2.0))
454
+
455
+ logger.info(f"DEBUG: [get_note_info] 正在跳转: {url}")
456
+ await self.page.goto(url, wait_until="load")
457
+ await asyncio.sleep(random.uniform(2.0, 4.0))
458
+
459
+ content = await self.page.content()
460
+ if "你访问的页面不见了" in content or "当前笔记暂时无法浏览" in content:
461
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {note_id}")
462
+ if "薯队长遇到了点小麻烦" in content:
463
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {note_id}")
464
+
465
+ initial_state = self._search_json(
466
+ r'window\s*\.\s*__INITIAL_STATE__\s*=',
467
+ content, 'initial state', note_id,
468
+ end_pattern=r'</script>', transform_source=js_to_json, default={}
469
+ )
470
+
471
+ result = traverse_obj(initial_state, ("note", "noteDetailMap", note_id, "note"))
472
+ logger.info(f"DEBUG: [get_note_info] 笔记解析完成: {result.get('title')}")
473
+
474
+ comment_count = result.get("interactInfo", {}).get("commentCount", 0)
475
+ logger.info(f"DEBUG: [get_note_info] 评论数量: {comment_count}")
476
+ if comment_count != "0":
477
+ # 评论区翻页
478
+ for i in range(25):
479
+ if state.get("risk_detected"):
480
+ raise RiskControlException(f"Risk control detected during scroll: {state.get('risk_msg')}")
481
+ if len(comments) >= comment_size or not state["has_more"]:
482
+ break
483
+ container = await self.page.evaluate_handle(r'''() => {
484
+ // 1. 寻找包含“条评论”的业务逻辑锚点
485
+ const divs = Array.from(document.querySelectorAll('div'));
486
+ const marker = divs.find(el => el.innerText && /共\s*\d+\s*条评论/.test(el.innerText));
487
+ if (!marker) return document.querySelector('.note-scroller'); // 兜底方案
488
+
489
+ // 2. 向上递归寻找最近的滚动容器
490
+ let parent = marker;
491
+ while (parent) {
492
+ const style = window.getComputedStyle(parent);
493
+ if (style.overflowY === 'scroll' || style.overflowY === 'auto') {
494
+ return parent;
495
+ }
496
+ parent = parent.parentElement;
497
+ }
498
+ return document.querySelector('.note-scroller');
499
+ }''')
500
+ await self.human_scroll(container_locator=container.as_element())
501
+ logger.info(f"DEBUG: [search_notes] 第 {i + 1} 次滚动, 当前评论数: {len(comments)}/{comment_count}")
502
+ if len(comments) == 0:
503
+ logger.warning(
504
+ f"[get_note_info] 第 {i + 1} 次滚动仍未抓取到评论,页面可能尚未加载完成"
505
+ )
506
+ # 二级评论下钻
507
+ for comment_index, comment in enumerate(comments):
508
+ if comment_index >= comment_size:
509
+ break
510
+ comment_id = comment.get("id")
511
+
512
+ comment_item = self.page.locator(f"[id='comment-{comment_id}']")
513
+ if await comment_item.count() == 0:
514
+ break
515
+
516
+ container = self.page.locator("div").filter(has=comment_item).last
517
+
518
+ for i in range(25):
519
+ if state.get("risk_detected"):
520
+ raise RiskControlException(f"Risk control detected during sub-comment expansion: {state.get('risk_msg')}")
521
+ # 只定位“当前可见”的展开按钮
522
+ btns = container.get_by_text(re.compile(r"展开\s*\d+\s*条回复|展开更多回复")).filter(visible=True)
523
+ count = await btns.count()
524
+ if count == 0:
525
+ break
526
+
527
+ current_sub_comment_count = len(comments[comment_index].get("sub_comments", []))
528
+ if current_sub_comment_count >= sub_comment_size:
529
+ break
530
+
531
+ for i in range(count):
532
+ try:
533
+ btn = btns.nth(i)
534
+ # 滚动到中心,模拟真实视觉确认
535
+ await btn.scroll_into_view_if_needed()
536
+ await asyncio.sleep(random.uniform(0.5, 1.2))
537
+ # 生产级点击:带上随机位置偏移
538
+ await btn.click(timeout=3000, delay=random.randint(50, 150))
539
+ # 关键:点击后必须给 API 留出加载时间
540
+ await asyncio.sleep(random.uniform(1.2, 2.5))
541
+ except:
542
+ continue
543
+
544
+ # 检查是否还有新出现的按钮
545
+ await asyncio.sleep(1.5)
546
+
547
+ result = self._format_dict(result)
548
+ result["comments"] = [self._format_dict(n) for n in comments][:comment_size]
549
+ return result
550
+ except Exception as e:
551
+ logger.error(f"Search note failed: {e}")
552
+ traceback.print_exc()
553
+ raise
554
+
555
+ async def search_notes(
556
+ self,
557
+ keyword: str,
558
+ size: int = 20,
559
+ sort_by: str | None = None,
560
+ note_type: str | None = None,
561
+ publish_time: str | None = None,
562
+ search_scope: str | None = None,
563
+ location: str | None = None,
564
+ ):
565
+ """Search for notes by keyword with robust anti-bot measures."""
566
+ logger.info(f"DEBUG: [search_notes] 关键词: {keyword}")
567
+ result = {}
568
+ state = {"has_more": True, "risk_detected": False, "risk_msg": ""}
569
+
570
+ async def handle_response(response):
571
+ if "/api/sns/web/v1/search/notes" in response.url:
572
+ if response.status != 200:
573
+ state["risk_detected"] = True
574
+ state["risk_msg"] = f"Search API status {response.status}"
575
+ logger.bind(sys={
576
+ "module": "SEARCH_NOTES",
577
+ "action": "HANDLE_RESPONSE",
578
+ "status": "WARN",
579
+ "raw_data": {"url": str(response.url), "status": response.status, "message": response.status_text},
580
+ "message": "拦截到 response 事件。response 响应失败 → 触发重试机制。"
581
+ })
582
+ return RiskControlException()
583
+ else:
584
+ try:
585
+ try:
586
+ data = await response.json()
587
+ except Exception as e:
588
+ # response 解析 json 失败
589
+ logger.error(f"Search note failed: {e}")
590
+ return
591
+ if not data.get("success"):
592
+ # response 响应失败
593
+ msg = data.get("msg", "")
594
+ if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
595
+ state["risk_detected"] = True
596
+ state["risk_msg"] = msg
597
+ logger.error(f"Rednote API error: {data}")
598
+ return
599
+
600
+ items = data.get('data', {}).get('items', [])
601
+ state["has_more"] = data.get('data', {}).get('has_more', False)
602
+ for item in items:
603
+ if item.get("model_type") == "note":
604
+ note_id = item.get('id')
605
+ if note_id:
606
+ result[note_id] = item
607
+ except Exception as e:
608
+ if "No resource with given identifier found" in str(e):
609
+ return
610
+ logger.error(f"JSON parse error: {e}")
611
+
612
+ from rednote_cli._runtime.core.browser.manager import browser_manager
613
+
614
+ # Double encoding is required for Rednote search
615
+ encoded_keyword = quote(keyword)
616
+ search_url = f"https://www.xiaohongshu.com/search_result?keyword={encoded_keyword}&source=web_search_result_notes"
617
+
618
+ async with browser_manager.observe_responses(self.page, handle_response):
619
+ try:
620
+ # Add Referer to look more like a real search from home page
621
+ await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
622
+
623
+ logger.info(f"DEBUG: [search_notes] 正在打开搜索页: {search_url}")
624
+ # Navigate and wait for a bit more than just domcontentloaded
625
+ await self.page.goto(search_url, wait_until="load")
626
+ await asyncio.sleep(random.uniform(2.0, 4.0))
627
+
628
+ user_tab_selector = "#channel-container"
629
+ await self.page.wait_for_selector(user_tab_selector, timeout=30000)
630
+ await self.human_click(f"{user_tab_selector} >> text=全部")
631
+ await asyncio.sleep(random.uniform(1.5, 3.0))
632
+
633
+ filters_applied = await self._apply_note_search_filters(
634
+ sort_by=sort_by,
635
+ note_type=note_type,
636
+ publish_time=publish_time,
637
+ search_scope=search_scope,
638
+ location=location,
639
+ )
640
+ if filters_applied:
641
+ # Drop pre-filter responses captured during initial page load.
642
+ result.clear()
643
+ state["has_more"] = True
644
+ state["risk_detected"] = False
645
+ state["risk_msg"] = ""
646
+
647
+ # Check if we hit the "Page not found" or "Risk control" page
648
+ content = await self.page.content()
649
+ if "你访问的页面不见了" in content or "验证码" in content:
650
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {keyword}")
651
+ if "薯队长遇到了点小麻烦" in content:
652
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on search page")
653
+
654
+ for i in range(25):
655
+ if state.get("risk_detected"):
656
+ raise RiskControlException(f"Risk control detected during search scroll: {state.get('risk_msg')}")
657
+ if len(result) >= size or not state["has_more"]:
658
+ break
659
+ logger.info(f"DEBUG: [search_notes] 第 {i + 1} 次滚动, 当前结果数: {len(result)}")
660
+ if len(result) == 0:
661
+ user_tab_selector = "#channel-container"
662
+ await self.page.wait_for_selector(user_tab_selector, timeout=30000)
663
+ await self.human_click(f"{user_tab_selector} >> text=全部")
664
+ await asyncio.sleep(random.uniform(1.5, 3.0))
665
+ await self.human_scroll()
666
+
667
+ formatted = [self._format_dict(n) for n in result.values()][:size]
668
+ if formatted:
669
+ return formatted
670
+
671
+ fallback = await self._extract_search_feeds_from_initial_state()
672
+ return fallback[:size]
673
+ except Exception as e:
674
+ logger.error(f"Search note failed: {e}")
675
+ raise
676
+
677
+ async def search_users(self, keyword: str, size: int = 20):
678
+ """Search for users by keyword with robust anti-bot measures."""
679
+ logger.info(f"DEBUG: [search_users] 关键词: {keyword}")
680
+ result = {}
681
+ state = {"has_more": True, "risk_detected": False, "risk_msg": ""}
682
+
683
+ async def handle_response(response):
684
+ if "/api/sns/web/v1/search/usersearch" in response.url:
685
+ if response.status != 200:
686
+ state["risk_detected"] = True
687
+ state["risk_msg"] = f"User search API status {response.status}"
688
+ logger.warning(f"User search blocked by risk control: {response.status}")
689
+ return
690
+ if response.status == 200:
691
+ try:
692
+ data = await response.json()
693
+ # 业务级错误码校验
694
+ if not data.get("success"):
695
+ msg = data.get("msg", "")
696
+ if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
697
+ state["risk_detected"] = True
698
+ state["risk_msg"] = msg
699
+ logger.error(f"Rednote API error: {data}")
700
+ return
701
+
702
+ items = data.get('data', {}).get('users', [])
703
+ state["has_more"] = data.get('data', {}).get('has_more', False)
704
+ for item in items:
705
+ user_red_id = item.get('red_id')
706
+ if user_red_id:
707
+ result[user_red_id] = item
708
+ except Exception as e:
709
+ if "No resource with given identifier found" in str(e):
710
+ return
711
+ logger.error(f"JSON parse error: {e}")
712
+
713
+ from rednote_cli._runtime.core.browser.manager import browser_manager
714
+ encoded_keyword = quote(keyword)
715
+ search_url = f"https://www.xiaohongshu.com/search_result?keyword={encoded_keyword}&source=web_search_result_notes"
716
+
717
+ async with browser_manager.observe_responses(self.page, handle_response):
718
+ try:
719
+ # 添加 Referer 伪装
720
+ await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
721
+
722
+ logger.info("DEBUG: [search_users] 正在进入搜索结果页...")
723
+ await self.page.goto(search_url, wait_until="load")
724
+ await asyncio.sleep(random.uniform(2.0, 4.0))
725
+
726
+ # 风控页面检测与自动刷新
727
+ content = await self.page.content()
728
+ if "你访问的页面不见了" in content or "验证码" in content:
729
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {keyword}")
730
+ if "薯队长遇到了点小麻烦" in content:
731
+ raise RiskControlException(f"Detected 'Captain Potato' risk control on search page")
732
+
733
+ user_tab_selector = "#channel-container"
734
+ await self.page.wait_for_selector(user_tab_selector, timeout=30000)
735
+ await self.human_click(f"{user_tab_selector} >> text=用户")
736
+ await asyncio.sleep(random.uniform(1.5, 3.0))
737
+
738
+ # 拟人化滚动循环
739
+ for i in range(25):
740
+ if state.get("risk_detected"):
741
+ raise RiskControlException(f"Risk control detected during user search scroll: {state.get('risk_msg')}")
742
+ if len(result) >= size or not state["has_more"]:
743
+ break
744
+ logger.info(f"DEBUG: [search_users] 第 {i + 1} 次滚动, 当前用户数: {len(result)}")
745
+ if len(result) == 0:
746
+ user_tab_selector = "#channel-container"
747
+ await self.page.wait_for_selector(user_tab_selector, timeout=30000)
748
+ await self.human_click(f"{user_tab_selector} >> text=用户")
749
+ await asyncio.sleep(random.uniform(1.5, 3.0))
750
+ await self.human_scroll()
751
+
752
+ return [self._format_dict(u) for u in result.values()][:size]
753
+ except Exception as e:
754
+ logger.error(f"Search user failed: {e}")
755
+ raise
756
+
757
+ async def publish_note(
758
+ self,
759
+ target: str,
760
+ image_list: list = None,
761
+ title: str = "",
762
+ content: str = "",
763
+ tags: list | None = None,
764
+ schedule_at: datetime | None = None,
765
+ ):
766
+ """
767
+ 发布笔记,支持图文、视频、长文类型。
768
+
769
+ :param target: 笔记类型:video、image、article
770
+ :param image_list: 图片列表,当 target = image 时必填且非空,支持本地路径、图片URL或混合
771
+ :return:
772
+ """
773
+ try:
774
+ target_text = (target or "").strip().lower()
775
+ if not target_text:
776
+ raise InvalidPublishParameterError("`target` 必须是非空字符串")
777
+
778
+ publisher = RednotePublisher(self.page)
779
+ if target_text == "image":
780
+ if not image_list:
781
+ raise InvalidPublishParameterError("`image_list` 在图文发布时不能为空")
782
+ if isinstance(schedule_at, str) and schedule_at.strip():
783
+ schedule_at = parse_rfc3339(schedule_at)
784
+ return await publisher.publish_image_note(
785
+ image_list=image_list,
786
+ title=title or "",
787
+ content=content or "",
788
+ tags=tags or [],
789
+ schedule_at=schedule_at,
790
+ )
791
+ if target_text == "video":
792
+ if not image_list:
793
+ raise InvalidPublishParameterError("`image_list` 在视频发布时不能为空")
794
+ if len(image_list) != 1:
795
+ raise InvalidPublishParameterError("`target=video` 时仅支持 1 个视频素材")
796
+ if isinstance(schedule_at, str) and schedule_at.strip():
797
+ schedule_at = parse_rfc3339(schedule_at)
798
+ return await publisher.publish_video_note(
799
+ video_path=image_list[0],
800
+ title=title or "",
801
+ content=content or "",
802
+ tags=tags or [],
803
+ schedule_at=schedule_at,
804
+ )
805
+ if target_text == "article":
806
+ raise PublishWorkflowNotReadyError("article 发布流程尚未实现")
807
+
808
+ raise UnsupportedPublishTargetError(f"不支持的 target: {target},当前支持: video, image, article")
809
+
810
+ except RiskControlException:
811
+ logger.warning(f"发布过程中触发风控,交由重试机制处理")
812
+ raise
813
+ except PublishNoteException as e:
814
+ logger.error(str(e))
815
+ raise
816
+ except Exception as e:
817
+ logger.exception(f"[publish_note] Unexpected error: {e}")
818
+ raise PublishExecutionError(f"发布笔记失败: {e}") from e