rednote-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rednote_cli/__init__.py +5 -0
- rednote_cli/_runtime/__init__.py +0 -0
- rednote_cli/_runtime/common/__init__.py +0 -0
- rednote_cli/_runtime/common/app_utils.py +77 -0
- rednote_cli/_runtime/common/config.py +83 -0
- rednote_cli/_runtime/common/enums.py +17 -0
- rednote_cli/_runtime/common/errors.py +22 -0
- rednote_cli/_runtime/core/__init__.py +0 -0
- rednote_cli/_runtime/core/account_manager.py +349 -0
- rednote_cli/_runtime/core/browser/__init__.py +0 -0
- rednote_cli/_runtime/core/browser/manager.py +247 -0
- rednote_cli/_runtime/core/database/__init__.py +0 -0
- rednote_cli/_runtime/core/database/manager.py +334 -0
- rednote_cli/_runtime/platforms/__init__.py +0 -0
- rednote_cli/_runtime/platforms/base.py +62 -0
- rednote_cli/_runtime/platforms/factory.py +55 -0
- rednote_cli/_runtime/platforms/publishing/__init__.py +12 -0
- rednote_cli/_runtime/platforms/publishing/media.py +275 -0
- rednote_cli/_runtime/platforms/publishing/models.py +59 -0
- rednote_cli/_runtime/platforms/publishing/validator.py +124 -0
- rednote_cli/_runtime/services/__init__.py +1 -0
- rednote_cli/_runtime/services/scraper_service.py +235 -0
- rednote_cli/adapters/__init__.py +1 -0
- rednote_cli/adapters/output/__init__.py +1 -0
- rednote_cli/adapters/output/event_stream.py +29 -0
- rednote_cli/adapters/output/formatter_json.py +23 -0
- rednote_cli/adapters/output/formatter_table.py +39 -0
- rednote_cli/adapters/output/writer.py +17 -0
- rednote_cli/adapters/persistence/__init__.py +1 -0
- rednote_cli/adapters/persistence/file_account_repo.py +51 -0
- rednote_cli/adapters/platform/__init__.py +1 -0
- rednote_cli/adapters/platform/rednote/__init__.py +1 -0
- rednote_cli/adapters/platform/rednote/extractor.py +65 -0
- rednote_cli/adapters/platform/rednote/publisher.py +26 -0
- rednote_cli/adapters/platform/rednote/runtime_extractor.py +818 -0
- rednote_cli/adapters/platform/rednote/runtime_publisher.py +373 -0
- rednote_cli/adapters/platform/rednote/runtime_registration.py +20 -0
- rednote_cli/application/__init__.py +1 -0
- rednote_cli/application/dto/__init__.py +1 -0
- rednote_cli/application/dto/input_models.py +121 -0
- rednote_cli/application/dto/output_models.py +78 -0
- rednote_cli/application/use_cases/__init__.py +1 -0
- rednote_cli/application/use_cases/account_list.py +9 -0
- rednote_cli/application/use_cases/account_mutation.py +22 -0
- rednote_cli/application/use_cases/auth_login.py +64 -0
- rednote_cli/application/use_cases/auth_status.py +96 -0
- rednote_cli/application/use_cases/doctor.py +49 -0
- rednote_cli/application/use_cases/init_runtime.py +20 -0
- rednote_cli/application/use_cases/note_get.py +22 -0
- rednote_cli/application/use_cases/note_search.py +26 -0
- rednote_cli/application/use_cases/publish_note.py +25 -0
- rednote_cli/application/use_cases/user_get.py +18 -0
- rednote_cli/application/use_cases/user_search.py +8 -0
- rednote_cli/application/use_cases/user_self.py +8 -0
- rednote_cli/cli/__init__.py +1 -0
- rednote_cli/cli/__main__.py +5 -0
- rednote_cli/cli/commands/__init__.py +1 -0
- rednote_cli/cli/commands/account.py +204 -0
- rednote_cli/cli/commands/doctor.py +20 -0
- rednote_cli/cli/commands/init.py +20 -0
- rednote_cli/cli/commands/note.py +101 -0
- rednote_cli/cli/commands/publish.py +147 -0
- rednote_cli/cli/commands/search.py +185 -0
- rednote_cli/cli/commands/user.py +113 -0
- rednote_cli/cli/main.py +163 -0
- rednote_cli/cli/options.py +13 -0
- rednote_cli/cli/runtime.py +142 -0
- rednote_cli/cli/utils.py +74 -0
- rednote_cli/domain/__init__.py +1 -0
- rednote_cli/domain/errors.py +50 -0
- rednote_cli/domain/note_search_filters.py +155 -0
- rednote_cli/infra/__init__.py +1 -0
- rednote_cli/infra/exit_codes.py +30 -0
- rednote_cli/infra/logger.py +11 -0
- rednote_cli/infra/paths.py +31 -0
- rednote_cli/infra/platforms.py +4 -0
- rednote_cli-0.1.0.dist-info/METADATA +81 -0
- rednote_cli-0.1.0.dist-info/RECORD +81 -0
- rednote_cli-0.1.0.dist-info/WHEEL +5 -0
- rednote_cli-0.1.0.dist-info/entry_points.txt +2 -0
- rednote_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,818 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import shutil
|
|
3
|
+
import urllib.parse
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
from loguru import logger
|
|
12
|
+
from yt_dlp.utils import *
|
|
13
|
+
|
|
14
|
+
from rednote_cli._runtime.common.enums import XsecSource
|
|
15
|
+
from rednote_cli._runtime.common.errors import (
|
|
16
|
+
InvalidPublishParameterError,
|
|
17
|
+
PublishMediaPreparationError,
|
|
18
|
+
PublishWorkflowNotReadyError,
|
|
19
|
+
UnsupportedPublishTargetError,
|
|
20
|
+
)
|
|
21
|
+
from rednote_cli._runtime.common.errors import (
|
|
22
|
+
PublishExecutionError,
|
|
23
|
+
PublishNoteException,
|
|
24
|
+
)
|
|
25
|
+
from rednote_cli._runtime.platforms.base import BaseExtractor, RiskControlException
|
|
26
|
+
from rednote_cli._runtime.platforms.publishing.validator import parse_rfc3339
|
|
27
|
+
from rednote_cli.adapters.platform.rednote.runtime_publisher import RednotePublisher
|
|
28
|
+
from rednote_cli.domain.note_search_filters import build_search_filter_selections
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_url(path: str) -> bool:
|
|
32
|
+
try:
|
|
33
|
+
parsed = urlparse(path)
|
|
34
|
+
return bool(parsed.scheme and parsed.netloc)
|
|
35
|
+
except Exception:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _validate_extension(file_identify: str, allowed_extensions) -> str:
|
|
40
|
+
path_part = urlparse(file_identify).path
|
|
41
|
+
suffix = Path(path_part).suffix.lower()
|
|
42
|
+
if suffix not in allowed_extensions:
|
|
43
|
+
supported = ", ".join(ext.lstrip(".") for ext in sorted(allowed_extensions))
|
|
44
|
+
raise InvalidPublishParameterError(
|
|
45
|
+
f"不支持的文件格式: {file_identify} (仅支持 {supported})"
|
|
46
|
+
)
|
|
47
|
+
return suffix
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _normalize_media_list(media_list: list, field_name: str) -> list[str]:
|
|
51
|
+
if media_list is None:
|
|
52
|
+
raise InvalidPublishParameterError(f"`{field_name}` 不能为空")
|
|
53
|
+
if not isinstance(media_list, (list, tuple)):
|
|
54
|
+
raise InvalidPublishParameterError(f"`{field_name}` 必须是 list 或 tuple")
|
|
55
|
+
if len(media_list) == 0:
|
|
56
|
+
raise InvalidPublishParameterError(f"`{field_name}` 至少包含 1 个文件")
|
|
57
|
+
|
|
58
|
+
normalized = []
|
|
59
|
+
for index, item in enumerate(media_list):
|
|
60
|
+
text = "" if item is None else str(item).strip()
|
|
61
|
+
if not text:
|
|
62
|
+
raise InvalidPublishParameterError(f"第 {index + 1} 个素材项为空,请检查输入")
|
|
63
|
+
normalized.append(text)
|
|
64
|
+
return normalized
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@contextmanager
|
|
68
|
+
def prepare_image_paths(image_list: list, allowed_extensions):
|
|
69
|
+
"""Validate/download image assets and cleanup temporary files automatically."""
|
|
70
|
+
normalized_images = _normalize_media_list(image_list, field_name="image_list")
|
|
71
|
+
temp_dir = tempfile.mkdtemp(prefix="publish_upload_")
|
|
72
|
+
final_paths = []
|
|
73
|
+
|
|
74
|
+
logger.info(f"开始校验发布素材, count: {len(normalized_images)}")
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
for index, item in enumerate(normalized_images):
|
|
78
|
+
if _is_url(item):
|
|
79
|
+
suffix = _validate_extension(item, allowed_extensions=allowed_extensions)
|
|
80
|
+
target_path = Path(temp_dir) / f"download_{index}{suffix}"
|
|
81
|
+
try:
|
|
82
|
+
headers = {
|
|
83
|
+
"User-Agent": (
|
|
84
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
85
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
86
|
+
"Chrome/110.0.0.0 Safari/537.36"
|
|
87
|
+
)
|
|
88
|
+
}
|
|
89
|
+
response = requests.get(item, headers=headers, timeout=15, stream=True)
|
|
90
|
+
response.raise_for_status()
|
|
91
|
+
with open(target_path, "wb") as output:
|
|
92
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
93
|
+
output.write(chunk)
|
|
94
|
+
except requests.exceptions.RequestException as e:
|
|
95
|
+
raise PublishMediaPreparationError(
|
|
96
|
+
f"图片 {index + 1} 下载失败: {item}。具体原因: {e}"
|
|
97
|
+
) from e
|
|
98
|
+
|
|
99
|
+
final_paths.append(str(target_path.absolute()))
|
|
100
|
+
logger.info(f"图片 {index + 1} 下载成功, source: {item}, target_path: {str(target_path.absolute())}")
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
local_path = Path(item)
|
|
104
|
+
if not local_path.exists():
|
|
105
|
+
raise InvalidPublishParameterError(f"本地图片不存在: {item}")
|
|
106
|
+
if not local_path.is_file():
|
|
107
|
+
raise InvalidPublishParameterError(f"路径不是一个有效文件: {item}")
|
|
108
|
+
|
|
109
|
+
_validate_extension(item, allowed_extensions=allowed_extensions)
|
|
110
|
+
final_paths.append(str(local_path.absolute()))
|
|
111
|
+
logger.info(f"图片 {index + 1} 校验通过, path: {str(local_path.absolute())}")
|
|
112
|
+
|
|
113
|
+
yield final_paths
|
|
114
|
+
finally:
|
|
115
|
+
if os.path.exists(temp_dir):
|
|
116
|
+
shutil.rmtree(temp_dir)
|
|
117
|
+
logger.info("发布素材临时目录已清理, temp_dir: {temp_dir}")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RednoteExtractor(BaseExtractor):
|
|
121
|
+
"""Rednote-specific data extraction using yt_dlp InfoExtractor pattern."""
|
|
122
|
+
# 采用拦截后端访问直接获取接口返回数据
|
|
123
|
+
IE_NAME = 'rednote:all'
|
|
124
|
+
|
|
125
|
+
def __init__(self, page):
|
|
126
|
+
super().__init__(page)
|
|
127
|
+
|
|
128
|
+
def _to_snake_case(self, name):
|
|
129
|
+
"""将 camelCase 转换为 snake_case"""
|
|
130
|
+
import re
|
|
131
|
+
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
|
|
132
|
+
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
|
|
133
|
+
|
|
134
|
+
def _format_dict(self, data):
|
|
135
|
+
"""递归统一所有键名为小写下划线,不漏掉任何数据"""
|
|
136
|
+
if not isinstance(data, (dict, list)):
|
|
137
|
+
return data
|
|
138
|
+
if isinstance(data, list):
|
|
139
|
+
return [self._format_dict(i) for i in data]
|
|
140
|
+
|
|
141
|
+
new_data = {}
|
|
142
|
+
for k, v in data.items():
|
|
143
|
+
new_key = self._to_snake_case(k)
|
|
144
|
+
new_data[new_key] = self._format_dict(v)
|
|
145
|
+
return new_data
|
|
146
|
+
|
|
147
|
+
async def _apply_note_search_filters(
|
|
148
|
+
self,
|
|
149
|
+
*,
|
|
150
|
+
sort_by: str | None = None,
|
|
151
|
+
note_type: str | None = None,
|
|
152
|
+
publish_time: str | None = None,
|
|
153
|
+
search_scope: str | None = None,
|
|
154
|
+
location: str | None = None,
|
|
155
|
+
) -> bool:
|
|
156
|
+
selections = build_search_filter_selections(
|
|
157
|
+
sort_by=sort_by,
|
|
158
|
+
note_type=note_type,
|
|
159
|
+
publish_time=publish_time,
|
|
160
|
+
search_scope=search_scope,
|
|
161
|
+
location=location,
|
|
162
|
+
)
|
|
163
|
+
if not selections:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
filter_button = self.page.locator("div.filter").first
|
|
167
|
+
filter_panel = self.page.locator("div.filter-panel").first
|
|
168
|
+
|
|
169
|
+
for item in selections:
|
|
170
|
+
await filter_button.hover()
|
|
171
|
+
await filter_panel.wait_for(state="visible", timeout=10_000)
|
|
172
|
+
selector = f"div.filter-panel div.filters:nth-child({item.filters_index}) div.tags:nth-child({item.tags_index})"
|
|
173
|
+
await self.page.locator(selector).first.click(timeout=10_000)
|
|
174
|
+
logger.info(f"DEBUG: [search_notes] 应用筛选 {item.field}={item.value}({item.label})")
|
|
175
|
+
await asyncio.sleep(random.uniform(0.5, 1.0))
|
|
176
|
+
|
|
177
|
+
await self.page.wait_for_load_state("networkidle")
|
|
178
|
+
await asyncio.sleep(random.uniform(1.0, 1.8))
|
|
179
|
+
return True
|
|
180
|
+
|
|
181
|
+
async def _extract_search_feeds_from_initial_state(self) -> list[dict]:
|
|
182
|
+
feeds = await self.page.evaluate(
|
|
183
|
+
"""
|
|
184
|
+
() => {
|
|
185
|
+
const state = window.__INITIAL_STATE__;
|
|
186
|
+
if (!state || !state.search || !state.search.feeds) return [];
|
|
187
|
+
const feeds = state.search.feeds;
|
|
188
|
+
const data = feeds.value !== undefined ? feeds.value : feeds._value;
|
|
189
|
+
return Array.isArray(data) ? data : [];
|
|
190
|
+
}
|
|
191
|
+
"""
|
|
192
|
+
)
|
|
193
|
+
if not isinstance(feeds, list):
|
|
194
|
+
return []
|
|
195
|
+
return [self._format_dict(item) for item in feeds if isinstance(item, dict)]
|
|
196
|
+
|
|
197
|
+
async def get_self_info(self):
|
|
198
|
+
"""Extract personal information of the logged-in user with robust measures."""
|
|
199
|
+
logger.info("DEBUG: [get_self_info] 开始执行")
|
|
200
|
+
# 采用后端拦截方式直接获取接口数据
|
|
201
|
+
result = {}
|
|
202
|
+
state = {"risk_detected": False, "risk_msg": "", "self_info_obtained": False}
|
|
203
|
+
event = asyncio.Event()
|
|
204
|
+
|
|
205
|
+
async def handle_response(response):
|
|
206
|
+
if "/api/sns/web/v2/user/me" in response.url or "/api/sns/web/v1/user/selfinfo" in response.url:
|
|
207
|
+
if response.status != 200:
|
|
208
|
+
if not state["self_info_obtained"]:
|
|
209
|
+
state["risk_detected"] = True
|
|
210
|
+
state["risk_msg"] = f"Self info API status {response.status}"
|
|
211
|
+
logger.warning(f"Self info API returned non-200: {response.status}")
|
|
212
|
+
return
|
|
213
|
+
else:
|
|
214
|
+
try:
|
|
215
|
+
data = await response.json()
|
|
216
|
+
if not data.get("success"):
|
|
217
|
+
msg = data.get("msg", "")
|
|
218
|
+
if ("薯队长遇到了点小麻烦" in msg or "风控" in msg) and not state["self_info_obtained"]:
|
|
219
|
+
state["risk_detected"] = True
|
|
220
|
+
state["risk_msg"] = msg
|
|
221
|
+
logger.error(f"Rednote API error: {data}")
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
r = data.get('data', {})
|
|
225
|
+
if "/api/sns/web/v2/user/me" in response.url:
|
|
226
|
+
result["red_id"] = r.get("red_id")
|
|
227
|
+
result["user_id"] = r.get("user_id")
|
|
228
|
+
result["nickname"] = r.get("nickname")
|
|
229
|
+
result["desc"] = r.get("desc")
|
|
230
|
+
result["gender"] = r.get("gender")
|
|
231
|
+
result["guest"] = r.get("guest")
|
|
232
|
+
result["images"] = r.get("images")
|
|
233
|
+
result["imageb"] = r.get("imageb")
|
|
234
|
+
else:
|
|
235
|
+
basic_info = r.get("basic_info", {})
|
|
236
|
+
result["red_id"] = basic_info.get("red_id")
|
|
237
|
+
result["nickname"] = basic_info.get("nickname")
|
|
238
|
+
result["desc"] = basic_info.get("desc")
|
|
239
|
+
result["gender"] = basic_info.get("gender")
|
|
240
|
+
result["images"] = basic_info.get("images")
|
|
241
|
+
result["imageb"] = basic_info.get("imageb")
|
|
242
|
+
result["ip_location"] = basic_info.get("ip_location")
|
|
243
|
+
result["interactions"] = r.get("interactions")
|
|
244
|
+
result["tags"] = r.get("tags")
|
|
245
|
+
result["tab_public"] = r.get("tab_public")
|
|
246
|
+
|
|
247
|
+
if result.get("red_id"):
|
|
248
|
+
state["self_info_obtained"] = True
|
|
249
|
+
state["risk_detected"] = False
|
|
250
|
+
state["risk_msg"] = ""
|
|
251
|
+
logger.info(f"DEBUG: [get_self_info] 成功拦截到接口数据: {result.get('nickname')}")
|
|
252
|
+
event.set()
|
|
253
|
+
except Exception as e:
|
|
254
|
+
if "No resource with given identifier found" in str(e):
|
|
255
|
+
return
|
|
256
|
+
logger.error(f"JSON parse error: {e}")
|
|
257
|
+
|
|
258
|
+
from rednote_cli._runtime.core.browser.manager import browser_manager
|
|
259
|
+
|
|
260
|
+
async with browser_manager.observe_responses(self.page, handle_response):
|
|
261
|
+
try:
|
|
262
|
+
await self.page.goto("https://www.xiaohongshu.com/explore", wait_until="domcontentloaded")
|
|
263
|
+
await self.page.wait_for_selector(".main-container .user .link-wrapper .channel", timeout=30000)
|
|
264
|
+
logger.info("DEBUG: [get_self_info] 正在触发用户信息请求...")
|
|
265
|
+
await self.page.evaluate(
|
|
266
|
+
"() => fetch('/api/sns/web/v2/user/me', { credentials: 'include' }).catch(() => null)"
|
|
267
|
+
)
|
|
268
|
+
await self.page.wait_for_timeout(500)
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
# 等待接口返回,最多等待 10 秒
|
|
272
|
+
await asyncio.wait_for(event.wait(), timeout=10.0)
|
|
273
|
+
except asyncio.TimeoutError:
|
|
274
|
+
logger.warning("DEBUG: [get_self_info] 等待接口返回超时")
|
|
275
|
+
|
|
276
|
+
if result.get("red_id"):
|
|
277
|
+
return self._format_dict(result)
|
|
278
|
+
|
|
279
|
+
if "验证码" in await self.page.content():
|
|
280
|
+
logger.error("Detected Captcha on self page.")
|
|
281
|
+
if "薯队长遇到了点小麻烦" in await self.page.content():
|
|
282
|
+
raise RiskControlException("Detected 'Captain Potato' risk control on self page")
|
|
283
|
+
if state["risk_detected"]:
|
|
284
|
+
raise RiskControlException(f"Risk control detected during self info extraction: {state['risk_msg']}")
|
|
285
|
+
return self._format_dict(result)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(f"Extract self info failed: {e}")
|
|
288
|
+
raise
|
|
289
|
+
|
|
290
|
+
# initial_state = self._search_json(
|
|
291
|
+
# r'window\s*\.\s*__INITIAL_STATE__\s*=',
|
|
292
|
+
# content, 'initial state', 'self',
|
|
293
|
+
# end_pattern=r'</script>', transform_source=js_to_json, default={}
|
|
294
|
+
# )
|
|
295
|
+
#
|
|
296
|
+
# return traverse_obj(initial_state, {
|
|
297
|
+
# "user_id": ("user", "userInfo", ("user_id", "userId"), {str}, any),
|
|
298
|
+
# "red_id": ("user", "userInfo", ("red_id", "redId"), {str}, any),
|
|
299
|
+
# "nickname": ("user", "userInfo", "nickname", {str}),
|
|
300
|
+
# "desc": ("user", "userInfo", "desc", {str}),
|
|
301
|
+
# "gender": ("user", "userInfo", "gender", {int}),
|
|
302
|
+
# "images": ("user", "userInfo", "images", {str}),
|
|
303
|
+
# "imageb": ("user", "userInfo", "imageb", {str}),
|
|
304
|
+
# "guest": ("user", "userInfo", "guest", {bool}),
|
|
305
|
+
# "logged_in": ("user", "loggedIn", {bool}),
|
|
306
|
+
# "ip_location": ("user", "userPageData", "basicInfo", "ipLocation", {str}),
|
|
307
|
+
# "fans": ("user", "userPageData", "interactions", lambda _, v: v.get("type") == "fans", "count", {str_to_int}, any),
|
|
308
|
+
# "follows": ("user", "userPageData", "interactions", lambda _, v: v.get("type") == "follows", "count", {str_to_int}, any),
|
|
309
|
+
# "interaction": ("user", "userPageData", "interactions", lambda _, v: v.get("type") == "interaction", "count", {str_to_int}, any),
|
|
310
|
+
# "tags": (
|
|
311
|
+
# 'user', 'userPageData', 'tags', ...,
|
|
312
|
+
# {
|
|
313
|
+
# "name": ("name", {str}),
|
|
314
|
+
# "icon": ("icon", {str}),
|
|
315
|
+
# "type": ("tagType", {str}),
|
|
316
|
+
# }
|
|
317
|
+
# ),
|
|
318
|
+
# })
|
|
319
|
+
|
|
320
|
+
async def get_user_info(self, user_id: str, xsec_token: str = None, xsec_source: XsecSource = XsecSource.PC_FEED):
|
|
321
|
+
"""Extract information about a specific user with robust anti-bot measures."""
|
|
322
|
+
logger.info(f"DEBUG: [get_user_info] 目标 ID: {user_id}?xsec_source={xsec_source.value}")
|
|
323
|
+
# 进入用户页面后不会触发 `/api/sns/web/v1/user/otherinfo` 和 `/api/sns/web/v1/user_posted` 接口, 因此采用解析 HTML 的方式
|
|
324
|
+
try:
|
|
325
|
+
url = f"https://www.xiaohongshu.com/user/profile/{user_id}"
|
|
326
|
+
if xsec_token:
|
|
327
|
+
url += f"&xsec_token={xsec_token}"
|
|
328
|
+
|
|
329
|
+
await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
|
|
330
|
+
await self.page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
|
|
331
|
+
await self.page.wait_for_selector(".side-bar .user a", timeout=30000)
|
|
332
|
+
await asyncio.sleep(random.uniform(1.0, 2.0))
|
|
333
|
+
|
|
334
|
+
logger.info(f"DEBUG: [get_user_info] 正在跳转: {url}")
|
|
335
|
+
await self.page.goto(url, wait_until="load")
|
|
336
|
+
await asyncio.sleep(random.uniform(2.0, 4.0))
|
|
337
|
+
|
|
338
|
+
# 风控页面检测与自动刷新
|
|
339
|
+
content = await self.page.content()
|
|
340
|
+
if "你访问的页面不见了" in content or "验证码" in content or "未连接到服务器" in content:
|
|
341
|
+
logger.error(f"要访问该用户({user_id})信息, 必须传入 xsec_token 参数")
|
|
342
|
+
if "薯队长遇到了点小麻烦" in content:
|
|
343
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on user profile: {user_id}")
|
|
344
|
+
|
|
345
|
+
initial_state = self._search_json(
|
|
346
|
+
r'window\s*\.\s*__INITIAL_STATE__\s*=',
|
|
347
|
+
content, 'initial state', user_id,
|
|
348
|
+
end_pattern=r'</script>', transform_source=js_to_json, default={}
|
|
349
|
+
)
|
|
350
|
+
result = traverse_obj(initial_state, {
|
|
351
|
+
"red_id": ("user", "userPageData", "basicInfo", "redId", {str}),
|
|
352
|
+
"nickname": ("user", "userPageData", "basicInfo", "nickname", {str}),
|
|
353
|
+
"desc": ("user", "userPageData", "basicInfo", "desc", {str}),
|
|
354
|
+
"gender": ("user", "userPageData", "basicInfo", "gender", {int}),
|
|
355
|
+
"images": ("user", "userPageData", "basicInfo", "images", {str}),
|
|
356
|
+
"imageb": ("user", "userPageData", "basicInfo", "imageb", {str}),
|
|
357
|
+
"ip_location": ("user", "userPageData", "basicInfo", "ipLocation", {str}),
|
|
358
|
+
"interactions": ("user", "userPageData", "interactions", {list}),
|
|
359
|
+
"tags": ("user", "userPageData", "tags", {list}),
|
|
360
|
+
"tab_public": ("user", "userPageData", "tabPublic", {list}),
|
|
361
|
+
"notes": ('user', 'notes', ..., ...),
|
|
362
|
+
})
|
|
363
|
+
|
|
364
|
+
result["user_id"] = user_id
|
|
365
|
+
logger.info(f"DEBUG: [get_user_info] 提取完成,昵称: {result.get('nickname')}")
|
|
366
|
+
return self._format_dict(result)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.error(f"Failed to extract user info for {user_id}: {e}")
|
|
369
|
+
raise
|
|
370
|
+
|
|
371
|
+
async def get_note_info(self, note_id: str, xsec_token: str = None, xsec_source: XsecSource = XsecSource.PC_FEED, comment_size: int = 10, sub_comment_size: int = 5):
|
|
372
|
+
"""Search for notes by keyword with robust anti-bot measures."""
|
|
373
|
+
logger.info(f"DEBUG: [get_note_info] 目标笔记: {note_id}")
|
|
374
|
+
# 进入笔记页面后不会触发 `/api/sns/web/v1/feed` 接口, 因此采用解析 HTML 的方式获取"笔记信息", 而采用拦截后端访问直接获取接口返回的评论数据
|
|
375
|
+
comments = []
|
|
376
|
+
state = {"has_more": True, "risk_detected": False, "risk_msg": ""}
|
|
377
|
+
|
|
378
|
+
async def handle_response(response):
|
|
379
|
+
if "/api/sns/web/v2/comment/sub/page" in response.url:
|
|
380
|
+
if response.status != 200:
|
|
381
|
+
state["risk_detected"] = True
|
|
382
|
+
state["risk_msg"] = f"Sub-comment API status {response.status}"
|
|
383
|
+
logger.warning(f"Search blocked by risk control: {response.status}")
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
if response.status == 200:
|
|
387
|
+
try:
|
|
388
|
+
parsed = urllib.parse.urlparse(str(response.url))
|
|
389
|
+
query_params = urllib.parse.parse_qs(parsed.query)
|
|
390
|
+
root_comment_id = query_params.get('root_comment_id', [None])[0]
|
|
391
|
+
data = await response.json()
|
|
392
|
+
if not data.get("success"):
|
|
393
|
+
msg = data.get("msg", "")
|
|
394
|
+
if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
|
|
395
|
+
state["risk_detected"] = True
|
|
396
|
+
state["risk_msg"] = msg
|
|
397
|
+
logger.error(f"Rednote API error: {data}")
|
|
398
|
+
return
|
|
399
|
+
r = data.get('data', {})
|
|
400
|
+
cs = r.get("comments", [])
|
|
401
|
+
sub_comment_count = ""
|
|
402
|
+
now_sub_comment_count = -1
|
|
403
|
+
for c in comments:
|
|
404
|
+
if c.get("id") == root_comment_id:
|
|
405
|
+
sub_comment_count = c.get("sub_comment_count", "")
|
|
406
|
+
c.get("sub_comments", []).extend(cs)
|
|
407
|
+
now_sub_comment_count = len(c.get("sub_comments", []))
|
|
408
|
+
break
|
|
409
|
+
|
|
410
|
+
logger.info(f"DEBUG: [get_note_info] 拦截到 {root_comment_id} 二级评论数据: {now_sub_comment_count}/{sub_comment_count} 条")
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logger.error(f"JSON parse error: {e}")
|
|
413
|
+
logger.error(f"response.text: {response.text}")
|
|
414
|
+
traceback.print_exc()
|
|
415
|
+
|
|
416
|
+
elif "/api/sns/web/v2/comment/page" in response.url:
|
|
417
|
+
if response.status != 200:
|
|
418
|
+
state["risk_detected"] = True
|
|
419
|
+
state["risk_msg"] = f"Comment API status {response.status}"
|
|
420
|
+
logger.warning(f"Search blocked by risk control: {response.status}")
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
if response.status == 200:
|
|
424
|
+
try:
|
|
425
|
+
data = await response.json()
|
|
426
|
+
if not data.get("success"):
|
|
427
|
+
msg = data.get("msg", "")
|
|
428
|
+
if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
|
|
429
|
+
state["risk_detected"] = True
|
|
430
|
+
state["risk_msg"] = msg
|
|
431
|
+
logger.error(f"Rednote API error: {data}")
|
|
432
|
+
return
|
|
433
|
+
r = data.get('data', {})
|
|
434
|
+
state["has_more"] = r.get('has_more', False)
|
|
435
|
+
comments.extend(r.get('comments', []))
|
|
436
|
+
logger.info(f"DEBUG: [get_note_info] 拦截到评论数据: {len(comments)} 条")
|
|
437
|
+
except Exception as e:
|
|
438
|
+
if "No resource with given identifier found" in str(e):
|
|
439
|
+
return
|
|
440
|
+
logger.error(f"JSON parse error: {e}")
|
|
441
|
+
|
|
442
|
+
from rednote_cli._runtime.core.browser.manager import browser_manager
|
|
443
|
+
|
|
444
|
+
async with browser_manager.observe_responses(self.page, handle_response):
|
|
445
|
+
try:
|
|
446
|
+
url = f"https://www.xiaohongshu.com/explore/{note_id}?xsec_source={xsec_source.value}"
|
|
447
|
+
if xsec_token:
|
|
448
|
+
url += f"&xsec_token={xsec_token}"
|
|
449
|
+
|
|
450
|
+
await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
|
|
451
|
+
await self.page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded")
|
|
452
|
+
await self.page.wait_for_selector(".side-bar .user a", timeout=30000)
|
|
453
|
+
await asyncio.sleep(random.uniform(1.0, 2.0))
|
|
454
|
+
|
|
455
|
+
logger.info(f"DEBUG: [get_note_info] 正在跳转: {url}")
|
|
456
|
+
await self.page.goto(url, wait_until="load")
|
|
457
|
+
await asyncio.sleep(random.uniform(2.0, 4.0))
|
|
458
|
+
|
|
459
|
+
content = await self.page.content()
|
|
460
|
+
if "你访问的页面不见了" in content or "当前笔记暂时无法浏览" in content:
|
|
461
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {note_id}")
|
|
462
|
+
if "薯队长遇到了点小麻烦" in content:
|
|
463
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {note_id}")
|
|
464
|
+
|
|
465
|
+
initial_state = self._search_json(
|
|
466
|
+
r'window\s*\.\s*__INITIAL_STATE__\s*=',
|
|
467
|
+
content, 'initial state', note_id,
|
|
468
|
+
end_pattern=r'</script>', transform_source=js_to_json, default={}
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
result = traverse_obj(initial_state, ("note", "noteDetailMap", note_id, "note"))
|
|
472
|
+
logger.info(f"DEBUG: [get_note_info] 笔记解析完成: {result.get('title')}")
|
|
473
|
+
|
|
474
|
+
comment_count = result.get("interactInfo", {}).get("commentCount", 0)
|
|
475
|
+
logger.info(f"DEBUG: [get_note_info] 评论数量: {comment_count}")
|
|
476
|
+
if comment_count != "0":
|
|
477
|
+
# 评论区翻页
|
|
478
|
+
for i in range(25):
|
|
479
|
+
if state.get("risk_detected"):
|
|
480
|
+
raise RiskControlException(f"Risk control detected during scroll: {state.get('risk_msg')}")
|
|
481
|
+
if len(comments) >= comment_size or not state["has_more"]:
|
|
482
|
+
break
|
|
483
|
+
container = await self.page.evaluate_handle(r'''() => {
|
|
484
|
+
// 1. 寻找包含“条评论”的业务逻辑锚点
|
|
485
|
+
const divs = Array.from(document.querySelectorAll('div'));
|
|
486
|
+
const marker = divs.find(el => el.innerText && /共\s*\d+\s*条评论/.test(el.innerText));
|
|
487
|
+
if (!marker) return document.querySelector('.note-scroller'); // 兜底方案
|
|
488
|
+
|
|
489
|
+
// 2. 向上递归寻找最近的滚动容器
|
|
490
|
+
let parent = marker;
|
|
491
|
+
while (parent) {
|
|
492
|
+
const style = window.getComputedStyle(parent);
|
|
493
|
+
if (style.overflowY === 'scroll' || style.overflowY === 'auto') {
|
|
494
|
+
return parent;
|
|
495
|
+
}
|
|
496
|
+
parent = parent.parentElement;
|
|
497
|
+
}
|
|
498
|
+
return document.querySelector('.note-scroller');
|
|
499
|
+
}''')
|
|
500
|
+
await self.human_scroll(container_locator=container.as_element())
|
|
501
|
+
logger.info(f"DEBUG: [search_notes] 第 {i + 1} 次滚动, 当前评论数: {len(comments)}/{comment_count}")
|
|
502
|
+
if len(comments) == 0:
|
|
503
|
+
logger.warning(
|
|
504
|
+
f"[get_note_info] 第 {i + 1} 次滚动仍未抓取到评论,页面可能尚未加载完成"
|
|
505
|
+
)
|
|
506
|
+
# 二级评论下钻
|
|
507
|
+
for comment_index, comment in enumerate(comments):
|
|
508
|
+
if comment_index >= comment_size:
|
|
509
|
+
break
|
|
510
|
+
comment_id = comment.get("id")
|
|
511
|
+
|
|
512
|
+
comment_item = self.page.locator(f"[id='comment-{comment_id}']")
|
|
513
|
+
if await comment_item.count() == 0:
|
|
514
|
+
break
|
|
515
|
+
|
|
516
|
+
container = self.page.locator("div").filter(has=comment_item).last
|
|
517
|
+
|
|
518
|
+
for i in range(25):
|
|
519
|
+
if state.get("risk_detected"):
|
|
520
|
+
raise RiskControlException(f"Risk control detected during sub-comment expansion: {state.get('risk_msg')}")
|
|
521
|
+
# 只定位“当前可见”的展开按钮
|
|
522
|
+
btns = container.get_by_text(re.compile(r"展开\s*\d+\s*条回复|展开更多回复")).filter(visible=True)
|
|
523
|
+
count = await btns.count()
|
|
524
|
+
if count == 0:
|
|
525
|
+
break
|
|
526
|
+
|
|
527
|
+
current_sub_comment_count = len(comments[comment_index].get("sub_comments", []))
|
|
528
|
+
if current_sub_comment_count >= sub_comment_size:
|
|
529
|
+
break
|
|
530
|
+
|
|
531
|
+
for i in range(count):
|
|
532
|
+
try:
|
|
533
|
+
btn = btns.nth(i)
|
|
534
|
+
# 滚动到中心,模拟真实视觉确认
|
|
535
|
+
await btn.scroll_into_view_if_needed()
|
|
536
|
+
await asyncio.sleep(random.uniform(0.5, 1.2))
|
|
537
|
+
# 生产级点击:带上随机位置偏移
|
|
538
|
+
await btn.click(timeout=3000, delay=random.randint(50, 150))
|
|
539
|
+
# 关键:点击后必须给 API 留出加载时间
|
|
540
|
+
await asyncio.sleep(random.uniform(1.2, 2.5))
|
|
541
|
+
except:
|
|
542
|
+
continue
|
|
543
|
+
|
|
544
|
+
# 检查是否还有新出现的按钮
|
|
545
|
+
await asyncio.sleep(1.5)
|
|
546
|
+
|
|
547
|
+
result = self._format_dict(result)
|
|
548
|
+
result["comments"] = [self._format_dict(n) for n in comments][:comment_size]
|
|
549
|
+
return result
|
|
550
|
+
except Exception as e:
|
|
551
|
+
logger.error(f"Search note failed: {e}")
|
|
552
|
+
traceback.print_exc()
|
|
553
|
+
raise
|
|
554
|
+
|
|
555
|
+
async def search_notes(
|
|
556
|
+
self,
|
|
557
|
+
keyword: str,
|
|
558
|
+
size: int = 20,
|
|
559
|
+
sort_by: str | None = None,
|
|
560
|
+
note_type: str | None = None,
|
|
561
|
+
publish_time: str | None = None,
|
|
562
|
+
search_scope: str | None = None,
|
|
563
|
+
location: str | None = None,
|
|
564
|
+
):
|
|
565
|
+
"""Search for notes by keyword with robust anti-bot measures."""
|
|
566
|
+
logger.info(f"DEBUG: [search_notes] 关键词: {keyword}")
|
|
567
|
+
result = {}
|
|
568
|
+
state = {"has_more": True, "risk_detected": False, "risk_msg": ""}
|
|
569
|
+
|
|
570
|
+
async def handle_response(response):
|
|
571
|
+
if "/api/sns/web/v1/search/notes" in response.url:
|
|
572
|
+
if response.status != 200:
|
|
573
|
+
state["risk_detected"] = True
|
|
574
|
+
state["risk_msg"] = f"Search API status {response.status}"
|
|
575
|
+
logger.bind(sys={
|
|
576
|
+
"module": "SEARCH_NOTES",
|
|
577
|
+
"action": "HANDLE_RESPONSE",
|
|
578
|
+
"status": "WARN",
|
|
579
|
+
"raw_data": {"url": str(response.url), "status": response.status, "message": response.status_text},
|
|
580
|
+
"message": "拦截到 response 事件。response 响应失败 → 触发重试机制。"
|
|
581
|
+
})
|
|
582
|
+
return RiskControlException()
|
|
583
|
+
else:
|
|
584
|
+
try:
|
|
585
|
+
try:
|
|
586
|
+
data = await response.json()
|
|
587
|
+
except Exception as e:
|
|
588
|
+
# response 解析 json 失败
|
|
589
|
+
logger.error(f"Search note failed: {e}")
|
|
590
|
+
return
|
|
591
|
+
if not data.get("success"):
|
|
592
|
+
# response 响应失败
|
|
593
|
+
msg = data.get("msg", "")
|
|
594
|
+
if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
|
|
595
|
+
state["risk_detected"] = True
|
|
596
|
+
state["risk_msg"] = msg
|
|
597
|
+
logger.error(f"Rednote API error: {data}")
|
|
598
|
+
return
|
|
599
|
+
|
|
600
|
+
items = data.get('data', {}).get('items', [])
|
|
601
|
+
state["has_more"] = data.get('data', {}).get('has_more', False)
|
|
602
|
+
for item in items:
|
|
603
|
+
if item.get("model_type") == "note":
|
|
604
|
+
note_id = item.get('id')
|
|
605
|
+
if note_id:
|
|
606
|
+
result[note_id] = item
|
|
607
|
+
except Exception as e:
|
|
608
|
+
if "No resource with given identifier found" in str(e):
|
|
609
|
+
return
|
|
610
|
+
logger.error(f"JSON parse error: {e}")
|
|
611
|
+
|
|
612
|
+
from rednote_cli._runtime.core.browser.manager import browser_manager
|
|
613
|
+
|
|
614
|
+
# Double encoding is required for Rednote search
|
|
615
|
+
encoded_keyword = quote(keyword)
|
|
616
|
+
search_url = f"https://www.xiaohongshu.com/search_result?keyword={encoded_keyword}&source=web_search_result_notes"
|
|
617
|
+
|
|
618
|
+
async with browser_manager.observe_responses(self.page, handle_response):
|
|
619
|
+
try:
|
|
620
|
+
# Add Referer to look more like a real search from home page
|
|
621
|
+
await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
|
|
622
|
+
|
|
623
|
+
logger.info(f"DEBUG: [search_notes] 正在打开搜索页: {search_url}")
|
|
624
|
+
# Navigate and wait for a bit more than just domcontentloaded
|
|
625
|
+
await self.page.goto(search_url, wait_until="load")
|
|
626
|
+
await asyncio.sleep(random.uniform(2.0, 4.0))
|
|
627
|
+
|
|
628
|
+
user_tab_selector = "#channel-container"
|
|
629
|
+
await self.page.wait_for_selector(user_tab_selector, timeout=30000)
|
|
630
|
+
await self.human_click(f"{user_tab_selector} >> text=全部")
|
|
631
|
+
await asyncio.sleep(random.uniform(1.5, 3.0))
|
|
632
|
+
|
|
633
|
+
filters_applied = await self._apply_note_search_filters(
|
|
634
|
+
sort_by=sort_by,
|
|
635
|
+
note_type=note_type,
|
|
636
|
+
publish_time=publish_time,
|
|
637
|
+
search_scope=search_scope,
|
|
638
|
+
location=location,
|
|
639
|
+
)
|
|
640
|
+
if filters_applied:
|
|
641
|
+
# Drop pre-filter responses captured during initial page load.
|
|
642
|
+
result.clear()
|
|
643
|
+
state["has_more"] = True
|
|
644
|
+
state["risk_detected"] = False
|
|
645
|
+
state["risk_msg"] = ""
|
|
646
|
+
|
|
647
|
+
# Check if we hit the "Page not found" or "Risk control" page
|
|
648
|
+
content = await self.page.content()
|
|
649
|
+
if "你访问的页面不见了" in content or "验证码" in content:
|
|
650
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {keyword}")
|
|
651
|
+
if "薯队长遇到了点小麻烦" in content:
|
|
652
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on search page")
|
|
653
|
+
|
|
654
|
+
for i in range(25):
|
|
655
|
+
if state.get("risk_detected"):
|
|
656
|
+
raise RiskControlException(f"Risk control detected during search scroll: {state.get('risk_msg')}")
|
|
657
|
+
if len(result) >= size or not state["has_more"]:
|
|
658
|
+
break
|
|
659
|
+
logger.info(f"DEBUG: [search_notes] 第 {i + 1} 次滚动, 当前结果数: {len(result)}")
|
|
660
|
+
if len(result) == 0:
|
|
661
|
+
user_tab_selector = "#channel-container"
|
|
662
|
+
await self.page.wait_for_selector(user_tab_selector, timeout=30000)
|
|
663
|
+
await self.human_click(f"{user_tab_selector} >> text=全部")
|
|
664
|
+
await asyncio.sleep(random.uniform(1.5, 3.0))
|
|
665
|
+
await self.human_scroll()
|
|
666
|
+
|
|
667
|
+
formatted = [self._format_dict(n) for n in result.values()][:size]
|
|
668
|
+
if formatted:
|
|
669
|
+
return formatted
|
|
670
|
+
|
|
671
|
+
fallback = await self._extract_search_feeds_from_initial_state()
|
|
672
|
+
return fallback[:size]
|
|
673
|
+
except Exception as e:
|
|
674
|
+
logger.error(f"Search note failed: {e}")
|
|
675
|
+
raise
|
|
676
|
+
|
|
677
|
+
async def search_users(self, keyword: str, size: int = 20):
|
|
678
|
+
"""Search for users by keyword with robust anti-bot measures."""
|
|
679
|
+
logger.info(f"DEBUG: [search_users] 关键词: {keyword}")
|
|
680
|
+
result = {}
|
|
681
|
+
state = {"has_more": True, "risk_detected": False, "risk_msg": ""}
|
|
682
|
+
|
|
683
|
+
async def handle_response(response):
|
|
684
|
+
if "/api/sns/web/v1/search/usersearch" in response.url:
|
|
685
|
+
if response.status != 200:
|
|
686
|
+
state["risk_detected"] = True
|
|
687
|
+
state["risk_msg"] = f"User search API status {response.status}"
|
|
688
|
+
logger.warning(f"User search blocked by risk control: {response.status}")
|
|
689
|
+
return
|
|
690
|
+
if response.status == 200:
|
|
691
|
+
try:
|
|
692
|
+
data = await response.json()
|
|
693
|
+
# 业务级错误码校验
|
|
694
|
+
if not data.get("success"):
|
|
695
|
+
msg = data.get("msg", "")
|
|
696
|
+
if "薯队长遇到了点小麻烦" in msg or "风控" in msg:
|
|
697
|
+
state["risk_detected"] = True
|
|
698
|
+
state["risk_msg"] = msg
|
|
699
|
+
logger.error(f"Rednote API error: {data}")
|
|
700
|
+
return
|
|
701
|
+
|
|
702
|
+
items = data.get('data', {}).get('users', [])
|
|
703
|
+
state["has_more"] = data.get('data', {}).get('has_more', False)
|
|
704
|
+
for item in items:
|
|
705
|
+
user_red_id = item.get('red_id')
|
|
706
|
+
if user_red_id:
|
|
707
|
+
result[user_red_id] = item
|
|
708
|
+
except Exception as e:
|
|
709
|
+
if "No resource with given identifier found" in str(e):
|
|
710
|
+
return
|
|
711
|
+
logger.error(f"JSON parse error: {e}")
|
|
712
|
+
|
|
713
|
+
from rednote_cli._runtime.core.browser.manager import browser_manager
|
|
714
|
+
encoded_keyword = quote(keyword)
|
|
715
|
+
search_url = f"https://www.xiaohongshu.com/search_result?keyword={encoded_keyword}&source=web_search_result_notes"
|
|
716
|
+
|
|
717
|
+
async with browser_manager.observe_responses(self.page, handle_response):
|
|
718
|
+
try:
|
|
719
|
+
# 添加 Referer 伪装
|
|
720
|
+
await self.page.set_extra_http_headers({"Referer": "https://www.xiaohongshu.com/"})
|
|
721
|
+
|
|
722
|
+
logger.info("DEBUG: [search_users] 正在进入搜索结果页...")
|
|
723
|
+
await self.page.goto(search_url, wait_until="load")
|
|
724
|
+
await asyncio.sleep(random.uniform(2.0, 4.0))
|
|
725
|
+
|
|
726
|
+
# 风控页面检测与自动刷新
|
|
727
|
+
content = await self.page.content()
|
|
728
|
+
if "你访问的页面不见了" in content or "验证码" in content:
|
|
729
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on page: {keyword}")
|
|
730
|
+
if "薯队长遇到了点小麻烦" in content:
|
|
731
|
+
raise RiskControlException(f"Detected 'Captain Potato' risk control on search page")
|
|
732
|
+
|
|
733
|
+
user_tab_selector = "#channel-container"
|
|
734
|
+
await self.page.wait_for_selector(user_tab_selector, timeout=30000)
|
|
735
|
+
await self.human_click(f"{user_tab_selector} >> text=用户")
|
|
736
|
+
await asyncio.sleep(random.uniform(1.5, 3.0))
|
|
737
|
+
|
|
738
|
+
# 拟人化滚动循环
|
|
739
|
+
for i in range(25):
|
|
740
|
+
if state.get("risk_detected"):
|
|
741
|
+
raise RiskControlException(f"Risk control detected during user search scroll: {state.get('risk_msg')}")
|
|
742
|
+
if len(result) >= size or not state["has_more"]:
|
|
743
|
+
break
|
|
744
|
+
logger.info(f"DEBUG: [search_users] 第 {i + 1} 次滚动, 当前用户数: {len(result)}")
|
|
745
|
+
if len(result) == 0:
|
|
746
|
+
user_tab_selector = "#channel-container"
|
|
747
|
+
await self.page.wait_for_selector(user_tab_selector, timeout=30000)
|
|
748
|
+
await self.human_click(f"{user_tab_selector} >> text=用户")
|
|
749
|
+
await asyncio.sleep(random.uniform(1.5, 3.0))
|
|
750
|
+
await self.human_scroll()
|
|
751
|
+
|
|
752
|
+
return [self._format_dict(u) for u in result.values()][:size]
|
|
753
|
+
except Exception as e:
|
|
754
|
+
logger.error(f"Search user failed: {e}")
|
|
755
|
+
raise
|
|
756
|
+
|
|
757
|
+
async def publish_note(
|
|
758
|
+
self,
|
|
759
|
+
target: str,
|
|
760
|
+
image_list: list = None,
|
|
761
|
+
title: str = "",
|
|
762
|
+
content: str = "",
|
|
763
|
+
tags: list | None = None,
|
|
764
|
+
schedule_at: datetime | None = None,
|
|
765
|
+
):
|
|
766
|
+
"""
|
|
767
|
+
发布笔记,支持图文、视频、长文类型。
|
|
768
|
+
|
|
769
|
+
:param target: 笔记类型:video、image、article
|
|
770
|
+
:param image_list: 图片列表,当 target = image 时必填且非空,支持本地路径、图片URL或混合
|
|
771
|
+
:return:
|
|
772
|
+
"""
|
|
773
|
+
try:
|
|
774
|
+
target_text = (target or "").strip().lower()
|
|
775
|
+
if not target_text:
|
|
776
|
+
raise InvalidPublishParameterError("`target` 必须是非空字符串")
|
|
777
|
+
|
|
778
|
+
publisher = RednotePublisher(self.page)
|
|
779
|
+
if target_text == "image":
|
|
780
|
+
if not image_list:
|
|
781
|
+
raise InvalidPublishParameterError("`image_list` 在图文发布时不能为空")
|
|
782
|
+
if isinstance(schedule_at, str) and schedule_at.strip():
|
|
783
|
+
schedule_at = parse_rfc3339(schedule_at)
|
|
784
|
+
return await publisher.publish_image_note(
|
|
785
|
+
image_list=image_list,
|
|
786
|
+
title=title or "",
|
|
787
|
+
content=content or "",
|
|
788
|
+
tags=tags or [],
|
|
789
|
+
schedule_at=schedule_at,
|
|
790
|
+
)
|
|
791
|
+
if target_text == "video":
|
|
792
|
+
if not image_list:
|
|
793
|
+
raise InvalidPublishParameterError("`image_list` 在视频发布时不能为空")
|
|
794
|
+
if len(image_list) != 1:
|
|
795
|
+
raise InvalidPublishParameterError("`target=video` 时仅支持 1 个视频素材")
|
|
796
|
+
if isinstance(schedule_at, str) and schedule_at.strip():
|
|
797
|
+
schedule_at = parse_rfc3339(schedule_at)
|
|
798
|
+
return await publisher.publish_video_note(
|
|
799
|
+
video_path=image_list[0],
|
|
800
|
+
title=title or "",
|
|
801
|
+
content=content or "",
|
|
802
|
+
tags=tags or [],
|
|
803
|
+
schedule_at=schedule_at,
|
|
804
|
+
)
|
|
805
|
+
if target_text == "article":
|
|
806
|
+
raise PublishWorkflowNotReadyError("article 发布流程尚未实现")
|
|
807
|
+
|
|
808
|
+
raise UnsupportedPublishTargetError(f"不支持的 target: {target},当前支持: video, image, article")
|
|
809
|
+
|
|
810
|
+
except RiskControlException:
|
|
811
|
+
logger.warning(f"发布过程中触发风控,交由重试机制处理")
|
|
812
|
+
raise
|
|
813
|
+
except PublishNoteException as e:
|
|
814
|
+
logger.error(str(e))
|
|
815
|
+
raise
|
|
816
|
+
except Exception as e:
|
|
817
|
+
logger.exception(f"[publish_note] Unexpected error: {e}")
|
|
818
|
+
raise PublishExecutionError(f"发布笔记失败: {e}") from e
|