xhs-note-extractor 0.1.dev2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xhs_note_extractor/DEVICE_RETRY_GUIDE.md +98 -0
- xhs_note_extractor/agent_login.py +264 -0
- xhs_note_extractor/date_desc_utils.py +80 -0
- xhs_note_extractor/extractor.py +644 -108
- xhs_note_extractor/login_propmt.py +145 -0
- xhs_note_extractor/number_utils.py +44 -0
- xhs_note_extractor/sms_verification.py +307 -0
- xhs_note_extractor/test_device_retry.py +100 -0
- xhs_note_extractor/test_initialization_fix.py +46 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.4.dist-info}/METADATA +4 -1
- xhs_note_extractor-0.1.4.dist-info/RECORD +19 -0
- xhs_note_extractor-0.1.dev2.dist-info/RECORD +0 -11
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.4.dist-info}/WHEEL +0 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.4.dist-info}/entry_points.txt +0 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.4.dist-info}/top_level.txt +0 -0
xhs_note_extractor/extractor.py
CHANGED
|
@@ -18,6 +18,11 @@ import requests
|
|
|
18
18
|
import logging
|
|
19
19
|
from typing import Dict, List, Optional, Union
|
|
20
20
|
from urllib.parse import urlparse, parse_qs
|
|
21
|
+
import xml.etree.ElementTree as ET
|
|
22
|
+
|
|
23
|
+
# 延迟加载agent_login模块以避免不必要的依赖
|
|
24
|
+
from .date_desc_utils import parse_time_to_timestamp_ms
|
|
25
|
+
from .number_utils import parse_count_to_int
|
|
21
26
|
|
|
22
27
|
# 配置日志
|
|
23
28
|
logging.basicConfig(
|
|
@@ -35,39 +40,240 @@ class XHSNoteExtractor:
|
|
|
35
40
|
包括URL解析、设备连接、页面跳转和笔记内容提取。
|
|
36
41
|
"""
|
|
37
42
|
|
|
38
|
-
def __init__(self,
|
|
43
|
+
def __init__(self, devices:dict = None):
|
|
39
44
|
"""
|
|
40
45
|
初始化小红书笔记提取器
|
|
41
46
|
|
|
42
47
|
Args:
|
|
43
|
-
|
|
44
|
-
|
|
48
|
+
devices (dict, optional): 设备信息字典,包含设备序列号和对应小红书账号可选手机号
|
|
49
|
+
{
|
|
50
|
+
"b520805": ["13800000000"]
|
|
51
|
+
}
|
|
52
|
+
|
|
45
53
|
Raises:
|
|
46
|
-
|
|
54
|
+
ValueError: 当设备信息为空或无效时抛出异常
|
|
55
|
+
"""
|
|
56
|
+
if not devices:
|
|
57
|
+
raise ValueError("设备信息必须从外部传入")
|
|
58
|
+
|
|
59
|
+
self.device = None # 当前设备
|
|
60
|
+
self.next_phone = None # 下一个手机号
|
|
61
|
+
self.devices_info = devices # 存储设备信息字典
|
|
62
|
+
self.problematic_devices = [] # 存储无法获取笔记的设备信息
|
|
63
|
+
self.enable_time_logging = True # 默认启用耗时打印
|
|
64
|
+
|
|
65
|
+
# 日志记录设备信息
|
|
66
|
+
logger.info(f"已配置设备信息: {self.devices_info}")
|
|
67
|
+
logger.info("设备将在需要时连接")
|
|
68
|
+
|
|
69
|
+
def _get_next_phone_number(self, device_serial: str) -> Optional[str]:
|
|
70
|
+
"""
|
|
71
|
+
获取指定设备的下一个手机号(循环)
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
device_serial (str): 设备序列号
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
str: 下一个手机号,如果没有则返回None
|
|
78
|
+
"""
|
|
79
|
+
if device_serial not in self.devices_info:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
phone_list = self.devices_info[device_serial]
|
|
83
|
+
if not phone_list:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
# 如果当前没有设置下一个手机号,返回第一个
|
|
87
|
+
if not self.next_phone:
|
|
88
|
+
self.next_phone = phone_list[0]
|
|
89
|
+
return self.next_phone
|
|
90
|
+
|
|
91
|
+
# 找到当前手机号在列表中的索引
|
|
92
|
+
try:
|
|
93
|
+
current_index = phone_list.index(self.next_phone)
|
|
94
|
+
# 循环到下一个
|
|
95
|
+
next_index = (current_index + 1) % len(phone_list)
|
|
96
|
+
self.next_phone = phone_list[next_index]
|
|
97
|
+
except ValueError:
|
|
98
|
+
# 如果当前手机号不在列表中,返回第一个
|
|
99
|
+
self.next_phone = phone_list[0]
|
|
100
|
+
return self.next_phone
|
|
101
|
+
def _time_method(self, method_name, start_time):
|
|
102
|
+
"""
|
|
103
|
+
记录方法执行时间
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
method_name (str): 方法名称
|
|
107
|
+
start_time (float): 开始时间
|
|
47
108
|
"""
|
|
48
|
-
self.
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
109
|
+
if self.enable_time_logging:
|
|
110
|
+
elapsed_time = time.time() - start_time
|
|
111
|
+
if elapsed_time < 1:
|
|
112
|
+
logger.info(f"⏱️ [{method_name}] 耗时: {elapsed_time*1000:.0f}ms")
|
|
113
|
+
else:
|
|
114
|
+
logger.info(f"⏱️ [{method_name}] 耗时: {elapsed_time:.2f}s")
|
|
115
|
+
|
|
116
|
+
|
|
52
117
|
|
|
53
|
-
def connect_device(self) -> bool:
|
|
118
|
+
def connect_device(self, device_serial: Optional[str] = None) -> bool:
|
|
54
119
|
"""
|
|
55
120
|
连接设备
|
|
56
121
|
|
|
122
|
+
Args:
|
|
123
|
+
device_serial (str, optional): 指定设备序列号,如果为None则使用devices_info中的第一个设备
|
|
124
|
+
|
|
57
125
|
Returns:
|
|
58
126
|
bool: 是否成功连接设备
|
|
59
127
|
"""
|
|
128
|
+
start_time = time.time()
|
|
129
|
+
|
|
130
|
+
# 如果指定了设备序列号,则使用指定的设备
|
|
131
|
+
target_device = device_serial
|
|
132
|
+
|
|
133
|
+
# 如果没有指定设备序列号,尝试使用devices_info中的第一个设备
|
|
134
|
+
if not target_device and self.devices_info:
|
|
135
|
+
target_device = next(iter(self.devices_info.keys()))
|
|
136
|
+
|
|
60
137
|
try:
|
|
61
|
-
if
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
138
|
+
if not target_device:
|
|
139
|
+
logger.error("✗ 设备连接失败: 无法确定设备序列号")
|
|
140
|
+
self._time_method("connect_device", start_time)
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
self.device = u2.connect(target_device)
|
|
65
144
|
logger.info(f"✓ 已连接设备: {self.device.serial}")
|
|
145
|
+
self._time_method("connect_device", start_time)
|
|
146
|
+
# 重启小红书应用以确保登录状态
|
|
147
|
+
logger.info("🔄 重启小红书应用...")
|
|
148
|
+
self.device.app_stop("com.xingin.xhs")
|
|
149
|
+
time.sleep(1)
|
|
150
|
+
self.device.app_start("com.xingin.xhs")
|
|
151
|
+
time.sleep(3)
|
|
152
|
+
# 获取下一个手机号
|
|
153
|
+
self.next_phone = self._get_next_phone_number(target_device)
|
|
154
|
+
logger.warning(f'next_phone:{self.next_phone}')
|
|
66
155
|
return True
|
|
67
156
|
except Exception as e:
|
|
68
157
|
logger.error(f"✗ 设备连接失败: {e}")
|
|
158
|
+
self._time_method("connect_device", start_time)
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
def switch_to_next_device(self) -> bool:
|
|
162
|
+
"""
|
|
163
|
+
切换到下一个可用设备
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
bool: 是否成功切换到下一个设备
|
|
167
|
+
"""
|
|
168
|
+
self.next_phone = None # 重置下一个手机号为None
|
|
169
|
+
if not self.devices_info or len(self.devices_info) <= 1:
|
|
170
|
+
logger.warning("没有更多可用设备可以切换")
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
# 获取当前设备的序列号
|
|
174
|
+
current_serial = self.device.serial if self.device else None
|
|
175
|
+
logger.info(f"当前设备: {current_serial}")
|
|
176
|
+
# 转换为列表以便切换
|
|
177
|
+
device_serials = list(self.devices_info.keys())
|
|
178
|
+
logger.info(f"device_serials: {device_serials}")
|
|
179
|
+
# 找到当前设备的索引
|
|
180
|
+
current_index = device_serials.index(current_serial) if current_serial in device_serials else -1
|
|
181
|
+
logger.info(f"current_index: {current_index}")
|
|
182
|
+
|
|
183
|
+
# 如果当前设备不在列表中,并且有尝试过的设备记录,则从尝试过的设备之后开始
|
|
184
|
+
attempted_serials = [d['serial'] for d in self.problematic_devices]
|
|
185
|
+
if current_index == -1 and attempted_serials:
|
|
186
|
+
# 找到最后一个尝试过的设备的索引
|
|
187
|
+
last_attempted = attempted_serials[-1]
|
|
188
|
+
if last_attempted in device_serials:
|
|
189
|
+
current_index = device_serials.index(last_attempted)
|
|
190
|
+
|
|
191
|
+
# 移动到下一个设备
|
|
192
|
+
next_index = (current_index + 1) % len(device_serials)
|
|
193
|
+
next_device_serial = device_serials[next_index]
|
|
194
|
+
logger.info(f"next_device_serial: {next_device_serial}")
|
|
195
|
+
logger.info(f"尝试切换到设备: {next_device_serial}")
|
|
196
|
+
return self.connect_device(next_device_serial)
|
|
197
|
+
def is_device_connected(self) -> bool:
|
|
198
|
+
"""
|
|
199
|
+
检查设备是否仍然连接
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
bool: 设备是否连接
|
|
203
|
+
"""
|
|
204
|
+
if not self.device:
|
|
205
|
+
return False
|
|
206
|
+
try:
|
|
207
|
+
# 通过获取设备信息来验证连接
|
|
208
|
+
self.device.info
|
|
209
|
+
return True
|
|
210
|
+
except:
|
|
69
211
|
return False
|
|
70
212
|
|
|
213
|
+
def get_problematic_devices(self) -> List[Dict[str, Union[str, float]]]:
|
|
214
|
+
"""
|
|
215
|
+
获取无法获取笔记的设备列表
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
List[Dict[str, Union[str, float]]]: 包含有问题设备信息的列表,每个设备信息包括:
|
|
219
|
+
- serial: 设备序列号
|
|
220
|
+
- reason: 问题原因
|
|
221
|
+
- note_id: 尝试提取的笔记ID
|
|
222
|
+
- timestamp: 记录时间戳
|
|
223
|
+
"""
|
|
224
|
+
return self.problematic_devices
|
|
225
|
+
|
|
226
|
+
def clear_problematic_devices(self) -> None:
|
|
227
|
+
"""
|
|
228
|
+
清空有问题的设备列表
|
|
229
|
+
"""
|
|
230
|
+
self.problematic_devices.clear()
|
|
231
|
+
# 清除缓存并重启APP
|
|
232
|
+
def clear_login_state(self, device_serial=None):
|
|
233
|
+
import uiautomator2 as u2
|
|
234
|
+
import time
|
|
235
|
+
|
|
236
|
+
# 连接设备
|
|
237
|
+
d = u2.connect(device_serial)
|
|
238
|
+
|
|
239
|
+
# 启动APP
|
|
240
|
+
d.app_stop('com.xingin.xhs')
|
|
241
|
+
time.sleep(1) # 等待APP启动
|
|
242
|
+
d.app_start('com.xingin.xhs')
|
|
243
|
+
time.sleep(2) # 等待APP启动
|
|
244
|
+
try:
|
|
245
|
+
if not d(text='我').exists():
|
|
246
|
+
print("已退出登录,无需退出登录")
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
# 点击我的/个人中心按钮
|
|
250
|
+
d(description='我').click()
|
|
251
|
+
time.sleep(2)
|
|
252
|
+
|
|
253
|
+
if d(text='微信登录').exists() or d(text='手机号登录').exists():
|
|
254
|
+
print("已登录,无需退出登录")
|
|
255
|
+
return
|
|
256
|
+
|
|
257
|
+
# 点击设置按钮
|
|
258
|
+
d(description='设置').click()
|
|
259
|
+
time.sleep(2)
|
|
260
|
+
|
|
261
|
+
# 滚动到退出登录选项
|
|
262
|
+
d.swipe_ext('up', scale=0.5)
|
|
263
|
+
time.sleep(1)
|
|
264
|
+
|
|
265
|
+
# 点击退出登录
|
|
266
|
+
d(text='退出登录').click()
|
|
267
|
+
time.sleep(1)
|
|
268
|
+
|
|
269
|
+
# 确认退出
|
|
270
|
+
d(text='退出登录').click()
|
|
271
|
+
time.sleep(2)
|
|
272
|
+
|
|
273
|
+
print("退出登录成功")
|
|
274
|
+
except Exception as e:
|
|
275
|
+
print(f"退出登录失败: {e}")
|
|
276
|
+
|
|
71
277
|
@staticmethod
|
|
72
278
|
def parse_xhs_url(url: str) -> Dict[str, str]:
|
|
73
279
|
"""
|
|
@@ -82,6 +288,7 @@ class XHSNoteExtractor:
|
|
|
82
288
|
Raises:
|
|
83
289
|
ValueError: 当URL格式不正确时抛出异常
|
|
84
290
|
"""
|
|
291
|
+
start_time = time.time()
|
|
85
292
|
# 处理xhsdiscover协议格式
|
|
86
293
|
if url.startswith("xhsdiscover://"):
|
|
87
294
|
# 提取note_id
|
|
@@ -122,13 +329,22 @@ class XHSNoteExtractor:
|
|
|
122
329
|
note_id = path_parts[explore_index + 1]
|
|
123
330
|
else:
|
|
124
331
|
raise ValueError("URL中缺少note_id")
|
|
332
|
+
# 兼容 /discovery/item/ 格式
|
|
333
|
+
elif 'discovery' in path_parts and 'item' in path_parts:
|
|
334
|
+
item_index = path_parts.index('item')
|
|
335
|
+
if item_index + 1 < len(path_parts):
|
|
336
|
+
note_id = path_parts[item_index + 1]
|
|
337
|
+
else:
|
|
338
|
+
raise ValueError("URL中缺少note_id")
|
|
125
339
|
else:
|
|
126
|
-
raise ValueError("URL格式不正确,缺少/explore/路径")
|
|
340
|
+
raise ValueError("URL格式不正确,缺少/explore/或/discovery/item/路径")
|
|
127
341
|
|
|
128
342
|
# 提取查询参数中的xsec_token
|
|
129
343
|
query_params = parse_qs(parsed_url.query)
|
|
130
344
|
xsec_token = query_params.get('xsec_token', [''])[0]
|
|
131
345
|
|
|
346
|
+
elapsed_time = time.time() - start_time
|
|
347
|
+
logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
|
|
132
348
|
return {
|
|
133
349
|
"note_id": note_id,
|
|
134
350
|
"xsec_token": xsec_token,
|
|
@@ -136,6 +352,8 @@ class XHSNoteExtractor:
|
|
|
136
352
|
}
|
|
137
353
|
|
|
138
354
|
else:
|
|
355
|
+
elapsed_time = time.time() - start_time
|
|
356
|
+
logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
|
|
139
357
|
raise ValueError("不支持的URL格式")
|
|
140
358
|
|
|
141
359
|
@staticmethod
|
|
@@ -167,17 +385,23 @@ class XHSNoteExtractor:
|
|
|
167
385
|
Returns:
|
|
168
386
|
str: xhsdiscover协议格式的URL
|
|
169
387
|
"""
|
|
388
|
+
start_time = time.time()
|
|
389
|
+
result = ""
|
|
170
390
|
if xsec_token:
|
|
171
391
|
original_url = f"http://www.xiaohongshu.com/explore/{note_id}?xsec_token={xsec_token}&xsec_source=pc_feed"
|
|
172
392
|
encoded_url = requests.utils.quote(original_url)
|
|
173
|
-
|
|
393
|
+
result = f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
|
|
174
394
|
else:
|
|
175
|
-
|
|
395
|
+
result = f"xhsdiscover://item/{note_id}"
|
|
396
|
+
|
|
397
|
+
elapsed_time = time.time() - start_time
|
|
398
|
+
logger.info(f"[convert_to_xhsdiscover_format] 耗时: {elapsed_time:.3f}秒")
|
|
399
|
+
return result
|
|
176
400
|
|
|
177
|
-
def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
|
|
178
|
-
xsec_token: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
|
|
401
|
+
def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
|
|
402
|
+
xsec_token: Optional[str] = None) -> Optional[Dict[str, Union[str, List[str]]]]:
|
|
179
403
|
"""
|
|
180
|
-
|
|
404
|
+
从小红书笔记中提取数据,支持设备重试机制
|
|
181
405
|
|
|
182
406
|
Args:
|
|
183
407
|
url (str, optional): 小红书URL,如果提供则会解析其中的note_id和xsec_token
|
|
@@ -185,133 +409,418 @@ class XHSNoteExtractor:
|
|
|
185
409
|
xsec_token (str, optional): xsec_token参数
|
|
186
410
|
|
|
187
411
|
Returns:
|
|
188
|
-
Dict[str, Union[str, List[str]]]:
|
|
412
|
+
Optional[Dict[str, Union[str, List[str]]]]: 包含笔记数据的字典,如果没有成功则返回None
|
|
189
413
|
|
|
190
414
|
Raises:
|
|
191
|
-
RuntimeError: 当设备未连接时抛出异常
|
|
192
415
|
Exception: 当提取过程中出现错误时抛出异常
|
|
193
416
|
"""
|
|
417
|
+
start_time = time.time()
|
|
194
418
|
# 如果提供了URL,则先解析它(验证URL有效性)
|
|
195
419
|
if url:
|
|
196
420
|
parsed_data = self.parse_xhs_url(url)
|
|
197
421
|
note_id = parsed_data["note_id"]
|
|
198
422
|
xsec_token = parsed_data["xsec_token"]
|
|
199
|
-
|
|
200
|
-
# 检查设备是否连接
|
|
201
|
-
if self.device is None:
|
|
202
|
-
raise RuntimeError("设备未连接,请先连接设备")
|
|
203
|
-
|
|
204
|
-
# 构建跳转URL
|
|
205
|
-
jump_url = self.convert_to_xhsdiscover_format(note_id, xsec_token)
|
|
206
423
|
|
|
207
|
-
|
|
424
|
+
max_retries = len(self.devices_info) if self.devices_info else 1
|
|
425
|
+
attempted_devices = []
|
|
208
426
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
self.device.open_url(jump_url)
|
|
212
|
-
logger.info("✓ 已发送跳转指令,等待页面加载...")
|
|
427
|
+
for attempt in range(max_retries):
|
|
428
|
+
logger.info(f"尝试第 {attempt + 1}/{max_retries} 次提取笔记: {note_id}")
|
|
213
429
|
|
|
214
|
-
#
|
|
215
|
-
|
|
430
|
+
# 检查设备是否连接,如果没有则尝试连接
|
|
431
|
+
if self.device is None:
|
|
432
|
+
if not self.connect_device():
|
|
433
|
+
logger.warning("设备连接失败,尝试下一个设备")
|
|
434
|
+
# 记录连接失败的设备
|
|
435
|
+
device_serials = list(self.devices_info.keys())
|
|
436
|
+
if device_serials and attempt < len(device_serials):
|
|
437
|
+
failed_device = device_serials[attempt]
|
|
438
|
+
if failed_device not in [d['serial'] for d in self.problematic_devices]:
|
|
439
|
+
self.problematic_devices.append({
|
|
440
|
+
'serial': failed_device,
|
|
441
|
+
'reason': '设备连接失败',
|
|
442
|
+
'note_id': note_id,
|
|
443
|
+
'timestamp': time.time()
|
|
444
|
+
})
|
|
445
|
+
if self.switch_to_next_device():
|
|
446
|
+
continue
|
|
447
|
+
else:
|
|
448
|
+
break
|
|
216
449
|
|
|
217
|
-
|
|
450
|
+
# 构建跳转URL
|
|
451
|
+
jump_url = self.convert_to_xhsdiscover_format(note_id, xsec_token)
|
|
218
452
|
|
|
219
|
-
|
|
453
|
+
logger.info(f"正在尝试跳转至笔记: {note_id} (设备: {self.device.serial if self.device else '未知'})")
|
|
220
454
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
455
|
+
try:
|
|
456
|
+
# # 在跳转链接前重启APP
|
|
457
|
+
# logger.info(f"🔄 准备跳转至笔记 {note_id},正在重启APP...")
|
|
458
|
+
# self.restart_xhs_app()
|
|
459
|
+
|
|
460
|
+
# 发起跳转
|
|
461
|
+
self.device.open_url(jump_url)
|
|
462
|
+
logger.info("✓ 已发送跳转指令,等待页面加载...")
|
|
463
|
+
|
|
464
|
+
# 使用现有的xhs_utils功能提取数据
|
|
465
|
+
data = self._get_detail_data(jump_url)
|
|
466
|
+
|
|
467
|
+
# 如果返回None,说明需要登录,尝试下一个设备
|
|
468
|
+
if data is None:
|
|
469
|
+
logger.warning(f"当前设备{self.device.serial}需要登录,尝试切换到下一个设备")
|
|
470
|
+
attempted_devices.append(self.device.serial if self.device else "未知设备")
|
|
471
|
+
# 尝试重新登录
|
|
472
|
+
# 触发退出登录
|
|
473
|
+
self.clear_login_state(self.device.serial)
|
|
474
|
+
# 触发登录
|
|
475
|
+
# 确认当前设备对应的手机号
|
|
476
|
+
self.next_phone = self._get_next_phone_number(self.device.serial)
|
|
477
|
+
logger.warning(f'next_phone:{self.next_phone}')
|
|
478
|
+
try:
|
|
479
|
+
# 延迟加载agent_login模块以避免不必要的依赖
|
|
480
|
+
from .agent_login import do_login
|
|
481
|
+
if do_login(phone_number=self.next_phone, device_id=self.device.serial):
|
|
482
|
+
logger.info(f"✓ 设备{self.device.serial}登录成功")
|
|
483
|
+
continue
|
|
484
|
+
else:
|
|
485
|
+
logger.warning(f"✓ 设备{self.device.serial}登录失败")
|
|
486
|
+
attempted_devices.append(self.device.serial) # 记录尝试过的设备
|
|
487
|
+
# 保存当前设备序列号,以便switch_to_next_device知道从哪里开始
|
|
488
|
+
failed_device_serial = self.device.serial
|
|
489
|
+
self.device = None
|
|
490
|
+
|
|
491
|
+
# 手动记录失败的设备信息
|
|
492
|
+
if failed_device_serial not in [d['serial'] for d in self.problematic_devices]:
|
|
493
|
+
self.problematic_devices.append({
|
|
494
|
+
'serial': failed_device_serial,
|
|
495
|
+
'reason': '设备登录失败',
|
|
496
|
+
'note_id': note_id,
|
|
497
|
+
'timestamp': time.time()
|
|
498
|
+
})
|
|
499
|
+
|
|
500
|
+
# 尝试切换到下一个设备
|
|
501
|
+
if not self.switch_to_next_device():
|
|
502
|
+
logger.error("没有更多可用设备,提取失败")
|
|
503
|
+
self._time_method("extract_note_data", start_time)
|
|
504
|
+
return {}
|
|
505
|
+
continue
|
|
506
|
+
except ImportError as e:
|
|
507
|
+
logger.warning(f"无法导入登录模块: {e}")
|
|
508
|
+
logger.warning("将尝试跳过登录步骤,继续使用当前设备")
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
logger.info(f"✓ 成功提取笔记数据,点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
|
|
512
|
+
self._time_method("extract_note_data", start_time)
|
|
513
|
+
return data
|
|
514
|
+
|
|
515
|
+
except Exception as e:
|
|
516
|
+
logger.error(f"✗ 提取笔记数据失败: {e}")
|
|
517
|
+
attempted_devices.append(self.device.serial if self.device else "未知设备")
|
|
518
|
+
|
|
519
|
+
# 记录有问题的设备
|
|
520
|
+
if self.device and self.device.serial not in [d['serial'] for d in self.problematic_devices]:
|
|
521
|
+
self.problematic_devices.append({
|
|
522
|
+
'serial': self.device.serial,
|
|
523
|
+
'reason': f'提取异常: {str(e)}',
|
|
524
|
+
'note_id': note_id,
|
|
525
|
+
'timestamp': time.time()
|
|
526
|
+
})
|
|
527
|
+
|
|
528
|
+
# 如果还有设备可用,尝试下一个
|
|
529
|
+
if attempt < max_retries - 1 and self.switch_to_next_device():
|
|
530
|
+
continue
|
|
531
|
+
else:
|
|
532
|
+
logger.error("所有设备尝试完毕,提取失败")
|
|
533
|
+
self._time_method("extract_note_data", start_time)
|
|
534
|
+
logger.error(f"所有设备尝试完毕,提取失败。尝试过的设备: {attempted_devices}")
|
|
535
|
+
self._time_method("extract_note_data", start_time)
|
|
536
|
+
return {}
|
|
224
537
|
|
|
225
|
-
def _get_detail_data(self) -> Dict[str, Union[str, List[str]]]:
|
|
538
|
+
def _get_detail_data(self, jump_url: str) -> Dict[str, Union[str, List[str]]]:
|
|
226
539
|
"""
|
|
227
540
|
从当前已经打开的小红书详情页提取完整正文、图片和点赞数。
|
|
228
|
-
|
|
541
|
+
优化版本: 使用 dump_hierarchy 替代遍历,大幅提升速度。
|
|
229
542
|
|
|
543
|
+
Args:
|
|
544
|
+
jump_url (str): 笔记的跳转URL,用于白屏时重新加载
|
|
545
|
+
|
|
230
546
|
Returns:
|
|
231
547
|
Dict[str, Union[str, List[str]]]: 包含笔记数据的字典
|
|
232
548
|
"""
|
|
233
|
-
|
|
549
|
+
start_time = time.time()
|
|
550
|
+
logger.info("🔍 进入深度提取模式 (XML优化版)...")
|
|
234
551
|
|
|
235
|
-
# 1. 验证是否进入详情页
|
|
552
|
+
# 1. 验证是否进入详情页 & 展开全文
|
|
236
553
|
detail_loaded = False
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
554
|
+
try:
|
|
555
|
+
if self.device(text="展开").exists:
|
|
556
|
+
self.device(text="展开").click()
|
|
557
|
+
except: pass
|
|
558
|
+
|
|
559
|
+
# 超快速检查 - 只等0.2秒
|
|
560
|
+
time.sleep(0.2)
|
|
561
|
+
|
|
562
|
+
# 快速检查登录状态
|
|
563
|
+
if self.device(textContains="其他登录方式").exists or self.device(textContains="微信登录").exists or self.device(textContains="登录发现更多精彩").exists:
|
|
564
|
+
logger.error("✗ 需要登录才能查看详情页内容,提取终止")
|
|
565
|
+
return None
|
|
566
|
+
|
|
567
|
+
# 极简检查 - 只检查一次
|
|
568
|
+
time.sleep(0.3)
|
|
569
|
+
detail_count = 5
|
|
570
|
+
detail_loaded = False
|
|
571
|
+
while(detail_count > 0):
|
|
572
|
+
if not self.device(textContains="关注").exists:
|
|
573
|
+
detail_count -= 1
|
|
574
|
+
time.sleep(0.1)
|
|
575
|
+
continue
|
|
576
|
+
detail_loaded = True
|
|
577
|
+
break
|
|
246
578
|
|
|
247
579
|
if not detail_loaded:
|
|
248
|
-
logger.warning("⚠
|
|
580
|
+
logger.warning("⚠ 警告:详情页特征未发现,提取可能不完整")
|
|
249
581
|
|
|
250
|
-
#
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
582
|
+
# 智能滚动 - 确保看到发布时间和评论区 (优化速度版)
|
|
583
|
+
scroll_phase_start = time.time()
|
|
584
|
+
try:
|
|
585
|
+
# 定义需要查找的目标元素 (正则匹配)
|
|
586
|
+
target_pattern = re.compile(r"条评论|发布于|小时前|天前|月前|年前|昨天|今天")
|
|
587
|
+
|
|
588
|
+
# 最多滚动6次,单次距离加大
|
|
589
|
+
for i in range(6):
|
|
590
|
+
# 向下滚动
|
|
591
|
+
swipe_start = time.time()
|
|
592
|
+
self.device.swipe(540, 1600, 540, 600, 0.1)
|
|
593
|
+
self._time_method(f"scroll_swipe_{i+1}", swipe_start)
|
|
594
|
+
|
|
595
|
+
# 核心优化:只 dump 一次,在字符串中搜索,避免多次 exists() 调用的开销
|
|
596
|
+
dump_start = time.time()
|
|
597
|
+
xml_temp = self.device.dump_hierarchy()
|
|
598
|
+
self._time_method(f"scroll_dump_{i+1}", dump_start)
|
|
599
|
+
|
|
600
|
+
if target_pattern.search(xml_temp):
|
|
601
|
+
logger.info(f"✓ 已检测到目标元素 (第 {i+1} 次滚动)")
|
|
602
|
+
break
|
|
603
|
+
|
|
604
|
+
# 极短间隔
|
|
605
|
+
time.sleep(0.1)
|
|
606
|
+
|
|
607
|
+
time.sleep(0.3) # 稳定时间
|
|
608
|
+
self._time_method("intelligent_scroll_total", scroll_phase_start)
|
|
609
|
+
logger.info("✓ 滚动完成")
|
|
610
|
+
except Exception as e:
|
|
611
|
+
logger.warning(f"滚动失败: {e}")
|
|
257
612
|
|
|
258
|
-
#
|
|
613
|
+
# 初始化提取变量
|
|
259
614
|
content = ""
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
615
|
+
likes = 0
|
|
616
|
+
collects = 0
|
|
617
|
+
comments = 0
|
|
618
|
+
author_name = "Unknown"
|
|
619
|
+
publish_time = 0
|
|
620
|
+
date_desc = ""
|
|
621
|
+
image_urls = []
|
|
622
|
+
|
|
623
|
+
# 2. 获取 UI层级 (核心优化)
|
|
624
|
+
# 增加一次重试逻辑,如果第一次没抓到日期
|
|
625
|
+
text_nodes = []
|
|
626
|
+
limit_y = 2500
|
|
627
|
+
|
|
628
|
+
for attempt in range(2):
|
|
629
|
+
xml_dump_start = time.time()
|
|
630
|
+
xml_content = self.device.dump_hierarchy()
|
|
631
|
+
self._time_method("dump_hierarchy", xml_dump_start)
|
|
632
|
+
|
|
633
|
+
# 检测白屏状态 - 检查文本节点数量
|
|
634
|
+
current_text_nodes = []
|
|
635
|
+
root = ET.fromstring(xml_content)
|
|
636
|
+
|
|
637
|
+
def parse_nodes(node):
|
|
638
|
+
text = node.attrib.get('text', '') or node.attrib.get('content-desc', '')
|
|
639
|
+
bounds_str = node.attrib.get('bounds', '[0,0][0,0]')
|
|
269
640
|
try:
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
641
|
+
coords = bounds_str.replace('][', ',').replace('[', '').replace(']', '').split(',')
|
|
642
|
+
x1, y1, x2, y2 = map(int, coords)
|
|
643
|
+
if text:
|
|
644
|
+
current_text_nodes.append({
|
|
645
|
+
'text': text,
|
|
646
|
+
'l': x1, 't': y1, 'r': x2, 'b': y2,
|
|
647
|
+
'cx': (x1 + x2) / 2, 'cy': (y1 + y2) / 2
|
|
648
|
+
})
|
|
649
|
+
except: pass
|
|
650
|
+
for child in node: parse_nodes(child)
|
|
651
|
+
|
|
652
|
+
parse_nodes(root)
|
|
653
|
+
|
|
654
|
+
# 白屏检测:如果文本节点太少,可能是白屏
|
|
655
|
+
print(f'当前文本节点数量: {len(current_text_nodes)}')
|
|
656
|
+
if len(current_text_nodes) < 11:
|
|
657
|
+
logger.error(f"✗ 检测到白屏状态 - 文本节点数量异常少 ({len(current_text_nodes)}个节点)")
|
|
658
|
+
logger.info("--- 调试: 捕获的文本节点 ---")
|
|
659
|
+
for i, n in enumerate(current_text_nodes):
|
|
660
|
+
logger.info(f"[{i}] {n['text']} (t={n['t']}, b={n['b']}, l={n['l']}, r={n['r']})")
|
|
661
|
+
logger.info("--- 调试结束 ---")
|
|
662
|
+
|
|
663
|
+
# 如果是第一次尝试,重新加载页面
|
|
664
|
+
if attempt == 0:
|
|
665
|
+
logger.info("🔄 尝试重新加载页面...")
|
|
666
|
+
# 重新发送跳转指令
|
|
667
|
+
self.device.open_url(jump_url)
|
|
668
|
+
time.sleep(2) # 等待页面重新加载
|
|
669
|
+
continue
|
|
670
|
+
else:
|
|
671
|
+
# 第二次尝试仍白屏,直接返回None
|
|
672
|
+
logger.error("✗ 页面加载失败 - 白屏状态")
|
|
673
|
+
return None
|
|
674
|
+
|
|
675
|
+
# 检查是否存在加载指示器
|
|
676
|
+
loading_found = False
|
|
677
|
+
for node in current_text_nodes:
|
|
678
|
+
if re.search(r'(加载|loading|等待|waiting|\.\.\.|\\u231a|\\u25ba)', node['text'], re.IGNORECASE):
|
|
679
|
+
loading_found = True
|
|
680
|
+
break
|
|
681
|
+
|
|
682
|
+
if loading_found:
|
|
683
|
+
logger.warning("⚠ 检测到页面正在加载中")
|
|
684
|
+
if attempt == 0:
|
|
685
|
+
logger.info("🔄 等待页面加载完成...")
|
|
686
|
+
time.sleep(2)
|
|
687
|
+
continue
|
|
688
|
+
|
|
689
|
+
text_nodes = current_text_nodes # 保留最新的节点供后续提取使用
|
|
690
|
+
|
|
691
|
+
# 4. 分析节点数据 (简化版日期快速检查)
|
|
692
|
+
found_date_in_this_xml = False
|
|
693
|
+
follow_node = None
|
|
694
|
+
for n in text_nodes:
|
|
695
|
+
if n['text'] in ["关注", "已关注"]:
|
|
696
|
+
follow_node = n
|
|
697
|
+
break
|
|
698
|
+
|
|
699
|
+
if follow_node:
|
|
700
|
+
# 寻找作者名
|
|
701
|
+
best_dist = 999
|
|
702
|
+
for n in text_nodes:
|
|
703
|
+
if n == follow_node: continue
|
|
704
|
+
if abs(n['cy'] - follow_node['cy']) < 100 and n['r'] <= follow_node['l'] + 50:
|
|
705
|
+
dist = follow_node['l'] - n['r']
|
|
706
|
+
if dist < best_dist:
|
|
707
|
+
best_dist = dist
|
|
708
|
+
author_name = n['text']
|
|
709
|
+
|
|
710
|
+
# 寻找日期
|
|
711
|
+
min_y = follow_node['b'] if follow_node else 150
|
|
712
|
+
# 提前寻找 limit_y
|
|
713
|
+
current_limit_y = 2500
|
|
714
|
+
for n in text_nodes:
|
|
715
|
+
if re.match(r"^共\s*\d+\s*条评论$", n['text']) or n['text'] in ["说点什么", "写评论", "写点什么", "这里是评论区"]:
|
|
716
|
+
current_limit_y = min(current_limit_y, n['t'])
|
|
717
|
+
limit_y = current_limit_y
|
|
279
718
|
|
|
280
|
-
|
|
281
|
-
|
|
719
|
+
for n in text_nodes:
|
|
720
|
+
if n['t'] > min_y - 200 and n['b'] < limit_y + 150:
|
|
721
|
+
txt = n['text'].strip()
|
|
722
|
+
if 2 <= len(txt) <= 50 and txt not in ["点赞", "收藏", "评论", "关注", "分享", "回复", "不喜欢"]:
|
|
723
|
+
try:
|
|
724
|
+
ts = parse_time_to_timestamp_ms(txt)
|
|
725
|
+
publish_time = ts
|
|
726
|
+
date_desc = txt
|
|
727
|
+
found_date_in_this_xml = True
|
|
728
|
+
# 不要 break,因为日期通常在最后
|
|
729
|
+
except: continue
|
|
730
|
+
|
|
731
|
+
if found_date_in_this_xml:
|
|
732
|
+
break
|
|
733
|
+
|
|
734
|
+
if attempt == 0:
|
|
735
|
+
logger.warning("⚠ 未识别到发布时间,尝试额外滚动并重试...")
|
|
736
|
+
self.device.swipe(540, 1500, 540, 1000, 0.2)
|
|
737
|
+
time.sleep(0.5)
|
|
738
|
+
|
|
739
|
+
if not date_desc:
|
|
740
|
+
logger.warning("未识别到发布时间")
|
|
741
|
+
# 埋点调试: 打印出识别到的所有节点及其坐标
|
|
742
|
+
logger.info("--- 调试: 所有捕获的文本节点 ---")
|
|
743
|
+
for i, n in enumerate(text_nodes):
|
|
744
|
+
logger.info(f"[{i}] {n['text']} (t={n['t']}, b={n['b']}, l={n['l']}, r={n['r']})")
|
|
745
|
+
logger.info("--- 调试结束 ---")
|
|
746
|
+
else:
|
|
747
|
+
logger.info(f"✓ 识别到发布时间: {date_desc} -> {publish_time}")
|
|
748
|
+
|
|
749
|
+
logger.info(f"text_nodes: {text_nodes}")
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
# B. 互动数据提取 (底部区域)
|
|
753
|
+
# 使用 limit_y 作为分割线大概率更准确
|
|
754
|
+
bottom_nodes = [n for n in text_nodes if n['t'] >= limit_y - 300] # 互动栏通常在 limit_y 上方一点点 或者 就在 mask 区域
|
|
755
|
+
bottom_nodes.sort(key=lambda x: x['l']) # 从左到右
|
|
756
|
+
|
|
757
|
+
for n in bottom_nodes:
|
|
758
|
+
txt = n['text']
|
|
759
|
+
# 保留数字、小数点、w/W 和 "万" 字
|
|
760
|
+
num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W', '万'])
|
|
761
|
+
if not num_txt: continue
|
|
762
|
+
|
|
763
|
+
cx = n['cx']
|
|
764
|
+
if 500 < cx < 750:
|
|
765
|
+
likes = parse_count_to_int(num_txt)
|
|
766
|
+
elif 750 < cx < 900:
|
|
767
|
+
collects = parse_count_to_int(num_txt)
|
|
768
|
+
elif cx >= 900:
|
|
769
|
+
comments = parse_count_to_int(num_txt)
|
|
770
|
+
|
|
771
|
+
# C. 正文提取
|
|
772
|
+
# 过滤掉非正文内容
|
|
773
|
+
content_lines = []
|
|
774
|
+
# exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
|
|
775
|
+
# if date_desc:
|
|
776
|
+
# exclude_keywords.append(date_desc)
|
|
777
|
+
|
|
778
|
+
# 按照垂直位置排序 (使用 min_y 和 limit_y 约束)
|
|
779
|
+
content_nodes = [n for n in text_nodes if min_y < n['t'] < limit_y]
|
|
780
|
+
content_nodes.sort(key=lambda x: x['t'])
|
|
781
|
+
|
|
782
|
+
for n in content_nodes:
|
|
783
|
+
t = n['text']
|
|
784
|
+
if len(t) < 2: continue
|
|
785
|
+
# if any(k in t for k in exclude_keywords): continue
|
|
786
|
+
|
|
787
|
+
# 简单的去重策略
|
|
788
|
+
if content_lines and t in content_lines[-1]: continue
|
|
789
|
+
content_lines.append(t)
|
|
790
|
+
|
|
791
|
+
content = "\n".join(content_lines)
|
|
792
|
+
logger.info(f"提取正文: {content}")
|
|
793
|
+
# 5. 图片提取 (保持原有逻辑但优化等待)
|
|
282
794
|
try:
|
|
795
|
+
# 这里还是需要交互,无法纯靠XML
|
|
283
796
|
share_btn = self.device(description="分享")
|
|
284
797
|
if share_btn.exists:
|
|
285
798
|
share_btn.click()
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
if
|
|
289
|
-
|
|
799
|
+
# 显式等待 "复制链接"
|
|
800
|
+
copy_link = self.device(text="复制链接")
|
|
801
|
+
if copy_link.wait(timeout=2.0):
|
|
802
|
+
copy_link.click()
|
|
803
|
+
# 等待剪贴板更新? 稍微缓一下
|
|
290
804
|
time.sleep(0.5)
|
|
291
805
|
share_link = self.device.clipboard
|
|
292
806
|
if "http" in str(share_link):
|
|
293
807
|
image_urls = self._fetch_web_images(share_link)
|
|
294
808
|
else:
|
|
809
|
+
logger.warning("未找到复制链接按钮")
|
|
295
810
|
self.device.press("back")
|
|
296
811
|
except Exception as e:
|
|
297
812
|
logger.warning(f"⚠ 图片提取异常: {e}")
|
|
298
813
|
|
|
299
|
-
|
|
300
|
-
likes = "0"
|
|
301
|
-
try:
|
|
302
|
-
for el in self.device(className="android.widget.TextView"):
|
|
303
|
-
txt = el.get_text() or ""
|
|
304
|
-
if any(c.isdigit() for c in txt):
|
|
305
|
-
b = el.info.get('bounds', {})
|
|
306
|
-
if b.get('top', 0) > 2000 and b.get('left', 0) > 500:
|
|
307
|
-
likes = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
|
|
308
|
-
if likes: break
|
|
309
|
-
except: pass
|
|
310
|
-
|
|
814
|
+
self._time_method("_get_detail_data", start_time)
|
|
311
815
|
return {
|
|
312
816
|
"content": content,
|
|
313
817
|
"image_urls": image_urls,
|
|
314
|
-
"likes": likes
|
|
818
|
+
"likes": likes,
|
|
819
|
+
"collects": collects,
|
|
820
|
+
"comments": comments,
|
|
821
|
+
"author_name": author_name,
|
|
822
|
+
"publish_time": publish_time,
|
|
823
|
+
"date_desc": date_desc
|
|
315
824
|
}
|
|
316
825
|
|
|
317
826
|
def _fetch_web_images(self, url: str) -> List[str]:
|
|
@@ -324,6 +833,7 @@ class XHSNoteExtractor:
|
|
|
324
833
|
Returns:
|
|
325
834
|
List[str]: 图片URL列表
|
|
326
835
|
"""
|
|
836
|
+
start_time = time.time()
|
|
327
837
|
try:
|
|
328
838
|
headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"}
|
|
329
839
|
res = requests.get(url, headers=headers, timeout=10)
|
|
@@ -339,8 +849,10 @@ class XHSNoteExtractor:
|
|
|
339
849
|
for m in matches:
|
|
340
850
|
clean_url = m.replace('\\u002F', '/')
|
|
341
851
|
if clean_url not in found: found.append(clean_url)
|
|
852
|
+
self._time_method("_fetch_web_images", start_time)
|
|
342
853
|
return found
|
|
343
854
|
except:
|
|
855
|
+
self._time_method("_fetch_web_images", start_time)
|
|
344
856
|
return []
|
|
345
857
|
|
|
346
858
|
def save_note_data(self, data: Dict[str, Union[str, List[str]]],
|
|
@@ -354,6 +866,7 @@ class XHSNoteExtractor:
|
|
|
354
866
|
filename (str): 保存文件名
|
|
355
867
|
note_url (str): 笔记URL
|
|
356
868
|
"""
|
|
869
|
+
start_time = time.time()
|
|
357
870
|
try:
|
|
358
871
|
with open(filename, "w", encoding="utf-8") as f:
|
|
359
872
|
f.write("=" * 50 + "\n")
|
|
@@ -362,8 +875,12 @@ class XHSNoteExtractor:
|
|
|
362
875
|
if note_url:
|
|
363
876
|
f.write(f"笔记URL: {note_url}\n")
|
|
364
877
|
f.write("=" * 50 + "\n")
|
|
365
|
-
f.write(f"
|
|
366
|
-
f.write(f"
|
|
878
|
+
f.write(f"作者: {data.get('author_name', 'Unknown')}\n")
|
|
879
|
+
f.write(f"点赞数: {data.get('likes', '0')}\n")
|
|
880
|
+
f.write(f"收藏数: {data.get('collects', '0')}\n")
|
|
881
|
+
f.write(f"评论数: {data.get('comments', '0')}\n")
|
|
882
|
+
f.write(f"图片数: {len(data.get('image_urls', []))}\n")
|
|
883
|
+
f.write(f"发布时间: {data.get('date_desc', '')} ({data.get('publish_time', 0)})\n")
|
|
367
884
|
f.write("=" * 50 + "\n")
|
|
368
885
|
f.write("【正文内容】\n")
|
|
369
886
|
f.write(data['content'])
|
|
@@ -375,24 +892,38 @@ class XHSNoteExtractor:
|
|
|
375
892
|
f.write("=" * 50 + "\n")
|
|
376
893
|
|
|
377
894
|
logger.info(f"✓ 笔记数据已保存到: {filename}")
|
|
895
|
+
self._time_method("save_note_data", start_time)
|
|
378
896
|
except Exception as e:
|
|
379
897
|
logger.error(f"✗ 保存笔记数据失败: {e}")
|
|
898
|
+
self._time_method("save_note_data", start_time)
|
|
380
899
|
raise
|
|
381
900
|
|
|
382
901
|
|
|
383
|
-
def extract_note_from_url(url: str, device_serial: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
|
|
902
|
+
def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Optional[Dict[str, Union[str, List[str]]]]:
|
|
384
903
|
"""
|
|
385
|
-
便捷函数:直接从URL
|
|
904
|
+
便捷函数:直接从URL提取笔记数据,支持设备重试机制
|
|
386
905
|
|
|
387
906
|
Args:
|
|
388
907
|
url (str): 小红书笔记URL
|
|
389
908
|
device_serial (str, optional): 设备序列号
|
|
909
|
+
enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
|
|
390
910
|
|
|
391
911
|
Returns:
|
|
392
|
-
Dict[str, Union[str, List[str]]]:
|
|
912
|
+
Optional[Dict[str, Union[str, List[str]]]]: 笔记数据,如果没有成功则返回None
|
|
393
913
|
"""
|
|
394
|
-
|
|
395
|
-
|
|
914
|
+
start_time = time.time()
|
|
915
|
+
logger.info(f"[extract_note_from_url] 开始处理URL: {url}")
|
|
916
|
+
try:
|
|
917
|
+
extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
|
|
918
|
+
result = extractor.extract_note_data(url=url)
|
|
919
|
+
elapsed_time = time.time() - start_time
|
|
920
|
+
logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
|
|
921
|
+
return result
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error(f"[extract_note_from_url] 提取失败: {e}")
|
|
924
|
+
elapsed_time = time.time() - start_time
|
|
925
|
+
logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
|
|
926
|
+
return None
|
|
396
927
|
|
|
397
928
|
|
|
398
929
|
def convert_url_format(url: str) -> str:
|
|
@@ -405,8 +936,13 @@ def convert_url_format(url: str) -> str:
|
|
|
405
936
|
Returns:
|
|
406
937
|
str: 转换后的xhsdiscover协议格式URL
|
|
407
938
|
"""
|
|
939
|
+
start_time = time.time()
|
|
940
|
+
logger.info(f"[convert_url_format] 开始转换URL: {url}")
|
|
408
941
|
parsed_data = XHSNoteExtractor.parse_xhs_url(url)
|
|
409
|
-
|
|
942
|
+
result = XHSNoteExtractor.convert_to_xhsdiscover_format(
|
|
410
943
|
parsed_data["note_id"],
|
|
411
944
|
parsed_data["xsec_token"]
|
|
412
|
-
)
|
|
945
|
+
)
|
|
946
|
+
elapsed_time = time.time() - start_time
|
|
947
|
+
logger.info(f"[convert_url_format] 耗时: {elapsed_time:.3f}秒,结果: {result}")
|
|
948
|
+
return result
|