xhs-note-extractor 0.1.dev2__py3-none-any.whl → 0.1.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xhs_note_extractor/extractor.py +216 -64
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/METADATA +1 -1
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/RECORD +7 -7
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/WHEEL +0 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/entry_points.txt +0 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/licenses/LICENSE +0 -0
- {xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/top_level.txt +0 -0
xhs_note_extractor/extractor.py
CHANGED
|
@@ -18,6 +18,7 @@ import requests
|
|
|
18
18
|
import logging
|
|
19
19
|
from typing import Dict, List, Optional, Union
|
|
20
20
|
from urllib.parse import urlparse, parse_qs
|
|
21
|
+
import xml.etree.ElementTree as ET
|
|
21
22
|
|
|
22
23
|
# 配置日志
|
|
23
24
|
logging.basicConfig(
|
|
@@ -35,21 +36,35 @@ class XHSNoteExtractor:
|
|
|
35
36
|
包括URL解析、设备连接、页面跳转和笔记内容提取。
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
|
-
def __init__(self, device_serial: Optional[str] = None):
|
|
39
|
+
def __init__(self, device_serial: Optional[str] = None, enable_time_logging: bool = True):
|
|
39
40
|
"""
|
|
40
41
|
初始化小红书笔记提取器
|
|
41
42
|
|
|
42
43
|
Args:
|
|
43
44
|
device_serial (str, optional): 设备序列号,如果为None则自动连接可用设备
|
|
45
|
+
enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
|
|
44
46
|
|
|
45
47
|
Raises:
|
|
46
48
|
RuntimeError: 当没有可用设备时抛出异常
|
|
47
49
|
"""
|
|
48
50
|
self.device = None
|
|
49
51
|
self.device_serial = device_serial
|
|
52
|
+
self.enable_time_logging = enable_time_logging
|
|
50
53
|
if not self.connect_device():
|
|
51
54
|
raise RuntimeError("未找到可用的Android设备,请连接设备后再试")
|
|
52
55
|
|
|
56
|
+
def _time_method(self, method_name, start_time):
|
|
57
|
+
"""
|
|
58
|
+
记录方法执行时间
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
method_name (str): 方法名称
|
|
62
|
+
start_time (float): 开始时间
|
|
63
|
+
"""
|
|
64
|
+
if self.enable_time_logging:
|
|
65
|
+
elapsed_time = time.time() - start_time
|
|
66
|
+
logger.info(f"[{method_name}] 耗时: {elapsed_time:.3f}秒")
|
|
67
|
+
|
|
53
68
|
def connect_device(self) -> bool:
|
|
54
69
|
"""
|
|
55
70
|
连接设备
|
|
@@ -57,17 +72,35 @@ class XHSNoteExtractor:
|
|
|
57
72
|
Returns:
|
|
58
73
|
bool: 是否成功连接设备
|
|
59
74
|
"""
|
|
75
|
+
start_time = time.time()
|
|
60
76
|
try:
|
|
61
77
|
if self.device_serial:
|
|
62
78
|
self.device = u2.connect(self.device_serial)
|
|
63
79
|
else:
|
|
64
80
|
self.device = u2.connect()
|
|
65
81
|
logger.info(f"✓ 已连接设备: {self.device.serial}")
|
|
82
|
+
self._time_method("connect_device", start_time)
|
|
66
83
|
return True
|
|
67
84
|
except Exception as e:
|
|
68
85
|
logger.error(f"✗ 设备连接失败: {e}")
|
|
86
|
+
self._time_method("connect_device", start_time)
|
|
69
87
|
return False
|
|
70
|
-
|
|
88
|
+
def is_device_connected(self) -> bool:
|
|
89
|
+
"""
|
|
90
|
+
检查设备是否仍然连接
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
bool: 设备是否连接
|
|
94
|
+
"""
|
|
95
|
+
if not self.device:
|
|
96
|
+
return False
|
|
97
|
+
try:
|
|
98
|
+
# 通过获取设备信息来验证连接
|
|
99
|
+
self.device.info
|
|
100
|
+
return True
|
|
101
|
+
except:
|
|
102
|
+
return False
|
|
103
|
+
|
|
71
104
|
@staticmethod
|
|
72
105
|
def parse_xhs_url(url: str) -> Dict[str, str]:
|
|
73
106
|
"""
|
|
@@ -82,6 +115,7 @@ class XHSNoteExtractor:
|
|
|
82
115
|
Raises:
|
|
83
116
|
ValueError: 当URL格式不正确时抛出异常
|
|
84
117
|
"""
|
|
118
|
+
start_time = time.time()
|
|
85
119
|
# 处理xhsdiscover协议格式
|
|
86
120
|
if url.startswith("xhsdiscover://"):
|
|
87
121
|
# 提取note_id
|
|
@@ -122,13 +156,22 @@ class XHSNoteExtractor:
|
|
|
122
156
|
note_id = path_parts[explore_index + 1]
|
|
123
157
|
else:
|
|
124
158
|
raise ValueError("URL中缺少note_id")
|
|
159
|
+
# 兼容 /discovery/item/ 格式
|
|
160
|
+
elif 'discovery' in path_parts and 'item' in path_parts:
|
|
161
|
+
item_index = path_parts.index('item')
|
|
162
|
+
if item_index + 1 < len(path_parts):
|
|
163
|
+
note_id = path_parts[item_index + 1]
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError("URL中缺少note_id")
|
|
125
166
|
else:
|
|
126
|
-
raise ValueError("URL格式不正确,缺少/explore/路径")
|
|
167
|
+
raise ValueError("URL格式不正确,缺少/explore/或/discovery/item/路径")
|
|
127
168
|
|
|
128
169
|
# 提取查询参数中的xsec_token
|
|
129
170
|
query_params = parse_qs(parsed_url.query)
|
|
130
171
|
xsec_token = query_params.get('xsec_token', [''])[0]
|
|
131
172
|
|
|
173
|
+
elapsed_time = time.time() - start_time
|
|
174
|
+
logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
|
|
132
175
|
return {
|
|
133
176
|
"note_id": note_id,
|
|
134
177
|
"xsec_token": xsec_token,
|
|
@@ -136,6 +179,8 @@ class XHSNoteExtractor:
|
|
|
136
179
|
}
|
|
137
180
|
|
|
138
181
|
else:
|
|
182
|
+
elapsed_time = time.time() - start_time
|
|
183
|
+
logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
|
|
139
184
|
raise ValueError("不支持的URL格式")
|
|
140
185
|
|
|
141
186
|
@staticmethod
|
|
@@ -167,12 +212,18 @@ class XHSNoteExtractor:
|
|
|
167
212
|
Returns:
|
|
168
213
|
str: xhsdiscover协议格式的URL
|
|
169
214
|
"""
|
|
215
|
+
start_time = time.time()
|
|
216
|
+
result = ""
|
|
170
217
|
if xsec_token:
|
|
171
218
|
original_url = f"http://www.xiaohongshu.com/explore/{note_id}?xsec_token={xsec_token}&xsec_source=pc_feed"
|
|
172
219
|
encoded_url = requests.utils.quote(original_url)
|
|
173
|
-
|
|
220
|
+
result = f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
|
|
174
221
|
else:
|
|
175
|
-
|
|
222
|
+
result = f"xhsdiscover://item/{note_id}"
|
|
223
|
+
|
|
224
|
+
elapsed_time = time.time() - start_time
|
|
225
|
+
logger.info(f"[convert_to_xhsdiscover_format] 耗时: {elapsed_time:.3f}秒")
|
|
226
|
+
return result
|
|
176
227
|
|
|
177
228
|
def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
|
|
178
229
|
xsec_token: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
|
|
@@ -191,6 +242,7 @@ class XHSNoteExtractor:
|
|
|
191
242
|
RuntimeError: 当设备未连接时抛出异常
|
|
192
243
|
Exception: 当提取过程中出现错误时抛出异常
|
|
193
244
|
"""
|
|
245
|
+
start_time = time.time()
|
|
194
246
|
# 如果提供了URL,则先解析它(验证URL有效性)
|
|
195
247
|
if url:
|
|
196
248
|
parsed_data = self.parse_xhs_url(url)
|
|
@@ -199,6 +251,7 @@ class XHSNoteExtractor:
|
|
|
199
251
|
|
|
200
252
|
# 检查设备是否连接
|
|
201
253
|
if self.device is None:
|
|
254
|
+
self._time_method("extract_note_data", start_time)
|
|
202
255
|
raise RuntimeError("设备未连接,请先连接设备")
|
|
203
256
|
|
|
204
257
|
# 构建跳转URL
|
|
@@ -216,102 +269,181 @@ class XHSNoteExtractor:
|
|
|
216
269
|
|
|
217
270
|
logger.info(f"✓ 成功提取笔记数据,点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
|
|
218
271
|
|
|
272
|
+
self._time_method("extract_note_data", start_time)
|
|
219
273
|
return data
|
|
220
274
|
|
|
221
275
|
except Exception as e:
|
|
222
276
|
logger.error(f"✗ 提取笔记数据失败: {e}")
|
|
277
|
+
self._time_method("extract_note_data", start_time)
|
|
223
278
|
raise
|
|
224
279
|
|
|
225
280
|
def _get_detail_data(self) -> Dict[str, Union[str, List[str]]]:
|
|
226
281
|
"""
|
|
227
282
|
从当前已经打开的小红书详情页提取完整正文、图片和点赞数。
|
|
228
|
-
|
|
283
|
+
优化版本: 使用 dump_hierarchy 替代遍历,大幅提升速度。
|
|
229
284
|
|
|
230
285
|
Returns:
|
|
231
286
|
Dict[str, Union[str, List[str]]]: 包含笔记数据的字典
|
|
232
287
|
"""
|
|
233
|
-
|
|
288
|
+
start_time = time.time()
|
|
289
|
+
logger.info("🔍 进入深度提取模式 (XML优化版)...")
|
|
234
290
|
|
|
235
|
-
# 1. 验证是否进入详情页
|
|
291
|
+
# 1. 验证是否进入详情页 & 展开全文
|
|
236
292
|
detail_loaded = False
|
|
237
293
|
detail_keywords = ["说点什么", "写评论", "写点什么", "收藏", "点赞", "评论", "分享", "发弹幕"]
|
|
238
|
-
|
|
294
|
+
|
|
295
|
+
# 尝试点击展开 (预先动作)
|
|
296
|
+
try:
|
|
297
|
+
# 快速检查是否有展开按钮
|
|
298
|
+
for btn_text in ["展开", "查看全部", "全文"]:
|
|
299
|
+
if self.device(text=btn_text).exists:
|
|
300
|
+
self.device(text=btn_text).click()
|
|
301
|
+
break
|
|
302
|
+
except: pass
|
|
303
|
+
|
|
304
|
+
# 等待加载完整
|
|
305
|
+
for i in range(5):
|
|
239
306
|
if any(self.device(textContains=kw).exists or self.device(descriptionContains=kw).exists for kw in detail_keywords):
|
|
240
307
|
detail_loaded = True
|
|
241
308
|
break
|
|
242
|
-
if i ==
|
|
309
|
+
if i == 2:
|
|
243
310
|
# 可能是视频,点击屏幕中心尝试激活 UI
|
|
244
311
|
self.device.click(540, 900)
|
|
245
|
-
time.sleep(
|
|
312
|
+
time.sleep(0.5)
|
|
246
313
|
|
|
247
314
|
if not detail_loaded:
|
|
248
315
|
logger.warning("⚠ 警告:详情页特征未发现,提取可能不完整")
|
|
249
316
|
|
|
250
|
-
# 2.
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
# 3. 提取正文 (多策略拼接)
|
|
317
|
+
# 2. 获取 UI层级 (核心优化)
|
|
318
|
+
xml_dump_start = time.time()
|
|
319
|
+
xml_content = self.device.dump_hierarchy()
|
|
320
|
+
self._time_method("dump_hierarchy", xml_dump_start)
|
|
321
|
+
|
|
322
|
+
# 3. 解析 XML
|
|
323
|
+
root = ET.fromstring(xml_content)
|
|
324
|
+
|
|
259
325
|
content = ""
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
326
|
+
likes = "0"
|
|
327
|
+
collects = "0"
|
|
328
|
+
comments = "0"
|
|
329
|
+
author_name = "Unknown"
|
|
330
|
+
image_urls = []
|
|
331
|
+
|
|
332
|
+
# 收集所有 TextView 节点信息
|
|
333
|
+
text_nodes = []
|
|
334
|
+
|
|
335
|
+
def parse_nodes(node):
|
|
336
|
+
if node.attrib.get('class') == 'android.widget.TextView':
|
|
337
|
+
text = node.attrib.get('text', '')
|
|
338
|
+
bounds_str = node.attrib.get('bounds', '[0,0][0,0]')
|
|
339
|
+
# 解析 bounds: [x1,y1][x2,y2]
|
|
269
340
|
try:
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
341
|
+
coords = bounds_str.replace('][', ',').replace('[', '').replace(']', '').split(',')
|
|
342
|
+
x1, y1, x2, y2 = map(int, coords)
|
|
343
|
+
if text:
|
|
344
|
+
text_nodes.append({
|
|
345
|
+
'text': text,
|
|
346
|
+
'l': x1, 't': y1, 'r': x2, 'b': y2,
|
|
347
|
+
'cx': (x1 + x2) / 2, 'cy': (y1 + y2) / 2
|
|
348
|
+
})
|
|
349
|
+
except: pass
|
|
350
|
+
for child in node:
|
|
351
|
+
parse_nodes(child)
|
|
352
|
+
|
|
353
|
+
parse_nodes(root)
|
|
354
|
+
|
|
355
|
+
# 4. 分析节点数据
|
|
356
|
+
|
|
357
|
+
# A. 作者提取 (寻找 "关注" 附近的文本)
|
|
358
|
+
# 策略: 找到包含 "关注" 的节点,取其左侧最近的节点
|
|
359
|
+
follow_node = None
|
|
360
|
+
for n in text_nodes:
|
|
361
|
+
if n['text'] in ["关注", "已关注"]:
|
|
362
|
+
follow_node = n
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
if follow_node:
|
|
366
|
+
best_dist = 9999
|
|
367
|
+
for n in text_nodes:
|
|
368
|
+
if n == follow_node: continue
|
|
369
|
+
if n['text'] in ["关注", "已关注"] or len(n['text']) > 30: continue
|
|
370
|
+
|
|
371
|
+
# 垂直接近
|
|
372
|
+
if abs(n['cy'] - follow_node['cy']) < 100:
|
|
373
|
+
# 在左侧
|
|
374
|
+
if n['r'] <= follow_node['l'] + 50:
|
|
375
|
+
dist = follow_node['l'] - n['r']
|
|
376
|
+
if dist < best_dist:
|
|
377
|
+
best_dist = dist
|
|
378
|
+
author_name = n['text']
|
|
379
|
+
logger.info(f"✓ 识别到作者: {author_name}")
|
|
279
380
|
|
|
280
|
-
#
|
|
281
|
-
|
|
381
|
+
# B. 互动数据提取 (底部区域)
|
|
382
|
+
bottom_nodes = [n for n in text_nodes if n['t'] > 2000] # 假设屏幕高度足够
|
|
383
|
+
bottom_nodes.sort(key=lambda x: x['l']) # 从左到右
|
|
384
|
+
|
|
385
|
+
for n in bottom_nodes:
|
|
386
|
+
txt = n['text']
|
|
387
|
+
num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
|
|
388
|
+
if not num_txt: continue
|
|
389
|
+
|
|
390
|
+
cx = n['cx']
|
|
391
|
+
if 500 < cx < 750:
|
|
392
|
+
likes = num_txt
|
|
393
|
+
elif 750 < cx < 900:
|
|
394
|
+
collects = num_txt
|
|
395
|
+
elif cx >= 900:
|
|
396
|
+
comments = num_txt
|
|
397
|
+
|
|
398
|
+
# C. 正文提取
|
|
399
|
+
# 过滤掉非正文内容
|
|
400
|
+
content_lines = []
|
|
401
|
+
exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
|
|
402
|
+
|
|
403
|
+
# 按照垂直位置排序
|
|
404
|
+
content_nodes = [n for n in text_nodes if 200 < n['t'] < 2000]
|
|
405
|
+
content_nodes.sort(key=lambda x: x['t'])
|
|
406
|
+
|
|
407
|
+
for n in content_nodes:
|
|
408
|
+
t = n['text']
|
|
409
|
+
if len(t) < 2: continue
|
|
410
|
+
if any(k in t for k in exclude_keywords): continue
|
|
411
|
+
|
|
412
|
+
# 简单的去重策略
|
|
413
|
+
if content_lines and t in content_lines[-1]: continue
|
|
414
|
+
content_lines.append(t)
|
|
415
|
+
|
|
416
|
+
content = "\n".join(content_lines)
|
|
417
|
+
|
|
418
|
+
# 5. 图片提取 (保持原有逻辑但优化等待)
|
|
282
419
|
try:
|
|
420
|
+
# 这里还是需要交互,无法纯靠XML
|
|
283
421
|
share_btn = self.device(description="分享")
|
|
284
422
|
if share_btn.exists:
|
|
285
423
|
share_btn.click()
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
if
|
|
289
|
-
|
|
424
|
+
# 显式等待 "复制链接"
|
|
425
|
+
copy_link = self.device(text="复制链接")
|
|
426
|
+
if copy_link.wait(timeout=2.0):
|
|
427
|
+
copy_link.click()
|
|
428
|
+
# 等待剪贴板更新? 稍微缓一下
|
|
290
429
|
time.sleep(0.5)
|
|
291
430
|
share_link = self.device.clipboard
|
|
292
431
|
if "http" in str(share_link):
|
|
293
432
|
image_urls = self._fetch_web_images(share_link)
|
|
294
433
|
else:
|
|
434
|
+
logger.warning("未找到复制链接按钮")
|
|
295
435
|
self.device.press("back")
|
|
296
436
|
except Exception as e:
|
|
297
437
|
logger.warning(f"⚠ 图片提取异常: {e}")
|
|
298
438
|
|
|
299
|
-
|
|
300
|
-
likes = "0"
|
|
301
|
-
try:
|
|
302
|
-
for el in self.device(className="android.widget.TextView"):
|
|
303
|
-
txt = el.get_text() or ""
|
|
304
|
-
if any(c.isdigit() for c in txt):
|
|
305
|
-
b = el.info.get('bounds', {})
|
|
306
|
-
if b.get('top', 0) > 2000 and b.get('left', 0) > 500:
|
|
307
|
-
likes = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
|
|
308
|
-
if likes: break
|
|
309
|
-
except: pass
|
|
310
|
-
|
|
439
|
+
self._time_method("_get_detail_data", start_time)
|
|
311
440
|
return {
|
|
312
441
|
"content": content,
|
|
313
442
|
"image_urls": image_urls,
|
|
314
|
-
"likes": likes
|
|
443
|
+
"likes": likes,
|
|
444
|
+
"collects": collects,
|
|
445
|
+
"comments": comments,
|
|
446
|
+
"author_name": author_name
|
|
315
447
|
}
|
|
316
448
|
|
|
317
449
|
def _fetch_web_images(self, url: str) -> List[str]:
|
|
@@ -324,6 +456,7 @@ class XHSNoteExtractor:
|
|
|
324
456
|
Returns:
|
|
325
457
|
List[str]: 图片URL列表
|
|
326
458
|
"""
|
|
459
|
+
start_time = time.time()
|
|
327
460
|
try:
|
|
328
461
|
headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"}
|
|
329
462
|
res = requests.get(url, headers=headers, timeout=10)
|
|
@@ -339,8 +472,10 @@ class XHSNoteExtractor:
|
|
|
339
472
|
for m in matches:
|
|
340
473
|
clean_url = m.replace('\\u002F', '/')
|
|
341
474
|
if clean_url not in found: found.append(clean_url)
|
|
475
|
+
self._time_method("_fetch_web_images", start_time)
|
|
342
476
|
return found
|
|
343
477
|
except:
|
|
478
|
+
self._time_method("_fetch_web_images", start_time)
|
|
344
479
|
return []
|
|
345
480
|
|
|
346
481
|
def save_note_data(self, data: Dict[str, Union[str, List[str]]],
|
|
@@ -354,6 +489,7 @@ class XHSNoteExtractor:
|
|
|
354
489
|
filename (str): 保存文件名
|
|
355
490
|
note_url (str): 笔记URL
|
|
356
491
|
"""
|
|
492
|
+
start_time = time.time()
|
|
357
493
|
try:
|
|
358
494
|
with open(filename, "w", encoding="utf-8") as f:
|
|
359
495
|
f.write("=" * 50 + "\n")
|
|
@@ -362,8 +498,11 @@ class XHSNoteExtractor:
|
|
|
362
498
|
if note_url:
|
|
363
499
|
f.write(f"笔记URL: {note_url}\n")
|
|
364
500
|
f.write("=" * 50 + "\n")
|
|
365
|
-
f.write(f"
|
|
366
|
-
f.write(f"
|
|
501
|
+
f.write(f"作者: {data.get('author_name', 'Unknown')}\n")
|
|
502
|
+
f.write(f"点赞数: {data.get('likes', '0')}\n")
|
|
503
|
+
f.write(f"收藏数: {data.get('collects', '0')}\n")
|
|
504
|
+
f.write(f"评论数: {data.get('comments', '0')}\n")
|
|
505
|
+
f.write(f"图片数: {len(data.get('image_urls', []))}\n")
|
|
367
506
|
f.write("=" * 50 + "\n")
|
|
368
507
|
f.write("【正文内容】\n")
|
|
369
508
|
f.write(data['content'])
|
|
@@ -375,24 +514,32 @@ class XHSNoteExtractor:
|
|
|
375
514
|
f.write("=" * 50 + "\n")
|
|
376
515
|
|
|
377
516
|
logger.info(f"✓ 笔记数据已保存到: {filename}")
|
|
517
|
+
self._time_method("save_note_data", start_time)
|
|
378
518
|
except Exception as e:
|
|
379
519
|
logger.error(f"✗ 保存笔记数据失败: {e}")
|
|
520
|
+
self._time_method("save_note_data", start_time)
|
|
380
521
|
raise
|
|
381
522
|
|
|
382
523
|
|
|
383
|
-
def extract_note_from_url(url: str, device_serial: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
|
|
524
|
+
def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Dict[str, Union[str, List[str]]]:
|
|
384
525
|
"""
|
|
385
526
|
便捷函数:直接从URL提取笔记数据
|
|
386
527
|
|
|
387
528
|
Args:
|
|
388
529
|
url (str): 小红书笔记URL
|
|
389
530
|
device_serial (str, optional): 设备序列号
|
|
531
|
+
enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
|
|
390
532
|
|
|
391
533
|
Returns:
|
|
392
534
|
Dict[str, Union[str, List[str]]]: 笔记数据
|
|
393
535
|
"""
|
|
394
|
-
|
|
395
|
-
|
|
536
|
+
start_time = time.time()
|
|
537
|
+
logger.info(f"[extract_note_from_url] 开始处理URL: {url}")
|
|
538
|
+
extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
|
|
539
|
+
result = extractor.extract_note_data(url=url)
|
|
540
|
+
elapsed_time = time.time() - start_time
|
|
541
|
+
logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
|
|
542
|
+
return result
|
|
396
543
|
|
|
397
544
|
|
|
398
545
|
def convert_url_format(url: str) -> str:
|
|
@@ -405,8 +552,13 @@ def convert_url_format(url: str) -> str:
|
|
|
405
552
|
Returns:
|
|
406
553
|
str: 转换后的xhsdiscover协议格式URL
|
|
407
554
|
"""
|
|
555
|
+
start_time = time.time()
|
|
556
|
+
logger.info(f"[convert_url_format] 开始转换URL: {url}")
|
|
408
557
|
parsed_data = XHSNoteExtractor.parse_xhs_url(url)
|
|
409
|
-
|
|
558
|
+
result = XHSNoteExtractor.convert_to_xhsdiscover_format(
|
|
410
559
|
parsed_data["note_id"],
|
|
411
560
|
parsed_data["xsec_token"]
|
|
412
|
-
)
|
|
561
|
+
)
|
|
562
|
+
elapsed_time = time.time() - start_time
|
|
563
|
+
logger.info(f"[convert_url_format] 耗时: {elapsed_time:.3f}秒,结果: {result}")
|
|
564
|
+
return result
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
xhs_note_extractor/__init__.py,sha256=CjHdqO4W5sj6zbeE7xYkR0_WRfc99G56nR6k2Kmji44,1207
|
|
2
2
|
xhs_note_extractor/_version.py,sha256=59jjKBtTUi_9u6FVZcIpQEDYjyAaqdxzqXyuRuFYKPE,720
|
|
3
3
|
xhs_note_extractor/cli.py,sha256=F5phl4HqnzEe_vTS8vpio_KcZNx4cxmXJnYcQ1FgMbA,2693
|
|
4
|
-
xhs_note_extractor/extractor.py,sha256=
|
|
4
|
+
xhs_note_extractor/extractor.py,sha256=NwYWtgU8lX53OVmfHzpLeSWEpZv0mH3UcLcw4K-1Oj4,21136
|
|
5
5
|
xhs_note_extractor/utils.py,sha256=mOVoLknlflzv7aCjXdmeNniQ7P6WNUcjSKjCm8uwFNk,14364
|
|
6
|
-
xhs_note_extractor-0.1.
|
|
7
|
-
xhs_note_extractor-0.1.
|
|
8
|
-
xhs_note_extractor-0.1.
|
|
9
|
-
xhs_note_extractor-0.1.
|
|
10
|
-
xhs_note_extractor-0.1.
|
|
11
|
-
xhs_note_extractor-0.1.
|
|
6
|
+
xhs_note_extractor-0.1.dev6.dist-info/licenses/LICENSE,sha256=VFtWajKKKkgOoX3cMb2upEjsQmDpU85ymhom2bYY_oI,1069
|
|
7
|
+
xhs_note_extractor-0.1.dev6.dist-info/METADATA,sha256=6XtFTabTgXrg7np6dVLtWAn0u5C_rmC10eJ9WYfukyA,5525
|
|
8
|
+
xhs_note_extractor-0.1.dev6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
xhs_note_extractor-0.1.dev6.dist-info/entry_points.txt,sha256=1IG34snKfX2pzpLSeDXHqlSVSH8p7bf3eaKQfcwGDk4,60
|
|
10
|
+
xhs_note_extractor-0.1.dev6.dist-info/top_level.txt,sha256=at3SqTdQr3DWMFCL5KM0Ofo_LE88WqADjh8MeFLwwO0,19
|
|
11
|
+
xhs_note_extractor-0.1.dev6.dist-info/RECORD,,
|
|
File without changes
|
{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/top_level.txt
RENAMED
|
File without changes
|