xhs-note-extractor 0.1.dev4__tar.gz → 0.1.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/PKG-INFO +1 -1
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/examples/basic_usage.py +1 -1
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/pyproject.toml +1 -1
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/uv.lock +1 -1
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/extractor.py +190 -130
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/PKG-INFO +1 -1
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/.gitignore +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/.joycode/prompt.json +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/LICENSE +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/MANIFEST.in +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/QUICK_START.md +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/README.md +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/examples/advanced_usage.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/scripts/build.sh +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/scripts/publish.sh +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/setup.cfg +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/test_cli.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/tests/simple_test.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/tests/test_extractor.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/__init__.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/_version.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/cli.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/utils.py +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/SOURCES.txt +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/dependency_links.txt +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/entry_points.txt +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/requires.txt +0 -0
- {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/top_level.txt +0 -0
|
@@ -66,7 +66,7 @@ def main():
|
|
|
66
66
|
print(f" 评论数: {note_data.get('comments', 0)}")
|
|
67
67
|
print(f" 图片数: {len(note_data.get('image_urls', []))}")
|
|
68
68
|
print(f" 笔记内容: {note_data.get('content', '')[:100]}...")
|
|
69
|
-
print(f" 作者: {note_data.get('
|
|
69
|
+
print(f" 作者: {note_data.get('author_name', '未知')}")
|
|
70
70
|
success_count += 1
|
|
71
71
|
except Exception as e:
|
|
72
72
|
print(f" ❌ 笔记 {i+1} 提取失败: {e}")
|
|
@@ -18,6 +18,7 @@ import requests
|
|
|
18
18
|
import logging
|
|
19
19
|
from typing import Dict, List, Optional, Union
|
|
20
20
|
from urllib.parse import urlparse, parse_qs
|
|
21
|
+
import xml.etree.ElementTree as ET
|
|
21
22
|
|
|
22
23
|
# 配置日志
|
|
23
24
|
logging.basicConfig(
|
|
@@ -35,21 +36,35 @@ class XHSNoteExtractor:
|
|
|
35
36
|
包括URL解析、设备连接、页面跳转和笔记内容提取。
|
|
36
37
|
"""
|
|
37
38
|
|
|
38
|
-
def __init__(self, device_serial: Optional[str] = None):
|
|
39
|
+
def __init__(self, device_serial: Optional[str] = None, enable_time_logging: bool = True):
|
|
39
40
|
"""
|
|
40
41
|
初始化小红书笔记提取器
|
|
41
42
|
|
|
42
43
|
Args:
|
|
43
44
|
device_serial (str, optional): 设备序列号,如果为None则自动连接可用设备
|
|
45
|
+
enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
|
|
44
46
|
|
|
45
47
|
Raises:
|
|
46
48
|
RuntimeError: 当没有可用设备时抛出异常
|
|
47
49
|
"""
|
|
48
50
|
self.device = None
|
|
49
51
|
self.device_serial = device_serial
|
|
52
|
+
self.enable_time_logging = enable_time_logging
|
|
50
53
|
if not self.connect_device():
|
|
51
54
|
raise RuntimeError("未找到可用的Android设备,请连接设备后再试")
|
|
52
55
|
|
|
56
|
+
def _time_method(self, method_name, start_time):
|
|
57
|
+
"""
|
|
58
|
+
记录方法执行时间
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
method_name (str): 方法名称
|
|
62
|
+
start_time (float): 开始时间
|
|
63
|
+
"""
|
|
64
|
+
if self.enable_time_logging:
|
|
65
|
+
elapsed_time = time.time() - start_time
|
|
66
|
+
logger.info(f"[{method_name}] 耗时: {elapsed_time:.3f}秒")
|
|
67
|
+
|
|
53
68
|
def connect_device(self) -> bool:
|
|
54
69
|
"""
|
|
55
70
|
连接设备
|
|
@@ -57,15 +72,18 @@ class XHSNoteExtractor:
|
|
|
57
72
|
Returns:
|
|
58
73
|
bool: 是否成功连接设备
|
|
59
74
|
"""
|
|
75
|
+
start_time = time.time()
|
|
60
76
|
try:
|
|
61
77
|
if self.device_serial:
|
|
62
78
|
self.device = u2.connect(self.device_serial)
|
|
63
79
|
else:
|
|
64
80
|
self.device = u2.connect()
|
|
65
81
|
logger.info(f"✓ 已连接设备: {self.device.serial}")
|
|
82
|
+
self._time_method("connect_device", start_time)
|
|
66
83
|
return True
|
|
67
84
|
except Exception as e:
|
|
68
85
|
logger.error(f"✗ 设备连接失败: {e}")
|
|
86
|
+
self._time_method("connect_device", start_time)
|
|
69
87
|
return False
|
|
70
88
|
def is_device_connected(self) -> bool:
|
|
71
89
|
"""
|
|
@@ -85,7 +103,6 @@ class XHSNoteExtractor:
|
|
|
85
103
|
|
|
86
104
|
@staticmethod
|
|
87
105
|
def parse_xhs_url(url: str) -> Dict[str, str]:
|
|
88
|
-
|
|
89
106
|
"""
|
|
90
107
|
解析小红书URL,提取note_id和xsec_token
|
|
91
108
|
|
|
@@ -98,6 +115,7 @@ class XHSNoteExtractor:
|
|
|
98
115
|
Raises:
|
|
99
116
|
ValueError: 当URL格式不正确时抛出异常
|
|
100
117
|
"""
|
|
118
|
+
start_time = time.time()
|
|
101
119
|
# 处理xhsdiscover协议格式
|
|
102
120
|
if url.startswith("xhsdiscover://"):
|
|
103
121
|
# 提取note_id
|
|
@@ -138,13 +156,22 @@ class XHSNoteExtractor:
|
|
|
138
156
|
note_id = path_parts[explore_index + 1]
|
|
139
157
|
else:
|
|
140
158
|
raise ValueError("URL中缺少note_id")
|
|
159
|
+
# 兼容 /discovery/item/ 格式
|
|
160
|
+
elif 'discovery' in path_parts and 'item' in path_parts:
|
|
161
|
+
item_index = path_parts.index('item')
|
|
162
|
+
if item_index + 1 < len(path_parts):
|
|
163
|
+
note_id = path_parts[item_index + 1]
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError("URL中缺少note_id")
|
|
141
166
|
else:
|
|
142
|
-
raise ValueError("URL格式不正确,缺少/explore/路径")
|
|
167
|
+
raise ValueError("URL格式不正确,缺少/explore/或/discovery/item/路径")
|
|
143
168
|
|
|
144
169
|
# 提取查询参数中的xsec_token
|
|
145
170
|
query_params = parse_qs(parsed_url.query)
|
|
146
171
|
xsec_token = query_params.get('xsec_token', [''])[0]
|
|
147
172
|
|
|
173
|
+
elapsed_time = time.time() - start_time
|
|
174
|
+
logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
|
|
148
175
|
return {
|
|
149
176
|
"note_id": note_id,
|
|
150
177
|
"xsec_token": xsec_token,
|
|
@@ -152,6 +179,8 @@ class XHSNoteExtractor:
|
|
|
152
179
|
}
|
|
153
180
|
|
|
154
181
|
else:
|
|
182
|
+
elapsed_time = time.time() - start_time
|
|
183
|
+
logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
|
|
155
184
|
raise ValueError("不支持的URL格式")
|
|
156
185
|
|
|
157
186
|
@staticmethod
|
|
@@ -183,12 +212,18 @@ class XHSNoteExtractor:
|
|
|
183
212
|
Returns:
|
|
184
213
|
str: xhsdiscover协议格式的URL
|
|
185
214
|
"""
|
|
215
|
+
start_time = time.time()
|
|
216
|
+
result = ""
|
|
186
217
|
if xsec_token:
|
|
187
218
|
original_url = f"http://www.xiaohongshu.com/explore/{note_id}?xsec_token={xsec_token}&xsec_source=pc_feed"
|
|
188
219
|
encoded_url = requests.utils.quote(original_url)
|
|
189
|
-
|
|
220
|
+
result = f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
|
|
190
221
|
else:
|
|
191
|
-
|
|
222
|
+
result = f"xhsdiscover://item/{note_id}"
|
|
223
|
+
|
|
224
|
+
elapsed_time = time.time() - start_time
|
|
225
|
+
logger.info(f"[convert_to_xhsdiscover_format] 耗时: {elapsed_time:.3f}秒")
|
|
226
|
+
return result
|
|
192
227
|
|
|
193
228
|
def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
|
|
194
229
|
xsec_token: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
|
|
@@ -207,6 +242,7 @@ class XHSNoteExtractor:
|
|
|
207
242
|
RuntimeError: 当设备未连接时抛出异常
|
|
208
243
|
Exception: 当提取过程中出现错误时抛出异常
|
|
209
244
|
"""
|
|
245
|
+
start_time = time.time()
|
|
210
246
|
# 如果提供了URL,则先解析它(验证URL有效性)
|
|
211
247
|
if url:
|
|
212
248
|
parsed_data = self.parse_xhs_url(url)
|
|
@@ -215,6 +251,7 @@ class XHSNoteExtractor:
|
|
|
215
251
|
|
|
216
252
|
# 检查设备是否连接
|
|
217
253
|
if self.device is None:
|
|
254
|
+
self._time_method("extract_note_data", start_time)
|
|
218
255
|
raise RuntimeError("设备未连接,请先连接设备")
|
|
219
256
|
|
|
220
257
|
# 构建跳转URL
|
|
@@ -232,168 +269,174 @@ class XHSNoteExtractor:
|
|
|
232
269
|
|
|
233
270
|
logger.info(f"✓ 成功提取笔记数据,点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
|
|
234
271
|
|
|
272
|
+
self._time_method("extract_note_data", start_time)
|
|
235
273
|
return data
|
|
236
274
|
|
|
237
275
|
except Exception as e:
|
|
238
276
|
logger.error(f"✗ 提取笔记数据失败: {e}")
|
|
277
|
+
self._time_method("extract_note_data", start_time)
|
|
239
278
|
raise
|
|
240
279
|
|
|
241
280
|
def _get_detail_data(self) -> Dict[str, Union[str, List[str]]]:
|
|
242
281
|
"""
|
|
243
282
|
从当前已经打开的小红书详情页提取完整正文、图片和点赞数。
|
|
244
|
-
|
|
283
|
+
优化版本: 使用 dump_hierarchy 替代遍历,大幅提升速度。
|
|
245
284
|
|
|
246
285
|
Returns:
|
|
247
286
|
Dict[str, Union[str, List[str]]]: 包含笔记数据的字典
|
|
248
287
|
"""
|
|
249
|
-
|
|
288
|
+
start_time = time.time()
|
|
289
|
+
logger.info("🔍 进入深度提取模式 (XML优化版)...")
|
|
250
290
|
|
|
251
|
-
# 1. 验证是否进入详情页
|
|
291
|
+
# 1. 验证是否进入详情页 & 展开全文
|
|
252
292
|
detail_loaded = False
|
|
253
293
|
detail_keywords = ["说点什么", "写评论", "写点什么", "收藏", "点赞", "评论", "分享", "发弹幕"]
|
|
254
|
-
|
|
294
|
+
|
|
295
|
+
# 尝试点击展开 (预先动作)
|
|
296
|
+
try:
|
|
297
|
+
# 快速检查是否有展开按钮
|
|
298
|
+
for btn_text in ["展开", "查看全部", "全文"]:
|
|
299
|
+
if self.device(text=btn_text).exists:
|
|
300
|
+
self.device(text=btn_text).click()
|
|
301
|
+
break
|
|
302
|
+
except: pass
|
|
303
|
+
|
|
304
|
+
# 等待加载完整
|
|
305
|
+
for i in range(5):
|
|
255
306
|
if any(self.device(textContains=kw).exists or self.device(descriptionContains=kw).exists for kw in detail_keywords):
|
|
256
307
|
detail_loaded = True
|
|
257
308
|
break
|
|
258
|
-
if i ==
|
|
309
|
+
if i == 2:
|
|
259
310
|
# 可能是视频,点击屏幕中心尝试激活 UI
|
|
260
311
|
self.device.click(540, 900)
|
|
261
|
-
time.sleep(
|
|
312
|
+
time.sleep(0.5)
|
|
262
313
|
|
|
263
314
|
if not detail_loaded:
|
|
264
315
|
logger.warning("⚠ 警告:详情页特征未发现,提取可能不完整")
|
|
265
316
|
|
|
266
|
-
#
|
|
317
|
+
# 2. 获取 UI层级 (核心优化)
|
|
318
|
+
xml_dump_start = time.time()
|
|
319
|
+
xml_content = self.device.dump_hierarchy()
|
|
320
|
+
self._time_method("dump_hierarchy", xml_dump_start)
|
|
321
|
+
|
|
322
|
+
# 3. 解析 XML
|
|
323
|
+
root = ET.fromstring(xml_content)
|
|
324
|
+
|
|
325
|
+
content = ""
|
|
326
|
+
likes = "0"
|
|
327
|
+
collects = "0"
|
|
328
|
+
comments = "0"
|
|
267
329
|
author_name = "Unknown"
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
330
|
+
image_urls = []
|
|
331
|
+
|
|
332
|
+
# 收集所有 TextView 节点信息
|
|
333
|
+
text_nodes = []
|
|
334
|
+
|
|
335
|
+
def parse_nodes(node):
|
|
336
|
+
if node.attrib.get('class') == 'android.widget.TextView':
|
|
337
|
+
text = node.attrib.get('text', '')
|
|
338
|
+
bounds_str = node.attrib.get('bounds', '[0,0][0,0]')
|
|
339
|
+
# 解析 bounds: [x1,y1][x2,y2]
|
|
340
|
+
try:
|
|
341
|
+
coords = bounds_str.replace('][', ',').replace('[', '').replace(']', '').split(',')
|
|
342
|
+
x1, y1, x2, y2 = map(int, coords)
|
|
343
|
+
if text:
|
|
344
|
+
text_nodes.append({
|
|
345
|
+
'text': text,
|
|
346
|
+
'l': x1, 't': y1, 'r': x2, 'b': y2,
|
|
347
|
+
'cx': (x1 + x2) / 2, 'cy': (y1 + y2) / 2
|
|
348
|
+
})
|
|
349
|
+
except: pass
|
|
350
|
+
for child in node:
|
|
351
|
+
parse_nodes(child)
|
|
352
|
+
|
|
353
|
+
parse_nodes(root)
|
|
354
|
+
|
|
355
|
+
# 4. 分析节点数据
|
|
356
|
+
|
|
357
|
+
# A. 作者提取 (寻找 "关注" 附近的文本)
|
|
358
|
+
# 策略: 找到包含 "关注" 的节点,取其左侧最近的节点
|
|
359
|
+
follow_node = None
|
|
360
|
+
for n in text_nodes:
|
|
361
|
+
if n['text'] in ["关注", "已关注"]:
|
|
362
|
+
follow_node = n
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
if follow_node:
|
|
366
|
+
best_dist = 9999
|
|
367
|
+
for n in text_nodes:
|
|
368
|
+
if n == follow_node: continue
|
|
369
|
+
if n['text'] in ["关注", "已关注"] or len(n['text']) > 30: continue
|
|
370
|
+
|
|
371
|
+
# 垂直接近
|
|
372
|
+
if abs(n['cy'] - follow_node['cy']) < 100:
|
|
373
|
+
# 在左侧
|
|
374
|
+
if n['r'] <= follow_node['l'] + 50:
|
|
375
|
+
dist = follow_node['l'] - n['r']
|
|
376
|
+
if dist < best_dist:
|
|
377
|
+
best_dist = dist
|
|
378
|
+
author_name = n['text']
|
|
379
|
+
logger.info(f"✓ 识别到作者: {author_name}")
|
|
307
380
|
|
|
308
|
-
#
|
|
309
|
-
for
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
381
|
+
# B. 互动数据提取 (底部区域)
|
|
382
|
+
bottom_nodes = [n for n in text_nodes if n['t'] > 2000] # 假设屏幕高度足够
|
|
383
|
+
bottom_nodes.sort(key=lambda x: x['l']) # 从左到右
|
|
384
|
+
|
|
385
|
+
for n in bottom_nodes:
|
|
386
|
+
txt = n['text']
|
|
387
|
+
num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
|
|
388
|
+
if not num_txt: continue
|
|
389
|
+
|
|
390
|
+
cx = n['cx']
|
|
391
|
+
if 500 < cx < 750:
|
|
392
|
+
likes = num_txt
|
|
393
|
+
elif 750 < cx < 900:
|
|
394
|
+
collects = num_txt
|
|
395
|
+
elif cx >= 900:
|
|
396
|
+
comments = num_txt
|
|
315
397
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
except: continue
|
|
336
|
-
content = "\n".join(texts)
|
|
398
|
+
# C. 正文提取
|
|
399
|
+
# 过滤掉非正文内容
|
|
400
|
+
content_lines = []
|
|
401
|
+
exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
|
|
402
|
+
|
|
403
|
+
# 按照垂直位置排序
|
|
404
|
+
content_nodes = [n for n in text_nodes if 200 < n['t'] < 2000]
|
|
405
|
+
content_nodes.sort(key=lambda x: x['t'])
|
|
406
|
+
|
|
407
|
+
for n in content_nodes:
|
|
408
|
+
t = n['text']
|
|
409
|
+
if len(t) < 2: continue
|
|
410
|
+
if any(k in t for k in exclude_keywords): continue
|
|
411
|
+
|
|
412
|
+
# 简单的去重策略
|
|
413
|
+
if content_lines and t in content_lines[-1]: continue
|
|
414
|
+
content_lines.append(t)
|
|
415
|
+
|
|
416
|
+
content = "\n".join(content_lines)
|
|
337
417
|
|
|
338
|
-
#
|
|
339
|
-
image_urls = []
|
|
418
|
+
# 5. 图片提取 (保持原有逻辑但优化等待)
|
|
340
419
|
try:
|
|
420
|
+
# 这里还是需要交互,无法纯靠XML
|
|
341
421
|
share_btn = self.device(description="分享")
|
|
342
422
|
if share_btn.exists:
|
|
343
423
|
share_btn.click()
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
if
|
|
347
|
-
|
|
424
|
+
# 显式等待 "复制链接"
|
|
425
|
+
copy_link = self.device(text="复制链接")
|
|
426
|
+
if copy_link.wait(timeout=2.0):
|
|
427
|
+
copy_link.click()
|
|
428
|
+
# 等待剪贴板更新? 稍微缓一下
|
|
348
429
|
time.sleep(0.5)
|
|
349
430
|
share_link = self.device.clipboard
|
|
350
431
|
if "http" in str(share_link):
|
|
351
432
|
image_urls = self._fetch_web_images(share_link)
|
|
352
433
|
else:
|
|
434
|
+
logger.warning("未找到复制链接按钮")
|
|
353
435
|
self.device.press("back")
|
|
354
436
|
except Exception as e:
|
|
355
437
|
logger.warning(f"⚠ 图片提取异常: {e}")
|
|
356
438
|
|
|
357
|
-
|
|
358
|
-
likes = "0"
|
|
359
|
-
collects = "0"
|
|
360
|
-
comments = "0"
|
|
361
|
-
|
|
362
|
-
try:
|
|
363
|
-
# 在底部区域查找互动数据
|
|
364
|
-
# 通常顺序是:左边评论(或直接显示写评论),右边依次是点赞、收藏、评论数
|
|
365
|
-
# 策略:遍历底部 TextView,根据位置和内容识别
|
|
366
|
-
bottom_elements = []
|
|
367
|
-
for el in self.device(className="android.widget.TextView"):
|
|
368
|
-
b = el.info.get('bounds', {})
|
|
369
|
-
if b.get('top', 0) > 2000: # 屏幕底部区域
|
|
370
|
-
bottom_elements.append(el)
|
|
371
|
-
|
|
372
|
-
# 排序:按从左到右
|
|
373
|
-
bottom_elements.sort(key=lambda x: x.info.get('bounds', {}).get('left', 0))
|
|
374
|
-
|
|
375
|
-
for el in bottom_elements:
|
|
376
|
-
txt = el.get_text() or ""
|
|
377
|
-
# 提取数字部分
|
|
378
|
-
num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
|
|
379
|
-
if not num_txt: continue
|
|
380
|
-
|
|
381
|
-
b = el.info.get('bounds', {})
|
|
382
|
-
left = b.get('left', 0)
|
|
383
|
-
|
|
384
|
-
# 根据位置初步判断 (这类位置可能随机型变化,但相对顺序通常一致)
|
|
385
|
-
# 点赞通常在 500-750 左右
|
|
386
|
-
# 收藏通常在 750-900 左右
|
|
387
|
-
# 评论通常在 900+ 左右
|
|
388
|
-
if 500 < left < 750:
|
|
389
|
-
likes = num_txt
|
|
390
|
-
elif 750 < left < 900:
|
|
391
|
-
collects = num_txt
|
|
392
|
-
elif left >= 900:
|
|
393
|
-
comments = num_txt
|
|
394
|
-
except Exception as e:
|
|
395
|
-
logger.warning(f"⚠ 互动数据提取异常: {e}")
|
|
396
|
-
|
|
439
|
+
self._time_method("_get_detail_data", start_time)
|
|
397
440
|
return {
|
|
398
441
|
"content": content,
|
|
399
442
|
"image_urls": image_urls,
|
|
@@ -413,6 +456,7 @@ class XHSNoteExtractor:
|
|
|
413
456
|
Returns:
|
|
414
457
|
List[str]: 图片URL列表
|
|
415
458
|
"""
|
|
459
|
+
start_time = time.time()
|
|
416
460
|
try:
|
|
417
461
|
headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"}
|
|
418
462
|
res = requests.get(url, headers=headers, timeout=10)
|
|
@@ -428,8 +472,10 @@ class XHSNoteExtractor:
|
|
|
428
472
|
for m in matches:
|
|
429
473
|
clean_url = m.replace('\\u002F', '/')
|
|
430
474
|
if clean_url not in found: found.append(clean_url)
|
|
475
|
+
self._time_method("_fetch_web_images", start_time)
|
|
431
476
|
return found
|
|
432
477
|
except:
|
|
478
|
+
self._time_method("_fetch_web_images", start_time)
|
|
433
479
|
return []
|
|
434
480
|
|
|
435
481
|
def save_note_data(self, data: Dict[str, Union[str, List[str]]],
|
|
@@ -443,6 +489,7 @@ class XHSNoteExtractor:
|
|
|
443
489
|
filename (str): 保存文件名
|
|
444
490
|
note_url (str): 笔记URL
|
|
445
491
|
"""
|
|
492
|
+
start_time = time.time()
|
|
446
493
|
try:
|
|
447
494
|
with open(filename, "w", encoding="utf-8") as f:
|
|
448
495
|
f.write("=" * 50 + "\n")
|
|
@@ -467,24 +514,32 @@ class XHSNoteExtractor:
|
|
|
467
514
|
f.write("=" * 50 + "\n")
|
|
468
515
|
|
|
469
516
|
logger.info(f"✓ 笔记数据已保存到: {filename}")
|
|
517
|
+
self._time_method("save_note_data", start_time)
|
|
470
518
|
except Exception as e:
|
|
471
519
|
logger.error(f"✗ 保存笔记数据失败: {e}")
|
|
520
|
+
self._time_method("save_note_data", start_time)
|
|
472
521
|
raise
|
|
473
522
|
|
|
474
523
|
|
|
475
|
-
def extract_note_from_url(url: str, device_serial: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
|
|
524
|
+
def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Dict[str, Union[str, List[str]]]:
|
|
476
525
|
"""
|
|
477
526
|
便捷函数:直接从URL提取笔记数据
|
|
478
527
|
|
|
479
528
|
Args:
|
|
480
529
|
url (str): 小红书笔记URL
|
|
481
530
|
device_serial (str, optional): 设备序列号
|
|
531
|
+
enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
|
|
482
532
|
|
|
483
533
|
Returns:
|
|
484
534
|
Dict[str, Union[str, List[str]]]: 笔记数据
|
|
485
535
|
"""
|
|
486
|
-
|
|
487
|
-
|
|
536
|
+
start_time = time.time()
|
|
537
|
+
logger.info(f"[extract_note_from_url] 开始处理URL: {url}")
|
|
538
|
+
extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
|
|
539
|
+
result = extractor.extract_note_data(url=url)
|
|
540
|
+
elapsed_time = time.time() - start_time
|
|
541
|
+
logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
|
|
542
|
+
return result
|
|
488
543
|
|
|
489
544
|
|
|
490
545
|
def convert_url_format(url: str) -> str:
|
|
@@ -497,8 +552,13 @@ def convert_url_format(url: str) -> str:
|
|
|
497
552
|
Returns:
|
|
498
553
|
str: 转换后的xhsdiscover协议格式URL
|
|
499
554
|
"""
|
|
555
|
+
start_time = time.time()
|
|
556
|
+
logger.info(f"[convert_url_format] 开始转换URL: {url}")
|
|
500
557
|
parsed_data = XHSNoteExtractor.parse_xhs_url(url)
|
|
501
|
-
|
|
558
|
+
result = XHSNoteExtractor.convert_to_xhsdiscover_format(
|
|
502
559
|
parsed_data["note_id"],
|
|
503
560
|
parsed_data["xsec_token"]
|
|
504
|
-
)
|
|
561
|
+
)
|
|
562
|
+
elapsed_time = time.time() - start_time
|
|
563
|
+
logger.info(f"[convert_url_format] 耗时: {elapsed_time:.3f}秒,结果: {result}")
|
|
564
|
+
return result
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|