xhs-note-extractor 0.1.dev4__tar.gz → 0.1.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/PKG-INFO +1 -1
  2. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/examples/basic_usage.py +1 -1
  3. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/pyproject.toml +1 -1
  4. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/uv.lock +1 -1
  5. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/extractor.py +190 -130
  6. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/PKG-INFO +1 -1
  7. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/.gitignore +0 -0
  8. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/.joycode/prompt.json +0 -0
  9. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/LICENSE +0 -0
  10. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/MANIFEST.in +0 -0
  11. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/QUICK_START.md +0 -0
  12. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/README.md +0 -0
  13. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/examples/advanced_usage.py +0 -0
  14. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/scripts/build.sh +0 -0
  15. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/scripts/publish.sh +0 -0
  16. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/setup.cfg +0 -0
  17. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/test_cli.py +0 -0
  18. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/tests/simple_test.py +0 -0
  19. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/tests/test_extractor.py +0 -0
  20. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/__init__.py +0 -0
  21. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/_version.py +0 -0
  22. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/cli.py +0 -0
  23. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor/utils.py +0 -0
  24. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/SOURCES.txt +0 -0
  25. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/dependency_links.txt +0 -0
  26. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/entry_points.txt +0 -0
  27. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/requires.txt +0 -0
  28. {xhs_note_extractor-0.1.dev4 → xhs_note_extractor-0.1.dev6}/xhs_note_extractor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xhs-note-extractor
3
- Version: 0.1.dev4
3
+ Version: 0.1.dev6
4
4
  Summary: A Python package for extracting Xiaohongshu (Little Red Book) note data from URLs
5
5
  Author-email: JoyCode Agent <agent@joycode.com>
6
6
  License: MIT
@@ -66,7 +66,7 @@ def main():
66
66
  print(f" 评论数: {note_data.get('comments', 0)}")
67
67
  print(f" 图片数: {len(note_data.get('image_urls', []))}")
68
68
  print(f" 笔记内容: {note_data.get('content', '')[:100]}...")
69
- print(f" 作者: {note_data.get('author', {}).get('nickname', '未知')}")
69
+ print(f" 作者: {note_data.get('author_name', '未知')}")
70
70
  success_count += 1
71
71
  except Exception as e:
72
72
  print(f" ❌ 笔记 {i+1} 提取失败: {e}")
@@ -31,7 +31,7 @@ dependencies = [
31
31
  "uiautomator2>=2.16.17",
32
32
  "requests>=2.25.0",
33
33
  ]
34
- version = "0.1.dev4"
34
+ version = "0.1.dev6"
35
35
 
36
36
  [project.optional-dependencies]
37
37
  dev = [
@@ -816,7 +816,7 @@ wheels = [
816
816
 
817
817
  [[package]]
818
818
  name = "xhs-note-extractor"
819
- version = "0.1.dev3"
819
+ version = "0.1.dev5"
820
820
  source = { editable = "." }
821
821
  dependencies = [
822
822
  { name = "requests" },
@@ -18,6 +18,7 @@ import requests
18
18
  import logging
19
19
  from typing import Dict, List, Optional, Union
20
20
  from urllib.parse import urlparse, parse_qs
21
+ import xml.etree.ElementTree as ET
21
22
 
22
23
  # 配置日志
23
24
  logging.basicConfig(
@@ -35,21 +36,35 @@ class XHSNoteExtractor:
35
36
  包括URL解析、设备连接、页面跳转和笔记内容提取。
36
37
  """
37
38
 
38
- def __init__(self, device_serial: Optional[str] = None):
39
+ def __init__(self, device_serial: Optional[str] = None, enable_time_logging: bool = True):
39
40
  """
40
41
  初始化小红书笔记提取器
41
42
 
42
43
  Args:
43
44
  device_serial (str, optional): 设备序列号,如果为None则自动连接可用设备
45
+ enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
44
46
 
45
47
  Raises:
46
48
  RuntimeError: 当没有可用设备时抛出异常
47
49
  """
48
50
  self.device = None
49
51
  self.device_serial = device_serial
52
+ self.enable_time_logging = enable_time_logging
50
53
  if not self.connect_device():
51
54
  raise RuntimeError("未找到可用的Android设备,请连接设备后再试")
52
55
 
56
+ def _time_method(self, method_name, start_time):
57
+ """
58
+ 记录方法执行时间
59
+
60
+ Args:
61
+ method_name (str): 方法名称
62
+ start_time (float): 开始时间
63
+ """
64
+ if self.enable_time_logging:
65
+ elapsed_time = time.time() - start_time
66
+ logger.info(f"[{method_name}] 耗时: {elapsed_time:.3f}秒")
67
+
53
68
  def connect_device(self) -> bool:
54
69
  """
55
70
  连接设备
@@ -57,15 +72,18 @@ class XHSNoteExtractor:
57
72
  Returns:
58
73
  bool: 是否成功连接设备
59
74
  """
75
+ start_time = time.time()
60
76
  try:
61
77
  if self.device_serial:
62
78
  self.device = u2.connect(self.device_serial)
63
79
  else:
64
80
  self.device = u2.connect()
65
81
  logger.info(f"✓ 已连接设备: {self.device.serial}")
82
+ self._time_method("connect_device", start_time)
66
83
  return True
67
84
  except Exception as e:
68
85
  logger.error(f"✗ 设备连接失败: {e}")
86
+ self._time_method("connect_device", start_time)
69
87
  return False
70
88
  def is_device_connected(self) -> bool:
71
89
  """
@@ -85,7 +103,6 @@ class XHSNoteExtractor:
85
103
 
86
104
  @staticmethod
87
105
  def parse_xhs_url(url: str) -> Dict[str, str]:
88
-
89
106
  """
90
107
  解析小红书URL,提取note_id和xsec_token
91
108
 
@@ -98,6 +115,7 @@ class XHSNoteExtractor:
98
115
  Raises:
99
116
  ValueError: 当URL格式不正确时抛出异常
100
117
  """
118
+ start_time = time.time()
101
119
  # 处理xhsdiscover协议格式
102
120
  if url.startswith("xhsdiscover://"):
103
121
  # 提取note_id
@@ -138,13 +156,22 @@ class XHSNoteExtractor:
138
156
  note_id = path_parts[explore_index + 1]
139
157
  else:
140
158
  raise ValueError("URL中缺少note_id")
159
+ # 兼容 /discovery/item/ 格式
160
+ elif 'discovery' in path_parts and 'item' in path_parts:
161
+ item_index = path_parts.index('item')
162
+ if item_index + 1 < len(path_parts):
163
+ note_id = path_parts[item_index + 1]
164
+ else:
165
+ raise ValueError("URL中缺少note_id")
141
166
  else:
142
- raise ValueError("URL格式不正确,缺少/explore/路径")
167
+ raise ValueError("URL格式不正确,缺少/explore/或/discovery/item/路径")
143
168
 
144
169
  # 提取查询参数中的xsec_token
145
170
  query_params = parse_qs(parsed_url.query)
146
171
  xsec_token = query_params.get('xsec_token', [''])[0]
147
172
 
173
+ elapsed_time = time.time() - start_time
174
+ logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
148
175
  return {
149
176
  "note_id": note_id,
150
177
  "xsec_token": xsec_token,
@@ -152,6 +179,8 @@ class XHSNoteExtractor:
152
179
  }
153
180
 
154
181
  else:
182
+ elapsed_time = time.time() - start_time
183
+ logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
155
184
  raise ValueError("不支持的URL格式")
156
185
 
157
186
  @staticmethod
@@ -183,12 +212,18 @@ class XHSNoteExtractor:
183
212
  Returns:
184
213
  str: xhsdiscover协议格式的URL
185
214
  """
215
+ start_time = time.time()
216
+ result = ""
186
217
  if xsec_token:
187
218
  original_url = f"http://www.xiaohongshu.com/explore/{note_id}?xsec_token={xsec_token}&xsec_source=pc_feed"
188
219
  encoded_url = requests.utils.quote(original_url)
189
- return f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
220
+ result = f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
190
221
  else:
191
- return f"xhsdiscover://item/{note_id}"
222
+ result = f"xhsdiscover://item/{note_id}"
223
+
224
+ elapsed_time = time.time() - start_time
225
+ logger.info(f"[convert_to_xhsdiscover_format] 耗时: {elapsed_time:.3f}秒")
226
+ return result
192
227
 
193
228
  def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
194
229
  xsec_token: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
@@ -207,6 +242,7 @@ class XHSNoteExtractor:
207
242
  RuntimeError: 当设备未连接时抛出异常
208
243
  Exception: 当提取过程中出现错误时抛出异常
209
244
  """
245
+ start_time = time.time()
210
246
  # 如果提供了URL,则先解析它(验证URL有效性)
211
247
  if url:
212
248
  parsed_data = self.parse_xhs_url(url)
@@ -215,6 +251,7 @@ class XHSNoteExtractor:
215
251
 
216
252
  # 检查设备是否连接
217
253
  if self.device is None:
254
+ self._time_method("extract_note_data", start_time)
218
255
  raise RuntimeError("设备未连接,请先连接设备")
219
256
 
220
257
  # 构建跳转URL
@@ -232,168 +269,174 @@ class XHSNoteExtractor:
232
269
 
233
270
  logger.info(f"✓ 成功提取笔记数据,点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
234
271
 
272
+ self._time_method("extract_note_data", start_time)
235
273
  return data
236
274
 
237
275
  except Exception as e:
238
276
  logger.error(f"✗ 提取笔记数据失败: {e}")
277
+ self._time_method("extract_note_data", start_time)
239
278
  raise
240
279
 
241
280
  def _get_detail_data(self) -> Dict[str, Union[str, List[str]]]:
242
281
  """
243
282
  从当前已经打开的小红书详情页提取完整正文、图片和点赞数。
244
- 这是xhs_utils.get_detail_data的封装版本,保持相同功能。
283
+ 优化版本: 使用 dump_hierarchy 替代遍历,大幅提升速度。
245
284
 
246
285
  Returns:
247
286
  Dict[str, Union[str, List[str]]]: 包含笔记数据的字典
248
287
  """
249
- logger.info("🔍 进入深度提取模式...")
288
+ start_time = time.time()
289
+ logger.info("🔍 进入深度提取模式 (XML优化版)...")
250
290
 
251
- # 1. 验证是否进入详情页 (增加重试和多关键词检测)
291
+ # 1. 验证是否进入详情页 & 展开全文
252
292
  detail_loaded = False
253
293
  detail_keywords = ["说点什么", "写评论", "写点什么", "收藏", "点赞", "评论", "分享", "发弹幕"]
254
- for i in range(8):
294
+
295
+ # 尝试点击展开 (预先动作)
296
+ try:
297
+ # 快速检查是否有展开按钮
298
+ for btn_text in ["展开", "查看全部", "全文"]:
299
+ if self.device(text=btn_text).exists:
300
+ self.device(text=btn_text).click()
301
+ break
302
+ except: pass
303
+
304
+ # 等待加载完整
305
+ for i in range(5):
255
306
  if any(self.device(textContains=kw).exists or self.device(descriptionContains=kw).exists for kw in detail_keywords):
256
307
  detail_loaded = True
257
308
  break
258
- if i == 4:
309
+ if i == 2:
259
310
  # 可能是视频,点击屏幕中心尝试激活 UI
260
311
  self.device.click(540, 900)
261
- time.sleep(1)
312
+ time.sleep(0.5)
262
313
 
263
314
  if not detail_loaded:
264
315
  logger.warning("⚠ 警告:详情页特征未发现,提取可能不完整")
265
316
 
266
- # 1.5 提取作者信息 (优先尝试)
317
+ # 2. 获取 UI层级 (核心优化)
318
+ xml_dump_start = time.time()
319
+ xml_content = self.device.dump_hierarchy()
320
+ self._time_method("dump_hierarchy", xml_dump_start)
321
+
322
+ # 3. 解析 XML
323
+ root = ET.fromstring(xml_content)
324
+
325
+ content = ""
326
+ likes = "0"
327
+ collects = "0"
328
+ comments = "0"
267
329
  author_name = "Unknown"
268
- try:
269
- # 策略: 寻找 "关注" 或 "已关注" 按钮,作者名通常在它左边
270
- follow_keyword = "关注"
271
- follow_btn = self.device(text=follow_keyword)
272
- if not follow_btn.exists:
273
- follow_keyword = "已关注"
274
- follow_btn = self.device(text=follow_keyword)
275
-
276
- if follow_btn.exists:
277
- # 获取关注按钮位置
278
- btn_info = follow_btn.info
279
- if btn_info and 'bounds' in btn_info:
280
- btn_cnter_y = (btn_info['bounds']['top'] + btn_info['bounds']['bottom']) / 2
281
- btn_left = btn_info['bounds']['left']
282
-
283
- # 寻找左侧最近的文本
284
- candidates = []
285
- for el in self.device(className="android.widget.TextView"):
286
- txt = el.get_text()
287
- # 名字太长通常不是,但放宽限制到 30
288
- if not txt or txt == follow_keyword or len(txt) > 30: continue
289
-
290
- b = el.info.get('bounds')
291
- if not b: continue
292
-
293
- el_center_y = (b['top'] + b['bottom']) / 2
294
- # 垂直对齐判断 (容差 50px)
295
- if abs(el_center_y - btn_cnter_y) < 50:
296
- # 必须在按钮左侧
297
- if b['right'] < btn_left + 50: # 允许少量重叠或紧贴
298
- candidates.append((txt, b['right']))
299
-
300
- # 选最靠右的那个 (离按钮最近)
301
- if candidates:
302
- candidates.sort(key=lambda x: x[1], reverse=True)
303
- author_name = candidates[0][0]
304
- logger.info(f"✓ 识别到作者: {author_name}")
305
- except Exception as e:
306
- logger.warning(f"⚠ 作者提取异常: {e}")
330
+ image_urls = []
331
+
332
+ # 收集所有 TextView 节点信息
333
+ text_nodes = []
334
+
335
+ def parse_nodes(node):
336
+ if node.attrib.get('class') == 'android.widget.TextView':
337
+ text = node.attrib.get('text', '')
338
+ bounds_str = node.attrib.get('bounds', '[0,0][0,0]')
339
+ # 解析 bounds: [x1,y1][x2,y2]
340
+ try:
341
+ coords = bounds_str.replace('][', ',').replace('[', '').replace(']', '').split(',')
342
+ x1, y1, x2, y2 = map(int, coords)
343
+ if text:
344
+ text_nodes.append({
345
+ 'text': text,
346
+ 'l': x1, 't': y1, 'r': x2, 'b': y2,
347
+ 'cx': (x1 + x2) / 2, 'cy': (y1 + y2) / 2
348
+ })
349
+ except: pass
350
+ for child in node:
351
+ parse_nodes(child)
352
+
353
+ parse_nodes(root)
354
+
355
+ # 4. 分析节点数据
356
+
357
+ # A. 作者提取 (寻找 "关注" 附近的文本)
358
+ # 策略: 找到包含 "关注" 的节点,取其左侧最近的节点
359
+ follow_node = None
360
+ for n in text_nodes:
361
+ if n['text'] in ["关注", "已关注"]:
362
+ follow_node = n
363
+ break
364
+
365
+ if follow_node:
366
+ best_dist = 9999
367
+ for n in text_nodes:
368
+ if n == follow_node: continue
369
+ if n['text'] in ["关注", "已关注"] or len(n['text']) > 30: continue
370
+
371
+ # 垂直接近
372
+ if abs(n['cy'] - follow_node['cy']) < 100:
373
+ # 在左侧
374
+ if n['r'] <= follow_node['l'] + 50:
375
+ dist = follow_node['l'] - n['r']
376
+ if dist < best_dist:
377
+ best_dist = dist
378
+ author_name = n['text']
379
+ logger.info(f"✓ 识别到作者: {author_name}")
307
380
 
308
- # 2. 处理"展开"按钮以获取完整长文
309
- for btn_text in ["展开", "查看全部", "全文"]:
310
- btn = self.device(text=btn_text)
311
- if btn.exists:
312
- logger.info(f"[Action] 点击'{btn_text}'")
313
- btn.click()
314
- time.sleep(1)
381
+ # B. 互动数据提取 (底部区域)
382
+ bottom_nodes = [n for n in text_nodes if n['t'] > 2000] # 假设屏幕高度足够
383
+ bottom_nodes.sort(key=lambda x: x['l']) # 从左到右
384
+
385
+ for n in bottom_nodes:
386
+ txt = n['text']
387
+ num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
388
+ if not num_txt: continue
389
+
390
+ cx = n['cx']
391
+ if 500 < cx < 750:
392
+ likes = num_txt
393
+ elif 750 < cx < 900:
394
+ collects = num_txt
395
+ elif cx >= 900:
396
+ comments = num_txt
315
397
 
316
- # 3. 提取正文 (多策略拼接)
317
- content = ""
318
- # 策略 A: 尝试常见 ID
319
- desc_el = self.device(resourceIdMatches=".*desc.*|.*content.*")
320
- if desc_el.exists:
321
- content = desc_el.get_text()
322
-
323
- # 策略 B: 文本容器遍历 (更稳健)
324
- if not content or len(content) < 20:
325
- texts = []
326
- for el in self.device(className="android.widget.TextView"):
327
- try:
328
- t = el.get_text()
329
- if not t or len(t) < 3: continue
330
- # 过滤坐标:只取屏幕中间内容区
331
- b = el.info.get('bounds', {})
332
- if 200 < b.get('top', 0) < 2100:
333
- if not any(k in t for k in ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论']):
334
- texts.append(t)
335
- except: continue
336
- content = "\n".join(texts)
398
+ # C. 正文提取
399
+ # 过滤掉非正文内容
400
+ content_lines = []
401
+ exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
402
+
403
+ # 按照垂直位置排序
404
+ content_nodes = [n for n in text_nodes if 200 < n['t'] < 2000]
405
+ content_nodes.sort(key=lambda x: x['t'])
406
+
407
+ for n in content_nodes:
408
+ t = n['text']
409
+ if len(t) < 2: continue
410
+ if any(k in t for k in exclude_keywords): continue
411
+
412
+ # 简单的去重策略
413
+ if content_lines and t in content_lines[-1]: continue
414
+ content_lines.append(t)
415
+
416
+ content = "\n".join(content_lines)
337
417
 
338
- # 4. 提取图片 (通过分享链接解析高清图)
339
- image_urls = []
418
+ # 5. 图片提取 (保持原有逻辑但优化等待)
340
419
  try:
420
+ # 这里还是需要交互,无法纯靠XML
341
421
  share_btn = self.device(description="分享")
342
422
  if share_btn.exists:
343
423
  share_btn.click()
344
- time.sleep(1.5)
345
- copy_link_btn = self.device(text="复制链接")
346
- if copy_link_btn.exists:
347
- copy_link_btn.click()
424
+ # 显式等待 "复制链接"
425
+ copy_link = self.device(text="复制链接")
426
+ if copy_link.wait(timeout=2.0):
427
+ copy_link.click()
428
+ # 等待剪贴板更新? 稍微缓一下
348
429
  time.sleep(0.5)
349
430
  share_link = self.device.clipboard
350
431
  if "http" in str(share_link):
351
432
  image_urls = self._fetch_web_images(share_link)
352
433
  else:
434
+ logger.warning("未找到复制链接按钮")
353
435
  self.device.press("back")
354
436
  except Exception as e:
355
437
  logger.warning(f"⚠ 图片提取异常: {e}")
356
438
 
357
- # 5. 提取互动数据 (点赞、收藏、评论)
358
- likes = "0"
359
- collects = "0"
360
- comments = "0"
361
-
362
- try:
363
- # 在底部区域查找互动数据
364
- # 通常顺序是:左边评论(或直接显示写评论),右边依次是点赞、收藏、评论数
365
- # 策略:遍历底部 TextView,根据位置和内容识别
366
- bottom_elements = []
367
- for el in self.device(className="android.widget.TextView"):
368
- b = el.info.get('bounds', {})
369
- if b.get('top', 0) > 2000: # 屏幕底部区域
370
- bottom_elements.append(el)
371
-
372
- # 排序:按从左到右
373
- bottom_elements.sort(key=lambda x: x.info.get('bounds', {}).get('left', 0))
374
-
375
- for el in bottom_elements:
376
- txt = el.get_text() or ""
377
- # 提取数字部分
378
- num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
379
- if not num_txt: continue
380
-
381
- b = el.info.get('bounds', {})
382
- left = b.get('left', 0)
383
-
384
- # 根据位置初步判断 (这类位置可能随机型变化,但相对顺序通常一致)
385
- # 点赞通常在 500-750 左右
386
- # 收藏通常在 750-900 左右
387
- # 评论通常在 900+ 左右
388
- if 500 < left < 750:
389
- likes = num_txt
390
- elif 750 < left < 900:
391
- collects = num_txt
392
- elif left >= 900:
393
- comments = num_txt
394
- except Exception as e:
395
- logger.warning(f"⚠ 互动数据提取异常: {e}")
396
-
439
+ self._time_method("_get_detail_data", start_time)
397
440
  return {
398
441
  "content": content,
399
442
  "image_urls": image_urls,
@@ -413,6 +456,7 @@ class XHSNoteExtractor:
413
456
  Returns:
414
457
  List[str]: 图片URL列表
415
458
  """
459
+ start_time = time.time()
416
460
  try:
417
461
  headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"}
418
462
  res = requests.get(url, headers=headers, timeout=10)
@@ -428,8 +472,10 @@ class XHSNoteExtractor:
428
472
  for m in matches:
429
473
  clean_url = m.replace('\\u002F', '/')
430
474
  if clean_url not in found: found.append(clean_url)
475
+ self._time_method("_fetch_web_images", start_time)
431
476
  return found
432
477
  except:
478
+ self._time_method("_fetch_web_images", start_time)
433
479
  return []
434
480
 
435
481
  def save_note_data(self, data: Dict[str, Union[str, List[str]]],
@@ -443,6 +489,7 @@ class XHSNoteExtractor:
443
489
  filename (str): 保存文件名
444
490
  note_url (str): 笔记URL
445
491
  """
492
+ start_time = time.time()
446
493
  try:
447
494
  with open(filename, "w", encoding="utf-8") as f:
448
495
  f.write("=" * 50 + "\n")
@@ -467,24 +514,32 @@ class XHSNoteExtractor:
467
514
  f.write("=" * 50 + "\n")
468
515
 
469
516
  logger.info(f"✓ 笔记数据已保存到: {filename}")
517
+ self._time_method("save_note_data", start_time)
470
518
  except Exception as e:
471
519
  logger.error(f"✗ 保存笔记数据失败: {e}")
520
+ self._time_method("save_note_data", start_time)
472
521
  raise
473
522
 
474
523
 
475
- def extract_note_from_url(url: str, device_serial: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
524
+ def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Dict[str, Union[str, List[str]]]:
476
525
  """
477
526
  便捷函数:直接从URL提取笔记数据
478
527
 
479
528
  Args:
480
529
  url (str): 小红书笔记URL
481
530
  device_serial (str, optional): 设备序列号
531
+ enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
482
532
 
483
533
  Returns:
484
534
  Dict[str, Union[str, List[str]]]: 笔记数据
485
535
  """
486
- extractor = XHSNoteExtractor(device_serial=device_serial)
487
- return extractor.extract_note_data(url=url)
536
+ start_time = time.time()
537
+ logger.info(f"[extract_note_from_url] 开始处理URL: {url}")
538
+ extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
539
+ result = extractor.extract_note_data(url=url)
540
+ elapsed_time = time.time() - start_time
541
+ logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
542
+ return result
488
543
 
489
544
 
490
545
  def convert_url_format(url: str) -> str:
@@ -497,8 +552,13 @@ def convert_url_format(url: str) -> str:
497
552
  Returns:
498
553
  str: 转换后的xhsdiscover协议格式URL
499
554
  """
555
+ start_time = time.time()
556
+ logger.info(f"[convert_url_format] 开始转换URL: {url}")
500
557
  parsed_data = XHSNoteExtractor.parse_xhs_url(url)
501
- return XHSNoteExtractor.convert_to_xhsdiscover_format(
558
+ result = XHSNoteExtractor.convert_to_xhsdiscover_format(
502
559
  parsed_data["note_id"],
503
560
  parsed_data["xsec_token"]
504
- )
561
+ )
562
+ elapsed_time = time.time() - start_time
563
+ logger.info(f"[convert_url_format] 耗时: {elapsed_time:.3f}秒,结果: {result}")
564
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xhs-note-extractor
3
- Version: 0.1.dev4
3
+ Version: 0.1.dev6
4
4
  Summary: A Python package for extracting Xiaohongshu (Little Red Book) note data from URLs
5
5
  Author-email: JoyCode Agent <agent@joycode.com>
6
6
  License: MIT