xhs-note-extractor 0.1.dev6__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,10 @@ from typing import Dict, List, Optional, Union
20
20
  from urllib.parse import urlparse, parse_qs
21
21
  import xml.etree.ElementTree as ET
22
22
 
23
+ # 延迟加载agent_login模块以避免不必要的依赖
24
+ from .date_desc_utils import parse_time_to_timestamp_ms
25
+ from .number_utils import parse_count_to_int
26
+
23
27
  # 配置日志
24
28
  logging.basicConfig(
25
29
  level=logging.INFO,
@@ -36,23 +40,64 @@ class XHSNoteExtractor:
36
40
  包括URL解析、设备连接、页面跳转和笔记内容提取。
37
41
  """
38
42
 
39
- def __init__(self, device_serial: Optional[str] = None, enable_time_logging: bool = True):
43
+ def __init__(self, devices:dict = None):
40
44
  """
41
45
  初始化小红书笔记提取器
42
46
 
43
47
  Args:
44
- device_serial (str, optional): 设备序列号,如果为None则自动连接可用设备
45
- enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
46
-
48
+ devices (dict, optional): 设备信息字典,包含设备序列号和对应小红书账号可选手机号
49
+ {
50
+ "b520805": ["13800000000"]
51
+ }
52
+
47
53
  Raises:
48
- RuntimeError: 当没有可用设备时抛出异常
54
+ ValueError: 当设备信息为空或无效时抛出异常
49
55
  """
50
- self.device = None
51
- self.device_serial = device_serial
52
- self.enable_time_logging = enable_time_logging
53
- if not self.connect_device():
54
- raise RuntimeError("未找到可用的Android设备,请连接设备后再试")
56
+ if not devices:
57
+ raise ValueError("设备信息必须从外部传入")
58
+
59
+ self.device = None # 当前设备
60
+ self.next_phone = None # 下一个手机号
61
+ self.devices_info = devices # 存储设备信息字典
62
+ self.problematic_devices = [] # 存储无法获取笔记的设备信息
63
+ self.enable_time_logging = True # 默认启用耗时打印
64
+
65
+ # 日志记录设备信息
66
+ logger.info(f"已配置设备信息: {self.devices_info}")
67
+ logger.info("设备将在需要时连接")
55
68
 
69
+ def _get_next_phone_number(self, device_serial: str) -> Optional[str]:
70
+ """
71
+ 获取指定设备的下一个手机号(循环)
72
+
73
+ Args:
74
+ device_serial (str): 设备序列号
75
+
76
+ Returns:
77
+ str: 下一个手机号,如果没有则返回None
78
+ """
79
+ if device_serial not in self.devices_info:
80
+ return None
81
+
82
+ phone_list = self.devices_info[device_serial]
83
+ if not phone_list:
84
+ return None
85
+
86
+ # 如果当前没有设置下一个手机号,返回第一个
87
+ if not self.next_phone:
88
+ self.next_phone = phone_list[0]
89
+ return self.next_phone
90
+
91
+ # 找到当前手机号在列表中的索引
92
+ try:
93
+ current_index = phone_list.index(self.next_phone)
94
+ # 循环到下一个
95
+ next_index = (current_index + 1) % len(phone_list)
96
+ self.next_phone = phone_list[next_index]
97
+ except ValueError:
98
+ # 如果当前手机号不在列表中,返回第一个
99
+ self.next_phone = phone_list[0]
100
+ return self.next_phone
56
101
  def _time_method(self, method_name, start_time):
57
102
  """
58
103
  记录方法执行时间
@@ -63,28 +108,92 @@ class XHSNoteExtractor:
63
108
  """
64
109
  if self.enable_time_logging:
65
110
  elapsed_time = time.time() - start_time
66
- logger.info(f"[{method_name}] 耗时: {elapsed_time:.3f}秒")
111
+ if elapsed_time < 1:
112
+ logger.info(f"⏱️ [{method_name}] 耗时: {elapsed_time*1000:.0f}ms")
113
+ else:
114
+ logger.info(f"⏱️ [{method_name}] 耗时: {elapsed_time:.2f}s")
115
+
116
+
67
117
 
68
- def connect_device(self) -> bool:
118
+ def connect_device(self, device_serial: Optional[str] = None) -> bool:
69
119
  """
70
120
  连接设备
71
121
 
122
+ Args:
123
+ device_serial (str, optional): 指定设备序列号,如果为None则使用devices_info中的第一个设备
124
+
72
125
  Returns:
73
126
  bool: 是否成功连接设备
74
127
  """
75
128
  start_time = time.time()
129
+
130
+ # 如果指定了设备序列号,则使用指定的设备
131
+ target_device = device_serial
132
+
133
+ # 如果没有指定设备序列号,尝试使用devices_info中的第一个设备
134
+ if not target_device and self.devices_info:
135
+ target_device = next(iter(self.devices_info.keys()))
136
+
76
137
  try:
77
- if self.device_serial:
78
- self.device = u2.connect(self.device_serial)
79
- else:
80
- self.device = u2.connect()
138
+ if not target_device:
139
+ logger.error("✗ 设备连接失败: 无法确定设备序列号")
140
+ self._time_method("connect_device", start_time)
141
+ return False
142
+
143
+ self.device = u2.connect(target_device)
81
144
  logger.info(f"✓ 已连接设备: {self.device.serial}")
82
145
  self._time_method("connect_device", start_time)
146
+ # 重启小红书应用以确保登录状态
147
+ logger.info("🔄 重启小红书应用...")
148
+ self.device.app_stop("com.xingin.xhs")
149
+ time.sleep(1)
150
+ self.device.app_start("com.xingin.xhs")
151
+ time.sleep(3)
152
+ # 获取下一个手机号
153
+ self.next_phone = self._get_next_phone_number(target_device)
154
+ logger.warning(f'next_phone:{self.next_phone}')
83
155
  return True
84
156
  except Exception as e:
85
157
  logger.error(f"✗ 设备连接失败: {e}")
86
158
  self._time_method("connect_device", start_time)
87
159
  return False
160
+
161
+ def switch_to_next_device(self) -> bool:
162
+ """
163
+ 切换到下一个可用设备
164
+
165
+ Returns:
166
+ bool: 是否成功切换到下一个设备
167
+ """
168
+ self.next_phone = None # 重置下一个手机号为None
169
+ if not self.devices_info or len(self.devices_info) <= 1:
170
+ logger.warning("没有更多可用设备可以切换")
171
+ return False
172
+
173
+ # 获取当前设备的序列号
174
+ current_serial = self.device.serial if self.device else None
175
+ logger.info(f"当前设备: {current_serial}")
176
+ # 转换为列表以便切换
177
+ device_serials = list(self.devices_info.keys())
178
+ logger.info(f"device_serials: {device_serials}")
179
+ # 找到当前设备的索引
180
+ current_index = device_serials.index(current_serial) if current_serial in device_serials else -1
181
+ logger.info(f"current_index: {current_index}")
182
+
183
+ # 如果当前设备不在列表中,并且有尝试过的设备记录,则从尝试过的设备之后开始
184
+ attempted_serials = [d['serial'] for d in self.problematic_devices]
185
+ if current_index == -1 and attempted_serials:
186
+ # 找到最后一个尝试过的设备的索引
187
+ last_attempted = attempted_serials[-1]
188
+ if last_attempted in device_serials:
189
+ current_index = device_serials.index(last_attempted)
190
+
191
+ # 移动到下一个设备
192
+ next_index = (current_index + 1) % len(device_serials)
193
+ next_device_serial = device_serials[next_index]
194
+ logger.info(f"next_device_serial: {next_device_serial}")
195
+ logger.info(f"尝试切换到设备: {next_device_serial}")
196
+ return self.connect_device(next_device_serial)
88
197
  def is_device_connected(self) -> bool:
89
198
  """
90
199
  检查设备是否仍然连接
@@ -100,7 +209,71 @@ class XHSNoteExtractor:
100
209
  return True
101
210
  except:
102
211
  return False
212
+
213
+ def get_problematic_devices(self) -> List[Dict[str, Union[str, float]]]:
214
+ """
215
+ 获取无法获取笔记的设备列表
216
+
217
+ Returns:
218
+ List[Dict[str, Union[str, float]]]: 包含有问题设备信息的列表,每个设备信息包括:
219
+ - serial: 设备序列号
220
+ - reason: 问题原因
221
+ - note_id: 尝试提取的笔记ID
222
+ - timestamp: 记录时间戳
223
+ """
224
+ return self.problematic_devices
225
+
226
+ def clear_problematic_devices(self) -> None:
227
+ """
228
+ 清空有问题的设备列表
229
+ """
230
+ self.problematic_devices.clear()
231
+ # 清除缓存并重启APP
232
+ def clear_login_state(self, device_serial=None):
233
+ import uiautomator2 as u2
234
+ import time
235
+
236
+ # 连接设备
237
+ d = u2.connect(device_serial)
238
+
239
+ # 启动APP
240
+ d.app_stop('com.xingin.xhs')
241
+ time.sleep(1) # 等待APP启动
242
+ d.app_start('com.xingin.xhs')
243
+ time.sleep(2) # 等待APP启动
244
+ try:
245
+ if not d(text='我').exists():
246
+ print("已退出登录,无需退出登录")
247
+ return
248
+
249
+ # 点击我的/个人中心按钮
250
+ d(description='我').click()
251
+ time.sleep(2)
252
+
253
+ if d(text='微信登录').exists() or d(text='手机号登录').exists():
254
+ print("已登录,无需退出登录")
255
+ return
103
256
 
257
+ # 点击设置按钮
258
+ d(description='设置').click()
259
+ time.sleep(2)
260
+
261
+ # 滚动到退出登录选项
262
+ d.swipe_ext('up', scale=0.5)
263
+ time.sleep(1)
264
+
265
+ # 点击退出登录
266
+ d(text='退出登录').click()
267
+ time.sleep(1)
268
+
269
+ # 确认退出
270
+ d(text='退出登录').click()
271
+ time.sleep(2)
272
+
273
+ print("退出登录成功")
274
+ except Exception as e:
275
+ print(f"退出登录失败: {e}")
276
+
104
277
  @staticmethod
105
278
  def parse_xhs_url(url: str) -> Dict[str, str]:
106
279
  """
@@ -225,10 +398,10 @@ class XHSNoteExtractor:
225
398
  logger.info(f"[convert_to_xhsdiscover_format] 耗时: {elapsed_time:.3f}秒")
226
399
  return result
227
400
 
228
- def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
229
- xsec_token: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
401
+ def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
402
+ xsec_token: Optional[str] = None) -> Optional[Dict[str, Union[str, List[str]]]]:
230
403
  """
231
- 从小红书笔记中提取数据
404
+ 从小红书笔记中提取数据,支持设备重试机制
232
405
 
233
406
  Args:
234
407
  url (str, optional): 小红书URL,如果提供则会解析其中的note_id和xsec_token
@@ -236,10 +409,9 @@ class XHSNoteExtractor:
236
409
  xsec_token (str, optional): xsec_token参数
237
410
 
238
411
  Returns:
239
- Dict[str, Union[str, List[str]]]: 包含笔记数据的字典,格式与xhs_utils.get_detail_data()一致
412
+ Optional[Dict[str, Union[str, List[str]]]]: 包含笔记数据的字典,如果没有成功则返回None
240
413
 
241
414
  Raises:
242
- RuntimeError: 当设备未连接时抛出异常
243
415
  Exception: 当提取过程中出现错误时抛出异常
244
416
  """
245
417
  start_time = time.time()
@@ -248,40 +420,129 @@ class XHSNoteExtractor:
248
420
  parsed_data = self.parse_xhs_url(url)
249
421
  note_id = parsed_data["note_id"]
250
422
  xsec_token = parsed_data["xsec_token"]
251
-
252
- # 检查设备是否连接
253
- if self.device is None:
254
- self._time_method("extract_note_data", start_time)
255
- raise RuntimeError("设备未连接,请先连接设备")
256
423
 
257
- # 构建跳转URL
258
- jump_url = self.convert_to_xhsdiscover_format(note_id, xsec_token)
424
+ max_retries = len(self.devices_info) if self.devices_info else 1
425
+ attempted_devices = []
259
426
 
260
- logger.info(f"正在尝试跳转至笔记: {note_id}")
261
-
262
- try:
263
- # 发起跳转
264
- self.device.open_url(jump_url)
265
- logger.info("✓ 已发送跳转指令,等待页面加载...")
427
+ for attempt in range(max_retries):
428
+ logger.info(f"尝试第 {attempt + 1}/{max_retries} 次提取笔记: {note_id}")
266
429
 
267
- # 使用现有的xhs_utils功能提取数据
268
- data = self._get_detail_data()
430
+ # 检查设备是否连接,如果没有则尝试连接
431
+ if self.device is None:
432
+ if not self.connect_device():
433
+ logger.warning("设备连接失败,尝试下一个设备")
434
+ # 记录连接失败的设备
435
+ device_serials = list(self.devices_info.keys())
436
+ if device_serials and attempt < len(device_serials):
437
+ failed_device = device_serials[attempt]
438
+ if failed_device not in [d['serial'] for d in self.problematic_devices]:
439
+ self.problematic_devices.append({
440
+ 'serial': failed_device,
441
+ 'reason': '设备连接失败',
442
+ 'note_id': note_id,
443
+ 'timestamp': time.time()
444
+ })
445
+ if self.switch_to_next_device():
446
+ continue
447
+ else:
448
+ break
269
449
 
270
- logger.info(f"✓ 成功提取笔记数据,点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
450
+ # 构建跳转URL
451
+ jump_url = self.convert_to_xhsdiscover_format(note_id, xsec_token)
271
452
 
272
- self._time_method("extract_note_data", start_time)
273
- return data
453
+ logger.info(f"正在尝试跳转至笔记: {note_id} (设备: {self.device.serial if self.device else '未知'})")
274
454
 
275
- except Exception as e:
276
- logger.error(f"✗ 提取笔记数据失败: {e}")
277
- self._time_method("extract_note_data", start_time)
278
- raise
455
+ try:
456
+ # # 在跳转链接前重启APP
457
+ # logger.info(f"🔄 准备跳转至笔记 {note_id},正在重启APP...")
458
+ # self.restart_xhs_app()
459
+
460
+ # 发起跳转
461
+ self.device.open_url(jump_url)
462
+ logger.info("✓ 已发送跳转指令,等待页面加载...")
463
+
464
+ # 使用现有的xhs_utils功能提取数据
465
+ data = self._get_detail_data(jump_url)
466
+
467
+ # 如果返回None,说明需要登录,尝试下一个设备
468
+ if data is None:
469
+ logger.warning(f"当前设备{self.device.serial}需要登录,尝试切换到下一个设备")
470
+ attempted_devices.append(self.device.serial if self.device else "未知设备")
471
+ # 尝试重新登录
472
+ # 触发退出登录
473
+ self.clear_login_state(self.device.serial)
474
+ # 触发登录
475
+ # 确认当前设备对应的手机号
476
+ self.next_phone = self._get_next_phone_number(self.device.serial)
477
+ logger.warning(f'next_phone:{self.next_phone}')
478
+ try:
479
+ # 延迟加载agent_login模块以避免不必要的依赖
480
+ from .agent_login import do_login
481
+ if do_login(phone_number=self.next_phone, device_id=self.device.serial):
482
+ logger.info(f"✓ 设备{self.device.serial}登录成功")
483
+ continue
484
+ else:
485
+ logger.warning(f"✓ 设备{self.device.serial}登录失败")
486
+ attempted_devices.append(self.device.serial) # 记录尝试过的设备
487
+ # 保存当前设备序列号,以便switch_to_next_device知道从哪里开始
488
+ failed_device_serial = self.device.serial
489
+ self.device = None
490
+
491
+ # 手动记录失败的设备信息
492
+ if failed_device_serial not in [d['serial'] for d in self.problematic_devices]:
493
+ self.problematic_devices.append({
494
+ 'serial': failed_device_serial,
495
+ 'reason': '设备登录失败',
496
+ 'note_id': note_id,
497
+ 'timestamp': time.time()
498
+ })
499
+
500
+ # 尝试切换到下一个设备
501
+ if not self.switch_to_next_device():
502
+ logger.error("没有更多可用设备,提取失败")
503
+ self._time_method("extract_note_data", start_time)
504
+ return {}
505
+ continue
506
+ except ImportError as e:
507
+ logger.warning(f"无法导入登录模块: {e}")
508
+ logger.warning("将尝试跳过登录步骤,继续使用当前设备")
509
+ continue
510
+
511
+ logger.info(f"✓ 成功提取笔记数据,点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
512
+ self._time_method("extract_note_data", start_time)
513
+ return data
514
+
515
+ except Exception as e:
516
+ logger.error(f"✗ 提取笔记数据失败: {e}")
517
+ attempted_devices.append(self.device.serial if self.device else "未知设备")
518
+
519
+ # 记录有问题的设备
520
+ if self.device and self.device.serial not in [d['serial'] for d in self.problematic_devices]:
521
+ self.problematic_devices.append({
522
+ 'serial': self.device.serial,
523
+ 'reason': f'提取异常: {str(e)}',
524
+ 'note_id': note_id,
525
+ 'timestamp': time.time()
526
+ })
527
+
528
+ # 如果还有设备可用,尝试下一个
529
+ if attempt < max_retries - 1 and self.switch_to_next_device():
530
+ continue
531
+ else:
532
+ logger.error("所有设备尝试完毕,提取失败")
533
+ self._time_method("extract_note_data", start_time)
534
+ logger.error(f"所有设备尝试完毕,提取失败。尝试过的设备: {attempted_devices}")
535
+ self._time_method("extract_note_data", start_time)
536
+ return {}
279
537
 
280
- def _get_detail_data(self) -> Dict[str, Union[str, List[str]]]:
538
+ def _get_detail_data(self, jump_url: str) -> Dict[str, Union[str, List[str]]]:
281
539
  """
282
540
  从当前已经打开的小红书详情页提取完整正文、图片和点赞数。
283
541
  优化版本: 使用 dump_hierarchy 替代遍历,大幅提升速度。
284
542
 
543
+ Args:
544
+ jump_url (str): 笔记的跳转URL,用于白屏时重新加载
545
+
285
546
  Returns:
286
547
  Dict[str, Union[str, List[str]]]: 包含笔记数据的字典
287
548
  """
@@ -290,131 +551,245 @@ class XHSNoteExtractor:
290
551
 
291
552
  # 1. 验证是否进入详情页 & 展开全文
292
553
  detail_loaded = False
293
- detail_keywords = ["说点什么", "写评论", "写点什么", "收藏", "点赞", "评论", "分享", "发弹幕"]
294
-
295
- # 尝试点击展开 (预先动作)
296
554
  try:
297
- # 快速检查是否有展开按钮
298
- for btn_text in ["展开", "查看全部", "全文"]:
299
- if self.device(text=btn_text).exists:
300
- self.device(text=btn_text).click()
301
- break
555
+ if self.device(text="展开").exists:
556
+ self.device(text="展开").click()
302
557
  except: pass
303
558
 
304
- # 等待加载完整
305
- for i in range(5):
306
- if any(self.device(textContains=kw).exists or self.device(descriptionContains=kw).exists for kw in detail_keywords):
307
- detail_loaded = True
308
- break
309
- if i == 2:
310
- # 可能是视频,点击屏幕中心尝试激活 UI
311
- self.device.click(540, 900)
312
- time.sleep(0.5)
559
+ # 超快速检查 - 只等0.2秒
560
+ time.sleep(0.2)
313
561
 
314
- if not detail_loaded:
315
- logger.warning(" 警告:详情页特征未发现,提取可能不完整")
316
-
317
- # 2. 获取 UI层级 (核心优化)
318
- xml_dump_start = time.time()
319
- xml_content = self.device.dump_hierarchy()
320
- self._time_method("dump_hierarchy", xml_dump_start)
562
+ # 快速检查登录状态
563
+ if self.device(textContains="其他登录方式").exists or self.device(textContains="微信登录").exists or self.device(textContains="登录发现更多精彩").exists:
564
+ logger.error("✗ 需要登录才能查看详情页内容,提取终止")
565
+ return None
321
566
 
322
- # 3. 解析 XML
323
- root = ET.fromstring(xml_content)
567
+ # 极简检查 - 只检查一次
568
+ time.sleep(0.3)
569
+ detail_count = 5
570
+ detail_loaded = False
571
+ while(detail_count > 0):
572
+ if not self.device(textContains="关注").exists:
573
+ detail_count -= 1
574
+ time.sleep(0.1)
575
+ continue
576
+ detail_loaded = True
577
+ break
324
578
 
579
+ if not detail_loaded:
580
+ logger.warning("⚠ 警告:详情页特征未发现,提取可能不完整")
581
+
582
+ # 智能滚动 - 确保看到发布时间和评论区 (优化速度版)
583
+ scroll_phase_start = time.time()
584
+ try:
585
+ # 定义需要查找的目标元素 (正则匹配)
586
+ target_pattern = re.compile(r"条评论|发布于|小时前|天前|月前|年前|昨天|今天")
587
+
588
+ # 最多滚动6次,单次距离加大
589
+ for i in range(6):
590
+ # 向下滚动
591
+ swipe_start = time.time()
592
+ self.device.swipe(540, 1600, 540, 600, 0.1)
593
+ self._time_method(f"scroll_swipe_{i+1}", swipe_start)
594
+
595
+ # 核心优化:只 dump 一次,在字符串中搜索,避免多次 exists() 调用的开销
596
+ dump_start = time.time()
597
+ xml_temp = self.device.dump_hierarchy()
598
+ self._time_method(f"scroll_dump_{i+1}", dump_start)
599
+
600
+ if target_pattern.search(xml_temp):
601
+ logger.info(f"✓ 已检测到目标元素 (第 {i+1} 次滚动)")
602
+ break
603
+
604
+ # 极短间隔
605
+ time.sleep(0.1)
606
+
607
+ time.sleep(0.3) # 稳定时间
608
+ self._time_method("intelligent_scroll_total", scroll_phase_start)
609
+ logger.info("✓ 滚动完成")
610
+ except Exception as e:
611
+ logger.warning(f"滚动失败: {e}")
612
+
613
+ # 初始化提取变量
325
614
  content = ""
326
- likes = "0"
327
- collects = "0"
328
- comments = "0"
615
+ likes = 0
616
+ collects = 0
617
+ comments = 0
329
618
  author_name = "Unknown"
619
+ publish_time = 0
620
+ date_desc = ""
330
621
  image_urls = []
331
622
 
332
- # 收集所有 TextView 节点信息
623
+ # 2. 获取 UI层级 (核心优化)
624
+ # 增加一次重试逻辑,如果第一次没抓到日期
333
625
  text_nodes = []
626
+ limit_y = 2500
334
627
 
335
- def parse_nodes(node):
336
- if node.attrib.get('class') == 'android.widget.TextView':
337
- text = node.attrib.get('text', '')
628
+ for attempt in range(2):
629
+ xml_dump_start = time.time()
630
+ xml_content = self.device.dump_hierarchy()
631
+ self._time_method("dump_hierarchy", xml_dump_start)
632
+
633
+ # 检测白屏状态 - 检查文本节点数量
634
+ current_text_nodes = []
635
+ root = ET.fromstring(xml_content)
636
+
637
+ def parse_nodes(node):
638
+ text = node.attrib.get('text', '') or node.attrib.get('content-desc', '')
338
639
  bounds_str = node.attrib.get('bounds', '[0,0][0,0]')
339
- # 解析 bounds: [x1,y1][x2,y2]
340
640
  try:
341
641
  coords = bounds_str.replace('][', ',').replace('[', '').replace(']', '').split(',')
342
642
  x1, y1, x2, y2 = map(int, coords)
343
643
  if text:
344
- text_nodes.append({
644
+ current_text_nodes.append({
345
645
  'text': text,
346
646
  'l': x1, 't': y1, 'r': x2, 'b': y2,
347
647
  'cx': (x1 + x2) / 2, 'cy': (y1 + y2) / 2
348
648
  })
349
649
  except: pass
350
- for child in node:
351
- parse_nodes(child)
650
+ for child in node: parse_nodes(child)
651
+
652
+ parse_nodes(root)
653
+
654
+ # 白屏检测:如果文本节点太少,可能是白屏
655
+ print(f'当前文本节点数量: {len(current_text_nodes)}')
656
+ if len(current_text_nodes) < 11:
657
+ logger.error(f"✗ 检测到白屏状态 - 文本节点数量异常少 ({len(current_text_nodes)}个节点)")
658
+ logger.info("--- 调试: 捕获的文本节点 ---")
659
+ for i, n in enumerate(current_text_nodes):
660
+ logger.info(f"[{i}] {n['text']} (t={n['t']}, b={n['b']}, l={n['l']}, r={n['r']})")
661
+ logger.info("--- 调试结束 ---")
352
662
 
353
- parse_nodes(root)
354
-
355
- # 4. 分析节点数据
356
-
357
- # A. 作者提取 (寻找 "关注" 附近的文本)
358
- # 策略: 找到包含 "关注" 的节点,取其左侧最近的节点
359
- follow_node = None
360
- for n in text_nodes:
361
- if n['text'] in ["关注", "已关注"]:
362
- follow_node = n
363
- break
364
-
365
- if follow_node:
366
- best_dist = 9999
663
+ # 如果是第一次尝试,重新加载页面
664
+ if attempt == 0:
665
+ logger.info("🔄 尝试重新加载页面...")
666
+ # 重新发送跳转指令
667
+ self.device.open_url(jump_url)
668
+ time.sleep(2) # 等待页面重新加载
669
+ continue
670
+ else:
671
+ # 第二次尝试仍白屏,直接返回None
672
+ logger.error("✗ 页面加载失败 - 白屏状态")
673
+ return None
674
+
675
+ # 检查是否存在加载指示器
676
+ loading_found = False
677
+ for node in current_text_nodes:
678
+ if re.search(r'(加载|loading|等待|waiting|\.\.\.|\\u231a|\\u25ba)', node['text'], re.IGNORECASE):
679
+ loading_found = True
680
+ break
681
+
682
+ if loading_found:
683
+ logger.warning("⚠ 检测到页面正在加载中")
684
+ if attempt == 0:
685
+ logger.info("🔄 等待页面加载完成...")
686
+ time.sleep(2)
687
+ continue
688
+
689
+ text_nodes = current_text_nodes # 保留最新的节点供后续提取使用
690
+
691
+ # 4. 分析节点数据 (简化版日期快速检查)
692
+ found_date_in_this_xml = False
693
+ follow_node = None
367
694
  for n in text_nodes:
368
- if n == follow_node: continue
369
- if n['text'] in ["关注", "已关注"] or len(n['text']) > 30: continue
370
-
371
- # 垂直接近
372
- if abs(n['cy'] - follow_node['cy']) < 100:
373
- # 在左侧
374
- if n['r'] <= follow_node['l'] + 50:
695
+ if n['text'] in ["关注", "已关注"]:
696
+ follow_node = n
697
+ break
698
+
699
+ if follow_node:
700
+ # 寻找作者名
701
+ best_dist = 999
702
+ for n in text_nodes:
703
+ if n == follow_node: continue
704
+ if abs(n['cy'] - follow_node['cy']) < 100 and n['r'] <= follow_node['l'] + 50:
375
705
  dist = follow_node['l'] - n['r']
376
706
  if dist < best_dist:
377
707
  best_dist = dist
378
708
  author_name = n['text']
379
- logger.info(f"✓ 识别到作者: {author_name}")
709
+
710
+ # 寻找日期
711
+ min_y = follow_node['b'] if follow_node else 150
712
+ # 提前寻找 limit_y
713
+ current_limit_y = 2500
714
+ for n in text_nodes:
715
+ if re.match(r"^共\s*\d+\s*条评论$", n['text']) or n['text'] in ["说点什么", "写评论", "写点什么", "这里是评论区"]:
716
+ current_limit_y = min(current_limit_y, n['t'])
717
+ limit_y = current_limit_y
380
718
 
719
+ for n in text_nodes:
720
+ if n['t'] > min_y - 200 and n['b'] < limit_y + 150:
721
+ txt = n['text'].strip()
722
+ if 2 <= len(txt) <= 50 and txt not in ["点赞", "收藏", "评论", "关注", "分享", "回复", "不喜欢"]:
723
+ try:
724
+ ts = parse_time_to_timestamp_ms(txt)
725
+ publish_time = ts
726
+ date_desc = txt
727
+ found_date_in_this_xml = True
728
+ # 不要 break,因为日期通常在最后
729
+ except: continue
730
+
731
+ if found_date_in_this_xml:
732
+ break
733
+
734
+ if attempt == 0:
735
+ logger.warning("⚠ 未识别到发布时间,尝试额外滚动并重试...")
736
+ self.device.swipe(540, 1500, 540, 1000, 0.2)
737
+ time.sleep(0.5)
738
+
739
+ if not date_desc:
740
+ logger.warning("未识别到发布时间")
741
+ # 埋点调试: 打印出识别到的所有节点及其坐标
742
+ logger.info("--- 调试: 所有捕获的文本节点 ---")
743
+ for i, n in enumerate(text_nodes):
744
+ logger.info(f"[{i}] {n['text']} (t={n['t']}, b={n['b']}, l={n['l']}, r={n['r']})")
745
+ logger.info("--- 调试结束 ---")
746
+ else:
747
+ logger.info(f"✓ 识别到发布时间: {date_desc} -> {publish_time}")
748
+
749
+ logger.info(f"text_nodes: {text_nodes}")
750
+
751
+
381
752
  # B. 互动数据提取 (底部区域)
382
- bottom_nodes = [n for n in text_nodes if n['t'] > 2000] # 假设屏幕高度足够
753
+ # 使用 limit_y 作为分割线大概率更准确
754
+ bottom_nodes = [n for n in text_nodes if n['t'] >= limit_y - 300] # 互动栏通常在 limit_y 上方一点点 或者 就在 mask 区域
383
755
  bottom_nodes.sort(key=lambda x: x['l']) # 从左到右
384
756
 
385
757
  for n in bottom_nodes:
386
758
  txt = n['text']
387
- num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
759
+ # 保留数字、小数点、w/W "万"
760
+ num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W', '万'])
388
761
  if not num_txt: continue
389
762
 
390
763
  cx = n['cx']
391
764
  if 500 < cx < 750:
392
- likes = num_txt
765
+ likes = parse_count_to_int(num_txt)
393
766
  elif 750 < cx < 900:
394
- collects = num_txt
767
+ collects = parse_count_to_int(num_txt)
395
768
  elif cx >= 900:
396
- comments = num_txt
769
+ comments = parse_count_to_int(num_txt)
397
770
 
398
771
  # C. 正文提取
399
772
  # 过滤掉非正文内容
400
773
  content_lines = []
401
- exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
774
+ # exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
775
+ # if date_desc:
776
+ # exclude_keywords.append(date_desc)
402
777
 
403
- # 按照垂直位置排序
404
- content_nodes = [n for n in text_nodes if 200 < n['t'] < 2000]
778
+ # 按照垂直位置排序 (使用 min_y 和 limit_y 约束)
779
+ content_nodes = [n for n in text_nodes if min_y < n['t'] < limit_y]
405
780
  content_nodes.sort(key=lambda x: x['t'])
406
781
 
407
782
  for n in content_nodes:
408
783
  t = n['text']
409
784
  if len(t) < 2: continue
410
- if any(k in t for k in exclude_keywords): continue
785
+ # if any(k in t for k in exclude_keywords): continue
411
786
 
412
787
  # 简单的去重策略
413
788
  if content_lines and t in content_lines[-1]: continue
414
789
  content_lines.append(t)
415
790
 
416
791
  content = "\n".join(content_lines)
417
-
792
+ logger.info(f"提取正文: {content}")
418
793
  # 5. 图片提取 (保持原有逻辑但优化等待)
419
794
  try:
420
795
  # 这里还是需要交互,无法纯靠XML
@@ -443,7 +818,9 @@ class XHSNoteExtractor:
443
818
  "likes": likes,
444
819
  "collects": collects,
445
820
  "comments": comments,
446
- "author_name": author_name
821
+ "author_name": author_name,
822
+ "publish_time": publish_time,
823
+ "date_desc": date_desc
447
824
  }
448
825
 
449
826
  def _fetch_web_images(self, url: str) -> List[str]:
@@ -503,6 +880,7 @@ class XHSNoteExtractor:
503
880
  f.write(f"收藏数: {data.get('collects', '0')}\n")
504
881
  f.write(f"评论数: {data.get('comments', '0')}\n")
505
882
  f.write(f"图片数: {len(data.get('image_urls', []))}\n")
883
+ f.write(f"发布时间: {data.get('date_desc', '')} ({data.get('publish_time', 0)})\n")
506
884
  f.write("=" * 50 + "\n")
507
885
  f.write("【正文内容】\n")
508
886
  f.write(data['content'])
@@ -521,9 +899,9 @@ class XHSNoteExtractor:
521
899
  raise
522
900
 
523
901
 
524
- def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Dict[str, Union[str, List[str]]]:
902
+ def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Optional[Dict[str, Union[str, List[str]]]]:
525
903
  """
526
- 便捷函数:直接从URL提取笔记数据
904
+ 便捷函数:直接从URL提取笔记数据,支持设备重试机制
527
905
 
528
906
  Args:
529
907
  url (str): 小红书笔记URL
@@ -531,15 +909,21 @@ def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_
531
909
  enable_time_logging (bool, optional): 是否启用耗时打印,默认为True
532
910
 
533
911
  Returns:
534
- Dict[str, Union[str, List[str]]]: 笔记数据
912
+ Optional[Dict[str, Union[str, List[str]]]]: 笔记数据,如果没有成功则返回None
535
913
  """
536
914
  start_time = time.time()
537
915
  logger.info(f"[extract_note_from_url] 开始处理URL: {url}")
538
- extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
539
- result = extractor.extract_note_data(url=url)
540
- elapsed_time = time.time() - start_time
541
- logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
542
- return result
916
+ try:
917
+ extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
918
+ result = extractor.extract_note_data(url=url)
919
+ elapsed_time = time.time() - start_time
920
+ logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
921
+ return result
922
+ except Exception as e:
923
+ logger.error(f"[extract_note_from_url] 提取失败: {e}")
924
+ elapsed_time = time.time() - start_time
925
+ logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
926
+ return None
543
927
 
544
928
 
545
929
  def convert_url_format(url: str) -> str: