PyPI - xhs-note-extractor - Versions diffs - 0.1.dev2__py3-none-any.whl → 0.1.dev6__py3-none-any.whl - Mend

xhs-note-extractor 0.1.dev2py3-none-any.whl → 0.1.dev6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

xhs_note_extractor/extractor.py CHANGED Viewed

@@ -18,6 +18,7 @@ import requests
 import logging
 from typing import Dict, List, Optional, Union
 from urllib.parse import urlparse, parse_qs
+import xml.etree.ElementTree as ET
 # 配置日志
 logging.basicConfig(
@@ -35,21 +36,35 @@ class XHSNoteExtractor:
     包括URL解析、设备连接、页面跳转和笔记内容提取。
     """
-    def __init__(self, device_serial: Optional[str] = None):
+    def __init__(self, device_serial: Optional[str] = None, enable_time_logging: bool = True):
         """
         初始化小红书笔记提取器
         Args:
             device_serial (str, optional): 设备序列号，如果为None则自动连接可用设备
+            enable_time_logging (bool, optional): 是否启用耗时打印，默认为True
         Raises:
             RuntimeError: 当没有可用设备时抛出异常
         """
         self.device = None
         self.device_serial = device_serial
+        self.enable_time_logging = enable_time_logging
         if not self.connect_device():
             raise RuntimeError("未找到可用的Android设备，请连接设备后再试")
+    def _time_method(self, method_name, start_time):
+        """
+        记录方法执行时间
+        Args:
+            method_name (str): 方法名称
+            start_time (float): 开始时间
+        """
+        if self.enable_time_logging:
+            elapsed_time = time.time() - start_time
+            logger.info(f"[{method_name}] 耗时: {elapsed_time:.3f}秒")
     def connect_device(self) -> bool:
         """
         连接设备
@@ -57,17 +72,35 @@ class XHSNoteExtractor:
         Returns:
             bool: 是否成功连接设备
         """
+        start_time = time.time()
         try:
             if self.device_serial:
                 self.device = u2.connect(self.device_serial)
             else:
                 self.device = u2.connect()
             logger.info(f"✓ 已连接设备: {self.device.serial}")
+            self._time_method("connect_device", start_time)
             return True
         except Exception as e:
             logger.error(f"✗ 设备连接失败: {e}")
+            self._time_method("connect_device", start_time)
             return False
+    def is_device_connected(self) -> bool:
+        """
+        检查设备是否仍然连接
+        Returns:
+            bool: 设备是否连接
+        """
+        if not self.device:
+            return False
+        try:
+            # 通过获取设备信息来验证连接
+            self.device.info
+            return True
+        except:
+            return False
     @staticmethod
     def parse_xhs_url(url: str) -> Dict[str, str]:
         """
@@ -82,6 +115,7 @@ class XHSNoteExtractor:
         Raises:
             ValueError: 当URL格式不正确时抛出异常
         """
+        start_time = time.time()
         # 处理xhsdiscover协议格式
         if url.startswith("xhsdiscover://"):
             # 提取note_id
@@ -122,13 +156,22 @@ class XHSNoteExtractor:
                     note_id = path_parts[explore_index + 1]
                 else:
                     raise ValueError("URL中缺少note_id")
+            # 兼容 /discovery/item/ 格式
+            elif 'discovery' in path_parts and 'item' in path_parts:
+                item_index = path_parts.index('item')
+                if item_index + 1 < len(path_parts):
+                    note_id = path_parts[item_index + 1]
+                else:
+                    raise ValueError("URL中缺少note_id")
             else:
-                raise ValueError("URL格式不正确，缺少/explore/路径")
+                raise ValueError("URL格式不正确，缺少/explore/或/discovery/item/路径")
             # 提取查询参数中的xsec_token
             query_params = parse_qs(parsed_url.query)
             xsec_token = query_params.get('xsec_token', [''])[0]
+            elapsed_time = time.time() - start_time
+            logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
             return {
                 "note_id": note_id,
                 "xsec_token": xsec_token,
@@ -136,6 +179,8 @@ class XHSNoteExtractor:
             }
         else:
+            elapsed_time = time.time() - start_time
+            logger.info(f"[parse_xhs_url] 耗时: {elapsed_time:.3f}秒")
             raise ValueError("不支持的URL格式")
     @staticmethod
@@ -167,12 +212,18 @@ class XHSNoteExtractor:
         Returns:
             str: xhsdiscover协议格式的URL
         """
+        start_time = time.time()
+        result = ""
         if xsec_token:
             original_url = f"http://www.xiaohongshu.com/explore/{note_id}?xsec_token={xsec_token}&xsec_source=pc_feed"
             encoded_url = requests.utils.quote(original_url)
-            return f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
+            result = f"xhsdiscover://item/{note_id}?open_url={encoded_url}"
         else:
-            return f"xhsdiscover://item/{note_id}"
+            result = f"xhsdiscover://item/{note_id}"
+        elapsed_time = time.time() - start_time
+        logger.info(f"[convert_to_xhsdiscover_format] 耗时: {elapsed_time:.3f}秒")
+        return result
     def extract_note_data(self, url: Optional[str] = None, note_id: Optional[str] = None,
                          xsec_token: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
@@ -191,6 +242,7 @@ class XHSNoteExtractor:
             RuntimeError: 当设备未连接时抛出异常
             Exception: 当提取过程中出现错误时抛出异常
         """
+        start_time = time.time()
         # 如果提供了URL，则先解析它（验证URL有效性）
         if url:
             parsed_data = self.parse_xhs_url(url)
@@ -199,6 +251,7 @@ class XHSNoteExtractor:
         # 检查设备是否连接
         if self.device is None:
+            self._time_method("extract_note_data", start_time)
             raise RuntimeError("设备未连接，请先连接设备")
         # 构建跳转URL
@@ -216,102 +269,181 @@ class XHSNoteExtractor:
             logger.info(f"✓ 成功提取笔记数据，点赞数: {data['likes']}, 图片数: {len(data['image_urls'])}")
+            self._time_method("extract_note_data", start_time)
             return data
         except Exception as e:
             logger.error(f"✗ 提取笔记数据失败: {e}")
+            self._time_method("extract_note_data", start_time)
             raise
     def _get_detail_data(self) -> Dict[str, Union[str, List[str]]]:
         """
         从当前已经打开的小红书详情页提取完整正文、图片和点赞数。
-        这是xhs_utils.get_detail_data的封装版本，保持相同功能。
+        优化版本: 使用 dump_hierarchy 替代遍历，大幅提升速度。
         Returns:
             Dict[str, Union[str, List[str]]]: 包含笔记数据的字典
         """
-        logger.info("🔍 进入深度提取模式...")
+        start_time = time.time()
+        logger.info("🔍 进入深度提取模式 (XML优化版)...")
-        # 1. 验证是否进入详情页 (增加重试和多关键词检测)
+        # 1. 验证是否进入详情页 & 展开全文
         detail_loaded = False
         detail_keywords = ["说点什么", "写评论", "写点什么", "收藏", "点赞", "评论", "分享", "发弹幕"]
-        for i in range(8):
+        # 尝试点击展开 (预先动作)
+        try:
+            # 快速检查是否有展开按钮
+            for btn_text in ["展开", "查看全部", "全文"]:
+                if self.device(text=btn_text).exists:
+                    self.device(text=btn_text).click()
+                    break
+        except: pass
+        # 等待加载完整
+        for i in range(5):
             if any(self.device(textContains=kw).exists or self.device(descriptionContains=kw).exists for kw in detail_keywords):
                 detail_loaded = True
                 break
-            if i == 4:
+            if i == 2:
                 # 可能是视频，点击屏幕中心尝试激活 UI
                 self.device.click(540, 900)
-            time.sleep(1)
+            time.sleep(0.5)
         if not detail_loaded:
             logger.warning("⚠ 警告：详情页特征未发现，提取可能不完整")
-        # 2. 处理"展开"按钮以获取完整长文
-        for btn_text in ["展开", "查看全部", "全文"]:
-            btn = self.device(text=btn_text)
-            if btn.exists:
-                logger.info(f"[Action] 点击'{btn_text}'")
-                btn.click()
-                time.sleep(1)
-        # 3. 提取正文 (多策略拼接)
+        # 2. 获取 UI层级 (核心优化)
+        xml_dump_start = time.time()
+        xml_content = self.device.dump_hierarchy()
+        self._time_method("dump_hierarchy", xml_dump_start)
+        # 3. 解析 XML
+        root = ET.fromstring(xml_content)
         content = ""
-        # 策略 A: 尝试常见 ID
-        desc_el = self.device(resourceIdMatches=".*desc.*|.*content.*")
-        if desc_el.exists:
-            content = desc_el.get_text()
-        # 策略 B: 文本容器遍历 (更稳健)
-        if not content or len(content) < 20:
-            texts = []
-            for el in self.device(className="android.widget.TextView"):
+        likes = "0"
+        collects = "0"
+        comments = "0"
+        author_name = "Unknown"
+        image_urls = []
+        # 收集所有 TextView 节点信息
+        text_nodes = []
+        def parse_nodes(node):
+            if node.attrib.get('class') == 'android.widget.TextView':
+                text = node.attrib.get('text', '')
+                bounds_str = node.attrib.get('bounds', '[0,0][0,0]')
+                # 解析 bounds: [x1,y1][x2,y2]
                 try:
-                    t = el.get_text()
-                    if not t or len(t) < 3: continue
-                    # 过滤坐标：只取屏幕中间内容区
-                    b = el.info.get('bounds', {})
-                    if 200 < b.get('top', 0) < 2100:
-                        if not any(k in t for k in ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论']):
-                            texts.append(t)
-                except: continue
-            content = "\n".join(texts)
+                    coords = bounds_str.replace('][', ',').replace('[', '').replace(']', '').split(',')
+                    x1, y1, x2, y2 = map(int, coords)
+                    if text:
+                        text_nodes.append({
+                            'text': text,
+                            'l': x1, 't': y1, 'r': x2, 'b': y2,
+                            'cx': (x1 + x2) / 2, 'cy': (y1 + y2) / 2
+                        })
+                except: pass
+            for child in node:
+                parse_nodes(child)
+        parse_nodes(root)
+        # 4. 分析节点数据
+        # A. 作者提取 (寻找 "关注" 附近的文本)
+        # 策略: 找到包含 "关注" 的节点，取其左侧最近的节点
+        follow_node = None
+        for n in text_nodes:
+            if n['text'] in ["关注", "已关注"]:
+                follow_node = n
+                break
+        if follow_node:
+            best_dist = 9999
+            for n in text_nodes:
+                if n == follow_node: continue
+                if n['text'] in ["关注", "已关注"] or len(n['text']) > 30: continue
+                # 垂直接近
+                if abs(n['cy'] - follow_node['cy']) < 100:
+                    # 在左侧
+                    if n['r'] <= follow_node['l'] + 50:
+                        dist = follow_node['l'] - n['r']
+                        if dist < best_dist:
+                            best_dist = dist
+                            author_name = n['text']
+            logger.info(f"✓ 识别到作者: {author_name}")
-        # 4. 提取图片 (通过分享链接解析高清图)
-        image_urls = []
+        # B. 互动数据提取 (底部区域)
+        bottom_nodes = [n for n in text_nodes if n['t'] > 2000] # 假设屏幕高度足够
+        bottom_nodes.sort(key=lambda x: x['l']) # 从左到右
+        for n in bottom_nodes:
+            txt = n['text']
+            num_txt = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
+            if not num_txt: continue
+            cx = n['cx']
+            if 500 < cx < 750:
+                likes = num_txt
+            elif 750 < cx < 900:
+                collects = num_txt
+            elif cx >= 900:
+                comments = num_txt
+        # C. 正文提取
+        # 过滤掉非正文内容
+        content_lines = []
+        exclude_keywords = ['收藏', '点赞', '评论', '分享', '发布于', '说点什么', '条评论', '关注', author_name]
+        # 按照垂直位置排序
+        content_nodes = [n for n in text_nodes if 200 < n['t'] < 2000]
+        content_nodes.sort(key=lambda x: x['t'])
+        for n in content_nodes:
+            t = n['text']
+            if len(t) < 2: continue
+            if any(k in t for k in exclude_keywords): continue
+            # 简单的去重策略
+            if content_lines and t in content_lines[-1]: continue
+            content_lines.append(t)
+        content = "\n".join(content_lines)
+        # 5. 图片提取 (保持原有逻辑但优化等待)
         try:
+             # 这里还是需要交互，无法纯靠XML
             share_btn = self.device(description="分享")
             if share_btn.exists:
                 share_btn.click()
-                time.sleep(1.5)
-                copy_link_btn = self.device(text="复制链接")
-                if copy_link_btn.exists:
-                    copy_link_btn.click()
+                # 显式等待 "复制链接"
+                copy_link = self.device(text="复制链接")
+                if copy_link.wait(timeout=2.0):
+                    copy_link.click()
+                    # 等待剪贴板更新? 稍微缓一下
                     time.sleep(0.5)
                     share_link = self.device.clipboard
                     if "http" in str(share_link):
                         image_urls = self._fetch_web_images(share_link)
                 else:
+                    logger.warning("未找到复制链接按钮")
                     self.device.press("back")
         except Exception as e:
             logger.warning(f"⚠ 图片提取异常: {e}")
-        # 5. 提取点赞数
-        likes = "0"
-        try:
-            for el in self.device(className="android.widget.TextView"):
-                txt = el.get_text() or ""
-                if any(c.isdigit() for c in txt):
-                    b = el.info.get('bounds', {})
-                    if b.get('top', 0) > 2000 and b.get('left', 0) > 500:
-                        likes = ''.join(c for c in txt if c.isdigit() or c in ['.', 'w', 'W'])
-                        if likes: break
-        except: pass
+        self._time_method("_get_detail_data", start_time)
         return {
             "content": content,
             "image_urls": image_urls,
-            "likes": likes
+            "likes": likes,
+            "collects": collects,
+            "comments": comments,
+            "author_name": author_name
         }
     def _fetch_web_images(self, url: str) -> List[str]:
@@ -324,6 +456,7 @@ class XHSNoteExtractor:
         Returns:
             List[str]: 图片URL列表
         """
+        start_time = time.time()
         try:
             headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"}
             res = requests.get(url, headers=headers, timeout=10)
@@ -339,8 +472,10 @@ class XHSNoteExtractor:
                 for m in matches:
                     clean_url = m.replace('\\u002F', '/')
                     if clean_url not in found: found.append(clean_url)
+            self._time_method("_fetch_web_images", start_time)
             return found
         except:
+            self._time_method("_fetch_web_images", start_time)
             return []
     def save_note_data(self, data: Dict[str, Union[str, List[str]]],
@@ -354,6 +489,7 @@ class XHSNoteExtractor:
             filename (str): 保存文件名
             note_url (str): 笔记URL
         """
+        start_time = time.time()
         try:
             with open(filename, "w", encoding="utf-8") as f:
                 f.write("=" * 50 + "\n")
@@ -362,8 +498,11 @@ class XHSNoteExtractor:
                 if note_url:
                     f.write(f"笔记URL: {note_url}\n")
                     f.write("=" * 50 + "\n")
-                f.write(f"点赞数: {data['likes']}\n")
-                f.write(f"图片数: {len(data['image_urls'])}\n")
+                f.write(f"作者: {data.get('author_name', 'Unknown')}\n")
+                f.write(f"点赞数: {data.get('likes', '0')}\n")
+                f.write(f"收藏数: {data.get('collects', '0')}\n")
+                f.write(f"评论数: {data.get('comments', '0')}\n")
+                f.write(f"图片数: {len(data.get('image_urls', []))}\n")
                 f.write("=" * 50 + "\n")
                 f.write("【正文内容】\n")
                 f.write(data['content'])
@@ -375,24 +514,32 @@ class XHSNoteExtractor:
                     f.write("=" * 50 + "\n")
             logger.info(f"✓ 笔记数据已保存到: {filename}")
+            self._time_method("save_note_data", start_time)
         except Exception as e:
             logger.error(f"✗ 保存笔记数据失败: {e}")
+            self._time_method("save_note_data", start_time)
             raise
-def extract_note_from_url(url: str, device_serial: Optional[str] = None) -> Dict[str, Union[str, List[str]]]:
+def extract_note_from_url(url: str, device_serial: Optional[str] = None, enable_time_logging: bool = True) -> Dict[str, Union[str, List[str]]]:
     """
     便捷函数：直接从URL提取笔记数据
     Args:
         url (str): 小红书笔记URL
         device_serial (str, optional): 设备序列号
+        enable_time_logging (bool, optional): 是否启用耗时打印，默认为True
     Returns:
         Dict[str, Union[str, List[str]]]: 笔记数据
     """
-    extractor = XHSNoteExtractor(device_serial=device_serial)
-    return extractor.extract_note_data(url=url)
+    start_time = time.time()
+    logger.info(f"[extract_note_from_url] 开始处理URL: {url}")
+    extractor = XHSNoteExtractor(device_serial=device_serial, enable_time_logging=enable_time_logging)
+    result = extractor.extract_note_data(url=url)
+    elapsed_time = time.time() - start_time
+    logger.info(f"[extract_note_from_url] 总耗时: {elapsed_time:.3f}秒")
+    return result
 def convert_url_format(url: str) -> str:
@@ -405,8 +552,13 @@ def convert_url_format(url: str) -> str:
     Returns:
         str: 转换后的xhsdiscover协议格式URL
     """
+    start_time = time.time()
+    logger.info(f"[convert_url_format] 开始转换URL: {url}")
     parsed_data = XHSNoteExtractor.parse_xhs_url(url)
-    return XHSNoteExtractor.convert_to_xhsdiscover_format(
+    result = XHSNoteExtractor.convert_to_xhsdiscover_format(
         parsed_data["note_id"],
         parsed_data["xsec_token"]
-    )
+    )
+    elapsed_time = time.time() - start_time
+    logger.info(f"[convert_url_format] 耗时: {elapsed_time:.3f}秒，结果: {result}")
+    return result

{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xhs-note-extractor
-Version: 0.1.dev2
+Version: 0.1.dev6
 Summary: A Python package for extracting Xiaohongshu (Little Red Book) note data from URLs
 Author-email: JoyCode Agent <agent@joycode.com>
 License: MIT

{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 xhs_note_extractor/__init__.py,sha256=CjHdqO4W5sj6zbeE7xYkR0_WRfc99G56nR6k2Kmji44,1207
 xhs_note_extractor/_version.py,sha256=59jjKBtTUi_9u6FVZcIpQEDYjyAaqdxzqXyuRuFYKPE,720
 xhs_note_extractor/cli.py,sha256=F5phl4HqnzEe_vTS8vpio_KcZNx4cxmXJnYcQ1FgMbA,2693
-xhs_note_extractor/extractor.py,sha256=Afl-VzMmuRSk82BbAROpIAB6g1BSpC3yRbDJTrO0NCM,14964
+xhs_note_extractor/extractor.py,sha256=NwYWtgU8lX53OVmfHzpLeSWEpZv0mH3UcLcw4K-1Oj4,21136
 xhs_note_extractor/utils.py,sha256=mOVoLknlflzv7aCjXdmeNniQ7P6WNUcjSKjCm8uwFNk,14364
-xhs_note_extractor-0.1.dev2.dist-info/licenses/LICENSE,sha256=VFtWajKKKkgOoX3cMb2upEjsQmDpU85ymhom2bYY_oI,1069
-xhs_note_extractor-0.1.dev2.dist-info/METADATA,sha256=ohcbdSTJ-ms66W5xBEqXIhczNN_7lpk4mqsvpL1fF0g,5525
-xhs_note_extractor-0.1.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-xhs_note_extractor-0.1.dev2.dist-info/entry_points.txt,sha256=1IG34snKfX2pzpLSeDXHqlSVSH8p7bf3eaKQfcwGDk4,60
-xhs_note_extractor-0.1.dev2.dist-info/top_level.txt,sha256=at3SqTdQr3DWMFCL5KM0Ofo_LE88WqADjh8MeFLwwO0,19
-xhs_note_extractor-0.1.dev2.dist-info/RECORD,,
+xhs_note_extractor-0.1.dev6.dist-info/licenses/LICENSE,sha256=VFtWajKKKkgOoX3cMb2upEjsQmDpU85ymhom2bYY_oI,1069
+xhs_note_extractor-0.1.dev6.dist-info/METADATA,sha256=6XtFTabTgXrg7np6dVLtWAn0u5C_rmC10eJ9WYfukyA,5525
+xhs_note_extractor-0.1.dev6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+xhs_note_extractor-0.1.dev6.dist-info/entry_points.txt,sha256=1IG34snKfX2pzpLSeDXHqlSVSH8p7bf3eaKQfcwGDk4,60
+xhs_note_extractor-0.1.dev6.dist-info/top_level.txt,sha256=at3SqTdQr3DWMFCL5KM0Ofo_LE88WqADjh8MeFLwwO0,19
+xhs_note_extractor-0.1.dev6.dist-info/RECORD,,

{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/WHEEL RENAMED Viewed

File without changes

{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xhs_note_extractor-0.1.dev2.dist-info → xhs_note_extractor-0.1.dev6.dist-info}/top_level.txt RENAMED Viewed

File without changes

xhs-note-extractor 0.1.dev2__py3-none-any.whl → 0.1.dev6__py3-none-any.whl

xhs-note-extractor 0.1.dev2py3-none-any.whl → 0.1.dev6py3-none-any.whl