xhs-note-extractor 0.1.5.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ # 小红书笔记提取器 - 设备重试机制使用指南
2
+
3
+ ## 功能概述
4
+
5
+ 小红书笔记提取器现在支持设备重试机制。当某个设备需要登录时,系统会自动尝试连接其他可用设备,直到找到无需登录的设备或所有设备都尝试过为止。
6
+
7
+ ## 使用方式
8
+
9
+ ### 1. 使用便捷函数(推荐)
10
+
11
+ ```python
12
+ from xhs_note_extractor import extract_note_from_url
13
+
14
+ # 直接提取笔记数据
15
+ result = extract_note_from_url("https://www.xiaohongshu.com/explore/你的笔记ID")
16
+
17
+ if result:
18
+ print("成功提取笔记数据")
19
+ print(f"作者: {result['author_name']}")
20
+ print(f"点赞数: {result['likes']}")
21
+ print(f"图片数: {len(result['image_urls'])}")
22
+ else:
23
+ print("所有设备都需要登录,提取失败")
24
+ ```
25
+
26
+ ### 2. 使用类实例
27
+
28
+ ```python
29
+ from xhs_note_extractor import XHSNoteExtractor
30
+
31
+ # 创建提取器实例
32
+ extractor = XHSNoteExtractor()
33
+
34
+ # 显示可用设备
35
+ print(f"可用设备: {extractor.available_devices}")
36
+
37
+ # 提取笔记数据
38
+ result = extractor.extract_note_data(url="https://www.xiaohongshu.com/explore/你的笔记ID")
39
+
40
+ if result:
41
+ print("成功提取笔记数据")
42
+ else:
43
+ print("所有设备都需要登录,提取失败")
44
+ ```
45
+
46
+ ### 3. 手动切换设备
47
+
48
+ ```python
49
+ from xhs_note_extractor import XHSNoteExtractor
50
+
51
+ extractor = XHSNoteExtractor()
52
+
53
+ # 查看当前设备
54
+ print(f"当前设备: {extractor.device.serial}")
55
+
56
+ # 手动切换到下一个设备
57
+ success = extractor.switch_to_next_device()
58
+ if success:
59
+ print(f"已切换到设备: {extractor.device.serial}")
60
+ ```
61
+
62
+ ## 工作原理
63
+
64
+ 1. **设备发现**: 初始化时自动检测所有通过ADB连接的Android设备
65
+ 2. **登录检测**: 在提取笔记时检测是否需要登录
66
+ 3. **自动重试**: 如果需要登录,自动尝试下一个可用设备
67
+ 4. **循环尝试**: 依次尝试所有可用设备
68
+ 5. **结果返回**: 成功则返回数据,失败则返回None
69
+
70
+ ## 注意事项
71
+
72
+ - 确保所有设备都已通过USB调试连接并授权
73
+ - 设备需要安装小红书APP
74
+ - 如果所有设备都需要登录,则返回None而不是抛出异常
75
+ - 设备切换时会自动重启小红书APP
76
+
77
+ ## 测试脚本
78
+
79
+ 运行测试脚本验证设备重试功能:
80
+
81
+ ```bash
82
+ python xhs_note_extractor/test_device_retry.py
83
+ ```
84
+
85
+ ## 故障排除
86
+
87
+ 1. **无法发现设备**:
88
+ - 检查USB连接
89
+ - 确保ADB调试已开启
90
+ - 运行 `adb devices` 验证设备连接
91
+
92
+ 2. **设备切换失败**:
93
+ - 检查设备是否仍然连接
94
+ - 确保小红书APP在设备上已安装
95
+
96
+ 3. **返回None**:
97
+ - 所有设备都需要登录
98
+ - 尝试手动登录某个设备后再试
@@ -0,0 +1,50 @@
1
+ """
2
+ 小红书笔记提取器包
3
+
4
+ 这是一个用于从小红书URL中提取笔记信息的Python包。
5
+ 支持URL解析、设备连接、页面跳转和笔记内容提取。
6
+
7
+ 主要功能:
8
+ - URL解析和转换(支持标准格式和xhsdiscover协议格式)
9
+ - 设备连接和自动化操作
10
+ - 笔记内容提取(正文、图片、点赞数等)
11
+ - 结构化数据返回
12
+
13
+ 示例:
14
+ >>> from xhs_note_extractor import XHSNoteExtractor
15
+ >>> extractor = XHSNoteExtractor()
16
+ >>> data = extractor.extract_note_data(url="https://www.xiaohongshu.com/explore/...")
17
+ >>> print(data['content'])
18
+ """
19
+
20
+ __version__ = "1.0.0"
21
+ __author__ = "JoyCode Agent"
22
+ __email__ = "agent@joycode.com"
23
+
24
+ from .extractor import XHSNoteExtractor
25
+ from .utils import (
26
+ DeviceManager,
27
+ ElementFinder,
28
+ DataFormatter,
29
+ NetworkUtils,
30
+ FileManager,
31
+ XHSUtils,
32
+ connect_device,
33
+ format_like_count,
34
+ extract_image_urls_from_html,
35
+ fetch_html
36
+ )
37
+
38
+ __all__ = [
39
+ "XHSNoteExtractor",
40
+ "DeviceManager",
41
+ "ElementFinder",
42
+ "DataFormatter",
43
+ "NetworkUtils",
44
+ "FileManager",
45
+ "XHSUtils",
46
+ "connect_device",
47
+ "format_like_count",
48
+ "extract_image_urls_from_html",
49
+ "fetch_html",
50
+ ]
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.1.dev2'
32
+ __version_tuple__ = version_tuple = (0, 1, 'dev2')
33
+
34
+ __commit_id__ = commit_id = 'g1aa72014c'
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phone Agent Usage Examples / Phone Agent 使用示例
4
+
5
+ Demonstrates how to use Phone Agent for phone automation tasks via Python API.
6
+ 演示如何通过 Python API 使用 Phone Agent 进行手机自动化任务。
7
+ """
8
+ import json
9
+ from datetime import datetime
10
+
11
+ from phone_agent import PhoneAgent
12
+ from phone_agent.agent import AgentConfig
13
+ from phone_agent.config import get_messages
14
+ from phone_agent.model import ModelConfig
15
+
16
+ from xhs_note_extractor.login_propmt import phone_agent_protocol_v3_t1, phone_agent_protocol_v3_t3
17
+ from xhs_note_extractor.sms_verification import get_verification_code_sync
18
+
19
+ def do_login(device_id:str, lang: str = "cn", phone_number:str = "19163152334"):
20
+ """Basic task example / 基础任务示例"""
21
+ # Configure model endpoint
22
+ model_config = ModelConfig(
23
+ model_name="ZhipuAI/AutoGLM-Phone-9B",
24
+ temperature=0.1,
25
+ api_key="ms-ed9ed848-d630-4192-a688-37ebbf985246",
26
+ base_url="https://api-inference.modelscope.cn/v1"
27
+ )
28
+
29
+ # Configure Agent behavior
30
+ agent_config = AgentConfig(
31
+ max_steps=50,
32
+ verbose=True,
33
+ lang=lang,
34
+ device_id=device_id,
35
+ )
36
+
37
+ # Create Agent
38
+ agent = PhoneAgent(
39
+ model_config=model_config,
40
+ agent_config=agent_config,
41
+ )
42
+ cur_date_time = datetime.now()
43
+ print(f'phone_number:{phone_number}')
44
+ # 从文件加载协议内容
45
+ prompt_task1 = phone_agent_protocol_v3_t1.format(phone_number)
46
+ result = agent.run(prompt_task1)
47
+ print(f"prompt_task1: {result}")
48
+
49
+ # 解析JSON结果
50
+ try:
51
+ result_json = json.loads(result)
52
+ print(f"result_json: {result_json}")
53
+ # 检查任务1是否成功
54
+ if result_json.get("status") != "success":
55
+ print(f"验证码触发失败: {result_json.get('message')}")
56
+ return False
57
+ except json.JSONDecodeError:
58
+ print(f"result error: {result}")
59
+ # 如果不是JSON格式,尝试兼容旧格式
60
+
61
+ # 检查是否包含图片验证码相关内容
62
+ image_captcha_keywords = ["图片验证码", "点击文字", "旋转图片", "滑块", "拼图", "拖拽", "文字验证"]
63
+ has_image_captcha = any(keyword in result for keyword in image_captcha_keywords)
64
+
65
+ if has_image_captcha:
66
+ print("检测到图片验证码,验证码触发失败")
67
+ return False
68
+
69
+ # 原始的旧格式检查逻辑
70
+ if (not "验证码已触发" in result and not "验证码已成功发送" in result and "任务已完成" not in result ) and ("图片验证码界面" in result):
71
+ print("验证码触发失败")
72
+ return False
73
+ # 3. 自定义重试参数
74
+ print(f"\n获取手机号 {phone_number} 的验证码(3次尝试,每次间隔3秒)...")
75
+ code = get_verification_code_sync(
76
+ phone_number,
77
+ send_time=cur_date_time,
78
+ max_retries=3,
79
+ retry_interval=3
80
+ )
81
+ print(f"手机号: {phone_number}, 验证码: {code}")
82
+ prompt_task3 = phone_agent_protocol_v3_t3.format(phone_number, code)
83
+ result = agent.run(prompt_task3)
84
+ print(f"prompt_task3: {result}")
85
+
86
+ # 解析JSON结果
87
+ try:
88
+ result_json = json.loads(result)
89
+ # 检查任务3是否成功
90
+ return result_json.get("status") == "success"
91
+ except json.JSONDecodeError:
92
+ # 如果不是JSON格式,尝试兼容旧格式
93
+ return "登录成功" in result
94
+
95
+ def example_with_callbacks(lang: str = "cn"):
96
+ """Task example with callbacks / 带回调的任务示例"""
97
+ msgs = get_messages(lang)
98
+
99
+ def my_confirmation(message: str) -> bool:
100
+ """Sensitive operation confirmation callback / 敏感操作确认回调"""
101
+ print(f"\n[{msgs['confirmation_required']}] {message}")
102
+ response = input(f"{msgs['continue_prompt']}: ")
103
+ return response.lower() in ("yes", "y", "是")
104
+
105
+ def my_takeover(message: str) -> None:
106
+ """Manual takeover callback / 人工接管回调"""
107
+ print(f"\n[{msgs['manual_operation_required']}] {message}")
108
+ print(msgs["manual_operation_hint"])
109
+ input(f"{msgs['press_enter_when_done']}: ")
110
+
111
+ # Create Agent with custom callbacks
112
+ agent_config = AgentConfig(lang=lang)
113
+ agent = PhoneAgent(
114
+ agent_config=agent_config,
115
+ confirmation_callback=my_confirmation,
116
+ takeover_callback=my_takeover,
117
+ )
118
+
119
+ # Execute task that may require confirmation
120
+ result = agent.run("打开淘宝搜索无线耳机并加入购物车")
121
+ print(f"{msgs['task_result']}: {result}")
122
+
123
+
124
+ def example_step_by_step(lang: str = "cn"):
125
+ """Step-by-step execution example (for debugging) / 单步执行示例(用于调试)"""
126
+ msgs = get_messages(lang)
127
+
128
+ agent_config = AgentConfig(lang=lang)
129
+ agent = PhoneAgent(agent_config=agent_config)
130
+
131
+ # Initialize task
132
+ result = agent.step("打开美团搜索附近的火锅店")
133
+ print(f"{msgs['step']} 1: {result.action}")
134
+
135
+ # Continue if not finished
136
+ while not result.finished and agent.step_count < 10:
137
+ result = agent.step()
138
+ print(f"{msgs['step']} {agent.step_count}: {result.action}")
139
+ print(f" {msgs['thinking']}: {result.thinking[:100]}...")
140
+
141
+ print(f"\n{msgs['final_result']}: {result.message}")
142
+
143
+
144
+ def example_multiple_tasks(lang: str = "cn"):
145
+ """Batch task example / 批量任务示例"""
146
+ msgs = get_messages(lang)
147
+
148
+ agent_config = AgentConfig(lang=lang)
149
+ agent = PhoneAgent(agent_config=agent_config)
150
+
151
+ tasks = [
152
+ "打开高德地图查看实时路况",
153
+ "打开大众点评搜索附近的咖啡店",
154
+ "打开bilibili搜索Python教程",
155
+ ]
156
+
157
+ for task in tasks:
158
+ print(f"\n{'=' * 50}")
159
+ print(f"{msgs['task']}: {task}")
160
+ print("=" * 50)
161
+
162
+ result = agent.run(task)
163
+ print(f"{msgs['result']}: {result}")
164
+
165
+ # Reset Agent state
166
+ agent.reset()
167
+
168
+
169
+ def example_remote_device(lang: str = "cn"):
170
+ """Remote device example / 远程设备示例"""
171
+ from phone_agent.adb import ADBConnection
172
+
173
+ msgs = get_messages(lang)
174
+
175
+ # Create connection manager
176
+ conn = ADBConnection()
177
+
178
+ # Connect to remote device
179
+ success, message = conn.connect("192.168.1.100:5555")
180
+ if not success:
181
+ print(f"{msgs['connection_failed']}: {message}")
182
+ return
183
+
184
+ print(f"{msgs['connection_successful']}: {message}")
185
+
186
+ # Create Agent with device specified
187
+ agent_config = AgentConfig(
188
+ device_id="192.168.1.100:5555",
189
+ verbose=True,
190
+ lang=lang,
191
+ )
192
+
193
+ agent = PhoneAgent(agent_config=agent_config)
194
+
195
+ # Execute task
196
+ result = agent.run("打开微信查看消息")
197
+ print(f"{msgs['task_result']}: {result}")
198
+
199
+ # Disconnect
200
+ conn.disconnect("192.168.1.100:5555")
201
+
202
+
203
+
204
+ def check_adb_devices():
205
+ """Check if any ADB devices are connected / 检查是否有 ADB 设备连接"""
206
+ import subprocess
207
+ try:
208
+ result = subprocess.run(["adb", "devices"], capture_output=True, text=True)
209
+ lines = result.stdout.strip().split("\n")[1:] # Skip header
210
+ devices = [line for line in lines if line.strip()]
211
+ if not devices:
212
+ print("\nError: No Android devices connected via ADB.")
213
+ print("错误: 未通过 ADB 连接任何 Android 设备。")
214
+ print("Please connect a device or start an emulator.")
215
+ print("请连接设备或启动模拟器。")
216
+ return False
217
+ return True
218
+ except FileNotFoundError:
219
+ print("\nError: 'adb' command not found. Please install Android Platform Tools.")
220
+ print("错误: 未找到 'adb' 命令。请安装 Android Platform Tools。")
221
+ return False
222
+ #
223
+ # if __name__ == "__main__":
224
+ # if not check_adb_devices():
225
+ # exit(1)
226
+ #
227
+ # import argparse
228
+ #
229
+ # parser = argparse.ArgumentParser(description="Phone Agent Usage Examples")
230
+ # parser.add_argument(
231
+ # "--lang",
232
+ # type=str,
233
+ # default="cn",
234
+ # choices=["cn", "en"],
235
+ # help="Language for UI messages (cn=Chinese, en=English)",
236
+ # )
237
+ # args = parser.parse_args()
238
+ #
239
+ # msgs = get_messages(args.lang)
240
+ #
241
+ # print("Phone Agent Usage Examples")
242
+ # print("=" * 50)
243
+ #
244
+ # # Run basic example
245
+ # print(f"\n1. Basic Task Example")
246
+ # print("-" * 30)
247
+ # do_login(args.lang)
248
+ #
249
+ # # Uncomment to run other examples
250
+ # # print(f"\n2. Task Example with Callbacks")
251
+ # # print("-" * 30)
252
+ # # example_with_callbacks(args.lang)
253
+ #
254
+ # # print(f"\n3. Step-by-step Example")
255
+ # # print("-" * 30)
256
+ # # example_step_by_step(args.lang)
257
+ #
258
+ # # print(f"\n4. Batch Task Example")
259
+ # # print("-" * 30)
260
+ # # example_multiple_tasks(args.lang)
261
+ #
262
+ # # print(f"\n5. Remote Device Example")
263
+ # # print("-" * 30)
264
+ # # example_remote_device(args.lang)
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command Line Interface for XHS Note Extractor
4
+ """
5
+
6
+ import argparse
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from .extractor import XHSNoteExtractor
11
+ from .utils import NetworkUtils
12
+
13
+
14
+ def main():
15
+ """Main entry point for the CLI application."""
16
+ parser = argparse.ArgumentParser(
17
+ description="Extract Xiaohongshu (Little Red Book) note data from URLs"
18
+ )
19
+ parser.add_argument(
20
+ "url",
21
+ help="Xiaohongshu note URL to extract data from"
22
+ )
23
+ parser.add_argument(
24
+ "-o", "--output",
25
+ help="Output file path (default: stdout)"
26
+ )
27
+ parser.add_argument(
28
+ "-f", "--format",
29
+ choices=["json", "csv"],
30
+ default="json",
31
+ help="Output format (default: json)"
32
+ )
33
+ parser.add_argument(
34
+ "-v", "--verbose",
35
+ action="store_true",
36
+ help="Enable verbose output"
37
+ )
38
+
39
+ args = parser.parse_args()
40
+
41
+ # Validate URL
42
+ if not NetworkUtils.is_valid_xhs_url(args.url):
43
+ print(f"Error: Invalid Xiaohongshu URL: {args.url}", file=sys.stderr)
44
+ sys.exit(1)
45
+
46
+ try:
47
+ if args.verbose:
48
+ print(f"Extracting data from: {args.url}")
49
+
50
+ # Initialize extractor
51
+ extractor = XHSNoteExtractor()
52
+
53
+ # Extract note data
54
+ note_data = extractor.extract_note(args.url)
55
+
56
+ if not note_data:
57
+ print("Error: Failed to extract note data", file=sys.stderr)
58
+ sys.exit(1)
59
+
60
+ # Format output
61
+ if args.format == "json":
62
+ import json
63
+ output = json.dumps(note_data, ensure_ascii=False, indent=2)
64
+ else: # csv
65
+ import csv
66
+ from io import StringIO
67
+
68
+ # Convert to CSV format (simplified)
69
+ output_buffer = StringIO()
70
+ writer = csv.writer(output_buffer)
71
+
72
+ # Write headers
73
+ writer.writerow(["Field", "Value"])
74
+
75
+ # Write data rows
76
+ for key, value in note_data.items():
77
+ if isinstance(value, (list, dict)):
78
+ value = str(value)
79
+ writer.writerow([key, value])
80
+
81
+ output = output_buffer.getvalue()
82
+
83
+ # Output result
84
+ if args.output:
85
+ output_path = Path(args.output)
86
+ output_path.write_text(output, encoding='utf-8')
87
+ if args.verbose:
88
+ print(f"Output saved to: {output_path}")
89
+ else:
90
+ print(output)
91
+
92
+ except Exception as e:
93
+ print(f"Error: {e}", file=sys.stderr)
94
+ sys.exit(1)
95
+
96
+
97
+ if __name__ == "__main__":
98
+ main()
@@ -0,0 +1,80 @@
1
+ import re
2
+ from datetime import datetime, timedelta
3
+
4
+ def parse_time_to_timestamp_ms(time_str: str, now: datetime | None = None) -> int:
5
+ if now is None:
6
+ now = datetime.now()
7
+
8
+ time_str = time_str.strip()
9
+ # Remove common prefixes
10
+ for prefix in ["编辑于", "发布于"]:
11
+ if time_str.startswith(prefix):
12
+ time_str = time_str[len(prefix):].strip()
13
+
14
+ # Remove location suffix (e.g., "昨天 15:09重庆" -> "昨天 15:09")
15
+ # Match common Chinese city/province names at the end
16
+ import re as re_module
17
+ time_str = re_module.sub(r'[\u4e00-\u9fa5]{2,4}$', '', time_str).strip()
18
+
19
+ # 刚刚
20
+ if time_str == "刚刚":
21
+ dt = now
22
+
23
+ # X分钟前
24
+ elif match := re.match(r"(\d+)分钟前", time_str):
25
+ dt = now - timedelta(minutes=int(match.group(1)))
26
+
27
+ # X小时前
28
+ elif match := re.match(r"(\d+)小时前", time_str):
29
+ dt = now - timedelta(hours=int(match.group(1)))
30
+
31
+ # X天前
32
+ elif match := re.match(r"(\d+)天前", time_str):
33
+ dt = now - timedelta(days=int(match.group(1)))
34
+
35
+ # 今天 HH:mm
36
+ elif match := re.match(r"今天\s*(\d{1,2}:\d{2})", time_str):
37
+ dt = datetime.strptime(
38
+ f"{now.date()} {match.group(1)}",
39
+ "%Y-%m-%d %H:%M"
40
+ )
41
+
42
+ # 昨天 HH:mm
43
+ elif match := re.match(r"昨天\s*(\d{1,2}:\d{2})", time_str):
44
+ dt = datetime.strptime(
45
+ f"{(now - timedelta(days=1)).date()} {match.group(1)}",
46
+ "%Y-%m-%d %H:%M"
47
+ )
48
+
49
+ # 前天 HH:mm
50
+ elif match := re.match(r"前天\s*(\d{1,2}:\d{2})", time_str):
51
+ dt = datetime.strptime(
52
+ f"{(now - timedelta(days=2)).date()} {match.group(1)}",
53
+ "%Y-%m-%d %H:%M"
54
+ )
55
+
56
+ # YYYY-MM-DD HH:mm
57
+ elif re.match(r"\d{4}-\d{2}-\d{2}\s+\d{1,2}:\d{2}", time_str):
58
+ dt = datetime.strptime(time_str, "%Y-%m-%d %H:%M")
59
+
60
+ # YYYY-MM-DD
61
+ elif re.match(r"\d{4}-\d{2}-\d{2}", time_str):
62
+ dt = datetime.strptime(time_str, "%Y-%m-%d")
63
+
64
+ # ✅ 新增:MM-DD(默认当前年份)
65
+ elif match := re.match(r"(\d{2})-(\d{2})", time_str):
66
+ year = now.year
67
+ month, day = map(int, match.groups())
68
+ dt = datetime(year, month, day)
69
+
70
+ # HH:mm(默认当天)
71
+ elif re.match(r"\d{1,2}:\d{2}", time_str):
72
+ dt = datetime.strptime(
73
+ f"{now.date()} {time_str}",
74
+ "%Y-%m-%d %H:%M"
75
+ )
76
+
77
+ else:
78
+ raise ValueError(f"无法解析的时间格式: {time_str}")
79
+
80
+ return int(dt.timestamp() * 1000)