crawl4weibo 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawl4weibo
3
+ Version: 0.1.0
4
+ Summary: A professional Weibo crawler library
5
+ Home-page: https://github.com/yourusername/crawl4weibo
6
+ Author: Your Name
7
+ Author-email: Kritoooo <krito2023@gmail.com>
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.7
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Requires-Python: >=3.7
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: requests>=2.25.0
22
+ Requires-Dist: lxml>=4.6.0
23
+ Requires-Dist: tqdm>=4.60.0
24
+ Requires-Dist: python-dateutil>=2.8.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=6.0; extra == "dev"
27
+ Requires-Dist: pytest-cov; extra == "dev"
28
+ Requires-Dist: black; extra == "dev"
29
+ Requires-Dist: isort; extra == "dev"
30
+ Requires-Dist: flake8; extra == "dev"
31
+ Dynamic: author
32
+ Dynamic: home-page
33
+ Dynamic: requires-python
34
+
35
+ # Crawl4Weibo
36
+
37
+ 一个专业的微博爬虫Python库,基于实际测试成功的方案,无需Cookie即可使用。
38
+
39
+ ## 特性
40
+
41
+ - 🚀 **开箱即用**: 无需Cookie,一行代码初始化
42
+ - 🛡️ **防反爬**: 自动处理432错误和请求限制
43
+ - 📱 **真实模拟**: 使用真实手机浏览器UA
44
+ - 🔄 **智能重试**: 自动重试机制
45
+ - 📊 **结构化数据**: 清晰的数据模型
46
+
47
+ ## 安装
48
+
49
+ ```bash
50
+ pip install -e .
51
+ ```
52
+
53
+ ## 快速开始
54
+
55
+ ```python
56
+ from crawl4weibo import WeiboClient
57
+
58
+ # 初始化(无需Cookie)
59
+ client = WeiboClient()
60
+
61
+ # 获取用户信息
62
+ user = client.get_user_by_uid("1195230310")
63
+ print(f"用户: {user.screen_name}")
64
+ print(f"粉丝: {user.followers_count:,}")
65
+
66
+ # 获取微博
67
+ posts = client.get_user_posts("1195230310")
68
+ for post in posts:
69
+ print(f"微博: {post.text[:50]}...")
70
+ print(f"点赞: {post.attitudes_count}")
71
+
72
+ # 搜索用户
73
+ users = client.search_users("技术博主")
74
+ for user in users:
75
+ print(f"用户: {user.screen_name}")
76
+
77
+ # 搜索微博
78
+ posts = client.search_posts("人工智能")
79
+ for post in posts:
80
+ print(f"内容: {post.text[:50]}...")
81
+ ```
82
+
83
+ ## API参考
84
+
85
+ ### WeiboClient
86
+
87
+ #### 初始化
88
+ ```python
89
+ WeiboClient(cookies=None, log_level="INFO", log_file=None)
90
+ ```
91
+
92
+ #### 主要方法
93
+
94
+ - `get_user_by_uid(uid)` - 获取用户信息
95
+ - `get_user_posts(uid, page=1)` - 获取用户微博
96
+ - `search_users(query, page=1, count=10)` - 搜索用户
97
+ - `search_posts(query, page=1)` - 搜索微博
98
+
99
+ ### 数据模型
100
+
101
+ **User (用户)**:
102
+ - `screen_name` - 用户名
103
+ - `followers_count` - 粉丝数
104
+ - `posts_count` - 微博数
105
+ - `verified` - 是否认证
106
+
107
+ **Post (微博)**:
108
+ - `text` - 微博文本
109
+ - `attitudes_count` - 点赞数
110
+ - `comments_count` - 评论数
111
+ - `created_at` - 发布时间
112
+
113
+ ## 运行示例
114
+
115
+ ```bash
116
+ python examples/simple_example.py
117
+ ```
118
+
119
+ ## 技术实现
120
+
121
+ 基于你提供的成功代码实现:
122
+
123
+ ```python
124
+ # 核心技术栈
125
+ - Android Chrome UA模拟
126
+ - 移动端API接口
127
+ - 自动session管理
128
+ - 432错误智能重试
129
+ - 随机请求间隔
130
+ ```
131
+
132
+ ## 许可证
133
+
134
+ MIT License
@@ -0,0 +1,100 @@
1
+ # Crawl4Weibo
2
+
3
+ 一个专业的微博爬虫Python库,基于实际测试成功的方案,无需Cookie即可使用。
4
+
5
+ ## 特性
6
+
7
+ - 🚀 **开箱即用**: 无需Cookie,一行代码初始化
8
+ - 🛡️ **防反爬**: 自动处理432错误和请求限制
9
+ - 📱 **真实模拟**: 使用真实手机浏览器UA
10
+ - 🔄 **智能重试**: 自动重试机制
11
+ - 📊 **结构化数据**: 清晰的数据模型
12
+
13
+ ## 安装
14
+
15
+ ```bash
16
+ pip install -e .
17
+ ```
18
+
19
+ ## 快速开始
20
+
21
+ ```python
22
+ from crawl4weibo import WeiboClient
23
+
24
+ # 初始化(无需Cookie)
25
+ client = WeiboClient()
26
+
27
+ # 获取用户信息
28
+ user = client.get_user_by_uid("1195230310")
29
+ print(f"用户: {user.screen_name}")
30
+ print(f"粉丝: {user.followers_count:,}")
31
+
32
+ # 获取微博
33
+ posts = client.get_user_posts("1195230310")
34
+ for post in posts:
35
+ print(f"微博: {post.text[:50]}...")
36
+ print(f"点赞: {post.attitudes_count}")
37
+
38
+ # 搜索用户
39
+ users = client.search_users("技术博主")
40
+ for user in users:
41
+ print(f"用户: {user.screen_name}")
42
+
43
+ # 搜索微博
44
+ posts = client.search_posts("人工智能")
45
+ for post in posts:
46
+ print(f"内容: {post.text[:50]}...")
47
+ ```
48
+
49
+ ## API参考
50
+
51
+ ### WeiboClient
52
+
53
+ #### 初始化
54
+ ```python
55
+ WeiboClient(cookies=None, log_level="INFO", log_file=None)
56
+ ```
57
+
58
+ #### 主要方法
59
+
60
+ - `get_user_by_uid(uid)` - 获取用户信息
61
+ - `get_user_posts(uid, page=1)` - 获取用户微博
62
+ - `search_users(query, page=1, count=10)` - 搜索用户
63
+ - `search_posts(query, page=1)` - 搜索微博
64
+
65
+ ### 数据模型
66
+
67
+ **User (用户)**:
68
+ - `screen_name` - 用户名
69
+ - `followers_count` - 粉丝数
70
+ - `posts_count` - 微博数
71
+ - `verified` - 是否认证
72
+
73
+ **Post (微博)**:
74
+ - `text` - 微博文本
75
+ - `attitudes_count` - 点赞数
76
+ - `comments_count` - 评论数
77
+ - `created_at` - 发布时间
78
+
79
+ ## 运行示例
80
+
81
+ ```bash
82
+ python examples/simple_example.py
83
+ ```
84
+
85
+ ## 技术实现
86
+
87
+ 基于你提供的成功代码实现:
88
+
89
+ ```python
90
+ # 核心技术栈
91
+ - Android Chrome UA模拟
92
+ - 移动端API接口
93
+ - 自动session管理
94
+ - 432错误智能重试
95
+ - 随机请求间隔
96
+ ```
97
+
98
+ ## 许可证
99
+
100
+ MIT License
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ crawl4weibo - A professional Weibo crawler library
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Your Name"
10
+ __email__ = "your.email@example.com"
11
+
12
+ from .core.client import WeiboClient
13
+ from .models.user import User
14
+ from .models.post import Post
15
+ from .exceptions.base import CrawlError, AuthenticationError, RateLimitError, UserNotFoundError, NetworkError, ParseError
16
+
17
+ __all__ = [
18
+ "WeiboClient",
19
+ "User",
20
+ "Post",
21
+ "CrawlError",
22
+ "AuthenticationError",
23
+ "RateLimitError",
24
+ "UserNotFoundError",
25
+ "NetworkError",
26
+ "ParseError",
27
+ ]
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Core module for crawl4weibo
6
+ """
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ 微博爬虫客户端 - 基于实际测试成功的代码
6
+ """
7
+
8
+ import requests
9
+ import time
10
+ import random
11
+ from typing import List, Dict, Any, Optional, Union
12
+
13
+ from ..utils.parser import WeiboParser
14
+ from ..utils.logger import setup_logger
15
+ from ..models.user import User
16
+ from ..models.post import Post
17
+ from ..exceptions.base import CrawlError, UserNotFoundError, ParseError, NetworkError
18
+
19
+
20
+ class WeiboClient:
21
+ """微博爬虫客户端"""
22
+
23
+ def __init__(self, cookies: Optional[Union[str, Dict[str, str]]] = None,
24
+ log_level: str = "INFO", log_file: Optional[str] = None,
25
+ user_agent: Optional[str] = None):
26
+ """
27
+ 初始化微博客户端
28
+
29
+ Args:
30
+ cookies: 可选的Cookie字符串或字典
31
+ log_level: 日志级别
32
+ log_file: 日志文件路径
33
+ user_agent: 可选的User-Agent字符串
34
+ """
35
+ self.logger = setup_logger(
36
+ level=getattr(__import__('logging'), log_level.upper()),
37
+ log_file=log_file
38
+ )
39
+
40
+ # 创建session
41
+ self.session = requests.Session()
42
+
43
+ # 设置经过验证的headers
44
+ default_user_agent = (
45
+ "Mozilla/5.0 (Linux; Android 13; SM-G9980) "
46
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
47
+ "Chrome/112.0.5615.135 Mobile Safari/537.36"
48
+ )
49
+ self.session.headers.update({
50
+ "User-Agent": user_agent or default_user_agent,
51
+ "Referer": "https://m.weibo.cn/",
52
+ "Accept": "application/json, text/plain, */*",
53
+ "X-Requested-With": "XMLHttpRequest"
54
+ })
55
+
56
+ # 添加Cookie(如果提供)
57
+ if cookies:
58
+ self._set_cookies(cookies)
59
+
60
+ # 初始化session
61
+ self._init_session()
62
+
63
+ # 解析器
64
+ self.parser = WeiboParser()
65
+
66
+ self.logger.info("WeiboClient initialized successfully")
67
+
68
+ def _set_cookies(self, cookies: Union[str, Dict[str, str]]):
69
+ """设置cookies"""
70
+ if isinstance(cookies, str):
71
+ cookie_dict = {}
72
+ for pair in cookies.split(';'):
73
+ if '=' in pair:
74
+ key, value = pair.split('=', 1)
75
+ cookie_dict[key.strip()] = value.strip()
76
+ self.session.cookies.update(cookie_dict)
77
+ elif isinstance(cookies, dict):
78
+ self.session.cookies.update(cookies)
79
+
80
+ def _init_session(self):
81
+ """初始化session,获取首页cookie"""
82
+ try:
83
+ self.logger.debug("初始化session...")
84
+ self.session.get("https://m.weibo.cn/", timeout=5)
85
+ time.sleep(random.uniform(2, 4))
86
+ except Exception as e:
87
+ self.logger.warning(f"Session初始化失败: {e}")
88
+
89
+ def _request(self, url: str, params: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]:
90
+ """发送请求并处理重试"""
91
+ for attempt in range(1, max_retries + 1):
92
+ try:
93
+ response = self.session.get(url, params=params, timeout=5)
94
+
95
+ if response.status_code == 200:
96
+ return response.json()
97
+ elif response.status_code == 432:
98
+ if attempt < max_retries:
99
+ sleep_time = random.uniform(4, 7)
100
+ self.logger.warning(f"遇到432错误,等待 {sleep_time:.1f} 秒后重试...")
101
+ time.sleep(sleep_time)
102
+ continue
103
+ else:
104
+ raise NetworkError("遇到432反爬虫拦截")
105
+ else:
106
+ response.raise_for_status()
107
+
108
+ except requests.exceptions.RequestException as e:
109
+ if attempt < max_retries:
110
+ sleep_time = random.uniform(2, 5)
111
+ self.logger.warning(f"请求失败,等待 {sleep_time:.1f} 秒后重试: {e}")
112
+ time.sleep(sleep_time)
113
+ continue
114
+ else:
115
+ raise NetworkError(f"请求失败: {e}")
116
+
117
+ raise CrawlError("达到最大重试次数")
118
+
119
+ def get_user_by_uid(self, uid: str) -> User:
120
+ """根据UID获取用户信息"""
121
+ url = "https://m.weibo.cn/api/container/getIndex"
122
+ params = {"containerid": f"100505{uid}"}
123
+
124
+ data = self._request(url, params)
125
+
126
+ if not data.get("data") or not data["data"].get("userInfo"):
127
+ raise UserNotFoundError(f"用户 {uid} 不存在")
128
+
129
+ user_info = self.parser.parse_user_info(data)
130
+ user = User.from_dict(user_info)
131
+
132
+ self.logger.info(f"获取用户: {user.screen_name}")
133
+ return user
134
+
135
+ def get_user_posts(self, uid: str, page: int = 1) -> List[Post]:
136
+ """获取用户微博"""
137
+ time.sleep(random.uniform(1, 3)) # 请求间隔
138
+
139
+ url = "https://m.weibo.cn/api/container/getIndex"
140
+ params = {"containerid": f"107603{uid}", "page": page}
141
+
142
+ data = self._request(url, params)
143
+
144
+ if not data.get("data"):
145
+ return []
146
+
147
+ posts_data = self.parser.parse_posts(data)
148
+ posts = [Post.from_dict(post_data) for post_data in posts_data]
149
+
150
+ self.logger.info(f"获取到 {len(posts)} 条微博")
151
+ return posts
152
+
153
+ def search_users(self, query: str, page: int = 1, count: int = 10) -> List[User]:
154
+ """搜索用户"""
155
+ time.sleep(random.uniform(1, 3))
156
+
157
+ url = "https://m.weibo.cn/api/container/getIndex"
158
+ params = {
159
+ "containerid": f"100103type=3&q={query}",
160
+ "page": page,
161
+ "count": count
162
+ }
163
+
164
+ data = self._request(url, params)
165
+ users = []
166
+ cards = data.get("data", {}).get("cards", [])
167
+
168
+ for card in cards:
169
+ if card.get("card_type") == 11:
170
+ card_group = card.get("card_group", [])
171
+ for group_card in card_group:
172
+ if group_card.get("card_type") == 10:
173
+ user_data = group_card.get("user", {})
174
+ if user_data:
175
+ users.append(User.from_dict(user_data))
176
+
177
+ self.logger.info(f"搜索到 {len(users)} 个用户")
178
+ return users
179
+
180
+ def search_posts(self, query: str, page: int = 1) -> List[Post]:
181
+ """搜索微博"""
182
+ time.sleep(random.uniform(1, 3))
183
+
184
+ url = "https://m.weibo.cn/api/container/getIndex"
185
+ params = {
186
+ "containerid": f"100103type=1&q={query}",
187
+ "page": page
188
+ }
189
+
190
+ data = self._request(url, params)
191
+ posts_data = self.parser.parse_posts(data)
192
+ posts = [Post.from_dict(post_data) for post_data in posts_data]
193
+
194
+ self.logger.info(f"搜索到 {len(posts)} 条微博")
195
+ return posts
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Exceptions for crawl4weibo
6
+ """
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Base exceptions for crawl4weibo
6
+ """
7
+
8
+
9
+ class CrawlError(Exception):
10
+ """Base exception for crawl4weibo"""
11
+
12
+ def __init__(self, message="An error occurred during crawling", code=None):
13
+ self.message = message
14
+ self.code = code
15
+ super().__init__(self.message)
16
+
17
+
18
+ class AuthenticationError(CrawlError):
19
+ """Raised when authentication fails"""
20
+
21
+ def __init__(self, message="Authentication failed", code="AUTH_ERROR"):
22
+ super().__init__(message, code)
23
+
24
+
25
+ class RateLimitError(CrawlError):
26
+ """Raised when rate limit is exceeded"""
27
+
28
+ def __init__(self, message="Rate limit exceeded", code="RATE_LIMIT", retry_after=None):
29
+ self.retry_after = retry_after
30
+ super().__init__(message, code)
31
+
32
+
33
+ class NetworkError(CrawlError):
34
+ """Raised when network request fails"""
35
+
36
+ def __init__(self, message="Network request failed", code="NETWORK_ERROR"):
37
+ super().__init__(message, code)
38
+
39
+
40
+ class ParseError(CrawlError):
41
+ """Raised when response parsing fails"""
42
+
43
+ def __init__(self, message="Failed to parse response", code="PARSE_ERROR"):
44
+ super().__init__(message, code)
45
+
46
+
47
+ class UserNotFoundError(CrawlError):
48
+ """Raised when user is not found"""
49
+
50
+ def __init__(self, message="User not found", code="USER_NOT_FOUND"):
51
+ super().__init__(message, code)
52
+
53
+
54
+ class InvalidConfigError(CrawlError):
55
+ """Raised when configuration is invalid"""
56
+
57
+ def __init__(self, message="Invalid configuration", code="INVALID_CONFIG"):
58
+ super().__init__(message, code)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Data models for crawl4weibo
6
+ """
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Post model for crawl4weibo
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Optional, List, Dict, Any
10
+ from datetime import datetime
11
+
12
+
13
+ @dataclass
14
+ class Post:
15
+ """Weibo post model"""
16
+
17
+ id: str
18
+ user_id: str
19
+ text: str = ""
20
+ created_at: Optional[datetime] = None
21
+ source: str = ""
22
+ reposts_count: int = 0
23
+ comments_count: int = 0
24
+ attitudes_count: int = 0
25
+ pic_urls: List[str] = field(default_factory=list)
26
+ video_url: str = ""
27
+ is_original: bool = True
28
+ retweeted_status: Optional["Post"] = None
29
+ location: str = ""
30
+ topic_ids: List[str] = field(default_factory=list)
31
+ at_users: List[str] = field(default_factory=list)
32
+ raw_data: Dict[str, Any] = field(default_factory=dict)
33
+
34
+ @classmethod
35
+ def from_dict(cls, data: Dict[str, Any]) -> "Post":
36
+ """Create Post instance from dictionary"""
37
+ retweeted_status = None
38
+ if data.get("retweeted_status"):
39
+ retweeted_status = cls.from_dict(data["retweeted_status"])
40
+
41
+ post_data = {
42
+ "id": str(data.get("id", "")),
43
+ "user_id": str(data.get("user_id", "")),
44
+ "text": data.get("text", ""),
45
+ "created_at": data.get("created_at"),
46
+ "source": data.get("source", ""),
47
+ "reposts_count": data.get("reposts_count", 0),
48
+ "comments_count": data.get("comments_count", 0),
49
+ "attitudes_count": data.get("attitudes_count", 0),
50
+ "pic_urls": data.get("pic_urls", []),
51
+ "video_url": data.get("video_url", ""),
52
+ "is_original": data.get("is_original", True),
53
+ "retweeted_status": retweeted_status,
54
+ "location": data.get("location", ""),
55
+ "topic_ids": data.get("topic_ids", []),
56
+ "at_users": data.get("at_users", []),
57
+ "raw_data": data,
58
+ }
59
+ return cls(**post_data)
60
+
61
+ def to_dict(self) -> Dict[str, Any]:
62
+ """Convert Post instance to dictionary"""
63
+ result = {
64
+ "id": self.id,
65
+ "user_id": self.user_id,
66
+ "text": self.text,
67
+ "created_at": self.created_at,
68
+ "source": self.source,
69
+ "reposts_count": self.reposts_count,
70
+ "comments_count": self.comments_count,
71
+ "attitudes_count": self.attitudes_count,
72
+ "pic_urls": self.pic_urls,
73
+ "video_url": self.video_url,
74
+ "is_original": self.is_original,
75
+ "location": self.location,
76
+ "topic_ids": self.topic_ids,
77
+ "at_users": self.at_users,
78
+ }
79
+
80
+ if self.retweeted_status:
81
+ result["retweeted_status"] = self.retweeted_status.to_dict()
82
+
83
+ return result
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ User model for crawl4weibo
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Optional, Dict, Any
10
+ from datetime import datetime
11
+
12
+
13
+ @dataclass
14
+ class User:
15
+ """Weibo user model"""
16
+
17
+ id: str
18
+ screen_name: str = ""
19
+ gender: str = ""
20
+ location: str = ""
21
+ description: str = ""
22
+ followers_count: int = 0
23
+ following_count: int = 0
24
+ posts_count: int = 0
25
+ verified: bool = False
26
+ verified_reason: str = ""
27
+ avatar_url: str = ""
28
+ cover_image_url: str = ""
29
+ birthday: Optional[str] = None
30
+ education: str = ""
31
+ company: str = ""
32
+ registration_time: Optional[datetime] = None
33
+ sunshine_credit: str = ""
34
+ raw_data: Dict[str, Any] = field(default_factory=dict)
35
+
36
+ @classmethod
37
+ def from_dict(cls, data: Dict[str, Any]) -> "User":
38
+ """Create User instance from dictionary"""
39
+ user_data = {
40
+ "id": str(data.get("id", "")),
41
+ "screen_name": data.get("screen_name", ""),
42
+ "gender": data.get("gender", ""),
43
+ "location": data.get("location", ""),
44
+ "description": data.get("description", ""),
45
+ "followers_count": data.get("followers_count", 0),
46
+ "following_count": data.get("following_count", 0),
47
+ "posts_count": data.get("posts_count", 0),
48
+ "verified": data.get("verified", False),
49
+ "verified_reason": data.get("verified_reason", ""),
50
+ "avatar_url": data.get("avatar_url", ""),
51
+ "cover_image_url": data.get("cover_image_url", ""),
52
+ "birthday": data.get("birthday"),
53
+ "education": data.get("education", ""),
54
+ "company": data.get("company", ""),
55
+ "registration_time": data.get("registration_time"),
56
+ "sunshine_credit": data.get("sunshine_credit", ""),
57
+ "raw_data": data,
58
+ }
59
+ return cls(**user_data)
60
+
61
+ def to_dict(self) -> Dict[str, Any]:
62
+ """Convert User instance to dictionary"""
63
+ return {
64
+ "id": self.id,
65
+ "screen_name": self.screen_name,
66
+ "gender": self.gender,
67
+ "location": self.location,
68
+ "description": self.description,
69
+ "followers_count": self.followers_count,
70
+ "following_count": self.following_count,
71
+ "posts_count": self.posts_count,
72
+ "verified": self.verified,
73
+ "verified_reason": self.verified_reason,
74
+ "avatar_url": self.avatar_url,
75
+ "cover_image_url": self.cover_image_url,
76
+ "birthday": self.birthday,
77
+ "education": self.education,
78
+ "company": self.company,
79
+ "registration_time": self.registration_time,
80
+ "sunshine_credit": self.sunshine_credit,
81
+ }
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Utility functions for crawl4weibo
6
+ """
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Logger utilities for crawl4weibo
6
+ """
7
+
8
+ import logging
9
+ import sys
10
+ from pathlib import Path
11
+
12
+
13
+ def setup_logger(name="crawl4weibo", level=logging.INFO, log_file=None):
14
+ """
15
+ Setup logger with console and optional file output
16
+
17
+ Args:
18
+ name (str): Logger name
19
+ level (int): Logging level
20
+ log_file (str, optional): Log file path
21
+
22
+ Returns:
23
+ logging.Logger: Configured logger
24
+ """
25
+ logger = logging.getLogger(name)
26
+
27
+ if logger.handlers:
28
+ return logger
29
+
30
+ logger.setLevel(level)
31
+
32
+ formatter = logging.Formatter(
33
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
34
+ )
35
+
36
+ console_handler = logging.StreamHandler(sys.stdout)
37
+ console_handler.setLevel(level)
38
+ console_handler.setFormatter(formatter)
39
+ logger.addHandler(console_handler)
40
+
41
+ if log_file:
42
+ log_path = Path(log_file)
43
+ log_path.parent.mkdir(parents=True, exist_ok=True)
44
+
45
+ file_handler = logging.FileHandler(log_file, encoding='utf-8')
46
+ file_handler.setLevel(level)
47
+ file_handler.setFormatter(formatter)
48
+ logger.addHandler(file_handler)
49
+
50
+ return logger
51
+
52
+
53
+ def get_logger(name="crawl4weibo"):
54
+ """Get existing logger or create new one"""
55
+ return logging.getLogger(name)
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ HTML/JSON parsing utilities for crawl4weibo
6
+ """
7
+
8
+ import re
9
+ from typing import List, Dict, Any, Optional
10
+ from datetime import datetime
11
+ from lxml import etree
12
+
13
+ from ..exceptions.base import ParseError
14
+ from .logger import get_logger
15
+
16
+
17
+ class WeiboParser:
18
+ """Parser for Weibo API responses and HTML content"""
19
+
20
+ def __init__(self):
21
+ self.logger = get_logger()
22
+
23
+ def parse_user_info(self, response_data: Dict[str, Any]) -> Dict[str, Any]:
24
+ """
25
+ Parse user information from API response
26
+
27
+ Args:
28
+ response_data: Raw API response data
29
+
30
+ Returns:
31
+ Dict containing parsed user information
32
+ """
33
+ try:
34
+ if "data" not in response_data or "userInfo" not in response_data["data"]:
35
+ raise ParseError("Invalid user info response format")
36
+
37
+ user_info = response_data["data"]["userInfo"]
38
+
39
+ return {
40
+ "id": str(user_info.get("id", "")),
41
+ "screen_name": user_info.get("screen_name", ""),
42
+ "gender": user_info.get("gender", ""),
43
+ "description": user_info.get("description", ""),
44
+ "followers_count": user_info.get("followers_count", 0),
45
+ "following_count": user_info.get("follow_count", 0),
46
+ "posts_count": user_info.get("statuses_count", 0),
47
+ "verified": user_info.get("verified", False),
48
+ "verified_reason": user_info.get("verified_reason", ""),
49
+ "avatar_url": user_info.get("profile_image_url", ""),
50
+ "cover_image_url": user_info.get("cover_image_phone", ""),
51
+ }
52
+ except Exception as e:
53
+ self.logger.error(f"Failed to parse user info: {e}")
54
+ raise ParseError(f"Failed to parse user info: {e}")
55
+
56
+ def parse_posts(self, response_data: Dict[str, Any]) -> List[Dict[str, Any]]:
57
+ """
58
+ Parse posts from API response
59
+
60
+ Args:
61
+ response_data: Raw API response data
62
+
63
+ Returns:
64
+ List of parsed post dictionaries
65
+ """
66
+ try:
67
+ if "data" not in response_data or "cards" not in response_data["data"]:
68
+ return []
69
+
70
+ posts = []
71
+ cards = response_data["data"]["cards"]
72
+
73
+ for card in cards:
74
+ if card.get("card_type") == 9 and "mblog" in card:
75
+ post_data = self._parse_single_post(card["mblog"])
76
+ if post_data:
77
+ posts.append(post_data)
78
+
79
+ return posts
80
+ except Exception as e:
81
+ self.logger.error(f"Failed to parse posts: {e}")
82
+ raise ParseError(f"Failed to parse posts: {e}")
83
+
84
+ def _parse_single_post(self, mblog: Dict[str, Any]) -> Optional[Dict[str, Any]]:
85
+ """Parse a single post from mblog data"""
86
+ try:
87
+ post = {
88
+ "id": str(mblog.get("id", "")),
89
+ "user_id": str(mblog.get("user", {}).get("id", "")),
90
+ "text": self._clean_text(mblog.get("text", "")),
91
+ "created_at": self._parse_time(mblog.get("created_at", "")),
92
+ "source": mblog.get("source", ""),
93
+ "reposts_count": mblog.get("reposts_count", 0),
94
+ "comments_count": mblog.get("comments_count", 0),
95
+ "attitudes_count": mblog.get("attitudes_count", 0),
96
+ "pic_urls": self._extract_pic_urls(mblog),
97
+ "video_url": self._extract_video_url(mblog),
98
+ "is_original": not mblog.get("retweeted_status"),
99
+ "location": mblog.get("geo", {}).get("name", ""),
100
+ "topic_ids": self._extract_topics(mblog.get("text", "")),
101
+ "at_users": self._extract_at_users(mblog.get("text", "")),
102
+ }
103
+
104
+ if mblog.get("retweeted_status"):
105
+ post["retweeted_status"] = self._parse_single_post(mblog["retweeted_status"])
106
+
107
+ return post
108
+ except Exception as e:
109
+ self.logger.error(f"Failed to parse single post: {e}")
110
+ return None
111
+
112
+ def _clean_text(self, text: str) -> str:
113
+ """Remove HTML tags and clean text"""
114
+ if not text:
115
+ return ""
116
+
117
+ text = re.sub(r'<[^>]+>', '', text)
118
+ text = re.sub(r'\s+', ' ', text).strip()
119
+ return text
120
+
121
+ def _parse_time(self, time_str: str) -> Optional[datetime]:
122
+ """Parse time string to datetime"""
123
+ if not time_str:
124
+ return None
125
+
126
+ try:
127
+ return datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
128
+ except ValueError:
129
+ try:
130
+ return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
131
+ except ValueError:
132
+ self.logger.warning(f"Failed to parse time: {time_str}")
133
+ return None
134
+
135
+ def _extract_pic_urls(self, mblog: Dict[str, Any]) -> List[str]:
136
+ """Extract picture URLs from post"""
137
+ pic_urls = []
138
+
139
+ if "pics" in mblog:
140
+ for pic in mblog["pics"]:
141
+ if "large" in pic and "url" in pic["large"]:
142
+ pic_urls.append(pic["large"]["url"])
143
+
144
+ return pic_urls
145
+
146
+ def _extract_video_url(self, mblog: Dict[str, Any]) -> str:
147
+ """Extract video URL from post"""
148
+ if "page_info" in mblog and mblog["page_info"].get("type") == "video":
149
+ media_info = mblog["page_info"].get("media_info", {})
150
+ return media_info.get("stream_url", "")
151
+
152
+ return ""
153
+
154
+ def _extract_topics(self, text: str) -> List[str]:
155
+ """Extract topic hashtags from text"""
156
+ if not text:
157
+ return []
158
+
159
+ topics = re.findall(r'#([^#]+)#', text)
160
+ return [topic.strip() for topic in topics if topic.strip()]
161
+
162
+ def _extract_at_users(self, text: str) -> List[str]:
163
+ """Extract @mentioned users from text"""
164
+ if not text:
165
+ return []
166
+
167
+ mentions = re.findall(r'@([^\s@]+)', text)
168
+ return [mention.strip() for mention in mentions if mention.strip()]
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawl4weibo
3
+ Version: 0.1.0
4
+ Summary: A professional Weibo crawler library
5
+ Home-page: https://github.com/yourusername/crawl4weibo
6
+ Author: Your Name
7
+ Author-email: Kritoooo <krito2023@gmail.com>
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.7
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Requires-Python: >=3.7
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: requests>=2.25.0
22
+ Requires-Dist: lxml>=4.6.0
23
+ Requires-Dist: tqdm>=4.60.0
24
+ Requires-Dist: python-dateutil>=2.8.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=6.0; extra == "dev"
27
+ Requires-Dist: pytest-cov; extra == "dev"
28
+ Requires-Dist: black; extra == "dev"
29
+ Requires-Dist: isort; extra == "dev"
30
+ Requires-Dist: flake8; extra == "dev"
31
+ Dynamic: author
32
+ Dynamic: home-page
33
+ Dynamic: requires-python
34
+
35
+ # Crawl4Weibo
36
+
37
+ 一个专业的微博爬虫Python库,基于实际测试成功的方案,无需Cookie即可使用。
38
+
39
+ ## 特性
40
+
41
+ - 🚀 **开箱即用**: 无需Cookie,一行代码初始化
42
+ - 🛡️ **防反爬**: 自动处理432错误和请求限制
43
+ - 📱 **真实模拟**: 使用真实手机浏览器UA
44
+ - 🔄 **智能重试**: 自动重试机制
45
+ - 📊 **结构化数据**: 清晰的数据模型
46
+
47
+ ## 安装
48
+
49
+ ```bash
50
+ pip install -e .
51
+ ```
52
+
53
+ ## 快速开始
54
+
55
+ ```python
56
+ from crawl4weibo import WeiboClient
57
+
58
+ # 初始化(无需Cookie)
59
+ client = WeiboClient()
60
+
61
+ # 获取用户信息
62
+ user = client.get_user_by_uid("1195230310")
63
+ print(f"用户: {user.screen_name}")
64
+ print(f"粉丝: {user.followers_count:,}")
65
+
66
+ # 获取微博
67
+ posts = client.get_user_posts("1195230310")
68
+ for post in posts:
69
+ print(f"微博: {post.text[:50]}...")
70
+ print(f"点赞: {post.attitudes_count}")
71
+
72
+ # 搜索用户
73
+ users = client.search_users("技术博主")
74
+ for user in users:
75
+ print(f"用户: {user.screen_name}")
76
+
77
+ # 搜索微博
78
+ posts = client.search_posts("人工智能")
79
+ for post in posts:
80
+ print(f"内容: {post.text[:50]}...")
81
+ ```
82
+
83
+ ## API参考
84
+
85
+ ### WeiboClient
86
+
87
+ #### 初始化
88
+ ```python
89
+ WeiboClient(cookies=None, log_level="INFO", log_file=None)
90
+ ```
91
+
92
+ #### 主要方法
93
+
94
+ - `get_user_by_uid(uid)` - 获取用户信息
95
+ - `get_user_posts(uid, page=1)` - 获取用户微博
96
+ - `search_users(query, page=1, count=10)` - 搜索用户
97
+ - `search_posts(query, page=1)` - 搜索微博
98
+
99
+ ### 数据模型
100
+
101
+ **User (用户)**:
102
+ - `screen_name` - 用户名
103
+ - `followers_count` - 粉丝数
104
+ - `posts_count` - 微博数
105
+ - `verified` - 是否认证
106
+
107
+ **Post (微博)**:
108
+ - `text` - 微博文本
109
+ - `attitudes_count` - 点赞数
110
+ - `comments_count` - 评论数
111
+ - `created_at` - 发布时间
112
+
113
+ ## 运行示例
114
+
115
+ ```bash
116
+ python examples/simple_example.py
117
+ ```
118
+
119
+ ## 技术实现
120
+
121
+ 基于你提供的成功代码实现:
122
+
123
+ ```python
124
+ # 核心技术栈
125
+ - Android Chrome UA模拟
126
+ - 移动端API接口
127
+ - 自动session管理
128
+ - 432错误智能重试
129
+ - 随机请求间隔
130
+ ```
131
+
132
+ ## 许可证
133
+
134
+ MIT License
@@ -0,0 +1,19 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ crawl4weibo/__init__.py
5
+ crawl4weibo.egg-info/PKG-INFO
6
+ crawl4weibo.egg-info/SOURCES.txt
7
+ crawl4weibo.egg-info/dependency_links.txt
8
+ crawl4weibo.egg-info/requires.txt
9
+ crawl4weibo.egg-info/top_level.txt
10
+ crawl4weibo/core/__init__.py
11
+ crawl4weibo/core/client.py
12
+ crawl4weibo/exceptions/__init__.py
13
+ crawl4weibo/exceptions/base.py
14
+ crawl4weibo/models/__init__.py
15
+ crawl4weibo/models/post.py
16
+ crawl4weibo/models/user.py
17
+ crawl4weibo/utils/__init__.py
18
+ crawl4weibo/utils/logger.py
19
+ crawl4weibo/utils/parser.py
@@ -0,0 +1,11 @@
1
+ requests>=2.25.0
2
+ lxml>=4.6.0
3
+ tqdm>=4.60.0
4
+ python-dateutil>=2.8.0
5
+
6
+ [dev]
7
+ pytest>=6.0
8
+ pytest-cov
9
+ black
10
+ isort
11
+ flake8
@@ -0,0 +1 @@
1
+ crawl4weibo
@@ -0,0 +1,53 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "crawl4weibo"
7
+ description = "A professional Weibo crawler library"
8
+ readme = "README.md"
9
+ requires-python = ">=3.7"
10
+ license = {text = "MIT"}
11
+ authors = [
12
+ {name = "Kritoooo", email = "krito2023@gmail.com"},
13
+ ]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.7",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ ]
26
+ dependencies = [
27
+ "requests>=2.25.0",
28
+ "lxml>=4.6.0",
29
+ "tqdm>=4.60.0",
30
+ "python-dateutil>=2.8.0",
31
+ ]
32
+ dynamic = ["version"]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "pytest>=6.0",
37
+ "pytest-cov",
38
+ "black",
39
+ "isort",
40
+ "flake8",
41
+ ]
42
+
43
+ [tool.setuptools.packages.find]
44
+ where = ["."]
45
+ include = ["crawl4weibo*"]
46
+
47
+ [tool.black]
48
+ line-length = 88
49
+ target-version = ['py37']
50
+
51
+ [tool.isort]
52
+ profile = "black"
53
+ line_length = 88
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from setuptools import setup, find_packages
5
+
6
+ with open("README.md", "r", encoding="utf-8") as fh:
7
+ long_description = fh.read()
8
+
9
+ setup(
10
+ name="crawl4weibo",
11
+ version="0.1.0",
12
+ author="Your Name",
13
+ author_email="your.email@example.com",
14
+ description="A professional Weibo crawler library",
15
+ long_description=long_description,
16
+ long_description_content_type="text/markdown",
17
+ url="https://github.com/yourusername/crawl4weibo",
18
+ packages=find_packages(),
19
+ classifiers=[
20
+ "Development Status :: 3 - Alpha",
21
+ "Intended Audience :: Developers",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Operating System :: OS Independent",
24
+ "Programming Language :: Python :: 3",
25
+ "Programming Language :: Python :: 3.7",
26
+ "Programming Language :: Python :: 3.8",
27
+ "Programming Language :: Python :: 3.9",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ ],
31
+ python_requires=">=3.7",
32
+ install_requires=[
33
+ "requests>=2.25.0",
34
+ "lxml>=4.6.0",
35
+ "tqdm>=4.60.0",
36
+ "python-dateutil>=2.8.0",
37
+ ],
38
+ extras_require={
39
+ "dev": [
40
+ "pytest>=6.0",
41
+ "pytest-cov",
42
+ "black",
43
+ "isort",
44
+ "flake8",
45
+ ],
46
+ },
47
+ )