crawl4weibo 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawl4weibo-0.1.0/PKG-INFO +134 -0
- crawl4weibo-0.1.0/README.md +100 -0
- crawl4weibo-0.1.0/crawl4weibo/__init__.py +27 -0
- crawl4weibo-0.1.0/crawl4weibo/core/__init__.py +6 -0
- crawl4weibo-0.1.0/crawl4weibo/core/client.py +195 -0
- crawl4weibo-0.1.0/crawl4weibo/exceptions/__init__.py +6 -0
- crawl4weibo-0.1.0/crawl4weibo/exceptions/base.py +58 -0
- crawl4weibo-0.1.0/crawl4weibo/models/__init__.py +6 -0
- crawl4weibo-0.1.0/crawl4weibo/models/post.py +83 -0
- crawl4weibo-0.1.0/crawl4weibo/models/user.py +81 -0
- crawl4weibo-0.1.0/crawl4weibo/utils/__init__.py +6 -0
- crawl4weibo-0.1.0/crawl4weibo/utils/logger.py +55 -0
- crawl4weibo-0.1.0/crawl4weibo/utils/parser.py +168 -0
- crawl4weibo-0.1.0/crawl4weibo.egg-info/PKG-INFO +134 -0
- crawl4weibo-0.1.0/crawl4weibo.egg-info/SOURCES.txt +19 -0
- crawl4weibo-0.1.0/crawl4weibo.egg-info/dependency_links.txt +1 -0
- crawl4weibo-0.1.0/crawl4weibo.egg-info/requires.txt +11 -0
- crawl4weibo-0.1.0/crawl4weibo.egg-info/top_level.txt +1 -0
- crawl4weibo-0.1.0/pyproject.toml +53 -0
- crawl4weibo-0.1.0/setup.cfg +4 -0
- crawl4weibo-0.1.0/setup.py +47 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawl4weibo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A professional Weibo crawler library
|
|
5
|
+
Home-page: https://github.com/yourusername/crawl4weibo
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: Kritoooo <krito2023@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Requires-Python: >=3.7
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: requests>=2.25.0
|
|
22
|
+
Requires-Dist: lxml>=4.6.0
|
|
23
|
+
Requires-Dist: tqdm>=4.60.0
|
|
24
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: black; extra == "dev"
|
|
29
|
+
Requires-Dist: isort; extra == "dev"
|
|
30
|
+
Requires-Dist: flake8; extra == "dev"
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: requires-python
|
|
34
|
+
|
|
35
|
+
# Crawl4Weibo
|
|
36
|
+
|
|
37
|
+
一个专业的微博爬虫Python库,基于实际测试成功的方案,无需Cookie即可使用。
|
|
38
|
+
|
|
39
|
+
## 特性
|
|
40
|
+
|
|
41
|
+
- 🚀 **开箱即用**: 无需Cookie,一行代码初始化
|
|
42
|
+
- 🛡️ **防反爬**: 自动处理432错误和请求限制
|
|
43
|
+
- 📱 **真实模拟**: 使用真实手机浏览器UA
|
|
44
|
+
- 🔄 **智能重试**: 自动重试机制
|
|
45
|
+
- 📊 **结构化数据**: 清晰的数据模型
|
|
46
|
+
|
|
47
|
+
## 安装
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## 快速开始
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from crawl4weibo import WeiboClient
|
|
57
|
+
|
|
58
|
+
# 初始化(无需Cookie)
|
|
59
|
+
client = WeiboClient()
|
|
60
|
+
|
|
61
|
+
# 获取用户信息
|
|
62
|
+
user = client.get_user_by_uid("1195230310")
|
|
63
|
+
print(f"用户: {user.screen_name}")
|
|
64
|
+
print(f"粉丝: {user.followers_count:,}")
|
|
65
|
+
|
|
66
|
+
# 获取微博
|
|
67
|
+
posts = client.get_user_posts("1195230310")
|
|
68
|
+
for post in posts:
|
|
69
|
+
print(f"微博: {post.text[:50]}...")
|
|
70
|
+
print(f"点赞: {post.attitudes_count}")
|
|
71
|
+
|
|
72
|
+
# 搜索用户
|
|
73
|
+
users = client.search_users("技术博主")
|
|
74
|
+
for user in users:
|
|
75
|
+
print(f"用户: {user.screen_name}")
|
|
76
|
+
|
|
77
|
+
# 搜索微博
|
|
78
|
+
posts = client.search_posts("人工智能")
|
|
79
|
+
for post in posts:
|
|
80
|
+
print(f"内容: {post.text[:50]}...")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## API参考
|
|
84
|
+
|
|
85
|
+
### WeiboClient
|
|
86
|
+
|
|
87
|
+
#### 初始化
|
|
88
|
+
```python
|
|
89
|
+
WeiboClient(cookies=None, log_level="INFO", log_file=None)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
#### 主要方法
|
|
93
|
+
|
|
94
|
+
- `get_user_by_uid(uid)` - 获取用户信息
|
|
95
|
+
- `get_user_posts(uid, page=1)` - 获取用户微博
|
|
96
|
+
- `search_users(query, page=1, count=10)` - 搜索用户
|
|
97
|
+
- `search_posts(query, page=1)` - 搜索微博
|
|
98
|
+
|
|
99
|
+
### 数据模型
|
|
100
|
+
|
|
101
|
+
**User (用户)**:
|
|
102
|
+
- `screen_name` - 用户名
|
|
103
|
+
- `followers_count` - 粉丝数
|
|
104
|
+
- `posts_count` - 微博数
|
|
105
|
+
- `verified` - 是否认证
|
|
106
|
+
|
|
107
|
+
**Post (微博)**:
|
|
108
|
+
- `text` - 微博文本
|
|
109
|
+
- `attitudes_count` - 点赞数
|
|
110
|
+
- `comments_count` - 评论数
|
|
111
|
+
- `created_at` - 发布时间
|
|
112
|
+
|
|
113
|
+
## 运行示例
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
python examples/simple_example.py
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## 技术实现
|
|
120
|
+
|
|
121
|
+
基于你提供的成功代码实现:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
# 核心技术栈
|
|
125
|
+
- Android Chrome UA模拟
|
|
126
|
+
- 移动端API接口
|
|
127
|
+
- 自动session管理
|
|
128
|
+
- 432错误智能重试
|
|
129
|
+
- 随机请求间隔
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## 许可证
|
|
133
|
+
|
|
134
|
+
MIT License
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Crawl4Weibo
|
|
2
|
+
|
|
3
|
+
一个专业的微博爬虫Python库,基于实际测试成功的方案,无需Cookie即可使用。
|
|
4
|
+
|
|
5
|
+
## 特性
|
|
6
|
+
|
|
7
|
+
- 🚀 **开箱即用**: 无需Cookie,一行代码初始化
|
|
8
|
+
- 🛡️ **防反爬**: 自动处理432错误和请求限制
|
|
9
|
+
- 📱 **真实模拟**: 使用真实手机浏览器UA
|
|
10
|
+
- 🔄 **智能重试**: 自动重试机制
|
|
11
|
+
- 📊 **结构化数据**: 清晰的数据模型
|
|
12
|
+
|
|
13
|
+
## 安装
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install -e .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## 快速开始
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from crawl4weibo import WeiboClient
|
|
23
|
+
|
|
24
|
+
# 初始化(无需Cookie)
|
|
25
|
+
client = WeiboClient()
|
|
26
|
+
|
|
27
|
+
# 获取用户信息
|
|
28
|
+
user = client.get_user_by_uid("1195230310")
|
|
29
|
+
print(f"用户: {user.screen_name}")
|
|
30
|
+
print(f"粉丝: {user.followers_count:,}")
|
|
31
|
+
|
|
32
|
+
# 获取微博
|
|
33
|
+
posts = client.get_user_posts("1195230310")
|
|
34
|
+
for post in posts:
|
|
35
|
+
print(f"微博: {post.text[:50]}...")
|
|
36
|
+
print(f"点赞: {post.attitudes_count}")
|
|
37
|
+
|
|
38
|
+
# 搜索用户
|
|
39
|
+
users = client.search_users("技术博主")
|
|
40
|
+
for user in users:
|
|
41
|
+
print(f"用户: {user.screen_name}")
|
|
42
|
+
|
|
43
|
+
# 搜索微博
|
|
44
|
+
posts = client.search_posts("人工智能")
|
|
45
|
+
for post in posts:
|
|
46
|
+
print(f"内容: {post.text[:50]}...")
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## API参考
|
|
50
|
+
|
|
51
|
+
### WeiboClient
|
|
52
|
+
|
|
53
|
+
#### 初始化
|
|
54
|
+
```python
|
|
55
|
+
WeiboClient(cookies=None, log_level="INFO", log_file=None)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
#### 主要方法
|
|
59
|
+
|
|
60
|
+
- `get_user_by_uid(uid)` - 获取用户信息
|
|
61
|
+
- `get_user_posts(uid, page=1)` - 获取用户微博
|
|
62
|
+
- `search_users(query, page=1, count=10)` - 搜索用户
|
|
63
|
+
- `search_posts(query, page=1)` - 搜索微博
|
|
64
|
+
|
|
65
|
+
### 数据模型
|
|
66
|
+
|
|
67
|
+
**User (用户)**:
|
|
68
|
+
- `screen_name` - 用户名
|
|
69
|
+
- `followers_count` - 粉丝数
|
|
70
|
+
- `posts_count` - 微博数
|
|
71
|
+
- `verified` - 是否认证
|
|
72
|
+
|
|
73
|
+
**Post (微博)**:
|
|
74
|
+
- `text` - 微博文本
|
|
75
|
+
- `attitudes_count` - 点赞数
|
|
76
|
+
- `comments_count` - 评论数
|
|
77
|
+
- `created_at` - 发布时间
|
|
78
|
+
|
|
79
|
+
## 运行示例
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
python examples/simple_example.py
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## 技术实现
|
|
86
|
+
|
|
87
|
+
基于你提供的成功代码实现:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
# 核心技术栈
|
|
91
|
+
- Android Chrome UA模拟
|
|
92
|
+
- 移动端API接口
|
|
93
|
+
- 自动session管理
|
|
94
|
+
- 432错误智能重试
|
|
95
|
+
- 随机请求间隔
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## 许可证
|
|
99
|
+
|
|
100
|
+
MIT License
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
crawl4weibo - A professional Weibo crawler library
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__author__ = "Your Name"
|
|
10
|
+
__email__ = "your.email@example.com"
|
|
11
|
+
|
|
12
|
+
from .core.client import WeiboClient
|
|
13
|
+
from .models.user import User
|
|
14
|
+
from .models.post import Post
|
|
15
|
+
from .exceptions.base import CrawlError, AuthenticationError, RateLimitError, UserNotFoundError, NetworkError, ParseError
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"WeiboClient",
|
|
19
|
+
"User",
|
|
20
|
+
"Post",
|
|
21
|
+
"CrawlError",
|
|
22
|
+
"AuthenticationError",
|
|
23
|
+
"RateLimitError",
|
|
24
|
+
"UserNotFoundError",
|
|
25
|
+
"NetworkError",
|
|
26
|
+
"ParseError",
|
|
27
|
+
]
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
微博爬虫客户端 - 基于实际测试成功的代码
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
import time
|
|
10
|
+
import random
|
|
11
|
+
from typing import List, Dict, Any, Optional, Union
|
|
12
|
+
|
|
13
|
+
from ..utils.parser import WeiboParser
|
|
14
|
+
from ..utils.logger import setup_logger
|
|
15
|
+
from ..models.user import User
|
|
16
|
+
from ..models.post import Post
|
|
17
|
+
from ..exceptions.base import CrawlError, UserNotFoundError, ParseError, NetworkError
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class WeiboClient:
|
|
21
|
+
"""微博爬虫客户端"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, cookies: Optional[Union[str, Dict[str, str]]] = None,
|
|
24
|
+
log_level: str = "INFO", log_file: Optional[str] = None,
|
|
25
|
+
user_agent: Optional[str] = None):
|
|
26
|
+
"""
|
|
27
|
+
初始化微博客户端
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
cookies: 可选的Cookie字符串或字典
|
|
31
|
+
log_level: 日志级别
|
|
32
|
+
log_file: 日志文件路径
|
|
33
|
+
user_agent: 可选的User-Agent字符串
|
|
34
|
+
"""
|
|
35
|
+
self.logger = setup_logger(
|
|
36
|
+
level=getattr(__import__('logging'), log_level.upper()),
|
|
37
|
+
log_file=log_file
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# 创建session
|
|
41
|
+
self.session = requests.Session()
|
|
42
|
+
|
|
43
|
+
# 设置经过验证的headers
|
|
44
|
+
default_user_agent = (
|
|
45
|
+
"Mozilla/5.0 (Linux; Android 13; SM-G9980) "
|
|
46
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
47
|
+
"Chrome/112.0.5615.135 Mobile Safari/537.36"
|
|
48
|
+
)
|
|
49
|
+
self.session.headers.update({
|
|
50
|
+
"User-Agent": user_agent or default_user_agent,
|
|
51
|
+
"Referer": "https://m.weibo.cn/",
|
|
52
|
+
"Accept": "application/json, text/plain, */*",
|
|
53
|
+
"X-Requested-With": "XMLHttpRequest"
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
# 添加Cookie(如果提供)
|
|
57
|
+
if cookies:
|
|
58
|
+
self._set_cookies(cookies)
|
|
59
|
+
|
|
60
|
+
# 初始化session
|
|
61
|
+
self._init_session()
|
|
62
|
+
|
|
63
|
+
# 解析器
|
|
64
|
+
self.parser = WeiboParser()
|
|
65
|
+
|
|
66
|
+
self.logger.info("WeiboClient initialized successfully")
|
|
67
|
+
|
|
68
|
+
def _set_cookies(self, cookies: Union[str, Dict[str, str]]):
|
|
69
|
+
"""设置cookies"""
|
|
70
|
+
if isinstance(cookies, str):
|
|
71
|
+
cookie_dict = {}
|
|
72
|
+
for pair in cookies.split(';'):
|
|
73
|
+
if '=' in pair:
|
|
74
|
+
key, value = pair.split('=', 1)
|
|
75
|
+
cookie_dict[key.strip()] = value.strip()
|
|
76
|
+
self.session.cookies.update(cookie_dict)
|
|
77
|
+
elif isinstance(cookies, dict):
|
|
78
|
+
self.session.cookies.update(cookies)
|
|
79
|
+
|
|
80
|
+
def _init_session(self):
|
|
81
|
+
"""初始化session,获取首页cookie"""
|
|
82
|
+
try:
|
|
83
|
+
self.logger.debug("初始化session...")
|
|
84
|
+
self.session.get("https://m.weibo.cn/", timeout=5)
|
|
85
|
+
time.sleep(random.uniform(2, 4))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
self.logger.warning(f"Session初始化失败: {e}")
|
|
88
|
+
|
|
89
|
+
def _request(self, url: str, params: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]:
|
|
90
|
+
"""发送请求并处理重试"""
|
|
91
|
+
for attempt in range(1, max_retries + 1):
|
|
92
|
+
try:
|
|
93
|
+
response = self.session.get(url, params=params, timeout=5)
|
|
94
|
+
|
|
95
|
+
if response.status_code == 200:
|
|
96
|
+
return response.json()
|
|
97
|
+
elif response.status_code == 432:
|
|
98
|
+
if attempt < max_retries:
|
|
99
|
+
sleep_time = random.uniform(4, 7)
|
|
100
|
+
self.logger.warning(f"遇到432错误,等待 {sleep_time:.1f} 秒后重试...")
|
|
101
|
+
time.sleep(sleep_time)
|
|
102
|
+
continue
|
|
103
|
+
else:
|
|
104
|
+
raise NetworkError("遇到432反爬虫拦截")
|
|
105
|
+
else:
|
|
106
|
+
response.raise_for_status()
|
|
107
|
+
|
|
108
|
+
except requests.exceptions.RequestException as e:
|
|
109
|
+
if attempt < max_retries:
|
|
110
|
+
sleep_time = random.uniform(2, 5)
|
|
111
|
+
self.logger.warning(f"请求失败,等待 {sleep_time:.1f} 秒后重试: {e}")
|
|
112
|
+
time.sleep(sleep_time)
|
|
113
|
+
continue
|
|
114
|
+
else:
|
|
115
|
+
raise NetworkError(f"请求失败: {e}")
|
|
116
|
+
|
|
117
|
+
raise CrawlError("达到最大重试次数")
|
|
118
|
+
|
|
119
|
+
def get_user_by_uid(self, uid: str) -> User:
|
|
120
|
+
"""根据UID获取用户信息"""
|
|
121
|
+
url = "https://m.weibo.cn/api/container/getIndex"
|
|
122
|
+
params = {"containerid": f"100505{uid}"}
|
|
123
|
+
|
|
124
|
+
data = self._request(url, params)
|
|
125
|
+
|
|
126
|
+
if not data.get("data") or not data["data"].get("userInfo"):
|
|
127
|
+
raise UserNotFoundError(f"用户 {uid} 不存在")
|
|
128
|
+
|
|
129
|
+
user_info = self.parser.parse_user_info(data)
|
|
130
|
+
user = User.from_dict(user_info)
|
|
131
|
+
|
|
132
|
+
self.logger.info(f"获取用户: {user.screen_name}")
|
|
133
|
+
return user
|
|
134
|
+
|
|
135
|
+
def get_user_posts(self, uid: str, page: int = 1) -> List[Post]:
|
|
136
|
+
"""获取用户微博"""
|
|
137
|
+
time.sleep(random.uniform(1, 3)) # 请求间隔
|
|
138
|
+
|
|
139
|
+
url = "https://m.weibo.cn/api/container/getIndex"
|
|
140
|
+
params = {"containerid": f"107603{uid}", "page": page}
|
|
141
|
+
|
|
142
|
+
data = self._request(url, params)
|
|
143
|
+
|
|
144
|
+
if not data.get("data"):
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
posts_data = self.parser.parse_posts(data)
|
|
148
|
+
posts = [Post.from_dict(post_data) for post_data in posts_data]
|
|
149
|
+
|
|
150
|
+
self.logger.info(f"获取到 {len(posts)} 条微博")
|
|
151
|
+
return posts
|
|
152
|
+
|
|
153
|
+
def search_users(self, query: str, page: int = 1, count: int = 10) -> List[User]:
|
|
154
|
+
"""搜索用户"""
|
|
155
|
+
time.sleep(random.uniform(1, 3))
|
|
156
|
+
|
|
157
|
+
url = "https://m.weibo.cn/api/container/getIndex"
|
|
158
|
+
params = {
|
|
159
|
+
"containerid": f"100103type=3&q={query}",
|
|
160
|
+
"page": page,
|
|
161
|
+
"count": count
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
data = self._request(url, params)
|
|
165
|
+
users = []
|
|
166
|
+
cards = data.get("data", {}).get("cards", [])
|
|
167
|
+
|
|
168
|
+
for card in cards:
|
|
169
|
+
if card.get("card_type") == 11:
|
|
170
|
+
card_group = card.get("card_group", [])
|
|
171
|
+
for group_card in card_group:
|
|
172
|
+
if group_card.get("card_type") == 10:
|
|
173
|
+
user_data = group_card.get("user", {})
|
|
174
|
+
if user_data:
|
|
175
|
+
users.append(User.from_dict(user_data))
|
|
176
|
+
|
|
177
|
+
self.logger.info(f"搜索到 {len(users)} 个用户")
|
|
178
|
+
return users
|
|
179
|
+
|
|
180
|
+
def search_posts(self, query: str, page: int = 1) -> List[Post]:
|
|
181
|
+
"""搜索微博"""
|
|
182
|
+
time.sleep(random.uniform(1, 3))
|
|
183
|
+
|
|
184
|
+
url = "https://m.weibo.cn/api/container/getIndex"
|
|
185
|
+
params = {
|
|
186
|
+
"containerid": f"100103type=1&q={query}",
|
|
187
|
+
"page": page
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
data = self._request(url, params)
|
|
191
|
+
posts_data = self.parser.parse_posts(data)
|
|
192
|
+
posts = [Post.from_dict(post_data) for post_data in posts_data]
|
|
193
|
+
|
|
194
|
+
self.logger.info(f"搜索到 {len(posts)} 条微博")
|
|
195
|
+
return posts
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Base exceptions for crawl4weibo
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CrawlError(Exception):
|
|
10
|
+
"""Base exception for crawl4weibo"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, message="An error occurred during crawling", code=None):
|
|
13
|
+
self.message = message
|
|
14
|
+
self.code = code
|
|
15
|
+
super().__init__(self.message)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AuthenticationError(CrawlError):
|
|
19
|
+
"""Raised when authentication fails"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, message="Authentication failed", code="AUTH_ERROR"):
|
|
22
|
+
super().__init__(message, code)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RateLimitError(CrawlError):
|
|
26
|
+
"""Raised when rate limit is exceeded"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, message="Rate limit exceeded", code="RATE_LIMIT", retry_after=None):
|
|
29
|
+
self.retry_after = retry_after
|
|
30
|
+
super().__init__(message, code)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class NetworkError(CrawlError):
|
|
34
|
+
"""Raised when network request fails"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, message="Network request failed", code="NETWORK_ERROR"):
|
|
37
|
+
super().__init__(message, code)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ParseError(CrawlError):
|
|
41
|
+
"""Raised when response parsing fails"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, message="Failed to parse response", code="PARSE_ERROR"):
|
|
44
|
+
super().__init__(message, code)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class UserNotFoundError(CrawlError):
|
|
48
|
+
"""Raised when user is not found"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, message="User not found", code="USER_NOT_FOUND"):
|
|
51
|
+
super().__init__(message, code)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class InvalidConfigError(CrawlError):
|
|
55
|
+
"""Raised when configuration is invalid"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, message="Invalid configuration", code="INVALID_CONFIG"):
|
|
58
|
+
super().__init__(message, code)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Post model for crawl4weibo
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Optional, List, Dict, Any
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class Post:
|
|
15
|
+
"""Weibo post model"""
|
|
16
|
+
|
|
17
|
+
id: str
|
|
18
|
+
user_id: str
|
|
19
|
+
text: str = ""
|
|
20
|
+
created_at: Optional[datetime] = None
|
|
21
|
+
source: str = ""
|
|
22
|
+
reposts_count: int = 0
|
|
23
|
+
comments_count: int = 0
|
|
24
|
+
attitudes_count: int = 0
|
|
25
|
+
pic_urls: List[str] = field(default_factory=list)
|
|
26
|
+
video_url: str = ""
|
|
27
|
+
is_original: bool = True
|
|
28
|
+
retweeted_status: Optional["Post"] = None
|
|
29
|
+
location: str = ""
|
|
30
|
+
topic_ids: List[str] = field(default_factory=list)
|
|
31
|
+
at_users: List[str] = field(default_factory=list)
|
|
32
|
+
raw_data: Dict[str, Any] = field(default_factory=dict)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_dict(cls, data: Dict[str, Any]) -> "Post":
|
|
36
|
+
"""Create Post instance from dictionary"""
|
|
37
|
+
retweeted_status = None
|
|
38
|
+
if data.get("retweeted_status"):
|
|
39
|
+
retweeted_status = cls.from_dict(data["retweeted_status"])
|
|
40
|
+
|
|
41
|
+
post_data = {
|
|
42
|
+
"id": str(data.get("id", "")),
|
|
43
|
+
"user_id": str(data.get("user_id", "")),
|
|
44
|
+
"text": data.get("text", ""),
|
|
45
|
+
"created_at": data.get("created_at"),
|
|
46
|
+
"source": data.get("source", ""),
|
|
47
|
+
"reposts_count": data.get("reposts_count", 0),
|
|
48
|
+
"comments_count": data.get("comments_count", 0),
|
|
49
|
+
"attitudes_count": data.get("attitudes_count", 0),
|
|
50
|
+
"pic_urls": data.get("pic_urls", []),
|
|
51
|
+
"video_url": data.get("video_url", ""),
|
|
52
|
+
"is_original": data.get("is_original", True),
|
|
53
|
+
"retweeted_status": retweeted_status,
|
|
54
|
+
"location": data.get("location", ""),
|
|
55
|
+
"topic_ids": data.get("topic_ids", []),
|
|
56
|
+
"at_users": data.get("at_users", []),
|
|
57
|
+
"raw_data": data,
|
|
58
|
+
}
|
|
59
|
+
return cls(**post_data)
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
62
|
+
"""Convert Post instance to dictionary"""
|
|
63
|
+
result = {
|
|
64
|
+
"id": self.id,
|
|
65
|
+
"user_id": self.user_id,
|
|
66
|
+
"text": self.text,
|
|
67
|
+
"created_at": self.created_at,
|
|
68
|
+
"source": self.source,
|
|
69
|
+
"reposts_count": self.reposts_count,
|
|
70
|
+
"comments_count": self.comments_count,
|
|
71
|
+
"attitudes_count": self.attitudes_count,
|
|
72
|
+
"pic_urls": self.pic_urls,
|
|
73
|
+
"video_url": self.video_url,
|
|
74
|
+
"is_original": self.is_original,
|
|
75
|
+
"location": self.location,
|
|
76
|
+
"topic_ids": self.topic_ids,
|
|
77
|
+
"at_users": self.at_users,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if self.retweeted_status:
|
|
81
|
+
result["retweeted_status"] = self.retweeted_status.to_dict()
|
|
82
|
+
|
|
83
|
+
return result
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
User model for crawl4weibo
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Optional, Dict, Any
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class User:
|
|
15
|
+
"""Weibo user model"""
|
|
16
|
+
|
|
17
|
+
id: str
|
|
18
|
+
screen_name: str = ""
|
|
19
|
+
gender: str = ""
|
|
20
|
+
location: str = ""
|
|
21
|
+
description: str = ""
|
|
22
|
+
followers_count: int = 0
|
|
23
|
+
following_count: int = 0
|
|
24
|
+
posts_count: int = 0
|
|
25
|
+
verified: bool = False
|
|
26
|
+
verified_reason: str = ""
|
|
27
|
+
avatar_url: str = ""
|
|
28
|
+
cover_image_url: str = ""
|
|
29
|
+
birthday: Optional[str] = None
|
|
30
|
+
education: str = ""
|
|
31
|
+
company: str = ""
|
|
32
|
+
registration_time: Optional[datetime] = None
|
|
33
|
+
sunshine_credit: str = ""
|
|
34
|
+
raw_data: Dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_dict(cls, data: Dict[str, Any]) -> "User":
|
|
38
|
+
"""Create User instance from dictionary"""
|
|
39
|
+
user_data = {
|
|
40
|
+
"id": str(data.get("id", "")),
|
|
41
|
+
"screen_name": data.get("screen_name", ""),
|
|
42
|
+
"gender": data.get("gender", ""),
|
|
43
|
+
"location": data.get("location", ""),
|
|
44
|
+
"description": data.get("description", ""),
|
|
45
|
+
"followers_count": data.get("followers_count", 0),
|
|
46
|
+
"following_count": data.get("following_count", 0),
|
|
47
|
+
"posts_count": data.get("posts_count", 0),
|
|
48
|
+
"verified": data.get("verified", False),
|
|
49
|
+
"verified_reason": data.get("verified_reason", ""),
|
|
50
|
+
"avatar_url": data.get("avatar_url", ""),
|
|
51
|
+
"cover_image_url": data.get("cover_image_url", ""),
|
|
52
|
+
"birthday": data.get("birthday"),
|
|
53
|
+
"education": data.get("education", ""),
|
|
54
|
+
"company": data.get("company", ""),
|
|
55
|
+
"registration_time": data.get("registration_time"),
|
|
56
|
+
"sunshine_credit": data.get("sunshine_credit", ""),
|
|
57
|
+
"raw_data": data,
|
|
58
|
+
}
|
|
59
|
+
return cls(**user_data)
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
62
|
+
"""Convert User instance to dictionary"""
|
|
63
|
+
return {
|
|
64
|
+
"id": self.id,
|
|
65
|
+
"screen_name": self.screen_name,
|
|
66
|
+
"gender": self.gender,
|
|
67
|
+
"location": self.location,
|
|
68
|
+
"description": self.description,
|
|
69
|
+
"followers_count": self.followers_count,
|
|
70
|
+
"following_count": self.following_count,
|
|
71
|
+
"posts_count": self.posts_count,
|
|
72
|
+
"verified": self.verified,
|
|
73
|
+
"verified_reason": self.verified_reason,
|
|
74
|
+
"avatar_url": self.avatar_url,
|
|
75
|
+
"cover_image_url": self.cover_image_url,
|
|
76
|
+
"birthday": self.birthday,
|
|
77
|
+
"education": self.education,
|
|
78
|
+
"company": self.company,
|
|
79
|
+
"registration_time": self.registration_time,
|
|
80
|
+
"sunshine_credit": self.sunshine_credit,
|
|
81
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Logger utilities for crawl4weibo
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def setup_logger(name="crawl4weibo", level=logging.INFO, log_file=None):
|
|
14
|
+
"""
|
|
15
|
+
Setup logger with console and optional file output
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
name (str): Logger name
|
|
19
|
+
level (int): Logging level
|
|
20
|
+
log_file (str, optional): Log file path
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
logging.Logger: Configured logger
|
|
24
|
+
"""
|
|
25
|
+
logger = logging.getLogger(name)
|
|
26
|
+
|
|
27
|
+
if logger.handlers:
|
|
28
|
+
return logger
|
|
29
|
+
|
|
30
|
+
logger.setLevel(level)
|
|
31
|
+
|
|
32
|
+
formatter = logging.Formatter(
|
|
33
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
|
37
|
+
console_handler.setLevel(level)
|
|
38
|
+
console_handler.setFormatter(formatter)
|
|
39
|
+
logger.addHandler(console_handler)
|
|
40
|
+
|
|
41
|
+
if log_file:
|
|
42
|
+
log_path = Path(log_file)
|
|
43
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
|
46
|
+
file_handler.setLevel(level)
|
|
47
|
+
file_handler.setFormatter(formatter)
|
|
48
|
+
logger.addHandler(file_handler)
|
|
49
|
+
|
|
50
|
+
return logger
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_logger(name="crawl4weibo"):
|
|
54
|
+
"""Get existing logger or create new one"""
|
|
55
|
+
return logging.getLogger(name)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
HTML/JSON parsing utilities for crawl4weibo
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Dict, Any, Optional
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from lxml import etree
|
|
12
|
+
|
|
13
|
+
from ..exceptions.base import ParseError
|
|
14
|
+
from .logger import get_logger
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class WeiboParser:
|
|
18
|
+
"""Parser for Weibo API responses and HTML content"""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.logger = get_logger()
|
|
22
|
+
|
|
23
|
+
def parse_user_info(self, response_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
24
|
+
"""
|
|
25
|
+
Parse user information from API response
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
response_data: Raw API response data
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dict containing parsed user information
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
if "data" not in response_data or "userInfo" not in response_data["data"]:
|
|
35
|
+
raise ParseError("Invalid user info response format")
|
|
36
|
+
|
|
37
|
+
user_info = response_data["data"]["userInfo"]
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
"id": str(user_info.get("id", "")),
|
|
41
|
+
"screen_name": user_info.get("screen_name", ""),
|
|
42
|
+
"gender": user_info.get("gender", ""),
|
|
43
|
+
"description": user_info.get("description", ""),
|
|
44
|
+
"followers_count": user_info.get("followers_count", 0),
|
|
45
|
+
"following_count": user_info.get("follow_count", 0),
|
|
46
|
+
"posts_count": user_info.get("statuses_count", 0),
|
|
47
|
+
"verified": user_info.get("verified", False),
|
|
48
|
+
"verified_reason": user_info.get("verified_reason", ""),
|
|
49
|
+
"avatar_url": user_info.get("profile_image_url", ""),
|
|
50
|
+
"cover_image_url": user_info.get("cover_image_phone", ""),
|
|
51
|
+
}
|
|
52
|
+
except Exception as e:
|
|
53
|
+
self.logger.error(f"Failed to parse user info: {e}")
|
|
54
|
+
raise ParseError(f"Failed to parse user info: {e}")
|
|
55
|
+
|
|
56
|
+
def parse_posts(self, response_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
57
|
+
"""
|
|
58
|
+
Parse posts from API response
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
response_data: Raw API response data
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
List of parsed post dictionaries
|
|
65
|
+
"""
|
|
66
|
+
try:
|
|
67
|
+
if "data" not in response_data or "cards" not in response_data["data"]:
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
posts = []
|
|
71
|
+
cards = response_data["data"]["cards"]
|
|
72
|
+
|
|
73
|
+
for card in cards:
|
|
74
|
+
if card.get("card_type") == 9 and "mblog" in card:
|
|
75
|
+
post_data = self._parse_single_post(card["mblog"])
|
|
76
|
+
if post_data:
|
|
77
|
+
posts.append(post_data)
|
|
78
|
+
|
|
79
|
+
return posts
|
|
80
|
+
except Exception as e:
|
|
81
|
+
self.logger.error(f"Failed to parse posts: {e}")
|
|
82
|
+
raise ParseError(f"Failed to parse posts: {e}")
|
|
83
|
+
|
|
84
|
+
def _parse_single_post(self, mblog: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
85
|
+
"""Parse a single post from mblog data"""
|
|
86
|
+
try:
|
|
87
|
+
post = {
|
|
88
|
+
"id": str(mblog.get("id", "")),
|
|
89
|
+
"user_id": str(mblog.get("user", {}).get("id", "")),
|
|
90
|
+
"text": self._clean_text(mblog.get("text", "")),
|
|
91
|
+
"created_at": self._parse_time(mblog.get("created_at", "")),
|
|
92
|
+
"source": mblog.get("source", ""),
|
|
93
|
+
"reposts_count": mblog.get("reposts_count", 0),
|
|
94
|
+
"comments_count": mblog.get("comments_count", 0),
|
|
95
|
+
"attitudes_count": mblog.get("attitudes_count", 0),
|
|
96
|
+
"pic_urls": self._extract_pic_urls(mblog),
|
|
97
|
+
"video_url": self._extract_video_url(mblog),
|
|
98
|
+
"is_original": not mblog.get("retweeted_status"),
|
|
99
|
+
"location": mblog.get("geo", {}).get("name", ""),
|
|
100
|
+
"topic_ids": self._extract_topics(mblog.get("text", "")),
|
|
101
|
+
"at_users": self._extract_at_users(mblog.get("text", "")),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if mblog.get("retweeted_status"):
|
|
105
|
+
post["retweeted_status"] = self._parse_single_post(mblog["retweeted_status"])
|
|
106
|
+
|
|
107
|
+
return post
|
|
108
|
+
except Exception as e:
|
|
109
|
+
self.logger.error(f"Failed to parse single post: {e}")
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def _clean_text(self, text: str) -> str:
|
|
113
|
+
"""Remove HTML tags and clean text"""
|
|
114
|
+
if not text:
|
|
115
|
+
return ""
|
|
116
|
+
|
|
117
|
+
text = re.sub(r'<[^>]+>', '', text)
|
|
118
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
119
|
+
return text
|
|
120
|
+
|
|
121
|
+
def _parse_time(self, time_str: str) -> Optional[datetime]:
|
|
122
|
+
"""Parse time string to datetime"""
|
|
123
|
+
if not time_str:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
return datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
|
|
128
|
+
except ValueError:
|
|
129
|
+
try:
|
|
130
|
+
return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
|
|
131
|
+
except ValueError:
|
|
132
|
+
self.logger.warning(f"Failed to parse time: {time_str}")
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
def _extract_pic_urls(self, mblog: Dict[str, Any]) -> List[str]:
|
|
136
|
+
"""Extract picture URLs from post"""
|
|
137
|
+
pic_urls = []
|
|
138
|
+
|
|
139
|
+
if "pics" in mblog:
|
|
140
|
+
for pic in mblog["pics"]:
|
|
141
|
+
if "large" in pic and "url" in pic["large"]:
|
|
142
|
+
pic_urls.append(pic["large"]["url"])
|
|
143
|
+
|
|
144
|
+
return pic_urls
|
|
145
|
+
|
|
146
|
+
def _extract_video_url(self, mblog: Dict[str, Any]) -> str:
|
|
147
|
+
"""Extract video URL from post"""
|
|
148
|
+
if "page_info" in mblog and mblog["page_info"].get("type") == "video":
|
|
149
|
+
media_info = mblog["page_info"].get("media_info", {})
|
|
150
|
+
return media_info.get("stream_url", "")
|
|
151
|
+
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
def _extract_topics(self, text: str) -> List[str]:
|
|
155
|
+
"""Extract topic hashtags from text"""
|
|
156
|
+
if not text:
|
|
157
|
+
return []
|
|
158
|
+
|
|
159
|
+
topics = re.findall(r'#([^#]+)#', text)
|
|
160
|
+
return [topic.strip() for topic in topics if topic.strip()]
|
|
161
|
+
|
|
162
|
+
def _extract_at_users(self, text: str) -> List[str]:
|
|
163
|
+
"""Extract @mentioned users from text"""
|
|
164
|
+
if not text:
|
|
165
|
+
return []
|
|
166
|
+
|
|
167
|
+
mentions = re.findall(r'@([^\s@]+)', text)
|
|
168
|
+
return [mention.strip() for mention in mentions if mention.strip()]
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawl4weibo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A professional Weibo crawler library
|
|
5
|
+
Home-page: https://github.com/yourusername/crawl4weibo
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: Kritoooo <krito2023@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Requires-Python: >=3.7
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: requests>=2.25.0
|
|
22
|
+
Requires-Dist: lxml>=4.6.0
|
|
23
|
+
Requires-Dist: tqdm>=4.60.0
|
|
24
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: black; extra == "dev"
|
|
29
|
+
Requires-Dist: isort; extra == "dev"
|
|
30
|
+
Requires-Dist: flake8; extra == "dev"
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: requires-python
|
|
34
|
+
|
|
35
|
+
# Crawl4Weibo
|
|
36
|
+
|
|
37
|
+
一个专业的微博爬虫Python库,基于实际测试成功的方案,无需Cookie即可使用。
|
|
38
|
+
|
|
39
|
+
## 特性
|
|
40
|
+
|
|
41
|
+
- 🚀 **开箱即用**: 无需Cookie,一行代码初始化
|
|
42
|
+
- 🛡️ **防反爬**: 自动处理432错误和请求限制
|
|
43
|
+
- 📱 **真实模拟**: 使用真实手机浏览器UA
|
|
44
|
+
- 🔄 **智能重试**: 自动重试机制
|
|
45
|
+
- 📊 **结构化数据**: 清晰的数据模型
|
|
46
|
+
|
|
47
|
+
## 安装
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## 快速开始
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from crawl4weibo import WeiboClient
|
|
57
|
+
|
|
58
|
+
# 初始化(无需Cookie)
|
|
59
|
+
client = WeiboClient()
|
|
60
|
+
|
|
61
|
+
# 获取用户信息
|
|
62
|
+
user = client.get_user_by_uid("1195230310")
|
|
63
|
+
print(f"用户: {user.screen_name}")
|
|
64
|
+
print(f"粉丝: {user.followers_count:,}")
|
|
65
|
+
|
|
66
|
+
# 获取微博
|
|
67
|
+
posts = client.get_user_posts("1195230310")
|
|
68
|
+
for post in posts:
|
|
69
|
+
print(f"微博: {post.text[:50]}...")
|
|
70
|
+
print(f"点赞: {post.attitudes_count}")
|
|
71
|
+
|
|
72
|
+
# 搜索用户
|
|
73
|
+
users = client.search_users("技术博主")
|
|
74
|
+
for user in users:
|
|
75
|
+
print(f"用户: {user.screen_name}")
|
|
76
|
+
|
|
77
|
+
# 搜索微博
|
|
78
|
+
posts = client.search_posts("人工智能")
|
|
79
|
+
for post in posts:
|
|
80
|
+
print(f"内容: {post.text[:50]}...")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## API参考
|
|
84
|
+
|
|
85
|
+
### WeiboClient
|
|
86
|
+
|
|
87
|
+
#### 初始化
|
|
88
|
+
```python
|
|
89
|
+
WeiboClient(cookies=None, log_level="INFO", log_file=None)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
#### 主要方法
|
|
93
|
+
|
|
94
|
+
- `get_user_by_uid(uid)` - 获取用户信息
|
|
95
|
+
- `get_user_posts(uid, page=1)` - 获取用户微博
|
|
96
|
+
- `search_users(query, page=1, count=10)` - 搜索用户
|
|
97
|
+
- `search_posts(query, page=1)` - 搜索微博
|
|
98
|
+
|
|
99
|
+
### 数据模型
|
|
100
|
+
|
|
101
|
+
**User (用户)**:
|
|
102
|
+
- `screen_name` - 用户名
|
|
103
|
+
- `followers_count` - 粉丝数
|
|
104
|
+
- `posts_count` - 微博数
|
|
105
|
+
- `verified` - 是否认证
|
|
106
|
+
|
|
107
|
+
**Post (微博)**:
|
|
108
|
+
- `text` - 微博文本
|
|
109
|
+
- `attitudes_count` - 点赞数
|
|
110
|
+
- `comments_count` - 评论数
|
|
111
|
+
- `created_at` - 发布时间
|
|
112
|
+
|
|
113
|
+
## 运行示例
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
python examples/simple_example.py
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## 技术实现
|
|
120
|
+
|
|
121
|
+
基于你提供的成功代码实现:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
# 核心技术栈
|
|
125
|
+
- Android Chrome UA模拟
|
|
126
|
+
- 移动端API接口
|
|
127
|
+
- 自动session管理
|
|
128
|
+
- 432错误智能重试
|
|
129
|
+
- 随机请求间隔
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## 许可证
|
|
133
|
+
|
|
134
|
+
MIT License
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
crawl4weibo/__init__.py
|
|
5
|
+
crawl4weibo.egg-info/PKG-INFO
|
|
6
|
+
crawl4weibo.egg-info/SOURCES.txt
|
|
7
|
+
crawl4weibo.egg-info/dependency_links.txt
|
|
8
|
+
crawl4weibo.egg-info/requires.txt
|
|
9
|
+
crawl4weibo.egg-info/top_level.txt
|
|
10
|
+
crawl4weibo/core/__init__.py
|
|
11
|
+
crawl4weibo/core/client.py
|
|
12
|
+
crawl4weibo/exceptions/__init__.py
|
|
13
|
+
crawl4weibo/exceptions/base.py
|
|
14
|
+
crawl4weibo/models/__init__.py
|
|
15
|
+
crawl4weibo/models/post.py
|
|
16
|
+
crawl4weibo/models/user.py
|
|
17
|
+
crawl4weibo/utils/__init__.py
|
|
18
|
+
crawl4weibo/utils/logger.py
|
|
19
|
+
crawl4weibo/utils/parser.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
crawl4weibo
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "crawl4weibo"
|
|
7
|
+
description = "A professional Weibo crawler library"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.7"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Kritoooo", email = "krito2023@gmail.com"},
|
|
13
|
+
]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.7",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"requests>=2.25.0",
|
|
28
|
+
"lxml>=4.6.0",
|
|
29
|
+
"tqdm>=4.60.0",
|
|
30
|
+
"python-dateutil>=2.8.0",
|
|
31
|
+
]
|
|
32
|
+
dynamic = ["version"]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=6.0",
|
|
37
|
+
"pytest-cov",
|
|
38
|
+
"black",
|
|
39
|
+
"isort",
|
|
40
|
+
"flake8",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
where = ["."]
|
|
45
|
+
include = ["crawl4weibo*"]
|
|
46
|
+
|
|
47
|
+
[tool.black]
|
|
48
|
+
line-length = 88
|
|
49
|
+
target-version = ['py37']
|
|
50
|
+
|
|
51
|
+
[tool.isort]
|
|
52
|
+
profile = "black"
|
|
53
|
+
line_length = 88
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
from setuptools import setup, find_packages
|
|
5
|
+
|
|
6
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
7
|
+
long_description = fh.read()
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name="crawl4weibo",
|
|
11
|
+
version="0.1.0",
|
|
12
|
+
author="Your Name",
|
|
13
|
+
author_email="your.email@example.com",
|
|
14
|
+
description="A professional Weibo crawler library",
|
|
15
|
+
long_description=long_description,
|
|
16
|
+
long_description_content_type="text/markdown",
|
|
17
|
+
url="https://github.com/yourusername/crawl4weibo",
|
|
18
|
+
packages=find_packages(),
|
|
19
|
+
classifiers=[
|
|
20
|
+
"Development Status :: 3 - Alpha",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.7",
|
|
26
|
+
"Programming Language :: Python :: 3.8",
|
|
27
|
+
"Programming Language :: Python :: 3.9",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
],
|
|
31
|
+
python_requires=">=3.7",
|
|
32
|
+
install_requires=[
|
|
33
|
+
"requests>=2.25.0",
|
|
34
|
+
"lxml>=4.6.0",
|
|
35
|
+
"tqdm>=4.60.0",
|
|
36
|
+
"python-dateutil>=2.8.0",
|
|
37
|
+
],
|
|
38
|
+
extras_require={
|
|
39
|
+
"dev": [
|
|
40
|
+
"pytest>=6.0",
|
|
41
|
+
"pytest-cov",
|
|
42
|
+
"black",
|
|
43
|
+
"isort",
|
|
44
|
+
"flake8",
|
|
45
|
+
],
|
|
46
|
+
},
|
|
47
|
+
)
|