dykit 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dykit-4.0.0/LICENSE +21 -0
- dykit-4.0.0/PKG-INFO +325 -0
- dykit-4.0.0/README.md +293 -0
- dykit-4.0.0/dykit/__init__.py +61 -0
- dykit-4.0.0/dykit/__main__.py +26 -0
- dykit-4.0.0/dykit/buffer.py +193 -0
- dykit-4.0.0/dykit/cli/__init__.py +5 -0
- dykit-4.0.0/dykit/cli/app.py +21 -0
- dykit-4.0.0/dykit/cli/commands/__init__.py +1 -0
- dykit-4.0.0/dykit/cli/commands/analysis_cmd.py +274 -0
- dykit-4.0.0/dykit/cli/commands/collect_cmd.py +88 -0
- dykit-4.0.0/dykit/cli/commands/initdb_cmd.py +27 -0
- dykit-4.0.0/dykit/cli/commands/io_cmd.py +51 -0
- dykit-4.0.0/dykit/cli/commands/service_cmd.py +177 -0
- dykit-4.0.0/dykit/cli/common.py +62 -0
- dykit-4.0.0/dykit/cli/formatters.py +88 -0
- dykit-4.0.0/dykit/cli/options.py +111 -0
- dykit-4.0.0/dykit/cli/rich_output.py +26 -0
- dykit-4.0.0/dykit/cli/services/__init__.py +1 -0
- dykit-4.0.0/dykit/cli/services/analysis_flow.py +132 -0
- dykit-4.0.0/dykit/cli/services/dbio.py +205 -0
- dykit-4.0.0/dykit/collectors/__init__.py +26 -0
- dykit-4.0.0/dykit/collectors/async_.py +478 -0
- dykit-4.0.0/dykit/collectors/base.py +136 -0
- dykit-4.0.0/dykit/constants.py +47 -0
- dykit-4.0.0/dykit/log.py +37 -0
- dykit-4.0.0/dykit/protocol.py +470 -0
- dykit-4.0.0/dykit/storage/__init__.py +35 -0
- dykit-4.0.0/dykit/storage/base.py +160 -0
- dykit-4.0.0/dykit/storage/console.py +85 -0
- dykit-4.0.0/dykit/storage/csv.py +228 -0
- dykit-4.0.0/dykit/storage/postgres.py +306 -0
- dykit-4.0.0/dykit/tools/__init__.py +1 -0
- dykit-4.0.0/dykit/tools/cluster.py +180 -0
- dykit-4.0.0/dykit/tools/prune.py +67 -0
- dykit-4.0.0/dykit/tools/rank.py +206 -0
- dykit-4.0.0/dykit/tools/search.py +130 -0
- dykit-4.0.0/dykit/types.py +128 -0
- dykit-4.0.0/dykit.egg-info/PKG-INFO +325 -0
- dykit-4.0.0/dykit.egg-info/SOURCES.txt +47 -0
- dykit-4.0.0/dykit.egg-info/dependency_links.txt +1 -0
- dykit-4.0.0/dykit.egg-info/entry_points.txt +2 -0
- dykit-4.0.0/dykit.egg-info/requires.txt +17 -0
- dykit-4.0.0/dykit.egg-info/top_level.txt +1 -0
- dykit-4.0.0/pyproject.toml +81 -0
- dykit-4.0.0/setup.cfg +4 -0
- dykit-4.0.0/tests/test_cli.py +696 -0
- dykit-4.0.0/tests/test_collector_keepalive_contract.py +76 -0
- dykit-4.0.0/tests/test_smoke_6657.py +262 -0
dykit-4.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Joxos
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dykit-4.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dykit
|
|
3
|
+
Version: 4.0.0
|
|
4
|
+
Summary: Douyu Live Stream Danmu (弹幕) Collector - A modular, async-capable library for collecting chat messages from Douyu live streams.
|
|
5
|
+
Author-email: Douyu Danmu Crawler Project <contact@example.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Joxos/dykit
|
|
8
|
+
Project-URL: Repository, https://github.com/Joxos/dykit
|
|
9
|
+
Project-URL: Documentation, https://github.com/Joxos/dykit/blob/main/README.md
|
|
10
|
+
Project-URL: Issues, https://github.com/Joxos/dykit/issues
|
|
11
|
+
Keywords: douyu,danmu,chat,websocket,async
|
|
12
|
+
Requires-Python: >=3.12
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: websockets>=12.0
|
|
16
|
+
Requires-Dist: psycopg[binary]>=3.0.0
|
|
17
|
+
Requires-Dist: click>=8
|
|
18
|
+
Requires-Dist: rich>=14.0.0
|
|
19
|
+
Requires-Dist: loguru>=0.7.0
|
|
20
|
+
Requires-Dist: httpx>=0.28.1
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.14.3
|
|
22
|
+
Requires-Dist: rapidfuzz>=3.14.1
|
|
23
|
+
Requires-Dist: construct>=2.10.70
|
|
24
|
+
Requires-Dist: tenacity>=9.1.2
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
27
|
+
Requires-Dist: basedpyright>=1.31.2; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
30
|
+
Requires-Dist: vulture>=2.14; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# dykit - 斗鱼弹幕采集与分析工具
|
|
34
|
+
|
|
35
|
+
PostgreSQL 架构,支持实时采集、数据分析和 CSV 导入导出。
|
|
36
|
+
|
|
37
|
+
v4.0.0 (2026-03-04)
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## 功能特性
|
|
42
|
+
|
|
43
|
+
- **PostgreSQL 存储**:采用 PostgreSQL 作为主要存储后端,支持高并发写入和高性能查询。
|
|
44
|
+
- **工具链**:提供 7 个核心子命令(collect, rank, prune, cluster, import, export, init-db)。
|
|
45
|
+
- **数据结构**:14 列扁平化数据结构,移除了复杂的 JSONB 字段。
|
|
46
|
+
- **CLI 接口**:基于 Click 框架,支持环境变量配置与 DSN 连接。
|
|
47
|
+
- **技术栈**:使用 psycopg3 驱动和异步 WebSocket 采集。
|
|
48
|
+
- **消息处理**:增强的 UTF-8 缓冲区处理,解决断包导致的乱码问题。
|
|
49
|
+
|
|
50
|
+
## 系统要求
|
|
51
|
+
|
|
52
|
+
- Python 3.12+
|
|
53
|
+
- PostgreSQL 12+
|
|
54
|
+
- [uv](https://github.com/astral-sh/uv) (推荐) 或 pip
|
|
55
|
+
|
|
56
|
+
## 安装
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# 使用 uv (推荐)
|
|
60
|
+
uv venv
|
|
61
|
+
source .venv/bin/activate
|
|
62
|
+
uv pip install .
|
|
63
|
+
|
|
64
|
+
# 或使用 pip
|
|
65
|
+
pip install .
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## 快速开始
|
|
69
|
+
|
|
70
|
+
### 1. 设置数据库连接 (DSN)
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
export DYTOOLS_DSN="postgresql://user:pass@localhost:5432/douyu"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 2. 初始化数据库
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
dykit init-db
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### 3. 开始采集
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
dykit collect -r 6657
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 4. 查看排行
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
dykit rank -r 6657 --top 20
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Service Management
|
|
95
|
+
|
|
96
|
+
### Managing Long-Running Collectors
|
|
97
|
+
`dykit` supports managing long-running collectors as `systemd --user` services. This allows background collection that persists across sessions and restarts automatically.
|
|
98
|
+
|
|
99
|
+
### Basic Workflow
|
|
100
|
+
```bash
|
|
101
|
+
# Set your database DSN (required for the service to connect)
|
|
102
|
+
export DYTOOLS_DSN="postgresql://douyu:douyu6657@localhost:5432/douyu_danmu"
|
|
103
|
+
|
|
104
|
+
# Create a service for a specific room (Format: NAME:ROOM_ID)
|
|
105
|
+
dykit service create test-room:9999
|
|
106
|
+
|
|
107
|
+
# List all managed services
|
|
108
|
+
dykit service list
|
|
109
|
+
|
|
110
|
+
# Check status of a specific service
|
|
111
|
+
dykit service status test-room-9999
|
|
112
|
+
|
|
113
|
+
# View recent logs
|
|
114
|
+
dykit service logs test-room-9999 --lines 10
|
|
115
|
+
|
|
116
|
+
# Stop a running service
|
|
117
|
+
dykit service stop test-room-9999
|
|
118
|
+
|
|
119
|
+
# Get the path to the unit file
|
|
120
|
+
dykit service where test-room-9999
|
|
121
|
+
|
|
122
|
+
# Remove the service completely
|
|
123
|
+
dykit service remove test-room-9999
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Important Notes
|
|
127
|
+
- **Persistence**: To ensure services keep running after you log out, run `loginctl enable-linger $USER`.
|
|
128
|
+
- **Storage**: Service unit files are stored in `~/.config/systemd/user/`.
|
|
129
|
+
- **Naming**: When creating a service with `NAME:ROOM_ID`, the resulting systemd unit is named `dykit-NAME-ROOM_ID.service`. Use the `NAME-ROOM_ID` part with `dykit service` commands.
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## 命令行参考
|
|
135
|
+
|
|
136
|
+
### 数据库管理
|
|
137
|
+
|
|
138
|
+
#### init-db
|
|
139
|
+
初始化数据库表结构和索引。
|
|
140
|
+
```bash
|
|
141
|
+
dykit init-db
|
|
142
|
+
```
|
|
143
|
+
输出示例:
|
|
144
|
+
```
|
|
145
|
+
Database schema initialized successfully
|
|
146
|
+
Table: danmaku
|
|
147
|
+
Indexes: idx_danmaku_room_time, idx_danmaku_user_id, idx_danmaku_msg_type
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
#### collect
|
|
151
|
+
实时采集直播间弹幕。
|
|
152
|
+
- `-r, --room`: 直播间 ID
|
|
153
|
+
- `-v, --verbose`: 打印调试日志
|
|
154
|
+
```bash
|
|
155
|
+
dykit collect -r 6657 -v
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### 数据分析
|
|
159
|
+
|
|
160
|
+
#### rank
|
|
161
|
+
统计发送消息最多的用户或高频出现的重复弹幕。
|
|
162
|
+
- `-r, --room`: 直播间 ID
|
|
163
|
+
- `--by user|content`: 统计维度(默认 user)
|
|
164
|
+
- `--top N`: 显示前 N 名 (默认 10)
|
|
165
|
+
- `--type TYPE`: 过滤消息类型 (默认 chatmsg, 可选 dgb 等)
|
|
166
|
+
- `--user USERNAME`: 按用户名过滤数据集
|
|
167
|
+
- `--user-id USER_ID`: 按 user_id 过滤数据集
|
|
168
|
+
- `--from YYYY-MM-DD`: 起始日期
|
|
169
|
+
- `--to YYYY-MM-DD`: 结束日期(含当天)
|
|
170
|
+
- `--last N`: 仅基于最近 N 条消息进行统计
|
|
171
|
+
- `--first N`: 仅基于最早 N 条消息进行统计
|
|
172
|
+
- `-o, --output FILE`: 导出排名结果 CSV
|
|
173
|
+
- `--days N`: 统计最近 N 天的数据
|
|
174
|
+
```bash
|
|
175
|
+
# 查看最活跃的用户 (默认)
|
|
176
|
+
dykit rank -r 6657 --top 10
|
|
177
|
+
|
|
178
|
+
# 按用户统计送礼榜
|
|
179
|
+
dykit rank -r 6657 --by user --type dgb --top 5
|
|
180
|
+
|
|
181
|
+
# 查看重复弹幕
|
|
182
|
+
dykit rank -r 6657 --by content --top 10
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
#### cluster
|
|
187
|
+
使用文本相似度算法对弹幕进行聚类,识别重复模式。
|
|
188
|
+
- `--type TYPE`: 过滤消息类型 (默认 chatmsg)
|
|
189
|
+
- `--user USERNAME`: 按用户名过滤数据集
|
|
190
|
+
- `--user-id USER_ID`: 按 user_id 过滤数据集
|
|
191
|
+
- `--from YYYY-MM-DD`: 起始日期
|
|
192
|
+
- `--to YYYY-MM-DD`: 结束日期(含当天)
|
|
193
|
+
- `--last N`: 仅基于最近 N 条消息进行聚类
|
|
194
|
+
- `--first N`: 仅基于最早 N 条消息进行聚类
|
|
195
|
+
- `--days N`: 仅基于最近 N 天消息进行聚类
|
|
196
|
+
- `--threshold FLOAT`: 相似度阈值 (默认 0.6)
|
|
197
|
+
- `-o, --output FILE`: 将结果保存到 CSV 文件
|
|
198
|
+
```bash
|
|
199
|
+
dykit cluster -r 6657 --threshold 0.5 --limit 50
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
#### prune
|
|
203
|
+
清理数据库中的重复记录。
|
|
204
|
+
```bash
|
|
205
|
+
dykit prune -r 6657
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### 导入与导出
|
|
209
|
+
|
|
210
|
+
#### import
|
|
211
|
+
将 CSV 采集文件导入到 PostgreSQL。
|
|
212
|
+
```bash
|
|
213
|
+
dykit import data.csv -r 6657
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
#### export
|
|
217
|
+
将数据库数据导出为 CSV 文件。
|
|
218
|
+
```bash
|
|
219
|
+
dykit export -r 6657 -o backup.csv
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## 数据库字段
|
|
225
|
+
|
|
226
|
+
`dykit` 将所有消息存储在 `danmaku` 表中:
|
|
227
|
+
|
|
228
|
+
| 列名 | 类型 | 说明 |
|
|
229
|
+
| :--- | :--- | :--- |
|
|
230
|
+
| timestamp | TIMESTAMP | 接收时间 |
|
|
231
|
+
| room_id | TEXT | 直播间 ID |
|
|
232
|
+
| msg_type | TEXT | 消息类型 (chatmsg, dgb, uenter 等) |
|
|
233
|
+
| user_id | TEXT | 用户 UID |
|
|
234
|
+
| username | TEXT | 用户昵称 |
|
|
235
|
+
| content | TEXT | 消息内容 |
|
|
236
|
+
| user_level | INTEGER | 用户等级 |
|
|
237
|
+
| gift_id | TEXT | 礼物 ID (可选) |
|
|
238
|
+
| gift_count | INTEGER | 礼物数量 (可选) |
|
|
239
|
+
| gift_name | TEXT | 礼物名称 (可选) |
|
|
240
|
+
| badge_level| INTEGER | 粉丝牌等级 (可选) |
|
|
241
|
+
| badge_name | TEXT | 粉丝牌名称 (可选) |
|
|
242
|
+
| noble_level| INTEGER | 贵族等级 (可选) |
|
|
243
|
+
| avatar_url | TEXT | 头像 URL (可选) |
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## Python API
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
import asyncio
|
|
251
|
+
from dykit.storage import PostgreSQLStorage
|
|
252
|
+
from dykit.collectors import AsyncCollector
|
|
253
|
+
|
|
254
|
+
async def main():
|
|
255
|
+
storage = PostgreSQLStorage(
|
|
256
|
+
room_id=6657,
|
|
257
|
+
host='localhost',
|
|
258
|
+
port=5432,
|
|
259
|
+
database='douyu',
|
|
260
|
+
user='douyu',
|
|
261
|
+
password='pass'
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
with storage:
|
|
265
|
+
collector = AsyncCollector(6657, storage)
|
|
266
|
+
try:
|
|
267
|
+
await collector.connect()
|
|
268
|
+
except KeyboardInterrupt:
|
|
269
|
+
await collector.stop()
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
asyncio.run(main())
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## 项目结构
|
|
278
|
+
|
|
279
|
+
```
|
|
280
|
+
dykit/
|
|
281
|
+
├── __main__.py # CLI 入口
|
|
282
|
+
├── types.py # 数据类定义
|
|
283
|
+
├── protocol.py # 协议解析
|
|
284
|
+
├── collectors/
|
|
285
|
+
│ └── async_.py # 异步采集器
|
|
286
|
+
├── storage/
|
|
287
|
+
│ ├── postgres.py # PostgreSQL 实现
|
|
288
|
+
│ └── csv.py # CSV 导入导出
|
|
289
|
+
└── tools/ # 分析工具
|
|
290
|
+
├── rank.py # 排行榜 (支持用户和内容双模式)
|
|
291
|
+
├── prune.py # 去重
|
|
292
|
+
└── cluster.py # 相似度聚类
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## 常见问题
|
|
296
|
+
|
|
297
|
+
**Q: 如何配置数据库?**
|
|
298
|
+
A: 使用环境变量 `DYTOOLS_DSN` 或参数 `--dsn` 指定 PostgreSQL 连接字符串。
|
|
299
|
+
|
|
300
|
+
**Q: CSV 文件去哪了?**
|
|
301
|
+
A: v4.0.0 默认使用数据库。如果需要 CSV,请在采集后运行 `export` 命令。
|
|
302
|
+
|
|
303
|
+
**Q: 兼容旧版 CSV 吗?**
|
|
304
|
+
A: 兼容。使用 `import` 命令即可将旧版 8 列格式的数据导入数据库。
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## TODO
|
|
309
|
+
|
|
310
|
+
- [ ] 保存更多字段 — 利用 raw_data JSONB 字段提取额外信息(如弹幕颜色、特殊标识等)
|
|
311
|
+
- [ ] systemd 服务管理 — 添加 systemd user service unit 文件用于后台采集
|
|
312
|
+
- [x] 历史数据迁移 — 已完成 room_id 统一迁移,迁移脚本已从仓库移除
|
|
313
|
+
- [ ] construct typing 跟踪 — 关注上游 issue https://github.com/construct/construct/issues/1125 ,上游提供官方 typing/stub 后评估移除本地 `typings/construct` 临时桩
|
|
314
|
+
|
|
315
|
+
## Collector Keepalive Contract
|
|
316
|
+
|
|
317
|
+
- Do **NOT** enable `websockets` built-in keepalive (`ping_interval` / `ping_timeout`) for Douyu collection.
|
|
318
|
+
- Collector liveness policy is:
|
|
319
|
+
- protocol heartbeat: send `mrkl` every `WS_DOUYU_HEARTBEAT_SECONDS`
|
|
320
|
+
- idle detection: reconnect when no messages within `WS_READ_IDLE_TIMEOUT_SECONDS`
|
|
321
|
+
- Regression guard:
|
|
322
|
+
- `tests/test_collector_keepalive_contract.py` asserts connect kwargs keep `ping_interval=None` and `ping_timeout=None`, and asserts heartbeat loop sends `mrkl`.
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
仅供学习研究使用。
|
dykit-4.0.0/README.md
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# dykit - 斗鱼弹幕采集与分析工具
|
|
2
|
+
|
|
3
|
+
PostgreSQL 架构,支持实时采集、数据分析和 CSV 导入导出。
|
|
4
|
+
|
|
5
|
+
v4.0.0 (2026-03-04)
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 功能特性
|
|
10
|
+
|
|
11
|
+
- **PostgreSQL 存储**:采用 PostgreSQL 作为主要存储后端,支持高并发写入和高性能查询。
|
|
12
|
+
- **工具链**:提供 7 个核心子命令(collect, rank, prune, cluster, import, export, init-db)。
|
|
13
|
+
- **数据结构**:14 列扁平化数据结构,移除了复杂的 JSONB 字段。
|
|
14
|
+
- **CLI 接口**:基于 Click 框架,支持环境变量配置与 DSN 连接。
|
|
15
|
+
- **技术栈**:使用 psycopg3 驱动和异步 WebSocket 采集。
|
|
16
|
+
- **消息处理**:增强的 UTF-8 缓冲区处理,解决断包导致的乱码问题。
|
|
17
|
+
|
|
18
|
+
## 系统要求
|
|
19
|
+
|
|
20
|
+
- Python 3.12+
|
|
21
|
+
- PostgreSQL 12+
|
|
22
|
+
- [uv](https://github.com/astral-sh/uv) (推荐) 或 pip
|
|
23
|
+
|
|
24
|
+
## 安装
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# 使用 uv (推荐)
|
|
28
|
+
uv venv
|
|
29
|
+
source .venv/bin/activate
|
|
30
|
+
uv pip install .
|
|
31
|
+
|
|
32
|
+
# 或使用 pip
|
|
33
|
+
pip install .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## 快速开始
|
|
37
|
+
|
|
38
|
+
### 1. 设置数据库连接 (DSN)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
export DYTOOLS_DSN="postgresql://user:pass@localhost:5432/douyu"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 2. 初始化数据库
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
dykit init-db
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### 3. 开始采集
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
dykit collect -r 6657
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 4. 查看排行
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
dykit rank -r 6657 --top 20
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Service Management
|
|
63
|
+
|
|
64
|
+
### Managing Long-Running Collectors
|
|
65
|
+
`dykit` supports managing long-running collectors as `systemd --user` services. This allows background collection that persists across sessions and restarts automatically.
|
|
66
|
+
|
|
67
|
+
### Basic Workflow
|
|
68
|
+
```bash
|
|
69
|
+
# Set your database DSN (required for the service to connect)
|
|
70
|
+
export DYTOOLS_DSN="postgresql://douyu:douyu6657@localhost:5432/douyu_danmu"
|
|
71
|
+
|
|
72
|
+
# Create a service for a specific room (Format: NAME:ROOM_ID)
|
|
73
|
+
dykit service create test-room:9999
|
|
74
|
+
|
|
75
|
+
# List all managed services
|
|
76
|
+
dykit service list
|
|
77
|
+
|
|
78
|
+
# Check status of a specific service
|
|
79
|
+
dykit service status test-room-9999
|
|
80
|
+
|
|
81
|
+
# View recent logs
|
|
82
|
+
dykit service logs test-room-9999 --lines 10
|
|
83
|
+
|
|
84
|
+
# Stop a running service
|
|
85
|
+
dykit service stop test-room-9999
|
|
86
|
+
|
|
87
|
+
# Get the path to the unit file
|
|
88
|
+
dykit service where test-room-9999
|
|
89
|
+
|
|
90
|
+
# Remove the service completely
|
|
91
|
+
dykit service remove test-room-9999
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Important Notes
|
|
95
|
+
- **Persistence**: To ensure services keep running after you log out, run `loginctl enable-linger $USER`.
|
|
96
|
+
- **Storage**: Service unit files are stored in `~/.config/systemd/user/`.
|
|
97
|
+
- **Naming**: When creating a service with `NAME:ROOM_ID`, the resulting systemd unit is named `dykit-NAME-ROOM_ID.service`. Use the `NAME-ROOM_ID` part with `dykit service` commands.
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 命令行参考
|
|
103
|
+
|
|
104
|
+
### 数据库管理
|
|
105
|
+
|
|
106
|
+
#### init-db
|
|
107
|
+
初始化数据库表结构和索引。
|
|
108
|
+
```bash
|
|
109
|
+
dykit init-db
|
|
110
|
+
```
|
|
111
|
+
输出示例:
|
|
112
|
+
```
|
|
113
|
+
Database schema initialized successfully
|
|
114
|
+
Table: danmaku
|
|
115
|
+
Indexes: idx_danmaku_room_time, idx_danmaku_user_id, idx_danmaku_msg_type
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
#### collect
|
|
119
|
+
实时采集直播间弹幕。
|
|
120
|
+
- `-r, --room`: 直播间 ID
|
|
121
|
+
- `-v, --verbose`: 打印调试日志
|
|
122
|
+
```bash
|
|
123
|
+
dykit collect -r 6657 -v
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### 数据分析
|
|
127
|
+
|
|
128
|
+
#### rank
|
|
129
|
+
统计发送消息最多的用户或高频出现的重复弹幕。
|
|
130
|
+
- `-r, --room`: 直播间 ID
|
|
131
|
+
- `--by user|content`: 统计维度(默认 user)
|
|
132
|
+
- `--top N`: 显示前 N 名 (默认 10)
|
|
133
|
+
- `--type TYPE`: 过滤消息类型 (默认 chatmsg, 可选 dgb 等)
|
|
134
|
+
- `--user USERNAME`: 按用户名过滤数据集
|
|
135
|
+
- `--user-id USER_ID`: 按 user_id 过滤数据集
|
|
136
|
+
- `--from YYYY-MM-DD`: 起始日期
|
|
137
|
+
- `--to YYYY-MM-DD`: 结束日期(含当天)
|
|
138
|
+
- `--last N`: 仅基于最近 N 条消息进行统计
|
|
139
|
+
- `--first N`: 仅基于最早 N 条消息进行统计
|
|
140
|
+
- `-o, --output FILE`: 导出排名结果 CSV
|
|
141
|
+
- `--days N`: 统计最近 N 天的数据
|
|
142
|
+
```bash
|
|
143
|
+
# 查看最活跃的用户 (默认)
|
|
144
|
+
dykit rank -r 6657 --top 10
|
|
145
|
+
|
|
146
|
+
# 按用户统计送礼榜
|
|
147
|
+
dykit rank -r 6657 --by user --type dgb --top 5
|
|
148
|
+
|
|
149
|
+
# 查看重复弹幕
|
|
150
|
+
dykit rank -r 6657 --by content --top 10
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
#### cluster
|
|
155
|
+
使用文本相似度算法对弹幕进行聚类,识别重复模式。
|
|
156
|
+
- `--type TYPE`: 过滤消息类型 (默认 chatmsg)
|
|
157
|
+
- `--user USERNAME`: 按用户名过滤数据集
|
|
158
|
+
- `--user-id USER_ID`: 按 user_id 过滤数据集
|
|
159
|
+
- `--from YYYY-MM-DD`: 起始日期
|
|
160
|
+
- `--to YYYY-MM-DD`: 结束日期(含当天)
|
|
161
|
+
- `--last N`: 仅基于最近 N 条消息进行聚类
|
|
162
|
+
- `--first N`: 仅基于最早 N 条消息进行聚类
|
|
163
|
+
- `--days N`: 仅基于最近 N 天消息进行聚类
|
|
164
|
+
- `--threshold FLOAT`: 相似度阈值 (默认 0.6)
|
|
165
|
+
- `-o, --output FILE`: 将结果保存到 CSV 文件
|
|
166
|
+
```bash
|
|
167
|
+
dykit cluster -r 6657 --threshold 0.5 --limit 50
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
#### prune
|
|
171
|
+
清理数据库中的重复记录。
|
|
172
|
+
```bash
|
|
173
|
+
dykit prune -r 6657
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 导入与导出
|
|
177
|
+
|
|
178
|
+
#### import
|
|
179
|
+
将 CSV 采集文件导入到 PostgreSQL。
|
|
180
|
+
```bash
|
|
181
|
+
dykit import data.csv -r 6657
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
#### export
|
|
185
|
+
将数据库数据导出为 CSV 文件。
|
|
186
|
+
```bash
|
|
187
|
+
dykit export -r 6657 -o backup.csv
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## 数据库字段
|
|
193
|
+
|
|
194
|
+
`dykit` 将所有消息存储在 `danmaku` 表中:
|
|
195
|
+
|
|
196
|
+
| 列名 | 类型 | 说明 |
|
|
197
|
+
| :--- | :--- | :--- |
|
|
198
|
+
| timestamp | TIMESTAMP | 接收时间 |
|
|
199
|
+
| room_id | TEXT | 直播间 ID |
|
|
200
|
+
| msg_type | TEXT | 消息类型 (chatmsg, dgb, uenter 等) |
|
|
201
|
+
| user_id | TEXT | 用户 UID |
|
|
202
|
+
| username | TEXT | 用户昵称 |
|
|
203
|
+
| content | TEXT | 消息内容 |
|
|
204
|
+
| user_level | INTEGER | 用户等级 |
|
|
205
|
+
| gift_id | TEXT | 礼物 ID (可选) |
|
|
206
|
+
| gift_count | INTEGER | 礼物数量 (可选) |
|
|
207
|
+
| gift_name | TEXT | 礼物名称 (可选) |
|
|
208
|
+
| badge_level| INTEGER | 粉丝牌等级 (可选) |
|
|
209
|
+
| badge_name | TEXT | 粉丝牌名称 (可选) |
|
|
210
|
+
| noble_level| INTEGER | 贵族等级 (可选) |
|
|
211
|
+
| avatar_url | TEXT | 头像 URL (可选) |
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## Python API
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
import asyncio
|
|
219
|
+
from dykit.storage import PostgreSQLStorage
|
|
220
|
+
from dykit.collectors import AsyncCollector
|
|
221
|
+
|
|
222
|
+
async def main():
|
|
223
|
+
storage = PostgreSQLStorage(
|
|
224
|
+
room_id=6657,
|
|
225
|
+
host='localhost',
|
|
226
|
+
port=5432,
|
|
227
|
+
database='douyu',
|
|
228
|
+
user='douyu',
|
|
229
|
+
password='pass'
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
with storage:
|
|
233
|
+
collector = AsyncCollector(6657, storage)
|
|
234
|
+
try:
|
|
235
|
+
await collector.connect()
|
|
236
|
+
except KeyboardInterrupt:
|
|
237
|
+
await collector.stop()
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
asyncio.run(main())
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## 项目结构
|
|
246
|
+
|
|
247
|
+
```
|
|
248
|
+
dykit/
|
|
249
|
+
├── __main__.py # CLI 入口
|
|
250
|
+
├── types.py # 数据类定义
|
|
251
|
+
├── protocol.py # 协议解析
|
|
252
|
+
├── collectors/
|
|
253
|
+
│ └── async_.py # 异步采集器
|
|
254
|
+
├── storage/
|
|
255
|
+
│ ├── postgres.py # PostgreSQL 实现
|
|
256
|
+
│ └── csv.py # CSV 导入导出
|
|
257
|
+
└── tools/ # 分析工具
|
|
258
|
+
├── rank.py # 排行榜 (支持用户和内容双模式)
|
|
259
|
+
├── prune.py # 去重
|
|
260
|
+
└── cluster.py # 相似度聚类
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## 常见问题
|
|
264
|
+
|
|
265
|
+
**Q: 如何配置数据库?**
|
|
266
|
+
A: 使用环境变量 `DYTOOLS_DSN` 或参数 `--dsn` 指定 PostgreSQL 连接字符串。
|
|
267
|
+
|
|
268
|
+
**Q: CSV 文件去哪了?**
|
|
269
|
+
A: v4.0.0 默认使用数据库。如果需要 CSV,请在采集后运行 `export` 命令。
|
|
270
|
+
|
|
271
|
+
**Q: 兼容旧版 CSV 吗?**
|
|
272
|
+
A: 兼容。使用 `import` 命令即可将旧版 8 列格式的数据导入数据库。
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## TODO
|
|
277
|
+
|
|
278
|
+
- [ ] 保存更多字段 — 利用 raw_data JSONB 字段提取额外信息(如弹幕颜色、特殊标识等)
|
|
279
|
+
- [ ] systemd 服务管理 — 添加 systemd user service unit 文件用于后台采集
|
|
280
|
+
- [x] 历史数据迁移 — 已完成 room_id 统一迁移,迁移脚本已从仓库移除
|
|
281
|
+
- [ ] construct typing 跟踪 — 关注上游 issue https://github.com/construct/construct/issues/1125 ,上游提供官方 typing/stub 后评估移除本地 `typings/construct` 临时桩
|
|
282
|
+
|
|
283
|
+
## Collector Keepalive Contract
|
|
284
|
+
|
|
285
|
+
- Do **NOT** enable `websockets` built-in keepalive (`ping_interval` / `ping_timeout`) for Douyu collection.
|
|
286
|
+
- Collector liveness policy is:
|
|
287
|
+
- protocol heartbeat: send `mrkl` every `WS_DOUYU_HEARTBEAT_SECONDS`
|
|
288
|
+
- idle detection: reconnect when no messages within `WS_READ_IDLE_TIMEOUT_SECONDS`
|
|
289
|
+
- Regression guard:
|
|
290
|
+
- `tests/test_collector_keepalive_contract.py` asserts connect kwargs keep `ping_interval=None` and `ping_timeout=None`, and asserts heartbeat loop sends `mrkl`.
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
仅供学习研究使用。
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Douyu Live Stream Danmu (弹幕) Collector.
|
|
2
|
+
|
|
3
|
+
A modular, async library for collecting chat messages from Douyu live streams.
|
|
4
|
+
|
|
5
|
+
Features:
|
|
6
|
+
- Message buffering to prevent UTF-8 truncation
|
|
7
|
+
- Async collectors
|
|
8
|
+
- Pluggable storage handlers
|
|
9
|
+
- Type-safe dataclasses
|
|
10
|
+
Basic usage:
|
|
11
|
+
from dykit import DanmuMessage, encode_message, serialize_message
|
|
12
|
+
|
|
13
|
+
# Serialize a message to Douyu key-value format
|
|
14
|
+
msg = {"type": "chatmsg", "content": "Hello"}
|
|
15
|
+
serialized = serialize_message(msg)
|
|
16
|
+
|
|
17
|
+
# Encode to binary protocol
|
|
18
|
+
encoded = encode_message(serialized)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from .collectors import AsyncCollector
|
|
24
|
+
|
|
25
|
+
# Import public API from submodules
|
|
26
|
+
from .protocol import (
|
|
27
|
+
CLIENT_MSG_TYPE,
|
|
28
|
+
DOUYU_WS_URL,
|
|
29
|
+
SERVER_MSG_TYPE,
|
|
30
|
+
decode_message,
|
|
31
|
+
deserialize_message,
|
|
32
|
+
encode_message,
|
|
33
|
+
serialize_message,
|
|
34
|
+
)
|
|
35
|
+
from .storage import ConsoleStorage, CSVStorage, StorageHandler
|
|
36
|
+
from .types import DanmuMessage, MessageType
|
|
37
|
+
|
|
38
|
+
__version__ = "4.0.0"
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
# Version
|
|
42
|
+
"__version__",
|
|
43
|
+
# Protocol functions
|
|
44
|
+
"serialize_message",
|
|
45
|
+
"deserialize_message",
|
|
46
|
+
"encode_message",
|
|
47
|
+
"decode_message",
|
|
48
|
+
# Protocol constants
|
|
49
|
+
"DOUYU_WS_URL",
|
|
50
|
+
"CLIENT_MSG_TYPE",
|
|
51
|
+
"SERVER_MSG_TYPE",
|
|
52
|
+
# Type definitions
|
|
53
|
+
"DanmuMessage",
|
|
54
|
+
"MessageType",
|
|
55
|
+
# Collectors
|
|
56
|
+
"AsyncCollector",
|
|
57
|
+
# Storage
|
|
58
|
+
"StorageHandler",
|
|
59
|
+
"CSVStorage",
|
|
60
|
+
"ConsoleStorage",
|
|
61
|
+
]
|