knowlyr-datacheck 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowlyr_datacheck-0.2.0/.claude/settings.local.json +23 -0
- knowlyr_datacheck-0.2.0/.github/workflows/ci.yml +33 -0
- knowlyr_datacheck-0.2.0/.gitignore +52 -0
- knowlyr_datacheck-0.2.0/LICENSE +21 -0
- knowlyr_datacheck-0.2.0/PKG-INFO +527 -0
- knowlyr_datacheck-0.2.0/README.md +478 -0
- knowlyr_datacheck-0.2.0/pyproject.toml +59 -0
- knowlyr_datacheck-0.2.0/src/datacheck/__init__.py +19 -0
- knowlyr_datacheck-0.2.0/src/datacheck/checker.py +494 -0
- knowlyr_datacheck-0.2.0/src/datacheck/cli.py +274 -0
- knowlyr_datacheck-0.2.0/src/datacheck/mcp_server.py +306 -0
- knowlyr_datacheck-0.2.0/src/datacheck/report.py +300 -0
- knowlyr_datacheck-0.2.0/src/datacheck/rules.py +457 -0
- knowlyr_datacheck-0.2.0/src/datacheck/text_rules.py +104 -0
- knowlyr_datacheck-0.2.0/tests/__init__.py +1 -0
- knowlyr_datacheck-0.2.0/tests/test_checker.py +252 -0
- knowlyr_datacheck-0.2.0/tests/test_cli.py +95 -0
- knowlyr_datacheck-0.2.0/tests/test_report.py +118 -0
- knowlyr_datacheck-0.2.0/tests/test_text_rules.py +155 -0
- knowlyr_datacheck-0.2.0/uv.lock +1378 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(python -m pytest:*)",
|
|
5
|
+
"Bash(wc:*)",
|
|
6
|
+
"Bash(.venv/bin/python -m pytest:*)",
|
|
7
|
+
"Bash(.venv/bin/pip install PyYAML)",
|
|
8
|
+
"Bash(.venv/bin/python -m pip install PyYAML)",
|
|
9
|
+
"Bash(uv pip install:*)",
|
|
10
|
+
"Bash(.venv/bin/ruff check:*)",
|
|
11
|
+
"Bash(git -C /Users/liukai/data-check status)",
|
|
12
|
+
"Bash(git -C /Users/liukai/data-check diff)",
|
|
13
|
+
"Bash(git -C /Users/liukai/data-check log --oneline -5)",
|
|
14
|
+
"Bash(git -C /Users/liukai/data-check add pyproject.toml src/datacheck/checker.py src/datacheck/cli.py src/datacheck/mcp_server.py src/datacheck/report.py src/datacheck/rules.py src/datacheck/text_rules.py tests/test_checker.py tests/test_cli.py tests/test_text_rules.py)",
|
|
15
|
+
"Bash(git -C /Users/liukai/data-check commit -m \"$\\(cat <<''EOF''\nfeat: 添加 JSONL/CSV 支持、采样模式、YAML 规则配置、文本质量检测、CI 阈值\n\n- JSONL/CSV 格式支持: 按文件扩展名自动检测并加载 .jsonl/.csv/.json\n- 采样检查模式: --sample N / --sample-rate 0.1 随机抽样大数据集\n- 自定义规则配置: --rules-file 加载 YAML 规则文件 \\(支持 required/min_length/max_length/regex/enum\\)\n- 文本质量检测: PII 隐私信息、乱码、重复文本检测 + n-gram 近似重复检测\n- CI 集成: --threshold / --strict 可配置退出码阈值\n\nGenerated with [Claude Code]\\(https://claude.ai/code\\)\nvia [Happy]\\(https://happy.engineering\\)\nEOF\n\\)\")",
|
|
16
|
+
"Bash(git -C /Users/liukai/data-check push)",
|
|
17
|
+
"Bash(git -C /Users/liukai/data-check add:*)",
|
|
18
|
+
"Bash(git -C /Users/liukai/data-check commit -m \"$\\(cat <<''EOF''\ndocs: 更新 README 添加新功能说明\n\n- 检查项目表格新增 PII、乱码、重复文本检测\n- 安装说明新增 yaml 可选依赖\n- CLI 示例新增 JSONL/CSV、采样、阈值用法\n- 内置规则表格补充 3 条文本质量规则\n- 自定义规则章节改为 YAML 配置方式\n- 命令参考新增 --rules-file/--sample/--threshold/--strict\n- API 示例更新为正确的 SDK 用法\n- 项目架构新增 text_rules.py\n\nGenerated with [Claude Code]\\(https://claude.ai/code\\)\nvia [Happy]\\(https://happy.engineering\\)\nEOF\n\\)\")",
|
|
19
|
+
"Bash(uv pip list:*)",
|
|
20
|
+
"Bash(.venv/bin/python:*)"
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
|
|
29
|
+
- name: Lint with ruff
|
|
30
|
+
run: ruff check .
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: pytest
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
*.egg
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
ENV/
|
|
16
|
+
|
|
17
|
+
# IDE
|
|
18
|
+
.idea/
|
|
19
|
+
.vscode/
|
|
20
|
+
*.swp
|
|
21
|
+
*.swo
|
|
22
|
+
|
|
23
|
+
# Testing
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
|
|
28
|
+
# Reports
|
|
29
|
+
*_report.md
|
|
30
|
+
*_report.json
|
|
31
|
+
|
|
32
|
+
# Environment
|
|
33
|
+
.env
|
|
34
|
+
.env.local
|
|
35
|
+
|
|
36
|
+
# OS
|
|
37
|
+
.DS_Store
|
|
38
|
+
Thumbs.db
|
|
39
|
+
|
|
40
|
+
# ===================
|
|
41
|
+
# Supplemented common patterns
|
|
42
|
+
# ===================
|
|
43
|
+
|
|
44
|
+
# Python wheels
|
|
45
|
+
*.whl
|
|
46
|
+
|
|
47
|
+
# Virtual environments
|
|
48
|
+
env/
|
|
49
|
+
|
|
50
|
+
# Linting / type checking
|
|
51
|
+
.mypy_cache/
|
|
52
|
+
.ruff_cache/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Liu Kai
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: knowlyr-datacheck
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Data quality inspection toolkit - automated validation, anomaly detection, and distribution analysis
|
|
5
|
+
Project-URL: Homepage, https://github.com/liuxiaotong/data-check
|
|
6
|
+
Project-URL: Documentation, https://github.com/liuxiaotong/data-check#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/liuxiaotong/data-check
|
|
8
|
+
Project-URL: Issues, https://github.com/liuxiaotong/data-check/issues
|
|
9
|
+
Author-email: Liu Kai <mrliukai@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai,anomaly-detection,data-inspection,data-quality,machine-learning,training-data,validation
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: click>=8.0
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: anthropic>=0.18; extra == 'all'
|
|
28
|
+
Requires-Dist: mcp>=1.0; extra == 'all'
|
|
29
|
+
Requires-Dist: numpy>=1.20; extra == 'all'
|
|
30
|
+
Requires-Dist: openai>=1.0; extra == 'all'
|
|
31
|
+
Requires-Dist: pytest; extra == 'all'
|
|
32
|
+
Requires-Dist: pyyaml>=6.0; extra == 'all'
|
|
33
|
+
Requires-Dist: ruff; extra == 'all'
|
|
34
|
+
Requires-Dist: scipy>=1.7; extra == 'all'
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
38
|
+
Provides-Extra: llm
|
|
39
|
+
Requires-Dist: anthropic>=0.18; extra == 'llm'
|
|
40
|
+
Requires-Dist: openai>=1.0; extra == 'llm'
|
|
41
|
+
Provides-Extra: mcp
|
|
42
|
+
Requires-Dist: mcp>=1.0; extra == 'mcp'
|
|
43
|
+
Provides-Extra: stats
|
|
44
|
+
Requires-Dist: numpy>=1.20; extra == 'stats'
|
|
45
|
+
Requires-Dist: scipy>=1.7; extra == 'stats'
|
|
46
|
+
Provides-Extra: yaml
|
|
47
|
+
Requires-Dist: pyyaml>=6.0; extra == 'yaml'
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
<div align="center">
|
|
51
|
+
|
|
52
|
+
# DataCheck
|
|
53
|
+
|
|
54
|
+
**数据质检工具 — 自动化质量检查、异常检测、分布分析**
|
|
55
|
+
**Automated quality checks, anomaly detection & distribution analysis for LLM datasets**
|
|
56
|
+
|
|
57
|
+
[](https://pypi.org/project/knowlyr-datacheck/)
|
|
58
|
+
[](https://www.python.org/downloads/)
|
|
59
|
+
[](LICENSE)
|
|
60
|
+
[](#mcp-server)
|
|
61
|
+
|
|
62
|
+
[快速开始](#快速开始) · [质量规则](#质量规则) · [分布分析](#分布分析) · [MCP Server](#mcp-server) · [Data Pipeline 生态](#data-pipeline-生态)
|
|
63
|
+
|
|
64
|
+
</div>
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
**GitHub Topics**: `data-quality`, `anomaly-detection`, `data-validation`, `mcp`, `ai-data-pipeline`
|
|
69
|
+
|
|
70
|
+
自动化数据质量检查,支持规则验证、重复检测、分布分析,生成可读的质量报告。
|
|
71
|
+
|
|
72
|
+
## 核心能力 / Core Capabilities
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
数据文件 + Schema → 规则检查 → 异常检测 → 分布分析 → 质量报告
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### 质量仪表盘预览 / Sample Dashboard
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
┌───────────────┬──────────────┬────────────┐
|
|
82
|
+
│ 通过率 92% │ 评级 🟢 优秀 │ 错误 8 条 │
|
|
83
|
+
├───────────────┴──────────────┴────────────┤
|
|
84
|
+
│ ⚠ required_fields : 3 │ ⚠ duplicate_rows : 2│
|
|
85
|
+
│ 🔍 语言分布: zh 68% / en 32% │
|
|
86
|
+
└────────────────────────────────────────────┘
|
|
87
|
+
|
|
88
|
+
完整示例: `examples/reports/demo_quality_report.md`
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 检查项目 / Checks
|
|
92
|
+
|
|
93
|
+
| 检查类型 | 说明 |
|
|
94
|
+
|----------|------|
|
|
95
|
+
| 🔴 **必填字段** | 检查是否包含所有必填字段 |
|
|
96
|
+
| 🔴 **非空检查** | 检查关键字段是否为空 |
|
|
97
|
+
| 🔴 **格式检查** | 检查数据类型是否正确 |
|
|
98
|
+
| 🟡 **长度边界** | 检查文本长度是否合理 |
|
|
99
|
+
| 🟡 **重复检测** | 检测完全重复 + 近似重复 (n-gram Jaccard) |
|
|
100
|
+
| 🟡 **隐私信息 (PII)** | 检测邮箱、手机号、身份证号 |
|
|
101
|
+
| 🟡 **乱码检测** | 检测乱码、异常字符、编码错误 |
|
|
102
|
+
| 🟡 **重复文本** | 检测文本内过度重复内容 |
|
|
103
|
+
| 🔵 **语言一致性** | 检查文本语言是否一致 |
|
|
104
|
+
|
|
105
|
+
### 质量评级 / Rating
|
|
106
|
+
|
|
107
|
+
| 通过率 | 评级 | 建议 |
|
|
108
|
+
|--------|------|------|
|
|
109
|
+
| ≥90% | 🟢 优秀 | 可直接使用 |
|
|
110
|
+
| ≥70% | 🟡 良好 | 建议修复警告 |
|
|
111
|
+
| ≥50% | 🟠 一般 | 需要处理错误 |
|
|
112
|
+
| <50% | 🔴 需改进 | 严重质量问题 |
|
|
113
|
+
|
|
114
|
+
## 安装 / Installation
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pip install knowlyr-datacheck
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
可选依赖:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pip install knowlyr-datacheck[stats] # 统计分析 (numpy, scipy)
|
|
124
|
+
pip install knowlyr-datacheck[mcp] # MCP 服务器
|
|
125
|
+
pip install knowlyr-datacheck[yaml] # YAML 规则配置
|
|
126
|
+
pip install knowlyr-datacheck[all] # 全部功能
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 快速开始 / Quick Start
|
|
130
|
+
|
|
131
|
+
### 检查数据文件 / CLI
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# 基础检查 (支持 JSON / JSONL / CSV)
|
|
135
|
+
knowlyr-datacheck check data.json
|
|
136
|
+
knowlyr-datacheck check data.jsonl
|
|
137
|
+
knowlyr-datacheck check data.csv
|
|
138
|
+
|
|
139
|
+
# 指定 Schema
|
|
140
|
+
knowlyr-datacheck check data.json -s schema.json
|
|
141
|
+
|
|
142
|
+
# 输出报告
|
|
143
|
+
knowlyr-datacheck check data.json -o report.md
|
|
144
|
+
|
|
145
|
+
# 采样检查 (大数据集)
|
|
146
|
+
knowlyr-datacheck check data.jsonl --sample 1000
|
|
147
|
+
knowlyr-datacheck check data.jsonl --sample-rate 0.1
|
|
148
|
+
|
|
149
|
+
# CI 集成: 自定义阈值
|
|
150
|
+
knowlyr-datacheck check data.json --threshold 0.9
|
|
151
|
+
knowlyr-datacheck check data.json --strict
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### 在 Python 中接入 / Python SDK
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from datacheck import DataChecker, QualityReport
|
|
158
|
+
|
|
159
|
+
checker = DataChecker()
|
|
160
|
+
result = checker.check_file("data.json", schema_path="schema.json")
|
|
161
|
+
|
|
162
|
+
report = QualityReport(result)
|
|
163
|
+
report.print_summary()
|
|
164
|
+
report.save("./report.md")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
<details>
|
|
168
|
+
<summary>输出示例</summary>
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
正在检查 data.json...
|
|
172
|
+
|
|
173
|
+
==================================================
|
|
174
|
+
数据质量检查结果
|
|
175
|
+
==================================================
|
|
176
|
+
总样本: 100
|
|
177
|
+
通过: 92
|
|
178
|
+
失败: 8
|
|
179
|
+
通过率: 92.0%
|
|
180
|
+
评级: 🟢 优秀
|
|
181
|
+
==================================================
|
|
182
|
+
|
|
183
|
+
🟡 警告: 3
|
|
184
|
+
⚠️ 重复: 2 组
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
</details>
|
|
188
|
+
|
|
189
|
+
### 使用 DataRecipe 分析结果验证 / Validate DataRecipe Outputs
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
# 验证合成数据
|
|
193
|
+
knowlyr-datacheck validate ./analysis_output/my_dataset/
|
|
194
|
+
|
|
195
|
+
# 验证指定文件
|
|
196
|
+
knowlyr-datacheck validate ./analysis_output/my_dataset/ -d custom_data.json
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
<details>
|
|
200
|
+
<summary>输出示例</summary>
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
正在验证 ./analysis_output/my_dataset/...
|
|
204
|
+
✓ 报告已保存: ./analysis_output/my_dataset/12_质检报告/quality_report.md
|
|
205
|
+
|
|
206
|
+
==================================================
|
|
207
|
+
数据质量检查结果
|
|
208
|
+
==================================================
|
|
209
|
+
总样本: 1000
|
|
210
|
+
通过: 956
|
|
211
|
+
失败: 44
|
|
212
|
+
通过率: 95.6%
|
|
213
|
+
评级: 🟢 优秀
|
|
214
|
+
==================================================
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
</details>
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 质量规则 / Quality Rules
|
|
222
|
+
|
|
223
|
+
### 内置规则 / Built-in Rules
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
# 查看所有规则
|
|
227
|
+
knowlyr-datacheck rules
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
| 规则 ID | 名称 | 级别 | 说明 |
|
|
231
|
+
|---------|------|------|------|
|
|
232
|
+
| `required_fields` | 必填字段检查 | 🔴 错误 | 检查必填字段是否存在 |
|
|
233
|
+
| `non_empty` | 非空检查 | 🔴 错误 | 检查关键字段是否为空 |
|
|
234
|
+
| `format_valid` | 格式检查 | 🔴 错误 | 检查数据类型是否正确 |
|
|
235
|
+
| `score_valid` | 评分有效性 | 🔴 错误 | 检查评分是否在有效范围 |
|
|
236
|
+
| `length_bounds` | 长度边界检查 | 🟡 警告 | 检查文本长度范围 |
|
|
237
|
+
| `pii_detection` | 隐私信息检测 | 🟡 警告 | 检测邮箱、手机号、身份证号 |
|
|
238
|
+
| `garbled_text` | 乱码检测 | 🟡 警告 | 检测乱码、异常字符 |
|
|
239
|
+
| `repetitive_text` | 重复文本检测 | 🟡 警告 | 检测文本内过度重复 |
|
|
240
|
+
| `language_consistency` | 语言一致性 | 🔵 提示 | 检查语言是否一致 |
|
|
241
|
+
|
|
242
|
+
### 预设规则集 / Rule Packs
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
# 使用 SFT 数据规则集
|
|
246
|
+
knowlyr-datacheck check data.json --ruleset sft
|
|
247
|
+
|
|
248
|
+
# 使用偏好数据规则集
|
|
249
|
+
knowlyr-datacheck check data.json --ruleset preference
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
| 规则集 | 说明 |
|
|
253
|
+
|--------|------|
|
|
254
|
+
| `default` | 通用规则 |
|
|
255
|
+
| `sft` | SFT 数据专用规则 (指令质量、回复质量) |
|
|
256
|
+
| `preference` | 偏好数据专用规则 (chosen/rejected 差异) |
|
|
257
|
+
|
|
258
|
+
### 自定义规则配置 / Custom Rules (YAML)
|
|
259
|
+
|
|
260
|
+
通过 YAML 配置文件定义自定义规则,无需写 Python 代码:
|
|
261
|
+
|
|
262
|
+
```yaml
|
|
263
|
+
# rules.yaml
|
|
264
|
+
rules:
|
|
265
|
+
- field: instruction
|
|
266
|
+
check: min_length
|
|
267
|
+
value: 10
|
|
268
|
+
severity: error
|
|
269
|
+
|
|
270
|
+
- field: response
|
|
271
|
+
check: max_length
|
|
272
|
+
value: 10000
|
|
273
|
+
severity: warning
|
|
274
|
+
|
|
275
|
+
- field: category
|
|
276
|
+
check: enum
|
|
277
|
+
values: ["qa", "chat", "code", "math"]
|
|
278
|
+
severity: error
|
|
279
|
+
|
|
280
|
+
- field: instruction
|
|
281
|
+
check: regex
|
|
282
|
+
pattern: "^[A-Z\u4e00-\u9fff]"
|
|
283
|
+
severity: info
|
|
284
|
+
message: "指令应以大写字母或中文开头"
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
# 使用自定义规则
|
|
289
|
+
knowlyr-datacheck check data.json --rules-file rules.yaml
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
支持的检查类型:`required`、`non_empty`、`min_length`、`max_length`、`regex`、`enum`
|
|
293
|
+
|
|
294
|
+
> 需要安装 YAML 支持:`pip install knowlyr-datacheck[yaml]`
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
## 分布分析 / Distribution Analysis
|
|
299
|
+
|
|
300
|
+
### 对比多个数据文件
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
knowlyr-datacheck compare seed.json synthetic.json -o comparison.md
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
<details>
|
|
307
|
+
<summary>输出示例</summary>
|
|
308
|
+
|
|
309
|
+
```markdown
|
|
310
|
+
# 数据分布对比报告
|
|
311
|
+
|
|
312
|
+
## 文件概要
|
|
313
|
+
|
|
314
|
+
| 文件 | 样本数 |
|
|
315
|
+
|------|--------|
|
|
316
|
+
| seed.json | 50 |
|
|
317
|
+
| synthetic.json | 1000 |
|
|
318
|
+
|
|
319
|
+
## 字段对比
|
|
320
|
+
|
|
321
|
+
### instruction
|
|
322
|
+
- **seed.json**: 长度 15-200 (平均 68)
|
|
323
|
+
- **synthetic.json**: 长度 12-198 (平均 72)
|
|
324
|
+
|
|
325
|
+
### response
|
|
326
|
+
- **seed.json**: 长度 50-800 (平均 245)
|
|
327
|
+
- **synthetic.json**: 长度 45-820 (平均 251)
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
</details>
|
|
331
|
+
|
|
332
|
+
### 分析内容
|
|
333
|
+
|
|
334
|
+
- **长度统计**: 最小值、最大值、平均值
|
|
335
|
+
- **唯一值比例**: 检测多样性
|
|
336
|
+
- **值分布**: 数值型字段的分布情况
|
|
337
|
+
- **参考对比**: 与种子数据的分布差异
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## MCP Server
|
|
342
|
+
|
|
343
|
+
在 Claude Desktop / Claude Code 中直接使用。
|
|
344
|
+
|
|
345
|
+
### 配置
|
|
346
|
+
|
|
347
|
+
添加到 `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
348
|
+
|
|
349
|
+
```json
|
|
350
|
+
{
|
|
351
|
+
"mcpServers": {
|
|
352
|
+
"knowlyr-datacheck": {
|
|
353
|
+
"command": "uv",
|
|
354
|
+
"args": ["--directory", "/path/to/data-check", "run", "python", "-m", "datacheck.mcp_server"]
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### 可用工具
|
|
361
|
+
|
|
362
|
+
| 工具 | 功能 |
|
|
363
|
+
|------|------|
|
|
364
|
+
| `check_data_quality` | 检查数据文件质量 |
|
|
365
|
+
| `validate_from_datarecipe` | 使用 DataRecipe 分析结果验证 |
|
|
366
|
+
| `compare_distributions` | 对比多个数据文件分布 |
|
|
367
|
+
| `list_quality_rules` | 列出所有质量检查规则 |
|
|
368
|
+
|
|
369
|
+
### 使用示例
|
|
370
|
+
|
|
371
|
+
```
|
|
372
|
+
用户: 帮我检查 ./output/synthetic.json 的质量
|
|
373
|
+
|
|
374
|
+
Claude: [调用 check_data_quality]
|
|
375
|
+
|
|
376
|
+
## 数据质量检查结果
|
|
377
|
+
|
|
378
|
+
- 通过率: **95.6%**
|
|
379
|
+
- 评级: **🟢 优秀**
|
|
380
|
+
- 错误: 0, 警告: 44
|
|
381
|
+
|
|
382
|
+
发现 2 组重复数据
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
---
|
|
386
|
+
|
|
387
|
+
## Data Pipeline 生态
|
|
388
|
+
|
|
389
|
+
DataCheck 是 Data Pipeline 生态的质检组件:
|
|
390
|
+
|
|
391
|
+
```mermaid
|
|
392
|
+
graph LR
|
|
393
|
+
Radar["🔍 Radar<br/>情报发现"] --> Recipe["📋 Recipe<br/>逆向分析"]
|
|
394
|
+
Recipe --> Synth["🔄 Synth<br/>数据合成"]
|
|
395
|
+
Recipe --> Label["🏷️ Label<br/>数据标注"]
|
|
396
|
+
Synth --> Check["✅ Check<br/>数据质检"]
|
|
397
|
+
Label --> Check
|
|
398
|
+
Check --> Audit["🔬 Audit<br/>模型审计"]
|
|
399
|
+
Audit --> Hub["🎯 Hub<br/>编排层"]
|
|
400
|
+
Hub --> Sandbox["📦 Sandbox<br/>执行沙箱"]
|
|
401
|
+
Sandbox --> Recorder["📹 Recorder<br/>轨迹录制"]
|
|
402
|
+
Recorder --> Reward["⭐ Reward<br/>过程打分"]
|
|
403
|
+
style Check fill:#0969da,color:#fff,stroke:#0969da
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### 生态项目
|
|
407
|
+
|
|
408
|
+
| 层 | 项目 | PyPI 包 | 说明 | 仓库 |
|
|
409
|
+
|---|---|---|---|---|
|
|
410
|
+
| 情报 | **AI Dataset Radar** | knowlyr-radar | 数据集竞争情报、趋势分析 | [GitHub](https://github.com/liuxiaotong/ai-dataset-radar) |
|
|
411
|
+
| 分析 | **DataRecipe** | knowlyr-datarecipe | 逆向分析、Schema 提取、成本估算 | [GitHub](https://github.com/liuxiaotong/data-recipe) |
|
|
412
|
+
| 生产 | **DataSynth** | knowlyr-datasynth | LLM 批量合成、种子数据扩充 | [GitHub](https://github.com/liuxiaotong/data-synth) |
|
|
413
|
+
| 生产 | **DataLabel** | knowlyr-datalabel | 轻量标注工具、多标注员合并 | [GitHub](https://github.com/liuxiaotong/data-label) |
|
|
414
|
+
| 质检 | **DataCheck** | knowlyr-datacheck | 规则验证、重复检测、分布分析 | You are here |
|
|
415
|
+
| 质检 | **ModelAudit** | knowlyr-modelaudit | 蒸馏检测、模型指纹、身份验证 | [GitHub](https://github.com/liuxiaotong/model-audit) |
|
|
416
|
+
| Agent | **knowlyr-agent** | knowlyr-sandbox / recorder / reward / hub | 沙箱 + 轨迹录制 + Reward + 编排 | [GitHub](https://github.com/liuxiaotong/knowlyr-agent) |
|
|
417
|
+
|
|
418
|
+
### 端到端工作流
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
# 1. DataRecipe: 分析数据集,生成 Schema 和样例
|
|
422
|
+
knowlyr-datarecipe deep-analyze tencent/CL-bench -o ./output
|
|
423
|
+
|
|
424
|
+
# 2. DataLabel: 生成标注界面,人工标注/校准种子数据
|
|
425
|
+
knowlyr-datalabel generate ./output/tencent_CL-bench/
|
|
426
|
+
|
|
427
|
+
# 3. DataSynth: 基于种子数据批量合成
|
|
428
|
+
knowlyr-datasynth generate ./output/tencent_CL-bench/ -n 1000
|
|
429
|
+
|
|
430
|
+
# 4. DataCheck: 质量检查
|
|
431
|
+
knowlyr-datacheck validate ./output/tencent_CL-bench/
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
### 四合一 MCP 配置
|
|
435
|
+
|
|
436
|
+
```json
|
|
437
|
+
{
|
|
438
|
+
"mcpServers": {
|
|
439
|
+
"knowlyr-datarecipe": {
|
|
440
|
+
"command": "uv",
|
|
441
|
+
"args": ["--directory", "/path/to/data-recipe", "run", "knowlyr-datarecipe-mcp"]
|
|
442
|
+
},
|
|
443
|
+
"knowlyr-datalabel": {
|
|
444
|
+
"command": "uv",
|
|
445
|
+
"args": ["--directory", "/path/to/data-label", "run", "python", "-m", "datalabel.mcp_server"]
|
|
446
|
+
},
|
|
447
|
+
"knowlyr-datasynth": {
|
|
448
|
+
"command": "uv",
|
|
449
|
+
"args": ["--directory", "/path/to/data-synth", "run", "python", "-m", "datasynth.mcp_server"]
|
|
450
|
+
},
|
|
451
|
+
"knowlyr-datacheck": {
|
|
452
|
+
"command": "uv",
|
|
453
|
+
"args": ["--directory", "/path/to/data-check", "run", "python", "-m", "datacheck.mcp_server"]
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
---
|
|
460
|
+
|
|
461
|
+
## 命令参考
|
|
462
|
+
|
|
463
|
+
| 命令 | 功能 |
|
|
464
|
+
|------|------|
|
|
465
|
+
| `knowlyr-datacheck check <file>` | 检查数据文件 (JSON/JSONL/CSV) |
|
|
466
|
+
| `knowlyr-datacheck check <file> -s <schema>` | 使用 Schema 检查 |
|
|
467
|
+
| `knowlyr-datacheck check <file> --ruleset sft` | 使用指定规则集 |
|
|
468
|
+
| `knowlyr-datacheck check <file> --rules-file rules.yaml` | 使用自定义 YAML 规则 |
|
|
469
|
+
| `knowlyr-datacheck check <file> --sample 1000` | 随机抽样 1000 条检查 |
|
|
470
|
+
| `knowlyr-datacheck check <file> --sample-rate 0.1` | 随机抽样 10% 检查 |
|
|
471
|
+
| `knowlyr-datacheck check <file> --threshold 0.9` | 通过率低于 90% 时退出码 1 |
|
|
472
|
+
| `knowlyr-datacheck check <file> --strict` | 任何错误/警告都退出码 1 |
|
|
473
|
+
| `knowlyr-datacheck validate <dir>` | 验证 DataRecipe 输出 |
|
|
474
|
+
| `knowlyr-datacheck compare <files...>` | 对比多个文件分布 |
|
|
475
|
+
| `knowlyr-datacheck rules` | 列出所有规则 |
|
|
476
|
+
|
|
477
|
+
---
|
|
478
|
+
|
|
479
|
+
## API 使用
|
|
480
|
+
|
|
481
|
+
```python
|
|
482
|
+
from datacheck import DataChecker, QualityReport, RuleSet
|
|
483
|
+
|
|
484
|
+
# 创建检查器
|
|
485
|
+
checker = DataChecker()
|
|
486
|
+
|
|
487
|
+
# 检查文件 (支持 JSON/JSONL/CSV + 采样)
|
|
488
|
+
result = checker.check_file("data.jsonl", sample_count=1000)
|
|
489
|
+
|
|
490
|
+
print(f"通过率: {result.pass_rate:.1%}")
|
|
491
|
+
print(f"错误: {result.error_count}")
|
|
492
|
+
print(f"重复: {len(result.duplicates)} 组")
|
|
493
|
+
print(f"近似重复: {len(result.near_duplicates)} 组")
|
|
494
|
+
|
|
495
|
+
# 使用 YAML 自定义规则
|
|
496
|
+
rules = RuleSet.from_config("rules.yaml")
|
|
497
|
+
checker = DataChecker(rules)
|
|
498
|
+
result = checker.check_file("data.json")
|
|
499
|
+
|
|
500
|
+
# 生成报告
|
|
501
|
+
report = QualityReport(result)
|
|
502
|
+
report.save("report.md")
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
---
|
|
506
|
+
|
|
507
|
+
## 项目架构
|
|
508
|
+
|
|
509
|
+
```
|
|
510
|
+
src/datacheck/
|
|
511
|
+
├── checker.py # 核心检查器 (JSON/JSONL/CSV 加载、采样、近似重复检测)
|
|
512
|
+
├── rules.py # 规则定义、预设规则集、YAML 配置加载
|
|
513
|
+
├── text_rules.py # 文本质量规则 (PII、乱码、重复文本、n-gram)
|
|
514
|
+
├── report.py # 报告生成 (Markdown / JSON)
|
|
515
|
+
├── cli.py # CLI 命令行
|
|
516
|
+
└── mcp_server.py # MCP Server (4 工具)
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
---
|
|
520
|
+
|
|
521
|
+
## License
|
|
522
|
+
|
|
523
|
+
[MIT](LICENSE)
|
|
524
|
+
|
|
525
|
+
<div align="center">
|
|
526
|
+
<sub><a href="https://github.com/liuxiaotong">knowlyr</a> 数据工程生态 · 自动化数据质检</sub>
|
|
527
|
+
</div>
|