knowlyr-datacheck 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python -m pytest:*)",
5
+ "Bash(wc:*)",
6
+ "Bash(.venv/bin/python -m pytest:*)",
7
+ "Bash(.venv/bin/pip install PyYAML)",
8
+ "Bash(.venv/bin/python -m pip install PyYAML)",
9
+ "Bash(uv pip install:*)",
10
+ "Bash(.venv/bin/ruff check:*)",
11
+ "Bash(git -C /Users/liukai/data-check status)",
12
+ "Bash(git -C /Users/liukai/data-check diff)",
13
+ "Bash(git -C /Users/liukai/data-check log --oneline -5)",
14
+ "Bash(git -C /Users/liukai/data-check add pyproject.toml src/datacheck/checker.py src/datacheck/cli.py src/datacheck/mcp_server.py src/datacheck/report.py src/datacheck/rules.py src/datacheck/text_rules.py tests/test_checker.py tests/test_cli.py tests/test_text_rules.py)",
15
+ "Bash(git -C /Users/liukai/data-check commit -m \"$\\(cat <<''EOF''\nfeat: 添加 JSONL/CSV 支持、采样模式、YAML 规则配置、文本质量检测、CI 阈值\n\n- JSONL/CSV 格式支持: 按文件扩展名自动检测并加载 .jsonl/.csv/.json\n- 采样检查模式: --sample N / --sample-rate 0.1 随机抽样大数据集\n- 自定义规则配置: --rules-file 加载 YAML 规则文件 \\(支持 required/min_length/max_length/regex/enum\\)\n- 文本质量检测: PII 隐私信息、乱码、重复文本检测 + n-gram 近似重复检测\n- CI 集成: --threshold / --strict 可配置退出码阈值\n\nGenerated with [Claude Code]\\(https://claude.ai/code\\)\nvia [Happy]\\(https://happy.engineering\\)\nEOF\n\\)\")",
16
+ "Bash(git -C /Users/liukai/data-check push)",
17
+ "Bash(git -C /Users/liukai/data-check add:*)",
18
+ "Bash(git -C /Users/liukai/data-check commit -m \"$\\(cat <<''EOF''\ndocs: 更新 README 添加新功能说明\n\n- 检查项目表格新增 PII、乱码、重复文本检测\n- 安装说明新增 yaml 可选依赖\n- CLI 示例新增 JSONL/CSV、采样、阈值用法\n- 内置规则表格补充 3 条文本质量规则\n- 自定义规则章节改为 YAML 配置方式\n- 命令参考新增 --rules-file/--sample/--threshold/--strict\n- API 示例更新为正确的 SDK 用法\n- 项目架构新增 text_rules.py\n\nGenerated with [Claude Code]\\(https://claude.ai/code\\)\nvia [Happy]\\(https://happy.engineering\\)\nEOF\n\\)\")",
19
+ "Bash(uv pip list:*)",
20
+ "Bash(.venv/bin/python:*)"
21
+ ]
22
+ }
23
+ }
@@ -0,0 +1,33 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+
29
+ - name: Lint with ruff
30
+ run: ruff check .
31
+
32
+ - name: Run tests
33
+ run: pytest
@@ -0,0 +1,52 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ ENV/
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+ *.swo
22
+
23
+ # Testing
24
+ .pytest_cache/
25
+ .coverage
26
+ htmlcov/
27
+
28
+ # Reports
29
+ *_report.md
30
+ *_report.json
31
+
32
+ # Environment
33
+ .env
34
+ .env.local
35
+
36
+ # OS
37
+ .DS_Store
38
+ Thumbs.db
39
+
40
+ # ===================
41
+ # Supplemented common patterns
42
+ # ===================
43
+
44
+ # Python wheels
45
+ *.whl
46
+
47
+ # Virtual environments
48
+ env/
49
+
50
+ # Linting / type checking
51
+ .mypy_cache/
52
+ .ruff_cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Liu Kai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,527 @@
1
+ Metadata-Version: 2.4
2
+ Name: knowlyr-datacheck
3
+ Version: 0.2.0
4
+ Summary: Data quality inspection toolkit - automated validation, anomaly detection, and distribution analysis
5
+ Project-URL: Homepage, https://github.com/liuxiaotong/data-check
6
+ Project-URL: Documentation, https://github.com/liuxiaotong/data-check#readme
7
+ Project-URL: Repository, https://github.com/liuxiaotong/data-check
8
+ Project-URL: Issues, https://github.com/liuxiaotong/data-check/issues
9
+ Author-email: Liu Kai <mrliukai@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai,anomaly-detection,data-inspection,data-quality,machine-learning,training-data,validation
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: click>=8.0
25
+ Requires-Dist: pydantic>=2.0
26
+ Provides-Extra: all
27
+ Requires-Dist: anthropic>=0.18; extra == 'all'
28
+ Requires-Dist: mcp>=1.0; extra == 'all'
29
+ Requires-Dist: numpy>=1.20; extra == 'all'
30
+ Requires-Dist: openai>=1.0; extra == 'all'
31
+ Requires-Dist: pytest; extra == 'all'
32
+ Requires-Dist: pyyaml>=6.0; extra == 'all'
33
+ Requires-Dist: ruff; extra == 'all'
34
+ Requires-Dist: scipy>=1.7; extra == 'all'
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest; extra == 'dev'
37
+ Requires-Dist: ruff; extra == 'dev'
38
+ Provides-Extra: llm
39
+ Requires-Dist: anthropic>=0.18; extra == 'llm'
40
+ Requires-Dist: openai>=1.0; extra == 'llm'
41
+ Provides-Extra: mcp
42
+ Requires-Dist: mcp>=1.0; extra == 'mcp'
43
+ Provides-Extra: stats
44
+ Requires-Dist: numpy>=1.20; extra == 'stats'
45
+ Requires-Dist: scipy>=1.7; extra == 'stats'
46
+ Provides-Extra: yaml
47
+ Requires-Dist: pyyaml>=6.0; extra == 'yaml'
48
+ Description-Content-Type: text/markdown
49
+
50
+ <div align="center">
51
+
52
+ # DataCheck
53
+
54
+ **数据质检工具 — 自动化质量检查、异常检测、分布分析**
55
+ **Automated quality checks, anomaly detection & distribution analysis for LLM datasets**
56
+
57
+ [![PyPI](https://img.shields.io/pypi/v/knowlyr-datacheck?color=blue)](https://pypi.org/project/knowlyr-datacheck/)
58
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
59
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
60
+ [![MCP](https://img.shields.io/badge/MCP-4_Tools-purple.svg)](#mcp-server)
61
+
62
+ [快速开始](#快速开始) · [质量规则](#质量规则) · [分布分析](#分布分析) · [MCP Server](#mcp-server) · [Data Pipeline 生态](#data-pipeline-生态)
63
+
64
+ </div>
65
+
66
+ ---
67
+
68
+ **GitHub Topics**: `data-quality`, `anomaly-detection`, `data-validation`, `mcp`, `ai-data-pipeline`
69
+
70
+ 自动化数据质量检查,支持规则验证、重复检测、分布分析,生成可读的质量报告。
71
+
72
+ ## 核心能力 / Core Capabilities
73
+
74
+ ```
75
+ 数据文件 + Schema → 规则检查 → 异常检测 → 分布分析 → 质量报告
76
+ ```
77
+
78
+ ### 质量仪表盘预览 / Sample Dashboard
79
+
80
+ ```
81
+ ┌───────────────┬──────────────┬────────────┐
82
+ │ 通过率 92% │ 评级 🟢 优秀 │ 错误 8 条 │
83
+ ├───────────────┴──────────────┴────────────┤
84
+ │ ⚠ required_fields : 3 │ ⚠ duplicate_rows : 2│
85
+ │ 🔍 语言分布: zh 68% / en 32% │
86
+ └────────────────────────────────────────────┘
87
+
88
+ 完整示例: `examples/reports/demo_quality_report.md`
89
+ ```
90
+
91
+ ### 检查项目 / Checks
92
+
93
+ | 检查类型 | 说明 |
94
+ |----------|------|
95
+ | 🔴 **必填字段** | 检查是否包含所有必填字段 |
96
+ | 🔴 **非空检查** | 检查关键字段是否为空 |
97
+ | 🔴 **格式检查** | 检查数据类型是否正确 |
98
+ | 🟡 **长度边界** | 检查文本长度是否合理 |
99
+ | 🟡 **重复检测** | 检测完全重复 + 近似重复 (n-gram Jaccard) |
100
+ | 🟡 **隐私信息 (PII)** | 检测邮箱、手机号、身份证号 |
101
+ | 🟡 **乱码检测** | 检测乱码、异常字符、编码错误 |
102
+ | 🟡 **重复文本** | 检测文本内过度重复内容 |
103
+ | 🔵 **语言一致性** | 检查文本语言是否一致 |
104
+
105
+ ### 质量评级 / Rating
106
+
107
+ | 通过率 | 评级 | 建议 |
108
+ |--------|------|------|
109
+ | ≥90% | 🟢 优秀 | 可直接使用 |
110
+ | ≥70% | 🟡 良好 | 建议修复警告 |
111
+ | ≥50% | 🟠 一般 | 需要处理错误 |
112
+ | <50% | 🔴 需改进 | 严重质量问题 |
113
+
114
+ ## 安装 / Installation
115
+
116
+ ```bash
117
+ pip install knowlyr-datacheck
118
+ ```
119
+
120
+ 可选依赖:
121
+
122
+ ```bash
123
+ pip install knowlyr-datacheck[stats] # 统计分析 (numpy, scipy)
124
+ pip install knowlyr-datacheck[mcp] # MCP 服务器
125
+ pip install knowlyr-datacheck[yaml] # YAML 规则配置
126
+ pip install knowlyr-datacheck[all] # 全部功能
127
+ ```
128
+
129
+ ## 快速开始 / Quick Start
130
+
131
+ ### 检查数据文件 / CLI
132
+
133
+ ```bash
134
+ # 基础检查 (支持 JSON / JSONL / CSV)
135
+ knowlyr-datacheck check data.json
136
+ knowlyr-datacheck check data.jsonl
137
+ knowlyr-datacheck check data.csv
138
+
139
+ # 指定 Schema
140
+ knowlyr-datacheck check data.json -s schema.json
141
+
142
+ # 输出报告
143
+ knowlyr-datacheck check data.json -o report.md
144
+
145
+ # 采样检查 (大数据集)
146
+ knowlyr-datacheck check data.jsonl --sample 1000
147
+ knowlyr-datacheck check data.jsonl --sample-rate 0.1
148
+
149
+ # CI 集成: 自定义阈值
150
+ knowlyr-datacheck check data.json --threshold 0.9
151
+ knowlyr-datacheck check data.json --strict
152
+ ```
153
+
154
+ ### 在 Python 中接入 / Python SDK
155
+
156
+ ```python
157
+ from datacheck import DataChecker, QualityReport
158
+
159
+ checker = DataChecker()
160
+ result = checker.check_file("data.json", schema_path="schema.json")
161
+
162
+ report = QualityReport(result)
163
+ report.print_summary()
164
+ report.save("./report.md")
165
+ ```
166
+
167
+ <details>
168
+ <summary>输出示例</summary>
169
+
170
+ ```
171
+ 正在检查 data.json...
172
+
173
+ ==================================================
174
+ 数据质量检查结果
175
+ ==================================================
176
+ 总样本: 100
177
+ 通过: 92
178
+ 失败: 8
179
+ 通过率: 92.0%
180
+ 评级: 🟢 优秀
181
+ ==================================================
182
+
183
+ 🟡 警告: 3
184
+ ⚠️ 重复: 2 组
185
+ ```
186
+
187
+ </details>
188
+
189
+ ### 使用 DataRecipe 分析结果验证 / Validate DataRecipe Outputs
190
+
191
+ ```bash
192
+ # 验证合成数据
193
+ knowlyr-datacheck validate ./analysis_output/my_dataset/
194
+
195
+ # 验证指定文件
196
+ knowlyr-datacheck validate ./analysis_output/my_dataset/ -d custom_data.json
197
+ ```
198
+
199
+ <details>
200
+ <summary>输出示例</summary>
201
+
202
+ ```
203
+ 正在验证 ./analysis_output/my_dataset/...
204
+ ✓ 报告已保存: ./analysis_output/my_dataset/12_质检报告/quality_report.md
205
+
206
+ ==================================================
207
+ 数据质量检查结果
208
+ ==================================================
209
+ 总样本: 1000
210
+ 通过: 956
211
+ 失败: 44
212
+ 通过率: 95.6%
213
+ 评级: 🟢 优秀
214
+ ==================================================
215
+ ```
216
+
217
+ </details>
218
+
219
+ ---
220
+
221
+ ## 质量规则 / Quality Rules
222
+
223
+ ### 内置规则 / Built-in Rules
224
+
225
+ ```bash
226
+ # 查看所有规则
227
+ knowlyr-datacheck rules
228
+ ```
229
+
230
+ | 规则 ID | 名称 | 级别 | 说明 |
231
+ |---------|------|------|------|
232
+ | `required_fields` | 必填字段检查 | 🔴 错误 | 检查必填字段是否存在 |
233
+ | `non_empty` | 非空检查 | 🔴 错误 | 检查关键字段是否为空 |
234
+ | `format_valid` | 格式检查 | 🔴 错误 | 检查数据类型是否正确 |
235
+ | `score_valid` | 评分有效性 | 🔴 错误 | 检查评分是否在有效范围 |
236
+ | `length_bounds` | 长度边界检查 | 🟡 警告 | 检查文本长度范围 |
237
+ | `pii_detection` | 隐私信息检测 | 🟡 警告 | 检测邮箱、手机号、身份证号 |
238
+ | `garbled_text` | 乱码检测 | 🟡 警告 | 检测乱码、异常字符 |
239
+ | `repetitive_text` | 重复文本检测 | 🟡 警告 | 检测文本内过度重复 |
240
+ | `language_consistency` | 语言一致性 | 🔵 提示 | 检查语言是否一致 |
241
+
242
+ ### 预设规则集 / Rule Packs
243
+
244
+ ```bash
245
+ # 使用 SFT 数据规则集
246
+ knowlyr-datacheck check data.json --ruleset sft
247
+
248
+ # 使用偏好数据规则集
249
+ knowlyr-datacheck check data.json --ruleset preference
250
+ ```
251
+
252
+ | 规则集 | 说明 |
253
+ |--------|------|
254
+ | `default` | 通用规则 |
255
+ | `sft` | SFT 数据专用规则 (指令质量、回复质量) |
256
+ | `preference` | 偏好数据专用规则 (chosen/rejected 差异) |
257
+
258
+ ### 自定义规则配置 / Custom Rules (YAML)
259
+
260
+ 通过 YAML 配置文件定义自定义规则,无需写 Python 代码:
261
+
262
+ ```yaml
263
+ # rules.yaml
264
+ rules:
265
+ - field: instruction
266
+ check: min_length
267
+ value: 10
268
+ severity: error
269
+
270
+ - field: response
271
+ check: max_length
272
+ value: 10000
273
+ severity: warning
274
+
275
+ - field: category
276
+ check: enum
277
+ values: ["qa", "chat", "code", "math"]
278
+ severity: error
279
+
280
+ - field: instruction
281
+ check: regex
282
+ pattern: "^[A-Z\u4e00-\u9fff]"
283
+ severity: info
284
+ message: "指令应以大写字母或中文开头"
285
+ ```
286
+
287
+ ```bash
288
+ # 使用自定义规则
289
+ knowlyr-datacheck check data.json --rules-file rules.yaml
290
+ ```
291
+
292
+ 支持的检查类型:`required`、`non_empty`、`min_length`、`max_length`、`regex`、`enum`
293
+
294
+ > 需要安装 YAML 支持:`pip install knowlyr-datacheck[yaml]`
295
+
296
+ ---
297
+
298
+ ## 分布分析 / Distribution Analysis
299
+
300
+ ### 对比多个数据文件
301
+
302
+ ```bash
303
+ knowlyr-datacheck compare seed.json synthetic.json -o comparison.md
304
+ ```
305
+
306
+ <details>
307
+ <summary>输出示例</summary>
308
+
309
+ ```markdown
310
+ # 数据分布对比报告
311
+
312
+ ## 文件概要
313
+
314
+ | 文件 | 样本数 |
315
+ |------|--------|
316
+ | seed.json | 50 |
317
+ | synthetic.json | 1000 |
318
+
319
+ ## 字段对比
320
+
321
+ ### instruction
322
+ - **seed.json**: 长度 15-200 (平均 68)
323
+ - **synthetic.json**: 长度 12-198 (平均 72)
324
+
325
+ ### response
326
+ - **seed.json**: 长度 50-800 (平均 245)
327
+ - **synthetic.json**: 长度 45-820 (平均 251)
328
+ ```
329
+
330
+ </details>
331
+
332
+ ### 分析内容
333
+
334
+ - **长度统计**: 最小值、最大值、平均值
335
+ - **唯一值比例**: 检测多样性
336
+ - **值分布**: 数值型字段的分布情况
337
+ - **参考对比**: 与种子数据的分布差异
338
+
339
+ ---
340
+
341
+ ## MCP Server
342
+
343
+ 在 Claude Desktop / Claude Code 中直接使用。
344
+
345
+ ### 配置
346
+
347
+ 添加到 `~/Library/Application Support/Claude/claude_desktop_config.json`:
348
+
349
+ ```json
350
+ {
351
+ "mcpServers": {
352
+ "knowlyr-datacheck": {
353
+ "command": "uv",
354
+ "args": ["--directory", "/path/to/data-check", "run", "python", "-m", "datacheck.mcp_server"]
355
+ }
356
+ }
357
+ }
358
+ ```
359
+
360
+ ### 可用工具
361
+
362
+ | 工具 | 功能 |
363
+ |------|------|
364
+ | `check_data_quality` | 检查数据文件质量 |
365
+ | `validate_from_datarecipe` | 使用 DataRecipe 分析结果验证 |
366
+ | `compare_distributions` | 对比多个数据文件分布 |
367
+ | `list_quality_rules` | 列出所有质量检查规则 |
368
+
369
+ ### 使用示例
370
+
371
+ ```
372
+ 用户: 帮我检查 ./output/synthetic.json 的质量
373
+
374
+ Claude: [调用 check_data_quality]
375
+
376
+ ## 数据质量检查结果
377
+
378
+ - 通过率: **95.6%**
379
+ - 评级: **🟢 优秀**
380
+ - 错误: 0, 警告: 44
381
+
382
+ 发现 2 组重复数据
383
+ ```
384
+
385
+ ---
386
+
387
+ ## Data Pipeline 生态
388
+
389
+ DataCheck 是 Data Pipeline 生态的质检组件:
390
+
391
+ ```mermaid
392
+ graph LR
393
+ Radar["🔍 Radar<br/>情报发现"] --> Recipe["📋 Recipe<br/>逆向分析"]
394
+ Recipe --> Synth["🔄 Synth<br/>数据合成"]
395
+ Recipe --> Label["🏷️ Label<br/>数据标注"]
396
+ Synth --> Check["✅ Check<br/>数据质检"]
397
+ Label --> Check
398
+ Check --> Audit["🔬 Audit<br/>模型审计"]
399
+ Audit --> Hub["🎯 Hub<br/>编排层"]
400
+ Hub --> Sandbox["📦 Sandbox<br/>执行沙箱"]
401
+ Sandbox --> Recorder["📹 Recorder<br/>轨迹录制"]
402
+ Recorder --> Reward["⭐ Reward<br/>过程打分"]
403
+ style Check fill:#0969da,color:#fff,stroke:#0969da
404
+ ```
405
+
406
+ ### 生态项目
407
+
408
+ | 层 | 项目 | PyPI 包 | 说明 | 仓库 |
409
+ |---|---|---|---|---|
410
+ | 情报 | **AI Dataset Radar** | knowlyr-radar | 数据集竞争情报、趋势分析 | [GitHub](https://github.com/liuxiaotong/ai-dataset-radar) |
411
+ | 分析 | **DataRecipe** | knowlyr-datarecipe | 逆向分析、Schema 提取、成本估算 | [GitHub](https://github.com/liuxiaotong/data-recipe) |
412
+ | 生产 | **DataSynth** | knowlyr-datasynth | LLM 批量合成、种子数据扩充 | [GitHub](https://github.com/liuxiaotong/data-synth) |
413
+ | 生产 | **DataLabel** | knowlyr-datalabel | 轻量标注工具、多标注员合并 | [GitHub](https://github.com/liuxiaotong/data-label) |
414
+ | 质检 | **DataCheck** | knowlyr-datacheck | 规则验证、重复检测、分布分析 | You are here |
415
+ | 质检 | **ModelAudit** | knowlyr-modelaudit | 蒸馏检测、模型指纹、身份验证 | [GitHub](https://github.com/liuxiaotong/model-audit) |
416
+ | Agent | **knowlyr-agent** | knowlyr-sandbox / recorder / reward / hub | 沙箱 + 轨迹录制 + Reward + 编排 | [GitHub](https://github.com/liuxiaotong/knowlyr-agent) |
417
+
418
+ ### 端到端工作流
419
+
420
+ ```bash
421
+ # 1. DataRecipe: 分析数据集,生成 Schema 和样例
422
+ knowlyr-datarecipe deep-analyze tencent/CL-bench -o ./output
423
+
424
+ # 2. DataLabel: 生成标注界面,人工标注/校准种子数据
425
+ knowlyr-datalabel generate ./output/tencent_CL-bench/
426
+
427
+ # 3. DataSynth: 基于种子数据批量合成
428
+ knowlyr-datasynth generate ./output/tencent_CL-bench/ -n 1000
429
+
430
+ # 4. DataCheck: 质量检查
431
+ knowlyr-datacheck validate ./output/tencent_CL-bench/
432
+ ```
433
+
434
+ ### 四合一 MCP 配置
435
+
436
+ ```json
437
+ {
438
+ "mcpServers": {
439
+ "knowlyr-datarecipe": {
440
+ "command": "uv",
441
+ "args": ["--directory", "/path/to/data-recipe", "run", "knowlyr-datarecipe-mcp"]
442
+ },
443
+ "knowlyr-datalabel": {
444
+ "command": "uv",
445
+ "args": ["--directory", "/path/to/data-label", "run", "python", "-m", "datalabel.mcp_server"]
446
+ },
447
+ "knowlyr-datasynth": {
448
+ "command": "uv",
449
+ "args": ["--directory", "/path/to/data-synth", "run", "python", "-m", "datasynth.mcp_server"]
450
+ },
451
+ "knowlyr-datacheck": {
452
+ "command": "uv",
453
+ "args": ["--directory", "/path/to/data-check", "run", "python", "-m", "datacheck.mcp_server"]
454
+ }
455
+ }
456
+ }
457
+ ```
458
+
459
+ ---
460
+
461
+ ## 命令参考
462
+
463
+ | 命令 | 功能 |
464
+ |------|------|
465
+ | `knowlyr-datacheck check <file>` | 检查数据文件 (JSON/JSONL/CSV) |
466
+ | `knowlyr-datacheck check <file> -s <schema>` | 使用 Schema 检查 |
467
+ | `knowlyr-datacheck check <file> --ruleset sft` | 使用指定规则集 |
468
+ | `knowlyr-datacheck check <file> --rules-file rules.yaml` | 使用自定义 YAML 规则 |
469
+ | `knowlyr-datacheck check <file> --sample 1000` | 随机抽样 1000 条检查 |
470
+ | `knowlyr-datacheck check <file> --sample-rate 0.1` | 随机抽样 10% 检查 |
471
+ | `knowlyr-datacheck check <file> --threshold 0.9` | 通过率低于 90% 时退出码 1 |
472
+ | `knowlyr-datacheck check <file> --strict` | 任何错误/警告都退出码 1 |
473
+ | `knowlyr-datacheck validate <dir>` | 验证 DataRecipe 输出 |
474
+ | `knowlyr-datacheck compare <files...>` | 对比多个文件分布 |
475
+ | `knowlyr-datacheck rules` | 列出所有规则 |
476
+
477
+ ---
478
+
479
+ ## API 使用
480
+
481
+ ```python
482
+ from datacheck import DataChecker, QualityReport, RuleSet
483
+
484
+ # 创建检查器
485
+ checker = DataChecker()
486
+
487
+ # 检查文件 (支持 JSON/JSONL/CSV + 采样)
488
+ result = checker.check_file("data.jsonl", sample_count=1000)
489
+
490
+ print(f"通过率: {result.pass_rate:.1%}")
491
+ print(f"错误: {result.error_count}")
492
+ print(f"重复: {len(result.duplicates)} 组")
493
+ print(f"近似重复: {len(result.near_duplicates)} 组")
494
+
495
+ # 使用 YAML 自定义规则
496
+ rules = RuleSet.from_config("rules.yaml")
497
+ checker = DataChecker(rules)
498
+ result = checker.check_file("data.json")
499
+
500
+ # 生成报告
501
+ report = QualityReport(result)
502
+ report.save("report.md")
503
+ ```
504
+
505
+ ---
506
+
507
+ ## 项目架构
508
+
509
+ ```
510
+ src/datacheck/
511
+ ├── checker.py # 核心检查器 (JSON/JSONL/CSV 加载、采样、近似重复检测)
512
+ ├── rules.py # 规则定义、预设规则集、YAML 配置加载
513
+ ├── text_rules.py # 文本质量规则 (PII、乱码、重复文本、n-gram)
514
+ ├── report.py # 报告生成 (Markdown / JSON)
515
+ ├── cli.py # CLI 命令行
516
+ └── mcp_server.py # MCP Server (4 工具)
517
+ ```
518
+
519
+ ---
520
+
521
+ ## License
522
+
523
+ [MIT](LICENSE)
524
+
525
+ <div align="center">
526
+ <sub><a href="https://github.com/liuxiaotong">knowlyr</a> 数据工程生态 · 自动化数据质检</sub>
527
+ </div>