dtflow 0.4.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dtflow-0.4.2 → dtflow-0.5.0}/PKG-INFO +117 -1
- {dtflow-0.4.2 → dtflow-0.5.0}/README.md +116 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/__init__.py +34 -1
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/__main__.py +28 -3
- dtflow-0.5.0/dtflow/cli/clean.py +486 -0
- dtflow-0.5.0/dtflow/cli/commands.py +61 -0
- dtflow-0.5.0/dtflow/cli/common.py +384 -0
- dtflow-0.5.0/dtflow/cli/io_ops.py +385 -0
- dtflow-0.5.0/dtflow/cli/lineage.py +49 -0
- dtflow-0.5.0/dtflow/cli/pipeline.py +54 -0
- dtflow-0.5.0/dtflow/cli/sample.py +294 -0
- dtflow-0.5.0/dtflow/cli/stats.py +589 -0
- dtflow-0.5.0/dtflow/cli/transform.py +486 -0
- dtflow-0.5.0/dtflow/cli/validate.py +152 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/core.py +189 -0
- dtflow-0.5.0/dtflow/framework.py +610 -0
- dtflow-0.5.0/dtflow/schema.py +508 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/storage/io.py +49 -6
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/streaming.py +25 -4
- dtflow-0.5.0/tests/test_framework.py +204 -0
- dtflow-0.5.0/tests/test_schema.py +547 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_transformer.py +33 -4
- dtflow-0.4.2/dtflow/cli/commands.py +0 -2640
- {dtflow-0.4.2 → dtflow-0.5.0}/.gitignore +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/cli/__init__.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/converters.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/lineage.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/mcp/__init__.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/mcp/__main__.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/mcp/cli.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/mcp/docs.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/mcp/server.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/pipeline.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/presets.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/storage/__init__.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/tokenizers.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/utils/__init__.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/utils/display.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/dtflow/utils/field_path.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/pyproject.toml +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/benchmark_io.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_converters.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_field_path.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_io.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_lineage.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_pipeline.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_streaming.py +0 -0
- {dtflow-0.4.2 → dtflow-0.5.0}/tests/test_tokenizers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -126,6 +126,64 @@ dt.filter(lambda x: x.score > 0.8)
|
|
|
126
126
|
dt.filter(lambda x: x.language == "zh")
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
### 数据验证
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# 简单验证,返回不通过的记录列表
|
|
133
|
+
errors = dt.validate(lambda x: len(x.messages) >= 2)
|
|
134
|
+
|
|
135
|
+
if errors:
|
|
136
|
+
for e in errors[:5]:
|
|
137
|
+
print(f"第 {e.index} 行: {e.error}")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Schema 验证
|
|
141
|
+
|
|
142
|
+
使用 Schema 进行结构化数据验证:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from dtflow import Schema, Field, openai_chat_schema
|
|
146
|
+
|
|
147
|
+
# 使用预设 Schema
|
|
148
|
+
result = dt.validate_schema(openai_chat_schema)
|
|
149
|
+
print(result) # ValidationResult(valid=950, invalid=50, errors=[...])
|
|
150
|
+
|
|
151
|
+
# 自定义 Schema
|
|
152
|
+
schema = Schema({
|
|
153
|
+
"messages": Field(type="list", required=True, min_length=1),
|
|
154
|
+
"messages[*].role": Field(type="str", choices=["user", "assistant", "system"]),
|
|
155
|
+
"messages[*].content": Field(type="str", min_length=1),
|
|
156
|
+
"score": Field(type="float", min=0, max=1),
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
result = dt.validate_schema(schema)
|
|
160
|
+
|
|
161
|
+
# 过滤出有效数据
|
|
162
|
+
valid_dt = dt.validate_schema(schema, filter_invalid=True)
|
|
163
|
+
valid_dt.save("valid.jsonl")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**预设 Schema**:
|
|
167
|
+
|
|
168
|
+
| Schema 名称 | 用途 |
|
|
169
|
+
|------------|------|
|
|
170
|
+
| `openai_chat_schema` | OpenAI messages 格式验证 |
|
|
171
|
+
| `alpaca_schema` | Alpaca instruction/output 格式 |
|
|
172
|
+
| `sharegpt_schema` | ShareGPT conversations 格式 |
|
|
173
|
+
| `dpo_schema` | DPO prompt/chosen/rejected 格式 |
|
|
174
|
+
|
|
175
|
+
**Field 参数**:
|
|
176
|
+
|
|
177
|
+
| 参数 | 说明 | 示例 |
|
|
178
|
+
|------|------|------|
|
|
179
|
+
| `type` | 类型验证 | `"str"`, `"int"`, `"float"`, `"bool"`, `"list"`, `"dict"` |
|
|
180
|
+
| `required` | 是否必填 | `True` / `False` |
|
|
181
|
+
| `min` / `max` | 数值范围 | `min=0, max=1` |
|
|
182
|
+
| `min_length` / `max_length` | 长度范围 | `min_length=1` |
|
|
183
|
+
| `choices` | 枚举值 | `choices=["user", "assistant"]` |
|
|
184
|
+
| `pattern` | 正则匹配 | `pattern=r"^\d{4}-\d{2}-\d{2}$"` |
|
|
185
|
+
| `custom` | 自定义验证 | `custom=lambda x: x > 0` |
|
|
186
|
+
|
|
129
187
|
### 数据转换
|
|
130
188
|
|
|
131
189
|
```python
|
|
@@ -275,6 +333,58 @@ dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
|
|
|
275
333
|
# 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
|
|
276
334
|
```
|
|
277
335
|
|
|
336
|
+
### 训练框架一键导出
|
|
337
|
+
|
|
338
|
+
将数据导出为目标训练框架可直接使用的格式,自动生成配置文件:
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from dtflow import DataTransformer
|
|
342
|
+
|
|
343
|
+
dt = DataTransformer.load("data.jsonl")
|
|
344
|
+
|
|
345
|
+
# 1. 检查框架兼容性
|
|
346
|
+
result = dt.check_compatibility("llama-factory")
|
|
347
|
+
print(result)
|
|
348
|
+
# ✅ 兼容 - LLaMA-Factory (openai_chat)
|
|
349
|
+
# 或
|
|
350
|
+
# ❌ 不兼容 - 错误: xxx
|
|
351
|
+
|
|
352
|
+
# 2. 一键导出到 LLaMA-Factory
|
|
353
|
+
files = dt.export_for("llama-factory", "./llama_ready/")
|
|
354
|
+
# 生成文件:
|
|
355
|
+
# - ./llama_ready/custom_dataset.json # 数据文件
|
|
356
|
+
# - ./llama_ready/dataset_info.json # 数据集配置
|
|
357
|
+
# - ./llama_ready/train_args.yaml # 训练参数模板
|
|
358
|
+
|
|
359
|
+
# 3. 导出到 ms-swift
|
|
360
|
+
files = dt.export_for("swift", "./swift_ready/")
|
|
361
|
+
# 生成: data.jsonl + train_swift.sh
|
|
362
|
+
|
|
363
|
+
# 4. 导出到 Axolotl
|
|
364
|
+
files = dt.export_for("axolotl", "./axolotl_ready/")
|
|
365
|
+
# 生成: data.jsonl + config.yaml
|
|
366
|
+
|
|
367
|
+
# 指定数据集名称
|
|
368
|
+
dt.export_for("llama-factory", "./output/", dataset_name="my_sft_data")
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
**支持的框架**:
|
|
372
|
+
|
|
373
|
+
| 框架 | 导出内容 | 使用方式 |
|
|
374
|
+
|------|---------|---------|
|
|
375
|
+
| `llama-factory` | data.json + dataset_info.json + train_args.yaml | `llamafactory-cli train train_args.yaml` |
|
|
376
|
+
| `swift` | data.jsonl + train_swift.sh | `bash train_swift.sh` |
|
|
377
|
+
| `axolotl` | data.jsonl + config.yaml | `accelerate launch -m axolotl.cli.train config.yaml` |
|
|
378
|
+
|
|
379
|
+
**自动格式检测**:
|
|
380
|
+
|
|
381
|
+
| 检测到的格式 | 数据结构 |
|
|
382
|
+
|------------|---------|
|
|
383
|
+
| `openai_chat` | `{"messages": [{"role": "user", ...}]}` |
|
|
384
|
+
| `alpaca` | `{"instruction": ..., "output": ...}` |
|
|
385
|
+
| `sharegpt` | `{"conversations": [{"from": "human", ...}]}` |
|
|
386
|
+
| `dpo` | `{"prompt": ..., "chosen": ..., "rejected": ...}` |
|
|
387
|
+
|
|
278
388
|
### 其他操作
|
|
279
389
|
|
|
280
390
|
```python
|
|
@@ -350,6 +460,12 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
|
350
460
|
|
|
351
461
|
# 数据统计
|
|
352
462
|
dt stats data.jsonl
|
|
463
|
+
|
|
464
|
+
# 数据验证
|
|
465
|
+
dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验证
|
|
466
|
+
dt validate data.jsonl --preset=alpaca --verbose # 详细输出
|
|
467
|
+
dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
|
|
468
|
+
dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
|
|
353
469
|
```
|
|
354
470
|
|
|
355
471
|
### 字段路径语法
|
|
@@ -50,6 +50,64 @@ dt.filter(lambda x: x.score > 0.8)
|
|
|
50
50
|
dt.filter(lambda x: x.language == "zh")
|
|
51
51
|
```
|
|
52
52
|
|
|
53
|
+
### 数据验证
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
# 简单验证,返回不通过的记录列表
|
|
57
|
+
errors = dt.validate(lambda x: len(x.messages) >= 2)
|
|
58
|
+
|
|
59
|
+
if errors:
|
|
60
|
+
for e in errors[:5]:
|
|
61
|
+
print(f"第 {e.index} 行: {e.error}")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Schema 验证
|
|
65
|
+
|
|
66
|
+
使用 Schema 进行结构化数据验证:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from dtflow import Schema, Field, openai_chat_schema
|
|
70
|
+
|
|
71
|
+
# 使用预设 Schema
|
|
72
|
+
result = dt.validate_schema(openai_chat_schema)
|
|
73
|
+
print(result) # ValidationResult(valid=950, invalid=50, errors=[...])
|
|
74
|
+
|
|
75
|
+
# 自定义 Schema
|
|
76
|
+
schema = Schema({
|
|
77
|
+
"messages": Field(type="list", required=True, min_length=1),
|
|
78
|
+
"messages[*].role": Field(type="str", choices=["user", "assistant", "system"]),
|
|
79
|
+
"messages[*].content": Field(type="str", min_length=1),
|
|
80
|
+
"score": Field(type="float", min=0, max=1),
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
result = dt.validate_schema(schema)
|
|
84
|
+
|
|
85
|
+
# 过滤出有效数据
|
|
86
|
+
valid_dt = dt.validate_schema(schema, filter_invalid=True)
|
|
87
|
+
valid_dt.save("valid.jsonl")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**预设 Schema**:
|
|
91
|
+
|
|
92
|
+
| Schema 名称 | 用途 |
|
|
93
|
+
|------------|------|
|
|
94
|
+
| `openai_chat_schema` | OpenAI messages 格式验证 |
|
|
95
|
+
| `alpaca_schema` | Alpaca instruction/output 格式 |
|
|
96
|
+
| `sharegpt_schema` | ShareGPT conversations 格式 |
|
|
97
|
+
| `dpo_schema` | DPO prompt/chosen/rejected 格式 |
|
|
98
|
+
|
|
99
|
+
**Field 参数**:
|
|
100
|
+
|
|
101
|
+
| 参数 | 说明 | 示例 |
|
|
102
|
+
|------|------|------|
|
|
103
|
+
| `type` | 类型验证 | `"str"`, `"int"`, `"float"`, `"bool"`, `"list"`, `"dict"` |
|
|
104
|
+
| `required` | 是否必填 | `True` / `False` |
|
|
105
|
+
| `min` / `max` | 数值范围 | `min=0, max=1` |
|
|
106
|
+
| `min_length` / `max_length` | 长度范围 | `min_length=1` |
|
|
107
|
+
| `choices` | 枚举值 | `choices=["user", "assistant"]` |
|
|
108
|
+
| `pattern` | 正则匹配 | `pattern=r"^\d{4}-\d{2}-\d{2}$"` |
|
|
109
|
+
| `custom` | 自定义验证 | `custom=lambda x: x > 0` |
|
|
110
|
+
|
|
53
111
|
### 数据转换
|
|
54
112
|
|
|
55
113
|
```python
|
|
@@ -199,6 +257,58 @@ dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
|
|
|
199
257
|
# 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
|
|
200
258
|
```
|
|
201
259
|
|
|
260
|
+
### 训练框架一键导出
|
|
261
|
+
|
|
262
|
+
将数据导出为目标训练框架可直接使用的格式,自动生成配置文件:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
from dtflow import DataTransformer
|
|
266
|
+
|
|
267
|
+
dt = DataTransformer.load("data.jsonl")
|
|
268
|
+
|
|
269
|
+
# 1. 检查框架兼容性
|
|
270
|
+
result = dt.check_compatibility("llama-factory")
|
|
271
|
+
print(result)
|
|
272
|
+
# ✅ 兼容 - LLaMA-Factory (openai_chat)
|
|
273
|
+
# 或
|
|
274
|
+
# ❌ 不兼容 - 错误: xxx
|
|
275
|
+
|
|
276
|
+
# 2. 一键导出到 LLaMA-Factory
|
|
277
|
+
files = dt.export_for("llama-factory", "./llama_ready/")
|
|
278
|
+
# 生成文件:
|
|
279
|
+
# - ./llama_ready/custom_dataset.json # 数据文件
|
|
280
|
+
# - ./llama_ready/dataset_info.json # 数据集配置
|
|
281
|
+
# - ./llama_ready/train_args.yaml # 训练参数模板
|
|
282
|
+
|
|
283
|
+
# 3. 导出到 ms-swift
|
|
284
|
+
files = dt.export_for("swift", "./swift_ready/")
|
|
285
|
+
# 生成: data.jsonl + train_swift.sh
|
|
286
|
+
|
|
287
|
+
# 4. 导出到 Axolotl
|
|
288
|
+
files = dt.export_for("axolotl", "./axolotl_ready/")
|
|
289
|
+
# 生成: data.jsonl + config.yaml
|
|
290
|
+
|
|
291
|
+
# 指定数据集名称
|
|
292
|
+
dt.export_for("llama-factory", "./output/", dataset_name="my_sft_data")
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
**支持的框架**:
|
|
296
|
+
|
|
297
|
+
| 框架 | 导出内容 | 使用方式 |
|
|
298
|
+
|------|---------|---------|
|
|
299
|
+
| `llama-factory` | data.json + dataset_info.json + train_args.yaml | `llamafactory-cli train train_args.yaml` |
|
|
300
|
+
| `swift` | data.jsonl + train_swift.sh | `bash train_swift.sh` |
|
|
301
|
+
| `axolotl` | data.jsonl + config.yaml | `accelerate launch -m axolotl.cli.train config.yaml` |
|
|
302
|
+
|
|
303
|
+
**自动格式检测**:
|
|
304
|
+
|
|
305
|
+
| 检测到的格式 | 数据结构 |
|
|
306
|
+
|------------|---------|
|
|
307
|
+
| `openai_chat` | `{"messages": [{"role": "user", ...}]}` |
|
|
308
|
+
| `alpaca` | `{"instruction": ..., "output": ...}` |
|
|
309
|
+
| `sharegpt` | `{"conversations": [{"from": "human", ...}]}` |
|
|
310
|
+
| `dpo` | `{"prompt": ..., "chosen": ..., "rejected": ...}` |
|
|
311
|
+
|
|
202
312
|
### 其他操作
|
|
203
313
|
|
|
204
314
|
```python
|
|
@@ -274,6 +384,12 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
|
274
384
|
|
|
275
385
|
# 数据统计
|
|
276
386
|
dt stats data.jsonl
|
|
387
|
+
|
|
388
|
+
# 数据验证
|
|
389
|
+
dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验证
|
|
390
|
+
dt validate data.jsonl --preset=alpaca --verbose # 详细输出
|
|
391
|
+
dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
|
|
392
|
+
dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
|
|
277
393
|
```
|
|
278
394
|
|
|
279
395
|
### 字段路径语法
|
|
@@ -4,6 +4,7 @@ DataTransformer: 简洁的数据格式转换工具
|
|
|
4
4
|
核心功能:
|
|
5
5
|
- DataTransformer: 数据加载、转换、保存
|
|
6
6
|
- presets: 预设转换模板 (openai_chat, alpaca, sharegpt, dpo_pair, simple_qa)
|
|
7
|
+
- schema: 数据结构验证 (Schema, Field)
|
|
7
8
|
- tokenizers: Token 统计和过滤
|
|
8
9
|
- converters: HuggingFace/OpenAI 等格式转换
|
|
9
10
|
"""
|
|
@@ -26,6 +27,23 @@ from .converters import ( # LLaMA-Factory 扩展; ms-swift
|
|
|
26
27
|
)
|
|
27
28
|
from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
|
|
28
29
|
from .presets import get_preset, list_presets
|
|
30
|
+
from .schema import (
|
|
31
|
+
Field,
|
|
32
|
+
Schema,
|
|
33
|
+
ValidationError,
|
|
34
|
+
ValidationResult,
|
|
35
|
+
alpaca_schema,
|
|
36
|
+
dpo_schema,
|
|
37
|
+
openai_chat_schema,
|
|
38
|
+
sharegpt_schema,
|
|
39
|
+
validate_data,
|
|
40
|
+
)
|
|
41
|
+
from .framework import (
|
|
42
|
+
CompatibilityResult,
|
|
43
|
+
check_compatibility,
|
|
44
|
+
detect_format,
|
|
45
|
+
export_for,
|
|
46
|
+
)
|
|
29
47
|
from .storage import load_data, sample_file, save_data
|
|
30
48
|
from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
|
|
31
49
|
from .tokenizers import (
|
|
@@ -42,7 +60,7 @@ from .tokenizers import (
|
|
|
42
60
|
token_stats,
|
|
43
61
|
)
|
|
44
62
|
|
|
45
|
-
__version__ = "0.
|
|
63
|
+
__version__ = "0.5.0"
|
|
46
64
|
|
|
47
65
|
__all__ = [
|
|
48
66
|
# core
|
|
@@ -53,6 +71,21 @@ __all__ = [
|
|
|
53
71
|
# presets
|
|
54
72
|
"get_preset",
|
|
55
73
|
"list_presets",
|
|
74
|
+
# schema
|
|
75
|
+
"Schema",
|
|
76
|
+
"Field",
|
|
77
|
+
"ValidationResult",
|
|
78
|
+
"ValidationError",
|
|
79
|
+
"validate_data",
|
|
80
|
+
"openai_chat_schema",
|
|
81
|
+
"alpaca_schema",
|
|
82
|
+
"dpo_schema",
|
|
83
|
+
"sharegpt_schema",
|
|
84
|
+
# framework
|
|
85
|
+
"CompatibilityResult",
|
|
86
|
+
"check_compatibility",
|
|
87
|
+
"detect_format",
|
|
88
|
+
"export_for",
|
|
56
89
|
# storage
|
|
57
90
|
"save_data",
|
|
58
91
|
"load_data",
|
|
@@ -18,6 +18,7 @@ Commands:
|
|
|
18
18
|
clean 数据清洗
|
|
19
19
|
run 执行 Pipeline 配置文件
|
|
20
20
|
history 显示数据血缘历史
|
|
21
|
+
validate 使用 Schema 验证数据格式
|
|
21
22
|
mcp MCP 服务管理(install/uninstall/status)
|
|
22
23
|
logs 日志查看工具使用说明
|
|
23
24
|
"""
|
|
@@ -40,6 +41,7 @@ from .cli.commands import stats as _stats
|
|
|
40
41
|
from .cli.commands import tail as _tail
|
|
41
42
|
from .cli.commands import token_stats as _token_stats
|
|
42
43
|
from .cli.commands import transform as _transform
|
|
44
|
+
from .cli.commands import validate as _validate
|
|
43
45
|
|
|
44
46
|
# 创建主应用
|
|
45
47
|
app = typer.Typer(
|
|
@@ -64,10 +66,11 @@ def sample(
|
|
|
64
66
|
by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
|
|
65
67
|
uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
|
|
66
68
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
|
|
69
|
+
raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
|
|
67
70
|
):
|
|
68
71
|
"""从数据文件中采样指定数量的数据"""
|
|
69
72
|
actual_num = num_arg if num_arg is not None else num
|
|
70
|
-
_sample(filename, actual_num, type, output, seed, by, uniform, fields)
|
|
73
|
+
_sample(filename, actual_num, type, output, seed, by, uniform, fields, raw)
|
|
71
74
|
|
|
72
75
|
|
|
73
76
|
@app.command()
|
|
@@ -77,11 +80,12 @@ def head(
|
|
|
77
80
|
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
78
81
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
79
82
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
83
|
+
raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
|
|
80
84
|
):
|
|
81
85
|
"""显示文件的前 N 条数据"""
|
|
82
86
|
# 位置参数优先于选项参数
|
|
83
87
|
actual_num = num_arg if num_arg is not None else num
|
|
84
|
-
_head(filename, actual_num, output, fields)
|
|
88
|
+
_head(filename, actual_num, output, fields, raw)
|
|
85
89
|
|
|
86
90
|
|
|
87
91
|
@app.command()
|
|
@@ -91,11 +95,12 @@ def tail(
|
|
|
91
95
|
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
92
96
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
93
97
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
98
|
+
raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
|
|
94
99
|
):
|
|
95
100
|
"""显示文件的后 N 条数据"""
|
|
96
101
|
# 位置参数优先于选项参数
|
|
97
102
|
actual_num = num_arg if num_arg is not None else num
|
|
98
|
-
_tail(filename, actual_num, output, fields)
|
|
103
|
+
_tail(filename, actual_num, output, fields, raw)
|
|
99
104
|
|
|
100
105
|
|
|
101
106
|
# ============ 数据转换命令 ============
|
|
@@ -208,6 +213,26 @@ def history(
|
|
|
208
213
|
_history(filename, json)
|
|
209
214
|
|
|
210
215
|
|
|
216
|
+
# ============ 验证命令 ============
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@app.command()
|
|
220
|
+
def validate(
|
|
221
|
+
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
222
|
+
preset: Optional[str] = typer.Option(
|
|
223
|
+
None, "--preset", "-p", help="预设 Schema: openai_chat, alpaca, dpo, sharegpt"
|
|
224
|
+
),
|
|
225
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出有效数据的文件路径"),
|
|
226
|
+
filter: bool = typer.Option(
|
|
227
|
+
False, "--filter", "-f", help="过滤无效数据并保存"
|
|
228
|
+
),
|
|
229
|
+
max_errors: int = typer.Option(20, "--max-errors", help="最多显示的错误数量"),
|
|
230
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="显示详细信息"),
|
|
231
|
+
):
|
|
232
|
+
"""使用预设 Schema 验证数据格式"""
|
|
233
|
+
_validate(filename, preset, output, filter, max_errors, verbose)
|
|
234
|
+
|
|
235
|
+
|
|
211
236
|
# ============ 工具命令 ============
|
|
212
237
|
|
|
213
238
|
|