dtflow 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow-0.4.0/.gitignore +61 -0
- dtflow-0.4.0/PKG-INFO +580 -0
- dtflow-0.4.0/README.md +504 -0
- dtflow-0.4.0/dtflow/__init__.py +94 -0
- dtflow-0.4.0/dtflow/__main__.py +316 -0
- dtflow-0.4.0/dtflow/cli/__init__.py +33 -0
- dtflow-0.4.0/dtflow/cli/commands.py +2469 -0
- dtflow-0.4.0/dtflow/converters.py +780 -0
- dtflow-0.4.0/dtflow/core.py +850 -0
- dtflow-0.4.0/dtflow/lineage.py +410 -0
- dtflow-0.4.0/dtflow/mcp/__init__.py +29 -0
- dtflow-0.4.0/dtflow/mcp/__main__.py +18 -0
- dtflow-0.4.0/dtflow/mcp/cli.py +388 -0
- dtflow-0.4.0/dtflow/mcp/docs.py +416 -0
- dtflow-0.4.0/dtflow/mcp/server.py +153 -0
- dtflow-0.4.0/dtflow/pipeline.py +460 -0
- dtflow-0.4.0/dtflow/presets.py +188 -0
- dtflow-0.4.0/dtflow/storage/__init__.py +21 -0
- dtflow-0.4.0/dtflow/storage/io.py +710 -0
- dtflow-0.4.0/dtflow/streaming.py +656 -0
- dtflow-0.4.0/dtflow/tokenizers.py +562 -0
- dtflow-0.4.0/dtflow/utils/__init__.py +23 -0
- dtflow-0.4.0/dtflow/utils/display.py +197 -0
- dtflow-0.4.0/dtflow/utils/field_path.py +274 -0
- dtflow-0.4.0/pyproject.toml +312 -0
- dtflow-0.4.0/tests/benchmark_io.py +229 -0
- dtflow-0.4.0/tests/test_converters.py +929 -0
- dtflow-0.4.0/tests/test_field_path.py +189 -0
- dtflow-0.4.0/tests/test_io.py +572 -0
- dtflow-0.4.0/tests/test_lineage.py +460 -0
- dtflow-0.4.0/tests/test_pipeline.py +658 -0
- dtflow-0.4.0/tests/test_streaming.py +281 -0
- dtflow-0.4.0/tests/test_tokenizers.py +623 -0
- dtflow-0.4.0/tests/test_transformer.py +614 -0
dtflow-0.4.0/.gitignore
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
/data
|
|
12
|
+
/analysis*
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# Virtual environments
|
|
28
|
+
venv/
|
|
29
|
+
env/
|
|
30
|
+
ENV/
|
|
31
|
+
.venv
|
|
32
|
+
|
|
33
|
+
# IDEs
|
|
34
|
+
.vscode/
|
|
35
|
+
.idea/
|
|
36
|
+
*.swp
|
|
37
|
+
*.swo
|
|
38
|
+
*~
|
|
39
|
+
|
|
40
|
+
# Testing
|
|
41
|
+
.pytest_cache/
|
|
42
|
+
.coverage
|
|
43
|
+
htmlcov/
|
|
44
|
+
.tox/
|
|
45
|
+
|
|
46
|
+
# Data files
|
|
47
|
+
data/
|
|
48
|
+
*.jsonl
|
|
49
|
+
*.csv
|
|
50
|
+
*.parquet
|
|
51
|
+
|
|
52
|
+
# OS
|
|
53
|
+
.DS_Store
|
|
54
|
+
Thumbs.db
|
|
55
|
+
|
|
56
|
+
# Jupyter
|
|
57
|
+
.ipynb_checkpoints/
|
|
58
|
+
*.ipynb
|
|
59
|
+
|
|
60
|
+
# Logs
|
|
61
|
+
*.log
|
dtflow-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dtflow
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
|
+
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/yourusername/DataTransformer
|
|
8
|
+
Project-URL: Issues, https://github.com/yourusername/DataTransformer/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/yourusername/DataTransformer/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Your Name <your.email@example.com>
|
|
11
|
+
Maintainer-email: Your Name <your.email@example.com>
|
|
12
|
+
License-Expression: MIT
|
|
13
|
+
Keywords: ai,data-processing,data-transformation,machine-learning,nlp,pretrain,rlhf,sft
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
|
+
Classifier: Topic :: Text Processing
|
|
29
|
+
Requires-Python: >=3.8
|
|
30
|
+
Requires-Dist: numpy>=1.20.0
|
|
31
|
+
Requires-Dist: orjson>=3.9.0
|
|
32
|
+
Requires-Dist: polars>=0.20.0
|
|
33
|
+
Requires-Dist: pyyaml>=5.4.0
|
|
34
|
+
Requires-Dist: rich>=10.0.0
|
|
35
|
+
Requires-Dist: typer>=0.9.0
|
|
36
|
+
Provides-Extra: converters
|
|
37
|
+
Requires-Dist: datasets>=2.0.0; extra == 'converters'
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: black>=21.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: flake8>=3.9.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: isort>=5.9.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: mypy>=0.910; extra == 'dev'
|
|
43
|
+
Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: pytest>=6.0.0; extra == 'dev'
|
|
45
|
+
Provides-Extra: display
|
|
46
|
+
Provides-Extra: docs
|
|
47
|
+
Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
|
|
48
|
+
Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
|
|
49
|
+
Requires-Dist: sphinx>=4.0.0; extra == 'docs'
|
|
50
|
+
Provides-Extra: full
|
|
51
|
+
Requires-Dist: datasets>=2.0.0; extra == 'full'
|
|
52
|
+
Requires-Dist: datasketch>=1.5.0; extra == 'full'
|
|
53
|
+
Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
|
|
54
|
+
Requires-Dist: pyarrow; extra == 'full'
|
|
55
|
+
Requires-Dist: rich>=10.0.0; extra == 'full'
|
|
56
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
|
|
57
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'full'
|
|
58
|
+
Requires-Dist: tokenizers>=0.15.0; extra == 'full'
|
|
59
|
+
Requires-Dist: toolong>=1.5.0; extra == 'full'
|
|
60
|
+
Provides-Extra: logs
|
|
61
|
+
Requires-Dist: toolong>=1.5.0; extra == 'logs'
|
|
62
|
+
Provides-Extra: mcp
|
|
63
|
+
Requires-Dist: mcp>=1.0.0; extra == 'mcp'
|
|
64
|
+
Provides-Extra: similarity
|
|
65
|
+
Requires-Dist: datasketch>=1.5.0; extra == 'similarity'
|
|
66
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'similarity'
|
|
67
|
+
Provides-Extra: storage
|
|
68
|
+
Requires-Dist: pyarrow; extra == 'storage'
|
|
69
|
+
Provides-Extra: tokenizers
|
|
70
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'tokenizers'
|
|
71
|
+
Provides-Extra: tokenizers-hf
|
|
72
|
+
Requires-Dist: huggingface-hub>=0.20.0; extra == 'tokenizers-hf'
|
|
73
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'tokenizers-hf'
|
|
74
|
+
Requires-Dist: tokenizers>=0.15.0; extra == 'tokenizers-hf'
|
|
75
|
+
Description-Content-Type: text/markdown
|
|
76
|
+
|
|
77
|
+
# dtflow
|
|
78
|
+
|
|
79
|
+
简洁的数据格式转换工具,专为机器学习训练数据设计。
|
|
80
|
+
|
|
81
|
+
## 安装
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install dtflow
|
|
85
|
+
|
|
86
|
+
# 可选依赖
|
|
87
|
+
pip install tiktoken # Token 统计(OpenAI 模型)
|
|
88
|
+
pip install transformers # Token 统计(HuggingFace 模型)
|
|
89
|
+
pip install datasets # HuggingFace Dataset 转换
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## 快速开始
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from dtflow import DataTransformer
|
|
96
|
+
|
|
97
|
+
# 加载数据
|
|
98
|
+
dt = DataTransformer.load("data.jsonl")
|
|
99
|
+
|
|
100
|
+
# 链式操作:过滤 -> 转换 -> 保存
|
|
101
|
+
(dt.filter(lambda x: x.score > 0.8)
|
|
102
|
+
.to(lambda x: {"q": x.question, "a": x.answer})
|
|
103
|
+
.save("output.jsonl"))
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 核心功能
|
|
107
|
+
|
|
108
|
+
### 数据加载与保存
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# 支持 JSONL、JSON、CSV、Parquet、Arrow(使用 Polars 引擎,比 Pandas 快 3x)
|
|
112
|
+
dt = DataTransformer.load("data.jsonl")
|
|
113
|
+
dt.save("output.jsonl")
|
|
114
|
+
|
|
115
|
+
# 从列表创建
|
|
116
|
+
dt = DataTransformer([{"q": "问题", "a": "答案"}])
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### 数据过滤
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# Lambda 过滤
|
|
123
|
+
dt.filter(lambda x: x.score > 0.8)
|
|
124
|
+
|
|
125
|
+
# 支持属性访问
|
|
126
|
+
dt.filter(lambda x: x.language == "zh")
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 数据转换
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# 自定义转换
|
|
133
|
+
dt.to(lambda x: {"question": x.q, "answer": x.a})
|
|
134
|
+
|
|
135
|
+
# 使用预设模板
|
|
136
|
+
dt.to(preset="openai_chat", user_field="q", assistant_field="a")
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### 预设模板
|
|
140
|
+
|
|
141
|
+
| 预设名称 | 输出格式 |
|
|
142
|
+
|---------|---------|
|
|
143
|
+
| `openai_chat` | `{"messages": [{"role": "user", ...}, {"role": "assistant", ...}]}` |
|
|
144
|
+
| `alpaca` | `{"instruction": ..., "input": ..., "output": ...}` |
|
|
145
|
+
| `sharegpt` | `{"conversations": [{"from": "human", ...}, {"from": "gpt", ...}]}` |
|
|
146
|
+
| `dpo_pair` | `{"prompt": ..., "chosen": ..., "rejected": ...}` |
|
|
147
|
+
| `simple_qa` | `{"question": ..., "answer": ...}` |
|
|
148
|
+
|
|
149
|
+
### Token 统计
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from dtflow import count_tokens, token_counter, token_filter, token_stats
|
|
153
|
+
|
|
154
|
+
# 计算 token 数量
|
|
155
|
+
count = count_tokens("Hello world", model="gpt-4")
|
|
156
|
+
|
|
157
|
+
# 添加 token_count 字段
|
|
158
|
+
dt.transform(token_counter("text")).save("with_tokens.jsonl")
|
|
159
|
+
|
|
160
|
+
# 按 token 长度过滤
|
|
161
|
+
dt.filter(token_filter("text", max_tokens=2048))
|
|
162
|
+
dt.filter(token_filter(["question", "answer"], min_tokens=10, max_tokens=4096))
|
|
163
|
+
|
|
164
|
+
# 统计 token 分布
|
|
165
|
+
stats = token_stats(dt.data, "text")
|
|
166
|
+
# {"total_tokens": 12345, "avg_tokens": 123, "min_tokens": 5, "max_tokens": 500, ...}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
支持 `tiktoken`(OpenAI,默认)和 `transformers` 后端,**自动检测**:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# OpenAI 模型 -> 自动使用 tiktoken
|
|
173
|
+
count_tokens("Hello", model="gpt-4")
|
|
174
|
+
|
|
175
|
+
# HuggingFace/本地模型 -> 自动使用 transformers
|
|
176
|
+
count_tokens("Hello", model="Qwen/Qwen2-7B")
|
|
177
|
+
count_tokens("Hello", model="/home/models/qwen")
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Messages Token 统计
|
|
181
|
+
|
|
182
|
+
专为多轮对话设计的 token 统计功能:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from dtflow import messages_token_counter, messages_token_filter, messages_token_stats
|
|
186
|
+
|
|
187
|
+
# 为每条数据添加 token 统计
|
|
188
|
+
dt.transform(messages_token_counter(model="gpt-4")) # 简单模式,输出总数
|
|
189
|
+
dt.transform(messages_token_counter(model="gpt-4", detailed=True)) # 详细模式
|
|
190
|
+
# 详细模式输出: {"total": 500, "user": 200, "assistant": 280, "system": 20, "turns": 5, ...}
|
|
191
|
+
|
|
192
|
+
# 按 token 数和轮数过滤
|
|
193
|
+
dt.filter(messages_token_filter(min_tokens=100, max_tokens=4096))
|
|
194
|
+
dt.filter(messages_token_filter(min_turns=2, max_turns=10))
|
|
195
|
+
|
|
196
|
+
# 统计整个数据集
|
|
197
|
+
stats = messages_token_stats(dt.data, model="gpt-4")
|
|
198
|
+
# {"count": 1000, "total_tokens": 500000, "user_tokens": 200000, "assistant_tokens": 290000, ...}
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### 格式转换器
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from dtflow import (
|
|
205
|
+
to_hf_dataset, from_hf_dataset, # HuggingFace Dataset
|
|
206
|
+
to_openai_batch, from_openai_batch, # OpenAI Batch API
|
|
207
|
+
to_llama_factory, # LLaMA-Factory Alpaca 格式
|
|
208
|
+
to_axolotl, # Axolotl 格式
|
|
209
|
+
messages_to_text, # messages 转纯文本
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# HuggingFace Dataset 互转
|
|
213
|
+
ds = to_hf_dataset(dt.data)
|
|
214
|
+
ds.push_to_hub("my-dataset")
|
|
215
|
+
|
|
216
|
+
data = from_hf_dataset("tatsu-lab/alpaca", split="train")
|
|
217
|
+
|
|
218
|
+
# OpenAI Batch API
|
|
219
|
+
batch_input = dt.to(to_openai_batch(model="gpt-4o"))
|
|
220
|
+
results = from_openai_batch(batch_output)
|
|
221
|
+
|
|
222
|
+
# messages 转纯文本(支持 chatml/llama2/simple 模板)
|
|
223
|
+
dt.transform(messages_to_text(template="chatml"))
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### LLaMA-Factory 格式
|
|
227
|
+
|
|
228
|
+
完整支持 LLaMA-Factory 的 SFT 训练格式:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from dtflow import (
|
|
232
|
+
to_llama_factory, # Alpaca 格式(单轮)
|
|
233
|
+
to_llama_factory_sharegpt, # ShareGPT 格式(多轮对话)
|
|
234
|
+
to_llama_factory_vlm, # VLM Alpaca 格式
|
|
235
|
+
to_llama_factory_vlm_sharegpt, # VLM ShareGPT 格式
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Alpaca 格式
|
|
239
|
+
dt.transform(to_llama_factory()).save("alpaca.jsonl")
|
|
240
|
+
# 输出: {"instruction": "...", "input": "", "output": "..."}
|
|
241
|
+
|
|
242
|
+
# ShareGPT 格式(多轮对话)
|
|
243
|
+
dt.transform(to_llama_factory_sharegpt()).save("sharegpt.jsonl")
|
|
244
|
+
# 输出: {"conversations": [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}], "system": "..."}
|
|
245
|
+
|
|
246
|
+
# VLM 格式(图片/视频)
|
|
247
|
+
dt.transform(to_llama_factory_vlm(images_field="images")).save("vlm.jsonl")
|
|
248
|
+
# 输出: {"instruction": "...", "output": "...", "images": ["/path/to/img.jpg"]}
|
|
249
|
+
|
|
250
|
+
dt.transform(to_llama_factory_vlm_sharegpt(images_field="images", videos_field="videos"))
|
|
251
|
+
# 输出: {"conversations": [...], "images": [...], "videos": [...]}
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### ms-swift 格式
|
|
255
|
+
|
|
256
|
+
支持 ModelScope ms-swift 的训练格式:
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from dtflow import (
|
|
260
|
+
to_swift_messages, # 标准 messages 格式
|
|
261
|
+
to_swift_query_response, # query-response 格式
|
|
262
|
+
to_swift_vlm, # VLM 格式
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# messages 格式
|
|
266
|
+
dt.transform(to_swift_messages()).save("swift_messages.jsonl")
|
|
267
|
+
# 输出: {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
268
|
+
|
|
269
|
+
# query-response 格式(自动提取 history)
|
|
270
|
+
dt.transform(to_swift_query_response(query_field="messages")).save("swift_qr.jsonl")
|
|
271
|
+
# 输出: {"query": "...", "response": "...", "system": "...", "history": [["q1", "a1"], ...]}
|
|
272
|
+
|
|
273
|
+
# VLM 格式
|
|
274
|
+
dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
|
|
275
|
+
# 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### 其他操作
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
# 采样
|
|
282
|
+
dt.sample(100) # 随机采样 100 条
|
|
283
|
+
dt.head(10) # 前 10 条
|
|
284
|
+
dt.tail(10) # 后 10 条
|
|
285
|
+
|
|
286
|
+
# 分割
|
|
287
|
+
train, test = dt.split(ratio=0.8, shuffle=True, seed=42)
|
|
288
|
+
|
|
289
|
+
# 统计
|
|
290
|
+
stats = dt.stats() # 总数、字段信息
|
|
291
|
+
count = dt.count(lambda x: x.score > 0.9)
|
|
292
|
+
|
|
293
|
+
# 打乱
|
|
294
|
+
dt.shuffle(seed=42)
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
## CLI 命令
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
# 数据采样
|
|
301
|
+
dt sample data.jsonl --num=10
|
|
302
|
+
dt sample data.csv --num=100 --sample_type=head
|
|
303
|
+
dt sample data.jsonl 1000 --by=category # 分层采样
|
|
304
|
+
dt sample data.jsonl 1000 --by=meta.source # 按嵌套字段分层采样
|
|
305
|
+
dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
|
|
306
|
+
|
|
307
|
+
# 数据转换 - 预设模式
|
|
308
|
+
dt transform data.jsonl --preset=openai_chat
|
|
309
|
+
dt transform data.jsonl --preset=alpaca
|
|
310
|
+
|
|
311
|
+
# 数据转换 - 配置文件模式
|
|
312
|
+
dt transform data.jsonl # 首次运行生成配置文件
|
|
313
|
+
# 编辑 .dt/data.py 后再次运行
|
|
314
|
+
dt transform data.jsonl --num=100 # 执行转换
|
|
315
|
+
|
|
316
|
+
# Pipeline 执行(可复现的数据处理流程)
|
|
317
|
+
dt run pipeline.yaml
|
|
318
|
+
dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
|
|
319
|
+
|
|
320
|
+
# Token 统计
|
|
321
|
+
dt token-stats data.jsonl --field=messages --model=gpt-4
|
|
322
|
+
dt token-stats data.jsonl --field=messages[-1].content # 统计最后一条消息
|
|
323
|
+
dt token-stats data.jsonl --field=text --detailed
|
|
324
|
+
|
|
325
|
+
# 数据对比
|
|
326
|
+
dt diff v1/train.jsonl v2/train.jsonl
|
|
327
|
+
dt diff a.jsonl b.jsonl --key=id
|
|
328
|
+
dt diff a.jsonl b.jsonl --key=meta.uuid # 按嵌套字段匹配
|
|
329
|
+
|
|
330
|
+
# 数据清洗
|
|
331
|
+
dt clean data.jsonl --drop-empty # 删除任意空值记录
|
|
332
|
+
dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
|
|
333
|
+
dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
|
|
334
|
+
dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
|
|
335
|
+
dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
|
|
336
|
+
dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
|
|
337
|
+
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
338
|
+
dt clean data.jsonl --drop=metadata # 删除指定字段
|
|
339
|
+
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
340
|
+
|
|
341
|
+
# 数据去重
|
|
342
|
+
dt dedupe data.jsonl # 全量精确去重
|
|
343
|
+
dt dedupe data.jsonl --key=text # 按字段精确去重
|
|
344
|
+
dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
345
|
+
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
346
|
+
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
347
|
+
|
|
348
|
+
# 文件拼接
|
|
349
|
+
dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
350
|
+
|
|
351
|
+
# 数据统计
|
|
352
|
+
dt stats data.jsonl
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### 字段路径语法
|
|
356
|
+
|
|
357
|
+
CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的数据:
|
|
358
|
+
|
|
359
|
+
| 语法 | 含义 | 示例 |
|
|
360
|
+
|------|------|------|
|
|
361
|
+
| `a.b.c` | 嵌套字段 | `meta.source` |
|
|
362
|
+
| `a[0].b` | 数组索引 | `messages[0].role` |
|
|
363
|
+
| `a[-1].b` | 负索引 | `messages[-1].content` |
|
|
364
|
+
| `a.#` | 数组长度 | `messages.#` |
|
|
365
|
+
| `a[*].b` | 展开所有元素 | `messages[*].role` |
|
|
366
|
+
| `a[*].b:join` | 展开并用 `\|` 拼接 | `messages[*].role:join` |
|
|
367
|
+
| `a[*].b:unique` | 展开去重后拼接 | `messages[*].role:unique` |
|
|
368
|
+
|
|
369
|
+
支持字段路径的命令参数:
|
|
370
|
+
|
|
371
|
+
| 命令 | 参数 | 示例 |
|
|
372
|
+
|------|------|------|
|
|
373
|
+
| `sample` | `--by=` | `--by=meta.source`、`--by=messages.#` |
|
|
374
|
+
| `dedupe` | `--key=` | `--key=meta.id`、`--key=messages[0].content` |
|
|
375
|
+
| `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
|
|
376
|
+
| `clean` | `--min-len=` | `--min-len=messages.#:2` |
|
|
377
|
+
| `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
|
|
378
|
+
| `token-stats` | `--field=` | `--field=messages[-1].content` |
|
|
379
|
+
| `diff` | `--key=` | `--key=meta.uuid` |
|
|
380
|
+
|
|
381
|
+
示例数据:
|
|
382
|
+
```json
|
|
383
|
+
{"meta": {"source": "wiki"}, "messages": [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]}
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
- `meta.source` → `"wiki"`
|
|
387
|
+
- `messages[0].role` → `"user"`
|
|
388
|
+
- `messages[-1].content` → `"hello"`
|
|
389
|
+
- `messages.#` → `2`
|
|
390
|
+
- `messages[*].role` → `"user"` (默认取第一个)
|
|
391
|
+
- `messages[*].role:join` → `"user|assistant"`
|
|
392
|
+
|
|
393
|
+
### Pipeline 配置
|
|
394
|
+
|
|
395
|
+
使用 YAML 配置文件定义可复现的数据处理流程:
|
|
396
|
+
|
|
397
|
+
```yaml
|
|
398
|
+
# pipeline.yaml
|
|
399
|
+
version: "1.0"
|
|
400
|
+
seed: 42
|
|
401
|
+
input: raw_data.jsonl
|
|
402
|
+
output: processed.jsonl
|
|
403
|
+
|
|
404
|
+
steps:
|
|
405
|
+
- type: filter
|
|
406
|
+
condition: "score > 0.5"
|
|
407
|
+
|
|
408
|
+
- type: filter
|
|
409
|
+
condition: "len(text) > 10"
|
|
410
|
+
|
|
411
|
+
- type: transform
|
|
412
|
+
preset: openai_chat
|
|
413
|
+
params:
|
|
414
|
+
user_field: q
|
|
415
|
+
assistant_field: a
|
|
416
|
+
|
|
417
|
+
- type: dedupe
|
|
418
|
+
key: text
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
支持的步骤类型:
|
|
422
|
+
|
|
423
|
+
| 步骤 | 参数 | 说明 |
|
|
424
|
+
|------|------|------|
|
|
425
|
+
| `filter` | `condition` | 条件过滤:`score > 0.5`, `len(text) > 10`, `field is not empty` |
|
|
426
|
+
| `transform` | `preset`, `params` | 格式转换,使用预设模板 |
|
|
427
|
+
| `dedupe` | `key`, `similar` | 去重,支持精确和相似度去重 |
|
|
428
|
+
| `sample` | `num`, `seed` | 随机采样 |
|
|
429
|
+
| `head` | `num` | 取前 N 条 |
|
|
430
|
+
| `tail` | `num` | 取后 N 条 |
|
|
431
|
+
| `shuffle` | `seed` | 打乱顺序 |
|
|
432
|
+
| `split` | `ratio`, `seed` | 数据集分割 |
|
|
433
|
+
|
|
434
|
+
执行 Pipeline:
|
|
435
|
+
|
|
436
|
+
```bash
|
|
437
|
+
dt run pipeline.yaml
|
|
438
|
+
dt run pipeline.yaml --input=new_data.jsonl # 覆盖输入文件
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
### 数据血缘追踪
|
|
442
|
+
|
|
443
|
+
记录数据处理的完整历史,支持可复现和问题追溯:
|
|
444
|
+
|
|
445
|
+
```python
|
|
446
|
+
# 启用血缘追踪
|
|
447
|
+
dt = DataTransformer.load("raw.jsonl", track_lineage=True)
|
|
448
|
+
|
|
449
|
+
# 正常进行数据处理
|
|
450
|
+
result = (dt
|
|
451
|
+
.filter(lambda x: x.score > 0.5)
|
|
452
|
+
.transform(lambda x: {"q": x.q, "a": x.a})
|
|
453
|
+
.dedupe("q")
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# 保存时记录血缘
|
|
457
|
+
result.save("processed.jsonl", lineage=True)
|
|
458
|
+
# 自动生成 processed.jsonl.lineage.json
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
查看血缘历史:
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
dt history processed.jsonl
|
|
465
|
+
# 输出:
|
|
466
|
+
# 📊 数据血缘报告: processed.jsonl
|
|
467
|
+
# └─ 版本 1
|
|
468
|
+
# 来源: raw.jsonl
|
|
469
|
+
# 操作链:
|
|
470
|
+
# ├─ filter: 1000 → 800
|
|
471
|
+
# ├─ transform: 800 → 800
|
|
472
|
+
# └─ dedupe: 800 → 750
|
|
473
|
+
# 输出数量: 750
|
|
474
|
+
|
|
475
|
+
dt history processed.jsonl --json # JSON 格式输出
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
### 大文件流式处理
|
|
479
|
+
|
|
480
|
+
专为超大文件设计的流式处理接口,内存占用 O(1),支持 JSONL、CSV、Parquet、Arrow 格式:
|
|
481
|
+
|
|
482
|
+
```python
|
|
483
|
+
from dtflow import load_stream, load_sharded
|
|
484
|
+
|
|
485
|
+
# 流式加载和处理(100GB 文件也只用常量内存)
|
|
486
|
+
(load_stream("huge_100gb.jsonl")
|
|
487
|
+
.filter(lambda x: x["score"] > 0.5)
|
|
488
|
+
.transform(lambda x: {"text": x["content"]})
|
|
489
|
+
.save("output.jsonl"))
|
|
490
|
+
|
|
491
|
+
# 跨格式转换(CSV → Parquet)
|
|
492
|
+
(load_stream("data.csv")
|
|
493
|
+
.filter(lambda x: x["score"] > 0.5)
|
|
494
|
+
.save("output.parquet"))
|
|
495
|
+
|
|
496
|
+
# 分片文件加载(支持多格式)
|
|
497
|
+
(load_sharded("data/train_*.parquet")
|
|
498
|
+
.filter(lambda x: len(x["text"]) > 10)
|
|
499
|
+
.save("merged.jsonl"))
|
|
500
|
+
|
|
501
|
+
# 分片保存
|
|
502
|
+
(load_stream("huge.jsonl")
|
|
503
|
+
.transform(lambda x: {"q": x["question"], "a": x["answer"]})
|
|
504
|
+
.save_sharded("output/", shard_size=100000))
|
|
505
|
+
# 生成: output/part-00000.jsonl, output/part-00001.jsonl, ...
|
|
506
|
+
|
|
507
|
+
# 批次处理(适合需要批量调用 API 的场景)
|
|
508
|
+
for batch in load_stream("data.jsonl").batch(1000):
|
|
509
|
+
results = call_api(batch) # 批量处理
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
特点:
|
|
513
|
+
- **惰性执行**:filter/transform 不会立即执行,只在 save/collect 时才触发
|
|
514
|
+
- **O(1) 内存**:无论文件多大,内存占用恒定(读取侧)
|
|
515
|
+
- **多格式支持**:JSONL、CSV、Parquet、Arrow 均支持流式处理
|
|
516
|
+
- **跨格式转换**:可直接从 CSV 读取并保存为 Parquet 等
|
|
517
|
+
- **分片支持**:支持 glob 模式加载多个分片,自动合并处理
|
|
518
|
+
|
|
519
|
+
## 错误处理
|
|
520
|
+
|
|
521
|
+
```python
|
|
522
|
+
# 跳过错误项(默认)
|
|
523
|
+
dt.to(transform_func, on_error="skip")
|
|
524
|
+
|
|
525
|
+
# 抛出异常
|
|
526
|
+
dt.to(transform_func, on_error="raise")
|
|
527
|
+
|
|
528
|
+
# 保留原始数据
|
|
529
|
+
dt.to(transform_func, on_error="keep")
|
|
530
|
+
|
|
531
|
+
# 返回错误信息
|
|
532
|
+
result, errors = dt.to(transform_func, return_errors=True)
|
|
533
|
+
```
|
|
534
|
+
|
|
535
|
+
## 设计哲学
|
|
536
|
+
|
|
537
|
+
### 函数式优于类继承
|
|
538
|
+
|
|
539
|
+
不需要复杂的 OOP 抽象,直接用函数解决问题:
|
|
540
|
+
|
|
541
|
+
```python
|
|
542
|
+
# ✅ 简单直接
|
|
543
|
+
dt.to(lambda x: {"q": x.question, "a": x.answer})
|
|
544
|
+
|
|
545
|
+
# ❌ 不需要这种设计
|
|
546
|
+
class MyFormatter(BaseFormatter):
|
|
547
|
+
def format(self, item): ...
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
### 预设是便利层,不是核心抽象
|
|
551
|
+
|
|
552
|
+
90% 的需求用 `transform(lambda x: ...)` 就能解决。预设只是常见场景的快捷方式:
|
|
553
|
+
|
|
554
|
+
```python
|
|
555
|
+
# 预设:常见场景的便利函数
|
|
556
|
+
dt.to(preset="openai_chat")
|
|
557
|
+
|
|
558
|
+
# 自定义:完全控制转换逻辑
|
|
559
|
+
dt.to(lambda x: {
|
|
560
|
+
"messages": [
|
|
561
|
+
{"role": "user", "content": x.q},
|
|
562
|
+
{"role": "assistant", "content": x.a}
|
|
563
|
+
]
|
|
564
|
+
})
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
### KISS 原则
|
|
568
|
+
|
|
569
|
+
- 一个核心类 `DataTransformer` 搞定所有操作
|
|
570
|
+
- 链式 API,代码像自然语言
|
|
571
|
+
- 属性访问 `x.field` 代替 `x["field"]`
|
|
572
|
+
- 不过度设计,不追求"可扩展框架"
|
|
573
|
+
|
|
574
|
+
### 实用主义
|
|
575
|
+
|
|
576
|
+
不追求学术上的完美抽象,只提供**足够好用的工具**。
|
|
577
|
+
|
|
578
|
+
## License
|
|
579
|
+
|
|
580
|
+
MIT
|