dtflow 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +34 -1
- dtflow/__main__.py +22 -0
- dtflow/cli/commands.py +5 -0
- dtflow/cli/common.py +13 -9
- dtflow/cli/stats.py +114 -36
- dtflow/cli/validate.py +152 -0
- dtflow/core.py +220 -10
- dtflow/framework.py +610 -0
- dtflow/lineage.py +17 -0
- dtflow/schema.py +508 -0
- dtflow/streaming.py +93 -35
- dtflow/tokenizers.py +84 -29
- dtflow/utils/field_path.py +6 -2
- {dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/METADATA +117 -2
- {dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/RECORD +17 -14
- {dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/WHEEL +0 -0
- {dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/entry_points.txt +0 -0
dtflow/tokenizers.py
CHANGED
|
@@ -210,7 +210,10 @@ def token_counter(
|
|
|
210
210
|
创建 token 计数转换函数。
|
|
211
211
|
|
|
212
212
|
Args:
|
|
213
|
-
fields:
|
|
213
|
+
fields: 要统计的字段(单个或多个),支持嵌套路径语法
|
|
214
|
+
- 简单字段: "text"
|
|
215
|
+
- 嵌套字段: "meta.content", "data.text"
|
|
216
|
+
- 索引: "messages[0].content", "messages[-1].content"
|
|
214
217
|
model: 模型名称或别名,如 "qwen2.5", "gpt-4", "llama3" 等
|
|
215
218
|
backend: 后端选择,None 则自动检测
|
|
216
219
|
output_field: 输出字段名
|
|
@@ -221,6 +224,7 @@ def token_counter(
|
|
|
221
224
|
Examples:
|
|
222
225
|
>>> dt.transform(token_counter("text"))
|
|
223
226
|
>>> dt.transform(token_counter(["question", "answer"], model="qwen3"))
|
|
227
|
+
>>> dt.transform(token_counter("messages[-1].content")) # 最后一条消息
|
|
224
228
|
"""
|
|
225
229
|
if isinstance(fields, str):
|
|
226
230
|
fields = [fields]
|
|
@@ -229,7 +233,7 @@ def token_counter(
|
|
|
229
233
|
result = item.to_dict() if hasattr(item, "to_dict") else dict(item)
|
|
230
234
|
total = 0
|
|
231
235
|
for field in fields:
|
|
232
|
-
value = item
|
|
236
|
+
value = get_field_with_spec(item, field, default="")
|
|
233
237
|
if value:
|
|
234
238
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
235
239
|
result[output_field] = total
|
|
@@ -249,7 +253,10 @@ def token_filter(
|
|
|
249
253
|
创建基于 token 长度的过滤函数。
|
|
250
254
|
|
|
251
255
|
Args:
|
|
252
|
-
fields:
|
|
256
|
+
fields: 要统计的字段(单个或多个),支持嵌套路径语法
|
|
257
|
+
- 简单字段: "text"
|
|
258
|
+
- 嵌套字段: "meta.content", "data.text"
|
|
259
|
+
- 索引: "messages[0].content", "messages[-1].content"
|
|
253
260
|
min_tokens: 最小 token 数(包含)
|
|
254
261
|
max_tokens: 最大 token 数(包含)
|
|
255
262
|
model: 模型名称
|
|
@@ -261,6 +268,7 @@ def token_filter(
|
|
|
261
268
|
Examples:
|
|
262
269
|
>>> dt.filter(token_filter("text", min_tokens=10, max_tokens=512))
|
|
263
270
|
>>> dt.filter(token_filter(["q", "a"], max_tokens=2048))
|
|
271
|
+
>>> dt.filter(token_filter("messages[-1].content", max_tokens=1024))
|
|
264
272
|
"""
|
|
265
273
|
if isinstance(fields, str):
|
|
266
274
|
fields = [fields]
|
|
@@ -268,7 +276,7 @@ def token_filter(
|
|
|
268
276
|
def filter_func(item) -> bool:
|
|
269
277
|
total = 0
|
|
270
278
|
for field in fields:
|
|
271
|
-
value = item
|
|
279
|
+
value = get_field_with_spec(item, field, default="")
|
|
272
280
|
if value:
|
|
273
281
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
274
282
|
|
|
@@ -281,11 +289,32 @@ def token_filter(
|
|
|
281
289
|
return filter_func
|
|
282
290
|
|
|
283
291
|
|
|
292
|
+
def _percentile(sorted_data: List[int], p: float) -> int:
|
|
293
|
+
"""计算百分位数"""
|
|
294
|
+
n = len(sorted_data)
|
|
295
|
+
if n == 0:
|
|
296
|
+
return 0
|
|
297
|
+
idx = (n - 1) * p / 100
|
|
298
|
+
lower = int(idx)
|
|
299
|
+
upper = min(lower + 1, n - 1)
|
|
300
|
+
weight = idx - lower
|
|
301
|
+
return int(sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _std(counts: List[int], avg: float) -> float:
|
|
305
|
+
"""计算标准差"""
|
|
306
|
+
if len(counts) < 2:
|
|
307
|
+
return 0.0
|
|
308
|
+
variance = sum((x - avg) ** 2 for x in counts) / len(counts)
|
|
309
|
+
return variance**0.5
|
|
310
|
+
|
|
311
|
+
|
|
284
312
|
def token_stats(
|
|
285
313
|
data: List[Dict[str, Any]],
|
|
286
314
|
fields: Union[str, List[str]],
|
|
287
315
|
model: str = DEFAULT_MODEL,
|
|
288
316
|
backend: Optional[str] = None,
|
|
317
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
289
318
|
) -> Dict[str, Any]:
|
|
290
319
|
"""
|
|
291
320
|
统计数据集的 token 信息。
|
|
@@ -295,9 +324,17 @@ def token_stats(
|
|
|
295
324
|
fields: 要统计的字段,支持嵌套路径语法(如 meta.text, messages[-1].content)
|
|
296
325
|
model: 模型名称或别名,如 "qwen2.5", "gpt-4" 等
|
|
297
326
|
backend: 后端选择,None 则自动检测
|
|
327
|
+
progress_callback: 进度回调函数,接收 (current, total) 两个参数
|
|
298
328
|
|
|
299
329
|
Returns:
|
|
300
|
-
|
|
330
|
+
统计信息字典,包含:
|
|
331
|
+
- total_tokens: 总 token 数
|
|
332
|
+
- count: 样本数
|
|
333
|
+
- avg_tokens: 平均 token 数
|
|
334
|
+
- std_tokens: 标准差
|
|
335
|
+
- min_tokens, max_tokens: 最小/最大值
|
|
336
|
+
- median_tokens: 中位数 (p50)
|
|
337
|
+
- p25, p75, p90, p95, p99: 百分位数
|
|
301
338
|
"""
|
|
302
339
|
if isinstance(fields, str):
|
|
303
340
|
fields = [fields]
|
|
@@ -306,21 +343,33 @@ def token_stats(
|
|
|
306
343
|
return {"total_tokens": 0, "count": 0}
|
|
307
344
|
|
|
308
345
|
counts = []
|
|
309
|
-
|
|
346
|
+
total_items = len(data)
|
|
347
|
+
for i, item in enumerate(data):
|
|
310
348
|
total = 0
|
|
311
349
|
for field in fields:
|
|
312
350
|
value = get_field_with_spec(item, field, default="")
|
|
313
351
|
if value:
|
|
314
352
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
315
353
|
counts.append(total)
|
|
354
|
+
if progress_callback:
|
|
355
|
+
progress_callback(i + 1, total_items)
|
|
356
|
+
|
|
357
|
+
sorted_counts = sorted(counts)
|
|
358
|
+
avg = sum(counts) / len(counts)
|
|
316
359
|
|
|
317
360
|
return {
|
|
318
361
|
"total_tokens": sum(counts),
|
|
319
362
|
"count": len(counts),
|
|
320
|
-
"avg_tokens":
|
|
363
|
+
"avg_tokens": avg,
|
|
364
|
+
"std_tokens": _std(counts, avg),
|
|
321
365
|
"min_tokens": min(counts),
|
|
322
366
|
"max_tokens": max(counts),
|
|
323
|
-
"median_tokens":
|
|
367
|
+
"median_tokens": _percentile(sorted_counts, 50),
|
|
368
|
+
"p25": _percentile(sorted_counts, 25),
|
|
369
|
+
"p75": _percentile(sorted_counts, 75),
|
|
370
|
+
"p90": _percentile(sorted_counts, 90),
|
|
371
|
+
"p95": _percentile(sorted_counts, 95),
|
|
372
|
+
"p99": _percentile(sorted_counts, 99),
|
|
324
373
|
}
|
|
325
374
|
|
|
326
375
|
|
|
@@ -504,6 +553,7 @@ def messages_token_stats(
|
|
|
504
553
|
messages_field: str = "messages",
|
|
505
554
|
model: str = DEFAULT_MODEL,
|
|
506
555
|
backend: Optional[str] = None,
|
|
556
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
507
557
|
) -> Dict[str, Any]:
|
|
508
558
|
"""
|
|
509
559
|
统计数据集中 messages 的 token 信息。
|
|
@@ -513,25 +563,18 @@ def messages_token_stats(
|
|
|
513
563
|
messages_field: messages 字段名,支持嵌套路径语法(如 conversation.messages)
|
|
514
564
|
model: 模型名称或别名
|
|
515
565
|
backend: 后端,None 则自动检测
|
|
566
|
+
progress_callback: 进度回调函数,接收 (current, total) 两个参数
|
|
516
567
|
|
|
517
568
|
Returns:
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
"user_tokens": 200000,
|
|
528
|
-
"assistant_tokens": 290000,
|
|
529
|
-
"system_tokens": 10000,
|
|
530
|
-
"avg_tokens": 500,
|
|
531
|
-
"max_tokens": 2048,
|
|
532
|
-
"min_tokens": 50,
|
|
533
|
-
"avg_turns": 4,
|
|
534
|
-
}
|
|
569
|
+
统计信息字典,包含:
|
|
570
|
+
- count: 样本数
|
|
571
|
+
- total_tokens: 总 token 数
|
|
572
|
+
- user_tokens, assistant_tokens, system_tokens: 各角色 token 数
|
|
573
|
+
- avg_tokens, std_tokens: 平均值和标准差
|
|
574
|
+
- min_tokens, max_tokens: 最小/最大值
|
|
575
|
+
- median_tokens: 中位数
|
|
576
|
+
- p25, p75, p90, p95, p99: 百分位数
|
|
577
|
+
- avg_turns: 平均对话轮数
|
|
535
578
|
"""
|
|
536
579
|
_backend = backend or _auto_backend(model)
|
|
537
580
|
|
|
@@ -539,24 +582,36 @@ def messages_token_stats(
|
|
|
539
582
|
return {"count": 0, "total_tokens": 0}
|
|
540
583
|
|
|
541
584
|
all_stats = []
|
|
542
|
-
|
|
585
|
+
total_items = len(data)
|
|
586
|
+
for i, item in enumerate(data):
|
|
543
587
|
messages = get_field_with_spec(item, messages_field, default=[])
|
|
544
588
|
if messages:
|
|
545
589
|
all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
|
|
590
|
+
if progress_callback:
|
|
591
|
+
progress_callback(i + 1, total_items)
|
|
546
592
|
|
|
547
593
|
if not all_stats:
|
|
548
594
|
return {"count": 0, "total_tokens": 0}
|
|
549
595
|
|
|
550
596
|
totals = [s["total"] for s in all_stats]
|
|
597
|
+
sorted_totals = sorted(totals)
|
|
598
|
+
avg = sum(totals) / len(totals)
|
|
599
|
+
|
|
551
600
|
return {
|
|
552
601
|
"count": len(all_stats),
|
|
553
602
|
"total_tokens": sum(totals),
|
|
554
603
|
"user_tokens": sum(s["user"] for s in all_stats),
|
|
555
604
|
"assistant_tokens": sum(s["assistant"] for s in all_stats),
|
|
556
605
|
"system_tokens": sum(s["system"] for s in all_stats),
|
|
557
|
-
"avg_tokens":
|
|
558
|
-
"
|
|
606
|
+
"avg_tokens": int(avg),
|
|
607
|
+
"std_tokens": _std(totals, avg),
|
|
559
608
|
"min_tokens": min(totals),
|
|
560
|
-
"
|
|
609
|
+
"max_tokens": max(totals),
|
|
610
|
+
"median_tokens": _percentile(sorted_totals, 50),
|
|
611
|
+
"p25": _percentile(sorted_totals, 25),
|
|
612
|
+
"p75": _percentile(sorted_totals, 75),
|
|
613
|
+
"p90": _percentile(sorted_totals, 90),
|
|
614
|
+
"p95": _percentile(sorted_totals, 95),
|
|
615
|
+
"p99": _percentile(sorted_totals, 99),
|
|
561
616
|
"avg_turns": sum(s["turns"] for s in all_stats) // len(all_stats),
|
|
562
617
|
}
|
dtflow/utils/field_path.py
CHANGED
|
@@ -96,7 +96,9 @@ def _parse_path(path: str) -> List[Union[str, int, Literal["*", "#"]]]:
|
|
|
96
96
|
continue
|
|
97
97
|
|
|
98
98
|
# 解析 field[index] 格式
|
|
99
|
-
match = re.match(
|
|
99
|
+
match = re.match(
|
|
100
|
+
r"([a-zA-Z_\u4e00-\u9fff][a-zA-Z0-9_\u4e00-\u9fff]*)?(?:\[(-?\d+|\*)\])?", part
|
|
101
|
+
)
|
|
100
102
|
if match:
|
|
101
103
|
field_name, index = match.groups()
|
|
102
104
|
|
|
@@ -175,10 +177,12 @@ def _get_value_by_segments(
|
|
|
175
177
|
|
|
176
178
|
return values
|
|
177
179
|
|
|
178
|
-
#
|
|
180
|
+
# 字典字段访问(支持 dict 和类 dict 对象如 DictWrapper)
|
|
179
181
|
if isinstance(seg, str):
|
|
180
182
|
if isinstance(current, dict):
|
|
181
183
|
current = current.get(seg)
|
|
184
|
+
elif hasattr(current, "get"):
|
|
185
|
+
current = current.get(seg)
|
|
182
186
|
else:
|
|
183
187
|
return None
|
|
184
188
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -32,16 +32,26 @@ Requires-Dist: orjson>=3.9.0
|
|
|
32
32
|
Requires-Dist: polars>=0.20.0
|
|
33
33
|
Requires-Dist: pyyaml>=5.4.0
|
|
34
34
|
Requires-Dist: rich>=10.0.0
|
|
35
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
35
36
|
Requires-Dist: typer>=0.9.0
|
|
36
37
|
Provides-Extra: converters
|
|
37
38
|
Requires-Dist: datasets>=2.0.0; extra == 'converters'
|
|
38
39
|
Provides-Extra: dev
|
|
39
40
|
Requires-Dist: black>=21.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: datasets>=2.0.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: datasketch>=1.5.0; extra == 'dev'
|
|
40
43
|
Requires-Dist: flake8>=3.9.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
|
|
41
45
|
Requires-Dist: isort>=5.9.0; extra == 'dev'
|
|
42
46
|
Requires-Dist: mypy>=0.910; extra == 'dev'
|
|
47
|
+
Requires-Dist: pyarrow; extra == 'dev'
|
|
43
48
|
Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
|
|
44
49
|
Requires-Dist: pytest>=6.0.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: rich>=10.0.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: tokenizers>=0.15.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: toolong>=1.5.0; extra == 'dev'
|
|
45
55
|
Provides-Extra: display
|
|
46
56
|
Provides-Extra: docs
|
|
47
57
|
Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
|
|
@@ -129,7 +139,7 @@ dt.filter(lambda x: x.language == "zh")
|
|
|
129
139
|
### 数据验证
|
|
130
140
|
|
|
131
141
|
```python
|
|
132
|
-
#
|
|
142
|
+
# 简单验证,返回不通过的记录列表
|
|
133
143
|
errors = dt.validate(lambda x: len(x.messages) >= 2)
|
|
134
144
|
|
|
135
145
|
if errors:
|
|
@@ -137,6 +147,53 @@ if errors:
|
|
|
137
147
|
print(f"第 {e.index} 行: {e.error}")
|
|
138
148
|
```
|
|
139
149
|
|
|
150
|
+
### Schema 验证
|
|
151
|
+
|
|
152
|
+
使用 Schema 进行结构化数据验证:
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from dtflow import Schema, Field, openai_chat_schema
|
|
156
|
+
|
|
157
|
+
# 使用预设 Schema
|
|
158
|
+
result = dt.validate_schema(openai_chat_schema)
|
|
159
|
+
print(result) # ValidationResult(valid=950, invalid=50, errors=[...])
|
|
160
|
+
|
|
161
|
+
# 自定义 Schema
|
|
162
|
+
schema = Schema({
|
|
163
|
+
"messages": Field(type="list", required=True, min_length=1),
|
|
164
|
+
"messages[*].role": Field(type="str", choices=["user", "assistant", "system"]),
|
|
165
|
+
"messages[*].content": Field(type="str", min_length=1),
|
|
166
|
+
"score": Field(type="float", min=0, max=1),
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
result = dt.validate_schema(schema)
|
|
170
|
+
|
|
171
|
+
# 过滤出有效数据
|
|
172
|
+
valid_dt = dt.validate_schema(schema, filter_invalid=True)
|
|
173
|
+
valid_dt.save("valid.jsonl")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**预设 Schema**:
|
|
177
|
+
|
|
178
|
+
| Schema 名称 | 用途 |
|
|
179
|
+
|------------|------|
|
|
180
|
+
| `openai_chat_schema` | OpenAI messages 格式验证 |
|
|
181
|
+
| `alpaca_schema` | Alpaca instruction/output 格式 |
|
|
182
|
+
| `sharegpt_schema` | ShareGPT conversations 格式 |
|
|
183
|
+
| `dpo_schema` | DPO prompt/chosen/rejected 格式 |
|
|
184
|
+
|
|
185
|
+
**Field 参数**:
|
|
186
|
+
|
|
187
|
+
| 参数 | 说明 | 示例 |
|
|
188
|
+
|------|------|------|
|
|
189
|
+
| `type` | 类型验证 | `"str"`, `"int"`, `"float"`, `"bool"`, `"list"`, `"dict"` |
|
|
190
|
+
| `required` | 是否必填 | `True` / `False` |
|
|
191
|
+
| `min` / `max` | 数值范围 | `min=0, max=1` |
|
|
192
|
+
| `min_length` / `max_length` | 长度范围 | `min_length=1` |
|
|
193
|
+
| `choices` | 枚举值 | `choices=["user", "assistant"]` |
|
|
194
|
+
| `pattern` | 正则匹配 | `pattern=r"^\d{4}-\d{2}-\d{2}$"` |
|
|
195
|
+
| `custom` | 自定义验证 | `custom=lambda x: x > 0` |
|
|
196
|
+
|
|
140
197
|
### 数据转换
|
|
141
198
|
|
|
142
199
|
```python
|
|
@@ -286,6 +343,58 @@ dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
|
|
|
286
343
|
# 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
|
|
287
344
|
```
|
|
288
345
|
|
|
346
|
+
### 训练框架一键导出
|
|
347
|
+
|
|
348
|
+
将数据导出为目标训练框架可直接使用的格式,自动生成配置文件:
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
from dtflow import DataTransformer
|
|
352
|
+
|
|
353
|
+
dt = DataTransformer.load("data.jsonl")
|
|
354
|
+
|
|
355
|
+
# 1. 检查框架兼容性
|
|
356
|
+
result = dt.check_compatibility("llama-factory")
|
|
357
|
+
print(result)
|
|
358
|
+
# ✅ 兼容 - LLaMA-Factory (openai_chat)
|
|
359
|
+
# 或
|
|
360
|
+
# ❌ 不兼容 - 错误: xxx
|
|
361
|
+
|
|
362
|
+
# 2. 一键导出到 LLaMA-Factory
|
|
363
|
+
files = dt.export_for("llama-factory", "./llama_ready/")
|
|
364
|
+
# 生成文件:
|
|
365
|
+
# - ./llama_ready/custom_dataset.json # 数据文件
|
|
366
|
+
# - ./llama_ready/dataset_info.json # 数据集配置
|
|
367
|
+
# - ./llama_ready/train_args.yaml # 训练参数模板
|
|
368
|
+
|
|
369
|
+
# 3. 导出到 ms-swift
|
|
370
|
+
files = dt.export_for("swift", "./swift_ready/")
|
|
371
|
+
# 生成: data.jsonl + train_swift.sh
|
|
372
|
+
|
|
373
|
+
# 4. 导出到 Axolotl
|
|
374
|
+
files = dt.export_for("axolotl", "./axolotl_ready/")
|
|
375
|
+
# 生成: data.jsonl + config.yaml
|
|
376
|
+
|
|
377
|
+
# 指定数据集名称
|
|
378
|
+
dt.export_for("llama-factory", "./output/", dataset_name="my_sft_data")
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
**支持的框架**:
|
|
382
|
+
|
|
383
|
+
| 框架 | 导出内容 | 使用方式 |
|
|
384
|
+
|------|---------|---------|
|
|
385
|
+
| `llama-factory` | data.json + dataset_info.json + train_args.yaml | `llamafactory-cli train train_args.yaml` |
|
|
386
|
+
| `swift` | data.jsonl + train_swift.sh | `bash train_swift.sh` |
|
|
387
|
+
| `axolotl` | data.jsonl + config.yaml | `accelerate launch -m axolotl.cli.train config.yaml` |
|
|
388
|
+
|
|
389
|
+
**自动格式检测**:
|
|
390
|
+
|
|
391
|
+
| 检测到的格式 | 数据结构 |
|
|
392
|
+
|------------|---------|
|
|
393
|
+
| `openai_chat` | `{"messages": [{"role": "user", ...}]}` |
|
|
394
|
+
| `alpaca` | `{"instruction": ..., "output": ...}` |
|
|
395
|
+
| `sharegpt` | `{"conversations": [{"from": "human", ...}]}` |
|
|
396
|
+
| `dpo` | `{"prompt": ..., "chosen": ..., "rejected": ...}` |
|
|
397
|
+
|
|
289
398
|
### 其他操作
|
|
290
399
|
|
|
291
400
|
```python
|
|
@@ -361,6 +470,12 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
|
361
470
|
|
|
362
471
|
# 数据统计
|
|
363
472
|
dt stats data.jsonl
|
|
473
|
+
|
|
474
|
+
# 数据验证
|
|
475
|
+
dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验证
|
|
476
|
+
dt validate data.jsonl --preset=alpaca --verbose # 详细输出
|
|
477
|
+
dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
|
|
478
|
+
dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
|
|
364
479
|
```
|
|
365
480
|
|
|
366
481
|
### 字段路径语法
|
|
@@ -1,22 +1,25 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
2
|
-
dtflow/__main__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=PTqh_6-F6eEwg1RxQ0ueP6CYnZauMuqYhlZe2BJphr0,3031
|
|
2
|
+
dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
|
|
3
3
|
dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
|
|
4
|
-
dtflow/core.py,sha256=
|
|
5
|
-
dtflow/
|
|
4
|
+
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
5
|
+
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
6
|
+
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
6
7
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
7
8
|
dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
|
|
8
|
-
dtflow/
|
|
9
|
-
dtflow/
|
|
9
|
+
dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
|
|
10
|
+
dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
|
|
11
|
+
dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
|
|
10
12
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
11
13
|
dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
|
|
12
|
-
dtflow/cli/commands.py,sha256=
|
|
13
|
-
dtflow/cli/common.py,sha256=
|
|
14
|
+
dtflow/cli/commands.py,sha256=ST65Ox_MKu-CKAtPVaxECAPXYOJiF7BhL32A4nsZZl0,1175
|
|
15
|
+
dtflow/cli/common.py,sha256=nIPc9GBK61r6kmaI9OS3IyhcfPqShpDEHx1ddjFPnlM,13131
|
|
14
16
|
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
15
17
|
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
16
18
|
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
17
19
|
dtflow/cli/sample.py,sha256=vPTQlF0OXEry4QjO8uaD9vOae4AQbX9zDwVYOxg59ZI,10339
|
|
18
|
-
dtflow/cli/stats.py,sha256=
|
|
20
|
+
dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
|
|
19
21
|
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
22
|
+
dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
|
|
20
23
|
dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
|
|
21
24
|
dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
|
|
22
25
|
dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
|
|
@@ -26,8 +29,8 @@ dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,36
|
|
|
26
29
|
dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
|
|
27
30
|
dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
|
|
28
31
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
29
|
-
dtflow/utils/field_path.py,sha256=
|
|
30
|
-
dtflow-0.
|
|
31
|
-
dtflow-0.
|
|
32
|
-
dtflow-0.
|
|
33
|
-
dtflow-0.
|
|
32
|
+
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
33
|
+
dtflow-0.5.2.dist-info/METADATA,sha256=RlpGaySrAIgTviom_Wyn6o2LWzQQVihff12Jpazy10o,22544
|
|
34
|
+
dtflow-0.5.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
35
|
+
dtflow-0.5.2.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
36
|
+
dtflow-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|