dtflow 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +22 -6
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +106 -6
- dtflow/cli/clean.py +90 -1
- dtflow/cli/commands.py +17 -1
- dtflow/cli/eval.py +288 -0
- dtflow/cli/export.py +81 -0
- dtflow/cli/sample.py +90 -3
- dtflow/cli/split.py +138 -0
- dtflow/eval.py +276 -0
- dtflow/utils/text_parser.py +124 -0
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/METADATA +29 -1
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/RECORD +15 -10
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/WHEEL +0 -0
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/entry_points.txt +0 -0
dtflow/SKILL.md
CHANGED
|
@@ -3,13 +3,14 @@ name: dtflow
|
|
|
3
3
|
description: >
|
|
4
4
|
当用户需要处理 JSONL/CSV/Parquet/JSON/Arrow 数据文件时使用此 skill。
|
|
5
5
|
提供 CLI 工具 `dt` 和 Python API `DataTransformer`。
|
|
6
|
-
适用场景:(1) 查看数据:dt sample/head/tail 采样预览,dt stats 统计字段分布;
|
|
6
|
+
适用场景:(1) 查看数据:dt sample/head/tail 采样预览,dt slice 按行范围查看,dt stats 统计字段分布;
|
|
7
7
|
(2) 数据清洗:dt clean 支持 --drop-empty/--min-len/--max-len 过滤行,--keep/--drop/--rename/--promote/--add-field/--fill/--reorder 操作字段;
|
|
8
8
|
(3) 去重:dt dedupe 精确去重或 --similar 相似度去重;
|
|
9
9
|
(4) 格式转换:dt transform 预设模板(openai_chat/alpaca/sharegpt/dpo)或自定义配置;
|
|
10
10
|
(5) Schema 验证:dt validate --preset 验证数据格式;
|
|
11
|
-
(6)
|
|
12
|
-
(7)
|
|
11
|
+
(6) 数据集切分:dt split 按比例切分 train/test/val;
|
|
12
|
+
(7) 训练框架导出:dt export / export_for() 一键导出到 llama-factory/swift/axolotl;
|
|
13
|
+
(8) 大文件流式处理:load_stream() O(1) 内存处理 100GB+ 文件。
|
|
13
14
|
注意:此工具专注数据文件的结构化处理,不涉及 LLM 调用(LLM 调用请用 flexllm)。
|
|
14
15
|
---
|
|
15
16
|
|
|
@@ -162,7 +163,7 @@ dt sample data.jsonl 1000 --by=category # 分层采样
|
|
|
162
163
|
dt sample data.jsonl 1000 --by=category --uniform # 均匀分层采样
|
|
163
164
|
dt sample data.jsonl --where="messages.#>=2" # 条件筛选
|
|
164
165
|
dt sample data.jsonl 10 -f input,output # 只显示指定字段
|
|
165
|
-
dt sample data.jsonl 10 --
|
|
166
|
+
dt sample data.jsonl 10 --pretty # 表格预览模式(默认原始 JSON)
|
|
166
167
|
dt sample data.jsonl 100 --seed=42 -o out.jsonl # 固定随机种子并保存
|
|
167
168
|
|
|
168
169
|
# 去重
|
|
@@ -185,8 +186,20 @@ dt clean data.jsonl --add-field=source:web # 添加常量字段
|
|
|
185
186
|
dt clean data.jsonl --fill=label:unknown # 填充空值/缺失字段
|
|
186
187
|
dt clean data.jsonl --reorder=id,text,label # 控制字段输出顺序
|
|
187
188
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
189
|
+
dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
|
|
190
|
+
dt clean data.jsonl --max-tokens=content:1000 -m gpt-4 # 最多 1000 tokens(指定分词器)
|
|
188
191
|
dt clean data.jsonl --promote=meta.label --drop=meta --fill=label:unknown # 组合使用
|
|
189
192
|
|
|
193
|
+
# 数据集切分
|
|
194
|
+
dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
|
|
195
|
+
dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
|
|
196
|
+
dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
|
|
197
|
+
|
|
198
|
+
# 训练框架导出
|
|
199
|
+
dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
|
|
200
|
+
dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
|
|
201
|
+
dt export data.jsonl -f llama-factory --check # 仅检查兼容性
|
|
202
|
+
|
|
190
203
|
# 验证
|
|
191
204
|
dt validate data.jsonl --preset=openai_chat # 预设: openai_chat/alpaca/dpo/sharegpt
|
|
192
205
|
dt validate data.jsonl -p alpaca -f -o valid.jsonl # 过滤无效数据并保存
|
|
@@ -205,10 +218,13 @@ dt diff a.jsonl b.jsonl --key=id # 对比差异
|
|
|
205
218
|
dt diff a.jsonl b.jsonl --key=id -o report.md # 输出对比报告
|
|
206
219
|
|
|
207
220
|
# 查看数据
|
|
208
|
-
dt head data.jsonl 10 # 前 10
|
|
221
|
+
dt head data.jsonl 10 # 前 10 条(默认原始 JSON)
|
|
209
222
|
dt head data.jsonl 10 -f input,output # 只显示指定字段
|
|
210
|
-
dt head data.jsonl 10 --
|
|
223
|
+
dt head data.jsonl 10 --pretty # 表格预览模式
|
|
211
224
|
dt tail data.jsonl 10 # 后 10 条
|
|
225
|
+
dt slice data.jsonl 10:20 # 第 10-19 行(Python 切片语法)
|
|
226
|
+
dt slice data.jsonl :100 # 前 100 行
|
|
227
|
+
dt slice data.jsonl 100: # 第 100 行到末尾
|
|
212
228
|
|
|
213
229
|
# 其他
|
|
214
230
|
dt run pipeline.yaml # Pipeline 执行
|
dtflow/__init__.py
CHANGED
dtflow/__main__.py
CHANGED
|
@@ -18,6 +18,8 @@ Commands:
|
|
|
18
18
|
clean 数据清洗
|
|
19
19
|
run 执行 Pipeline 配置文件
|
|
20
20
|
history 显示数据血缘历史
|
|
21
|
+
split 分割数据集
|
|
22
|
+
export 导出到训练框架
|
|
21
23
|
validate 使用 Schema 验证数据格式
|
|
22
24
|
logs 日志查看工具使用说明
|
|
23
25
|
install-skill 安装 dtflow skill 到 Claude Code
|
|
@@ -33,12 +35,16 @@ from .cli.commands import clean as _clean
|
|
|
33
35
|
from .cli.commands import concat as _concat
|
|
34
36
|
from .cli.commands import dedupe as _dedupe
|
|
35
37
|
from .cli.commands import diff as _diff
|
|
38
|
+
from .cli.commands import eval as _eval
|
|
39
|
+
from .cli.commands import export as _export
|
|
36
40
|
from .cli.commands import head as _head
|
|
37
41
|
from .cli.commands import history as _history
|
|
38
42
|
from .cli.commands import install_skill as _install_skill
|
|
39
43
|
from .cli.commands import run as _run
|
|
40
44
|
from .cli.commands import sample as _sample
|
|
41
45
|
from .cli.commands import skill_status as _skill_status
|
|
46
|
+
from .cli.commands import slice_data as _slice_data
|
|
47
|
+
from .cli.commands import split as _split
|
|
42
48
|
from .cli.commands import stats as _stats
|
|
43
49
|
from .cli.commands import tail as _tail
|
|
44
50
|
from .cli.commands import token_stats as _token_stats
|
|
@@ -69,12 +75,12 @@ def sample(
|
|
|
69
75
|
by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
|
|
70
76
|
uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
|
|
71
77
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
|
|
72
|
-
|
|
78
|
+
pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON)"),
|
|
73
79
|
where: Optional[List[str]] = typer.Option(None, "--where", "-w", help="筛选条件 (可多次使用)"),
|
|
74
80
|
):
|
|
75
81
|
"""从数据文件中采样指定数量的数据"""
|
|
76
82
|
actual_num = num_arg if num_arg is not None else num
|
|
77
|
-
_sample(filename, actual_num, type, output, seed, by, uniform, fields,
|
|
83
|
+
_sample(filename, actual_num, type, output, seed, by, uniform, fields, not pretty, where)
|
|
78
84
|
|
|
79
85
|
|
|
80
86
|
@app.command()
|
|
@@ -84,12 +90,12 @@ def head(
|
|
|
84
90
|
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
85
91
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
86
92
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
87
|
-
|
|
93
|
+
pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON)"),
|
|
88
94
|
):
|
|
89
95
|
"""显示文件的前 N 条数据"""
|
|
90
96
|
# 位置参数优先于选项参数
|
|
91
97
|
actual_num = num_arg if num_arg is not None else num
|
|
92
|
-
_head(filename, actual_num, output, fields,
|
|
98
|
+
_head(filename, actual_num, output, fields, not pretty)
|
|
93
99
|
|
|
94
100
|
|
|
95
101
|
@app.command()
|
|
@@ -99,12 +105,31 @@ def tail(
|
|
|
99
105
|
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
100
106
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
101
107
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
102
|
-
|
|
108
|
+
pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON)"),
|
|
103
109
|
):
|
|
104
110
|
"""显示文件的后 N 条数据"""
|
|
105
111
|
# 位置参数优先于选项参数
|
|
106
112
|
actual_num = num_arg if num_arg is not None else num
|
|
107
|
-
_tail(filename, actual_num, output, fields,
|
|
113
|
+
_tail(filename, actual_num, output, fields, not pretty)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.command("slice")
|
|
117
|
+
def slice_cmd(
|
|
118
|
+
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
119
|
+
range_str: str = typer.Argument(..., help="行号范围 (start:end),如 10:20、:100、100:、-10:"),
|
|
120
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
121
|
+
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
122
|
+
pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON)"),
|
|
123
|
+
):
|
|
124
|
+
"""按行号范围查看数据(Python 切片语法)
|
|
125
|
+
|
|
126
|
+
示例:
|
|
127
|
+
dt slice data.jsonl 10:20 第 10-19 行
|
|
128
|
+
dt slice data.jsonl :100 前 100 行
|
|
129
|
+
dt slice data.jsonl 100: 第 100 行到末尾
|
|
130
|
+
dt slice data.jsonl -10: 最后 10 行
|
|
131
|
+
"""
|
|
132
|
+
_slice_data(filename, range_str, output, fields, not pretty)
|
|
108
133
|
|
|
109
134
|
|
|
110
135
|
# ============ 数据转换命令 ============
|
|
@@ -174,6 +199,13 @@ def clean(
|
|
|
174
199
|
None, "--reorder", help="控制字段顺序 (field1,field2,...)"
|
|
175
200
|
),
|
|
176
201
|
strip: bool = typer.Option(False, "--strip", help="去除字符串首尾空白"),
|
|
202
|
+
min_tokens: Optional[str] = typer.Option(
|
|
203
|
+
None, "--min-tokens", help="最小 token 数过滤 (字段:数量)"
|
|
204
|
+
),
|
|
205
|
+
max_tokens: Optional[str] = typer.Option(
|
|
206
|
+
None, "--max-tokens", help="最大 token 数过滤 (字段:数量)"
|
|
207
|
+
),
|
|
208
|
+
model: str = typer.Option("cl100k_base", "--model", "-m", help="分词器模型 (默认 cl100k_base)"),
|
|
177
209
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
178
210
|
):
|
|
179
211
|
"""数据清洗"""
|
|
@@ -190,6 +222,9 @@ def clean(
|
|
|
190
222
|
fill,
|
|
191
223
|
reorder,
|
|
192
224
|
strip,
|
|
225
|
+
min_tokens,
|
|
226
|
+
max_tokens,
|
|
227
|
+
model,
|
|
193
228
|
output,
|
|
194
229
|
)
|
|
195
230
|
|
|
@@ -249,6 +284,71 @@ def history(
|
|
|
249
284
|
_history(filename, json)
|
|
250
285
|
|
|
251
286
|
|
|
287
|
+
# ============ 切分与导出命令 ============
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@app.command()
|
|
291
|
+
def split(
|
|
292
|
+
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
293
|
+
ratio: str = typer.Option("0.8", "--ratio", "-r", help="分割比例,如 0.8 或 0.7,0.15,0.15"),
|
|
294
|
+
seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
|
|
295
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出目录(默认同目录)"),
|
|
296
|
+
):
|
|
297
|
+
"""分割数据集为 train/test (或 train/val/test)"""
|
|
298
|
+
_split(filename, ratio, seed, output)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@app.command()
|
|
302
|
+
def export(
|
|
303
|
+
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
304
|
+
framework: str = typer.Option(
|
|
305
|
+
..., "--framework", "-f", help="目标框架: llama-factory, swift, axolotl"
|
|
306
|
+
),
|
|
307
|
+
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出目录"),
|
|
308
|
+
name: Optional[str] = typer.Option(None, "--name", "-n", help="数据集名称"),
|
|
309
|
+
check: bool = typer.Option(False, "--check", help="仅检查兼容性,不导出"),
|
|
310
|
+
):
|
|
311
|
+
"""导出数据到训练框架 (LLaMA-Factory, ms-swift, Axolotl)"""
|
|
312
|
+
_export(filename, framework, output, name, check)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ============ 评估命令 ============
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
@app.command()
|
|
319
|
+
def eval(
|
|
320
|
+
result_file: str = typer.Argument(..., help="模型输出的 .jsonl 文件路径"),
|
|
321
|
+
source: Optional[str] = typer.Option(
|
|
322
|
+
None, "--source", "-s", help="原始输入文件,按行号对齐合并"
|
|
323
|
+
),
|
|
324
|
+
response_col: str = typer.Option("content", "--response-col", "-r", help="模型响应字段名"),
|
|
325
|
+
label_col: Optional[str] = typer.Option(
|
|
326
|
+
None, "--label-col", "-l", help="标签字段名(不指定时自动检测)"
|
|
327
|
+
),
|
|
328
|
+
extract: str = typer.Option(
|
|
329
|
+
"direct",
|
|
330
|
+
"--extract",
|
|
331
|
+
"-e",
|
|
332
|
+
help="管道式提取规则,算子: direct/tag:X/json_key:X/index:N/line:N/lines/regex:X",
|
|
333
|
+
),
|
|
334
|
+
sep: Optional[str] = typer.Option(None, "--sep", help="配合 index 算子使用的分隔符"),
|
|
335
|
+
mapping: Optional[str] = typer.Option(None, "--mapping", "-m", help="值映射 (k1:v1,k2:v2)"),
|
|
336
|
+
output_dir: str = typer.Option("record", "--output-dir", "-o", help="指标报告输出目录"),
|
|
337
|
+
):
|
|
338
|
+
"""对模型输出进行解析和指标评估
|
|
339
|
+
|
|
340
|
+
两阶段解析:自动清洗(去 think 标签、提取代码块)+ 管道式提取。
|
|
341
|
+
|
|
342
|
+
示例:
|
|
343
|
+
dt eval result.jsonl --label-col=label
|
|
344
|
+
dt eval result.jsonl --extract="tag:标签" --mapping="是:1,否:0"
|
|
345
|
+
dt eval result.jsonl --source=input.jsonl --response-col=api_output.content
|
|
346
|
+
dt eval result.jsonl --extract="json_key:result | index:0" --sep=","
|
|
347
|
+
dt eval result.jsonl --extract="lines | index:1" --sep="|"
|
|
348
|
+
"""
|
|
349
|
+
_eval(result_file, source, response_col, label_col, extract, sep, mapping, output_dir)
|
|
350
|
+
|
|
351
|
+
|
|
252
352
|
# ============ 验证命令 ============
|
|
253
353
|
|
|
254
354
|
|
dtflow/cli/clean.py
CHANGED
|
@@ -138,6 +138,9 @@ def clean(
|
|
|
138
138
|
fill: Optional[str] = None,
|
|
139
139
|
reorder: Optional[str] = None,
|
|
140
140
|
strip: bool = False,
|
|
141
|
+
min_tokens: Optional[str] = None,
|
|
142
|
+
max_tokens: Optional[str] = None,
|
|
143
|
+
model: str = "cl100k_base",
|
|
141
144
|
output: Optional[str] = None,
|
|
142
145
|
) -> None:
|
|
143
146
|
"""
|
|
@@ -173,6 +176,9 @@ def clean(
|
|
|
173
176
|
dt clean data.jsonl --fill=label:unknown # 填充空值
|
|
174
177
|
dt clean data.jsonl --reorder=id,text,label # 控制字段顺序
|
|
175
178
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
179
|
+
dt clean data.jsonl --min-tokens=content:10 # content 字段最少 10 tokens
|
|
180
|
+
dt clean data.jsonl --max-tokens=content:1000 # content 字段最多 1000 tokens
|
|
181
|
+
dt clean data.jsonl --min-tokens=text:50 --model=gpt-4 # 使用 gpt-4 分词器
|
|
176
182
|
"""
|
|
177
183
|
filepath = Path(filename)
|
|
178
184
|
|
|
@@ -186,6 +192,13 @@ def clean(
|
|
|
186
192
|
# 解析参数
|
|
187
193
|
min_len_field, min_len_value = _parse_len_param(min_len) if min_len else (None, None)
|
|
188
194
|
max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
|
|
195
|
+
min_tokens_field, min_tokens_value = (
|
|
196
|
+
_parse_len_param(min_tokens) if min_tokens else (None, None)
|
|
197
|
+
)
|
|
198
|
+
max_tokens_field, max_tokens_value = (
|
|
199
|
+
_parse_len_param(max_tokens) if max_tokens else (None, None)
|
|
200
|
+
)
|
|
201
|
+
token_model = model
|
|
189
202
|
keep_fields = _parse_field_list(keep) if keep else None
|
|
190
203
|
drop_fields_set = set(_parse_field_list(drop)) if drop else None
|
|
191
204
|
rename_map = _parse_rename_param(rename) if rename else None
|
|
@@ -229,6 +242,14 @@ def clean(
|
|
|
229
242
|
print(f"🔄 填充空值: {fill_desc}")
|
|
230
243
|
if reorder_fields:
|
|
231
244
|
print(f"🔄 字段排序: {', '.join(reorder_fields)}")
|
|
245
|
+
if min_tokens_field:
|
|
246
|
+
print(
|
|
247
|
+
f"🔄 过滤 {min_tokens_field} tokens < {min_tokens_value} 的记录 (model={token_model})..."
|
|
248
|
+
)
|
|
249
|
+
if max_tokens_field:
|
|
250
|
+
print(
|
|
251
|
+
f"🔄 过滤 {max_tokens_field} tokens > {max_tokens_value} 的记录 (model={token_model})..."
|
|
252
|
+
)
|
|
232
253
|
|
|
233
254
|
output_path = output or str(filepath)
|
|
234
255
|
|
|
@@ -271,6 +292,11 @@ def clean(
|
|
|
271
292
|
add_field_map=add_field_map,
|
|
272
293
|
fill_map=fill_map,
|
|
273
294
|
reorder_fields=reorder_fields,
|
|
295
|
+
min_tokens_field=min_tokens_field,
|
|
296
|
+
min_tokens_value=min_tokens_value,
|
|
297
|
+
max_tokens_field=max_tokens_field,
|
|
298
|
+
max_tokens_value=max_tokens_value,
|
|
299
|
+
token_model=token_model,
|
|
274
300
|
)
|
|
275
301
|
|
|
276
302
|
# 如果使用了临时文件,移动到目标位置
|
|
@@ -316,6 +342,11 @@ def clean(
|
|
|
316
342
|
add_field_map=add_field_map,
|
|
317
343
|
fill_map=fill_map,
|
|
318
344
|
reorder_fields=reorder_fields,
|
|
345
|
+
min_tokens_field=min_tokens_field,
|
|
346
|
+
min_tokens_value=min_tokens_value,
|
|
347
|
+
max_tokens_field=max_tokens_field,
|
|
348
|
+
max_tokens_value=max_tokens_value,
|
|
349
|
+
token_model=token_model,
|
|
319
350
|
)
|
|
320
351
|
|
|
321
352
|
# 保存结果
|
|
@@ -458,6 +489,11 @@ def _clean_data_single_pass(
|
|
|
458
489
|
add_field_map: Optional[Dict[str, str]] = None,
|
|
459
490
|
fill_map: Optional[Dict[str, str]] = None,
|
|
460
491
|
reorder_fields: Optional[List[str]] = None,
|
|
492
|
+
min_tokens_field: Optional[str] = None,
|
|
493
|
+
min_tokens_value: Optional[int] = None,
|
|
494
|
+
max_tokens_field: Optional[str] = None,
|
|
495
|
+
max_tokens_value: Optional[int] = None,
|
|
496
|
+
token_model: str = "cl100k_base",
|
|
461
497
|
) -> tuple:
|
|
462
498
|
"""
|
|
463
499
|
单次遍历执行所有清洗操作。
|
|
@@ -476,11 +512,18 @@ def _clean_data_single_pass(
|
|
|
476
512
|
Returns:
|
|
477
513
|
(清洗后的数据, 统计信息列表)
|
|
478
514
|
"""
|
|
515
|
+
# 延迟导入 count_tokens(仅在需要时)
|
|
516
|
+
_count_tokens = None
|
|
517
|
+
if min_tokens_field is not None or max_tokens_field is not None:
|
|
518
|
+
from ..tokenizers import count_tokens as _count_tokens
|
|
519
|
+
|
|
479
520
|
result = []
|
|
480
521
|
stats = {
|
|
481
522
|
"drop_empty": 0,
|
|
482
523
|
"min_len": 0,
|
|
483
524
|
"max_len": 0,
|
|
525
|
+
"min_tokens": 0,
|
|
526
|
+
"max_tokens": 0,
|
|
484
527
|
}
|
|
485
528
|
|
|
486
529
|
# 预先计算 keep_fields 集合(如果有的话)
|
|
@@ -516,6 +559,20 @@ def _clean_data_single_pass(
|
|
|
516
559
|
stats["max_len"] += 1
|
|
517
560
|
continue
|
|
518
561
|
|
|
562
|
+
# 4.5 最小 token 数过滤
|
|
563
|
+
if min_tokens_field is not None:
|
|
564
|
+
value = get_field_with_spec(item, min_tokens_field, default="")
|
|
565
|
+
if _count_tokens(str(value), model=token_model) < min_tokens_value:
|
|
566
|
+
stats["min_tokens"] += 1
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
# 4.6 最大 token 数过滤
|
|
570
|
+
if max_tokens_field is not None:
|
|
571
|
+
value = get_field_with_spec(item, max_tokens_field, default="")
|
|
572
|
+
if _count_tokens(str(value), model=token_model) > max_tokens_value:
|
|
573
|
+
stats["max_tokens"] += 1
|
|
574
|
+
continue
|
|
575
|
+
|
|
519
576
|
# 5. 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
|
|
520
577
|
if promote_list is not None:
|
|
521
578
|
item = _promote_fields(item, promote_list)
|
|
@@ -554,6 +611,10 @@ def _clean_data_single_pass(
|
|
|
554
611
|
step_stats.append(f"min-len: -{stats['min_len']}")
|
|
555
612
|
if stats["max_len"] > 0:
|
|
556
613
|
step_stats.append(f"max-len: -{stats['max_len']}")
|
|
614
|
+
if stats["min_tokens"] > 0:
|
|
615
|
+
step_stats.append(f"min-tokens: -{stats['min_tokens']}")
|
|
616
|
+
if stats["max_tokens"] > 0:
|
|
617
|
+
step_stats.append(f"max-tokens: -{stats['max_tokens']}")
|
|
557
618
|
if keep_fields:
|
|
558
619
|
step_stats.append(f"keep: {len(keep_fields)} 字段")
|
|
559
620
|
if drop_fields:
|
|
@@ -588,6 +649,11 @@ def _clean_streaming(
|
|
|
588
649
|
add_field_map: Optional[Dict[str, str]] = None,
|
|
589
650
|
fill_map: Optional[Dict[str, str]] = None,
|
|
590
651
|
reorder_fields: Optional[List[str]] = None,
|
|
652
|
+
min_tokens_field: Optional[str] = None,
|
|
653
|
+
min_tokens_value: Optional[int] = None,
|
|
654
|
+
max_tokens_field: Optional[str] = None,
|
|
655
|
+
max_tokens_value: Optional[int] = None,
|
|
656
|
+
token_model: str = "cl100k_base",
|
|
591
657
|
) -> int:
|
|
592
658
|
"""
|
|
593
659
|
流式清洗数据。
|
|
@@ -596,6 +662,11 @@ def _clean_streaming(
|
|
|
596
662
|
处理后的数据条数
|
|
597
663
|
"""
|
|
598
664
|
|
|
665
|
+
# 延迟导入 count_tokens(仅在需要时)
|
|
666
|
+
_count_tokens = None
|
|
667
|
+
if min_tokens_field is not None or max_tokens_field is not None:
|
|
668
|
+
from ..tokenizers import count_tokens as _count_tokens
|
|
669
|
+
|
|
599
670
|
def clean_filter(item: Dict) -> bool:
|
|
600
671
|
"""过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
|
|
601
672
|
# 空值过滤
|
|
@@ -618,6 +689,18 @@ def _clean_streaming(
|
|
|
618
689
|
if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
|
|
619
690
|
return False
|
|
620
691
|
|
|
692
|
+
# 最小 token 数过滤
|
|
693
|
+
if min_tokens_field is not None:
|
|
694
|
+
value = get_field_with_spec(item, min_tokens_field, default="")
|
|
695
|
+
if _count_tokens(str(value), model=token_model) < min_tokens_value:
|
|
696
|
+
return False
|
|
697
|
+
|
|
698
|
+
# 最大 token 数过滤
|
|
699
|
+
if max_tokens_field is not None:
|
|
700
|
+
value = get_field_with_spec(item, max_tokens_field, default="")
|
|
701
|
+
if _count_tokens(str(value), model=token_model) > max_tokens_value:
|
|
702
|
+
return False
|
|
703
|
+
|
|
621
704
|
return True
|
|
622
705
|
|
|
623
706
|
def clean_transform(item: Dict) -> Dict:
|
|
@@ -644,7 +727,13 @@ def _clean_streaming(
|
|
|
644
727
|
)
|
|
645
728
|
|
|
646
729
|
# 执行过滤
|
|
647
|
-
if
|
|
730
|
+
if (
|
|
731
|
+
empty_fields is not None
|
|
732
|
+
or min_len_field is not None
|
|
733
|
+
or max_len_field is not None
|
|
734
|
+
or min_tokens_field is not None
|
|
735
|
+
or max_tokens_field is not None
|
|
736
|
+
):
|
|
648
737
|
st = st.filter(clean_filter)
|
|
649
738
|
|
|
650
739
|
# 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
|
dtflow/cli/commands.py
CHANGED
|
@@ -16,6 +16,12 @@ CLI 命令统一导出入口
|
|
|
16
16
|
# 清洗命令
|
|
17
17
|
from .clean import clean, dedupe
|
|
18
18
|
|
|
19
|
+
# 评估命令
|
|
20
|
+
from .eval import eval
|
|
21
|
+
|
|
22
|
+
# 导出命令
|
|
23
|
+
from .export import export
|
|
24
|
+
|
|
19
25
|
# IO 操作命令
|
|
20
26
|
from .io_ops import concat, diff
|
|
21
27
|
|
|
@@ -24,11 +30,14 @@ from .lineage import history
|
|
|
24
30
|
|
|
25
31
|
# Pipeline 命令
|
|
26
32
|
from .pipeline import run
|
|
27
|
-
from .sample import head, sample, tail
|
|
33
|
+
from .sample import head, sample, slice_data, tail
|
|
28
34
|
|
|
29
35
|
# Skill 命令
|
|
30
36
|
from .skill import install_skill, skill_status, uninstall_skill
|
|
31
37
|
|
|
38
|
+
# 切分命令
|
|
39
|
+
from .split import split
|
|
40
|
+
|
|
32
41
|
# 统计命令
|
|
33
42
|
from .stats import stats, token_stats
|
|
34
43
|
|
|
@@ -43,6 +52,7 @@ __all__ = [
|
|
|
43
52
|
"sample",
|
|
44
53
|
"head",
|
|
45
54
|
"tail",
|
|
55
|
+
"slice_data",
|
|
46
56
|
# 转换
|
|
47
57
|
"transform",
|
|
48
58
|
# 统计
|
|
@@ -60,6 +70,12 @@ __all__ = [
|
|
|
60
70
|
"history",
|
|
61
71
|
# 验证
|
|
62
72
|
"validate",
|
|
73
|
+
# 切分
|
|
74
|
+
"split",
|
|
75
|
+
# 导出
|
|
76
|
+
"export",
|
|
77
|
+
# 评估
|
|
78
|
+
"eval",
|
|
63
79
|
# Skill
|
|
64
80
|
"install_skill",
|
|
65
81
|
"uninstall_skill",
|