dtflow 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +39 -5
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +137 -8
- dtflow/cli/clean.py +294 -9
- dtflow/cli/commands.py +17 -1
- dtflow/cli/eval.py +288 -0
- dtflow/cli/export.py +81 -0
- dtflow/cli/sample.py +90 -3
- dtflow/cli/split.py +138 -0
- dtflow/cli/stats.py +224 -30
- dtflow/eval.py +276 -0
- dtflow/utils/text_parser.py +124 -0
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/METADATA +34 -2
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/RECORD +16 -11
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/WHEEL +0 -0
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/entry_points.txt +0 -0
dtflow/cli/clean.py
CHANGED
|
@@ -132,7 +132,15 @@ def clean(
|
|
|
132
132
|
max_len: Optional[str] = None,
|
|
133
133
|
keep: Optional[str] = None,
|
|
134
134
|
drop: Optional[str] = None,
|
|
135
|
+
rename: Optional[str] = None,
|
|
136
|
+
promote: Optional[str] = None,
|
|
137
|
+
add_field: Optional[str] = None,
|
|
138
|
+
fill: Optional[str] = None,
|
|
139
|
+
reorder: Optional[str] = None,
|
|
135
140
|
strip: bool = False,
|
|
141
|
+
min_tokens: Optional[str] = None,
|
|
142
|
+
max_tokens: Optional[str] = None,
|
|
143
|
+
model: str = "cl100k_base",
|
|
136
144
|
output: Optional[str] = None,
|
|
137
145
|
) -> None:
|
|
138
146
|
"""
|
|
@@ -147,19 +155,30 @@ def clean(
|
|
|
147
155
|
max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
|
|
148
156
|
keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
|
|
149
157
|
drop: 删除指定字段(逗号分隔,仅支持顶层字段)
|
|
158
|
+
rename: 重命名字段,格式 "old:new" 或 "old1:new1,old2:new2"
|
|
159
|
+
promote: 提升嵌套字段到顶层,格式 "path" 或 "path:name"(逗号分隔多个)
|
|
160
|
+
add_field: 添加常量字段,格式 "key:value"(逗号分隔多个)
|
|
161
|
+
fill: 填充空值,格式 "field:default_value"(逗号分隔多个)
|
|
162
|
+
reorder: 控制字段顺序(逗号分隔),未列出的字段追加在后面
|
|
150
163
|
strip: 去除所有字符串字段的首尾空白
|
|
151
164
|
output: 输出文件路径,不指定则覆盖原文件
|
|
152
165
|
|
|
153
166
|
Examples:
|
|
154
167
|
dt clean data.jsonl --drop-empty # 删除任意空值记录
|
|
155
168
|
dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
|
|
156
|
-
dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
|
|
157
169
|
dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
|
|
158
|
-
dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
|
|
159
|
-
dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
|
|
160
170
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
161
171
|
dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
|
|
172
|
+
dt clean data.jsonl --rename=question:instruction # 重命名字段
|
|
173
|
+
dt clean data.jsonl --promote=meta.label # 提升嵌套字段到顶层
|
|
174
|
+
dt clean data.jsonl --promote=meta.label:tag # 提升并自定义名称
|
|
175
|
+
dt clean data.jsonl --add-field=source:web # 添加常量字段
|
|
176
|
+
dt clean data.jsonl --fill=label:unknown # 填充空值
|
|
177
|
+
dt clean data.jsonl --reorder=id,text,label # 控制字段顺序
|
|
162
178
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
179
|
+
dt clean data.jsonl --min-tokens=content:10 # content 字段最少 10 tokens
|
|
180
|
+
dt clean data.jsonl --max-tokens=content:1000 # content 字段最多 1000 tokens
|
|
181
|
+
dt clean data.jsonl --min-tokens=text:50 --model=gpt-4 # 使用 gpt-4 分词器
|
|
163
182
|
"""
|
|
164
183
|
filepath = Path(filename)
|
|
165
184
|
|
|
@@ -173,8 +192,20 @@ def clean(
|
|
|
173
192
|
# 解析参数
|
|
174
193
|
min_len_field, min_len_value = _parse_len_param(min_len) if min_len else (None, None)
|
|
175
194
|
max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
|
|
195
|
+
min_tokens_field, min_tokens_value = (
|
|
196
|
+
_parse_len_param(min_tokens) if min_tokens else (None, None)
|
|
197
|
+
)
|
|
198
|
+
max_tokens_field, max_tokens_value = (
|
|
199
|
+
_parse_len_param(max_tokens) if max_tokens else (None, None)
|
|
200
|
+
)
|
|
201
|
+
token_model = model
|
|
176
202
|
keep_fields = _parse_field_list(keep) if keep else None
|
|
177
203
|
drop_fields_set = set(_parse_field_list(drop)) if drop else None
|
|
204
|
+
rename_map = _parse_rename_param(rename) if rename else None
|
|
205
|
+
promote_list = _parse_promote_param(promote) if promote else None
|
|
206
|
+
add_field_map = _parse_kv_param(add_field, "add-field") if add_field else None
|
|
207
|
+
fill_map = _parse_kv_param(fill, "fill") if fill else None
|
|
208
|
+
reorder_fields = _parse_field_list(reorder) if reorder else None
|
|
178
209
|
keep_set = set(keep_fields) if keep_fields else None
|
|
179
210
|
|
|
180
211
|
# 构建清洗配置
|
|
@@ -197,6 +228,28 @@ def clean(
|
|
|
197
228
|
print(f"🔄 只保留字段: {', '.join(keep_fields)}")
|
|
198
229
|
if drop_fields_set:
|
|
199
230
|
print(f"🔄 删除字段: {', '.join(drop_fields_set)}")
|
|
231
|
+
if rename_map:
|
|
232
|
+
rename_desc = ", ".join(f"{k} → {v}" for k, v in rename_map.items())
|
|
233
|
+
print(f"🔄 重命名字段: {rename_desc}")
|
|
234
|
+
if promote_list:
|
|
235
|
+
promote_desc = ", ".join(f"{src} → {dst}" for src, dst in promote_list)
|
|
236
|
+
print(f"🔄 提升字段: {promote_desc}")
|
|
237
|
+
if add_field_map:
|
|
238
|
+
add_desc = ", ".join(f"{k}={v}" for k, v in add_field_map.items())
|
|
239
|
+
print(f"🔄 添加字段: {add_desc}")
|
|
240
|
+
if fill_map:
|
|
241
|
+
fill_desc = ", ".join(f"{k}={v}" for k, v in fill_map.items())
|
|
242
|
+
print(f"🔄 填充空值: {fill_desc}")
|
|
243
|
+
if reorder_fields:
|
|
244
|
+
print(f"🔄 字段排序: {', '.join(reorder_fields)}")
|
|
245
|
+
if min_tokens_field:
|
|
246
|
+
print(
|
|
247
|
+
f"🔄 过滤 {min_tokens_field} tokens < {min_tokens_value} 的记录 (model={token_model})..."
|
|
248
|
+
)
|
|
249
|
+
if max_tokens_field:
|
|
250
|
+
print(
|
|
251
|
+
f"🔄 过滤 {max_tokens_field} tokens > {max_tokens_value} 的记录 (model={token_model})..."
|
|
252
|
+
)
|
|
200
253
|
|
|
201
254
|
output_path = output or str(filepath)
|
|
202
255
|
|
|
@@ -234,6 +287,16 @@ def clean(
|
|
|
234
287
|
max_len_value=max_len_value,
|
|
235
288
|
keep_set=keep_set,
|
|
236
289
|
drop_fields_set=drop_fields_set,
|
|
290
|
+
rename_map=rename_map,
|
|
291
|
+
promote_list=promote_list,
|
|
292
|
+
add_field_map=add_field_map,
|
|
293
|
+
fill_map=fill_map,
|
|
294
|
+
reorder_fields=reorder_fields,
|
|
295
|
+
min_tokens_field=min_tokens_field,
|
|
296
|
+
min_tokens_value=min_tokens_value,
|
|
297
|
+
max_tokens_field=max_tokens_field,
|
|
298
|
+
max_tokens_value=max_tokens_value,
|
|
299
|
+
token_model=token_model,
|
|
237
300
|
)
|
|
238
301
|
|
|
239
302
|
# 如果使用了临时文件,移动到目标位置
|
|
@@ -274,6 +337,16 @@ def clean(
|
|
|
274
337
|
max_len_value=max_len_value,
|
|
275
338
|
keep_fields=keep_fields,
|
|
276
339
|
drop_fields=drop_fields_set,
|
|
340
|
+
rename_map=rename_map,
|
|
341
|
+
promote_list=promote_list,
|
|
342
|
+
add_field_map=add_field_map,
|
|
343
|
+
fill_map=fill_map,
|
|
344
|
+
reorder_fields=reorder_fields,
|
|
345
|
+
min_tokens_field=min_tokens_field,
|
|
346
|
+
min_tokens_value=min_tokens_value,
|
|
347
|
+
max_tokens_field=max_tokens_field,
|
|
348
|
+
max_tokens_value=max_tokens_value,
|
|
349
|
+
token_model=token_model,
|
|
277
350
|
)
|
|
278
351
|
|
|
279
352
|
# 保存结果
|
|
@@ -288,12 +361,106 @@ def clean(
|
|
|
288
361
|
|
|
289
362
|
# 打印统计
|
|
290
363
|
removed_count = original_count - final_count
|
|
291
|
-
print(
|
|
364
|
+
print("\n✅ 完成!")
|
|
292
365
|
print(f" 原始: {original_count} 条 -> 清洗后: {final_count} 条 (删除 {removed_count} 条)")
|
|
293
366
|
if step_stats:
|
|
294
367
|
print(f" 步骤: {' | '.join(step_stats)}")
|
|
295
368
|
|
|
296
369
|
|
|
370
|
+
def _parse_rename_param(param: str) -> Dict[str, str]:
|
|
371
|
+
"""解析重命名参数,格式 'old:new' 或 'old1:new1,old2:new2'"""
|
|
372
|
+
rename_map = {}
|
|
373
|
+
for pair in param.split(","):
|
|
374
|
+
pair = pair.strip()
|
|
375
|
+
if ":" not in pair:
|
|
376
|
+
raise ValueError(f"重命名参数格式错误: {pair},应为 'old:new'")
|
|
377
|
+
old, new = pair.split(":", 1)
|
|
378
|
+
old, new = old.strip(), new.strip()
|
|
379
|
+
if not old or not new:
|
|
380
|
+
raise ValueError(f"重命名参数格式错误: {pair},字段名不能为空")
|
|
381
|
+
rename_map[old] = new
|
|
382
|
+
return rename_map
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _parse_promote_param(param: str) -> List[tuple]:
|
|
386
|
+
"""
|
|
387
|
+
解析提升参数,格式 'path' 或 'path:name'(逗号分隔多个)。
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
[(source_path, target_name), ...]
|
|
391
|
+
"""
|
|
392
|
+
result = []
|
|
393
|
+
for item in param.split(","):
|
|
394
|
+
item = item.strip()
|
|
395
|
+
if ":" in item:
|
|
396
|
+
src, dst = item.split(":", 1)
|
|
397
|
+
src, dst = src.strip(), dst.strip()
|
|
398
|
+
else:
|
|
399
|
+
src = item
|
|
400
|
+
# 默认用路径最后一段作为目标名
|
|
401
|
+
dst = src.rsplit(".", 1)[-1] if "." in src else src
|
|
402
|
+
if not src or not dst:
|
|
403
|
+
raise ValueError(f"promote 参数格式错误: {item}")
|
|
404
|
+
result.append((src, dst))
|
|
405
|
+
return result
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _parse_kv_param(param: str, param_name: str) -> Dict[str, str]:
|
|
409
|
+
"""解析 key:value 格式参数(通用),用于 --add-field 和 --fill"""
|
|
410
|
+
kv_map = {}
|
|
411
|
+
for pair in param.split(","):
|
|
412
|
+
pair = pair.strip()
|
|
413
|
+
if ":" not in pair:
|
|
414
|
+
raise ValueError(f"{param_name} 参数格式错误: {pair},应为 'key:value'")
|
|
415
|
+
key, value = pair.split(":", 1)
|
|
416
|
+
key, value = key.strip(), value.strip()
|
|
417
|
+
if not key:
|
|
418
|
+
raise ValueError(f"{param_name} 参数格式错误: {pair},key 不能为空")
|
|
419
|
+
kv_map[key] = value
|
|
420
|
+
return kv_map
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _rename_item(item: Dict, rename_map: Dict[str, str]) -> Dict:
|
|
424
|
+
"""重命名字段,保持字段顺序"""
|
|
425
|
+
return {rename_map.get(k, k): v for k, v in item.items()}
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _promote_fields(item: Dict, promote_list: List[tuple]) -> Dict:
|
|
429
|
+
"""提升嵌套字段到顶层(始终添加字段,即使值为 None)"""
|
|
430
|
+
item = dict(item)
|
|
431
|
+
for src_path, dst_name in promote_list:
|
|
432
|
+
item[dst_name] = get_field_with_spec(item, src_path)
|
|
433
|
+
return item
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _add_fields(item: Dict, add_field_map: Dict[str, str]) -> Dict:
|
|
437
|
+
"""添加常量字段"""
|
|
438
|
+
item = dict(item)
|
|
439
|
+
item.update(add_field_map)
|
|
440
|
+
return item
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _fill_empty(item: Dict, fill_map: Dict[str, str]) -> Dict:
|
|
444
|
+
"""填充空值(字段不存在时也会添加)"""
|
|
445
|
+
item = dict(item)
|
|
446
|
+
for field, default in fill_map.items():
|
|
447
|
+
if field not in item or _is_empty_value(item[field]):
|
|
448
|
+
item[field] = default
|
|
449
|
+
return item
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _reorder_item(item: Dict, reorder_fields: List[str]) -> Dict:
|
|
453
|
+
"""按指定顺序重排字段,未列出的字段追加在后面"""
|
|
454
|
+
ordered = {}
|
|
455
|
+
for f in reorder_fields:
|
|
456
|
+
if f in item:
|
|
457
|
+
ordered[f] = item[f]
|
|
458
|
+
for k, v in item.items():
|
|
459
|
+
if k not in ordered:
|
|
460
|
+
ordered[k] = v
|
|
461
|
+
return ordered
|
|
462
|
+
|
|
463
|
+
|
|
297
464
|
def _parse_len_param(param: str) -> tuple:
|
|
298
465
|
"""解析长度参数,格式 'field:length'"""
|
|
299
466
|
if ":" not in param:
|
|
@@ -302,8 +469,8 @@ def _parse_len_param(param: str) -> tuple:
|
|
|
302
469
|
field = parts[0].strip()
|
|
303
470
|
try:
|
|
304
471
|
length = int(parts[1].strip())
|
|
305
|
-
except ValueError:
|
|
306
|
-
raise ValueError(f"长度必须是整数: {parts[1]}")
|
|
472
|
+
except ValueError as e:
|
|
473
|
+
raise ValueError(f"长度必须是整数: {parts[1]}") from e
|
|
307
474
|
return field, length
|
|
308
475
|
|
|
309
476
|
|
|
@@ -317,6 +484,16 @@ def _clean_data_single_pass(
|
|
|
317
484
|
max_len_value: Optional[int] = None,
|
|
318
485
|
keep_fields: Optional[List[str]] = None,
|
|
319
486
|
drop_fields: Optional[set] = None,
|
|
487
|
+
rename_map: Optional[Dict[str, str]] = None,
|
|
488
|
+
promote_list: Optional[List[tuple]] = None,
|
|
489
|
+
add_field_map: Optional[Dict[str, str]] = None,
|
|
490
|
+
fill_map: Optional[Dict[str, str]] = None,
|
|
491
|
+
reorder_fields: Optional[List[str]] = None,
|
|
492
|
+
min_tokens_field: Optional[str] = None,
|
|
493
|
+
min_tokens_value: Optional[int] = None,
|
|
494
|
+
max_tokens_field: Optional[str] = None,
|
|
495
|
+
max_tokens_value: Optional[int] = None,
|
|
496
|
+
token_model: str = "cl100k_base",
|
|
320
497
|
) -> tuple:
|
|
321
498
|
"""
|
|
322
499
|
单次遍历执行所有清洗操作。
|
|
@@ -335,11 +512,18 @@ def _clean_data_single_pass(
|
|
|
335
512
|
Returns:
|
|
336
513
|
(清洗后的数据, 统计信息列表)
|
|
337
514
|
"""
|
|
515
|
+
# 延迟导入 count_tokens(仅在需要时)
|
|
516
|
+
_count_tokens = None
|
|
517
|
+
if min_tokens_field is not None or max_tokens_field is not None:
|
|
518
|
+
from ..tokenizers import count_tokens as _count_tokens
|
|
519
|
+
|
|
338
520
|
result = []
|
|
339
521
|
stats = {
|
|
340
522
|
"drop_empty": 0,
|
|
341
523
|
"min_len": 0,
|
|
342
524
|
"max_len": 0,
|
|
525
|
+
"min_tokens": 0,
|
|
526
|
+
"max_tokens": 0,
|
|
343
527
|
}
|
|
344
528
|
|
|
345
529
|
# 预先计算 keep_fields 集合(如果有的话)
|
|
@@ -375,12 +559,46 @@ def _clean_data_single_pass(
|
|
|
375
559
|
stats["max_len"] += 1
|
|
376
560
|
continue
|
|
377
561
|
|
|
378
|
-
# 5
|
|
562
|
+
# 4.5 最小 token 数过滤
|
|
563
|
+
if min_tokens_field is not None:
|
|
564
|
+
value = get_field_with_spec(item, min_tokens_field, default="")
|
|
565
|
+
if _count_tokens(str(value), model=token_model) < min_tokens_value:
|
|
566
|
+
stats["min_tokens"] += 1
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
# 4.6 最大 token 数过滤
|
|
570
|
+
if max_tokens_field is not None:
|
|
571
|
+
value = get_field_with_spec(item, max_tokens_field, default="")
|
|
572
|
+
if _count_tokens(str(value), model=token_model) > max_tokens_value:
|
|
573
|
+
stats["max_tokens"] += 1
|
|
574
|
+
continue
|
|
575
|
+
|
|
576
|
+
# 5. 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
|
|
577
|
+
if promote_list is not None:
|
|
578
|
+
item = _promote_fields(item, promote_list)
|
|
579
|
+
|
|
580
|
+
# 6. 字段管理(keep/drop)
|
|
379
581
|
if keep_set is not None:
|
|
380
582
|
item = {k: v for k, v in item.items() if k in keep_set}
|
|
381
583
|
elif drop_fields is not None:
|
|
382
584
|
item = {k: v for k, v in item.items() if k not in drop_fields}
|
|
383
585
|
|
|
586
|
+
# 7. 字段重命名
|
|
587
|
+
if rename_map is not None:
|
|
588
|
+
item = _rename_item(item, rename_map)
|
|
589
|
+
|
|
590
|
+
# 8. 添加常量字段
|
|
591
|
+
if add_field_map is not None:
|
|
592
|
+
item = _add_fields(item, add_field_map)
|
|
593
|
+
|
|
594
|
+
# 9. 填充空值
|
|
595
|
+
if fill_map is not None:
|
|
596
|
+
item = _fill_empty(item, fill_map)
|
|
597
|
+
|
|
598
|
+
# 10. 字段排序(最后执行)
|
|
599
|
+
if reorder_fields is not None:
|
|
600
|
+
item = _reorder_item(item, reorder_fields)
|
|
601
|
+
|
|
384
602
|
result.append(item)
|
|
385
603
|
|
|
386
604
|
# 构建统计信息字符串列表
|
|
@@ -393,10 +611,24 @@ def _clean_data_single_pass(
|
|
|
393
611
|
step_stats.append(f"min-len: -{stats['min_len']}")
|
|
394
612
|
if stats["max_len"] > 0:
|
|
395
613
|
step_stats.append(f"max-len: -{stats['max_len']}")
|
|
614
|
+
if stats["min_tokens"] > 0:
|
|
615
|
+
step_stats.append(f"min-tokens: -{stats['min_tokens']}")
|
|
616
|
+
if stats["max_tokens"] > 0:
|
|
617
|
+
step_stats.append(f"max-tokens: -{stats['max_tokens']}")
|
|
396
618
|
if keep_fields:
|
|
397
619
|
step_stats.append(f"keep: {len(keep_fields)} 字段")
|
|
398
620
|
if drop_fields:
|
|
399
621
|
step_stats.append(f"drop: {len(drop_fields)} 字段")
|
|
622
|
+
if rename_map:
|
|
623
|
+
step_stats.append(f"rename: {len(rename_map)} 字段")
|
|
624
|
+
if promote_list:
|
|
625
|
+
step_stats.append(f"promote: {len(promote_list)} 字段")
|
|
626
|
+
if add_field_map:
|
|
627
|
+
step_stats.append(f"add-field: {len(add_field_map)} 字段")
|
|
628
|
+
if fill_map:
|
|
629
|
+
step_stats.append(f"fill: {len(fill_map)} 字段")
|
|
630
|
+
if reorder_fields:
|
|
631
|
+
step_stats.append("reorder")
|
|
400
632
|
|
|
401
633
|
return result, step_stats
|
|
402
634
|
|
|
@@ -412,6 +644,16 @@ def _clean_streaming(
|
|
|
412
644
|
max_len_value: Optional[int] = None,
|
|
413
645
|
keep_set: Optional[set] = None,
|
|
414
646
|
drop_fields_set: Optional[set] = None,
|
|
647
|
+
rename_map: Optional[Dict[str, str]] = None,
|
|
648
|
+
promote_list: Optional[List[tuple]] = None,
|
|
649
|
+
add_field_map: Optional[Dict[str, str]] = None,
|
|
650
|
+
fill_map: Optional[Dict[str, str]] = None,
|
|
651
|
+
reorder_fields: Optional[List[str]] = None,
|
|
652
|
+
min_tokens_field: Optional[str] = None,
|
|
653
|
+
min_tokens_value: Optional[int] = None,
|
|
654
|
+
max_tokens_field: Optional[str] = None,
|
|
655
|
+
max_tokens_value: Optional[int] = None,
|
|
656
|
+
token_model: str = "cl100k_base",
|
|
415
657
|
) -> int:
|
|
416
658
|
"""
|
|
417
659
|
流式清洗数据。
|
|
@@ -420,6 +662,11 @@ def _clean_streaming(
|
|
|
420
662
|
处理后的数据条数
|
|
421
663
|
"""
|
|
422
664
|
|
|
665
|
+
# 延迟导入 count_tokens(仅在需要时)
|
|
666
|
+
_count_tokens = None
|
|
667
|
+
if min_tokens_field is not None or max_tokens_field is not None:
|
|
668
|
+
from ..tokenizers import count_tokens as _count_tokens
|
|
669
|
+
|
|
423
670
|
def clean_filter(item: Dict) -> bool:
|
|
424
671
|
"""过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
|
|
425
672
|
# 空值过滤
|
|
@@ -442,6 +689,18 @@ def _clean_streaming(
|
|
|
442
689
|
if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
|
|
443
690
|
return False
|
|
444
691
|
|
|
692
|
+
# 最小 token 数过滤
|
|
693
|
+
if min_tokens_field is not None:
|
|
694
|
+
value = get_field_with_spec(item, min_tokens_field, default="")
|
|
695
|
+
if _count_tokens(str(value), model=token_model) < min_tokens_value:
|
|
696
|
+
return False
|
|
697
|
+
|
|
698
|
+
# 最大 token 数过滤
|
|
699
|
+
if max_tokens_field is not None:
|
|
700
|
+
value = get_field_with_spec(item, max_tokens_field, default="")
|
|
701
|
+
if _count_tokens(str(value), model=token_model) > max_tokens_value:
|
|
702
|
+
return False
|
|
703
|
+
|
|
445
704
|
return True
|
|
446
705
|
|
|
447
706
|
def clean_transform(item: Dict) -> Dict:
|
|
@@ -468,10 +727,20 @@ def _clean_streaming(
|
|
|
468
727
|
)
|
|
469
728
|
|
|
470
729
|
# 执行过滤
|
|
471
|
-
if
|
|
730
|
+
if (
|
|
731
|
+
empty_fields is not None
|
|
732
|
+
or min_len_field is not None
|
|
733
|
+
or max_len_field is not None
|
|
734
|
+
or min_tokens_field is not None
|
|
735
|
+
or max_tokens_field is not None
|
|
736
|
+
):
|
|
472
737
|
st = st.filter(clean_filter)
|
|
473
738
|
|
|
474
|
-
#
|
|
739
|
+
# 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
|
|
740
|
+
if promote_list is not None:
|
|
741
|
+
st = st.transform(lambda item: _promote_fields(item, promote_list))
|
|
742
|
+
|
|
743
|
+
# 执行字段管理(keep/drop)
|
|
475
744
|
if keep_set is not None or drop_fields_set is not None:
|
|
476
745
|
|
|
477
746
|
def field_transform(item):
|
|
@@ -483,4 +752,20 @@ def _clean_streaming(
|
|
|
483
752
|
|
|
484
753
|
st = st.transform(field_transform)
|
|
485
754
|
|
|
755
|
+
# 执行字段重命名
|
|
756
|
+
if rename_map is not None:
|
|
757
|
+
st = st.transform(lambda item: _rename_item(item, rename_map))
|
|
758
|
+
|
|
759
|
+
# 添加常量字段
|
|
760
|
+
if add_field_map is not None:
|
|
761
|
+
st = st.transform(lambda item: _add_fields(item, add_field_map))
|
|
762
|
+
|
|
763
|
+
# 填充空值
|
|
764
|
+
if fill_map is not None:
|
|
765
|
+
st = st.transform(lambda item: _fill_empty(item, fill_map))
|
|
766
|
+
|
|
767
|
+
# 字段排序(最后执行)
|
|
768
|
+
if reorder_fields is not None:
|
|
769
|
+
st = st.transform(lambda item: _reorder_item(item, reorder_fields))
|
|
770
|
+
|
|
486
771
|
return st.save(output_path)
|
dtflow/cli/commands.py
CHANGED
|
@@ -16,6 +16,12 @@ CLI 命令统一导出入口
|
|
|
16
16
|
# 清洗命令
|
|
17
17
|
from .clean import clean, dedupe
|
|
18
18
|
|
|
19
|
+
# 评估命令
|
|
20
|
+
from .eval import eval
|
|
21
|
+
|
|
22
|
+
# 导出命令
|
|
23
|
+
from .export import export
|
|
24
|
+
|
|
19
25
|
# IO 操作命令
|
|
20
26
|
from .io_ops import concat, diff
|
|
21
27
|
|
|
@@ -24,11 +30,14 @@ from .lineage import history
|
|
|
24
30
|
|
|
25
31
|
# Pipeline 命令
|
|
26
32
|
from .pipeline import run
|
|
27
|
-
from .sample import head, sample, tail
|
|
33
|
+
from .sample import head, sample, slice_data, tail
|
|
28
34
|
|
|
29
35
|
# Skill 命令
|
|
30
36
|
from .skill import install_skill, skill_status, uninstall_skill
|
|
31
37
|
|
|
38
|
+
# 切分命令
|
|
39
|
+
from .split import split
|
|
40
|
+
|
|
32
41
|
# 统计命令
|
|
33
42
|
from .stats import stats, token_stats
|
|
34
43
|
|
|
@@ -43,6 +52,7 @@ __all__ = [
|
|
|
43
52
|
"sample",
|
|
44
53
|
"head",
|
|
45
54
|
"tail",
|
|
55
|
+
"slice_data",
|
|
46
56
|
# 转换
|
|
47
57
|
"transform",
|
|
48
58
|
# 统计
|
|
@@ -60,6 +70,12 @@ __all__ = [
|
|
|
60
70
|
"history",
|
|
61
71
|
# 验证
|
|
62
72
|
"validate",
|
|
73
|
+
# 切分
|
|
74
|
+
"split",
|
|
75
|
+
# 导出
|
|
76
|
+
"export",
|
|
77
|
+
# 评估
|
|
78
|
+
"eval",
|
|
63
79
|
# Skill
|
|
64
80
|
"install_skill",
|
|
65
81
|
"uninstall_skill",
|