dtflow 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/cli/clean.py CHANGED
@@ -132,7 +132,15 @@ def clean(
132
132
  max_len: Optional[str] = None,
133
133
  keep: Optional[str] = None,
134
134
  drop: Optional[str] = None,
135
+ rename: Optional[str] = None,
136
+ promote: Optional[str] = None,
137
+ add_field: Optional[str] = None,
138
+ fill: Optional[str] = None,
139
+ reorder: Optional[str] = None,
135
140
  strip: bool = False,
141
+ min_tokens: Optional[str] = None,
142
+ max_tokens: Optional[str] = None,
143
+ model: str = "cl100k_base",
136
144
  output: Optional[str] = None,
137
145
  ) -> None:
138
146
  """
@@ -147,19 +155,30 @@ def clean(
147
155
  max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
148
156
  keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
149
157
  drop: 删除指定字段(逗号分隔,仅支持顶层字段)
158
+ rename: 重命名字段,格式 "old:new" 或 "old1:new1,old2:new2"
159
+ promote: 提升嵌套字段到顶层,格式 "path" 或 "path:name"(逗号分隔多个)
160
+ add_field: 添加常量字段,格式 "key:value"(逗号分隔多个)
161
+ fill: 填充空值,格式 "field:default_value"(逗号分隔多个)
162
+ reorder: 控制字段顺序(逗号分隔),未列出的字段追加在后面
150
163
  strip: 去除所有字符串字段的首尾空白
151
164
  output: 输出文件路径,不指定则覆盖原文件
152
165
 
153
166
  Examples:
154
167
  dt clean data.jsonl --drop-empty # 删除任意空值记录
155
168
  dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
156
- dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
157
169
  dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
158
- dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
159
- dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
160
170
  dt clean data.jsonl --keep=question,answer # 只保留这些字段
161
171
  dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
172
+ dt clean data.jsonl --rename=question:instruction # 重命名字段
173
+ dt clean data.jsonl --promote=meta.label # 提升嵌套字段到顶层
174
+ dt clean data.jsonl --promote=meta.label:tag # 提升并自定义名称
175
+ dt clean data.jsonl --add-field=source:web # 添加常量字段
176
+ dt clean data.jsonl --fill=label:unknown # 填充空值
177
+ dt clean data.jsonl --reorder=id,text,label # 控制字段顺序
162
178
  dt clean data.jsonl --strip # 去除字符串首尾空白
179
+ dt clean data.jsonl --min-tokens=content:10 # content 字段最少 10 tokens
180
+ dt clean data.jsonl --max-tokens=content:1000 # content 字段最多 1000 tokens
181
+ dt clean data.jsonl --min-tokens=text:50 --model=gpt-4 # 使用 gpt-4 分词器
163
182
  """
164
183
  filepath = Path(filename)
165
184
 
@@ -173,8 +192,20 @@ def clean(
173
192
  # 解析参数
174
193
  min_len_field, min_len_value = _parse_len_param(min_len) if min_len else (None, None)
175
194
  max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
195
+ min_tokens_field, min_tokens_value = (
196
+ _parse_len_param(min_tokens) if min_tokens else (None, None)
197
+ )
198
+ max_tokens_field, max_tokens_value = (
199
+ _parse_len_param(max_tokens) if max_tokens else (None, None)
200
+ )
201
+ token_model = model
176
202
  keep_fields = _parse_field_list(keep) if keep else None
177
203
  drop_fields_set = set(_parse_field_list(drop)) if drop else None
204
+ rename_map = _parse_rename_param(rename) if rename else None
205
+ promote_list = _parse_promote_param(promote) if promote else None
206
+ add_field_map = _parse_kv_param(add_field, "add-field") if add_field else None
207
+ fill_map = _parse_kv_param(fill, "fill") if fill else None
208
+ reorder_fields = _parse_field_list(reorder) if reorder else None
178
209
  keep_set = set(keep_fields) if keep_fields else None
179
210
 
180
211
  # 构建清洗配置
@@ -197,6 +228,28 @@ def clean(
197
228
  print(f"🔄 只保留字段: {', '.join(keep_fields)}")
198
229
  if drop_fields_set:
199
230
  print(f"🔄 删除字段: {', '.join(drop_fields_set)}")
231
+ if rename_map:
232
+ rename_desc = ", ".join(f"{k} → {v}" for k, v in rename_map.items())
233
+ print(f"🔄 重命名字段: {rename_desc}")
234
+ if promote_list:
235
+ promote_desc = ", ".join(f"{src} → {dst}" for src, dst in promote_list)
236
+ print(f"🔄 提升字段: {promote_desc}")
237
+ if add_field_map:
238
+ add_desc = ", ".join(f"{k}={v}" for k, v in add_field_map.items())
239
+ print(f"🔄 添加字段: {add_desc}")
240
+ if fill_map:
241
+ fill_desc = ", ".join(f"{k}={v}" for k, v in fill_map.items())
242
+ print(f"🔄 填充空值: {fill_desc}")
243
+ if reorder_fields:
244
+ print(f"🔄 字段排序: {', '.join(reorder_fields)}")
245
+ if min_tokens_field:
246
+ print(
247
+ f"🔄 过滤 {min_tokens_field} tokens < {min_tokens_value} 的记录 (model={token_model})..."
248
+ )
249
+ if max_tokens_field:
250
+ print(
251
+ f"🔄 过滤 {max_tokens_field} tokens > {max_tokens_value} 的记录 (model={token_model})..."
252
+ )
200
253
 
201
254
  output_path = output or str(filepath)
202
255
 
@@ -234,6 +287,16 @@ def clean(
234
287
  max_len_value=max_len_value,
235
288
  keep_set=keep_set,
236
289
  drop_fields_set=drop_fields_set,
290
+ rename_map=rename_map,
291
+ promote_list=promote_list,
292
+ add_field_map=add_field_map,
293
+ fill_map=fill_map,
294
+ reorder_fields=reorder_fields,
295
+ min_tokens_field=min_tokens_field,
296
+ min_tokens_value=min_tokens_value,
297
+ max_tokens_field=max_tokens_field,
298
+ max_tokens_value=max_tokens_value,
299
+ token_model=token_model,
237
300
  )
238
301
 
239
302
  # 如果使用了临时文件,移动到目标位置
@@ -274,6 +337,16 @@ def clean(
274
337
  max_len_value=max_len_value,
275
338
  keep_fields=keep_fields,
276
339
  drop_fields=drop_fields_set,
340
+ rename_map=rename_map,
341
+ promote_list=promote_list,
342
+ add_field_map=add_field_map,
343
+ fill_map=fill_map,
344
+ reorder_fields=reorder_fields,
345
+ min_tokens_field=min_tokens_field,
346
+ min_tokens_value=min_tokens_value,
347
+ max_tokens_field=max_tokens_field,
348
+ max_tokens_value=max_tokens_value,
349
+ token_model=token_model,
277
350
  )
278
351
 
279
352
  # 保存结果
@@ -288,12 +361,106 @@ def clean(
288
361
 
289
362
  # 打印统计
290
363
  removed_count = original_count - final_count
291
- print(f"\n✅ 完成!")
364
+ print("\n✅ 完成!")
292
365
  print(f" 原始: {original_count} 条 -> 清洗后: {final_count} 条 (删除 {removed_count} 条)")
293
366
  if step_stats:
294
367
  print(f" 步骤: {' | '.join(step_stats)}")
295
368
 
296
369
 
370
+ def _parse_rename_param(param: str) -> Dict[str, str]:
371
+ """解析重命名参数,格式 'old:new' 或 'old1:new1,old2:new2'"""
372
+ rename_map = {}
373
+ for pair in param.split(","):
374
+ pair = pair.strip()
375
+ if ":" not in pair:
376
+ raise ValueError(f"重命名参数格式错误: {pair},应为 'old:new'")
377
+ old, new = pair.split(":", 1)
378
+ old, new = old.strip(), new.strip()
379
+ if not old or not new:
380
+ raise ValueError(f"重命名参数格式错误: {pair},字段名不能为空")
381
+ rename_map[old] = new
382
+ return rename_map
383
+
384
+
385
+ def _parse_promote_param(param: str) -> List[tuple]:
386
+ """
387
+ 解析提升参数,格式 'path' 或 'path:name'(逗号分隔多个)。
388
+
389
+ Returns:
390
+ [(source_path, target_name), ...]
391
+ """
392
+ result = []
393
+ for item in param.split(","):
394
+ item = item.strip()
395
+ if ":" in item:
396
+ src, dst = item.split(":", 1)
397
+ src, dst = src.strip(), dst.strip()
398
+ else:
399
+ src = item
400
+ # 默认用路径最后一段作为目标名
401
+ dst = src.rsplit(".", 1)[-1] if "." in src else src
402
+ if not src or not dst:
403
+ raise ValueError(f"promote 参数格式错误: {item}")
404
+ result.append((src, dst))
405
+ return result
406
+
407
+
408
+ def _parse_kv_param(param: str, param_name: str) -> Dict[str, str]:
409
+ """解析 key:value 格式参数(通用),用于 --add-field 和 --fill"""
410
+ kv_map = {}
411
+ for pair in param.split(","):
412
+ pair = pair.strip()
413
+ if ":" not in pair:
414
+ raise ValueError(f"{param_name} 参数格式错误: {pair},应为 'key:value'")
415
+ key, value = pair.split(":", 1)
416
+ key, value = key.strip(), value.strip()
417
+ if not key:
418
+ raise ValueError(f"{param_name} 参数格式错误: {pair},key 不能为空")
419
+ kv_map[key] = value
420
+ return kv_map
421
+
422
+
423
+ def _rename_item(item: Dict, rename_map: Dict[str, str]) -> Dict:
424
+ """重命名字段,保持字段顺序"""
425
+ return {rename_map.get(k, k): v for k, v in item.items()}
426
+
427
+
428
+ def _promote_fields(item: Dict, promote_list: List[tuple]) -> Dict:
429
+ """提升嵌套字段到顶层(始终添加字段,即使值为 None)"""
430
+ item = dict(item)
431
+ for src_path, dst_name in promote_list:
432
+ item[dst_name] = get_field_with_spec(item, src_path)
433
+ return item
434
+
435
+
436
+ def _add_fields(item: Dict, add_field_map: Dict[str, str]) -> Dict:
437
+ """添加常量字段"""
438
+ item = dict(item)
439
+ item.update(add_field_map)
440
+ return item
441
+
442
+
443
+ def _fill_empty(item: Dict, fill_map: Dict[str, str]) -> Dict:
444
+ """填充空值(字段不存在时也会添加)"""
445
+ item = dict(item)
446
+ for field, default in fill_map.items():
447
+ if field not in item or _is_empty_value(item[field]):
448
+ item[field] = default
449
+ return item
450
+
451
+
452
+ def _reorder_item(item: Dict, reorder_fields: List[str]) -> Dict:
453
+ """按指定顺序重排字段,未列出的字段追加在后面"""
454
+ ordered = {}
455
+ for f in reorder_fields:
456
+ if f in item:
457
+ ordered[f] = item[f]
458
+ for k, v in item.items():
459
+ if k not in ordered:
460
+ ordered[k] = v
461
+ return ordered
462
+
463
+
297
464
  def _parse_len_param(param: str) -> tuple:
298
465
  """解析长度参数,格式 'field:length'"""
299
466
  if ":" not in param:
@@ -302,8 +469,8 @@ def _parse_len_param(param: str) -> tuple:
302
469
  field = parts[0].strip()
303
470
  try:
304
471
  length = int(parts[1].strip())
305
- except ValueError:
306
- raise ValueError(f"长度必须是整数: {parts[1]}")
472
+ except ValueError as e:
473
+ raise ValueError(f"长度必须是整数: {parts[1]}") from e
307
474
  return field, length
308
475
 
309
476
 
@@ -317,6 +484,16 @@ def _clean_data_single_pass(
317
484
  max_len_value: Optional[int] = None,
318
485
  keep_fields: Optional[List[str]] = None,
319
486
  drop_fields: Optional[set] = None,
487
+ rename_map: Optional[Dict[str, str]] = None,
488
+ promote_list: Optional[List[tuple]] = None,
489
+ add_field_map: Optional[Dict[str, str]] = None,
490
+ fill_map: Optional[Dict[str, str]] = None,
491
+ reorder_fields: Optional[List[str]] = None,
492
+ min_tokens_field: Optional[str] = None,
493
+ min_tokens_value: Optional[int] = None,
494
+ max_tokens_field: Optional[str] = None,
495
+ max_tokens_value: Optional[int] = None,
496
+ token_model: str = "cl100k_base",
320
497
  ) -> tuple:
321
498
  """
322
499
  单次遍历执行所有清洗操作。
@@ -335,11 +512,18 @@ def _clean_data_single_pass(
335
512
  Returns:
336
513
  (清洗后的数据, 统计信息列表)
337
514
  """
515
+ # 延迟导入 count_tokens(仅在需要时)
516
+ _count_tokens = None
517
+ if min_tokens_field is not None or max_tokens_field is not None:
518
+ from ..tokenizers import count_tokens as _count_tokens
519
+
338
520
  result = []
339
521
  stats = {
340
522
  "drop_empty": 0,
341
523
  "min_len": 0,
342
524
  "max_len": 0,
525
+ "min_tokens": 0,
526
+ "max_tokens": 0,
343
527
  }
344
528
 
345
529
  # 预先计算 keep_fields 集合(如果有的话)
@@ -375,12 +559,46 @@ def _clean_data_single_pass(
375
559
  stats["max_len"] += 1
376
560
  continue
377
561
 
378
- # 5. 字段管理(keep/drop)
562
+ # 4.5 最小 token 数过滤
563
+ if min_tokens_field is not None:
564
+ value = get_field_with_spec(item, min_tokens_field, default="")
565
+ if _count_tokens(str(value), model=token_model) < min_tokens_value:
566
+ stats["min_tokens"] += 1
567
+ continue
568
+
569
+ # 4.6 最大 token 数过滤
570
+ if max_tokens_field is not None:
571
+ value = get_field_with_spec(item, max_tokens_field, default="")
572
+ if _count_tokens(str(value), model=token_model) > max_tokens_value:
573
+ stats["max_tokens"] += 1
574
+ continue
575
+
576
+ # 5. 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
577
+ if promote_list is not None:
578
+ item = _promote_fields(item, promote_list)
579
+
580
+ # 6. 字段管理(keep/drop)
379
581
  if keep_set is not None:
380
582
  item = {k: v for k, v in item.items() if k in keep_set}
381
583
  elif drop_fields is not None:
382
584
  item = {k: v for k, v in item.items() if k not in drop_fields}
383
585
 
586
+ # 7. 字段重命名
587
+ if rename_map is not None:
588
+ item = _rename_item(item, rename_map)
589
+
590
+ # 8. 添加常量字段
591
+ if add_field_map is not None:
592
+ item = _add_fields(item, add_field_map)
593
+
594
+ # 9. 填充空值
595
+ if fill_map is not None:
596
+ item = _fill_empty(item, fill_map)
597
+
598
+ # 10. 字段排序(最后执行)
599
+ if reorder_fields is not None:
600
+ item = _reorder_item(item, reorder_fields)
601
+
384
602
  result.append(item)
385
603
 
386
604
  # 构建统计信息字符串列表
@@ -393,10 +611,24 @@ def _clean_data_single_pass(
393
611
  step_stats.append(f"min-len: -{stats['min_len']}")
394
612
  if stats["max_len"] > 0:
395
613
  step_stats.append(f"max-len: -{stats['max_len']}")
614
+ if stats["min_tokens"] > 0:
615
+ step_stats.append(f"min-tokens: -{stats['min_tokens']}")
616
+ if stats["max_tokens"] > 0:
617
+ step_stats.append(f"max-tokens: -{stats['max_tokens']}")
396
618
  if keep_fields:
397
619
  step_stats.append(f"keep: {len(keep_fields)} 字段")
398
620
  if drop_fields:
399
621
  step_stats.append(f"drop: {len(drop_fields)} 字段")
622
+ if rename_map:
623
+ step_stats.append(f"rename: {len(rename_map)} 字段")
624
+ if promote_list:
625
+ step_stats.append(f"promote: {len(promote_list)} 字段")
626
+ if add_field_map:
627
+ step_stats.append(f"add-field: {len(add_field_map)} 字段")
628
+ if fill_map:
629
+ step_stats.append(f"fill: {len(fill_map)} 字段")
630
+ if reorder_fields:
631
+ step_stats.append("reorder")
400
632
 
401
633
  return result, step_stats
402
634
 
@@ -412,6 +644,16 @@ def _clean_streaming(
412
644
  max_len_value: Optional[int] = None,
413
645
  keep_set: Optional[set] = None,
414
646
  drop_fields_set: Optional[set] = None,
647
+ rename_map: Optional[Dict[str, str]] = None,
648
+ promote_list: Optional[List[tuple]] = None,
649
+ add_field_map: Optional[Dict[str, str]] = None,
650
+ fill_map: Optional[Dict[str, str]] = None,
651
+ reorder_fields: Optional[List[str]] = None,
652
+ min_tokens_field: Optional[str] = None,
653
+ min_tokens_value: Optional[int] = None,
654
+ max_tokens_field: Optional[str] = None,
655
+ max_tokens_value: Optional[int] = None,
656
+ token_model: str = "cl100k_base",
415
657
  ) -> int:
416
658
  """
417
659
  流式清洗数据。
@@ -420,6 +662,11 @@ def _clean_streaming(
420
662
  处理后的数据条数
421
663
  """
422
664
 
665
+ # 延迟导入 count_tokens(仅在需要时)
666
+ _count_tokens = None
667
+ if min_tokens_field is not None or max_tokens_field is not None:
668
+ from ..tokenizers import count_tokens as _count_tokens
669
+
423
670
  def clean_filter(item: Dict) -> bool:
424
671
  """过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
425
672
  # 空值过滤
@@ -442,6 +689,18 @@ def _clean_streaming(
442
689
  if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
443
690
  return False
444
691
 
692
+ # 最小 token 数过滤
693
+ if min_tokens_field is not None:
694
+ value = get_field_with_spec(item, min_tokens_field, default="")
695
+ if _count_tokens(str(value), model=token_model) < min_tokens_value:
696
+ return False
697
+
698
+ # 最大 token 数过滤
699
+ if max_tokens_field is not None:
700
+ value = get_field_with_spec(item, max_tokens_field, default="")
701
+ if _count_tokens(str(value), model=token_model) > max_tokens_value:
702
+ return False
703
+
445
704
  return True
446
705
 
447
706
  def clean_transform(item: Dict) -> Dict:
@@ -468,10 +727,20 @@ def _clean_streaming(
468
727
  )
469
728
 
470
729
  # 执行过滤
471
- if empty_fields is not None or min_len_field is not None or max_len_field is not None:
730
+ if (
731
+ empty_fields is not None
732
+ or min_len_field is not None
733
+ or max_len_field is not None
734
+ or min_tokens_field is not None
735
+ or max_tokens_field is not None
736
+ ):
472
737
  st = st.filter(clean_filter)
473
738
 
474
- # 执行字段管理(如果没有 strip,也需要在这里处理)
739
+ # 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
740
+ if promote_list is not None:
741
+ st = st.transform(lambda item: _promote_fields(item, promote_list))
742
+
743
+ # 执行字段管理(keep/drop)
475
744
  if keep_set is not None or drop_fields_set is not None:
476
745
 
477
746
  def field_transform(item):
@@ -483,4 +752,20 @@ def _clean_streaming(
483
752
 
484
753
  st = st.transform(field_transform)
485
754
 
755
+ # 执行字段重命名
756
+ if rename_map is not None:
757
+ st = st.transform(lambda item: _rename_item(item, rename_map))
758
+
759
+ # 添加常量字段
760
+ if add_field_map is not None:
761
+ st = st.transform(lambda item: _add_fields(item, add_field_map))
762
+
763
+ # 填充空值
764
+ if fill_map is not None:
765
+ st = st.transform(lambda item: _fill_empty(item, fill_map))
766
+
767
+ # 字段排序(最后执行)
768
+ if reorder_fields is not None:
769
+ st = st.transform(lambda item: _reorder_item(item, reorder_fields))
770
+
486
771
  return st.save(output_path)
dtflow/cli/commands.py CHANGED
@@ -16,6 +16,12 @@ CLI 命令统一导出入口
16
16
  # 清洗命令
17
17
  from .clean import clean, dedupe
18
18
 
19
+ # 评估命令
20
+ from .eval import eval
21
+
22
+ # 导出命令
23
+ from .export import export
24
+
19
25
  # IO 操作命令
20
26
  from .io_ops import concat, diff
21
27
 
@@ -24,11 +30,14 @@ from .lineage import history
24
30
 
25
31
  # Pipeline 命令
26
32
  from .pipeline import run
27
- from .sample import head, sample, tail
33
+ from .sample import head, sample, slice_data, tail
28
34
 
29
35
  # Skill 命令
30
36
  from .skill import install_skill, skill_status, uninstall_skill
31
37
 
38
+ # 切分命令
39
+ from .split import split
40
+
32
41
  # 统计命令
33
42
  from .stats import stats, token_stats
34
43
 
@@ -43,6 +52,7 @@ __all__ = [
43
52
  "sample",
44
53
  "head",
45
54
  "tail",
55
+ "slice_data",
46
56
  # 转换
47
57
  "transform",
48
58
  # 统计
@@ -60,6 +70,12 @@ __all__ = [
60
70
  "history",
61
71
  # 验证
62
72
  "validate",
73
+ # 切分
74
+ "split",
75
+ # 导出
76
+ "export",
77
+ # 评估
78
+ "eval",
63
79
  # Skill
64
80
  "install_skill",
65
81
  "uninstall_skill",