dtflow 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/cli/clean.py CHANGED
@@ -132,6 +132,11 @@ def clean(
132
132
  max_len: Optional[str] = None,
133
133
  keep: Optional[str] = None,
134
134
  drop: Optional[str] = None,
135
+ rename: Optional[str] = None,
136
+ promote: Optional[str] = None,
137
+ add_field: Optional[str] = None,
138
+ fill: Optional[str] = None,
139
+ reorder: Optional[str] = None,
135
140
  strip: bool = False,
136
141
  output: Optional[str] = None,
137
142
  ) -> None:
@@ -147,18 +152,26 @@ def clean(
147
152
  max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
148
153
  keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
149
154
  drop: 删除指定字段(逗号分隔,仅支持顶层字段)
155
+ rename: 重命名字段,格式 "old:new" 或 "old1:new1,old2:new2"
156
+ promote: 提升嵌套字段到顶层,格式 "path" 或 "path:name"(逗号分隔多个)
157
+ add_field: 添加常量字段,格式 "key:value"(逗号分隔多个)
158
+ fill: 填充空值,格式 "field:default_value"(逗号分隔多个)
159
+ reorder: 控制字段顺序(逗号分隔),未列出的字段追加在后面
150
160
  strip: 去除所有字符串字段的首尾空白
151
161
  output: 输出文件路径,不指定则覆盖原文件
152
162
 
153
163
  Examples:
154
164
  dt clean data.jsonl --drop-empty # 删除任意空值记录
155
165
  dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
156
- dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
157
166
  dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
158
- dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
159
- dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
160
167
  dt clean data.jsonl --keep=question,answer # 只保留这些字段
161
168
  dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
169
+ dt clean data.jsonl --rename=question:instruction # 重命名字段
170
+ dt clean data.jsonl --promote=meta.label # 提升嵌套字段到顶层
171
+ dt clean data.jsonl --promote=meta.label:tag # 提升并自定义名称
172
+ dt clean data.jsonl --add-field=source:web # 添加常量字段
173
+ dt clean data.jsonl --fill=label:unknown # 填充空值
174
+ dt clean data.jsonl --reorder=id,text,label # 控制字段顺序
162
175
  dt clean data.jsonl --strip # 去除字符串首尾空白
163
176
  """
164
177
  filepath = Path(filename)
@@ -175,6 +188,11 @@ def clean(
175
188
  max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
176
189
  keep_fields = _parse_field_list(keep) if keep else None
177
190
  drop_fields_set = set(_parse_field_list(drop)) if drop else None
191
+ rename_map = _parse_rename_param(rename) if rename else None
192
+ promote_list = _parse_promote_param(promote) if promote else None
193
+ add_field_map = _parse_kv_param(add_field, "add-field") if add_field else None
194
+ fill_map = _parse_kv_param(fill, "fill") if fill else None
195
+ reorder_fields = _parse_field_list(reorder) if reorder else None
178
196
  keep_set = set(keep_fields) if keep_fields else None
179
197
 
180
198
  # 构建清洗配置
@@ -197,6 +215,20 @@ def clean(
197
215
  print(f"🔄 只保留字段: {', '.join(keep_fields)}")
198
216
  if drop_fields_set:
199
217
  print(f"🔄 删除字段: {', '.join(drop_fields_set)}")
218
+ if rename_map:
219
+ rename_desc = ", ".join(f"{k} → {v}" for k, v in rename_map.items())
220
+ print(f"🔄 重命名字段: {rename_desc}")
221
+ if promote_list:
222
+ promote_desc = ", ".join(f"{src} → {dst}" for src, dst in promote_list)
223
+ print(f"🔄 提升字段: {promote_desc}")
224
+ if add_field_map:
225
+ add_desc = ", ".join(f"{k}={v}" for k, v in add_field_map.items())
226
+ print(f"🔄 添加字段: {add_desc}")
227
+ if fill_map:
228
+ fill_desc = ", ".join(f"{k}={v}" for k, v in fill_map.items())
229
+ print(f"🔄 填充空值: {fill_desc}")
230
+ if reorder_fields:
231
+ print(f"🔄 字段排序: {', '.join(reorder_fields)}")
200
232
 
201
233
  output_path = output or str(filepath)
202
234
 
@@ -234,6 +266,11 @@ def clean(
234
266
  max_len_value=max_len_value,
235
267
  keep_set=keep_set,
236
268
  drop_fields_set=drop_fields_set,
269
+ rename_map=rename_map,
270
+ promote_list=promote_list,
271
+ add_field_map=add_field_map,
272
+ fill_map=fill_map,
273
+ reorder_fields=reorder_fields,
237
274
  )
238
275
 
239
276
  # 如果使用了临时文件,移动到目标位置
@@ -274,6 +311,11 @@ def clean(
274
311
  max_len_value=max_len_value,
275
312
  keep_fields=keep_fields,
276
313
  drop_fields=drop_fields_set,
314
+ rename_map=rename_map,
315
+ promote_list=promote_list,
316
+ add_field_map=add_field_map,
317
+ fill_map=fill_map,
318
+ reorder_fields=reorder_fields,
277
319
  )
278
320
 
279
321
  # 保存结果
@@ -288,12 +330,106 @@ def clean(
288
330
 
289
331
  # 打印统计
290
332
  removed_count = original_count - final_count
291
- print(f"\n✅ 完成!")
333
+ print("\n✅ 完成!")
292
334
  print(f" 原始: {original_count} 条 -> 清洗后: {final_count} 条 (删除 {removed_count} 条)")
293
335
  if step_stats:
294
336
  print(f" 步骤: {' | '.join(step_stats)}")
295
337
 
296
338
 
339
+ def _parse_rename_param(param: str) -> Dict[str, str]:
340
+ """解析重命名参数,格式 'old:new' 或 'old1:new1,old2:new2'"""
341
+ rename_map = {}
342
+ for pair in param.split(","):
343
+ pair = pair.strip()
344
+ if ":" not in pair:
345
+ raise ValueError(f"重命名参数格式错误: {pair},应为 'old:new'")
346
+ old, new = pair.split(":", 1)
347
+ old, new = old.strip(), new.strip()
348
+ if not old or not new:
349
+ raise ValueError(f"重命名参数格式错误: {pair},字段名不能为空")
350
+ rename_map[old] = new
351
+ return rename_map
352
+
353
+
354
+ def _parse_promote_param(param: str) -> List[tuple]:
355
+ """
356
+ 解析提升参数,格式 'path' 或 'path:name'(逗号分隔多个)。
357
+
358
+ Returns:
359
+ [(source_path, target_name), ...]
360
+ """
361
+ result = []
362
+ for item in param.split(","):
363
+ item = item.strip()
364
+ if ":" in item:
365
+ src, dst = item.split(":", 1)
366
+ src, dst = src.strip(), dst.strip()
367
+ else:
368
+ src = item
369
+ # 默认用路径最后一段作为目标名
370
+ dst = src.rsplit(".", 1)[-1] if "." in src else src
371
+ if not src or not dst:
372
+ raise ValueError(f"promote 参数格式错误: {item}")
373
+ result.append((src, dst))
374
+ return result
375
+
376
+
377
+ def _parse_kv_param(param: str, param_name: str) -> Dict[str, str]:
378
+ """解析 key:value 格式参数(通用),用于 --add-field 和 --fill"""
379
+ kv_map = {}
380
+ for pair in param.split(","):
381
+ pair = pair.strip()
382
+ if ":" not in pair:
383
+ raise ValueError(f"{param_name} 参数格式错误: {pair},应为 'key:value'")
384
+ key, value = pair.split(":", 1)
385
+ key, value = key.strip(), value.strip()
386
+ if not key:
387
+ raise ValueError(f"{param_name} 参数格式错误: {pair},key 不能为空")
388
+ kv_map[key] = value
389
+ return kv_map
390
+
391
+
392
+ def _rename_item(item: Dict, rename_map: Dict[str, str]) -> Dict:
393
+ """重命名字段,保持字段顺序"""
394
+ return {rename_map.get(k, k): v for k, v in item.items()}
395
+
396
+
397
+ def _promote_fields(item: Dict, promote_list: List[tuple]) -> Dict:
398
+ """提升嵌套字段到顶层(始终添加字段,即使值为 None)"""
399
+ item = dict(item)
400
+ for src_path, dst_name in promote_list:
401
+ item[dst_name] = get_field_with_spec(item, src_path)
402
+ return item
403
+
404
+
405
+ def _add_fields(item: Dict, add_field_map: Dict[str, str]) -> Dict:
406
+ """添加常量字段"""
407
+ item = dict(item)
408
+ item.update(add_field_map)
409
+ return item
410
+
411
+
412
+ def _fill_empty(item: Dict, fill_map: Dict[str, str]) -> Dict:
413
+ """填充空值(字段不存在时也会添加)"""
414
+ item = dict(item)
415
+ for field, default in fill_map.items():
416
+ if field not in item or _is_empty_value(item[field]):
417
+ item[field] = default
418
+ return item
419
+
420
+
421
+ def _reorder_item(item: Dict, reorder_fields: List[str]) -> Dict:
422
+ """按指定顺序重排字段,未列出的字段追加在后面"""
423
+ ordered = {}
424
+ for f in reorder_fields:
425
+ if f in item:
426
+ ordered[f] = item[f]
427
+ for k, v in item.items():
428
+ if k not in ordered:
429
+ ordered[k] = v
430
+ return ordered
431
+
432
+
297
433
  def _parse_len_param(param: str) -> tuple:
298
434
  """解析长度参数,格式 'field:length'"""
299
435
  if ":" not in param:
@@ -302,8 +438,8 @@ def _parse_len_param(param: str) -> tuple:
302
438
  field = parts[0].strip()
303
439
  try:
304
440
  length = int(parts[1].strip())
305
- except ValueError:
306
- raise ValueError(f"长度必须是整数: {parts[1]}")
441
+ except ValueError as e:
442
+ raise ValueError(f"长度必须是整数: {parts[1]}") from e
307
443
  return field, length
308
444
 
309
445
 
@@ -317,6 +453,11 @@ def _clean_data_single_pass(
317
453
  max_len_value: Optional[int] = None,
318
454
  keep_fields: Optional[List[str]] = None,
319
455
  drop_fields: Optional[set] = None,
456
+ rename_map: Optional[Dict[str, str]] = None,
457
+ promote_list: Optional[List[tuple]] = None,
458
+ add_field_map: Optional[Dict[str, str]] = None,
459
+ fill_map: Optional[Dict[str, str]] = None,
460
+ reorder_fields: Optional[List[str]] = None,
320
461
  ) -> tuple:
321
462
  """
322
463
  单次遍历执行所有清洗操作。
@@ -375,12 +516,32 @@ def _clean_data_single_pass(
375
516
  stats["max_len"] += 1
376
517
  continue
377
518
 
378
- # 5. 字段管理(keep/drop
519
+ # 5. 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
520
+ if promote_list is not None:
521
+ item = _promote_fields(item, promote_list)
522
+
523
+ # 6. 字段管理(keep/drop)
379
524
  if keep_set is not None:
380
525
  item = {k: v for k, v in item.items() if k in keep_set}
381
526
  elif drop_fields is not None:
382
527
  item = {k: v for k, v in item.items() if k not in drop_fields}
383
528
 
529
+ # 7. 字段重命名
530
+ if rename_map is not None:
531
+ item = _rename_item(item, rename_map)
532
+
533
+ # 8. 添加常量字段
534
+ if add_field_map is not None:
535
+ item = _add_fields(item, add_field_map)
536
+
537
+ # 9. 填充空值
538
+ if fill_map is not None:
539
+ item = _fill_empty(item, fill_map)
540
+
541
+ # 10. 字段排序(最后执行)
542
+ if reorder_fields is not None:
543
+ item = _reorder_item(item, reorder_fields)
544
+
384
545
  result.append(item)
385
546
 
386
547
  # 构建统计信息字符串列表
@@ -397,6 +558,16 @@ def _clean_data_single_pass(
397
558
  step_stats.append(f"keep: {len(keep_fields)} 字段")
398
559
  if drop_fields:
399
560
  step_stats.append(f"drop: {len(drop_fields)} 字段")
561
+ if rename_map:
562
+ step_stats.append(f"rename: {len(rename_map)} 字段")
563
+ if promote_list:
564
+ step_stats.append(f"promote: {len(promote_list)} 字段")
565
+ if add_field_map:
566
+ step_stats.append(f"add-field: {len(add_field_map)} 字段")
567
+ if fill_map:
568
+ step_stats.append(f"fill: {len(fill_map)} 字段")
569
+ if reorder_fields:
570
+ step_stats.append("reorder")
400
571
 
401
572
  return result, step_stats
402
573
 
@@ -412,6 +583,11 @@ def _clean_streaming(
412
583
  max_len_value: Optional[int] = None,
413
584
  keep_set: Optional[set] = None,
414
585
  drop_fields_set: Optional[set] = None,
586
+ rename_map: Optional[Dict[str, str]] = None,
587
+ promote_list: Optional[List[tuple]] = None,
588
+ add_field_map: Optional[Dict[str, str]] = None,
589
+ fill_map: Optional[Dict[str, str]] = None,
590
+ reorder_fields: Optional[List[str]] = None,
415
591
  ) -> int:
416
592
  """
417
593
  流式清洗数据。
@@ -471,7 +647,11 @@ def _clean_streaming(
471
647
  if empty_fields is not None or min_len_field is not None or max_len_field is not None:
472
648
  st = st.filter(clean_filter)
473
649
 
474
- # 执行字段管理(如果没有 strip,也需要在这里处理)
650
+ # 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
651
+ if promote_list is not None:
652
+ st = st.transform(lambda item: _promote_fields(item, promote_list))
653
+
654
+ # 执行字段管理(keep/drop)
475
655
  if keep_set is not None or drop_fields_set is not None:
476
656
 
477
657
  def field_transform(item):
@@ -483,4 +663,20 @@ def _clean_streaming(
483
663
 
484
664
  st = st.transform(field_transform)
485
665
 
666
+ # 执行字段重命名
667
+ if rename_map is not None:
668
+ st = st.transform(lambda item: _rename_item(item, rename_map))
669
+
670
+ # 添加常量字段
671
+ if add_field_map is not None:
672
+ st = st.transform(lambda item: _add_fields(item, add_field_map))
673
+
674
+ # 填充空值
675
+ if fill_map is not None:
676
+ st = st.transform(lambda item: _fill_empty(item, fill_map))
677
+
678
+ # 字段排序(最后执行)
679
+ if reorder_fields is not None:
680
+ st = st.transform(lambda item: _reorder_item(item, reorder_fields))
681
+
486
682
  return st.save(output_path)
dtflow/cli/commands.py CHANGED
@@ -13,25 +13,27 @@ CLI 命令统一导出入口
13
13
  """
14
14
 
15
15
  # 采样命令
16
- from .sample import head, sample, tail
17
-
18
- # 转换命令
19
- from .transform import transform
20
-
21
- # 统计命令
22
- from .stats import stats, token_stats
23
-
24
16
  # 清洗命令
25
17
  from .clean import clean, dedupe
26
18
 
27
19
  # IO 操作命令
28
20
  from .io_ops import concat, diff
29
21
 
22
+ # 血缘追踪命令
23
+ from .lineage import history
24
+
30
25
  # Pipeline 命令
31
26
  from .pipeline import run
27
+ from .sample import head, sample, tail
32
28
 
33
- # 血缘追踪命令
34
- from .lineage import history
29
+ # Skill 命令
30
+ from .skill import install_skill, skill_status, uninstall_skill
31
+
32
+ # 统计命令
33
+ from .stats import stats, token_stats
34
+
35
+ # 转换命令
36
+ from .transform import transform
35
37
 
36
38
  # 验证命令
37
39
  from .validate import validate
@@ -58,4 +60,8 @@ __all__ = [
58
60
  "history",
59
61
  # 验证
60
62
  "validate",
63
+ # Skill
64
+ "install_skill",
65
+ "uninstall_skill",
66
+ "skill_status",
61
67
  ]
dtflow/cli/skill.py ADDED
@@ -0,0 +1,72 @@
1
+ """
2
+ Claude Code Skill 安装命令
3
+ """
4
+
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ from rich.console import Console
9
+
10
+ console = Console()
11
+
12
+
13
+ def get_skill_source_path() -> Path:
14
+ """获取 SKILL.md 源文件路径"""
15
+ return Path(__file__).parent.parent / "SKILL.md"
16
+
17
+
18
+ def get_skill_target_dir() -> Path:
19
+ """获取 skill 安装目标目录"""
20
+ return Path.home() / ".claude" / "skills" / "dtflow"
21
+
22
+
23
+ def install_skill() -> None:
24
+ """安装 dtflow skill 到 Claude Code"""
25
+ source = get_skill_source_path()
26
+ target_dir = get_skill_target_dir()
27
+ target = target_dir / "SKILL.md"
28
+
29
+ if not source.exists():
30
+ console.print("[red]错误: SKILL.md 源文件不存在[/red]")
31
+ raise SystemExit(1)
32
+
33
+ # 创建目标目录
34
+ target_dir.mkdir(parents=True, exist_ok=True)
35
+
36
+ # 复制文件
37
+ shutil.copy2(source, target)
38
+
39
+ console.print("[green]✓[/green] 已安装 dtflow skill 到 Claude Code")
40
+ console.print(f" [dim]{target}[/dim]")
41
+ console.print()
42
+ console.print("[dim]在 Claude Code 中使用 /dtflow 调用此 skill[/dim]")
43
+
44
+
45
+ def uninstall_skill() -> None:
46
+ """卸载 dtflow skill"""
47
+ target_dir = get_skill_target_dir()
48
+ target = target_dir / "SKILL.md"
49
+
50
+ if not target.exists():
51
+ console.print("[yellow]dtflow skill 未安装[/yellow]")
52
+ return
53
+
54
+ target.unlink()
55
+
56
+ # 如果目录为空,也删除目录
57
+ if target_dir.exists() and not any(target_dir.iterdir()):
58
+ target_dir.rmdir()
59
+
60
+ console.print("[green]✓[/green] 已卸载 dtflow skill")
61
+
62
+
63
+ def skill_status() -> None:
64
+ """显示 skill 安装状态"""
65
+ target = get_skill_target_dir() / "SKILL.md"
66
+
67
+ if target.exists():
68
+ console.print("[green]✓[/green] dtflow skill 已安装")
69
+ console.print(f" [dim]{target}[/dim]")
70
+ else:
71
+ console.print("[yellow]✗[/yellow] dtflow skill 未安装")
72
+ console.print(" [dim]运行 dt install-skill 安装[/dim]")