dtflow 0.5.8__tar.gz → 0.5.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {dtflow-0.5.8 → dtflow-0.5.10}/PKG-INFO +29 -1
  2. {dtflow-0.5.8 → dtflow-0.5.10}/README.md +23 -0
  3. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/SKILL.md +22 -6
  4. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/__init__.py +1 -1
  5. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/__main__.py +106 -6
  6. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/clean.py +90 -1
  7. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/commands.py +17 -1
  8. dtflow-0.5.10/dtflow/cli/eval.py +288 -0
  9. dtflow-0.5.10/dtflow/cli/export.py +81 -0
  10. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/sample.py +90 -3
  11. dtflow-0.5.10/dtflow/cli/split.py +138 -0
  12. dtflow-0.5.10/dtflow/eval.py +276 -0
  13. dtflow-0.5.10/dtflow/utils/text_parser.py +124 -0
  14. {dtflow-0.5.8 → dtflow-0.5.10}/pyproject.toml +7 -0
  15. dtflow-0.5.10/tests/test_eval.py +445 -0
  16. {dtflow-0.5.8 → dtflow-0.5.10}/.gitignore +0 -0
  17. {dtflow-0.5.8 → dtflow-0.5.10}/CHANGELOG.md +0 -0
  18. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/__init__.py +0 -0
  19. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/common.py +0 -0
  20. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/io_ops.py +0 -0
  21. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/lineage.py +0 -0
  22. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/pipeline.py +0 -0
  23. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/skill.py +0 -0
  24. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/stats.py +0 -0
  25. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/transform.py +0 -0
  26. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/cli/validate.py +0 -0
  27. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/converters.py +0 -0
  28. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/core.py +0 -0
  29. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/framework.py +0 -0
  30. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/lineage.py +0 -0
  31. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/parallel.py +0 -0
  32. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/pipeline.py +0 -0
  33. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/presets.py +0 -0
  34. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/schema.py +0 -0
  35. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/storage/__init__.py +0 -0
  36. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/storage/io.py +0 -0
  37. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/streaming.py +0 -0
  38. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/tokenizers.py +0 -0
  39. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/utils/__init__.py +0 -0
  40. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/utils/display.py +0 -0
  41. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/utils/field_path.py +0 -0
  42. {dtflow-0.5.8 → dtflow-0.5.10}/dtflow/utils/helpers.py +0 -0
  43. {dtflow-0.5.8 → dtflow-0.5.10}/tests/README.md +0 -0
  44. {dtflow-0.5.8 → dtflow-0.5.10}/tests/benchmark_io.py +0 -0
  45. {dtflow-0.5.8 → dtflow-0.5.10}/tests/benchmark_sharegpt.py +0 -0
  46. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_cli_benchmark.py +0 -0
  47. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_cli_clean.py +0 -0
  48. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_cli_sample.py +0 -0
  49. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_cli_stats.py +0 -0
  50. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_cli_transform.py +0 -0
  51. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_converters.py +0 -0
  52. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_field_path.py +0 -0
  53. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_framework.py +0 -0
  54. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_io.py +0 -0
  55. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_lineage.py +0 -0
  56. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_parallel.py +0 -0
  57. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_pipeline.py +0 -0
  58. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_schema.py +0 -0
  59. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_streaming.py +0 -0
  60. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_tokenizers.py +0 -0
  61. {dtflow-0.5.8 → dtflow-0.5.10}/tests/test_transformer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.8
3
+ Version: 0.5.10
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -44,6 +44,7 @@ Requires-Dist: flake8>=3.9.0; extra == 'dev'
44
44
  Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
45
45
  Requires-Dist: isort>=5.9.0; extra == 'dev'
46
46
  Requires-Dist: mypy>=0.910; extra == 'dev'
47
+ Requires-Dist: pandas>=1.3.0; extra == 'dev'
47
48
  Requires-Dist: pyarrow; extra == 'dev'
48
49
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
49
50
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
@@ -57,10 +58,14 @@ Provides-Extra: docs
57
58
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
58
59
  Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
59
60
  Requires-Dist: sphinx>=4.0.0; extra == 'docs'
61
+ Provides-Extra: eval
62
+ Requires-Dist: pandas>=1.3.0; extra == 'eval'
63
+ Requires-Dist: scikit-learn>=0.24.0; extra == 'eval'
60
64
  Provides-Extra: full
61
65
  Requires-Dist: datasets>=2.0.0; extra == 'full'
62
66
  Requires-Dist: datasketch>=1.5.0; extra == 'full'
63
67
  Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
68
+ Requires-Dist: pandas>=1.3.0; extra == 'full'
64
69
  Requires-Dist: pyarrow; extra == 'full'
65
70
  Requires-Dist: rich>=10.0.0; extra == 'full'
66
71
  Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
@@ -435,6 +440,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
435
440
  dt sample data.jsonl --where="category=tech" # 筛选后采样
436
441
  dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
437
442
 
443
+ # 按行范围查看(Python 切片语法)
444
+ dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
445
+ dt slice data.jsonl :100 # 前 100 行
446
+ dt slice data.jsonl 100: # 第 100 行到末尾
447
+ dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
448
+ dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
449
+
438
450
  # 数据转换 - 预设模式
439
451
  dt transform data.jsonl --preset=openai_chat
440
452
  dt transform data.jsonl --preset=alpaca
@@ -469,6 +481,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
469
481
  dt clean data.jsonl --keep=question,answer # 只保留这些字段
470
482
  dt clean data.jsonl --drop=metadata # 删除指定字段
471
483
  dt clean data.jsonl --strip # 去除字符串首尾空白
484
+ dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
485
+ dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
486
+ dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
472
487
 
473
488
  # 数据去重
474
489
  dt dedupe data.jsonl # 全量精确去重
@@ -477,6 +492,17 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
477
492
  dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
478
493
  dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
479
494
 
495
+ # 数据集切分
496
+ dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
497
+ dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
498
+ dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
499
+
500
+ # 训练框架导出
501
+ dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
502
+ dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
503
+ dt export data.jsonl -f axolotl # 导出到 Axolotl
504
+ dt export data.jsonl -f llama-factory --check # 仅检查兼容性
505
+
480
506
  # 文件拼接
481
507
  dt concat a.jsonl b.jsonl -o merged.jsonl
482
508
 
@@ -522,6 +548,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
522
548
  | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
523
549
  | `clean` | `--min-len=` | `--min-len=messages.#:2` |
524
550
  | `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
551
+ | `clean` | `--min-tokens=` | `--min-tokens=content:10` |
552
+ | `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
525
553
  | `token-stats` | `--field=` | `--field=messages[-1].content` |
526
554
  | `diff` | `--key=` | `--key=meta.uuid` |
527
555
 
@@ -351,6 +351,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
351
351
  dt sample data.jsonl --where="category=tech" # 筛选后采样
352
352
  dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
353
353
 
354
+ # 按行范围查看(Python 切片语法)
355
+ dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
356
+ dt slice data.jsonl :100 # 前 100 行
357
+ dt slice data.jsonl 100: # 第 100 行到末尾
358
+ dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
359
+ dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
360
+
354
361
  # 数据转换 - 预设模式
355
362
  dt transform data.jsonl --preset=openai_chat
356
363
  dt transform data.jsonl --preset=alpaca
@@ -385,6 +392,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
385
392
  dt clean data.jsonl --keep=question,answer # 只保留这些字段
386
393
  dt clean data.jsonl --drop=metadata # 删除指定字段
387
394
  dt clean data.jsonl --strip # 去除字符串首尾空白
395
+ dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
396
+ dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
397
+ dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
388
398
 
389
399
  # 数据去重
390
400
  dt dedupe data.jsonl # 全量精确去重
@@ -393,6 +403,17 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
393
403
  dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
394
404
  dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
395
405
 
406
+ # 数据集切分
407
+ dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
408
+ dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
409
+ dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
410
+
411
+ # 训练框架导出
412
+ dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
413
+ dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
414
+ dt export data.jsonl -f axolotl # 导出到 Axolotl
415
+ dt export data.jsonl -f llama-factory --check # 仅检查兼容性
416
+
396
417
  # 文件拼接
397
418
  dt concat a.jsonl b.jsonl -o merged.jsonl
398
419
 
@@ -438,6 +459,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
438
459
  | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
439
460
  | `clean` | `--min-len=` | `--min-len=messages.#:2` |
440
461
  | `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
462
+ | `clean` | `--min-tokens=` | `--min-tokens=content:10` |
463
+ | `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
441
464
  | `token-stats` | `--field=` | `--field=messages[-1].content` |
442
465
  | `diff` | `--key=` | `--key=meta.uuid` |
443
466
 
@@ -3,13 +3,14 @@ name: dtflow
3
3
  description: >
4
4
  当用户需要处理 JSONL/CSV/Parquet/JSON/Arrow 数据文件时使用此 skill。
5
5
  提供 CLI 工具 `dt` 和 Python API `DataTransformer`。
6
- 适用场景:(1) 查看数据:dt sample/head/tail 采样预览,dt stats 统计字段分布;
6
+ 适用场景:(1) 查看数据:dt sample/head/tail 采样预览,dt slice 按行范围查看,dt stats 统计字段分布;
7
7
  (2) 数据清洗:dt clean 支持 --drop-empty/--min-len/--max-len 过滤行,--keep/--drop/--rename/--promote/--add-field/--fill/--reorder 操作字段;
8
8
  (3) 去重:dt dedupe 精确去重或 --similar 相似度去重;
9
9
  (4) 格式转换:dt transform 预设模板(openai_chat/alpaca/sharegpt/dpo)或自定义配置;
10
10
  (5) Schema 验证:dt validate --preset 验证数据格式;
11
- (6) ML 训练框架导出:export_for("llama-factory"/"swift"/"axolotl") 一键生成训练配置;
12
- (7) 大文件流式处理:load_stream() O(1) 内存处理 100GB+ 文件。
11
+ (6) 数据集切分:dt split 按比例切分 train/test/val;
12
+ (7) 训练框架导出:dt export / export_for() 一键导出到 llama-factory/swift/axolotl;
13
+ (8) 大文件流式处理:load_stream() O(1) 内存处理 100GB+ 文件。
13
14
  注意:此工具专注数据文件的结构化处理,不涉及 LLM 调用(LLM 调用请用 flexllm)。
14
15
  ---
15
16
 
@@ -162,7 +163,7 @@ dt sample data.jsonl 1000 --by=category # 分层采样
162
163
  dt sample data.jsonl 1000 --by=category --uniform # 均匀分层采样
163
164
  dt sample data.jsonl --where="messages.#>=2" # 条件筛选
164
165
  dt sample data.jsonl 10 -f input,output # 只显示指定字段
165
- dt sample data.jsonl 10 --raw # 输出原始 JSON(不截断)
166
+ dt sample data.jsonl 10 --pretty # 表格预览模式(默认原始 JSON
166
167
  dt sample data.jsonl 100 --seed=42 -o out.jsonl # 固定随机种子并保存
167
168
 
168
169
  # 去重
@@ -185,8 +186,20 @@ dt clean data.jsonl --add-field=source:web # 添加常量字段
185
186
  dt clean data.jsonl --fill=label:unknown # 填充空值/缺失字段
186
187
  dt clean data.jsonl --reorder=id,text,label # 控制字段输出顺序
187
188
  dt clean data.jsonl --strip # 去除字符串首尾空白
189
+ dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
190
+ dt clean data.jsonl --max-tokens=content:1000 -m gpt-4 # 最多 1000 tokens(指定分词器)
188
191
  dt clean data.jsonl --promote=meta.label --drop=meta --fill=label:unknown # 组合使用
189
192
 
193
+ # 数据集切分
194
+ dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
195
+ dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
196
+ dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
197
+
198
+ # 训练框架导出
199
+ dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
200
+ dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
201
+ dt export data.jsonl -f llama-factory --check # 仅检查兼容性
202
+
190
203
  # 验证
191
204
  dt validate data.jsonl --preset=openai_chat # 预设: openai_chat/alpaca/dpo/sharegpt
192
205
  dt validate data.jsonl -p alpaca -f -o valid.jsonl # 过滤无效数据并保存
@@ -205,10 +218,13 @@ dt diff a.jsonl b.jsonl --key=id # 对比差异
205
218
  dt diff a.jsonl b.jsonl --key=id -o report.md # 输出对比报告
206
219
 
207
220
  # 查看数据
208
- dt head data.jsonl 10 # 前 10
221
+ dt head data.jsonl 10 # 前 10 条(默认原始 JSON)
209
222
  dt head data.jsonl 10 -f input,output # 只显示指定字段
210
- dt head data.jsonl 10 --raw # 输出完整 JSON(不截断)
223
+ dt head data.jsonl 10 --pretty # 表格预览模式
211
224
  dt tail data.jsonl 10 # 后 10 条
225
+ dt slice data.jsonl 10:20 # 第 10-19 行(Python 切片语法)
226
+ dt slice data.jsonl :100 # 前 100 行
227
+ dt slice data.jsonl 100: # 第 100 行到末尾
212
228
 
213
229
  # 其他
214
230
  dt run pipeline.yaml # Pipeline 执行
@@ -60,7 +60,7 @@ from .tokenizers import (
60
60
  token_stats,
61
61
  )
62
62
 
63
- __version__ = "0.5.8"
63
+ __version__ = "0.5.10"
64
64
 
65
65
  __all__ = [
66
66
  # core
@@ -18,6 +18,8 @@ Commands:
18
18
  clean 数据清洗
19
19
  run 执行 Pipeline 配置文件
20
20
  history 显示数据血缘历史
21
+ split 分割数据集
22
+ export 导出到训练框架
21
23
  validate 使用 Schema 验证数据格式
22
24
  logs 日志查看工具使用说明
23
25
  install-skill 安装 dtflow skill 到 Claude Code
@@ -33,12 +35,16 @@ from .cli.commands import clean as _clean
33
35
  from .cli.commands import concat as _concat
34
36
  from .cli.commands import dedupe as _dedupe
35
37
  from .cli.commands import diff as _diff
38
+ from .cli.commands import eval as _eval
39
+ from .cli.commands import export as _export
36
40
  from .cli.commands import head as _head
37
41
  from .cli.commands import history as _history
38
42
  from .cli.commands import install_skill as _install_skill
39
43
  from .cli.commands import run as _run
40
44
  from .cli.commands import sample as _sample
41
45
  from .cli.commands import skill_status as _skill_status
46
+ from .cli.commands import slice_data as _slice_data
47
+ from .cli.commands import split as _split
42
48
  from .cli.commands import stats as _stats
43
49
  from .cli.commands import tail as _tail
44
50
  from .cli.commands import token_stats as _token_stats
@@ -69,12 +75,12 @@ def sample(
69
75
  by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
70
76
  uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
71
77
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
72
- raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
78
+ pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON"),
73
79
  where: Optional[List[str]] = typer.Option(None, "--where", "-w", help="筛选条件 (可多次使用)"),
74
80
  ):
75
81
  """从数据文件中采样指定数量的数据"""
76
82
  actual_num = num_arg if num_arg is not None else num
77
- _sample(filename, actual_num, type, output, seed, by, uniform, fields, raw, where)
83
+ _sample(filename, actual_num, type, output, seed, by, uniform, fields, not pretty, where)
78
84
 
79
85
 
80
86
  @app.command()
@@ -84,12 +90,12 @@ def head(
84
90
  num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
85
91
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
86
92
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
87
- raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
93
+ pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON"),
88
94
  ):
89
95
  """显示文件的前 N 条数据"""
90
96
  # 位置参数优先于选项参数
91
97
  actual_num = num_arg if num_arg is not None else num
92
- _head(filename, actual_num, output, fields, raw)
98
+ _head(filename, actual_num, output, fields, not pretty)
93
99
 
94
100
 
95
101
  @app.command()
@@ -99,12 +105,31 @@ def tail(
99
105
  num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
100
106
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
101
107
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
102
- raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
108
+ pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON"),
103
109
  ):
104
110
  """显示文件的后 N 条数据"""
105
111
  # 位置参数优先于选项参数
106
112
  actual_num = num_arg if num_arg is not None else num
107
- _tail(filename, actual_num, output, fields, raw)
113
+ _tail(filename, actual_num, output, fields, not pretty)
114
+
115
+
116
+ @app.command("slice")
117
+ def slice_cmd(
118
+ filename: str = typer.Argument(..., help="输入文件路径"),
119
+ range_str: str = typer.Argument(..., help="行号范围 (start:end),如 10:20、:100、100:、-10:"),
120
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
121
+ fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
122
+ pretty: bool = typer.Option(False, "--pretty", "-R", help="使用表格预览(默认原始 JSON)"),
123
+ ):
124
+ """按行号范围查看数据(Python 切片语法)
125
+
126
+ 示例:
127
+ dt slice data.jsonl 10:20 第 10-19 行
128
+ dt slice data.jsonl :100 前 100 行
129
+ dt slice data.jsonl 100: 第 100 行到末尾
130
+ dt slice data.jsonl -10: 最后 10 行
131
+ """
132
+ _slice_data(filename, range_str, output, fields, not pretty)
108
133
 
109
134
 
110
135
  # ============ 数据转换命令 ============
@@ -174,6 +199,13 @@ def clean(
174
199
  None, "--reorder", help="控制字段顺序 (field1,field2,...)"
175
200
  ),
176
201
  strip: bool = typer.Option(False, "--strip", help="去除字符串首尾空白"),
202
+ min_tokens: Optional[str] = typer.Option(
203
+ None, "--min-tokens", help="最小 token 数过滤 (字段:数量)"
204
+ ),
205
+ max_tokens: Optional[str] = typer.Option(
206
+ None, "--max-tokens", help="最大 token 数过滤 (字段:数量)"
207
+ ),
208
+ model: str = typer.Option("cl100k_base", "--model", "-m", help="分词器模型 (默认 cl100k_base)"),
177
209
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
178
210
  ):
179
211
  """数据清洗"""
@@ -190,6 +222,9 @@ def clean(
190
222
  fill,
191
223
  reorder,
192
224
  strip,
225
+ min_tokens,
226
+ max_tokens,
227
+ model,
193
228
  output,
194
229
  )
195
230
 
@@ -249,6 +284,71 @@ def history(
249
284
  _history(filename, json)
250
285
 
251
286
 
287
+ # ============ 切分与导出命令 ============
288
+
289
+
290
+ @app.command()
291
+ def split(
292
+ filename: str = typer.Argument(..., help="输入文件路径"),
293
+ ratio: str = typer.Option("0.8", "--ratio", "-r", help="分割比例,如 0.8 或 0.7,0.15,0.15"),
294
+ seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
295
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="输出目录(默认同目录)"),
296
+ ):
297
+ """分割数据集为 train/test (或 train/val/test)"""
298
+ _split(filename, ratio, seed, output)
299
+
300
+
301
+ @app.command()
302
+ def export(
303
+ filename: str = typer.Argument(..., help="输入文件路径"),
304
+ framework: str = typer.Option(
305
+ ..., "--framework", "-f", help="目标框架: llama-factory, swift, axolotl"
306
+ ),
307
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="输出目录"),
308
+ name: Optional[str] = typer.Option(None, "--name", "-n", help="数据集名称"),
309
+ check: bool = typer.Option(False, "--check", help="仅检查兼容性,不导出"),
310
+ ):
311
+ """导出数据到训练框架 (LLaMA-Factory, ms-swift, Axolotl)"""
312
+ _export(filename, framework, output, name, check)
313
+
314
+
315
+ # ============ 评估命令 ============
316
+
317
+
318
+ @app.command()
319
+ def eval(
320
+ result_file: str = typer.Argument(..., help="模型输出的 .jsonl 文件路径"),
321
+ source: Optional[str] = typer.Option(
322
+ None, "--source", "-s", help="原始输入文件,按行号对齐合并"
323
+ ),
324
+ response_col: str = typer.Option("content", "--response-col", "-r", help="模型响应字段名"),
325
+ label_col: Optional[str] = typer.Option(
326
+ None, "--label-col", "-l", help="标签字段名(不指定时自动检测)"
327
+ ),
328
+ extract: str = typer.Option(
329
+ "direct",
330
+ "--extract",
331
+ "-e",
332
+ help="管道式提取规则,算子: direct/tag:X/json_key:X/index:N/line:N/lines/regex:X",
333
+ ),
334
+ sep: Optional[str] = typer.Option(None, "--sep", help="配合 index 算子使用的分隔符"),
335
+ mapping: Optional[str] = typer.Option(None, "--mapping", "-m", help="值映射 (k1:v1,k2:v2)"),
336
+ output_dir: str = typer.Option("record", "--output-dir", "-o", help="指标报告输出目录"),
337
+ ):
338
+ """对模型输出进行解析和指标评估
339
+
340
+ 两阶段解析:自动清洗(去 think 标签、提取代码块)+ 管道式提取。
341
+
342
+ 示例:
343
+ dt eval result.jsonl --label-col=label
344
+ dt eval result.jsonl --extract="tag:标签" --mapping="是:1,否:0"
345
+ dt eval result.jsonl --source=input.jsonl --response-col=api_output.content
346
+ dt eval result.jsonl --extract="json_key:result | index:0" --sep=","
347
+ dt eval result.jsonl --extract="lines | index:1" --sep="|"
348
+ """
349
+ _eval(result_file, source, response_col, label_col, extract, sep, mapping, output_dir)
350
+
351
+
252
352
  # ============ 验证命令 ============
253
353
 
254
354
 
@@ -138,6 +138,9 @@ def clean(
138
138
  fill: Optional[str] = None,
139
139
  reorder: Optional[str] = None,
140
140
  strip: bool = False,
141
+ min_tokens: Optional[str] = None,
142
+ max_tokens: Optional[str] = None,
143
+ model: str = "cl100k_base",
141
144
  output: Optional[str] = None,
142
145
  ) -> None:
143
146
  """
@@ -173,6 +176,9 @@ def clean(
173
176
  dt clean data.jsonl --fill=label:unknown # 填充空值
174
177
  dt clean data.jsonl --reorder=id,text,label # 控制字段顺序
175
178
  dt clean data.jsonl --strip # 去除字符串首尾空白
179
+ dt clean data.jsonl --min-tokens=content:10 # content 字段最少 10 tokens
180
+ dt clean data.jsonl --max-tokens=content:1000 # content 字段最多 1000 tokens
181
+ dt clean data.jsonl --min-tokens=text:50 --model=gpt-4 # 使用 gpt-4 分词器
176
182
  """
177
183
  filepath = Path(filename)
178
184
 
@@ -186,6 +192,13 @@ def clean(
186
192
  # 解析参数
187
193
  min_len_field, min_len_value = _parse_len_param(min_len) if min_len else (None, None)
188
194
  max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
195
+ min_tokens_field, min_tokens_value = (
196
+ _parse_len_param(min_tokens) if min_tokens else (None, None)
197
+ )
198
+ max_tokens_field, max_tokens_value = (
199
+ _parse_len_param(max_tokens) if max_tokens else (None, None)
200
+ )
201
+ token_model = model
189
202
  keep_fields = _parse_field_list(keep) if keep else None
190
203
  drop_fields_set = set(_parse_field_list(drop)) if drop else None
191
204
  rename_map = _parse_rename_param(rename) if rename else None
@@ -229,6 +242,14 @@ def clean(
229
242
  print(f"🔄 填充空值: {fill_desc}")
230
243
  if reorder_fields:
231
244
  print(f"🔄 字段排序: {', '.join(reorder_fields)}")
245
+ if min_tokens_field:
246
+ print(
247
+ f"🔄 过滤 {min_tokens_field} tokens < {min_tokens_value} 的记录 (model={token_model})..."
248
+ )
249
+ if max_tokens_field:
250
+ print(
251
+ f"🔄 过滤 {max_tokens_field} tokens > {max_tokens_value} 的记录 (model={token_model})..."
252
+ )
232
253
 
233
254
  output_path = output or str(filepath)
234
255
 
@@ -271,6 +292,11 @@ def clean(
271
292
  add_field_map=add_field_map,
272
293
  fill_map=fill_map,
273
294
  reorder_fields=reorder_fields,
295
+ min_tokens_field=min_tokens_field,
296
+ min_tokens_value=min_tokens_value,
297
+ max_tokens_field=max_tokens_field,
298
+ max_tokens_value=max_tokens_value,
299
+ token_model=token_model,
274
300
  )
275
301
 
276
302
  # 如果使用了临时文件,移动到目标位置
@@ -316,6 +342,11 @@ def clean(
316
342
  add_field_map=add_field_map,
317
343
  fill_map=fill_map,
318
344
  reorder_fields=reorder_fields,
345
+ min_tokens_field=min_tokens_field,
346
+ min_tokens_value=min_tokens_value,
347
+ max_tokens_field=max_tokens_field,
348
+ max_tokens_value=max_tokens_value,
349
+ token_model=token_model,
319
350
  )
320
351
 
321
352
  # 保存结果
@@ -458,6 +489,11 @@ def _clean_data_single_pass(
458
489
  add_field_map: Optional[Dict[str, str]] = None,
459
490
  fill_map: Optional[Dict[str, str]] = None,
460
491
  reorder_fields: Optional[List[str]] = None,
492
+ min_tokens_field: Optional[str] = None,
493
+ min_tokens_value: Optional[int] = None,
494
+ max_tokens_field: Optional[str] = None,
495
+ max_tokens_value: Optional[int] = None,
496
+ token_model: str = "cl100k_base",
461
497
  ) -> tuple:
462
498
  """
463
499
  单次遍历执行所有清洗操作。
@@ -476,11 +512,18 @@ def _clean_data_single_pass(
476
512
  Returns:
477
513
  (清洗后的数据, 统计信息列表)
478
514
  """
515
+ # 延迟导入 count_tokens(仅在需要时)
516
+ _count_tokens = None
517
+ if min_tokens_field is not None or max_tokens_field is not None:
518
+ from ..tokenizers import count_tokens as _count_tokens
519
+
479
520
  result = []
480
521
  stats = {
481
522
  "drop_empty": 0,
482
523
  "min_len": 0,
483
524
  "max_len": 0,
525
+ "min_tokens": 0,
526
+ "max_tokens": 0,
484
527
  }
485
528
 
486
529
  # 预先计算 keep_fields 集合(如果有的话)
@@ -516,6 +559,20 @@ def _clean_data_single_pass(
516
559
  stats["max_len"] += 1
517
560
  continue
518
561
 
562
+ # 4.5 最小 token 数过滤
563
+ if min_tokens_field is not None:
564
+ value = get_field_with_spec(item, min_tokens_field, default="")
565
+ if _count_tokens(str(value), model=token_model) < min_tokens_value:
566
+ stats["min_tokens"] += 1
567
+ continue
568
+
569
+ # 4.6 最大 token 数过滤
570
+ if max_tokens_field is not None:
571
+ value = get_field_with_spec(item, max_tokens_field, default="")
572
+ if _count_tokens(str(value), model=token_model) > max_tokens_value:
573
+ stats["max_tokens"] += 1
574
+ continue
575
+
519
576
  # 5. 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
520
577
  if promote_list is not None:
521
578
  item = _promote_fields(item, promote_list)
@@ -554,6 +611,10 @@ def _clean_data_single_pass(
554
611
  step_stats.append(f"min-len: -{stats['min_len']}")
555
612
  if stats["max_len"] > 0:
556
613
  step_stats.append(f"max-len: -{stats['max_len']}")
614
+ if stats["min_tokens"] > 0:
615
+ step_stats.append(f"min-tokens: -{stats['min_tokens']}")
616
+ if stats["max_tokens"] > 0:
617
+ step_stats.append(f"max-tokens: -{stats['max_tokens']}")
557
618
  if keep_fields:
558
619
  step_stats.append(f"keep: {len(keep_fields)} 字段")
559
620
  if drop_fields:
@@ -588,6 +649,11 @@ def _clean_streaming(
588
649
  add_field_map: Optional[Dict[str, str]] = None,
589
650
  fill_map: Optional[Dict[str, str]] = None,
590
651
  reorder_fields: Optional[List[str]] = None,
652
+ min_tokens_field: Optional[str] = None,
653
+ min_tokens_value: Optional[int] = None,
654
+ max_tokens_field: Optional[str] = None,
655
+ max_tokens_value: Optional[int] = None,
656
+ token_model: str = "cl100k_base",
591
657
  ) -> int:
592
658
  """
593
659
  流式清洗数据。
@@ -596,6 +662,11 @@ def _clean_streaming(
596
662
  处理后的数据条数
597
663
  """
598
664
 
665
+ # 延迟导入 count_tokens(仅在需要时)
666
+ _count_tokens = None
667
+ if min_tokens_field is not None or max_tokens_field is not None:
668
+ from ..tokenizers import count_tokens as _count_tokens
669
+
599
670
  def clean_filter(item: Dict) -> bool:
600
671
  """过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
601
672
  # 空值过滤
@@ -618,6 +689,18 @@ def _clean_streaming(
618
689
  if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
619
690
  return False
620
691
 
692
+ # 最小 token 数过滤
693
+ if min_tokens_field is not None:
694
+ value = get_field_with_spec(item, min_tokens_field, default="")
695
+ if _count_tokens(str(value), model=token_model) < min_tokens_value:
696
+ return False
697
+
698
+ # 最大 token 数过滤
699
+ if max_tokens_field is not None:
700
+ value = get_field_with_spec(item, max_tokens_field, default="")
701
+ if _count_tokens(str(value), model=token_model) > max_tokens_value:
702
+ return False
703
+
621
704
  return True
622
705
 
623
706
  def clean_transform(item: Dict) -> Dict:
@@ -644,7 +727,13 @@ def _clean_streaming(
644
727
  )
645
728
 
646
729
  # 执行过滤
647
- if empty_fields is not None or min_len_field is not None or max_len_field is not None:
730
+ if (
731
+ empty_fields is not None
732
+ or min_len_field is not None
733
+ or max_len_field is not None
734
+ or min_tokens_field is not None
735
+ or max_tokens_field is not None
736
+ ):
648
737
  st = st.filter(clean_filter)
649
738
 
650
739
  # 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
@@ -16,6 +16,12 @@ CLI 命令统一导出入口
16
16
  # 清洗命令
17
17
  from .clean import clean, dedupe
18
18
 
19
+ # 评估命令
20
+ from .eval import eval
21
+
22
+ # 导出命令
23
+ from .export import export
24
+
19
25
  # IO 操作命令
20
26
  from .io_ops import concat, diff
21
27
 
@@ -24,11 +30,14 @@ from .lineage import history
24
30
 
25
31
  # Pipeline 命令
26
32
  from .pipeline import run
27
- from .sample import head, sample, tail
33
+ from .sample import head, sample, slice_data, tail
28
34
 
29
35
  # Skill 命令
30
36
  from .skill import install_skill, skill_status, uninstall_skill
31
37
 
38
+ # 切分命令
39
+ from .split import split
40
+
32
41
  # 统计命令
33
42
  from .stats import stats, token_stats
34
43
 
@@ -43,6 +52,7 @@ __all__ = [
43
52
  "sample",
44
53
  "head",
45
54
  "tail",
55
+ "slice_data",
46
56
  # 转换
47
57
  "transform",
48
58
  # 统计
@@ -60,6 +70,12 @@ __all__ = [
60
70
  "history",
61
71
  # 验证
62
72
  "validate",
73
+ # 切分
74
+ "split",
75
+ # 导出
76
+ "export",
77
+ # 评估
78
+ "eval",
63
79
  # Skill
64
80
  "install_skill",
65
81
  "uninstall_skill",