dtflow 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {dtflow-0.3.0 → dtflow-0.3.2}/PKG-INFO +153 -7
  2. {dtflow-0.3.0 → dtflow-0.3.2}/README.md +140 -1
  3. dtflow-0.3.2/dtflow/__init__.py +94 -0
  4. dtflow-0.3.2/dtflow/__main__.py +316 -0
  5. dtflow-0.3.2/dtflow/cli/__init__.py +33 -0
  6. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/cli/commands.py +1112 -113
  7. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/converters.py +39 -23
  8. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/core.py +140 -72
  9. dtflow-0.3.2/dtflow/lineage.py +410 -0
  10. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/mcp/__init__.py +1 -0
  11. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/mcp/__main__.py +2 -0
  12. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/mcp/cli.py +35 -17
  13. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/mcp/docs.py +0 -5
  14. dtflow-0.3.2/dtflow/pipeline.py +460 -0
  15. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/presets.py +24 -22
  16. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/storage/__init__.py +11 -10
  17. dtflow-0.3.2/dtflow/storage/io.py +710 -0
  18. dtflow-0.3.2/dtflow/streaming.py +656 -0
  19. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/tokenizers.py +212 -57
  20. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/utils/__init__.py +2 -1
  21. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/utils/display.py +28 -27
  22. {dtflow-0.3.0 → dtflow-0.3.2}/pyproject.toml +17 -5
  23. dtflow-0.3.2/tests/benchmark_io.py +229 -0
  24. dtflow-0.3.2/tests/test_streaming.py +281 -0
  25. dtflow-0.3.0/dtflow/__init__.py +0 -67
  26. dtflow-0.3.0/dtflow/__main__.py +0 -254
  27. dtflow-0.3.0/dtflow/cli/__init__.py +0 -6
  28. dtflow-0.3.0/dtflow/storage/io.py +0 -695
  29. {dtflow-0.3.0 → dtflow-0.3.2}/.gitignore +0 -0
  30. {dtflow-0.3.0 → dtflow-0.3.2}/dtflow/mcp/server.py +0 -0
  31. {dtflow-0.3.0 → dtflow-0.3.2}/tests/test_converters.py +0 -0
  32. {dtflow-0.3.0 → dtflow-0.3.2}/tests/test_tokenizers.py +0 -0
  33. {dtflow-0.3.0 → dtflow-0.3.2}/tests/test_transformer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -27,11 +27,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
27
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing
29
29
  Requires-Python: >=3.8
30
- Requires-Dist: fire>=0.4.0
31
30
  Requires-Dist: numpy>=1.20.0
32
31
  Requires-Dist: orjson>=3.9.0
32
+ Requires-Dist: polars>=0.20.0
33
33
  Requires-Dist: pyyaml>=5.4.0
34
- Requires-Dist: tqdm>=4.60.0
34
+ Requires-Dist: rich>=10.0.0
35
+ Requires-Dist: typer>=0.9.0
35
36
  Provides-Extra: converters
36
37
  Requires-Dist: datasets>=2.0.0; extra == 'converters'
37
38
  Provides-Extra: dev
@@ -42,7 +43,6 @@ Requires-Dist: mypy>=0.910; extra == 'dev'
42
43
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
43
44
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
44
45
  Provides-Extra: display
45
- Requires-Dist: rich>=10.0.0; extra == 'display'
46
46
  Provides-Extra: docs
47
47
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
48
48
  Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
@@ -50,21 +50,28 @@ Requires-Dist: sphinx>=4.0.0; extra == 'docs'
50
50
  Provides-Extra: full
51
51
  Requires-Dist: datasets>=2.0.0; extra == 'full'
52
52
  Requires-Dist: datasketch>=1.5.0; extra == 'full'
53
- Requires-Dist: pandas>=1.3.0; extra == 'full'
53
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
54
54
  Requires-Dist: pyarrow; extra == 'full'
55
55
  Requires-Dist: rich>=10.0.0; extra == 'full'
56
56
  Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
57
57
  Requires-Dist: tiktoken>=0.5.0; extra == 'full'
58
+ Requires-Dist: tokenizers>=0.15.0; extra == 'full'
59
+ Requires-Dist: toolong>=1.5.0; extra == 'full'
60
+ Provides-Extra: logs
61
+ Requires-Dist: toolong>=1.5.0; extra == 'logs'
58
62
  Provides-Extra: mcp
59
63
  Requires-Dist: mcp>=1.0.0; extra == 'mcp'
60
64
  Provides-Extra: similarity
61
65
  Requires-Dist: datasketch>=1.5.0; extra == 'similarity'
62
66
  Requires-Dist: scikit-learn>=0.24.0; extra == 'similarity'
63
67
  Provides-Extra: storage
64
- Requires-Dist: pandas>=1.3.0; extra == 'storage'
65
68
  Requires-Dist: pyarrow; extra == 'storage'
66
69
  Provides-Extra: tokenizers
67
70
  Requires-Dist: tiktoken>=0.5.0; extra == 'tokenizers'
71
+ Provides-Extra: tokenizers-hf
72
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'tokenizers-hf'
73
+ Requires-Dist: tiktoken>=0.5.0; extra == 'tokenizers-hf'
74
+ Requires-Dist: tokenizers>=0.15.0; extra == 'tokenizers-hf'
68
75
  Description-Content-Type: text/markdown
69
76
 
70
77
  # dtflow
@@ -101,7 +108,7 @@ dt = DataTransformer.load("data.jsonl")
101
108
  ### 数据加载与保存
102
109
 
103
110
  ```python
104
- # 支持 JSONL、JSON、CSV、Parquet
111
+ # 支持 JSONL、JSON、CSV、Parquet、Arrow(使用 Polars 引擎,比 Pandas 快 3x)
105
112
  dt = DataTransformer.load("data.jsonl")
106
113
  dt.save("output.jsonl")
107
114
 
@@ -293,6 +300,7 @@ dt.shuffle(seed=42)
293
300
  # 数据采样
294
301
  dt sample data.jsonl --num=10
295
302
  dt sample data.csv --num=100 --sample_type=head
303
+ dt sample data.jsonl 1000 --by=category # 分层采样
296
304
 
297
305
  # 数据转换 - 预设模式
298
306
  dt transform data.jsonl --preset=openai_chat
@@ -303,6 +311,18 @@ dt transform data.jsonl # 首次运行生成配置文件
303
311
  # 编辑 .dt/data.py 后再次运行
304
312
  dt transform data.jsonl --num=100 # 执行转换
305
313
 
314
+ # Pipeline 执行(可复现的数据处理流程)
315
+ dt run pipeline.yaml
316
+ dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
317
+
318
+ # Token 统计
319
+ dt token-stats data.jsonl --field=messages --model=gpt-4
320
+ dt token-stats data.jsonl --field=text --detailed
321
+
322
+ # 数据对比
323
+ dt diff v1/train.jsonl v2/train.jsonl
324
+ dt diff a.jsonl b.jsonl --key=id
325
+
306
326
  # 数据清洗
307
327
  dt clean data.jsonl --drop-empty # 删除任意空值记录
308
328
  dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
@@ -325,6 +345,132 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
325
345
  dt stats data.jsonl
326
346
  ```
327
347
 
348
+ ### Pipeline 配置
349
+
350
+ 使用 YAML 配置文件定义可复现的数据处理流程:
351
+
352
+ ```yaml
353
+ # pipeline.yaml
354
+ version: "1.0"
355
+ seed: 42
356
+ input: raw_data.jsonl
357
+ output: processed.jsonl
358
+
359
+ steps:
360
+ - type: filter
361
+ condition: "score > 0.5"
362
+
363
+ - type: filter
364
+ condition: "len(text) > 10"
365
+
366
+ - type: transform
367
+ preset: openai_chat
368
+ params:
369
+ user_field: q
370
+ assistant_field: a
371
+
372
+ - type: dedupe
373
+ key: text
374
+ ```
375
+
376
+ 支持的步骤类型:
377
+
378
+ | 步骤 | 参数 | 说明 |
379
+ |------|------|------|
380
+ | `filter` | `condition` | 条件过滤:`score > 0.5`, `len(text) > 10`, `field is not empty` |
381
+ | `transform` | `preset`, `params` | 格式转换,使用预设模板 |
382
+ | `dedupe` | `key`, `similar` | 去重,支持精确和相似度去重 |
383
+ | `sample` | `num`, `seed` | 随机采样 |
384
+ | `head` | `num` | 取前 N 条 |
385
+ | `tail` | `num` | 取后 N 条 |
386
+ | `shuffle` | `seed` | 打乱顺序 |
387
+ | `split` | `ratio`, `seed` | 数据集分割 |
388
+
389
+ 执行 Pipeline:
390
+
391
+ ```bash
392
+ dt run pipeline.yaml
393
+ dt run pipeline.yaml --input=new_data.jsonl # 覆盖输入文件
394
+ ```
395
+
396
+ ### 数据血缘追踪
397
+
398
+ 记录数据处理的完整历史,支持可复现和问题追溯:
399
+
400
+ ```python
401
+ # 启用血缘追踪
402
+ dt = DataTransformer.load("raw.jsonl", track_lineage=True)
403
+
404
+ # 正常进行数据处理
405
+ result = (dt
406
+ .filter(lambda x: x.score > 0.5)
407
+ .transform(lambda x: {"q": x.q, "a": x.a})
408
+ .dedupe("q")
409
+ )
410
+
411
+ # 保存时记录血缘
412
+ result.save("processed.jsonl", lineage=True)
413
+ # 自动生成 processed.jsonl.lineage.json
414
+ ```
415
+
416
+ 查看血缘历史:
417
+
418
+ ```bash
419
+ dt history processed.jsonl
420
+ # 输出:
421
+ # 📊 数据血缘报告: processed.jsonl
422
+ # └─ 版本 1
423
+ # 来源: raw.jsonl
424
+ # 操作链:
425
+ # ├─ filter: 1000 → 800
426
+ # ├─ transform: 800 → 800
427
+ # └─ dedupe: 800 → 750
428
+ # 输出数量: 750
429
+
430
+ dt history processed.jsonl --json # JSON 格式输出
431
+ ```
432
+
433
+ ### 大文件流式处理
434
+
435
+ 专为超大文件设计的流式处理接口,内存占用 O(1),支持 JSONL、CSV、Parquet、Arrow 格式:
436
+
437
+ ```python
438
+ from dtflow import load_stream, load_sharded
439
+
440
+ # 流式加载和处理(100GB 文件也只用常量内存)
441
+ (load_stream("huge_100gb.jsonl")
442
+ .filter(lambda x: x["score"] > 0.5)
443
+ .transform(lambda x: {"text": x["content"]})
444
+ .save("output.jsonl"))
445
+
446
+ # 跨格式转换(CSV → Parquet)
447
+ (load_stream("data.csv")
448
+ .filter(lambda x: x["score"] > 0.5)
449
+ .save("output.parquet"))
450
+
451
+ # 分片文件加载(支持多格式)
452
+ (load_sharded("data/train_*.parquet")
453
+ .filter(lambda x: len(x["text"]) > 10)
454
+ .save("merged.jsonl"))
455
+
456
+ # 分片保存
457
+ (load_stream("huge.jsonl")
458
+ .transform(lambda x: {"q": x["question"], "a": x["answer"]})
459
+ .save_sharded("output/", shard_size=100000))
460
+ # 生成: output/part-00000.jsonl, output/part-00001.jsonl, ...
461
+
462
+ # 批次处理(适合需要批量调用 API 的场景)
463
+ for batch in load_stream("data.jsonl").batch(1000):
464
+ results = call_api(batch) # 批量处理
465
+ ```
466
+
467
+ 特点:
468
+ - **惰性执行**:filter/transform 不会立即执行,只在 save/collect 时才触发
469
+ - **O(1) 内存**:无论文件多大,内存占用恒定(读取侧)
470
+ - **多格式支持**:JSONL、CSV、Parquet、Arrow 均支持流式处理
471
+ - **跨格式转换**:可直接从 CSV 读取并保存为 Parquet 等
472
+ - **分片支持**:支持 glob 模式加载多个分片,自动合并处理
473
+
328
474
  ## 错误处理
329
475
 
330
476
  ```python
@@ -32,7 +32,7 @@ dt = DataTransformer.load("data.jsonl")
32
32
  ### 数据加载与保存
33
33
 
34
34
  ```python
35
- # 支持 JSONL、JSON、CSV、Parquet
35
+ # 支持 JSONL、JSON、CSV、Parquet、Arrow(使用 Polars 引擎,比 Pandas 快 3x)
36
36
  dt = DataTransformer.load("data.jsonl")
37
37
  dt.save("output.jsonl")
38
38
 
@@ -224,6 +224,7 @@ dt.shuffle(seed=42)
224
224
  # 数据采样
225
225
  dt sample data.jsonl --num=10
226
226
  dt sample data.csv --num=100 --sample_type=head
227
+ dt sample data.jsonl 1000 --by=category # 分层采样
227
228
 
228
229
  # 数据转换 - 预设模式
229
230
  dt transform data.jsonl --preset=openai_chat
@@ -234,6 +235,18 @@ dt transform data.jsonl # 首次运行生成配置文件
234
235
  # 编辑 .dt/data.py 后再次运行
235
236
  dt transform data.jsonl --num=100 # 执行转换
236
237
 
238
+ # Pipeline 执行(可复现的数据处理流程)
239
+ dt run pipeline.yaml
240
+ dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
241
+
242
+ # Token 统计
243
+ dt token-stats data.jsonl --field=messages --model=gpt-4
244
+ dt token-stats data.jsonl --field=text --detailed
245
+
246
+ # 数据对比
247
+ dt diff v1/train.jsonl v2/train.jsonl
248
+ dt diff a.jsonl b.jsonl --key=id
249
+
237
250
  # 数据清洗
238
251
  dt clean data.jsonl --drop-empty # 删除任意空值记录
239
252
  dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
@@ -256,6 +269,132 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
256
269
  dt stats data.jsonl
257
270
  ```
258
271
 
272
+ ### Pipeline 配置
273
+
274
+ 使用 YAML 配置文件定义可复现的数据处理流程:
275
+
276
+ ```yaml
277
+ # pipeline.yaml
278
+ version: "1.0"
279
+ seed: 42
280
+ input: raw_data.jsonl
281
+ output: processed.jsonl
282
+
283
+ steps:
284
+ - type: filter
285
+ condition: "score > 0.5"
286
+
287
+ - type: filter
288
+ condition: "len(text) > 10"
289
+
290
+ - type: transform
291
+ preset: openai_chat
292
+ params:
293
+ user_field: q
294
+ assistant_field: a
295
+
296
+ - type: dedupe
297
+ key: text
298
+ ```
299
+
300
+ 支持的步骤类型:
301
+
302
+ | 步骤 | 参数 | 说明 |
303
+ |------|------|------|
304
+ | `filter` | `condition` | 条件过滤:`score > 0.5`, `len(text) > 10`, `field is not empty` |
305
+ | `transform` | `preset`, `params` | 格式转换,使用预设模板 |
306
+ | `dedupe` | `key`, `similar` | 去重,支持精确和相似度去重 |
307
+ | `sample` | `num`, `seed` | 随机采样 |
308
+ | `head` | `num` | 取前 N 条 |
309
+ | `tail` | `num` | 取后 N 条 |
310
+ | `shuffle` | `seed` | 打乱顺序 |
311
+ | `split` | `ratio`, `seed` | 数据集分割 |
312
+
313
+ 执行 Pipeline:
314
+
315
+ ```bash
316
+ dt run pipeline.yaml
317
+ dt run pipeline.yaml --input=new_data.jsonl # 覆盖输入文件
318
+ ```
319
+
320
+ ### 数据血缘追踪
321
+
322
+ 记录数据处理的完整历史,支持可复现和问题追溯:
323
+
324
+ ```python
325
+ # 启用血缘追踪
326
+ dt = DataTransformer.load("raw.jsonl", track_lineage=True)
327
+
328
+ # 正常进行数据处理
329
+ result = (dt
330
+ .filter(lambda x: x.score > 0.5)
331
+ .transform(lambda x: {"q": x.q, "a": x.a})
332
+ .dedupe("q")
333
+ )
334
+
335
+ # 保存时记录血缘
336
+ result.save("processed.jsonl", lineage=True)
337
+ # 自动生成 processed.jsonl.lineage.json
338
+ ```
339
+
340
+ 查看血缘历史:
341
+
342
+ ```bash
343
+ dt history processed.jsonl
344
+ # 输出:
345
+ # 📊 数据血缘报告: processed.jsonl
346
+ # └─ 版本 1
347
+ # 来源: raw.jsonl
348
+ # 操作链:
349
+ # ├─ filter: 1000 → 800
350
+ # ├─ transform: 800 → 800
351
+ # └─ dedupe: 800 → 750
352
+ # 输出数量: 750
353
+
354
+ dt history processed.jsonl --json # JSON 格式输出
355
+ ```
356
+
357
+ ### 大文件流式处理
358
+
359
+ 专为超大文件设计的流式处理接口,内存占用 O(1),支持 JSONL、CSV、Parquet、Arrow 格式:
360
+
361
+ ```python
362
+ from dtflow import load_stream, load_sharded
363
+
364
+ # 流式加载和处理(100GB 文件也只用常量内存)
365
+ (load_stream("huge_100gb.jsonl")
366
+ .filter(lambda x: x["score"] > 0.5)
367
+ .transform(lambda x: {"text": x["content"]})
368
+ .save("output.jsonl"))
369
+
370
+ # 跨格式转换(CSV → Parquet)
371
+ (load_stream("data.csv")
372
+ .filter(lambda x: x["score"] > 0.5)
373
+ .save("output.parquet"))
374
+
375
+ # 分片文件加载(支持多格式)
376
+ (load_sharded("data/train_*.parquet")
377
+ .filter(lambda x: len(x["text"]) > 10)
378
+ .save("merged.jsonl"))
379
+
380
+ # 分片保存
381
+ (load_stream("huge.jsonl")
382
+ .transform(lambda x: {"q": x["question"], "a": x["answer"]})
383
+ .save_sharded("output/", shard_size=100000))
384
+ # 生成: output/part-00000.jsonl, output/part-00001.jsonl, ...
385
+
386
+ # 批次处理(适合需要批量调用 API 的场景)
387
+ for batch in load_stream("data.jsonl").batch(1000):
388
+ results = call_api(batch) # 批量处理
389
+ ```
390
+
391
+ 特点:
392
+ - **惰性执行**:filter/transform 不会立即执行,只在 save/collect 时才触发
393
+ - **O(1) 内存**:无论文件多大,内存占用恒定(读取侧)
394
+ - **多格式支持**:JSONL、CSV、Parquet、Arrow 均支持流式处理
395
+ - **跨格式转换**:可直接从 CSV 读取并保存为 Parquet 等
396
+ - **分片支持**:支持 glob 模式加载多个分片,自动合并处理
397
+
259
398
  ## 错误处理
260
399
 
261
400
  ```python
@@ -0,0 +1,94 @@
1
+ """
2
+ DataTransformer: 简洁的数据格式转换工具
3
+
4
+ 核心功能:
5
+ - DataTransformer: 数据加载、转换、保存
6
+ - presets: 预设转换模板 (openai_chat, alpaca, sharegpt, dpo_pair, simple_qa)
7
+ - tokenizers: Token 统计和过滤
8
+ - converters: HuggingFace/OpenAI 等格式转换
9
+ """
10
+
11
+ from .converters import ( # LLaMA-Factory 扩展; ms-swift
12
+ from_hf_dataset,
13
+ from_openai_batch,
14
+ messages_to_text,
15
+ to_axolotl,
16
+ to_hf_chat_format,
17
+ to_hf_dataset,
18
+ to_llama_factory,
19
+ to_llama_factory_sharegpt,
20
+ to_llama_factory_vlm,
21
+ to_llama_factory_vlm_sharegpt,
22
+ to_openai_batch,
23
+ to_swift_messages,
24
+ to_swift_query_response,
25
+ to_swift_vlm,
26
+ )
27
+ from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
28
+ from .presets import get_preset, list_presets
29
+ from .storage import load_data, sample_file, save_data
30
+ from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
31
+ from .tokenizers import (
32
+ DEFAULT_MODEL,
33
+ MODEL_ALIASES,
34
+ OPENAI_MODELS,
35
+ count_tokens,
36
+ messages_token_counter,
37
+ messages_token_filter,
38
+ messages_token_stats,
39
+ resolve_model,
40
+ token_counter,
41
+ token_filter,
42
+ token_stats,
43
+ )
44
+
45
+ __version__ = "0.3.2"
46
+
47
+ __all__ = [
48
+ # core
49
+ "DataTransformer",
50
+ "DictWrapper",
51
+ "TransformError",
52
+ "TransformErrors",
53
+ # presets
54
+ "get_preset",
55
+ "list_presets",
56
+ # storage
57
+ "save_data",
58
+ "load_data",
59
+ "sample_file",
60
+ # tokenizers
61
+ "count_tokens",
62
+ "token_counter",
63
+ "token_filter",
64
+ "token_stats",
65
+ "messages_token_counter",
66
+ "messages_token_filter",
67
+ "messages_token_stats",
68
+ "DEFAULT_MODEL",
69
+ "MODEL_ALIASES",
70
+ "OPENAI_MODELS",
71
+ "resolve_model",
72
+ # converters
73
+ "to_hf_dataset",
74
+ "from_hf_dataset",
75
+ "to_hf_chat_format",
76
+ "from_openai_batch",
77
+ "to_openai_batch",
78
+ "to_llama_factory",
79
+ "to_axolotl",
80
+ "messages_to_text",
81
+ # LLaMA-Factory 扩展
82
+ "to_llama_factory_sharegpt",
83
+ "to_llama_factory_vlm",
84
+ "to_llama_factory_vlm_sharegpt",
85
+ # ms-swift
86
+ "to_swift_messages",
87
+ "to_swift_query_response",
88
+ "to_swift_vlm",
89
+ # streaming
90
+ "StreamingTransformer",
91
+ "load_stream",
92
+ "load_sharded",
93
+ "process_shards",
94
+ ]