dtflow 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {dtflow-0.2.0 → dtflow-0.3.1}/PKG-INFO +234 -15
  2. dtflow-0.3.1/README.md +459 -0
  3. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/__init__.py +36 -2
  4. dtflow-0.3.1/dtflow/__main__.py +307 -0
  5. dtflow-0.3.1/dtflow/cli/__init__.py +12 -0
  6. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/cli/commands.py +1030 -92
  7. dtflow-0.3.1/dtflow/converters.py +764 -0
  8. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/core.py +96 -31
  9. dtflow-0.3.1/dtflow/lineage.py +407 -0
  10. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/mcp/cli.py +14 -14
  11. dtflow-0.3.1/dtflow/pipeline.py +450 -0
  12. dtflow-0.3.1/dtflow/storage/io.py +701 -0
  13. dtflow-0.3.1/dtflow/streaming.py +661 -0
  14. dtflow-0.3.1/dtflow/tokenizers.py +542 -0
  15. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/utils/display.py +5 -4
  16. {dtflow-0.2.0 → dtflow-0.3.1}/pyproject.toml +17 -5
  17. dtflow-0.3.1/tests/benchmark_io.py +229 -0
  18. dtflow-0.3.1/tests/test_converters.py +929 -0
  19. dtflow-0.3.1/tests/test_streaming.py +281 -0
  20. dtflow-0.3.1/tests/test_tokenizers.py +623 -0
  21. dtflow-0.2.0/README.md +0 -247
  22. dtflow-0.2.0/dtflow/__main__.py +0 -254
  23. dtflow-0.2.0/dtflow/cli/__init__.py +0 -6
  24. dtflow-0.2.0/dtflow/converters.py +0 -308
  25. dtflow-0.2.0/dtflow/storage/io.py +0 -695
  26. dtflow-0.2.0/dtflow/tokenizers.py +0 -186
  27. dtflow-0.2.0/tests/test_converters.py +0 -478
  28. dtflow-0.2.0/tests/test_tokenizers.py +0 -270
  29. {dtflow-0.2.0 → dtflow-0.3.1}/.gitignore +0 -0
  30. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/mcp/__init__.py +0 -0
  31. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/mcp/__main__.py +0 -0
  32. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/mcp/docs.py +0 -0
  33. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/mcp/server.py +0 -0
  34. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/presets.py +0 -0
  35. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/storage/__init__.py +0 -0
  36. {dtflow-0.2.0 → dtflow-0.3.1}/dtflow/utils/__init__.py +0 -0
  37. {dtflow-0.2.0 → dtflow-0.3.1}/tests/test_transformer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -27,11 +27,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
27
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing
29
29
  Requires-Python: >=3.8
30
- Requires-Dist: fire>=0.4.0
31
30
  Requires-Dist: numpy>=1.20.0
32
31
  Requires-Dist: orjson>=3.9.0
32
+ Requires-Dist: polars>=0.20.0
33
33
  Requires-Dist: pyyaml>=5.4.0
34
- Requires-Dist: tqdm>=4.60.0
34
+ Requires-Dist: rich>=10.0.0
35
+ Requires-Dist: typer>=0.9.0
35
36
  Provides-Extra: converters
36
37
  Requires-Dist: datasets>=2.0.0; extra == 'converters'
37
38
  Provides-Extra: dev
@@ -42,7 +43,6 @@ Requires-Dist: mypy>=0.910; extra == 'dev'
42
43
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
43
44
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
44
45
  Provides-Extra: display
45
- Requires-Dist: rich>=10.0.0; extra == 'display'
46
46
  Provides-Extra: docs
47
47
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
48
48
  Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
@@ -50,21 +50,28 @@ Requires-Dist: sphinx>=4.0.0; extra == 'docs'
50
50
  Provides-Extra: full
51
51
  Requires-Dist: datasets>=2.0.0; extra == 'full'
52
52
  Requires-Dist: datasketch>=1.5.0; extra == 'full'
53
- Requires-Dist: pandas>=1.3.0; extra == 'full'
53
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
54
54
  Requires-Dist: pyarrow; extra == 'full'
55
55
  Requires-Dist: rich>=10.0.0; extra == 'full'
56
56
  Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
57
57
  Requires-Dist: tiktoken>=0.5.0; extra == 'full'
58
+ Requires-Dist: tokenizers>=0.15.0; extra == 'full'
59
+ Requires-Dist: toolong>=1.5.0; extra == 'full'
60
+ Provides-Extra: logs
61
+ Requires-Dist: toolong>=1.5.0; extra == 'logs'
58
62
  Provides-Extra: mcp
59
63
  Requires-Dist: mcp>=1.0.0; extra == 'mcp'
60
64
  Provides-Extra: similarity
61
65
  Requires-Dist: datasketch>=1.5.0; extra == 'similarity'
62
66
  Requires-Dist: scikit-learn>=0.24.0; extra == 'similarity'
63
67
  Provides-Extra: storage
64
- Requires-Dist: pandas>=1.3.0; extra == 'storage'
65
68
  Requires-Dist: pyarrow; extra == 'storage'
66
69
  Provides-Extra: tokenizers
67
70
  Requires-Dist: tiktoken>=0.5.0; extra == 'tokenizers'
71
+ Provides-Extra: tokenizers-hf
72
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'tokenizers-hf'
73
+ Requires-Dist: tiktoken>=0.5.0; extra == 'tokenizers-hf'
74
+ Requires-Dist: tokenizers>=0.15.0; extra == 'tokenizers-hf'
68
75
  Description-Content-Type: text/markdown
69
76
 
70
77
  # dtflow
@@ -101,7 +108,7 @@ dt = DataTransformer.load("data.jsonl")
101
108
  ### 数据加载与保存
102
109
 
103
110
  ```python
104
- # 支持 JSONL、JSON、CSV、Parquet
111
+ # 支持 JSONL、JSON、CSV、Parquet、Arrow(使用 Polars 引擎,比 Pandas 快 3x)
105
112
  dt = DataTransformer.load("data.jsonl")
106
113
  dt.save("output.jsonl")
107
114
 
@@ -159,11 +166,36 @@ stats = token_stats(dt.data, "text")
159
166
  # {"total_tokens": 12345, "avg_tokens": 123, "min_tokens": 5, "max_tokens": 500, ...}
160
167
  ```
161
168
 
162
- 支持 `tiktoken`(OpenAI,默认)和 `transformers` 后端:
169
+ 支持 `tiktoken`(OpenAI,默认)和 `transformers` 后端,**自动检测**:
163
170
 
164
171
  ```python
165
- # 使用 transformers tokenizer
166
- count_tokens("Hello", model="Qwen/Qwen2-7B", backend="transformers")
172
+ # OpenAI 模型 -> 自动使用 tiktoken
173
+ count_tokens("Hello", model="gpt-4")
174
+
175
+ # HuggingFace/本地模型 -> 自动使用 transformers
176
+ count_tokens("Hello", model="Qwen/Qwen2-7B")
177
+ count_tokens("Hello", model="/home/models/qwen")
178
+ ```
179
+
180
+ ### Messages Token 统计
181
+
182
+ 专为多轮对话设计的 token 统计功能:
183
+
184
+ ```python
185
+ from dtflow import messages_token_counter, messages_token_filter, messages_token_stats
186
+
187
+ # 为每条数据添加 token 统计
188
+ dt.transform(messages_token_counter(model="gpt-4")) # 简单模式,输出总数
189
+ dt.transform(messages_token_counter(model="gpt-4", detailed=True)) # 详细模式
190
+ # 详细模式输出: {"total": 500, "user": 200, "assistant": 280, "system": 20, "turns": 5, ...}
191
+
192
+ # 按 token 数和轮数过滤
193
+ dt.filter(messages_token_filter(min_tokens=100, max_tokens=4096))
194
+ dt.filter(messages_token_filter(min_turns=2, max_turns=10))
195
+
196
+ # 统计整个数据集
197
+ stats = messages_token_stats(dt.data, model="gpt-4")
198
+ # {"count": 1000, "total_tokens": 500000, "user_tokens": 200000, "assistant_tokens": 290000, ...}
167
199
  ```
168
200
 
169
201
  ### 格式转换器
@@ -172,7 +204,7 @@ count_tokens("Hello", model="Qwen/Qwen2-7B", backend="transformers")
172
204
  from dtflow import (
173
205
  to_hf_dataset, from_hf_dataset, # HuggingFace Dataset
174
206
  to_openai_batch, from_openai_batch, # OpenAI Batch API
175
- to_llama_factory, # LLaMA-Factory 格式
207
+ to_llama_factory, # LLaMA-Factory Alpaca 格式
176
208
  to_axolotl, # Axolotl 格式
177
209
  messages_to_text, # messages 转纯文本
178
210
  )
@@ -187,14 +219,62 @@ data = from_hf_dataset("tatsu-lab/alpaca", split="train")
187
219
  batch_input = dt.to(to_openai_batch(model="gpt-4o"))
188
220
  results = from_openai_batch(batch_output)
189
221
 
190
- # 训练框架格式
191
- dt.transform(to_llama_factory()).save("llama_factory.jsonl")
192
- dt.transform(to_axolotl()).save("axolotl.jsonl")
193
-
194
222
  # messages 转纯文本(支持 chatml/llama2/simple 模板)
195
223
  dt.transform(messages_to_text(template="chatml"))
196
224
  ```
197
225
 
226
+ ### LLaMA-Factory 格式
227
+
228
+ 完整支持 LLaMA-Factory 的 SFT 训练格式:
229
+
230
+ ```python
231
+ from dtflow import (
232
+ to_llama_factory, # Alpaca 格式(单轮)
233
+ to_llama_factory_sharegpt, # ShareGPT 格式(多轮对话)
234
+ to_llama_factory_vlm, # VLM Alpaca 格式
235
+ to_llama_factory_vlm_sharegpt, # VLM ShareGPT 格式
236
+ )
237
+
238
+ # Alpaca 格式
239
+ dt.transform(to_llama_factory()).save("alpaca.jsonl")
240
+ # 输出: {"instruction": "...", "input": "", "output": "..."}
241
+
242
+ # ShareGPT 格式(多轮对话)
243
+ dt.transform(to_llama_factory_sharegpt()).save("sharegpt.jsonl")
244
+ # 输出: {"conversations": [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}], "system": "..."}
245
+
246
+ # VLM 格式(图片/视频)
247
+ dt.transform(to_llama_factory_vlm(images_field="images")).save("vlm.jsonl")
248
+ # 输出: {"instruction": "...", "output": "...", "images": ["/path/to/img.jpg"]}
249
+
250
+ dt.transform(to_llama_factory_vlm_sharegpt(images_field="images", videos_field="videos"))
251
+ # 输出: {"conversations": [...], "images": [...], "videos": [...]}
252
+ ```
253
+
254
+ ### ms-swift 格式
255
+
256
+ 支持 ModelScope ms-swift 的训练格式:
257
+
258
+ ```python
259
+ from dtflow import (
260
+ to_swift_messages, # 标准 messages 格式
261
+ to_swift_query_response, # query-response 格式
262
+ to_swift_vlm, # VLM 格式
263
+ )
264
+
265
+ # messages 格式
266
+ dt.transform(to_swift_messages()).save("swift_messages.jsonl")
267
+ # 输出: {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
268
+
269
+ # query-response 格式(自动提取 history)
270
+ dt.transform(to_swift_query_response(query_field="messages")).save("swift_qr.jsonl")
271
+ # 输出: {"query": "...", "response": "...", "system": "...", "history": [["q1", "a1"], ...]}
272
+
273
+ # VLM 格式
274
+ dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
275
+ # 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
276
+ ```
277
+
198
278
  ### 其他操作
199
279
 
200
280
  ```python
@@ -220,6 +300,7 @@ dt.shuffle(seed=42)
220
300
  # 数据采样
221
301
  dt sample data.jsonl --num=10
222
302
  dt sample data.csv --num=100 --sample_type=head
303
+ dt sample data.jsonl 1000 --by=category # 分层采样
223
304
 
224
305
  # 数据转换 - 预设模式
225
306
  dt transform data.jsonl --preset=openai_chat
@@ -230,6 +311,18 @@ dt transform data.jsonl # 首次运行生成配置文件
230
311
  # 编辑 .dt/data.py 后再次运行
231
312
  dt transform data.jsonl --num=100 # 执行转换
232
313
 
314
+ # Pipeline 执行(可复现的数据处理流程)
315
+ dt run pipeline.yaml
316
+ dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
317
+
318
+ # Token 统计
319
+ dt token-stats data.jsonl --field=messages --model=gpt-4
320
+ dt token-stats data.jsonl --field=text --detailed
321
+
322
+ # 数据对比
323
+ dt diff v1/train.jsonl v2/train.jsonl
324
+ dt diff a.jsonl b.jsonl --key=id
325
+
233
326
  # 数据清洗
234
327
  dt clean data.jsonl --drop-empty # 删除任意空值记录
235
328
  dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
@@ -252,6 +345,132 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
252
345
  dt stats data.jsonl
253
346
  ```
254
347
 
348
+ ### Pipeline 配置
349
+
350
+ 使用 YAML 配置文件定义可复现的数据处理流程:
351
+
352
+ ```yaml
353
+ # pipeline.yaml
354
+ version: "1.0"
355
+ seed: 42
356
+ input: raw_data.jsonl
357
+ output: processed.jsonl
358
+
359
+ steps:
360
+ - type: filter
361
+ condition: "score > 0.5"
362
+
363
+ - type: filter
364
+ condition: "len(text) > 10"
365
+
366
+ - type: transform
367
+ preset: openai_chat
368
+ params:
369
+ user_field: q
370
+ assistant_field: a
371
+
372
+ - type: dedupe
373
+ key: text
374
+ ```
375
+
376
+ 支持的步骤类型:
377
+
378
+ | 步骤 | 参数 | 说明 |
379
+ |------|------|------|
380
+ | `filter` | `condition` | 条件过滤:`score > 0.5`, `len(text) > 10`, `field is not empty` |
381
+ | `transform` | `preset`, `params` | 格式转换,使用预设模板 |
382
+ | `dedupe` | `key`, `similar` | 去重,支持精确和相似度去重 |
383
+ | `sample` | `num`, `seed` | 随机采样 |
384
+ | `head` | `num` | 取前 N 条 |
385
+ | `tail` | `num` | 取后 N 条 |
386
+ | `shuffle` | `seed` | 打乱顺序 |
387
+ | `split` | `ratio`, `seed` | 数据集分割 |
388
+
389
+ 执行 Pipeline:
390
+
391
+ ```bash
392
+ dt run pipeline.yaml
393
+ dt run pipeline.yaml --input=new_data.jsonl # 覆盖输入文件
394
+ ```
395
+
396
+ ### 数据血缘追踪
397
+
398
+ 记录数据处理的完整历史,支持可复现和问题追溯:
399
+
400
+ ```python
401
+ # 启用血缘追踪
402
+ dt = DataTransformer.load("raw.jsonl", track_lineage=True)
403
+
404
+ # 正常进行数据处理
405
+ result = (dt
406
+ .filter(lambda x: x.score > 0.5)
407
+ .transform(lambda x: {"q": x.q, "a": x.a})
408
+ .dedupe("q")
409
+ )
410
+
411
+ # 保存时记录血缘
412
+ result.save("processed.jsonl", lineage=True)
413
+ # 自动生成 processed.jsonl.lineage.json
414
+ ```
415
+
416
+ 查看血缘历史:
417
+
418
+ ```bash
419
+ dt history processed.jsonl
420
+ # 输出:
421
+ # 📊 数据血缘报告: processed.jsonl
422
+ # └─ 版本 1
423
+ # 来源: raw.jsonl
424
+ # 操作链:
425
+ # ├─ filter: 1000 → 800
426
+ # ├─ transform: 800 → 800
427
+ # └─ dedupe: 800 → 750
428
+ # 输出数量: 750
429
+
430
+ dt history processed.jsonl --json # JSON 格式输出
431
+ ```
432
+
433
+ ### 大文件流式处理
434
+
435
+ 专为超大文件设计的流式处理接口,内存占用 O(1),支持 JSONL、CSV、Parquet、Arrow 格式:
436
+
437
+ ```python
438
+ from dtflow import load_stream, load_sharded
439
+
440
+ # 流式加载和处理(100GB 文件也只用常量内存)
441
+ (load_stream("huge_100gb.jsonl")
442
+ .filter(lambda x: x["score"] > 0.5)
443
+ .transform(lambda x: {"text": x["content"]})
444
+ .save("output.jsonl"))
445
+
446
+ # 跨格式转换(CSV → Parquet)
447
+ (load_stream("data.csv")
448
+ .filter(lambda x: x["score"] > 0.5)
449
+ .save("output.parquet"))
450
+
451
+ # 分片文件加载(支持多格式)
452
+ (load_sharded("data/train_*.parquet")
453
+ .filter(lambda x: len(x["text"]) > 10)
454
+ .save("merged.jsonl"))
455
+
456
+ # 分片保存
457
+ (load_stream("huge.jsonl")
458
+ .transform(lambda x: {"q": x["question"], "a": x["answer"]})
459
+ .save_sharded("output/", shard_size=100000))
460
+ # 生成: output/part-00000.jsonl, output/part-00001.jsonl, ...
461
+
462
+ # 批次处理(适合需要批量调用 API 的场景)
463
+ for batch in load_stream("data.jsonl").batch(1000):
464
+ results = call_api(batch) # 批量处理
465
+ ```
466
+
467
+ 特点:
468
+ - **惰性执行**:filter/transform 不会立即执行,只在 save/collect 时才触发
469
+ - **O(1) 内存**:无论文件多大,内存占用恒定(读取侧)
470
+ - **多格式支持**:JSONL、CSV、Parquet、Arrow 均支持流式处理
471
+ - **跨格式转换**:可直接从 CSV 读取并保存为 Parquet 等
472
+ - **分片支持**:支持 glob 模式加载多个分片,自动合并处理
473
+
255
474
  ## 错误处理
256
475
 
257
476
  ```python