dtflow 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +22 -8
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +108 -14
- dtflow/cli/clean.py +90 -1
- dtflow/cli/commands.py +17 -1
- dtflow/cli/eval.py +288 -0
- dtflow/cli/export.py +81 -0
- dtflow/cli/sample.py +90 -3
- dtflow/cli/split.py +138 -0
- dtflow/cli/stats.py +10 -23
- dtflow/cli/validate.py +19 -52
- dtflow/eval.py +276 -0
- dtflow/schema.py +13 -99
- dtflow/tokenizers.py +21 -104
- dtflow/utils/text_parser.py +124 -0
- {dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/METADATA +29 -3
- {dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/RECORD +19 -15
- dtflow/parallel.py +0 -115
- {dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/WHEEL +0 -0
- {dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/entry_points.txt +0 -0
dtflow/tokenizers.py
CHANGED
|
@@ -122,8 +122,8 @@ def _get_tiktoken_encoder(model: str):
|
|
|
122
122
|
_tokenizer_cache[model] = tiktoken.get_encoding(model)
|
|
123
123
|
else:
|
|
124
124
|
_tokenizer_cache[model] = tiktoken.encoding_for_model(model)
|
|
125
|
-
except ImportError
|
|
126
|
-
raise ImportError("需要安装 tiktoken: pip install tiktoken")
|
|
125
|
+
except ImportError:
|
|
126
|
+
raise ImportError("需要安装 tiktoken: pip install tiktoken")
|
|
127
127
|
return _tokenizer_cache[model]
|
|
128
128
|
|
|
129
129
|
|
|
@@ -149,12 +149,12 @@ def _get_hf_tokenizer(model: str):
|
|
|
149
149
|
|
|
150
150
|
tokenizer = AutoTokenizer.from_pretrained(resolved, trust_remote_code=True)
|
|
151
151
|
_tokenizer_cache[resolved] = ("transformers", tokenizer)
|
|
152
|
-
except ImportError
|
|
152
|
+
except ImportError:
|
|
153
153
|
raise ImportError(
|
|
154
154
|
"需要安装 tokenizers 或 transformers:\n"
|
|
155
155
|
" pip install tokenizers huggingface_hub (推荐,更轻量)\n"
|
|
156
156
|
" pip install transformers"
|
|
157
|
-
)
|
|
157
|
+
)
|
|
158
158
|
return _tokenizer_cache[resolved]
|
|
159
159
|
|
|
160
160
|
|
|
@@ -309,29 +309,12 @@ def _std(counts: List[int], avg: float) -> float:
|
|
|
309
309
|
return variance**0.5
|
|
310
310
|
|
|
311
311
|
|
|
312
|
-
def _count_item_tokens(args: tuple) -> int:
|
|
313
|
-
"""
|
|
314
|
-
计算单条数据的 token 数(用于多进程)。
|
|
315
|
-
|
|
316
|
-
Args:
|
|
317
|
-
args: (item, fields, model, backend) 元组
|
|
318
|
-
"""
|
|
319
|
-
item, fields, model, backend = args
|
|
320
|
-
total = 0
|
|
321
|
-
for field in fields:
|
|
322
|
-
value = get_field_with_spec(item, field, default="")
|
|
323
|
-
if value:
|
|
324
|
-
total += count_tokens(str(value), model=model, backend=backend)
|
|
325
|
-
return total
|
|
326
|
-
|
|
327
|
-
|
|
328
312
|
def token_stats(
|
|
329
313
|
data: List[Dict[str, Any]],
|
|
330
314
|
fields: Union[str, List[str]],
|
|
331
315
|
model: str = DEFAULT_MODEL,
|
|
332
316
|
backend: Optional[str] = None,
|
|
333
317
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
334
|
-
workers: Optional[int] = None,
|
|
335
318
|
) -> Dict[str, Any]:
|
|
336
319
|
"""
|
|
337
320
|
统计数据集的 token 信息。
|
|
@@ -342,7 +325,6 @@ def token_stats(
|
|
|
342
325
|
model: 模型名称或别名,如 "qwen2.5", "gpt-4" 等
|
|
343
326
|
backend: 后端选择,None 则自动检测
|
|
344
327
|
progress_callback: 进度回调函数,接收 (current, total) 两个参数
|
|
345
|
-
workers: 进程数,None 自动检测,1 表示禁用并行
|
|
346
328
|
|
|
347
329
|
Returns:
|
|
348
330
|
统计信息字典,包含:
|
|
@@ -360,42 +342,17 @@ def token_stats(
|
|
|
360
342
|
if not data:
|
|
361
343
|
return {"total_tokens": 0, "count": 0}
|
|
362
344
|
|
|
345
|
+
counts = []
|
|
363
346
|
total_items = len(data)
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
# 准备参数
|
|
374
|
-
args_list = [(item, fields, model, _backend) for item in data]
|
|
375
|
-
counts = []
|
|
376
|
-
for i, result in enumerate(
|
|
377
|
-
parallel_imap(
|
|
378
|
-
_count_item_tokens,
|
|
379
|
-
args_list,
|
|
380
|
-
workers=actual_workers,
|
|
381
|
-
threshold=1000,
|
|
382
|
-
)
|
|
383
|
-
):
|
|
384
|
-
counts.append(result)
|
|
385
|
-
if progress_callback:
|
|
386
|
-
progress_callback(i + 1, total_items)
|
|
387
|
-
else:
|
|
388
|
-
# 串行处理
|
|
389
|
-
counts = []
|
|
390
|
-
for i, item in enumerate(data):
|
|
391
|
-
total = 0
|
|
392
|
-
for field in fields:
|
|
393
|
-
value = get_field_with_spec(item, field, default="")
|
|
394
|
-
if value:
|
|
395
|
-
total += count_tokens(str(value), model=model, backend=_backend)
|
|
396
|
-
counts.append(total)
|
|
397
|
-
if progress_callback:
|
|
398
|
-
progress_callback(i + 1, total_items)
|
|
347
|
+
for i, item in enumerate(data):
|
|
348
|
+
total = 0
|
|
349
|
+
for field in fields:
|
|
350
|
+
value = get_field_with_spec(item, field, default="")
|
|
351
|
+
if value:
|
|
352
|
+
total += count_tokens(str(value), model=model, backend=backend)
|
|
353
|
+
counts.append(total)
|
|
354
|
+
if progress_callback:
|
|
355
|
+
progress_callback(i + 1, total_items)
|
|
399
356
|
|
|
400
357
|
sorted_counts = sorted(counts)
|
|
401
358
|
avg = sum(counts) / len(counts)
|
|
@@ -591,27 +548,12 @@ def messages_token_filter(
|
|
|
591
548
|
return filter_func
|
|
592
549
|
|
|
593
550
|
|
|
594
|
-
def _count_messages_tokens_wrapper(args: tuple) -> Optional[Dict[str, int]]:
|
|
595
|
-
"""
|
|
596
|
-
计算单条 messages 的 token 数(用于多进程)。
|
|
597
|
-
|
|
598
|
-
Args:
|
|
599
|
-
args: (item, messages_field, model, backend) 元组
|
|
600
|
-
"""
|
|
601
|
-
item, messages_field, model, backend = args
|
|
602
|
-
messages = get_field_with_spec(item, messages_field, default=[])
|
|
603
|
-
if messages:
|
|
604
|
-
return _count_messages_tokens(messages, model=model, backend=backend)
|
|
605
|
-
return None
|
|
606
|
-
|
|
607
|
-
|
|
608
551
|
def messages_token_stats(
|
|
609
552
|
data: List[Dict[str, Any]],
|
|
610
553
|
messages_field: str = "messages",
|
|
611
554
|
model: str = DEFAULT_MODEL,
|
|
612
555
|
backend: Optional[str] = None,
|
|
613
556
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
614
|
-
workers: Optional[int] = None,
|
|
615
557
|
) -> Dict[str, Any]:
|
|
616
558
|
"""
|
|
617
559
|
统计数据集中 messages 的 token 信息。
|
|
@@ -622,7 +564,6 @@ def messages_token_stats(
|
|
|
622
564
|
model: 模型名称或别名
|
|
623
565
|
backend: 后端,None 则自动检测
|
|
624
566
|
progress_callback: 进度回调函数,接收 (current, total) 两个参数
|
|
625
|
-
workers: 进程数,None 自动检测,1 表示禁用并行
|
|
626
567
|
|
|
627
568
|
Returns:
|
|
628
569
|
统计信息字典,包含:
|
|
@@ -640,38 +581,14 @@ def messages_token_stats(
|
|
|
640
581
|
if not data:
|
|
641
582
|
return {"count": 0, "total_tokens": 0}
|
|
642
583
|
|
|
643
|
-
total_items = len(data)
|
|
644
|
-
|
|
645
|
-
# 判断是否使用多进程
|
|
646
|
-
use_parallel = workers != 1 and total_items >= 1000
|
|
647
|
-
|
|
648
584
|
all_stats = []
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
parallel_imap(
|
|
657
|
-
_count_messages_tokens_wrapper,
|
|
658
|
-
args_list,
|
|
659
|
-
workers=actual_workers,
|
|
660
|
-
threshold=1000,
|
|
661
|
-
)
|
|
662
|
-
):
|
|
663
|
-
if result is not None:
|
|
664
|
-
all_stats.append(result)
|
|
665
|
-
if progress_callback:
|
|
666
|
-
progress_callback(i + 1, total_items)
|
|
667
|
-
else:
|
|
668
|
-
# 串行处理
|
|
669
|
-
for i, item in enumerate(data):
|
|
670
|
-
messages = get_field_with_spec(item, messages_field, default=[])
|
|
671
|
-
if messages:
|
|
672
|
-
all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
|
|
673
|
-
if progress_callback:
|
|
674
|
-
progress_callback(i + 1, total_items)
|
|
585
|
+
total_items = len(data)
|
|
586
|
+
for i, item in enumerate(data):
|
|
587
|
+
messages = get_field_with_spec(item, messages_field, default=[])
|
|
588
|
+
if messages:
|
|
589
|
+
all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
|
|
590
|
+
if progress_callback:
|
|
591
|
+
progress_callback(i + 1, total_items)
|
|
675
592
|
|
|
676
593
|
if not all_stats:
|
|
677
594
|
return {"count": 0, "total_tokens": 0}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
文本清洗工具
|
|
3
|
+
|
|
4
|
+
提供 LLM 输出的常见清洗函数:
|
|
5
|
+
- strip_think_tags: 去除 <think>...</think> 思考链内容
|
|
6
|
+
- extract_code_snippets: 提取 ``` 代码块
|
|
7
|
+
- parse_generic_tags: 解析 <tag>content</tag> 格式标签
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Dict, List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def strip_think_tags(text: str) -> str:
|
|
15
|
+
"""去除 <think>...</think> 包裹的内容
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: 输入文本
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
去除思考链后的文本
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
>>> strip_think_tags("<think>让我想想...</think>答案是42")
|
|
25
|
+
'答案是42'
|
|
26
|
+
"""
|
|
27
|
+
if not text:
|
|
28
|
+
return text
|
|
29
|
+
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_code_snippets(text: str, strict: bool = True) -> List[Dict[str, str]]:
|
|
33
|
+
"""提取 ``` 代码块
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: 输入文本
|
|
37
|
+
strict: True 仅匹配 ```lang...``` 格式,False 额外匹配 {...} 格式
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
代码片段列表,每项为 {"language": ..., "code": ...}
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
>>> extract_code_snippets("```json\\n{\"a\": 1}\\n```")
|
|
44
|
+
[{'language': 'json', 'code': '{"a": 1}'}]
|
|
45
|
+
"""
|
|
46
|
+
pattern = r"```(\w+)?\s*([\s\S]*?)```"
|
|
47
|
+
matches = re.findall(pattern, text)
|
|
48
|
+
|
|
49
|
+
code_snippets = []
|
|
50
|
+
for lang, code in matches:
|
|
51
|
+
code_snippets.append(
|
|
52
|
+
{
|
|
53
|
+
"language": lang.strip() if lang else "unknown",
|
|
54
|
+
"code": code.strip(),
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if not strict:
|
|
59
|
+
# 移除已匹配的 ``` 块,在剩余文本中匹配 { ... }
|
|
60
|
+
text = re.sub(pattern, "", text)
|
|
61
|
+
brace_matches = re.findall(r"\{[\s\S]*?\}", text)
|
|
62
|
+
for code in brace_matches:
|
|
63
|
+
code_snippets.append(
|
|
64
|
+
{
|
|
65
|
+
"language": "unknown",
|
|
66
|
+
"code": code.strip(),
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return code_snippets
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_generic_tags(text: str, strict: bool = False) -> Dict[str, str]:
|
|
74
|
+
"""解析 XML 风格标签
|
|
75
|
+
|
|
76
|
+
支持两种模式:
|
|
77
|
+
- strict=True: 仅匹配闭合标签 <label>content</label>
|
|
78
|
+
- strict=False: 同时匹配开放式标签 <label>content,闭合标签优先
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: 输入文本
|
|
82
|
+
strict: 是否严格模式
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
{标签名: 内容} 字典
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> parse_generic_tags("<标签>内容</标签>")
|
|
89
|
+
{'标签': '内容'}
|
|
90
|
+
>>> parse_generic_tags("<a>hello<b>world", strict=False)
|
|
91
|
+
{'a': 'hello', 'b': 'world'}
|
|
92
|
+
"""
|
|
93
|
+
if not text:
|
|
94
|
+
return {}
|
|
95
|
+
|
|
96
|
+
result = {}
|
|
97
|
+
|
|
98
|
+
if strict:
|
|
99
|
+
pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
100
|
+
matches = re.findall(pattern_closed, text, re.DOTALL)
|
|
101
|
+
for label, content in matches:
|
|
102
|
+
result[label.strip()] = content.strip()
|
|
103
|
+
else:
|
|
104
|
+
remaining_text = str(text)
|
|
105
|
+
|
|
106
|
+
# 1. 优先处理闭合标签
|
|
107
|
+
def process_closed_tag(match_obj):
|
|
108
|
+
label = match_obj.group(1).strip()
|
|
109
|
+
content = match_obj.group(2).strip()
|
|
110
|
+
result[label] = content
|
|
111
|
+
return ""
|
|
112
|
+
|
|
113
|
+
pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
114
|
+
remaining_text = re.sub(pattern_closed, process_closed_tag, remaining_text, flags=re.DOTALL)
|
|
115
|
+
|
|
116
|
+
# 2. 在剩余文本中处理开放式标签
|
|
117
|
+
pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
|
|
118
|
+
matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
|
|
119
|
+
for label, content in matches_open:
|
|
120
|
+
label_stripped = label.strip()
|
|
121
|
+
if label_stripped not in result:
|
|
122
|
+
result[label_stripped] = content.strip()
|
|
123
|
+
|
|
124
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.9
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -44,6 +44,7 @@ Requires-Dist: flake8>=3.9.0; extra == 'dev'
|
|
|
44
44
|
Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
|
|
45
45
|
Requires-Dist: isort>=5.9.0; extra == 'dev'
|
|
46
46
|
Requires-Dist: mypy>=0.910; extra == 'dev'
|
|
47
|
+
Requires-Dist: pandas>=1.3.0; extra == 'dev'
|
|
47
48
|
Requires-Dist: pyarrow; extra == 'dev'
|
|
48
49
|
Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
|
|
49
50
|
Requires-Dist: pytest>=6.0.0; extra == 'dev'
|
|
@@ -57,10 +58,14 @@ Provides-Extra: docs
|
|
|
57
58
|
Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
|
|
58
59
|
Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
|
|
59
60
|
Requires-Dist: sphinx>=4.0.0; extra == 'docs'
|
|
61
|
+
Provides-Extra: eval
|
|
62
|
+
Requires-Dist: pandas>=1.3.0; extra == 'eval'
|
|
63
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'eval'
|
|
60
64
|
Provides-Extra: full
|
|
61
65
|
Requires-Dist: datasets>=2.0.0; extra == 'full'
|
|
62
66
|
Requires-Dist: datasketch>=1.5.0; extra == 'full'
|
|
63
67
|
Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
|
|
68
|
+
Requires-Dist: pandas>=1.3.0; extra == 'full'
|
|
64
69
|
Requires-Dist: pyarrow; extra == 'full'
|
|
65
70
|
Requires-Dist: rich>=10.0.0; extra == 'full'
|
|
66
71
|
Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
|
|
@@ -435,6 +440,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
|
|
|
435
440
|
dt sample data.jsonl --where="category=tech" # 筛选后采样
|
|
436
441
|
dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
|
|
437
442
|
|
|
443
|
+
# 按行范围查看(Python 切片语法)
|
|
444
|
+
dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
|
|
445
|
+
dt slice data.jsonl :100 # 前 100 行
|
|
446
|
+
dt slice data.jsonl 100: # 第 100 行到末尾
|
|
447
|
+
dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
|
|
448
|
+
dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
|
|
449
|
+
|
|
438
450
|
# 数据转换 - 预设模式
|
|
439
451
|
dt transform data.jsonl --preset=openai_chat
|
|
440
452
|
dt transform data.jsonl --preset=alpaca
|
|
@@ -452,7 +464,6 @@ dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
|
|
|
452
464
|
dt token-stats data.jsonl --field=messages --model=gpt-4
|
|
453
465
|
dt token-stats data.jsonl --field=messages[-1].content # 统计最后一条消息
|
|
454
466
|
dt token-stats data.jsonl --field=text --detailed
|
|
455
|
-
dt token-stats data.jsonl --workers=4 # 多进程加速(数据量大时自动启用)
|
|
456
467
|
|
|
457
468
|
# 数据对比
|
|
458
469
|
dt diff v1/train.jsonl v2/train.jsonl
|
|
@@ -469,6 +480,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
|
|
|
469
480
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
470
481
|
dt clean data.jsonl --drop=metadata # 删除指定字段
|
|
471
482
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
483
|
+
dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
|
|
484
|
+
dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
|
|
485
|
+
dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
|
|
472
486
|
|
|
473
487
|
# 数据去重
|
|
474
488
|
dt dedupe data.jsonl # 全量精确去重
|
|
@@ -477,6 +491,17 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
|
477
491
|
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
478
492
|
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
479
493
|
|
|
494
|
+
# 数据集切分
|
|
495
|
+
dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
|
|
496
|
+
dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
|
|
497
|
+
dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
|
|
498
|
+
|
|
499
|
+
# 训练框架导出
|
|
500
|
+
dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
|
|
501
|
+
dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
|
|
502
|
+
dt export data.jsonl -f axolotl # 导出到 Axolotl
|
|
503
|
+
dt export data.jsonl -f llama-factory --check # 仅检查兼容性
|
|
504
|
+
|
|
480
505
|
# 文件拼接
|
|
481
506
|
dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
482
507
|
|
|
@@ -496,7 +521,6 @@ dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验
|
|
|
496
521
|
dt validate data.jsonl --preset=alpaca --verbose # 详细输出
|
|
497
522
|
dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
|
|
498
523
|
dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
|
|
499
|
-
dt validate data.jsonl --preset=openai_chat --workers=4 # 多进程加速
|
|
500
524
|
```
|
|
501
525
|
|
|
502
526
|
### 字段路径语法
|
|
@@ -522,6 +546,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
|
|
|
522
546
|
| `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
|
|
523
547
|
| `clean` | `--min-len=` | `--min-len=messages.#:2` |
|
|
524
548
|
| `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
|
|
549
|
+
| `clean` | `--min-tokens=` | `--min-tokens=content:10` |
|
|
550
|
+
| `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
|
|
525
551
|
| `token-stats` | `--field=` | `--field=messages[-1].content` |
|
|
526
552
|
| `diff` | `--key=` | `--key=meta.uuid` |
|
|
527
553
|
|
|
@@ -1,35 +1,39 @@
|
|
|
1
|
-
dtflow/SKILL.md,sha256=
|
|
2
|
-
dtflow/__init__.py,sha256=
|
|
3
|
-
dtflow/__main__.py,sha256=
|
|
1
|
+
dtflow/SKILL.md,sha256=hPxJhroGmNbBv8MLZUkOA2yW1TDdUKEUYYlz9tW2mao,10393
|
|
2
|
+
dtflow/__init__.py,sha256=9ZqhqD8qQM9w2dfHKyUWIaqSX-X4elWtbaQN4CNBhgg,3031
|
|
3
|
+
dtflow/__main__.py,sha256=gg3v7u-Ot7AicgKrP1fuyKtMJXVduNuLmhy7L1LUPDg,17710
|
|
4
4
|
dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
|
|
5
5
|
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
6
|
+
dtflow/eval.py,sha256=_c-XP2zsOBznYltSyKEScOqvmPVX2orqepg5cNhXXB0,9836
|
|
6
7
|
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
7
8
|
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
8
|
-
dtflow/parallel.py,sha256=EnIdGEGMrZUNT2-CBIV93UFfpqr_jU_heqqvdGXcP-Y,3046
|
|
9
9
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
10
10
|
dtflow/presets.py,sha256=qa8WQJhbNMuGxqqgA9BFadEBwDB9s0zWNxxhzF3q1K8,4701
|
|
11
|
-
dtflow/schema.py,sha256=
|
|
11
|
+
dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
|
|
12
12
|
dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
|
|
13
|
-
dtflow/tokenizers.py,sha256=
|
|
13
|
+
dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
|
|
14
14
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
15
|
-
dtflow/cli/clean.py,sha256=
|
|
16
|
-
dtflow/cli/commands.py,sha256=
|
|
15
|
+
dtflow/cli/clean.py,sha256=BEQQlH2q6luCbx51M3oxxOwcnwlOA8vo9WX3Fp7I6AY,29498
|
|
16
|
+
dtflow/cli/commands.py,sha256=LvyDQ_nWUM7UlPDEFQadRdw5O2ZKDLgF41_xAJRhYxI,1583
|
|
17
17
|
dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
|
|
18
|
+
dtflow/cli/eval.py,sha256=c53kCRH86k2Q_6vESKFlcepcNnTpO9O68agWK4_oJj8,9582
|
|
19
|
+
dtflow/cli/export.py,sha256=loRfVPwEVsDw3ZMKEYGp0Hy38kYZG2QT8JCMbz1dRzU,2156
|
|
18
20
|
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
19
21
|
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
20
22
|
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
21
|
-
dtflow/cli/sample.py,sha256=
|
|
23
|
+
dtflow/cli/sample.py,sha256=etbro5I0pyNgn0Qfhp1M6Bh-95JN-AntDa5AwVe_oKY,18269
|
|
22
24
|
dtflow/cli/skill.py,sha256=opiTEBejA7JHKrEMftMOPDQlOgZ4n59rwaHXGU1Nukk,2022
|
|
23
|
-
dtflow/cli/
|
|
25
|
+
dtflow/cli/split.py,sha256=96bhWnxHnjIqifoliLgciApkLbwQU8bWHovK8bcMk9g,3667
|
|
26
|
+
dtflow/cli/stats.py,sha256=Jx3d4X0ftgpzU5q5RAWZEVJWwXviQTF4EAwBmz1IliA,31366
|
|
24
27
|
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
25
|
-
dtflow/cli/validate.py,sha256=
|
|
28
|
+
dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
|
|
26
29
|
dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
|
|
27
30
|
dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
|
|
28
31
|
dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
|
|
29
32
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
30
33
|
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
31
34
|
dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
|
|
32
|
-
dtflow
|
|
33
|
-
dtflow-0.5.
|
|
34
|
-
dtflow-0.5.
|
|
35
|
-
dtflow-0.5.
|
|
35
|
+
dtflow/utils/text_parser.py,sha256=0t2TMOSha4dTiDu9H4ygdb67cI20zhtBH1XavDspL_g,3727
|
|
36
|
+
dtflow-0.5.9.dist-info/METADATA,sha256=Pu92Dz2vj7U_dki4A0e5xgka36BTT9K2PnN1LIeEhN0,25839
|
|
37
|
+
dtflow-0.5.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
dtflow-0.5.9.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
39
|
+
dtflow-0.5.9.dist-info/RECORD,,
|
dtflow/parallel.py
DELETED
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
并行处理模块
|
|
3
|
-
|
|
4
|
-
提供多进程并行处理工具,用于加速大数据集的 token 统计和 schema 验证。
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from multiprocessing import Pool, cpu_count
|
|
8
|
-
from typing import Callable, List, Optional, TypeVar
|
|
9
|
-
|
|
10
|
-
T = TypeVar("T")
|
|
11
|
-
R = TypeVar("R")
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def parallel_map(
|
|
15
|
-
func: Callable[[T], R],
|
|
16
|
-
data: List[T],
|
|
17
|
-
workers: Optional[int] = None,
|
|
18
|
-
threshold: int = 1000,
|
|
19
|
-
chunksize: Optional[int] = None,
|
|
20
|
-
) -> List[R]:
|
|
21
|
-
"""
|
|
22
|
-
并行 map 操作。
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
func: 处理函数(必须可 pickle,不能是 lambda 或闭包)
|
|
26
|
-
data: 数据列表
|
|
27
|
-
workers: 进程数,None 则使用 CPU 核数
|
|
28
|
-
threshold: 数据量阈值,低于此值使用串行
|
|
29
|
-
chunksize: 每个进程的任务块大小,None 则自动计算
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
处理结果列表(保持顺序)
|
|
33
|
-
"""
|
|
34
|
-
n = len(data)
|
|
35
|
-
|
|
36
|
-
# 数据量小或指定单进程,使用串行
|
|
37
|
-
if n < threshold or workers == 1:
|
|
38
|
-
return [func(item) for item in data]
|
|
39
|
-
|
|
40
|
-
workers = workers or cpu_count()
|
|
41
|
-
workers = min(workers, n) # 进程数不超过数据量
|
|
42
|
-
|
|
43
|
-
# 自动计算 chunksize
|
|
44
|
-
if chunksize is None:
|
|
45
|
-
chunksize = max(1, n // (workers * 4))
|
|
46
|
-
|
|
47
|
-
with Pool(processes=workers) as pool:
|
|
48
|
-
return pool.map(func, data, chunksize=chunksize)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def parallel_imap(
|
|
52
|
-
func: Callable[[T], R],
|
|
53
|
-
data: List[T],
|
|
54
|
-
workers: Optional[int] = None,
|
|
55
|
-
threshold: int = 1000,
|
|
56
|
-
chunksize: Optional[int] = None,
|
|
57
|
-
):
|
|
58
|
-
"""
|
|
59
|
-
并行 imap 操作(惰性迭代器版本,支持进度回调)。
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
func: 处理函数(必须可 pickle)
|
|
63
|
-
data: 数据列表
|
|
64
|
-
workers: 进程数,None 则使用 CPU 核数
|
|
65
|
-
threshold: 数据量阈值,低于此值使用串行
|
|
66
|
-
chunksize: 每个进程的任务块大小
|
|
67
|
-
|
|
68
|
-
Yields:
|
|
69
|
-
处理结果(按顺序)
|
|
70
|
-
"""
|
|
71
|
-
n = len(data)
|
|
72
|
-
|
|
73
|
-
# 数据量小或指定单进程,使用串行
|
|
74
|
-
if n < threshold or workers == 1:
|
|
75
|
-
for item in data:
|
|
76
|
-
yield func(item)
|
|
77
|
-
return
|
|
78
|
-
|
|
79
|
-
workers = workers or cpu_count()
|
|
80
|
-
workers = min(workers, n)
|
|
81
|
-
|
|
82
|
-
if chunksize is None:
|
|
83
|
-
chunksize = max(1, n // (workers * 4))
|
|
84
|
-
|
|
85
|
-
with Pool(processes=workers) as pool:
|
|
86
|
-
for result in pool.imap(func, data, chunksize=chunksize):
|
|
87
|
-
yield result
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def get_optimal_workers(data_size: int, default: Optional[int] = None) -> int:
|
|
91
|
-
"""
|
|
92
|
-
根据数据量计算最优进程数。
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
data_size: 数据量
|
|
96
|
-
default: 用户指定的进程数,None 则自动计算
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
最优进程数
|
|
100
|
-
"""
|
|
101
|
-
if default is not None:
|
|
102
|
-
return default
|
|
103
|
-
|
|
104
|
-
cpu_cores = cpu_count()
|
|
105
|
-
|
|
106
|
-
# 数据量小于阈值,单进程
|
|
107
|
-
if data_size < 1000:
|
|
108
|
-
return 1
|
|
109
|
-
|
|
110
|
-
# 数据量适中,使用一半 CPU
|
|
111
|
-
if data_size < 10000:
|
|
112
|
-
return max(1, cpu_cores // 2)
|
|
113
|
-
|
|
114
|
-
# 大数据量,使用全部 CPU
|
|
115
|
-
return cpu_cores
|
|
File without changes
|
|
File without changes
|