dtflow 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +39 -5
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +137 -8
- dtflow/cli/clean.py +294 -9
- dtflow/cli/commands.py +17 -1
- dtflow/cli/eval.py +288 -0
- dtflow/cli/export.py +81 -0
- dtflow/cli/sample.py +90 -3
- dtflow/cli/split.py +138 -0
- dtflow/cli/stats.py +224 -30
- dtflow/eval.py +276 -0
- dtflow/utils/text_parser.py +124 -0
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/METADATA +34 -2
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/RECORD +16 -11
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/WHEEL +0 -0
- {dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
文本清洗工具
|
|
3
|
+
|
|
4
|
+
提供 LLM 输出的常见清洗函数:
|
|
5
|
+
- strip_think_tags: 去除 <think>...</think> 思考链内容
|
|
6
|
+
- extract_code_snippets: 提取 ``` 代码块
|
|
7
|
+
- parse_generic_tags: 解析 <tag>content</tag> 格式标签
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Dict, List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def strip_think_tags(text: str) -> str:
|
|
15
|
+
"""去除 <think>...</think> 包裹的内容
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: 输入文本
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
去除思考链后的文本
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
>>> strip_think_tags("<think>让我想想...</think>答案是42")
|
|
25
|
+
'答案是42'
|
|
26
|
+
"""
|
|
27
|
+
if not text:
|
|
28
|
+
return text
|
|
29
|
+
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_code_snippets(text: str, strict: bool = True) -> List[Dict[str, str]]:
|
|
33
|
+
"""提取 ``` 代码块
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: 输入文本
|
|
37
|
+
strict: True 仅匹配 ```lang...``` 格式,False 额外匹配 {...} 格式
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
代码片段列表,每项为 {"language": ..., "code": ...}
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
>>> extract_code_snippets("```json\\n{\"a\": 1}\\n```")
|
|
44
|
+
[{'language': 'json', 'code': '{"a": 1}'}]
|
|
45
|
+
"""
|
|
46
|
+
pattern = r"```(\w+)?\s*([\s\S]*?)```"
|
|
47
|
+
matches = re.findall(pattern, text)
|
|
48
|
+
|
|
49
|
+
code_snippets = []
|
|
50
|
+
for lang, code in matches:
|
|
51
|
+
code_snippets.append(
|
|
52
|
+
{
|
|
53
|
+
"language": lang.strip() if lang else "unknown",
|
|
54
|
+
"code": code.strip(),
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if not strict:
|
|
59
|
+
# 移除已匹配的 ``` 块,在剩余文本中匹配 { ... }
|
|
60
|
+
text = re.sub(pattern, "", text)
|
|
61
|
+
brace_matches = re.findall(r"\{[\s\S]*?\}", text)
|
|
62
|
+
for code in brace_matches:
|
|
63
|
+
code_snippets.append(
|
|
64
|
+
{
|
|
65
|
+
"language": "unknown",
|
|
66
|
+
"code": code.strip(),
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return code_snippets
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_generic_tags(text: str, strict: bool = False) -> Dict[str, str]:
|
|
74
|
+
"""解析 XML 风格标签
|
|
75
|
+
|
|
76
|
+
支持两种模式:
|
|
77
|
+
- strict=True: 仅匹配闭合标签 <label>content</label>
|
|
78
|
+
- strict=False: 同时匹配开放式标签 <label>content,闭合标签优先
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: 输入文本
|
|
82
|
+
strict: 是否严格模式
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
{标签名: 内容} 字典
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> parse_generic_tags("<标签>内容</标签>")
|
|
89
|
+
{'标签': '内容'}
|
|
90
|
+
>>> parse_generic_tags("<a>hello<b>world", strict=False)
|
|
91
|
+
{'a': 'hello', 'b': 'world'}
|
|
92
|
+
"""
|
|
93
|
+
if not text:
|
|
94
|
+
return {}
|
|
95
|
+
|
|
96
|
+
result = {}
|
|
97
|
+
|
|
98
|
+
if strict:
|
|
99
|
+
pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
100
|
+
matches = re.findall(pattern_closed, text, re.DOTALL)
|
|
101
|
+
for label, content in matches:
|
|
102
|
+
result[label.strip()] = content.strip()
|
|
103
|
+
else:
|
|
104
|
+
remaining_text = str(text)
|
|
105
|
+
|
|
106
|
+
# 1. 优先处理闭合标签
|
|
107
|
+
def process_closed_tag(match_obj):
|
|
108
|
+
label = match_obj.group(1).strip()
|
|
109
|
+
content = match_obj.group(2).strip()
|
|
110
|
+
result[label] = content
|
|
111
|
+
return ""
|
|
112
|
+
|
|
113
|
+
pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
114
|
+
remaining_text = re.sub(pattern_closed, process_closed_tag, remaining_text, flags=re.DOTALL)
|
|
115
|
+
|
|
116
|
+
# 2. 在剩余文本中处理开放式标签
|
|
117
|
+
pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
|
|
118
|
+
matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
|
|
119
|
+
for label, content in matches_open:
|
|
120
|
+
label_stripped = label.strip()
|
|
121
|
+
if label_stripped not in result:
|
|
122
|
+
result[label_stripped] = content.strip()
|
|
123
|
+
|
|
124
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.9
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -44,6 +44,7 @@ Requires-Dist: flake8>=3.9.0; extra == 'dev'
|
|
|
44
44
|
Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
|
|
45
45
|
Requires-Dist: isort>=5.9.0; extra == 'dev'
|
|
46
46
|
Requires-Dist: mypy>=0.910; extra == 'dev'
|
|
47
|
+
Requires-Dist: pandas>=1.3.0; extra == 'dev'
|
|
47
48
|
Requires-Dist: pyarrow; extra == 'dev'
|
|
48
49
|
Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
|
|
49
50
|
Requires-Dist: pytest>=6.0.0; extra == 'dev'
|
|
@@ -57,10 +58,14 @@ Provides-Extra: docs
|
|
|
57
58
|
Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
|
|
58
59
|
Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
|
|
59
60
|
Requires-Dist: sphinx>=4.0.0; extra == 'docs'
|
|
61
|
+
Provides-Extra: eval
|
|
62
|
+
Requires-Dist: pandas>=1.3.0; extra == 'eval'
|
|
63
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'eval'
|
|
60
64
|
Provides-Extra: full
|
|
61
65
|
Requires-Dist: datasets>=2.0.0; extra == 'full'
|
|
62
66
|
Requires-Dist: datasketch>=1.5.0; extra == 'full'
|
|
63
67
|
Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
|
|
68
|
+
Requires-Dist: pandas>=1.3.0; extra == 'full'
|
|
64
69
|
Requires-Dist: pyarrow; extra == 'full'
|
|
65
70
|
Requires-Dist: rich>=10.0.0; extra == 'full'
|
|
66
71
|
Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
|
|
@@ -435,6 +440,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
|
|
|
435
440
|
dt sample data.jsonl --where="category=tech" # 筛选后采样
|
|
436
441
|
dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
|
|
437
442
|
|
|
443
|
+
# 按行范围查看(Python 切片语法)
|
|
444
|
+
dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
|
|
445
|
+
dt slice data.jsonl :100 # 前 100 行
|
|
446
|
+
dt slice data.jsonl 100: # 第 100 行到末尾
|
|
447
|
+
dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
|
|
448
|
+
dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
|
|
449
|
+
|
|
438
450
|
# 数据转换 - 预设模式
|
|
439
451
|
dt transform data.jsonl --preset=openai_chat
|
|
440
452
|
dt transform data.jsonl --preset=alpaca
|
|
@@ -468,6 +480,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
|
|
|
468
480
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
469
481
|
dt clean data.jsonl --drop=metadata # 删除指定字段
|
|
470
482
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
483
|
+
dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
|
|
484
|
+
dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
|
|
485
|
+
dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
|
|
471
486
|
|
|
472
487
|
# 数据去重
|
|
473
488
|
dt dedupe data.jsonl # 全量精确去重
|
|
@@ -476,11 +491,26 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
|
476
491
|
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
477
492
|
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
478
493
|
|
|
494
|
+
# 数据集切分
|
|
495
|
+
dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
|
|
496
|
+
dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
|
|
497
|
+
dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
|
|
498
|
+
|
|
499
|
+
# 训练框架导出
|
|
500
|
+
dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
|
|
501
|
+
dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
|
|
502
|
+
dt export data.jsonl -f axolotl # 导出到 Axolotl
|
|
503
|
+
dt export data.jsonl -f llama-factory --check # 仅检查兼容性
|
|
504
|
+
|
|
479
505
|
# 文件拼接
|
|
480
506
|
dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
481
507
|
|
|
482
508
|
# 数据统计
|
|
483
|
-
dt stats data.jsonl
|
|
509
|
+
dt stats data.jsonl # 快速模式
|
|
510
|
+
dt stats data.jsonl --full # 完整模式(含值分布)
|
|
511
|
+
dt stats data.jsonl --full --field=category # 指定字段统计
|
|
512
|
+
dt stats data.jsonl --full --expand=tags # 展开 list 字段统计元素分布
|
|
513
|
+
dt stats data.jsonl --full --expand='messages[*].role' # 展开嵌套 list 字段
|
|
484
514
|
|
|
485
515
|
# Claude Code Skill 安装
|
|
486
516
|
dt install-skill # 安装到 ~/.claude/skills/
|
|
@@ -516,6 +546,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
|
|
|
516
546
|
| `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
|
|
517
547
|
| `clean` | `--min-len=` | `--min-len=messages.#:2` |
|
|
518
548
|
| `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
|
|
549
|
+
| `clean` | `--min-tokens=` | `--min-tokens=content:10` |
|
|
550
|
+
| `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
|
|
519
551
|
| `token-stats` | `--field=` | `--field=messages[-1].content` |
|
|
520
552
|
| `diff` | `--key=` | `--key=meta.uuid` |
|
|
521
553
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
dtflow/SKILL.md,sha256=
|
|
2
|
-
dtflow/__init__.py,sha256=
|
|
3
|
-
dtflow/__main__.py,sha256=
|
|
1
|
+
dtflow/SKILL.md,sha256=hPxJhroGmNbBv8MLZUkOA2yW1TDdUKEUYYlz9tW2mao,10393
|
|
2
|
+
dtflow/__init__.py,sha256=9ZqhqD8qQM9w2dfHKyUWIaqSX-X4elWtbaQN4CNBhgg,3031
|
|
3
|
+
dtflow/__main__.py,sha256=gg3v7u-Ot7AicgKrP1fuyKtMJXVduNuLmhy7L1LUPDg,17710
|
|
4
4
|
dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
|
|
5
5
|
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
6
|
+
dtflow/eval.py,sha256=_c-XP2zsOBznYltSyKEScOqvmPVX2orqepg5cNhXXB0,9836
|
|
6
7
|
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
7
8
|
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
8
9
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
@@ -11,15 +12,18 @@ dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
|
|
|
11
12
|
dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
|
|
12
13
|
dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
|
|
13
14
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
14
|
-
dtflow/cli/clean.py,sha256=
|
|
15
|
-
dtflow/cli/commands.py,sha256=
|
|
15
|
+
dtflow/cli/clean.py,sha256=BEQQlH2q6luCbx51M3oxxOwcnwlOA8vo9WX3Fp7I6AY,29498
|
|
16
|
+
dtflow/cli/commands.py,sha256=LvyDQ_nWUM7UlPDEFQadRdw5O2ZKDLgF41_xAJRhYxI,1583
|
|
16
17
|
dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
|
|
18
|
+
dtflow/cli/eval.py,sha256=c53kCRH86k2Q_6vESKFlcepcNnTpO9O68agWK4_oJj8,9582
|
|
19
|
+
dtflow/cli/export.py,sha256=loRfVPwEVsDw3ZMKEYGp0Hy38kYZG2QT8JCMbz1dRzU,2156
|
|
17
20
|
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
18
21
|
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
19
22
|
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
20
|
-
dtflow/cli/sample.py,sha256=
|
|
23
|
+
dtflow/cli/sample.py,sha256=etbro5I0pyNgn0Qfhp1M6Bh-95JN-AntDa5AwVe_oKY,18269
|
|
21
24
|
dtflow/cli/skill.py,sha256=opiTEBejA7JHKrEMftMOPDQlOgZ4n59rwaHXGU1Nukk,2022
|
|
22
|
-
dtflow/cli/
|
|
25
|
+
dtflow/cli/split.py,sha256=96bhWnxHnjIqifoliLgciApkLbwQU8bWHovK8bcMk9g,3667
|
|
26
|
+
dtflow/cli/stats.py,sha256=Jx3d4X0ftgpzU5q5RAWZEVJWwXviQTF4EAwBmz1IliA,31366
|
|
23
27
|
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
24
28
|
dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
|
|
25
29
|
dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
|
|
@@ -28,7 +32,8 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
|
|
|
28
32
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
29
33
|
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
30
34
|
dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
|
|
31
|
-
dtflow
|
|
32
|
-
dtflow-0.5.
|
|
33
|
-
dtflow-0.5.
|
|
34
|
-
dtflow-0.5.
|
|
35
|
+
dtflow/utils/text_parser.py,sha256=0t2TMOSha4dTiDu9H4ygdb67cI20zhtBH1XavDspL_g,3727
|
|
36
|
+
dtflow-0.5.9.dist-info/METADATA,sha256=Pu92Dz2vj7U_dki4A0e5xgka36BTT9K2PnN1LIeEhN0,25839
|
|
37
|
+
dtflow-0.5.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
dtflow-0.5.9.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
39
|
+
dtflow-0.5.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|