dtflow 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ """
2
+ 文本清洗工具
3
+
4
+ 提供 LLM 输出的常见清洗函数:
5
+ - strip_think_tags: 去除 <think>...</think> 思考链内容
6
+ - extract_code_snippets: 提取 ``` 代码块
7
+ - parse_generic_tags: 解析 <tag>content</tag> 格式标签
8
+ """
9
+
10
+ import re
11
+ from typing import Dict, List
12
+
13
+
14
+ def strip_think_tags(text: str) -> str:
15
+ """去除 <think>...</think> 包裹的内容
16
+
17
+ Args:
18
+ text: 输入文本
19
+
20
+ Returns:
21
+ 去除思考链后的文本
22
+
23
+ Examples:
24
+ >>> strip_think_tags("<think>让我想想...</think>答案是42")
25
+ '答案是42'
26
+ """
27
+ if not text:
28
+ return text
29
+ return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
30
+
31
+
32
+ def extract_code_snippets(text: str, strict: bool = True) -> List[Dict[str, str]]:
33
+ """提取 ``` 代码块
34
+
35
+ Args:
36
+ text: 输入文本
37
+ strict: True 仅匹配 ```lang...``` 格式,False 额外匹配 {...} 格式
38
+
39
+ Returns:
40
+ 代码片段列表,每项为 {"language": ..., "code": ...}
41
+
42
+ Examples:
43
+ >>> extract_code_snippets("```json\\n{\"a\": 1}\\n```")
44
+ [{'language': 'json', 'code': '{"a": 1}'}]
45
+ """
46
+ pattern = r"```(\w+)?\s*([\s\S]*?)```"
47
+ matches = re.findall(pattern, text)
48
+
49
+ code_snippets = []
50
+ for lang, code in matches:
51
+ code_snippets.append(
52
+ {
53
+ "language": lang.strip() if lang else "unknown",
54
+ "code": code.strip(),
55
+ }
56
+ )
57
+
58
+ if not strict:
59
+ # 移除已匹配的 ``` 块,在剩余文本中匹配 { ... }
60
+ text = re.sub(pattern, "", text)
61
+ brace_matches = re.findall(r"\{[\s\S]*?\}", text)
62
+ for code in brace_matches:
63
+ code_snippets.append(
64
+ {
65
+ "language": "unknown",
66
+ "code": code.strip(),
67
+ }
68
+ )
69
+
70
+ return code_snippets
71
+
72
+
73
+ def parse_generic_tags(text: str, strict: bool = False) -> Dict[str, str]:
74
+ """解析 XML 风格标签
75
+
76
+ 支持两种模式:
77
+ - strict=True: 仅匹配闭合标签 <label>content</label>
78
+ - strict=False: 同时匹配开放式标签 <label>content,闭合标签优先
79
+
80
+ Args:
81
+ text: 输入文本
82
+ strict: 是否严格模式
83
+
84
+ Returns:
85
+ {标签名: 内容} 字典
86
+
87
+ Examples:
88
+ >>> parse_generic_tags("<标签>内容</标签>")
89
+ {'标签': '内容'}
90
+ >>> parse_generic_tags("<a>hello<b>world", strict=False)
91
+ {'a': 'hello', 'b': 'world'}
92
+ """
93
+ if not text:
94
+ return {}
95
+
96
+ result = {}
97
+
98
+ if strict:
99
+ pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
100
+ matches = re.findall(pattern_closed, text, re.DOTALL)
101
+ for label, content in matches:
102
+ result[label.strip()] = content.strip()
103
+ else:
104
+ remaining_text = str(text)
105
+
106
+ # 1. 优先处理闭合标签
107
+ def process_closed_tag(match_obj):
108
+ label = match_obj.group(1).strip()
109
+ content = match_obj.group(2).strip()
110
+ result[label] = content
111
+ return ""
112
+
113
+ pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
114
+ remaining_text = re.sub(pattern_closed, process_closed_tag, remaining_text, flags=re.DOTALL)
115
+
116
+ # 2. 在剩余文本中处理开放式标签
117
+ pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
118
+ matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
119
+ for label, content in matches_open:
120
+ label_stripped = label.strip()
121
+ if label_stripped not in result:
122
+ result[label_stripped] = content.strip()
123
+
124
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.7
3
+ Version: 0.5.9
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -44,6 +44,7 @@ Requires-Dist: flake8>=3.9.0; extra == 'dev'
44
44
  Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
45
45
  Requires-Dist: isort>=5.9.0; extra == 'dev'
46
46
  Requires-Dist: mypy>=0.910; extra == 'dev'
47
+ Requires-Dist: pandas>=1.3.0; extra == 'dev'
47
48
  Requires-Dist: pyarrow; extra == 'dev'
48
49
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
49
50
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
@@ -57,10 +58,14 @@ Provides-Extra: docs
57
58
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
58
59
  Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
59
60
  Requires-Dist: sphinx>=4.0.0; extra == 'docs'
61
+ Provides-Extra: eval
62
+ Requires-Dist: pandas>=1.3.0; extra == 'eval'
63
+ Requires-Dist: scikit-learn>=0.24.0; extra == 'eval'
60
64
  Provides-Extra: full
61
65
  Requires-Dist: datasets>=2.0.0; extra == 'full'
62
66
  Requires-Dist: datasketch>=1.5.0; extra == 'full'
63
67
  Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
68
+ Requires-Dist: pandas>=1.3.0; extra == 'full'
64
69
  Requires-Dist: pyarrow; extra == 'full'
65
70
  Requires-Dist: rich>=10.0.0; extra == 'full'
66
71
  Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
@@ -435,6 +440,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
435
440
  dt sample data.jsonl --where="category=tech" # 筛选后采样
436
441
  dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
437
442
 
443
+ # 按行范围查看(Python 切片语法)
444
+ dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
445
+ dt slice data.jsonl :100 # 前 100 行
446
+ dt slice data.jsonl 100: # 第 100 行到末尾
447
+ dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
448
+ dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
449
+
438
450
  # 数据转换 - 预设模式
439
451
  dt transform data.jsonl --preset=openai_chat
440
452
  dt transform data.jsonl --preset=alpaca
@@ -468,6 +480,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
468
480
  dt clean data.jsonl --keep=question,answer # 只保留这些字段
469
481
  dt clean data.jsonl --drop=metadata # 删除指定字段
470
482
  dt clean data.jsonl --strip # 去除字符串首尾空白
483
+ dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
484
+ dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
485
+ dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
471
486
 
472
487
  # 数据去重
473
488
  dt dedupe data.jsonl # 全量精确去重
@@ -476,11 +491,26 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
476
491
  dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
477
492
  dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
478
493
 
494
+ # 数据集切分
495
+ dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
496
+ dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
497
+ dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
498
+
499
+ # 训练框架导出
500
+ dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
501
+ dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
502
+ dt export data.jsonl -f axolotl # 导出到 Axolotl
503
+ dt export data.jsonl -f llama-factory --check # 仅检查兼容性
504
+
479
505
  # 文件拼接
480
506
  dt concat a.jsonl b.jsonl -o merged.jsonl
481
507
 
482
508
  # 数据统计
483
- dt stats data.jsonl
509
+ dt stats data.jsonl # 快速模式
510
+ dt stats data.jsonl --full # 完整模式(含值分布)
511
+ dt stats data.jsonl --full --field=category # 指定字段统计
512
+ dt stats data.jsonl --full --expand=tags # 展开 list 字段统计元素分布
513
+ dt stats data.jsonl --full --expand='messages[*].role' # 展开嵌套 list 字段
484
514
 
485
515
  # Claude Code Skill 安装
486
516
  dt install-skill # 安装到 ~/.claude/skills/
@@ -516,6 +546,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
516
546
  | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
517
547
  | `clean` | `--min-len=` | `--min-len=messages.#:2` |
518
548
  | `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
549
+ | `clean` | `--min-tokens=` | `--min-tokens=content:10` |
550
+ | `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
519
551
  | `token-stats` | `--field=` | `--field=messages[-1].content` |
520
552
  | `diff` | `--key=` | `--key=meta.uuid` |
521
553
 
@@ -1,8 +1,9 @@
1
- dtflow/SKILL.md,sha256=sHf6i6DKUCca5zvSJ67VHu05tFlST4mYgnoURXVe1g0,7836
2
- dtflow/__init__.py,sha256=Ee7CDDxzki69MEGeXB5bczuMts5OwZZ-jVsKjH_rD_0,3031
3
- dtflow/__main__.py,sha256=3LXTku09Fw1dsgTUtX1UJCmE20qKeZpNga3UqmI3UiY,12145
1
+ dtflow/SKILL.md,sha256=hPxJhroGmNbBv8MLZUkOA2yW1TDdUKEUYYlz9tW2mao,10393
2
+ dtflow/__init__.py,sha256=9ZqhqD8qQM9w2dfHKyUWIaqSX-X4elWtbaQN4CNBhgg,3031
3
+ dtflow/__main__.py,sha256=gg3v7u-Ot7AicgKrP1fuyKtMJXVduNuLmhy7L1LUPDg,17710
4
4
  dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
5
5
  dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
6
+ dtflow/eval.py,sha256=_c-XP2zsOBznYltSyKEScOqvmPVX2orqepg5cNhXXB0,9836
6
7
  dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
7
8
  dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
8
9
  dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
@@ -11,15 +12,18 @@ dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
11
12
  dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
12
13
  dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
13
14
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
14
- dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
15
- dtflow/cli/commands.py,sha256=zKUG-B9Az-spqyqM00cR8Sgc2UgeOPQDThJFHWDNO_w,1336
15
+ dtflow/cli/clean.py,sha256=BEQQlH2q6luCbx51M3oxxOwcnwlOA8vo9WX3Fp7I6AY,29498
16
+ dtflow/cli/commands.py,sha256=LvyDQ_nWUM7UlPDEFQadRdw5O2ZKDLgF41_xAJRhYxI,1583
16
17
  dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
18
+ dtflow/cli/eval.py,sha256=c53kCRH86k2Q_6vESKFlcepcNnTpO9O68agWK4_oJj8,9582
19
+ dtflow/cli/export.py,sha256=loRfVPwEVsDw3ZMKEYGp0Hy38kYZG2QT8JCMbz1dRzU,2156
17
20
  dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
18
21
  dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
19
22
  dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
20
- dtflow/cli/sample.py,sha256=pubpx4AIzsarBEalD150MC2apYQSt4bal70IZkTfFO0,15475
23
+ dtflow/cli/sample.py,sha256=etbro5I0pyNgn0Qfhp1M6Bh-95JN-AntDa5AwVe_oKY,18269
21
24
  dtflow/cli/skill.py,sha256=opiTEBejA7JHKrEMftMOPDQlOgZ4n59rwaHXGU1Nukk,2022
22
- dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
25
+ dtflow/cli/split.py,sha256=96bhWnxHnjIqifoliLgciApkLbwQU8bWHovK8bcMk9g,3667
26
+ dtflow/cli/stats.py,sha256=Jx3d4X0ftgpzU5q5RAWZEVJWwXviQTF4EAwBmz1IliA,31366
23
27
  dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
24
28
  dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
25
29
  dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
@@ -28,7 +32,8 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
28
32
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
29
33
  dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
30
34
  dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
31
- dtflow-0.5.7.dist-info/METADATA,sha256=mlWaRHSM1ZucQrAa8PGcHzjHj2RQPBynnmdA_JoNSNI,23899
32
- dtflow-0.5.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
33
- dtflow-0.5.7.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
34
- dtflow-0.5.7.dist-info/RECORD,,
35
+ dtflow/utils/text_parser.py,sha256=0t2TMOSha4dTiDu9H4ygdb67cI20zhtBH1XavDspL_g,3727
36
+ dtflow-0.5.9.dist-info/METADATA,sha256=Pu92Dz2vj7U_dki4A0e5xgka36BTT9K2PnN1LIeEhN0,25839
37
+ dtflow-0.5.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ dtflow-0.5.9.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
39
+ dtflow-0.5.9.dist-info/RECORD,,
File without changes