dtflow 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +69 -58
- dtflow/__main__.py +29 -20
- dtflow/cli/__init__.py +25 -4
- dtflow/cli/commands.py +184 -93
- dtflow/converters.py +39 -23
- dtflow/core.py +79 -51
- dtflow/lineage.py +6 -3
- dtflow/mcp/__init__.py +1 -0
- dtflow/mcp/__main__.py +2 -0
- dtflow/mcp/cli.py +22 -4
- dtflow/mcp/docs.py +0 -5
- dtflow/pipeline.py +33 -23
- dtflow/presets.py +24 -22
- dtflow/storage/__init__.py +11 -10
- dtflow/storage/io.py +19 -10
- dtflow/streaming.py +13 -18
- dtflow/tokenizers.py +32 -12
- dtflow/utils/__init__.py +20 -1
- dtflow/utils/display.py +23 -23
- dtflow/utils/field_path.py +274 -0
- {dtflow-0.3.1.dist-info → dtflow-0.4.0.dist-info}/METADATA +48 -3
- dtflow-0.4.0.dist-info/RECORD +25 -0
- dtflow-0.3.1.dist-info/RECORD +0 -24
- {dtflow-0.3.1.dist-info → dtflow-0.4.0.dist-info}/WHEEL +0 -0
- {dtflow-0.3.1.dist-info → dtflow-0.4.0.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
|
@@ -7,77 +7,88 @@ DataTransformer: 简洁的数据格式转换工具
|
|
|
7
7
|
- tokenizers: Token 统计和过滤
|
|
8
8
|
- converters: HuggingFace/OpenAI 等格式转换
|
|
9
9
|
"""
|
|
10
|
+
|
|
11
|
+
from .converters import ( # LLaMA-Factory 扩展; ms-swift
|
|
12
|
+
from_hf_dataset,
|
|
13
|
+
from_openai_batch,
|
|
14
|
+
messages_to_text,
|
|
15
|
+
to_axolotl,
|
|
16
|
+
to_hf_chat_format,
|
|
17
|
+
to_hf_dataset,
|
|
18
|
+
to_llama_factory,
|
|
19
|
+
to_llama_factory_sharegpt,
|
|
20
|
+
to_llama_factory_vlm,
|
|
21
|
+
to_llama_factory_vlm_sharegpt,
|
|
22
|
+
to_openai_batch,
|
|
23
|
+
to_swift_messages,
|
|
24
|
+
to_swift_query_response,
|
|
25
|
+
to_swift_vlm,
|
|
26
|
+
)
|
|
10
27
|
from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
|
|
11
28
|
from .presets import get_preset, list_presets
|
|
12
|
-
from .storage import
|
|
29
|
+
from .storage import load_data, sample_file, save_data
|
|
30
|
+
from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
|
|
13
31
|
from .tokenizers import (
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
to_swift_messages, to_swift_query_response, to_swift_vlm,
|
|
26
|
-
)
|
|
27
|
-
from .streaming import (
|
|
28
|
-
StreamingTransformer,
|
|
29
|
-
load_stream,
|
|
30
|
-
load_sharded,
|
|
31
|
-
process_shards,
|
|
32
|
+
DEFAULT_MODEL,
|
|
33
|
+
MODEL_ALIASES,
|
|
34
|
+
OPENAI_MODELS,
|
|
35
|
+
count_tokens,
|
|
36
|
+
messages_token_counter,
|
|
37
|
+
messages_token_filter,
|
|
38
|
+
messages_token_stats,
|
|
39
|
+
resolve_model,
|
|
40
|
+
token_counter,
|
|
41
|
+
token_filter,
|
|
42
|
+
token_stats,
|
|
32
43
|
)
|
|
33
44
|
|
|
34
|
-
__version__ =
|
|
45
|
+
__version__ = "0.4.0"
|
|
35
46
|
|
|
36
47
|
__all__ = [
|
|
37
48
|
# core
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
"DataTransformer",
|
|
50
|
+
"DictWrapper",
|
|
51
|
+
"TransformError",
|
|
52
|
+
"TransformErrors",
|
|
42
53
|
# presets
|
|
43
|
-
|
|
44
|
-
|
|
54
|
+
"get_preset",
|
|
55
|
+
"list_presets",
|
|
45
56
|
# storage
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
57
|
+
"save_data",
|
|
58
|
+
"load_data",
|
|
59
|
+
"sample_file",
|
|
49
60
|
# tokenizers
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
+
"count_tokens",
|
|
62
|
+
"token_counter",
|
|
63
|
+
"token_filter",
|
|
64
|
+
"token_stats",
|
|
65
|
+
"messages_token_counter",
|
|
66
|
+
"messages_token_filter",
|
|
67
|
+
"messages_token_stats",
|
|
68
|
+
"DEFAULT_MODEL",
|
|
69
|
+
"MODEL_ALIASES",
|
|
70
|
+
"OPENAI_MODELS",
|
|
71
|
+
"resolve_model",
|
|
61
72
|
# converters
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
73
|
+
"to_hf_dataset",
|
|
74
|
+
"from_hf_dataset",
|
|
75
|
+
"to_hf_chat_format",
|
|
76
|
+
"from_openai_batch",
|
|
77
|
+
"to_openai_batch",
|
|
78
|
+
"to_llama_factory",
|
|
79
|
+
"to_axolotl",
|
|
80
|
+
"messages_to_text",
|
|
70
81
|
# LLaMA-Factory 扩展
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
82
|
+
"to_llama_factory_sharegpt",
|
|
83
|
+
"to_llama_factory_vlm",
|
|
84
|
+
"to_llama_factory_vlm_sharegpt",
|
|
74
85
|
# ms-swift
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
86
|
+
"to_swift_messages",
|
|
87
|
+
"to_swift_query_response",
|
|
88
|
+
"to_swift_vlm",
|
|
78
89
|
# streaming
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
90
|
+
"StreamingTransformer",
|
|
91
|
+
"load_stream",
|
|
92
|
+
"load_sharded",
|
|
93
|
+
"process_shards",
|
|
83
94
|
]
|
dtflow/__main__.py
CHANGED
|
@@ -21,26 +21,25 @@ Commands:
|
|
|
21
21
|
mcp MCP 服务管理(install/uninstall/status)
|
|
22
22
|
logs 日志查看工具使用说明
|
|
23
23
|
"""
|
|
24
|
+
|
|
24
25
|
import os
|
|
25
26
|
import sys
|
|
26
27
|
from typing import List, Optional
|
|
27
28
|
|
|
28
29
|
import typer
|
|
29
30
|
|
|
30
|
-
from .cli.commands import
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
history as _history,
|
|
43
|
-
)
|
|
31
|
+
from .cli.commands import clean as _clean
|
|
32
|
+
from .cli.commands import concat as _concat
|
|
33
|
+
from .cli.commands import dedupe as _dedupe
|
|
34
|
+
from .cli.commands import diff as _diff
|
|
35
|
+
from .cli.commands import head as _head
|
|
36
|
+
from .cli.commands import history as _history
|
|
37
|
+
from .cli.commands import run as _run
|
|
38
|
+
from .cli.commands import sample as _sample
|
|
39
|
+
from .cli.commands import stats as _stats
|
|
40
|
+
from .cli.commands import tail as _tail
|
|
41
|
+
from .cli.commands import token_stats as _token_stats
|
|
42
|
+
from .cli.commands import transform as _transform
|
|
44
43
|
|
|
45
44
|
# 创建主应用
|
|
46
45
|
app = typer.Typer(
|
|
@@ -53,6 +52,7 @@ app = typer.Typer(
|
|
|
53
52
|
|
|
54
53
|
# ============ 数据预览命令 ============
|
|
55
54
|
|
|
55
|
+
|
|
56
56
|
@app.command()
|
|
57
57
|
def sample(
|
|
58
58
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
@@ -92,6 +92,7 @@ def tail(
|
|
|
92
92
|
|
|
93
93
|
# ============ 数据转换命令 ============
|
|
94
94
|
|
|
95
|
+
|
|
95
96
|
@app.command()
|
|
96
97
|
def transform(
|
|
97
98
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
@@ -116,6 +117,7 @@ def run(
|
|
|
116
117
|
|
|
117
118
|
# ============ 数据处理命令 ============
|
|
118
119
|
|
|
120
|
+
|
|
119
121
|
@app.command()
|
|
120
122
|
def dedupe(
|
|
121
123
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
@@ -154,6 +156,7 @@ def clean(
|
|
|
154
156
|
|
|
155
157
|
# ============ 数据统计命令 ============
|
|
156
158
|
|
|
159
|
+
|
|
157
160
|
@app.command()
|
|
158
161
|
def stats(
|
|
159
162
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
@@ -167,7 +170,9 @@ def stats(
|
|
|
167
170
|
def token_stats(
|
|
168
171
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
169
172
|
field: str = typer.Option("messages", "--field", "-f", help="统计字段"),
|
|
170
|
-
model: str = typer.Option(
|
|
173
|
+
model: str = typer.Option(
|
|
174
|
+
"cl100k_base", "--model", "-m", help="分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等"
|
|
175
|
+
),
|
|
171
176
|
detailed: bool = typer.Option(False, "--detailed", "-d", help="显示详细统计"),
|
|
172
177
|
):
|
|
173
178
|
"""统计数据集的 Token 信息"""
|
|
@@ -196,6 +201,7 @@ def history(
|
|
|
196
201
|
|
|
197
202
|
# ============ 工具命令 ============
|
|
198
203
|
|
|
204
|
+
|
|
199
205
|
@app.command()
|
|
200
206
|
def logs():
|
|
201
207
|
"""日志查看工具使用说明"""
|
|
@@ -237,6 +243,7 @@ def install(
|
|
|
237
243
|
):
|
|
238
244
|
"""安装 Datatron MCP 服务"""
|
|
239
245
|
from .mcp.cli import MCPCommands
|
|
246
|
+
|
|
240
247
|
MCPCommands().install(name, target)
|
|
241
248
|
|
|
242
249
|
|
|
@@ -247,6 +254,7 @@ def uninstall(
|
|
|
247
254
|
):
|
|
248
255
|
"""移除 Datatron MCP 服务"""
|
|
249
256
|
from .mcp.cli import MCPCommands
|
|
257
|
+
|
|
250
258
|
MCPCommands().uninstall(name, target)
|
|
251
259
|
|
|
252
260
|
|
|
@@ -254,6 +262,7 @@ def uninstall(
|
|
|
254
262
|
def status():
|
|
255
263
|
"""查看 MCP 服务安装状态"""
|
|
256
264
|
from .mcp.cli import MCPCommands
|
|
265
|
+
|
|
257
266
|
MCPCommands().status()
|
|
258
267
|
|
|
259
268
|
|
|
@@ -261,6 +270,7 @@ def status():
|
|
|
261
270
|
def test():
|
|
262
271
|
"""测试 MCP 服务是否正常"""
|
|
263
272
|
from .mcp.cli import MCPCommands
|
|
273
|
+
|
|
264
274
|
MCPCommands().test()
|
|
265
275
|
|
|
266
276
|
|
|
@@ -281,10 +291,9 @@ def _show_completion_hint():
|
|
|
281
291
|
|
|
282
292
|
# 显示提示(使用 stderr 避免干扰管道输出)
|
|
283
293
|
from rich.console import Console
|
|
294
|
+
|
|
284
295
|
console = Console(stderr=True)
|
|
285
|
-
console.print(
|
|
286
|
-
"[dim]💡 提示: 运行 [green]dt --install-completion[/green] 启用命令补全[/dim]"
|
|
287
|
-
)
|
|
296
|
+
console.print("[dim]💡 提示: 运行 [green]dt --install-completion[/green] 启用命令补全[/dim]")
|
|
288
297
|
|
|
289
298
|
# 记录已提示
|
|
290
299
|
try:
|
|
@@ -296,8 +305,8 @@ def _show_completion_hint():
|
|
|
296
305
|
|
|
297
306
|
def main():
|
|
298
307
|
# less 分页器配置(仅 Unix-like 系统)
|
|
299
|
-
if sys.platform !=
|
|
300
|
-
os.environ[
|
|
308
|
+
if sys.platform != "win32":
|
|
309
|
+
os.environ["PAGER"] = "less -RXF"
|
|
301
310
|
|
|
302
311
|
# _show_completion_hint()
|
|
303
312
|
app()
|
dtflow/cli/__init__.py
CHANGED
|
@@ -1,12 +1,33 @@
|
|
|
1
1
|
"""
|
|
2
2
|
CLI module for DataTransformer.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
from .commands import (
|
|
5
|
-
clean,
|
|
6
|
-
|
|
6
|
+
clean,
|
|
7
|
+
concat,
|
|
8
|
+
dedupe,
|
|
9
|
+
diff,
|
|
10
|
+
head,
|
|
11
|
+
history,
|
|
12
|
+
run,
|
|
13
|
+
sample,
|
|
14
|
+
stats,
|
|
15
|
+
tail,
|
|
16
|
+
token_stats,
|
|
17
|
+
transform,
|
|
7
18
|
)
|
|
8
19
|
|
|
9
20
|
__all__ = [
|
|
10
|
-
"sample",
|
|
11
|
-
"
|
|
21
|
+
"sample",
|
|
22
|
+
"head",
|
|
23
|
+
"tail",
|
|
24
|
+
"transform",
|
|
25
|
+
"dedupe",
|
|
26
|
+
"concat",
|
|
27
|
+
"stats",
|
|
28
|
+
"clean",
|
|
29
|
+
"run",
|
|
30
|
+
"token_stats",
|
|
31
|
+
"diff",
|
|
32
|
+
"history",
|
|
12
33
|
]
|