dtflow 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -7,77 +7,88 @@ DataTransformer: 简洁的数据格式转换工具
7
7
  - tokenizers: Token 统计和过滤
8
8
  - converters: HuggingFace/OpenAI 等格式转换
9
9
  """
10
+
11
+ from .converters import ( # LLaMA-Factory 扩展; ms-swift
12
+ from_hf_dataset,
13
+ from_openai_batch,
14
+ messages_to_text,
15
+ to_axolotl,
16
+ to_hf_chat_format,
17
+ to_hf_dataset,
18
+ to_llama_factory,
19
+ to_llama_factory_sharegpt,
20
+ to_llama_factory_vlm,
21
+ to_llama_factory_vlm_sharegpt,
22
+ to_openai_batch,
23
+ to_swift_messages,
24
+ to_swift_query_response,
25
+ to_swift_vlm,
26
+ )
10
27
  from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
11
28
  from .presets import get_preset, list_presets
12
- from .storage import save_data, load_data, sample_file
29
+ from .storage import load_data, sample_file, save_data
30
+ from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
13
31
  from .tokenizers import (
14
- count_tokens, token_counter, token_filter, token_stats,
15
- messages_token_counter, messages_token_filter, messages_token_stats,
16
- DEFAULT_MODEL, MODEL_ALIASES, OPENAI_MODELS, resolve_model,
17
- )
18
- from .converters import (
19
- to_hf_dataset, from_hf_dataset, to_hf_chat_format,
20
- from_openai_batch, to_openai_batch,
21
- to_llama_factory, to_axolotl, messages_to_text,
22
- # LLaMA-Factory 扩展
23
- to_llama_factory_sharegpt, to_llama_factory_vlm, to_llama_factory_vlm_sharegpt,
24
- # ms-swift
25
- to_swift_messages, to_swift_query_response, to_swift_vlm,
26
- )
27
- from .streaming import (
28
- StreamingTransformer,
29
- load_stream,
30
- load_sharded,
31
- process_shards,
32
+ DEFAULT_MODEL,
33
+ MODEL_ALIASES,
34
+ OPENAI_MODELS,
35
+ count_tokens,
36
+ messages_token_counter,
37
+ messages_token_filter,
38
+ messages_token_stats,
39
+ resolve_model,
40
+ token_counter,
41
+ token_filter,
42
+ token_stats,
32
43
  )
33
44
 
34
- __version__ = '0.3.1'
45
+ __version__ = "0.4.0"
35
46
 
36
47
  __all__ = [
37
48
  # core
38
- 'DataTransformer',
39
- 'DictWrapper',
40
- 'TransformError',
41
- 'TransformErrors',
49
+ "DataTransformer",
50
+ "DictWrapper",
51
+ "TransformError",
52
+ "TransformErrors",
42
53
  # presets
43
- 'get_preset',
44
- 'list_presets',
54
+ "get_preset",
55
+ "list_presets",
45
56
  # storage
46
- 'save_data',
47
- 'load_data',
48
- 'sample_file',
57
+ "save_data",
58
+ "load_data",
59
+ "sample_file",
49
60
  # tokenizers
50
- 'count_tokens',
51
- 'token_counter',
52
- 'token_filter',
53
- 'token_stats',
54
- 'messages_token_counter',
55
- 'messages_token_filter',
56
- 'messages_token_stats',
57
- 'DEFAULT_MODEL',
58
- 'MODEL_ALIASES',
59
- 'OPENAI_MODELS',
60
- 'resolve_model',
61
+ "count_tokens",
62
+ "token_counter",
63
+ "token_filter",
64
+ "token_stats",
65
+ "messages_token_counter",
66
+ "messages_token_filter",
67
+ "messages_token_stats",
68
+ "DEFAULT_MODEL",
69
+ "MODEL_ALIASES",
70
+ "OPENAI_MODELS",
71
+ "resolve_model",
61
72
  # converters
62
- 'to_hf_dataset',
63
- 'from_hf_dataset',
64
- 'to_hf_chat_format',
65
- 'from_openai_batch',
66
- 'to_openai_batch',
67
- 'to_llama_factory',
68
- 'to_axolotl',
69
- 'messages_to_text',
73
+ "to_hf_dataset",
74
+ "from_hf_dataset",
75
+ "to_hf_chat_format",
76
+ "from_openai_batch",
77
+ "to_openai_batch",
78
+ "to_llama_factory",
79
+ "to_axolotl",
80
+ "messages_to_text",
70
81
  # LLaMA-Factory 扩展
71
- 'to_llama_factory_sharegpt',
72
- 'to_llama_factory_vlm',
73
- 'to_llama_factory_vlm_sharegpt',
82
+ "to_llama_factory_sharegpt",
83
+ "to_llama_factory_vlm",
84
+ "to_llama_factory_vlm_sharegpt",
74
85
  # ms-swift
75
- 'to_swift_messages',
76
- 'to_swift_query_response',
77
- 'to_swift_vlm',
86
+ "to_swift_messages",
87
+ "to_swift_query_response",
88
+ "to_swift_vlm",
78
89
  # streaming
79
- 'StreamingTransformer',
80
- 'load_stream',
81
- 'load_sharded',
82
- 'process_shards',
90
+ "StreamingTransformer",
91
+ "load_stream",
92
+ "load_sharded",
93
+ "process_shards",
83
94
  ]
dtflow/__main__.py CHANGED
@@ -21,26 +21,25 @@ Commands:
21
21
  mcp MCP 服务管理(install/uninstall/status)
22
22
  logs 日志查看工具使用说明
23
23
  """
24
+
24
25
  import os
25
26
  import sys
26
27
  from typing import List, Optional
27
28
 
28
29
  import typer
29
30
 
30
- from .cli.commands import (
31
- sample as _sample,
32
- head as _head,
33
- tail as _tail,
34
- transform as _transform,
35
- dedupe as _dedupe,
36
- concat as _concat,
37
- stats as _stats,
38
- clean as _clean,
39
- run as _run,
40
- token_stats as _token_stats,
41
- diff as _diff,
42
- history as _history,
43
- )
31
+ from .cli.commands import clean as _clean
32
+ from .cli.commands import concat as _concat
33
+ from .cli.commands import dedupe as _dedupe
34
+ from .cli.commands import diff as _diff
35
+ from .cli.commands import head as _head
36
+ from .cli.commands import history as _history
37
+ from .cli.commands import run as _run
38
+ from .cli.commands import sample as _sample
39
+ from .cli.commands import stats as _stats
40
+ from .cli.commands import tail as _tail
41
+ from .cli.commands import token_stats as _token_stats
42
+ from .cli.commands import transform as _transform
44
43
 
45
44
  # 创建主应用
46
45
  app = typer.Typer(
@@ -53,6 +52,7 @@ app = typer.Typer(
53
52
 
54
53
  # ============ 数据预览命令 ============
55
54
 
55
+
56
56
  @app.command()
57
57
  def sample(
58
58
  filename: str = typer.Argument(..., help="输入文件路径"),
@@ -92,6 +92,7 @@ def tail(
92
92
 
93
93
  # ============ 数据转换命令 ============
94
94
 
95
+
95
96
  @app.command()
96
97
  def transform(
97
98
  filename: str = typer.Argument(..., help="输入文件路径"),
@@ -116,6 +117,7 @@ def run(
116
117
 
117
118
  # ============ 数据处理命令 ============
118
119
 
120
+
119
121
  @app.command()
120
122
  def dedupe(
121
123
  filename: str = typer.Argument(..., help="输入文件路径"),
@@ -154,6 +156,7 @@ def clean(
154
156
 
155
157
  # ============ 数据统计命令 ============
156
158
 
159
+
157
160
  @app.command()
158
161
  def stats(
159
162
  filename: str = typer.Argument(..., help="输入文件路径"),
@@ -167,7 +170,9 @@ def stats(
167
170
  def token_stats(
168
171
  filename: str = typer.Argument(..., help="输入文件路径"),
169
172
  field: str = typer.Option("messages", "--field", "-f", help="统计字段"),
170
- model: str = typer.Option("cl100k_base", "--model", "-m", help="分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等"),
173
+ model: str = typer.Option(
174
+ "cl100k_base", "--model", "-m", help="分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等"
175
+ ),
171
176
  detailed: bool = typer.Option(False, "--detailed", "-d", help="显示详细统计"),
172
177
  ):
173
178
  """统计数据集的 Token 信息"""
@@ -196,6 +201,7 @@ def history(
196
201
 
197
202
  # ============ 工具命令 ============
198
203
 
204
+
199
205
  @app.command()
200
206
  def logs():
201
207
  """日志查看工具使用说明"""
@@ -237,6 +243,7 @@ def install(
237
243
  ):
238
244
  """安装 Datatron MCP 服务"""
239
245
  from .mcp.cli import MCPCommands
246
+
240
247
  MCPCommands().install(name, target)
241
248
 
242
249
 
@@ -247,6 +254,7 @@ def uninstall(
247
254
  ):
248
255
  """移除 Datatron MCP 服务"""
249
256
  from .mcp.cli import MCPCommands
257
+
250
258
  MCPCommands().uninstall(name, target)
251
259
 
252
260
 
@@ -254,6 +262,7 @@ def uninstall(
254
262
  def status():
255
263
  """查看 MCP 服务安装状态"""
256
264
  from .mcp.cli import MCPCommands
265
+
257
266
  MCPCommands().status()
258
267
 
259
268
 
@@ -261,6 +270,7 @@ def status():
261
270
  def test():
262
271
  """测试 MCP 服务是否正常"""
263
272
  from .mcp.cli import MCPCommands
273
+
264
274
  MCPCommands().test()
265
275
 
266
276
 
@@ -281,10 +291,9 @@ def _show_completion_hint():
281
291
 
282
292
  # 显示提示(使用 stderr 避免干扰管道输出)
283
293
  from rich.console import Console
294
+
284
295
  console = Console(stderr=True)
285
- console.print(
286
- "[dim]💡 提示: 运行 [green]dt --install-completion[/green] 启用命令补全[/dim]"
287
- )
296
+ console.print("[dim]💡 提示: 运行 [green]dt --install-completion[/green] 启用命令补全[/dim]")
288
297
 
289
298
  # 记录已提示
290
299
  try:
@@ -296,8 +305,8 @@ def _show_completion_hint():
296
305
 
297
306
  def main():
298
307
  # less 分页器配置(仅 Unix-like 系统)
299
- if sys.platform != 'win32':
300
- os.environ['PAGER'] = 'less -RXF'
308
+ if sys.platform != "win32":
309
+ os.environ["PAGER"] = "less -RXF"
301
310
 
302
311
  # _show_completion_hint()
303
312
  app()
dtflow/cli/__init__.py CHANGED
@@ -1,12 +1,33 @@
1
1
  """
2
2
  CLI module for DataTransformer.
3
3
  """
4
+
4
5
  from .commands import (
5
- clean, concat, dedupe, diff, head, history, run,
6
- sample, stats, tail, token_stats, transform
6
+ clean,
7
+ concat,
8
+ dedupe,
9
+ diff,
10
+ head,
11
+ history,
12
+ run,
13
+ sample,
14
+ stats,
15
+ tail,
16
+ token_stats,
17
+ transform,
7
18
  )
8
19
 
9
20
  __all__ = [
10
- "sample", "head", "tail", "transform", "dedupe", "concat",
11
- "stats", "clean", "run", "token_stats", "diff", "history"
21
+ "sample",
22
+ "head",
23
+ "tail",
24
+ "transform",
25
+ "dedupe",
26
+ "concat",
27
+ "stats",
28
+ "clean",
29
+ "run",
30
+ "token_stats",
31
+ "diff",
32
+ "history",
12
33
  ]