PyPI - dtflow - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

dtflow 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

dtflow/__init__.py +69 -58
dtflow/__main__.py +29 -20
dtflow/cli/__init__.py +25 -4
dtflow/cli/commands.py +184 -93
dtflow/converters.py +39 -23
dtflow/core.py +79 -51
dtflow/lineage.py +6 -3
dtflow/mcp/__init__.py +1 -0
dtflow/mcp/__main__.py +2 -0
dtflow/mcp/cli.py +22 -4
dtflow/mcp/docs.py +0 -5
dtflow/pipeline.py +33 -23
dtflow/presets.py +24 -22
dtflow/storage/__init__.py +11 -10
dtflow/storage/io.py +19 -10
dtflow/streaming.py +13 -18
dtflow/tokenizers.py +32 -12
dtflow/utils/__init__.py +20 -1
dtflow/utils/display.py +23 -23
dtflow/utils/field_path.py +274 -0
{dtflow-0.3.1.dist-info → dtflow-0.4.0.dist-info}/METADATA +48 -3
dtflow-0.4.0.dist-info/RECORD +25 -0
dtflow-0.3.1.dist-info/RECORD +0 -24
{dtflow-0.3.1.dist-info → dtflow-0.4.0.dist-info}/WHEEL +0 -0
{dtflow-0.3.1.dist-info → dtflow-0.4.0.dist-info}/entry_points.txt +0 -0

dtflow/__init__.py CHANGED Viewed

@@ -7,77 +7,88 @@ DataTransformer: 简洁的数据格式转换工具
 - tokenizers: Token 统计和过滤
 - converters: HuggingFace/OpenAI 等格式转换
 """
+from .converters import (  # LLaMA-Factory 扩展; ms-swift
+    from_hf_dataset,
+    from_openai_batch,
+    messages_to_text,
+    to_axolotl,
+    to_hf_chat_format,
+    to_hf_dataset,
+    to_llama_factory,
+    to_llama_factory_sharegpt,
+    to_llama_factory_vlm,
+    to_llama_factory_vlm_sharegpt,
+    to_openai_batch,
+    to_swift_messages,
+    to_swift_query_response,
+    to_swift_vlm,
+)
 from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
 from .presets import get_preset, list_presets
-from .storage import save_data, load_data, sample_file
+from .storage import load_data, sample_file, save_data
+from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
 from .tokenizers import (
-    count_tokens, token_counter, token_filter, token_stats,
-    messages_token_counter, messages_token_filter, messages_token_stats,
-    DEFAULT_MODEL, MODEL_ALIASES, OPENAI_MODELS, resolve_model,
-)
-from .converters import (
-    to_hf_dataset, from_hf_dataset, to_hf_chat_format,
-    from_openai_batch, to_openai_batch,
-    to_llama_factory, to_axolotl, messages_to_text,
-    # LLaMA-Factory 扩展
-    to_llama_factory_sharegpt, to_llama_factory_vlm, to_llama_factory_vlm_sharegpt,
-    # ms-swift
-    to_swift_messages, to_swift_query_response, to_swift_vlm,
-)
-from .streaming import (
-    StreamingTransformer,
-    load_stream,
-    load_sharded,
-    process_shards,
+    DEFAULT_MODEL,
+    MODEL_ALIASES,
+    OPENAI_MODELS,
+    count_tokens,
+    messages_token_counter,
+    messages_token_filter,
+    messages_token_stats,
+    resolve_model,
+    token_counter,
+    token_filter,
+    token_stats,
 )
-__version__ = '0.3.1'
+__version__ = "0.4.0"
 __all__ = [
     # core
-    'DataTransformer',
-    'DictWrapper',
-    'TransformError',
-    'TransformErrors',
+    "DataTransformer",
+    "DictWrapper",
+    "TransformError",
+    "TransformErrors",
     # presets
-    'get_preset',
-    'list_presets',
+    "get_preset",
+    "list_presets",
     # storage
-    'save_data',
-    'load_data',
-    'sample_file',
+    "save_data",
+    "load_data",
+    "sample_file",
     # tokenizers
-    'count_tokens',
-    'token_counter',
-    'token_filter',
-    'token_stats',
-    'messages_token_counter',
-    'messages_token_filter',
-    'messages_token_stats',
-    'DEFAULT_MODEL',
-    'MODEL_ALIASES',
-    'OPENAI_MODELS',
-    'resolve_model',
+    "count_tokens",
+    "token_counter",
+    "token_filter",
+    "token_stats",
+    "messages_token_counter",
+    "messages_token_filter",
+    "messages_token_stats",
+    "DEFAULT_MODEL",
+    "MODEL_ALIASES",
+    "OPENAI_MODELS",
+    "resolve_model",
     # converters
-    'to_hf_dataset',
-    'from_hf_dataset',
-    'to_hf_chat_format',
-    'from_openai_batch',
-    'to_openai_batch',
-    'to_llama_factory',
-    'to_axolotl',
-    'messages_to_text',
+    "to_hf_dataset",
+    "from_hf_dataset",
+    "to_hf_chat_format",
+    "from_openai_batch",
+    "to_openai_batch",
+    "to_llama_factory",
+    "to_axolotl",
+    "messages_to_text",
     # LLaMA-Factory 扩展
-    'to_llama_factory_sharegpt',
-    'to_llama_factory_vlm',
-    'to_llama_factory_vlm_sharegpt',
+    "to_llama_factory_sharegpt",
+    "to_llama_factory_vlm",
+    "to_llama_factory_vlm_sharegpt",
     # ms-swift
-    'to_swift_messages',
-    'to_swift_query_response',
-    'to_swift_vlm',
+    "to_swift_messages",
+    "to_swift_query_response",
+    "to_swift_vlm",
     # streaming
-    'StreamingTransformer',
-    'load_stream',
-    'load_sharded',
-    'process_shards',
+    "StreamingTransformer",
+    "load_stream",
+    "load_sharded",
+    "process_shards",
 ]

dtflow/__main__.py CHANGED Viewed

@@ -21,26 +21,25 @@ Commands:
     mcp          MCP 服务管理（install/uninstall/status）
     logs         日志查看工具使用说明
 """
 import os
 import sys
 from typing import List, Optional
 import typer
-from .cli.commands import (
-    sample as _sample,
-    head as _head,
-    tail as _tail,
-    transform as _transform,
-    dedupe as _dedupe,
-    concat as _concat,
-    stats as _stats,
-    clean as _clean,
-    run as _run,
-    token_stats as _token_stats,
-    diff as _diff,
-    history as _history,
-)
+from .cli.commands import clean as _clean
+from .cli.commands import concat as _concat
+from .cli.commands import dedupe as _dedupe
+from .cli.commands import diff as _diff
+from .cli.commands import head as _head
+from .cli.commands import history as _history
+from .cli.commands import run as _run
+from .cli.commands import sample as _sample
+from .cli.commands import stats as _stats
+from .cli.commands import tail as _tail
+from .cli.commands import token_stats as _token_stats
+from .cli.commands import transform as _transform
 # 创建主应用
 app = typer.Typer(
@@ -53,6 +52,7 @@ app = typer.Typer(
 # ============ 数据预览命令 ============
 @app.command()
 def sample(
     filename: str = typer.Argument(..., help="输入文件路径"),
@@ -92,6 +92,7 @@ def tail(
 # ============ 数据转换命令 ============
 @app.command()
 def transform(
     filename: str = typer.Argument(..., help="输入文件路径"),
@@ -116,6 +117,7 @@ def run(
 # ============ 数据处理命令 ============
 @app.command()
 def dedupe(
     filename: str = typer.Argument(..., help="输入文件路径"),
@@ -154,6 +156,7 @@ def clean(
 # ============ 数据统计命令 ============
 @app.command()
 def stats(
     filename: str = typer.Argument(..., help="输入文件路径"),
@@ -167,7 +170,9 @@ def stats(
 def token_stats(
     filename: str = typer.Argument(..., help="输入文件路径"),
     field: str = typer.Option("messages", "--field", "-f", help="统计字段"),
-    model: str = typer.Option("cl100k_base", "--model", "-m", help="分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等"),
+    model: str = typer.Option(
+        "cl100k_base", "--model", "-m", help="分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等"
+    ),
     detailed: bool = typer.Option(False, "--detailed", "-d", help="显示详细统计"),
 ):
     """统计数据集的 Token 信息"""
@@ -196,6 +201,7 @@ def history(
 # ============ 工具命令 ============
 @app.command()
 def logs():
     """日志查看工具使用说明"""
@@ -237,6 +243,7 @@ def install(
 ):
     """安装 Datatron MCP 服务"""
     from .mcp.cli import MCPCommands
     MCPCommands().install(name, target)
@@ -247,6 +254,7 @@ def uninstall(
 ):
     """移除 Datatron MCP 服务"""
     from .mcp.cli import MCPCommands
     MCPCommands().uninstall(name, target)
@@ -254,6 +262,7 @@ def uninstall(
 def status():
     """查看 MCP 服务安装状态"""
     from .mcp.cli import MCPCommands
     MCPCommands().status()
@@ -261,6 +270,7 @@ def status():
 def test():
     """测试 MCP 服务是否正常"""
     from .mcp.cli import MCPCommands
     MCPCommands().test()
@@ -281,10 +291,9 @@ def _show_completion_hint():
     # 显示提示（使用 stderr 避免干扰管道输出）
     from rich.console import Console
     console = Console(stderr=True)
-    console.print(
-        "[dim]💡 提示: 运行 [green]dt --install-completion[/green] 启用命令补全[/dim]"
-    )
+    console.print("[dim]💡 提示: 运行 [green]dt --install-completion[/green] 启用命令补全[/dim]")
     # 记录已提示
     try:
@@ -296,8 +305,8 @@ def _show_completion_hint():
 def main():
     # less 分页器配置（仅 Unix-like 系统）
-    if sys.platform != 'win32':
-        os.environ['PAGER'] = 'less -RXF'
+    if sys.platform != "win32":
+        os.environ["PAGER"] = "less -RXF"
     # _show_completion_hint()
     app()

dtflow/cli/__init__.py CHANGED Viewed

@@ -1,12 +1,33 @@
 """
 CLI module for DataTransformer.
 """
 from .commands import (
-    clean, concat, dedupe, diff, head, history, run,
-    sample, stats, tail, token_stats, transform
+    clean,
+    concat,
+    dedupe,
+    diff,
+    head,
+    history,
+    run,
+    sample,
+    stats,
+    tail,
+    token_stats,
+    transform,
 )
 __all__ = [
-    "sample", "head", "tail", "transform", "dedupe", "concat",
-    "stats", "clean", "run", "token_stats", "diff", "history"
+    "sample",
+    "head",
+    "tail",
+    "transform",
+    "dedupe",
+    "concat",
+    "stats",
+    "clean",
+    "run",
+    "token_stats",
+    "diff",
+    "history",
 ]

dtflow 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

dtflow 0.3.1py3-none-any.whl → 0.4.0py3-none-any.whl