PyPI - dtflow - Versions diffs - 0.5.2__tar.gz → 0.5.3__tar.gz - Mend

dtflow 0.5.2tar.gz → 0.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

dtflow-0.5.3/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,19 @@
+# Changelog
+## [0.5.2] - 2026-01-18
+### Miscellaneous
+- Bump version to 0.5.2
+- 添加 pre-commit 配置和发版脚本
+## [0.5.1] - 2026-01-18
+### Features
+- 优化 sample 命令文本预览显示
+### Testing
+- 添加测试运行说明
+- 补充 tail/token-stats/validate 性能测试

{dtflow-0.5.2 → dtflow-0.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dtflow
-Version: 0.5.2
+Version: 0.5.3
 Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
 Project-URL: Homepage, https://github.com/yourusername/DataTransformer
 Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme

{dtflow-0.5.2 → dtflow-0.5.3}/dtflow/__init__.py RENAMED Viewed

@@ -26,6 +26,12 @@ from .converters import (  # LLaMA-Factory 扩展; ms-swift
     to_swift_vlm,
 )
 from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
+from .framework import (
+    CompatibilityResult,
+    check_compatibility,
+    detect_format,
+    export_for,
+)
 from .presets import get_preset, list_presets
 from .schema import (
     Field,
@@ -38,12 +44,6 @@ from .schema import (
     sharegpt_schema,
     validate_data,
 )
-from .framework import (
-    CompatibilityResult,
-    check_compatibility,
-    detect_format,
-    export_for,
-)
 from .storage import load_data, sample_file, save_data
 from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
 from .tokenizers import (
@@ -60,7 +60,7 @@ from .tokenizers import (
     token_stats,
 )
-__version__ = "0.5.2"
+__version__ = "0.5.3"
 __all__ = [
     # core

{dtflow-0.5.2 → dtflow-0.5.3}/dtflow/converters.py RENAMED Viewed

@@ -4,7 +4,7 @@
 提供与 HuggingFace datasets 等常用格式的互转功能。
 """
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional
 def to_hf_dataset(data: List[Dict[str, Any]]):
@@ -143,14 +143,16 @@ def to_openai_batch(
         >>> batch_input = dt.to(to_openai_batch(model="gpt-4o"))
     """
-    def transform(item, idx=[0]) -> dict:
+    counter = {"idx": 0}
+    def transform(item) -> dict:
         messages = item.get(messages_field, []) if hasattr(item, "get") else item[messages_field]
         if custom_id_field:
             custom_id = item.get(custom_id_field) if hasattr(item, "get") else item[custom_id_field]
         else:
-            custom_id = f"request-{idx[0]}"
-            idx[0] += 1
+            custom_id = f"request-{counter['idx']}"
+            counter["idx"] += 1
         return {
             "custom_id": str(custom_id),
@@ -196,7 +198,7 @@ def to_llama_factory(
     """
     def transform(item) -> dict:
-        get = lambda f: (item.get(f, "") if hasattr(item, "get") else item.get(f, ""))
+        get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
         result = {
             "instruction": get(instruction_field),
@@ -248,7 +250,7 @@ def to_axolotl(
         conversations = (
             item.get(conversations_field, [])
             if hasattr(item, "get")
-            else item.get(conversations_field, [])
+            else getattr(item, conversations_field, [])
         )
         # 如果已经是正确格式，直接返回
@@ -257,7 +259,9 @@ def to_axolotl(
                 return {"conversations": conversations}
         # 尝试从 messages 格式转换
-        messages = item.get("messages", []) if hasattr(item, "get") else item.get("messages", [])
+        messages = (
+            item.get("messages", []) if hasattr(item, "get") else getattr(item, "messages", [])
+        )
         if messages:
             role_map = {"user": "human", "assistant": "gpt", "system": "system"}
             conversations = [
@@ -312,7 +316,7 @@ def to_llama_factory_sharegpt(
     }
     def transform(item) -> dict:
-        get = lambda f: (item.get(f, "") if hasattr(item, "get") else item.get(f, ""))
+        get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
         messages = get(messages_field) or []
         conversations = []
@@ -385,7 +389,7 @@ def to_llama_factory_vlm(
     """
     def transform(item) -> dict:
-        get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
+        get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
         messages = get(messages_field) or []
         instruction = ""
@@ -467,7 +471,7 @@ def to_llama_factory_vlm_sharegpt(
     role_map = {"user": "human", "assistant": "gpt", "system": "system"}
     def transform(item) -> dict:
-        get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
+        get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
         messages = get(messages_field) or []
         conversations = []
@@ -541,7 +545,7 @@ def to_swift_messages(
     """
     def transform(item) -> dict:
-        get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
+        get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
         messages = get(messages_field) or []
         # 复制 messages，避免修改原数据
@@ -600,7 +604,7 @@ def to_swift_query_response(
     """
     def transform(item) -> dict:
-        get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
+        get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
         query = get(query_field)
         response = get(response_field)
@@ -693,7 +697,7 @@ def to_swift_vlm(
     """
     def transform(item) -> dict:
-        get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
+        get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
         messages = get(messages_field) or []
         result_messages = []

{dtflow-0.5.2 → dtflow-0.5.3}/dtflow/presets.py RENAMED Viewed

@@ -6,6 +6,8 @@
 from typing import Any, Callable
+from dtflow.utils.helpers import get_field_value
 def openai_chat(
     user_field: str = "q", assistant_field: str = "a", system_prompt: str = None
@@ -33,8 +35,8 @@ def openai_chat(
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
-        user_content = getattr(item, user_field, None) or item.get(user_field, "")
-        assistant_content = getattr(item, assistant_field, None) or item.get(assistant_field, "")
+        user_content = get_field_value(item, user_field)
+        assistant_content = get_field_value(item, assistant_field)
         messages.append({"role": "user", "content": user_content})
         messages.append({"role": "assistant", "content": assistant_content})
@@ -60,10 +62,9 @@ def alpaca(
     def transform(item: Any) -> dict:
         return {
-            "instruction": getattr(item, instruction_field, None)
-            or item.get(instruction_field, ""),
-            "input": getattr(item, input_field, None) or item.get(input_field, ""),
-            "output": getattr(item, output_field, None) or item.get(output_field, ""),
+            "instruction": get_field_value(item, instruction_field),
+            "input": get_field_value(item, input_field),
+            "output": get_field_value(item, output_field),
         }
     return transform
@@ -84,9 +85,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
     role_mapping = role_mapping or {"user": "human", "assistant": "gpt"}
     def transform(item: Any) -> dict:
-        conversations = getattr(item, conversations_field, None) or item.get(
-            conversations_field, []
-        )
+        conversations = get_field_value(item, conversations_field, [])
         # 如果已经是对话格式，直接返回
         if conversations:
@@ -102,7 +101,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
             ("answer", "gpt"),
             ("output", "gpt"),
         ]:
-            value = getattr(item, field, None) or item.get(field, None)
+            value = get_field_value(item, field, None)
             if value:
                 result.append({"from": role, "value": value})
@@ -127,9 +126,9 @@ def dpo_pair(
     def transform(item: Any) -> dict:
         return {
-            "prompt": getattr(item, prompt_field, None) or item.get(prompt_field, ""),
-            "chosen": getattr(item, chosen_field, None) or item.get(chosen_field, ""),
-            "rejected": getattr(item, rejected_field, None) or item.get(rejected_field, ""),
+            "prompt": get_field_value(item, prompt_field),
+            "chosen": get_field_value(item, chosen_field),
+            "rejected": get_field_value(item, rejected_field),
         }
     return transform
@@ -148,8 +147,8 @@ def simple_qa(question_field: str = "q", answer_field: str = "a") -> Callable:
     def transform(item: Any) -> dict:
         return {
-            "question": getattr(item, question_field, None) or item.get(question_field, ""),
-            "answer": getattr(item, answer_field, None) or item.get(answer_field, ""),
+            "question": get_field_value(item, question_field),
+            "answer": get_field_value(item, answer_field),
         }
     return transform

{dtflow-0.5.2 → dtflow-0.5.3}/dtflow/utils/__init__.py RENAMED Viewed

@@ -9,6 +9,7 @@ from .field_path import (
     get_field_with_spec,
     parse_field_spec,
 )
+from .helpers import get_field_value
 __all__ = [
     "display_data",
@@ -20,4 +21,6 @@ __all__ = [
     "extract",
     "extract_with_spec",
     "ExpandMode",
+    # helpers
+    "get_field_value",
 ]

dtflow-0.5.3/dtflow/utils/helpers.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""公共辅助函数"""
+from typing import Any
+def get_field_value(item: Any, field: str, default: Any = "") -> Any:
+    """
+    获取字段值，支持 DictWrapper 和普通 dict。
+    优先尝试 dict.get()，如果没有 get 方法则使用 getattr()。
+    Args:
+        item: 数据对象（dict 或 DictWrapper）
+        field: 字段名
+        default: 默认值
+    Returns:
+        字段值或默认值
+    Examples:
+        >>> get_field_value({"name": "test"}, "name")
+        'test'
+        >>> get_field_value({"name": ""}, "name", "default")
+        'default'
+    """
+    if hasattr(item, "get"):
+        value = item.get(field, default)
+    else:
+        value = getattr(item, field, default)
+    return value if value else default

{dtflow-0.5.2 → dtflow-0.5.3}/pyproject.toml RENAMED Viewed

@@ -300,8 +300,10 @@ ignore_missing_imports = true
 # Ruff configuration (optional alternative to flake8)
 [tool.ruff]
-target-version = "py37"
+target-version = "py38"
 line-length = 100
+[tool.ruff.lint]
 select = [
     "E",  # pycodestyle errors
     "W",  # pycodestyle warnings
@@ -311,13 +313,13 @@ select = [
     "B",  # flake8-bugbear
 ]
 ignore = [
-    "E501",  # line too long, handled by black
+    "E501",  # line too long, handled by ruff-format
     "B008",  # do not perform function calls in argument defaults
     "C901",  # too complex
 ]
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401"]
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["dtflow"]

dtflow-0.5.3/tests/test_cli_clean.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""
+Tests for CLI clean and dedupe commands.
+"""
+import pytest
+from dtflow.cli.clean import _clean_data_single_pass, _parse_len_param, clean, dedupe
+from dtflow.storage.io import load_data, save_data
+# ============== Fixtures ==============
+@pytest.fixture
+def sample_data_file(tmp_path):
+    """Create a sample dataset file."""
+    data = [
+        {"text": "Hello world", "score": 0.8, "category": "greeting"},
+        {"text": "How are you?", "score": 0.9, "category": "question"},
+        {"text": "  Needs trimming  ", "score": 0.7, "category": "test"},
+        {"text": "", "score": 0.6, "category": "empty_text"},
+        {"text": "Short", "score": None, "category": None},
+        {"text": "Hello world", "score": 0.85, "category": "duplicate"},  # duplicate text
+    ]
+    filepath = tmp_path / "test_data.jsonl"
+    save_data(data, str(filepath))
+    return filepath, data
+@pytest.fixture
+def sample_nested_file(tmp_path):
+    """Create a sample dataset with nested fields."""
+    data = [
+        {
+            "id": 1,
+            "meta": {"source": "web", "score": 0.9},
+            "messages": [
+                {"role": "user", "content": "Hello"},
+                {"role": "assistant", "content": "Hi!"},
+            ],
+        },
+        {
+            "id": 2,
+            "meta": {"source": "api", "score": 0.8},
+            "messages": [{"role": "user", "content": "Hi"}],
+        },
+        {
+            "id": 3,
+            "meta": {"source": None, "score": 0.5},
+            "messages": [
+                {"role": "user", "content": "A"},
+                {"role": "assistant", "content": "B"},
+                {"role": "user", "content": "C"},
+            ],
+        },
+    ]
+    filepath = tmp_path / "test_nested.jsonl"
+    save_data(data, str(filepath))
+    return filepath, data
+# ============== Clean Command Tests ==============
+class TestCleanBasic:
+    """Test basic clean functionality."""
+    def test_clean_drop_empty(self, sample_data_file, tmp_path):
+        """Test dropping empty records."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), drop_empty="text", output=str(output))
+        result = load_data(str(output))
+        # Should remove the record with empty text
+        for item in result:
+            assert item["text"] != ""
+    def test_clean_drop_empty_all_fields(self, sample_data_file, tmp_path):
+        """Test dropping records with any empty field."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        # drop_empty="" means check all fields
+        clean(str(filepath), drop_empty="", output=str(output))
+        result = load_data(str(output))
+        # Should remove records with any None or empty value
+        for item in result:
+            assert all(v is not None and v != "" for v in item.values())
+    def test_clean_strip(self, sample_data_file, tmp_path):
+        """Test stripping whitespace."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), strip=True, output=str(output))
+        result = load_data(str(output))
+        # Find the item that had extra whitespace
+        trimmed = [item for item in result if "Needs trimming" in item.get("text", "")]
+        if trimmed:
+            assert trimmed[0]["text"] == "Needs trimming"
+    def test_clean_min_len(self, sample_data_file, tmp_path):
+        """Test minimum length filtering."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), min_len="text:10", output=str(output))
+        result = load_data(str(output))
+        for item in result:
+            assert len(item["text"]) >= 10
+    def test_clean_max_len(self, sample_data_file, tmp_path):
+        """Test maximum length filtering."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), max_len="text:10", output=str(output))
+        result = load_data(str(output))
+        for item in result:
+            assert len(item["text"]) <= 10
+    def test_clean_keep_fields(self, sample_data_file, tmp_path):
+        """Test keeping only specified fields."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), keep="text,category", output=str(output))
+        result = load_data(str(output))
+        for item in result:
+            assert set(item.keys()) == {"text", "category"}
+    def test_clean_drop_fields(self, sample_data_file, tmp_path):
+        """Test dropping specified fields."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), drop="score", output=str(output))
+        result = load_data(str(output))
+        for item in result:
+            assert "score" not in item
+# ============== Clean with Nested Fields Tests ==============
+class TestCleanNested:
+    """Test clean with nested field paths."""
+    def test_clean_drop_empty_nested(self, sample_nested_file, tmp_path):
+        """Test dropping records with empty nested field."""
+        filepath, _ = sample_nested_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), drop_empty="meta.source", output=str(output))
+        result = load_data(str(output))
+        assert len(result) == 2  # Should remove the one with None source
+        for item in result:
+            assert item["meta"]["source"] is not None
+    def test_clean_min_len_messages(self, sample_nested_file, tmp_path):
+        """Test filtering by message count using .# syntax."""
+        filepath, _ = sample_nested_file
+        output = tmp_path / "output.jsonl"
+        clean(str(filepath), min_len="messages.#:2", output=str(output))
+        result = load_data(str(output))
+        for item in result:
+            assert len(item["messages"]) >= 2
+# ============== Dedupe Command Tests ==============
+class TestDedupeBasic:
+    """Test basic dedupe functionality."""
+    def test_dedupe_by_field(self, sample_data_file, tmp_path):
+        """Test deduplication by specific field."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        dedupe(str(filepath), key="text", output=str(output))
+        result = load_data(str(output))
+        # Should have removed the duplicate "Hello world"
+        texts = [item["text"] for item in result]
+        assert len(texts) == len(set(texts))
+    def test_dedupe_full(self, sample_data_file, tmp_path):
+        """Test full record deduplication."""
+        filepath, _ = sample_data_file
+        output = tmp_path / "output.jsonl"
+        dedupe(str(filepath), output=str(output))
+        result = load_data(str(output))
+        # All records are unique, so should have same count
+        # (unless there are exact duplicates)
+        assert len(result) >= 1
+    def test_dedupe_overwrite(self, sample_data_file):
+        """Test deduplication with overwrite (no output specified)."""
+        filepath, original_data = sample_data_file
+        dedupe(str(filepath), key="text")
+        result = load_data(str(filepath))
+        texts = [item["text"] for item in result]
+        assert len(texts) == len(set(texts))
+# ============== Parameter Parsing Tests ==============
+class TestParamParsing:
+    """Test parameter parsing functions."""
+    def test_parse_len_param_valid(self):
+        """Test valid length parameter parsing."""
+        field, value = _parse_len_param("text:100")
+        assert field == "text"
+        assert value == 100
+    def test_parse_len_param_nested(self):
+        """Test nested field length parameter."""
+        field, value = _parse_len_param("messages.#:5")
+        assert field == "messages.#"
+        assert value == 5
+    def test_parse_len_param_invalid_no_colon(self):
+        """Test invalid parameter without colon."""
+        with pytest.raises(ValueError):
+            _parse_len_param("text100")
+    def test_parse_len_param_invalid_non_numeric(self):
+        """Test invalid parameter with non-numeric value."""
+        with pytest.raises(ValueError):
+            _parse_len_param("text:abc")
+# ============== Clean Single Pass Tests ==============
+class TestCleanSinglePass:
+    """Test _clean_data_single_pass function."""
+    def test_single_pass_strip(self):
+        """Test strip in single pass."""
+        data = [{"text": "  hello  ", "value": " world "}]
+        result, _ = _clean_data_single_pass(data, strip=True)
+        assert result[0]["text"] == "hello"
+        assert result[0]["value"] == "world"
+    def test_single_pass_drop_empty(self):
+        """Test drop empty in single pass."""
+        data = [
+            {"text": "hello", "value": "world"},
+            {"text": "", "value": "test"},
+            {"text": "hi", "value": None},
+        ]
+        result, stats = _clean_data_single_pass(data, empty_fields=["text", "value"])
+        assert len(result) == 1
+        assert result[0]["text"] == "hello"
+    def test_single_pass_combined(self):
+        """Test combined operations in single pass."""
+        data = [
+            {"text": "  long text here  ", "score": 0.9},
+            {"text": "  hi  ", "score": 0.8},
+        ]
+        result, stats = _clean_data_single_pass(
+            data, strip=True, min_len_field="text", min_len_value=5
+        )
+        assert len(result) == 1
+        assert result[0]["text"] == "long text here"
+# ============== Error Handling Tests ==============
+class TestCleanErrors:
+    """Test error handling in clean commands."""
+    def test_clean_file_not_exists(self, tmp_path, capsys):
+        """Test error when file doesn't exist."""
+        clean(str(tmp_path / "nonexistent.jsonl"))
+        captured = capsys.readouterr()
+        assert "文件不存在" in captured.out
+    def test_dedupe_similar_without_key(self, sample_data_file, capsys):
+        """Test error when using similar without key."""
+        filepath, _ = sample_data_file
+        dedupe(str(filepath), similar=0.8)
+        captured = capsys.readouterr()
+        assert "需要指定 --key" in captured.out
+    def test_dedupe_invalid_similar_range(self, sample_data_file, capsys):
+        """Test error when similar value is out of range."""
+        filepath, _ = sample_data_file
+        dedupe(str(filepath), key="text", similar=1.5)
+        captured = capsys.readouterr()
+        assert "0-1 之间" in captured.out

dtflow 0.5.2__tar.gz → 0.5.3__tar.gz

dtflow 0.5.2tar.gz → 0.5.3tar.gz