PyPI - dtflow - Versions diffs - 0.5.5__tar.gz → 0.5.6__tar.gz - Mend

dtflow 0.5.5tar.gz → 0.5.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{dtflow-0.5.5 → dtflow-0.5.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dtflow
-Version: 0.5.5
+Version: 0.5.6
 Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
 Project-URL: Homepage, https://github.com/yourusername/DataTransformer
 Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -423,6 +423,8 @@ dt sample data.csv --num=100 --sample_type=head
 dt sample data.jsonl 1000 --by=category           # 分层采样
 dt sample data.jsonl 1000 --by=meta.source        # 按嵌套字段分层采样
 dt sample data.jsonl 1000 --by=messages.#         # 按消息数量分层采样
+dt sample data.jsonl --where="category=tech"      # 筛选后采样
+dt sample data.jsonl --where="messages.#>=2"      # 多条件筛选
 # 数据转换 - 预设模式
 dt transform data.jsonl --preset=openai_chat
@@ -496,7 +498,7 @@ CLI 命令中的字段参数支持嵌套路径语法，可访问深层嵌套的
 | 命令 | 参数 | 示例 |
 |------|------|------|
-| `sample` | `--by=` | `--by=meta.source`、`--by=messages.#` |
+| `sample` | `--by=`, `--where=` | `--by=meta.source`、`--where=messages.#>=2` |
 | `dedupe` | `--key=` | `--key=meta.id`、`--key=messages[0].content` |
 | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
 | `clean` | `--min-len=` | `--min-len=messages.#:2` |

{dtflow-0.5.5 → dtflow-0.5.6}/README.md RENAMED Viewed

@@ -337,6 +337,8 @@ dt sample data.csv --num=100 --sample_type=head
 dt sample data.jsonl 1000 --by=category           # 分层采样
 dt sample data.jsonl 1000 --by=meta.source        # 按嵌套字段分层采样
 dt sample data.jsonl 1000 --by=messages.#         # 按消息数量分层采样
+dt sample data.jsonl --where="category=tech"      # 筛选后采样
+dt sample data.jsonl --where="messages.#>=2"      # 多条件筛选
 # 数据转换 - 预设模式
 dt transform data.jsonl --preset=openai_chat
@@ -410,7 +412,7 @@ CLI 命令中的字段参数支持嵌套路径语法，可访问深层嵌套的
 | 命令 | 参数 | 示例 |
 |------|------|------|
-| `sample` | `--by=` | `--by=meta.source`、`--by=messages.#` |
+| `sample` | `--by=`, `--where=` | `--by=meta.source`、`--where=messages.#>=2` |
 | `dedupe` | `--key=` | `--key=meta.id`、`--key=messages[0].content` |
 | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
 | `clean` | `--min-len=` | `--min-len=messages.#:2` |

{dtflow-0.5.5 → dtflow-0.5.6}/dtflow/__init__.py RENAMED Viewed

@@ -60,7 +60,7 @@ from .tokenizers import (
     token_stats,
 )
-__version__ = "0.5.5"
+__version__ = "0.5.6"
 __all__ = [
     # core

{dtflow-0.5.5 → dtflow-0.5.6}/dtflow/__main__.py RENAMED Viewed

@@ -67,10 +67,11 @@ def sample(
     uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
     fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段（逗号分隔）"),
     raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON（不截断）"),
+    where: Optional[List[str]] = typer.Option(None, "--where", "-w", help="筛选条件 (可多次使用)"),
 ):
     """从数据文件中采样指定数量的数据"""
     actual_num = num_arg if num_arg is not None else num
-    _sample(filename, actual_num, type, output, seed, by, uniform, fields, raw)
+    _sample(filename, actual_num, type, output, seed, by, uniform, fields, raw, where)
 @app.command()

{dtflow-0.5.5 → dtflow-0.5.6}/dtflow/cli/sample.py RENAMED Viewed

@@ -2,8 +2,9 @@
 CLI 采样相关命令
 """
+import re
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional
 import orjson
@@ -16,6 +17,122 @@ from .common import (
     _print_samples,
 )
+# where 条件解析正则：field op value
+_WHERE_PATTERN = re.compile(r"^(.+?)(!=|~=|>=|<=|>|<|=)(.*)$")
+def _parse_where(condition: str) -> Callable[[dict], bool]:
+    """
+    解析 where 条件字符串，返回筛选函数。
+    支持的操作符:
+        =   等于
+        !=  不等于
+        ~=  包含（字符串）
+        >   大于
+        >=  大于等于
+        <   小于
+        <=  小于等于
+    Examples:
+        _parse_where("category=tech")
+        _parse_where("meta.source!=wiki")
+        _parse_where("content~=机器学习")
+        _parse_where("messages.#>=2")
+    """
+    match = _WHERE_PATTERN.match(condition)
+    if not match:
+        raise ValueError(f"无效的 where 条件: {condition}")
+    field, op, value = match.groups()
+    # 尝试转换 value 为数值
+    def parse_value(v: str) -> Any:
+        if v.lower() == "true":
+            return True
+        if v.lower() == "false":
+            return False
+        try:
+            return int(v)
+        except ValueError:
+            try:
+                return float(v)
+            except ValueError:
+                return v
+    parsed_value = parse_value(value)
+    def filter_fn(item: dict) -> bool:
+        field_value = get_field_with_spec(item, field)
+        if op == "=":
+            # 字符串比较或数值比较
+            if field_value is None:
+                return value == "" or value.lower() == "none"
+            return str(field_value) == value or field_value == parsed_value
+        elif op == "!=":
+            if field_value is None:
+                return value != "" and value.lower() != "none"
+            return str(field_value) != value and field_value != parsed_value
+        elif op == "~=":
+            # 包含
+            if field_value is None:
+                return False
+            return value in str(field_value)
+        elif op in (">", ">=", "<", "<="):
+            # 数值比较
+            if field_value is None:
+                return False
+            try:
+                num_field = float(field_value)
+                num_value = float(value)
+                if op == ">":
+                    return num_field > num_value
+                elif op == ">=":
+                    return num_field >= num_value
+                elif op == "<":
+                    return num_field < num_value
+                else:  # <=
+                    return num_field <= num_value
+            except (ValueError, TypeError):
+                return False
+        return False
+    return filter_fn
+def _apply_where_filters(data: List[Dict], where_conditions: List[str]) -> List[Dict]:
+    """应用多个 where 条件（AND 关系）"""
+    if not where_conditions:
+        return data
+    filters = [_parse_where(cond) for cond in where_conditions]
+    return [item for item in data if all(f(item) for f in filters)]
+def _sample_from_list(
+    data: List[Dict],
+    num: int,
+    sample_type: str,
+    seed: Optional[int] = None,
+) -> List[Dict]:
+    """从列表中采样"""
+    import random
+    if seed is not None:
+        random.seed(seed)
+    total = len(data)
+    if num <= 0 or num > total:
+        num = total
+    if sample_type == "random":
+        return random.sample(data, num)
+    elif sample_type == "head":
+        return data[:num]
+    else:  # tail
+        return data[-num:]
 def sample(
     filename: str,
@@ -27,6 +144,7 @@ def sample(
     uniform: bool = False,
     fields: Optional[str] = None,
     raw: bool = False,
+    where: Optional[List[str]] = None,
 ) -> None:
     """
     从数据文件中采样指定数量的数据。
@@ -44,6 +162,7 @@ def sample(
         uniform: 均匀采样模式（需配合 --by 使用），各组采样相同数量
         fields: 只显示指定字段（逗号分隔），仅在预览模式下有效
         raw: 输出原始 JSON 格式（不截断，完整显示所有内容）
+        where: 筛选条件列表，支持 =, !=, ~=, >, >=, <, <= 操作符
     Examples:
         dt sample data.jsonl 5
@@ -54,6 +173,9 @@ def sample(
         dt sample data.jsonl 1000 --by=category           # 按比例分层采样
         dt sample data.jsonl 1000 --by=category --uniform # 均匀分层采样
         dt sample data.jsonl --fields=question,answer     # 只显示指定字段
+        dt sample data.jsonl --where="category=tech"      # 筛选 category 为 tech 的数据
+        dt sample data.jsonl --where="meta.source~=wiki"  # 筛选 meta.source 包含 wiki
+        dt sample data.jsonl --where="messages.#>=2"      # 筛选消息数量 >= 2
     """
     filepath = Path(filename)
@@ -69,23 +191,46 @@ def sample(
         print("错误: --uniform 必须配合 --by 使用")
         return
+    # 处理 where 筛选
+    where_conditions = where or []
+    filtered_data = None
+    original_count = None
+    if where_conditions:
+        # 有 where 条件时，先加载全部数据再筛选
+        try:
+            all_data = load_data(str(filepath))
+            original_count = len(all_data)
+            filtered_data = _apply_where_filters(all_data, where_conditions)
+            print(f"🔍 筛选: {original_count} → {len(filtered_data)} 条")
+            if not filtered_data:
+                print("⚠️  筛选后无数据")
+                return
+        except ValueError as e:
+            print(f"错误: {e}")
+            return
     # 分层采样模式
     if by:
         try:
-            sampled = _stratified_sample(filepath, num, by, uniform, seed, type)
+            sampled = _stratified_sample(filepath, num, by, uniform, seed, type, data=filtered_data)
         except Exception as e:
             print(f"错误: {e}")
             return
     else:
         # 普通采样
         try:
-            sampled = sample_file(
-                str(filepath),
-                num=num,
-                sample_type=type,
-                seed=seed,
-                output=None,  # 先不保存，统一在最后处理
-            )
+            if filtered_data is not None:
+                # 已筛选的数据，直接采样
+                sampled = _sample_from_list(filtered_data, num, type, seed)
+            else:
+                sampled = sample_file(
+                    str(filepath),
+                    num=num,
+                    sample_type=type,
+                    seed=seed,
+                    output=None,  # 先不保存，统一在最后处理
+                )
         except Exception as e:
             print(f"错误: {e}")
             return
@@ -117,6 +262,7 @@ def _stratified_sample(
     uniform: bool,
     seed: Optional[int],
     sample_type: str,
+    data: Optional[List[Dict]] = None,
 ) -> List[Dict]:
     """
     分层采样实现。
@@ -133,6 +279,7 @@ def _stratified_sample(
         uniform: 是否均匀采样（各组相同数量）
         seed: 随机种子
         sample_type: 采样方式（用于组内采样）
+        data: 预筛选的数据（可选，如果提供则不从文件加载）
     Returns:
         采样后的数据列表
@@ -143,8 +290,9 @@ def _stratified_sample(
     if seed is not None:
         random.seed(seed)
-    # 加载数据
-    data = load_data(str(filepath))
+    # 加载数据（如果没有预筛选数据）
+    if data is None:
+        data = load_data(str(filepath))
     total = len(data)
     if num <= 0 or num > total:

{dtflow-0.5.5 → dtflow-0.5.6}/tests/test_cli_sample.py RENAMED Viewed

@@ -240,3 +240,102 @@ class TestRawOutput:
         # Raw mode outputs JSON with indentation
         assert "question" in captured.out
         assert "Question 0" in captured.out
+# ============== Where Filter Tests ==============
+class TestWhereFilter:
+    """Test --where filter functionality."""
+    def test_where_equal(self, sample_qa_file, tmp_path, capsys):
+        """Test where filter with = operator."""
+        filepath, _ = sample_qa_file
+        output = tmp_path / "filtered.jsonl"
+        sample(str(filepath), num=100, output=str(output), where=["category=cat0"])
+        result = load_data(str(output))
+        assert len(result) > 0
+        assert all(item["category"] == "cat0" for item in result)
+    def test_where_not_equal(self, sample_qa_file, tmp_path, capsys):
+        """Test where filter with != operator."""
+        filepath, _ = sample_qa_file
+        output = tmp_path / "filtered.jsonl"
+        sample(str(filepath), num=100, output=str(output), where=["category!=cat0"])
+        result = load_data(str(output))
+        assert len(result) > 0
+        assert all(item["category"] != "cat0" for item in result)
+    def test_where_contains(self, sample_qa_file, tmp_path, capsys):
+        """Test where filter with ~= (contains) operator."""
+        filepath, _ = sample_qa_file
+        output = tmp_path / "filtered.jsonl"
+        sample(str(filepath), num=100, output=str(output), where=["question~=Question 1"])
+        result = load_data(str(output))
+        assert len(result) > 0
+        assert all("Question 1" in item["question"] for item in result)
+    def test_where_nested_field(self, sample_nested_file, tmp_path, capsys):
+        """Test where filter on nested fields."""
+        filepath, _ = sample_nested_file
+        output = tmp_path / "filtered.jsonl"
+        sample(str(filepath), num=100, output=str(output), where=["meta.source=source0"])
+        result = load_data(str(output))
+        assert len(result) > 0
+        assert all(item["meta"]["source"] == "source0" for item in result)
+    def test_where_numeric_comparison(self, sample_nested_file, tmp_path, capsys):
+        """Test where filter with numeric comparison."""
+        filepath, _ = sample_nested_file
+        output = tmp_path / "filtered.jsonl"
+        sample(str(filepath), num=100, output=str(output), where=["id>=10"])
+        result = load_data(str(output))
+        assert len(result) > 0
+        assert all(item["id"] >= 10 for item in result)
+    def test_where_multiple_conditions(self, sample_qa_file, tmp_path, capsys):
+        """Test multiple where conditions (AND logic)."""
+        filepath, _ = sample_qa_file
+        output = tmp_path / "filtered.jsonl"
+        sample(
+            str(filepath),
+            num=100,
+            output=str(output),
+            where=["category=cat0", "question~=Question 0"],
+        )
+        result = load_data(str(output))
+        # category=cat0 包括 id 0, 3, 6, 9, 12, 15, 18
+        # question~=Question 0 包括 Question 0
+        assert len(result) == 1
+        assert result[0]["category"] == "cat0"
+        assert "Question 0" in result[0]["question"]
+    def test_where_no_match(self, sample_qa_file, capsys):
+        """Test where filter with no matching results."""
+        filepath, _ = sample_qa_file
+        sample(str(filepath), num=10, where=["category=nonexistent"])
+        captured = capsys.readouterr()
+        assert "筛选后无数据" in captured.out
+    def test_where_invalid_condition(self, sample_qa_file, capsys):
+        """Test where filter with invalid condition format."""
+        filepath, _ = sample_qa_file
+        sample(str(filepath), num=10, where=["invalid_condition"])
+        captured = capsys.readouterr()
+        assert "无效的 where 条件" in captured.out