PyPI - dtflow - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

dtflow 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

dtflow/__init__.py +1 -1
dtflow/__main__.py +4 -5
dtflow/cli/sample.py +161 -13
{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/METADATA +4 -2
{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/RECORD +7 -7
{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/WHEEL +0 -0
{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/entry_points.txt +0 -0

dtflow/__init__.py CHANGED Viewed

@@ -60,7 +60,7 @@ from .tokenizers import (
     token_stats,
 )
-__version__ = "0.5.4"
+__version__ = "0.5.6"
 __all__ = [
     # core

dtflow/__main__.py CHANGED Viewed

@@ -60,17 +60,18 @@ def sample(
     filename: str = typer.Argument(..., help="输入文件路径"),
     num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
     num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
-    type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
+    type: str = typer.Option("random", "--type", "-t", help="采样方式: random/head/tail"),
     output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
     seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
     by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
     uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
     fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段（逗号分隔）"),
     raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON（不截断）"),
+    where: Optional[List[str]] = typer.Option(None, "--where", "-w", help="筛选条件 (可多次使用)"),
 ):
     """从数据文件中采样指定数量的数据"""
     actual_num = num_arg if num_arg is not None else num
-    _sample(filename, actual_num, type, output, seed, by, uniform, fields, raw)
+    _sample(filename, actual_num, type, output, seed, by, uniform, fields, raw, where)
 @app.command()
@@ -223,9 +224,7 @@ def validate(
         None, "--preset", "-p", help="预设 Schema: openai_chat, alpaca, dpo, sharegpt"
     ),
     output: Optional[str] = typer.Option(None, "--output", "-o", help="输出有效数据的文件路径"),
-    filter: bool = typer.Option(
-        False, "--filter", "-f", help="过滤无效数据并保存"
-    ),
+    filter: bool = typer.Option(False, "--filter", "-f", help="过滤无效数据并保存"),
     max_errors: int = typer.Option(20, "--max-errors", help="最多显示的错误数量"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="显示详细信息"),
 ):

dtflow/cli/sample.py CHANGED Viewed

@@ -2,8 +2,9 @@
 CLI 采样相关命令
 """
+import re
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional
 import orjson
@@ -16,17 +17,134 @@ from .common import (
     _print_samples,
 )
+# where 条件解析正则：field op value
+_WHERE_PATTERN = re.compile(r"^(.+?)(!=|~=|>=|<=|>|<|=)(.*)$")
+def _parse_where(condition: str) -> Callable[[dict], bool]:
+    """
+    解析 where 条件字符串，返回筛选函数。
+    支持的操作符:
+        =   等于
+        !=  不等于
+        ~=  包含（字符串）
+        >   大于
+        >=  大于等于
+        <   小于
+        <=  小于等于
+    Examples:
+        _parse_where("category=tech")
+        _parse_where("meta.source!=wiki")
+        _parse_where("content~=机器学习")
+        _parse_where("messages.#>=2")
+    """
+    match = _WHERE_PATTERN.match(condition)
+    if not match:
+        raise ValueError(f"无效的 where 条件: {condition}")
+    field, op, value = match.groups()
+    # 尝试转换 value 为数值
+    def parse_value(v: str) -> Any:
+        if v.lower() == "true":
+            return True
+        if v.lower() == "false":
+            return False
+        try:
+            return int(v)
+        except ValueError:
+            try:
+                return float(v)
+            except ValueError:
+                return v
+    parsed_value = parse_value(value)
+    def filter_fn(item: dict) -> bool:
+        field_value = get_field_with_spec(item, field)
+        if op == "=":
+            # 字符串比较或数值比较
+            if field_value is None:
+                return value == "" or value.lower() == "none"
+            return str(field_value) == value or field_value == parsed_value
+        elif op == "!=":
+            if field_value is None:
+                return value != "" and value.lower() != "none"
+            return str(field_value) != value and field_value != parsed_value
+        elif op == "~=":
+            # 包含
+            if field_value is None:
+                return False
+            return value in str(field_value)
+        elif op in (">", ">=", "<", "<="):
+            # 数值比较
+            if field_value is None:
+                return False
+            try:
+                num_field = float(field_value)
+                num_value = float(value)
+                if op == ">":
+                    return num_field > num_value
+                elif op == ">=":
+                    return num_field >= num_value
+                elif op == "<":
+                    return num_field < num_value
+                else:  # <=
+                    return num_field <= num_value
+            except (ValueError, TypeError):
+                return False
+        return False
+    return filter_fn
+def _apply_where_filters(data: List[Dict], where_conditions: List[str]) -> List[Dict]:
+    """应用多个 where 条件（AND 关系）"""
+    if not where_conditions:
+        return data
+    filters = [_parse_where(cond) for cond in where_conditions]
+    return [item for item in data if all(f(item) for f in filters)]
+def _sample_from_list(
+    data: List[Dict],
+    num: int,
+    sample_type: str,
+    seed: Optional[int] = None,
+) -> List[Dict]:
+    """从列表中采样"""
+    import random
+    if seed is not None:
+        random.seed(seed)
+    total = len(data)
+    if num <= 0 or num > total:
+        num = total
+    if sample_type == "random":
+        return random.sample(data, num)
+    elif sample_type == "head":
+        return data[:num]
+    else:  # tail
+        return data[-num:]
 def sample(
     filename: str,
     num: int = 10,
-    type: Literal["random", "head", "tail"] = "head",
+    type: Literal["random", "head", "tail"] = "random",
     output: Optional[str] = None,
     seed: Optional[int] = None,
     by: Optional[str] = None,
     uniform: bool = False,
     fields: Optional[str] = None,
     raw: bool = False,
+    where: Optional[List[str]] = None,
 ) -> None:
     """
     从数据文件中采样指定数量的数据。
@@ -37,13 +155,14 @@ def sample(
             - num > 0: 采样指定数量
             - num = 0: 采样所有数据
             - num < 0: Python 切片风格（如 -1 表示最后 1 条，-10 表示最后 10 条）
-        type: 采样方式，可选 random/head/tail，默认 head
+        type: 采样方式，可选 random/head/tail，默认 random
         output: 输出文件路径，不指定则打印到控制台
         seed: 随机种子（仅在 type=random 时有效）
         by: 分层采样字段名，按该字段的值分组采样
         uniform: 均匀采样模式（需配合 --by 使用），各组采样相同数量
         fields: 只显示指定字段（逗号分隔），仅在预览模式下有效
         raw: 输出原始 JSON 格式（不截断，完整显示所有内容）
+        where: 筛选条件列表，支持 =, !=, ~=, >, >=, <, <= 操作符
     Examples:
         dt sample data.jsonl 5
@@ -54,6 +173,9 @@ def sample(
         dt sample data.jsonl 1000 --by=category           # 按比例分层采样
         dt sample data.jsonl 1000 --by=category --uniform # 均匀分层采样
         dt sample data.jsonl --fields=question,answer     # 只显示指定字段
+        dt sample data.jsonl --where="category=tech"      # 筛选 category 为 tech 的数据
+        dt sample data.jsonl --where="meta.source~=wiki"  # 筛选 meta.source 包含 wiki
+        dt sample data.jsonl --where="messages.#>=2"      # 筛选消息数量 >= 2
     """
     filepath = Path(filename)
@@ -69,23 +191,46 @@ def sample(
         print("错误: --uniform 必须配合 --by 使用")
         return
+    # 处理 where 筛选
+    where_conditions = where or []
+    filtered_data = None
+    original_count = None
+    if where_conditions:
+        # 有 where 条件时，先加载全部数据再筛选
+        try:
+            all_data = load_data(str(filepath))
+            original_count = len(all_data)
+            filtered_data = _apply_where_filters(all_data, where_conditions)
+            print(f"🔍 筛选: {original_count} → {len(filtered_data)} 条")
+            if not filtered_data:
+                print("⚠️  筛选后无数据")
+                return
+        except ValueError as e:
+            print(f"错误: {e}")
+            return
     # 分层采样模式
     if by:
         try:
-            sampled = _stratified_sample(filepath, num, by, uniform, seed, type)
+            sampled = _stratified_sample(filepath, num, by, uniform, seed, type, data=filtered_data)
         except Exception as e:
             print(f"错误: {e}")
             return
     else:
         # 普通采样
         try:
-            sampled = sample_file(
-                str(filepath),
-                num=num,
-                sample_type=type,
-                seed=seed,
-                output=None,  # 先不保存，统一在最后处理
-            )
+            if filtered_data is not None:
+                # 已筛选的数据，直接采样
+                sampled = _sample_from_list(filtered_data, num, type, seed)
+            else:
+                sampled = sample_file(
+                    str(filepath),
+                    num=num,
+                    sample_type=type,
+                    seed=seed,
+                    output=None,  # 先不保存，统一在最后处理
+                )
         except Exception as e:
             print(f"错误: {e}")
             return
@@ -117,6 +262,7 @@ def _stratified_sample(
     uniform: bool,
     seed: Optional[int],
     sample_type: str,
+    data: Optional[List[Dict]] = None,
 ) -> List[Dict]:
     """
     分层采样实现。
@@ -133,6 +279,7 @@ def _stratified_sample(
         uniform: 是否均匀采样（各组相同数量）
         seed: 随机种子
         sample_type: 采样方式（用于组内采样）
+        data: 预筛选的数据（可选，如果提供则不从文件加载）
     Returns:
         采样后的数据列表
@@ -143,8 +290,9 @@ def _stratified_sample(
     if seed is not None:
         random.seed(seed)
-    # 加载数据
-    data = load_data(str(filepath))
+    # 加载数据（如果没有预筛选数据）
+    if data is None:
+        data = load_data(str(filepath))
     total = len(data)
     if num <= 0 or num > total:

{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dtflow
-Version: 0.5.4
+Version: 0.5.6
 Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
 Project-URL: Homepage, https://github.com/yourusername/DataTransformer
 Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -423,6 +423,8 @@ dt sample data.csv --num=100 --sample_type=head
 dt sample data.jsonl 1000 --by=category           # 分层采样
 dt sample data.jsonl 1000 --by=meta.source        # 按嵌套字段分层采样
 dt sample data.jsonl 1000 --by=messages.#         # 按消息数量分层采样
+dt sample data.jsonl --where="category=tech"      # 筛选后采样
+dt sample data.jsonl --where="messages.#>=2"      # 多条件筛选
 # 数据转换 - 预设模式
 dt transform data.jsonl --preset=openai_chat
@@ -496,7 +498,7 @@ CLI 命令中的字段参数支持嵌套路径语法，可访问深层嵌套的
 | 命令 | 参数 | 示例 |
 |------|------|------|
-| `sample` | `--by=` | `--by=meta.source`、`--by=messages.#` |
+| `sample` | `--by=`, `--where=` | `--by=meta.source`、`--where=messages.#>=2` |
 | `dedupe` | `--key=` | `--key=meta.id`、`--key=messages[0].content` |
 | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
 | `clean` | `--min-len=` | `--min-len=messages.#:2` |

{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dtflow/__init__.py,sha256=yUwvKuVAmhDnp-1tYhZGlZcTdiEnZ3Jh-IJymgMIUhA,3031
-dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
+dtflow/__init__.py,sha256=_KUxZUD08hQhhLugGbjo_jlP5JuMCFAcCs0o0SCCoVM,3031
+dtflow/__main__.py,sha256=OJ60M0PbA0PcsQfA7FP9k9CflJgzexKhIl-yc-CPXkw,12675
 dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
 dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
 dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
@@ -16,7 +16,7 @@ dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
 dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
 dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
 dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
-dtflow/cli/sample.py,sha256=LRCkpFi9t0CI2QjRKADmvwWMdGfLriqdNkoFG6_wQkY,10497
+dtflow/cli/sample.py,sha256=pubpx4AIzsarBEalD150MC2apYQSt4bal70IZkTfFO0,15475
 dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
 dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
 dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
@@ -31,7 +31,7 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
 dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
 dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
 dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
-dtflow-0.5.4.dist-info/METADATA,sha256=mQIIV3B-6VBOuNSRiPQjqOwdLTs6Nir6to1_FIER3d0,22544
-dtflow-0.5.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-dtflow-0.5.4.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
-dtflow-0.5.4.dist-info/RECORD,,
+dtflow-0.5.6.dist-info/METADATA,sha256=TPSDq-fQDini8uKERCdm_4cZYw-b9t6V8UQ1MlTJ7iA,22698
+dtflow-0.5.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+dtflow-0.5.6.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
+dtflow-0.5.6.dist-info/RECORD,,

{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{dtflow-0.5.4.dist-info → dtflow-0.5.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dtflow 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

dtflow 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl