PyPI - dtflow - Versions diffs - 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

dtflow 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

dtflow/SKILL.md +22 -8
dtflow/__init__.py +1 -1
dtflow/__main__.py +108 -14
dtflow/cli/clean.py +90 -1
dtflow/cli/commands.py +17 -1
dtflow/cli/eval.py +288 -0
dtflow/cli/export.py +81 -0
dtflow/cli/sample.py +90 -3
dtflow/cli/split.py +138 -0
dtflow/cli/stats.py +10 -23
dtflow/cli/validate.py +19 -52
dtflow/eval.py +276 -0
dtflow/schema.py +13 -99
dtflow/tokenizers.py +21 -104
dtflow/utils/text_parser.py +124 -0
{dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/METADATA +29 -3
{dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/RECORD +19 -15
dtflow/parallel.py +0 -115
{dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/WHEEL +0 -0
{dtflow-0.5.8.dist-info → dtflow-0.5.9.dist-info}/entry_points.txt +0 -0

dtflow/cli/eval.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""
+CLI eval 命令实现
+对模型输出进行解析 + 指标计算，支持两阶段解析和管道式提取。
+"""
+import json
+import re
+from pathlib import Path
+from typing import Optional
+from rich.console import Console
+from ..storage.io import load_data
+from ..utils.field_path import get_field
+from ..utils.text_parser import extract_code_snippets, parse_generic_tags, strip_think_tags
+console = Console()
+# 自动检测 label 的候选字段名
+LABEL_CANDIDATES = ["label", "labels", "content_label", "target", "ground_truth", "answer"]
+def eval(
+    result_file: str,
+    source: Optional[str] = None,
+    response_col: str = "content",
+    label_col: Optional[str] = None,
+    extract: str = "direct",
+    sep: Optional[str] = None,
+    mapping: Optional[str] = None,
+    output_dir: str = "record",
+):
+    """对模型输出 .jsonl 文件进行解析和指标计算
+    两阶段解析流程：
+      阶段1（自动）：去除 <think>...</think>，提取 ```...``` 代码块
+      阶段2（--extract 指定）：管道式提取
+    Args:
+        result_file: 模型输出的 .jsonl 文件路径
+        source: 原始输入文件，按行号对齐合并（当 result_file 不含 label 时使用）
+        response_col: 模型响应所在字段名（支持嵌套路径，如 api_output.content）
+        label_col: 标签字段名（不指定时自动检测，支持嵌套路径）
+        extract: 管道式提取规则，算子间用 " | " 分隔
+        sep: 配合 index 算子使用的分隔符
+        mapping: 值映射，格式 "k1:v1,k2:v2"
+        output_dir: 指标报告输出目录
+    """
+    import pandas as pd
+    from ..eval import export_eval_report
+    # --- 加载数据 ---
+    data = load_data(result_file)
+    df = pd.DataFrame(data)
+    console.print(f"[cyan]加载 {result_file}，共 {len(df)} 条[/cyan]")
+    # 合并 source 文件
+    if source:
+        source_data = load_data(source)
+        source_df = pd.DataFrame(source_data)
+        if len(source_df) != len(df):
+            console.print(f"[red]行数不一致: result={len(df)}, source={len(source_df)}[/red]")
+            return
+        for col in source_df.columns:
+            if col not in df.columns:
+                df[col] = source_df[col].values
+        console.print(f"[dim]已合并 source 文件: {source}[/dim]")
+    # --- 解析 response_col（支持嵌套）---
+    response_col_resolved = _resolve_nested_col(df, response_col)
+    if response_col_resolved is None:
+        console.print(f"[red]响应列 '{response_col}' 不存在。可用列: {list(df.columns)}[/red]")
+        return
+    # --- 自动检测 label_col ---
+    if label_col is None:
+        label_col = _auto_detect_label_col(df)
+        if label_col is None:
+            console.print(
+                f"[red]未找到标签列，请通过 --label-col 指定。可用列: {list(df.columns)}[/red]"
+            )
+            return
+    # 解析 label_col（支持嵌套）
+    label_col_resolved = _resolve_nested_col(df, label_col)
+    if label_col_resolved is None:
+        console.print(f"[red]标签列 '{label_col}' 不存在。可用列: {list(df.columns)}[/red]")
+        return
+    console.print(
+        f"[dim]response_col={response_col_resolved}, "
+        f"label_col={label_col_resolved}, extract={extract}[/dim]"
+    )
+    # --- 阶段1+2：解析 ---
+    ops = _parse_pipeline(extract)
+    pred_col = "__pred__"
+    df[pred_col] = df[response_col_resolved].apply(
+        lambda x: _run_pipeline(_stage1_clean(x), ops, sep)
+    )
+    # --- mapping 阶段 ---
+    if mapping:
+        m = _parse_mapping(mapping)
+        priority = {}
+        for i, v in enumerate(m.values()):
+            priority[v] = i
+        def map_value(x):
+            if isinstance(x, list):
+                mapped = [m.get(v, v) for v in x]
+                return max(mapped, key=lambda v: priority.get(v, -1))
+            return m.get(x, x) if isinstance(x, str) else m.get(str(x), x)
+        df[pred_col] = df[pred_col].apply(map_value)
+        df[label_col_resolved] = df[label_col_resolved].apply(map_value)
+    # 统一转字符串
+    df[pred_col] = df[pred_col].apply(
+        lambda x: str(x).strip() if not isinstance(x, str) else x.strip()
+    )
+    df[label_col_resolved] = df[label_col_resolved].apply(
+        lambda x: str(x).strip() if not isinstance(x, str) else x.strip()
+    )
+    # --- 调用 export_eval_report ---
+    console.print("\n[bold green]评估结果[/bold green]")
+    input_name = Path(result_file).stem
+    export_eval_report(
+        df,
+        pred_col=pred_col,
+        label_col=label_col_resolved,
+        record_folder=output_dir,
+        input_name=input_name,
+    )
+# ============ 内部工具函数 ============
+def _resolve_nested_col(df, col_name: str) -> Optional[str]:
+    """解析嵌套字段路径，将其展开为 DataFrame 的新列
+    使用 dtflow 的 get_field() 支持完整的嵌套路径语法。
+    Returns:
+        解析后的列名，或 None（如果字段不存在）
+    """
+    # 简单情况：直接列名存在
+    if col_name in df.columns:
+        return col_name
+    # 尝试嵌套路径
+    if "." not in col_name and "[" not in col_name:
+        return None
+    # 用 get_field 从第一个非空行试探
+    sample_row = None
+    for _, row in df.iterrows():
+        row_dict = row.to_dict()
+        val = get_field(row_dict, col_name)
+        if val is not None:
+            sample_row = row_dict
+            break
+    if sample_row is None:
+        return None
+    # 展开嵌套字段到新列
+    resolved_name = col_name.replace(".", "__").replace("[", "_").replace("]", "")
+    df[resolved_name] = df.apply(lambda row: get_field(row.to_dict(), col_name), axis=1)
+    return resolved_name
+def _auto_detect_label_col(df) -> Optional[str]:
+    """自动检测 label 列"""
+    # 优先在顶层列中查找
+    for c in LABEL_CANDIDATES:
+        if c in df.columns:
+            return c
+    # 搜索 dict 类型列的嵌套 key
+    for col in df.columns:
+        non_null = df[col].dropna()
+        if non_null.empty:
+            continue
+        sample = non_null.iloc[0]
+        if isinstance(sample, dict):
+            for c in LABEL_CANDIDATES:
+                if c in sample:
+                    return f"{col}.{c}"
+    return None
+def _stage1_clean(text) -> str:
+    """阶段1：自动清洗（去思考链 + 提取代码块）"""
+    if not isinstance(text, str):
+        return str(text) if text is not None else ""
+    text = strip_think_tags(text)
+    snippets = extract_code_snippets(text)
+    if snippets:
+        return snippets[-1]["code"]
+    return text.strip()
+def _parse_pipeline(extract_str: str) -> list:
+    """解析管道表达式，按 ' | ' 分割"""
+    return [op.strip() for op in extract_str.split(" | ") if op.strip()]
+def _apply_op(text: str, op: str, sep: Optional[str] = None) -> str:
+    """对单个字符串执行单个算子"""
+    if op == "direct":
+        return text
+    elif op.startswith("tag:"):
+        tag_name = op[4:]
+        tags = parse_generic_tags(text)
+        return tags.get(tag_name, text)
+    elif op.startswith("json_key:"):
+        key = op[9:]
+        try:
+            obj = json.loads(text)
+        except Exception:
+            return text
+        if isinstance(obj, dict):
+            return str(obj.get(key, text))
+        return text
+    elif op.startswith("index:"):
+        idx = int(op[6:])
+        delimiter = sep if sep else ","
+        parts = text.split(delimiter)
+        if 0 <= idx < len(parts):
+            return parts[idx].strip()
+        return text
+    elif op.startswith("line:"):
+        n = int(op[5:])
+        text_lines = [line.strip() for line in text.splitlines() if line.strip()]
+        if text_lines and -len(text_lines) <= n < len(text_lines):
+            return text_lines[n]
+        return text
+    elif op.startswith("regex:"):
+        pattern = op[6:]
+        m = re.search(pattern, text)
+        if m:
+            return m.group(1) if m.lastindex else m.group(0)
+        return text
+    else:
+        console.print(f"[yellow]未知算子: {op}，跳过[/yellow]")
+        return text
+def _run_pipeline(text: str, ops: list, sep: Optional[str] = None):
+    """执行管道，处理 lines 展开"""
+    if "lines" in ops:
+        pos = ops.index("lines")
+        # lines 之前的算子先执行
+        for op in ops[:pos]:
+            text = _apply_op(text, op, sep)
+        # 展开为多行
+        items = [line.strip() for line in text.splitlines() if line.strip()]
+        # 每行独立走后续管道
+        rest_ops = ops[pos + 1 :]
+        results = []
+        for item in items:
+            for op in rest_ops:
+                item = _apply_op(item, op, sep)
+            results.append(item)
+        if not results:
+            return text
+        return results if len(results) > 1 else results[0]
+    else:
+        for op in ops:
+            text = _apply_op(text, op, sep)
+        return text
+def _parse_mapping(mapping_str: str) -> dict:
+    """解析 'k1:v1,k2:v2' 格式的映射"""
+    m = {}
+    for pair in mapping_str.split(","):
+        pair = pair.strip()
+        if ":" in pair:
+            k, v = pair.split(":", 1)
+            m[k.strip()] = v.strip()
+    return m

dtflow/cli/export.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""
+CLI 训练框架导出命令
+"""
+from pathlib import Path
+from typing import Optional
+from ..core import DataTransformer
+from ..framework import check_compatibility, detect_format, export_for
+from .common import _check_file_format
+def export(
+    filename: str,
+    framework: str,
+    output: Optional[str] = None,
+    name: Optional[str] = None,
+    check: bool = False,
+) -> None:
+    """
+    导出数据到训练框架 (LLaMA-Factory, ms-swift, Axolotl)。
+    Args:
+        filename: 输入文件路径
+        framework: 目标框架 (llama-factory, swift, axolotl)
+        output: 输出目录（默认 {stem}_{framework}/）
+        name: 数据集名称（默认 custom_dataset）
+        check: 仅检查兼容性，不导出
+    """
+    filepath = Path(filename)
+    if not filepath.exists():
+        print(f"错误: 文件不存在 - {filename}")
+        return
+    if not _check_file_format(filepath):
+        return
+    # 加载数据
+    print(f"📊 加载数据: {filepath}")
+    try:
+        dt = DataTransformer.load(str(filepath))
+    except Exception as e:
+        print(f"错误: 无法读取文件 - {e}")
+        return
+    data = dt.data
+    total = len(data)
+    print(f"   共 {total} 条数据")
+    # 检测格式
+    fmt = detect_format(data)
+    print(f"📋 检测到格式: {fmt}")
+    # 兼容性检查
+    result = check_compatibility(data, framework)
+    print(f"\n{result}")
+    if check:
+        return
+    if not result.valid:
+        print("\n❌ 兼容性检查未通过，跳过导出")
+        return
+    # 确定输出目录
+    if output is None:
+        fw_short = framework.lower().replace("-", "_")
+        output = str(filepath.parent / f"{filepath.stem}_{fw_short}")
+    dataset_name = name or "custom_dataset"
+    # 执行导出
+    print(f"\n📦 导出到 {framework}...")
+    try:
+        export_for(data, framework, output, dataset_name=dataset_name)
+    except Exception as e:
+        print(f"错误: 导出失败 - {e}")
+        return
+    print(f"\n✅ 导出完成! 文件保存在: {output}")

dtflow/cli/sample.py CHANGED Viewed

@@ -143,7 +143,7 @@ def sample(
     by: Optional[str] = None,
     uniform: bool = False,
     fields: Optional[str] = None,
-    raw: bool = False,
+    raw: bool = True,
     where: Optional[List[str]] = None,
 ) -> None:
     """
@@ -389,7 +389,7 @@ def head(
     num: int = 10,
     output: Optional[str] = None,
     fields: Optional[str] = None,
-    raw: bool = False,
+    raw: bool = True,
 ) -> None:
     """
     显示文件的前 N 条数据（dt sample --type=head 的快捷方式）。
@@ -415,12 +415,99 @@ def head(
     sample(filename, num=num, type="head", output=output, fields=fields, raw=raw)
+def slice_data(
+    filename: str,
+    range_str: str,
+    output: Optional[str] = None,
+    fields: Optional[str] = None,
+    raw: bool = True,
+) -> None:
+    """
+    按行号范围查看数据（Python 切片语法）。
+    Args:
+        filename: 输入文件路径
+        range_str: 行号范围，格式为 start:end（0-based，左闭右开）
+            - 10:20    第 10-19 行（共 10 条）
+            - :100     前 100 行
+            - 100:     第 100 行到末尾
+            - -10:     最后 10 行
+        output: 输出文件路径
+        fields: 只显示指定字段（逗号分隔）
+        raw: 输出原始 JSON 格式
+    Examples:
+        dt slice data.jsonl 10:20
+        dt slice data.jsonl :100
+        dt slice data.jsonl 100:
+        dt slice data.jsonl -10:
+        dt slice data.jsonl 10:20 --output=sliced.jsonl
+        dt slice data.jsonl 10:20 --fields=question,answer
+    """
+    filepath = Path(filename)
+    if not filepath.exists():
+        print(f"错误: 文件不存在 - {filename}")
+        return
+    if not _check_file_format(filepath):
+        return
+    # 解析 range
+    if ":" not in range_str:
+        print(f"错误: 无效的范围格式 '{range_str}'，应为 start:end（如 10:20）")
+        return
+    parts = range_str.split(":", 1)
+    start_str, end_str = parts[0].strip(), parts[1].strip()
+    try:
+        start = int(start_str) if start_str else None
+        end = int(end_str) if end_str else None
+    except ValueError:
+        print(f"错误: 无效的范围格式 '{range_str}'，start 和 end 必须为整数")
+        return
+    # 加载数据并切片
+    try:
+        data = load_data(str(filepath))
+    except Exception as e:
+        print(f"错误: {e}")
+        return
+    sliced = data[start:end]
+    if not sliced:
+        total = len(data)
+        print(f"⚠️  范围 [{range_str}] 无数据（文件共 {total} 行）")
+        return
+    # 显示范围信息
+    total = len(data)
+    actual_start = start if start is not None else 0
+    if actual_start < 0:
+        actual_start = max(0, total + actual_start)
+    actual_end = min(end, total) if end is not None else total
+    print(f"📍 行 {actual_start}-{actual_end - 1}（共 {len(sliced)} 条，文件共 {total} 行）")
+    # 输出结果
+    if output:
+        save_data(sliced, output)
+        print(f"已保存 {len(sliced)} 条数据到 {output}")
+    elif raw:
+        for item in sliced:
+            print(orjson.dumps(item, option=orjson.OPT_INDENT_2).decode("utf-8"))
+    else:
+        field_list = _parse_field_list(fields) if fields else None
+        _print_samples(sliced, filepath.name, total, field_list, filepath.stat().st_size)
 def tail(
     filename: str,
     num: int = 10,
     output: Optional[str] = None,
     fields: Optional[str] = None,
-    raw: bool = False,
+    raw: bool = True,
 ) -> None:
     """
     显示文件的后 N 条数据（dt sample --type=tail 的快捷方式）。

dtflow/cli/split.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+CLI 数据集切分命令
+"""
+from pathlib import Path
+from typing import List, Optional
+from ..core import DataTransformer
+from ..storage.io import save_data
+from .common import _check_file_format
+def _parse_ratio(ratio_str: str) -> List[float]:
+    """
+    解析比例参数。
+    - "0.8" -> [0.8, 0.2]（二分）
+    - "0.8,0.1,0.1" -> [0.8, 0.1, 0.1]（三分）
+    """
+    parts = [float(x.strip()) for x in ratio_str.split(",")]
+    if len(parts) == 1:
+        if not (0 < parts[0] < 1):
+            raise ValueError(f"比例必须在 0-1 之间: {parts[0]}")
+        parts.append(round(1 - parts[0], 10))
+    total = sum(parts)
+    if abs(total - 1.0) > 1e-6:
+        raise ValueError(f"比例之和必须为 1.0，当前为 {total}")
+    if any(p <= 0 for p in parts):
+        raise ValueError("每个比例都必须大于 0")
+    return parts
+# 切分名称：二分用 train/test，三分及以上用 train/val/test/part4/part5...
+_SPLIT_NAMES_2 = ["train", "test"]
+_SPLIT_NAMES_3 = ["train", "val", "test"]
+def _get_split_names(count: int) -> List[str]:
+    """根据切分数量获取名称"""
+    if count == 2:
+        return _SPLIT_NAMES_2
+    elif count == 3:
+        return _SPLIT_NAMES_3
+    else:
+        names = ["train", "val", "test"]
+        for i in range(3, count):
+            names.append(f"part{i + 1}")
+        return names
+def split(
+    filename: str,
+    ratio: str = "0.8",
+    seed: Optional[int] = None,
+    output: Optional[str] = None,
+) -> None:
+    """
+    分割数据集为 train/test (或 train/val/test)。
+    Args:
+        filename: 输入文件路径
+        ratio: 分割比例，如 "0.8" 或 "0.7,0.15,0.15"
+        seed: 随机种子
+        output: 输出目录（默认同目录）
+    """
+    filepath = Path(filename)
+    if not filepath.exists():
+        print(f"错误: 文件不存在 - {filename}")
+        return
+    if not _check_file_format(filepath):
+        return
+    # 解析比例
+    try:
+        ratios = _parse_ratio(ratio)
+    except ValueError as e:
+        print(f"错误: {e}")
+        return
+    split_names = _get_split_names(len(ratios))
+    # 加载数据
+    print(f"📊 加载数据: {filepath}")
+    try:
+        dt = DataTransformer.load(str(filepath))
+    except Exception as e:
+        print(f"错误: 无法读取文件 - {e}")
+        return
+    total = len(dt)
+    print(f"   共 {total} 条数据")
+    # 打乱
+    shuffled = dt.shuffle(seed)
+    if seed is not None:
+        print(f"🎲 随机种子: {seed}")
+    # 计算切分点
+    data = shuffled.data
+    split_indices = []
+    acc = 0
+    for r in ratios[:-1]:
+        acc += int(total * r)
+        split_indices.append(acc)
+    # 切分数据
+    parts = []
+    prev = 0
+    for idx in split_indices:
+        parts.append(data[prev:idx])
+        prev = idx
+    parts.append(data[prev:])
+    # 确定输出目录
+    if output:
+        output_dir = Path(output)
+        output_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        output_dir = filepath.parent
+    # 保存各部分
+    stem = filepath.stem
+    ext = filepath.suffix
+    print(f"\n🔀 切分比例: {' / '.join(f'{r:.0%}' for r in ratios)}")
+    for i, (name, part) in enumerate(zip(split_names, parts)):
+        output_path = output_dir / f"{stem}_{name}{ext}"
+        save_data(part, str(output_path))
+        pct = ratios[i] * 100
+        print(f"   {name}: {len(part)} 条 ({pct:.1f}%) -> {output_path}")
+    print(f"\n✅ 完成! 共切分为 {len(ratios)} 个部分")

dtflow 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

dtflow 0.5.8py3-none-any.whl → 0.5.9py3-none-any.whl