PyPI - dtflow - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

dtflow 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

dtflow/SKILL.md +39 -5
dtflow/__init__.py +1 -1
dtflow/__main__.py +137 -8
dtflow/cli/clean.py +294 -9
dtflow/cli/commands.py +17 -1
dtflow/cli/eval.py +288 -0
dtflow/cli/export.py +81 -0
dtflow/cli/sample.py +90 -3
dtflow/cli/split.py +138 -0
dtflow/cli/stats.py +224 -30
dtflow/eval.py +276 -0
dtflow/utils/text_parser.py +124 -0
{dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/METADATA +34 -2
{dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/RECORD +16 -11
{dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/WHEEL +0 -0
{dtflow-0.5.7.dist-info → dtflow-0.5.9.dist-info}/entry_points.txt +0 -0

dtflow/cli/stats.py CHANGED Viewed

@@ -3,7 +3,7 @@ CLI 数据统计相关命令
 """
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 import orjson
@@ -22,6 +22,8 @@ def stats(
     filename: str,
     top: int = 10,
     full: bool = False,
+    fields: Optional[List[str]] = None,
+    expand_fields: Optional[List[str]] = None,
 ) -> None:
     """
     显示数据文件的统计信息。
@@ -33,11 +35,15 @@ def stats(
         filename: 输入文件路径，支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
         top: 显示频率最高的前 N 个值，默认 10（仅完整模式）
         full: 完整模式，统计值分布、唯一值等详细信息
+        fields: 指定统计的字段列表（支持嵌套路径）
+        expand_fields: 展开 list 字段统计的字段列表
     Examples:
         dt stats data.jsonl            # 快速模式（默认）
         dt stats data.jsonl --full     # 完整模式
         dt stats data.csv -f --top=5   # 完整模式，显示 Top 5
+        dt stats data.jsonl --full --field=category  # 指定字段
+        dt stats data.jsonl --full --expand=tags     # 展开 list 字段
     """
     filepath = Path(filename)
@@ -48,7 +54,10 @@ def stats(
     if not _check_file_format(filepath):
         return
+    # 快速模式：忽略 --field 和 --expand 参数
     if not full:
+        if fields or expand_fields:
+            print("⚠️  警告: --field 和 --expand 参数仅在完整模式 (--full) 下生效")
         _quick_stats(filepath)
         return
@@ -65,7 +74,7 @@ def stats(
     # 计算统计信息
     total = len(data)
-    field_stats = _compute_field_stats(data, top)
+    field_stats = _compute_field_stats(data, top, fields, expand_fields)
     # 输出统计信息
     _print_stats(filepath.name, total, field_stats)
@@ -205,11 +214,99 @@ def _quick_stats(filepath: Path) -> None:
                 print(f"  {i}. {f['field']} ({f['type']})")
-def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
+def _extract_with_wildcard(item: dict, field_spec: str) -> List[Any]:
+    """处理包含 [*] 的字段路径，返回所有值"""
+    if "[*]" not in field_spec:
+        # 无 [*]，直接返回单个值的列表
+        value = get_field_with_spec(item, field_spec)
+        return [value] if value is not None else []
+    # 分割路径：messages[*].role -> ("messages", ".role")
+    before, after = field_spec.split("[*]", 1)
+    after = after.lstrip(".")  # 移除开头的点
+    # 获取数组
+    array = get_field_with_spec(item, before) if before else item
+    if not isinstance(array, list):
+        return []
+    # 提取每个元素的后续路径
+    results = []
+    for elem in array:
+        if after:
+            val = get_field_with_spec(elem, after)
+        else:
+            val = elem
+        if val is not None:
+            results.append(val)
+    return results
+def _extract_field_values(
+    data: List[Dict],
+    field_spec: str,
+    expand: bool = False,
+) -> List[Any]:
+    """
+    从数据中提取字段值。
+    Args:
+        data: 数据列表
+        field_spec: 字段路径规格（如 "messages[*].role"）
+        expand: 是否展开 list
+    Returns:
+        值列表（展开或不展开）
+    """
+    all_values = []
+    for item in data:
+        if "[*]" in field_spec or expand:
+            # 使用通配符提取所有值
+            values = _extract_with_wildcard(item, field_spec)
+            if expand and len(values) == 1 and isinstance(values[0], list):
+                # 展开模式：如果返回单个列表，展开其元素
+                all_values.extend(values[0])
+            elif expand and values and isinstance(values[0], list):
+                # 多个列表，全部展开
+                for v in values:
+                    if isinstance(v, list):
+                        all_values.extend(v)
+                    else:
+                        all_values.append(v)
+            else:
+                # 不展开或非列表值
+                all_values.extend(values)
+        else:
+            # 普通字段路径
+            value = get_field_with_spec(item, field_spec)
+            if expand and isinstance(value, list):
+                # 展开 list
+                all_values.extend(value)
+            else:
+                all_values.append(value)
+    return all_values
+def _compute_field_stats(
+    data: List[Dict],
+    top: int,
+    fields: Optional[List[str]] = None,
+    expand_fields: Optional[List[str]] = None,
+) -> List[Dict[str, Any]]:
     """
     单次遍历计算每个字段的统计信息。
     优化：将多次遍历合并为单次遍历，在遍历过程中同时收集所有统计数据。
+    Args:
+        data: 数据列表
+        top: Top N 值数量
+        fields: 指定统计的字段列表
+        expand_fields: 展开 list 字段统计的字段列表
     """
     from collections import Counter, defaultdict
@@ -218,38 +315,115 @@ def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
     total = len(data)
-    # 单次遍历收集所有字段的值和统计信息
-    field_values = defaultdict(list)  # 存储每个字段的所有值
-    field_counters = defaultdict(Counter)  # 存储每个字段的值频率（用于 top N）
+    # 如果没有指定字段，统计所有顶层字段（保持向后兼容）
+    if not fields and not expand_fields:
+        # 单次遍历收集所有字段的值和统计信息
+        field_values = defaultdict(list)  # 存储每个字段的所有值
+        field_counters = defaultdict(Counter)  # 存储每个字段的值频率（用于 top N）
+        for item in data:
+            for k, v in item.items():
+                field_values[k].append(v)
+                # 对值进行截断后计数（用于 top N 显示）
+                displayable = _truncate(v if v is not None else "", 30)
+                field_counters[k][displayable] += 1
+        # 根据收集的数据计算统计信息
+        stats_list = []
+        for field in sorted(field_values.keys()):
+            values = field_values[field]
+            non_null = [v for v in values if v is not None and v != ""]
+            non_null_count = len(non_null)
+            # 推断类型（从第一个非空值）
+            field_type = _infer_type(non_null)
+            # 基础统计
+            stat = {
+                "field": field,
+                "non_null": non_null_count,
+                "null_rate": f"{non_null_count / total * 100:.1f}%",
+                "type": field_type,
+            }
+            # 类型特定统计
+            if non_null:
+                # 唯一值计数（对复杂类型使用 hash 节省内存）
+                stat["unique"] = _count_unique(non_null, field_type)
+                # 字符串类型：计算长度统计
+                if field_type == "str":
+                    lengths = [len(str(v)) for v in non_null]
+                    stat["len_min"] = min(lengths)
+                    stat["len_max"] = max(lengths)
+                    stat["len_avg"] = sum(lengths) / len(lengths)
+                # 数值类型：计算数值统计
+                elif field_type in ("int", "float"):
+                    nums = [float(v) for v in non_null if _is_numeric(v)]
+                    if nums:
+                        stat["min"] = min(nums)
+                        stat["max"] = max(nums)
+                        stat["avg"] = sum(nums) / len(nums)
+                # 列表类型：计算长度统计
+                elif field_type == "list":
+                    lengths = [len(v) if isinstance(v, list) else 0 for v in non_null]
+                    stat["len_min"] = min(lengths)
+                    stat["len_max"] = max(lengths)
+                    stat["len_avg"] = sum(lengths) / len(lengths)
+                # Top N 值（已在遍历时收集）
+                stat["top_values"] = field_counters[field].most_common(top)
+            stats_list.append(stat)
+        return stats_list
+    # 指定了字段：收集指定字段的统计
+    stats_list = []
+    expand_set = set(expand_fields) if expand_fields else set()
-    for item in data:
-        for k, v in item.items():
-            field_values[k].append(v)
-            # 对值进行截断后计数（用于 top N 显示）
-            displayable = _truncate(v if v is not None else "", 30)
-            field_counters[k][displayable] += 1
+    # 合并字段列表
+    all_fields = set(fields) if fields else set()
+    all_fields.update(expand_set)
-    # 根据收集的数据计算统计信息
-    stats_list = []
-    for field in sorted(field_values.keys()):
-        values = field_values[field]
+    for field_spec in sorted(all_fields):
+        is_expanded = field_spec in expand_set
+        # 提取字段值
+        values = _extract_field_values(data, field_spec, expand=is_expanded)
+        # 过滤 None 和空值
         non_null = [v for v in values if v is not None and v != ""]
         non_null_count = len(non_null)
-        # 推断类型（从第一个非空值）
+        # 推断类型
         field_type = _infer_type(non_null)
         # 基础统计
-        stat = {
-            "field": field,
-            "non_null": non_null_count,
-            "null_rate": f"{(total - non_null_count) / total * 100:.1f}%",
-            "type": field_type,
-        }
+        if is_expanded:
+            # 展开模式：显示元素总数和平均数，而非非空率
+            stat = {
+                "field": field_spec,
+                "non_null": non_null_count,
+                "null_rate": f"总元素: {len(values)}",
+                "type": field_type,
+                "is_expanded": is_expanded,
+            }
+        else:
+            # 普通模式：显示非空率
+            stat = {
+                "field": field_spec,
+                "non_null": non_null_count,
+                "null_rate": f"{non_null_count / total * 100:.1f}%",
+                "type": field_type,
+                "is_expanded": is_expanded,
+            }
         # 类型特定统计
         if non_null:
-            # 唯一值计数（对复杂类型使用 hash 节省内存）
+            # 唯一值计数
             stat["unique"] = _count_unique(non_null, field_type)
             # 字符串类型：计算长度统计
@@ -274,8 +448,12 @@ def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
                 stat["len_max"] = max(lengths)
                 stat["len_avg"] = sum(lengths) / len(lengths)
-            # Top N 值（已在遍历时收集）
-            stat["top_values"] = field_counters[field].most_common(top)
+            # Top N 值（需要重新计数）
+            counter = Counter()
+            for v in non_null:
+                displayable = _truncate(v if v is not None else "", 30)
+                counter[displayable] += 1
+            stat["top_values"] = counter.most_common(top)
         stats_list.append(stat)
@@ -343,9 +521,18 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
         table.add_column("统计", style="dim")
         for stat in field_stats:
-            non_null_rate = f"{stat['non_null'] / total * 100:.0f}%"
+            # 使用 stat 中的 null_rate（支持展开模式的特殊显示）
+            if "null_rate" in stat:
+                non_null_rate = stat["null_rate"]
+            else:
+                non_null_rate = f"{stat['non_null'] / total * 100:.0f}%"
             unique = str(stat.get("unique", "-"))
+            # 字段名（添加展开标记）
+            field_name = stat["field"]
+            if stat.get("is_expanded"):
+                field_name += " (展开)"
             # 构建统计信息字符串
             extra = []
             if "len_avg" in stat:
@@ -363,7 +550,7 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
                     )
             table.add_row(
-                stat["field"],
+                field_name,
                 stat["type"],
                 non_null_rate,
                 unique,
@@ -387,12 +574,19 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
             if unique_ratio > 0.9 and stat.get("unique", 0) > 100:
                 continue
+            # 字段名（添加展开标记）
+            field_display = stat["field"]
+            if stat.get("is_expanded"):
+                field_display += " (展开)"
             console.print(
-                f"\n[bold cyan]{stat['field']}[/bold cyan] 值分布 (Top {len(top_values)}):"
+                f"\n[bold cyan]{field_display}[/bold cyan] 值分布 (Top {len(top_values)}):"
             )
             max_count = max(c for _, c in top_values) if top_values else 1
+            # 展开模式下使用 non_null（元素总数），否则使用 total（数据条数）
+            base_count = stat["non_null"] if stat.get("is_expanded") else total
             for value, count in top_values:
-                pct = count / total * 100
+                pct = count / base_count * 100 if base_count > 0 else 0
                 bar_len = int(count / max_count * 20)  # 按相对比例，最长 20 字符
                 bar = "█" * bar_len
                 display_value = value if value else "[空]"

dtflow/eval.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""
+评估指标计算模块
+提供分类任务的指标计算和评估报告导出：
+- MetricsCalculator: 计算 accuracy/precision/recall/F1/混淆矩阵
+- export_eval_report: 生成 metrics.md + result.jsonl + bad_case.jsonl
+依赖: scikit-learn, pandas
+安装: pip install dtflow[eval]
+"""
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from pandas import DataFrame
+def _check_eval_deps():
+    """检查 eval 依赖是否已安装"""
+    try:
+        import pandas  # noqa: F401
+        import sklearn  # noqa: F401
+    except ImportError as e:
+        missing = str(e).split("'")[1] if "'" in str(e) else str(e)
+        raise ImportError(
+            f"eval 功能需要额外依赖: {missing}\n" f"请运行: pip install dtflow[eval]"
+        ) from e
+class MetricsCalculator:
+    """分类指标计算器
+    基于 sklearn 计算 accuracy/precision/recall/F1/混淆矩阵/分类报告。
+    Args:
+        df: 包含预测列和标签列的 DataFrame
+        pred_col: 预测值列名
+        label_col: 标签值列名
+        include_macro_micro_avg: 是否在报告中包含 macro/micro 平均
+        remove_matrix_zero_row: 是否移除混淆矩阵中 support=0 的行
+    """
+    def __init__(
+        self,
+        df: "DataFrame",
+        pred_col: str = "predict",
+        label_col: str = "label",
+        include_macro_micro_avg: bool = False,
+        remove_matrix_zero_row: bool = False,
+    ):
+        _check_eval_deps()
+        self.df = df
+        self.y_pred = df[pred_col]
+        self.y_true = df[label_col]
+        self.all_labels = sorted(set(self.y_true.unique()).union(set(self.y_pred.unique())))
+        self.needed_labels = None
+        self.remove_matrix_zero_row = remove_matrix_zero_row
+        self.include_macro_micro_avg = include_macro_micro_avg
+        self.metrics = self._calculate_metrics()
+    def _calculate_metrics(self):
+        from sklearn.metrics import (
+            accuracy_score,
+            classification_report,
+            confusion_matrix,
+            precision_score,
+            recall_score,
+        )
+        accuracy = accuracy_score(self.y_true, self.y_pred)
+        precision = precision_score(
+            self.y_true, self.y_pred, labels=self.all_labels, average="weighted", zero_division=0
+        )
+        recall = recall_score(
+            self.y_true, self.y_pred, labels=self.all_labels, average="weighted", zero_division=0
+        )
+        conf_matrix = confusion_matrix(self.y_true, self.y_pred, labels=self.all_labels)
+        report = classification_report(
+            self.y_true, self.y_pred, labels=self.all_labels, output_dict=True, zero_division=0
+        )
+        # 默认只保留加权平均
+        if not self.include_macro_micro_avg:
+            report = {
+                label: metrics
+                for label, metrics in report.items()
+                if label in self.all_labels or label == "weighted avg"
+            }
+        # 去除 support=0 的类别（注意 accuracy 是 float 不是 dict）
+        report = {
+            label: metrics
+            for label, metrics in report.items()
+            if isinstance(metrics, dict) and metrics.get("support", 0) > 0
+        }
+        self.needed_labels = [label for label in report.keys() if label in self.all_labels]
+        # 可选移除混淆矩阵中不需要的行
+        needed_idx_list = [self.all_labels.index(label) for label in self.needed_labels]
+        if self.remove_matrix_zero_row:
+            conf_matrix = conf_matrix[needed_idx_list]
+        return {
+            "accuracy": accuracy,
+            "precision": precision,
+            "recall": recall,
+            "confusion_matrix": conf_matrix,
+            "classification_report": report,
+        }
+    def get_metrics(self):
+        return self.metrics
+    def format_classification_report_as_markdown(self):
+        """将分类报告格式化为 Markdown 表格"""
+        report = self.metrics["classification_report"]
+        header = "| Label | Precision | Recall | F1-score | Support |\n"
+        separator = "|-------|-----------|--------|----------|---------|\n"
+        rows = []
+        for label, metrics in report.items():
+            if isinstance(metrics, dict):
+                rows.append(
+                    f"| {label} | {metrics['precision']:.2f} | {metrics['recall']:.2f} "
+                    f"| {metrics['f1-score']:.2f} | {metrics['support']:.0f} |"
+                )
+        return header + separator + "\n".join(rows)
+    def _clean_label_for_markdown(self, label, max_length=20):
+        """清理标签文本，使其适合 Markdown 表格显示"""
+        label = str(label).replace("\n", " ")
+        label = label.replace("|", "\\|")
+        label = label.replace("-", "\\-")
+        label = label.replace("<", "&lt;")
+        label = label.replace(">", "&gt;")
+        if len(label) > max_length:
+            label = label[:max_length] + "..."
+        label = label.strip()
+        if not label:
+            label = "(empty)"
+        return label
+    def format_confusion_matrix_as_markdown(self, max_label_length=20):
+        """将混淆矩阵格式化为 Markdown 表格"""
+        matrix = self.metrics["confusion_matrix"]
+        if self.remove_matrix_zero_row:
+            labels = self.needed_labels
+        else:
+            labels = self.all_labels
+        processed_labels = [self._clean_label_for_markdown(lb, max_label_length) for lb in labels]
+        header = "| 真实值/预测值 | " + " | ".join(processed_labels) + " |\n"
+        separator_parts = [":---:"] * (len(processed_labels) + 1)
+        separator = "| " + " | ".join(separator_parts) + " |\n"
+        rows = []
+        for i, row in enumerate(matrix):
+            row_label = self._clean_label_for_markdown(labels[i], max_label_length)
+            formatted_row = [f"{num:,}" for num in row]
+            rows.append(f"| {row_label} | " + " | ".join(formatted_row) + " |")
+        return header + separator + "\n".join(rows)
+def export_eval_report(
+    df: "DataFrame",
+    pred_col: str,
+    label_col: str,
+    record_folder: str = "record",
+    input_name: Optional[str] = None,
+):
+    """生成评估报告并保存到指定目录
+    输出文件：
+    - metrics.md: 指标概览 + 分类报告 + 混淆矩阵
+    - result.jsonl: 完整预测结果
+    - bad_case.jsonl: 预测错误样本
+    Args:
+        df: 包含预测和标签的 DataFrame
+        pred_col: 预测值列名
+        label_col: 标签值列名
+        record_folder: 输出根目录
+        input_name: 输入文件名（用于子目录命名）
+    """
+    from rich.console import Console
+    from rich.markdown import Markdown
+    calculator = MetricsCalculator(df, pred_col=pred_col, label_col=label_col)
+    metrics = calculator.get_metrics()
+    # 用 Rich Table 构建指标概览（替代 tabulate）
+    from rich.table import Table
+    overview_table = Table(title="指标概览", show_header=True)
+    overview_table.add_column("Accuracy", justify="center")
+    overview_table.add_column("Precision", justify="center")
+    overview_table.add_column("Recall", justify="center")
+    overview_table.add_row(
+        f"{metrics['accuracy']:.4f}",
+        f"{metrics['precision']:.4f}",
+        f"{metrics['recall']:.4f}",
+    )
+    # 构建 Markdown 报告内容
+    md = (
+        f"\n\n### 指标概览\n\n"
+        f"| Accuracy | Precision | Recall |\n"
+        f"|----------|-----------|--------|\n"
+        f"| {metrics['accuracy']:.4f} | {metrics['precision']:.4f} | {metrics['recall']:.4f} |"
+    )
+    metrics_md = calculator.format_classification_report_as_markdown()
+    confusion_md = calculator.format_confusion_matrix_as_markdown()
+    md += f"\n\n### Classification Report\n{metrics_md}\n" f"\n### Confusion Matrix\n{confusion_md}"
+    # 创建输出目录（带序号和时间戳）
+    now = datetime.now().strftime("%Y%m%d-%H-%M-%S")
+    record_path = Path(record_folder)
+    if input_name:
+        record_path = record_path / input_name
+    if record_path.exists():
+        existing = [d.name for d in record_path.iterdir() if d.is_dir()]
+        max_idx = 0
+        for name in existing:
+            parts = name.split("-", 1)
+            if parts[0].isdigit():
+                max_idx = max(max_idx, int(parts[0]))
+        idx = max_idx + 1
+    else:
+        idx = 1
+    record_path = record_path / f"{idx}-{now}"
+    record_path.mkdir(parents=True, exist_ok=True)
+    # 终端输出
+    console = Console()
+    console.print(overview_table)
+    console.print(Markdown(md))
+    # 保存文件
+    with open(os.path.join(record_path, "metrics.md"), "w", encoding="utf-8") as f:
+        f.write(md)
+    bad_case_df = df[df[pred_col] != df[label_col]]
+    # 保存 JSONL
+    df.to_json(
+        os.path.join(record_path, "result.jsonl"),
+        orient="records",
+        lines=True,
+        force_ascii=False,
+    )
+    bad_case_df.to_json(
+        os.path.join(record_path, "bad_case.jsonl"),
+        orient="records",
+        lines=True,
+        force_ascii=False,
+    )
+    # 尝试保存 CSV
+    try:
+        df.to_csv(os.path.join(record_path, "result.csv"), index=False)
+        bad_case_df.to_csv(os.path.join(record_path, "bad_case.csv"), index=False)
+    except Exception:
+        pass
+    console.print(f"\n[green]报告已保存到: {record_path}[/green]")
+    console.print(f"[dim]  - metrics.md ({len(df)} 条数据, {len(bad_case_df)} 条错误)[/dim]")
+    return record_path

dtflow 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

dtflow 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl