PyPI - dtflow - Versions diffs - 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

dtflow 0.4.3py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

dtflow/__init__.py +34 -1
dtflow/__main__.py +22 -0
dtflow/cli/commands.py +5 -0
dtflow/cli/common.py +13 -9
dtflow/cli/stats.py +114 -36
dtflow/cli/validate.py +152 -0
dtflow/core.py +220 -10
dtflow/framework.py +610 -0
dtflow/lineage.py +17 -0
dtflow/schema.py +508 -0
dtflow/streaming.py +93 -35
dtflow/tokenizers.py +84 -29
dtflow/utils/field_path.py +6 -2
{dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/METADATA +117 -2
{dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/RECORD +17 -14
{dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/WHEEL +0 -0
{dtflow-0.4.3.dist-info → dtflow-0.5.2.dist-info}/entry_points.txt +0 -0

dtflow/core.py CHANGED Viewed

@@ -386,6 +386,88 @@ class DataTransformer:
         return errors
+    def validate_schema(
+        self,
+        schema: "Schema",
+        on_error: Literal["skip", "raise", "filter"] = "skip",
+        max_errors: int = 100,
+    ) -> Union["DataTransformer", List[tuple]]:
+        """
+        使用 Schema 验证数据结构。
+        Args:
+            schema: Schema 对象，定义数据结构验证规则
+            on_error: 错误处理方式
+                - "skip": 打印警告，返回验证失败的记录列表
+                - "raise": 第一个错误时抛出异常
+                - "filter": 过滤掉验证失败的记录，返回新的 DataTransformer
+            max_errors: 最大错误数量（on_error="skip" 时生效）
+        Returns:
+            - on_error="skip": 返回 [(index, ValidationResult), ...] 失败记录列表
+            - on_error="raise": 无返回（成功）或抛出 ValueError
+            - on_error="filter": 返回过滤后的新 DataTransformer
+        Examples:
+            >>> from dtflow import Schema, Field
+            >>> schema = Schema({
+            ...     "messages": Field(type="list", required=True, min_length=1),
+            ...     "messages[*].role": Field(type="str", choices=["user", "assistant"]),
+            ... })
+            >>> # 获取验证失败的记录
+            >>> errors = dt.validate_schema(schema)
+            >>> for idx, result in errors:
+            ...     print(f"第 {idx} 行验证失败: {result.errors}")
+            >>> # 过滤掉无效记录
+            >>> valid_dt = dt.validate_schema(schema, on_error="filter")
+            >>> # 遇到错误立即停止
+            >>> dt.validate_schema(schema, on_error="raise")
+        """
+        from .schema import Schema, ValidationResult
+        failed: List[tuple] = []
+        valid_data: List[dict] = []
+        error_count = 0
+        for i, item in enumerate(self._data):
+            result = schema.validate(item)
+            if result.valid:
+                valid_data.append(item)
+            else:
+                failed.append((i, result))
+                error_count += len(result.errors)
+                if on_error == "raise":
+                    error_msgs = [str(e) for e in result.errors[:3]]
+                    raise ValueError(
+                        f"第 {i} 行验证失败:\n  " + "\n  ".join(error_msgs)
+                    )
+                if on_error == "skip" and error_count >= max_errors:
+                    print(f"⚠️ 已达到最大错误数 {max_errors}，停止验证")
+                    break
+        if on_error == "skip":
+            if failed:
+                print(f"⚠️ 验证失败 {len(failed)} 条记录（共 {error_count} 个错误）")
+            return failed
+        if on_error == "filter":
+            tracker = self._lineage_tracker
+            if tracker:
+                tracker.record(
+                    "validate_schema",
+                    {"schema": repr(schema), "on_error": on_error},
+                    len(self._data),
+                    len(valid_data),
+                )
+            return DataTransformer(valid_data, _lineage_tracker=tracker)
+        return failed
     def dedupe(
         self,
         key: Union[None, str, List[str], Callable[[Any], Any]] = None,
@@ -711,19 +793,29 @@ class DataTransformer:
             seed: 随机种子
         Returns:
-            (train, test) 两个 DataTransformer
+            (train, test) 两个 DataTransformer，各自拥有独立的血缘追踪器
         """
         data = self.shuffle(seed).data
         split_idx = int(len(data) * ratio)
-        # 分割后血缘追踪器各自独立
+        # 分割后血缘追踪器各自独立（使用深拷贝避免相互影响）
         tracker = self._lineage_tracker
+        train_tracker = None
+        test_tracker = None
         if tracker:
             tracker.record("split", {"ratio": ratio, "seed": seed}, len(self._data), len(data))
+            # 为每个子数据集创建独立的追踪器副本
+            train_tracker = tracker.copy()
+            train_tracker.record("split_part", {"part": "train", "ratio": ratio}, len(data), split_idx)
+            test_tracker = tracker.copy()
+            test_tracker.record(
+                "split_part", {"part": "test", "ratio": 1 - ratio}, len(data), len(data) - split_idx
+            )
         return (
-            DataTransformer(data[:split_idx], _lineage_tracker=tracker),
-            DataTransformer(data[split_idx:], _lineage_tracker=tracker),
+            DataTransformer(data[:split_idx], _lineage_tracker=train_tracker),
+            DataTransformer(data[split_idx:], _lineage_tracker=test_tracker),
         )
     # ============ 并行处理 ============
@@ -733,6 +825,7 @@ class DataTransformer:
         func: Callable[[Dict], Any],
         workers: Optional[int] = None,
         chunksize: int = 1000,
+        timeout: Optional[float] = None,
     ) -> List[Any]:
         """
         并行执行转换函数（使用多进程）。
@@ -743,24 +836,46 @@ class DataTransformer:
             func: 转换函数，接收原始 dict，返回转换结果
             workers: 进程数，默认为 CPU 核心数
             chunksize: 每个进程处理的数据块大小
+            timeout: 超时时间（秒），None 表示无超时
         Returns:
             转换后的结果列表
+        Raises:
+            TypeError: 如果 func 无法被 pickle（如 lambda 函数）
+            RuntimeError: 如果子进程执行出错或超时
         Examples:
             >>> def transform(item):
             ...     return {"id": item["id"], "text": item["text"].upper()}
             >>> results = dt.map_parallel(transform)
         """
-        from multiprocessing import Pool, cpu_count
+        from multiprocessing import Pool, TimeoutError, cpu_count
+        import pickle
         if not self._data:
             return []
+        # 检查函数是否可 pickle
+        try:
+            pickle.dumps(func)
+        except (pickle.PicklingError, AttributeError, TypeError) as e:
+            func_name = getattr(func, "__name__", str(func))
+            raise TypeError(
+                f"函数 '{func_name}' 无法被 pickle，不能用于并行处理。"
+                f"请使用模块级函数而非 lambda 或闭包。错误: {e}"
+            ) from e
         workers = workers or cpu_count()
-        with Pool(workers) as pool:
-            results = pool.map(func, self._data, chunksize=chunksize)
+        try:
+            with Pool(workers) as pool:
+                async_result = pool.map_async(func, self._data, chunksize=chunksize)
+                results = async_result.get(timeout=timeout)
+        except TimeoutError:
+            raise RuntimeError(f"并行处理超时（{timeout}秒）")
+        except Exception as e:
+            raise RuntimeError(f"并行处理失败: {type(e).__name__}: {e}") from e
         return results
@@ -769,6 +884,7 @@ class DataTransformer:
         func: Callable[[Dict], bool],
         workers: Optional[int] = None,
         chunksize: int = 1000,
+        timeout: Optional[float] = None,
     ) -> "DataTransformer":
         """
         并行执行过滤函数（使用多进程）。
@@ -779,28 +895,122 @@ class DataTransformer:
             func: 过滤函数，接收原始 dict，返回 True 保留
             workers: 进程数，默认为 CPU 核心数
             chunksize: 每个进程处理的数据块大小
+            timeout: 超时时间（秒），None 表示无超时
         Returns:
             过滤后的新 DataTransformer
+        Raises:
+            TypeError: 如果 func 无法被 pickle（如 lambda 函数）
+            RuntimeError: 如果子进程执行出错或超时
         Examples:
             >>> def is_valid(item):
             ...     return len(item["text"]) > 10
             >>> filtered = dt.filter_parallel(is_valid)
         """
-        from multiprocessing import Pool, cpu_count
+        from multiprocessing import Pool, TimeoutError, cpu_count
+        import pickle
         if not self._data:
             return DataTransformer([])
+        # 检查函数是否可 pickle
+        try:
+            pickle.dumps(func)
+        except (pickle.PicklingError, AttributeError, TypeError) as e:
+            func_name = getattr(func, "__name__", str(func))
+            raise TypeError(
+                f"函数 '{func_name}' 无法被 pickle，不能用于并行处理。"
+                f"请使用模块级函数而非 lambda 或闭包。错误: {e}"
+            ) from e
         workers = workers or cpu_count()
-        with Pool(workers) as pool:
-            mask = pool.map(func, self._data, chunksize=chunksize)
+        try:
+            with Pool(workers) as pool:
+                async_result = pool.map_async(func, self._data, chunksize=chunksize)
+                mask = async_result.get(timeout=timeout)
+        except TimeoutError:
+            raise RuntimeError(f"并行处理超时（{timeout}秒）")
+        except Exception as e:
+            raise RuntimeError(f"并行处理失败: {type(e).__name__}: {e}") from e
         filtered = [item for item, keep in zip(self._data, mask) if keep]
         return DataTransformer(filtered)
+    # ============ 训练框架集成 ============
+    def check_compatibility(
+        self,
+        framework: Literal["llama-factory", "swift", "axolotl"],
+    ) -> "CompatibilityResult":
+        """
+        检查数据与目标训练框架的兼容性。
+        Args:
+            framework: 目标框架名称
+                - "llama-factory": LLaMA-Factory
+                - "swift": ms-swift (ModelScope)
+                - "axolotl": Axolotl
+        Returns:
+            CompatibilityResult 对象，包含 valid, errors, warnings, suggestions
+        Examples:
+            >>> result = dt.check_compatibility("llama-factory")
+            >>> if result.valid:
+            ...     print("兼容!")
+            >>> else:
+            ...     print(result.errors)
+        """
+        from .framework import check_compatibility
+        return check_compatibility(self._data, framework)
+    def export_for(
+        self,
+        framework: Literal["llama-factory", "swift", "axolotl"],
+        output_dir: str,
+        dataset_name: str = "custom_dataset",
+        **kwargs,
+    ) -> Dict[str, str]:
+        """
+        一键导出数据和配置文件到目标训练框架。
+        Args:
+            framework: 目标框架名称
+            output_dir: 输出目录
+            dataset_name: 数据集名称
+            **kwargs: 框架特定参数（如 model_name）
+        Returns:
+            生成的文件路径字典 {"data": "...", "config": "...", ...}
+        Examples:
+            >>> # 导出到 LLaMA-Factory
+            >>> dt.export_for("llama-factory", "./llama_ready")
+            # 生成:
+            # - ./llama_ready/custom_dataset.json
+            # - ./llama_ready/dataset_info.json
+            # - ./llama_ready/train_args.yaml
+            >>> # 导出到 ms-swift
+            >>> dt.export_for("swift", "./swift_ready", dataset_name="my_data")
+            >>> # 导出到 Axolotl
+            >>> dt.export_for("axolotl", "./axolotl_ready")
+        """
+        from .framework import export_for
+        return export_for(
+            self._data,
+            framework,
+            output_dir,
+            dataset_name=dataset_name,
+            **kwargs,
+        )
 def _sanitize_key(name: str) -> str:
     """将字段名规范化为合法的 Python 标识符"""

dtflow 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl

dtflow 0.4.3py3-none-any.whl → 0.5.2py3-none-any.whl