PyPI - dataforge-ml - Versions diffs - 1.0.0__tar.gz → 2.0.0__tar.gz - Mend

dataforge-ml 1.0.0tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

dataforge_ml-2.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,101 @@
+Metadata-Version: 2.4
+Name: dataforge-ml
+Version: 2.0.0
+Summary: A automated feature engineering and designing pipeline library
+License: MIT
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: polars>=1.0.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: scipy>=1.10.0
+Requires-Dist: numpy>=2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: chardet>=5.0.0
+Requires-Dist: iterative-stratification>=0.1.9
+Requires-Dist: diptest
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+Requires-Dist: sphinx>=8.0; extra == "dev"
+Requires-Dist: pydata-sphinx-theme>=0.16; extra == "dev"
+Requires-Dist: myst-parser>=4.0; extra == "dev"
+Requires-Dist: numpydoc>=1.8; extra == "dev"
+Requires-Dist: sphinx-autobuild>=2024.0; extra == "dev"
+Dynamic: license-file
+# DataForgeML
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/DEVunderdog/DataForgeML)
+Automated data profiling and splitting pipeline for ML datasets.
+DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
+## Installation
+```bash
+pip install dataforge-ml
+```
+## Quick Start
+```python
+from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
+df = DataLoader().load("titanic.csv")
+config = PipelineConfig()
+result = StructuralProfiler(config).profile(df)
+print(result.columns["Age"].semantic_type)  # SemanticType.Numeric
+print(result.dataset.row_count)             # total rows
+```
+`DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
+## Column Type Overrides
+Override the auto-detected type for any column before profiling:
+```python
+config = PipelineConfig()
+config.set_column_type("PassengerId", "identifier")           # skip stats entirely
+config.set_columns_type(["Survived", "Pclass"], "categorical")
+result = StructuralProfiler(config).profile(df)
+```
+To drop a column from all processing entirely, use `exclude_columns`:
+```python
+config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
+```
+## Splitting
+```python
+from dataforge_ml import DataLoader, DataSplitter
+df = DataLoader().load("titanic.csv")
+splitter = DataSplitter(df, target="Survived", random_seed=42)
+# Random train/test split (stratified by default when target is set)
+split = splitter.random_split(test_size=0.2)
+print(split.train.shape, split.test.shape)
+# Chronological split (no temporal leakage)
+split = splitter.time_split(time_column="date", test_size=0.2)
+# K-fold cross-validation
+for fold in splitter.kfold(k=5):
+    print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
+```
+## License
+MIT

dataforge_ml-2.0.0/README.md ADDED Viewed

@@ -0,0 +1,71 @@
+# DataForgeML
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/DEVunderdog/DataForgeML)
+Automated data profiling and splitting pipeline for ML datasets.
+DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
+## Installation
+```bash
+pip install dataforge-ml
+```
+## Quick Start
+```python
+from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
+df = DataLoader().load("titanic.csv")
+config = PipelineConfig()
+result = StructuralProfiler(config).profile(df)
+print(result.columns["Age"].semantic_type)  # SemanticType.Numeric
+print(result.dataset.row_count)             # total rows
+```
+`DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
+## Column Type Overrides
+Override the auto-detected type for any column before profiling:
+```python
+config = PipelineConfig()
+config.set_column_type("PassengerId", "identifier")           # skip stats entirely
+config.set_columns_type(["Survived", "Pclass"], "categorical")
+result = StructuralProfiler(config).profile(df)
+```
+To drop a column from all processing entirely, use `exclude_columns`:
+```python
+config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
+```
+## Splitting
+```python
+from dataforge_ml import DataLoader, DataSplitter
+df = DataLoader().load("titanic.csv")
+splitter = DataSplitter(df, target="Survived", random_seed=42)
+# Random train/test split (stratified by default when target is set)
+split = splitter.random_split(test_size=0.2)
+print(split.train.shape, split.test.shape)
+# Chronological split (no temporal leakage)
+split = splitter.time_split(time_column="date", test_size=0.2)
+# K-fold cross-validation
+for fold in splitter.kfold(k=5):
+    print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
+```
+## License
+MIT

{dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/pyproject.toml RENAMED Viewed

@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "1.0.0"
+version = "2.0.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">3.10"
 license = {text = "MIT"}
 classifiers = [
     "License :: OSI Approved :: MIT License",
@@ -23,11 +23,18 @@ dependencies = [
     "numpy>=2.0.0",
     "pandas>=2.0.0",
     "chardet>=5.0.0",
+    "iterative-stratification>=0.1.9",
+    "diptest",
 ]
 [project.optional-dependencies]
 dev = [
     "pytest>=8.0",
+    "sphinx>=8.0",
+    "pydata-sphinx-theme>=0.16",
+    "myst-parser>=4.0",
+    "numpydoc>=1.8",
+    "sphinx-autobuild>=2024.0",
 ]
 [tool.pytest.ini_options]

{dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
-from .profiling.structural import StructuralProfiler
+from .profiling.orchestrator import StructuralProfiler
 from .profiling._config import (
     ProfileConfig,
     StructuralProfileResult,
@@ -8,6 +8,7 @@ from .profiling._config import (
 )
 from .splitting import DataSplitter, SplitResult, FoldResult
 from .utils.data_loader import DataLoader
+from .imputation._config import ImputationFitDiagnostic
 __all__ = [
     "PipelineConfig",
@@ -23,4 +24,5 @@ __all__ = [
     "SplitResult",
     "FoldResult",
     "DataLoader",
+    "ImputationFitDiagnostic",
 ]

dataforge_ml-2.0.0/src/dataforge_ml/config.py ADDED Viewed

@@ -0,0 +1,343 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import TYPE_CHECKING, Union, Optional
+if TYPE_CHECKING:
+    from dataforge_ml.profiling._config import ProfileConfig, NumericKind
+    from dataforge_ml.imputation._config import ImputationConfig
+    from dataforge_ml.splitting._config import SplitConfig
+class SemanticType(StrEnum):
+    """The ML-level interpretation assigned to a column by the type detector.
+    Used throughout the pipeline to route columns to the correct sub-processors
+    and to determine which statistical operations apply. See CONTEXT.md §SemanticType
+    for the full type taxonomy and the Text vs Categorical distinction.
+    """
+    Numeric = "numeric"
+    Categorical = "categorical"
+    Datetime = "datetime"
+    Boolean = "boolean"
+    Text = "text"
+    Identifier = "identifier"
+class Modality(StrEnum):
+    """The data modality the pipeline operates on.
+    Currently only ``Tabular`` is supported. Reserved for future expansion to
+    additional modalities (time-series, image, etc.).
+    """
+    Tabular = "tabular"
+class PipelinePhase(StrEnum):
+    """The six sequential phases of the DataForgeML feature engineering pipeline.
+    Phase Orchestrators call ``PipelineConfig.resolve_active_columns`` with one
+    of these values to obtain the column set for that phase after Hard and Soft
+    Exclusions are applied.
+    """
+    Profiling = "profiling"
+    Imputation = "imputation"
+    OutlierDetection = "outlier_detection"
+    Normalization = "normalization"
+    Encoding = "encoding"
+    Scaling = "scaling"
+def _default_profile_config() -> ProfileConfig:
+    from dataforge_ml.profiling._config import ProfileConfig
+    return ProfileConfig()
+def _default_imputation_config() -> ImputationConfig:
+    from dataforge_ml.imputation._config import ImputationConfig
+    return ImputationConfig()
+def _default_split_config() -> SplitConfig:
+    from dataforge_ml.splitting._config import SplitConfig
+    return SplitConfig()
+@dataclass
+class PipelineConfig:
+    """
+    Master configuration for the full 6-phase feature engineering pipeline.
+    Parameters
+    ----------
+    exclude_columns : list[str]
+        Hard exclusions — columns dropped globally from every phase.
+    phase_exclusions : dict[PipelinePhase, list[str]]
+        Soft exclusions — columns bypassed for a specific phase but retained
+        in the dataset.
+    column_overrides : dict[str, SemanticType]
+        Explicit semantic type assignments respected by all downstream phases.
+    numeric_kind_overrides : dict[str, NumericKind]
+        Explicit ``NumericKind`` assignments for individual columns, applied
+        after auto-detection in Phase 1. Only valid for columns whose final
+        ``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
+    profiling : ProfileConfig
+        Phase 1-specific parameters (correlation, chunking, memory threshold).
+    imputation : ImputationConfig
+        Phase 2-specific parameters (strategy thresholds, size guards).
+    split : SplitConfig
+        Splitting thresholds (stratification signal cap, boolean minority bar).
+    random_seed : int, optional
+        Single seed for all stochastic pipeline operations, including GMM
+        Sampling during bimodal imputation. None produces non-deterministic
+        output.
+    """
+    exclude_columns: list[str] = field(default_factory=list)
+    phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
+    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
+    numeric_kind_overrides: dict[str, NumericKind] = field(default_factory=dict)
+    profiling: ProfileConfig = field(default_factory=_default_profile_config)
+    imputation: ImputationConfig = field(default_factory=_default_imputation_config)
+    split: SplitConfig = field(default_factory=_default_split_config)
+    random_seed: Optional[int] = None
+    def resolve_active_columns(
+        self, phase: PipelinePhase, available_columns: list[str]
+    ) -> list[str]:
+        """Return the columns the given phase should operate on.
+        Hard Exclusions are applied first, then phase-specific Soft Exclusions.
+        Columns absent from ``available_columns`` are silently ignored in both
+        exclusion lists.
+        Parameters
+        ----------
+        phase : PipelinePhase
+            The pipeline phase requesting the active column set.
+        available_columns : list[str]
+            The full list of columns currently present in the DataFrame.
+        Returns
+        -------
+        list[str]
+            Columns from ``available_columns`` that are not excluded by either
+            Hard or Soft Exclusion rules for the given phase, preserving the
+            original order.
+        """
+        hard_set = set(self.exclude_columns)
+        soft_set = set(self.phase_exclusions.get(phase, []))
+        excluded = hard_set | soft_set
+        return [c for c in available_columns if c not in excluded]
+    def add_exclusions(self, cols: list[str]) -> None:
+        """Add columns to the hard exclusion set, deduplicating automatically.
+        Columns already present in ``exclude_columns`` and duplicate entries
+        within ``cols`` are silently ignored. Calling with an empty list is a
+        no-op.
+        Parameters
+        ----------
+        cols : list[str]
+            Column names to register as hard exclusions. Deduplication is
+            handled here; callers do not need to pre-deduplicate.
+        """
+        existing = set(self.exclude_columns)
+        for col in cols:
+            if col not in existing:
+                self.exclude_columns.append(col)
+                existing.add(col)
+    def set_column_type(
+        self, column: str, semantic_type: Union[str, SemanticType]
+    ) -> None:
+        """Explicitly set the semantic type for a column, overriding auto-detection.
+        Parameters
+        ----------
+        column : str
+            Name of the column to override.
+        semantic_type : str or SemanticType
+            The desired semantic type. Accepts enum values or their string
+            equivalents (e.g. ``"numeric"``, ``"categorical"``).
+        Raises
+        ------
+        ValueError
+            When ``semantic_type`` is a string that does not match any
+            ``SemanticType`` value.
+        """
+        if isinstance(semantic_type, str):
+            try:
+                semantic_type = SemanticType(semantic_type)
+            except ValueError:
+                valid = [e.value for e in SemanticType]
+                raise ValueError(
+                    f"Unknown semantic type {semantic_type!r}. "
+                    f"Valid values: {valid}"
+                )
+        self.column_overrides[column] = semantic_type
+    def set_columns_type(
+        self, columns: list[str], semantic_type: Union[str, SemanticType]
+    ) -> None:
+        """Assign the same semantic type to every column in the list.
+        Parameters
+        ----------
+        columns : list[str]
+            Column names to override.
+        semantic_type : str or SemanticType
+            The desired semantic type applied to every column in the list.
+        """
+        for column in columns:
+            self.set_column_type(column, semantic_type)
+    def set_numeric_kind(
+        self, column: str, kind: Union[str, NumericKind]
+    ) -> None:
+        """Explicitly set the ``NumericKind`` for a single column.
+        Parameters
+        ----------
+        column : str
+            Name of the column to override.
+        kind : str or NumericKind
+            The desired numeric kind. Accepts enum values or their string
+            equivalents (``"continuous"``, ``"bounded_discrete"``).
+        Raises
+        ------
+        ValueError
+            When ``kind`` is a string that does not match any ``NumericKind``
+            value.
+        """
+        from dataforge_ml.profiling._config import NumericKind as _NumericKind
+        if isinstance(kind, str):
+            try:
+                kind = _NumericKind(kind)
+            except ValueError:
+                valid = [e.value for e in _NumericKind]
+                raise ValueError(
+                    f"Unknown NumericKind {kind!r}. Valid values: {valid}"
+                )
+        self.numeric_kind_overrides[column] = kind
+    def set_columns_numeric_kind(
+        self, columns: list[str], kind: Union[str, NumericKind]
+    ) -> None:
+        """Assign the same ``NumericKind`` to every column in the list.
+        Parameters
+        ----------
+        columns : list[str]
+            Column names to override.
+        kind : str or NumericKind
+            The desired numeric kind applied to every column in the list.
+        """
+        for column in columns:
+            self.set_numeric_kind(column, kind)
+    def to_dict(self) -> dict:
+        """Serialise the pipeline configuration to a plain dictionary.
+        Returns
+        -------
+        dict
+            All fields serialised to JSON-compatible types; nested configs are
+            recursively serialised via their own ``to_dict`` methods.
+        """
+        return {
+            "exclude_columns": list(self.exclude_columns),
+            "phase_exclusions": {
+                str(phase): list(cols)
+                for phase, cols in self.phase_exclusions.items()
+            },
+            "column_overrides": {
+                col: str(sem_type)
+                for col, sem_type in self.column_overrides.items()
+            },
+            "numeric_kind_overrides": {
+                col: str(kind)
+                for col, kind in self.numeric_kind_overrides.items()
+            },
+            "profiling": self.profiling.to_dict(),
+            "imputation": self.imputation.to_dict(),
+            "split": self.split.to_dict(),
+            "random_seed": self.random_seed,
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> PipelineConfig:
+        """Reconstruct a ``PipelineConfig`` from a plain dictionary.
+        Parameters
+        ----------
+        data : dict
+            Dictionary as produced by ``to_dict()``.
+        Returns
+        -------
+        PipelineConfig
+            Fully populated configuration instance with all nested sub-configs
+            restored.
+        """
+        from dataforge_ml.profiling._config import ProfileConfig, NumericKind as _NumericKind
+        from dataforge_ml.imputation._config import ImputationConfig
+        from dataforge_ml.splitting._config import SplitConfig
+        return cls(
+            exclude_columns=list(data.get("exclude_columns", [])),
+            phase_exclusions={
+                PipelinePhase(phase_str): list(cols)
+                for phase_str, cols in data.get("phase_exclusions", {}).items()
+            },
+            column_overrides={
+                col: SemanticType(sem_str)
+                for col, sem_str in data.get("column_overrides", {}).items()
+            },
+            numeric_kind_overrides={
+                col: _NumericKind(kind_str)
+                for col, kind_str in data.get("numeric_kind_overrides", {}).items()
+            },
+            profiling=ProfileConfig.from_dict(data.get("profiling", {})),
+            imputation=ImputationConfig.from_dict(data.get("imputation", {})),
+            split=SplitConfig.from_dict(data.get("split", {})),
+            random_seed=data.get("random_seed"),
+        )
+    def to_json(self, indent: int = 2) -> str:
+        """Serialise the pipeline configuration to a JSON string.
+        Parameters
+        ----------
+        indent : int
+            Number of spaces used for JSON indentation.
+        Returns
+        -------
+        str
+            JSON representation of ``to_dict()``.
+        """
+        return json.dumps(self.to_dict(), indent=indent)
+    @classmethod
+    def from_json(cls, json_str: str) -> PipelineConfig:
+        """Reconstruct a ``PipelineConfig`` from a JSON string.
+        Parameters
+        ----------
+        json_str : str
+            JSON string as produced by ``to_json()``.
+        Returns
+        -------
+        PipelineConfig
+            Fully populated configuration instance.
+        """
+        return cls.from_dict(json.loads(json_str))

dataforge_ml-2.0.0/src/dataforge_ml/imputation/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+from ._config import (
+    ColumnImputationRecord,
+    ImputationConfig,
+    ImputationFitDiagnostic,
+    ImputationResult,
+    ImputationStrategy,
+    NumericImputationConfig,
+)
+from ._fitted_imputer import (
+    FittedColumnAbsentError,
+    FittedImputer,
+    UnfittedColumnError,
+    UnseenColumnError,
+)
+from .orchestrator import ImputationOrchestrator
+__all__ = [
+    "ImputationStrategy",
+    "NumericImputationConfig",
+    "ImputationConfig",
+    "ImputationFitDiagnostic",
+    "ColumnImputationRecord",
+    "ImputationResult",
+    "FittedImputer",
+    "UnfittedColumnError",
+    "UnseenColumnError",
+    "FittedColumnAbsentError",
+    "ImputationOrchestrator",
+]

dataforge-ml 1.0.0__tar.gz → 2.0.0__tar.gz

dataforge-ml 1.0.0tar.gz → 2.0.0tar.gz