PyPI - dataforge-ml - Versions diffs - 0.9.0__tar.gz → 0.11.0__tar.gz - Mend

dataforge-ml 0.9.0tar.gz → 0.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.9.0
+Version: 0.11.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
@@ -21,9 +21,9 @@ Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Dynamic: license-file
-# FeatureForge
+# DataForgeML
-Automated feature engineering and data profiling pipeline library for tabular datasets.
+Automated feature engineering and data profiling pipeline library for datasets.
 ## Installation

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
-# FeatureForge
+# DataForgeML
-Automated feature engineering and data profiling pipeline library for tabular datasets.
+Automated feature engineering and data profiling pipeline library for datasets.
 ## Installation

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.9.0"
+version = "0.11.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/__init__.py RENAMED Viewed

@@ -1,21 +1,26 @@
+from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
 from .profiling.structural import StructuralProfiler
 from .profiling.config import (
     ProfileConfig,
-    SemanticType,
-    Modality,
     StructuralProfileResult,
+    ColumnProfile,
+    DatasetStats,
 )
 from .splitting import DataSplitter, SplitResult, FoldResult
 from .utils.data_loader import DataLoader
 __all__ = [
-    "StructuralProfiler",
-    "StructuralProfileResult",
+    "PipelineConfig",
+    "PipelinePhase",
     "ProfileConfig",
     "SemanticType",
     "Modality",
+    "StructuralProfiler",
+    "StructuralProfileResult",
+    "ColumnProfile",
+    "DatasetStats",
     "DataSplitter",
     "SplitResult",
     "FoldResult",
-    "DataLoader"
+    "DataLoader",
 ]

dataforge_ml-0.11.0/src/dataforge_ml/config.py ADDED Viewed

@@ -0,0 +1,133 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import TYPE_CHECKING, Union
+if TYPE_CHECKING:
+    from dataforge_ml.profiling.config import ProfileConfig
+class SemanticType(StrEnum):
+    Numeric = "numeric"
+    Categorical = "categorical"
+    Datetime = "datetime"
+    Boolean = "boolean"
+    Text = "text"
+    Identifier = "identifier"
+class Modality(StrEnum):
+    Tabular = "tabular"
+class PipelinePhase(StrEnum):
+    Profiling = "profiling"
+    Imputation = "imputation"
+    OutlierDetection = "outlier_detection"
+    Normalization = "normalization"
+    Encoding = "encoding"
+    Scaling = "scaling"
+def _default_profile_config() -> ProfileConfig:
+    from dataforge_ml.profiling.config import ProfileConfig
+    return ProfileConfig()
+@dataclass
+class PipelineConfig:
+    """
+    Master configuration for the full 6-phase feature engineering pipeline.
+    Parameters
+    ----------
+    exclude_columns : list[str]
+        Hard exclusions — columns dropped globally from every phase.
+    phase_exclusions : dict[PipelinePhase, list[str]]
+        Soft exclusions — columns bypassed for a specific phase but retained
+        in the dataset.
+    column_overrides : dict[str, SemanticType]
+        Explicit semantic type assignments respected by all downstream phases.
+    profiling : ProfileConfig
+        Phase 1-specific parameters (correlation, chunking, memory threshold).
+    """
+    exclude_columns: list[str] = field(default_factory=list)
+    phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
+    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
+    profiling: ProfileConfig = field(default_factory=_default_profile_config)
+    def resolve_active_columns(
+        self, phase: PipelinePhase, available_columns: list[str]
+    ) -> list[str]:
+        """
+        Return the columns the given phase should operate on.
+        Hard exclusions are applied first, then phase-specific soft exclusions.
+        Columns absent from available_columns are silently ignored in both lists.
+        """
+        hard_set = set(self.exclude_columns)
+        soft_set = set(self.phase_exclusions.get(phase, []))
+        excluded = hard_set | soft_set
+        return [c for c in available_columns if c not in excluded]
+    def set_column_type(
+        self, column: str, semantic_type: Union[str, SemanticType]
+    ) -> None:
+        """Explicitly set the semantic type for a column, overriding auto-detection."""
+        if isinstance(semantic_type, str):
+            try:
+                semantic_type = SemanticType(semantic_type)
+            except ValueError:
+                valid = [e.value for e in SemanticType]
+                raise ValueError(
+                    f"Unknown semantic type {semantic_type!r}. "
+                    f"Valid values: {valid}"
+                )
+        self.column_overrides[column] = semantic_type
+    def set_columns_type(
+        self, columns: list[str], semantic_type: Union[str, SemanticType]
+    ) -> None:
+        """Assign the same semantic type to every column in the list."""
+        for column in columns:
+            self.set_column_type(column, semantic_type)
+    def to_dict(self) -> dict:
+        return {
+            "exclude_columns": list(self.exclude_columns),
+            "phase_exclusions": {
+                str(phase): list(cols)
+                for phase, cols in self.phase_exclusions.items()
+            },
+            "column_overrides": {
+                col: str(sem_type)
+                for col, sem_type in self.column_overrides.items()
+            },
+            "profiling": self.profiling.to_dict(),
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> PipelineConfig:
+        from dataforge_ml.profiling.config import ProfileConfig
+        return cls(
+            exclude_columns=list(data.get("exclude_columns", [])),
+            phase_exclusions={
+                PipelinePhase(phase_str): list(cols)
+                for phase_str, cols in data.get("phase_exclusions", {}).items()
+            },
+            column_overrides={
+                col: SemanticType(sem_str)
+                for col, sem_str in data.get("column_overrides", {}).items()
+            },
+            profiling=ProfileConfig.from_dict(data.get("profiling", {})),
+        )
+    def to_json(self, indent: int = 2) -> str:
+        return json.dumps(self.to_dict(), indent=indent)
+    @classmethod
+    def from_json(cls, json_str: str) -> PipelineConfig:
+        return cls.from_dict(json.loads(json_str))

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/__init__.py RENAMED Viewed

@@ -1,10 +1,7 @@
 from .structural import StructuralProfiler
+from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
 from .config import (
     ProfileConfig,
-    PipelineConfig,
-    PipelinePhase,
-    SemanticType,
-    Modality,
     TypeFlag,
     NumericKind,
     NumericStats,

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/config.py RENAMED Viewed

@@ -12,6 +12,7 @@ from dataclasses import dataclass, field
 from enum import StrEnum
 from typing import Optional, Union
+from ..config import SemanticType, Modality
 from ._missingness_config import (
     ColumnMissingnessProfile,
 )
@@ -31,36 +32,6 @@ from ._boolean_config import BooleanStats
 from ._text_config import TextStats
 from ._target_config import TargetProfileResult
-# ---------------------------------------------------------------------------
-# Core enums
-# ---------------------------------------------------------------------------
-class SemanticType(StrEnum):
-    Numeric = "numeric"
-    Categorical = "categorical"
-    Datetime = "datetime"
-    Boolean = "boolean"
-    Text = "text"
-    Identifier = "identifier"
-class Modality(StrEnum):
-    Tabular = "tabular"
-    # Placeholder slots for future modalities — no implementation yet.
-    # Image = "image"
-    # TimeSeries = "time_series"
-class PipelinePhase(StrEnum):
-    Profiling = "profiling"
-    Imputation = "imputation"
-    OutlierDetection = "outlier_detection"
-    Normalization = "normalization"
-    Encoding = "encoding"
-    Scaling = "scaling"
 # ---------------------------------------------------------------------------
 # Type-detection enums — kept for TypeDetector compatibility
 # ---------------------------------------------------------------------------
@@ -227,10 +198,6 @@ class ProfileConfig:
         Data modality. Currently only Tabular is implemented.
     target_column : Optional[str]
         Name of the label/target column, if any.
-    column_overrides : dict[str, SemanticType]
-        Explicit semantic type assignments that override auto-detection.
-    exclude_columns : list[str]
-        Columns to skip entirely during profiling.
     compute_correlation : bool
         Whether to compute the feature-feature correlation matrix.
     correlation_target_column : Optional[str]
@@ -243,47 +210,16 @@ class ProfileConfig:
     modality: Modality = Modality.Tabular
     target_columns: list[str] = field(default_factory=list)
-    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
-    exclude_columns: list[str] = field(default_factory=list)
     compute_correlation: bool = False
     correlation_target_column: Optional[str] = None
     memory_threshold_mb: float = 500.0
     chunk_size: int = 100_000
-    def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
-        """
-        Explicitly set the semantic type for a column, overriding auto-detection.
-        The override is the sole source of truth for that column's type — the
-        type detector's verdict is ignored during profiling.  Calling this method
-        multiple times on the same column is valid; the last call wins.
-        Parameters
-        ----------
-        column : str
-            Name of the column to override.
-        semantic_type : str | SemanticType
-            Target semantic type.  Accepts a plain string (e.g. ``"numeric"``,
-            ``"categorical"``) or a ``SemanticType`` enum value.  Invalid strings
-            raise ``ValueError``.
-        """
-        if isinstance(semantic_type, str):
-            try:
-                semantic_type = SemanticType(semantic_type)
-            except ValueError:
-                valid = [e.value for e in SemanticType]
-                raise ValueError(
-                    f"Unknown semantic type {semantic_type!r}. "
-                    f"Valid values: {valid}"
-                )
-        self.column_overrides[column] = semantic_type
     def to_dict(self) -> dict:
         return {
             "modality": str(self.modality),
             "target_columns": list(self.target_columns),
-            "column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
-            "exclude_columns": list(self.exclude_columns),
             "compute_correlation": self.compute_correlation,
             "correlation_target_column": self.correlation_target_column,
             "memory_threshold_mb": self.memory_threshold_mb,
@@ -295,10 +231,6 @@ class ProfileConfig:
         return cls(
             modality=Modality(data.get("modality", Modality.Tabular)),
             target_columns=list(data.get("target_columns", [])),
-            column_overrides={
-                k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
-            },
-            exclude_columns=list(data.get("exclude_columns", [])),
             compute_correlation=bool(data.get("compute_correlation", False)),
             correlation_target_column=data.get("correlation_target_column"),
             memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
@@ -313,98 +245,6 @@ class ProfileConfig:
         return cls.from_dict(json.loads(json_str))
-@dataclass
-class PipelineConfig:
-    """
-    Master configuration for the full 6-phase feature engineering pipeline.
-    Parameters
-    ----------
-    exclude_columns : list[str]
-        Hard exclusions — columns dropped globally from every phase.
-    phase_exclusions : dict[PipelinePhase, list[str]]
-        Soft exclusions — columns bypassed for a specific phase but retained
-        in the dataset.
-    column_overrides : dict[str, SemanticType]
-        Explicit semantic type assignments respected by all downstream phases.
-    profiling : ProfileConfig
-        Phase 1-specific parameters (correlation, chunking, memory threshold).
-    """
-    exclude_columns: list[str] = field(default_factory=list)
-    phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
-    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
-    profiling: ProfileConfig = field(default_factory=ProfileConfig)
-    def resolve_active_columns(
-        self, phase: PipelinePhase, available_columns: list[str]
-    ) -> list[str]:
-        """
-        Return the columns the given phase should operate on.
-        Hard exclusions are applied first, then phase-specific soft exclusions.
-        Columns absent from available_columns are silently ignored in both lists.
-        """
-        hard_set = set(self.exclude_columns)
-        soft_set = set(self.phase_exclusions.get(phase, []))
-        excluded = hard_set | soft_set
-        return [c for c in available_columns if c not in excluded]
-    def set_column_type(
-        self, column: str, semantic_type: Union[str, "SemanticType"]
-    ) -> None:
-        """
-        Explicitly set the semantic type for a column, overriding auto-detection.
-        This override is respected by all downstream phases.
-        """
-        if isinstance(semantic_type, str):
-            try:
-                semantic_type = SemanticType(semantic_type)
-            except ValueError:
-                valid = [e.value for e in SemanticType]
-                raise ValueError(
-                    f"Unknown semantic type {semantic_type!r}. "
-                    f"Valid values: {valid}"
-                )
-        self.column_overrides[column] = semantic_type
-    def to_dict(self) -> dict:
-        return {
-            "exclude_columns": list(self.exclude_columns),
-            "phase_exclusions": {
-                str(phase): list(cols)
-                for phase, cols in self.phase_exclusions.items()
-            },
-            "column_overrides": {
-                col: str(sem_type)
-                for col, sem_type in self.column_overrides.items()
-            },
-            "profiling": self.profiling.to_dict(),
-        }
-    @classmethod
-    def from_dict(cls, data: dict) -> "PipelineConfig":
-        return cls(
-            exclude_columns=list(data.get("exclude_columns", [])),
-            phase_exclusions={
-                PipelinePhase(phase_str): list(cols)
-                for phase_str, cols in data.get("phase_exclusions", {}).items()
-            },
-            column_overrides={
-                col: SemanticType(sem_str)
-                for col, sem_str in data.get("column_overrides", {}).items()
-            },
-            profiling=ProfileConfig.from_dict(data.get("profiling", {})),
-        )
-    def to_json(self, indent: int = 2) -> str:
-        return json.dumps(self.to_dict(), indent=indent)
-    @classmethod
-    def from_json(cls, json_str: str) -> "PipelineConfig":
-        return cls.from_dict(json.loads(json_str))
 @dataclass
 class ColumnTypeInfo:
     column: str

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/structural.py RENAMED Viewed

@@ -34,15 +34,12 @@ from ._missingness_profiler import MissingnessProfiler
 from ._target_profiler import TargetProfiler
 from ._correlation_profiler import CorrelationProfiler
 from ._type_detector import TypeDetector
+from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
 from .config import (
-    PipelineConfig,
-    PipelinePhase,
     ColumnProfile,
     StructuralProfileResult,
     RowMissingnessDistribution,
-    SemanticType,
     TypeFlag,
-    Modality,
 )
 _ROW_DROP_THRESHOLD = 0.50
@@ -67,8 +64,6 @@ class StructuralProfiler:
     def __init__(self, config: PipelineConfig | None = None) -> None:
         self.config: PipelineConfig = config or PipelineConfig()
-        # Keep sub-profilers aligned with the master column_overrides.
-        self.config.profiling.column_overrides = self.config.column_overrides
         if self.config.profiling.modality == Modality.Tabular:
             self.modality_profiler: ModalityProfiler = TabularProfiler()

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.9.0
+Version: 0.11.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
@@ -21,9 +21,9 @@ Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Dynamic: license-file
-# FeatureForge
+# DataForgeML
-Automated feature engineering and data profiling pipeline library for tabular datasets.
+Automated feature engineering and data profiling pipeline library for datasets.
 ## Installation

{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,7 @@ LICENSE
 README.md
 pyproject.toml
 src/dataforge_ml/__init__.py
+src/dataforge_ml/config.py
 src/dataforge_ml.egg-info/PKG-INFO
 src/dataforge_ml.egg-info/SOURCES.txt
 src/dataforge_ml.egg-info/dependency_links.txt