PyPI - dataforge-ml - Versions diffs - 0.10.0__tar.gz → 0.11.0__tar.gz - Mend

dataforge-ml 0.10.0tar.gz → 0.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.10.0
+Version: 0.11.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.10.0"
+version = "0.11.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/__init__.py RENAMED Viewed

@@ -1,21 +1,26 @@
+from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
 from .profiling.structural import StructuralProfiler
 from .profiling.config import (
     ProfileConfig,
-    SemanticType,
-    Modality,
     StructuralProfileResult,
+    ColumnProfile,
+    DatasetStats,
 )
 from .splitting import DataSplitter, SplitResult, FoldResult
 from .utils.data_loader import DataLoader
 __all__ = [
-    "StructuralProfiler",
-    "StructuralProfileResult",
+    "PipelineConfig",
+    "PipelinePhase",
     "ProfileConfig",
     "SemanticType",
     "Modality",
+    "StructuralProfiler",
+    "StructuralProfileResult",
+    "ColumnProfile",
+    "DatasetStats",
     "DataSplitter",
     "SplitResult",
     "FoldResult",
-    "DataLoader"
+    "DataLoader",
 ]

dataforge_ml-0.11.0/src/dataforge_ml/config.py ADDED Viewed

@@ -0,0 +1,133 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import TYPE_CHECKING, Union
+if TYPE_CHECKING:
+    from dataforge_ml.profiling.config import ProfileConfig
+class SemanticType(StrEnum):
+    Numeric = "numeric"
+    Categorical = "categorical"
+    Datetime = "datetime"
+    Boolean = "boolean"
+    Text = "text"
+    Identifier = "identifier"
+class Modality(StrEnum):
+    Tabular = "tabular"
+class PipelinePhase(StrEnum):
+    Profiling = "profiling"
+    Imputation = "imputation"
+    OutlierDetection = "outlier_detection"
+    Normalization = "normalization"
+    Encoding = "encoding"
+    Scaling = "scaling"
+def _default_profile_config() -> ProfileConfig:
+    from dataforge_ml.profiling.config import ProfileConfig
+    return ProfileConfig()
+@dataclass
+class PipelineConfig:
+    """
+    Master configuration for the full 6-phase feature engineering pipeline.
+    Parameters
+    ----------
+    exclude_columns : list[str]
+        Hard exclusions — columns dropped globally from every phase.
+    phase_exclusions : dict[PipelinePhase, list[str]]
+        Soft exclusions — columns bypassed for a specific phase but retained
+        in the dataset.
+    column_overrides : dict[str, SemanticType]
+        Explicit semantic type assignments respected by all downstream phases.
+    profiling : ProfileConfig
+        Phase 1-specific parameters (correlation, chunking, memory threshold).
+    """
+    exclude_columns: list[str] = field(default_factory=list)
+    phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
+    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
+    profiling: ProfileConfig = field(default_factory=_default_profile_config)
+    def resolve_active_columns(
+        self, phase: PipelinePhase, available_columns: list[str]
+    ) -> list[str]:
+        """
+        Return the columns the given phase should operate on.
+        Hard exclusions are applied first, then phase-specific soft exclusions.
+        Columns absent from available_columns are silently ignored in both lists.
+        """
+        hard_set = set(self.exclude_columns)
+        soft_set = set(self.phase_exclusions.get(phase, []))
+        excluded = hard_set | soft_set
+        return [c for c in available_columns if c not in excluded]
+    def set_column_type(
+        self, column: str, semantic_type: Union[str, SemanticType]
+    ) -> None:
+        """Explicitly set the semantic type for a column, overriding auto-detection."""
+        if isinstance(semantic_type, str):
+            try:
+                semantic_type = SemanticType(semantic_type)
+            except ValueError:
+                valid = [e.value for e in SemanticType]
+                raise ValueError(
+                    f"Unknown semantic type {semantic_type!r}. "
+                    f"Valid values: {valid}"
+                )
+        self.column_overrides[column] = semantic_type
+    def set_columns_type(
+        self, columns: list[str], semantic_type: Union[str, SemanticType]
+    ) -> None:
+        """Assign the same semantic type to every column in the list."""
+        for column in columns:
+            self.set_column_type(column, semantic_type)
+    def to_dict(self) -> dict:
+        return {
+            "exclude_columns": list(self.exclude_columns),
+            "phase_exclusions": {
+                str(phase): list(cols)
+                for phase, cols in self.phase_exclusions.items()
+            },
+            "column_overrides": {
+                col: str(sem_type)
+                for col, sem_type in self.column_overrides.items()
+            },
+            "profiling": self.profiling.to_dict(),
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> PipelineConfig:
+        from dataforge_ml.profiling.config import ProfileConfig
+        return cls(
+            exclude_columns=list(data.get("exclude_columns", [])),
+            phase_exclusions={
+                PipelinePhase(phase_str): list(cols)
+                for phase_str, cols in data.get("phase_exclusions", {}).items()
+            },
+            column_overrides={
+                col: SemanticType(sem_str)
+                for col, sem_str in data.get("column_overrides", {}).items()
+            },
+            profiling=ProfileConfig.from_dict(data.get("profiling", {})),
+        )
+    def to_json(self, indent: int = 2) -> str:
+        return json.dumps(self.to_dict(), indent=indent)
+    @classmethod
+    def from_json(cls, json_str: str) -> PipelineConfig:
+        return cls.from_dict(json.loads(json_str))

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/__init__.py RENAMED Viewed

@@ -1,10 +1,7 @@
 from .structural import StructuralProfiler
+from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
 from .config import (
     ProfileConfig,
-    PipelineConfig,
-    PipelinePhase,
-    SemanticType,
-    Modality,
     TypeFlag,
     NumericKind,
     NumericStats,

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/config.py RENAMED Viewed

@@ -12,6 +12,7 @@ from dataclasses import dataclass, field
 from enum import StrEnum
 from typing import Optional, Union
+from ..config import SemanticType, Modality
 from ._missingness_config import (
     ColumnMissingnessProfile,
 )
@@ -31,36 +32,6 @@ from ._boolean_config import BooleanStats
 from ._text_config import TextStats
 from ._target_config import TargetProfileResult
-# ---------------------------------------------------------------------------
-# Core enums
-# ---------------------------------------------------------------------------
-class SemanticType(StrEnum):
-    Numeric = "numeric"
-    Categorical = "categorical"
-    Datetime = "datetime"
-    Boolean = "boolean"
-    Text = "text"
-    Identifier = "identifier"
-class Modality(StrEnum):
-    Tabular = "tabular"
-    # Placeholder slots for future modalities — no implementation yet.
-    # Image = "image"
-    # TimeSeries = "time_series"
-class PipelinePhase(StrEnum):
-    Profiling = "profiling"
-    Imputation = "imputation"
-    OutlierDetection = "outlier_detection"
-    Normalization = "normalization"
-    Encoding = "encoding"
-    Scaling = "scaling"
 # ---------------------------------------------------------------------------
 # Type-detection enums — kept for TypeDetector compatibility
 # ---------------------------------------------------------------------------
@@ -274,98 +245,6 @@ class ProfileConfig:
         return cls.from_dict(json.loads(json_str))
-@dataclass
-class PipelineConfig:
-    """
-    Master configuration for the full 6-phase feature engineering pipeline.
-    Parameters
-    ----------
-    exclude_columns : list[str]
-        Hard exclusions — columns dropped globally from every phase.
-    phase_exclusions : dict[PipelinePhase, list[str]]
-        Soft exclusions — columns bypassed for a specific phase but retained
-        in the dataset.
-    column_overrides : dict[str, SemanticType]
-        Explicit semantic type assignments respected by all downstream phases.
-    profiling : ProfileConfig
-        Phase 1-specific parameters (correlation, chunking, memory threshold).
-    """
-    exclude_columns: list[str] = field(default_factory=list)
-    phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
-    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
-    profiling: ProfileConfig = field(default_factory=ProfileConfig)
-    def resolve_active_columns(
-        self, phase: PipelinePhase, available_columns: list[str]
-    ) -> list[str]:
-        """
-        Return the columns the given phase should operate on.
-        Hard exclusions are applied first, then phase-specific soft exclusions.
-        Columns absent from available_columns are silently ignored in both lists.
-        """
-        hard_set = set(self.exclude_columns)
-        soft_set = set(self.phase_exclusions.get(phase, []))
-        excluded = hard_set | soft_set
-        return [c for c in available_columns if c not in excluded]
-    def set_column_type(
-        self, column: str, semantic_type: Union[str, "SemanticType"]
-    ) -> None:
-        """
-        Explicitly set the semantic type for a column, overriding auto-detection.
-        This override is respected by all downstream phases.
-        """
-        if isinstance(semantic_type, str):
-            try:
-                semantic_type = SemanticType(semantic_type)
-            except ValueError:
-                valid = [e.value for e in SemanticType]
-                raise ValueError(
-                    f"Unknown semantic type {semantic_type!r}. "
-                    f"Valid values: {valid}"
-                )
-        self.column_overrides[column] = semantic_type
-    def to_dict(self) -> dict:
-        return {
-            "exclude_columns": list(self.exclude_columns),
-            "phase_exclusions": {
-                str(phase): list(cols)
-                for phase, cols in self.phase_exclusions.items()
-            },
-            "column_overrides": {
-                col: str(sem_type)
-                for col, sem_type in self.column_overrides.items()
-            },
-            "profiling": self.profiling.to_dict(),
-        }
-    @classmethod
-    def from_dict(cls, data: dict) -> "PipelineConfig":
-        return cls(
-            exclude_columns=list(data.get("exclude_columns", [])),
-            phase_exclusions={
-                PipelinePhase(phase_str): list(cols)
-                for phase_str, cols in data.get("phase_exclusions", {}).items()
-            },
-            column_overrides={
-                col: SemanticType(sem_str)
-                for col, sem_str in data.get("column_overrides", {}).items()
-            },
-            profiling=ProfileConfig.from_dict(data.get("profiling", {})),
-        )
-    def to_json(self, indent: int = 2) -> str:
-        return json.dumps(self.to_dict(), indent=indent)
-    @classmethod
-    def from_json(cls, json_str: str) -> "PipelineConfig":
-        return cls.from_dict(json.loads(json_str))
 @dataclass
 class ColumnTypeInfo:
     column: str

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/structural.py RENAMED Viewed

@@ -34,15 +34,12 @@ from ._missingness_profiler import MissingnessProfiler
 from ._target_profiler import TargetProfiler
 from ._correlation_profiler import CorrelationProfiler
 from ._type_detector import TypeDetector
+from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
 from .config import (
-    PipelineConfig,
-    PipelinePhase,
     ColumnProfile,
     StructuralProfileResult,
     RowMissingnessDistribution,
-    SemanticType,
     TypeFlag,
-    Modality,
 )
 _ROW_DROP_THRESHOLD = 0.50

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.10.0
+Version: 0.11.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,7 @@ LICENSE
 README.md
 pyproject.toml
 src/dataforge_ml/__init__.py
+src/dataforge_ml/config.py
 src/dataforge_ml.egg-info/PKG-INFO
 src/dataforge_ml.egg-info/SOURCES.txt
 src/dataforge_ml.egg-info/dependency_links.txt