PyPI - dataforge-ml - Versions diffs - 0.7.0__tar.gz → 0.9.0__tar.gz - Mend

dataforge-ml 0.7.0tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.7.0
+Version: 0.9.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.7.0"
+version = "0.9.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/__init__.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from .structural import StructuralProfiler
 from .config import (
     ProfileConfig,
+    PipelineConfig,
+    PipelinePhase,
     SemanticType,
     Modality,
     TypeFlag,
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
 __all__ = [
     "StructuralProfiler",
     "ProfileConfig",
+    "PipelineConfig",
+    "PipelinePhase",
     "SemanticType",
     "Modality",
     "TypeFlag",

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_base.py RENAMED Viewed

@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
 Hierarchy
 ---------
-Profiling[R]                    — root: stores config, provides _resolve_columns
-├── ColumnBatchProfiler[R]      — registry tier: __init__(config=None) only;
-│   │                             profile(df, columns) processes a typed column batch
+Profiling[R]                    — root: thin ABC, provides _resolve_columns
+├── ColumnBatchProfiler[R]      — registry tier: profile(df, columns) processes a
+│   │                             typed column batch; no config, no eligibility gates
 │   ├── NumericProfiler
 │   ├── CategoricalProfiler
 │   ├── DatetimeProfiler
@@ -26,22 +26,19 @@ import polars as pl
 from abc import abstractmethod, ABC
 from typing import Generic, TypeVar
-from .config import DatasetStats, ProfileConfig
+from .config import DatasetStats
 R = TypeVar("R")
 class Profiling(ABC, Generic[R]):
     """
-    Root base for all profilers.
+    Root base for all profilers. Thin ABC — no config state.
-    Stores config and provides _resolve_columns. Not instantiated directly —
-    use one of the three concrete tier bases below.
+    Sub-processors are pure batch processors: given a DataFrame and a column
+    list, return a result. No routing, no scoping, no config.
     """
-    def __init__(self, config: ProfileConfig | None = None):
-        self.config = config or ProfileConfig()
     @abstractmethod
     def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
     Contract
     --------
-    - __init__ must accept ONLY config (no extra required params). This allows
-      StructuralProfiler to instantiate any registered profiler uniformly via
-          profiler_cls(config=self.config)
+    - __init__ takes no arguments (instantiated as profiler_cls()).
     - profile(df, columns) receives the full DataFrame and the list of same-type
-      column names to process. Returns a result with:
+      column names to process. Profiles every column in the list without any
+      internal eligibility gate or config consultation.
+    - Returns a result with:
           .columns: dict[str, <Stats>]        — per-column stats
           .analysed_columns: list[str]        — columns actually profiled
     """

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_profiler.py RENAMED Viewed

@@ -22,11 +22,7 @@ from __future__ import annotations
 import polars as pl
 from ._base import ColumnBatchProfiler
-from .config import (
-    ProfileConfig,
-    BooleanStats,
-    SemanticType,
-)
+from .config import BooleanStats
 from ._boolean_config import BooleanProfileResult
 from ..models._data_types import _INT_DTYPES
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
     """
     Boolean column profiler for Polars DataFrames.
-    A column is eligible when:
-      - Its Polars dtype is pl.Boolean, OR
-      - Its dtype is an integer with values exclusively in {0, 1}, OR
-      - It has a SemanticType.Boolean override in ProfileConfig.column_overrides
-    Non-eligible columns in the provided list are silently skipped.
-    Parameters
-    ----------
-    config : ProfileConfig | None
-        Shared profiling configuration.
+    Profiles every column passed to profile(df, columns) — no config,
+    no internal eligibility gate.
     """
-    def __init__(self, config: ProfileConfig | None = None) -> None:
-        super().__init__(config)
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
     ) -> BooleanProfileResult:
         return self._run(data, columns)
-    # ------------------------------------------------------------------
-    # Eligibility
-    # ------------------------------------------------------------------
-    def _eligible(self, series: pl.Series) -> bool:
-        override = self.config.column_overrides.get(series.name)
-        # Explicit override — trust it
-        if override == SemanticType.Boolean:
-            return True
-        # Another override takes precedence over auto-detection
-        if override is not None:
-            return False
-        return True
     # ------------------------------------------------------------------
     # Orchestration
     # ------------------------------------------------------------------
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
     ) -> BooleanProfileResult:
         result = BooleanProfileResult()
-        available = [
-            c
-            for c in self._resolve_columns(df.columns, columns)
-            if self._eligible(df[c])
-        ]
+        available = self._resolve_columns(df.columns, columns)
         result.analysed_columns = available
         for col_name in available:

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical.py RENAMED Viewed

@@ -45,10 +45,6 @@ from ._categorical_config import (
     RareCategoryStats,
     ImbalanceMetrics,
 )
-from .config import (
-    ProfileConfig,
-    SemanticType,
-)
 # ---------------------------------------------------------------------------
 # Module-level thresholds (documented so callers can see what drives flags)
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
     """
     Categorical profiler for Polars DataFrames.
-    Parameters
-    ----------
-    columns : list[str]
-        Columns to profile. The profiler intersects this list with
-        the DataFrame's actual columns at runtime.
-    config : ProfileConfig | None
-        Shared profiling configuration (used for chunk_size, etc.).
-    Usage
-    -----
-    >>> profiler = CategoricalProfiler(
-    ...     columns=["status", "country", "product_type"],
-    ... )
-    >>> result = profiler.profile(df)
-    >>> print(result)
+    Profiles every column passed to profile(df, columns) — no config,
+    no internal eligibility gate.
     """
-    def __init__(
-        self,
-        config: ProfileConfig | None = None,
-    ) -> None:
-        super().__init__(config)
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
     # Orchestration
     # ------------------------------------------------------------------
-    def _eligible(
-        self,
-        series: pl.Series,
-    ) -> bool:
-        override = self.config.column_overrides.get(series.name)
-        if override == SemanticType.Categorical:
-            return True
-        if override is not None:
-            return False
-        return True
     def _run(
         self,
         df: pl.DataFrame,
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
     ) -> CategoricalProfileResult:
         result = CategoricalProfileResult()
-        # Resolve columns against actual schema
-        available = [
-            c
-            for c in self._resolve_columns(df.columns, columns)
-            if self._eligible(df[c])
-        ]
+        available = self._resolve_columns(df.columns, columns)
         result.analysed_columns = available
         n_rows = df.height

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py RENAMED Viewed

@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
         near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
         top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
     ) -> None:
-        super().__init__(config)
+        super().__init__()
         self._numeric_columns = numeric_columns
         self._categorical_columns = categorical_columns or []
         self._threshold = near_redundant_threshold

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_profiler.py RENAMED Viewed

@@ -43,10 +43,6 @@ from datetime import datetime, timezone
 import polars as pl
 from ._base import ColumnBatchProfiler
-from .config import (
-    ProfileConfig,
-    SemanticType,
-)
 from ._datetime_config import (
     DatetimeProfileResult,
     DatetimeStats,
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
     """
     Datetime distribution profiler for Polars DataFrames.
-    Parameters
-    ----------
-    columns : list[str]
-        Columns to profile.  Non-datetime columns are skipped with a warning.
-    config : ProfileConfig | None
-        Shared profiling configuration.
+    Profiles every column passed to profile(df, columns) — no config,
+    no internal eligibility gate. String columns are coerced to Datetime;
+    columns that cannot be coerced are silently skipped.
     """
-    def __init__(
-        self,
-        config: ProfileConfig | None = None,
-    ) -> None:
-        super().__init__(config)
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
     # Orchestration
     # ------------------------------------------------------------------
-    def _eligible(self, series: pl.Series) -> bool:
-        override = self.config.column_overrides.get(series.name)
-        if override == SemanticType.Datetime:
-            return True
-        if override is not None:
-            return False
-        return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
     def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
         if series.dtype in (pl.Utf8, pl.String):
             coerced = series.str.to_datetime(strict=False)
             return coerced if coerced.drop_nulls().len() > 0 else None
-        return series
+        if _is_datetime_dtype(series.dtype):
+            return series
+        return None
     def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
         result = DatetimeProfileResult()
         now = datetime.now(tz=timezone.utc)
-        candidates = [
-            c
-            for c in self._resolve_columns(df.columns, columns)
-            if self._eligible(df[c])
-        ]
         available = []
         coerced_cache = {}
-        for col_name in candidates:
+        for col_name in self._resolve_columns(df.columns, columns):
             series = self._coerce_to_datetime(df[col_name])
             if series is not None:
                 available.append(col_name)

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py RENAMED Viewed

@@ -3,19 +3,10 @@ MissingnessProfiler  –  Phase 1 extension: Missingness Profiling.
 Eligibility model
 -----------------
-Effective-null detection is based on **dtype first**, with SemanticType
-overrides acting only as suppressors, not as enablers:
+Effective-null detection is purely dtype-driven — no SemanticType overrides:
-sentinel-string detection  →  runs when dtype is Utf8/String
-                                suppressed if override is Numeric / Datetime / Boolean
-                                (those types cannot have meaningful sentinel strings)
-Inf / NaN expansion        →  runs when dtype is Float32/Float64
-                                never suppressed (Inf in a float column is always
-                                effectively missing regardless of semantic label)
-column_overrides is SPARSE — most columns will have no entry.
-Absence of an override is not a signal; it means "trust the dtype".
+sentinel-string detection  →  runs for every String/Utf8 column unconditionally
+Inf / NaN expansion        →  runs for every Float32/Float64 column unconditionally
 """
 from __future__ import annotations
@@ -24,13 +15,13 @@ from __future__ import annotations
 import polars as pl
 from ._base import DatasetLevelProfiler
-from .config import ProfileConfig, SemanticType
 from ._missingness_config import (
     ColumnMissingnessProfile,
     MissingnessFlag,
     MissingnessProfileResult,
     MissingSeverity,
 )
+from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
 # ---------------------------------------------------------------------------
 # Thresholds
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
 _MAR_CORRELATION_THRESHOLD = 0.60
 _COL_DROP_THRESHOLD = 0.50
-_SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
-# Overrides that suppress sentinel-string detection on a String column.
-# If a column is String but the user says "this is Numeric", treating
-# "NA" as a sentinel is correct — but if they say Categorical or Text,
-# sentinel detection still makes sense and should run.
-_SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
-    {
-        SemanticType.Numeric,
-        SemanticType.Datetime,
-        SemanticType.Boolean,
-        SemanticType.Identifier,
-    }
-)
-def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
-    """True when sentinel-string detection should run for this column."""
-    if dtype not in (pl.Utf8, pl.String):
-        return False
-    # Override present and it's a non-text semantic → suppress
-    if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
-        return False
-    return True
-def _inf_eligible(dtype: pl.DataType) -> bool:
-    """True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
-    return dtype in (pl.Float32, pl.Float64)
 class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
-    """
-    Missingness profiler for Polars DataFrames.
-    Column scoping
-    --------------
-    Resolution priority (high → low):
-      1. Explicit ``columns`` argument to ``profile()``.
-      2. ``config.exclude_columns`` — always removed.
-      3. All remaining DataFrame columns.
-    """
+    """Missingness profiler for Polars DataFrames."""
-    def __init__(self, config: ProfileConfig | None = None) -> None:
-        super().__init__(config)
-        self._config: ProfileConfig = config or ProfileConfig()
+    def __init__(self) -> None:
+        super().__init__()
     # ------------------------------------------------------------------
     # Public API
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
         if n_rows == 0 or not cols:
             return result
-        overrides = self._config.column_overrides  # sparse — most keys absent
         indicator_cols: list[pl.Series] = []
         for col_name in cols:
-            override = overrides.get(col_name)  # None for most columns
             col_profile, indicator = self._profile_column(
                 series=df[col_name],
                 col_name=col_name,
                 n_rows=n_rows,
-                override=override,
             )
             result.columns[col_name] = col_profile
             indicator_cols.append(indicator)
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
         series: pl.Series,
         col_name: str,
         n_rows: int,
-        override: SemanticType | None = None,  # sparse — None is the common case
     ) -> tuple[ColumnMissingnessProfile, pl.Series]:
-        """
-        Compute standard + effective null counts for one column.
-        Eligibility is dtype-first:
-        - sentinel strings  → String dtype, unless override suppresses it
-        - Inf/NaN           → Float dtype, always (never suppressed)
-        - everything else   → standard Polars null only
-        """
         profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
         dtype = series.dtype
         std_null = series.is_null()
-        if _sentinel_eligible(dtype, override):
+        if _sentinel_eligible(dtype):
             eff_null = (
                 std_null
                 | (series.str.strip_chars() == "")
@@ -208,7 +147,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
         r = profile.effective_null_ratio
-        if r < _SEVERITY_MINOR and r != 0:
+        if r == 0.0:
+            profile.severity = None
+        elif r < _SEVERITY_MINOR:
             profile.severity = MissingSeverity.Minor
         elif r < _SEVERITY_MODERATE:
             profile.severity = MissingSeverity.Moderate

dataforge_ml-0.9.0/src/dataforge_ml/profiling/_null_detection.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+_null_detection  –  shared dtype-driven null primitives for Phase 1.
+Single authority for what counts as "effectively null" across the entire
+Phase 1 implementation. No config, no SemanticType overrides, no state.
+"""
+from __future__ import annotations
+import polars as pl
+_SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
+def _sentinel_eligible(dtype: pl.DataType) -> bool:
+    """True when sentinel-string detection should run for this column (String/Utf8 only)."""
+    return dtype in (pl.Utf8, pl.String)
+def _inf_eligible(dtype: pl.DataType) -> bool:
+    """True when Inf/NaN expansion should run (Float32/Float64 only)."""
+    return dtype in (pl.Float32, pl.Float64)

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -35,10 +35,6 @@ from __future__ import annotations
 import polars as pl
 from ._base import ColumnBatchProfiler
-from .config import (
-    ProfileConfig,
-    SemanticType,
-)
 from ._correlation_profiler import _INT_DTYPES
 from ._numeric_config import (
     NumericProfileResult,
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
     """
     Numeric distribution profiler for Polars DataFrames.
-    Parameters
-    ----------
-    columns : list[str]
-        Columns to profile.  Non-numeric or absent columns are skipped
-        with a warning; they do not raise.
-    config : ProfileConfig | None
-        Shared profiling configuration.
+    Profiles every column passed to profile(df, columns) — no config,
+    no internal eligibility gate.
     """
-    def __init__(
-        self,
-        config: ProfileConfig | None = None,
-    ) -> None:
-        super().__init__(config)
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
     # Orchestration
     # ------------------------------------------------------------------
-    def _eligible(self, series: pl.Series) -> bool:
-        override = self.config.column_overrides.get(series.name)
-        if override == SemanticType.Numeric:
-            return True
-        if override is not None:
-            return False
-        return True
     def _run(
         self,
         df: pl.DataFrame,
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         result = NumericProfileResult()
         n_rows = df.height
-        available = [
-            c
-            for c in self._resolve_columns(df.columns, columns)
-            if self._eligible(df[c])
-        ]
+        available = self._resolve_columns(df.columns, columns)
         result.analysed_columns = available
         if not available:
@@ -254,16 +225,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
                 for i in range(top_rows)
             ]
         else:
-            # --- 20-Bin Histogram Distribution (Continuous) ---
+            # --- Histogram Distribution (Continuous) ---
             import numpy as np
             counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
+            n_clean = clean_f64.len()
             profile.histogram = [
                 HistogramBin(
                     lower_bound=float(bin_edges[i]),
                     upper_bound=float(bin_edges[i + 1]),
                     count=int(counts[i]),
-                    percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
+                    percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
                 )
                 for i in range(len(counts))
             ]

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_tabular.py RENAMED Viewed

@@ -3,16 +3,18 @@ TabularProfiler  –  Phase 1: Structural Profiling for tabular datasets.
 All DataFrame operations use Polars (no pandas dependency).
+A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
+computes dataset-level stats over every column — no exclusion logic, no
+config dependency.
 Computes:
-  • row / column count                (always full dataset)
+  • row / column count                (full dataset)
   • memory usage + per-column breakdown when threshold exceeded
-  • duplicate row count & ratio       (scoped to config.duplicate_columns)
-  • overall sparsity                  (scoped to config.sparsity_columns)
-  • data-type detection               (scoped to config.type_detection_columns;
-                                       skipped entirely when None)
+  • duplicate row count & ratio       (all columns)
+  • overall sparsity                  (all columns)
 Chunked processing is activated automatically when the DataFrame's
-estimated memory exceeds config.memory_threshold_mb.
+estimated memory exceeds _MEMORY_THRESHOLD_MB.
 """
 from __future__ import annotations
@@ -24,31 +26,32 @@ import polars as pl
 from ._base import ModalityProfiler
 from .config import (
     MemoryBreakdown,
-    ProfileConfig,
     DatasetStats,
 )
+# ---------------------------------------------------------------------------
+# Module-level constants (previously sourced from ProfileConfig)
+# ---------------------------------------------------------------------------
+_MEMORY_THRESHOLD_MB: float = 500.0
+_CHUNK_SIZE: int = 100_000
 class TabularProfiler(ModalityProfiler):
     """
     Structural profiler for Polars DataFrames.
+    Pipeline-agnostic: accepts no constructor arguments and applies no column
+    filtering. Computes dataset-level stats (row count, column count, memory,
+    duplicate ratio, overall sparsity) over the complete DataFrame it receives.
     Usage
     -----
-    >>> cfg = ProfileConfig(
-    ...     duplicate_columns=["user_id", "event_time"],
-    ...     sparsity_columns=["age", "income", "postcode"],
-    ...     type_detection_columns=["age", "income", "postcode", "created_at"],
-    ...     memory_threshold_mb=200,
-    ... )
-    >>> profiler = TabularProfiler(config=cfg)
+    >>> profiler = TabularProfiler()
     >>> result = profiler.profile(df)
     >>> print(result)
     """
-    def __init__(self, config: ProfileConfig | None = None):
-        super().__init__(config)
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
         if result.row_count == 0:
             return result
-        # 3. Resolve column scopes
+        # 3. Operate on all columns — no exclusion logic
         all_cols: list[str] = df.columns
-        analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
-        dup_cols = analysed_cols
-        missingness_cols = analysed_cols
         if use_chunks:
-            self._chunked_metrics(df, dup_cols, missingness_cols, result)
+            self._chunked_metrics(df, all_cols, all_cols, result)
         else:
-            self._full_metrics(df, dup_cols, missingness_cols, result)
+            self._full_metrics(df, all_cols, all_cols, result)
         return result
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
         total_bytes = sum(col_bytes.values())
         result.memory_bytes = total_bytes
-        threshold_bytes = self.config.memory_threshold_mb * 1024 * 1024
+        threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
         if total_bytes > threshold_bytes:
             result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
         seen hashes — semantics match keep='first'.
         Sparsity is accumulated as (missing_cells, total_cells).
         """
-        chunk_size = self.config.chunk_size
+        chunk_size = _CHUNK_SIZE
         n_chunks = math.ceil(result.row_count / chunk_size)
         seen_hashes: set[int] = set()

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_profiler.py RENAMED Viewed

@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
     """
     def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
-        super().__init__(config)
+        super().__init__()
         self.target_column = target_column
     def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
         self, series: pl.Series, n_rows: int, result: TargetProfileResult
     ) -> None:
         """Generates categorical metrics and checks for class imbalance."""
-        cat_profiler = CategoricalProfiler(config=self.config)
+        cat_profiler = CategoricalProfiler()
         # Internally compute cardinality, top values, and imbalance metrics
         cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
         self, series: pl.Series, n_rows: int, result: TargetProfileResult
     ) -> None:
         """Generates numeric metrics and checks for target skewness."""
-        num_profiler = NumericProfiler(config=self.config)
+        num_profiler = NumericProfiler()
         col_name = series.name
         num_result = num_profiler.profile(series.to_frame(), [col_name])

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_profiler.py RENAMED Viewed

@@ -54,11 +54,7 @@ from __future__ import annotations
 import polars as pl
 from ._base import ColumnBatchProfiler
-from .config import (
-    ProfileConfig,
-    TextStats,
-    SemanticType,
-)
+from .config import TextStats
 from ._text_config import TextProfileResult
 # Regex that counts non-whitespace token runs — used with str.count_matches.
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
     """
     Free-text column profiler for Polars DataFrames.
-    A column is eligible when:
-      - It has a ``SemanticType.Text`` override in
-        ``ProfileConfig.column_overrides``, OR
-      - Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
-    Non-eligible columns are silently skipped.
-    Parameters
-    ----------
-    config : ProfileConfig | None
-        Shared profiling configuration.
+    Profiles every column passed to profile(df, columns) — no config,
+    no internal eligibility gate.
     """
-    def __init__(self, config: ProfileConfig | None = None) -> None:
-        super().__init__(config)
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
     ) -> TextProfileResult:
         return self._run(data, columns)
-    # ------------------------------------------------------------------
-    # Eligibility
-    # ------------------------------------------------------------------
-    def _eligible(self, series: pl.Series) -> bool:
-        override = self.config.column_overrides.get(series.name)
-        if override == SemanticType.Text:
-            return True
-        # Any other explicit override takes precedence
-        if override is not None:
-            return False
-        # Native string dtype (pl.Utf8 is the canonical name; pl.String is
-        # an alias in newer Polars — check both for cross-version safety)
-        return series.dtype in (pl.Utf8, pl.String)
     # ------------------------------------------------------------------
     # Orchestration
     # ------------------------------------------------------------------
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
     ) -> TextProfileResult:
         result = TextProfileResult()
-        available = [
-            c
-            for c in self._resolve_columns(df.columns, columns)
-            if self._eligible(df[c])
-        ]
+        available = self._resolve_columns(df.columns, columns)
         result.analysed_columns = available
         for col_name in available:

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/config.py RENAMED Viewed

@@ -52,6 +52,15 @@ class Modality(StrEnum):
     # TimeSeries = "time_series"
+class PipelinePhase(StrEnum):
+    Profiling = "profiling"
+    Imputation = "imputation"
+    OutlierDetection = "outlier_detection"
+    Normalization = "normalization"
+    Encoding = "encoding"
+    Scaling = "scaling"
 # ---------------------------------------------------------------------------
 # Type-detection enums — kept for TypeDetector compatibility
 # ---------------------------------------------------------------------------
@@ -71,6 +80,7 @@ class TypeFlag(StrEnum):
     SequentialIndex = "sequential_index"
     FloatSequentialIndex = "float_sequential_index"
     FreeTextCandidate = "free_text_candidate"
+    UserOverride = "user_override"
 # ---------------------------------------------------------------------------
@@ -240,6 +250,34 @@ class ProfileConfig:
     memory_threshold_mb: float = 500.0
     chunk_size: int = 100_000
+    def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
+        """
+        Explicitly set the semantic type for a column, overriding auto-detection.
+        The override is the sole source of truth for that column's type — the
+        type detector's verdict is ignored during profiling.  Calling this method
+        multiple times on the same column is valid; the last call wins.
+        Parameters
+        ----------
+        column : str
+            Name of the column to override.
+        semantic_type : str | SemanticType
+            Target semantic type.  Accepts a plain string (e.g. ``"numeric"``,
+            ``"categorical"``) or a ``SemanticType`` enum value.  Invalid strings
+            raise ``ValueError``.
+        """
+        if isinstance(semantic_type, str):
+            try:
+                semantic_type = SemanticType(semantic_type)
+            except ValueError:
+                valid = [e.value for e in SemanticType]
+                raise ValueError(
+                    f"Unknown semantic type {semantic_type!r}. "
+                    f"Valid values: {valid}"
+                )
+        self.column_overrides[column] = semantic_type
     def to_dict(self) -> dict:
         return {
             "modality": str(self.modality),
@@ -256,7 +294,7 @@ class ProfileConfig:
     def from_dict(cls, data: dict) -> ProfileConfig:
         return cls(
             modality=Modality(data.get("modality", Modality.Tabular)),
-            target_column=data.get("target_column"),
+            target_columns=list(data.get("target_columns", [])),
             column_overrides={
                 k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
             },
@@ -275,6 +313,98 @@ class ProfileConfig:
         return cls.from_dict(json.loads(json_str))
+@dataclass
+class PipelineConfig:
+    """
+    Master configuration for the full 6-phase feature engineering pipeline.
+    Parameters
+    ----------
+    exclude_columns : list[str]
+        Hard exclusions — columns dropped globally from every phase.
+    phase_exclusions : dict[PipelinePhase, list[str]]
+        Soft exclusions — columns bypassed for a specific phase but retained
+        in the dataset.
+    column_overrides : dict[str, SemanticType]
+        Explicit semantic type assignments respected by all downstream phases.
+    profiling : ProfileConfig
+        Phase 1-specific parameters (correlation, chunking, memory threshold).
+    """
+    exclude_columns: list[str] = field(default_factory=list)
+    phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
+    column_overrides: dict[str, SemanticType] = field(default_factory=dict)
+    profiling: ProfileConfig = field(default_factory=ProfileConfig)
+    def resolve_active_columns(
+        self, phase: PipelinePhase, available_columns: list[str]
+    ) -> list[str]:
+        """
+        Return the columns the given phase should operate on.
+        Hard exclusions are applied first, then phase-specific soft exclusions.
+        Columns absent from available_columns are silently ignored in both lists.
+        """
+        hard_set = set(self.exclude_columns)
+        soft_set = set(self.phase_exclusions.get(phase, []))
+        excluded = hard_set | soft_set
+        return [c for c in available_columns if c not in excluded]
+    def set_column_type(
+        self, column: str, semantic_type: Union[str, "SemanticType"]
+    ) -> None:
+        """
+        Explicitly set the semantic type for a column, overriding auto-detection.
+        This override is respected by all downstream phases.
+        """
+        if isinstance(semantic_type, str):
+            try:
+                semantic_type = SemanticType(semantic_type)
+            except ValueError:
+                valid = [e.value for e in SemanticType]
+                raise ValueError(
+                    f"Unknown semantic type {semantic_type!r}. "
+                    f"Valid values: {valid}"
+                )
+        self.column_overrides[column] = semantic_type
+    def to_dict(self) -> dict:
+        return {
+            "exclude_columns": list(self.exclude_columns),
+            "phase_exclusions": {
+                str(phase): list(cols)
+                for phase, cols in self.phase_exclusions.items()
+            },
+            "column_overrides": {
+                col: str(sem_type)
+                for col, sem_type in self.column_overrides.items()
+            },
+            "profiling": self.profiling.to_dict(),
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> "PipelineConfig":
+        return cls(
+            exclude_columns=list(data.get("exclude_columns", [])),
+            phase_exclusions={
+                PipelinePhase(phase_str): list(cols)
+                for phase_str, cols in data.get("phase_exclusions", {}).items()
+            },
+            column_overrides={
+                col: SemanticType(sem_str)
+                for col, sem_str in data.get("column_overrides", {}).items()
+            },
+            profiling=ProfileConfig.from_dict(data.get("profiling", {})),
+        )
+    def to_json(self, indent: int = 2) -> str:
+        return json.dumps(self.to_dict(), indent=indent)
+    @classmethod
+    def from_json(cls, json_str: str) -> "PipelineConfig":
+        return cls.from_dict(json.loads(json_str))
 @dataclass
 class ColumnTypeInfo:
     column: str

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/structural.py RENAMED Viewed

@@ -35,11 +35,13 @@ from ._target_profiler import TargetProfiler
 from ._correlation_profiler import CorrelationProfiler
 from ._type_detector import TypeDetector
 from .config import (
-    ProfileConfig,
+    PipelineConfig,
+    PipelinePhase,
     ColumnProfile,
     StructuralProfileResult,
     RowMissingnessDistribution,
     SemanticType,
+    TypeFlag,
     Modality,
 )
@@ -63,14 +65,16 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = {  #
 class StructuralProfiler:
-    def __init__(self, config: ProfileConfig | None = None) -> None:
-        self.config = config or ProfileConfig()
+    def __init__(self, config: PipelineConfig | None = None) -> None:
+        self.config: PipelineConfig = config or PipelineConfig()
+        # Keep sub-profilers aligned with the master column_overrides.
+        self.config.profiling.column_overrides = self.config.column_overrides
-        if self.config.modality == Modality.Tabular:
-            self.modality_profiler: ModalityProfiler = TabularProfiler(self.config)
+        if self.config.profiling.modality == Modality.Tabular:
+            self.modality_profiler: ModalityProfiler = TabularProfiler()
         else:
             raise NotImplementedError(
-                f"modality {self.config.modality} not supported yet"
+                f"modality {self.config.profiling.modality} not supported yet"
             )
     # ------------------------------------------------------------------
@@ -86,7 +90,17 @@ class StructuralProfiler:
         result = StructuralProfileResult()
-        active_cols = [c for c in data.columns if c not in self.config.exclude_columns]
+        active_cols = self.config.resolve_active_columns(
+            PipelinePhase.Profiling, list(data.columns)
+        )
+        # Columns soft-excluded for Profiling: skipped but retained in the result.
+        hard_set = set(self.config.exclude_columns)
+        soft_retained = [
+            c for c in data.columns
+            if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
+            and c not in hard_set
+        ]
         # ── 1. Modality profiler ─────────────────────────────────────────
         # Replaces default DatasetStats with the real one (row_count, memory,
@@ -96,7 +110,7 @@ class StructuralProfiler:
         # ── 2. Missingness pre-pass ──────────────────────────────────────
         # setdefault creates ColumnProfile entries; subsequent steps mutate
         # the same objects via the same setdefault pattern.
-        missingness_result = MissingnessProfiler(config=self.config).profile(
+        missingness_result = MissingnessProfiler().profile(
             data, columns=active_cols
         )
         for col_name in missingness_result.analysed_columns:
@@ -111,7 +125,6 @@ class StructuralProfiler:
             df=data,
             cols=active_cols,
             n_rows=data.height,
-            overrides=self.config.column_overrides,
         )
         # ── 4. Type detection ────────────────────────────────────────────
@@ -130,7 +143,10 @@ class StructuralProfiler:
         # Overrides for excluded / non-existent columns are silently ignored.
         for col_name, override_type in self.config.column_overrides.items():
             if col_name in result.columns:
-                result.columns[col_name].semantic_type = override_type
+                cp = result.columns[col_name]
+                cp.semantic_type = override_type
+                if TypeFlag.UserOverride not in cp.type_flags:
+                    cp.type_flags.append(TypeFlag.UserOverride)
         # ── 6. Per-column profiling routed by SemanticType ───────────────
         # Batch all columns of the same SemanticType together and call each
@@ -149,7 +165,7 @@ class StructuralProfiler:
             profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type)  # type: ignore[arg-type]
             if profiler_cls is None:
                 continue
-            profiler = profiler_cls(config=self.config)
+            profiler = profiler_cls()
             try:
                 batch = profiler.profile(data, columns=cols)
                 for col_name in batch.analysed_columns:
@@ -161,13 +177,13 @@ class StructuralProfiler:
         # ── 7. Target columns ────────────────────────────────────────────
         # TargetProfiler produces target-specific analysis stored in
         # result.targets.  cp.stats is NOT overwritten — step 6 already set it.
-        if self.config.target_columns:
-            for target in self.config.target_columns:
+        if self.config.profiling.target_columns:
+            for target in self.config.profiling.target_columns:
                 if target not in data.columns:
                     continue
                 target_result = TargetProfiler(
                     target_column=target,
-                    config=self.config,
+                    config=self.config.profiling,
                 ).profile(data)
                 result.targets[target] = target_result
@@ -176,7 +192,7 @@ class StructuralProfiler:
                 cp.is_target = True
         # ── 8. Correlation ───────────────────────────────────────────────
-        if self.config.compute_correlation:
+        if self.config.profiling.compute_correlation:
             # Resolve column lists by detected SemanticType (post-override).
             numeric_cols = [
                 c
@@ -194,7 +210,7 @@ class StructuralProfiler:
             corr_profiler = CorrelationProfiler(
                 numeric_columns=numeric_cols,
                 categorical_columns=categorical_cols,
-                config=self.config,
+                config=self.config.profiling,
             )
             # 8a. Feature-feature matrices — computed ONCE, target-independent.
@@ -205,7 +221,7 @@ class StructuralProfiler:
             # 8b. Per-target analysis — matrices are NOT recomputed; each call
             #     shallow-copies feature_corr and appends target-specific fields.
-            for target in self.config.target_columns:
+            for target in self.config.profiling.target_columns:
                 if target not in data.columns:
                     continue
                 result.dataset.target_correlations[target] = (
@@ -214,6 +230,12 @@ class StructuralProfiler:
                     )
                 )
+        # ── Soft-excluded placeholders ───────────────────────────────────────
+        # Columns soft-excluded for Profiling are not profiled but must still
+        # appear in the result so downstream phases can reference them.
+        for col in soft_retained:
+            result.columns.setdefault(col, ColumnProfile(name=col))
         return result
     # ------------------------------------------------------------------
@@ -225,9 +247,8 @@ class StructuralProfiler:
         df: pl.DataFrame,
         cols: list[str],
         n_rows: int,
-        overrides: dict[str, SemanticType],
     ) -> RowMissingnessDistribution:
-        from ._missingness_profiler import (
+        from ._null_detection import (
             _sentinel_eligible,
             _inf_eligible,
             _SENTINEL_STRINGS,
@@ -242,10 +263,9 @@ class StructuralProfiler:
         for col_name in cols:
             dtype = df[col_name].dtype
-            override = overrides.get(col_name)
             null_e = pl.col(col_name).is_null()
-            if _sentinel_eligible(dtype, override):
+            if _sentinel_eligible(dtype):
                 eff = (
                     null_e
                     | (pl.col(col_name).str.strip_chars() == "")

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/data_loader.py RENAMED Viewed

@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
 class DataLoader:
-    def __init__(self, fmt: str | None = None) -> None:
-        self._fmt_override = fmt.lower() if fmt else None
     def load(
         self,
@@ -92,7 +90,7 @@ class DataLoader:
     ) -> pl.DataFrame:
         raw, ext_from_path = _read_raw(source)
-        resolved_fmt = (fmt or self._fmt_override or ext_from_path or "").lower()
+        resolved_fmt = (ext_from_path or "").lower()
         if resolved_fmt not in _EXT_LOADERS:
             label = resolved_fmt if resolved_fmt else "<unknown>"

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.7.0
+Version: 0.9.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/SOURCES.txt RENAMED Viewed

@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
 src/dataforge_ml/profiling/_datetime_profiler.py
 src/dataforge_ml/profiling/_missingness_config.py
 src/dataforge_ml/profiling/_missingness_profiler.py
+src/dataforge_ml/profiling/_null_detection.py
 src/dataforge_ml/profiling/_numeric_config.py
 src/dataforge_ml/profiling/_numeric_profiler.py
 src/dataforge_ml/profiling/_tabular.py