dataforge-ml 0.8.0__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/PKG-INFO +3 -3
  2. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/README.md +2 -2
  3. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/pyproject.toml +1 -1
  4. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/__init__.py +4 -0
  5. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_base.py +11 -14
  6. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_boolean_profiler.py +4 -41
  7. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical.py +3 -44
  8. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
  9. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_datetime_profiler.py +7 -34
  10. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_profiler.py +8 -69
  11. dataforge_ml-0.10.0/src/dataforge_ml/profiling/_null_detection.py +22 -0
  12. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -32
  13. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_tabular.py +25 -26
  14. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -3
  15. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_text_profiler.py +4 -42
  16. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/config.py +101 -39
  17. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/structural.py +34 -20
  18. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/PKG-INFO +3 -3
  19. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
  20. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/LICENSE +0 -0
  21. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/setup.cfg +0 -0
  22. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/__init__.py +0 -0
  23. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/__init__.py +0 -0
  24. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  25. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/_data_types.py +0 -0
  26. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  27. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  28. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  29. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  30. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  31. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  32. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  33. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  34. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  35. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  36. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/_config.py +0 -0
  37. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  38. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/utils/__init__.py +0 -0
  39. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/utils/data_loader.py +0 -0
  40. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  41. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  42. {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.8.0
3
+ Version: 0.10.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -21,9 +21,9 @@ Provides-Extra: dev
21
21
  Requires-Dist: pytest>=8.0; extra == "dev"
22
22
  Dynamic: license-file
23
23
 
24
- # FeatureForge
24
+ # DataForgeML
25
25
 
26
- Automated feature engineering and data profiling pipeline library for tabular datasets.
26
+ Automated feature engineering and data profiling pipeline library for datasets.
27
27
 
28
28
  ## Installation
29
29
 
@@ -1,6 +1,6 @@
1
- # FeatureForge
1
+ # DataForgeML
2
2
 
3
- Automated feature engineering and data profiling pipeline library for tabular datasets.
3
+ Automated feature engineering and data profiling pipeline library for datasets.
4
4
 
5
5
  ## Installation
6
6
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.8.0"
7
+ version = "0.10.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,6 +1,8 @@
1
1
  from .structural import StructuralProfiler
2
2
  from .config import (
3
3
  ProfileConfig,
4
+ PipelineConfig,
5
+ PipelinePhase,
4
6
  SemanticType,
5
7
  Modality,
6
8
  TypeFlag,
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
19
21
  __all__ = [
20
22
  "StructuralProfiler",
21
23
  "ProfileConfig",
24
+ "PipelineConfig",
25
+ "PipelinePhase",
22
26
  "SemanticType",
23
27
  "Modality",
24
28
  "TypeFlag",
@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
3
3
 
4
4
  Hierarchy
5
5
  ---------
6
- Profiling[R] — root: stores config, provides _resolve_columns
7
- ├── ColumnBatchProfiler[R] — registry tier: __init__(config=None) only;
8
- │ │ profile(df, columns) processes a typed column batch
6
+ Profiling[R] — root: thin ABC, provides _resolve_columns
7
+ ├── ColumnBatchProfiler[R] — registry tier: profile(df, columns) processes a
8
+ │ │ typed column batch; no config, no eligibility gates
9
9
  │ ├── NumericProfiler
10
10
  │ ├── CategoricalProfiler
11
11
  │ ├── DatetimeProfiler
@@ -26,22 +26,19 @@ import polars as pl
26
26
  from abc import abstractmethod, ABC
27
27
  from typing import Generic, TypeVar
28
28
 
29
- from .config import DatasetStats, ProfileConfig
29
+ from .config import DatasetStats
30
30
 
31
31
  R = TypeVar("R")
32
32
 
33
33
 
34
34
  class Profiling(ABC, Generic[R]):
35
35
  """
36
- Root base for all profilers.
36
+ Root base for all profilers. Thin ABC — no config state.
37
37
 
38
- Stores config and provides _resolve_columns. Not instantiated directly
39
- use one of the three concrete tier bases below.
38
+ Sub-processors are pure batch processors: given a DataFrame and a column
39
+ list, return a result. No routing, no scoping, no config.
40
40
  """
41
41
 
42
- def __init__(self, config: ProfileConfig | None = None):
43
- self.config = config or ProfileConfig()
44
-
45
42
  @abstractmethod
46
43
  def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
47
44
 
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
62
59
 
63
60
  Contract
64
61
  --------
65
- - __init__ must accept ONLY config (no extra required params). This allows
66
- StructuralProfiler to instantiate any registered profiler uniformly via
67
- profiler_cls(config=self.config)
62
+ - __init__ takes no arguments (instantiated as profiler_cls()).
68
63
  - profile(df, columns) receives the full DataFrame and the list of same-type
69
- column names to process. Returns a result with:
64
+ column names to process. Profiles every column in the list without any
65
+ internal eligibility gate or config consultation.
66
+ - Returns a result with:
70
67
  .columns: dict[str, <Stats>] — per-column stats
71
68
  .analysed_columns: list[str] — columns actually profiled
72
69
  """
@@ -22,11 +22,7 @@ from __future__ import annotations
22
22
  import polars as pl
23
23
 
24
24
  from ._base import ColumnBatchProfiler
25
- from .config import (
26
- ProfileConfig,
27
- BooleanStats,
28
- SemanticType,
29
- )
25
+ from .config import BooleanStats
30
26
  from ._boolean_config import BooleanProfileResult
31
27
  from ..models._data_types import _INT_DTYPES
32
28
 
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
42
38
  """
43
39
  Boolean column profiler for Polars DataFrames.
44
40
 
45
- A column is eligible when:
46
- - Its Polars dtype is pl.Boolean, OR
47
- - Its dtype is an integer with values exclusively in {0, 1}, OR
48
- - It has a SemanticType.Boolean override in ProfileConfig.column_overrides
49
-
50
- Non-eligible columns in the provided list are silently skipped.
51
-
52
- Parameters
53
- ----------
54
- config : ProfileConfig | None
55
- Shared profiling configuration.
41
+ Profiles every column passed to profile(df, columns) — no config,
42
+ no internal eligibility gate.
56
43
  """
57
44
 
58
- def __init__(self, config: ProfileConfig | None = None) -> None:
59
- super().__init__(config)
60
-
61
45
  # ------------------------------------------------------------------
62
46
  # Public API
63
47
  # ------------------------------------------------------------------
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
69
53
  ) -> BooleanProfileResult:
70
54
  return self._run(data, columns)
71
55
 
72
- # ------------------------------------------------------------------
73
- # Eligibility
74
- # ------------------------------------------------------------------
75
-
76
- def _eligible(self, series: pl.Series) -> bool:
77
- override = self.config.column_overrides.get(series.name)
78
-
79
- # Explicit override — trust it
80
- if override == SemanticType.Boolean:
81
- return True
82
-
83
- # Another override takes precedence over auto-detection
84
- if override is not None:
85
- return False
86
-
87
- return True
88
-
89
56
  # ------------------------------------------------------------------
90
57
  # Orchestration
91
58
  # ------------------------------------------------------------------
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
97
64
  ) -> BooleanProfileResult:
98
65
  result = BooleanProfileResult()
99
66
 
100
- available = [
101
- c
102
- for c in self._resolve_columns(df.columns, columns)
103
- if self._eligible(df[c])
104
- ]
67
+ available = self._resolve_columns(df.columns, columns)
105
68
  result.analysed_columns = available
106
69
 
107
70
  for col_name in available:
@@ -45,10 +45,6 @@ from ._categorical_config import (
45
45
  RareCategoryStats,
46
46
  ImbalanceMetrics,
47
47
  )
48
- from .config import (
49
- ProfileConfig,
50
- SemanticType,
51
- )
52
48
 
53
49
  # ---------------------------------------------------------------------------
54
50
  # Module-level thresholds (documented so callers can see what drives flags)
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
65
61
  """
66
62
  Categorical profiler for Polars DataFrames.
67
63
 
68
- Parameters
69
- ----------
70
- columns : list[str]
71
- Columns to profile. The profiler intersects this list with
72
- the DataFrame's actual columns at runtime.
73
- config : ProfileConfig | None
74
- Shared profiling configuration (used for chunk_size, etc.).
75
-
76
- Usage
77
- -----
78
- >>> profiler = CategoricalProfiler(
79
- ... columns=["status", "country", "product_type"],
80
- ... )
81
- >>> result = profiler.profile(df)
82
- >>> print(result)
64
+ Profiles every column passed to profile(df, columns) — no config,
65
+ no internal eligibility gate.
83
66
  """
84
67
 
85
- def __init__(
86
- self,
87
- config: ProfileConfig | None = None,
88
- ) -> None:
89
- super().__init__(config)
90
-
91
68
  # ------------------------------------------------------------------
92
69
  # Public API
93
70
  # ------------------------------------------------------------------
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
103
80
  # Orchestration
104
81
  # ------------------------------------------------------------------
105
82
 
106
- def _eligible(
107
- self,
108
- series: pl.Series,
109
- ) -> bool:
110
- override = self.config.column_overrides.get(series.name)
111
- if override == SemanticType.Categorical:
112
- return True
113
-
114
- if override is not None:
115
- return False
116
-
117
- return True
118
-
119
83
  def _run(
120
84
  self,
121
85
  df: pl.DataFrame,
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
123
87
  ) -> CategoricalProfileResult:
124
88
  result = CategoricalProfileResult()
125
89
 
126
- # Resolve columns against actual schema
127
- available = [
128
- c
129
- for c in self._resolve_columns(df.columns, columns)
130
- if self._eligible(df[c])
131
- ]
90
+ available = self._resolve_columns(df.columns, columns)
132
91
  result.analysed_columns = available
133
92
 
134
93
  n_rows = df.height
@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
125
125
  near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
126
126
  top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
127
127
  ) -> None:
128
- super().__init__(config)
128
+ super().__init__()
129
129
  self._numeric_columns = numeric_columns
130
130
  self._categorical_columns = categorical_columns or []
131
131
  self._threshold = near_redundant_threshold
@@ -43,10 +43,6 @@ from datetime import datetime, timezone
43
43
  import polars as pl
44
44
 
45
45
  from ._base import ColumnBatchProfiler
46
- from .config import (
47
- ProfileConfig,
48
- SemanticType,
49
- )
50
46
  from ._datetime_config import (
51
47
  DatetimeProfileResult,
52
48
  DatetimeStats,
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
90
86
  """
91
87
  Datetime distribution profiler for Polars DataFrames.
92
88
 
93
- Parameters
94
- ----------
95
- columns : list[str]
96
- Columns to profile. Non-datetime columns are skipped with a warning.
97
- config : ProfileConfig | None
98
- Shared profiling configuration.
89
+ Profiles every column passed to profile(df, columns) — no config,
90
+ no internal eligibility gate. String columns are coerced to Datetime;
91
+ columns that cannot be coerced are silently skipped.
99
92
  """
100
93
 
101
- def __init__(
102
- self,
103
- config: ProfileConfig | None = None,
104
- ) -> None:
105
- super().__init__(config)
106
-
107
94
  # ------------------------------------------------------------------
108
95
  # Public API
109
96
  # ------------------------------------------------------------------
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
119
106
  # Orchestration
120
107
  # ------------------------------------------------------------------
121
108
 
122
- def _eligible(self, series: pl.Series) -> bool:
123
- override = self.config.column_overrides.get(series.name)
124
-
125
- if override == SemanticType.Datetime:
126
- return True
127
- if override is not None:
128
- return False
129
-
130
- return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
131
-
132
109
  def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
133
110
  if series.dtype in (pl.Utf8, pl.String):
134
111
  coerced = series.str.to_datetime(strict=False)
135
112
  return coerced if coerced.drop_nulls().len() > 0 else None
136
- return series
113
+ if _is_datetime_dtype(series.dtype):
114
+ return series
115
+ return None
137
116
 
138
117
  def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
139
118
  result = DatetimeProfileResult()
140
119
  now = datetime.now(tz=timezone.utc)
141
120
 
142
- candidates = [
143
- c
144
- for c in self._resolve_columns(df.columns, columns)
145
- if self._eligible(df[c])
146
- ]
147
-
148
121
  available = []
149
122
  coerced_cache = {}
150
- for col_name in candidates:
123
+ for col_name in self._resolve_columns(df.columns, columns):
151
124
  series = self._coerce_to_datetime(df[col_name])
152
125
  if series is not None:
153
126
  available.append(col_name)
@@ -3,19 +3,10 @@ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
3
3
 
4
4
  Eligibility model
5
5
  -----------------
6
- Effective-null detection is based on **dtype first**, with SemanticType
7
- overrides acting only as suppressors, not as enablers:
6
+ Effective-null detection is purely dtype-driven no SemanticType overrides:
8
7
 
9
- sentinel-string detection → runs when dtype is Utf8/String
10
- suppressed if override is Numeric / Datetime / Boolean
11
- (those types cannot have meaningful sentinel strings)
12
-
13
- Inf / NaN expansion → runs when dtype is Float32/Float64
14
- never suppressed (Inf in a float column is always
15
- effectively missing regardless of semantic label)
16
-
17
- column_overrides is SPARSE — most columns will have no entry.
18
- Absence of an override is not a signal; it means "trust the dtype".
8
+ sentinel-string detection → runs for every String/Utf8 column unconditionally
9
+ Inf / NaN expansion → runs for every Float32/Float64 column unconditionally
19
10
  """
20
11
 
21
12
  from __future__ import annotations
@@ -24,13 +15,13 @@ from __future__ import annotations
24
15
  import polars as pl
25
16
 
26
17
  from ._base import DatasetLevelProfiler
27
- from .config import ProfileConfig, SemanticType
28
18
  from ._missingness_config import (
29
19
  ColumnMissingnessProfile,
30
20
  MissingnessFlag,
31
21
  MissingnessProfileResult,
32
22
  MissingSeverity,
33
23
  )
24
+ from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
34
25
 
35
26
  # ---------------------------------------------------------------------------
36
27
  # Thresholds
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
43
34
  _MAR_CORRELATION_THRESHOLD = 0.60
44
35
  _COL_DROP_THRESHOLD = 0.50
45
36
 
46
- _SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
47
-
48
- # Overrides that suppress sentinel-string detection on a String column.
49
- # If a column is String but the user says "this is Numeric", treating
50
- # "NA" as a sentinel is correct — but if they say Categorical or Text,
51
- # sentinel detection still makes sense and should run.
52
- _SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
53
- {
54
- SemanticType.Numeric,
55
- SemanticType.Datetime,
56
- SemanticType.Boolean,
57
- SemanticType.Identifier,
58
- }
59
- )
60
-
61
-
62
- def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
63
- """True when sentinel-string detection should run for this column."""
64
- if dtype not in (pl.Utf8, pl.String):
65
- return False
66
- # Override present and it's a non-text semantic → suppress
67
- if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
68
- return False
69
- return True
70
-
71
-
72
- def _inf_eligible(dtype: pl.DataType) -> bool:
73
- """True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
74
- return dtype in (pl.Float32, pl.Float64)
75
-
76
37
 
77
38
  class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
78
- """
79
- Missingness profiler for Polars DataFrames.
80
-
81
- Column scoping
82
- --------------
83
- Resolution priority (high → low):
84
- 1. Explicit ``columns`` argument to ``profile()``.
85
- 2. ``config.exclude_columns`` — always removed.
86
- 3. All remaining DataFrame columns.
87
- """
39
+ """Missingness profiler for Polars DataFrames."""
88
40
 
89
- def __init__(self, config: ProfileConfig | None = None) -> None:
90
- super().__init__(config)
91
- self._config: ProfileConfig = config or ProfileConfig()
41
+ def __init__(self) -> None:
42
+ super().__init__()
92
43
 
93
44
  # ------------------------------------------------------------------
94
45
  # Public API
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
117
68
  if n_rows == 0 or not cols:
118
69
  return result
119
70
 
120
- overrides = self._config.column_overrides # sparse — most keys absent
121
71
  indicator_cols: list[pl.Series] = []
122
72
 
123
73
  for col_name in cols:
124
- override = overrides.get(col_name) # None for most columns
125
74
  col_profile, indicator = self._profile_column(
126
75
  series=df[col_name],
127
76
  col_name=col_name,
128
77
  n_rows=n_rows,
129
- override=override,
130
78
  )
131
79
  result.columns[col_name] = col_profile
132
80
  indicator_cols.append(indicator)
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
173
121
  series: pl.Series,
174
122
  col_name: str,
175
123
  n_rows: int,
176
- override: SemanticType | None = None, # sparse — None is the common case
177
124
  ) -> tuple[ColumnMissingnessProfile, pl.Series]:
178
- """
179
- Compute standard + effective null counts for one column.
180
-
181
- Eligibility is dtype-first:
182
- - sentinel strings → String dtype, unless override suppresses it
183
- - Inf/NaN → Float dtype, always (never suppressed)
184
- - everything else → standard Polars null only
185
- """
186
125
  profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
187
126
  dtype = series.dtype
188
127
  std_null = series.is_null()
189
128
 
190
- if _sentinel_eligible(dtype, override):
129
+ if _sentinel_eligible(dtype):
191
130
  eff_null = (
192
131
  std_null
193
132
  | (series.str.strip_chars() == "")
@@ -0,0 +1,22 @@
1
+ """
2
+ _null_detection – shared dtype-driven null primitives for Phase 1.
3
+
4
+ Single authority for what counts as "effectively null" across the entire
5
+ Phase 1 implementation. No config, no SemanticType overrides, no state.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import polars as pl
11
+
12
+ _SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
13
+
14
+
15
+ def _sentinel_eligible(dtype: pl.DataType) -> bool:
16
+ """True when sentinel-string detection should run for this column (String/Utf8 only)."""
17
+ return dtype in (pl.Utf8, pl.String)
18
+
19
+
20
+ def _inf_eligible(dtype: pl.DataType) -> bool:
21
+ """True when Inf/NaN expansion should run (Float32/Float64 only)."""
22
+ return dtype in (pl.Float32, pl.Float64)
@@ -35,10 +35,6 @@ from __future__ import annotations
35
35
  import polars as pl
36
36
 
37
37
  from ._base import ColumnBatchProfiler
38
- from .config import (
39
- ProfileConfig,
40
- SemanticType,
41
- )
42
38
  from ._correlation_profiler import _INT_DTYPES
43
39
  from ._numeric_config import (
44
40
  NumericProfileResult,
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
80
76
  """
81
77
  Numeric distribution profiler for Polars DataFrames.
82
78
 
83
- Parameters
84
- ----------
85
- columns : list[str]
86
- Columns to profile. Non-numeric or absent columns are skipped
87
- with a warning; they do not raise.
88
- config : ProfileConfig | None
89
- Shared profiling configuration.
79
+ Profiles every column passed to profile(df, columns) — no config,
80
+ no internal eligibility gate.
90
81
  """
91
82
 
92
- def __init__(
93
- self,
94
- config: ProfileConfig | None = None,
95
- ) -> None:
96
- super().__init__(config)
97
-
98
83
  # ------------------------------------------------------------------
99
84
  # Public API
100
85
  # ------------------------------------------------------------------
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
110
95
  # Orchestration
111
96
  # ------------------------------------------------------------------
112
97
 
113
- def _eligible(self, series: pl.Series) -> bool:
114
- override = self.config.column_overrides.get(series.name)
115
- if override == SemanticType.Numeric:
116
- return True
117
-
118
- if override is not None:
119
- return False
120
-
121
- return True
122
-
123
98
  def _run(
124
99
  self,
125
100
  df: pl.DataFrame,
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
128
103
  result = NumericProfileResult()
129
104
  n_rows = df.height
130
105
 
131
- available = [
132
- c
133
- for c in self._resolve_columns(df.columns, columns)
134
- if self._eligible(df[c])
135
- ]
106
+ available = self._resolve_columns(df.columns, columns)
136
107
  result.analysed_columns = available
137
108
 
138
109
  if not available:
@@ -3,16 +3,18 @@ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
3
3
 
4
4
  All DataFrame operations use Polars (no pandas dependency).
5
5
 
6
+ A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
7
+ computes dataset-level stats over every column — no exclusion logic, no
8
+ config dependency.
9
+
6
10
  Computes:
7
- • row / column count (always full dataset)
11
+ • row / column count (full dataset)
8
12
  • memory usage + per-column breakdown when threshold exceeded
9
- • duplicate row count & ratio (scoped to config.duplicate_columns)
10
- • overall sparsity (scoped to config.sparsity_columns)
11
- • data-type detection (scoped to config.type_detection_columns;
12
- skipped entirely when None)
13
+ • duplicate row count & ratio (all columns)
14
+ • overall sparsity (all columns)
13
15
 
14
16
  Chunked processing is activated automatically when the DataFrame's
15
- estimated memory exceeds config.memory_threshold_mb.
17
+ estimated memory exceeds _MEMORY_THRESHOLD_MB.
16
18
  """
17
19
 
18
20
  from __future__ import annotations
@@ -24,31 +26,32 @@ import polars as pl
24
26
  from ._base import ModalityProfiler
25
27
  from .config import (
26
28
  MemoryBreakdown,
27
- ProfileConfig,
28
29
  DatasetStats,
29
30
  )
30
31
 
32
+ # ---------------------------------------------------------------------------
33
+ # Module-level constants (previously sourced from ProfileConfig)
34
+ # ---------------------------------------------------------------------------
35
+
36
+ _MEMORY_THRESHOLD_MB: float = 500.0
37
+ _CHUNK_SIZE: int = 100_000
38
+
31
39
 
32
40
  class TabularProfiler(ModalityProfiler):
33
41
  """
34
42
  Structural profiler for Polars DataFrames.
35
43
 
44
+ Pipeline-agnostic: accepts no constructor arguments and applies no column
45
+ filtering. Computes dataset-level stats (row count, column count, memory,
46
+ duplicate ratio, overall sparsity) over the complete DataFrame it receives.
47
+
36
48
  Usage
37
49
  -----
38
- >>> cfg = ProfileConfig(
39
- ... duplicate_columns=["user_id", "event_time"],
40
- ... sparsity_columns=["age", "income", "postcode"],
41
- ... type_detection_columns=["age", "income", "postcode", "created_at"],
42
- ... memory_threshold_mb=200,
43
- ... )
44
- >>> profiler = TabularProfiler(config=cfg)
50
+ >>> profiler = TabularProfiler()
45
51
  >>> result = profiler.profile(df)
46
52
  >>> print(result)
47
53
  """
48
54
 
49
- def __init__(self, config: ProfileConfig | None = None):
50
- super().__init__(config)
51
-
52
55
  # ------------------------------------------------------------------
53
56
  # Public API
54
57
  # ------------------------------------------------------------------
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
77
80
  if result.row_count == 0:
78
81
  return result
79
82
 
80
- # 3. Resolve column scopes
83
+ # 3. Operate on all columns — no exclusion logic
81
84
  all_cols: list[str] = df.columns
82
- analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
83
-
84
- dup_cols = analysed_cols
85
- missingness_cols = analysed_cols
86
85
 
87
86
  if use_chunks:
88
- self._chunked_metrics(df, dup_cols, missingness_cols, result)
87
+ self._chunked_metrics(df, all_cols, all_cols, result)
89
88
  else:
90
- self._full_metrics(df, dup_cols, missingness_cols, result)
89
+ self._full_metrics(df, all_cols, all_cols, result)
91
90
 
92
91
  return result
93
92
 
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
136
135
  total_bytes = sum(col_bytes.values())
137
136
 
138
137
  result.memory_bytes = total_bytes
139
- threshold_bytes = self.config.memory_threshold_mb * 1024 * 1024
138
+ threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
140
139
 
141
140
  if total_bytes > threshold_bytes:
142
141
  result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
189
188
  seen hashes — semantics match keep='first'.
190
189
  Sparsity is accumulated as (missing_cells, total_cells).
191
190
  """
192
- chunk_size = self.config.chunk_size
191
+ chunk_size = _CHUNK_SIZE
193
192
  n_chunks = math.ceil(result.row_count / chunk_size)
194
193
 
195
194
  seen_hashes: set[int] = set()
@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
36
36
  """
37
37
 
38
38
  def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
39
- super().__init__(config)
39
+ super().__init__()
40
40
  self.target_column = target_column
41
41
 
42
42
  def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
129
129
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
130
130
  ) -> None:
131
131
  """Generates categorical metrics and checks for class imbalance."""
132
- cat_profiler = CategoricalProfiler(config=self.config)
132
+ cat_profiler = CategoricalProfiler()
133
133
 
134
134
  # Internally compute cardinality, top values, and imbalance metrics
135
135
  cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
146
146
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
147
147
  ) -> None:
148
148
  """Generates numeric metrics and checks for target skewness."""
149
- num_profiler = NumericProfiler(config=self.config)
149
+ num_profiler = NumericProfiler()
150
150
 
151
151
  col_name = series.name
152
152
  num_result = num_profiler.profile(series.to_frame(), [col_name])
@@ -54,11 +54,7 @@ from __future__ import annotations
54
54
  import polars as pl
55
55
 
56
56
  from ._base import ColumnBatchProfiler
57
- from .config import (
58
- ProfileConfig,
59
- TextStats,
60
- SemanticType,
61
- )
57
+ from .config import TextStats
62
58
  from ._text_config import TextProfileResult
63
59
 
64
60
  # Regex that counts non-whitespace token runs — used with str.count_matches.
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
69
65
  """
70
66
  Free-text column profiler for Polars DataFrames.
71
67
 
72
- A column is eligible when:
73
- - It has a ``SemanticType.Text`` override in
74
- ``ProfileConfig.column_overrides``, OR
75
- - Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
76
-
77
- Non-eligible columns are silently skipped.
78
-
79
- Parameters
80
- ----------
81
- config : ProfileConfig | None
82
- Shared profiling configuration.
68
+ Profiles every column passed to profile(df, columns) — no config,
69
+ no internal eligibility gate.
83
70
  """
84
71
 
85
- def __init__(self, config: ProfileConfig | None = None) -> None:
86
- super().__init__(config)
87
-
88
72
  # ------------------------------------------------------------------
89
73
  # Public API
90
74
  # ------------------------------------------------------------------
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
96
80
  ) -> TextProfileResult:
97
81
  return self._run(data, columns)
98
82
 
99
- # ------------------------------------------------------------------
100
- # Eligibility
101
- # ------------------------------------------------------------------
102
-
103
- def _eligible(self, series: pl.Series) -> bool:
104
- override = self.config.column_overrides.get(series.name)
105
-
106
- if override == SemanticType.Text:
107
- return True
108
-
109
- # Any other explicit override takes precedence
110
- if override is not None:
111
- return False
112
-
113
- # Native string dtype (pl.Utf8 is the canonical name; pl.String is
114
- # an alias in newer Polars — check both for cross-version safety)
115
- return series.dtype in (pl.Utf8, pl.String)
116
-
117
83
  # ------------------------------------------------------------------
118
84
  # Orchestration
119
85
  # ------------------------------------------------------------------
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
125
91
  ) -> TextProfileResult:
126
92
  result = TextProfileResult()
127
93
 
128
- available = [
129
- c
130
- for c in self._resolve_columns(df.columns, columns)
131
- if self._eligible(df[c])
132
- ]
94
+ available = self._resolve_columns(df.columns, columns)
133
95
  result.analysed_columns = available
134
96
 
135
97
  for col_name in available:
@@ -52,6 +52,15 @@ class Modality(StrEnum):
52
52
  # TimeSeries = "time_series"
53
53
 
54
54
 
55
+ class PipelinePhase(StrEnum):
56
+ Profiling = "profiling"
57
+ Imputation = "imputation"
58
+ OutlierDetection = "outlier_detection"
59
+ Normalization = "normalization"
60
+ Encoding = "encoding"
61
+ Scaling = "scaling"
62
+
63
+
55
64
  # ---------------------------------------------------------------------------
56
65
  # Type-detection enums — kept for TypeDetector compatibility
57
66
  # ---------------------------------------------------------------------------
@@ -218,10 +227,6 @@ class ProfileConfig:
218
227
  Data modality. Currently only Tabular is implemented.
219
228
  target_column : Optional[str]
220
229
  Name of the label/target column, if any.
221
- column_overrides : dict[str, SemanticType]
222
- Explicit semantic type assignments that override auto-detection.
223
- exclude_columns : list[str]
224
- Columns to skip entirely during profiling.
225
230
  compute_correlation : bool
226
231
  Whether to compute the feature-feature correlation matrix.
227
232
  correlation_target_column : Optional[str]
@@ -234,29 +239,84 @@ class ProfileConfig:
234
239
 
235
240
  modality: Modality = Modality.Tabular
236
241
  target_columns: list[str] = field(default_factory=list)
237
- column_overrides: dict[str, SemanticType] = field(default_factory=dict)
238
- exclude_columns: list[str] = field(default_factory=list)
239
242
  compute_correlation: bool = False
240
243
  correlation_target_column: Optional[str] = None
241
244
  memory_threshold_mb: float = 500.0
242
245
  chunk_size: int = 100_000
243
246
 
244
- def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
247
+
248
+ def to_dict(self) -> dict:
249
+ return {
250
+ "modality": str(self.modality),
251
+ "target_columns": list(self.target_columns),
252
+ "compute_correlation": self.compute_correlation,
253
+ "correlation_target_column": self.correlation_target_column,
254
+ "memory_threshold_mb": self.memory_threshold_mb,
255
+ "chunk_size": self.chunk_size,
256
+ }
257
+
258
+ @classmethod
259
+ def from_dict(cls, data: dict) -> ProfileConfig:
260
+ return cls(
261
+ modality=Modality(data.get("modality", Modality.Tabular)),
262
+ target_columns=list(data.get("target_columns", [])),
263
+ compute_correlation=bool(data.get("compute_correlation", False)),
264
+ correlation_target_column=data.get("correlation_target_column"),
265
+ memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
266
+ chunk_size=int(data.get("chunk_size", 100_000)),
267
+ )
268
+
269
+ def to_json(self) -> str:
270
+ return json.dumps(self.to_dict())
271
+
272
+ @classmethod
273
+ def from_json(cls, json_str: str) -> ProfileConfig:
274
+ return cls.from_dict(json.loads(json_str))
275
+
276
+
277
+ @dataclass
278
+ class PipelineConfig:
279
+ """
280
+ Master configuration for the full 6-phase feature engineering pipeline.
281
+
282
+ Parameters
283
+ ----------
284
+ exclude_columns : list[str]
285
+ Hard exclusions — columns dropped globally from every phase.
286
+ phase_exclusions : dict[PipelinePhase, list[str]]
287
+ Soft exclusions — columns bypassed for a specific phase but retained
288
+ in the dataset.
289
+ column_overrides : dict[str, SemanticType]
290
+ Explicit semantic type assignments respected by all downstream phases.
291
+ profiling : ProfileConfig
292
+ Phase 1-specific parameters (correlation, chunking, memory threshold).
293
+ """
294
+
295
+ exclude_columns: list[str] = field(default_factory=list)
296
+ phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
297
+ column_overrides: dict[str, SemanticType] = field(default_factory=dict)
298
+ profiling: ProfileConfig = field(default_factory=ProfileConfig)
299
+
300
+ def resolve_active_columns(
301
+ self, phase: PipelinePhase, available_columns: list[str]
302
+ ) -> list[str]:
245
303
  """
246
- Explicitly set the semantic type for a column, overriding auto-detection.
304
+ Return the columns the given phase should operate on.
247
305
 
248
- The override is the sole source of truth for that column's type — the
249
- type detector's verdict is ignored during profiling. Calling this method
250
- multiple times on the same column is valid; the last call wins.
251
-
252
- Parameters
253
- ----------
254
- column : str
255
- Name of the column to override.
256
- semantic_type : str | SemanticType
257
- Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
258
- ``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
259
- raise ``ValueError``.
306
+ Hard exclusions are applied first, then phase-specific soft exclusions.
307
+ Columns absent from available_columns are silently ignored in both lists.
308
+ """
309
+ hard_set = set(self.exclude_columns)
310
+ soft_set = set(self.phase_exclusions.get(phase, []))
311
+ excluded = hard_set | soft_set
312
+ return [c for c in available_columns if c not in excluded]
313
+
314
+ def set_column_type(
315
+ self, column: str, semantic_type: Union[str, "SemanticType"]
316
+ ) -> None:
317
+ """
318
+ Explicitly set the semantic type for a column, overriding auto-detection.
319
+ This override is respected by all downstream phases.
260
320
  """
261
321
  if isinstance(semantic_type, str):
262
322
  try:
@@ -271,36 +331,38 @@ class ProfileConfig:
271
331
 
272
332
  def to_dict(self) -> dict:
273
333
  return {
274
- "modality": str(self.modality),
275
- "target_columns": list(self.target_columns),
276
- "column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
277
334
  "exclude_columns": list(self.exclude_columns),
278
- "compute_correlation": self.compute_correlation,
279
- "correlation_target_column": self.correlation_target_column,
280
- "memory_threshold_mb": self.memory_threshold_mb,
281
- "chunk_size": self.chunk_size,
335
+ "phase_exclusions": {
336
+ str(phase): list(cols)
337
+ for phase, cols in self.phase_exclusions.items()
338
+ },
339
+ "column_overrides": {
340
+ col: str(sem_type)
341
+ for col, sem_type in self.column_overrides.items()
342
+ },
343
+ "profiling": self.profiling.to_dict(),
282
344
  }
283
345
 
284
346
  @classmethod
285
- def from_dict(cls, data: dict) -> ProfileConfig:
347
+ def from_dict(cls, data: dict) -> "PipelineConfig":
286
348
  return cls(
287
- modality=Modality(data.get("modality", Modality.Tabular)),
288
- target_column=data.get("target_column"),
349
+ exclude_columns=list(data.get("exclude_columns", [])),
350
+ phase_exclusions={
351
+ PipelinePhase(phase_str): list(cols)
352
+ for phase_str, cols in data.get("phase_exclusions", {}).items()
353
+ },
289
354
  column_overrides={
290
- k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
355
+ col: SemanticType(sem_str)
356
+ for col, sem_str in data.get("column_overrides", {}).items()
291
357
  },
292
- exclude_columns=list(data.get("exclude_columns", [])),
293
- compute_correlation=bool(data.get("compute_correlation", False)),
294
- correlation_target_column=data.get("correlation_target_column"),
295
- memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
296
- chunk_size=int(data.get("chunk_size", 100_000)),
358
+ profiling=ProfileConfig.from_dict(data.get("profiling", {})),
297
359
  )
298
360
 
299
- def to_json(self) -> str:
300
- return json.dumps(self.to_dict())
361
+ def to_json(self, indent: int = 2) -> str:
362
+ return json.dumps(self.to_dict(), indent=indent)
301
363
 
302
364
  @classmethod
303
- def from_json(cls, json_str: str) -> ProfileConfig:
365
+ def from_json(cls, json_str: str) -> "PipelineConfig":
304
366
  return cls.from_dict(json.loads(json_str))
305
367
 
306
368
 
@@ -35,7 +35,8 @@ from ._target_profiler import TargetProfiler
35
35
  from ._correlation_profiler import CorrelationProfiler
36
36
  from ._type_detector import TypeDetector
37
37
  from .config import (
38
- ProfileConfig,
38
+ PipelineConfig,
39
+ PipelinePhase,
39
40
  ColumnProfile,
40
41
  StructuralProfileResult,
41
42
  RowMissingnessDistribution,
@@ -64,14 +65,14 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { #
64
65
 
65
66
  class StructuralProfiler:
66
67
 
67
- def __init__(self, config: ProfileConfig | None = None) -> None:
68
- self.config = config or ProfileConfig()
68
+ def __init__(self, config: PipelineConfig | None = None) -> None:
69
+ self.config: PipelineConfig = config or PipelineConfig()
69
70
 
70
- if self.config.modality == Modality.Tabular:
71
- self.modality_profiler: ModalityProfiler = TabularProfiler(self.config)
71
+ if self.config.profiling.modality == Modality.Tabular:
72
+ self.modality_profiler: ModalityProfiler = TabularProfiler()
72
73
  else:
73
74
  raise NotImplementedError(
74
- f"modality {self.config.modality} not supported yet"
75
+ f"modality {self.config.profiling.modality} not supported yet"
75
76
  )
76
77
 
77
78
  # ------------------------------------------------------------------
@@ -87,7 +88,17 @@ class StructuralProfiler:
87
88
 
88
89
  result = StructuralProfileResult()
89
90
 
90
- active_cols = [c for c in data.columns if c not in self.config.exclude_columns]
91
+ active_cols = self.config.resolve_active_columns(
92
+ PipelinePhase.Profiling, list(data.columns)
93
+ )
94
+
95
+ # Columns soft-excluded for Profiling: skipped but retained in the result.
96
+ hard_set = set(self.config.exclude_columns)
97
+ soft_retained = [
98
+ c for c in data.columns
99
+ if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
100
+ and c not in hard_set
101
+ ]
91
102
 
92
103
  # ── 1. Modality profiler ─────────────────────────────────────────
93
104
  # Replaces default DatasetStats with the real one (row_count, memory,
@@ -97,7 +108,7 @@ class StructuralProfiler:
97
108
  # ── 2. Missingness pre-pass ──────────────────────────────────────
98
109
  # setdefault creates ColumnProfile entries; subsequent steps mutate
99
110
  # the same objects via the same setdefault pattern.
100
- missingness_result = MissingnessProfiler(config=self.config).profile(
111
+ missingness_result = MissingnessProfiler().profile(
101
112
  data, columns=active_cols
102
113
  )
103
114
  for col_name in missingness_result.analysed_columns:
@@ -112,7 +123,6 @@ class StructuralProfiler:
112
123
  df=data,
113
124
  cols=active_cols,
114
125
  n_rows=data.height,
115
- overrides=self.config.column_overrides,
116
126
  )
117
127
 
118
128
  # ── 4. Type detection ────────────────────────────────────────────
@@ -153,7 +163,7 @@ class StructuralProfiler:
153
163
  profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
154
164
  if profiler_cls is None:
155
165
  continue
156
- profiler = profiler_cls(config=self.config)
166
+ profiler = profiler_cls()
157
167
  try:
158
168
  batch = profiler.profile(data, columns=cols)
159
169
  for col_name in batch.analysed_columns:
@@ -165,13 +175,13 @@ class StructuralProfiler:
165
175
  # ── 7. Target columns ────────────────────────────────────────────
166
176
  # TargetProfiler produces target-specific analysis stored in
167
177
  # result.targets. cp.stats is NOT overwritten — step 6 already set it.
168
- if self.config.target_columns:
169
- for target in self.config.target_columns:
178
+ if self.config.profiling.target_columns:
179
+ for target in self.config.profiling.target_columns:
170
180
  if target not in data.columns:
171
181
  continue
172
182
  target_result = TargetProfiler(
173
183
  target_column=target,
174
- config=self.config,
184
+ config=self.config.profiling,
175
185
  ).profile(data)
176
186
  result.targets[target] = target_result
177
187
 
@@ -180,7 +190,7 @@ class StructuralProfiler:
180
190
  cp.is_target = True
181
191
 
182
192
  # ── 8. Correlation ───────────────────────────────────────────────
183
- if self.config.compute_correlation:
193
+ if self.config.profiling.compute_correlation:
184
194
  # Resolve column lists by detected SemanticType (post-override).
185
195
  numeric_cols = [
186
196
  c
@@ -198,7 +208,7 @@ class StructuralProfiler:
198
208
  corr_profiler = CorrelationProfiler(
199
209
  numeric_columns=numeric_cols,
200
210
  categorical_columns=categorical_cols,
201
- config=self.config,
211
+ config=self.config.profiling,
202
212
  )
203
213
 
204
214
  # 8a. Feature-feature matrices — computed ONCE, target-independent.
@@ -209,7 +219,7 @@ class StructuralProfiler:
209
219
 
210
220
  # 8b. Per-target analysis — matrices are NOT recomputed; each call
211
221
  # shallow-copies feature_corr and appends target-specific fields.
212
- for target in self.config.target_columns:
222
+ for target in self.config.profiling.target_columns:
213
223
  if target not in data.columns:
214
224
  continue
215
225
  result.dataset.target_correlations[target] = (
@@ -218,6 +228,12 @@ class StructuralProfiler:
218
228
  )
219
229
  )
220
230
 
231
+ # ── Soft-excluded placeholders ───────────────────────────────────────
232
+ # Columns soft-excluded for Profiling are not profiled but must still
233
+ # appear in the result so downstream phases can reference them.
234
+ for col in soft_retained:
235
+ result.columns.setdefault(col, ColumnProfile(name=col))
236
+
221
237
  return result
222
238
 
223
239
  # ------------------------------------------------------------------
@@ -229,9 +245,8 @@ class StructuralProfiler:
229
245
  df: pl.DataFrame,
230
246
  cols: list[str],
231
247
  n_rows: int,
232
- overrides: dict[str, SemanticType],
233
248
  ) -> RowMissingnessDistribution:
234
- from ._missingness_profiler import (
249
+ from ._null_detection import (
235
250
  _sentinel_eligible,
236
251
  _inf_eligible,
237
252
  _SENTINEL_STRINGS,
@@ -246,10 +261,9 @@ class StructuralProfiler:
246
261
 
247
262
  for col_name in cols:
248
263
  dtype = df[col_name].dtype
249
- override = overrides.get(col_name)
250
264
  null_e = pl.col(col_name).is_null()
251
265
 
252
- if _sentinel_eligible(dtype, override):
266
+ if _sentinel_eligible(dtype):
253
267
  eff = (
254
268
  null_e
255
269
  | (pl.col(col_name).str.strip_chars() == "")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.8.0
3
+ Version: 0.10.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -21,9 +21,9 @@ Provides-Extra: dev
21
21
  Requires-Dist: pytest>=8.0; extra == "dev"
22
22
  Dynamic: license-file
23
23
 
24
- # FeatureForge
24
+ # DataForgeML
25
25
 
26
- Automated feature engineering and data profiling pipeline library for tabular datasets.
26
+ Automated feature engineering and data profiling pipeline library for datasets.
27
27
 
28
28
  ## Installation
29
29
 
@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
22
22
  src/dataforge_ml/profiling/_datetime_profiler.py
23
23
  src/dataforge_ml/profiling/_missingness_config.py
24
24
  src/dataforge_ml/profiling/_missingness_profiler.py
25
+ src/dataforge_ml/profiling/_null_detection.py
25
26
  src/dataforge_ml/profiling/_numeric_config.py
26
27
  src/dataforge_ml/profiling/_numeric_profiler.py
27
28
  src/dataforge_ml/profiling/_tabular.py
File without changes
File without changes