dataforge-ml 0.7.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/__init__.py +4 -0
  4. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_base.py +11 -14
  5. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_profiler.py +4 -41
  6. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical.py +3 -44
  7. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
  8. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_profiler.py +7 -34
  9. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py +11 -70
  10. dataforge_ml-0.9.0/src/dataforge_ml/profiling/_null_detection.py +22 -0
  11. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_profiler.py +6 -34
  12. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_tabular.py +25 -26
  13. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -3
  14. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_profiler.py +4 -42
  15. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/config.py +131 -1
  16. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/structural.py +41 -21
  17. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/data_loader.py +1 -3
  18. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  19. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
  20. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/LICENSE +0 -0
  21. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/README.md +0 -0
  22. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/setup.cfg +0 -0
  23. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/__init__.py +0 -0
  24. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/__init__.py +0 -0
  25. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  26. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_types.py +0 -0
  27. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  28. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  29. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  30. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  31. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  32. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  33. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  34. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  35. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  36. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  37. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_config.py +0 -0
  38. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  39. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/__init__.py +0 -0
  40. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  41. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  42. {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.7.0"
7
+ version = "0.9.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,6 +1,8 @@
1
1
  from .structural import StructuralProfiler
2
2
  from .config import (
3
3
  ProfileConfig,
4
+ PipelineConfig,
5
+ PipelinePhase,
4
6
  SemanticType,
5
7
  Modality,
6
8
  TypeFlag,
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
19
21
  __all__ = [
20
22
  "StructuralProfiler",
21
23
  "ProfileConfig",
24
+ "PipelineConfig",
25
+ "PipelinePhase",
22
26
  "SemanticType",
23
27
  "Modality",
24
28
  "TypeFlag",
@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
3
3
 
4
4
  Hierarchy
5
5
  ---------
6
- Profiling[R] — root: stores config, provides _resolve_columns
7
- ├── ColumnBatchProfiler[R] — registry tier: __init__(config=None) only;
8
- │ │ profile(df, columns) processes a typed column batch
6
+ Profiling[R] — root: thin ABC, provides _resolve_columns
7
+ ├── ColumnBatchProfiler[R] — registry tier: profile(df, columns) processes a
8
+ │ │ typed column batch; no config, no eligibility gates
9
9
  │ ├── NumericProfiler
10
10
  │ ├── CategoricalProfiler
11
11
  │ ├── DatetimeProfiler
@@ -26,22 +26,19 @@ import polars as pl
26
26
  from abc import abstractmethod, ABC
27
27
  from typing import Generic, TypeVar
28
28
 
29
- from .config import DatasetStats, ProfileConfig
29
+ from .config import DatasetStats
30
30
 
31
31
  R = TypeVar("R")
32
32
 
33
33
 
34
34
  class Profiling(ABC, Generic[R]):
35
35
  """
36
- Root base for all profilers.
36
+ Root base for all profilers. Thin ABC — no config state.
37
37
 
38
- Stores config and provides _resolve_columns. Not instantiated directly
39
- use one of the three concrete tier bases below.
38
+ Sub-processors are pure batch processors: given a DataFrame and a column
39
+ list, return a result. No routing, no scoping, no config.
40
40
  """
41
41
 
42
- def __init__(self, config: ProfileConfig | None = None):
43
- self.config = config or ProfileConfig()
44
-
45
42
  @abstractmethod
46
43
  def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
47
44
 
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
62
59
 
63
60
  Contract
64
61
  --------
65
- - __init__ must accept ONLY config (no extra required params). This allows
66
- StructuralProfiler to instantiate any registered profiler uniformly via
67
- profiler_cls(config=self.config)
62
+ - __init__ takes no arguments (instantiated as profiler_cls()).
68
63
  - profile(df, columns) receives the full DataFrame and the list of same-type
69
- column names to process. Returns a result with:
64
+ column names to process. Profiles every column in the list without any
65
+ internal eligibility gate or config consultation.
66
+ - Returns a result with:
70
67
  .columns: dict[str, <Stats>] — per-column stats
71
68
  .analysed_columns: list[str] — columns actually profiled
72
69
  """
@@ -22,11 +22,7 @@ from __future__ import annotations
22
22
  import polars as pl
23
23
 
24
24
  from ._base import ColumnBatchProfiler
25
- from .config import (
26
- ProfileConfig,
27
- BooleanStats,
28
- SemanticType,
29
- )
25
+ from .config import BooleanStats
30
26
  from ._boolean_config import BooleanProfileResult
31
27
  from ..models._data_types import _INT_DTYPES
32
28
 
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
42
38
  """
43
39
  Boolean column profiler for Polars DataFrames.
44
40
 
45
- A column is eligible when:
46
- - Its Polars dtype is pl.Boolean, OR
47
- - Its dtype is an integer with values exclusively in {0, 1}, OR
48
- - It has a SemanticType.Boolean override in ProfileConfig.column_overrides
49
-
50
- Non-eligible columns in the provided list are silently skipped.
51
-
52
- Parameters
53
- ----------
54
- config : ProfileConfig | None
55
- Shared profiling configuration.
41
+ Profiles every column passed to profile(df, columns) — no config,
42
+ no internal eligibility gate.
56
43
  """
57
44
 
58
- def __init__(self, config: ProfileConfig | None = None) -> None:
59
- super().__init__(config)
60
-
61
45
  # ------------------------------------------------------------------
62
46
  # Public API
63
47
  # ------------------------------------------------------------------
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
69
53
  ) -> BooleanProfileResult:
70
54
  return self._run(data, columns)
71
55
 
72
- # ------------------------------------------------------------------
73
- # Eligibility
74
- # ------------------------------------------------------------------
75
-
76
- def _eligible(self, series: pl.Series) -> bool:
77
- override = self.config.column_overrides.get(series.name)
78
-
79
- # Explicit override — trust it
80
- if override == SemanticType.Boolean:
81
- return True
82
-
83
- # Another override takes precedence over auto-detection
84
- if override is not None:
85
- return False
86
-
87
- return True
88
-
89
56
  # ------------------------------------------------------------------
90
57
  # Orchestration
91
58
  # ------------------------------------------------------------------
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
97
64
  ) -> BooleanProfileResult:
98
65
  result = BooleanProfileResult()
99
66
 
100
- available = [
101
- c
102
- for c in self._resolve_columns(df.columns, columns)
103
- if self._eligible(df[c])
104
- ]
67
+ available = self._resolve_columns(df.columns, columns)
105
68
  result.analysed_columns = available
106
69
 
107
70
  for col_name in available:
@@ -45,10 +45,6 @@ from ._categorical_config import (
45
45
  RareCategoryStats,
46
46
  ImbalanceMetrics,
47
47
  )
48
- from .config import (
49
- ProfileConfig,
50
- SemanticType,
51
- )
52
48
 
53
49
  # ---------------------------------------------------------------------------
54
50
  # Module-level thresholds (documented so callers can see what drives flags)
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
65
61
  """
66
62
  Categorical profiler for Polars DataFrames.
67
63
 
68
- Parameters
69
- ----------
70
- columns : list[str]
71
- Columns to profile. The profiler intersects this list with
72
- the DataFrame's actual columns at runtime.
73
- config : ProfileConfig | None
74
- Shared profiling configuration (used for chunk_size, etc.).
75
-
76
- Usage
77
- -----
78
- >>> profiler = CategoricalProfiler(
79
- ... columns=["status", "country", "product_type"],
80
- ... )
81
- >>> result = profiler.profile(df)
82
- >>> print(result)
64
+ Profiles every column passed to profile(df, columns) — no config,
65
+ no internal eligibility gate.
83
66
  """
84
67
 
85
- def __init__(
86
- self,
87
- config: ProfileConfig | None = None,
88
- ) -> None:
89
- super().__init__(config)
90
-
91
68
  # ------------------------------------------------------------------
92
69
  # Public API
93
70
  # ------------------------------------------------------------------
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
103
80
  # Orchestration
104
81
  # ------------------------------------------------------------------
105
82
 
106
- def _eligible(
107
- self,
108
- series: pl.Series,
109
- ) -> bool:
110
- override = self.config.column_overrides.get(series.name)
111
- if override == SemanticType.Categorical:
112
- return True
113
-
114
- if override is not None:
115
- return False
116
-
117
- return True
118
-
119
83
  def _run(
120
84
  self,
121
85
  df: pl.DataFrame,
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
123
87
  ) -> CategoricalProfileResult:
124
88
  result = CategoricalProfileResult()
125
89
 
126
- # Resolve columns against actual schema
127
- available = [
128
- c
129
- for c in self._resolve_columns(df.columns, columns)
130
- if self._eligible(df[c])
131
- ]
90
+ available = self._resolve_columns(df.columns, columns)
132
91
  result.analysed_columns = available
133
92
 
134
93
  n_rows = df.height
@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
125
125
  near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
126
126
  top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
127
127
  ) -> None:
128
- super().__init__(config)
128
+ super().__init__()
129
129
  self._numeric_columns = numeric_columns
130
130
  self._categorical_columns = categorical_columns or []
131
131
  self._threshold = near_redundant_threshold
@@ -43,10 +43,6 @@ from datetime import datetime, timezone
43
43
  import polars as pl
44
44
 
45
45
  from ._base import ColumnBatchProfiler
46
- from .config import (
47
- ProfileConfig,
48
- SemanticType,
49
- )
50
46
  from ._datetime_config import (
51
47
  DatetimeProfileResult,
52
48
  DatetimeStats,
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
90
86
  """
91
87
  Datetime distribution profiler for Polars DataFrames.
92
88
 
93
- Parameters
94
- ----------
95
- columns : list[str]
96
- Columns to profile. Non-datetime columns are skipped with a warning.
97
- config : ProfileConfig | None
98
- Shared profiling configuration.
89
+ Profiles every column passed to profile(df, columns) — no config,
90
+ no internal eligibility gate. String columns are coerced to Datetime;
91
+ columns that cannot be coerced are silently skipped.
99
92
  """
100
93
 
101
- def __init__(
102
- self,
103
- config: ProfileConfig | None = None,
104
- ) -> None:
105
- super().__init__(config)
106
-
107
94
  # ------------------------------------------------------------------
108
95
  # Public API
109
96
  # ------------------------------------------------------------------
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
119
106
  # Orchestration
120
107
  # ------------------------------------------------------------------
121
108
 
122
- def _eligible(self, series: pl.Series) -> bool:
123
- override = self.config.column_overrides.get(series.name)
124
-
125
- if override == SemanticType.Datetime:
126
- return True
127
- if override is not None:
128
- return False
129
-
130
- return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
131
-
132
109
  def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
133
110
  if series.dtype in (pl.Utf8, pl.String):
134
111
  coerced = series.str.to_datetime(strict=False)
135
112
  return coerced if coerced.drop_nulls().len() > 0 else None
136
- return series
113
+ if _is_datetime_dtype(series.dtype):
114
+ return series
115
+ return None
137
116
 
138
117
  def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
139
118
  result = DatetimeProfileResult()
140
119
  now = datetime.now(tz=timezone.utc)
141
120
 
142
- candidates = [
143
- c
144
- for c in self._resolve_columns(df.columns, columns)
145
- if self._eligible(df[c])
146
- ]
147
-
148
121
  available = []
149
122
  coerced_cache = {}
150
- for col_name in candidates:
123
+ for col_name in self._resolve_columns(df.columns, columns):
151
124
  series = self._coerce_to_datetime(df[col_name])
152
125
  if series is not None:
153
126
  available.append(col_name)
@@ -3,19 +3,10 @@ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
3
3
 
4
4
  Eligibility model
5
5
  -----------------
6
- Effective-null detection is based on **dtype first**, with SemanticType
7
- overrides acting only as suppressors, not as enablers:
6
+ Effective-null detection is purely dtype-driven no SemanticType overrides:
8
7
 
9
- sentinel-string detection → runs when dtype is Utf8/String
10
- suppressed if override is Numeric / Datetime / Boolean
11
- (those types cannot have meaningful sentinel strings)
12
-
13
- Inf / NaN expansion → runs when dtype is Float32/Float64
14
- never suppressed (Inf in a float column is always
15
- effectively missing regardless of semantic label)
16
-
17
- column_overrides is SPARSE — most columns will have no entry.
18
- Absence of an override is not a signal; it means "trust the dtype".
8
+ sentinel-string detection → runs for every String/Utf8 column unconditionally
9
+ Inf / NaN expansion → runs for every Float32/Float64 column unconditionally
19
10
  """
20
11
 
21
12
  from __future__ import annotations
@@ -24,13 +15,13 @@ from __future__ import annotations
24
15
  import polars as pl
25
16
 
26
17
  from ._base import DatasetLevelProfiler
27
- from .config import ProfileConfig, SemanticType
28
18
  from ._missingness_config import (
29
19
  ColumnMissingnessProfile,
30
20
  MissingnessFlag,
31
21
  MissingnessProfileResult,
32
22
  MissingSeverity,
33
23
  )
24
+ from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
34
25
 
35
26
  # ---------------------------------------------------------------------------
36
27
  # Thresholds
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
43
34
  _MAR_CORRELATION_THRESHOLD = 0.60
44
35
  _COL_DROP_THRESHOLD = 0.50
45
36
 
46
- _SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
47
-
48
- # Overrides that suppress sentinel-string detection on a String column.
49
- # If a column is String but the user says "this is Numeric", treating
50
- # "NA" as a sentinel is correct — but if they say Categorical or Text,
51
- # sentinel detection still makes sense and should run.
52
- _SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
53
- {
54
- SemanticType.Numeric,
55
- SemanticType.Datetime,
56
- SemanticType.Boolean,
57
- SemanticType.Identifier,
58
- }
59
- )
60
-
61
-
62
- def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
63
- """True when sentinel-string detection should run for this column."""
64
- if dtype not in (pl.Utf8, pl.String):
65
- return False
66
- # Override present and it's a non-text semantic → suppress
67
- if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
68
- return False
69
- return True
70
-
71
-
72
- def _inf_eligible(dtype: pl.DataType) -> bool:
73
- """True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
74
- return dtype in (pl.Float32, pl.Float64)
75
-
76
37
 
77
38
  class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
78
- """
79
- Missingness profiler for Polars DataFrames.
80
-
81
- Column scoping
82
- --------------
83
- Resolution priority (high → low):
84
- 1. Explicit ``columns`` argument to ``profile()``.
85
- 2. ``config.exclude_columns`` — always removed.
86
- 3. All remaining DataFrame columns.
87
- """
39
+ """Missingness profiler for Polars DataFrames."""
88
40
 
89
- def __init__(self, config: ProfileConfig | None = None) -> None:
90
- super().__init__(config)
91
- self._config: ProfileConfig = config or ProfileConfig()
41
+ def __init__(self) -> None:
42
+ super().__init__()
92
43
 
93
44
  # ------------------------------------------------------------------
94
45
  # Public API
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
117
68
  if n_rows == 0 or not cols:
118
69
  return result
119
70
 
120
- overrides = self._config.column_overrides # sparse — most keys absent
121
71
  indicator_cols: list[pl.Series] = []
122
72
 
123
73
  for col_name in cols:
124
- override = overrides.get(col_name) # None for most columns
125
74
  col_profile, indicator = self._profile_column(
126
75
  series=df[col_name],
127
76
  col_name=col_name,
128
77
  n_rows=n_rows,
129
- override=override,
130
78
  )
131
79
  result.columns[col_name] = col_profile
132
80
  indicator_cols.append(indicator)
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
173
121
  series: pl.Series,
174
122
  col_name: str,
175
123
  n_rows: int,
176
- override: SemanticType | None = None, # sparse — None is the common case
177
124
  ) -> tuple[ColumnMissingnessProfile, pl.Series]:
178
- """
179
- Compute standard + effective null counts for one column.
180
-
181
- Eligibility is dtype-first:
182
- - sentinel strings → String dtype, unless override suppresses it
183
- - Inf/NaN → Float dtype, always (never suppressed)
184
- - everything else → standard Polars null only
185
- """
186
125
  profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
187
126
  dtype = series.dtype
188
127
  std_null = series.is_null()
189
128
 
190
- if _sentinel_eligible(dtype, override):
129
+ if _sentinel_eligible(dtype):
191
130
  eff_null = (
192
131
  std_null
193
132
  | (series.str.strip_chars() == "")
@@ -208,7 +147,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
208
147
 
209
148
  r = profile.effective_null_ratio
210
149
 
211
- if r < _SEVERITY_MINOR and r != 0:
150
+ if r == 0.0:
151
+ profile.severity = None
152
+ elif r < _SEVERITY_MINOR:
212
153
  profile.severity = MissingSeverity.Minor
213
154
  elif r < _SEVERITY_MODERATE:
214
155
  profile.severity = MissingSeverity.Moderate
@@ -0,0 +1,22 @@
1
+ """
2
+ _null_detection – shared dtype-driven null primitives for Phase 1.
3
+
4
+ Single authority for what counts as "effectively null" across the entire
5
+ Phase 1 implementation. No config, no SemanticType overrides, no state.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import polars as pl
11
+
12
+ _SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
13
+
14
+
15
+ def _sentinel_eligible(dtype: pl.DataType) -> bool:
16
+ """True when sentinel-string detection should run for this column (String/Utf8 only)."""
17
+ return dtype in (pl.Utf8, pl.String)
18
+
19
+
20
+ def _inf_eligible(dtype: pl.DataType) -> bool:
21
+ """True when Inf/NaN expansion should run (Float32/Float64 only)."""
22
+ return dtype in (pl.Float32, pl.Float64)
@@ -35,10 +35,6 @@ from __future__ import annotations
35
35
  import polars as pl
36
36
 
37
37
  from ._base import ColumnBatchProfiler
38
- from .config import (
39
- ProfileConfig,
40
- SemanticType,
41
- )
42
38
  from ._correlation_profiler import _INT_DTYPES
43
39
  from ._numeric_config import (
44
40
  NumericProfileResult,
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
80
76
  """
81
77
  Numeric distribution profiler for Polars DataFrames.
82
78
 
83
- Parameters
84
- ----------
85
- columns : list[str]
86
- Columns to profile. Non-numeric or absent columns are skipped
87
- with a warning; they do not raise.
88
- config : ProfileConfig | None
89
- Shared profiling configuration.
79
+ Profiles every column passed to profile(df, columns) — no config,
80
+ no internal eligibility gate.
90
81
  """
91
82
 
92
- def __init__(
93
- self,
94
- config: ProfileConfig | None = None,
95
- ) -> None:
96
- super().__init__(config)
97
-
98
83
  # ------------------------------------------------------------------
99
84
  # Public API
100
85
  # ------------------------------------------------------------------
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
110
95
  # Orchestration
111
96
  # ------------------------------------------------------------------
112
97
 
113
- def _eligible(self, series: pl.Series) -> bool:
114
- override = self.config.column_overrides.get(series.name)
115
- if override == SemanticType.Numeric:
116
- return True
117
-
118
- if override is not None:
119
- return False
120
-
121
- return True
122
-
123
98
  def _run(
124
99
  self,
125
100
  df: pl.DataFrame,
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
128
103
  result = NumericProfileResult()
129
104
  n_rows = df.height
130
105
 
131
- available = [
132
- c
133
- for c in self._resolve_columns(df.columns, columns)
134
- if self._eligible(df[c])
135
- ]
106
+ available = self._resolve_columns(df.columns, columns)
136
107
  result.analysed_columns = available
137
108
 
138
109
  if not available:
@@ -254,16 +225,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
254
225
  for i in range(top_rows)
255
226
  ]
256
227
  else:
257
- # --- 20-Bin Histogram Distribution (Continuous) ---
228
+ # --- Histogram Distribution (Continuous) ---
258
229
  import numpy as np
259
230
 
260
231
  counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
232
+ n_clean = clean_f64.len()
261
233
  profile.histogram = [
262
234
  HistogramBin(
263
235
  lower_bound=float(bin_edges[i]),
264
236
  upper_bound=float(bin_edges[i + 1]),
265
237
  count=int(counts[i]),
266
- percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
238
+ percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
267
239
  )
268
240
  for i in range(len(counts))
269
241
  ]
@@ -3,16 +3,18 @@ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
3
3
 
4
4
  All DataFrame operations use Polars (no pandas dependency).
5
5
 
6
+ A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
7
+ computes dataset-level stats over every column — no exclusion logic, no
8
+ config dependency.
9
+
6
10
  Computes:
7
- • row / column count (always full dataset)
11
+ • row / column count (full dataset)
8
12
  • memory usage + per-column breakdown when threshold exceeded
9
- • duplicate row count & ratio (scoped to config.duplicate_columns)
10
- • overall sparsity (scoped to config.sparsity_columns)
11
- • data-type detection (scoped to config.type_detection_columns;
12
- skipped entirely when None)
13
+ • duplicate row count & ratio (all columns)
14
+ • overall sparsity (all columns)
13
15
 
14
16
  Chunked processing is activated automatically when the DataFrame's
15
- estimated memory exceeds config.memory_threshold_mb.
17
+ estimated memory exceeds _MEMORY_THRESHOLD_MB.
16
18
  """
17
19
 
18
20
  from __future__ import annotations
@@ -24,31 +26,32 @@ import polars as pl
24
26
  from ._base import ModalityProfiler
25
27
  from .config import (
26
28
  MemoryBreakdown,
27
- ProfileConfig,
28
29
  DatasetStats,
29
30
  )
30
31
 
32
+ # ---------------------------------------------------------------------------
33
+ # Module-level constants (previously sourced from ProfileConfig)
34
+ # ---------------------------------------------------------------------------
35
+
36
+ _MEMORY_THRESHOLD_MB: float = 500.0
37
+ _CHUNK_SIZE: int = 100_000
38
+
31
39
 
32
40
  class TabularProfiler(ModalityProfiler):
33
41
  """
34
42
  Structural profiler for Polars DataFrames.
35
43
 
44
+ Pipeline-agnostic: accepts no constructor arguments and applies no column
45
+ filtering. Computes dataset-level stats (row count, column count, memory,
46
+ duplicate ratio, overall sparsity) over the complete DataFrame it receives.
47
+
36
48
  Usage
37
49
  -----
38
- >>> cfg = ProfileConfig(
39
- ... duplicate_columns=["user_id", "event_time"],
40
- ... sparsity_columns=["age", "income", "postcode"],
41
- ... type_detection_columns=["age", "income", "postcode", "created_at"],
42
- ... memory_threshold_mb=200,
43
- ... )
44
- >>> profiler = TabularProfiler(config=cfg)
50
+ >>> profiler = TabularProfiler()
45
51
  >>> result = profiler.profile(df)
46
52
  >>> print(result)
47
53
  """
48
54
 
49
- def __init__(self, config: ProfileConfig | None = None):
50
- super().__init__(config)
51
-
52
55
  # ------------------------------------------------------------------
53
56
  # Public API
54
57
  # ------------------------------------------------------------------
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
77
80
  if result.row_count == 0:
78
81
  return result
79
82
 
80
- # 3. Resolve column scopes
83
+ # 3. Operate on all columns — no exclusion logic
81
84
  all_cols: list[str] = df.columns
82
- analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
83
-
84
- dup_cols = analysed_cols
85
- missingness_cols = analysed_cols
86
85
 
87
86
  if use_chunks:
88
- self._chunked_metrics(df, dup_cols, missingness_cols, result)
87
+ self._chunked_metrics(df, all_cols, all_cols, result)
89
88
  else:
90
- self._full_metrics(df, dup_cols, missingness_cols, result)
89
+ self._full_metrics(df, all_cols, all_cols, result)
91
90
 
92
91
  return result
93
92
 
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
136
135
  total_bytes = sum(col_bytes.values())
137
136
 
138
137
  result.memory_bytes = total_bytes
139
- threshold_bytes = self.config.memory_threshold_mb * 1024 * 1024
138
+ threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
140
139
 
141
140
  if total_bytes > threshold_bytes:
142
141
  result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
189
188
  seen hashes — semantics match keep='first'.
190
189
  Sparsity is accumulated as (missing_cells, total_cells).
191
190
  """
192
- chunk_size = self.config.chunk_size
191
+ chunk_size = _CHUNK_SIZE
193
192
  n_chunks = math.ceil(result.row_count / chunk_size)
194
193
 
195
194
  seen_hashes: set[int] = set()
@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
36
36
  """
37
37
 
38
38
  def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
39
- super().__init__(config)
39
+ super().__init__()
40
40
  self.target_column = target_column
41
41
 
42
42
  def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
129
129
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
130
130
  ) -> None:
131
131
  """Generates categorical metrics and checks for class imbalance."""
132
- cat_profiler = CategoricalProfiler(config=self.config)
132
+ cat_profiler = CategoricalProfiler()
133
133
 
134
134
  # Internally compute cardinality, top values, and imbalance metrics
135
135
  cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
146
146
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
147
147
  ) -> None:
148
148
  """Generates numeric metrics and checks for target skewness."""
149
- num_profiler = NumericProfiler(config=self.config)
149
+ num_profiler = NumericProfiler()
150
150
 
151
151
  col_name = series.name
152
152
  num_result = num_profiler.profile(series.to_frame(), [col_name])
@@ -54,11 +54,7 @@ from __future__ import annotations
54
54
  import polars as pl
55
55
 
56
56
  from ._base import ColumnBatchProfiler
57
- from .config import (
58
- ProfileConfig,
59
- TextStats,
60
- SemanticType,
61
- )
57
+ from .config import TextStats
62
58
  from ._text_config import TextProfileResult
63
59
 
64
60
  # Regex that counts non-whitespace token runs — used with str.count_matches.
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
69
65
  """
70
66
  Free-text column profiler for Polars DataFrames.
71
67
 
72
- A column is eligible when:
73
- - It has a ``SemanticType.Text`` override in
74
- ``ProfileConfig.column_overrides``, OR
75
- - Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
76
-
77
- Non-eligible columns are silently skipped.
78
-
79
- Parameters
80
- ----------
81
- config : ProfileConfig | None
82
- Shared profiling configuration.
68
+ Profiles every column passed to profile(df, columns) — no config,
69
+ no internal eligibility gate.
83
70
  """
84
71
 
85
- def __init__(self, config: ProfileConfig | None = None) -> None:
86
- super().__init__(config)
87
-
88
72
  # ------------------------------------------------------------------
89
73
  # Public API
90
74
  # ------------------------------------------------------------------
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
96
80
  ) -> TextProfileResult:
97
81
  return self._run(data, columns)
98
82
 
99
- # ------------------------------------------------------------------
100
- # Eligibility
101
- # ------------------------------------------------------------------
102
-
103
- def _eligible(self, series: pl.Series) -> bool:
104
- override = self.config.column_overrides.get(series.name)
105
-
106
- if override == SemanticType.Text:
107
- return True
108
-
109
- # Any other explicit override takes precedence
110
- if override is not None:
111
- return False
112
-
113
- # Native string dtype (pl.Utf8 is the canonical name; pl.String is
114
- # an alias in newer Polars — check both for cross-version safety)
115
- return series.dtype in (pl.Utf8, pl.String)
116
-
117
83
  # ------------------------------------------------------------------
118
84
  # Orchestration
119
85
  # ------------------------------------------------------------------
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
125
91
  ) -> TextProfileResult:
126
92
  result = TextProfileResult()
127
93
 
128
- available = [
129
- c
130
- for c in self._resolve_columns(df.columns, columns)
131
- if self._eligible(df[c])
132
- ]
94
+ available = self._resolve_columns(df.columns, columns)
133
95
  result.analysed_columns = available
134
96
 
135
97
  for col_name in available:
@@ -52,6 +52,15 @@ class Modality(StrEnum):
52
52
  # TimeSeries = "time_series"
53
53
 
54
54
 
55
+ class PipelinePhase(StrEnum):
56
+ Profiling = "profiling"
57
+ Imputation = "imputation"
58
+ OutlierDetection = "outlier_detection"
59
+ Normalization = "normalization"
60
+ Encoding = "encoding"
61
+ Scaling = "scaling"
62
+
63
+
55
64
  # ---------------------------------------------------------------------------
56
65
  # Type-detection enums — kept for TypeDetector compatibility
57
66
  # ---------------------------------------------------------------------------
@@ -71,6 +80,7 @@ class TypeFlag(StrEnum):
71
80
  SequentialIndex = "sequential_index"
72
81
  FloatSequentialIndex = "float_sequential_index"
73
82
  FreeTextCandidate = "free_text_candidate"
83
+ UserOverride = "user_override"
74
84
 
75
85
 
76
86
  # ---------------------------------------------------------------------------
@@ -240,6 +250,34 @@ class ProfileConfig:
240
250
  memory_threshold_mb: float = 500.0
241
251
  chunk_size: int = 100_000
242
252
 
253
+ def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
254
+ """
255
+ Explicitly set the semantic type for a column, overriding auto-detection.
256
+
257
+ The override is the sole source of truth for that column's type — the
258
+ type detector's verdict is ignored during profiling. Calling this method
259
+ multiple times on the same column is valid; the last call wins.
260
+
261
+ Parameters
262
+ ----------
263
+ column : str
264
+ Name of the column to override.
265
+ semantic_type : str | SemanticType
266
+ Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
267
+ ``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
268
+ raise ``ValueError``.
269
+ """
270
+ if isinstance(semantic_type, str):
271
+ try:
272
+ semantic_type = SemanticType(semantic_type)
273
+ except ValueError:
274
+ valid = [e.value for e in SemanticType]
275
+ raise ValueError(
276
+ f"Unknown semantic type {semantic_type!r}. "
277
+ f"Valid values: {valid}"
278
+ )
279
+ self.column_overrides[column] = semantic_type
280
+
243
281
  def to_dict(self) -> dict:
244
282
  return {
245
283
  "modality": str(self.modality),
@@ -256,7 +294,7 @@ class ProfileConfig:
256
294
  def from_dict(cls, data: dict) -> ProfileConfig:
257
295
  return cls(
258
296
  modality=Modality(data.get("modality", Modality.Tabular)),
259
- target_column=data.get("target_column"),
297
+ target_columns=list(data.get("target_columns", [])),
260
298
  column_overrides={
261
299
  k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
262
300
  },
@@ -275,6 +313,98 @@ class ProfileConfig:
275
313
  return cls.from_dict(json.loads(json_str))
276
314
 
277
315
 
316
+ @dataclass
317
+ class PipelineConfig:
318
+ """
319
+ Master configuration for the full 6-phase feature engineering pipeline.
320
+
321
+ Parameters
322
+ ----------
323
+ exclude_columns : list[str]
324
+ Hard exclusions — columns dropped globally from every phase.
325
+ phase_exclusions : dict[PipelinePhase, list[str]]
326
+ Soft exclusions — columns bypassed for a specific phase but retained
327
+ in the dataset.
328
+ column_overrides : dict[str, SemanticType]
329
+ Explicit semantic type assignments respected by all downstream phases.
330
+ profiling : ProfileConfig
331
+ Phase 1-specific parameters (correlation, chunking, memory threshold).
332
+ """
333
+
334
+ exclude_columns: list[str] = field(default_factory=list)
335
+ phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
336
+ column_overrides: dict[str, SemanticType] = field(default_factory=dict)
337
+ profiling: ProfileConfig = field(default_factory=ProfileConfig)
338
+
339
+ def resolve_active_columns(
340
+ self, phase: PipelinePhase, available_columns: list[str]
341
+ ) -> list[str]:
342
+ """
343
+ Return the columns the given phase should operate on.
344
+
345
+ Hard exclusions are applied first, then phase-specific soft exclusions.
346
+ Columns absent from available_columns are silently ignored in both lists.
347
+ """
348
+ hard_set = set(self.exclude_columns)
349
+ soft_set = set(self.phase_exclusions.get(phase, []))
350
+ excluded = hard_set | soft_set
351
+ return [c for c in available_columns if c not in excluded]
352
+
353
+ def set_column_type(
354
+ self, column: str, semantic_type: Union[str, "SemanticType"]
355
+ ) -> None:
356
+ """
357
+ Explicitly set the semantic type for a column, overriding auto-detection.
358
+ This override is respected by all downstream phases.
359
+ """
360
+ if isinstance(semantic_type, str):
361
+ try:
362
+ semantic_type = SemanticType(semantic_type)
363
+ except ValueError:
364
+ valid = [e.value for e in SemanticType]
365
+ raise ValueError(
366
+ f"Unknown semantic type {semantic_type!r}. "
367
+ f"Valid values: {valid}"
368
+ )
369
+ self.column_overrides[column] = semantic_type
370
+
371
+ def to_dict(self) -> dict:
372
+ return {
373
+ "exclude_columns": list(self.exclude_columns),
374
+ "phase_exclusions": {
375
+ str(phase): list(cols)
376
+ for phase, cols in self.phase_exclusions.items()
377
+ },
378
+ "column_overrides": {
379
+ col: str(sem_type)
380
+ for col, sem_type in self.column_overrides.items()
381
+ },
382
+ "profiling": self.profiling.to_dict(),
383
+ }
384
+
385
+ @classmethod
386
+ def from_dict(cls, data: dict) -> "PipelineConfig":
387
+ return cls(
388
+ exclude_columns=list(data.get("exclude_columns", [])),
389
+ phase_exclusions={
390
+ PipelinePhase(phase_str): list(cols)
391
+ for phase_str, cols in data.get("phase_exclusions", {}).items()
392
+ },
393
+ column_overrides={
394
+ col: SemanticType(sem_str)
395
+ for col, sem_str in data.get("column_overrides", {}).items()
396
+ },
397
+ profiling=ProfileConfig.from_dict(data.get("profiling", {})),
398
+ )
399
+
400
+ def to_json(self, indent: int = 2) -> str:
401
+ return json.dumps(self.to_dict(), indent=indent)
402
+
403
+ @classmethod
404
+ def from_json(cls, json_str: str) -> "PipelineConfig":
405
+ return cls.from_dict(json.loads(json_str))
406
+
407
+
278
408
  @dataclass
279
409
  class ColumnTypeInfo:
280
410
  column: str
@@ -35,11 +35,13 @@ from ._target_profiler import TargetProfiler
35
35
  from ._correlation_profiler import CorrelationProfiler
36
36
  from ._type_detector import TypeDetector
37
37
  from .config import (
38
- ProfileConfig,
38
+ PipelineConfig,
39
+ PipelinePhase,
39
40
  ColumnProfile,
40
41
  StructuralProfileResult,
41
42
  RowMissingnessDistribution,
42
43
  SemanticType,
44
+ TypeFlag,
43
45
  Modality,
44
46
  )
45
47
 
@@ -63,14 +65,16 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { #
63
65
 
64
66
  class StructuralProfiler:
65
67
 
66
- def __init__(self, config: ProfileConfig | None = None) -> None:
67
- self.config = config or ProfileConfig()
68
+ def __init__(self, config: PipelineConfig | None = None) -> None:
69
+ self.config: PipelineConfig = config or PipelineConfig()
70
+ # Keep sub-profilers aligned with the master column_overrides.
71
+ self.config.profiling.column_overrides = self.config.column_overrides
68
72
 
69
- if self.config.modality == Modality.Tabular:
70
- self.modality_profiler: ModalityProfiler = TabularProfiler(self.config)
73
+ if self.config.profiling.modality == Modality.Tabular:
74
+ self.modality_profiler: ModalityProfiler = TabularProfiler()
71
75
  else:
72
76
  raise NotImplementedError(
73
- f"modality {self.config.modality} not supported yet"
77
+ f"modality {self.config.profiling.modality} not supported yet"
74
78
  )
75
79
 
76
80
  # ------------------------------------------------------------------
@@ -86,7 +90,17 @@ class StructuralProfiler:
86
90
 
87
91
  result = StructuralProfileResult()
88
92
 
89
- active_cols = [c for c in data.columns if c not in self.config.exclude_columns]
93
+ active_cols = self.config.resolve_active_columns(
94
+ PipelinePhase.Profiling, list(data.columns)
95
+ )
96
+
97
+ # Columns soft-excluded for Profiling: skipped but retained in the result.
98
+ hard_set = set(self.config.exclude_columns)
99
+ soft_retained = [
100
+ c for c in data.columns
101
+ if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
102
+ and c not in hard_set
103
+ ]
90
104
 
91
105
  # ── 1. Modality profiler ─────────────────────────────────────────
92
106
  # Replaces default DatasetStats with the real one (row_count, memory,
@@ -96,7 +110,7 @@ class StructuralProfiler:
96
110
  # ── 2. Missingness pre-pass ──────────────────────────────────────
97
111
  # setdefault creates ColumnProfile entries; subsequent steps mutate
98
112
  # the same objects via the same setdefault pattern.
99
- missingness_result = MissingnessProfiler(config=self.config).profile(
113
+ missingness_result = MissingnessProfiler().profile(
100
114
  data, columns=active_cols
101
115
  )
102
116
  for col_name in missingness_result.analysed_columns:
@@ -111,7 +125,6 @@ class StructuralProfiler:
111
125
  df=data,
112
126
  cols=active_cols,
113
127
  n_rows=data.height,
114
- overrides=self.config.column_overrides,
115
128
  )
116
129
 
117
130
  # ── 4. Type detection ────────────────────────────────────────────
@@ -130,7 +143,10 @@ class StructuralProfiler:
130
143
  # Overrides for excluded / non-existent columns are silently ignored.
131
144
  for col_name, override_type in self.config.column_overrides.items():
132
145
  if col_name in result.columns:
133
- result.columns[col_name].semantic_type = override_type
146
+ cp = result.columns[col_name]
147
+ cp.semantic_type = override_type
148
+ if TypeFlag.UserOverride not in cp.type_flags:
149
+ cp.type_flags.append(TypeFlag.UserOverride)
134
150
 
135
151
  # ── 6. Per-column profiling routed by SemanticType ───────────────
136
152
  # Batch all columns of the same SemanticType together and call each
@@ -149,7 +165,7 @@ class StructuralProfiler:
149
165
  profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
150
166
  if profiler_cls is None:
151
167
  continue
152
- profiler = profiler_cls(config=self.config)
168
+ profiler = profiler_cls()
153
169
  try:
154
170
  batch = profiler.profile(data, columns=cols)
155
171
  for col_name in batch.analysed_columns:
@@ -161,13 +177,13 @@ class StructuralProfiler:
161
177
  # ── 7. Target columns ────────────────────────────────────────────
162
178
  # TargetProfiler produces target-specific analysis stored in
163
179
  # result.targets. cp.stats is NOT overwritten — step 6 already set it.
164
- if self.config.target_columns:
165
- for target in self.config.target_columns:
180
+ if self.config.profiling.target_columns:
181
+ for target in self.config.profiling.target_columns:
166
182
  if target not in data.columns:
167
183
  continue
168
184
  target_result = TargetProfiler(
169
185
  target_column=target,
170
- config=self.config,
186
+ config=self.config.profiling,
171
187
  ).profile(data)
172
188
  result.targets[target] = target_result
173
189
 
@@ -176,7 +192,7 @@ class StructuralProfiler:
176
192
  cp.is_target = True
177
193
 
178
194
  # ── 8. Correlation ───────────────────────────────────────────────
179
- if self.config.compute_correlation:
195
+ if self.config.profiling.compute_correlation:
180
196
  # Resolve column lists by detected SemanticType (post-override).
181
197
  numeric_cols = [
182
198
  c
@@ -194,7 +210,7 @@ class StructuralProfiler:
194
210
  corr_profiler = CorrelationProfiler(
195
211
  numeric_columns=numeric_cols,
196
212
  categorical_columns=categorical_cols,
197
- config=self.config,
213
+ config=self.config.profiling,
198
214
  )
199
215
 
200
216
  # 8a. Feature-feature matrices — computed ONCE, target-independent.
@@ -205,7 +221,7 @@ class StructuralProfiler:
205
221
 
206
222
  # 8b. Per-target analysis — matrices are NOT recomputed; each call
207
223
  # shallow-copies feature_corr and appends target-specific fields.
208
- for target in self.config.target_columns:
224
+ for target in self.config.profiling.target_columns:
209
225
  if target not in data.columns:
210
226
  continue
211
227
  result.dataset.target_correlations[target] = (
@@ -214,6 +230,12 @@ class StructuralProfiler:
214
230
  )
215
231
  )
216
232
 
233
+ # ── Soft-excluded placeholders ───────────────────────────────────────
234
+ # Columns soft-excluded for Profiling are not profiled but must still
235
+ # appear in the result so downstream phases can reference them.
236
+ for col in soft_retained:
237
+ result.columns.setdefault(col, ColumnProfile(name=col))
238
+
217
239
  return result
218
240
 
219
241
  # ------------------------------------------------------------------
@@ -225,9 +247,8 @@ class StructuralProfiler:
225
247
  df: pl.DataFrame,
226
248
  cols: list[str],
227
249
  n_rows: int,
228
- overrides: dict[str, SemanticType],
229
250
  ) -> RowMissingnessDistribution:
230
- from ._missingness_profiler import (
251
+ from ._null_detection import (
231
252
  _sentinel_eligible,
232
253
  _inf_eligible,
233
254
  _SENTINEL_STRINGS,
@@ -242,10 +263,9 @@ class StructuralProfiler:
242
263
 
243
264
  for col_name in cols:
244
265
  dtype = df[col_name].dtype
245
- override = overrides.get(col_name)
246
266
  null_e = pl.col(col_name).is_null()
247
267
 
248
- if _sentinel_eligible(dtype, override):
268
+ if _sentinel_eligible(dtype):
249
269
  eff = (
250
270
  null_e
251
271
  | (pl.col(col_name).str.strip_chars() == "")
@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
82
82
 
83
83
 
84
84
  class DataLoader:
85
- def __init__(self, fmt: str | None = None) -> None:
86
- self._fmt_override = fmt.lower() if fmt else None
87
85
 
88
86
  def load(
89
87
  self,
@@ -92,7 +90,7 @@ class DataLoader:
92
90
  ) -> pl.DataFrame:
93
91
  raw, ext_from_path = _read_raw(source)
94
92
 
95
- resolved_fmt = (fmt or self._fmt_override or ext_from_path or "").lower()
93
+ resolved_fmt = (ext_from_path or "").lower()
96
94
 
97
95
  if resolved_fmt not in _EXT_LOADERS:
98
96
  label = resolved_fmt if resolved_fmt else "<unknown>"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
22
22
  src/dataforge_ml/profiling/_datetime_profiler.py
23
23
  src/dataforge_ml/profiling/_missingness_config.py
24
24
  src/dataforge_ml/profiling/_missingness_profiler.py
25
+ src/dataforge_ml/profiling/_null_detection.py
25
26
  src/dataforge_ml/profiling/_numeric_config.py
26
27
  src/dataforge_ml/profiling/_numeric_profiler.py
27
28
  src/dataforge_ml/profiling/_tabular.py
File without changes
File without changes
File without changes