dataforge-ml 0.8.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/__init__.py +4 -0
  4. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_base.py +11 -14
  5. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_profiler.py +4 -41
  6. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical.py +3 -44
  7. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
  8. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_profiler.py +7 -34
  9. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py +8 -69
  10. dataforge_ml-0.9.0/src/dataforge_ml/profiling/_null_detection.py +22 -0
  11. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -32
  12. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_tabular.py +25 -26
  13. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -3
  14. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_profiler.py +4 -42
  15. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/config.py +102 -1
  16. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/structural.py +36 -20
  17. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  18. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
  19. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/LICENSE +0 -0
  20. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/README.md +0 -0
  21. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/setup.cfg +0 -0
  22. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/__init__.py +0 -0
  23. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/__init__.py +0 -0
  24. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  25. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_types.py +0 -0
  26. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  27. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  28. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  29. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  30. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  31. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  32. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  33. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  34. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  35. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  36. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_config.py +0 -0
  37. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  38. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/__init__.py +0 -0
  39. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/data_loader.py +0 -0
  40. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  41. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  42. {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.8.0"
7
+ version = "0.9.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,6 +1,8 @@
1
1
  from .structural import StructuralProfiler
2
2
  from .config import (
3
3
  ProfileConfig,
4
+ PipelineConfig,
5
+ PipelinePhase,
4
6
  SemanticType,
5
7
  Modality,
6
8
  TypeFlag,
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
19
21
  __all__ = [
20
22
  "StructuralProfiler",
21
23
  "ProfileConfig",
24
+ "PipelineConfig",
25
+ "PipelinePhase",
22
26
  "SemanticType",
23
27
  "Modality",
24
28
  "TypeFlag",
@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
3
3
 
4
4
  Hierarchy
5
5
  ---------
6
- Profiling[R] — root: stores config, provides _resolve_columns
7
- ├── ColumnBatchProfiler[R] — registry tier: __init__(config=None) only;
8
- │ │ profile(df, columns) processes a typed column batch
6
+ Profiling[R] — root: thin ABC, provides _resolve_columns
7
+ ├── ColumnBatchProfiler[R] — registry tier: profile(df, columns) processes a
8
+ │ │ typed column batch; no config, no eligibility gates
9
9
  │ ├── NumericProfiler
10
10
  │ ├── CategoricalProfiler
11
11
  │ ├── DatetimeProfiler
@@ -26,22 +26,19 @@ import polars as pl
26
26
  from abc import abstractmethod, ABC
27
27
  from typing import Generic, TypeVar
28
28
 
29
- from .config import DatasetStats, ProfileConfig
29
+ from .config import DatasetStats
30
30
 
31
31
  R = TypeVar("R")
32
32
 
33
33
 
34
34
  class Profiling(ABC, Generic[R]):
35
35
  """
36
- Root base for all profilers.
36
+ Root base for all profilers. Thin ABC — no config state.
37
37
 
38
- Stores config and provides _resolve_columns. Not instantiated directly
39
- use one of the three concrete tier bases below.
38
+ Sub-processors are pure batch processors: given a DataFrame and a column
39
+ list, return a result. No routing, no scoping, no config.
40
40
  """
41
41
 
42
- def __init__(self, config: ProfileConfig | None = None):
43
- self.config = config or ProfileConfig()
44
-
45
42
  @abstractmethod
46
43
  def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
47
44
 
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
62
59
 
63
60
  Contract
64
61
  --------
65
- - __init__ must accept ONLY config (no extra required params). This allows
66
- StructuralProfiler to instantiate any registered profiler uniformly via
67
- profiler_cls(config=self.config)
62
+ - __init__ takes no arguments (instantiated as profiler_cls()).
68
63
  - profile(df, columns) receives the full DataFrame and the list of same-type
69
- column names to process. Returns a result with:
64
+ column names to process. Profiles every column in the list without any
65
+ internal eligibility gate or config consultation.
66
+ - Returns a result with:
70
67
  .columns: dict[str, <Stats>] — per-column stats
71
68
  .analysed_columns: list[str] — columns actually profiled
72
69
  """
@@ -22,11 +22,7 @@ from __future__ import annotations
22
22
  import polars as pl
23
23
 
24
24
  from ._base import ColumnBatchProfiler
25
- from .config import (
26
- ProfileConfig,
27
- BooleanStats,
28
- SemanticType,
29
- )
25
+ from .config import BooleanStats
30
26
  from ._boolean_config import BooleanProfileResult
31
27
  from ..models._data_types import _INT_DTYPES
32
28
 
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
42
38
  """
43
39
  Boolean column profiler for Polars DataFrames.
44
40
 
45
- A column is eligible when:
46
- - Its Polars dtype is pl.Boolean, OR
47
- - Its dtype is an integer with values exclusively in {0, 1}, OR
48
- - It has a SemanticType.Boolean override in ProfileConfig.column_overrides
49
-
50
- Non-eligible columns in the provided list are silently skipped.
51
-
52
- Parameters
53
- ----------
54
- config : ProfileConfig | None
55
- Shared profiling configuration.
41
+ Profiles every column passed to profile(df, columns) — no config,
42
+ no internal eligibility gate.
56
43
  """
57
44
 
58
- def __init__(self, config: ProfileConfig | None = None) -> None:
59
- super().__init__(config)
60
-
61
45
  # ------------------------------------------------------------------
62
46
  # Public API
63
47
  # ------------------------------------------------------------------
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
69
53
  ) -> BooleanProfileResult:
70
54
  return self._run(data, columns)
71
55
 
72
- # ------------------------------------------------------------------
73
- # Eligibility
74
- # ------------------------------------------------------------------
75
-
76
- def _eligible(self, series: pl.Series) -> bool:
77
- override = self.config.column_overrides.get(series.name)
78
-
79
- # Explicit override — trust it
80
- if override == SemanticType.Boolean:
81
- return True
82
-
83
- # Another override takes precedence over auto-detection
84
- if override is not None:
85
- return False
86
-
87
- return True
88
-
89
56
  # ------------------------------------------------------------------
90
57
  # Orchestration
91
58
  # ------------------------------------------------------------------
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
97
64
  ) -> BooleanProfileResult:
98
65
  result = BooleanProfileResult()
99
66
 
100
- available = [
101
- c
102
- for c in self._resolve_columns(df.columns, columns)
103
- if self._eligible(df[c])
104
- ]
67
+ available = self._resolve_columns(df.columns, columns)
105
68
  result.analysed_columns = available
106
69
 
107
70
  for col_name in available:
@@ -45,10 +45,6 @@ from ._categorical_config import (
45
45
  RareCategoryStats,
46
46
  ImbalanceMetrics,
47
47
  )
48
- from .config import (
49
- ProfileConfig,
50
- SemanticType,
51
- )
52
48
 
53
49
  # ---------------------------------------------------------------------------
54
50
  # Module-level thresholds (documented so callers can see what drives flags)
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
65
61
  """
66
62
  Categorical profiler for Polars DataFrames.
67
63
 
68
- Parameters
69
- ----------
70
- columns : list[str]
71
- Columns to profile. The profiler intersects this list with
72
- the DataFrame's actual columns at runtime.
73
- config : ProfileConfig | None
74
- Shared profiling configuration (used for chunk_size, etc.).
75
-
76
- Usage
77
- -----
78
- >>> profiler = CategoricalProfiler(
79
- ... columns=["status", "country", "product_type"],
80
- ... )
81
- >>> result = profiler.profile(df)
82
- >>> print(result)
64
+ Profiles every column passed to profile(df, columns) — no config,
65
+ no internal eligibility gate.
83
66
  """
84
67
 
85
- def __init__(
86
- self,
87
- config: ProfileConfig | None = None,
88
- ) -> None:
89
- super().__init__(config)
90
-
91
68
  # ------------------------------------------------------------------
92
69
  # Public API
93
70
  # ------------------------------------------------------------------
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
103
80
  # Orchestration
104
81
  # ------------------------------------------------------------------
105
82
 
106
- def _eligible(
107
- self,
108
- series: pl.Series,
109
- ) -> bool:
110
- override = self.config.column_overrides.get(series.name)
111
- if override == SemanticType.Categorical:
112
- return True
113
-
114
- if override is not None:
115
- return False
116
-
117
- return True
118
-
119
83
  def _run(
120
84
  self,
121
85
  df: pl.DataFrame,
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
123
87
  ) -> CategoricalProfileResult:
124
88
  result = CategoricalProfileResult()
125
89
 
126
- # Resolve columns against actual schema
127
- available = [
128
- c
129
- for c in self._resolve_columns(df.columns, columns)
130
- if self._eligible(df[c])
131
- ]
90
+ available = self._resolve_columns(df.columns, columns)
132
91
  result.analysed_columns = available
133
92
 
134
93
  n_rows = df.height
@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
125
125
  near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
126
126
  top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
127
127
  ) -> None:
128
- super().__init__(config)
128
+ super().__init__()
129
129
  self._numeric_columns = numeric_columns
130
130
  self._categorical_columns = categorical_columns or []
131
131
  self._threshold = near_redundant_threshold
@@ -43,10 +43,6 @@ from datetime import datetime, timezone
43
43
  import polars as pl
44
44
 
45
45
  from ._base import ColumnBatchProfiler
46
- from .config import (
47
- ProfileConfig,
48
- SemanticType,
49
- )
50
46
  from ._datetime_config import (
51
47
  DatetimeProfileResult,
52
48
  DatetimeStats,
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
90
86
  """
91
87
  Datetime distribution profiler for Polars DataFrames.
92
88
 
93
- Parameters
94
- ----------
95
- columns : list[str]
96
- Columns to profile. Non-datetime columns are skipped with a warning.
97
- config : ProfileConfig | None
98
- Shared profiling configuration.
89
+ Profiles every column passed to profile(df, columns) — no config,
90
+ no internal eligibility gate. String columns are coerced to Datetime;
91
+ columns that cannot be coerced are silently skipped.
99
92
  """
100
93
 
101
- def __init__(
102
- self,
103
- config: ProfileConfig | None = None,
104
- ) -> None:
105
- super().__init__(config)
106
-
107
94
  # ------------------------------------------------------------------
108
95
  # Public API
109
96
  # ------------------------------------------------------------------
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
119
106
  # Orchestration
120
107
  # ------------------------------------------------------------------
121
108
 
122
- def _eligible(self, series: pl.Series) -> bool:
123
- override = self.config.column_overrides.get(series.name)
124
-
125
- if override == SemanticType.Datetime:
126
- return True
127
- if override is not None:
128
- return False
129
-
130
- return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
131
-
132
109
  def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
133
110
  if series.dtype in (pl.Utf8, pl.String):
134
111
  coerced = series.str.to_datetime(strict=False)
135
112
  return coerced if coerced.drop_nulls().len() > 0 else None
136
- return series
113
+ if _is_datetime_dtype(series.dtype):
114
+ return series
115
+ return None
137
116
 
138
117
  def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
139
118
  result = DatetimeProfileResult()
140
119
  now = datetime.now(tz=timezone.utc)
141
120
 
142
- candidates = [
143
- c
144
- for c in self._resolve_columns(df.columns, columns)
145
- if self._eligible(df[c])
146
- ]
147
-
148
121
  available = []
149
122
  coerced_cache = {}
150
- for col_name in candidates:
123
+ for col_name in self._resolve_columns(df.columns, columns):
151
124
  series = self._coerce_to_datetime(df[col_name])
152
125
  if series is not None:
153
126
  available.append(col_name)
@@ -3,19 +3,10 @@ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
3
3
 
4
4
  Eligibility model
5
5
  -----------------
6
- Effective-null detection is based on **dtype first**, with SemanticType
7
- overrides acting only as suppressors, not as enablers:
6
+ Effective-null detection is purely dtype-driven no SemanticType overrides:
8
7
 
9
- sentinel-string detection → runs when dtype is Utf8/String
10
- suppressed if override is Numeric / Datetime / Boolean
11
- (those types cannot have meaningful sentinel strings)
12
-
13
- Inf / NaN expansion → runs when dtype is Float32/Float64
14
- never suppressed (Inf in a float column is always
15
- effectively missing regardless of semantic label)
16
-
17
- column_overrides is SPARSE — most columns will have no entry.
18
- Absence of an override is not a signal; it means "trust the dtype".
8
+ sentinel-string detection → runs for every String/Utf8 column unconditionally
9
+ Inf / NaN expansion → runs for every Float32/Float64 column unconditionally
19
10
  """
20
11
 
21
12
  from __future__ import annotations
@@ -24,13 +15,13 @@ from __future__ import annotations
24
15
  import polars as pl
25
16
 
26
17
  from ._base import DatasetLevelProfiler
27
- from .config import ProfileConfig, SemanticType
28
18
  from ._missingness_config import (
29
19
  ColumnMissingnessProfile,
30
20
  MissingnessFlag,
31
21
  MissingnessProfileResult,
32
22
  MissingSeverity,
33
23
  )
24
+ from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
34
25
 
35
26
  # ---------------------------------------------------------------------------
36
27
  # Thresholds
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
43
34
  _MAR_CORRELATION_THRESHOLD = 0.60
44
35
  _COL_DROP_THRESHOLD = 0.50
45
36
 
46
- _SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
47
-
48
- # Overrides that suppress sentinel-string detection on a String column.
49
- # If a column is String but the user says "this is Numeric", treating
50
- # "NA" as a sentinel is correct — but if they say Categorical or Text,
51
- # sentinel detection still makes sense and should run.
52
- _SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
53
- {
54
- SemanticType.Numeric,
55
- SemanticType.Datetime,
56
- SemanticType.Boolean,
57
- SemanticType.Identifier,
58
- }
59
- )
60
-
61
-
62
- def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
63
- """True when sentinel-string detection should run for this column."""
64
- if dtype not in (pl.Utf8, pl.String):
65
- return False
66
- # Override present and it's a non-text semantic → suppress
67
- if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
68
- return False
69
- return True
70
-
71
-
72
- def _inf_eligible(dtype: pl.DataType) -> bool:
73
- """True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
74
- return dtype in (pl.Float32, pl.Float64)
75
-
76
37
 
77
38
  class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
78
- """
79
- Missingness profiler for Polars DataFrames.
80
-
81
- Column scoping
82
- --------------
83
- Resolution priority (high → low):
84
- 1. Explicit ``columns`` argument to ``profile()``.
85
- 2. ``config.exclude_columns`` — always removed.
86
- 3. All remaining DataFrame columns.
87
- """
39
+ """Missingness profiler for Polars DataFrames."""
88
40
 
89
- def __init__(self, config: ProfileConfig | None = None) -> None:
90
- super().__init__(config)
91
- self._config: ProfileConfig = config or ProfileConfig()
41
+ def __init__(self) -> None:
42
+ super().__init__()
92
43
 
93
44
  # ------------------------------------------------------------------
94
45
  # Public API
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
117
68
  if n_rows == 0 or not cols:
118
69
  return result
119
70
 
120
- overrides = self._config.column_overrides # sparse — most keys absent
121
71
  indicator_cols: list[pl.Series] = []
122
72
 
123
73
  for col_name in cols:
124
- override = overrides.get(col_name) # None for most columns
125
74
  col_profile, indicator = self._profile_column(
126
75
  series=df[col_name],
127
76
  col_name=col_name,
128
77
  n_rows=n_rows,
129
- override=override,
130
78
  )
131
79
  result.columns[col_name] = col_profile
132
80
  indicator_cols.append(indicator)
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
173
121
  series: pl.Series,
174
122
  col_name: str,
175
123
  n_rows: int,
176
- override: SemanticType | None = None, # sparse — None is the common case
177
124
  ) -> tuple[ColumnMissingnessProfile, pl.Series]:
178
- """
179
- Compute standard + effective null counts for one column.
180
-
181
- Eligibility is dtype-first:
182
- - sentinel strings → String dtype, unless override suppresses it
183
- - Inf/NaN → Float dtype, always (never suppressed)
184
- - everything else → standard Polars null only
185
- """
186
125
  profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
187
126
  dtype = series.dtype
188
127
  std_null = series.is_null()
189
128
 
190
- if _sentinel_eligible(dtype, override):
129
+ if _sentinel_eligible(dtype):
191
130
  eff_null = (
192
131
  std_null
193
132
  | (series.str.strip_chars() == "")
@@ -0,0 +1,22 @@
1
+ """
2
+ _null_detection – shared dtype-driven null primitives for Phase 1.
3
+
4
+ Single authority for what counts as "effectively null" across the entire
5
+ Phase 1 implementation. No config, no SemanticType overrides, no state.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import polars as pl
11
+
12
+ _SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
13
+
14
+
15
+ def _sentinel_eligible(dtype: pl.DataType) -> bool:
16
+ """True when sentinel-string detection should run for this column (String/Utf8 only)."""
17
+ return dtype in (pl.Utf8, pl.String)
18
+
19
+
20
+ def _inf_eligible(dtype: pl.DataType) -> bool:
21
+ """True when Inf/NaN expansion should run (Float32/Float64 only)."""
22
+ return dtype in (pl.Float32, pl.Float64)
@@ -35,10 +35,6 @@ from __future__ import annotations
35
35
  import polars as pl
36
36
 
37
37
  from ._base import ColumnBatchProfiler
38
- from .config import (
39
- ProfileConfig,
40
- SemanticType,
41
- )
42
38
  from ._correlation_profiler import _INT_DTYPES
43
39
  from ._numeric_config import (
44
40
  NumericProfileResult,
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
80
76
  """
81
77
  Numeric distribution profiler for Polars DataFrames.
82
78
 
83
- Parameters
84
- ----------
85
- columns : list[str]
86
- Columns to profile. Non-numeric or absent columns are skipped
87
- with a warning; they do not raise.
88
- config : ProfileConfig | None
89
- Shared profiling configuration.
79
+ Profiles every column passed to profile(df, columns) — no config,
80
+ no internal eligibility gate.
90
81
  """
91
82
 
92
- def __init__(
93
- self,
94
- config: ProfileConfig | None = None,
95
- ) -> None:
96
- super().__init__(config)
97
-
98
83
  # ------------------------------------------------------------------
99
84
  # Public API
100
85
  # ------------------------------------------------------------------
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
110
95
  # Orchestration
111
96
  # ------------------------------------------------------------------
112
97
 
113
- def _eligible(self, series: pl.Series) -> bool:
114
- override = self.config.column_overrides.get(series.name)
115
- if override == SemanticType.Numeric:
116
- return True
117
-
118
- if override is not None:
119
- return False
120
-
121
- return True
122
-
123
98
  def _run(
124
99
  self,
125
100
  df: pl.DataFrame,
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
128
103
  result = NumericProfileResult()
129
104
  n_rows = df.height
130
105
 
131
- available = [
132
- c
133
- for c in self._resolve_columns(df.columns, columns)
134
- if self._eligible(df[c])
135
- ]
106
+ available = self._resolve_columns(df.columns, columns)
136
107
  result.analysed_columns = available
137
108
 
138
109
  if not available:
@@ -3,16 +3,18 @@ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
3
3
 
4
4
  All DataFrame operations use Polars (no pandas dependency).
5
5
 
6
+ A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
7
+ computes dataset-level stats over every column — no exclusion logic, no
8
+ config dependency.
9
+
6
10
  Computes:
7
- • row / column count (always full dataset)
11
+ • row / column count (full dataset)
8
12
  • memory usage + per-column breakdown when threshold exceeded
9
- • duplicate row count & ratio (scoped to config.duplicate_columns)
10
- • overall sparsity (scoped to config.sparsity_columns)
11
- • data-type detection (scoped to config.type_detection_columns;
12
- skipped entirely when None)
13
+ • duplicate row count & ratio (all columns)
14
+ • overall sparsity (all columns)
13
15
 
14
16
  Chunked processing is activated automatically when the DataFrame's
15
- estimated memory exceeds config.memory_threshold_mb.
17
+ estimated memory exceeds _MEMORY_THRESHOLD_MB.
16
18
  """
17
19
 
18
20
  from __future__ import annotations
@@ -24,31 +26,32 @@ import polars as pl
24
26
  from ._base import ModalityProfiler
25
27
  from .config import (
26
28
  MemoryBreakdown,
27
- ProfileConfig,
28
29
  DatasetStats,
29
30
  )
30
31
 
32
+ # ---------------------------------------------------------------------------
33
+ # Module-level constants (previously sourced from ProfileConfig)
34
+ # ---------------------------------------------------------------------------
35
+
36
+ _MEMORY_THRESHOLD_MB: float = 500.0
37
+ _CHUNK_SIZE: int = 100_000
38
+
31
39
 
32
40
  class TabularProfiler(ModalityProfiler):
33
41
  """
34
42
  Structural profiler for Polars DataFrames.
35
43
 
44
+ Pipeline-agnostic: accepts no constructor arguments and applies no column
45
+ filtering. Computes dataset-level stats (row count, column count, memory,
46
+ duplicate ratio, overall sparsity) over the complete DataFrame it receives.
47
+
36
48
  Usage
37
49
  -----
38
- >>> cfg = ProfileConfig(
39
- ... duplicate_columns=["user_id", "event_time"],
40
- ... sparsity_columns=["age", "income", "postcode"],
41
- ... type_detection_columns=["age", "income", "postcode", "created_at"],
42
- ... memory_threshold_mb=200,
43
- ... )
44
- >>> profiler = TabularProfiler(config=cfg)
50
+ >>> profiler = TabularProfiler()
45
51
  >>> result = profiler.profile(df)
46
52
  >>> print(result)
47
53
  """
48
54
 
49
- def __init__(self, config: ProfileConfig | None = None):
50
- super().__init__(config)
51
-
52
55
  # ------------------------------------------------------------------
53
56
  # Public API
54
57
  # ------------------------------------------------------------------
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
77
80
  if result.row_count == 0:
78
81
  return result
79
82
 
80
- # 3. Resolve column scopes
83
+ # 3. Operate on all columns — no exclusion logic
81
84
  all_cols: list[str] = df.columns
82
- analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
83
-
84
- dup_cols = analysed_cols
85
- missingness_cols = analysed_cols
86
85
 
87
86
  if use_chunks:
88
- self._chunked_metrics(df, dup_cols, missingness_cols, result)
87
+ self._chunked_metrics(df, all_cols, all_cols, result)
89
88
  else:
90
- self._full_metrics(df, dup_cols, missingness_cols, result)
89
+ self._full_metrics(df, all_cols, all_cols, result)
91
90
 
92
91
  return result
93
92
 
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
136
135
  total_bytes = sum(col_bytes.values())
137
136
 
138
137
  result.memory_bytes = total_bytes
139
- threshold_bytes = self.config.memory_threshold_mb * 1024 * 1024
138
+ threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
140
139
 
141
140
  if total_bytes > threshold_bytes:
142
141
  result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
189
188
  seen hashes — semantics match keep='first'.
190
189
  Sparsity is accumulated as (missing_cells, total_cells).
191
190
  """
192
- chunk_size = self.config.chunk_size
191
+ chunk_size = _CHUNK_SIZE
193
192
  n_chunks = math.ceil(result.row_count / chunk_size)
194
193
 
195
194
  seen_hashes: set[int] = set()
@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
36
36
  """
37
37
 
38
38
  def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
39
- super().__init__(config)
39
+ super().__init__()
40
40
  self.target_column = target_column
41
41
 
42
42
  def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
129
129
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
130
130
  ) -> None:
131
131
  """Generates categorical metrics and checks for class imbalance."""
132
- cat_profiler = CategoricalProfiler(config=self.config)
132
+ cat_profiler = CategoricalProfiler()
133
133
 
134
134
  # Internally compute cardinality, top values, and imbalance metrics
135
135
  cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
146
146
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
147
147
  ) -> None:
148
148
  """Generates numeric metrics and checks for target skewness."""
149
- num_profiler = NumericProfiler(config=self.config)
149
+ num_profiler = NumericProfiler()
150
150
 
151
151
  col_name = series.name
152
152
  num_result = num_profiler.profile(series.to_frame(), [col_name])
@@ -54,11 +54,7 @@ from __future__ import annotations
54
54
  import polars as pl
55
55
 
56
56
  from ._base import ColumnBatchProfiler
57
- from .config import (
58
- ProfileConfig,
59
- TextStats,
60
- SemanticType,
61
- )
57
+ from .config import TextStats
62
58
  from ._text_config import TextProfileResult
63
59
 
64
60
  # Regex that counts non-whitespace token runs — used with str.count_matches.
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
69
65
  """
70
66
  Free-text column profiler for Polars DataFrames.
71
67
 
72
- A column is eligible when:
73
- - It has a ``SemanticType.Text`` override in
74
- ``ProfileConfig.column_overrides``, OR
75
- - Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
76
-
77
- Non-eligible columns are silently skipped.
78
-
79
- Parameters
80
- ----------
81
- config : ProfileConfig | None
82
- Shared profiling configuration.
68
+ Profiles every column passed to profile(df, columns) — no config,
69
+ no internal eligibility gate.
83
70
  """
84
71
 
85
- def __init__(self, config: ProfileConfig | None = None) -> None:
86
- super().__init__(config)
87
-
88
72
  # ------------------------------------------------------------------
89
73
  # Public API
90
74
  # ------------------------------------------------------------------
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
96
80
  ) -> TextProfileResult:
97
81
  return self._run(data, columns)
98
82
 
99
- # ------------------------------------------------------------------
100
- # Eligibility
101
- # ------------------------------------------------------------------
102
-
103
- def _eligible(self, series: pl.Series) -> bool:
104
- override = self.config.column_overrides.get(series.name)
105
-
106
- if override == SemanticType.Text:
107
- return True
108
-
109
- # Any other explicit override takes precedence
110
- if override is not None:
111
- return False
112
-
113
- # Native string dtype (pl.Utf8 is the canonical name; pl.String is
114
- # an alias in newer Polars — check both for cross-version safety)
115
- return series.dtype in (pl.Utf8, pl.String)
116
-
117
83
  # ------------------------------------------------------------------
118
84
  # Orchestration
119
85
  # ------------------------------------------------------------------
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
125
91
  ) -> TextProfileResult:
126
92
  result = TextProfileResult()
127
93
 
128
- available = [
129
- c
130
- for c in self._resolve_columns(df.columns, columns)
131
- if self._eligible(df[c])
132
- ]
94
+ available = self._resolve_columns(df.columns, columns)
133
95
  result.analysed_columns = available
134
96
 
135
97
  for col_name in available:
@@ -52,6 +52,15 @@ class Modality(StrEnum):
52
52
  # TimeSeries = "time_series"
53
53
 
54
54
 
55
+ class PipelinePhase(StrEnum):
56
+ Profiling = "profiling"
57
+ Imputation = "imputation"
58
+ OutlierDetection = "outlier_detection"
59
+ Normalization = "normalization"
60
+ Encoding = "encoding"
61
+ Scaling = "scaling"
62
+
63
+
55
64
  # ---------------------------------------------------------------------------
56
65
  # Type-detection enums — kept for TypeDetector compatibility
57
66
  # ---------------------------------------------------------------------------
@@ -285,7 +294,7 @@ class ProfileConfig:
285
294
  def from_dict(cls, data: dict) -> ProfileConfig:
286
295
  return cls(
287
296
  modality=Modality(data.get("modality", Modality.Tabular)),
288
- target_column=data.get("target_column"),
297
+ target_columns=list(data.get("target_columns", [])),
289
298
  column_overrides={
290
299
  k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
291
300
  },
@@ -304,6 +313,98 @@ class ProfileConfig:
304
313
  return cls.from_dict(json.loads(json_str))
305
314
 
306
315
 
316
+ @dataclass
317
+ class PipelineConfig:
318
+ """
319
+ Master configuration for the full 6-phase feature engineering pipeline.
320
+
321
+ Parameters
322
+ ----------
323
+ exclude_columns : list[str]
324
+ Hard exclusions — columns dropped globally from every phase.
325
+ phase_exclusions : dict[PipelinePhase, list[str]]
326
+ Soft exclusions — columns bypassed for a specific phase but retained
327
+ in the dataset.
328
+ column_overrides : dict[str, SemanticType]
329
+ Explicit semantic type assignments respected by all downstream phases.
330
+ profiling : ProfileConfig
331
+ Phase 1-specific parameters (correlation, chunking, memory threshold).
332
+ """
333
+
334
+ exclude_columns: list[str] = field(default_factory=list)
335
+ phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
336
+ column_overrides: dict[str, SemanticType] = field(default_factory=dict)
337
+ profiling: ProfileConfig = field(default_factory=ProfileConfig)
338
+
339
+ def resolve_active_columns(
340
+ self, phase: PipelinePhase, available_columns: list[str]
341
+ ) -> list[str]:
342
+ """
343
+ Return the columns the given phase should operate on.
344
+
345
+ Hard exclusions are applied first, then phase-specific soft exclusions.
346
+ Columns absent from available_columns are silently ignored in both lists.
347
+ """
348
+ hard_set = set(self.exclude_columns)
349
+ soft_set = set(self.phase_exclusions.get(phase, []))
350
+ excluded = hard_set | soft_set
351
+ return [c for c in available_columns if c not in excluded]
352
+
353
+ def set_column_type(
354
+ self, column: str, semantic_type: Union[str, "SemanticType"]
355
+ ) -> None:
356
+ """
357
+ Explicitly set the semantic type for a column, overriding auto-detection.
358
+ This override is respected by all downstream phases.
359
+ """
360
+ if isinstance(semantic_type, str):
361
+ try:
362
+ semantic_type = SemanticType(semantic_type)
363
+ except ValueError:
364
+ valid = [e.value for e in SemanticType]
365
+ raise ValueError(
366
+ f"Unknown semantic type {semantic_type!r}. "
367
+ f"Valid values: {valid}"
368
+ )
369
+ self.column_overrides[column] = semantic_type
370
+
371
+ def to_dict(self) -> dict:
372
+ return {
373
+ "exclude_columns": list(self.exclude_columns),
374
+ "phase_exclusions": {
375
+ str(phase): list(cols)
376
+ for phase, cols in self.phase_exclusions.items()
377
+ },
378
+ "column_overrides": {
379
+ col: str(sem_type)
380
+ for col, sem_type in self.column_overrides.items()
381
+ },
382
+ "profiling": self.profiling.to_dict(),
383
+ }
384
+
385
+ @classmethod
386
+ def from_dict(cls, data: dict) -> "PipelineConfig":
387
+ return cls(
388
+ exclude_columns=list(data.get("exclude_columns", [])),
389
+ phase_exclusions={
390
+ PipelinePhase(phase_str): list(cols)
391
+ for phase_str, cols in data.get("phase_exclusions", {}).items()
392
+ },
393
+ column_overrides={
394
+ col: SemanticType(sem_str)
395
+ for col, sem_str in data.get("column_overrides", {}).items()
396
+ },
397
+ profiling=ProfileConfig.from_dict(data.get("profiling", {})),
398
+ )
399
+
400
+ def to_json(self, indent: int = 2) -> str:
401
+ return json.dumps(self.to_dict(), indent=indent)
402
+
403
+ @classmethod
404
+ def from_json(cls, json_str: str) -> "PipelineConfig":
405
+ return cls.from_dict(json.loads(json_str))
406
+
407
+
307
408
  @dataclass
308
409
  class ColumnTypeInfo:
309
410
  column: str
@@ -35,7 +35,8 @@ from ._target_profiler import TargetProfiler
35
35
  from ._correlation_profiler import CorrelationProfiler
36
36
  from ._type_detector import TypeDetector
37
37
  from .config import (
38
- ProfileConfig,
38
+ PipelineConfig,
39
+ PipelinePhase,
39
40
  ColumnProfile,
40
41
  StructuralProfileResult,
41
42
  RowMissingnessDistribution,
@@ -64,14 +65,16 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { #
64
65
 
65
66
  class StructuralProfiler:
66
67
 
67
- def __init__(self, config: ProfileConfig | None = None) -> None:
68
- self.config = config or ProfileConfig()
68
+ def __init__(self, config: PipelineConfig | None = None) -> None:
69
+ self.config: PipelineConfig = config or PipelineConfig()
70
+ # Keep sub-profilers aligned with the master column_overrides.
71
+ self.config.profiling.column_overrides = self.config.column_overrides
69
72
 
70
- if self.config.modality == Modality.Tabular:
71
- self.modality_profiler: ModalityProfiler = TabularProfiler(self.config)
73
+ if self.config.profiling.modality == Modality.Tabular:
74
+ self.modality_profiler: ModalityProfiler = TabularProfiler()
72
75
  else:
73
76
  raise NotImplementedError(
74
- f"modality {self.config.modality} not supported yet"
77
+ f"modality {self.config.profiling.modality} not supported yet"
75
78
  )
76
79
 
77
80
  # ------------------------------------------------------------------
@@ -87,7 +90,17 @@ class StructuralProfiler:
87
90
 
88
91
  result = StructuralProfileResult()
89
92
 
90
- active_cols = [c for c in data.columns if c not in self.config.exclude_columns]
93
+ active_cols = self.config.resolve_active_columns(
94
+ PipelinePhase.Profiling, list(data.columns)
95
+ )
96
+
97
+ # Columns soft-excluded for Profiling: skipped but retained in the result.
98
+ hard_set = set(self.config.exclude_columns)
99
+ soft_retained = [
100
+ c for c in data.columns
101
+ if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
102
+ and c not in hard_set
103
+ ]
91
104
 
92
105
  # ── 1. Modality profiler ─────────────────────────────────────────
93
106
  # Replaces default DatasetStats with the real one (row_count, memory,
@@ -97,7 +110,7 @@ class StructuralProfiler:
97
110
  # ── 2. Missingness pre-pass ──────────────────────────────────────
98
111
  # setdefault creates ColumnProfile entries; subsequent steps mutate
99
112
  # the same objects via the same setdefault pattern.
100
- missingness_result = MissingnessProfiler(config=self.config).profile(
113
+ missingness_result = MissingnessProfiler().profile(
101
114
  data, columns=active_cols
102
115
  )
103
116
  for col_name in missingness_result.analysed_columns:
@@ -112,7 +125,6 @@ class StructuralProfiler:
112
125
  df=data,
113
126
  cols=active_cols,
114
127
  n_rows=data.height,
115
- overrides=self.config.column_overrides,
116
128
  )
117
129
 
118
130
  # ── 4. Type detection ────────────────────────────────────────────
@@ -153,7 +165,7 @@ class StructuralProfiler:
153
165
  profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
154
166
  if profiler_cls is None:
155
167
  continue
156
- profiler = profiler_cls(config=self.config)
168
+ profiler = profiler_cls()
157
169
  try:
158
170
  batch = profiler.profile(data, columns=cols)
159
171
  for col_name in batch.analysed_columns:
@@ -165,13 +177,13 @@ class StructuralProfiler:
165
177
  # ── 7. Target columns ────────────────────────────────────────────
166
178
  # TargetProfiler produces target-specific analysis stored in
167
179
  # result.targets. cp.stats is NOT overwritten — step 6 already set it.
168
- if self.config.target_columns:
169
- for target in self.config.target_columns:
180
+ if self.config.profiling.target_columns:
181
+ for target in self.config.profiling.target_columns:
170
182
  if target not in data.columns:
171
183
  continue
172
184
  target_result = TargetProfiler(
173
185
  target_column=target,
174
- config=self.config,
186
+ config=self.config.profiling,
175
187
  ).profile(data)
176
188
  result.targets[target] = target_result
177
189
 
@@ -180,7 +192,7 @@ class StructuralProfiler:
180
192
  cp.is_target = True
181
193
 
182
194
  # ── 8. Correlation ───────────────────────────────────────────────
183
- if self.config.compute_correlation:
195
+ if self.config.profiling.compute_correlation:
184
196
  # Resolve column lists by detected SemanticType (post-override).
185
197
  numeric_cols = [
186
198
  c
@@ -198,7 +210,7 @@ class StructuralProfiler:
198
210
  corr_profiler = CorrelationProfiler(
199
211
  numeric_columns=numeric_cols,
200
212
  categorical_columns=categorical_cols,
201
- config=self.config,
213
+ config=self.config.profiling,
202
214
  )
203
215
 
204
216
  # 8a. Feature-feature matrices — computed ONCE, target-independent.
@@ -209,7 +221,7 @@ class StructuralProfiler:
209
221
 
210
222
  # 8b. Per-target analysis — matrices are NOT recomputed; each call
211
223
  # shallow-copies feature_corr and appends target-specific fields.
212
- for target in self.config.target_columns:
224
+ for target in self.config.profiling.target_columns:
213
225
  if target not in data.columns:
214
226
  continue
215
227
  result.dataset.target_correlations[target] = (
@@ -218,6 +230,12 @@ class StructuralProfiler:
218
230
  )
219
231
  )
220
232
 
233
+ # ── Soft-excluded placeholders ───────────────────────────────────────
234
+ # Columns soft-excluded for Profiling are not profiled but must still
235
+ # appear in the result so downstream phases can reference them.
236
+ for col in soft_retained:
237
+ result.columns.setdefault(col, ColumnProfile(name=col))
238
+
221
239
  return result
222
240
 
223
241
  # ------------------------------------------------------------------
@@ -229,9 +247,8 @@ class StructuralProfiler:
229
247
  df: pl.DataFrame,
230
248
  cols: list[str],
231
249
  n_rows: int,
232
- overrides: dict[str, SemanticType],
233
250
  ) -> RowMissingnessDistribution:
234
- from ._missingness_profiler import (
251
+ from ._null_detection import (
235
252
  _sentinel_eligible,
236
253
  _inf_eligible,
237
254
  _SENTINEL_STRINGS,
@@ -246,10 +263,9 @@ class StructuralProfiler:
246
263
 
247
264
  for col_name in cols:
248
265
  dtype = df[col_name].dtype
249
- override = overrides.get(col_name)
250
266
  null_e = pl.col(col_name).is_null()
251
267
 
252
- if _sentinel_eligible(dtype, override):
268
+ if _sentinel_eligible(dtype):
253
269
  eff = (
254
270
  null_e
255
271
  | (pl.col(col_name).str.strip_chars() == "")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
22
22
  src/dataforge_ml/profiling/_datetime_profiler.py
23
23
  src/dataforge_ml/profiling/_missingness_config.py
24
24
  src/dataforge_ml/profiling/_missingness_profiler.py
25
+ src/dataforge_ml/profiling/_null_detection.py
25
26
  src/dataforge_ml/profiling/_numeric_config.py
26
27
  src/dataforge_ml/profiling/_numeric_profiler.py
27
28
  src/dataforge_ml/profiling/_tabular.py
File without changes
File without changes
File without changes