dataforge-ml 2.0.2__tar.gz → 2.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-2.0.2/src/dataforge_ml.egg-info → dataforge_ml-2.0.4}/PKG-INFO +5 -4
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/README.md +4 -3
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/pyproject.toml +1 -1
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/config.py +126 -78
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/_config.py +308 -37
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/_fitted_imputer.py +3 -3
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/orchestrator.py +2 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/__init__.py +2 -1
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_base.py +85 -5
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_boolean_profiler.py +34 -2
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_categorical.py +20 -1
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_config.py +158 -14
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_datetime_profiler.py +56 -7
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_numeric_config.py +35 -15
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_numeric_profiler.py +57 -17
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_text_profiler.py +20 -1
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/orchestrator.py +6 -2
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4/src/dataforge_ml.egg-info}/PKG-INFO +5 -4
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/LICENSE +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/setup.cfg +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/__init__.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/imputation/_utils.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/utils/_null_detection.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/utils/_null_normalization.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-2.0.2 → dataforge_ml-2.0.4}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.4
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -65,15 +65,16 @@ Override the auto-detected type for any column before profiling:
|
|
|
65
65
|
```python
|
|
66
66
|
config = PipelineConfig()
|
|
67
67
|
config.set_column_type("PassengerId", "identifier") # skip stats entirely
|
|
68
|
-
config.
|
|
68
|
+
config.set_column_type(["Survived", "Pclass"], "categorical")
|
|
69
69
|
|
|
70
70
|
result = StructuralProfiler(config).profile(df)
|
|
71
71
|
```
|
|
72
72
|
|
|
73
|
-
To drop a column from all processing entirely, use `
|
|
73
|
+
To drop a column from all processing entirely, use `add_exclusion`:
|
|
74
74
|
|
|
75
75
|
```python
|
|
76
|
-
config = PipelineConfig(
|
|
76
|
+
config = PipelineConfig()
|
|
77
|
+
config.add_exclusion(["PassengerId", "Name"])
|
|
77
78
|
```
|
|
78
79
|
|
|
79
80
|
## Splitting
|
|
@@ -35,15 +35,16 @@ Override the auto-detected type for any column before profiling:
|
|
|
35
35
|
```python
|
|
36
36
|
config = PipelineConfig()
|
|
37
37
|
config.set_column_type("PassengerId", "identifier") # skip stats entirely
|
|
38
|
-
config.
|
|
38
|
+
config.set_column_type(["Survived", "Pclass"], "categorical")
|
|
39
39
|
|
|
40
40
|
result = StructuralProfiler(config).profile(df)
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
-
To drop a column from all processing entirely, use `
|
|
43
|
+
To drop a column from all processing entirely, use `add_exclusion`:
|
|
44
44
|
|
|
45
45
|
```python
|
|
46
|
-
config = PipelineConfig(
|
|
46
|
+
config = PipelineConfig()
|
|
47
|
+
config.add_exclusion(["PassengerId", "Name"])
|
|
47
48
|
```
|
|
48
49
|
|
|
49
50
|
## Splitting
|
|
@@ -4,6 +4,7 @@ import json
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from enum import StrEnum
|
|
6
6
|
from typing import TYPE_CHECKING, Union, Optional
|
|
7
|
+
from types import MappingProxyType
|
|
7
8
|
|
|
8
9
|
if TYPE_CHECKING:
|
|
9
10
|
from dataforge_ml.profiling._config import ProfileConfig, NumericKind
|
|
@@ -75,17 +76,6 @@ class PipelineConfig:
|
|
|
75
76
|
|
|
76
77
|
Parameters
|
|
77
78
|
----------
|
|
78
|
-
exclude_columns : list[str]
|
|
79
|
-
Hard exclusions — columns dropped globally from every phase.
|
|
80
|
-
phase_exclusions : dict[PipelinePhase, list[str]]
|
|
81
|
-
Soft exclusions — columns bypassed for a specific phase but retained
|
|
82
|
-
in the dataset.
|
|
83
|
-
column_overrides : dict[str, SemanticType]
|
|
84
|
-
Explicit semantic type assignments respected by all downstream phases.
|
|
85
|
-
numeric_kind_overrides : dict[str, NumericKind]
|
|
86
|
-
Explicit ``NumericKind`` assignments for individual columns, applied
|
|
87
|
-
after auto-detection in Phase 1. Only valid for columns whose final
|
|
88
|
-
``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
|
|
89
79
|
profiling : ProfileConfig
|
|
90
80
|
Phase 1-specific parameters (correlation, chunking, memory threshold).
|
|
91
81
|
imputation : ImputationConfig
|
|
@@ -96,17 +86,78 @@ class PipelineConfig:
|
|
|
96
86
|
Single seed for all stochastic pipeline operations, including GMM
|
|
97
87
|
Sampling during bimodal imputation. None produces non-deterministic
|
|
98
88
|
output.
|
|
89
|
+
|
|
90
|
+
Attributes
|
|
91
|
+
----------
|
|
92
|
+
exclude_columns : tuple[str, ...]
|
|
93
|
+
Hard exclusions — columns dropped globally from every phase.
|
|
94
|
+
phase_exclusions : MappingProxyType[PipelinePhase, tuple[str, ...]]
|
|
95
|
+
Soft exclusions — columns bypassed for a specific phase but retained
|
|
96
|
+
in the dataset.
|
|
97
|
+
column_overrides : MappingProxyType[str, SemanticType]
|
|
98
|
+
Explicit semantic type assignments respected by all downstream phases.
|
|
99
|
+
numeric_kind_overrides : MappingProxyType[str, NumericKind]
|
|
100
|
+
Explicit ``NumericKind`` assignments for individual columns, applied
|
|
101
|
+
after auto-detection in Phase 1. Only valid for columns whose final
|
|
102
|
+
``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
|
|
99
103
|
"""
|
|
100
104
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
+
_exclude_columns: list[str] = field(default_factory=list, init=False)
|
|
106
|
+
_phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict, init=False)
|
|
107
|
+
_column_overrides: dict[str, SemanticType] = field(default_factory=dict, init=False)
|
|
108
|
+
_numeric_kind_overrides: dict[str, NumericKind] = field(default_factory=dict, init=False)
|
|
105
109
|
profiling: ProfileConfig = field(default_factory=_default_profile_config)
|
|
106
110
|
imputation: ImputationConfig = field(default_factory=_default_imputation_config)
|
|
107
111
|
split: SplitConfig = field(default_factory=_default_split_config)
|
|
108
112
|
random_seed: Optional[int] = None
|
|
109
113
|
|
|
114
|
+
@property
|
|
115
|
+
def exclude_columns(self) -> tuple[str, ...]:
|
|
116
|
+
"""Hard exclusions — columns dropped globally from every phase.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
tuple[str, ...]
|
|
121
|
+
A snapshot tuple of columns registered as hard exclusions.
|
|
122
|
+
"""
|
|
123
|
+
return tuple(self._exclude_columns)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def phase_exclusions(self) -> "MappingProxyType[PipelinePhase, tuple[str, ...]]":
|
|
127
|
+
"""Soft exclusions — columns bypassed for a specific phase but retained in the dataset.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
MappingProxyType[PipelinePhase, tuple[str, ...]]
|
|
132
|
+
A read-only view mapping each phase to a tuple of excluded columns.
|
|
133
|
+
"""
|
|
134
|
+
from types import MappingProxyType
|
|
135
|
+
return MappingProxyType({k: tuple(v) for k, v in self._phase_exclusions.items()})
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def column_overrides(self) -> "MappingProxyType[str, SemanticType]":
|
|
139
|
+
"""Explicit semantic type assignments respected by all downstream phases.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
MappingProxyType[str, SemanticType]
|
|
144
|
+
A read-only view mapping columns to their explicitly assigned SemanticType.
|
|
145
|
+
"""
|
|
146
|
+
from types import MappingProxyType
|
|
147
|
+
return MappingProxyType(self._column_overrides)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def numeric_kind_overrides(self) -> "MappingProxyType[str, NumericKind]":
|
|
151
|
+
"""Explicit NumericKind assignments for individual columns, applied after auto-detection in Phase 1.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
MappingProxyType[str, NumericKind]
|
|
156
|
+
A read-only view mapping columns to their explicitly assigned NumericKind.
|
|
157
|
+
"""
|
|
158
|
+
from types import MappingProxyType
|
|
159
|
+
return MappingProxyType(self._numeric_kind_overrides)
|
|
160
|
+
|
|
110
161
|
def resolve_active_columns(
|
|
111
162
|
self, phase: PipelinePhase, available_columns: list[str]
|
|
112
163
|
) -> list[str]:
|
|
@@ -131,38 +182,61 @@ class PipelineConfig:
|
|
|
131
182
|
original order.
|
|
132
183
|
"""
|
|
133
184
|
hard_set = set(self.exclude_columns)
|
|
134
|
-
soft_set = set(self.phase_exclusions.get(phase,
|
|
185
|
+
soft_set = set(self.phase_exclusions.get(phase, ()))
|
|
135
186
|
excluded = hard_set | soft_set
|
|
136
187
|
return [c for c in available_columns if c not in excluded]
|
|
137
188
|
|
|
138
|
-
def
|
|
189
|
+
def add_exclusion(self, column: Union[str, list[str]]) -> None:
|
|
139
190
|
"""Add columns to the hard exclusion set, deduplicating automatically.
|
|
140
191
|
|
|
141
|
-
Columns already present in
|
|
142
|
-
within
|
|
192
|
+
Columns already present in the exclusion list and duplicate entries
|
|
193
|
+
within the input are silently ignored. Calling with an empty list is a
|
|
143
194
|
no-op.
|
|
144
195
|
|
|
145
196
|
Parameters
|
|
146
197
|
----------
|
|
147
|
-
|
|
148
|
-
Column
|
|
198
|
+
column : str or list[str]
|
|
199
|
+
Column name(s) to register as hard exclusions. Deduplication is
|
|
149
200
|
handled here; callers do not need to pre-deduplicate.
|
|
150
201
|
"""
|
|
151
|
-
|
|
202
|
+
cols = [column] if isinstance(column, str) else column
|
|
203
|
+
existing = set(self._exclude_columns)
|
|
204
|
+
for col in cols:
|
|
205
|
+
if col not in existing:
|
|
206
|
+
self._exclude_columns.append(col)
|
|
207
|
+
existing.add(col)
|
|
208
|
+
|
|
209
|
+
def add_phase_exclusion(self, phase: Union[PipelinePhase, str], column: Union[str, list[str]]) -> None:
|
|
210
|
+
"""Add columns to the soft exclusion set for a specific phase.
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
phase : PipelinePhase or str
|
|
215
|
+
The phase for which to exclude the column(s).
|
|
216
|
+
column : str or list[str]
|
|
217
|
+
Column name(s) to register as soft exclusions for this phase.
|
|
218
|
+
Deduplication is handled automatically.
|
|
219
|
+
"""
|
|
220
|
+
if isinstance(phase, str):
|
|
221
|
+
phase = PipelinePhase(phase)
|
|
222
|
+
|
|
223
|
+
cols = [column] if isinstance(column, str) else column
|
|
224
|
+
phase_list = self._phase_exclusions.setdefault(phase, [])
|
|
225
|
+
existing = set(phase_list)
|
|
152
226
|
for col in cols:
|
|
153
227
|
if col not in existing:
|
|
154
|
-
|
|
228
|
+
phase_list.append(col)
|
|
155
229
|
existing.add(col)
|
|
156
230
|
|
|
157
231
|
def set_column_type(
|
|
158
|
-
self, column: str, semantic_type: Union[str, SemanticType]
|
|
232
|
+
self, column: Union[str, list[str]], semantic_type: Union[str, SemanticType]
|
|
159
233
|
) -> None:
|
|
160
|
-
"""Explicitly set the semantic type for
|
|
234
|
+
"""Explicitly set the semantic type for one or more columns, overriding auto-detection.
|
|
161
235
|
|
|
162
236
|
Parameters
|
|
163
237
|
----------
|
|
164
|
-
column : str
|
|
165
|
-
Name of the column to override.
|
|
238
|
+
column : str or list[str]
|
|
239
|
+
Name of the column(s) to override.
|
|
166
240
|
semantic_type : str or SemanticType
|
|
167
241
|
The desired semantic type. Accepts enum values or their string
|
|
168
242
|
equivalents (e.g. ``"numeric"``, ``"categorical"``).
|
|
@@ -182,32 +256,19 @@ class PipelineConfig:
|
|
|
182
256
|
f"Unknown semantic type {semantic_type!r}. "
|
|
183
257
|
f"Valid values: {valid}"
|
|
184
258
|
)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
self, columns: list[str], semantic_type: Union[str, SemanticType]
|
|
189
|
-
) -> None:
|
|
190
|
-
"""Assign the same semantic type to every column in the list.
|
|
191
|
-
|
|
192
|
-
Parameters
|
|
193
|
-
----------
|
|
194
|
-
columns : list[str]
|
|
195
|
-
Column names to override.
|
|
196
|
-
semantic_type : str or SemanticType
|
|
197
|
-
The desired semantic type applied to every column in the list.
|
|
198
|
-
"""
|
|
199
|
-
for column in columns:
|
|
200
|
-
self.set_column_type(column, semantic_type)
|
|
259
|
+
cols = [column] if isinstance(column, str) else column
|
|
260
|
+
for col in cols:
|
|
261
|
+
self._column_overrides[col] = semantic_type
|
|
201
262
|
|
|
202
263
|
def set_numeric_kind(
|
|
203
|
-
self, column: str, kind: Union[str, NumericKind]
|
|
264
|
+
self, column: Union[str, list[str]], kind: Union[str, NumericKind]
|
|
204
265
|
) -> None:
|
|
205
|
-
"""Explicitly set the ``NumericKind`` for
|
|
266
|
+
"""Explicitly set the ``NumericKind`` for one or more columns.
|
|
206
267
|
|
|
207
268
|
Parameters
|
|
208
269
|
----------
|
|
209
|
-
column : str
|
|
210
|
-
Name of the column to override.
|
|
270
|
+
column : str or list[str]
|
|
271
|
+
Name of the column(s) to override.
|
|
211
272
|
kind : str or NumericKind
|
|
212
273
|
The desired numeric kind. Accepts enum values or their string
|
|
213
274
|
equivalents (``"continuous"``, ``"bounded_discrete"``).
|
|
@@ -227,22 +288,9 @@ class PipelineConfig:
|
|
|
227
288
|
raise ValueError(
|
|
228
289
|
f"Unknown NumericKind {kind!r}. Valid values: {valid}"
|
|
229
290
|
)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
self, columns: list[str], kind: Union[str, NumericKind]
|
|
234
|
-
) -> None:
|
|
235
|
-
"""Assign the same ``NumericKind`` to every column in the list.
|
|
236
|
-
|
|
237
|
-
Parameters
|
|
238
|
-
----------
|
|
239
|
-
columns : list[str]
|
|
240
|
-
Column names to override.
|
|
241
|
-
kind : str or NumericKind
|
|
242
|
-
The desired numeric kind applied to every column in the list.
|
|
243
|
-
"""
|
|
244
|
-
for column in columns:
|
|
245
|
-
self.set_numeric_kind(column, kind)
|
|
291
|
+
cols = [column] if isinstance(column, str) else column
|
|
292
|
+
for col in cols:
|
|
293
|
+
self._numeric_kind_overrides[col] = kind
|
|
246
294
|
|
|
247
295
|
def to_dict(self) -> dict:
|
|
248
296
|
"""Serialise the pipeline configuration to a plain dictionary.
|
|
@@ -288,28 +336,28 @@ class PipelineConfig:
|
|
|
288
336
|
Fully populated configuration instance with all nested sub-configs
|
|
289
337
|
restored.
|
|
290
338
|
"""
|
|
291
|
-
from dataforge_ml.profiling._config import ProfileConfig
|
|
339
|
+
from dataforge_ml.profiling._config import ProfileConfig
|
|
292
340
|
from dataforge_ml.imputation._config import ImputationConfig
|
|
293
341
|
from dataforge_ml.splitting._config import SplitConfig
|
|
294
|
-
|
|
295
|
-
exclude_columns=list(data.get("exclude_columns", [])),
|
|
296
|
-
phase_exclusions={
|
|
297
|
-
PipelinePhase(phase_str): list(cols)
|
|
298
|
-
for phase_str, cols in data.get("phase_exclusions", {}).items()
|
|
299
|
-
},
|
|
300
|
-
column_overrides={
|
|
301
|
-
col: SemanticType(sem_str)
|
|
302
|
-
for col, sem_str in data.get("column_overrides", {}).items()
|
|
303
|
-
},
|
|
304
|
-
numeric_kind_overrides={
|
|
305
|
-
col: _NumericKind(kind_str)
|
|
306
|
-
for col, kind_str in data.get("numeric_kind_overrides", {}).items()
|
|
307
|
-
},
|
|
342
|
+
cfg = cls(
|
|
308
343
|
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
309
344
|
imputation=ImputationConfig.from_dict(data.get("imputation", {})),
|
|
310
345
|
split=SplitConfig.from_dict(data.get("split", {})),
|
|
311
346
|
random_seed=data.get("random_seed"),
|
|
312
347
|
)
|
|
348
|
+
|
|
349
|
+
cfg.add_exclusion(data.get("exclude_columns", []))
|
|
350
|
+
|
|
351
|
+
for phase_str, cols in data.get("phase_exclusions", {}).items():
|
|
352
|
+
cfg.add_phase_exclusion(phase_str, cols)
|
|
353
|
+
|
|
354
|
+
for col, sem_str in data.get("column_overrides", {}).items():
|
|
355
|
+
cfg.set_column_type(col, sem_str)
|
|
356
|
+
|
|
357
|
+
for col, kind_str in data.get("numeric_kind_overrides", {}).items():
|
|
358
|
+
cfg.set_numeric_kind(col, kind_str)
|
|
359
|
+
|
|
360
|
+
return cfg
|
|
313
361
|
|
|
314
362
|
def to_json(self, indent: int = 2) -> str:
|
|
315
363
|
"""Serialise the pipeline configuration to a JSON string.
|