segmentae 1.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ import warnings
2
+ from typing import Any, List, Optional, Union
3
+
4
+ import pandas as pd
5
+ from pydantic import BaseModel, ConfigDict, field_validator
6
+
7
+ from segmentae.core.constants import EncoderType, ImputerType, ScalerType
8
+ from segmentae.core.exceptions import ModelNotFittedError, ValidationError
9
+ from segmentae.processing.simplifier import ComponentFactory
10
+
11
+ warnings.filterwarnings("ignore", category=Warning)
12
+
13
+ class PreprocessingConfig(BaseModel):
14
+ """
15
+ Configuration for preprocessing pipeline.
16
+ """
17
+
18
+ encoder: Optional[Union[EncoderType, str]] = None # Default: No encoding || Options: "IFrequencyEncoder", "LabelEncoder", "OneHotEncoder"
19
+ scaler: Optional[Union[ScalerType, str]] = "MinMaxScaler" # Default: MinMaxScaler || Options: "MinMaxScaler", "StandardScaler", "RobustScaler"
20
+ imputer: Optional[Union[ImputerType, str]] = "Simple" # Default: Simple Imputer || Options: "Simple"
21
+
22
+ @field_validator('encoder', mode='before')
23
+ def convert_encoder_to_enum(cls, v):
24
+ """Convert string encoder to enum."""
25
+ if v is None or isinstance(v, EncoderType):
26
+ return v
27
+ try:
28
+ return EncoderType(v)
29
+ except ValueError:
30
+ valid_options = [e.value for e in EncoderType]
31
+ raise ValueError(
32
+ f"Invalid encoder type: '{v}'. "
33
+ f"Valid options: {valid_options}"
34
+ )
35
+
36
+ @field_validator('scaler', mode='before')
37
+ def convert_scaler_to_enum(cls, v):
38
+ """Convert string scaler to enum."""
39
+ if v is None or isinstance(v, ScalerType):
40
+ return v
41
+ try:
42
+ return ScalerType(v)
43
+ except ValueError:
44
+ valid_options = [s.value for s in ScalerType]
45
+ raise ValueError(
46
+ f"Invalid scaler type: '{v}'. "
47
+ f"Valid options: {valid_options}"
48
+ )
49
+
50
+ @field_validator('imputer', mode='before')
51
+ def convert_imputer_to_enum(cls, v):
52
+ """Convert string imputer to enum."""
53
+ if v is None or isinstance(v, ImputerType):
54
+ return v
55
+ try:
56
+ return ImputerType(v)
57
+ except ValueError:
58
+ valid_options = [i.value for i in ImputerType]
59
+ raise ValueError(
60
+ f"Invalid imputer type: '{v}'. "
61
+ f"Valid options: {valid_options}"
62
+ )
63
+
64
+ model_config = ConfigDict(use_enum_values=False)
65
+
66
+ class Preprocessing:
67
+ """
68
+ Main preprocessing class for data transformation.
69
+
70
+ This class orchestrates the preprocessing pipeline including categorical encoding, numerical scaling, and missing value imputation.
71
+ It follows the scikit-learn fit/transform pattern.
72
+ """
73
+
74
+ def __init__(self,
75
+ encoder: Optional[Union[EncoderType, str]] = None,
76
+ scaler: Optional[Union[ScalerType, str]] = "MinMaxScaler",
77
+ imputer: Optional[Union[ImputerType, str]] = "Simple"):
78
+ """
79
+ Initialize preprocessing pipeline.
80
+ """
81
+ # Validate and store configuration
82
+ self.config = PreprocessingConfig(
83
+ encoder=encoder,
84
+ scaler=scaler,
85
+ imputer=imputer
86
+ )
87
+
88
+ # Internal component storage
89
+ self._encoder: Optional[Any] = None
90
+ self._scaler: Optional[Any] = None
91
+ self._imputer: Optional[Any] = None
92
+
93
+ # State tracking
94
+ self._X: Optional[pd.DataFrame] = None
95
+ self._cat_cols: List[str] = []
96
+ self._num_cols: List[str] = []
97
+ self._is_fitted: bool = False
98
+
99
+ def fit(self, X: pd.DataFrame) -> 'Preprocessing':
100
+ """
101
+ Fit preprocessing components to data.
102
+ """
103
+ self._validate_input(X, "Input for fitting")
104
+
105
+ # Setup components in order
106
+ self._setup_encoder(X)
107
+ self._setup_scaler()
108
+ self._setup_imputer()
109
+
110
+ self._is_fitted = True
111
+ return self
112
+
113
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
114
+ """
115
+ Transform data using fitted preprocessing components.
116
+ """
117
+ self._validate_fitted()
118
+ self._validate_input(X, "Input for transformation")
119
+
120
+ return self._apply_transformations(X)
121
+
122
+ def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
123
+ """
124
+ Fit and transform data in one step.
125
+ """
126
+ return self.fit(X).transform(X)
127
+
128
+ def _setup_encoder(self, X: pd.DataFrame) -> None:
129
+ """
130
+ Setup encoder based on categorical columns.
131
+ """
132
+ self._cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
133
+
134
+ if self.config.encoder is not None and self._cat_cols:
135
+ self._encoder = ComponentFactory.create_encoder(self.config.encoder)
136
+ self._encoder.fit(X[self._cat_cols])
137
+ self._X = self._encoder.transform(X).copy()
138
+ else:
139
+ self._X = X.copy()
140
+
141
+ def _setup_scaler(self) -> None:
142
+ """
143
+ Setup scaler based on numerical columns.
144
+ """
145
+ self._num_cols = self._X.select_dtypes(include=['int', 'float']).columns.tolist()
146
+
147
+ if self.config.scaler is not None and self._num_cols:
148
+ self._scaler = ComponentFactory.create_scaler(self.config.scaler)
149
+ self._scaler.fit(self._X[self._num_cols])
150
+
151
+ def _setup_imputer(self) -> None:
152
+ """
153
+ Setup imputer if missing values exist.
154
+ """
155
+ if self.config.imputer is None or self._X.isnull().sum().sum() == 0:
156
+ return
157
+
158
+ self._imputer = ComponentFactory.create_imputer(self.config.imputer)
159
+
160
+ # Prepare data for imputer
161
+ X_for_imputer = self._X.copy()
162
+
163
+ # Scale numerical columns before imputation
164
+ if self._scaler is not None and self._num_cols:
165
+ X_for_imputer[self._num_cols] = self._scaler.transform(
166
+ X_for_imputer[self._num_cols].copy()
167
+ )
168
+
169
+ # Fit imputer
170
+ self._imputer.fit(X=X_for_imputer)
171
+
172
+ def _apply_transformations(self, X: pd.DataFrame) -> pd.DataFrame:
173
+ """
174
+ Apply all fitted transformations in correct order.
175
+ """
176
+ X_ = X.copy()
177
+
178
+ # Apply encoder
179
+ if self._encoder is not None:
180
+ X_ = self._encoder.transform(X_)
181
+
182
+ # Apply scaler
183
+ if self._scaler is not None and self._num_cols:
184
+ X_[self._num_cols] = self._scaler.transform(X_[self._num_cols].copy())
185
+
186
+ # Apply imputer
187
+ if self._imputer is not None:
188
+ X_[self._num_cols] = self._imputer.transform(X=X_[self._num_cols].copy())
189
+
190
+ return X_
191
+
192
+ def _validate_input(self, X: pd.DataFrame, context: str = "Input") -> None:
193
+ """
194
+ Validate input DataFrame.
195
+ """
196
+ if not isinstance(X, pd.DataFrame):
197
+ raise ValidationError(
198
+ f"{context} must be a pandas DataFrame, got {type(X).__name__}",
199
+ suggestion="Convert your data to DataFrame using pd.DataFrame()"
200
+ )
201
+
202
+ if X.empty:
203
+ raise ValidationError(
204
+ f"{context} DataFrame is empty",
205
+ suggestion="Ensure your dataset contains data"
206
+ )
207
+
208
+ def _validate_fitted(self) -> None:
209
+ """
210
+ Check if preprocessing is fitted.
211
+ """
212
+ if not self._is_fitted:
213
+ raise ModelNotFittedError(
214
+ component="Preprocessing",
215
+ message="Preprocessing must be fitted before transform. "
216
+ "Call fit(X) method first."
217
+ )
218
+
219
+ @property
220
+ def encoder(self) -> Optional[Any]:
221
+ """Get fitted encoder component."""
222
+ return self._encoder
223
+
224
+ @property
225
+ def scaler(self) -> Optional[Any]:
226
+ """Get fitted scaler component."""
227
+ return self._scaler
228
+
229
+ @property
230
+ def imputer(self) -> Optional[Any]:
231
+ """Get fitted imputer component."""
232
+ return self._imputer
233
+
234
+ @property
235
+ def cat_cols(self) -> List[str]:
236
+ """Get list of categorical columns."""
237
+ return self._cat_cols
238
+
239
+ @property
240
+ def num_cols(self) -> List[str]:
241
+ """Get list of numerical columns."""
242
+ return self._num_cols
243
+
244
+ def __repr__(self) -> str:
245
+ """String representation of Preprocessing."""
246
+ return (
247
+ f"Preprocessing("
248
+ f"encoder={self.config.encoder.value if self.config.encoder else None}, "
249
+ f"scaler={self.config.scaler.value if self.config.scaler else None}, "
250
+ f"imputer={self.config.imputer.value if self.config.imputer else None})"
251
+ )
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
@@ -0,0 +1,74 @@
1
+ from typing import Any, Optional, Union
2
+
3
+ from atlantic.imputers.imputation import (
4
+ AutoSimpleImputer, # || #AutoKNNImputer, AutoIterativeImputer
5
+ )
6
+ from atlantic.processing.encoders import AutoIFrequencyEncoder, AutoLabelEncoder, AutoOneHotEncoder
7
+ from atlantic.processing.scalers import AutoMinMaxScaler, AutoRobustScaler, AutoStandardScaler
8
+ from segmentae.core.constants import EncoderType, ImputerType, ScalerType
9
+ from segmentae.core.exceptions import ConfigurationError
10
+
11
+
12
+ class ComponentFactory:
13
+ """Factory class for creating preprocessing components."""
14
+
15
+ @staticmethod
16
+ def create_imputer(imputer_type: Optional[Union[ImputerType, str]]) -> Optional[Any]:
17
+ """Create an imputer instance based on type."""
18
+ if imputer_type is None:
19
+ return None
20
+
21
+ if isinstance(imputer_type, str):
22
+ imputer_type = ImputerType(imputer_type)
23
+
24
+ if imputer_type == ImputerType.SIMPLE:
25
+ return AutoSimpleImputer(strategy="mean")
26
+
27
+ raise ConfigurationError(
28
+ f"Unknown imputer type: {imputer_type}",
29
+ valid_options=list(ImputerType)
30
+ )
31
+
32
+ @staticmethod
33
+ def create_encoder(encoder_type: Optional[Union[EncoderType, str]]) -> Optional[Any]:
34
+ """Create an encoder instance based on type."""
35
+ if encoder_type is None:
36
+ return None
37
+
38
+ if isinstance(encoder_type, str):
39
+ encoder_type = EncoderType(encoder_type)
40
+
41
+ match encoder_type:
42
+ case EncoderType.IFREQUENCY:
43
+ return AutoIFrequencyEncoder()
44
+ case EncoderType.LABEL:
45
+ return AutoLabelEncoder()
46
+ case EncoderType.ONEHOT:
47
+ return AutoOneHotEncoder()
48
+ case _:
49
+ raise ConfigurationError(
50
+ f"Unknown encoder type: {encoder_type}",
51
+ valid_options=list(EncoderType)
52
+ )
53
+
54
+ @staticmethod
55
+ def create_scaler(scaler_type: Optional[Union[ScalerType, str]]) -> Optional[Any]:
56
+ """Create a scaler instance based on type."""
57
+ if scaler_type is None:
58
+ return None
59
+
60
+ if isinstance(scaler_type, str):
61
+ scaler_type = ScalerType(scaler_type)
62
+
63
+ match scaler_type:
64
+ case ScalerType.MINMAX:
65
+ return AutoMinMaxScaler()
66
+ case ScalerType.STANDARD:
67
+ return AutoStandardScaler()
68
+ case ScalerType.ROBUST:
69
+ return AutoRobustScaler()
70
+ case _:
71
+ raise ConfigurationError(
72
+ f"Unknown scaler type: {scaler_type}",
73
+ valid_options=list(ScalerType)
74
+ )
@@ -0,0 +1,17 @@
1
+ from segmentae.utils.validation import (
2
+ validate_dataframe,
3
+ validate_fitted,
4
+ validate_lengths_match,
5
+ validate_positive_integer,
6
+ validate_series,
7
+ validate_threshold_ratio,
8
+ )
9
+
10
+ __all__ = [
11
+ 'validate_dataframe',
12
+ 'validate_series',
13
+ 'validate_fitted',
14
+ 'validate_threshold_ratio',
15
+ 'validate_lengths_match',
16
+ 'validate_positive_integer'
17
+ ]
@@ -0,0 +1,94 @@
1
+ from typing import Any
2
+
3
+ import pandas as pd
4
+
5
+ from segmentae.core.exceptions import ModelNotFittedError, ValidationError
6
+
7
+
8
+ def validate_dataframe(df: Any, name: str = "DataFrame") -> None:
9
+ """
10
+ Validate that input is a non-empty DataFrame.
11
+ """
12
+ if not isinstance(df, pd.DataFrame):
13
+ raise ValidationError(
14
+ f"{name} must be a pandas DataFrame, got {type(df).__name__}",
15
+ suggestion="Convert to DataFrame using pd.DataFrame()"
16
+ )
17
+
18
+ if df.empty:
19
+ raise ValidationError(
20
+ f"{name} cannot be empty",
21
+ suggestion="Ensure your dataset contains data"
22
+ )
23
+
24
+
25
+ def validate_series(series: Any, name: str = "Series") -> None:
26
+ """
27
+ Validate that input is a non-empty Series.
28
+ """
29
+ if not isinstance(series, pd.Series):
30
+ raise ValidationError(
31
+ f"{name} must be a pandas Series, got {type(series).__name__}",
32
+ suggestion="Convert to Series using pd.Series() or extract DataFrame column"
33
+ )
34
+
35
+ if len(series) == 0:
36
+ raise ValidationError(
37
+ f"{name} cannot be empty",
38
+ suggestion="Ensure your data contains values"
39
+ )
40
+
41
+
42
+ def validate_fitted(is_fitted: bool, component: str = "Model") -> None:
43
+ """
44
+ Check if component is fitted.
45
+ """
46
+ if not is_fitted:
47
+ raise ModelNotFittedError(
48
+ component=component,
49
+ message=f"{component} must be fitted before use. Call fit() method first."
50
+ )
51
+
52
+
53
+ def validate_threshold_ratio(ratio: float) -> None:
54
+ """
55
+ Validate threshold ratio is positive.
56
+ """
57
+ if ratio <= 0:
58
+ raise ValidationError(
59
+ f"threshold_ratio must be positive, got {ratio}",
60
+ suggestion="Use a positive value like 1.0, 2.0, etc."
61
+ )
62
+
63
+
64
+ def validate_lengths_match(
65
+ a: Any,
66
+ b: Any,
67
+ name_a: str = "First array",
68
+ name_b: str = "Second array"
69
+ ) -> None:
70
+ """
71
+ Validate two objects have matching lengths.
72
+ """
73
+ if len(a) != len(b):
74
+ raise ValidationError(
75
+ f"{name_a} and {name_b} must have same length. "
76
+ f"Got {len(a)} and {len(b)}",
77
+ suggestion="Ensure both arrays/dataframes have the same number of samples"
78
+ )
79
+
80
+
81
+ def validate_positive_integer(value: int, name: str = "Value") -> None:
82
+ """
83
+ Validate that a value is a positive integer.
84
+ """
85
+ if not isinstance(value, int):
86
+ raise ValidationError(
87
+ f"{name} must be an integer, got {type(value).__name__}"
88
+ )
89
+
90
+ if value < 1:
91
+ raise ValidationError(
92
+ f"{name} must be positive, got {value}",
93
+ suggestion="Use a value >= 1"
94
+ )