likelihood 2.2.0.dev1__cp310-cp310-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. likelihood/VERSION +1 -0
  2. likelihood/__init__.py +20 -0
  3. likelihood/graph/__init__.py +9 -0
  4. likelihood/graph/_nn.py +283 -0
  5. likelihood/graph/graph.py +86 -0
  6. likelihood/graph/nn.py +329 -0
  7. likelihood/main.py +273 -0
  8. likelihood/models/__init__.py +3 -0
  9. likelihood/models/deep/__init__.py +13 -0
  10. likelihood/models/deep/_autoencoders.py +896 -0
  11. likelihood/models/deep/_predictor.py +809 -0
  12. likelihood/models/deep/autoencoders.py +903 -0
  13. likelihood/models/deep/bandit.py +97 -0
  14. likelihood/models/deep/gan.py +313 -0
  15. likelihood/models/deep/predictor.py +805 -0
  16. likelihood/models/deep/rl.py +345 -0
  17. likelihood/models/environments.py +202 -0
  18. likelihood/models/hmm.py +163 -0
  19. likelihood/models/regression.py +451 -0
  20. likelihood/models/simulation.py +213 -0
  21. likelihood/models/utils.py +87 -0
  22. likelihood/pipes.py +382 -0
  23. likelihood/rust_py_integration.cpython-310-x86_64-linux-gnu.so +0 -0
  24. likelihood/tools/__init__.py +4 -0
  25. likelihood/tools/cat_embed.py +212 -0
  26. likelihood/tools/figures.py +348 -0
  27. likelihood/tools/impute.py +278 -0
  28. likelihood/tools/models_tools.py +866 -0
  29. likelihood/tools/numeric_tools.py +390 -0
  30. likelihood/tools/reports.py +375 -0
  31. likelihood/tools/tools.py +1336 -0
  32. likelihood-2.2.0.dev1.dist-info/METADATA +68 -0
  33. likelihood-2.2.0.dev1.dist-info/RECORD +39 -0
  34. likelihood-2.2.0.dev1.dist-info/WHEEL +5 -0
  35. likelihood-2.2.0.dev1.dist-info/licenses/LICENSE +21 -0
  36. likelihood-2.2.0.dev1.dist-info/sboms/auditwheel.cdx.json +1 -0
  37. likelihood-2.2.0.dev1.dist-info/top_level.txt +5 -0
  38. likelihood.libs/libgcc_s-0cd532bd.so.1 +0 -0
  39. src/lib.rs +12 -0
@@ -0,0 +1,213 @@
1
+ import pickle
2
+ from typing import Dict, List, Tuple, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from packaging import version
7
+ from pandas.core.frame import DataFrame
8
+
9
+ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
10
+
11
+ if version.parse(np.__version__) < version.parse("2.0.0"):
12
+ filter = np.RankWarning
13
+ else:
14
+ filter = np.exceptions.RankWarning
15
+
16
+
17
+ # --------------------------------------------------------------------------------------------------------------------------------------
18
+ def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
19
+ freq = df[column].value_counts()
20
+
21
+ q1 = freq.quantile(0.25)
22
+ q3 = freq.quantile(0.75)
23
+
24
+ least_frequent = freq[freq <= q1]
25
+ most_frequent = freq[freq >= q3]
26
+
27
+ least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
28
+ most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
29
+
30
+ return least_frequent_category, most_frequent_category
31
+
32
+
33
+ class SimulationEngine(FeatureSelection):
34
+ """
35
+ This class implements a predictive model that utilizes multiple linear regression for numerical target variables
36
+ and multiple logistic regression for categorical target variables.
37
+
38
+ The class provides methods for training the model on a given dataset, making predictions,
39
+ and evaluating the model's performance.
40
+
41
+ Key features:
42
+ - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
43
+ - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
44
+ - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
45
+
46
+ Usage:
47
+ - Instantiate the class with the training data and target variable.
48
+ - Call the fit method to train the model.
49
+ - Use the predict method to generate predictions on new data.
50
+ - Evaluate the model using built-in metrics for accuracy and error.
51
+
52
+ This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
53
+ for both numerical and categorical outcomes efficiently.
54
+ """
55
+
56
+ def __init__(self, use_scaler: bool = False, **kwargs):
57
+ self.df = pd.DataFrame()
58
+ self.n_importances = None
59
+ self.use_scaler = use_scaler
60
+ self.proba_dict = {}
61
+
62
+ super().__init__(**kwargs)
63
+
64
+ def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
65
+ w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
66
+
67
+ df = df[names_cols].copy()
68
+ dataset = self.df.copy()
69
+ dataset.drop(columns=column, inplace=True)
70
+ numeric_df = dataset.select_dtypes(include="number")
71
+ if self.use_scaler:
72
+ scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
73
+ _ = scaler.rescale()
74
+ dataset_ = df.copy()
75
+ numeric_df = dataset_.select_dtypes(include="number")
76
+ numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
77
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
78
+ for col in numeric_df.columns:
79
+ df[col] = numeric_df[col].values
80
+
81
+ for num, colname in enumerate(dfe._encode_columns):
82
+ if df[colname].dtype == "object":
83
+ encode_dict = dfe.encoding_list[num]
84
+ df[colname] = df[colname].apply(
85
+ dfe._code_transformation_to, dictionary_list=encode_dict
86
+ )
87
+
88
+ y = df.to_numpy() @ w
89
+
90
+ if quick_encoder != None:
91
+ one_hot = OneHotEncoder()
92
+ y = one_hot.decode(y)
93
+ encoding_dic = quick_encoder.decoding_list[0]
94
+ y = [encoding_dic[item] for item in y]
95
+ else:
96
+ if self.use_scaler:
97
+ y += 1
98
+ y /= 2
99
+ y = y * (self.df[column].max() - self.df[column].min())
100
+
101
+ return y[:]
102
+
103
+ def _encode(self, df: DataFrame) -> Dict[str, float]:
104
+ df = df.copy()
105
+ column = df.columns[0]
106
+ frec = df[column].value_counts() / len(df)
107
+ df.loc[:, "frec"] = df[column].map(frec)
108
+ df.sort_values("frec", inplace=True)
109
+ keys = df[column].to_list()
110
+ values = df["frec"].to_list()
111
+ return dict(zip(keys, values))
112
+
113
+ def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
114
+ self.df = df
115
+ self.n_importances = n_importances
116
+ self.get_digraph(self.df, self.n_importances, self.use_scaler)
117
+ proba_dict_keys = list(self.w_dict.keys())
118
+ self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
119
+ for key in proba_dict_keys:
120
+ x = (
121
+ self.df[key].values,
122
+ None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
123
+ )
124
+ poly = kwargs.get("poly", 9)
125
+ plot = kwargs.get("plot", False)
126
+ bandwidth = kwargs.get("bandwidth", 1.5)
127
+ if not x[1]:
128
+ media = self.df[key].mean()
129
+ standard_deviation = self.df[key].std()
130
+ lower_limit = media - bandwidth * standard_deviation
131
+ upper_limit = media + bandwidth * standard_deviation
132
+ f, _, ox = cdf(x[0].flatten(), poly=poly, plot=plot, key=key)
133
+ else:
134
+ f, ox = None, None
135
+ least_frequent_category, most_frequent_category = categories_by_quartile(
136
+ self.df[[key]], key
137
+ )
138
+ lower_limit = x[1].get(least_frequent_category, 0)
139
+ upper_limit = x[1].get(most_frequent_category, 0)
140
+ self.proba_dict[key] = (
141
+ f if f else None,
142
+ x[1],
143
+ (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
144
+ f(lower_limit) if f else lower_limit,
145
+ f(upper_limit) if f else upper_limit,
146
+ )
147
+
148
+ def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
149
+ value = (
150
+ value
151
+ if isinstance(value, list)
152
+ else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
153
+ )
154
+ return [
155
+ (
156
+ self.proba_dict[colname][0](val)
157
+ - self.proba_dict[colname][0](val - self.proba_dict[colname][2])
158
+ if (isinstance(val, float) or isinstance(val, int))
159
+ else self.proba_dict[colname][1].get(val, 0)
160
+ )
161
+ for val in value
162
+ ]
163
+
164
+ def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
165
+ return [
166
+ (
167
+ "inlier"
168
+ if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
169
+ else "outlier"
170
+ )
171
+ for val in self.get_proba(value, colname)
172
+ ]
173
+
174
+ def _clean_data(self, df: DataFrame) -> DataFrame:
175
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
176
+ df.replace(" ", np.nan, inplace=True)
177
+ df = check_nan_inf(df)
178
+ df = df.reset_index()
179
+ df = df.drop(columns=["index"])
180
+
181
+ return df
182
+
183
+ def save(self, filename: str = "./simulation_model") -> None:
184
+ """
185
+ Save the state of the SimulationEngine to a file.
186
+
187
+ Parameters
188
+ ----------
189
+ filename : str
190
+ The name of the file where the object will be saved.
191
+ """
192
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
193
+ with open(filename, "wb") as f:
194
+ pickle.dump(self, f)
195
+
196
+ @staticmethod
197
+ def load(filename: str = "./simulation_model"):
198
+ """
199
+ Load the state of a SimulationEngine from a file.
200
+
201
+ Parameters
202
+ ----------
203
+ filename : str
204
+ The name of the file containing the saved object.
205
+
206
+ Returns
207
+ -------
208
+ SimulationEngine : Any
209
+ A new instance of SimulationEngine with the loaded state.
210
+ """
211
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
212
+ with open(filename, "rb") as f:
213
+ return pickle.load(f)
@@ -0,0 +1,87 @@
1
+ import numpy as np
2
+
3
+ from likelihood.tools import cal_average
4
+
5
+
6
+ class FeaturesArima:
7
+ """Base class for ARIMA feature computations."""
8
+
9
+ def forward(self, y_sum: np.ndarray, theta: list, mode: bool, noise: float) -> np.ndarray:
10
+ """Compute the forward pass of the model.
11
+
12
+ Parameters
13
+ ----------
14
+ y_sum : `np.ndarray`
15
+ Summed data points.
16
+ theta : `list`
17
+ Model parameters.
18
+ mode : `bool`
19
+ Forward pass mode.
20
+ noise : `float`
21
+ Noise level.
22
+
23
+ Returns
24
+ -------
25
+ `np.ndarray`
26
+ Computed values.
27
+ """
28
+ if mode:
29
+ y_vec = []
30
+
31
+ y_t = np.dot(theta, y_sum)
32
+
33
+ n = y_sum.shape[0]
34
+
35
+ for i in range(n):
36
+ try:
37
+ n_int = np.where(y_sum != y_sum[i])[0]
38
+ y_i = (y_t - np.dot(theta[n_int], y_sum[n_int])) / theta[i]
39
+ y_i += np.random.rand() * noise
40
+ except:
41
+ y_i = (y_t - np.dot(theta[0:i], y_sum[0:i])) / theta[i]
42
+ y_vec.append(y_i)
43
+ else:
44
+ y_t = np.dot(theta, y_sum) + y_sum[0]
45
+ n_int = np.where(y_sum != y_sum[0])[0]
46
+ y_i = (y_t - np.dot(theta[n_int], y_sum[n_int])) / theta[0]
47
+ y_i += np.random.rand() * noise
48
+ return y_i
49
+
50
+ return np.array(y_vec)
51
+
52
+ def integrated(self, datapoints: np.ndarray) -> np.ndarray:
53
+ """Compute integrated values.
54
+
55
+ Parameters
56
+ ----------
57
+ datapoints : `np.ndarray`
58
+ Input data points.
59
+
60
+ Returns
61
+ -------
62
+ `np.ndarray`
63
+ Integrated values.
64
+ """
65
+ datapoints = self.datapoints
66
+ y_sum = list(np.diff(datapoints, self.d))
67
+ y_sum.insert(0, datapoints[0])
68
+
69
+ return np.array(y_sum)
70
+
71
+ def average(self, datapoints: np.ndarray) -> np.ndarray:
72
+ """Compute average values.
73
+
74
+ Parameters
75
+ ----------
76
+ datapoints : `np.ndarray`
77
+ Input data points.
78
+
79
+ Returns
80
+ -------
81
+ `np.ndarray`
82
+ Average values.
83
+ """
84
+ y_sum_average = cal_average(datapoints)
85
+ y_sum_eps = datapoints - y_sum_average
86
+
87
+ return y_sum_eps
likelihood/pipes.py ADDED
@@ -0,0 +1,382 @@
1
+ import json
2
+ import pickle
3
+ import re
4
+ import uuid
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from likelihood.tools import generate_html_pipeline
11
+ from likelihood.tools.impute import SimpleImputer
12
+ from likelihood.tools.models_tools import TransformRange, remove_collinearity
13
+ from likelihood.tools.tools import DataFrameEncoder, DataScaler, LinearRegression, OneHotEncoder
14
+
15
+
16
+ class Pipeline:
17
+ def __init__(self, config_path: str):
18
+ """
19
+ Initialize the pipeline with a JSON configuration file.
20
+
21
+ Parameters
22
+ ----------
23
+ config_path : str
24
+ Path to the JSON config defining target column and preprocessing steps.
25
+ """
26
+ self.config = self._load_config(config_path)
27
+ self.target_col = self.config["target_column"]
28
+ self.steps = self.config["preprocessing_steps"]
29
+ self.compute_importance = self.config.get("compute_feature_importance", False)
30
+ self.fitted_components: Dict[str, object] = {}
31
+ self.fitted_idx: List[str] = []
32
+ self.columns_bin_sizes: Dict[str, int] | None = None
33
+
34
+ def _load_config(self, config_path: str) -> Dict:
35
+ """Load and validate the JSON configuration."""
36
+ with open(config_path, "r") as f:
37
+ config = json.load(f)
38
+
39
+ assert "target_column" in config, "Config must specify 'target_column'"
40
+ assert "preprocessing_steps" in config, "Config must specify 'preprocessing_steps'"
41
+ return config
42
+
43
+ def fit(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
44
+ """
45
+ Fit preprocessing components on the input DataFrame and return cleaned X/y.
46
+
47
+ Parameters
48
+ ----------
49
+ df : pd.DataFrame
50
+ Input data with features + target column.
51
+
52
+ Returns
53
+ -------
54
+ X : pd.DataFrame
55
+ Cleaned feature matrix.
56
+ y : np.ndarray
57
+ Target vector (from self.target_col).
58
+ importances : Optional[np.ndarray]
59
+ Feature importance scores (if compute_feature_importance=`True`).
60
+ """
61
+ y = df[self.target_col].values
62
+ X = df.drop(columns=[self.target_col]).copy()
63
+
64
+ initial_info = {
65
+ "shape": X.shape,
66
+ "columns": list(X.columns),
67
+ "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
68
+ "missing_values": X.isnull().sum().to_dict(),
69
+ }
70
+
71
+ steps_info = []
72
+ for step in self.steps:
73
+ step_name = step["name"]
74
+ params = step.get("params", {})
75
+ uuid_idx = uuid.uuid4()
76
+ step_info = {
77
+ "step_name": step_name,
78
+ "parameters": params,
79
+ "description": self._get_step_description(step_name),
80
+ "id": uuid_idx,
81
+ }
82
+ step_info["input_columns"] = list(X.columns)
83
+ self.fitted_idx.append(uuid_idx)
84
+
85
+ X = self._apply_step(step_name, uuid_idx, X, fit=True, **params)
86
+
87
+ step_info["output_shape"] = X.shape
88
+ step_info["output_columns"] = list(X.columns)
89
+ step_info["output_dtypes"] = X.dtypes.apply(lambda x: x.name).to_dict()
90
+ categorical_columns = X.select_dtypes(include=["category"]).columns
91
+ unique_categories = {col: X[col].unique().tolist() for col in categorical_columns}
92
+ step_info["unique_categories"] = unique_categories
93
+
94
+ steps_info.append(step_info)
95
+
96
+ final_info = {
97
+ "shape": X.shape,
98
+ "columns": list(X.columns),
99
+ "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
100
+ "missing_values": X.isnull().sum().to_dict(),
101
+ }
102
+
103
+ self.documentation = {
104
+ "initial_dataset": initial_info,
105
+ "processing_steps": steps_info,
106
+ "final_dataset": final_info,
107
+ }
108
+
109
+ importances = None
110
+ if self.compute_importance:
111
+ numeric_X = X.select_dtypes(include=["float"])
112
+ numeric_columns = numeric_X.columns.tolist()
113
+ model = LinearRegression()
114
+ model.fit(numeric_X.T.values, y)
115
+ importances = model.get_importances()
116
+ df_scores = pd.DataFrame([importances], columns=numeric_columns)
117
+ df_scores_abs = df_scores.abs()
118
+ df_scores_norm = (
119
+ df_scores_abs / df_scores_abs.to_numpy().sum()
120
+ if isinstance(importances, np.ndarray)
121
+ else pd.DataFrame()
122
+ )
123
+ return X, y, df_scores_norm
124
+
125
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
126
+ """
127
+ Apply fitted preprocessing steps to new data (no target column needed).
128
+
129
+ Parameters
130
+ ----------
131
+ df : pd.DataFrame
132
+ New data to transform.
133
+
134
+ Returns
135
+ -------
136
+ X_transformed : pd.DataFrame
137
+ Cleaned feature matrix.
138
+ """
139
+ X = df.copy()
140
+ for index, (step_name, _) in enumerate(self.fitted_components.items()):
141
+ step_name = re.sub(r"_[a-f0-9\-]{36}", "", step_name)
142
+ X = self._apply_step(step_name, self.fitted_idx[index], X, fit=False)
143
+
144
+ return X
145
+
146
+ def get_doc(
147
+ self, save_to_file: bool = True, file_name: str = "data_processing_report.html"
148
+ ) -> None:
149
+ """
150
+ Generate an HTML report from `self.documentation` for pipeline documentation.
151
+
152
+ Parameters
153
+ ----------
154
+ save_to_file : bool, optional
155
+ Whether to save generated HTML content to a file. Default is True.
156
+ file_name : str, optional
157
+ Filename for output when `save_to_file` is True. Default is "data_processing_report.html".
158
+ """
159
+
160
+ generate_html_pipeline(self.documentation, save_to_file=save_to_file, file_name=file_name)
161
+
162
+ def _apply_step(
163
+ self, step_name: str, idx: str, X: pd.DataFrame, fit: bool, **params
164
+ ) -> pd.DataFrame:
165
+ """Dispatch to the correct handler for a preprocessing step."""
166
+ handlers = {
167
+ "DataScaler": self._handle_datascaler,
168
+ "DataFrameEncoder": self._handle_dataframeencoder,
169
+ "remove_collinearity": self._handle_remove_collinearity,
170
+ "TransformRange": self._handle_transformrange,
171
+ "OneHotEncoder": self._handle_onehotencoder,
172
+ "SimpleImputer": self._handle_simpleimputer,
173
+ }
174
+
175
+ if step_name not in handlers:
176
+ raise ValueError(
177
+ f"Step '{step_name}' not supported. Supported steps: {list(handlers.keys())}"
178
+ )
179
+
180
+ return handlers[step_name](X, idx=idx, fit=fit, **params)
181
+
182
+ def _get_step_description(self, step_name: str) -> str:
183
+ """Return a description of what each preprocessing step does."""
184
+ descriptions = {
185
+ "DataScaler": "Scales numerical features using normalization",
186
+ "DataFrameEncoder": "Encodes categorical variables and normalizes to numerical features",
187
+ "remove_collinearity": "Removes highly correlated features to reduce multicollinearity",
188
+ "TransformRange": "Bins continuous features into discrete ranges",
189
+ "OneHotEncoder": "Converts categorical variables into binary variables",
190
+ "SimpleImputer": "Handles missing values by imputing with multiple linear regression strategies",
191
+ }
192
+
193
+ return descriptions.get(step_name, f"Unknown preprocessing step: {step_name}")
194
+
195
+ # ------------------------------ Step Handlers ------------------------------
196
+ def _handle_datascaler(self, X: pd.DataFrame, idx: str, fit: bool, n: int = 1) -> pd.DataFrame:
197
+ """Handle DataScaler (fits on training data, applies to all)."""
198
+ numeric_X = X.select_dtypes(include=["float"])
199
+ numeric_columns = numeric_X.columns.tolist()
200
+ n = None if n == 0 else n
201
+ if fit:
202
+ scaler = DataScaler(numeric_X.values.T, n=n)
203
+ self.fitted_components[f"DataScaler_{idx}"] = scaler
204
+ numeric_X = pd.DataFrame(scaler.rescale().T, columns=numeric_X.columns)
205
+ else:
206
+ scaler = self.fitted_components[f"DataScaler_{idx}"]
207
+ numeric_X = pd.DataFrame(
208
+ scaler.rescale(numeric_X.values.T).T, columns=numeric_X.columns
209
+ )
210
+ for col in numeric_columns:
211
+ X[col] = numeric_X[col]
212
+ return X
213
+
214
+ def _handle_dataframeencoder(
215
+ self, X: pd.DataFrame, idx: str, fit: bool, norm_method: str = "mean"
216
+ ) -> pd.DataFrame:
217
+ """Handle DataFrameEncoder (fits encoders/normalizers)."""
218
+ if fit:
219
+ encoder = DataFrameEncoder(X)
220
+ encoded_X = encoder.encode(norm_method=norm_method)
221
+ self.fitted_components[f"DataFrameEncoder_{idx}"] = encoder
222
+ return encoded_X
223
+ else:
224
+ encoder = self.fitted_components[f"DataFrameEncoder_{idx}"]
225
+ encoder._df = X
226
+ return encoder.encode()
227
+
228
+ def _handle_remove_collinearity(
229
+ self, X: pd.DataFrame, idx: str, fit: bool, threshold: float = 0.9
230
+ ) -> pd.DataFrame:
231
+ """Handle collinearity removal (fits by selecting columns to drop)."""
232
+ numeric_X = X.select_dtypes(include=["float"])
233
+ numeric_columns = numeric_X.columns.tolist()
234
+ categorical_columns = set(X.columns) - set(numeric_columns)
235
+ if fit:
236
+ cleaned_X = remove_collinearity(numeric_X, threshold=threshold)
237
+ dropped_cols = set(X.columns) - set(cleaned_X.columns) - categorical_columns
238
+ self.fitted_components[f"remove_collinearity_{idx}"] = dropped_cols
239
+ return X.drop(columns=dropped_cols)
240
+ else:
241
+ dropped_cols = self.fitted_components[f"remove_collinearity_{idx}"]
242
+ return X.drop(columns=dropped_cols)
243
+
244
+ def _handle_transformrange(
245
+ self, X: pd.DataFrame, idx: str, fit: bool, columns_bin_sizes: Dict[str, int] | None = None
246
+ ) -> pd.DataFrame:
247
+ """Handle TransformRange (bin numerical features into ranges)."""
248
+ if fit:
249
+ transformer = TransformRange(columns_bin_sizes)
250
+ cleaned_X = transformer.transform(X)
251
+ self.fitted_components[f"TransformRange_{idx}"] = transformer
252
+ self.columns_bin_sizes = columns_bin_sizes
253
+ return cleaned_X
254
+ else:
255
+ transformer = self.fitted_components[f"TransformRange_{idx}"]
256
+ return transformer.transform(X, fit=False)
257
+
258
+ def _handle_onehotencoder(
259
+ self, X: pd.DataFrame, idx: str, fit: bool, columns: List[str] | None = None
260
+ ) -> pd.DataFrame:
261
+ """Handle OneHotEncoder (fits on categorical columns)."""
262
+ if fit:
263
+ tmp_df = X.drop(columns=columns)
264
+ encoder = OneHotEncoder()
265
+ category_to_indices = {}
266
+ for col in columns:
267
+ unique_values = X[col].unique()
268
+ category_to_indices[col] = {
269
+ value: i
270
+ for i, value in enumerate(
271
+ X[col].cat.codes.unique()
272
+ if pd.api.types.is_categorical_dtype(X[col])
273
+ else X[col].unique()
274
+ )
275
+ }
276
+ encoded_X = encoder.encode(
277
+ X[col].values
278
+ if isinstance(unique_values[0], int)
279
+ else X[col].cat.codes.map(category_to_indices[col])
280
+ )
281
+ tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
282
+ self.fitted_components[f"OneHotEncoder_{idx}"] = (
283
+ encoder,
284
+ columns,
285
+ category_to_indices,
286
+ unique_values,
287
+ )
288
+ else:
289
+ encoder, columns, category_to_indices, unique_values = self.fitted_components[
290
+ f"OneHotEncoder_{idx}"
291
+ ]
292
+ tmp_df = X.drop(columns=columns)
293
+ for col in columns:
294
+ encoded_X = encoder.encode(
295
+ (
296
+ X[col].values
297
+ if isinstance(unique_values[0], int)
298
+ else X[col].cat.codes.map(category_to_indices[col])
299
+ ),
300
+ fit=False,
301
+ )
302
+ tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
303
+ return tmp_df
304
+
305
+ def _handle_simpleimputer(
306
+ self,
307
+ X: pd.DataFrame,
308
+ idx: str,
309
+ fit: bool,
310
+ use_scaler: bool = False,
311
+ boundary: bool = True,
312
+ ) -> pd.DataFrame:
313
+ "Handle SimpleImputer (fit on numerical and categorical columns)."
314
+ if fit:
315
+ use_scaler = True if use_scaler == 1 else False
316
+ imputer = SimpleImputer(use_scaler=use_scaler)
317
+ tmp_df = imputer.fit_transform(X, boundary=boundary)
318
+ self.fitted_components[f"SimpleImputer_{idx}"] = imputer
319
+ return tmp_df
320
+ else:
321
+ imputer = self.fitted_components[f"SimpleImputer_{idx}"]
322
+ return imputer.transform(X, boundary=boundary)
323
+
324
+ def save(self, filepath: str) -> None:
325
+ """
326
+ Save the fitted pipeline state to a file using pickle.
327
+
328
+ Parameters
329
+ ----------
330
+ filepath : str
331
+ Path where the serialized pipeline will be saved.
332
+ """
333
+
334
+ save_dict = {
335
+ "config": self.config,
336
+ "fitted_components": self.fitted_components,
337
+ "fitted_idx": self.fitted_idx,
338
+ "target_col": self.target_col,
339
+ "steps": self.steps,
340
+ "compute_importance": self.compute_importance,
341
+ "columns_bin_sizes": self.columns_bin_sizes,
342
+ "documentation": self.documentation,
343
+ }
344
+
345
+ filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
346
+
347
+ with open(filepath, "wb") as f:
348
+ pickle.dump(save_dict, f)
349
+
350
+ @classmethod
351
+ def load(cls, filepath: str) -> "Pipeline":
352
+ """
353
+ Load a fitted pipeline from a file.
354
+
355
+ Parameters
356
+ ----------
357
+ filepath : str
358
+ Path to the serialized pipeline file.
359
+
360
+ Returns
361
+ -------
362
+ pipeline : Pipeline
363
+ Reconstructed pipeline instance with fitted components.
364
+ """
365
+
366
+ filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
367
+
368
+ with open(filepath, "rb") as f:
369
+ save_dict = pickle.load(f)
370
+
371
+ pipeline = cls.__new__(cls)
372
+
373
+ pipeline.config = save_dict["config"]
374
+ pipeline.fitted_components = save_dict["fitted_components"]
375
+ pipeline.fitted_idx = save_dict["fitted_idx"]
376
+ pipeline.target_col = save_dict["target_col"]
377
+ pipeline.steps = save_dict["steps"]
378
+ pipeline.compute_importance = save_dict["compute_importance"]
379
+ pipeline.columns_bin_sizes = save_dict["columns_bin_sizes"]
380
+ pipeline.documentation = save_dict["documentation"]
381
+
382
+ return pipeline
@@ -0,0 +1,4 @@
1
+ from .models_tools import *
2
+ from .numeric_tools import *
3
+ from .reports import generate_html_pipeline
4
+ from .tools import *