likelihood 2.2.0.dev1__cp310-cp310-musllinux_1_2_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/VERSION +1 -0
- likelihood/__init__.py +20 -0
- likelihood/graph/__init__.py +9 -0
- likelihood/graph/_nn.py +283 -0
- likelihood/graph/graph.py +86 -0
- likelihood/graph/nn.py +329 -0
- likelihood/main.py +273 -0
- likelihood/models/__init__.py +3 -0
- likelihood/models/deep/__init__.py +13 -0
- likelihood/models/deep/_autoencoders.py +896 -0
- likelihood/models/deep/_predictor.py +809 -0
- likelihood/models/deep/autoencoders.py +903 -0
- likelihood/models/deep/bandit.py +97 -0
- likelihood/models/deep/gan.py +313 -0
- likelihood/models/deep/predictor.py +805 -0
- likelihood/models/deep/rl.py +345 -0
- likelihood/models/environments.py +202 -0
- likelihood/models/hmm.py +163 -0
- likelihood/models/regression.py +451 -0
- likelihood/models/simulation.py +213 -0
- likelihood/models/utils.py +87 -0
- likelihood/pipes.py +382 -0
- likelihood/rust_py_integration.cpython-310-x86_64-linux-gnu.so +0 -0
- likelihood/tools/__init__.py +4 -0
- likelihood/tools/cat_embed.py +212 -0
- likelihood/tools/figures.py +348 -0
- likelihood/tools/impute.py +278 -0
- likelihood/tools/models_tools.py +866 -0
- likelihood/tools/numeric_tools.py +390 -0
- likelihood/tools/reports.py +375 -0
- likelihood/tools/tools.py +1336 -0
- likelihood-2.2.0.dev1.dist-info/METADATA +68 -0
- likelihood-2.2.0.dev1.dist-info/RECORD +39 -0
- likelihood-2.2.0.dev1.dist-info/WHEEL +5 -0
- likelihood-2.2.0.dev1.dist-info/licenses/LICENSE +21 -0
- likelihood-2.2.0.dev1.dist-info/sboms/auditwheel.cdx.json +1 -0
- likelihood-2.2.0.dev1.dist-info/top_level.txt +5 -0
- likelihood.libs/libgcc_s-0cd532bd.so.1 +0 -0
- src/lib.rs +12 -0
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
from typing import Dict, List, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from packaging import version
|
|
7
|
+
from pandas.core.frame import DataFrame
|
|
8
|
+
|
|
9
|
+
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
|
|
10
|
+
|
|
11
|
+
if version.parse(np.__version__) < version.parse("2.0.0"):
|
|
12
|
+
filter = np.RankWarning
|
|
13
|
+
else:
|
|
14
|
+
filter = np.exceptions.RankWarning
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
18
|
+
def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
|
|
19
|
+
freq = df[column].value_counts()
|
|
20
|
+
|
|
21
|
+
q1 = freq.quantile(0.25)
|
|
22
|
+
q3 = freq.quantile(0.75)
|
|
23
|
+
|
|
24
|
+
least_frequent = freq[freq <= q1]
|
|
25
|
+
most_frequent = freq[freq >= q3]
|
|
26
|
+
|
|
27
|
+
least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
|
|
28
|
+
most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
|
|
29
|
+
|
|
30
|
+
return least_frequent_category, most_frequent_category
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SimulationEngine(FeatureSelection):
|
|
34
|
+
"""
|
|
35
|
+
This class implements a predictive model that utilizes multiple linear regression for numerical target variables
|
|
36
|
+
and multiple logistic regression for categorical target variables.
|
|
37
|
+
|
|
38
|
+
The class provides methods for training the model on a given dataset, making predictions,
|
|
39
|
+
and evaluating the model's performance.
|
|
40
|
+
|
|
41
|
+
Key features:
|
|
42
|
+
- Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
|
|
43
|
+
- Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
|
|
44
|
+
- Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
|
|
45
|
+
|
|
46
|
+
Usage:
|
|
47
|
+
- Instantiate the class with the training data and target variable.
|
|
48
|
+
- Call the fit method to train the model.
|
|
49
|
+
- Use the predict method to generate predictions on new data.
|
|
50
|
+
- Evaluate the model using built-in metrics for accuracy and error.
|
|
51
|
+
|
|
52
|
+
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
|
|
53
|
+
for both numerical and categorical outcomes efficiently.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, use_scaler: bool = False, **kwargs):
|
|
57
|
+
self.df = pd.DataFrame()
|
|
58
|
+
self.n_importances = None
|
|
59
|
+
self.use_scaler = use_scaler
|
|
60
|
+
self.proba_dict = {}
|
|
61
|
+
|
|
62
|
+
super().__init__(**kwargs)
|
|
63
|
+
|
|
64
|
+
def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
|
|
65
|
+
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
66
|
+
|
|
67
|
+
df = df[names_cols].copy()
|
|
68
|
+
dataset = self.df.copy()
|
|
69
|
+
dataset.drop(columns=column, inplace=True)
|
|
70
|
+
numeric_df = dataset.select_dtypes(include="number")
|
|
71
|
+
if self.use_scaler:
|
|
72
|
+
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
73
|
+
_ = scaler.rescale()
|
|
74
|
+
dataset_ = df.copy()
|
|
75
|
+
numeric_df = dataset_.select_dtypes(include="number")
|
|
76
|
+
numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
|
|
77
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
78
|
+
for col in numeric_df.columns:
|
|
79
|
+
df[col] = numeric_df[col].values
|
|
80
|
+
|
|
81
|
+
for num, colname in enumerate(dfe._encode_columns):
|
|
82
|
+
if df[colname].dtype == "object":
|
|
83
|
+
encode_dict = dfe.encoding_list[num]
|
|
84
|
+
df[colname] = df[colname].apply(
|
|
85
|
+
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
y = df.to_numpy() @ w
|
|
89
|
+
|
|
90
|
+
if quick_encoder != None:
|
|
91
|
+
one_hot = OneHotEncoder()
|
|
92
|
+
y = one_hot.decode(y)
|
|
93
|
+
encoding_dic = quick_encoder.decoding_list[0]
|
|
94
|
+
y = [encoding_dic[item] for item in y]
|
|
95
|
+
else:
|
|
96
|
+
if self.use_scaler:
|
|
97
|
+
y += 1
|
|
98
|
+
y /= 2
|
|
99
|
+
y = y * (self.df[column].max() - self.df[column].min())
|
|
100
|
+
|
|
101
|
+
return y[:]
|
|
102
|
+
|
|
103
|
+
def _encode(self, df: DataFrame) -> Dict[str, float]:
|
|
104
|
+
df = df.copy()
|
|
105
|
+
column = df.columns[0]
|
|
106
|
+
frec = df[column].value_counts() / len(df)
|
|
107
|
+
df.loc[:, "frec"] = df[column].map(frec)
|
|
108
|
+
df.sort_values("frec", inplace=True)
|
|
109
|
+
keys = df[column].to_list()
|
|
110
|
+
values = df["frec"].to_list()
|
|
111
|
+
return dict(zip(keys, values))
|
|
112
|
+
|
|
113
|
+
def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
|
|
114
|
+
self.df = df
|
|
115
|
+
self.n_importances = n_importances
|
|
116
|
+
self.get_digraph(self.df, self.n_importances, self.use_scaler)
|
|
117
|
+
proba_dict_keys = list(self.w_dict.keys())
|
|
118
|
+
self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
|
|
119
|
+
for key in proba_dict_keys:
|
|
120
|
+
x = (
|
|
121
|
+
self.df[key].values,
|
|
122
|
+
None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
|
|
123
|
+
)
|
|
124
|
+
poly = kwargs.get("poly", 9)
|
|
125
|
+
plot = kwargs.get("plot", False)
|
|
126
|
+
bandwidth = kwargs.get("bandwidth", 1.5)
|
|
127
|
+
if not x[1]:
|
|
128
|
+
media = self.df[key].mean()
|
|
129
|
+
standard_deviation = self.df[key].std()
|
|
130
|
+
lower_limit = media - bandwidth * standard_deviation
|
|
131
|
+
upper_limit = media + bandwidth * standard_deviation
|
|
132
|
+
f, _, ox = cdf(x[0].flatten(), poly=poly, plot=plot, key=key)
|
|
133
|
+
else:
|
|
134
|
+
f, ox = None, None
|
|
135
|
+
least_frequent_category, most_frequent_category = categories_by_quartile(
|
|
136
|
+
self.df[[key]], key
|
|
137
|
+
)
|
|
138
|
+
lower_limit = x[1].get(least_frequent_category, 0)
|
|
139
|
+
upper_limit = x[1].get(most_frequent_category, 0)
|
|
140
|
+
self.proba_dict[key] = (
|
|
141
|
+
f if f else None,
|
|
142
|
+
x[1],
|
|
143
|
+
(np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
|
|
144
|
+
f(lower_limit) if f else lower_limit,
|
|
145
|
+
f(upper_limit) if f else upper_limit,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
|
|
149
|
+
value = (
|
|
150
|
+
value
|
|
151
|
+
if isinstance(value, list)
|
|
152
|
+
else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
|
|
153
|
+
)
|
|
154
|
+
return [
|
|
155
|
+
(
|
|
156
|
+
self.proba_dict[colname][0](val)
|
|
157
|
+
- self.proba_dict[colname][0](val - self.proba_dict[colname][2])
|
|
158
|
+
if (isinstance(val, float) or isinstance(val, int))
|
|
159
|
+
else self.proba_dict[colname][1].get(val, 0)
|
|
160
|
+
)
|
|
161
|
+
for val in value
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
|
|
165
|
+
return [
|
|
166
|
+
(
|
|
167
|
+
"inlier"
|
|
168
|
+
if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
|
|
169
|
+
else "outlier"
|
|
170
|
+
)
|
|
171
|
+
for val in self.get_proba(value, colname)
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
175
|
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
176
|
+
df.replace(" ", np.nan, inplace=True)
|
|
177
|
+
df = check_nan_inf(df)
|
|
178
|
+
df = df.reset_index()
|
|
179
|
+
df = df.drop(columns=["index"])
|
|
180
|
+
|
|
181
|
+
return df
|
|
182
|
+
|
|
183
|
+
def save(self, filename: str = "./simulation_model") -> None:
|
|
184
|
+
"""
|
|
185
|
+
Save the state of the SimulationEngine to a file.
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
filename : str
|
|
190
|
+
The name of the file where the object will be saved.
|
|
191
|
+
"""
|
|
192
|
+
filename = filename if filename.endswith(".pkl") else filename + ".pkl"
|
|
193
|
+
with open(filename, "wb") as f:
|
|
194
|
+
pickle.dump(self, f)
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def load(filename: str = "./simulation_model"):
|
|
198
|
+
"""
|
|
199
|
+
Load the state of a SimulationEngine from a file.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
filename : str
|
|
204
|
+
The name of the file containing the saved object.
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
SimulationEngine : Any
|
|
209
|
+
A new instance of SimulationEngine with the loaded state.
|
|
210
|
+
"""
|
|
211
|
+
filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
|
|
212
|
+
with open(filename, "rb") as f:
|
|
213
|
+
return pickle.load(f)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from likelihood.tools import cal_average
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FeaturesArima:
|
|
7
|
+
"""Base class for ARIMA feature computations."""
|
|
8
|
+
|
|
9
|
+
def forward(self, y_sum: np.ndarray, theta: list, mode: bool, noise: float) -> np.ndarray:
|
|
10
|
+
"""Compute the forward pass of the model.
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
y_sum : `np.ndarray`
|
|
15
|
+
Summed data points.
|
|
16
|
+
theta : `list`
|
|
17
|
+
Model parameters.
|
|
18
|
+
mode : `bool`
|
|
19
|
+
Forward pass mode.
|
|
20
|
+
noise : `float`
|
|
21
|
+
Noise level.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
`np.ndarray`
|
|
26
|
+
Computed values.
|
|
27
|
+
"""
|
|
28
|
+
if mode:
|
|
29
|
+
y_vec = []
|
|
30
|
+
|
|
31
|
+
y_t = np.dot(theta, y_sum)
|
|
32
|
+
|
|
33
|
+
n = y_sum.shape[0]
|
|
34
|
+
|
|
35
|
+
for i in range(n):
|
|
36
|
+
try:
|
|
37
|
+
n_int = np.where(y_sum != y_sum[i])[0]
|
|
38
|
+
y_i = (y_t - np.dot(theta[n_int], y_sum[n_int])) / theta[i]
|
|
39
|
+
y_i += np.random.rand() * noise
|
|
40
|
+
except:
|
|
41
|
+
y_i = (y_t - np.dot(theta[0:i], y_sum[0:i])) / theta[i]
|
|
42
|
+
y_vec.append(y_i)
|
|
43
|
+
else:
|
|
44
|
+
y_t = np.dot(theta, y_sum) + y_sum[0]
|
|
45
|
+
n_int = np.where(y_sum != y_sum[0])[0]
|
|
46
|
+
y_i = (y_t - np.dot(theta[n_int], y_sum[n_int])) / theta[0]
|
|
47
|
+
y_i += np.random.rand() * noise
|
|
48
|
+
return y_i
|
|
49
|
+
|
|
50
|
+
return np.array(y_vec)
|
|
51
|
+
|
|
52
|
+
def integrated(self, datapoints: np.ndarray) -> np.ndarray:
|
|
53
|
+
"""Compute integrated values.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
datapoints : `np.ndarray`
|
|
58
|
+
Input data points.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
`np.ndarray`
|
|
63
|
+
Integrated values.
|
|
64
|
+
"""
|
|
65
|
+
datapoints = self.datapoints
|
|
66
|
+
y_sum = list(np.diff(datapoints, self.d))
|
|
67
|
+
y_sum.insert(0, datapoints[0])
|
|
68
|
+
|
|
69
|
+
return np.array(y_sum)
|
|
70
|
+
|
|
71
|
+
def average(self, datapoints: np.ndarray) -> np.ndarray:
|
|
72
|
+
"""Compute average values.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
datapoints : `np.ndarray`
|
|
77
|
+
Input data points.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
`np.ndarray`
|
|
82
|
+
Average values.
|
|
83
|
+
"""
|
|
84
|
+
y_sum_average = cal_average(datapoints)
|
|
85
|
+
y_sum_eps = datapoints - y_sum_average
|
|
86
|
+
|
|
87
|
+
return y_sum_eps
|
likelihood/pipes.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pickle
|
|
3
|
+
import re
|
|
4
|
+
import uuid
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from likelihood.tools import generate_html_pipeline
|
|
11
|
+
from likelihood.tools.impute import SimpleImputer
|
|
12
|
+
from likelihood.tools.models_tools import TransformRange, remove_collinearity
|
|
13
|
+
from likelihood.tools.tools import DataFrameEncoder, DataScaler, LinearRegression, OneHotEncoder
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Pipeline:
|
|
17
|
+
def __init__(self, config_path: str):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the pipeline with a JSON configuration file.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
config_path : str
|
|
24
|
+
Path to the JSON config defining target column and preprocessing steps.
|
|
25
|
+
"""
|
|
26
|
+
self.config = self._load_config(config_path)
|
|
27
|
+
self.target_col = self.config["target_column"]
|
|
28
|
+
self.steps = self.config["preprocessing_steps"]
|
|
29
|
+
self.compute_importance = self.config.get("compute_feature_importance", False)
|
|
30
|
+
self.fitted_components: Dict[str, object] = {}
|
|
31
|
+
self.fitted_idx: List[str] = []
|
|
32
|
+
self.columns_bin_sizes: Dict[str, int] | None = None
|
|
33
|
+
|
|
34
|
+
def _load_config(self, config_path: str) -> Dict:
|
|
35
|
+
"""Load and validate the JSON configuration."""
|
|
36
|
+
with open(config_path, "r") as f:
|
|
37
|
+
config = json.load(f)
|
|
38
|
+
|
|
39
|
+
assert "target_column" in config, "Config must specify 'target_column'"
|
|
40
|
+
assert "preprocessing_steps" in config, "Config must specify 'preprocessing_steps'"
|
|
41
|
+
return config
|
|
42
|
+
|
|
43
|
+
def fit(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
|
|
44
|
+
"""
|
|
45
|
+
Fit preprocessing components on the input DataFrame and return cleaned X/y.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
df : pd.DataFrame
|
|
50
|
+
Input data with features + target column.
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
X : pd.DataFrame
|
|
55
|
+
Cleaned feature matrix.
|
|
56
|
+
y : np.ndarray
|
|
57
|
+
Target vector (from self.target_col).
|
|
58
|
+
importances : Optional[np.ndarray]
|
|
59
|
+
Feature importance scores (if compute_feature_importance=`True`).
|
|
60
|
+
"""
|
|
61
|
+
y = df[self.target_col].values
|
|
62
|
+
X = df.drop(columns=[self.target_col]).copy()
|
|
63
|
+
|
|
64
|
+
initial_info = {
|
|
65
|
+
"shape": X.shape,
|
|
66
|
+
"columns": list(X.columns),
|
|
67
|
+
"dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
|
|
68
|
+
"missing_values": X.isnull().sum().to_dict(),
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
steps_info = []
|
|
72
|
+
for step in self.steps:
|
|
73
|
+
step_name = step["name"]
|
|
74
|
+
params = step.get("params", {})
|
|
75
|
+
uuid_idx = uuid.uuid4()
|
|
76
|
+
step_info = {
|
|
77
|
+
"step_name": step_name,
|
|
78
|
+
"parameters": params,
|
|
79
|
+
"description": self._get_step_description(step_name),
|
|
80
|
+
"id": uuid_idx,
|
|
81
|
+
}
|
|
82
|
+
step_info["input_columns"] = list(X.columns)
|
|
83
|
+
self.fitted_idx.append(uuid_idx)
|
|
84
|
+
|
|
85
|
+
X = self._apply_step(step_name, uuid_idx, X, fit=True, **params)
|
|
86
|
+
|
|
87
|
+
step_info["output_shape"] = X.shape
|
|
88
|
+
step_info["output_columns"] = list(X.columns)
|
|
89
|
+
step_info["output_dtypes"] = X.dtypes.apply(lambda x: x.name).to_dict()
|
|
90
|
+
categorical_columns = X.select_dtypes(include=["category"]).columns
|
|
91
|
+
unique_categories = {col: X[col].unique().tolist() for col in categorical_columns}
|
|
92
|
+
step_info["unique_categories"] = unique_categories
|
|
93
|
+
|
|
94
|
+
steps_info.append(step_info)
|
|
95
|
+
|
|
96
|
+
final_info = {
|
|
97
|
+
"shape": X.shape,
|
|
98
|
+
"columns": list(X.columns),
|
|
99
|
+
"dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
|
|
100
|
+
"missing_values": X.isnull().sum().to_dict(),
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
self.documentation = {
|
|
104
|
+
"initial_dataset": initial_info,
|
|
105
|
+
"processing_steps": steps_info,
|
|
106
|
+
"final_dataset": final_info,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
importances = None
|
|
110
|
+
if self.compute_importance:
|
|
111
|
+
numeric_X = X.select_dtypes(include=["float"])
|
|
112
|
+
numeric_columns = numeric_X.columns.tolist()
|
|
113
|
+
model = LinearRegression()
|
|
114
|
+
model.fit(numeric_X.T.values, y)
|
|
115
|
+
importances = model.get_importances()
|
|
116
|
+
df_scores = pd.DataFrame([importances], columns=numeric_columns)
|
|
117
|
+
df_scores_abs = df_scores.abs()
|
|
118
|
+
df_scores_norm = (
|
|
119
|
+
df_scores_abs / df_scores_abs.to_numpy().sum()
|
|
120
|
+
if isinstance(importances, np.ndarray)
|
|
121
|
+
else pd.DataFrame()
|
|
122
|
+
)
|
|
123
|
+
return X, y, df_scores_norm
|
|
124
|
+
|
|
125
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
126
|
+
"""
|
|
127
|
+
Apply fitted preprocessing steps to new data (no target column needed).
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
df : pd.DataFrame
|
|
132
|
+
New data to transform.
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
X_transformed : pd.DataFrame
|
|
137
|
+
Cleaned feature matrix.
|
|
138
|
+
"""
|
|
139
|
+
X = df.copy()
|
|
140
|
+
for index, (step_name, _) in enumerate(self.fitted_components.items()):
|
|
141
|
+
step_name = re.sub(r"_[a-f0-9\-]{36}", "", step_name)
|
|
142
|
+
X = self._apply_step(step_name, self.fitted_idx[index], X, fit=False)
|
|
143
|
+
|
|
144
|
+
return X
|
|
145
|
+
|
|
146
|
+
def get_doc(
|
|
147
|
+
self, save_to_file: bool = True, file_name: str = "data_processing_report.html"
|
|
148
|
+
) -> None:
|
|
149
|
+
"""
|
|
150
|
+
Generate an HTML report from `self.documentation` for pipeline documentation.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
save_to_file : bool, optional
|
|
155
|
+
Whether to save generated HTML content to a file. Default is True.
|
|
156
|
+
file_name : str, optional
|
|
157
|
+
Filename for output when `save_to_file` is True. Default is "data_processing_report.html".
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
generate_html_pipeline(self.documentation, save_to_file=save_to_file, file_name=file_name)
|
|
161
|
+
|
|
162
|
+
def _apply_step(
|
|
163
|
+
self, step_name: str, idx: str, X: pd.DataFrame, fit: bool, **params
|
|
164
|
+
) -> pd.DataFrame:
|
|
165
|
+
"""Dispatch to the correct handler for a preprocessing step."""
|
|
166
|
+
handlers = {
|
|
167
|
+
"DataScaler": self._handle_datascaler,
|
|
168
|
+
"DataFrameEncoder": self._handle_dataframeencoder,
|
|
169
|
+
"remove_collinearity": self._handle_remove_collinearity,
|
|
170
|
+
"TransformRange": self._handle_transformrange,
|
|
171
|
+
"OneHotEncoder": self._handle_onehotencoder,
|
|
172
|
+
"SimpleImputer": self._handle_simpleimputer,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if step_name not in handlers:
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"Step '{step_name}' not supported. Supported steps: {list(handlers.keys())}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return handlers[step_name](X, idx=idx, fit=fit, **params)
|
|
181
|
+
|
|
182
|
+
def _get_step_description(self, step_name: str) -> str:
|
|
183
|
+
"""Return a description of what each preprocessing step does."""
|
|
184
|
+
descriptions = {
|
|
185
|
+
"DataScaler": "Scales numerical features using normalization",
|
|
186
|
+
"DataFrameEncoder": "Encodes categorical variables and normalizes to numerical features",
|
|
187
|
+
"remove_collinearity": "Removes highly correlated features to reduce multicollinearity",
|
|
188
|
+
"TransformRange": "Bins continuous features into discrete ranges",
|
|
189
|
+
"OneHotEncoder": "Converts categorical variables into binary variables",
|
|
190
|
+
"SimpleImputer": "Handles missing values by imputing with multiple linear regression strategies",
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return descriptions.get(step_name, f"Unknown preprocessing step: {step_name}")
|
|
194
|
+
|
|
195
|
+
# ------------------------------ Step Handlers ------------------------------
|
|
196
|
+
def _handle_datascaler(self, X: pd.DataFrame, idx: str, fit: bool, n: int = 1) -> pd.DataFrame:
|
|
197
|
+
"""Handle DataScaler (fits on training data, applies to all)."""
|
|
198
|
+
numeric_X = X.select_dtypes(include=["float"])
|
|
199
|
+
numeric_columns = numeric_X.columns.tolist()
|
|
200
|
+
n = None if n == 0 else n
|
|
201
|
+
if fit:
|
|
202
|
+
scaler = DataScaler(numeric_X.values.T, n=n)
|
|
203
|
+
self.fitted_components[f"DataScaler_{idx}"] = scaler
|
|
204
|
+
numeric_X = pd.DataFrame(scaler.rescale().T, columns=numeric_X.columns)
|
|
205
|
+
else:
|
|
206
|
+
scaler = self.fitted_components[f"DataScaler_{idx}"]
|
|
207
|
+
numeric_X = pd.DataFrame(
|
|
208
|
+
scaler.rescale(numeric_X.values.T).T, columns=numeric_X.columns
|
|
209
|
+
)
|
|
210
|
+
for col in numeric_columns:
|
|
211
|
+
X[col] = numeric_X[col]
|
|
212
|
+
return X
|
|
213
|
+
|
|
214
|
+
def _handle_dataframeencoder(
|
|
215
|
+
self, X: pd.DataFrame, idx: str, fit: bool, norm_method: str = "mean"
|
|
216
|
+
) -> pd.DataFrame:
|
|
217
|
+
"""Handle DataFrameEncoder (fits encoders/normalizers)."""
|
|
218
|
+
if fit:
|
|
219
|
+
encoder = DataFrameEncoder(X)
|
|
220
|
+
encoded_X = encoder.encode(norm_method=norm_method)
|
|
221
|
+
self.fitted_components[f"DataFrameEncoder_{idx}"] = encoder
|
|
222
|
+
return encoded_X
|
|
223
|
+
else:
|
|
224
|
+
encoder = self.fitted_components[f"DataFrameEncoder_{idx}"]
|
|
225
|
+
encoder._df = X
|
|
226
|
+
return encoder.encode()
|
|
227
|
+
|
|
228
|
+
def _handle_remove_collinearity(
|
|
229
|
+
self, X: pd.DataFrame, idx: str, fit: bool, threshold: float = 0.9
|
|
230
|
+
) -> pd.DataFrame:
|
|
231
|
+
"""Handle collinearity removal (fits by selecting columns to drop)."""
|
|
232
|
+
numeric_X = X.select_dtypes(include=["float"])
|
|
233
|
+
numeric_columns = numeric_X.columns.tolist()
|
|
234
|
+
categorical_columns = set(X.columns) - set(numeric_columns)
|
|
235
|
+
if fit:
|
|
236
|
+
cleaned_X = remove_collinearity(numeric_X, threshold=threshold)
|
|
237
|
+
dropped_cols = set(X.columns) - set(cleaned_X.columns) - categorical_columns
|
|
238
|
+
self.fitted_components[f"remove_collinearity_{idx}"] = dropped_cols
|
|
239
|
+
return X.drop(columns=dropped_cols)
|
|
240
|
+
else:
|
|
241
|
+
dropped_cols = self.fitted_components[f"remove_collinearity_{idx}"]
|
|
242
|
+
return X.drop(columns=dropped_cols)
|
|
243
|
+
|
|
244
|
+
def _handle_transformrange(
|
|
245
|
+
self, X: pd.DataFrame, idx: str, fit: bool, columns_bin_sizes: Dict[str, int] | None = None
|
|
246
|
+
) -> pd.DataFrame:
|
|
247
|
+
"""Handle TransformRange (bin numerical features into ranges)."""
|
|
248
|
+
if fit:
|
|
249
|
+
transformer = TransformRange(columns_bin_sizes)
|
|
250
|
+
cleaned_X = transformer.transform(X)
|
|
251
|
+
self.fitted_components[f"TransformRange_{idx}"] = transformer
|
|
252
|
+
self.columns_bin_sizes = columns_bin_sizes
|
|
253
|
+
return cleaned_X
|
|
254
|
+
else:
|
|
255
|
+
transformer = self.fitted_components[f"TransformRange_{idx}"]
|
|
256
|
+
return transformer.transform(X, fit=False)
|
|
257
|
+
|
|
258
|
+
def _handle_onehotencoder(
|
|
259
|
+
self, X: pd.DataFrame, idx: str, fit: bool, columns: List[str] | None = None
|
|
260
|
+
) -> pd.DataFrame:
|
|
261
|
+
"""Handle OneHotEncoder (fits on categorical columns)."""
|
|
262
|
+
if fit:
|
|
263
|
+
tmp_df = X.drop(columns=columns)
|
|
264
|
+
encoder = OneHotEncoder()
|
|
265
|
+
category_to_indices = {}
|
|
266
|
+
for col in columns:
|
|
267
|
+
unique_values = X[col].unique()
|
|
268
|
+
category_to_indices[col] = {
|
|
269
|
+
value: i
|
|
270
|
+
for i, value in enumerate(
|
|
271
|
+
X[col].cat.codes.unique()
|
|
272
|
+
if pd.api.types.is_categorical_dtype(X[col])
|
|
273
|
+
else X[col].unique()
|
|
274
|
+
)
|
|
275
|
+
}
|
|
276
|
+
encoded_X = encoder.encode(
|
|
277
|
+
X[col].values
|
|
278
|
+
if isinstance(unique_values[0], int)
|
|
279
|
+
else X[col].cat.codes.map(category_to_indices[col])
|
|
280
|
+
)
|
|
281
|
+
tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
|
|
282
|
+
self.fitted_components[f"OneHotEncoder_{idx}"] = (
|
|
283
|
+
encoder,
|
|
284
|
+
columns,
|
|
285
|
+
category_to_indices,
|
|
286
|
+
unique_values,
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
encoder, columns, category_to_indices, unique_values = self.fitted_components[
|
|
290
|
+
f"OneHotEncoder_{idx}"
|
|
291
|
+
]
|
|
292
|
+
tmp_df = X.drop(columns=columns)
|
|
293
|
+
for col in columns:
|
|
294
|
+
encoded_X = encoder.encode(
|
|
295
|
+
(
|
|
296
|
+
X[col].values
|
|
297
|
+
if isinstance(unique_values[0], int)
|
|
298
|
+
else X[col].cat.codes.map(category_to_indices[col])
|
|
299
|
+
),
|
|
300
|
+
fit=False,
|
|
301
|
+
)
|
|
302
|
+
tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
|
|
303
|
+
return tmp_df
|
|
304
|
+
|
|
305
|
+
def _handle_simpleimputer(
|
|
306
|
+
self,
|
|
307
|
+
X: pd.DataFrame,
|
|
308
|
+
idx: str,
|
|
309
|
+
fit: bool,
|
|
310
|
+
use_scaler: bool = False,
|
|
311
|
+
boundary: bool = True,
|
|
312
|
+
) -> pd.DataFrame:
|
|
313
|
+
"Handle SimpleImputer (fit on numerical and categorical columns)."
|
|
314
|
+
if fit:
|
|
315
|
+
use_scaler = True if use_scaler == 1 else False
|
|
316
|
+
imputer = SimpleImputer(use_scaler=use_scaler)
|
|
317
|
+
tmp_df = imputer.fit_transform(X, boundary=boundary)
|
|
318
|
+
self.fitted_components[f"SimpleImputer_{idx}"] = imputer
|
|
319
|
+
return tmp_df
|
|
320
|
+
else:
|
|
321
|
+
imputer = self.fitted_components[f"SimpleImputer_{idx}"]
|
|
322
|
+
return imputer.transform(X, boundary=boundary)
|
|
323
|
+
|
|
324
|
+
def save(self, filepath: str) -> None:
|
|
325
|
+
"""
|
|
326
|
+
Save the fitted pipeline state to a file using pickle.
|
|
327
|
+
|
|
328
|
+
Parameters
|
|
329
|
+
----------
|
|
330
|
+
filepath : str
|
|
331
|
+
Path where the serialized pipeline will be saved.
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
save_dict = {
|
|
335
|
+
"config": self.config,
|
|
336
|
+
"fitted_components": self.fitted_components,
|
|
337
|
+
"fitted_idx": self.fitted_idx,
|
|
338
|
+
"target_col": self.target_col,
|
|
339
|
+
"steps": self.steps,
|
|
340
|
+
"compute_importance": self.compute_importance,
|
|
341
|
+
"columns_bin_sizes": self.columns_bin_sizes,
|
|
342
|
+
"documentation": self.documentation,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
|
|
346
|
+
|
|
347
|
+
with open(filepath, "wb") as f:
|
|
348
|
+
pickle.dump(save_dict, f)
|
|
349
|
+
|
|
350
|
+
@classmethod
|
|
351
|
+
def load(cls, filepath: str) -> "Pipeline":
|
|
352
|
+
"""
|
|
353
|
+
Load a fitted pipeline from a file.
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
filepath : str
|
|
358
|
+
Path to the serialized pipeline file.
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
pipeline : Pipeline
|
|
363
|
+
Reconstructed pipeline instance with fitted components.
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
|
|
367
|
+
|
|
368
|
+
with open(filepath, "rb") as f:
|
|
369
|
+
save_dict = pickle.load(f)
|
|
370
|
+
|
|
371
|
+
pipeline = cls.__new__(cls)
|
|
372
|
+
|
|
373
|
+
pipeline.config = save_dict["config"]
|
|
374
|
+
pipeline.fitted_components = save_dict["fitted_components"]
|
|
375
|
+
pipeline.fitted_idx = save_dict["fitted_idx"]
|
|
376
|
+
pipeline.target_col = save_dict["target_col"]
|
|
377
|
+
pipeline.steps = save_dict["steps"]
|
|
378
|
+
pipeline.compute_importance = save_dict["compute_importance"]
|
|
379
|
+
pipeline.columns_bin_sizes = save_dict["columns_bin_sizes"]
|
|
380
|
+
pipeline.documentation = save_dict["documentation"]
|
|
381
|
+
|
|
382
|
+
return pipeline
|
|
Binary file
|