likelihood 2.0.0__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {likelihood-2.0.0 → likelihood-2.0.2}/PKG-INFO +1 -1
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/__init__.py +1 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/_autoencoders.py +2 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/rl.py +36 -36
- likelihood-2.0.2/likelihood/pipes.py +355 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/__init__.py +1 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/models_tools.py +219 -7
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/numeric_tools.py +4 -4
- likelihood-2.0.2/likelihood/tools/reports.py +195 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/tools.py +19 -17
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/PKG-INFO +1 -1
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/SOURCES.txt +2 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/LICENSE +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/README.md +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/__init__.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/_nn.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/graph.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/nn.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/main.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/__init__.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/__init__.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/_predictor.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/autoencoders.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/gan.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/predictor.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/hmm.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/regression.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/simulation.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/utils.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/cat_embed.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/figures.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/impute.py +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/dependency_links.txt +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/requires.txt +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/top_level.txt +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/setup.cfg +0 -0
- {likelihood-2.0.0 → likelihood-2.0.2}/setup.py +0 -0
|
@@ -27,12 +27,12 @@ class Env:
|
|
|
27
27
|
|
|
28
28
|
Parameters
|
|
29
29
|
----------
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
model : Any
|
|
31
|
+
Model with `.predict()` method (e.g., Keras model).
|
|
32
|
+
maxlen : int
|
|
33
|
+
Maximum length of deque. By default it is set to `100`.
|
|
34
|
+
name : str
|
|
35
|
+
The name of the environment. By default it is set to `likenasium`.
|
|
36
36
|
"""
|
|
37
37
|
self.model = model
|
|
38
38
|
self.maxlen = maxlen
|
|
@@ -49,14 +49,14 @@ class Env:
|
|
|
49
49
|
|
|
50
50
|
Parameters
|
|
51
51
|
----------
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
52
|
+
state : `np.ndarray`
|
|
53
|
+
Current state to process (input to the model).
|
|
54
|
+
action : `int`
|
|
55
|
+
Expected action to process.
|
|
56
56
|
|
|
57
57
|
Returns
|
|
58
58
|
-------
|
|
59
|
-
|
|
59
|
+
tuple : (current_state, action_pred, reward, next_action, done)
|
|
60
60
|
"""
|
|
61
61
|
if self.done:
|
|
62
62
|
return None, None, 0, None, True
|
|
@@ -120,9 +120,9 @@ class AutoQL:
|
|
|
120
120
|
|
|
121
121
|
Parameters
|
|
122
122
|
----------
|
|
123
|
-
env : Any
|
|
123
|
+
env : `Any`
|
|
124
124
|
The environment to interact with
|
|
125
|
-
model : tf.keras.Model
|
|
125
|
+
model : `tf.keras.Model`
|
|
126
126
|
The Q-network model
|
|
127
127
|
"""
|
|
128
128
|
|
|
@@ -137,16 +137,16 @@ class AutoQL:
|
|
|
137
137
|
|
|
138
138
|
Parameters
|
|
139
139
|
----------
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
140
|
+
state : `np.ndarray`
|
|
141
|
+
Current state.
|
|
142
|
+
action : `int`
|
|
143
|
+
Expected action to process.
|
|
144
|
+
epsilon : `float`
|
|
145
|
+
Exploration probability. By default it is set to `0`
|
|
146
146
|
|
|
147
147
|
Returns
|
|
148
148
|
-------
|
|
149
|
-
|
|
149
|
+
tuple : (state, action, reward, next_action, done)
|
|
150
150
|
"""
|
|
151
151
|
current_state, value, reward, next_action, done = self.env.step(state, action)
|
|
152
152
|
|
|
@@ -164,17 +164,17 @@ class AutoQL:
|
|
|
164
164
|
|
|
165
165
|
Parameters
|
|
166
166
|
----------
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
167
|
+
state : `np.ndarray`
|
|
168
|
+
Current state
|
|
169
|
+
action : `int`
|
|
170
|
+
Expected action to process.
|
|
171
171
|
|
|
172
|
-
|
|
173
|
-
|
|
172
|
+
epsilon : `float`
|
|
173
|
+
Exploration probability.
|
|
174
174
|
|
|
175
175
|
Returns
|
|
176
176
|
-------
|
|
177
|
-
|
|
177
|
+
tuple : (state, action, reward, next_action, done)
|
|
178
178
|
"""
|
|
179
179
|
current_state, greedy_action, reward, next_action, done = self.epsilon_greedy_policy(
|
|
180
180
|
state, action, epsilon
|
|
@@ -202,7 +202,7 @@ class AutoQL:
|
|
|
202
202
|
|
|
203
203
|
Returns
|
|
204
204
|
-------
|
|
205
|
-
|
|
205
|
+
float : Training loss
|
|
206
206
|
"""
|
|
207
207
|
|
|
208
208
|
batch_ = random.sample(self.replay_buffer, self.batch_size)
|
|
@@ -250,21 +250,21 @@ class AutoQL:
|
|
|
250
250
|
|
|
251
251
|
Parameters
|
|
252
252
|
----------
|
|
253
|
-
optimizer : str
|
|
253
|
+
optimizer : `str`
|
|
254
254
|
The optimizer for training (e.g., `sgd`). By default it is set to `adam`.
|
|
255
|
-
loss_fn : str
|
|
255
|
+
loss_fn : `str`
|
|
256
256
|
The loss function. By default it is set to `mse`.
|
|
257
|
-
num_episodes : int
|
|
257
|
+
num_episodes : `int`
|
|
258
258
|
Total number of episodes to train. By default it is set to `50`.
|
|
259
|
-
num_steps : int
|
|
259
|
+
num_steps : `int`
|
|
260
260
|
Steps per episode. By default it is set to `100`. If `num_steps` is less than `self.env.maxlen`, then the second will be chosen.
|
|
261
|
-
gamma : float
|
|
261
|
+
gamma : `float`
|
|
262
262
|
Discount factor. By default it is set to `0.7`.
|
|
263
|
-
batch_size : int
|
|
263
|
+
batch_size : `int`
|
|
264
264
|
Size of training batches. By default it is set to `32`.
|
|
265
|
-
patience : int
|
|
265
|
+
patience : `int`
|
|
266
266
|
How many episodes to wait for improvement.
|
|
267
|
-
alpha : float
|
|
267
|
+
alpha : `float`
|
|
268
268
|
Trade-off factor between loss and reward.
|
|
269
269
|
"""
|
|
270
270
|
rewards = []
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from likelihood.tools import generate_html_pipeline
|
|
8
|
+
from likelihood.tools.impute import SimpleImputer
|
|
9
|
+
from likelihood.tools.models_tools import TransformRange, remove_collinearity
|
|
10
|
+
from likelihood.tools.tools import DataFrameEncoder, DataScaler, LinearRegression, OneHotEncoder
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Pipeline:
|
|
14
|
+
def __init__(self, config_path: str):
|
|
15
|
+
"""
|
|
16
|
+
Initialize the pipeline with a JSON configuration file.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
config_path : str
|
|
21
|
+
Path to the JSON config defining target column and preprocessing steps.
|
|
22
|
+
"""
|
|
23
|
+
self.config = self._load_config(config_path)
|
|
24
|
+
self.target_col = self.config["target_column"]
|
|
25
|
+
self.steps = self.config["preprocessing_steps"]
|
|
26
|
+
self.compute_importance = self.config.get("compute_feature_importance", False)
|
|
27
|
+
self.fitted_components: Dict[str, object] = {}
|
|
28
|
+
self.columns_bin_sizes: Dict[str, int] | None = None
|
|
29
|
+
|
|
30
|
+
def _load_config(self, config_path: str) -> Dict:
|
|
31
|
+
"""Load and validate the JSON configuration."""
|
|
32
|
+
with open(config_path, "r") as f:
|
|
33
|
+
config = json.load(f)
|
|
34
|
+
|
|
35
|
+
assert "target_column" in config, "Config must specify 'target_column'"
|
|
36
|
+
assert "preprocessing_steps" in config, "Config must specify 'preprocessing_steps'"
|
|
37
|
+
return config
|
|
38
|
+
|
|
39
|
+
def fit(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
|
|
40
|
+
"""
|
|
41
|
+
Fit preprocessing components on the input DataFrame and return cleaned X/y.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
df : pd.DataFrame
|
|
46
|
+
Input data with features + target column.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
X : pd.DataFrame
|
|
51
|
+
Cleaned feature matrix.
|
|
52
|
+
y : np.ndarray
|
|
53
|
+
Target vector (from self.target_col).
|
|
54
|
+
importances : Optional[np.ndarray]
|
|
55
|
+
Feature importance scores (if compute_feature_importance=True).
|
|
56
|
+
"""
|
|
57
|
+
y = df[self.target_col].values
|
|
58
|
+
X = df.drop(columns=[self.target_col]).copy()
|
|
59
|
+
|
|
60
|
+
initial_info = {
|
|
61
|
+
"shape": X.shape,
|
|
62
|
+
"columns": list(X.columns),
|
|
63
|
+
"dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
|
|
64
|
+
"missing_values": X.isnull().sum().to_dict(),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
steps_info = []
|
|
68
|
+
for step in self.steps:
|
|
69
|
+
step_name = step["name"]
|
|
70
|
+
params = step.get("params", {})
|
|
71
|
+
step_info = {
|
|
72
|
+
"step_name": step_name,
|
|
73
|
+
"parameters": params,
|
|
74
|
+
"description": self._get_step_description(step_name),
|
|
75
|
+
}
|
|
76
|
+
step_info["input_columns"] = list(X.columns)
|
|
77
|
+
|
|
78
|
+
X = self._apply_step(step_name, X, fit=True, **params)
|
|
79
|
+
|
|
80
|
+
step_info["output_shape"] = X.shape
|
|
81
|
+
step_info["output_columns"] = list(X.columns)
|
|
82
|
+
step_info["output_dtypes"] = X.dtypes.apply(lambda x: x.name).to_dict()
|
|
83
|
+
|
|
84
|
+
steps_info.append(step_info)
|
|
85
|
+
|
|
86
|
+
final_info = {
|
|
87
|
+
"shape": X.shape,
|
|
88
|
+
"columns": list(X.columns),
|
|
89
|
+
"dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
|
|
90
|
+
"missing_values": X.isnull().sum().to_dict(),
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
self.documentation = {
|
|
94
|
+
"initial_dataset": initial_info,
|
|
95
|
+
"processing_steps": steps_info,
|
|
96
|
+
"final_dataset": final_info,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
importances = None
|
|
100
|
+
if self.compute_importance:
|
|
101
|
+
numeric_X = X.select_dtypes(include=["float"])
|
|
102
|
+
numeric_columns = numeric_X.columns.tolist()
|
|
103
|
+
model = LinearRegression()
|
|
104
|
+
model.fit(numeric_X.T.values, y)
|
|
105
|
+
importances = model.get_importances()
|
|
106
|
+
df_scores = pd.DataFrame([importances], columns=numeric_columns)
|
|
107
|
+
df_scores_abs = df_scores.abs()
|
|
108
|
+
df_scores_norm = (
|
|
109
|
+
df_scores_abs / df_scores_abs.to_numpy().sum()
|
|
110
|
+
if isinstance(importances, np.ndarray)
|
|
111
|
+
else pd.DataFrame()
|
|
112
|
+
)
|
|
113
|
+
return X, y, df_scores_norm
|
|
114
|
+
|
|
115
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
116
|
+
"""
|
|
117
|
+
Apply fitted preprocessing steps to new data (no target column needed).
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
df : pd.DataFrame
|
|
122
|
+
New data to transform.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
X_transformed : pd.DataFrame
|
|
127
|
+
Cleaned feature matrix.
|
|
128
|
+
"""
|
|
129
|
+
X = df.copy()
|
|
130
|
+
for step_name, _ in self.fitted_components.items():
|
|
131
|
+
X = self._apply_step(step_name, X, fit=False)
|
|
132
|
+
|
|
133
|
+
return X
|
|
134
|
+
|
|
135
|
+
def get_doc(
|
|
136
|
+
self, save_to_file: bool = True, file_name: str = "data_processing_report.html"
|
|
137
|
+
) -> None:
|
|
138
|
+
"""
|
|
139
|
+
Generate an HTML report from `self.documentation` for pipeline documentation.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
save_to_file : bool, optional
|
|
144
|
+
Whether to save generated HTML content to a file. Default is True.
|
|
145
|
+
file_name : str, optional
|
|
146
|
+
Filename for output when `save_to_file` is True. Default is "data_processing_report.html".
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
generate_html_pipeline(self.documentation, save_to_file=save_to_file, file_name=file_name)
|
|
150
|
+
|
|
151
|
+
def _apply_step(self, step_name: str, X: pd.DataFrame, fit: bool, **params) -> pd.DataFrame:
|
|
152
|
+
"""Dispatch to the correct handler for a preprocessing step."""
|
|
153
|
+
handlers = {
|
|
154
|
+
"DataScaler": self._handle_datascaler,
|
|
155
|
+
"DataFrameEncoder": self._handle_dataframeencoder,
|
|
156
|
+
"remove_collinearity": self._handle_remove_collinearity,
|
|
157
|
+
"TransformRange": self._handle_transformrange,
|
|
158
|
+
"OneHotEncoder": self._handle_onehotencoder,
|
|
159
|
+
"SimpleImputer": self._handle_simpleimputer,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if step_name not in handlers:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"Step '{step_name}' not supported. Supported steps: {list(handlers.keys())}"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return handlers[step_name](X, fit=fit, **params)
|
|
168
|
+
|
|
169
|
+
def _get_step_description(self, step_name: str) -> str:
|
|
170
|
+
"""Return a description of what each preprocessing step does."""
|
|
171
|
+
descriptions = {
|
|
172
|
+
"DataScaler": "Scales numerical features using normalization",
|
|
173
|
+
"DataFrameEncoder": "Encodes categorical variables and normalizes to numerical features",
|
|
174
|
+
"remove_collinearity": "Removes highly correlated features to reduce multicollinearity",
|
|
175
|
+
"TransformRange": "Bins continuous features into discrete ranges",
|
|
176
|
+
"OneHotEncoder": "Converts categorical variables into binary variables",
|
|
177
|
+
"SimpleImputer": "Handles missing values by imputing with multiple linear regression strategies",
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return descriptions.get(step_name, f"Unknown preprocessing step: {step_name}")
|
|
181
|
+
|
|
182
|
+
# ------------------------------ Step Handlers ------------------------------
|
|
183
|
+
def _handle_datascaler(self, X: pd.DataFrame, fit: bool, n: int = 1) -> pd.DataFrame:
|
|
184
|
+
"""Handle DataScaler (fits on training data, applies to all)."""
|
|
185
|
+
numeric_X = X.select_dtypes(include=["float"])
|
|
186
|
+
numeric_columns = numeric_X.columns.tolist()
|
|
187
|
+
n = None if n == 0 else n
|
|
188
|
+
if fit:
|
|
189
|
+
scaler = DataScaler(numeric_X.values.T, n=n)
|
|
190
|
+
self.fitted_components["DataScaler"] = scaler
|
|
191
|
+
numeric_X = pd.DataFrame(scaler.rescale().T, columns=numeric_X.columns)
|
|
192
|
+
else:
|
|
193
|
+
scaler = self.fitted_components["DataScaler"]
|
|
194
|
+
numeric_X = pd.DataFrame(
|
|
195
|
+
scaler.rescale(numeric_X.values.T).T, columns=numeric_X.columns
|
|
196
|
+
)
|
|
197
|
+
for col in numeric_columns:
|
|
198
|
+
X[col] = numeric_X[col]
|
|
199
|
+
return X
|
|
200
|
+
|
|
201
|
+
def _handle_dataframeencoder(
|
|
202
|
+
self, X: pd.DataFrame, fit: bool, norm_method: str = "mean"
|
|
203
|
+
) -> pd.DataFrame:
|
|
204
|
+
"""Handle DataFrameEncoder (fits encoders/normalizers)."""
|
|
205
|
+
if fit:
|
|
206
|
+
encoder = DataFrameEncoder(X)
|
|
207
|
+
encoded_X = encoder.encode(norm_method=norm_method)
|
|
208
|
+
self.fitted_components["DataFrameEncoder"] = encoder
|
|
209
|
+
return encoded_X
|
|
210
|
+
else:
|
|
211
|
+
encoder = self.fitted_components["DataFrameEncoder"]
|
|
212
|
+
encoder._df = X
|
|
213
|
+
return encoder.encode()
|
|
214
|
+
|
|
215
|
+
def _handle_remove_collinearity(
|
|
216
|
+
self, X: pd.DataFrame, fit: bool, threshold: float = 0.9
|
|
217
|
+
) -> pd.DataFrame:
|
|
218
|
+
"""Handle collinearity removal (fits by selecting columns to drop)."""
|
|
219
|
+
numeric_X = X.select_dtypes(include=["float"])
|
|
220
|
+
numeric_columns = numeric_X.columns.tolist()
|
|
221
|
+
categorical_columns = set(X.columns) - set(numeric_columns)
|
|
222
|
+
if fit:
|
|
223
|
+
cleaned_X = remove_collinearity(numeric_X, threshold=threshold)
|
|
224
|
+
dropped_cols = set(X.columns) - set(cleaned_X.columns) - categorical_columns
|
|
225
|
+
self.fitted_components["remove_collinearity"] = dropped_cols
|
|
226
|
+
return X.drop(columns=dropped_cols)
|
|
227
|
+
else:
|
|
228
|
+
dropped_cols = self.fitted_components["remove_collinearity"]
|
|
229
|
+
return X.drop(columns=dropped_cols)
|
|
230
|
+
|
|
231
|
+
def _handle_transformrange(
|
|
232
|
+
self, X: pd.DataFrame, fit: bool, columns_bin_sizes: Dict[str, int] | None = None
|
|
233
|
+
) -> pd.DataFrame:
|
|
234
|
+
"""Handle TransformRange (bin numerical features into ranges)."""
|
|
235
|
+
if fit:
|
|
236
|
+
transformer = TransformRange(columns_bin_sizes)
|
|
237
|
+
cleaned_X = transformer.transform(X)
|
|
238
|
+
self.fitted_components["TransformRange"] = transformer
|
|
239
|
+
self.columns_bin_sizes = columns_bin_sizes
|
|
240
|
+
return cleaned_X
|
|
241
|
+
else:
|
|
242
|
+
transformer = self.fitted_components["TransformRange"]
|
|
243
|
+
return transformer.transform(X, fit=False)
|
|
244
|
+
|
|
245
|
+
def _handle_onehotencoder(
|
|
246
|
+
self, X: pd.DataFrame, fit: bool, columns: List[str] | None = None
|
|
247
|
+
) -> pd.DataFrame:
|
|
248
|
+
"""Handle OneHotEncoder (fits on categorical columns)."""
|
|
249
|
+
if fit:
|
|
250
|
+
tmp_df = X.drop(columns=columns)
|
|
251
|
+
encoder = OneHotEncoder()
|
|
252
|
+
category_to_indices = {}
|
|
253
|
+
for col in columns:
|
|
254
|
+
unique_values = X[col].unique()
|
|
255
|
+
category_to_indices[col] = {value: i for i, value in enumerate(unique_values)}
|
|
256
|
+
encoded_X = encoder.encode(
|
|
257
|
+
X[col].values
|
|
258
|
+
if isinstance(unique_values[0], int)
|
|
259
|
+
else X[col].map(category_to_indices[col])
|
|
260
|
+
)
|
|
261
|
+
tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
|
|
262
|
+
self.fitted_components["OneHotEncoder"] = (encoder, columns, category_to_indices)
|
|
263
|
+
else:
|
|
264
|
+
encoder, columns, category_to_indices = self.fitted_components["OneHotEncoder"]
|
|
265
|
+
tmp_df = X.drop(columns=columns)
|
|
266
|
+
for col in columns:
|
|
267
|
+
unique_values = list(category_to_indices[col].keys())
|
|
268
|
+
encoded_X = encoder.encode(
|
|
269
|
+
(
|
|
270
|
+
X[col].values
|
|
271
|
+
if isinstance(unique_values[0], int)
|
|
272
|
+
else X[col].map(category_to_indices[col])
|
|
273
|
+
),
|
|
274
|
+
fit=False,
|
|
275
|
+
)
|
|
276
|
+
tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
|
|
277
|
+
return tmp_df
|
|
278
|
+
|
|
279
|
+
def _handle_simpleimputer(
|
|
280
|
+
self,
|
|
281
|
+
X: pd.DataFrame,
|
|
282
|
+
fit: bool,
|
|
283
|
+
use_scaler: bool = False,
|
|
284
|
+
boundary: bool = True,
|
|
285
|
+
) -> pd.DataFrame:
|
|
286
|
+
"Handle SimpleImputer (fit on numerical and categorical columns)."
|
|
287
|
+
if fit:
|
|
288
|
+
use_scaler = True if use_scaler == 1 else False
|
|
289
|
+
imputer = SimpleImputer(use_scaler=use_scaler)
|
|
290
|
+
tmp_df = imputer.fit_transform(X, boundary=boundary)
|
|
291
|
+
self.fitted_components["SimpleImputer"] = imputer
|
|
292
|
+
return tmp_df
|
|
293
|
+
else:
|
|
294
|
+
imputer = self.fitted_components["SimpleImputer"]
|
|
295
|
+
return imputer.transform(X, boundary=boundary)
|
|
296
|
+
|
|
297
|
+
def save(self, filepath: str) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Save the fitted pipeline state to a file using pickle.
|
|
300
|
+
|
|
301
|
+
Parameters
|
|
302
|
+
----------
|
|
303
|
+
filepath : str
|
|
304
|
+
Path where the serialized pipeline will be saved.
|
|
305
|
+
"""
|
|
306
|
+
import pickle
|
|
307
|
+
|
|
308
|
+
save_dict = {
|
|
309
|
+
"config": self.config,
|
|
310
|
+
"fitted_components": self.fitted_components,
|
|
311
|
+
"target_col": self.target_col,
|
|
312
|
+
"steps": self.steps,
|
|
313
|
+
"compute_importance": self.compute_importance,
|
|
314
|
+
"columns_bin_sizes": self.columns_bin_sizes,
|
|
315
|
+
"documentation": self.documentation,
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
|
|
319
|
+
|
|
320
|
+
with open(filepath, "wb") as f:
|
|
321
|
+
pickle.dump(save_dict, f)
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def load(cls, filepath: str) -> "Pipeline":
|
|
325
|
+
"""
|
|
326
|
+
Load a fitted pipeline from a file.
|
|
327
|
+
|
|
328
|
+
Parameters
|
|
329
|
+
----------
|
|
330
|
+
filepath : str
|
|
331
|
+
Path to the serialized pipeline file.
|
|
332
|
+
|
|
333
|
+
Returns
|
|
334
|
+
-------
|
|
335
|
+
pipeline : Pipeline
|
|
336
|
+
Reconstructed pipeline instance with fitted components.
|
|
337
|
+
"""
|
|
338
|
+
import pickle
|
|
339
|
+
|
|
340
|
+
filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
|
|
341
|
+
|
|
342
|
+
with open(filepath, "rb") as f:
|
|
343
|
+
save_dict = pickle.load(f)
|
|
344
|
+
|
|
345
|
+
pipeline = cls.__new__(cls)
|
|
346
|
+
|
|
347
|
+
pipeline.config = save_dict["config"]
|
|
348
|
+
pipeline.fitted_components = save_dict["fitted_components"]
|
|
349
|
+
pipeline.target_col = save_dict["target_col"]
|
|
350
|
+
pipeline.steps = save_dict["steps"]
|
|
351
|
+
pipeline.compute_importance = save_dict["compute_importance"]
|
|
352
|
+
pipeline.columns_bin_sizes = save_dict["columns_bin_sizes"]
|
|
353
|
+
pipeline.documentation = save_dict["documentation"]
|
|
354
|
+
|
|
355
|
+
return pipeline
|
|
@@ -11,7 +11,7 @@ logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
|
11
11
|
import sys
|
|
12
12
|
import warnings
|
|
13
13
|
from functools import wraps
|
|
14
|
-
from typing import Dict
|
|
14
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
import tensorflow as tf
|
|
@@ -40,6 +40,214 @@ def suppress_warnings(func):
|
|
|
40
40
|
return wrapper
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
class TransformRange:
|
|
44
|
+
"""
|
|
45
|
+
Generates a new DataFrame with ranges represented as strings.
|
|
46
|
+
|
|
47
|
+
Transforms numerical columns into categorical range bins with descriptive labels.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, columns_bin_sizes: Dict[str, int]) -> None:
|
|
51
|
+
"""Initializes the class with the original DataFrame.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
columns_bin_sizes : `dict`
|
|
56
|
+
A dictionary where the keys are column names and the values are the bin sizes.
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
TypeError
|
|
61
|
+
If df is not a pandas DataFrame.
|
|
62
|
+
"""
|
|
63
|
+
self.info = {}
|
|
64
|
+
self.columns_bin_sizes = columns_bin_sizes
|
|
65
|
+
|
|
66
|
+
def _create_bins_and_labels(
|
|
67
|
+
self, min_val: Union[int, float], max_val: Union[int, float], bin_size: int
|
|
68
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
69
|
+
"""
|
|
70
|
+
Creates the bin edges and their labels.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
min_val : `int` or `float`
|
|
75
|
+
The minimum value for the range.
|
|
76
|
+
max_val : `int` or `float`
|
|
77
|
+
The maximum value for the range.
|
|
78
|
+
bin_size : `int`
|
|
79
|
+
The size of each bin.
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
bins : `np.ndarray`
|
|
84
|
+
The bin edges.
|
|
85
|
+
labels : `list`
|
|
86
|
+
The labels for the bins.
|
|
87
|
+
|
|
88
|
+
Raises
|
|
89
|
+
------
|
|
90
|
+
ValueError
|
|
91
|
+
If bin_size is not positive or if min_val >= max_val.
|
|
92
|
+
"""
|
|
93
|
+
if bin_size <= 0:
|
|
94
|
+
raise ValueError("bin_size must be positive")
|
|
95
|
+
if min_val >= max_val:
|
|
96
|
+
raise ValueError("min_val must be less than max_val")
|
|
97
|
+
|
|
98
|
+
start = int(min_val)
|
|
99
|
+
end = int(max_val) + bin_size
|
|
100
|
+
|
|
101
|
+
bins = np.arange(start, end + 1, bin_size)
|
|
102
|
+
|
|
103
|
+
if bins[-1] <= max_val:
|
|
104
|
+
bins = np.append(bins, max_val + 1)
|
|
105
|
+
|
|
106
|
+
lower_bin_edge = -np.inf
|
|
107
|
+
upper_bin_edge = np.inf
|
|
108
|
+
|
|
109
|
+
labels = [f"{int(bins[i])}-{int(bins[i+1] - 1)}" for i in range(len(bins) - 1)]
|
|
110
|
+
end = int(bins[-1] - 1)
|
|
111
|
+
bins = bins.tolist()
|
|
112
|
+
bins.insert(0, lower_bin_edge)
|
|
113
|
+
bins.append(upper_bin_edge)
|
|
114
|
+
labels.insert(0, f"< {start}")
|
|
115
|
+
labels.append(f"> {end}")
|
|
116
|
+
return bins, labels
|
|
117
|
+
|
|
118
|
+
def _transform_column_to_ranges(
|
|
119
|
+
self, df: pd.DataFrame, column: str, bin_size: int, fit: bool = True
|
|
120
|
+
) -> pd.Series:
|
|
121
|
+
"""
|
|
122
|
+
Transforms a column in the DataFrame into range bins.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
df : `pd.DataFrame`
|
|
127
|
+
The original DataFrame to transform.
|
|
128
|
+
column : `str`
|
|
129
|
+
The name of the column to transform.
|
|
130
|
+
bin_size : `int`
|
|
131
|
+
The size of each bin.
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
`pd.Series`
|
|
136
|
+
A Series with the range labels.
|
|
137
|
+
|
|
138
|
+
Raises
|
|
139
|
+
------
|
|
140
|
+
KeyError
|
|
141
|
+
If column is not found in the DataFrame.
|
|
142
|
+
ValueError
|
|
143
|
+
If bin_size is not positive or if column contains non-numeric data.
|
|
144
|
+
"""
|
|
145
|
+
if not isinstance(df, pd.DataFrame):
|
|
146
|
+
raise TypeError("df must be a pandas DataFrame")
|
|
147
|
+
df_ = df.copy() # Create a copy to avoid modifying the original
|
|
148
|
+
numeric_series = pd.to_numeric(df_[column], errors="coerce")
|
|
149
|
+
if fit:
|
|
150
|
+
self.df = df_.copy()
|
|
151
|
+
if column not in df_.columns:
|
|
152
|
+
raise KeyError(f"Column '{column}' not found in DataFrame")
|
|
153
|
+
|
|
154
|
+
if bin_size <= 0:
|
|
155
|
+
raise ValueError("bin_size must be positive")
|
|
156
|
+
|
|
157
|
+
if numeric_series.isna().all():
|
|
158
|
+
raise ValueError(f"Column '{column}' contains no valid numeric data")
|
|
159
|
+
|
|
160
|
+
min_val = numeric_series.min()
|
|
161
|
+
max_val = numeric_series.max()
|
|
162
|
+
|
|
163
|
+
if min_val == max_val:
|
|
164
|
+
return pd.Series(
|
|
165
|
+
[f"{int(min_val)}-{int(max_val)}"] * len(df_), name=f"{column}_range"
|
|
166
|
+
)
|
|
167
|
+
self.info[column] = {"min_value": min_val, "max_value": max_val, "range": bin_size}
|
|
168
|
+
else:
|
|
169
|
+
min_val = self.info[column]["min_value"]
|
|
170
|
+
max_val = self.info[column]["max_value"]
|
|
171
|
+
bin_size = self.info[column]["range"]
|
|
172
|
+
|
|
173
|
+
bins, labels = self._create_bins_and_labels(min_val, max_val, bin_size)
|
|
174
|
+
return pd.cut(numeric_series, bins=bins, labels=labels, right=False, include_lowest=True)
|
|
175
|
+
|
|
176
|
+
def transform(
|
|
177
|
+
self, df: pd.DataFrame, drop_original: bool = False, fit: bool = True
|
|
178
|
+
) -> pd.DataFrame:
|
|
179
|
+
"""
|
|
180
|
+
Creates a new DataFrame with range columns.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
df : `pd.DataFrame`
|
|
185
|
+
The original DataFrame to transform.
|
|
186
|
+
drop_original : `bool`, optional
|
|
187
|
+
If True, drops original columns from the result, by default False
|
|
188
|
+
fit : `bool`, default=True
|
|
189
|
+
Whether to compute bin edges based on the data (True) or use predefined binning (False).
|
|
190
|
+
|
|
191
|
+
Returns
|
|
192
|
+
-------
|
|
193
|
+
`pd.DataFrame`
|
|
194
|
+
A DataFrame with the transformed range columns.
|
|
195
|
+
|
|
196
|
+
Raises
|
|
197
|
+
------
|
|
198
|
+
TypeError
|
|
199
|
+
If columns_bin_sizes is not a dictionary.
|
|
200
|
+
"""
|
|
201
|
+
if not isinstance(self.columns_bin_sizes, dict):
|
|
202
|
+
raise TypeError("columns_bin_sizes must be a dictionary")
|
|
203
|
+
|
|
204
|
+
if not self.columns_bin_sizes:
|
|
205
|
+
return pd.DataFrame()
|
|
206
|
+
|
|
207
|
+
range_columns = {}
|
|
208
|
+
for column, bin_size in self.columns_bin_sizes.items():
|
|
209
|
+
range_columns[f"{column}_range"] = self._transform_column_to_ranges(
|
|
210
|
+
df, column, bin_size, fit
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
result_df = pd.DataFrame(range_columns)
|
|
214
|
+
|
|
215
|
+
if not drop_original:
|
|
216
|
+
original_cols = [col for col in df.columns if col not in self.columns_bin_sizes]
|
|
217
|
+
if original_cols:
|
|
218
|
+
result_df = pd.concat([df[original_cols], result_df], axis=1)
|
|
219
|
+
|
|
220
|
+
return result_df
|
|
221
|
+
|
|
222
|
+
def get_range_info(self, column: str) -> Dict[str, Union[int, float, List[str]]]:
|
|
223
|
+
"""
|
|
224
|
+
Get information about the range transformation for a specific column.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
column : `str`
|
|
229
|
+
The name of the column to analyze.
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
`dict`
|
|
234
|
+
Dictionary containing min_val, max_val, bin_size, and labels.
|
|
235
|
+
"""
|
|
236
|
+
if column not in self.df.columns:
|
|
237
|
+
raise KeyError(f"Column '{column}' not found in DataFrame")
|
|
238
|
+
|
|
239
|
+
numeric_series = pd.to_numeric(self.df[column], errors="coerce")
|
|
240
|
+
min_val = numeric_series.min()
|
|
241
|
+
max_val = numeric_series.max()
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
"min_value": min_val,
|
|
245
|
+
"max_value": max_val,
|
|
246
|
+
"range": max_val - min_val,
|
|
247
|
+
"column": column,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
|
|
43
251
|
def remove_collinearity(df: DataFrame, threshold: float = 0.9):
|
|
44
252
|
"""
|
|
45
253
|
Removes highly collinear features from the DataFrame based on a correlation threshold.
|
|
@@ -56,8 +264,8 @@ def remove_collinearity(df: DataFrame, threshold: float = 0.9):
|
|
|
56
264
|
The correlation threshold above which features will be removed. Default is `0.9`.
|
|
57
265
|
|
|
58
266
|
Returns
|
|
59
|
-
|
|
60
|
-
DataFrame: A DataFrame with highly collinear features removed.
|
|
267
|
+
-------
|
|
268
|
+
DataFrame : A DataFrame with highly collinear features removed.
|
|
61
269
|
"""
|
|
62
270
|
corr_matrix = df.corr().abs()
|
|
63
271
|
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
@@ -97,11 +305,11 @@ def train_and_insights(
|
|
|
97
305
|
Fraction of data to use (default is 1.0).
|
|
98
306
|
|
|
99
307
|
Keyword Arguments:
|
|
100
|
-
|
|
308
|
+
------------------
|
|
101
309
|
Additional keyword arguments passed to the `model.fit` function, such as validation split and callbacks.
|
|
102
310
|
|
|
103
311
|
Returns
|
|
104
|
-
|
|
312
|
+
-------
|
|
105
313
|
`tf.keras.Model`
|
|
106
314
|
The trained model after fitting.
|
|
107
315
|
"""
|
|
@@ -207,7 +415,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
|
|
|
207
415
|
A threshold for the eigenvector centrality calculation, used to determine the cutoff for small eigenvalues. Default is `1e-6`.
|
|
208
416
|
|
|
209
417
|
Returns
|
|
210
|
-
|
|
418
|
+
-------
|
|
211
419
|
DataFrame : A DataFrame containing the following graph metrics as columns.
|
|
212
420
|
- `Degree Centrality`: Degree centrality values for each node, indicating the number of direct connections each node has.
|
|
213
421
|
- `Clustering Coefficient`: Clustering coefficient values for each node, representing the degree to which nodes cluster together.
|
|
@@ -218,7 +426,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
|
|
|
218
426
|
- `Assortativity`: The assortativity coefficient of the graph, measuring the tendency of nodes to connect to similar nodes.
|
|
219
427
|
|
|
220
428
|
Notes
|
|
221
|
-
|
|
429
|
+
-----
|
|
222
430
|
The returned DataFrame will have one row for each node and one column for each of the computed metrics.
|
|
223
431
|
"""
|
|
224
432
|
adj_matrix = adj_matrix.astype(int)
|
|
@@ -251,3 +459,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
|
|
|
251
459
|
metrics_df["Assortativity"] = assortativity
|
|
252
460
|
|
|
253
461
|
return metrics_df
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
if __name__ == "__main__":
|
|
465
|
+
pass
|
|
@@ -154,7 +154,7 @@ def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = No
|
|
|
154
154
|
The first variable to be correlated. Must have at least one dimension.
|
|
155
155
|
Y : `np.ndarray`
|
|
156
156
|
The second variable to be correlated. Must have at least one dimension.
|
|
157
|
-
ties : bool
|
|
157
|
+
ties : `bool`
|
|
158
158
|
Whether to handle ties using randomization.
|
|
159
159
|
random_seed : int, optional
|
|
160
160
|
Seed for the random number generator for reproducibility.
|
|
@@ -356,9 +356,9 @@ def find_multiples(target: int) -> tuple[int, int] | None:
|
|
|
356
356
|
Returns
|
|
357
357
|
-------
|
|
358
358
|
tuple[int, int] | None
|
|
359
|
-
If i and i+1 both divide target, returns (i, i+1).
|
|
360
|
-
Otherwise, returns (i, target // i)
|
|
361
|
-
Returns None if no factors are found.
|
|
359
|
+
If `i` and `i+1` both divide target, returns (i, i+1).
|
|
360
|
+
Otherwise, returns `(i, target // i)`.
|
|
361
|
+
Returns `None` if no factors are found.
|
|
362
362
|
"""
|
|
363
363
|
for i in range(2, target + 1):
|
|
364
364
|
if target % i == 0:
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from html import escape
|
|
2
|
+
from IPython.display import HTML, display
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def generate_html_pipeline(data_dict, save_to_file=False, file_name="data_processing_report.html"):
|
|
6
|
+
css_js = """
|
|
7
|
+
<style>
|
|
8
|
+
:root {
|
|
9
|
+
--primary: #0d9488;
|
|
10
|
+
--primary-dark: #0f766e;
|
|
11
|
+
--success: #10b981;
|
|
12
|
+
--accent: #3b82f6;
|
|
13
|
+
--card-bg: #ffffff;
|
|
14
|
+
--shadow-sm: 0 2px 6px rgba(0, 0, 0, 0.03);
|
|
15
|
+
--border-radius-md: 6px;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
* {
|
|
19
|
+
box-sizing: border-box;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
body {
|
|
23
|
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
|
|
24
|
+
background: #f8fafc;
|
|
25
|
+
color: #1e293b;
|
|
26
|
+
margin: 0;
|
|
27
|
+
padding: 1rem;
|
|
28
|
+
font-size: 14px;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
h2 {
|
|
32
|
+
background: linear-gradient(135deg, var(--primary), var(--primary-dark));
|
|
33
|
+
color: white;
|
|
34
|
+
text-align: center;
|
|
35
|
+
padding: 1rem;
|
|
36
|
+
border-radius: var(--border-radius-md);
|
|
37
|
+
font-weight: 600;
|
|
38
|
+
font-size: 1.5rem;
|
|
39
|
+
margin-bottom: 1.5rem;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
section {
|
|
43
|
+
background: var(--card-bg);
|
|
44
|
+
border-radius: var(--border-radius-md);
|
|
45
|
+
padding: 1rem;
|
|
46
|
+
box-shadow: var(--shadow-sm);
|
|
47
|
+
margin-bottom: 1.2rem;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
h3 {
|
|
51
|
+
color: var(--primary-dark);
|
|
52
|
+
font-weight: 600;
|
|
53
|
+
font-size: 1.2rem;
|
|
54
|
+
border-left: 4px solid var(--success);
|
|
55
|
+
padding-left: 0.8rem;
|
|
56
|
+
margin: 1rem 0 0.8rem;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
table {
|
|
60
|
+
width: 100%;
|
|
61
|
+
border-collapse: collapse;
|
|
62
|
+
font-size: 13px;
|
|
63
|
+
margin: 0.5rem 0 1rem;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
th, td {
|
|
67
|
+
padding: 0.5rem 0.75rem;
|
|
68
|
+
text-align: left;
|
|
69
|
+
border-bottom: 1px solid #e2e8f0;
|
|
70
|
+
vertical-align: top;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
thead {
|
|
74
|
+
background-color: #f0fdf4;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
tbody tr:nth-child(odd) {
|
|
78
|
+
background-color: #f9fafb;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
tbody tr:hover {
|
|
82
|
+
background-color: #e0f2fe;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
.nested-table {
|
|
86
|
+
font-size: 12px;
|
|
87
|
+
margin-top: 0.5rem;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
details {
|
|
91
|
+
margin-bottom: 0.8rem;
|
|
92
|
+
padding: 0.5rem 0.8rem;
|
|
93
|
+
background: #f9f9f9;
|
|
94
|
+
border-radius: var(--border-radius-md);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
summary {
|
|
98
|
+
font-weight: 600;
|
|
99
|
+
font-size: 1rem;
|
|
100
|
+
color: var(--primary-dark);
|
|
101
|
+
cursor: pointer;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
summary::before {
|
|
105
|
+
content: "▶";
|
|
106
|
+
margin-right: 6px;
|
|
107
|
+
color: var(--success);
|
|
108
|
+
font-size: 0.9rem;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
@media (max-width: 768px) {
|
|
112
|
+
body {
|
|
113
|
+
font-size: 13px;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
h2 {
|
|
117
|
+
font-size: 1.3rem;
|
|
118
|
+
padding: 0.8rem;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
h3 {
|
|
122
|
+
font-size: 1.1rem;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
table, .nested-table {
|
|
126
|
+
font-size: 12px;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
</style>
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def render_value(val):
|
|
133
|
+
if isinstance(val, dict):
|
|
134
|
+
return dict_to_table(val, nested=True)
|
|
135
|
+
elif isinstance(val, list):
|
|
136
|
+
if all(isinstance(item, (str, int, float)) for item in val):
|
|
137
|
+
return ", ".join(escape(str(x)) for x in val)
|
|
138
|
+
else:
|
|
139
|
+
return "<ul>" + "".join(f"<li>{render_value(v)}</li>" for v in val) + "</ul>"
|
|
140
|
+
else:
|
|
141
|
+
return escape(str(val))
|
|
142
|
+
|
|
143
|
+
def dict_to_table(d, title=None, nested=False):
|
|
144
|
+
html = ""
|
|
145
|
+
if title and not nested:
|
|
146
|
+
html += f"<h4>{escape(title)}</h4>"
|
|
147
|
+
table_class = "nested-table" if nested else "table"
|
|
148
|
+
html += f"<table class='{table_class}'>"
|
|
149
|
+
html += "<thead><tr><th>Key</th><th>Value</th></tr></thead><tbody>"
|
|
150
|
+
for key, val in d.items():
|
|
151
|
+
key_html = escape(str(key))
|
|
152
|
+
val_html = render_value(val)
|
|
153
|
+
html += f"<tr><td>{key_html}</td><td>{val_html}</td></tr>"
|
|
154
|
+
html += "</tbody></table>"
|
|
155
|
+
return html
|
|
156
|
+
|
|
157
|
+
html_content = css_js
|
|
158
|
+
html_content += "<h2>📈 Data Processing Report</h2>"
|
|
159
|
+
|
|
160
|
+
html_content += "<section>"
|
|
161
|
+
html_content += "<h3>📁 Initial Dataset</h3>"
|
|
162
|
+
html_content += dict_to_table(data_dict["initial_dataset"])
|
|
163
|
+
html_content += "</section>"
|
|
164
|
+
|
|
165
|
+
html_content += "<section>"
|
|
166
|
+
html_content += "<h3>🔧 Processing Steps</h3>"
|
|
167
|
+
for i, step in enumerate(data_dict["processing_steps"]):
|
|
168
|
+
html_content += "<details open>"
|
|
169
|
+
html_content += f"<summary>Step {i + 1}: {escape(step['step_name'])}</summary>"
|
|
170
|
+
html_content += f"<p><strong>Description:</strong> {escape(step['description'])}</p>"
|
|
171
|
+
html_content += dict_to_table(step["parameters"], title="Parameters", nested=True)
|
|
172
|
+
html_content += dict_to_table(
|
|
173
|
+
{
|
|
174
|
+
"Output Shape": step["output_shape"],
|
|
175
|
+
"Input Columns": step["input_columns"],
|
|
176
|
+
"Output Columns": step["output_columns"],
|
|
177
|
+
"Output Dtypes": step["output_dtypes"],
|
|
178
|
+
},
|
|
179
|
+
title="Output Info",
|
|
180
|
+
nested=True,
|
|
181
|
+
)
|
|
182
|
+
html_content += "</details>"
|
|
183
|
+
html_content += "</section>"
|
|
184
|
+
|
|
185
|
+
html_content += "<section>"
|
|
186
|
+
html_content += "<h3>✅ Final Dataset</h3>"
|
|
187
|
+
html_content += dict_to_table(data_dict["final_dataset"])
|
|
188
|
+
html_content += "</section>"
|
|
189
|
+
|
|
190
|
+
if save_to_file:
|
|
191
|
+
with open(file_name, "w", encoding="utf-8") as f:
|
|
192
|
+
f.write(html_content)
|
|
193
|
+
print(f"✅ Report saved to '{file_name}'")
|
|
194
|
+
else:
|
|
195
|
+
display(HTML(html_content))
|
|
@@ -2,7 +2,7 @@ import math
|
|
|
2
2
|
import os
|
|
3
3
|
import pickle
|
|
4
4
|
import warnings
|
|
5
|
-
from typing import Callable, Dict, List, Tuple, Union
|
|
5
|
+
from typing import Callable, Dict, Generator, List, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import matplotlib.pyplot as plt
|
|
8
8
|
import numpy as np
|
|
@@ -25,7 +25,7 @@ Data Science from Scratch, Second Edition, by Joel Grus (O'Reilly).Copyright 201
|
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def minibatches(dataset: List, batch_size: int, shuffle: bool = True) ->
|
|
28
|
+
def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> Generator:
|
|
29
29
|
"""Generates 'batch_size'-sized minibatches from the dataset
|
|
30
30
|
|
|
31
31
|
Parameters
|
|
@@ -660,7 +660,7 @@ class DataScaler:
|
|
|
660
660
|
|
|
661
661
|
__slots__ = ["dataset_", "_n", "data_scaled", "values", "inv_fitting"]
|
|
662
662
|
|
|
663
|
-
def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
|
|
663
|
+
def __init__(self, dataset: np.ndarray, n: int | None = 1) -> None:
|
|
664
664
|
"""Initializes the parameters required for scaling the data"""
|
|
665
665
|
self.dataset_ = dataset.copy()
|
|
666
666
|
self._n = n
|
|
@@ -861,7 +861,7 @@ class DataFrameEncoder:
|
|
|
861
861
|
"""Encodes the `object` type columns of the dataframe
|
|
862
862
|
|
|
863
863
|
Keyword Arguments:
|
|
864
|
-
|
|
864
|
+
------------------
|
|
865
865
|
- save_mode (`bool`): An optional integer parameter. By default it is set to `True`
|
|
866
866
|
- dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
|
|
867
867
|
- norm_method (`str`): An optional string parameter to perform normalization. By default it is set to `None`
|
|
@@ -1024,20 +1024,21 @@ class OneHotEncoder:
|
|
|
1024
1024
|
It receives an array of integers and returns a binary array using the one-hot encoding method.
|
|
1025
1025
|
"""
|
|
1026
1026
|
|
|
1027
|
-
__slots__ = ["
|
|
1027
|
+
__slots__ = ["num_categories"]
|
|
1028
1028
|
|
|
1029
1029
|
def __init__(self) -> None:
|
|
1030
1030
|
pass
|
|
1031
1031
|
|
|
1032
|
-
def encode(self, x: np.ndarray | list):
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1032
|
+
def encode(self, x: np.ndarray | list, fit: bool = True):
|
|
1033
|
+
if not isinstance(x, np.ndarray):
|
|
1034
|
+
x = np.array(x)
|
|
1035
|
+
x = x.astype(int)
|
|
1036
|
+
if fit:
|
|
1037
|
+
self.num_categories = x.max() + 1
|
|
1037
1038
|
|
|
1038
|
-
y = np.zeros((
|
|
1039
|
+
y = np.zeros((x.size, self.num_categories))
|
|
1039
1040
|
|
|
1040
|
-
y[np.arange(
|
|
1041
|
+
y[np.arange(x.size), x] = 1
|
|
1041
1042
|
|
|
1042
1043
|
return y
|
|
1043
1044
|
|
|
@@ -1189,7 +1190,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
|
|
|
1189
1190
|
if nan_values:
|
|
1190
1191
|
(
|
|
1191
1192
|
print(
|
|
1192
|
-
"UserWarning: Some rows may have been deleted due to the existence of NaN values."
|
|
1193
|
+
"UserWarning: Some rows may have been deleted due to the existence of NaN values.",
|
|
1194
|
+
f"NaN values removed: ",
|
|
1195
|
+
"{:,}".format(nan_count),
|
|
1193
1196
|
)
|
|
1194
1197
|
if verbose
|
|
1195
1198
|
else None
|
|
@@ -1199,7 +1202,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
|
|
|
1199
1202
|
if inf_values:
|
|
1200
1203
|
(
|
|
1201
1204
|
print(
|
|
1202
|
-
"UserWarning: Some rows may have been deleted due to the existence of Inf values."
|
|
1205
|
+
"UserWarning: Some rows may have been deleted due to the existence of Inf values.",
|
|
1206
|
+
f"Infinite values removed: ",
|
|
1207
|
+
"{:,}".format(inf_count),
|
|
1203
1208
|
)
|
|
1204
1209
|
if verbose
|
|
1205
1210
|
else None
|
|
@@ -1207,9 +1212,6 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
|
|
|
1207
1212
|
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
1208
1213
|
df.dropna(inplace=True)
|
|
1209
1214
|
|
|
1210
|
-
print(f"NaN values removed: ", "{:,}".format(nan_count))
|
|
1211
|
-
print(f"Infinite values removed: ", "{:,}".format(inf_count))
|
|
1212
|
-
|
|
1213
1215
|
return df
|
|
1214
1216
|
|
|
1215
1217
|
|
|
@@ -3,6 +3,7 @@ README.md
|
|
|
3
3
|
setup.py
|
|
4
4
|
likelihood/__init__.py
|
|
5
5
|
likelihood/main.py
|
|
6
|
+
likelihood/pipes.py
|
|
6
7
|
likelihood.egg-info/PKG-INFO
|
|
7
8
|
likelihood.egg-info/SOURCES.txt
|
|
8
9
|
likelihood.egg-info/dependency_links.txt
|
|
@@ -30,4 +31,5 @@ likelihood/tools/figures.py
|
|
|
30
31
|
likelihood/tools/impute.py
|
|
31
32
|
likelihood/tools/models_tools.py
|
|
32
33
|
likelihood/tools/numeric_tools.py
|
|
34
|
+
likelihood/tools/reports.py
|
|
33
35
|
likelihood/tools/tools.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|