likelihood 1.5.8__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/__init__.py +8 -0
- likelihood/graph/_nn.py +421 -0
- likelihood/models/deep/__init__.py +11 -2
- likelihood/models/deep/_autoencoders.py +895 -0
- likelihood/models/deep/_predictor.py +810 -0
- likelihood/models/deep/autoencoders.py +2 -2
- likelihood/models/deep/gan.py +4 -4
- likelihood/models/deep/predictor.py +1 -0
- likelihood/models/deep/rl.py +350 -0
- likelihood/models/simulation.py +9 -4
- likelihood/tools/models_tools.py +194 -7
- likelihood/tools/numeric_tools.py +4 -4
- likelihood/tools/tools.py +8 -3
- {likelihood-1.5.8.dist-info → likelihood-2.0.1.dist-info}/METADATA +4 -3
- likelihood-2.0.1.dist-info/RECORD +30 -0
- likelihood-1.5.8.dist-info/RECORD +0 -26
- {likelihood-1.5.8.dist-info → likelihood-2.0.1.dist-info}/WHEEL +0 -0
- {likelihood-1.5.8.dist-info → likelihood-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {likelihood-1.5.8.dist-info → likelihood-2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -484,12 +484,12 @@ class AutoClassifier(tf.keras.Model):
|
|
|
484
484
|
Sets the encoder and decoder layers from another AutoClassifier instance,
|
|
485
485
|
ensuring compatibility in dimensions. Only works if vae_mode is False.
|
|
486
486
|
|
|
487
|
-
Parameters
|
|
487
|
+
Parameters
|
|
488
488
|
-----------
|
|
489
489
|
source_model : AutoClassifier
|
|
490
490
|
The source model to copy the encoder and decoder layers from.
|
|
491
491
|
|
|
492
|
-
Raises
|
|
492
|
+
Raises
|
|
493
493
|
-------
|
|
494
494
|
ValueError
|
|
495
495
|
If the input shape or units of the source model do not match.
|
likelihood/models/deep/gan.py
CHANGED
|
@@ -102,7 +102,7 @@ class GANRegressor(tf.keras.Model):
|
|
|
102
102
|
Train the GAN model.
|
|
103
103
|
|
|
104
104
|
Parameters
|
|
105
|
-
|
|
105
|
+
----------
|
|
106
106
|
X : array-like
|
|
107
107
|
Input data.
|
|
108
108
|
y : array-like
|
|
@@ -117,7 +117,7 @@ class GANRegressor(tf.keras.Model):
|
|
|
117
117
|
Verbosity level. Default is 1.
|
|
118
118
|
|
|
119
119
|
Returns
|
|
120
|
-
|
|
120
|
+
-------
|
|
121
121
|
history : pd.DataFrame
|
|
122
122
|
Training history.
|
|
123
123
|
"""
|
|
@@ -234,7 +234,7 @@ class GANRegressor(tf.keras.Model):
|
|
|
234
234
|
Train the generator model.
|
|
235
235
|
|
|
236
236
|
Parameters
|
|
237
|
-
|
|
237
|
+
----------
|
|
238
238
|
X_train : array-like
|
|
239
239
|
Training data.
|
|
240
240
|
y_train : array-like
|
|
@@ -249,7 +249,7 @@ class GANRegressor(tf.keras.Model):
|
|
|
249
249
|
Number of epochs to wait before early stopping. Default is 3.
|
|
250
250
|
|
|
251
251
|
Returns
|
|
252
|
-
|
|
252
|
+
-------
|
|
253
253
|
history : pd.DataFrame
|
|
254
254
|
Training history.
|
|
255
255
|
"""
|
|
@@ -674,6 +674,7 @@ class GetInsights:
|
|
|
674
674
|
/ (data_normalized.iloc[:, :-1].max() - data_normalized.iloc[:, :-1].min())
|
|
675
675
|
- 1
|
|
676
676
|
)
|
|
677
|
+
data_normalized.dropna(axis=1, inplace=True)
|
|
677
678
|
radviz(data_normalized, color_column, color=self.colors)
|
|
678
679
|
plt.title(title)
|
|
679
680
|
plt.show()
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from collections import deque
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import tensorflow as tf
|
|
6
|
+
from packaging import version
|
|
7
|
+
|
|
8
|
+
if version.parse(tf.__version__) > version.parse("2.15.0"):
|
|
9
|
+
from ._autoencoders import AutoClassifier
|
|
10
|
+
else:
|
|
11
|
+
from .autoencoders import AutoClassifier
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def print_progress_bar(iteration, total, length=30):
|
|
15
|
+
percent = f"{100 * (iteration / float(total)):.1f}"
|
|
16
|
+
filled_length = int(length * iteration // total)
|
|
17
|
+
bar = "█" * filled_length + "-" * (length - filled_length)
|
|
18
|
+
print(f"\rProgress: |{bar}| {percent}% Complete", end="\r")
|
|
19
|
+
if iteration == total:
|
|
20
|
+
print()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Env:
|
|
24
|
+
def __init__(self, model, maxlen=100, name="likenasium"):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the environment with a model.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
model : Any
|
|
31
|
+
Model with `.predict()` method (e.g., Keras model).
|
|
32
|
+
maxlen : int
|
|
33
|
+
Maximum length of deque. By default it is set to `100`.
|
|
34
|
+
name : str
|
|
35
|
+
The name of the environment. By default it is set to `likenasium`.
|
|
36
|
+
"""
|
|
37
|
+
self.model = model
|
|
38
|
+
self.maxlen = maxlen
|
|
39
|
+
self.transitions = deque(
|
|
40
|
+
maxlen=self.maxlen
|
|
41
|
+
) # Stores (state, action, reward, next_action, done)
|
|
42
|
+
self.current_state = None
|
|
43
|
+
self.current_step = 0
|
|
44
|
+
self.done = False
|
|
45
|
+
|
|
46
|
+
def step(self, state, action, verbose=0):
|
|
47
|
+
"""
|
|
48
|
+
Perform an environment step with the given action.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
state : `np.ndarray`
|
|
53
|
+
Current state to process (input to the model).
|
|
54
|
+
action : `int`
|
|
55
|
+
Expected action to process.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
`tuple` : (current_state, action_pred, reward, next_action, done)
|
|
60
|
+
"""
|
|
61
|
+
if self.done:
|
|
62
|
+
return None, None, 0, None, True
|
|
63
|
+
|
|
64
|
+
# Process action through model
|
|
65
|
+
model_output = self.model.predict(state.reshape((1, -1)), verbose=verbose)
|
|
66
|
+
action_pred = np.argmax(model_output, axis=1)[0]
|
|
67
|
+
model_output[:, action_pred] = 0.0
|
|
68
|
+
next_action = np.max(model_output, axis=1)[0] # Second most probable action
|
|
69
|
+
|
|
70
|
+
# Calculate reward (1 if correct prediction, 0 otherwise)
|
|
71
|
+
reward = 1 if action_pred == action else 0
|
|
72
|
+
|
|
73
|
+
# Update current state
|
|
74
|
+
self.current_state = state
|
|
75
|
+
self.current_step += 1
|
|
76
|
+
|
|
77
|
+
# Add transition to history
|
|
78
|
+
if self.current_step <= self.maxlen:
|
|
79
|
+
self.transitions.append(
|
|
80
|
+
(
|
|
81
|
+
self.current_state, # Previous state
|
|
82
|
+
action_pred, # Current action
|
|
83
|
+
reward, # Reward
|
|
84
|
+
next_action, # Next action
|
|
85
|
+
self.done, # Done flag
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
return self.current_state, action_pred, reward, next_action, self.done
|
|
89
|
+
|
|
90
|
+
def reset(self):
|
|
91
|
+
"""Reset the environment to initial state."""
|
|
92
|
+
self.current_state = None
|
|
93
|
+
self.current_step = 0
|
|
94
|
+
self.done = False
|
|
95
|
+
self.transitions = deque(maxlen=self.maxlen)
|
|
96
|
+
return self.current_state
|
|
97
|
+
|
|
98
|
+
def get_transitions(self):
|
|
99
|
+
"""Get all stored transitions."""
|
|
100
|
+
return self.transitions
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class AutoQL:
|
|
104
|
+
"""
|
|
105
|
+
AutoQL: A reinforcement learning agent using Q-learning with Epsilon-greedy policy.
|
|
106
|
+
|
|
107
|
+
This class implements a Q-learning agent with:
|
|
108
|
+
- Epsilon-greedy policy for exploration
|
|
109
|
+
- Replay buffer for experience replay
|
|
110
|
+
- Automatic model version handling for TensorFlow
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(
|
|
114
|
+
self,
|
|
115
|
+
env,
|
|
116
|
+
model,
|
|
117
|
+
maxlen=2000,
|
|
118
|
+
):
|
|
119
|
+
"""Initialize AutoQL agent
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
env : `Any`
|
|
124
|
+
The environment to interact with
|
|
125
|
+
model : `tf.keras.Model`
|
|
126
|
+
The Q-network model
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
self.env = env
|
|
130
|
+
self.model = model
|
|
131
|
+
self.maxlen = maxlen
|
|
132
|
+
self.replay_buffer = deque(maxlen=self.maxlen)
|
|
133
|
+
|
|
134
|
+
def epsilon_greedy_policy(self, state, action, epsilon=0):
|
|
135
|
+
"""
|
|
136
|
+
Epsilon-greedy policy for action selection
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
state : `np.ndarray`
|
|
141
|
+
Current state.
|
|
142
|
+
action : `int`
|
|
143
|
+
Expected action to process.
|
|
144
|
+
epsilon : `float`
|
|
145
|
+
Exploration probability. By default it is set to `0`
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
`tuple` : (state, action, reward, next_action, done)
|
|
150
|
+
"""
|
|
151
|
+
current_state, value, reward, next_action, done = self.env.step(state, action)
|
|
152
|
+
|
|
153
|
+
if np.random.rand() > epsilon:
|
|
154
|
+
state = np.asarray(state).astype(np.float32)
|
|
155
|
+
return current_state, value, reward, next_action, done
|
|
156
|
+
step_ = random.sample(self.env.get_transitions(), 1)
|
|
157
|
+
_state, greedy_action, _reward, _next_action, _done = zip(*step_)
|
|
158
|
+
|
|
159
|
+
return _state[0], greedy_action[0], _reward[0], _next_action[0], _done[0]
|
|
160
|
+
|
|
161
|
+
def play_one_step(self, state, action, epsilon):
|
|
162
|
+
"""
|
|
163
|
+
Perform one step in the environment and add experience to buffer
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
state : `np.ndarray`
|
|
168
|
+
Current state
|
|
169
|
+
action : `int`
|
|
170
|
+
Expected action to process.
|
|
171
|
+
|
|
172
|
+
epsilon : `float`
|
|
173
|
+
Exploration probability.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
`tuple` : (state, action, reward, next_action, done)
|
|
178
|
+
"""
|
|
179
|
+
current_state, greedy_action, reward, next_action, done = self.epsilon_greedy_policy(
|
|
180
|
+
state, action, epsilon
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
done = 1 if done else 0
|
|
184
|
+
|
|
185
|
+
# Add experience to replay buffer
|
|
186
|
+
self.replay_buffer.append(
|
|
187
|
+
(
|
|
188
|
+
current_state, # Previous state
|
|
189
|
+
greedy_action, # Current action
|
|
190
|
+
reward, # Reward
|
|
191
|
+
next_action, # Next action
|
|
192
|
+
done, # Done flag
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return current_state, greedy_action, reward, next_action, done
|
|
197
|
+
|
|
198
|
+
@tf.function
|
|
199
|
+
def _training_step(self):
|
|
200
|
+
"""
|
|
201
|
+
Perform one training step using experience replay
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
`float` : Training loss
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
batch_ = random.sample(self.replay_buffer, self.batch_size)
|
|
209
|
+
states, actions, rewards, next_actions, dones = zip(*batch_)
|
|
210
|
+
states = np.array(states).reshape(self.batch_size, -1)
|
|
211
|
+
actions = np.array(actions).reshape(
|
|
212
|
+
self.batch_size,
|
|
213
|
+
)
|
|
214
|
+
rewards = np.array(rewards).reshape(
|
|
215
|
+
self.batch_size,
|
|
216
|
+
)
|
|
217
|
+
max_next_Q_values = np.array(next_actions).reshape(self.batch_size, -1)
|
|
218
|
+
dones = np.array(dones).reshape(
|
|
219
|
+
self.batch_size,
|
|
220
|
+
)
|
|
221
|
+
target_Q_values = rewards + (1 - dones) * self.gamma * max_next_Q_values
|
|
222
|
+
|
|
223
|
+
actions = tf.convert_to_tensor(actions, dtype=tf.int32)
|
|
224
|
+
states = tf.convert_to_tensor(states, dtype=tf.float32)
|
|
225
|
+
target_Q_values = tf.convert_to_tensor(target_Q_values, dtype=tf.float32)
|
|
226
|
+
|
|
227
|
+
with tf.GradientTape() as tape:
|
|
228
|
+
all_Q_values = self.model(states)
|
|
229
|
+
indices = tf.stack([tf.range(tf.shape(actions)[0]), actions], axis=1)
|
|
230
|
+
Q_values = tf.gather_nd(all_Q_values, indices)
|
|
231
|
+
loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))
|
|
232
|
+
grads = tape.gradient(loss, self.model.trainable_variables)
|
|
233
|
+
self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
|
|
234
|
+
return loss
|
|
235
|
+
|
|
236
|
+
def train(
|
|
237
|
+
self,
|
|
238
|
+
x_data,
|
|
239
|
+
y_data,
|
|
240
|
+
optimizer="adam",
|
|
241
|
+
loss_fn="mse",
|
|
242
|
+
num_episodes=50,
|
|
243
|
+
num_steps=100,
|
|
244
|
+
gamma=0.7,
|
|
245
|
+
batch_size=32,
|
|
246
|
+
patience=10,
|
|
247
|
+
alpha=0.01,
|
|
248
|
+
):
|
|
249
|
+
"""Train the agent for a fixed number of episodes
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
optimizer : `str`
|
|
254
|
+
The optimizer for training (e.g., `sgd`). By default it is set to `adam`.
|
|
255
|
+
loss_fn : `str`
|
|
256
|
+
The loss function. By default it is set to `mse`.
|
|
257
|
+
num_episodes : `int`
|
|
258
|
+
Total number of episodes to train. By default it is set to `50`.
|
|
259
|
+
num_steps : `int`
|
|
260
|
+
Steps per episode. By default it is set to `100`. If `num_steps` is less than `self.env.maxlen`, then the second will be chosen.
|
|
261
|
+
gamma : `float`
|
|
262
|
+
Discount factor. By default it is set to `0.7`.
|
|
263
|
+
batch_size : `int`
|
|
264
|
+
Size of training batches. By default it is set to `32`.
|
|
265
|
+
patience : `int`
|
|
266
|
+
How many episodes to wait for improvement.
|
|
267
|
+
alpha : `float`
|
|
268
|
+
Trade-off factor between loss and reward.
|
|
269
|
+
"""
|
|
270
|
+
rewards = []
|
|
271
|
+
self.best_weights = None
|
|
272
|
+
self.best_loss = float("inf")
|
|
273
|
+
|
|
274
|
+
optimizers = {
|
|
275
|
+
"sgd": tf.keras.optimizers.SGD(),
|
|
276
|
+
"adam": tf.keras.optimizers.Adam(),
|
|
277
|
+
"adamw": tf.keras.optimizers.AdamW(),
|
|
278
|
+
"adadelta": tf.keras.optimizers.Adadelta(),
|
|
279
|
+
"rmsprop": tf.keras.optimizers.RMSprop(),
|
|
280
|
+
}
|
|
281
|
+
self.optimizer = optimizers[optimizer]
|
|
282
|
+
losses = {
|
|
283
|
+
"mse": tf.keras.losses.MeanSquaredError(),
|
|
284
|
+
"mae": tf.keras.losses.MeanAbsoluteError(),
|
|
285
|
+
"mape": tf.keras.losses.MeanAbsolutePercentageError(),
|
|
286
|
+
}
|
|
287
|
+
self.loss_fn = losses[loss_fn]
|
|
288
|
+
self.num_episodes = num_episodes
|
|
289
|
+
self.num_steps = num_steps if num_steps >= self.env.maxlen else self.env.maxlen
|
|
290
|
+
self.gamma = gamma
|
|
291
|
+
self.batch_size = batch_size
|
|
292
|
+
loss = float("inf")
|
|
293
|
+
no_improve_count = 0
|
|
294
|
+
best_combined_metric = float("inf")
|
|
295
|
+
|
|
296
|
+
for episode in range(self.num_episodes):
|
|
297
|
+
print_progress_bar(episode + 1, self.num_episodes)
|
|
298
|
+
self.env.reset()
|
|
299
|
+
sum_rewards = 0
|
|
300
|
+
epsilon = max(1 - episode / (self.num_episodes * 0.8), 0.01)
|
|
301
|
+
|
|
302
|
+
for step in range(self.num_steps):
|
|
303
|
+
state, action, reward, next_action, done = self.play_one_step(
|
|
304
|
+
x_data[step], y_data[step], epsilon
|
|
305
|
+
)
|
|
306
|
+
sum_rewards += reward if isinstance(reward, int) else reward[0]
|
|
307
|
+
|
|
308
|
+
# Train if buffer has enough samples
|
|
309
|
+
if len(self.replay_buffer) > self.batch_size:
|
|
310
|
+
loss = self._training_step()
|
|
311
|
+
|
|
312
|
+
if done:
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
combined_metric = loss - alpha * sum_rewards
|
|
316
|
+
|
|
317
|
+
if combined_metric < best_combined_metric:
|
|
318
|
+
best_combined_metric = combined_metric
|
|
319
|
+
self.best_weights = self.model.get_weights()
|
|
320
|
+
self.best_loss = loss
|
|
321
|
+
no_improve_count = 0 # Reset counter on improvement
|
|
322
|
+
else:
|
|
323
|
+
no_improve_count += 1
|
|
324
|
+
|
|
325
|
+
rewards.append(sum_rewards)
|
|
326
|
+
|
|
327
|
+
# Logging
|
|
328
|
+
if episode % (self.num_episodes // 10) == 0:
|
|
329
|
+
print(
|
|
330
|
+
f"Episode: {episode}, Steps: {step+1}, Epsilon: {epsilon:.3f}, Loss: {loss:.2e}, Reward: {sum_rewards}, No Improve Count: {no_improve_count}"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Early stopping condition
|
|
334
|
+
if no_improve_count >= patience:
|
|
335
|
+
print(
|
|
336
|
+
f"Early stopping at episode {episode} due to no improvement in {patience} episodes."
|
|
337
|
+
)
|
|
338
|
+
break
|
|
339
|
+
|
|
340
|
+
# Save best model
|
|
341
|
+
self.model.set_weights(self.best_weights)
|
|
342
|
+
|
|
343
|
+
def __str__(self):
|
|
344
|
+
return (
|
|
345
|
+
f"AutoQL (Env: {self.env.name}, Episodes: {self.num_episodes}, Steps: {self.num_steps})"
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
if __name__ == "__main__":
|
|
350
|
+
pass
|
likelihood/models/simulation.py
CHANGED
|
@@ -4,11 +4,15 @@ from typing import Dict, List, Tuple, Union
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
|
+
from packaging import version
|
|
7
8
|
from pandas.core.frame import DataFrame
|
|
8
9
|
|
|
9
10
|
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
|
|
10
11
|
|
|
11
|
-
|
|
12
|
+
if version.parse(np.__version__) < version.parse("2.0.0"):
|
|
13
|
+
filter = np.RankWarning
|
|
14
|
+
else:
|
|
15
|
+
filter = np.exceptions.RankWarning
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
@@ -128,14 +132,15 @@ class SimulationEngine(FeatureSelection):
|
|
|
128
132
|
)
|
|
129
133
|
poly = kwargs.get("poly", 9)
|
|
130
134
|
plot = kwargs.get("plot", False)
|
|
135
|
+
bandwidth = kwargs.get("bandwidth", 1.5)
|
|
131
136
|
if not x[1]:
|
|
132
137
|
media = self.df[key].mean()
|
|
133
138
|
standard_deviation = self.df[key].std()
|
|
134
|
-
lower_limit = media -
|
|
135
|
-
upper_limit = media +
|
|
139
|
+
lower_limit = media - bandwidth * standard_deviation
|
|
140
|
+
upper_limit = media + bandwidth * standard_deviation
|
|
136
141
|
if plot:
|
|
137
142
|
print(f"Cumulative Distribution Function ({key})")
|
|
138
|
-
f,
|
|
143
|
+
f, _, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
|
|
139
144
|
else:
|
|
140
145
|
f, ox = None, None
|
|
141
146
|
least_frequent_category, most_frequent_category = categories_by_quartile(
|
likelihood/tools/models_tools.py
CHANGED
|
@@ -11,7 +11,7 @@ logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
|
11
11
|
import sys
|
|
12
12
|
import warnings
|
|
13
13
|
from functools import wraps
|
|
14
|
-
from typing import Dict
|
|
14
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
import tensorflow as tf
|
|
@@ -40,6 +40,189 @@ def suppress_warnings(func):
|
|
|
40
40
|
return wrapper
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
class TransformRange:
|
|
44
|
+
"""
|
|
45
|
+
Generates a new DataFrame with ranges represented as strings.
|
|
46
|
+
|
|
47
|
+
Transforms numerical columns into categorical range bins with descriptive labels.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, df: pd.DataFrame) -> None:
|
|
51
|
+
"""Initializes the class with the original DataFrame.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
df : `pd.DataFrame`
|
|
56
|
+
The original DataFrame to transform.
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
TypeError
|
|
61
|
+
If df is not a pandas DataFrame.
|
|
62
|
+
"""
|
|
63
|
+
if not isinstance(df, pd.DataFrame):
|
|
64
|
+
raise TypeError("df must be a pandas DataFrame")
|
|
65
|
+
self.df = df.copy() # Create a copy to avoid modifying the original
|
|
66
|
+
|
|
67
|
+
def _create_bins_and_labels(
|
|
68
|
+
self, min_val: Union[int, float], max_val: Union[int, float], bin_size: int
|
|
69
|
+
) -> Tuple[np.ndarray, List[str]]:
|
|
70
|
+
"""
|
|
71
|
+
Creates the bin edges and their labels.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
min_val : `int` or `float`
|
|
76
|
+
The minimum value for the range.
|
|
77
|
+
max_val : `int` or `float`
|
|
78
|
+
The maximum value for the range.
|
|
79
|
+
bin_size : `int`
|
|
80
|
+
The size of each bin.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
bins : `np.ndarray`
|
|
85
|
+
The bin edges.
|
|
86
|
+
labels : `list`
|
|
87
|
+
The labels for the bins.
|
|
88
|
+
|
|
89
|
+
Raises
|
|
90
|
+
------
|
|
91
|
+
ValueError
|
|
92
|
+
If bin_size is not positive or if min_val >= max_val.
|
|
93
|
+
"""
|
|
94
|
+
if bin_size <= 0:
|
|
95
|
+
raise ValueError("bin_size must be positive")
|
|
96
|
+
if min_val >= max_val:
|
|
97
|
+
raise ValueError("min_val must be less than max_val")
|
|
98
|
+
|
|
99
|
+
start = int(min_val)
|
|
100
|
+
end = int(max_val) + bin_size
|
|
101
|
+
|
|
102
|
+
bins = np.arange(start, end + 1, bin_size)
|
|
103
|
+
|
|
104
|
+
if bins[-1] <= max_val:
|
|
105
|
+
bins = np.append(bins, max_val + 1)
|
|
106
|
+
|
|
107
|
+
labels = [f"{int(bins[i])}-{int(bins[i+1] - 1)}" for i in range(len(bins) - 1)]
|
|
108
|
+
return bins, labels
|
|
109
|
+
|
|
110
|
+
def _transform_column_to_ranges(self, column: str, bin_size: int) -> pd.Series:
|
|
111
|
+
"""
|
|
112
|
+
Transforms a column in the DataFrame into range bins.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
column : `str`
|
|
117
|
+
The name of the column to transform.
|
|
118
|
+
bin_size : `int`
|
|
119
|
+
The size of each bin.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
`pd.Series`
|
|
124
|
+
A Series with the range labels.
|
|
125
|
+
|
|
126
|
+
Raises
|
|
127
|
+
------
|
|
128
|
+
KeyError
|
|
129
|
+
If column is not found in the DataFrame.
|
|
130
|
+
ValueError
|
|
131
|
+
If bin_size is not positive or if column contains non-numeric data.
|
|
132
|
+
"""
|
|
133
|
+
if column not in self.df.columns:
|
|
134
|
+
raise KeyError(f"Column '{column}' not found in DataFrame")
|
|
135
|
+
|
|
136
|
+
if bin_size <= 0:
|
|
137
|
+
raise ValueError("bin_size must be positive")
|
|
138
|
+
|
|
139
|
+
numeric_series = pd.to_numeric(self.df[column], errors="coerce")
|
|
140
|
+
if numeric_series.isna().all():
|
|
141
|
+
raise ValueError(f"Column '{column}' contains no valid numeric data")
|
|
142
|
+
|
|
143
|
+
min_val = numeric_series.min()
|
|
144
|
+
max_val = numeric_series.max()
|
|
145
|
+
|
|
146
|
+
if min_val == max_val:
|
|
147
|
+
return pd.Series(
|
|
148
|
+
[f"{int(min_val)}-{int(max_val)}"] * len(self.df), name=f"{column}_range"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
bins, labels = self._create_bins_and_labels(min_val, max_val, bin_size)
|
|
152
|
+
|
|
153
|
+
return pd.cut(numeric_series, bins=bins, labels=labels, right=False, include_lowest=True)
|
|
154
|
+
|
|
155
|
+
def transform_dataframe(
|
|
156
|
+
self, columns_bin_sizes: Dict[str, int], drop_original: bool = False
|
|
157
|
+
) -> pd.DataFrame:
|
|
158
|
+
"""
|
|
159
|
+
Creates a new DataFrame with range columns.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
columns_bin_sizes : `dict`
|
|
164
|
+
A dictionary where the keys are column names and the values are the bin sizes.
|
|
165
|
+
drop_original : `bool`, optional
|
|
166
|
+
If True, drops original columns from the result, by default False
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
`pd.DataFrame`
|
|
171
|
+
A DataFrame with the transformed range columns.
|
|
172
|
+
|
|
173
|
+
Raises
|
|
174
|
+
------
|
|
175
|
+
TypeError
|
|
176
|
+
If columns_bin_sizes is not a dictionary.
|
|
177
|
+
"""
|
|
178
|
+
if not isinstance(columns_bin_sizes, dict):
|
|
179
|
+
raise TypeError("columns_bin_sizes must be a dictionary")
|
|
180
|
+
|
|
181
|
+
if not columns_bin_sizes:
|
|
182
|
+
return pd.DataFrame()
|
|
183
|
+
|
|
184
|
+
range_columns = {}
|
|
185
|
+
for column, bin_size in columns_bin_sizes.items():
|
|
186
|
+
range_columns[f"{column}_range"] = self._transform_column_to_ranges(column, bin_size)
|
|
187
|
+
|
|
188
|
+
result_df = pd.DataFrame(range_columns)
|
|
189
|
+
|
|
190
|
+
if not drop_original:
|
|
191
|
+
original_cols = [col for col in self.df.columns if col not in columns_bin_sizes]
|
|
192
|
+
if original_cols:
|
|
193
|
+
result_df = pd.concat([self.df[original_cols], result_df], axis=1)
|
|
194
|
+
|
|
195
|
+
return result_df
|
|
196
|
+
|
|
197
|
+
def get_range_info(self, column: str) -> Dict[str, Union[int, float, List[str]]]:
|
|
198
|
+
"""
|
|
199
|
+
Get information about the range transformation for a specific column.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
column : `str`
|
|
204
|
+
The name of the column to analyze.
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
`dict`
|
|
209
|
+
Dictionary containing min_val, max_val, bin_size, and labels.
|
|
210
|
+
"""
|
|
211
|
+
if column not in self.df.columns:
|
|
212
|
+
raise KeyError(f"Column '{column}' not found in DataFrame")
|
|
213
|
+
|
|
214
|
+
numeric_series = pd.to_numeric(self.df[column], errors="coerce")
|
|
215
|
+
min_val = numeric_series.min()
|
|
216
|
+
max_val = numeric_series.max()
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
"min_value": min_val,
|
|
220
|
+
"max_value": max_val,
|
|
221
|
+
"range": max_val - min_val,
|
|
222
|
+
"column": column,
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
|
|
43
226
|
def remove_collinearity(df: DataFrame, threshold: float = 0.9):
|
|
44
227
|
"""
|
|
45
228
|
Removes highly collinear features from the DataFrame based on a correlation threshold.
|
|
@@ -56,8 +239,8 @@ def remove_collinearity(df: DataFrame, threshold: float = 0.9):
|
|
|
56
239
|
The correlation threshold above which features will be removed. Default is `0.9`.
|
|
57
240
|
|
|
58
241
|
Returns
|
|
59
|
-
|
|
60
|
-
DataFrame: A DataFrame with highly collinear features removed.
|
|
242
|
+
-------
|
|
243
|
+
DataFrame : A DataFrame with highly collinear features removed.
|
|
61
244
|
"""
|
|
62
245
|
corr_matrix = df.corr().abs()
|
|
63
246
|
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
@@ -97,11 +280,11 @@ def train_and_insights(
|
|
|
97
280
|
Fraction of data to use (default is 1.0).
|
|
98
281
|
|
|
99
282
|
Keyword Arguments:
|
|
100
|
-
|
|
283
|
+
------------------
|
|
101
284
|
Additional keyword arguments passed to the `model.fit` function, such as validation split and callbacks.
|
|
102
285
|
|
|
103
286
|
Returns
|
|
104
|
-
|
|
287
|
+
-------
|
|
105
288
|
`tf.keras.Model`
|
|
106
289
|
The trained model after fitting.
|
|
107
290
|
"""
|
|
@@ -207,7 +390,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
|
|
|
207
390
|
A threshold for the eigenvector centrality calculation, used to determine the cutoff for small eigenvalues. Default is `1e-6`.
|
|
208
391
|
|
|
209
392
|
Returns
|
|
210
|
-
|
|
393
|
+
-------
|
|
211
394
|
DataFrame : A DataFrame containing the following graph metrics as columns.
|
|
212
395
|
- `Degree Centrality`: Degree centrality values for each node, indicating the number of direct connections each node has.
|
|
213
396
|
- `Clustering Coefficient`: Clustering coefficient values for each node, representing the degree to which nodes cluster together.
|
|
@@ -218,7 +401,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
|
|
|
218
401
|
- `Assortativity`: The assortativity coefficient of the graph, measuring the tendency of nodes to connect to similar nodes.
|
|
219
402
|
|
|
220
403
|
Notes
|
|
221
|
-
|
|
404
|
+
-----
|
|
222
405
|
The returned DataFrame will have one row for each node and one column for each of the computed metrics.
|
|
223
406
|
"""
|
|
224
407
|
adj_matrix = adj_matrix.astype(int)
|
|
@@ -251,3 +434,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
|
|
|
251
434
|
metrics_df["Assortativity"] = assortativity
|
|
252
435
|
|
|
253
436
|
return metrics_df
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
if __name__ == "__main__":
|
|
440
|
+
pass
|