likelihood 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/nn.py +72 -113
- likelihood/models/deep/autoencoders.py +352 -116
- likelihood/tools/__init__.py +1 -0
- likelihood/tools/figures.py +348 -0
- likelihood/tools/models_tools.py +253 -0
- likelihood/tools/tools.py +26 -84
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/METADATA +1 -1
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/RECORD +11 -9
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/WHEEL +1 -1
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/LICENSE +0 -0
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
|
+
import warnings
|
|
4
5
|
from functools import partial
|
|
5
6
|
from shutil import rmtree
|
|
6
7
|
|
|
@@ -14,8 +15,8 @@ from pandas.plotting import radviz
|
|
|
14
15
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
15
16
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
16
17
|
|
|
17
|
-
|
|
18
|
-
from
|
|
18
|
+
|
|
19
|
+
from typing import List
|
|
19
20
|
|
|
20
21
|
import keras_tuner
|
|
21
22
|
import tensorflow as tf
|
|
@@ -24,21 +25,11 @@ from sklearn.manifold import TSNE
|
|
|
24
25
|
from tensorflow.keras.layers import InputLayer
|
|
25
26
|
from tensorflow.keras.regularizers import l2
|
|
26
27
|
|
|
27
|
-
from likelihood.tools import OneHotEncoder
|
|
28
|
+
from likelihood.tools import LoRALayer, OneHotEncoder, suppress_warnings
|
|
28
29
|
|
|
29
30
|
tf.get_logger().setLevel("ERROR")
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
def suppress_warnings(func):
|
|
33
|
-
@wraps(func)
|
|
34
|
-
def wrapper(*args, **kwargs):
|
|
35
|
-
with warnings.catch_warnings():
|
|
36
|
-
warnings.simplefilter("ignore")
|
|
37
|
-
return func(*args, **kwargs)
|
|
38
|
-
|
|
39
|
-
return wrapper
|
|
40
|
-
|
|
41
|
-
|
|
42
33
|
class EarlyStopping:
|
|
43
34
|
def __init__(self, patience=10, min_delta=0.001):
|
|
44
35
|
self.patience = patience
|
|
@@ -246,7 +237,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
246
237
|
Additional keyword arguments to pass to the model.
|
|
247
238
|
|
|
248
239
|
classifier_activation : `str`
|
|
249
|
-
The activation function to use for the classifier layer. Default is
|
|
240
|
+
The activation function to use for the classifier layer. Default is `softmax`. If the activation function is not a classification function, the model can be used in regression problems.
|
|
250
241
|
num_layers : `int`
|
|
251
242
|
The number of hidden layers in the classifier. Default is 1.
|
|
252
243
|
dropout : `float`
|
|
@@ -257,6 +248,10 @@ class AutoClassifier(tf.keras.Model):
|
|
|
257
248
|
Whether to use variational autoencoder mode. Default is False.
|
|
258
249
|
vae_units : `int`
|
|
259
250
|
The number of units in the variational autoencoder. Default is 2.
|
|
251
|
+
lora_mode : `bool`
|
|
252
|
+
Whether to use LoRA layers. Default is False.
|
|
253
|
+
lora_rank : `int`
|
|
254
|
+
The rank of the LoRA layer. Default is 4.
|
|
260
255
|
"""
|
|
261
256
|
|
|
262
257
|
def __init__(self, input_shape_parm, num_classes, units, activation, **kwargs):
|
|
@@ -275,6 +270,8 @@ class AutoClassifier(tf.keras.Model):
|
|
|
275
270
|
self.l2_reg = kwargs.get("l2_reg", 0.0)
|
|
276
271
|
self.vae_mode = kwargs.get("vae_mode", False)
|
|
277
272
|
self.vae_units = kwargs.get("vae_units", 2)
|
|
273
|
+
self.lora_mode = kwargs.get("lora_mode", False)
|
|
274
|
+
self.lora_rank = kwargs.get("lora_rank", 4)
|
|
278
275
|
|
|
279
276
|
def build_encoder_decoder(self, input_shape):
|
|
280
277
|
self.encoder = (
|
|
@@ -367,9 +364,8 @@ class AutoClassifier(tf.keras.Model):
|
|
|
367
364
|
else:
|
|
368
365
|
self.build_encoder_decoder(input_shape)
|
|
369
366
|
|
|
370
|
-
# Classifier with L2 regularization
|
|
371
367
|
self.classifier = tf.keras.Sequential()
|
|
372
|
-
if self.num_layers > 1:
|
|
368
|
+
if self.num_layers > 1 and not self.lora_mode:
|
|
373
369
|
for _ in range(self.num_layers - 1):
|
|
374
370
|
self.classifier.add(
|
|
375
371
|
tf.keras.layers.Dense(
|
|
@@ -380,13 +376,36 @@ class AutoClassifier(tf.keras.Model):
|
|
|
380
376
|
)
|
|
381
377
|
if self.dropout:
|
|
382
378
|
self.classifier.add(tf.keras.layers.Dropout(self.dropout))
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
379
|
+
self.classifier.add(
|
|
380
|
+
tf.keras.layers.Dense(
|
|
381
|
+
units=self.num_classes,
|
|
382
|
+
activation=self.classifier_activation,
|
|
383
|
+
kernel_regularizer=l2(self.l2_reg),
|
|
384
|
+
)
|
|
385
|
+
)
|
|
386
|
+
elif self.lora_mode:
|
|
387
|
+
for _ in range(self.num_layers - 1):
|
|
388
|
+
self.classifier.add(
|
|
389
|
+
LoRALayer(units=self.units, rank=self.lora_rank, name=f"LoRA_{_}")
|
|
390
|
+
)
|
|
391
|
+
self.classifier.add(tf.keras.layers.Activation(self.activation))
|
|
392
|
+
if self.dropout:
|
|
393
|
+
self.classifier.add(tf.keras.layers.Dropout(self.dropout))
|
|
394
|
+
self.classifier.add(
|
|
395
|
+
tf.keras.layers.Dense(
|
|
396
|
+
units=self.num_classes,
|
|
397
|
+
activation=self.classifier_activation,
|
|
398
|
+
kernel_regularizer=l2(self.l2_reg),
|
|
399
|
+
)
|
|
400
|
+
)
|
|
401
|
+
else:
|
|
402
|
+
self.classifier.add(
|
|
403
|
+
tf.keras.layers.Dense(
|
|
404
|
+
units=self.num_classes,
|
|
405
|
+
activation=self.classifier_activation,
|
|
406
|
+
kernel_regularizer=l2(self.l2_reg),
|
|
407
|
+
)
|
|
388
408
|
)
|
|
389
|
-
)
|
|
390
409
|
|
|
391
410
|
def train_encoder_decoder(
|
|
392
411
|
self, data, epochs, batch_size, validation_split=0.2, patience=10, **kwargs
|
|
@@ -498,7 +517,6 @@ class AutoClassifier(tf.keras.Model):
|
|
|
498
517
|
if not isinstance(source_model, AutoClassifier):
|
|
499
518
|
raise ValueError("Source model must be an instance of AutoClassifier.")
|
|
500
519
|
|
|
501
|
-
# Check compatibility in input shape and units
|
|
502
520
|
if self.input_shape_parm != source_model.input_shape_parm:
|
|
503
521
|
raise ValueError(
|
|
504
522
|
f"Incompatible input shape. Expected {self.input_shape_parm}, got {source_model.input_shape_parm}."
|
|
@@ -508,9 +526,8 @@ class AutoClassifier(tf.keras.Model):
|
|
|
508
526
|
f"Incompatible number of units. Expected {self.units}, got {source_model.units}."
|
|
509
527
|
)
|
|
510
528
|
self.encoder, self.decoder = tf.keras.Sequential(), tf.keras.Sequential()
|
|
511
|
-
# Copy the encoder layers
|
|
512
529
|
for i, layer in enumerate(source_model.encoder.layers):
|
|
513
|
-
if isinstance(layer, tf.keras.layers.Dense):
|
|
530
|
+
if isinstance(layer, tf.keras.layers.Dense):
|
|
514
531
|
dummy_input = tf.convert_to_tensor(tf.random.normal([1, layer.input_shape[1]]))
|
|
515
532
|
dense_layer = tf.keras.layers.Dense(
|
|
516
533
|
units=layer.units,
|
|
@@ -519,14 +536,12 @@ class AutoClassifier(tf.keras.Model):
|
|
|
519
536
|
)
|
|
520
537
|
dense_layer.build(dummy_input.shape)
|
|
521
538
|
self.encoder.add(dense_layer)
|
|
522
|
-
# Set the weights correctly
|
|
523
539
|
self.encoder.layers[i].set_weights(layer.get_weights())
|
|
524
540
|
elif not isinstance(layer, InputLayer):
|
|
525
541
|
raise ValueError(f"Layer type {type(layer)} not supported for copying.")
|
|
526
542
|
|
|
527
|
-
# Copy the decoder layers
|
|
528
543
|
for i, layer in enumerate(source_model.decoder.layers):
|
|
529
|
-
if isinstance(layer, tf.keras.layers.Dense):
|
|
544
|
+
if isinstance(layer, tf.keras.layers.Dense):
|
|
530
545
|
dummy_input = tf.convert_to_tensor(tf.random.normal([1, layer.input_shape[1]]))
|
|
531
546
|
dense_layer = tf.keras.layers.Dense(
|
|
532
547
|
units=layer.units,
|
|
@@ -535,7 +550,6 @@ class AutoClassifier(tf.keras.Model):
|
|
|
535
550
|
)
|
|
536
551
|
dense_layer.build(dummy_input.shape)
|
|
537
552
|
self.decoder.add(dense_layer)
|
|
538
|
-
# Set the weights correctly
|
|
539
553
|
self.decoder.layers[i].set_weights(layer.get_weights())
|
|
540
554
|
elif not isinstance(layer, InputLayer):
|
|
541
555
|
raise ValueError(f"Layer type {type(layer)} not supported for copying.")
|
|
@@ -552,6 +566,8 @@ class AutoClassifier(tf.keras.Model):
|
|
|
552
566
|
"l2_reg": self.l2_reg,
|
|
553
567
|
"vae_mode": self.vae_mode,
|
|
554
568
|
"vae_units": self.vae_units,
|
|
569
|
+
"lora_mode": self.lora_mode,
|
|
570
|
+
"lora_rank": self.lora_rank,
|
|
555
571
|
}
|
|
556
572
|
base_config = super(AutoClassifier, self).get_config()
|
|
557
573
|
return dict(list(base_config.items()) + list(config.items()))
|
|
@@ -569,6 +585,8 @@ class AutoClassifier(tf.keras.Model):
|
|
|
569
585
|
l2_reg=config["l2_reg"],
|
|
570
586
|
vae_mode=config["vae_mode"],
|
|
571
587
|
vae_units=config["vae_units"],
|
|
588
|
+
lora_mode=config["lora_mode"],
|
|
589
|
+
lora_rank=config["lora_rank"],
|
|
572
590
|
)
|
|
573
591
|
|
|
574
592
|
|
|
@@ -874,62 +892,220 @@ def setup_model(
|
|
|
874
892
|
|
|
875
893
|
|
|
876
894
|
class GetInsights:
|
|
895
|
+
"""
|
|
896
|
+
A class to analyze the output of a neural network model, including visualizations
|
|
897
|
+
of the weights, t-SNE representation, and feature statistics.
|
|
898
|
+
|
|
899
|
+
Parameters
|
|
900
|
+
----------
|
|
901
|
+
model : `AutoClassifier`
|
|
902
|
+
The trained model to analyze.
|
|
903
|
+
inputs : `np.ndarray`
|
|
904
|
+
The input data for analysis.
|
|
905
|
+
"""
|
|
906
|
+
|
|
877
907
|
def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
|
|
908
|
+
"""
|
|
909
|
+
Initializes the GetInsights class.
|
|
910
|
+
|
|
911
|
+
Parameters
|
|
912
|
+
----------
|
|
913
|
+
model : `AutoClassifier`
|
|
914
|
+
The trained model to analyze.
|
|
915
|
+
inputs : `np.ndarray`
|
|
916
|
+
The input data for analysis.
|
|
917
|
+
"""
|
|
878
918
|
self.inputs = inputs
|
|
879
919
|
self.model = model
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
920
|
+
|
|
921
|
+
self.encoder_layer = (
|
|
922
|
+
self.model.encoder.layers[1]
|
|
923
|
+
if isinstance(self.model.encoder.layers[0], InputLayer)
|
|
924
|
+
else self.model.encoder.layers[0]
|
|
925
|
+
)
|
|
884
926
|
self.decoder_layer = self.model.decoder.layers[0]
|
|
927
|
+
|
|
885
928
|
self.encoder_weights = self.encoder_layer.get_weights()[0]
|
|
886
929
|
self.decoder_weights = self.decoder_layer.get_weights()[0]
|
|
887
|
-
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
|
|
888
930
|
|
|
931
|
+
self.sorted_names = self._generate_sorted_color_names()
|
|
932
|
+
|
|
933
|
+
def _generate_sorted_color_names(self) -> list:
|
|
934
|
+
"""
|
|
935
|
+
Generate sorted color names based on their HSV values.
|
|
936
|
+
|
|
937
|
+
Parameters
|
|
938
|
+
----------
|
|
939
|
+
`None`
|
|
940
|
+
|
|
941
|
+
Returns
|
|
942
|
+
-------
|
|
943
|
+
`list` : Sorted color names.
|
|
944
|
+
"""
|
|
945
|
+
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
|
|
889
946
|
by_hsv = sorted(
|
|
890
947
|
(tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
|
|
891
948
|
for name, color in colors.items()
|
|
892
949
|
)
|
|
893
|
-
|
|
894
|
-
random.shuffle(
|
|
950
|
+
sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
|
|
951
|
+
random.shuffle(sorted_names)
|
|
952
|
+
return sorted_names
|
|
895
953
|
|
|
896
954
|
def predictor_analyzer(
|
|
897
955
|
self,
|
|
898
|
-
frac=None,
|
|
956
|
+
frac: float = None,
|
|
899
957
|
cmap: str = "viridis",
|
|
900
958
|
aspect: str = "auto",
|
|
901
959
|
highlight: bool = True,
|
|
902
960
|
**kwargs,
|
|
903
961
|
) -> None:
|
|
962
|
+
"""
|
|
963
|
+
Analyze the model's predictions and visualize data.
|
|
964
|
+
|
|
965
|
+
Parameters
|
|
966
|
+
----------
|
|
967
|
+
frac : `float`, optional
|
|
968
|
+
Fraction of data to use for analysis (default is `None`).
|
|
969
|
+
cmap : `str`, optional
|
|
970
|
+
The colormap for visualization (default is `"viridis"`).
|
|
971
|
+
aspect : `str`, optional
|
|
972
|
+
Aspect ratio for the visualization (default is `"auto"`).
|
|
973
|
+
highlight : `bool`, optional
|
|
974
|
+
Whether to highlight the maximum weights (default is `True`).
|
|
975
|
+
**kwargs : `dict`, optional
|
|
976
|
+
Additional keyword arguments for customization.
|
|
977
|
+
|
|
978
|
+
Returns
|
|
979
|
+
-------
|
|
980
|
+
`DataFrame` : The statistical summary of the input data.
|
|
981
|
+
"""
|
|
904
982
|
self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
|
|
905
983
|
inputs = self.inputs.copy()
|
|
984
|
+
inputs = self._prepare_inputs(inputs, frac)
|
|
906
985
|
y_labels = kwargs.get("y_labels", None)
|
|
986
|
+
encoded, reconstructed = self._encode_decode(inputs)
|
|
987
|
+
self._visualize_data(inputs, reconstructed, cmap, aspect)
|
|
988
|
+
self._prepare_data_for_analysis(inputs, reconstructed, encoded, y_labels)
|
|
989
|
+
|
|
990
|
+
try:
|
|
991
|
+
self._get_tsne_repr(inputs, frac)
|
|
992
|
+
self._viz_tsne_repr(c=self.classification)
|
|
993
|
+
|
|
994
|
+
self._viz_radviz(self.data, "class", "Radviz Visualization of Latent Space")
|
|
995
|
+
self._viz_radviz(self.data_input, "class", "Radviz Visualization of Input Data")
|
|
996
|
+
except ValueError:
|
|
997
|
+
warnings.warn(
|
|
998
|
+
"Some functions or processes will not be executed for regression problems.",
|
|
999
|
+
UserWarning,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
return self._statistics(self.data_input)
|
|
1003
|
+
|
|
1004
|
+
def _prepare_inputs(self, inputs: np.ndarray, frac: float) -> np.ndarray:
|
|
1005
|
+
"""
|
|
1006
|
+
Prepare the input data, possibly selecting a fraction of it.
|
|
1007
|
+
|
|
1008
|
+
Parameters
|
|
1009
|
+
----------
|
|
1010
|
+
inputs : `np.ndarray`
|
|
1011
|
+
The input data.
|
|
1012
|
+
frac : `float`
|
|
1013
|
+
Fraction of data to use.
|
|
1014
|
+
|
|
1015
|
+
Returns
|
|
1016
|
+
-------
|
|
1017
|
+
`np.ndarray` : The prepared input data.
|
|
1018
|
+
"""
|
|
907
1019
|
if frac:
|
|
908
1020
|
n = int(frac * self.inputs.shape[0])
|
|
909
1021
|
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
910
1022
|
inputs = inputs[indexes]
|
|
911
1023
|
inputs[np.isnan(inputs)] = 0.0
|
|
912
|
-
|
|
1024
|
+
return inputs
|
|
1025
|
+
|
|
1026
|
+
def _encode_decode(self, inputs: np.ndarray) -> tuple:
|
|
1027
|
+
"""
|
|
1028
|
+
Perform encoding and decoding on the input data.
|
|
1029
|
+
|
|
1030
|
+
Parameters
|
|
1031
|
+
----------
|
|
1032
|
+
inputs : `np.ndarray`
|
|
1033
|
+
The input data.
|
|
1034
|
+
|
|
1035
|
+
Returns
|
|
1036
|
+
-------
|
|
1037
|
+
`tuple` : The encoded and reconstructed data.
|
|
1038
|
+
"""
|
|
913
1039
|
try:
|
|
914
1040
|
mean, log_var = self.model.encoder(inputs)
|
|
915
1041
|
encoded = sampling(mean, log_var)
|
|
916
1042
|
except:
|
|
917
1043
|
encoded = self.model.encoder(inputs)
|
|
918
1044
|
reconstructed = self.model.decoder(encoded)
|
|
919
|
-
|
|
920
|
-
|
|
1045
|
+
return encoded, reconstructed
|
|
1046
|
+
|
|
1047
|
+
def _visualize_data(
|
|
1048
|
+
self, inputs: np.ndarray, reconstructed: np.ndarray, cmap: str, aspect: str
|
|
1049
|
+
) -> None:
|
|
1050
|
+
"""
|
|
1051
|
+
Visualize the original data and the reconstructed data.
|
|
1052
|
+
|
|
1053
|
+
Parameters
|
|
1054
|
+
----------
|
|
1055
|
+
inputs : `np.ndarray`
|
|
1056
|
+
The input data.
|
|
1057
|
+
reconstructed : `np.ndarray`
|
|
1058
|
+
The reconstructed data.
|
|
1059
|
+
cmap : `str`
|
|
1060
|
+
The colormap for visualization.
|
|
1061
|
+
aspect : `str`
|
|
1062
|
+
Aspect ratio for the visualization.
|
|
1063
|
+
|
|
1064
|
+
Returns
|
|
1065
|
+
-------
|
|
1066
|
+
`None`
|
|
1067
|
+
"""
|
|
921
1068
|
ax = plt.subplot(1, 2, 1)
|
|
922
|
-
plt.imshow(
|
|
1069
|
+
plt.imshow(inputs, cmap=cmap, aspect=aspect)
|
|
923
1070
|
plt.colorbar()
|
|
924
1071
|
plt.title("Original Data")
|
|
1072
|
+
|
|
925
1073
|
plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
|
|
926
1074
|
plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
|
|
927
1075
|
plt.colorbar()
|
|
928
1076
|
plt.title("Decoder Layer Reconstruction")
|
|
929
1077
|
plt.show()
|
|
930
1078
|
|
|
931
|
-
|
|
932
|
-
self
|
|
1079
|
+
def _prepare_data_for_analysis(
|
|
1080
|
+
self,
|
|
1081
|
+
inputs: np.ndarray,
|
|
1082
|
+
reconstructed: np.ndarray,
|
|
1083
|
+
encoded: np.ndarray,
|
|
1084
|
+
y_labels: List[str],
|
|
1085
|
+
) -> None:
|
|
1086
|
+
"""
|
|
1087
|
+
Prepare data for statistical analysis.
|
|
1088
|
+
|
|
1089
|
+
Parameters
|
|
1090
|
+
----------
|
|
1091
|
+
inputs : `np.ndarray`
|
|
1092
|
+
The input data.
|
|
1093
|
+
reconstructed : `np.ndarray`
|
|
1094
|
+
The reconstructed data.
|
|
1095
|
+
encoded : `np.ndarray`
|
|
1096
|
+
The encoded data.
|
|
1097
|
+
y_labels : `List[str]`
|
|
1098
|
+
The labels of features.
|
|
1099
|
+
|
|
1100
|
+
Returns
|
|
1101
|
+
-------
|
|
1102
|
+
`None`
|
|
1103
|
+
"""
|
|
1104
|
+
self.classification = (
|
|
1105
|
+
self.model.classifier(tf.concat([reconstructed, encoded], axis=1))
|
|
1106
|
+
.numpy()
|
|
1107
|
+
.argmax(axis=1)
|
|
1108
|
+
)
|
|
933
1109
|
|
|
934
1110
|
self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
|
|
935
1111
|
self.data_input = pd.DataFrame(
|
|
@@ -938,84 +1114,25 @@ class GetInsights:
|
|
|
938
1114
|
[f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
|
|
939
1115
|
),
|
|
940
1116
|
)
|
|
1117
|
+
|
|
941
1118
|
self.data["class"] = self.classification
|
|
942
1119
|
self.data_input["class"] = self.classification
|
|
943
1120
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
* (self.data_normalized.iloc[:, :-1] - self.data_normalized.iloc[:, :-1].min())
|
|
948
|
-
/ (self.data_normalized.iloc[:, :-1].max() - self.data_normalized.iloc[:, :-1].min())
|
|
949
|
-
- 1
|
|
950
|
-
)
|
|
951
|
-
radviz(self.data_normalized, "class", color=self.colors)
|
|
952
|
-
plt.title("Radviz Visualization of Latent Space")
|
|
953
|
-
plt.show()
|
|
954
|
-
self.data_input_normalized = self.data_input.copy(deep=True)
|
|
955
|
-
self.data_input_normalized.iloc[:, :-1] = (
|
|
956
|
-
2.0
|
|
957
|
-
* (
|
|
958
|
-
self.data_input_normalized.iloc[:, :-1]
|
|
959
|
-
- self.data_input_normalized.iloc[:, :-1].min()
|
|
960
|
-
)
|
|
961
|
-
/ (
|
|
962
|
-
self.data_input_normalized.iloc[:, :-1].max()
|
|
963
|
-
- self.data_input_normalized.iloc[:, :-1].min()
|
|
964
|
-
)
|
|
965
|
-
- 1
|
|
966
|
-
)
|
|
967
|
-
radviz(self.data_input_normalized, "class", color=self.colors)
|
|
968
|
-
plt.title("Radviz Visualization of Input Data")
|
|
969
|
-
plt.show()
|
|
970
|
-
return self._statistics(self.data_input)
|
|
971
|
-
|
|
972
|
-
def _statistics(self, data_input: DataFrame, **kwargs) -> DataFrame:
|
|
973
|
-
data = data_input.copy(deep=True)
|
|
974
|
-
|
|
975
|
-
if not pd.api.types.is_string_dtype(data["class"]):
|
|
976
|
-
data["class"] = data["class"].astype(str)
|
|
977
|
-
|
|
978
|
-
data.ffill(inplace=True)
|
|
979
|
-
grouped_data = data.groupby("class")
|
|
980
|
-
|
|
981
|
-
numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
|
|
982
|
-
numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
|
|
983
|
-
|
|
984
|
-
def get_mode(x):
|
|
985
|
-
mode_series = x.mode()
|
|
986
|
-
return mode_series.iloc[0] if not mode_series.empty else None
|
|
987
|
-
|
|
988
|
-
mode_stats = grouped_data.apply(get_mode, include_groups=False)
|
|
989
|
-
mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
|
|
990
|
-
combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
|
|
991
|
-
|
|
992
|
-
return combined_stats.T
|
|
993
|
-
|
|
994
|
-
def _viz_weights(
|
|
995
|
-
self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
|
|
996
|
-
) -> None:
|
|
997
|
-
title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
|
|
998
|
-
y_labels = kwargs.get("y_labels", None)
|
|
999
|
-
cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
|
|
1000
|
-
highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
|
|
1121
|
+
def _get_tsne_repr(self, inputs: np.ndarray = None, frac: float = None) -> None:
|
|
1122
|
+
"""
|
|
1123
|
+
Perform t-SNE dimensionality reduction on the input data.
|
|
1001
1124
|
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
|
|
1009
|
-
highlight_mask[i, j] = True
|
|
1010
|
-
plt.imshow(
|
|
1011
|
-
np.ma.masked_where(~highlight_mask, self.encoder_weights),
|
|
1012
|
-
cmap=cmap_highlight,
|
|
1013
|
-
alpha=0.5,
|
|
1014
|
-
aspect=aspect,
|
|
1015
|
-
)
|
|
1016
|
-
plt.show()
|
|
1125
|
+
Parameters
|
|
1126
|
+
----------
|
|
1127
|
+
inputs : `np.ndarray`
|
|
1128
|
+
The input data.
|
|
1129
|
+
frac : `float`
|
|
1130
|
+
Fraction of data to use.
|
|
1017
1131
|
|
|
1018
|
-
|
|
1132
|
+
Returns
|
|
1133
|
+
-------
|
|
1134
|
+
`None`
|
|
1135
|
+
"""
|
|
1019
1136
|
if inputs is None:
|
|
1020
1137
|
inputs = self.inputs.copy()
|
|
1021
1138
|
if frac:
|
|
@@ -1029,26 +1146,145 @@ class GetInsights:
|
|
|
1029
1146
|
self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
|
|
1030
1147
|
|
|
1031
1148
|
def _viz_tsne_repr(self, **kwargs) -> None:
|
|
1149
|
+
"""
|
|
1150
|
+
Visualize the t-SNE representation of the latent space.
|
|
1151
|
+
|
|
1152
|
+
Parameters
|
|
1153
|
+
----------
|
|
1154
|
+
**kwargs : `dict`
|
|
1155
|
+
Additional keyword arguments for customization.
|
|
1156
|
+
|
|
1157
|
+
Returns
|
|
1158
|
+
-------
|
|
1159
|
+
`None`
|
|
1160
|
+
"""
|
|
1032
1161
|
c = kwargs.get("c", None)
|
|
1033
1162
|
self.colors = (
|
|
1034
1163
|
kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
|
|
1035
1164
|
)
|
|
1165
|
+
|
|
1036
1166
|
plt.scatter(
|
|
1037
1167
|
self.reduced_data_tsne[:, 0],
|
|
1038
1168
|
self.reduced_data_tsne[:, 1],
|
|
1039
1169
|
cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
|
|
1040
1170
|
c=c,
|
|
1041
1171
|
)
|
|
1172
|
+
|
|
1042
1173
|
if c is not None:
|
|
1043
1174
|
cb = plt.colorbar()
|
|
1044
1175
|
loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
|
|
1045
1176
|
cb.set_ticks(loc)
|
|
1046
1177
|
cb.set_ticklabels(np.unique(c))
|
|
1178
|
+
|
|
1047
1179
|
plt.title("t-SNE Visualization of Latent Space")
|
|
1048
1180
|
plt.xlabel("t-SNE 1")
|
|
1049
1181
|
plt.ylabel("t-SNE 2")
|
|
1050
1182
|
plt.show()
|
|
1051
1183
|
|
|
1184
|
+
def _viz_radviz(self, data: pd.DataFrame, color_column: str, title: str) -> None:
|
|
1185
|
+
"""
|
|
1186
|
+
Visualize the data using RadViz.
|
|
1187
|
+
|
|
1188
|
+
Parameters
|
|
1189
|
+
----------
|
|
1190
|
+
data : `pd.DataFrame`
|
|
1191
|
+
The data to visualize.
|
|
1192
|
+
color_column : `str`
|
|
1193
|
+
The column to use for coloring.
|
|
1194
|
+
title : `str`
|
|
1195
|
+
The title of the plot.
|
|
1196
|
+
|
|
1197
|
+
Returns
|
|
1198
|
+
-------
|
|
1199
|
+
`None`
|
|
1200
|
+
"""
|
|
1201
|
+
data_normalized = data.copy(deep=True)
|
|
1202
|
+
data_normalized.iloc[:, :-1] = (
|
|
1203
|
+
2.0
|
|
1204
|
+
* (data_normalized.iloc[:, :-1] - data_normalized.iloc[:, :-1].min())
|
|
1205
|
+
/ (data_normalized.iloc[:, :-1].max() - data_normalized.iloc[:, :-1].min())
|
|
1206
|
+
- 1
|
|
1207
|
+
)
|
|
1208
|
+
radviz(data_normalized, color_column, color=self.colors)
|
|
1209
|
+
plt.title(title)
|
|
1210
|
+
plt.show()
|
|
1211
|
+
|
|
1212
|
+
def _viz_weights(
|
|
1213
|
+
self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
|
|
1214
|
+
) -> None:
|
|
1215
|
+
"""
|
|
1216
|
+
Visualize the encoder layer weights of the model.
|
|
1217
|
+
|
|
1218
|
+
Parameters
|
|
1219
|
+
----------
|
|
1220
|
+
cmap : `str`, optional
|
|
1221
|
+
The colormap for visualization (default is `"viridis"`).
|
|
1222
|
+
aspect : `str`, optional
|
|
1223
|
+
Aspect ratio for the visualization (default is `"auto"`).
|
|
1224
|
+
highlight : `bool`, optional
|
|
1225
|
+
Whether to highlight the maximum weights (default is `True`).
|
|
1226
|
+
**kwargs : `dict`, optional
|
|
1227
|
+
Additional keyword arguments for customization.
|
|
1228
|
+
|
|
1229
|
+
Returns
|
|
1230
|
+
-------
|
|
1231
|
+
`None`
|
|
1232
|
+
"""
|
|
1233
|
+
title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
|
|
1234
|
+
y_labels = kwargs.get("y_labels", None)
|
|
1235
|
+
cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
|
|
1236
|
+
highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
|
|
1237
|
+
|
|
1238
|
+
plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
|
|
1239
|
+
plt.colorbar()
|
|
1240
|
+
plt.title(title)
|
|
1241
|
+
if y_labels is not None:
|
|
1242
|
+
plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
|
|
1243
|
+
if highlight:
|
|
1244
|
+
for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
|
|
1245
|
+
highlight_mask[i, j] = True
|
|
1246
|
+
plt.imshow(
|
|
1247
|
+
np.ma.masked_where(~highlight_mask, self.encoder_weights),
|
|
1248
|
+
cmap=cmap_highlight,
|
|
1249
|
+
alpha=0.5,
|
|
1250
|
+
aspect=aspect,
|
|
1251
|
+
)
|
|
1252
|
+
plt.show()
|
|
1253
|
+
|
|
1254
|
+
def _statistics(self, data_input: DataFrame) -> DataFrame:
|
|
1255
|
+
"""
|
|
1256
|
+
Compute statistical summaries of the input data.
|
|
1257
|
+
|
|
1258
|
+
Parameters
|
|
1259
|
+
----------
|
|
1260
|
+
data_input : `DataFrame`
|
|
1261
|
+
The data to compute statistics for.
|
|
1262
|
+
|
|
1263
|
+
Returns
|
|
1264
|
+
-------
|
|
1265
|
+
`DataFrame` : The statistical summary of the input data.
|
|
1266
|
+
"""
|
|
1267
|
+
data = data_input.copy(deep=True)
|
|
1268
|
+
|
|
1269
|
+
if not pd.api.types.is_string_dtype(data["class"]):
|
|
1270
|
+
data["class"] = data["class"].astype(str)
|
|
1271
|
+
|
|
1272
|
+
data.ffill(inplace=True)
|
|
1273
|
+
grouped_data = data.groupby("class")
|
|
1274
|
+
|
|
1275
|
+
numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
|
|
1276
|
+
numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
|
|
1277
|
+
|
|
1278
|
+
def get_mode(x):
|
|
1279
|
+
mode_series = x.mode()
|
|
1280
|
+
return mode_series.iloc[0] if not mode_series.empty else None
|
|
1281
|
+
|
|
1282
|
+
mode_stats = grouped_data.apply(get_mode, include_groups=False)
|
|
1283
|
+
mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
|
|
1284
|
+
combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
|
|
1285
|
+
|
|
1286
|
+
return combined_stats.T
|
|
1287
|
+
|
|
1052
1288
|
|
|
1053
1289
|
########################################################################################
|
|
1054
1290
|
|
likelihood/tools/__init__.py
CHANGED