likelihood 1.5.4__py3-none-any.whl → 1.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/nn.py +74 -48
- likelihood/models/deep/__init__.py +1 -0
- likelihood/models/deep/autoencoders.py +0 -455
- likelihood/models/deep/predictor.py +804 -0
- likelihood/models/simulation.py +0 -3
- likelihood/tools/impute.py +0 -1
- likelihood/tools/numeric_tools.py +11 -3
- likelihood/tools/tools.py +0 -1
- {likelihood-1.5.4.dist-info → likelihood-1.5.6.dist-info}/METADATA +2 -2
- {likelihood-1.5.4.dist-info → likelihood-1.5.6.dist-info}/RECORD +13 -12
- {likelihood-1.5.4.dist-info → likelihood-1.5.6.dist-info}/WHEEL +1 -1
- {likelihood-1.5.4.dist-info → likelihood-1.5.6.dist-info}/licenses/LICENSE +0 -0
- {likelihood-1.5.4.dist-info → likelihood-1.5.6.dist-info}/top_level.txt +0 -0
likelihood/graph/nn.py
CHANGED
|
@@ -5,6 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
|
5
5
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
6
6
|
|
|
7
7
|
import warnings
|
|
8
|
+
from multiprocessing import Pool, cpu_count
|
|
8
9
|
from typing import Any, List, Tuple
|
|
9
10
|
|
|
10
11
|
import numpy as np
|
|
@@ -13,65 +14,79 @@ import tensorflow as tf
|
|
|
13
14
|
from IPython.display import clear_output
|
|
14
15
|
from pandas.core.frame import DataFrame
|
|
15
16
|
from sklearn.metrics import f1_score
|
|
16
|
-
from sklearn.model_selection import train_test_split
|
|
17
17
|
|
|
18
18
|
tf.get_logger().setLevel("ERROR")
|
|
19
19
|
|
|
20
20
|
from likelihood.tools import LoRALayer
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def
|
|
24
|
-
"""
|
|
23
|
+
def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
|
|
24
|
+
"""Vectorized similarity comparison between two numeric/categorical arrays."""
|
|
25
|
+
arr1 = np.asarray(arr1)
|
|
26
|
+
arr2 = np.asarray(arr2)
|
|
25
27
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
28
|
+
is_numeric = np.vectorize(
|
|
29
|
+
lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
|
|
30
|
+
)(arr1, arr2)
|
|
31
|
+
|
|
32
|
+
similarity = np.zeros_like(arr1, dtype=bool)
|
|
33
|
+
|
|
34
|
+
if np.any(is_numeric):
|
|
35
|
+
a_num = arr1[is_numeric].astype(float)
|
|
36
|
+
b_num = arr2[is_numeric].astype(float)
|
|
37
|
+
|
|
38
|
+
both_zero = (a_num == 0) & (b_num == 0)
|
|
39
|
+
nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
|
|
40
|
+
ratio = np.zeros_like(a_num)
|
|
41
|
+
ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
|
|
42
|
+
a_num[nonzero], b_num[nonzero]
|
|
43
|
+
)
|
|
44
|
+
numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
|
|
45
|
+
|
|
46
|
+
similarity[is_numeric] = numeric_similar
|
|
47
|
+
|
|
48
|
+
similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
|
|
49
|
+
|
|
50
|
+
return np.count_nonzero(similarity)
|
|
37
51
|
|
|
38
|
-
|
|
52
|
+
|
|
53
|
+
def compare_pair(pair, data, similarity, threshold):
|
|
54
|
+
i, j = pair
|
|
55
|
+
sim = compare_similarity_np(data[i], data[j], threshold=threshold)
|
|
56
|
+
return (i, j, 1 if sim >= similarity else 0)
|
|
39
57
|
|
|
40
58
|
|
|
41
59
|
def cal_adjacency_matrix(
|
|
42
|
-
df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
60
|
+
df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
43
61
|
) -> Tuple[dict, np.ndarray]:
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
The similarity is calculated using the `compare_similarity` function.
|
|
47
|
-
The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
|
|
62
|
+
"""
|
|
63
|
+
Calculates the adjacency matrix for a given DataFrame using parallel processing.
|
|
48
64
|
|
|
49
65
|
Parameters
|
|
50
66
|
----------
|
|
51
67
|
df : `DataFrame`
|
|
52
68
|
The input DataFrame containing the features.
|
|
53
|
-
exclude_subset : `List[str]`, optional
|
|
69
|
+
exclude_subset : `List[str]`, `optional`
|
|
54
70
|
A list of features to exclude from the calculation of the adjacency matrix.
|
|
55
|
-
sparse : `bool`, optional
|
|
71
|
+
sparse : `bool`, `optional`
|
|
56
72
|
Whether to return a sparse matrix or a dense matrix.
|
|
57
73
|
**kwargs : `dict`
|
|
58
74
|
Additional keyword arguments to pass to the `compare_similarity` function.
|
|
59
75
|
|
|
60
|
-
Keyword Arguments:
|
|
61
|
-
----------
|
|
62
|
-
similarity: `int`
|
|
63
|
-
The minimum number of features that must be the same in both arrays to be considered similar.
|
|
64
|
-
threshold : `float`
|
|
65
|
-
The threshold value used in the `compare_similarity` function. Default is 0.05.
|
|
66
|
-
|
|
67
76
|
Returns
|
|
68
77
|
-------
|
|
69
78
|
adj_dict : `dict`
|
|
70
79
|
A dictionary containing the features.
|
|
71
80
|
adjacency_matrix : `ndarray`
|
|
72
81
|
The adjacency matrix.
|
|
73
|
-
"""
|
|
74
82
|
|
|
83
|
+
Keyword Arguments:
|
|
84
|
+
----------
|
|
85
|
+
similarity: `int`
|
|
86
|
+
The minimum number of features that must be the same in both arrays to be considered similar.
|
|
87
|
+
threshold : `float`
|
|
88
|
+
The threshold value used in the `compare_similarity` function. Default is 0.0
|
|
89
|
+
"""
|
|
75
90
|
if len(exclude_subset) > 0:
|
|
76
91
|
columns = [col for col in df.columns if col not in exclude_subset]
|
|
77
92
|
df_ = df[columns].copy()
|
|
@@ -84,14 +99,26 @@ def cal_adjacency_matrix(
|
|
|
84
99
|
threshold = kwargs.get("threshold", 0.05)
|
|
85
100
|
assert similarity <= df_.shape[1]
|
|
86
101
|
|
|
87
|
-
|
|
102
|
+
data = df_.to_numpy()
|
|
103
|
+
n = len(data)
|
|
88
104
|
|
|
89
|
-
|
|
105
|
+
adj_dict = {i: data[i].tolist() for i in range(n)}
|
|
90
106
|
|
|
91
|
-
|
|
92
|
-
for
|
|
93
|
-
|
|
94
|
-
|
|
107
|
+
def pair_generator():
|
|
108
|
+
for i in range(n):
|
|
109
|
+
for j in range(i, n):
|
|
110
|
+
yield (i, j)
|
|
111
|
+
|
|
112
|
+
with Pool(cpu_count()) as pool:
|
|
113
|
+
results = pool.starmap(
|
|
114
|
+
compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
|
|
118
|
+
for i, j, val in results:
|
|
119
|
+
if val:
|
|
120
|
+
adjacency_matrix[i, j] = 1
|
|
121
|
+
adjacency_matrix[j, i] = 1
|
|
95
122
|
|
|
96
123
|
if sparse:
|
|
97
124
|
num_nodes = adjacency_matrix.shape[0]
|
|
@@ -103,9 +130,7 @@ def cal_adjacency_matrix(
|
|
|
103
130
|
indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
|
|
104
131
|
)
|
|
105
132
|
|
|
106
|
-
|
|
107
|
-
else:
|
|
108
|
-
return adj_dict, adjacency_matrix
|
|
133
|
+
return adj_dict, adjacency_matrix
|
|
109
134
|
|
|
110
135
|
|
|
111
136
|
class Data:
|
|
@@ -260,12 +285,17 @@ class VanillaGNN(tf.keras.Model):
|
|
|
260
285
|
val_losses = []
|
|
261
286
|
val_f1_scores = []
|
|
262
287
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
288
|
+
num_nodes = len(data.x)
|
|
289
|
+
split_index = int((1 - test_size) * num_nodes)
|
|
290
|
+
|
|
291
|
+
X_train, X_test = data.x[:split_index], data.x[split_index:]
|
|
292
|
+
y_train, y_test = data.y[:split_index], data.y[split_index:]
|
|
293
|
+
|
|
294
|
+
adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
|
|
267
295
|
adjacency_test = tf.sparse.slice(
|
|
268
|
-
data.adjacency,
|
|
296
|
+
data.adjacency,
|
|
297
|
+
[split_index, split_index],
|
|
298
|
+
[num_nodes - split_index, num_nodes - split_index],
|
|
269
299
|
)
|
|
270
300
|
|
|
271
301
|
batch_starts = np.arange(0, len(X_train), batch_size)
|
|
@@ -286,10 +316,6 @@ class VanillaGNN(tf.keras.Model):
|
|
|
286
316
|
|
|
287
317
|
if epoch % 5 == 0:
|
|
288
318
|
clear_output(wait=True)
|
|
289
|
-
warnings.warn(
|
|
290
|
-
"It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
|
|
291
|
-
UserWarning,
|
|
292
|
-
)
|
|
293
319
|
val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
|
|
294
320
|
val_losses.append(val_loss)
|
|
295
321
|
val_f1_scores.append(val_f1)
|
|
@@ -1,27 +1,17 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
import random
|
|
4
|
-
import warnings
|
|
5
3
|
from functools import partial
|
|
6
4
|
from shutil import rmtree
|
|
7
5
|
|
|
8
|
-
import matplotlib
|
|
9
|
-
import matplotlib.colors as mcolors
|
|
10
|
-
import matplotlib.pyplot as plt
|
|
11
6
|
import numpy as np
|
|
12
7
|
import pandas as pd
|
|
13
|
-
from pandas.plotting import radviz
|
|
14
8
|
|
|
15
9
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
16
10
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
17
11
|
|
|
18
|
-
|
|
19
|
-
from typing import List
|
|
20
|
-
|
|
21
12
|
import keras_tuner
|
|
22
13
|
import tensorflow as tf
|
|
23
14
|
from pandas.core.frame import DataFrame
|
|
24
|
-
from sklearn.manifold import TSNE
|
|
25
15
|
from tensorflow.keras.layers import InputLayer
|
|
26
16
|
from tensorflow.keras.regularizers import l2
|
|
27
17
|
|
|
@@ -889,448 +879,3 @@ def setup_model(
|
|
|
889
879
|
|
|
890
880
|
best_hps = tuner.get_best_hyperparameters(1)[0].values
|
|
891
881
|
return best_model, pd.DataFrame(best_hps, index=["Value"])
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
class GetInsights:
|
|
895
|
-
"""
|
|
896
|
-
A class to analyze the output of a neural network model, including visualizations
|
|
897
|
-
of the weights, t-SNE representation, and feature statistics.
|
|
898
|
-
|
|
899
|
-
Parameters
|
|
900
|
-
----------
|
|
901
|
-
model : `AutoClassifier`
|
|
902
|
-
The trained model to analyze.
|
|
903
|
-
inputs : `np.ndarray`
|
|
904
|
-
The input data for analysis.
|
|
905
|
-
"""
|
|
906
|
-
|
|
907
|
-
def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
|
|
908
|
-
"""
|
|
909
|
-
Initializes the GetInsights class.
|
|
910
|
-
|
|
911
|
-
Parameters
|
|
912
|
-
----------
|
|
913
|
-
model : `AutoClassifier`
|
|
914
|
-
The trained model to analyze.
|
|
915
|
-
inputs : `np.ndarray`
|
|
916
|
-
The input data for analysis.
|
|
917
|
-
"""
|
|
918
|
-
self.inputs = inputs
|
|
919
|
-
self.model = model
|
|
920
|
-
|
|
921
|
-
self.encoder_layer = (
|
|
922
|
-
self.model.encoder.layers[1]
|
|
923
|
-
if isinstance(self.model.encoder.layers[0], InputLayer)
|
|
924
|
-
else self.model.encoder.layers[0]
|
|
925
|
-
)
|
|
926
|
-
self.decoder_layer = self.model.decoder.layers[0]
|
|
927
|
-
|
|
928
|
-
self.encoder_weights = self.encoder_layer.get_weights()[0]
|
|
929
|
-
self.decoder_weights = self.decoder_layer.get_weights()[0]
|
|
930
|
-
|
|
931
|
-
self.sorted_names = self._generate_sorted_color_names()
|
|
932
|
-
|
|
933
|
-
def _generate_sorted_color_names(self) -> list:
|
|
934
|
-
"""
|
|
935
|
-
Generate sorted color names based on their HSV values.
|
|
936
|
-
|
|
937
|
-
Parameters
|
|
938
|
-
----------
|
|
939
|
-
`None`
|
|
940
|
-
|
|
941
|
-
Returns
|
|
942
|
-
-------
|
|
943
|
-
`list` : Sorted color names.
|
|
944
|
-
"""
|
|
945
|
-
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
|
|
946
|
-
by_hsv = sorted(
|
|
947
|
-
(tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
|
|
948
|
-
for name, color in colors.items()
|
|
949
|
-
)
|
|
950
|
-
sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
|
|
951
|
-
random.shuffle(sorted_names)
|
|
952
|
-
return sorted_names
|
|
953
|
-
|
|
954
|
-
def predictor_analyzer(
|
|
955
|
-
self,
|
|
956
|
-
frac: float = None,
|
|
957
|
-
cmap: str = "viridis",
|
|
958
|
-
aspect: str = "auto",
|
|
959
|
-
highlight: bool = True,
|
|
960
|
-
**kwargs,
|
|
961
|
-
) -> None:
|
|
962
|
-
"""
|
|
963
|
-
Analyze the model's predictions and visualize data.
|
|
964
|
-
|
|
965
|
-
Parameters
|
|
966
|
-
----------
|
|
967
|
-
frac : `float`, optional
|
|
968
|
-
Fraction of data to use for analysis (default is `None`).
|
|
969
|
-
cmap : `str`, optional
|
|
970
|
-
The colormap for visualization (default is `"viridis"`).
|
|
971
|
-
aspect : `str`, optional
|
|
972
|
-
Aspect ratio for the visualization (default is `"auto"`).
|
|
973
|
-
highlight : `bool`, optional
|
|
974
|
-
Whether to highlight the maximum weights (default is `True`).
|
|
975
|
-
**kwargs : `dict`, optional
|
|
976
|
-
Additional keyword arguments for customization.
|
|
977
|
-
|
|
978
|
-
Returns
|
|
979
|
-
-------
|
|
980
|
-
`DataFrame` : The statistical summary of the input data.
|
|
981
|
-
"""
|
|
982
|
-
self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
|
|
983
|
-
inputs = self.inputs.copy()
|
|
984
|
-
inputs = self._prepare_inputs(inputs, frac)
|
|
985
|
-
y_labels = kwargs.get("y_labels", None)
|
|
986
|
-
encoded, reconstructed = self._encode_decode(inputs)
|
|
987
|
-
self._visualize_data(inputs, reconstructed, cmap, aspect)
|
|
988
|
-
self._prepare_data_for_analysis(inputs, reconstructed, encoded, y_labels)
|
|
989
|
-
|
|
990
|
-
try:
|
|
991
|
-
self._get_tsne_repr(inputs, frac)
|
|
992
|
-
self._viz_tsne_repr(c=self.classification)
|
|
993
|
-
|
|
994
|
-
self._viz_radviz(self.data, "class", "Radviz Visualization of Latent Space")
|
|
995
|
-
self._viz_radviz(self.data_input, "class", "Radviz Visualization of Input Data")
|
|
996
|
-
except ValueError:
|
|
997
|
-
warnings.warn(
|
|
998
|
-
"Some functions or processes will not be executed for regression problems.",
|
|
999
|
-
UserWarning,
|
|
1000
|
-
)
|
|
1001
|
-
|
|
1002
|
-
return self._statistics(self.data_input)
|
|
1003
|
-
|
|
1004
|
-
def _prepare_inputs(self, inputs: np.ndarray, frac: float) -> np.ndarray:
|
|
1005
|
-
"""
|
|
1006
|
-
Prepare the input data, possibly selecting a fraction of it.
|
|
1007
|
-
|
|
1008
|
-
Parameters
|
|
1009
|
-
----------
|
|
1010
|
-
inputs : `np.ndarray`
|
|
1011
|
-
The input data.
|
|
1012
|
-
frac : `float`
|
|
1013
|
-
Fraction of data to use.
|
|
1014
|
-
|
|
1015
|
-
Returns
|
|
1016
|
-
-------
|
|
1017
|
-
`np.ndarray` : The prepared input data.
|
|
1018
|
-
"""
|
|
1019
|
-
if frac:
|
|
1020
|
-
n = int(frac * self.inputs.shape[0])
|
|
1021
|
-
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
1022
|
-
inputs = inputs[indexes]
|
|
1023
|
-
inputs[np.isnan(inputs)] = 0.0
|
|
1024
|
-
return inputs
|
|
1025
|
-
|
|
1026
|
-
def _encode_decode(self, inputs: np.ndarray) -> tuple:
|
|
1027
|
-
"""
|
|
1028
|
-
Perform encoding and decoding on the input data.
|
|
1029
|
-
|
|
1030
|
-
Parameters
|
|
1031
|
-
----------
|
|
1032
|
-
inputs : `np.ndarray`
|
|
1033
|
-
The input data.
|
|
1034
|
-
|
|
1035
|
-
Returns
|
|
1036
|
-
-------
|
|
1037
|
-
`tuple` : The encoded and reconstructed data.
|
|
1038
|
-
"""
|
|
1039
|
-
try:
|
|
1040
|
-
mean, log_var = self.model.encoder(inputs)
|
|
1041
|
-
encoded = sampling(mean, log_var)
|
|
1042
|
-
except:
|
|
1043
|
-
encoded = self.model.encoder(inputs)
|
|
1044
|
-
reconstructed = self.model.decoder(encoded)
|
|
1045
|
-
return encoded, reconstructed
|
|
1046
|
-
|
|
1047
|
-
def _visualize_data(
|
|
1048
|
-
self, inputs: np.ndarray, reconstructed: np.ndarray, cmap: str, aspect: str
|
|
1049
|
-
) -> None:
|
|
1050
|
-
"""
|
|
1051
|
-
Visualize the original data and the reconstructed data.
|
|
1052
|
-
|
|
1053
|
-
Parameters
|
|
1054
|
-
----------
|
|
1055
|
-
inputs : `np.ndarray`
|
|
1056
|
-
The input data.
|
|
1057
|
-
reconstructed : `np.ndarray`
|
|
1058
|
-
The reconstructed data.
|
|
1059
|
-
cmap : `str`
|
|
1060
|
-
The colormap for visualization.
|
|
1061
|
-
aspect : `str`
|
|
1062
|
-
Aspect ratio for the visualization.
|
|
1063
|
-
|
|
1064
|
-
Returns
|
|
1065
|
-
-------
|
|
1066
|
-
`None`
|
|
1067
|
-
"""
|
|
1068
|
-
ax = plt.subplot(1, 2, 1)
|
|
1069
|
-
plt.imshow(inputs, cmap=cmap, aspect=aspect)
|
|
1070
|
-
plt.colorbar()
|
|
1071
|
-
plt.title("Original Data")
|
|
1072
|
-
|
|
1073
|
-
plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
|
|
1074
|
-
plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
|
|
1075
|
-
plt.colorbar()
|
|
1076
|
-
plt.title("Decoder Layer Reconstruction")
|
|
1077
|
-
plt.show()
|
|
1078
|
-
|
|
1079
|
-
def _prepare_data_for_analysis(
|
|
1080
|
-
self,
|
|
1081
|
-
inputs: np.ndarray,
|
|
1082
|
-
reconstructed: np.ndarray,
|
|
1083
|
-
encoded: np.ndarray,
|
|
1084
|
-
y_labels: List[str],
|
|
1085
|
-
) -> None:
|
|
1086
|
-
"""
|
|
1087
|
-
Prepare data for statistical analysis.
|
|
1088
|
-
|
|
1089
|
-
Parameters
|
|
1090
|
-
----------
|
|
1091
|
-
inputs : `np.ndarray`
|
|
1092
|
-
The input data.
|
|
1093
|
-
reconstructed : `np.ndarray`
|
|
1094
|
-
The reconstructed data.
|
|
1095
|
-
encoded : `np.ndarray`
|
|
1096
|
-
The encoded data.
|
|
1097
|
-
y_labels : `List[str]`
|
|
1098
|
-
The labels of features.
|
|
1099
|
-
|
|
1100
|
-
Returns
|
|
1101
|
-
-------
|
|
1102
|
-
`None`
|
|
1103
|
-
"""
|
|
1104
|
-
self.classification = (
|
|
1105
|
-
self.model.classifier(tf.concat([reconstructed, encoded], axis=1))
|
|
1106
|
-
.numpy()
|
|
1107
|
-
.argmax(axis=1)
|
|
1108
|
-
)
|
|
1109
|
-
|
|
1110
|
-
self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
|
|
1111
|
-
self.data_input = pd.DataFrame(
|
|
1112
|
-
inputs,
|
|
1113
|
-
columns=(
|
|
1114
|
-
[f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
|
|
1115
|
-
),
|
|
1116
|
-
)
|
|
1117
|
-
|
|
1118
|
-
self.data["class"] = self.classification
|
|
1119
|
-
self.data_input["class"] = self.classification
|
|
1120
|
-
|
|
1121
|
-
def _get_tsne_repr(self, inputs: np.ndarray = None, frac: float = None) -> None:
|
|
1122
|
-
"""
|
|
1123
|
-
Perform t-SNE dimensionality reduction on the input data.
|
|
1124
|
-
|
|
1125
|
-
Parameters
|
|
1126
|
-
----------
|
|
1127
|
-
inputs : `np.ndarray`
|
|
1128
|
-
The input data.
|
|
1129
|
-
frac : `float`
|
|
1130
|
-
Fraction of data to use.
|
|
1131
|
-
|
|
1132
|
-
Returns
|
|
1133
|
-
-------
|
|
1134
|
-
`None`
|
|
1135
|
-
"""
|
|
1136
|
-
if inputs is None:
|
|
1137
|
-
inputs = self.inputs.copy()
|
|
1138
|
-
if frac:
|
|
1139
|
-
n = int(frac * self.inputs.shape[0])
|
|
1140
|
-
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
1141
|
-
inputs = inputs[indexes]
|
|
1142
|
-
inputs[np.isnan(inputs)] = 0.0
|
|
1143
|
-
self.latent_representations = inputs @ self.encoder_weights
|
|
1144
|
-
|
|
1145
|
-
tsne = TSNE(n_components=2)
|
|
1146
|
-
self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
|
|
1147
|
-
|
|
1148
|
-
def _viz_tsne_repr(self, **kwargs) -> None:
|
|
1149
|
-
"""
|
|
1150
|
-
Visualize the t-SNE representation of the latent space.
|
|
1151
|
-
|
|
1152
|
-
Parameters
|
|
1153
|
-
----------
|
|
1154
|
-
**kwargs : `dict`
|
|
1155
|
-
Additional keyword arguments for customization.
|
|
1156
|
-
|
|
1157
|
-
Returns
|
|
1158
|
-
-------
|
|
1159
|
-
`None`
|
|
1160
|
-
"""
|
|
1161
|
-
c = kwargs.get("c", None)
|
|
1162
|
-
self.colors = (
|
|
1163
|
-
kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
|
|
1164
|
-
)
|
|
1165
|
-
|
|
1166
|
-
plt.scatter(
|
|
1167
|
-
self.reduced_data_tsne[:, 0],
|
|
1168
|
-
self.reduced_data_tsne[:, 1],
|
|
1169
|
-
cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
|
|
1170
|
-
c=c,
|
|
1171
|
-
)
|
|
1172
|
-
|
|
1173
|
-
if c is not None:
|
|
1174
|
-
cb = plt.colorbar()
|
|
1175
|
-
loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
|
|
1176
|
-
cb.set_ticks(loc)
|
|
1177
|
-
cb.set_ticklabels(np.unique(c))
|
|
1178
|
-
|
|
1179
|
-
plt.title("t-SNE Visualization of Latent Space")
|
|
1180
|
-
plt.xlabel("t-SNE 1")
|
|
1181
|
-
plt.ylabel("t-SNE 2")
|
|
1182
|
-
plt.show()
|
|
1183
|
-
|
|
1184
|
-
def _viz_radviz(self, data: pd.DataFrame, color_column: str, title: str) -> None:
|
|
1185
|
-
"""
|
|
1186
|
-
Visualize the data using RadViz.
|
|
1187
|
-
|
|
1188
|
-
Parameters
|
|
1189
|
-
----------
|
|
1190
|
-
data : `pd.DataFrame`
|
|
1191
|
-
The data to visualize.
|
|
1192
|
-
color_column : `str`
|
|
1193
|
-
The column to use for coloring.
|
|
1194
|
-
title : `str`
|
|
1195
|
-
The title of the plot.
|
|
1196
|
-
|
|
1197
|
-
Returns
|
|
1198
|
-
-------
|
|
1199
|
-
`None`
|
|
1200
|
-
"""
|
|
1201
|
-
data_normalized = data.copy(deep=True)
|
|
1202
|
-
data_normalized.iloc[:, :-1] = (
|
|
1203
|
-
2.0
|
|
1204
|
-
* (data_normalized.iloc[:, :-1] - data_normalized.iloc[:, :-1].min())
|
|
1205
|
-
/ (data_normalized.iloc[:, :-1].max() - data_normalized.iloc[:, :-1].min())
|
|
1206
|
-
- 1
|
|
1207
|
-
)
|
|
1208
|
-
radviz(data_normalized, color_column, color=self.colors)
|
|
1209
|
-
plt.title(title)
|
|
1210
|
-
plt.show()
|
|
1211
|
-
|
|
1212
|
-
def _viz_weights(
|
|
1213
|
-
self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
|
|
1214
|
-
) -> None:
|
|
1215
|
-
"""
|
|
1216
|
-
Visualize the encoder layer weights of the model.
|
|
1217
|
-
|
|
1218
|
-
Parameters
|
|
1219
|
-
----------
|
|
1220
|
-
cmap : `str`, optional
|
|
1221
|
-
The colormap for visualization (default is `"viridis"`).
|
|
1222
|
-
aspect : `str`, optional
|
|
1223
|
-
Aspect ratio for the visualization (default is `"auto"`).
|
|
1224
|
-
highlight : `bool`, optional
|
|
1225
|
-
Whether to highlight the maximum weights (default is `True`).
|
|
1226
|
-
**kwargs : `dict`, optional
|
|
1227
|
-
Additional keyword arguments for customization.
|
|
1228
|
-
|
|
1229
|
-
Returns
|
|
1230
|
-
-------
|
|
1231
|
-
`None`
|
|
1232
|
-
"""
|
|
1233
|
-
title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
|
|
1234
|
-
y_labels = kwargs.get("y_labels", None)
|
|
1235
|
-
cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
|
|
1236
|
-
highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
|
|
1237
|
-
|
|
1238
|
-
plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
|
|
1239
|
-
plt.colorbar()
|
|
1240
|
-
plt.title(title)
|
|
1241
|
-
if y_labels is not None:
|
|
1242
|
-
plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
|
|
1243
|
-
if highlight:
|
|
1244
|
-
for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
|
|
1245
|
-
highlight_mask[i, j] = True
|
|
1246
|
-
plt.imshow(
|
|
1247
|
-
np.ma.masked_where(~highlight_mask, self.encoder_weights),
|
|
1248
|
-
cmap=cmap_highlight,
|
|
1249
|
-
alpha=0.5,
|
|
1250
|
-
aspect=aspect,
|
|
1251
|
-
)
|
|
1252
|
-
plt.show()
|
|
1253
|
-
|
|
1254
|
-
def _statistics(self, data_input: DataFrame) -> DataFrame:
|
|
1255
|
-
"""
|
|
1256
|
-
Compute statistical summaries of the input data.
|
|
1257
|
-
|
|
1258
|
-
Parameters
|
|
1259
|
-
----------
|
|
1260
|
-
data_input : `DataFrame`
|
|
1261
|
-
The data to compute statistics for.
|
|
1262
|
-
|
|
1263
|
-
Returns
|
|
1264
|
-
-------
|
|
1265
|
-
`DataFrame` : The statistical summary of the input data.
|
|
1266
|
-
"""
|
|
1267
|
-
data = data_input.copy(deep=True)
|
|
1268
|
-
|
|
1269
|
-
if not pd.api.types.is_string_dtype(data["class"]):
|
|
1270
|
-
data["class"] = data["class"].astype(str)
|
|
1271
|
-
|
|
1272
|
-
data.ffill(inplace=True)
|
|
1273
|
-
grouped_data = data.groupby("class")
|
|
1274
|
-
|
|
1275
|
-
numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
|
|
1276
|
-
numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
|
|
1277
|
-
|
|
1278
|
-
def get_mode(x):
|
|
1279
|
-
mode_series = x.mode()
|
|
1280
|
-
return mode_series.iloc[0] if not mode_series.empty else None
|
|
1281
|
-
|
|
1282
|
-
mode_stats = grouped_data.apply(get_mode, include_groups=False)
|
|
1283
|
-
mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
|
|
1284
|
-
combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
|
|
1285
|
-
|
|
1286
|
-
return combined_stats.T
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
########################################################################################
|
|
1290
|
-
|
|
1291
|
-
if __name__ == "__main__":
|
|
1292
|
-
# Example usage
|
|
1293
|
-
import pandas as pd
|
|
1294
|
-
from sklearn.datasets import load_iris
|
|
1295
|
-
from sklearn.preprocessing import OneHotEncoder
|
|
1296
|
-
|
|
1297
|
-
# Load the dataset
|
|
1298
|
-
iris = load_iris()
|
|
1299
|
-
|
|
1300
|
-
# Convert to a DataFrame for easy exploration
|
|
1301
|
-
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
|
|
1302
|
-
iris_df["species"] = iris.target
|
|
1303
|
-
|
|
1304
|
-
X = iris_df.drop(columns="species")
|
|
1305
|
-
y_labels = X.columns
|
|
1306
|
-
X = X.values
|
|
1307
|
-
y = iris_df["species"].values
|
|
1308
|
-
|
|
1309
|
-
X = np.asarray(X).astype(np.float32)
|
|
1310
|
-
|
|
1311
|
-
encoder = OneHotEncoder()
|
|
1312
|
-
y = encoder.fit_transform(y.reshape(-1, 1)).toarray()
|
|
1313
|
-
y = np.asarray(y).astype(np.float32)
|
|
1314
|
-
|
|
1315
|
-
model = AutoClassifier(
|
|
1316
|
-
input_shape_parm=X.shape[1],
|
|
1317
|
-
num_classes=3,
|
|
1318
|
-
units=27,
|
|
1319
|
-
activation="tanh",
|
|
1320
|
-
num_layers=2,
|
|
1321
|
-
dropout=0.2,
|
|
1322
|
-
)
|
|
1323
|
-
model.compile(
|
|
1324
|
-
optimizer="adam",
|
|
1325
|
-
loss=tf.keras.losses.CategoricalCrossentropy(),
|
|
1326
|
-
metrics=[tf.keras.metrics.F1Score(threshold=0.5)],
|
|
1327
|
-
)
|
|
1328
|
-
model.fit(X, y, epochs=50, validation_split=0.2)
|
|
1329
|
-
|
|
1330
|
-
insights = GetInsights(model, X)
|
|
1331
|
-
summary = insights.predictor_analyzer(frac=1.0, y_labels=y_labels)
|
|
1332
|
-
insights._get_tsne_repr()
|
|
1333
|
-
insights._viz_tsne_repr()
|
|
1334
|
-
insights._viz_tsne_repr(c=iris_df["species"])
|
|
1335
|
-
insights._viz_weights()
|
|
1336
|
-
print(summary)
|