likelihood 1.5.4__py3-none-any.whl → 1.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/graph/nn.py CHANGED
@@ -5,6 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
5
  logging.getLogger("tensorflow").setLevel(logging.ERROR)
6
6
 
7
7
  import warnings
8
+ from multiprocessing import Pool, cpu_count
8
9
  from typing import Any, List, Tuple
9
10
 
10
11
  import numpy as np
@@ -13,65 +14,79 @@ import tensorflow as tf
13
14
  from IPython.display import clear_output
14
15
  from pandas.core.frame import DataFrame
15
16
  from sklearn.metrics import f1_score
16
- from sklearn.model_selection import train_test_split
17
17
 
18
18
  tf.get_logger().setLevel("ERROR")
19
19
 
20
20
  from likelihood.tools import LoRALayer
21
21
 
22
22
 
23
- def compare_similarity(arr1: List[Any], arr2: List[Any], threshold: float = 0.05) -> int:
24
- """Calculate the similarity between two arrays considering numeric values near to 1 in ratio."""
23
+ def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
24
+ """Vectorized similarity comparison between two numeric/categorical arrays."""
25
+ arr1 = np.asarray(arr1)
26
+ arr2 = np.asarray(arr2)
25
27
 
26
- def is_similar(a: Any, b: Any) -> bool:
27
- if isinstance(a, (int, float)) and isinstance(b, (int, float)):
28
- if a == 0 and b == 0:
29
- return True
30
- if a == 0 or b == 0:
31
- return False
32
- # For numeric values, check if their ratio is within the threshold range
33
- ratio = max(a, b) / min(a, b)
34
- return 1 - threshold <= ratio <= 1 + threshold
35
- else:
36
- return a == b
28
+ is_numeric = np.vectorize(
29
+ lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
30
+ )(arr1, arr2)
31
+
32
+ similarity = np.zeros_like(arr1, dtype=bool)
33
+
34
+ if np.any(is_numeric):
35
+ a_num = arr1[is_numeric].astype(float)
36
+ b_num = arr2[is_numeric].astype(float)
37
+
38
+ both_zero = (a_num == 0) & (b_num == 0)
39
+ nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
40
+ ratio = np.zeros_like(a_num)
41
+ ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
42
+ a_num[nonzero], b_num[nonzero]
43
+ )
44
+ numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
45
+
46
+ similarity[is_numeric] = numeric_similar
47
+
48
+ similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
49
+
50
+ return np.count_nonzero(similarity)
37
51
 
38
- return sum(is_similar(a, b) for a, b in zip(arr1, arr2))
52
+
53
+ def compare_pair(pair, data, similarity, threshold):
54
+ i, j = pair
55
+ sim = compare_similarity_np(data[i], data[j], threshold=threshold)
56
+ return (i, j, 1 if sim >= similarity else 0)
39
57
 
40
58
 
41
59
  def cal_adjacency_matrix(
42
- df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
60
+ df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
43
61
  ) -> Tuple[dict, np.ndarray]:
44
- """Calculates the adjacency matrix for a given DataFrame.
45
- The adjacency matrix is a matrix that represents the similarity between each pair of features.
46
- The similarity is calculated using the `compare_similarity` function.
47
- The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
62
+ """
63
+ Calculates the adjacency matrix for a given DataFrame using parallel processing.
48
64
 
49
65
  Parameters
50
66
  ----------
51
67
  df : `DataFrame`
52
68
  The input DataFrame containing the features.
53
- exclude_subset : `List[str]`, optional
69
+ exclude_subset : `List[str]`, `optional`
54
70
  A list of features to exclude from the calculation of the adjacency matrix.
55
- sparse : `bool`, optional
71
+ sparse : `bool`, `optional`
56
72
  Whether to return a sparse matrix or a dense matrix.
57
73
  **kwargs : `dict`
58
74
  Additional keyword arguments to pass to the `compare_similarity` function.
59
75
 
60
- Keyword Arguments:
61
- ----------
62
- similarity: `int`
63
- The minimum number of features that must be the same in both arrays to be considered similar.
64
- threshold : `float`
65
- The threshold value used in the `compare_similarity` function. Default is 0.05.
66
-
67
76
  Returns
68
77
  -------
69
78
  adj_dict : `dict`
70
79
  A dictionary containing the features.
71
80
  adjacency_matrix : `ndarray`
72
81
  The adjacency matrix.
73
- """
74
82
 
83
+ Keyword Arguments:
84
+ ----------
85
+ similarity: `int`
86
+ The minimum number of features that must be the same in both arrays to be considered similar.
87
+ threshold : `float`
88
+ The threshold value used in the `compare_similarity` function. Default is 0.0
89
+ """
75
90
  if len(exclude_subset) > 0:
76
91
  columns = [col for col in df.columns if col not in exclude_subset]
77
92
  df_ = df[columns].copy()
@@ -84,14 +99,26 @@ def cal_adjacency_matrix(
84
99
  threshold = kwargs.get("threshold", 0.05)
85
100
  assert similarity <= df_.shape[1]
86
101
 
87
- adj_dict = {index: row.tolist() for index, row in df_.iterrows()}
102
+ data = df_.to_numpy()
103
+ n = len(data)
88
104
 
89
- adjacency_matrix = np.zeros((len(df_), len(df_)))
105
+ adj_dict = {i: data[i].tolist() for i in range(n)}
90
106
 
91
- for i in range(len(df_)):
92
- for j in range(len(df_)):
93
- if compare_similarity(adj_dict[i], adj_dict[j], threshold=threshold) >= similarity:
94
- adjacency_matrix[i][j] = 1
107
+ def pair_generator():
108
+ for i in range(n):
109
+ for j in range(i, n):
110
+ yield (i, j)
111
+
112
+ with Pool(cpu_count()) as pool:
113
+ results = pool.starmap(
114
+ compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
115
+ )
116
+
117
+ adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
118
+ for i, j, val in results:
119
+ if val:
120
+ adjacency_matrix[i, j] = 1
121
+ adjacency_matrix[j, i] = 1
95
122
 
96
123
  if sparse:
97
124
  num_nodes = adjacency_matrix.shape[0]
@@ -103,9 +130,7 @@ def cal_adjacency_matrix(
103
130
  indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
104
131
  )
105
132
 
106
- return adj_dict, adjacency_matrix
107
- else:
108
- return adj_dict, adjacency_matrix
133
+ return adj_dict, adjacency_matrix
109
134
 
110
135
 
111
136
  class Data:
@@ -260,12 +285,17 @@ class VanillaGNN(tf.keras.Model):
260
285
  val_losses = []
261
286
  val_f1_scores = []
262
287
 
263
- X_train, X_test, y_train, y_test = train_test_split(
264
- data.x, data.y, test_size=test_size, shuffle=False
265
- )
266
- adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
288
+ num_nodes = len(data.x)
289
+ split_index = int((1 - test_size) * num_nodes)
290
+
291
+ X_train, X_test = data.x[:split_index], data.x[split_index:]
292
+ y_train, y_test = data.y[:split_index], data.y[split_index:]
293
+
294
+ adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
267
295
  adjacency_test = tf.sparse.slice(
268
- data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
296
+ data.adjacency,
297
+ [split_index, split_index],
298
+ [num_nodes - split_index, num_nodes - split_index],
269
299
  )
270
300
 
271
301
  batch_starts = np.arange(0, len(X_train), batch_size)
@@ -286,10 +316,6 @@ class VanillaGNN(tf.keras.Model):
286
316
 
287
317
  if epoch % 5 == 0:
288
318
  clear_output(wait=True)
289
- warnings.warn(
290
- "It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
291
- UserWarning,
292
- )
293
319
  val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
294
320
  val_losses.append(val_loss)
295
321
  val_f1_scores.append(val_f1)
@@ -1,2 +1,3 @@
1
1
  from .autoencoders import *
2
2
  from .gan import *
3
+ from .predictor import GetInsights
@@ -1,27 +1,17 @@
1
1
  import logging
2
2
  import os
3
- import random
4
- import warnings
5
3
  from functools import partial
6
4
  from shutil import rmtree
7
5
 
8
- import matplotlib
9
- import matplotlib.colors as mcolors
10
- import matplotlib.pyplot as plt
11
6
  import numpy as np
12
7
  import pandas as pd
13
- from pandas.plotting import radviz
14
8
 
15
9
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
16
10
  logging.getLogger("tensorflow").setLevel(logging.ERROR)
17
11
 
18
-
19
- from typing import List
20
-
21
12
  import keras_tuner
22
13
  import tensorflow as tf
23
14
  from pandas.core.frame import DataFrame
24
- from sklearn.manifold import TSNE
25
15
  from tensorflow.keras.layers import InputLayer
26
16
  from tensorflow.keras.regularizers import l2
27
17
 
@@ -889,448 +879,3 @@ def setup_model(
889
879
 
890
880
  best_hps = tuner.get_best_hyperparameters(1)[0].values
891
881
  return best_model, pd.DataFrame(best_hps, index=["Value"])
892
-
893
-
894
- class GetInsights:
895
- """
896
- A class to analyze the output of a neural network model, including visualizations
897
- of the weights, t-SNE representation, and feature statistics.
898
-
899
- Parameters
900
- ----------
901
- model : `AutoClassifier`
902
- The trained model to analyze.
903
- inputs : `np.ndarray`
904
- The input data for analysis.
905
- """
906
-
907
- def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
908
- """
909
- Initializes the GetInsights class.
910
-
911
- Parameters
912
- ----------
913
- model : `AutoClassifier`
914
- The trained model to analyze.
915
- inputs : `np.ndarray`
916
- The input data for analysis.
917
- """
918
- self.inputs = inputs
919
- self.model = model
920
-
921
- self.encoder_layer = (
922
- self.model.encoder.layers[1]
923
- if isinstance(self.model.encoder.layers[0], InputLayer)
924
- else self.model.encoder.layers[0]
925
- )
926
- self.decoder_layer = self.model.decoder.layers[0]
927
-
928
- self.encoder_weights = self.encoder_layer.get_weights()[0]
929
- self.decoder_weights = self.decoder_layer.get_weights()[0]
930
-
931
- self.sorted_names = self._generate_sorted_color_names()
932
-
933
- def _generate_sorted_color_names(self) -> list:
934
- """
935
- Generate sorted color names based on their HSV values.
936
-
937
- Parameters
938
- ----------
939
- `None`
940
-
941
- Returns
942
- -------
943
- `list` : Sorted color names.
944
- """
945
- colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
946
- by_hsv = sorted(
947
- (tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
948
- for name, color in colors.items()
949
- )
950
- sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
951
- random.shuffle(sorted_names)
952
- return sorted_names
953
-
954
- def predictor_analyzer(
955
- self,
956
- frac: float = None,
957
- cmap: str = "viridis",
958
- aspect: str = "auto",
959
- highlight: bool = True,
960
- **kwargs,
961
- ) -> None:
962
- """
963
- Analyze the model's predictions and visualize data.
964
-
965
- Parameters
966
- ----------
967
- frac : `float`, optional
968
- Fraction of data to use for analysis (default is `None`).
969
- cmap : `str`, optional
970
- The colormap for visualization (default is `"viridis"`).
971
- aspect : `str`, optional
972
- Aspect ratio for the visualization (default is `"auto"`).
973
- highlight : `bool`, optional
974
- Whether to highlight the maximum weights (default is `True`).
975
- **kwargs : `dict`, optional
976
- Additional keyword arguments for customization.
977
-
978
- Returns
979
- -------
980
- `DataFrame` : The statistical summary of the input data.
981
- """
982
- self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
983
- inputs = self.inputs.copy()
984
- inputs = self._prepare_inputs(inputs, frac)
985
- y_labels = kwargs.get("y_labels", None)
986
- encoded, reconstructed = self._encode_decode(inputs)
987
- self._visualize_data(inputs, reconstructed, cmap, aspect)
988
- self._prepare_data_for_analysis(inputs, reconstructed, encoded, y_labels)
989
-
990
- try:
991
- self._get_tsne_repr(inputs, frac)
992
- self._viz_tsne_repr(c=self.classification)
993
-
994
- self._viz_radviz(self.data, "class", "Radviz Visualization of Latent Space")
995
- self._viz_radviz(self.data_input, "class", "Radviz Visualization of Input Data")
996
- except ValueError:
997
- warnings.warn(
998
- "Some functions or processes will not be executed for regression problems.",
999
- UserWarning,
1000
- )
1001
-
1002
- return self._statistics(self.data_input)
1003
-
1004
- def _prepare_inputs(self, inputs: np.ndarray, frac: float) -> np.ndarray:
1005
- """
1006
- Prepare the input data, possibly selecting a fraction of it.
1007
-
1008
- Parameters
1009
- ----------
1010
- inputs : `np.ndarray`
1011
- The input data.
1012
- frac : `float`
1013
- Fraction of data to use.
1014
-
1015
- Returns
1016
- -------
1017
- `np.ndarray` : The prepared input data.
1018
- """
1019
- if frac:
1020
- n = int(frac * self.inputs.shape[0])
1021
- indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
1022
- inputs = inputs[indexes]
1023
- inputs[np.isnan(inputs)] = 0.0
1024
- return inputs
1025
-
1026
- def _encode_decode(self, inputs: np.ndarray) -> tuple:
1027
- """
1028
- Perform encoding and decoding on the input data.
1029
-
1030
- Parameters
1031
- ----------
1032
- inputs : `np.ndarray`
1033
- The input data.
1034
-
1035
- Returns
1036
- -------
1037
- `tuple` : The encoded and reconstructed data.
1038
- """
1039
- try:
1040
- mean, log_var = self.model.encoder(inputs)
1041
- encoded = sampling(mean, log_var)
1042
- except:
1043
- encoded = self.model.encoder(inputs)
1044
- reconstructed = self.model.decoder(encoded)
1045
- return encoded, reconstructed
1046
-
1047
- def _visualize_data(
1048
- self, inputs: np.ndarray, reconstructed: np.ndarray, cmap: str, aspect: str
1049
- ) -> None:
1050
- """
1051
- Visualize the original data and the reconstructed data.
1052
-
1053
- Parameters
1054
- ----------
1055
- inputs : `np.ndarray`
1056
- The input data.
1057
- reconstructed : `np.ndarray`
1058
- The reconstructed data.
1059
- cmap : `str`
1060
- The colormap for visualization.
1061
- aspect : `str`
1062
- Aspect ratio for the visualization.
1063
-
1064
- Returns
1065
- -------
1066
- `None`
1067
- """
1068
- ax = plt.subplot(1, 2, 1)
1069
- plt.imshow(inputs, cmap=cmap, aspect=aspect)
1070
- plt.colorbar()
1071
- plt.title("Original Data")
1072
-
1073
- plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
1074
- plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
1075
- plt.colorbar()
1076
- plt.title("Decoder Layer Reconstruction")
1077
- plt.show()
1078
-
1079
- def _prepare_data_for_analysis(
1080
- self,
1081
- inputs: np.ndarray,
1082
- reconstructed: np.ndarray,
1083
- encoded: np.ndarray,
1084
- y_labels: List[str],
1085
- ) -> None:
1086
- """
1087
- Prepare data for statistical analysis.
1088
-
1089
- Parameters
1090
- ----------
1091
- inputs : `np.ndarray`
1092
- The input data.
1093
- reconstructed : `np.ndarray`
1094
- The reconstructed data.
1095
- encoded : `np.ndarray`
1096
- The encoded data.
1097
- y_labels : `List[str]`
1098
- The labels of features.
1099
-
1100
- Returns
1101
- -------
1102
- `None`
1103
- """
1104
- self.classification = (
1105
- self.model.classifier(tf.concat([reconstructed, encoded], axis=1))
1106
- .numpy()
1107
- .argmax(axis=1)
1108
- )
1109
-
1110
- self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
1111
- self.data_input = pd.DataFrame(
1112
- inputs,
1113
- columns=(
1114
- [f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
1115
- ),
1116
- )
1117
-
1118
- self.data["class"] = self.classification
1119
- self.data_input["class"] = self.classification
1120
-
1121
- def _get_tsne_repr(self, inputs: np.ndarray = None, frac: float = None) -> None:
1122
- """
1123
- Perform t-SNE dimensionality reduction on the input data.
1124
-
1125
- Parameters
1126
- ----------
1127
- inputs : `np.ndarray`
1128
- The input data.
1129
- frac : `float`
1130
- Fraction of data to use.
1131
-
1132
- Returns
1133
- -------
1134
- `None`
1135
- """
1136
- if inputs is None:
1137
- inputs = self.inputs.copy()
1138
- if frac:
1139
- n = int(frac * self.inputs.shape[0])
1140
- indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
1141
- inputs = inputs[indexes]
1142
- inputs[np.isnan(inputs)] = 0.0
1143
- self.latent_representations = inputs @ self.encoder_weights
1144
-
1145
- tsne = TSNE(n_components=2)
1146
- self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
1147
-
1148
- def _viz_tsne_repr(self, **kwargs) -> None:
1149
- """
1150
- Visualize the t-SNE representation of the latent space.
1151
-
1152
- Parameters
1153
- ----------
1154
- **kwargs : `dict`
1155
- Additional keyword arguments for customization.
1156
-
1157
- Returns
1158
- -------
1159
- `None`
1160
- """
1161
- c = kwargs.get("c", None)
1162
- self.colors = (
1163
- kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
1164
- )
1165
-
1166
- plt.scatter(
1167
- self.reduced_data_tsne[:, 0],
1168
- self.reduced_data_tsne[:, 1],
1169
- cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
1170
- c=c,
1171
- )
1172
-
1173
- if c is not None:
1174
- cb = plt.colorbar()
1175
- loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
1176
- cb.set_ticks(loc)
1177
- cb.set_ticklabels(np.unique(c))
1178
-
1179
- plt.title("t-SNE Visualization of Latent Space")
1180
- plt.xlabel("t-SNE 1")
1181
- plt.ylabel("t-SNE 2")
1182
- plt.show()
1183
-
1184
- def _viz_radviz(self, data: pd.DataFrame, color_column: str, title: str) -> None:
1185
- """
1186
- Visualize the data using RadViz.
1187
-
1188
- Parameters
1189
- ----------
1190
- data : `pd.DataFrame`
1191
- The data to visualize.
1192
- color_column : `str`
1193
- The column to use for coloring.
1194
- title : `str`
1195
- The title of the plot.
1196
-
1197
- Returns
1198
- -------
1199
- `None`
1200
- """
1201
- data_normalized = data.copy(deep=True)
1202
- data_normalized.iloc[:, :-1] = (
1203
- 2.0
1204
- * (data_normalized.iloc[:, :-1] - data_normalized.iloc[:, :-1].min())
1205
- / (data_normalized.iloc[:, :-1].max() - data_normalized.iloc[:, :-1].min())
1206
- - 1
1207
- )
1208
- radviz(data_normalized, color_column, color=self.colors)
1209
- plt.title(title)
1210
- plt.show()
1211
-
1212
- def _viz_weights(
1213
- self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
1214
- ) -> None:
1215
- """
1216
- Visualize the encoder layer weights of the model.
1217
-
1218
- Parameters
1219
- ----------
1220
- cmap : `str`, optional
1221
- The colormap for visualization (default is `"viridis"`).
1222
- aspect : `str`, optional
1223
- Aspect ratio for the visualization (default is `"auto"`).
1224
- highlight : `bool`, optional
1225
- Whether to highlight the maximum weights (default is `True`).
1226
- **kwargs : `dict`, optional
1227
- Additional keyword arguments for customization.
1228
-
1229
- Returns
1230
- -------
1231
- `None`
1232
- """
1233
- title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
1234
- y_labels = kwargs.get("y_labels", None)
1235
- cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
1236
- highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
1237
-
1238
- plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
1239
- plt.colorbar()
1240
- plt.title(title)
1241
- if y_labels is not None:
1242
- plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
1243
- if highlight:
1244
- for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
1245
- highlight_mask[i, j] = True
1246
- plt.imshow(
1247
- np.ma.masked_where(~highlight_mask, self.encoder_weights),
1248
- cmap=cmap_highlight,
1249
- alpha=0.5,
1250
- aspect=aspect,
1251
- )
1252
- plt.show()
1253
-
1254
- def _statistics(self, data_input: DataFrame) -> DataFrame:
1255
- """
1256
- Compute statistical summaries of the input data.
1257
-
1258
- Parameters
1259
- ----------
1260
- data_input : `DataFrame`
1261
- The data to compute statistics for.
1262
-
1263
- Returns
1264
- -------
1265
- `DataFrame` : The statistical summary of the input data.
1266
- """
1267
- data = data_input.copy(deep=True)
1268
-
1269
- if not pd.api.types.is_string_dtype(data["class"]):
1270
- data["class"] = data["class"].astype(str)
1271
-
1272
- data.ffill(inplace=True)
1273
- grouped_data = data.groupby("class")
1274
-
1275
- numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
1276
- numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
1277
-
1278
- def get_mode(x):
1279
- mode_series = x.mode()
1280
- return mode_series.iloc[0] if not mode_series.empty else None
1281
-
1282
- mode_stats = grouped_data.apply(get_mode, include_groups=False)
1283
- mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
1284
- combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
1285
-
1286
- return combined_stats.T
1287
-
1288
-
1289
- ########################################################################################
1290
-
1291
- if __name__ == "__main__":
1292
- # Example usage
1293
- import pandas as pd
1294
- from sklearn.datasets import load_iris
1295
- from sklearn.preprocessing import OneHotEncoder
1296
-
1297
- # Load the dataset
1298
- iris = load_iris()
1299
-
1300
- # Convert to a DataFrame for easy exploration
1301
- iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
1302
- iris_df["species"] = iris.target
1303
-
1304
- X = iris_df.drop(columns="species")
1305
- y_labels = X.columns
1306
- X = X.values
1307
- y = iris_df["species"].values
1308
-
1309
- X = np.asarray(X).astype(np.float32)
1310
-
1311
- encoder = OneHotEncoder()
1312
- y = encoder.fit_transform(y.reshape(-1, 1)).toarray()
1313
- y = np.asarray(y).astype(np.float32)
1314
-
1315
- model = AutoClassifier(
1316
- input_shape_parm=X.shape[1],
1317
- num_classes=3,
1318
- units=27,
1319
- activation="tanh",
1320
- num_layers=2,
1321
- dropout=0.2,
1322
- )
1323
- model.compile(
1324
- optimizer="adam",
1325
- loss=tf.keras.losses.CategoricalCrossentropy(),
1326
- metrics=[tf.keras.metrics.F1Score(threshold=0.5)],
1327
- )
1328
- model.fit(X, y, epochs=50, validation_split=0.2)
1329
-
1330
- insights = GetInsights(model, X)
1331
- summary = insights.predictor_analyzer(frac=1.0, y_labels=y_labels)
1332
- insights._get_tsne_repr()
1333
- insights._viz_tsne_repr()
1334
- insights._viz_tsne_repr(c=iris_df["species"])
1335
- insights._viz_weights()
1336
- print(summary)