likelihood 2.2.0.dev1__cp311-cp311-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/VERSION +1 -0
- likelihood/__init__.py +20 -0
- likelihood/graph/__init__.py +9 -0
- likelihood/graph/_nn.py +283 -0
- likelihood/graph/graph.py +86 -0
- likelihood/graph/nn.py +329 -0
- likelihood/main.py +273 -0
- likelihood/models/__init__.py +3 -0
- likelihood/models/deep/__init__.py +13 -0
- likelihood/models/deep/_autoencoders.py +896 -0
- likelihood/models/deep/_predictor.py +809 -0
- likelihood/models/deep/autoencoders.py +903 -0
- likelihood/models/deep/bandit.py +97 -0
- likelihood/models/deep/gan.py +313 -0
- likelihood/models/deep/predictor.py +805 -0
- likelihood/models/deep/rl.py +345 -0
- likelihood/models/environments.py +202 -0
- likelihood/models/hmm.py +163 -0
- likelihood/models/regression.py +451 -0
- likelihood/models/simulation.py +213 -0
- likelihood/models/utils.py +87 -0
- likelihood/pipes.py +382 -0
- likelihood/rust_py_integration.cpython-311-x86_64-linux-gnu.so +0 -0
- likelihood/tools/__init__.py +4 -0
- likelihood/tools/cat_embed.py +212 -0
- likelihood/tools/figures.py +348 -0
- likelihood/tools/impute.py +278 -0
- likelihood/tools/models_tools.py +866 -0
- likelihood/tools/numeric_tools.py +390 -0
- likelihood/tools/reports.py +375 -0
- likelihood/tools/tools.py +1336 -0
- likelihood-2.2.0.dev1.dist-info/METADATA +68 -0
- likelihood-2.2.0.dev1.dist-info/RECORD +37 -0
- likelihood-2.2.0.dev1.dist-info/WHEEL +5 -0
- likelihood-2.2.0.dev1.dist-info/licenses/LICENSE +21 -0
- likelihood-2.2.0.dev1.dist-info/top_level.txt +7 -0
- src/lib.rs +12 -0
|
@@ -0,0 +1,805 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import warnings
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import matplotlib
|
|
6
|
+
import matplotlib.colors as mcolors
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
import networkx as nx
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import tensorflow as tf
|
|
12
|
+
from IPython.display import HTML, display
|
|
13
|
+
from matplotlib import cm
|
|
14
|
+
from matplotlib.colors import Normalize
|
|
15
|
+
from pandas.plotting import radviz
|
|
16
|
+
from sklearn.manifold import TSNE
|
|
17
|
+
from tensorflow.keras.layers import InputLayer
|
|
18
|
+
|
|
19
|
+
from likelihood.models.deep.autoencoders import AutoClassifier, sampling
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GetInsights:
|
|
23
|
+
"""
|
|
24
|
+
A class to analyze the output of a neural network model, including visualizations
|
|
25
|
+
of the weights, t-SNE representation, and feature statistics.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
model : `AutoClassifier`
|
|
30
|
+
The trained model to analyze.
|
|
31
|
+
inputs : `np.ndarray`
|
|
32
|
+
The input data for analysis.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Initializes the GetInsights class.
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
model : `AutoClassifier`
|
|
42
|
+
The trained model to analyze.
|
|
43
|
+
inputs : `np.ndarray`
|
|
44
|
+
The input data for analysis.
|
|
45
|
+
"""
|
|
46
|
+
self.inputs = inputs
|
|
47
|
+
self.model = model
|
|
48
|
+
|
|
49
|
+
self.encoder_layer = (
|
|
50
|
+
self.model.encoder.layers[1]
|
|
51
|
+
if isinstance(self.model.encoder.layers[0], InputLayer)
|
|
52
|
+
else self.model.encoder.layers[0]
|
|
53
|
+
)
|
|
54
|
+
self.decoder_layer = self.model.decoder.layers[0]
|
|
55
|
+
|
|
56
|
+
self.encoder_weights = self.encoder_layer.get_weights()[0]
|
|
57
|
+
self.decoder_weights = self.decoder_layer.get_weights()[0]
|
|
58
|
+
|
|
59
|
+
self.sorted_names = self._generate_sorted_color_names()
|
|
60
|
+
|
|
61
|
+
def _generate_sorted_color_names(self) -> list:
|
|
62
|
+
"""
|
|
63
|
+
Generate sorted color names based on their HSV values.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
`None`
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
`list` : Sorted color names.
|
|
72
|
+
"""
|
|
73
|
+
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
|
|
74
|
+
by_hsv = sorted(
|
|
75
|
+
(tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
|
|
76
|
+
for name, color in colors.items()
|
|
77
|
+
)
|
|
78
|
+
sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
|
|
79
|
+
random.shuffle(sorted_names)
|
|
80
|
+
return sorted_names
|
|
81
|
+
|
|
82
|
+
def render_html_report(
|
|
83
|
+
self,
|
|
84
|
+
frac: float = 0.2,
|
|
85
|
+
top_k: int = 5,
|
|
86
|
+
threshold_factor: float = 1.0,
|
|
87
|
+
max_rows: int = 5,
|
|
88
|
+
**kwargs,
|
|
89
|
+
) -> None:
|
|
90
|
+
"""
|
|
91
|
+
Generate and display an embedded HTML report in a Jupyter Notebook cell.
|
|
92
|
+
"""
|
|
93
|
+
display(HTML("<h2 style='margin-top:20px;'>📊 Predictor Analysis</h2>"))
|
|
94
|
+
display(
|
|
95
|
+
HTML(
|
|
96
|
+
"<p>This section visualizes how the model predicts the data. "
|
|
97
|
+
"You will see original inputs, reconstructed outputs, and analyses such as t-SNE "
|
|
98
|
+
"that reduce dimensionality to visualize latent space clustering.</p>"
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
stats_df = self.predictor_analyzer(frac=frac, **kwargs)
|
|
102
|
+
|
|
103
|
+
display(HTML("<h2 style='margin-top:30px;'>🔁 Encoder-Decoder Graph</h2>"))
|
|
104
|
+
display(
|
|
105
|
+
HTML(
|
|
106
|
+
"<p>This visualization displays the connections between layers in the encoder and decoder. "
|
|
107
|
+
"Edges with the strongest weights are highlighted to emphasize influential features "
|
|
108
|
+
"in the model's transformation.</p>"
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
if not self.model.encoder.name.startswith("vae"):
|
|
112
|
+
self.viz_encoder_decoder_graphs(threshold_factor=threshold_factor, top_k=top_k)
|
|
113
|
+
|
|
114
|
+
display(HTML("<h2 style='margin-top:30px;'>🧠 Classifier Layer Graphs</h2>"))
|
|
115
|
+
display(
|
|
116
|
+
HTML(
|
|
117
|
+
"<p>This visualization shows how features propagate through each dense layer in the classifier. "
|
|
118
|
+
"Only the strongest weighted connections are shown to highlight influential paths through the network.</p>"
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
self.viz_classifier_graphs(threshold_factor=threshold_factor, top_k=top_k)
|
|
122
|
+
|
|
123
|
+
display(HTML("<h2 style='margin-top:30px;'>📈 Statistical Summary</h2>"))
|
|
124
|
+
display(
|
|
125
|
+
HTML(
|
|
126
|
+
"<p>This table summarizes feature statistics grouped by predicted classes, "
|
|
127
|
+
"including means, standard deviations, and modes, providing insight into "
|
|
128
|
+
"feature distributions across different classes.</p>"
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if max_rows is not None and max_rows > 0:
|
|
133
|
+
stats_to_display = stats_df.head(max_rows)
|
|
134
|
+
else:
|
|
135
|
+
stats_to_display = stats_df
|
|
136
|
+
|
|
137
|
+
display(
|
|
138
|
+
stats_to_display.style.set_table_attributes(
|
|
139
|
+
"style='display:inline;border-collapse:collapse;'"
|
|
140
|
+
)
|
|
141
|
+
.set_caption("Feature Summary per Class")
|
|
142
|
+
.set_properties(
|
|
143
|
+
**{
|
|
144
|
+
"border": "1px solid #ddd",
|
|
145
|
+
"padding": "8px",
|
|
146
|
+
"text-align": "center",
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
display(
|
|
152
|
+
HTML(
|
|
153
|
+
"<p style='color: gray; margin-top:30px;'>Report generated with "
|
|
154
|
+
"<code>GetInsights</code> class. For detailed customization, extend "
|
|
155
|
+
"<code>render_html_report</code>.</p>"
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def viz_classifier_graphs(self, threshold_factor=1.0, top_k=5, save_path=None):
|
|
160
|
+
"""
|
|
161
|
+
Visualize all Dense layers in self.model.classifier as a single directed graph,
|
|
162
|
+
connecting each Dense layer to the next.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def get_top_k_edges(weights, src_prefix, dst_prefix, k):
|
|
166
|
+
flat_weights = np.abs(weights.flatten())
|
|
167
|
+
indices = np.argpartition(flat_weights, -k)[-k:]
|
|
168
|
+
top_k_flat_indices = indices[np.argsort(-flat_weights[indices])]
|
|
169
|
+
top_k_edges = []
|
|
170
|
+
|
|
171
|
+
for flat_index in top_k_flat_indices:
|
|
172
|
+
i, j = np.unravel_index(flat_index, weights.shape)
|
|
173
|
+
top_k_edges.append((f"{src_prefix}_{i}", f"{dst_prefix}_{j}", weights[i, j]))
|
|
174
|
+
return top_k_edges
|
|
175
|
+
|
|
176
|
+
def add_dense_layer_edges(G, weights, layer_idx, threshold_factor, top_k):
|
|
177
|
+
src_prefix = f"L{layer_idx}"
|
|
178
|
+
dst_prefix = f"L{layer_idx + 1}"
|
|
179
|
+
input_nodes = [f"{src_prefix}_{i}" for i in range(weights.shape[0])]
|
|
180
|
+
output_nodes = [f"{dst_prefix}_{j}" for j in range(weights.shape[1])]
|
|
181
|
+
|
|
182
|
+
G.add_nodes_from(input_nodes + output_nodes)
|
|
183
|
+
|
|
184
|
+
abs_weights = np.abs(weights)
|
|
185
|
+
threshold = threshold_factor * np.mean(abs_weights)
|
|
186
|
+
top_k_edges = get_top_k_edges(weights, src_prefix, dst_prefix, top_k)
|
|
187
|
+
top_k_set = set((u, v) for u, v, _ in top_k_edges)
|
|
188
|
+
|
|
189
|
+
for i, src in enumerate(input_nodes):
|
|
190
|
+
for j, dst in enumerate(output_nodes):
|
|
191
|
+
w = weights[i, j]
|
|
192
|
+
if abs(w) > threshold:
|
|
193
|
+
G.add_edge(src, dst, weight=w, highlight=(src, dst) in top_k_set)
|
|
194
|
+
|
|
195
|
+
def compute_layout(G):
|
|
196
|
+
pos = {}
|
|
197
|
+
layer_nodes = {}
|
|
198
|
+
|
|
199
|
+
for node in G.nodes():
|
|
200
|
+
layer_idx = int(node.split("_")[0][1:])
|
|
201
|
+
layer_nodes.setdefault(layer_idx, []).append(node)
|
|
202
|
+
|
|
203
|
+
for layer_idx, nodes in sorted(layer_nodes.items()):
|
|
204
|
+
y_positions = np.linspace(1, -1, len(nodes))
|
|
205
|
+
for y, node in zip(y_positions, nodes):
|
|
206
|
+
pos[node] = (layer_idx * 2, y)
|
|
207
|
+
|
|
208
|
+
return pos
|
|
209
|
+
|
|
210
|
+
def draw_graph(G, pos, title, save_path=None):
|
|
211
|
+
weights = [abs(G[u][v]["weight"]) for u, v in G.edges()]
|
|
212
|
+
if not weights:
|
|
213
|
+
print("No edges to draw.")
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
norm = Normalize(vmin=min(weights), vmax=max(weights))
|
|
217
|
+
cmap = cm.get_cmap("coolwarm")
|
|
218
|
+
|
|
219
|
+
edge_colors = [cmap(norm(G[u][v]["weight"])) for u, v in G.edges()]
|
|
220
|
+
edge_widths = [1.0 + 2.0 * norm(abs(G[u][v]["weight"])) for u, v in G.edges()]
|
|
221
|
+
|
|
222
|
+
fig, ax = plt.subplots(figsize=(12, 8))
|
|
223
|
+
|
|
224
|
+
nx.draw(
|
|
225
|
+
G,
|
|
226
|
+
pos,
|
|
227
|
+
ax=ax,
|
|
228
|
+
with_labels=True,
|
|
229
|
+
node_color="lightgray",
|
|
230
|
+
node_size=1000,
|
|
231
|
+
font_size=8,
|
|
232
|
+
edge_color=edge_colors,
|
|
233
|
+
width=edge_widths,
|
|
234
|
+
arrows=True,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
ax.set_title(title, fontsize=14)
|
|
238
|
+
|
|
239
|
+
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
|
|
240
|
+
sm.set_array([])
|
|
241
|
+
plt.colorbar(sm, ax=ax, orientation="vertical", label="Edge Weight")
|
|
242
|
+
|
|
243
|
+
plt.tight_layout()
|
|
244
|
+
if save_path:
|
|
245
|
+
plt.savefig(save_path)
|
|
246
|
+
plt.show()
|
|
247
|
+
|
|
248
|
+
dense_layers = [
|
|
249
|
+
layer
|
|
250
|
+
for layer in self.model.classifier.layers
|
|
251
|
+
if isinstance(layer, tf.keras.layers.Dense)
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
if len(dense_layers) < 1:
|
|
255
|
+
print("No Dense layers found in classifier.")
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
G = nx.DiGraph()
|
|
259
|
+
for idx, layer in enumerate(dense_layers):
|
|
260
|
+
weights = layer.get_weights()[0]
|
|
261
|
+
add_dense_layer_edges(G, weights, idx, threshold_factor, top_k)
|
|
262
|
+
|
|
263
|
+
pos = compute_layout(G)
|
|
264
|
+
draw_graph(G, pos, "Classifier Dense Layers Graph", save_path)
|
|
265
|
+
|
|
266
|
+
def viz_encoder_decoder_graphs(self, threshold_factor=1.0, top_k=5, save_path=None):
|
|
267
|
+
"""
|
|
268
|
+
Visualize Dense layers in self.model.encoder and self.model.decoder as directed graphs.
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
def get_top_k_edges(weights, labels_src, labels_dst_prefix, k):
|
|
272
|
+
flat_weights = np.abs(weights.flatten())
|
|
273
|
+
indices = np.argpartition(flat_weights, -k)[-k:]
|
|
274
|
+
top_k_flat_indices = indices[np.argsort(-flat_weights[indices])]
|
|
275
|
+
top_k_edges = []
|
|
276
|
+
for flat_index in top_k_flat_indices:
|
|
277
|
+
i, j = np.unravel_index(flat_index, weights.shape)
|
|
278
|
+
src_label = labels_src[i] if isinstance(labels_src, list) else f"{labels_src}_{i}"
|
|
279
|
+
dst_label = f"{labels_dst_prefix}_{j}"
|
|
280
|
+
top_k_edges.append((src_label, dst_label, weights[i, j]))
|
|
281
|
+
return top_k_edges
|
|
282
|
+
|
|
283
|
+
def add_layer_to_graph(
|
|
284
|
+
G, weights, labels_src, labels_dst_prefix, x_offset, top_k_set, threshold
|
|
285
|
+
):
|
|
286
|
+
output_nodes = [f"{labels_dst_prefix}_{j}" for j in range(weights.shape[1])]
|
|
287
|
+
|
|
288
|
+
for node in labels_src + output_nodes:
|
|
289
|
+
if node not in G:
|
|
290
|
+
G.add_node(node, x=x_offset if node in labels_src else x_offset + 1)
|
|
291
|
+
|
|
292
|
+
for i, src in enumerate(labels_src):
|
|
293
|
+
for j, dst in enumerate(output_nodes):
|
|
294
|
+
w = weights[i, j]
|
|
295
|
+
if abs(w) > threshold:
|
|
296
|
+
G.add_edge(src, dst, weight=w, highlight=(src, dst) in top_k_set)
|
|
297
|
+
return output_nodes
|
|
298
|
+
|
|
299
|
+
def layout_graph(G):
|
|
300
|
+
pos = {}
|
|
301
|
+
layers = {}
|
|
302
|
+
for node, data in G.nodes(data=True):
|
|
303
|
+
x = data["x"]
|
|
304
|
+
layers.setdefault(x, []).append(node)
|
|
305
|
+
|
|
306
|
+
for x in sorted(layers):
|
|
307
|
+
nodes = layers[x]
|
|
308
|
+
y_positions = np.linspace(1, -1, len(nodes))
|
|
309
|
+
for y, node in zip(y_positions, nodes):
|
|
310
|
+
pos[node] = (x, y)
|
|
311
|
+
return pos
|
|
312
|
+
|
|
313
|
+
def draw_graph(G, title, ax):
|
|
314
|
+
weights = [abs(G[u][v]["weight"]) for u, v in G.edges()]
|
|
315
|
+
if not weights:
|
|
316
|
+
return
|
|
317
|
+
|
|
318
|
+
norm = Normalize(vmin=min(weights), vmax=max(weights))
|
|
319
|
+
cmap = cm.get_cmap("coolwarm")
|
|
320
|
+
|
|
321
|
+
edge_colors = [cmap(norm(G[u][v]["weight"])) for u, v in G.edges()]
|
|
322
|
+
edge_widths = [1.0 + 2.0 * norm(abs(G[u][v]["weight"])) for u, v in G.edges()]
|
|
323
|
+
|
|
324
|
+
pos = layout_graph(G)
|
|
325
|
+
nx.draw(
|
|
326
|
+
G,
|
|
327
|
+
pos,
|
|
328
|
+
ax=ax,
|
|
329
|
+
with_labels=True,
|
|
330
|
+
node_color="lightgray",
|
|
331
|
+
node_size=1000,
|
|
332
|
+
font_size=8,
|
|
333
|
+
edge_color=edge_colors,
|
|
334
|
+
width=edge_widths,
|
|
335
|
+
arrows=True,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
ax.set_title(title, fontsize=12)
|
|
339
|
+
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
|
|
340
|
+
sm.set_array([])
|
|
341
|
+
plt.colorbar(sm, ax=ax, orientation="vertical", label="Edge Weight")
|
|
342
|
+
|
|
343
|
+
def build_graph(layers, label_prefix, input_labels=None):
|
|
344
|
+
G = nx.DiGraph()
|
|
345
|
+
x_offset = 0
|
|
346
|
+
prev_labels = input_labels or [
|
|
347
|
+
f"{label_prefix}0_{i}" for i in range(layers[0].get_weights()[0].shape[0])
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
for idx, layer in enumerate(layers):
|
|
351
|
+
weights = layer.get_weights()[0]
|
|
352
|
+
label = f"{label_prefix}{idx+1}"
|
|
353
|
+
threshold = threshold_factor * np.mean(np.abs(weights))
|
|
354
|
+
top_k_edges = get_top_k_edges(weights, prev_labels, label, top_k)
|
|
355
|
+
top_k_set = set((src, dst) for src, dst, _ in top_k_edges)
|
|
356
|
+
|
|
357
|
+
prev_labels = add_layer_to_graph(
|
|
358
|
+
G, weights, prev_labels, label, x_offset, top_k_set, threshold
|
|
359
|
+
)
|
|
360
|
+
x_offset += 2
|
|
361
|
+
|
|
362
|
+
return G
|
|
363
|
+
|
|
364
|
+
encoder_layers = [
|
|
365
|
+
l for l in self.model.encoder.layers if isinstance(l, tf.keras.layers.Dense)
|
|
366
|
+
]
|
|
367
|
+
decoder_layers = [
|
|
368
|
+
l for l in self.model.decoder.layers if isinstance(l, tf.keras.layers.Dense)
|
|
369
|
+
]
|
|
370
|
+
|
|
371
|
+
if not encoder_layers and not decoder_layers:
|
|
372
|
+
print("No Dense layers found in encoder or decoder.")
|
|
373
|
+
return
|
|
374
|
+
|
|
375
|
+
n_graphs = int(bool(encoder_layers)) + int(bool(decoder_layers))
|
|
376
|
+
fig, axes = plt.subplots(1, n_graphs, figsize=(7 * n_graphs, 6), squeeze=False)
|
|
377
|
+
|
|
378
|
+
col = 0
|
|
379
|
+
if encoder_layers:
|
|
380
|
+
input_labels = (
|
|
381
|
+
self.y_labels
|
|
382
|
+
if self.y_labels
|
|
383
|
+
and len(self.y_labels) == encoder_layers[0].get_weights()[0].shape[0]
|
|
384
|
+
else None
|
|
385
|
+
)
|
|
386
|
+
encoder_graph = build_graph(encoder_layers, "E", input_labels)
|
|
387
|
+
draw_graph(encoder_graph, "Encoder", axes[0][col])
|
|
388
|
+
col += 1
|
|
389
|
+
|
|
390
|
+
if decoder_layers:
|
|
391
|
+
decoder_graph = build_graph(decoder_layers, "D")
|
|
392
|
+
draw_graph(decoder_graph, "Decoder", axes[0][col])
|
|
393
|
+
|
|
394
|
+
fig.suptitle("Encoder & Decoder Dense Layer Graphs", fontsize=15)
|
|
395
|
+
plt.tight_layout(rect=[0, 0, 1, 0.95])
|
|
396
|
+
|
|
397
|
+
if save_path:
|
|
398
|
+
plt.savefig(save_path)
|
|
399
|
+
plt.show()
|
|
400
|
+
|
|
401
|
+
if encoder_layers:
|
|
402
|
+
weights = encoder_layers[0].get_weights()[0]
|
|
403
|
+
importances = np.abs(weights).mean(axis=1)
|
|
404
|
+
sorted_idx = np.argsort(-importances)
|
|
405
|
+
xticks = [
|
|
406
|
+
(
|
|
407
|
+
self.y_labels[i]
|
|
408
|
+
if self.y_labels and len(self.y_labels) == weights.shape[0]
|
|
409
|
+
else f"Input_{i}"
|
|
410
|
+
)
|
|
411
|
+
for i in sorted_idx
|
|
412
|
+
]
|
|
413
|
+
|
|
414
|
+
plt.figure(figsize=(10, 4))
|
|
415
|
+
plt.bar(range(len(importances)), importances[sorted_idx], color="skyblue")
|
|
416
|
+
plt.xticks(range(len(importances)), xticks, rotation=45, ha="right")
|
|
417
|
+
plt.title("Feature Importances (Encoder Input Layer)", fontsize=13)
|
|
418
|
+
plt.ylabel("Mean |Weight|")
|
|
419
|
+
plt.tight_layout()
|
|
420
|
+
plt.show()
|
|
421
|
+
|
|
422
|
+
def predictor_analyzer(
|
|
423
|
+
self,
|
|
424
|
+
frac: float = None,
|
|
425
|
+
cmap: str = "viridis",
|
|
426
|
+
aspect: str = "auto",
|
|
427
|
+
highlight: bool = True,
|
|
428
|
+
**kwargs,
|
|
429
|
+
) -> None:
|
|
430
|
+
"""
|
|
431
|
+
Analyze the model's predictions and visualize data.
|
|
432
|
+
|
|
433
|
+
Parameters
|
|
434
|
+
----------
|
|
435
|
+
frac : `float`, optional
|
|
436
|
+
Fraction of data to use for analysis (default is `None`).
|
|
437
|
+
cmap : `str`, optional
|
|
438
|
+
The colormap for visualization (default is `"viridis"`).
|
|
439
|
+
aspect : `str`, optional
|
|
440
|
+
Aspect ratio for the visualization (default is `"auto"`).
|
|
441
|
+
highlight : `bool`, optional
|
|
442
|
+
Whether to highlight the maximum weights (default is `True`).
|
|
443
|
+
**kwargs : `dict`, optional
|
|
444
|
+
Additional keyword arguments for customization.
|
|
445
|
+
|
|
446
|
+
Returns
|
|
447
|
+
-------
|
|
448
|
+
`pd.DataFrame` : The statistical summary of the input data.
|
|
449
|
+
"""
|
|
450
|
+
self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
|
|
451
|
+
inputs = self.inputs.copy()
|
|
452
|
+
inputs = self._prepare_inputs(inputs, frac)
|
|
453
|
+
self.y_labels = kwargs.get("y_labels", None)
|
|
454
|
+
encoded, reconstructed = self._encode_decode(inputs)
|
|
455
|
+
self._visualize_data(inputs, reconstructed, cmap, aspect)
|
|
456
|
+
self._prepare_data_for_analysis(inputs, reconstructed, encoded, self.y_labels)
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
self._get_tsne_repr(inputs, frac)
|
|
460
|
+
self._viz_tsne_repr(c=self.classification)
|
|
461
|
+
|
|
462
|
+
self._viz_radviz(self.data, "class", "Radviz Visualization of Latent Space")
|
|
463
|
+
self._viz_radviz(self.data_input, "class", "Radviz Visualization of Input Data")
|
|
464
|
+
except ValueError:
|
|
465
|
+
warnings.warn(
|
|
466
|
+
"Some functions or processes will not be executed for regression problems.",
|
|
467
|
+
UserWarning,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
return self._statistics(self.data_input)
|
|
471
|
+
|
|
472
|
+
def _prepare_inputs(self, inputs: np.ndarray, frac: float) -> np.ndarray:
|
|
473
|
+
"""
|
|
474
|
+
Prepare the input data, possibly selecting a fraction of it.
|
|
475
|
+
|
|
476
|
+
Parameters
|
|
477
|
+
----------
|
|
478
|
+
inputs : `np.ndarray`
|
|
479
|
+
The input data.
|
|
480
|
+
frac : `float`
|
|
481
|
+
Fraction of data to use.
|
|
482
|
+
|
|
483
|
+
Returns
|
|
484
|
+
-------
|
|
485
|
+
`np.ndarray` : The prepared input data.
|
|
486
|
+
"""
|
|
487
|
+
if frac:
|
|
488
|
+
n = int(frac * self.inputs.shape[0])
|
|
489
|
+
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
490
|
+
inputs = inputs[indexes]
|
|
491
|
+
inputs[np.isnan(inputs)] = 0.0
|
|
492
|
+
return inputs
|
|
493
|
+
|
|
494
|
+
def _encode_decode(self, inputs: np.ndarray) -> tuple:
|
|
495
|
+
"""
|
|
496
|
+
Perform encoding and decoding on the input data.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
inputs : `np.ndarray`
|
|
501
|
+
The input data.
|
|
502
|
+
|
|
503
|
+
Returns
|
|
504
|
+
-------
|
|
505
|
+
`tuple` : The encoded and reconstructed data.
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
mean, log_var = self.model.encoder(inputs)
|
|
509
|
+
encoded = sampling(mean, log_var)
|
|
510
|
+
except:
|
|
511
|
+
encoded = self.model.encoder(inputs)
|
|
512
|
+
reconstructed = self.model.decoder(encoded)
|
|
513
|
+
return encoded, reconstructed
|
|
514
|
+
|
|
515
|
+
def _visualize_data(
|
|
516
|
+
self, inputs: np.ndarray, reconstructed: np.ndarray, cmap: str, aspect: str
|
|
517
|
+
) -> None:
|
|
518
|
+
"""
|
|
519
|
+
Visualize the original data and the reconstructed data.
|
|
520
|
+
|
|
521
|
+
Parameters
|
|
522
|
+
----------
|
|
523
|
+
inputs : `np.ndarray`
|
|
524
|
+
The input data.
|
|
525
|
+
reconstructed : `np.ndarray`
|
|
526
|
+
The reconstructed data.
|
|
527
|
+
cmap : `str`
|
|
528
|
+
The colormap for visualization.
|
|
529
|
+
aspect : `str`
|
|
530
|
+
Aspect ratio for the visualization.
|
|
531
|
+
|
|
532
|
+
Returns
|
|
533
|
+
-------
|
|
534
|
+
`None`
|
|
535
|
+
"""
|
|
536
|
+
ax = plt.subplot(1, 2, 1)
|
|
537
|
+
plt.imshow(inputs, cmap=cmap, aspect=aspect)
|
|
538
|
+
plt.colorbar()
|
|
539
|
+
plt.title("Original Data")
|
|
540
|
+
|
|
541
|
+
plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
|
|
542
|
+
plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
|
|
543
|
+
plt.colorbar()
|
|
544
|
+
plt.title("Decoder Layer Reconstruction")
|
|
545
|
+
plt.show()
|
|
546
|
+
|
|
547
|
+
def _prepare_data_for_analysis(
|
|
548
|
+
self,
|
|
549
|
+
inputs: np.ndarray,
|
|
550
|
+
reconstructed: np.ndarray,
|
|
551
|
+
encoded: np.ndarray,
|
|
552
|
+
y_labels: List[str],
|
|
553
|
+
) -> None:
|
|
554
|
+
"""
|
|
555
|
+
Prepare data for statistical analysis.
|
|
556
|
+
|
|
557
|
+
Parameters
|
|
558
|
+
----------
|
|
559
|
+
inputs : `np.ndarray`
|
|
560
|
+
The input data.
|
|
561
|
+
reconstructed : `np.ndarray`
|
|
562
|
+
The reconstructed data.
|
|
563
|
+
encoded : `np.ndarray`
|
|
564
|
+
The encoded data.
|
|
565
|
+
y_labels : `List[str]`
|
|
566
|
+
The labels of features.
|
|
567
|
+
|
|
568
|
+
Returns
|
|
569
|
+
-------
|
|
570
|
+
`None`
|
|
571
|
+
"""
|
|
572
|
+
self.classification = (
|
|
573
|
+
self.model.classifier(tf.concat([reconstructed, encoded], axis=1))
|
|
574
|
+
.numpy()
|
|
575
|
+
.argmax(axis=1)
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
|
|
579
|
+
self.data_input = pd.DataFrame(
|
|
580
|
+
inputs,
|
|
581
|
+
columns=(
|
|
582
|
+
[f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
|
|
583
|
+
),
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
self.data["class"] = self.classification
|
|
587
|
+
self.data_input["class"] = self.classification
|
|
588
|
+
|
|
589
|
+
def _get_tsne_repr(self, inputs: np.ndarray = None, frac: float = None) -> None:
|
|
590
|
+
"""
|
|
591
|
+
Perform t-SNE dimensionality reduction on the input data.
|
|
592
|
+
|
|
593
|
+
Parameters
|
|
594
|
+
----------
|
|
595
|
+
inputs : `np.ndarray`
|
|
596
|
+
The input data.
|
|
597
|
+
frac : `float`
|
|
598
|
+
Fraction of data to use.
|
|
599
|
+
|
|
600
|
+
Returns
|
|
601
|
+
-------
|
|
602
|
+
`None`
|
|
603
|
+
"""
|
|
604
|
+
if inputs is None:
|
|
605
|
+
inputs = self.inputs.copy()
|
|
606
|
+
if frac:
|
|
607
|
+
n = int(frac * self.inputs.shape[0])
|
|
608
|
+
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
609
|
+
inputs = inputs[indexes]
|
|
610
|
+
inputs[np.isnan(inputs)] = 0.0
|
|
611
|
+
self.latent_representations = inputs @ self.encoder_weights
|
|
612
|
+
|
|
613
|
+
tsne = TSNE(n_components=2)
|
|
614
|
+
self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
|
|
615
|
+
|
|
616
|
+
def _viz_tsne_repr(self, **kwargs) -> None:
|
|
617
|
+
"""
|
|
618
|
+
Visualize the t-SNE representation of the latent space.
|
|
619
|
+
|
|
620
|
+
Parameters
|
|
621
|
+
----------
|
|
622
|
+
**kwargs : `dict`
|
|
623
|
+
Additional keyword arguments for customization.
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
`None`
|
|
628
|
+
"""
|
|
629
|
+
c = kwargs.get("c", None)
|
|
630
|
+
self.colors = (
|
|
631
|
+
kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
plt.scatter(
|
|
635
|
+
self.reduced_data_tsne[:, 0],
|
|
636
|
+
self.reduced_data_tsne[:, 1],
|
|
637
|
+
cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
|
|
638
|
+
c=c,
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
if c is not None:
|
|
642
|
+
cb = plt.colorbar()
|
|
643
|
+
loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
|
|
644
|
+
cb.set_ticks(loc)
|
|
645
|
+
cb.set_ticklabels(np.unique(c))
|
|
646
|
+
|
|
647
|
+
plt.title("t-SNE Visualization of Latent Space")
|
|
648
|
+
plt.xlabel("t-SNE 1")
|
|
649
|
+
plt.ylabel("t-SNE 2")
|
|
650
|
+
plt.show()
|
|
651
|
+
|
|
652
|
+
def _viz_radviz(self, data: pd.DataFrame, color_column: str, title: str) -> None:
|
|
653
|
+
"""
|
|
654
|
+
Visualize the data using RadViz.
|
|
655
|
+
|
|
656
|
+
Parameters
|
|
657
|
+
----------
|
|
658
|
+
data : `pd.DataFrame`
|
|
659
|
+
The data to visualize.
|
|
660
|
+
color_column : `str`
|
|
661
|
+
The column to use for coloring.
|
|
662
|
+
title : `str`
|
|
663
|
+
The title of the plot.
|
|
664
|
+
|
|
665
|
+
Returns
|
|
666
|
+
-------
|
|
667
|
+
`None`
|
|
668
|
+
"""
|
|
669
|
+
data_normalized = data.copy(deep=True)
|
|
670
|
+
data_normalized.iloc[:, :-1] = (
|
|
671
|
+
2.0
|
|
672
|
+
* (data_normalized.iloc[:, :-1] - data_normalized.iloc[:, :-1].min())
|
|
673
|
+
/ (data_normalized.iloc[:, :-1].max() - data_normalized.iloc[:, :-1].min())
|
|
674
|
+
- 1
|
|
675
|
+
)
|
|
676
|
+
data_normalized.dropna(axis=1, inplace=True)
|
|
677
|
+
radviz(data_normalized, color_column, color=self.colors)
|
|
678
|
+
plt.title(title)
|
|
679
|
+
plt.show()
|
|
680
|
+
|
|
681
|
+
def _viz_weights(
|
|
682
|
+
self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
|
|
683
|
+
) -> None:
|
|
684
|
+
"""
|
|
685
|
+
Visualize the encoder layer weights of the model.
|
|
686
|
+
|
|
687
|
+
Parameters
|
|
688
|
+
----------
|
|
689
|
+
cmap : `str`, optional
|
|
690
|
+
The colormap for visualization (default is `"viridis"`).
|
|
691
|
+
aspect : `str`, optional
|
|
692
|
+
Aspect ratio for the visualization (default is `"auto"`).
|
|
693
|
+
highlight : `bool`, optional
|
|
694
|
+
Whether to highlight the maximum weights (default is `True`).
|
|
695
|
+
**kwargs : `dict`, optional
|
|
696
|
+
Additional keyword arguments for customization.
|
|
697
|
+
|
|
698
|
+
Returns
|
|
699
|
+
-------
|
|
700
|
+
`None`
|
|
701
|
+
"""
|
|
702
|
+
title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
|
|
703
|
+
y_labels = kwargs.get("y_labels", None)
|
|
704
|
+
cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
|
|
705
|
+
highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
|
|
706
|
+
|
|
707
|
+
plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
|
|
708
|
+
plt.colorbar()
|
|
709
|
+
plt.title(title)
|
|
710
|
+
if y_labels is not None:
|
|
711
|
+
plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
|
|
712
|
+
if highlight:
|
|
713
|
+
for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
|
|
714
|
+
highlight_mask[i, j] = True
|
|
715
|
+
plt.imshow(
|
|
716
|
+
np.ma.masked_where(~highlight_mask, self.encoder_weights),
|
|
717
|
+
cmap=cmap_highlight,
|
|
718
|
+
alpha=0.5,
|
|
719
|
+
aspect=aspect,
|
|
720
|
+
)
|
|
721
|
+
plt.show()
|
|
722
|
+
|
|
723
|
+
def _statistics(self, data_input: pd.DataFrame) -> pd.DataFrame:
|
|
724
|
+
"""
|
|
725
|
+
Compute statistical summaries of the input data.
|
|
726
|
+
|
|
727
|
+
Parameters
|
|
728
|
+
----------
|
|
729
|
+
data_input : `pd.DataFrame`
|
|
730
|
+
The data to compute statistics for.
|
|
731
|
+
|
|
732
|
+
Returns
|
|
733
|
+
-------
|
|
734
|
+
`pd.DataFrame` : The statistical summary of the input data.
|
|
735
|
+
"""
|
|
736
|
+
data = data_input.copy(deep=True)
|
|
737
|
+
|
|
738
|
+
if not pd.api.types.is_string_dtype(data["class"]):
|
|
739
|
+
data["class"] = data["class"].astype(str)
|
|
740
|
+
|
|
741
|
+
data.ffill(inplace=True)
|
|
742
|
+
grouped_data = data.groupby("class")
|
|
743
|
+
|
|
744
|
+
numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
|
|
745
|
+
numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
|
|
746
|
+
|
|
747
|
+
def get_mode(x):
|
|
748
|
+
mode_series = x.mode()
|
|
749
|
+
return mode_series.iloc[0] if not mode_series.empty else None
|
|
750
|
+
|
|
751
|
+
mode_stats = grouped_data.apply(get_mode, include_groups=False)
|
|
752
|
+
mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
|
|
753
|
+
combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
|
|
754
|
+
|
|
755
|
+
return combined_stats.T
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
########################################################################################
|
|
759
|
+
|
|
760
|
+
if __name__ == "__main__":
|
|
761
|
+
# Example usage
|
|
762
|
+
import pandas as pd
|
|
763
|
+
from sklearn.datasets import load_iris
|
|
764
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
765
|
+
|
|
766
|
+
# Load the dataset
|
|
767
|
+
iris = load_iris()
|
|
768
|
+
|
|
769
|
+
# Convert to a DataFrame for easy exploration
|
|
770
|
+
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
|
|
771
|
+
iris_df["species"] = iris.target
|
|
772
|
+
|
|
773
|
+
X = iris_df.drop(columns="species")
|
|
774
|
+
y_labels = X.columns
|
|
775
|
+
X = X.values
|
|
776
|
+
y = iris_df["species"].values
|
|
777
|
+
|
|
778
|
+
X = np.asarray(X).astype(np.float32)
|
|
779
|
+
|
|
780
|
+
encoder = OneHotEncoder()
|
|
781
|
+
y = encoder.fit_transform(y.reshape(-1, 1)).toarray()
|
|
782
|
+
y = np.asarray(y).astype(np.float32)
|
|
783
|
+
|
|
784
|
+
model = AutoClassifier(
|
|
785
|
+
input_shape_parm=X.shape[1],
|
|
786
|
+
num_classes=3,
|
|
787
|
+
units=27,
|
|
788
|
+
activation="tanh",
|
|
789
|
+
num_layers=2,
|
|
790
|
+
dropout=0.2,
|
|
791
|
+
)
|
|
792
|
+
model.compile(
|
|
793
|
+
optimizer="adam",
|
|
794
|
+
loss=tf.keras.losses.CategoricalCrossentropy(),
|
|
795
|
+
metrics=[tf.keras.metrics.F1Score(threshold=0.5)],
|
|
796
|
+
)
|
|
797
|
+
model.fit(X, y, epochs=50, validation_split=0.2)
|
|
798
|
+
|
|
799
|
+
insights = GetInsights(model, X)
|
|
800
|
+
summary = insights.predictor_analyzer(frac=1.0, y_labels=y_labels)
|
|
801
|
+
insights._get_tsne_repr()
|
|
802
|
+
insights._viz_tsne_repr()
|
|
803
|
+
insights._viz_tsne_repr(c=iris_df["species"])
|
|
804
|
+
insights._viz_weights()
|
|
805
|
+
print(summary)
|