likelihood 1.2.23__py3-none-any.whl → 1.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/graph/nn.py CHANGED
@@ -1,7 +1,9 @@
1
+ import logging
1
2
  import os
2
3
 
3
- os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
4
- import logging
4
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
6
+
5
7
  import warnings
6
8
  from typing import List, Tuple
7
9
 
@@ -9,19 +11,16 @@ import numpy as np
9
11
  import pandas as pd
10
12
  import tensorflow as tf
11
13
  from IPython.display import clear_output
12
- from numpy import ndarray
13
14
  from pandas.core.frame import DataFrame
14
15
  from sklearn.metrics import f1_score
15
16
  from sklearn.model_selection import train_test_split
16
17
 
17
18
  from likelihood.tools import generate_feature_yaml
18
19
 
19
- logging.getLogger("tensorflow").setLevel(logging.ERROR)
20
-
21
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
20
+ tf.get_logger().setLevel("ERROR")
22
21
 
23
22
 
24
- def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
23
+ def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
25
24
  """Compares the similarity between two arrays of categories.
26
25
 
27
26
  Parameters
@@ -44,9 +43,9 @@ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
44
43
  return count
45
44
 
46
45
 
47
- def cal_adjency_matrix(
46
+ def cal_adjacency_matrix(
48
47
  df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
49
- ) -> Tuple[dict, ndarray]:
48
+ ) -> Tuple[dict, np.ndarray]:
50
49
  """Calculates the adjacency matrix for a given DataFrame.
51
50
  The adjacency matrix is a matrix that represents the similarity between each pair of categories.
52
51
  The similarity is calculated using the `compare_similarity` function.
@@ -133,7 +132,7 @@ class Data:
133
132
  target: str | None = None,
134
133
  exclude_subset: List[str] = [],
135
134
  ):
136
- _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
135
+ _, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=True)
137
136
  if target is not None:
138
137
  X = df.drop(columns=[target] + exclude_subset)
139
138
  else:
@@ -1,19 +1,40 @@
1
1
  import logging
2
2
  import os
3
+ import random
3
4
  from functools import partial
4
5
  from shutil import rmtree
5
6
 
6
- import keras_tuner
7
+ import matplotlib
8
+ import matplotlib.colors as mcolors
9
+ import matplotlib.pyplot as plt
7
10
  import numpy as np
8
11
  import pandas as pd
12
+ from pandas.plotting import radviz
13
+
14
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
15
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
16
+
17
+ import warnings
18
+ from functools import wraps
19
+
20
+ import keras_tuner
9
21
  import tensorflow as tf
10
22
  from pandas.core.frame import DataFrame
23
+ from sklearn.manifold import TSNE
11
24
 
12
25
  from likelihood.tools import OneHotEncoder
13
26
 
14
- logging.getLogger("tensorflow").setLevel(logging.ERROR)
27
+ tf.get_logger().setLevel("ERROR")
28
+
29
+
30
+ def suppress_warnings(func):
31
+ @wraps(func)
32
+ def wrapper(*args, **kwargs):
33
+ with warnings.catch_warnings():
34
+ warnings.simplefilter("ignore")
35
+ return func(*args, **kwargs)
15
36
 
16
- tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
37
+ return wrapper
17
38
 
18
39
 
19
40
  @tf.keras.utils.register_keras_serializable(package="Custom", name="AutoClassifier")
@@ -35,7 +56,7 @@ class AutoClassifier(tf.keras.Model):
35
56
  from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
36
57
  """
37
58
 
38
- def __init__(self, input_shape_parm, num_classes, units, activation):
59
+ def __init__(self, input_shape_parm, num_classes, units, activation, **kwargs):
39
60
  """
40
61
  Initializes an AutoClassifier instance with the given parameters.
41
62
 
@@ -49,6 +70,15 @@ class AutoClassifier(tf.keras.Model):
49
70
  The number of neurons in each hidden layer.
50
71
  activation : `str`
51
72
  The type of activation function to use for the neural network layers.
73
+
74
+ Keyword Arguments:
75
+ ----------
76
+ Additional keyword arguments to pass to the model.
77
+
78
+ classifier_activation : `str`
79
+ The activation function to use for the classifier layer. Default is "softmax". If the activation function is not a classification function, the model can be used in regression problems.
80
+ num_layers : `int`
81
+ The number of hidden layers in the classifier. Default is 1.
52
82
  """
53
83
  super(AutoClassifier, self).__init__()
54
84
  self.input_shape_parm = input_shape_parm
@@ -59,6 +89,8 @@ class AutoClassifier(tf.keras.Model):
59
89
  self.encoder = None
60
90
  self.decoder = None
61
91
  self.classifier = None
92
+ self.classifier_activation = kwargs.get("classifier_activation", "softmax")
93
+ self.num_layers = kwargs.get("num_layers", 1)
62
94
 
63
95
  def build(self, input_shape):
64
96
  self.encoder = tf.keras.Sequential(
@@ -75,8 +107,14 @@ class AutoClassifier(tf.keras.Model):
75
107
  ]
76
108
  )
77
109
 
78
- self.classifier = tf.keras.Sequential(
79
- [tf.keras.layers.Dense(self.num_classes, activation="softmax")]
110
+ self.classifier = tf.keras.Sequential()
111
+ if self.num_layers > 1:
112
+ for _ in range(self.num_layers - 1):
113
+ self.classifier.add(
114
+ tf.keras.layers.Dense(units=self.units, activation=self.activation)
115
+ )
116
+ self.classifier.add(
117
+ tf.keras.layers.Dense(units=self.num_classes, activation=self.classifier_activation)
80
118
  )
81
119
 
82
120
  def call(self, x):
@@ -92,6 +130,8 @@ class AutoClassifier(tf.keras.Model):
92
130
  "num_classes": self.num_classes,
93
131
  "units": self.units,
94
132
  "activation": self.activation,
133
+ "classifier_activation": self.classifier_activation,
134
+ "num_layers": self.num_layers,
95
135
  }
96
136
  base_config = super(AutoClassifier, self).get_config()
97
137
  return dict(list(base_config.items()) + list(config.items()))
@@ -103,6 +143,8 @@ class AutoClassifier(tf.keras.Model):
103
143
  num_classes=config["num_classes"],
104
144
  units=config["units"],
105
145
  activation=config["activation"],
146
+ classifier_activation=config["classifier_activation"],
147
+ num_layers=config["num_layers"],
106
148
  )
107
149
 
108
150
 
@@ -113,6 +155,7 @@ def call_existing_code(
113
155
  optimizer: str,
114
156
  input_shape_parm: None | int = None,
115
157
  num_classes: None | int = None,
158
+ num_layers: int = 1,
116
159
  ) -> AutoClassifier:
117
160
  """
118
161
  Calls an existing AutoClassifier instance.
@@ -142,6 +185,7 @@ def call_existing_code(
142
185
  num_classes=num_classes,
143
186
  units=units,
144
187
  activation=activation,
188
+ num_layers=num_layers,
145
189
  )
146
190
  model.compile(
147
191
  optimizer=optimizer,
@@ -151,7 +195,9 @@ def call_existing_code(
151
195
  return model
152
196
 
153
197
 
154
- def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
198
+ def build_model(
199
+ hp, input_shape_parm: None | int, num_classes: None | int, **kwargs
200
+ ) -> AutoClassifier:
155
201
  """Builds a neural network model using Keras Tuner's search algorithm.
156
202
 
157
203
  Parameters
@@ -163,17 +209,51 @@ def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> Au
163
209
  num_classes : `int`
164
210
  The number of classes in the dataset.
165
211
 
212
+ Keyword Arguments:
213
+ ----------
214
+ Additional keyword arguments to pass to the model.
215
+
216
+ hyperparameters : `dict`
217
+ The hyperparameters to set.
218
+
166
219
  Returns
167
220
  -------
168
221
  `keras.Model`
169
222
  The neural network model.
170
223
  """
171
- units = hp.Int(
172
- "units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
224
+ hyperparameters = kwargs.get("hyperparameters", None)
225
+ hyperparameters_keys = hyperparameters.keys() if hyperparameters is not None else []
226
+
227
+ units = (
228
+ hp.Int(
229
+ "units",
230
+ min_value=int(input_shape_parm * 0.2),
231
+ max_value=int(input_shape_parm * 1.5),
232
+ step=2,
233
+ )
234
+ if "units" not in hyperparameters_keys
235
+ else hyperparameters["units"]
236
+ )
237
+ activation = (
238
+ hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus", "softsign"])
239
+ if "activation" not in hyperparameters_keys
240
+ else hyperparameters["activation"]
241
+ )
242
+ optimizer = (
243
+ hp.Choice("optimizer", ["sgd", "adam", "adadelta", "rmsprop", "adamax", "adagrad"])
244
+ if "optimizer" not in hyperparameters_keys
245
+ else hyperparameters["optimizer"]
246
+ )
247
+ threshold = (
248
+ hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
249
+ if "threshold" not in hyperparameters_keys
250
+ else hyperparameters["threshold"]
251
+ )
252
+ num_layers = (
253
+ hp.Int("num_layers", min_value=1, max_value=10, step=1)
254
+ if "num_layers" not in hyperparameters_keys
255
+ else hyperparameters["num_layers"]
173
256
  )
174
- activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
175
- optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
176
- threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
177
257
 
178
258
  model = call_existing_code(
179
259
  units=units,
@@ -182,10 +262,12 @@ def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> Au
182
262
  optimizer=optimizer,
183
263
  input_shape_parm=input_shape_parm,
184
264
  num_classes=num_classes,
265
+ num_layers=num_layers,
185
266
  )
186
267
  return model
187
268
 
188
269
 
270
+ @suppress_warnings
189
271
  def setup_model(
190
272
  data: DataFrame,
191
273
  target: str,
@@ -194,6 +276,7 @@ def setup_model(
194
276
  seed=None,
195
277
  train_mode: bool = True,
196
278
  filepath: str = "./my_dir/best_model",
279
+ method: str = "Hyperband",
197
280
  **kwargs,
198
281
  ) -> AutoClassifier:
199
282
  """Setup model for training and tuning.
@@ -214,6 +297,8 @@ def setup_model(
214
297
  Whether to train the model or not.
215
298
  filepath : `str`
216
299
  The path to save the best model to.
300
+ method : `str`
301
+ The method to use for hyperparameter tuning. Options are "Hyperband" and "RandomSearch".
217
302
 
218
303
  Keyword Arguments:
219
304
  ----------
@@ -229,30 +314,30 @@ def setup_model(
229
314
  The objective to optimize.
230
315
  verbose : `bool`
231
316
  Whether to print verbose output.
317
+ hyperparameters : `dict`
318
+ The hyperparameters to set.
232
319
 
233
320
  Returns
234
321
  -------
235
322
  model : `AutoClassifier`
236
323
  The trained model.
237
324
  """
238
- max_trials = kwargs["max_trials"] if "max_trials" in kwargs else 10
239
- directory = kwargs["directory"] if "directory" in kwargs else "./my_dir"
240
- project_name = kwargs["project_name"] if "project_name" in kwargs else "get_best"
241
- objective = kwargs["objective"] if "objective" in kwargs else "val_loss"
242
- verbose = kwargs["verbose"] if "verbose" in kwargs else True
325
+ max_trials = kwargs.get("max_trials", 10)
326
+ directory = kwargs.get("directory", "./my_dir")
327
+ project_name = kwargs.get("project_name", "get_best")
328
+ objective = kwargs.get("objective", "val_loss")
329
+ verbose = kwargs.get("verbose", True)
330
+ hyperparameters = kwargs.get("hyperparameters", None)
243
331
 
244
332
  X = data.drop(columns=target)
245
333
  input_sample = X.sample(1)
246
334
  y = data[target]
247
- # Verify if there are categorical columns in the dataframe
248
335
  assert (
249
336
  X.select_dtypes(include=["object"]).empty == True
250
337
  ), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
251
338
  validation_split = 1.0 - train_size
252
- # Create my_dir path if it does not exist
253
339
 
254
340
  if train_mode:
255
- # Create a new directory if it does not exist
256
341
  try:
257
342
  if (not os.path.exists(directory)) and directory != "./":
258
343
  os.makedirs(directory)
@@ -263,7 +348,6 @@ def setup_model(
263
348
  except:
264
349
  print("Warning: unable to create directory")
265
350
 
266
- # Create a Classifier instance
267
351
  y_encoder = OneHotEncoder()
268
352
  y = y_encoder.encode(y.to_list())
269
353
  X = X.to_numpy()
@@ -276,34 +360,239 @@ def setup_model(
276
360
  num_classes = y.shape[1]
277
361
  global build_model
278
362
  build_model = partial(
279
- build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
363
+ build_model,
364
+ input_shape_parm=input_shape_parm,
365
+ num_classes=num_classes,
366
+ hyperparameters=hyperparameters,
280
367
  )
281
368
 
282
- # Create the AutoKeras model
283
- tuner = keras_tuner.RandomSearch(
284
- hypermodel=build_model,
285
- objective=objective,
286
- max_trials=max_trials,
287
- directory=directory,
288
- project_name=project_name,
289
- seed=seed,
290
- )
291
-
292
- tuner.search(X, y, epochs=epochs, validation_split=validation_split)
369
+ if method == "Hyperband":
370
+ tuner = keras_tuner.Hyperband(
371
+ hypermodel=build_model,
372
+ objective=objective,
373
+ max_epochs=epochs,
374
+ factor=3,
375
+ directory=directory,
376
+ project_name=project_name,
377
+ seed=seed,
378
+ )
379
+ elif method == "RandomSearch":
380
+ tuner = keras_tuner.RandomSearch(
381
+ hypermodel=build_model,
382
+ objective=objective,
383
+ max_trials=max_trials,
384
+ directory=directory,
385
+ project_name=project_name,
386
+ seed=seed,
387
+ )
388
+
389
+ tuner.search(X, y, epochs=epochs, validation_split=validation_split, verbose=verbose)
293
390
  models = tuner.get_best_models(num_models=2)
294
391
  best_model = models[0]
295
392
  best_model(input_sample)
296
393
 
297
- # save model
298
394
  best_model.save(filepath, save_format="tf")
299
395
 
300
396
  if verbose:
301
397
  tuner.results_summary()
302
398
  else:
303
- # Load the best model from the directory
304
399
  best_model = tf.keras.models.load_model(filepath)
305
400
 
306
- return best_model
401
+ best_hps = tuner.get_best_hyperparameters(1)[0].values
402
+ return best_model, pd.DataFrame(best_hps, index=["Value"])
403
+
404
+
405
+ class GetInsights:
406
+ def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
407
+ self.inputs = inputs
408
+ self.model = model
409
+ self.encoder_layer = self.model.encoder.layers[0]
410
+ self.decoder_layer = self.model.decoder.layers[0]
411
+ self.classifier_layer = self.model.classifier.layers[-2]
412
+ self.encoder_weights = self.encoder_layer.get_weights()[0]
413
+ self.decoder_weights = self.decoder_layer.get_weights()[0]
414
+ self.classifier_weights = self.classifier_layer.get_weights()[0]
415
+ colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
416
+
417
+ by_hsv = sorted(
418
+ (tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
419
+ for name, color in colors.items()
420
+ )
421
+ self.sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
422
+ random.shuffle(self.sorted_names)
423
+
424
+ def predictor_analyzer(
425
+ self,
426
+ frac=None,
427
+ cmap: str = "viridis",
428
+ aspect: str = "auto",
429
+ highlight: bool = True,
430
+ **kwargs,
431
+ ) -> None:
432
+ self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
433
+ inputs = self.inputs.copy()
434
+ y_labels = kwargs.get("y_labels", None)
435
+ if frac:
436
+ n = int(frac * self.inputs.shape[0])
437
+ indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
438
+ inputs = inputs[indexes]
439
+ inputs[np.isnan(inputs)] = 0.0
440
+ encoded = self.model.encoder(inputs)
441
+ reconstructed = self.model.decoder(encoded)
442
+ combined = tf.concat([reconstructed, encoded], axis=1)
443
+ self.classification = self.model.classifier(combined).numpy().argmax(axis=1)
444
+ ax = plt.subplot(1, 2, 1)
445
+ plt.imshow(self.inputs, cmap=cmap, aspect=aspect)
446
+ plt.colorbar()
447
+ plt.title("Original Data")
448
+ plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
449
+ plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
450
+ plt.colorbar()
451
+ plt.title("Decoder Layer Reconstruction")
452
+ plt.show()
453
+
454
+ self._get_tsne_repr(inputs=inputs, frac=frac)
455
+ self._viz_tsne_repr(c=self.classification)
456
+
457
+ self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
458
+ self.data_input = pd.DataFrame(
459
+ inputs,
460
+ columns=(
461
+ [f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
462
+ ),
463
+ )
464
+ self.data["class"] = self.classification
465
+ self.data_input["class"] = self.classification
466
+ radviz(self.data, "class", color=self.colors)
467
+ plt.title("Radviz Visualization of Latent Space")
468
+ plt.show()
469
+
470
+ radviz(self.data_input, "class", color=self.colors)
471
+ plt.title("Radviz Visualization of Input Data")
472
+ plt.show()
473
+ return self._statistics(self.data_input)
474
+
475
+ def _statistics(self, data_input: DataFrame, **kwargs) -> DataFrame:
476
+ data = data_input.copy(deep=True)
477
+
478
+ if not pd.api.types.is_string_dtype(data["class"]):
479
+ data["class"] = data["class"].astype(str)
480
+
481
+ data.ffill(inplace=True)
482
+ grouped_data = data.groupby("class")
483
+
484
+ numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
485
+ numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
486
+
487
+ def get_mode(x):
488
+ mode_series = x.mode()
489
+ return mode_series.iloc[0] if not mode_series.empty else None
490
+
491
+ mode_stats = grouped_data.apply(get_mode, include_groups=False)
492
+ mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
493
+ combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
494
+
495
+ return combined_stats.T
496
+
497
+ def _viz_weights(
498
+ self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
499
+ ) -> None:
500
+ title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
501
+ y_labels = kwargs.get("y_labels", None)
502
+ cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
503
+ highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
504
+
505
+ plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
506
+ plt.colorbar()
507
+ plt.title(title)
508
+ if y_labels is not None:
509
+ plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
510
+ if highlight:
511
+ for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
512
+ highlight_mask[i, j] = True
513
+ plt.imshow(
514
+ np.ma.masked_where(~highlight_mask, self.encoder_weights),
515
+ cmap=cmap_highlight,
516
+ alpha=0.5,
517
+ aspect=aspect,
518
+ )
519
+ plt.show()
520
+
521
+ def _get_tsne_repr(self, inputs=None, frac=None) -> None:
522
+ if inputs is None:
523
+ inputs = self.inputs.copy()
524
+ if frac:
525
+ n = int(frac * self.inputs.shape[0])
526
+ indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
527
+ inputs = inputs[indexes]
528
+ inputs[np.isnan(inputs)] = 0.0
529
+ self.latent_representations = inputs @ self.encoder_weights
530
+
531
+ tsne = TSNE(n_components=2)
532
+ self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
533
+
534
+ def _viz_tsne_repr(self, **kwargs) -> None:
535
+ c = kwargs.get("c", None)
536
+ self.colors = (
537
+ kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
538
+ )
539
+ plt.scatter(
540
+ self.reduced_data_tsne[:, 0],
541
+ self.reduced_data_tsne[:, 1],
542
+ cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
543
+ c=c,
544
+ )
545
+ if c is not None:
546
+ cb = plt.colorbar()
547
+ loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
548
+ cb.set_ticks(loc)
549
+ cb.set_ticklabels(np.unique(c))
550
+ plt.title("t-SNE Visualization of Latent Space")
551
+ plt.xlabel("t-SNE 1")
552
+ plt.ylabel("t-SNE 2")
553
+ plt.show()
307
554
 
308
555
 
309
556
  ########################################################################################
557
+
558
+ if __name__ == "__main__":
559
+ # Example usage
560
+ import pandas as pd
561
+ from sklearn.datasets import load_iris
562
+ from sklearn.preprocessing import OneHotEncoder
563
+
564
+ # Load the dataset
565
+ iris = load_iris()
566
+
567
+ # Convert to a DataFrame for easy exploration
568
+ iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
569
+ iris_df["species"] = iris.target
570
+
571
+ X = iris_df.drop(columns="species")
572
+ y_labels = X.columns
573
+ X = X.values
574
+ y = iris_df["species"].values
575
+
576
+ X = np.asarray(X).astype(np.float32)
577
+
578
+ encoder = OneHotEncoder()
579
+ y = encoder.fit_transform(y.reshape(-1, 1)).toarray()
580
+ y = np.asarray(y).astype(np.float32)
581
+
582
+ model = AutoClassifier(
583
+ input_shape_parm=X.shape[1], num_classes=3, units=27, activation="selu", num_layers=2
584
+ )
585
+ model.compile(
586
+ optimizer="adam",
587
+ loss=tf.keras.losses.CategoricalCrossentropy(),
588
+ metrics=[tf.keras.metrics.F1Score(threshold=0.5)],
589
+ )
590
+ model.fit(X, y, epochs=50, validation_split=0.2)
591
+
592
+ insights = GetInsights(model, X)
593
+ summary = insights.predictor_analyzer(frac=1.0, y_labels=y_labels)
594
+ insights._get_tsne_repr()
595
+ insights._viz_tsne_repr()
596
+ insights._viz_tsne_repr(c=iris_df["species"])
597
+ insights._viz_weights()
598
+ print(summary)
@@ -0,0 +1,163 @@
1
+ import logging
2
+ import os
3
+ import pickle
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ from IPython.display import clear_output
8
+
9
+
10
+ class HMM:
11
+ def __init__(self, n_states: int, n_observations: int):
12
+ self.n_states = n_states
13
+ self.n_observations = n_observations
14
+
15
+ # Initialize parameters with random values
16
+ self.pi = np.random.dirichlet(np.ones(n_states), size=1)[0]
17
+ self.A = np.random.dirichlet(np.ones(n_states), size=n_states)
18
+ self.B = np.random.dirichlet(np.ones(n_observations), size=n_states)
19
+
20
+ def save_model(self, filename: str = "./hmm") -> None:
21
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
22
+ with open(filename, "wb") as f:
23
+ pickle.dump(self, f)
24
+
25
+ @staticmethod
26
+ def load_model(filename: str = "./hmm") -> "HMM":
27
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
28
+ with open(filename, "rb") as f:
29
+ return pickle.load(f)
30
+
31
+ def forward(self, sequence: List[int]) -> np.ndarray:
32
+ T = len(sequence)
33
+ alpha = np.zeros((T, self.n_states))
34
+
35
+ # Add a small constant (smoothing) to avoid log(0)
36
+ epsilon = 1e-10 # Small value to avoid taking log(0)
37
+
38
+ # Initialization (log-space)
39
+ alpha[0] = np.log(self.pi + epsilon) + np.log(self.B[:, sequence[0]] + epsilon)
40
+ alpha[0] -= np.log(np.sum(np.exp(alpha[0]))) # Normalization (log-space)
41
+
42
+ # Recursion (log-space)
43
+ for t in range(1, T):
44
+ for i in range(self.n_states):
45
+ alpha[t, i] = np.log(
46
+ np.sum(np.exp(alpha[t - 1] + np.log(self.A[:, i] + epsilon)))
47
+ ) + np.log(self.B[i, sequence[t]] + epsilon)
48
+ alpha[t] -= np.log(np.sum(np.exp(alpha[t]))) # Normalization
49
+
50
+ return alpha
51
+
52
+ def backward(self, sequence: List[int]) -> np.ndarray:
53
+ T = len(sequence)
54
+ beta = np.ones((T, self.n_states))
55
+
56
+ # Backward recursion
57
+ for t in range(T - 2, -1, -1):
58
+ for i in range(self.n_states):
59
+ beta[t, i] = np.sum(self.A[i] * self.B[:, sequence[t + 1]] * beta[t + 1])
60
+
61
+ return beta
62
+
63
+ def viterbi(self, sequence: List[int]) -> np.ndarray:
64
+ T = len(sequence)
65
+ delta = np.zeros((T, self.n_states))
66
+ psi = np.zeros((T, self.n_states), dtype=int)
67
+
68
+ # Initialization
69
+ delta[0] = self.pi * self.B[:, sequence[0]]
70
+
71
+ # Recursion
72
+ for t in range(1, T):
73
+ for i in range(self.n_states):
74
+ delta[t, i] = np.max(delta[t - 1] * self.A[:, i]) * self.B[i, sequence[t]]
75
+ psi[t, i] = np.argmax(delta[t - 1] * self.A[:, i])
76
+
77
+ # Reconstruct the most probable path
78
+ state_sequence = np.zeros(T, dtype=int)
79
+ state_sequence[T - 1] = np.argmax(delta[T - 1])
80
+ for t in range(T - 2, -1, -1):
81
+ state_sequence[t] = psi[t + 1, state_sequence[t + 1]]
82
+
83
+ return state_sequence
84
+
85
+ def baum_welch(
86
+ self, sequences: List[List[int]], n_iterations: int, verbose: bool = False
87
+ ) -> None:
88
+ for iteration in range(n_iterations):
89
+ # Initialize accumulators
90
+ A_num = np.zeros((self.n_states, self.n_states))
91
+ B_num = np.zeros((self.n_states, self.n_observations))
92
+ pi_num = np.zeros(self.n_states)
93
+
94
+ for sequence in sequences:
95
+ T = len(sequence)
96
+ alpha = self.forward(sequence)
97
+ beta = self.backward(sequence)
98
+
99
+ # Update pi
100
+ gamma = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
101
+ pi_num += gamma[0]
102
+
103
+ # Update A and B
104
+ for t in range(T - 1):
105
+ xi = np.zeros((self.n_states, self.n_states))
106
+ denom = np.sum(alpha[t] * self.A * self.B[:, sequence[t + 1]] * beta[t + 1])
107
+
108
+ for i in range(self.n_states):
109
+ for j in range(self.n_states):
110
+ xi[i, j] = (
111
+ alpha[t, i]
112
+ * self.A[i, j]
113
+ * self.B[j, sequence[t + 1]]
114
+ * beta[t + 1, j]
115
+ ) / denom
116
+ A_num[i] += xi[i]
117
+
118
+ B_num[:, sequence[t]] += gamma[t]
119
+
120
+ # For the last step of the sequence
121
+ B_num[:, sequence[-1]] += gamma[-1]
122
+
123
+ # Normalize and update parameters
124
+ self.pi = pi_num / len(sequences)
125
+ self.A = A_num / np.sum(A_num, axis=1, keepdims=True)
126
+ self.B = B_num / np.sum(B_num, axis=1, keepdims=True)
127
+
128
+ # Logging parameters every 10 iterations
129
+ if iteration % 10 == 0 and verbose:
130
+ os.system("cls" if os.name == "nt" else "clear")
131
+ clear_output(wait=True)
132
+ logging.info(f"Iteration {iteration}:")
133
+ logging.info("Pi: %s", self.pi)
134
+ logging.info("A:\n%s", self.A)
135
+ logging.info("B:\n%s", self.B)
136
+
137
+ def decoding_accuracy(self, sequences: List[List[int]], true_states: List[List[int]]) -> float:
138
+ correct_predictions = 0
139
+ total_predictions = 0
140
+
141
+ for sequence, true_state in zip(sequences, true_states):
142
+ predicted_states = self.viterbi(sequence)
143
+ correct_predictions += np.sum(predicted_states == true_state)
144
+ total_predictions += len(sequence)
145
+
146
+ accuracy = (correct_predictions / total_predictions) * 100
147
+ return accuracy
148
+
149
+ def state_probabilities(self, sequence: List[int]) -> np.ndarray:
150
+ """
151
+ Returns the smoothed probabilities of the hidden states at each time step.
152
+ This is done by using both forward and backward probabilities.
153
+ """
154
+ alpha = self.forward(sequence)
155
+ beta = self.backward(sequence)
156
+
157
+ # Compute smoothed probabilities (gamma)
158
+ smoothed_probs = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
159
+
160
+ return smoothed_probs
161
+
162
+ def sequence_probability(self, sequence: List[int]) -> np.ndarray:
163
+ return self.state_probabilities(sequence)[-1]
@@ -5,7 +5,6 @@ from typing import List, Tuple, Union
5
5
  import matplotlib.pyplot as plt
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from numpy import ndarray
9
8
  from pandas.core.frame import DataFrame
10
9
 
11
10
  from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
@@ -66,12 +65,12 @@ class SimulationEngine(FeatureSelection):
66
65
 
67
66
  super().__init__(**kwargs)
68
67
 
69
- def predict(self, df: DataFrame, column: str) -> ndarray | list:
68
+ def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
70
69
  # Let us assign the dictionary entries corresponding to the column
71
70
  w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
72
71
 
73
72
  df = df[names_cols].copy()
74
- # Change the scale of the dataframe
73
+ # Change the scale of the DataFrame
75
74
  dataset = self.df.copy()
76
75
  dataset.drop(columns=column, inplace=True)
77
76
  numeric_df = dataset.select_dtypes(include="number")
@@ -85,7 +84,7 @@ class SimulationEngine(FeatureSelection):
85
84
  for col in numeric_df.columns:
86
85
  df[col] = numeric_df[col].values
87
86
 
88
- # Encoding the datadrame
87
+ # Encoding the DataFrame
89
88
  for num, colname in enumerate(dfe._encode_columns):
90
89
  if df[colname].dtype == "object":
91
90
  encode_dict = dfe.encoding_list[num]
@@ -93,7 +92,7 @@ class SimulationEngine(FeatureSelection):
93
92
  dfe._code_transformation_to, dictionary_list=encode_dict
94
93
  )
95
94
 
96
- # PREDICTION
95
+ # Prediction
97
96
  y = df.to_numpy() @ w
98
97
 
99
98
  # Categorical column
@@ -113,7 +112,7 @@ class SimulationEngine(FeatureSelection):
113
112
 
114
113
  return y[:]
115
114
 
116
- def _encode(self, df: DataFrame) -> ndarray | list:
115
+ def _encode(self, df: DataFrame) -> np.ndarray | list:
117
116
  df = df.copy()
118
117
  column = df.columns[0]
119
118
  frec = df[column].value_counts() / len(df)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: likelihood
3
- Version: 1.2.23
3
+ Version: 1.2.25
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -13,7 +13,7 @@ Classifier: Operating System :: OS Independent
13
13
  Requires-Python: >=3.10
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
- Requires-Dist: black[jupyter]==24.1.1
16
+ Requires-Dist: black[jupyter]>=24.3.0
17
17
  Requires-Dist: mypy-extensions==1.0.0
18
18
  Requires-Dist: types-openpyxl==3.1.0.15
19
19
  Requires-Dist: pydocstyle==6.3.0
@@ -31,6 +31,18 @@ Requires-Dist: pyvis; extra == "full"
31
31
  Requires-Dist: tensorflow==2.15.0; extra == "full"
32
32
  Requires-Dist: keras-tuner; extra == "full"
33
33
  Requires-Dist: scikit-learn; extra == "full"
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: classifier
37
+ Dynamic: description
38
+ Dynamic: description-content-type
39
+ Dynamic: home-page
40
+ Dynamic: maintainer
41
+ Dynamic: maintainer-email
42
+ Dynamic: provides-extra
43
+ Dynamic: requires-dist
44
+ Dynamic: requires-python
45
+ Dynamic: summary
34
46
 
35
47
  ![likelihood](https://raw.githubusercontent.com/RodolfoFerro/likelihood/main/likelihood.png)
36
48
 
@@ -2,18 +2,19 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
2
2
  likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
3
3
  likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
4
4
  likelihood/graph/graph.py,sha256=hGWCznxaRQ8BfY2aLjrvwriZkAIsz5ydKXF4x_7b0EQ,3359
5
- likelihood/graph/nn.py,sha256=jBgb2SMUwM5OBatkIxH2I-_hH1ok5aw2fwXq5a1VAEg,12306
5
+ likelihood/graph/nn.py,sha256=WuK66hRTN5hdVIArgfSweqtE098tb6QFd2ZMFaHvnZA,12263
6
6
  likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
7
+ likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
7
8
  likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
8
- likelihood/models/simulation.py,sha256=Y4RXkeYHmQCve-EpEYVmzh6tm5pkJa_Pbx0iYJmptU8,8852
9
+ likelihood/models/simulation.py,sha256=L_9Mihcca7i_AnvWWrZilFV8VEhz_Z8fDLepmwBGSi8,8832
9
10
  likelihood/models/utils.py,sha256=VtEj07lV-GRoWraQgpfjU0jTt1Ntf9MXgYwe6XYQh20,1552
10
11
  likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
11
- likelihood/models/deep/autoencoders.py,sha256=2P--nS96XwMi44q0OIxvIp6Mdbt-B4LqwCSXTn2jYrY,10070
12
+ likelihood/models/deep/autoencoders.py,sha256=seE1rb1t1gbbKRyEzfi01BqMsV4MU6yakVTLcukAMkg,20591
12
13
  likelihood/tools/__init__.py,sha256=MCjsCWfBNKE2uMN0VizDN1uFzZ_md0X2WZeBdWhrCR8,50
13
14
  likelihood/tools/numeric_tools.py,sha256=FA44kbiAcxcquz1el_g3Pqsp5ii8XFkAIrsMs5bGkj0,11445
14
15
  likelihood/tools/tools.py,sha256=iZBC7IHTFpAyxooyel7ZFi-5-G0nCotNLLtxenPw9T8,44303
15
- likelihood-1.2.23.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
16
- likelihood-1.2.23.dist-info/METADATA,sha256=sdJRNVLSm5SNwfQkolcusGvkFnlf_dNcMzeRmb4JUyQ,2504
17
- likelihood-1.2.23.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
18
- likelihood-1.2.23.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
19
- likelihood-1.2.23.dist-info/RECORD,,
16
+ likelihood-1.2.25.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
17
+ likelihood-1.2.25.dist-info/METADATA,sha256=hUsmkghXP8m4z3FtWcM64gwBEW74HIOTNJifK26OOkw,2771
18
+ likelihood-1.2.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
19
+ likelihood-1.2.25.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
20
+ likelihood-1.2.25.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5