likelihood 1.2.23__tar.gz → 1.2.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {likelihood-1.2.23 → likelihood-1.2.25}/PKG-INFO +15 -3
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/graph/nn.py +9 -10
- likelihood-1.2.25/likelihood/models/deep/autoencoders.py +598 -0
- likelihood-1.2.25/likelihood/models/hmm.py +163 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/models/simulation.py +5 -6
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood.egg-info/PKG-INFO +15 -3
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood.egg-info/SOURCES.txt +1 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood.egg-info/requires.txt +1 -1
- likelihood-1.2.23/likelihood/models/deep/autoencoders.py +0 -309
- {likelihood-1.2.23 → likelihood-1.2.25}/LICENSE +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/README.md +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/__init__.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/graph/__init__.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/graph/graph.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/main.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/models/__init__.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/models/deep/__init__.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/models/regression.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/models/utils.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/tools/__init__.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/tools/numeric_tools.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood/tools/tools.py +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood.egg-info/dependency_links.txt +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/likelihood.egg-info/top_level.txt +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/setup.cfg +0 -0
- {likelihood-1.2.23 → likelihood-1.2.25}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.25
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -13,7 +13,7 @@ Classifier: Operating System :: OS Independent
|
|
|
13
13
|
Requires-Python: >=3.10
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
|
-
Requires-Dist: black[jupyter]
|
|
16
|
+
Requires-Dist: black[jupyter]>=24.3.0
|
|
17
17
|
Requires-Dist: mypy-extensions==1.0.0
|
|
18
18
|
Requires-Dist: types-openpyxl==3.1.0.15
|
|
19
19
|
Requires-Dist: pydocstyle==6.3.0
|
|
@@ -31,6 +31,18 @@ Requires-Dist: pyvis; extra == "full"
|
|
|
31
31
|
Requires-Dist: tensorflow==2.15.0; extra == "full"
|
|
32
32
|
Requires-Dist: keras-tuner; extra == "full"
|
|
33
33
|
Requires-Dist: scikit-learn; extra == "full"
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: author-email
|
|
36
|
+
Dynamic: classifier
|
|
37
|
+
Dynamic: description
|
|
38
|
+
Dynamic: description-content-type
|
|
39
|
+
Dynamic: home-page
|
|
40
|
+
Dynamic: maintainer
|
|
41
|
+
Dynamic: maintainer-email
|
|
42
|
+
Dynamic: provides-extra
|
|
43
|
+
Dynamic: requires-dist
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
Dynamic: summary
|
|
34
46
|
|
|
35
47
|

|
|
36
48
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
|
|
3
|
-
os.environ["
|
|
4
|
-
|
|
4
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
5
|
+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
6
|
+
|
|
5
7
|
import warnings
|
|
6
8
|
from typing import List, Tuple
|
|
7
9
|
|
|
@@ -9,19 +11,16 @@ import numpy as np
|
|
|
9
11
|
import pandas as pd
|
|
10
12
|
import tensorflow as tf
|
|
11
13
|
from IPython.display import clear_output
|
|
12
|
-
from numpy import ndarray
|
|
13
14
|
from pandas.core.frame import DataFrame
|
|
14
15
|
from sklearn.metrics import f1_score
|
|
15
16
|
from sklearn.model_selection import train_test_split
|
|
16
17
|
|
|
17
18
|
from likelihood.tools import generate_feature_yaml
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|
20
|
+
tf.get_logger().setLevel("ERROR")
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
|
|
23
|
+
def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
|
|
25
24
|
"""Compares the similarity between two arrays of categories.
|
|
26
25
|
|
|
27
26
|
Parameters
|
|
@@ -44,9 +43,9 @@ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
|
|
|
44
43
|
return count
|
|
45
44
|
|
|
46
45
|
|
|
47
|
-
def
|
|
46
|
+
def cal_adjacency_matrix(
|
|
48
47
|
df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
49
|
-
) -> Tuple[dict, ndarray]:
|
|
48
|
+
) -> Tuple[dict, np.ndarray]:
|
|
50
49
|
"""Calculates the adjacency matrix for a given DataFrame.
|
|
51
50
|
The adjacency matrix is a matrix that represents the similarity between each pair of categories.
|
|
52
51
|
The similarity is calculated using the `compare_similarity` function.
|
|
@@ -133,7 +132,7 @@ class Data:
|
|
|
133
132
|
target: str | None = None,
|
|
134
133
|
exclude_subset: List[str] = [],
|
|
135
134
|
):
|
|
136
|
-
_, adjacency =
|
|
135
|
+
_, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=True)
|
|
137
136
|
if target is not None:
|
|
138
137
|
X = df.drop(columns=[target] + exclude_subset)
|
|
139
138
|
else:
|
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
from functools import partial
|
|
5
|
+
from shutil import rmtree
|
|
6
|
+
|
|
7
|
+
import matplotlib
|
|
8
|
+
import matplotlib.colors as mcolors
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from pandas.plotting import radviz
|
|
13
|
+
|
|
14
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
15
|
+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
16
|
+
|
|
17
|
+
import warnings
|
|
18
|
+
from functools import wraps
|
|
19
|
+
|
|
20
|
+
import keras_tuner
|
|
21
|
+
import tensorflow as tf
|
|
22
|
+
from pandas.core.frame import DataFrame
|
|
23
|
+
from sklearn.manifold import TSNE
|
|
24
|
+
|
|
25
|
+
from likelihood.tools import OneHotEncoder
|
|
26
|
+
|
|
27
|
+
tf.get_logger().setLevel("ERROR")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def suppress_warnings(func):
|
|
31
|
+
@wraps(func)
|
|
32
|
+
def wrapper(*args, **kwargs):
|
|
33
|
+
with warnings.catch_warnings():
|
|
34
|
+
warnings.simplefilter("ignore")
|
|
35
|
+
return func(*args, **kwargs)
|
|
36
|
+
|
|
37
|
+
return wrapper
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@tf.keras.utils.register_keras_serializable(package="Custom", name="AutoClassifier")
|
|
41
|
+
class AutoClassifier(tf.keras.Model):
|
|
42
|
+
"""
|
|
43
|
+
An auto-classifier model that automatically determines the best classification strategy based on the input data.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
- input_shape_parm: The shape of the input data.
|
|
47
|
+
- num_classes: The number of classes in the dataset.
|
|
48
|
+
- units: The number of neurons in each hidden layer.
|
|
49
|
+
- activation: The type of activation function to use for the neural network layers.
|
|
50
|
+
|
|
51
|
+
Methods:
|
|
52
|
+
__init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
|
|
53
|
+
build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
|
|
54
|
+
call(self, x): Defines the forward pass of the model.
|
|
55
|
+
get_config(self): Returns the configuration of the model.
|
|
56
|
+
from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, input_shape_parm, num_classes, units, activation, **kwargs):
|
|
60
|
+
"""
|
|
61
|
+
Initializes an AutoClassifier instance with the given parameters.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
input_shape_parm : `int`
|
|
66
|
+
The shape of the input data.
|
|
67
|
+
num_classes : `int`
|
|
68
|
+
The number of classes in the dataset.
|
|
69
|
+
units : `int`
|
|
70
|
+
The number of neurons in each hidden layer.
|
|
71
|
+
activation : `str`
|
|
72
|
+
The type of activation function to use for the neural network layers.
|
|
73
|
+
|
|
74
|
+
Keyword Arguments:
|
|
75
|
+
----------
|
|
76
|
+
Additional keyword arguments to pass to the model.
|
|
77
|
+
|
|
78
|
+
classifier_activation : `str`
|
|
79
|
+
The activation function to use for the classifier layer. Default is "softmax". If the activation function is not a classification function, the model can be used in regression problems.
|
|
80
|
+
num_layers : `int`
|
|
81
|
+
The number of hidden layers in the classifier. Default is 1.
|
|
82
|
+
"""
|
|
83
|
+
super(AutoClassifier, self).__init__()
|
|
84
|
+
self.input_shape_parm = input_shape_parm
|
|
85
|
+
self.num_classes = num_classes
|
|
86
|
+
self.units = units
|
|
87
|
+
self.activation = activation
|
|
88
|
+
|
|
89
|
+
self.encoder = None
|
|
90
|
+
self.decoder = None
|
|
91
|
+
self.classifier = None
|
|
92
|
+
self.classifier_activation = kwargs.get("classifier_activation", "softmax")
|
|
93
|
+
self.num_layers = kwargs.get("num_layers", 1)
|
|
94
|
+
|
|
95
|
+
def build(self, input_shape):
|
|
96
|
+
self.encoder = tf.keras.Sequential(
|
|
97
|
+
[
|
|
98
|
+
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
99
|
+
tf.keras.layers.Dense(units=int(self.units / 2), activation=self.activation),
|
|
100
|
+
]
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
self.decoder = tf.keras.Sequential(
|
|
104
|
+
[
|
|
105
|
+
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
106
|
+
tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
|
|
107
|
+
]
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
self.classifier = tf.keras.Sequential()
|
|
111
|
+
if self.num_layers > 1:
|
|
112
|
+
for _ in range(self.num_layers - 1):
|
|
113
|
+
self.classifier.add(
|
|
114
|
+
tf.keras.layers.Dense(units=self.units, activation=self.activation)
|
|
115
|
+
)
|
|
116
|
+
self.classifier.add(
|
|
117
|
+
tf.keras.layers.Dense(units=self.num_classes, activation=self.classifier_activation)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def call(self, x):
|
|
121
|
+
encoded = self.encoder(x)
|
|
122
|
+
decoded = self.decoder(encoded)
|
|
123
|
+
combined = tf.concat([decoded, encoded], axis=1)
|
|
124
|
+
classification = self.classifier(combined)
|
|
125
|
+
return classification
|
|
126
|
+
|
|
127
|
+
def get_config(self):
|
|
128
|
+
config = {
|
|
129
|
+
"input_shape_parm": self.input_shape_parm,
|
|
130
|
+
"num_classes": self.num_classes,
|
|
131
|
+
"units": self.units,
|
|
132
|
+
"activation": self.activation,
|
|
133
|
+
"classifier_activation": self.classifier_activation,
|
|
134
|
+
"num_layers": self.num_layers,
|
|
135
|
+
}
|
|
136
|
+
base_config = super(AutoClassifier, self).get_config()
|
|
137
|
+
return dict(list(base_config.items()) + list(config.items()))
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def from_config(cls, config):
|
|
141
|
+
return cls(
|
|
142
|
+
input_shape_parm=config["input_shape_parm"],
|
|
143
|
+
num_classes=config["num_classes"],
|
|
144
|
+
units=config["units"],
|
|
145
|
+
activation=config["activation"],
|
|
146
|
+
classifier_activation=config["classifier_activation"],
|
|
147
|
+
num_layers=config["num_layers"],
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def call_existing_code(
|
|
152
|
+
units: int,
|
|
153
|
+
activation: str,
|
|
154
|
+
threshold: float,
|
|
155
|
+
optimizer: str,
|
|
156
|
+
input_shape_parm: None | int = None,
|
|
157
|
+
num_classes: None | int = None,
|
|
158
|
+
num_layers: int = 1,
|
|
159
|
+
) -> AutoClassifier:
|
|
160
|
+
"""
|
|
161
|
+
Calls an existing AutoClassifier instance.
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
units : `int`
|
|
166
|
+
The number of neurons in each hidden layer.
|
|
167
|
+
activation : `str`
|
|
168
|
+
The type of activation function to use for the neural network layers.
|
|
169
|
+
threshold : `float`
|
|
170
|
+
The threshold for the classifier.
|
|
171
|
+
optimizer : `str`
|
|
172
|
+
The type of optimizer to use for the neural network layers.
|
|
173
|
+
input_shape_parm : `None` | `int`
|
|
174
|
+
The shape of the input data.
|
|
175
|
+
num_classes : `int`
|
|
176
|
+
The number of classes in the dataset.
|
|
177
|
+
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
`AutoClassifier`
|
|
181
|
+
The AutoClassifier instance.
|
|
182
|
+
"""
|
|
183
|
+
model = AutoClassifier(
|
|
184
|
+
input_shape_parm=input_shape_parm,
|
|
185
|
+
num_classes=num_classes,
|
|
186
|
+
units=units,
|
|
187
|
+
activation=activation,
|
|
188
|
+
num_layers=num_layers,
|
|
189
|
+
)
|
|
190
|
+
model.compile(
|
|
191
|
+
optimizer=optimizer,
|
|
192
|
+
loss=tf.keras.losses.CategoricalCrossentropy(),
|
|
193
|
+
metrics=[tf.keras.metrics.F1Score(threshold=threshold)],
|
|
194
|
+
)
|
|
195
|
+
return model
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def build_model(
|
|
199
|
+
hp, input_shape_parm: None | int, num_classes: None | int, **kwargs
|
|
200
|
+
) -> AutoClassifier:
|
|
201
|
+
"""Builds a neural network model using Keras Tuner's search algorithm.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
hp : `keras_tuner.HyperParameters`
|
|
206
|
+
The hyperparameters to tune.
|
|
207
|
+
input_shape_parm : `None` | `int`
|
|
208
|
+
The shape of the input data.
|
|
209
|
+
num_classes : `int`
|
|
210
|
+
The number of classes in the dataset.
|
|
211
|
+
|
|
212
|
+
Keyword Arguments:
|
|
213
|
+
----------
|
|
214
|
+
Additional keyword arguments to pass to the model.
|
|
215
|
+
|
|
216
|
+
hyperparameters : `dict`
|
|
217
|
+
The hyperparameters to set.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
`keras.Model`
|
|
222
|
+
The neural network model.
|
|
223
|
+
"""
|
|
224
|
+
hyperparameters = kwargs.get("hyperparameters", None)
|
|
225
|
+
hyperparameters_keys = hyperparameters.keys() if hyperparameters is not None else []
|
|
226
|
+
|
|
227
|
+
units = (
|
|
228
|
+
hp.Int(
|
|
229
|
+
"units",
|
|
230
|
+
min_value=int(input_shape_parm * 0.2),
|
|
231
|
+
max_value=int(input_shape_parm * 1.5),
|
|
232
|
+
step=2,
|
|
233
|
+
)
|
|
234
|
+
if "units" not in hyperparameters_keys
|
|
235
|
+
else hyperparameters["units"]
|
|
236
|
+
)
|
|
237
|
+
activation = (
|
|
238
|
+
hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus", "softsign"])
|
|
239
|
+
if "activation" not in hyperparameters_keys
|
|
240
|
+
else hyperparameters["activation"]
|
|
241
|
+
)
|
|
242
|
+
optimizer = (
|
|
243
|
+
hp.Choice("optimizer", ["sgd", "adam", "adadelta", "rmsprop", "adamax", "adagrad"])
|
|
244
|
+
if "optimizer" not in hyperparameters_keys
|
|
245
|
+
else hyperparameters["optimizer"]
|
|
246
|
+
)
|
|
247
|
+
threshold = (
|
|
248
|
+
hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
|
|
249
|
+
if "threshold" not in hyperparameters_keys
|
|
250
|
+
else hyperparameters["threshold"]
|
|
251
|
+
)
|
|
252
|
+
num_layers = (
|
|
253
|
+
hp.Int("num_layers", min_value=1, max_value=10, step=1)
|
|
254
|
+
if "num_layers" not in hyperparameters_keys
|
|
255
|
+
else hyperparameters["num_layers"]
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
model = call_existing_code(
|
|
259
|
+
units=units,
|
|
260
|
+
activation=activation,
|
|
261
|
+
threshold=threshold,
|
|
262
|
+
optimizer=optimizer,
|
|
263
|
+
input_shape_parm=input_shape_parm,
|
|
264
|
+
num_classes=num_classes,
|
|
265
|
+
num_layers=num_layers,
|
|
266
|
+
)
|
|
267
|
+
return model
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@suppress_warnings
|
|
271
|
+
def setup_model(
|
|
272
|
+
data: DataFrame,
|
|
273
|
+
target: str,
|
|
274
|
+
epochs: int,
|
|
275
|
+
train_size: float = 0.7,
|
|
276
|
+
seed=None,
|
|
277
|
+
train_mode: bool = True,
|
|
278
|
+
filepath: str = "./my_dir/best_model",
|
|
279
|
+
method: str = "Hyperband",
|
|
280
|
+
**kwargs,
|
|
281
|
+
) -> AutoClassifier:
|
|
282
|
+
"""Setup model for training and tuning.
|
|
283
|
+
|
|
284
|
+
Parameters
|
|
285
|
+
----------
|
|
286
|
+
data : `DataFrame`
|
|
287
|
+
The dataset to train the model on.
|
|
288
|
+
target : `str`
|
|
289
|
+
The name of the target column.
|
|
290
|
+
epochs : `int`
|
|
291
|
+
The number of epochs to train the model for.
|
|
292
|
+
train_size : `float`
|
|
293
|
+
The proportion of the dataset to use for training.
|
|
294
|
+
seed : `Any` | `int`
|
|
295
|
+
The random seed to use for reproducibility.
|
|
296
|
+
train_mode : `bool`
|
|
297
|
+
Whether to train the model or not.
|
|
298
|
+
filepath : `str`
|
|
299
|
+
The path to save the best model to.
|
|
300
|
+
method : `str`
|
|
301
|
+
The method to use for hyperparameter tuning. Options are "Hyperband" and "RandomSearch".
|
|
302
|
+
|
|
303
|
+
Keyword Arguments:
|
|
304
|
+
----------
|
|
305
|
+
Additional keyword arguments to pass to the model.
|
|
306
|
+
|
|
307
|
+
max_trials : `int`
|
|
308
|
+
The maximum number of trials to perform.
|
|
309
|
+
directory : `str`
|
|
310
|
+
The directory to save the model to.
|
|
311
|
+
project_name : `str`
|
|
312
|
+
The name of the project.
|
|
313
|
+
objective : `str`
|
|
314
|
+
The objective to optimize.
|
|
315
|
+
verbose : `bool`
|
|
316
|
+
Whether to print verbose output.
|
|
317
|
+
hyperparameters : `dict`
|
|
318
|
+
The hyperparameters to set.
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
model : `AutoClassifier`
|
|
323
|
+
The trained model.
|
|
324
|
+
"""
|
|
325
|
+
max_trials = kwargs.get("max_trials", 10)
|
|
326
|
+
directory = kwargs.get("directory", "./my_dir")
|
|
327
|
+
project_name = kwargs.get("project_name", "get_best")
|
|
328
|
+
objective = kwargs.get("objective", "val_loss")
|
|
329
|
+
verbose = kwargs.get("verbose", True)
|
|
330
|
+
hyperparameters = kwargs.get("hyperparameters", None)
|
|
331
|
+
|
|
332
|
+
X = data.drop(columns=target)
|
|
333
|
+
input_sample = X.sample(1)
|
|
334
|
+
y = data[target]
|
|
335
|
+
assert (
|
|
336
|
+
X.select_dtypes(include=["object"]).empty == True
|
|
337
|
+
), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
|
|
338
|
+
validation_split = 1.0 - train_size
|
|
339
|
+
|
|
340
|
+
if train_mode:
|
|
341
|
+
try:
|
|
342
|
+
if (not os.path.exists(directory)) and directory != "./":
|
|
343
|
+
os.makedirs(directory)
|
|
344
|
+
elif directory != "./":
|
|
345
|
+
print(f"Directory {directory} already exists, it will be deleted.")
|
|
346
|
+
rmtree(directory)
|
|
347
|
+
os.makedirs(directory)
|
|
348
|
+
except:
|
|
349
|
+
print("Warning: unable to create directory")
|
|
350
|
+
|
|
351
|
+
y_encoder = OneHotEncoder()
|
|
352
|
+
y = y_encoder.encode(y.to_list())
|
|
353
|
+
X = X.to_numpy()
|
|
354
|
+
input_sample.to_numpy()
|
|
355
|
+
X = np.asarray(X).astype(np.float32)
|
|
356
|
+
input_sample = np.asarray(input_sample).astype(np.float32)
|
|
357
|
+
y = np.asarray(y).astype(np.float32)
|
|
358
|
+
|
|
359
|
+
input_shape_parm = X.shape[1]
|
|
360
|
+
num_classes = y.shape[1]
|
|
361
|
+
global build_model
|
|
362
|
+
build_model = partial(
|
|
363
|
+
build_model,
|
|
364
|
+
input_shape_parm=input_shape_parm,
|
|
365
|
+
num_classes=num_classes,
|
|
366
|
+
hyperparameters=hyperparameters,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
if method == "Hyperband":
|
|
370
|
+
tuner = keras_tuner.Hyperband(
|
|
371
|
+
hypermodel=build_model,
|
|
372
|
+
objective=objective,
|
|
373
|
+
max_epochs=epochs,
|
|
374
|
+
factor=3,
|
|
375
|
+
directory=directory,
|
|
376
|
+
project_name=project_name,
|
|
377
|
+
seed=seed,
|
|
378
|
+
)
|
|
379
|
+
elif method == "RandomSearch":
|
|
380
|
+
tuner = keras_tuner.RandomSearch(
|
|
381
|
+
hypermodel=build_model,
|
|
382
|
+
objective=objective,
|
|
383
|
+
max_trials=max_trials,
|
|
384
|
+
directory=directory,
|
|
385
|
+
project_name=project_name,
|
|
386
|
+
seed=seed,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
tuner.search(X, y, epochs=epochs, validation_split=validation_split, verbose=verbose)
|
|
390
|
+
models = tuner.get_best_models(num_models=2)
|
|
391
|
+
best_model = models[0]
|
|
392
|
+
best_model(input_sample)
|
|
393
|
+
|
|
394
|
+
best_model.save(filepath, save_format="tf")
|
|
395
|
+
|
|
396
|
+
if verbose:
|
|
397
|
+
tuner.results_summary()
|
|
398
|
+
else:
|
|
399
|
+
best_model = tf.keras.models.load_model(filepath)
|
|
400
|
+
|
|
401
|
+
best_hps = tuner.get_best_hyperparameters(1)[0].values
|
|
402
|
+
return best_model, pd.DataFrame(best_hps, index=["Value"])
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
class GetInsights:
|
|
406
|
+
def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
|
|
407
|
+
self.inputs = inputs
|
|
408
|
+
self.model = model
|
|
409
|
+
self.encoder_layer = self.model.encoder.layers[0]
|
|
410
|
+
self.decoder_layer = self.model.decoder.layers[0]
|
|
411
|
+
self.classifier_layer = self.model.classifier.layers[-2]
|
|
412
|
+
self.encoder_weights = self.encoder_layer.get_weights()[0]
|
|
413
|
+
self.decoder_weights = self.decoder_layer.get_weights()[0]
|
|
414
|
+
self.classifier_weights = self.classifier_layer.get_weights()[0]
|
|
415
|
+
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
|
|
416
|
+
|
|
417
|
+
by_hsv = sorted(
|
|
418
|
+
(tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
|
|
419
|
+
for name, color in colors.items()
|
|
420
|
+
)
|
|
421
|
+
self.sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
|
|
422
|
+
random.shuffle(self.sorted_names)
|
|
423
|
+
|
|
424
|
+
def predictor_analyzer(
|
|
425
|
+
self,
|
|
426
|
+
frac=None,
|
|
427
|
+
cmap: str = "viridis",
|
|
428
|
+
aspect: str = "auto",
|
|
429
|
+
highlight: bool = True,
|
|
430
|
+
**kwargs,
|
|
431
|
+
) -> None:
|
|
432
|
+
self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
|
|
433
|
+
inputs = self.inputs.copy()
|
|
434
|
+
y_labels = kwargs.get("y_labels", None)
|
|
435
|
+
if frac:
|
|
436
|
+
n = int(frac * self.inputs.shape[0])
|
|
437
|
+
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
438
|
+
inputs = inputs[indexes]
|
|
439
|
+
inputs[np.isnan(inputs)] = 0.0
|
|
440
|
+
encoded = self.model.encoder(inputs)
|
|
441
|
+
reconstructed = self.model.decoder(encoded)
|
|
442
|
+
combined = tf.concat([reconstructed, encoded], axis=1)
|
|
443
|
+
self.classification = self.model.classifier(combined).numpy().argmax(axis=1)
|
|
444
|
+
ax = plt.subplot(1, 2, 1)
|
|
445
|
+
plt.imshow(self.inputs, cmap=cmap, aspect=aspect)
|
|
446
|
+
plt.colorbar()
|
|
447
|
+
plt.title("Original Data")
|
|
448
|
+
plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
|
|
449
|
+
plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
|
|
450
|
+
plt.colorbar()
|
|
451
|
+
plt.title("Decoder Layer Reconstruction")
|
|
452
|
+
plt.show()
|
|
453
|
+
|
|
454
|
+
self._get_tsne_repr(inputs=inputs, frac=frac)
|
|
455
|
+
self._viz_tsne_repr(c=self.classification)
|
|
456
|
+
|
|
457
|
+
self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
|
|
458
|
+
self.data_input = pd.DataFrame(
|
|
459
|
+
inputs,
|
|
460
|
+
columns=(
|
|
461
|
+
[f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
|
|
462
|
+
),
|
|
463
|
+
)
|
|
464
|
+
self.data["class"] = self.classification
|
|
465
|
+
self.data_input["class"] = self.classification
|
|
466
|
+
radviz(self.data, "class", color=self.colors)
|
|
467
|
+
plt.title("Radviz Visualization of Latent Space")
|
|
468
|
+
plt.show()
|
|
469
|
+
|
|
470
|
+
radviz(self.data_input, "class", color=self.colors)
|
|
471
|
+
plt.title("Radviz Visualization of Input Data")
|
|
472
|
+
plt.show()
|
|
473
|
+
return self._statistics(self.data_input)
|
|
474
|
+
|
|
475
|
+
def _statistics(self, data_input: DataFrame, **kwargs) -> DataFrame:
|
|
476
|
+
data = data_input.copy(deep=True)
|
|
477
|
+
|
|
478
|
+
if not pd.api.types.is_string_dtype(data["class"]):
|
|
479
|
+
data["class"] = data["class"].astype(str)
|
|
480
|
+
|
|
481
|
+
data.ffill(inplace=True)
|
|
482
|
+
grouped_data = data.groupby("class")
|
|
483
|
+
|
|
484
|
+
numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
|
|
485
|
+
numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
|
|
486
|
+
|
|
487
|
+
def get_mode(x):
|
|
488
|
+
mode_series = x.mode()
|
|
489
|
+
return mode_series.iloc[0] if not mode_series.empty else None
|
|
490
|
+
|
|
491
|
+
mode_stats = grouped_data.apply(get_mode, include_groups=False)
|
|
492
|
+
mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
|
|
493
|
+
combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
|
|
494
|
+
|
|
495
|
+
return combined_stats.T
|
|
496
|
+
|
|
497
|
+
def _viz_weights(
|
|
498
|
+
self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
|
|
499
|
+
) -> None:
|
|
500
|
+
title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
|
|
501
|
+
y_labels = kwargs.get("y_labels", None)
|
|
502
|
+
cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
|
|
503
|
+
highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
|
|
504
|
+
|
|
505
|
+
plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
|
|
506
|
+
plt.colorbar()
|
|
507
|
+
plt.title(title)
|
|
508
|
+
if y_labels is not None:
|
|
509
|
+
plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
|
|
510
|
+
if highlight:
|
|
511
|
+
for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
|
|
512
|
+
highlight_mask[i, j] = True
|
|
513
|
+
plt.imshow(
|
|
514
|
+
np.ma.masked_where(~highlight_mask, self.encoder_weights),
|
|
515
|
+
cmap=cmap_highlight,
|
|
516
|
+
alpha=0.5,
|
|
517
|
+
aspect=aspect,
|
|
518
|
+
)
|
|
519
|
+
plt.show()
|
|
520
|
+
|
|
521
|
+
def _get_tsne_repr(self, inputs=None, frac=None) -> None:
|
|
522
|
+
if inputs is None:
|
|
523
|
+
inputs = self.inputs.copy()
|
|
524
|
+
if frac:
|
|
525
|
+
n = int(frac * self.inputs.shape[0])
|
|
526
|
+
indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
|
|
527
|
+
inputs = inputs[indexes]
|
|
528
|
+
inputs[np.isnan(inputs)] = 0.0
|
|
529
|
+
self.latent_representations = inputs @ self.encoder_weights
|
|
530
|
+
|
|
531
|
+
tsne = TSNE(n_components=2)
|
|
532
|
+
self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
|
|
533
|
+
|
|
534
|
+
def _viz_tsne_repr(self, **kwargs) -> None:
|
|
535
|
+
c = kwargs.get("c", None)
|
|
536
|
+
self.colors = (
|
|
537
|
+
kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
|
|
538
|
+
)
|
|
539
|
+
plt.scatter(
|
|
540
|
+
self.reduced_data_tsne[:, 0],
|
|
541
|
+
self.reduced_data_tsne[:, 1],
|
|
542
|
+
cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
|
|
543
|
+
c=c,
|
|
544
|
+
)
|
|
545
|
+
if c is not None:
|
|
546
|
+
cb = plt.colorbar()
|
|
547
|
+
loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
|
|
548
|
+
cb.set_ticks(loc)
|
|
549
|
+
cb.set_ticklabels(np.unique(c))
|
|
550
|
+
plt.title("t-SNE Visualization of Latent Space")
|
|
551
|
+
plt.xlabel("t-SNE 1")
|
|
552
|
+
plt.ylabel("t-SNE 2")
|
|
553
|
+
plt.show()
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
########################################################################################
|
|
557
|
+
|
|
558
|
+
if __name__ == "__main__":
|
|
559
|
+
# Example usage
|
|
560
|
+
import pandas as pd
|
|
561
|
+
from sklearn.datasets import load_iris
|
|
562
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
563
|
+
|
|
564
|
+
# Load the dataset
|
|
565
|
+
iris = load_iris()
|
|
566
|
+
|
|
567
|
+
# Convert to a DataFrame for easy exploration
|
|
568
|
+
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
|
|
569
|
+
iris_df["species"] = iris.target
|
|
570
|
+
|
|
571
|
+
X = iris_df.drop(columns="species")
|
|
572
|
+
y_labels = X.columns
|
|
573
|
+
X = X.values
|
|
574
|
+
y = iris_df["species"].values
|
|
575
|
+
|
|
576
|
+
X = np.asarray(X).astype(np.float32)
|
|
577
|
+
|
|
578
|
+
encoder = OneHotEncoder()
|
|
579
|
+
y = encoder.fit_transform(y.reshape(-1, 1)).toarray()
|
|
580
|
+
y = np.asarray(y).astype(np.float32)
|
|
581
|
+
|
|
582
|
+
model = AutoClassifier(
|
|
583
|
+
input_shape_parm=X.shape[1], num_classes=3, units=27, activation="selu", num_layers=2
|
|
584
|
+
)
|
|
585
|
+
model.compile(
|
|
586
|
+
optimizer="adam",
|
|
587
|
+
loss=tf.keras.losses.CategoricalCrossentropy(),
|
|
588
|
+
metrics=[tf.keras.metrics.F1Score(threshold=0.5)],
|
|
589
|
+
)
|
|
590
|
+
model.fit(X, y, epochs=50, validation_split=0.2)
|
|
591
|
+
|
|
592
|
+
insights = GetInsights(model, X)
|
|
593
|
+
summary = insights.predictor_analyzer(frac=1.0, y_labels=y_labels)
|
|
594
|
+
insights._get_tsne_repr()
|
|
595
|
+
insights._viz_tsne_repr()
|
|
596
|
+
insights._viz_tsne_repr(c=iris_df["species"])
|
|
597
|
+
insights._viz_weights()
|
|
598
|
+
print(summary)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import pickle
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from IPython.display import clear_output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HMM:
|
|
11
|
+
def __init__(self, n_states: int, n_observations: int):
|
|
12
|
+
self.n_states = n_states
|
|
13
|
+
self.n_observations = n_observations
|
|
14
|
+
|
|
15
|
+
# Initialize parameters with random values
|
|
16
|
+
self.pi = np.random.dirichlet(np.ones(n_states), size=1)[0]
|
|
17
|
+
self.A = np.random.dirichlet(np.ones(n_states), size=n_states)
|
|
18
|
+
self.B = np.random.dirichlet(np.ones(n_observations), size=n_states)
|
|
19
|
+
|
|
20
|
+
def save_model(self, filename: str = "./hmm") -> None:
|
|
21
|
+
filename = filename if filename.endswith(".pkl") else filename + ".pkl"
|
|
22
|
+
with open(filename, "wb") as f:
|
|
23
|
+
pickle.dump(self, f)
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def load_model(filename: str = "./hmm") -> "HMM":
|
|
27
|
+
filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
|
|
28
|
+
with open(filename, "rb") as f:
|
|
29
|
+
return pickle.load(f)
|
|
30
|
+
|
|
31
|
+
def forward(self, sequence: List[int]) -> np.ndarray:
|
|
32
|
+
T = len(sequence)
|
|
33
|
+
alpha = np.zeros((T, self.n_states))
|
|
34
|
+
|
|
35
|
+
# Add a small constant (smoothing) to avoid log(0)
|
|
36
|
+
epsilon = 1e-10 # Small value to avoid taking log(0)
|
|
37
|
+
|
|
38
|
+
# Initialization (log-space)
|
|
39
|
+
alpha[0] = np.log(self.pi + epsilon) + np.log(self.B[:, sequence[0]] + epsilon)
|
|
40
|
+
alpha[0] -= np.log(np.sum(np.exp(alpha[0]))) # Normalization (log-space)
|
|
41
|
+
|
|
42
|
+
# Recursion (log-space)
|
|
43
|
+
for t in range(1, T):
|
|
44
|
+
for i in range(self.n_states):
|
|
45
|
+
alpha[t, i] = np.log(
|
|
46
|
+
np.sum(np.exp(alpha[t - 1] + np.log(self.A[:, i] + epsilon)))
|
|
47
|
+
) + np.log(self.B[i, sequence[t]] + epsilon)
|
|
48
|
+
alpha[t] -= np.log(np.sum(np.exp(alpha[t]))) # Normalization
|
|
49
|
+
|
|
50
|
+
return alpha
|
|
51
|
+
|
|
52
|
+
def backward(self, sequence: List[int]) -> np.ndarray:
|
|
53
|
+
T = len(sequence)
|
|
54
|
+
beta = np.ones((T, self.n_states))
|
|
55
|
+
|
|
56
|
+
# Backward recursion
|
|
57
|
+
for t in range(T - 2, -1, -1):
|
|
58
|
+
for i in range(self.n_states):
|
|
59
|
+
beta[t, i] = np.sum(self.A[i] * self.B[:, sequence[t + 1]] * beta[t + 1])
|
|
60
|
+
|
|
61
|
+
return beta
|
|
62
|
+
|
|
63
|
+
def viterbi(self, sequence: List[int]) -> np.ndarray:
|
|
64
|
+
T = len(sequence)
|
|
65
|
+
delta = np.zeros((T, self.n_states))
|
|
66
|
+
psi = np.zeros((T, self.n_states), dtype=int)
|
|
67
|
+
|
|
68
|
+
# Initialization
|
|
69
|
+
delta[0] = self.pi * self.B[:, sequence[0]]
|
|
70
|
+
|
|
71
|
+
# Recursion
|
|
72
|
+
for t in range(1, T):
|
|
73
|
+
for i in range(self.n_states):
|
|
74
|
+
delta[t, i] = np.max(delta[t - 1] * self.A[:, i]) * self.B[i, sequence[t]]
|
|
75
|
+
psi[t, i] = np.argmax(delta[t - 1] * self.A[:, i])
|
|
76
|
+
|
|
77
|
+
# Reconstruct the most probable path
|
|
78
|
+
state_sequence = np.zeros(T, dtype=int)
|
|
79
|
+
state_sequence[T - 1] = np.argmax(delta[T - 1])
|
|
80
|
+
for t in range(T - 2, -1, -1):
|
|
81
|
+
state_sequence[t] = psi[t + 1, state_sequence[t + 1]]
|
|
82
|
+
|
|
83
|
+
return state_sequence
|
|
84
|
+
|
|
85
|
+
def baum_welch(
|
|
86
|
+
self, sequences: List[List[int]], n_iterations: int, verbose: bool = False
|
|
87
|
+
) -> None:
|
|
88
|
+
for iteration in range(n_iterations):
|
|
89
|
+
# Initialize accumulators
|
|
90
|
+
A_num = np.zeros((self.n_states, self.n_states))
|
|
91
|
+
B_num = np.zeros((self.n_states, self.n_observations))
|
|
92
|
+
pi_num = np.zeros(self.n_states)
|
|
93
|
+
|
|
94
|
+
for sequence in sequences:
|
|
95
|
+
T = len(sequence)
|
|
96
|
+
alpha = self.forward(sequence)
|
|
97
|
+
beta = self.backward(sequence)
|
|
98
|
+
|
|
99
|
+
# Update pi
|
|
100
|
+
gamma = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
|
|
101
|
+
pi_num += gamma[0]
|
|
102
|
+
|
|
103
|
+
# Update A and B
|
|
104
|
+
for t in range(T - 1):
|
|
105
|
+
xi = np.zeros((self.n_states, self.n_states))
|
|
106
|
+
denom = np.sum(alpha[t] * self.A * self.B[:, sequence[t + 1]] * beta[t + 1])
|
|
107
|
+
|
|
108
|
+
for i in range(self.n_states):
|
|
109
|
+
for j in range(self.n_states):
|
|
110
|
+
xi[i, j] = (
|
|
111
|
+
alpha[t, i]
|
|
112
|
+
* self.A[i, j]
|
|
113
|
+
* self.B[j, sequence[t + 1]]
|
|
114
|
+
* beta[t + 1, j]
|
|
115
|
+
) / denom
|
|
116
|
+
A_num[i] += xi[i]
|
|
117
|
+
|
|
118
|
+
B_num[:, sequence[t]] += gamma[t]
|
|
119
|
+
|
|
120
|
+
# For the last step of the sequence
|
|
121
|
+
B_num[:, sequence[-1]] += gamma[-1]
|
|
122
|
+
|
|
123
|
+
# Normalize and update parameters
|
|
124
|
+
self.pi = pi_num / len(sequences)
|
|
125
|
+
self.A = A_num / np.sum(A_num, axis=1, keepdims=True)
|
|
126
|
+
self.B = B_num / np.sum(B_num, axis=1, keepdims=True)
|
|
127
|
+
|
|
128
|
+
# Logging parameters every 10 iterations
|
|
129
|
+
if iteration % 10 == 0 and verbose:
|
|
130
|
+
os.system("cls" if os.name == "nt" else "clear")
|
|
131
|
+
clear_output(wait=True)
|
|
132
|
+
logging.info(f"Iteration {iteration}:")
|
|
133
|
+
logging.info("Pi: %s", self.pi)
|
|
134
|
+
logging.info("A:\n%s", self.A)
|
|
135
|
+
logging.info("B:\n%s", self.B)
|
|
136
|
+
|
|
137
|
+
def decoding_accuracy(self, sequences: List[List[int]], true_states: List[List[int]]) -> float:
|
|
138
|
+
correct_predictions = 0
|
|
139
|
+
total_predictions = 0
|
|
140
|
+
|
|
141
|
+
for sequence, true_state in zip(sequences, true_states):
|
|
142
|
+
predicted_states = self.viterbi(sequence)
|
|
143
|
+
correct_predictions += np.sum(predicted_states == true_state)
|
|
144
|
+
total_predictions += len(sequence)
|
|
145
|
+
|
|
146
|
+
accuracy = (correct_predictions / total_predictions) * 100
|
|
147
|
+
return accuracy
|
|
148
|
+
|
|
149
|
+
def state_probabilities(self, sequence: List[int]) -> np.ndarray:
|
|
150
|
+
"""
|
|
151
|
+
Returns the smoothed probabilities of the hidden states at each time step.
|
|
152
|
+
This is done by using both forward and backward probabilities.
|
|
153
|
+
"""
|
|
154
|
+
alpha = self.forward(sequence)
|
|
155
|
+
beta = self.backward(sequence)
|
|
156
|
+
|
|
157
|
+
# Compute smoothed probabilities (gamma)
|
|
158
|
+
smoothed_probs = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
|
|
159
|
+
|
|
160
|
+
return smoothed_probs
|
|
161
|
+
|
|
162
|
+
def sequence_probability(self, sequence: List[int]) -> np.ndarray:
|
|
163
|
+
return self.state_probabilities(sequence)[-1]
|
|
@@ -5,7 +5,6 @@ from typing import List, Tuple, Union
|
|
|
5
5
|
import matplotlib.pyplot as plt
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from numpy import ndarray
|
|
9
8
|
from pandas.core.frame import DataFrame
|
|
10
9
|
|
|
11
10
|
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
|
|
@@ -66,12 +65,12 @@ class SimulationEngine(FeatureSelection):
|
|
|
66
65
|
|
|
67
66
|
super().__init__(**kwargs)
|
|
68
67
|
|
|
69
|
-
def predict(self, df: DataFrame, column: str) -> ndarray | list:
|
|
68
|
+
def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
|
|
70
69
|
# Let us assign the dictionary entries corresponding to the column
|
|
71
70
|
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
72
71
|
|
|
73
72
|
df = df[names_cols].copy()
|
|
74
|
-
# Change the scale of the
|
|
73
|
+
# Change the scale of the DataFrame
|
|
75
74
|
dataset = self.df.copy()
|
|
76
75
|
dataset.drop(columns=column, inplace=True)
|
|
77
76
|
numeric_df = dataset.select_dtypes(include="number")
|
|
@@ -85,7 +84,7 @@ class SimulationEngine(FeatureSelection):
|
|
|
85
84
|
for col in numeric_df.columns:
|
|
86
85
|
df[col] = numeric_df[col].values
|
|
87
86
|
|
|
88
|
-
# Encoding the
|
|
87
|
+
# Encoding the DataFrame
|
|
89
88
|
for num, colname in enumerate(dfe._encode_columns):
|
|
90
89
|
if df[colname].dtype == "object":
|
|
91
90
|
encode_dict = dfe.encoding_list[num]
|
|
@@ -93,7 +92,7 @@ class SimulationEngine(FeatureSelection):
|
|
|
93
92
|
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
94
93
|
)
|
|
95
94
|
|
|
96
|
-
#
|
|
95
|
+
# Prediction
|
|
97
96
|
y = df.to_numpy() @ w
|
|
98
97
|
|
|
99
98
|
# Categorical column
|
|
@@ -113,7 +112,7 @@ class SimulationEngine(FeatureSelection):
|
|
|
113
112
|
|
|
114
113
|
return y[:]
|
|
115
114
|
|
|
116
|
-
def _encode(self, df: DataFrame) -> ndarray | list:
|
|
115
|
+
def _encode(self, df: DataFrame) -> np.ndarray | list:
|
|
117
116
|
df = df.copy()
|
|
118
117
|
column = df.columns[0]
|
|
119
118
|
frec = df[column].value_counts() / len(df)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.25
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -13,7 +13,7 @@ Classifier: Operating System :: OS Independent
|
|
|
13
13
|
Requires-Python: >=3.10
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
|
-
Requires-Dist: black[jupyter]
|
|
16
|
+
Requires-Dist: black[jupyter]>=24.3.0
|
|
17
17
|
Requires-Dist: mypy-extensions==1.0.0
|
|
18
18
|
Requires-Dist: types-openpyxl==3.1.0.15
|
|
19
19
|
Requires-Dist: pydocstyle==6.3.0
|
|
@@ -31,6 +31,18 @@ Requires-Dist: pyvis; extra == "full"
|
|
|
31
31
|
Requires-Dist: tensorflow==2.15.0; extra == "full"
|
|
32
32
|
Requires-Dist: keras-tuner; extra == "full"
|
|
33
33
|
Requires-Dist: scikit-learn; extra == "full"
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: author-email
|
|
36
|
+
Dynamic: classifier
|
|
37
|
+
Dynamic: description
|
|
38
|
+
Dynamic: description-content-type
|
|
39
|
+
Dynamic: home-page
|
|
40
|
+
Dynamic: maintainer
|
|
41
|
+
Dynamic: maintainer-email
|
|
42
|
+
Dynamic: provides-extra
|
|
43
|
+
Dynamic: requires-dist
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
Dynamic: summary
|
|
34
46
|
|
|
35
47
|

|
|
36
48
|
|
|
@@ -1,309 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
from functools import partial
|
|
4
|
-
from shutil import rmtree
|
|
5
|
-
|
|
6
|
-
import keras_tuner
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import tensorflow as tf
|
|
10
|
-
from pandas.core.frame import DataFrame
|
|
11
|
-
|
|
12
|
-
from likelihood.tools import OneHotEncoder
|
|
13
|
-
|
|
14
|
-
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
15
|
-
|
|
16
|
-
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@tf.keras.utils.register_keras_serializable(package="Custom", name="AutoClassifier")
|
|
20
|
-
class AutoClassifier(tf.keras.Model):
|
|
21
|
-
"""
|
|
22
|
-
An auto-classifier model that automatically determines the best classification strategy based on the input data.
|
|
23
|
-
|
|
24
|
-
Attributes:
|
|
25
|
-
- input_shape_parm: The shape of the input data.
|
|
26
|
-
- num_classes: The number of classes in the dataset.
|
|
27
|
-
- units: The number of neurons in each hidden layer.
|
|
28
|
-
- activation: The type of activation function to use for the neural network layers.
|
|
29
|
-
|
|
30
|
-
Methods:
|
|
31
|
-
__init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
|
|
32
|
-
build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
|
|
33
|
-
call(self, x): Defines the forward pass of the model.
|
|
34
|
-
get_config(self): Returns the configuration of the model.
|
|
35
|
-
from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
def __init__(self, input_shape_parm, num_classes, units, activation):
|
|
39
|
-
"""
|
|
40
|
-
Initializes an AutoClassifier instance with the given parameters.
|
|
41
|
-
|
|
42
|
-
Parameters
|
|
43
|
-
----------
|
|
44
|
-
input_shape_parm : `int`
|
|
45
|
-
The shape of the input data.
|
|
46
|
-
num_classes : `int`
|
|
47
|
-
The number of classes in the dataset.
|
|
48
|
-
units : `int`
|
|
49
|
-
The number of neurons in each hidden layer.
|
|
50
|
-
activation : `str`
|
|
51
|
-
The type of activation function to use for the neural network layers.
|
|
52
|
-
"""
|
|
53
|
-
super(AutoClassifier, self).__init__()
|
|
54
|
-
self.input_shape_parm = input_shape_parm
|
|
55
|
-
self.num_classes = num_classes
|
|
56
|
-
self.units = units
|
|
57
|
-
self.activation = activation
|
|
58
|
-
|
|
59
|
-
self.encoder = None
|
|
60
|
-
self.decoder = None
|
|
61
|
-
self.classifier = None
|
|
62
|
-
|
|
63
|
-
def build(self, input_shape):
|
|
64
|
-
self.encoder = tf.keras.Sequential(
|
|
65
|
-
[
|
|
66
|
-
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
67
|
-
tf.keras.layers.Dense(units=int(self.units / 2), activation=self.activation),
|
|
68
|
-
]
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
self.decoder = tf.keras.Sequential(
|
|
72
|
-
[
|
|
73
|
-
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
74
|
-
tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
|
|
75
|
-
]
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
self.classifier = tf.keras.Sequential(
|
|
79
|
-
[tf.keras.layers.Dense(self.num_classes, activation="softmax")]
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
def call(self, x):
|
|
83
|
-
encoded = self.encoder(x)
|
|
84
|
-
decoded = self.decoder(encoded)
|
|
85
|
-
combined = tf.concat([decoded, encoded], axis=1)
|
|
86
|
-
classification = self.classifier(combined)
|
|
87
|
-
return classification
|
|
88
|
-
|
|
89
|
-
def get_config(self):
|
|
90
|
-
config = {
|
|
91
|
-
"input_shape_parm": self.input_shape_parm,
|
|
92
|
-
"num_classes": self.num_classes,
|
|
93
|
-
"units": self.units,
|
|
94
|
-
"activation": self.activation,
|
|
95
|
-
}
|
|
96
|
-
base_config = super(AutoClassifier, self).get_config()
|
|
97
|
-
return dict(list(base_config.items()) + list(config.items()))
|
|
98
|
-
|
|
99
|
-
@classmethod
|
|
100
|
-
def from_config(cls, config):
|
|
101
|
-
return cls(
|
|
102
|
-
input_shape_parm=config["input_shape_parm"],
|
|
103
|
-
num_classes=config["num_classes"],
|
|
104
|
-
units=config["units"],
|
|
105
|
-
activation=config["activation"],
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def call_existing_code(
|
|
110
|
-
units: int,
|
|
111
|
-
activation: str,
|
|
112
|
-
threshold: float,
|
|
113
|
-
optimizer: str,
|
|
114
|
-
input_shape_parm: None | int = None,
|
|
115
|
-
num_classes: None | int = None,
|
|
116
|
-
) -> AutoClassifier:
|
|
117
|
-
"""
|
|
118
|
-
Calls an existing AutoClassifier instance.
|
|
119
|
-
|
|
120
|
-
Parameters
|
|
121
|
-
----------
|
|
122
|
-
units : `int`
|
|
123
|
-
The number of neurons in each hidden layer.
|
|
124
|
-
activation : `str`
|
|
125
|
-
The type of activation function to use for the neural network layers.
|
|
126
|
-
threshold : `float`
|
|
127
|
-
The threshold for the classifier.
|
|
128
|
-
optimizer : `str`
|
|
129
|
-
The type of optimizer to use for the neural network layers.
|
|
130
|
-
input_shape_parm : `None` | `int`
|
|
131
|
-
The shape of the input data.
|
|
132
|
-
num_classes : `int`
|
|
133
|
-
The number of classes in the dataset.
|
|
134
|
-
|
|
135
|
-
Returns
|
|
136
|
-
-------
|
|
137
|
-
`AutoClassifier`
|
|
138
|
-
The AutoClassifier instance.
|
|
139
|
-
"""
|
|
140
|
-
model = AutoClassifier(
|
|
141
|
-
input_shape_parm=input_shape_parm,
|
|
142
|
-
num_classes=num_classes,
|
|
143
|
-
units=units,
|
|
144
|
-
activation=activation,
|
|
145
|
-
)
|
|
146
|
-
model.compile(
|
|
147
|
-
optimizer=optimizer,
|
|
148
|
-
loss=tf.keras.losses.CategoricalCrossentropy(),
|
|
149
|
-
metrics=[tf.keras.metrics.F1Score(threshold=threshold)],
|
|
150
|
-
)
|
|
151
|
-
return model
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
|
|
155
|
-
"""Builds a neural network model using Keras Tuner's search algorithm.
|
|
156
|
-
|
|
157
|
-
Parameters
|
|
158
|
-
----------
|
|
159
|
-
hp : `keras_tuner.HyperParameters`
|
|
160
|
-
The hyperparameters to tune.
|
|
161
|
-
input_shape_parm : `None` | `int`
|
|
162
|
-
The shape of the input data.
|
|
163
|
-
num_classes : `int`
|
|
164
|
-
The number of classes in the dataset.
|
|
165
|
-
|
|
166
|
-
Returns
|
|
167
|
-
-------
|
|
168
|
-
`keras.Model`
|
|
169
|
-
The neural network model.
|
|
170
|
-
"""
|
|
171
|
-
units = hp.Int(
|
|
172
|
-
"units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
|
|
173
|
-
)
|
|
174
|
-
activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
|
|
175
|
-
optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
|
|
176
|
-
threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
|
|
177
|
-
|
|
178
|
-
model = call_existing_code(
|
|
179
|
-
units=units,
|
|
180
|
-
activation=activation,
|
|
181
|
-
threshold=threshold,
|
|
182
|
-
optimizer=optimizer,
|
|
183
|
-
input_shape_parm=input_shape_parm,
|
|
184
|
-
num_classes=num_classes,
|
|
185
|
-
)
|
|
186
|
-
return model
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def setup_model(
|
|
190
|
-
data: DataFrame,
|
|
191
|
-
target: str,
|
|
192
|
-
epochs: int,
|
|
193
|
-
train_size: float = 0.7,
|
|
194
|
-
seed=None,
|
|
195
|
-
train_mode: bool = True,
|
|
196
|
-
filepath: str = "./my_dir/best_model",
|
|
197
|
-
**kwargs,
|
|
198
|
-
) -> AutoClassifier:
|
|
199
|
-
"""Setup model for training and tuning.
|
|
200
|
-
|
|
201
|
-
Parameters
|
|
202
|
-
----------
|
|
203
|
-
data : `DataFrame`
|
|
204
|
-
The dataset to train the model on.
|
|
205
|
-
target : `str`
|
|
206
|
-
The name of the target column.
|
|
207
|
-
epochs : `int`
|
|
208
|
-
The number of epochs to train the model for.
|
|
209
|
-
train_size : `float`
|
|
210
|
-
The proportion of the dataset to use for training.
|
|
211
|
-
seed : `Any` | `int`
|
|
212
|
-
The random seed to use for reproducibility.
|
|
213
|
-
train_mode : `bool`
|
|
214
|
-
Whether to train the model or not.
|
|
215
|
-
filepath : `str`
|
|
216
|
-
The path to save the best model to.
|
|
217
|
-
|
|
218
|
-
Keyword Arguments:
|
|
219
|
-
----------
|
|
220
|
-
Additional keyword arguments to pass to the model.
|
|
221
|
-
|
|
222
|
-
max_trials : `int`
|
|
223
|
-
The maximum number of trials to perform.
|
|
224
|
-
directory : `str`
|
|
225
|
-
The directory to save the model to.
|
|
226
|
-
project_name : `str`
|
|
227
|
-
The name of the project.
|
|
228
|
-
objective : `str`
|
|
229
|
-
The objective to optimize.
|
|
230
|
-
verbose : `bool`
|
|
231
|
-
Whether to print verbose output.
|
|
232
|
-
|
|
233
|
-
Returns
|
|
234
|
-
-------
|
|
235
|
-
model : `AutoClassifier`
|
|
236
|
-
The trained model.
|
|
237
|
-
"""
|
|
238
|
-
max_trials = kwargs["max_trials"] if "max_trials" in kwargs else 10
|
|
239
|
-
directory = kwargs["directory"] if "directory" in kwargs else "./my_dir"
|
|
240
|
-
project_name = kwargs["project_name"] if "project_name" in kwargs else "get_best"
|
|
241
|
-
objective = kwargs["objective"] if "objective" in kwargs else "val_loss"
|
|
242
|
-
verbose = kwargs["verbose"] if "verbose" in kwargs else True
|
|
243
|
-
|
|
244
|
-
X = data.drop(columns=target)
|
|
245
|
-
input_sample = X.sample(1)
|
|
246
|
-
y = data[target]
|
|
247
|
-
# Verify if there are categorical columns in the dataframe
|
|
248
|
-
assert (
|
|
249
|
-
X.select_dtypes(include=["object"]).empty == True
|
|
250
|
-
), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
|
|
251
|
-
validation_split = 1.0 - train_size
|
|
252
|
-
# Create my_dir path if it does not exist
|
|
253
|
-
|
|
254
|
-
if train_mode:
|
|
255
|
-
# Create a new directory if it does not exist
|
|
256
|
-
try:
|
|
257
|
-
if (not os.path.exists(directory)) and directory != "./":
|
|
258
|
-
os.makedirs(directory)
|
|
259
|
-
elif directory != "./":
|
|
260
|
-
print(f"Directory {directory} already exists, it will be deleted.")
|
|
261
|
-
rmtree(directory)
|
|
262
|
-
os.makedirs(directory)
|
|
263
|
-
except:
|
|
264
|
-
print("Warning: unable to create directory")
|
|
265
|
-
|
|
266
|
-
# Create a Classifier instance
|
|
267
|
-
y_encoder = OneHotEncoder()
|
|
268
|
-
y = y_encoder.encode(y.to_list())
|
|
269
|
-
X = X.to_numpy()
|
|
270
|
-
input_sample.to_numpy()
|
|
271
|
-
X = np.asarray(X).astype(np.float32)
|
|
272
|
-
input_sample = np.asarray(input_sample).astype(np.float32)
|
|
273
|
-
y = np.asarray(y).astype(np.float32)
|
|
274
|
-
|
|
275
|
-
input_shape_parm = X.shape[1]
|
|
276
|
-
num_classes = y.shape[1]
|
|
277
|
-
global build_model
|
|
278
|
-
build_model = partial(
|
|
279
|
-
build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
# Create the AutoKeras model
|
|
283
|
-
tuner = keras_tuner.RandomSearch(
|
|
284
|
-
hypermodel=build_model,
|
|
285
|
-
objective=objective,
|
|
286
|
-
max_trials=max_trials,
|
|
287
|
-
directory=directory,
|
|
288
|
-
project_name=project_name,
|
|
289
|
-
seed=seed,
|
|
290
|
-
)
|
|
291
|
-
|
|
292
|
-
tuner.search(X, y, epochs=epochs, validation_split=validation_split)
|
|
293
|
-
models = tuner.get_best_models(num_models=2)
|
|
294
|
-
best_model = models[0]
|
|
295
|
-
best_model(input_sample)
|
|
296
|
-
|
|
297
|
-
# save model
|
|
298
|
-
best_model.save(filepath, save_format="tf")
|
|
299
|
-
|
|
300
|
-
if verbose:
|
|
301
|
-
tuner.results_summary()
|
|
302
|
-
else:
|
|
303
|
-
# Load the best model from the directory
|
|
304
|
-
best_model = tf.keras.models.load_model(filepath)
|
|
305
|
-
|
|
306
|
-
return best_model
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
########################################################################################
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|