likelihood 1.2.16__py3-none-any.whl → 1.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/nn.py +344 -0
- likelihood/models/deep/autoencoders.py +53 -24
- likelihood/models/simulation.py +51 -39
- likelihood/tools/numeric_tools.py +57 -30
- likelihood/tools/tools.py +28 -10
- {likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/METADATA +3 -2
- {likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/RECORD +10 -9
- {likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/WHEEL +1 -1
- {likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/LICENSE +0 -0
- {likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/top_level.txt +0 -0
likelihood/graph/nn.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from typing import List, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import tensorflow as tf
|
|
7
|
+
from numpy import ndarray
|
|
8
|
+
from pandas.core.frame import DataFrame
|
|
9
|
+
from sklearn.metrics import f1_score
|
|
10
|
+
from sklearn.model_selection import train_test_split
|
|
11
|
+
|
|
12
|
+
from likelihood.tools import generate_feature_yaml
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
|
|
16
|
+
"""Compares the similarity between two arrays of categories.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
arr1 : `ndarray`
|
|
21
|
+
The first array of categories.
|
|
22
|
+
arr2 : `ndarray`
|
|
23
|
+
The second array of categories.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
count: `int`
|
|
28
|
+
The number of categories that are the same in both arrays.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
count = 0
|
|
32
|
+
for i in range(len(arr1)):
|
|
33
|
+
if arr1[i] == arr2[i]:
|
|
34
|
+
count += 1
|
|
35
|
+
return count
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def cal_adjency_matrix(
|
|
39
|
+
df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
40
|
+
) -> Tuple[dict, ndarray]:
|
|
41
|
+
"""Calculates the adjacency matrix for a given DataFrame.
|
|
42
|
+
The adjacency matrix is a matrix that represents the similarity between each pair of categories.
|
|
43
|
+
The similarity is calculated using the `compare_similarity` function.
|
|
44
|
+
The resulting matrix is a square matrix with the same number of rows and columns as the input DataFrame.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
df : `DataFrame`
|
|
49
|
+
The input DataFrame containing the categories.
|
|
50
|
+
exclude_subset : `List[str]`, optional
|
|
51
|
+
A list of categories to exclude from the calculation of the adjacency matrix.
|
|
52
|
+
sparse : `bool`, optional
|
|
53
|
+
Whether to return a sparse matrix or a dense matrix.
|
|
54
|
+
**kwargs : `dict`
|
|
55
|
+
Additional keyword arguments to pass to the `compare_similarity` function.
|
|
56
|
+
|
|
57
|
+
Keyword Arguments:
|
|
58
|
+
----------
|
|
59
|
+
similarity: `int`
|
|
60
|
+
The minimum number of categories that must be the same in both arrays to be considered similar.
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
adj_dict : `dict`
|
|
65
|
+
A dictionary containing the categories.
|
|
66
|
+
adjacency_matrix : `ndarray`
|
|
67
|
+
The adjacency matrix.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
yaml_ = generate_feature_yaml(df)
|
|
71
|
+
categorical_columns = yaml_["categorical_features"]
|
|
72
|
+
if len(exclude_subset) > 0:
|
|
73
|
+
categorical_columns = [col for col in categorical_columns if col not in exclude_subset]
|
|
74
|
+
|
|
75
|
+
if len(categorical_columns) > 1:
|
|
76
|
+
df_categorical = df[categorical_columns].copy()
|
|
77
|
+
else:
|
|
78
|
+
categorical_columns = [
|
|
79
|
+
col
|
|
80
|
+
for col in df.columns
|
|
81
|
+
if (
|
|
82
|
+
col not in exclude_subset
|
|
83
|
+
and pd.api.types.is_integer_dtype(df[col])
|
|
84
|
+
and len(df[col].unique()) > 2
|
|
85
|
+
)
|
|
86
|
+
]
|
|
87
|
+
df_categorical = df[categorical_columns].copy()
|
|
88
|
+
|
|
89
|
+
assert len(df_categorical) > 0
|
|
90
|
+
|
|
91
|
+
similarity = kwargs["similarity"] if "similarity" in kwargs else len(df_categorical.columns) - 1
|
|
92
|
+
assert similarity <= df_categorical.shape[1]
|
|
93
|
+
|
|
94
|
+
adj_dict = {}
|
|
95
|
+
for index, row in df_categorical.iterrows():
|
|
96
|
+
adj_dict[index] = row.to_list()
|
|
97
|
+
|
|
98
|
+
adjacency_matrix = np.zeros((len(df_categorical), len(df_categorical)))
|
|
99
|
+
|
|
100
|
+
for i in range(len(df_categorical)):
|
|
101
|
+
for j in range(len(df_categorical)):
|
|
102
|
+
if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
|
|
103
|
+
adjacency_matrix[i][j] = 1
|
|
104
|
+
|
|
105
|
+
if sparse:
|
|
106
|
+
num_nodes = adjacency_matrix.shape[0]
|
|
107
|
+
|
|
108
|
+
indices = np.argwhere(adjacency_matrix != 0.0)
|
|
109
|
+
indices = tf.constant(indices, dtype=tf.int64)
|
|
110
|
+
values = tf.constant(adjacency_matrix[indices[:, 0], indices[:, 1]], dtype=tf.float32)
|
|
111
|
+
adjacency_matrix = tf.sparse.SparseTensor(
|
|
112
|
+
indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return adj_dict, adjacency_matrix
|
|
116
|
+
else:
|
|
117
|
+
return adj_dict, adjacency_matrix
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class Data:
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
df: DataFrame,
|
|
124
|
+
target: str | None = None,
|
|
125
|
+
exclude_subset: List[str] = [],
|
|
126
|
+
):
|
|
127
|
+
_, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
|
|
128
|
+
if target is not None:
|
|
129
|
+
X = df.drop(columns=[target] + exclude_subset)
|
|
130
|
+
else:
|
|
131
|
+
X = df.drop(columns=exclude_subset)
|
|
132
|
+
self.columns = X.columns
|
|
133
|
+
X = X.to_numpy()
|
|
134
|
+
self.x = np.asarray(X).astype(np.float32)
|
|
135
|
+
self.adjacency = adjacency
|
|
136
|
+
if target is not None:
|
|
137
|
+
self.y = np.asarray(df[target].values).astype(np.int32)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
|
|
141
|
+
class VanillaGNNLayer(tf.keras.layers.Layer):
|
|
142
|
+
def __init__(self, dim_in, dim_out, kernel_initializer="glorot_uniform", **kwargs):
|
|
143
|
+
super(VanillaGNNLayer, self).__init__(**kwargs)
|
|
144
|
+
self.dim_out = dim_out
|
|
145
|
+
self.kernel_initializer = kernel_initializer
|
|
146
|
+
self.linear = None
|
|
147
|
+
|
|
148
|
+
def build(self, input_shape):
|
|
149
|
+
self.linear = tf.keras.layers.Dense(
|
|
150
|
+
self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
|
|
151
|
+
)
|
|
152
|
+
super(VanillaGNNLayer, self).build(input_shape)
|
|
153
|
+
|
|
154
|
+
def call(self, x, adjacency):
|
|
155
|
+
x = self.linear(x)
|
|
156
|
+
x = tf.sparse.sparse_dense_matmul(adjacency, x)
|
|
157
|
+
return x
|
|
158
|
+
|
|
159
|
+
def get_config(self):
|
|
160
|
+
config = super(VanillaGNNLayer, self).get_config()
|
|
161
|
+
config.update(
|
|
162
|
+
{
|
|
163
|
+
"dim_out": self.dim_out,
|
|
164
|
+
"kernel_initializer": tf.keras.initializers.serialize(
|
|
165
|
+
self.linear.kernel_initializer
|
|
166
|
+
),
|
|
167
|
+
}
|
|
168
|
+
)
|
|
169
|
+
return config
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
|
|
173
|
+
class VanillaGNN(tf.keras.Model):
|
|
174
|
+
def __init__(self, dim_in, dim_h, dim_out, **kwargs):
|
|
175
|
+
super(VanillaGNN, self).__init__(**kwargs)
|
|
176
|
+
self.dim_in = dim_in
|
|
177
|
+
self.dim_h = dim_h
|
|
178
|
+
self.dim_out = dim_out
|
|
179
|
+
self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h)
|
|
180
|
+
self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h)
|
|
181
|
+
self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out)
|
|
182
|
+
|
|
183
|
+
def build(self, input_shape):
|
|
184
|
+
super(VanillaGNN, self).build(input_shape)
|
|
185
|
+
dummy_input = tf.keras.Input(shape=input_shape[1:])
|
|
186
|
+
dummy_adjacency = tf.sparse.SparseTensor(
|
|
187
|
+
indices=[[0, 0]], values=[1.0], dense_shape=[input_shape[0], input_shape[0]]
|
|
188
|
+
)
|
|
189
|
+
_ = self(dummy_input, dummy_adjacency)
|
|
190
|
+
|
|
191
|
+
def call(self, x, adjacency):
|
|
192
|
+
h = self.gnn1(x, adjacency)
|
|
193
|
+
h = tf.nn.tanh(h)
|
|
194
|
+
h = self.gnn2(h, adjacency)
|
|
195
|
+
h = self.gnn3(h, adjacency)
|
|
196
|
+
return tf.nn.softmax(h, axis=1)
|
|
197
|
+
|
|
198
|
+
def f1_macro(self, y_true, y_pred):
|
|
199
|
+
return f1_score(y_true, y_pred, average="macro")
|
|
200
|
+
|
|
201
|
+
def compute_f1_score(self, logits, labels):
|
|
202
|
+
predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
|
|
203
|
+
true_labels = tf.cast(labels, tf.int32)
|
|
204
|
+
return self.f1_macro(true_labels.numpy(), predictions.numpy())
|
|
205
|
+
|
|
206
|
+
def evaluate(self, x, adjacency, y):
|
|
207
|
+
y = tf.cast(y, tf.int32)
|
|
208
|
+
out = self(x, adjacency)
|
|
209
|
+
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
|
|
210
|
+
loss = tf.reduce_mean(loss)
|
|
211
|
+
f1 = self.compute_f1_score(out, y)
|
|
212
|
+
return loss.numpy(), f1
|
|
213
|
+
|
|
214
|
+
def test(self, data):
|
|
215
|
+
out = self(data.x, data.adjacency)
|
|
216
|
+
test_f1 = self.compute_f1_score(out, data.y)
|
|
217
|
+
return test_f1
|
|
218
|
+
|
|
219
|
+
def predict(self, data):
|
|
220
|
+
out = self(data.x, data.adjacency)
|
|
221
|
+
return tf.argmax(out, axis=1, output_type=tf.int32).numpy()
|
|
222
|
+
|
|
223
|
+
def get_config(self):
|
|
224
|
+
config = {
|
|
225
|
+
"dim_in": self.dim_in,
|
|
226
|
+
"dim_h": self.dim_h,
|
|
227
|
+
"dim_out": self.dim_out,
|
|
228
|
+
}
|
|
229
|
+
base_config = super(VanillaGNN, self).get_config()
|
|
230
|
+
return dict(list(base_config.items()) + list(config.items()))
|
|
231
|
+
|
|
232
|
+
@classmethod
|
|
233
|
+
def from_config(cls, config):
|
|
234
|
+
return cls(
|
|
235
|
+
dim_in=config["dim_in"],
|
|
236
|
+
dim_h=config["dim_h"],
|
|
237
|
+
dim_out=config["dim_out"],
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
@tf.function
|
|
241
|
+
def train_step(self, batch_x, batch_adjacency, batch_y, optimizer):
|
|
242
|
+
with tf.GradientTape() as tape:
|
|
243
|
+
out = self(batch_x, batch_adjacency)
|
|
244
|
+
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=batch_y, logits=out)
|
|
245
|
+
loss = tf.reduce_mean(loss)
|
|
246
|
+
gradients = tape.gradient(loss, self.trainable_variables)
|
|
247
|
+
optimizer.apply_gradients(zip(gradients, self.trainable_variables))
|
|
248
|
+
return loss
|
|
249
|
+
|
|
250
|
+
def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
|
|
251
|
+
warnings.warn(
|
|
252
|
+
"It is normal for validation metrics to underperform. Use the test method to validate after training.",
|
|
253
|
+
UserWarning,
|
|
254
|
+
)
|
|
255
|
+
optimizers = {
|
|
256
|
+
"sgd": tf.keras.optimizers.SGD(),
|
|
257
|
+
"adam": tf.keras.optimizers.Adam(),
|
|
258
|
+
"adamw": tf.keras.optimizers.AdamW(),
|
|
259
|
+
"adadelta": tf.keras.optimizers.Adadelta(),
|
|
260
|
+
"rmsprop": tf.keras.optimizers.RMSprop(),
|
|
261
|
+
}
|
|
262
|
+
optimizer = optimizers[optimizer]
|
|
263
|
+
train_losses = []
|
|
264
|
+
train_f1_scores = []
|
|
265
|
+
val_losses = []
|
|
266
|
+
val_f1_scores = []
|
|
267
|
+
|
|
268
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
269
|
+
data.x, data.y, test_size=test_size, shuffle=False
|
|
270
|
+
)
|
|
271
|
+
adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
|
|
272
|
+
adjacency_test = tf.sparse.slice(
|
|
273
|
+
data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
batch_starts = np.arange(0, len(X_train), batch_size)
|
|
277
|
+
for epoch in range(epochs):
|
|
278
|
+
np.random.shuffle(batch_starts)
|
|
279
|
+
for start in batch_starts:
|
|
280
|
+
end = start + batch_size
|
|
281
|
+
batch_x = X_train[start:end, :]
|
|
282
|
+
batch_adjacency = tf.sparse.slice(
|
|
283
|
+
adjacency_train, [start, start], [batch_size, batch_size]
|
|
284
|
+
)
|
|
285
|
+
batch_y = y_train[start:end]
|
|
286
|
+
train_loss = self.train_step(batch_x, batch_adjacency, batch_y, optimizer)
|
|
287
|
+
|
|
288
|
+
train_loss, train_f1 = self.evaluate(X_train, adjacency_train, y_train)
|
|
289
|
+
train_losses.append(train_loss)
|
|
290
|
+
train_f1_scores.append(train_f1)
|
|
291
|
+
|
|
292
|
+
if epoch % 2 == 0:
|
|
293
|
+
val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
|
|
294
|
+
val_losses.append(val_loss)
|
|
295
|
+
val_f1_scores.append(val_f1)
|
|
296
|
+
print(
|
|
297
|
+
f"Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train F1: {train_f1:.3f} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
return train_losses, train_f1_scores, val_losses, val_f1_scores
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
if __name__ == "__main__":
|
|
304
|
+
# Example usage
|
|
305
|
+
import pandas as pd
|
|
306
|
+
from sklearn.datasets import load_iris
|
|
307
|
+
|
|
308
|
+
# Load the dataset
|
|
309
|
+
iris = load_iris()
|
|
310
|
+
|
|
311
|
+
# Convert to a DataFrame for easy exploration
|
|
312
|
+
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
|
|
313
|
+
iris_df["species"] = iris.target
|
|
314
|
+
|
|
315
|
+
iris_df["sepal length (cm)"] = iris_df["sepal length (cm)"].astype("category")
|
|
316
|
+
iris_df["sepal width (cm)"] = iris_df["sepal width (cm)"].astype("category")
|
|
317
|
+
iris_df["petal length (cm)"] = iris_df["petal length (cm)"].astype("category")
|
|
318
|
+
iris_df["petal width (cm)"] = iris_df["petal width (cm)"].astype("category")
|
|
319
|
+
|
|
320
|
+
# Display the first few rows of the dataset
|
|
321
|
+
print(iris_df.head())
|
|
322
|
+
|
|
323
|
+
iris_df = iris_df.sample(frac=1, replace=False).reset_index(drop=True)
|
|
324
|
+
|
|
325
|
+
data = Data(iris_df, "species")
|
|
326
|
+
|
|
327
|
+
model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
|
|
328
|
+
print("Before training F1:", model.test(data))
|
|
329
|
+
model.fit(data, epochs=200, batch_size=32, test_size=0.5)
|
|
330
|
+
model.save("./best_model.keras")
|
|
331
|
+
print("After training F1:", model.test(data))
|
|
332
|
+
best_model = tf.keras.models.load_model("./best_model.keras")
|
|
333
|
+
|
|
334
|
+
print("After loading F1:", best_model.test(data))
|
|
335
|
+
df_results = pd.DataFrame()
|
|
336
|
+
|
|
337
|
+
# Suppose we have a new dataset without the target variable
|
|
338
|
+
iris_df = iris_df.drop(columns=["species"])
|
|
339
|
+
data_new = Data(iris_df)
|
|
340
|
+
print("Predictions:", best_model.predict(data_new))
|
|
341
|
+
df_results["predicted"] = list(model.predict(data))
|
|
342
|
+
df_results["actual"] = list(data.y)
|
|
343
|
+
# df_results.to_csv("results.csv", index=False)
|
|
344
|
+
breakpoint()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from functools import partial
|
|
3
|
+
from shutil import rmtree
|
|
3
4
|
|
|
4
5
|
import keras_tuner
|
|
5
6
|
import numpy as np
|
|
@@ -15,26 +16,26 @@ class AutoClassifier(tf.keras.Model):
|
|
|
15
16
|
An auto-classifier model that automatically determines the best classification strategy based on the input data.
|
|
16
17
|
|
|
17
18
|
Attributes:
|
|
18
|
-
-
|
|
19
|
+
- input_shape_parm: The shape of the input data.
|
|
19
20
|
- num_classes: The number of classes in the dataset.
|
|
20
21
|
- units: The number of neurons in each hidden layer.
|
|
21
22
|
- activation: The type of activation function to use for the neural network layers.
|
|
22
23
|
|
|
23
24
|
Methods:
|
|
24
|
-
__init__(self,
|
|
25
|
-
build(self,
|
|
25
|
+
__init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
|
|
26
|
+
build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
|
|
26
27
|
call(self, x): Defines the forward pass of the model.
|
|
27
28
|
get_config(self): Returns the configuration of the model.
|
|
28
29
|
from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
|
|
29
30
|
"""
|
|
30
31
|
|
|
31
|
-
def __init__(self,
|
|
32
|
+
def __init__(self, input_shape_parm, num_classes, units, activation):
|
|
32
33
|
"""
|
|
33
34
|
Initializes an AutoClassifier instance with the given parameters.
|
|
34
35
|
|
|
35
36
|
Parameters
|
|
36
37
|
----------
|
|
37
|
-
|
|
38
|
+
input_shape_parm : `int`
|
|
38
39
|
The shape of the input data.
|
|
39
40
|
num_classes : `int`
|
|
40
41
|
The number of classes in the dataset.
|
|
@@ -44,7 +45,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
44
45
|
The type of activation function to use for the neural network layers.
|
|
45
46
|
"""
|
|
46
47
|
super(AutoClassifier, self).__init__()
|
|
47
|
-
self.
|
|
48
|
+
self.input_shape_parm = input_shape_parm
|
|
48
49
|
self.num_classes = num_classes
|
|
49
50
|
self.units = units
|
|
50
51
|
self.activation = activation
|
|
@@ -53,7 +54,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
53
54
|
self.decoder = None
|
|
54
55
|
self.classifier = None
|
|
55
56
|
|
|
56
|
-
def build(self,
|
|
57
|
+
def build(self, input_shape_parm):
|
|
57
58
|
self.encoder = tf.keras.Sequential(
|
|
58
59
|
[
|
|
59
60
|
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
@@ -64,7 +65,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
64
65
|
self.decoder = tf.keras.Sequential(
|
|
65
66
|
[
|
|
66
67
|
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
67
|
-
tf.keras.layers.Dense(units=self.
|
|
68
|
+
tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
|
|
68
69
|
]
|
|
69
70
|
)
|
|
70
71
|
|
|
@@ -81,7 +82,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
81
82
|
|
|
82
83
|
def get_config(self):
|
|
83
84
|
config = {
|
|
84
|
-
"
|
|
85
|
+
"input_shape_parm": self.input_shape_parm,
|
|
85
86
|
"num_classes": self.num_classes,
|
|
86
87
|
"units": self.units,
|
|
87
88
|
"activation": self.activation,
|
|
@@ -92,7 +93,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
92
93
|
@classmethod
|
|
93
94
|
def from_config(cls, config):
|
|
94
95
|
return cls(
|
|
95
|
-
|
|
96
|
+
input_shape_parm=config["input_shape_parm"],
|
|
96
97
|
num_classes=config["num_classes"],
|
|
97
98
|
units=config["units"],
|
|
98
99
|
activation=config["activation"],
|
|
@@ -104,7 +105,7 @@ def call_existing_code(
|
|
|
104
105
|
activation: str,
|
|
105
106
|
threshold: float,
|
|
106
107
|
optimizer: str,
|
|
107
|
-
|
|
108
|
+
input_shape_parm: None | int = None,
|
|
108
109
|
num_classes: None | int = None,
|
|
109
110
|
) -> AutoClassifier:
|
|
110
111
|
"""
|
|
@@ -120,7 +121,7 @@ def call_existing_code(
|
|
|
120
121
|
The threshold for the classifier.
|
|
121
122
|
optimizer : `str`
|
|
122
123
|
The type of optimizer to use for the neural network layers.
|
|
123
|
-
|
|
124
|
+
input_shape_parm : `None` | `int`
|
|
124
125
|
The shape of the input data.
|
|
125
126
|
num_classes : `int`
|
|
126
127
|
The number of classes in the dataset.
|
|
@@ -131,7 +132,10 @@ def call_existing_code(
|
|
|
131
132
|
The AutoClassifier instance.
|
|
132
133
|
"""
|
|
133
134
|
model = AutoClassifier(
|
|
134
|
-
|
|
135
|
+
input_shape_parm=input_shape_parm,
|
|
136
|
+
num_classes=num_classes,
|
|
137
|
+
units=units,
|
|
138
|
+
activation=activation,
|
|
135
139
|
)
|
|
136
140
|
model.compile(
|
|
137
141
|
optimizer=optimizer,
|
|
@@ -141,14 +145,14 @@ def call_existing_code(
|
|
|
141
145
|
return model
|
|
142
146
|
|
|
143
147
|
|
|
144
|
-
def build_model(hp,
|
|
148
|
+
def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
|
|
145
149
|
"""Builds a neural network model using Keras Tuner's search algorithm.
|
|
146
150
|
|
|
147
151
|
Parameters
|
|
148
152
|
----------
|
|
149
153
|
hp : `keras_tuner.HyperParameters`
|
|
150
154
|
The hyperparameters to tune.
|
|
151
|
-
|
|
155
|
+
input_shape_parm : `None` | `int`
|
|
152
156
|
The shape of the input data.
|
|
153
157
|
num_classes : `int`
|
|
154
158
|
The number of classes in the dataset.
|
|
@@ -158,7 +162,9 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
|
|
|
158
162
|
`keras.Model`
|
|
159
163
|
The neural network model.
|
|
160
164
|
"""
|
|
161
|
-
units = hp.Int(
|
|
165
|
+
units = hp.Int(
|
|
166
|
+
"units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
|
|
167
|
+
)
|
|
162
168
|
activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
|
|
163
169
|
optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
|
|
164
170
|
threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
|
|
@@ -168,14 +174,21 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
|
|
|
168
174
|
activation=activation,
|
|
169
175
|
threshold=threshold,
|
|
170
176
|
optimizer=optimizer,
|
|
171
|
-
|
|
177
|
+
input_shape_parm=input_shape_parm,
|
|
172
178
|
num_classes=num_classes,
|
|
173
179
|
)
|
|
174
180
|
return model
|
|
175
181
|
|
|
176
182
|
|
|
177
183
|
def setup_model(
|
|
178
|
-
data: DataFrame,
|
|
184
|
+
data: DataFrame,
|
|
185
|
+
target: str,
|
|
186
|
+
epochs: int,
|
|
187
|
+
train_size: float = 0.7,
|
|
188
|
+
seed=None,
|
|
189
|
+
train_mode: bool = True,
|
|
190
|
+
filepath: str = "./my_dir/best_model.keras",
|
|
191
|
+
**kwargs,
|
|
179
192
|
) -> AutoClassifier:
|
|
180
193
|
"""Setup model for training and tuning.
|
|
181
194
|
|
|
@@ -191,6 +204,10 @@ def setup_model(
|
|
|
191
204
|
The proportion of the dataset to use for training.
|
|
192
205
|
seed : `Any` | `int`
|
|
193
206
|
The random seed to use for reproducibility.
|
|
207
|
+
train_mode : `bool`
|
|
208
|
+
Whether to train the model or not.
|
|
209
|
+
filepath : `str`
|
|
210
|
+
The path to save the best model to.
|
|
194
211
|
|
|
195
212
|
Keyword Arguments:
|
|
196
213
|
----------
|
|
@@ -226,8 +243,18 @@ def setup_model(
|
|
|
226
243
|
), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
|
|
227
244
|
validation_split = 1.0 - train_size
|
|
228
245
|
# Create my_dir path if it does not exist
|
|
229
|
-
|
|
230
|
-
|
|
246
|
+
|
|
247
|
+
if train_mode:
|
|
248
|
+
# Create a new directory if it does not exist
|
|
249
|
+
try:
|
|
250
|
+
if not os.path.exists(directory):
|
|
251
|
+
os.makedirs(directory)
|
|
252
|
+
else:
|
|
253
|
+
print(f"Directory {directory} already exists, it will be deleted.")
|
|
254
|
+
rmtree(directory)
|
|
255
|
+
os.makedirs(directory)
|
|
256
|
+
except:
|
|
257
|
+
print("Warning: unable to create directory")
|
|
231
258
|
|
|
232
259
|
# Create a Classifier instance
|
|
233
260
|
y_encoder = OneHotEncoder()
|
|
@@ -237,10 +264,12 @@ def setup_model(
|
|
|
237
264
|
|
|
238
265
|
y = np.asarray(y).astype(np.float32)
|
|
239
266
|
|
|
240
|
-
|
|
267
|
+
input_shape_parm = X.shape[1]
|
|
241
268
|
num_classes = y.shape[1]
|
|
242
269
|
global build_model
|
|
243
|
-
build_model = partial(
|
|
270
|
+
build_model = partial(
|
|
271
|
+
build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
|
|
272
|
+
)
|
|
244
273
|
|
|
245
274
|
# Create the AutoKeras model
|
|
246
275
|
tuner = keras_tuner.RandomSearch(
|
|
@@ -257,13 +286,13 @@ def setup_model(
|
|
|
257
286
|
best_model = models[0]
|
|
258
287
|
|
|
259
288
|
# save model
|
|
260
|
-
best_model.save(
|
|
289
|
+
best_model.save(filepath)
|
|
261
290
|
|
|
262
291
|
if verbose:
|
|
263
292
|
tuner.results_summary()
|
|
264
293
|
else:
|
|
265
294
|
# Load the best model from the directory
|
|
266
|
-
best_model = tf.keras.models.load_model(
|
|
295
|
+
best_model = tf.keras.models.load_model(filepath)
|
|
267
296
|
|
|
268
297
|
return best_model
|
|
269
298
|
|
likelihood/models/simulation.py
CHANGED
|
@@ -10,53 +10,65 @@ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class SimulationEngine(FeatureSelection):
|
|
13
|
+
"""
|
|
14
|
+
This class implements a predictive model that utilizes multiple linear regression for numerical target variables
|
|
15
|
+
and multiple logistic regression for categorical target variables.
|
|
13
16
|
|
|
14
|
-
|
|
17
|
+
The class provides methods for training the model on a given dataset, making predictions,
|
|
18
|
+
and evaluating the model's performance.
|
|
19
|
+
|
|
20
|
+
Key features:
|
|
21
|
+
- Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
|
|
22
|
+
- Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
|
|
23
|
+
- Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
- Instantiate the class with the training data and target variable.
|
|
27
|
+
- Call the fit method to train the model.
|
|
28
|
+
- Use the predict method to generate predictions on new data.
|
|
29
|
+
- Evaluate the model using built-in metrics for accuracy and error.
|
|
30
|
+
|
|
31
|
+
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
|
|
32
|
+
for both numerical and categorical outcomes efficiently.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
|
|
15
36
|
|
|
16
37
|
self.df = df
|
|
17
38
|
self.n_importances = n_importances
|
|
39
|
+
self.use_scaler = use_scaler
|
|
18
40
|
|
|
19
41
|
super().__init__(**kwargs)
|
|
20
42
|
|
|
21
|
-
def predict(self, df: DataFrame, column: str
|
|
22
|
-
|
|
23
|
-
# We clean the data set
|
|
24
|
-
df = self._clean_data(df)
|
|
25
|
-
|
|
43
|
+
def predict(self, df: DataFrame, column: str) -> ndarray | list:
|
|
26
44
|
# Let us assign the dictionary entries corresponding to the column
|
|
27
45
|
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
28
46
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
47
|
+
df = df[names_cols].copy()
|
|
48
|
+
# Change the scale of the dataframe
|
|
49
|
+
dataset = self.df.copy()
|
|
50
|
+
dataset.drop(columns=column, inplace=True)
|
|
51
|
+
numeric_df = dataset.select_dtypes(include="number")
|
|
52
|
+
if self.use_scaler:
|
|
33
53
|
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
34
|
-
|
|
54
|
+
_ = scaler.rescale()
|
|
55
|
+
dataset_ = df.copy()
|
|
56
|
+
numeric_df = dataset_.select_dtypes(include="number")
|
|
57
|
+
numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
|
|
35
58
|
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# Encoding the datadrame
|
|
39
|
-
for num, colname in enumerate(dfe._encode_columns):
|
|
40
|
-
if df[colname].dtype == "object":
|
|
41
|
-
encode_dict = dfe.encoding_list[num]
|
|
42
|
-
df[colname] = df[colname].apply(
|
|
43
|
-
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
except:
|
|
47
|
-
print("The dataframe provided does not have the same columns as in the fit method.")
|
|
48
|
-
|
|
49
|
-
# Assign value to n if n is None
|
|
50
|
-
n = n if n != None else len(df)
|
|
51
|
-
|
|
52
|
-
# Generation of assertion
|
|
53
|
-
assert n > 0 and n <= len(df), '"n" must be interger or "<= len(df)".'
|
|
59
|
+
for col in numeric_df.columns:
|
|
60
|
+
df[col] = numeric_df[col].values
|
|
54
61
|
|
|
55
|
-
#
|
|
56
|
-
|
|
62
|
+
# Encoding the datadrame
|
|
63
|
+
for num, colname in enumerate(dfe._encode_columns):
|
|
64
|
+
if df[colname].dtype == "object":
|
|
65
|
+
encode_dict = dfe.encoding_list[num]
|
|
66
|
+
df[colname] = df[colname].apply(
|
|
67
|
+
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
68
|
+
)
|
|
57
69
|
|
|
58
70
|
# PREDICTION
|
|
59
|
-
y =
|
|
71
|
+
y = df.to_numpy() @ w
|
|
60
72
|
|
|
61
73
|
# Categorical column
|
|
62
74
|
if quick_encoder != None:
|
|
@@ -67,18 +79,18 @@ class SimulationEngine(FeatureSelection):
|
|
|
67
79
|
y = [encoding_dic[item] for item in y]
|
|
68
80
|
# Numeric column
|
|
69
81
|
else:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
82
|
+
if self.use_scaler:
|
|
83
|
+
# scale output
|
|
84
|
+
y += 1
|
|
85
|
+
y /= 2
|
|
86
|
+
y = y * (self.df[column].max() - self.df[column].min())
|
|
75
87
|
|
|
76
|
-
return y
|
|
88
|
+
return y[:]
|
|
77
89
|
|
|
78
90
|
def fit(self, **kwargs) -> None:
|
|
79
91
|
|
|
80
92
|
# We run the feature selection algorithm
|
|
81
|
-
self.get_digraph(self.df, self.n_importances)
|
|
93
|
+
self.get_digraph(self.df, self.n_importances, self.use_scaler)
|
|
82
94
|
|
|
83
95
|
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
84
96
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
4
5
|
from numpy import arange, array, ndarray, random
|
|
5
6
|
from numpy.linalg import solve
|
|
6
7
|
from pandas.core.frame import DataFrame
|
|
7
8
|
|
|
8
|
-
# -------------------------------------------------------------------------
|
|
9
|
-
|
|
10
9
|
|
|
11
|
-
|
|
10
|
+
# -------------------------------------------------------------------------
|
|
11
|
+
def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
12
|
"""Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
|
|
13
13
|
|
|
14
14
|
Parameters
|
|
@@ -19,11 +19,15 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
19
19
|
Returns
|
|
20
20
|
-------
|
|
21
21
|
`DataFrame`
|
|
22
|
-
A dataframe with variable names as
|
|
23
|
-
correlation coefficients
|
|
22
|
+
A square dataframe with variable names as both index and columns,
|
|
23
|
+
containing their corresponding correlation coefficients.
|
|
24
24
|
"""
|
|
25
|
-
|
|
26
|
-
columns = df.columns
|
|
25
|
+
|
|
26
|
+
columns = df.select_dtypes(include="number").columns
|
|
27
|
+
n = len(columns)
|
|
28
|
+
|
|
29
|
+
# Initialize a square matrix for the correlations
|
|
30
|
+
correlations = pd.DataFrame(1.0, index=columns, columns=columns)
|
|
27
31
|
|
|
28
32
|
for i, col1 in enumerate(columns):
|
|
29
33
|
for j, col2 in enumerate(columns):
|
|
@@ -32,9 +36,9 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
32
36
|
y = df[col2].values
|
|
33
37
|
|
|
34
38
|
correlation = xicor(x, y)
|
|
35
|
-
correlations[
|
|
36
|
-
|
|
37
|
-
|
|
39
|
+
correlations.loc[col1, col2] = round(correlation, 8)
|
|
40
|
+
correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
|
|
41
|
+
|
|
38
42
|
return correlations
|
|
39
43
|
|
|
40
44
|
|
|
@@ -51,10 +55,11 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
51
55
|
"""
|
|
52
56
|
|
|
53
57
|
|
|
54
|
-
def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
|
|
55
|
-
"""
|
|
58
|
+
def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
|
|
59
|
+
"""
|
|
60
|
+
Calculate a generalized coefficient of correlation between two variables.
|
|
56
61
|
|
|
57
|
-
|
|
62
|
+
This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
|
|
58
63
|
|
|
59
64
|
Parameters
|
|
60
65
|
----------
|
|
@@ -62,30 +67,52 @@ def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
|
|
|
62
67
|
The first variable to be correlated. Must have at least one dimension.
|
|
63
68
|
Y : `np.ndarray`
|
|
64
69
|
The second variable to be correlated. Must have at least one dimension.
|
|
70
|
+
ties : bool
|
|
71
|
+
Whether to handle ties using randomization.
|
|
72
|
+
random_seed : int, optional
|
|
73
|
+
Seed for the random number generator for reproducibility.
|
|
65
74
|
|
|
66
75
|
Returns
|
|
67
76
|
-------
|
|
68
77
|
xi : `float`
|
|
69
78
|
The estimated value of the new coefficient of correlation.
|
|
70
79
|
"""
|
|
71
|
-
|
|
80
|
+
|
|
81
|
+
# Early return for identical arrays
|
|
82
|
+
if np.array_equal(X, Y):
|
|
83
|
+
return 1.0
|
|
84
|
+
|
|
72
85
|
n = len(X)
|
|
73
|
-
|
|
86
|
+
|
|
87
|
+
# Early return for cases with less than 2 elements
|
|
88
|
+
if n < 2:
|
|
89
|
+
return 0.0
|
|
90
|
+
|
|
91
|
+
# Flatten the input arrays if they are multidimensional
|
|
92
|
+
X = X.flatten()
|
|
93
|
+
Y = Y.flatten()
|
|
94
|
+
|
|
95
|
+
# Get the sorted order of X
|
|
96
|
+
order = np.argsort(X)
|
|
97
|
+
|
|
74
98
|
if ties:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
99
|
+
np.random.seed(random_seed) # Set seed for reproducibility if needed
|
|
100
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks
|
|
101
|
+
unique_ranks, counts = np.unique(ranks, return_counts=True)
|
|
102
|
+
|
|
103
|
+
# Adjust ranks for ties by shuffling
|
|
104
|
+
for rank, count in zip(unique_ranks, counts):
|
|
105
|
+
if count > 1:
|
|
106
|
+
tie_indices = np.where(ranks == rank)[0]
|
|
107
|
+
np.random.shuffle(ranks[tie_indices]) # Randomize ties
|
|
108
|
+
|
|
109
|
+
cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
|
|
110
|
+
return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
|
|
111
|
+
2 * np.sum(cumulative_counts * (n - cumulative_counts))
|
|
112
|
+
)
|
|
86
113
|
else:
|
|
87
|
-
|
|
88
|
-
return 1 - 3 * sum(abs(
|
|
114
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
|
|
115
|
+
return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
|
|
89
116
|
|
|
90
117
|
|
|
91
118
|
# -------------------------------------------------------------------------
|
|
@@ -257,8 +284,8 @@ if __name__ == "__main__":
|
|
|
257
284
|
print("New correlation coefficient test")
|
|
258
285
|
X = np.random.rand(100, 1)
|
|
259
286
|
Y = X * X
|
|
260
|
-
print("coefficient for Y = X * X : ", xicor(X, Y))
|
|
261
|
-
|
|
287
|
+
print("coefficient for Y = X * X : ", xicor(X, Y, False))
|
|
288
|
+
df["index"] = ["A", "B", "C", "D"]
|
|
262
289
|
print("New correlation coefficient test for pandas DataFrame")
|
|
263
290
|
values_df = xi_corr(df)
|
|
264
291
|
breakpoint()
|
likelihood/tools/tools.py
CHANGED
|
@@ -640,14 +640,14 @@ def cal_average(y: ndarray, alpha: float = 1):
|
|
|
640
640
|
class DataScaler:
|
|
641
641
|
"""numpy array `scaler` and `rescaler`"""
|
|
642
642
|
|
|
643
|
-
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose"]
|
|
643
|
+
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
|
|
644
644
|
|
|
645
645
|
def __init__(self, dataset: ndarray, n: int = 1) -> None:
|
|
646
646
|
"""Initializes the parameters required for scaling the data"""
|
|
647
647
|
self.dataset_ = dataset.copy()
|
|
648
648
|
self._n = n
|
|
649
649
|
|
|
650
|
-
def rescale(self) -> ndarray:
|
|
650
|
+
def rescale(self, dataset_: ndarray | None = None) -> ndarray:
|
|
651
651
|
"""Perform a standard rescaling of the data
|
|
652
652
|
|
|
653
653
|
Returns
|
|
@@ -655,11 +655,26 @@ class DataScaler:
|
|
|
655
655
|
data_scaled : `np.array`
|
|
656
656
|
An array containing the scaled data.
|
|
657
657
|
"""
|
|
658
|
+
if isinstance(dataset_, ndarray):
|
|
659
|
+
data_scaled = np.copy(dataset_)
|
|
660
|
+
mu = self.values[0]
|
|
661
|
+
sigma = self.values[1]
|
|
662
|
+
f = self.values[2]
|
|
663
|
+
data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
|
|
664
|
+
for i in range(self.dataset_.shape[0]):
|
|
665
|
+
if self._n != None:
|
|
666
|
+
poly = f[i](self.inv_fitting[i](data_scaled[i]))
|
|
667
|
+
data_scaled[i] += -poly
|
|
668
|
+
data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
|
|
669
|
+
return data_scaled
|
|
670
|
+
else:
|
|
671
|
+
self.data_scaled = np.copy(self.dataset_.copy())
|
|
658
672
|
|
|
659
673
|
mu = []
|
|
660
674
|
sigma = []
|
|
661
675
|
fitting = []
|
|
662
|
-
self.
|
|
676
|
+
self.inv_fitting = []
|
|
677
|
+
|
|
663
678
|
try:
|
|
664
679
|
xaxis = range(self.dataset_.shape[1])
|
|
665
680
|
except:
|
|
@@ -675,12 +690,15 @@ class DataScaler:
|
|
|
675
690
|
for i in range(self.dataset_.shape[0]):
|
|
676
691
|
if self._n != None:
|
|
677
692
|
fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
|
|
693
|
+
inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
|
|
678
694
|
f = np.poly1d(fit)
|
|
679
695
|
poly = f(xaxis)
|
|
680
696
|
fitting.append(f)
|
|
697
|
+
self.inv_fitting.append(inv_fit)
|
|
681
698
|
self.data_scaled[i, :] += -poly
|
|
682
699
|
else:
|
|
683
700
|
fitting.append(0.0)
|
|
701
|
+
self.inv_fitting.append(0.0)
|
|
684
702
|
mu.append(np.min(self.data_scaled[i, :]))
|
|
685
703
|
if np.max(self.data_scaled[i, :]) != 0:
|
|
686
704
|
sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
|
|
@@ -1064,7 +1082,7 @@ class FeatureSelection:
|
|
|
1064
1082
|
self.all_features_imp_graph: List[Tuple] = []
|
|
1065
1083
|
self.w_dict = dict()
|
|
1066
1084
|
|
|
1067
|
-
def get_digraph(self, dataset: DataFrame, n_importances: int) -> str:
|
|
1085
|
+
def get_digraph(self, dataset: DataFrame, n_importances: int, use_scaler: bool = False) -> str:
|
|
1068
1086
|
"""
|
|
1069
1087
|
Get directed graph showing importance of features.
|
|
1070
1088
|
|
|
@@ -1092,10 +1110,11 @@ class FeatureSelection:
|
|
|
1092
1110
|
feature_string += column + "; "
|
|
1093
1111
|
|
|
1094
1112
|
numeric_df = curr_dataset.select_dtypes(include="number")
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1113
|
+
if use_scaler:
|
|
1114
|
+
self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
1115
|
+
numeric_scaled = self.scaler.rescale()
|
|
1116
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
1117
|
+
curr_dataset[numeric_df.columns] = numeric_df
|
|
1099
1118
|
|
|
1100
1119
|
# We construct dictionary to save index for scaling
|
|
1101
1120
|
numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
|
|
@@ -1119,7 +1138,6 @@ class FeatureSelection:
|
|
|
1119
1138
|
dfe = DataFrameEncoder(X_aux)
|
|
1120
1139
|
encoded_df = dfe.encode(save_mode=False)
|
|
1121
1140
|
# We train
|
|
1122
|
-
|
|
1123
1141
|
Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
|
|
1124
1142
|
# We obtain importance
|
|
1125
1143
|
importance = Model.get_importances()
|
|
@@ -1202,7 +1220,7 @@ class FeatureSelection:
|
|
|
1202
1220
|
|
|
1203
1221
|
|
|
1204
1222
|
def check_nan_inf(df: DataFrame) -> DataFrame:
|
|
1205
|
-
"""
|
|
1223
|
+
"""Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
|
|
1206
1224
|
nan_values = df.isnull().values.any()
|
|
1207
1225
|
count = np.isinf(df.select_dtypes(include="number")).values.sum()
|
|
1208
1226
|
print("There are null values : ", nan_values)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.18
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -28,8 +28,9 @@ Requires-Dist: corner
|
|
|
28
28
|
Provides-Extra: full
|
|
29
29
|
Requires-Dist: networkx ; extra == 'full'
|
|
30
30
|
Requires-Dist: pyvis ; extra == 'full'
|
|
31
|
-
Requires-Dist: tensorflow ; extra == 'full'
|
|
31
|
+
Requires-Dist: tensorflow ==2.15.0 ; extra == 'full'
|
|
32
32
|
Requires-Dist: keras-tuner ; extra == 'full'
|
|
33
|
+
Requires-Dist: scikit-learn ; extra == 'full'
|
|
33
34
|
|
|
34
35
|

|
|
35
36
|
|
|
@@ -2,17 +2,18 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
|
|
|
2
2
|
likelihood/main.py,sha256=prqT9egu3B2rcbsVMqYxuosNbe7NhDBCmmZtQ21aSlQ,8591
|
|
3
3
|
likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
|
|
4
4
|
likelihood/graph/graph.py,sha256=wKJqgxXiSbnvzyW3SjhQVrqp00yKMHf3ph6CIDNVhNM,2891
|
|
5
|
+
likelihood/graph/nn.py,sha256=XqTnAHzXP0jSdLd0IOFjVZUZTcQU-XppsZLmJrG2GMo,12372
|
|
5
6
|
likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
|
|
6
7
|
likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
|
|
7
|
-
likelihood/models/simulation.py,sha256=
|
|
8
|
+
likelihood/models/simulation.py,sha256=mdgQPg_LEY5svPaF4TFv-DoQRE2oP2ig_uXnwINtewM,4039
|
|
8
9
|
likelihood/models/utils.py,sha256=VtEj07lV-GRoWraQgpfjU0jTt1Ntf9MXgYwe6XYQh20,1552
|
|
9
10
|
likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
|
|
10
|
-
likelihood/models/deep/autoencoders.py,sha256=
|
|
11
|
+
likelihood/models/deep/autoencoders.py,sha256=lUvFQ7lbjvIPR_IKFnK5VCrSa419P5dOaTL3qSHntJk,9623
|
|
11
12
|
likelihood/tools/__init__.py,sha256=MCjsCWfBNKE2uMN0VizDN1uFzZ_md0X2WZeBdWhrCR8,50
|
|
12
|
-
likelihood/tools/numeric_tools.py,sha256=
|
|
13
|
-
likelihood/tools/tools.py,sha256=
|
|
14
|
-
likelihood-1.2.
|
|
15
|
-
likelihood-1.2.
|
|
16
|
-
likelihood-1.2.
|
|
17
|
-
likelihood-1.2.
|
|
18
|
-
likelihood-1.2.
|
|
13
|
+
likelihood/tools/numeric_tools.py,sha256=cPTPgdww2ofxfyhJDomqvtXDgsSDs9iRQ7GHLt5Vl6M,8457
|
|
14
|
+
likelihood/tools/tools.py,sha256=O39aPxTNsaBVSJFIkNsUESNSkfG4C7GG77wcR51a8IQ,42543
|
|
15
|
+
likelihood-1.2.18.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
|
|
16
|
+
likelihood-1.2.18.dist-info/METADATA,sha256=8nAjAwwqCDw8K9IBzKG2cgBU5DOLAA-N-RIlr02eyjU,2518
|
|
17
|
+
likelihood-1.2.18.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
18
|
+
likelihood-1.2.18.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
|
|
19
|
+
likelihood-1.2.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|