likelihood 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/nn.py +72 -113
- likelihood/models/deep/autoencoders.py +352 -116
- likelihood/tools/__init__.py +1 -0
- likelihood/tools/figures.py +348 -0
- likelihood/tools/models_tools.py +253 -0
- likelihood/tools/tools.py +26 -84
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/METADATA +1 -1
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/RECORD +11 -9
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/WHEEL +1 -1
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/LICENSE +0 -0
- {likelihood-1.4.0.dist-info → likelihood-1.5.0.dist-info}/top_level.txt +0 -0
likelihood/graph/nn.py
CHANGED
|
@@ -5,7 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
|
5
5
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
6
6
|
|
|
7
7
|
import warnings
|
|
8
|
-
from typing import List, Tuple
|
|
8
|
+
from typing import Any, List, Tuple
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import pandas as pd
|
|
@@ -15,48 +15,43 @@ from pandas.core.frame import DataFrame
|
|
|
15
15
|
from sklearn.metrics import f1_score
|
|
16
16
|
from sklearn.model_selection import train_test_split
|
|
17
17
|
|
|
18
|
-
from likelihood.tools import generate_feature_yaml
|
|
19
|
-
|
|
20
18
|
tf.get_logger().setLevel("ERROR")
|
|
21
19
|
|
|
20
|
+
from likelihood.tools import LoRALayer
|
|
22
21
|
|
|
23
|
-
def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
|
|
24
|
-
"""Compares the similarity between two arrays of categories.
|
|
25
22
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
arr1 : `ndarray`
|
|
29
|
-
The first array of categories.
|
|
30
|
-
arr2 : `ndarray`
|
|
31
|
-
The second array of categories.
|
|
23
|
+
def compare_similarity(arr1: List[Any], arr2: List[Any], threshold: float = 0.05) -> int:
|
|
24
|
+
"""Calculate the similarity between two arrays considering numeric values near to 1 in ratio."""
|
|
32
25
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
26
|
+
def is_similar(a: Any, b: Any) -> bool:
|
|
27
|
+
if isinstance(a, (int, float)) and isinstance(b, (int, float)):
|
|
28
|
+
if a == 0 and b == 0:
|
|
29
|
+
return True
|
|
30
|
+
if a == 0 or b == 0:
|
|
31
|
+
return False
|
|
32
|
+
# For numeric values, check if their ratio is within the threshold range
|
|
33
|
+
ratio = max(a, b) / min(a, b)
|
|
34
|
+
return 1 - threshold <= ratio <= 1 + threshold
|
|
35
|
+
else:
|
|
36
|
+
return a == b
|
|
38
37
|
|
|
39
|
-
|
|
40
|
-
for i in range(len(arr1)):
|
|
41
|
-
if arr1[i] == arr2[i]:
|
|
42
|
-
count += 1
|
|
43
|
-
return count
|
|
38
|
+
return sum(is_similar(a, b) for a, b in zip(arr1, arr2))
|
|
44
39
|
|
|
45
40
|
|
|
46
41
|
def cal_adjacency_matrix(
|
|
47
42
|
df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
48
43
|
) -> Tuple[dict, np.ndarray]:
|
|
49
44
|
"""Calculates the adjacency matrix for a given DataFrame.
|
|
50
|
-
The adjacency matrix is a matrix that represents the similarity between each pair of
|
|
45
|
+
The adjacency matrix is a matrix that represents the similarity between each pair of features.
|
|
51
46
|
The similarity is calculated using the `compare_similarity` function.
|
|
52
|
-
The resulting matrix is a square matrix with the same number of rows and columns as the input DataFrame.
|
|
47
|
+
The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
|
|
53
48
|
|
|
54
49
|
Parameters
|
|
55
50
|
----------
|
|
56
51
|
df : `DataFrame`
|
|
57
|
-
The input DataFrame containing the
|
|
52
|
+
The input DataFrame containing the features.
|
|
58
53
|
exclude_subset : `List[str]`, optional
|
|
59
|
-
A list of
|
|
54
|
+
A list of features to exclude from the calculation of the adjacency matrix.
|
|
60
55
|
sparse : `bool`, optional
|
|
61
56
|
Whether to return a sparse matrix or a dense matrix.
|
|
62
57
|
**kwargs : `dict`
|
|
@@ -65,49 +60,37 @@ def cal_adjacency_matrix(
|
|
|
65
60
|
Keyword Arguments:
|
|
66
61
|
----------
|
|
67
62
|
similarity: `int`
|
|
68
|
-
The minimum number of
|
|
63
|
+
The minimum number of features that must be the same in both arrays to be considered similar.
|
|
64
|
+
threshold : `float`
|
|
65
|
+
The threshold value used in the `compare_similarity` function. Default is 0.05.
|
|
69
66
|
|
|
70
67
|
Returns
|
|
71
68
|
-------
|
|
72
69
|
adj_dict : `dict`
|
|
73
|
-
A dictionary containing the
|
|
70
|
+
A dictionary containing the features.
|
|
74
71
|
adjacency_matrix : `ndarray`
|
|
75
72
|
The adjacency matrix.
|
|
76
73
|
"""
|
|
77
74
|
|
|
78
|
-
yaml_ = generate_feature_yaml(df)
|
|
79
|
-
categorical_columns = yaml_["categorical_features"]
|
|
80
75
|
if len(exclude_subset) > 0:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
if len(categorical_columns) > 1:
|
|
84
|
-
df_categorical = df[categorical_columns].copy()
|
|
76
|
+
columns = [col for col in df.columns if col not in exclude_subset]
|
|
77
|
+
df_ = df[columns].copy()
|
|
85
78
|
else:
|
|
86
|
-
|
|
87
|
-
col
|
|
88
|
-
for col in df.columns
|
|
89
|
-
if (
|
|
90
|
-
col not in exclude_subset
|
|
91
|
-
and pd.api.types.is_integer_dtype(df[col])
|
|
92
|
-
and len(df[col].unique()) > 2
|
|
93
|
-
)
|
|
94
|
-
]
|
|
95
|
-
df_categorical = df[categorical_columns].copy()
|
|
79
|
+
df_ = df.copy()
|
|
96
80
|
|
|
97
|
-
assert len(
|
|
81
|
+
assert len(df_) > 0
|
|
98
82
|
|
|
99
|
-
similarity = kwargs.get("similarity", len(
|
|
100
|
-
|
|
83
|
+
similarity = kwargs.get("similarity", len(df_.columns) - 1)
|
|
84
|
+
threshold = kwargs.get("threshold", 0.05)
|
|
85
|
+
assert similarity <= df_.shape[1]
|
|
101
86
|
|
|
102
|
-
adj_dict = {}
|
|
103
|
-
for index, row in df_categorical.iterrows():
|
|
104
|
-
adj_dict[index] = row.to_list()
|
|
87
|
+
adj_dict = {index: row.tolist() for index, row in df_.iterrows()}
|
|
105
88
|
|
|
106
|
-
adjacency_matrix = np.zeros((len(
|
|
89
|
+
adjacency_matrix = np.zeros((len(df_), len(df_)))
|
|
107
90
|
|
|
108
|
-
for i in range(len(
|
|
109
|
-
for j in range(len(
|
|
110
|
-
if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
|
|
91
|
+
for i in range(len(df_)):
|
|
92
|
+
for j in range(len(df_)):
|
|
93
|
+
if compare_similarity(adj_dict[i], adj_dict[j], threshold=threshold) >= similarity:
|
|
111
94
|
adjacency_matrix[i][j] = 1
|
|
112
95
|
|
|
113
96
|
if sparse:
|
|
@@ -131,8 +114,13 @@ class Data:
|
|
|
131
114
|
df: DataFrame,
|
|
132
115
|
target: str | None = None,
|
|
133
116
|
exclude_subset: List[str] = [],
|
|
117
|
+
**kwargs,
|
|
134
118
|
):
|
|
135
|
-
|
|
119
|
+
sparse = kwargs.get("sparse", True)
|
|
120
|
+
threshold = kwargs.get("threshold", 0.05)
|
|
121
|
+
_, adjacency = cal_adjacency_matrix(
|
|
122
|
+
df, exclude_subset=exclude_subset, sparse=sparse, threshold=threshold
|
|
123
|
+
)
|
|
136
124
|
if target is not None:
|
|
137
125
|
X = df.drop(columns=[target] + exclude_subset)
|
|
138
126
|
else:
|
|
@@ -147,16 +135,20 @@ class Data:
|
|
|
147
135
|
|
|
148
136
|
@tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
|
|
149
137
|
class VanillaGNNLayer(tf.keras.layers.Layer):
|
|
150
|
-
def __init__(self, dim_in, dim_out, kernel_initializer="glorot_uniform", **kwargs):
|
|
138
|
+
def __init__(self, dim_in, dim_out, rank=None, kernel_initializer="glorot_uniform", **kwargs):
|
|
151
139
|
super(VanillaGNNLayer, self).__init__(**kwargs)
|
|
152
140
|
self.dim_out = dim_out
|
|
141
|
+
self.rank = rank
|
|
153
142
|
self.kernel_initializer = kernel_initializer
|
|
154
143
|
self.linear = None
|
|
155
144
|
|
|
156
145
|
def build(self, input_shape):
|
|
157
|
-
self.
|
|
158
|
-
self.
|
|
159
|
-
|
|
146
|
+
if self.rank:
|
|
147
|
+
self.linear = LoRALayer(self.dim_out, rank=self.rank)
|
|
148
|
+
else:
|
|
149
|
+
self.linear = tf.keras.layers.Dense(
|
|
150
|
+
self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
|
|
151
|
+
)
|
|
160
152
|
super(VanillaGNNLayer, self).build(input_shape)
|
|
161
153
|
|
|
162
154
|
def call(self, x, adjacency):
|
|
@@ -169,8 +161,11 @@ class VanillaGNNLayer(tf.keras.layers.Layer):
|
|
|
169
161
|
config.update(
|
|
170
162
|
{
|
|
171
163
|
"dim_out": self.dim_out,
|
|
172
|
-
"
|
|
173
|
-
|
|
164
|
+
"rank": self.rank,
|
|
165
|
+
"kernel_initializer": (
|
|
166
|
+
None
|
|
167
|
+
if self.rank
|
|
168
|
+
else tf.keras.initializers.serialize(self.linear.kernel_initializer)
|
|
174
169
|
),
|
|
175
170
|
}
|
|
176
171
|
)
|
|
@@ -179,14 +174,16 @@ class VanillaGNNLayer(tf.keras.layers.Layer):
|
|
|
179
174
|
|
|
180
175
|
@tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
|
|
181
176
|
class VanillaGNN(tf.keras.Model):
|
|
182
|
-
def __init__(self, dim_in, dim_h, dim_out, **kwargs):
|
|
177
|
+
def __init__(self, dim_in, dim_h, dim_out, rank=2, **kwargs):
|
|
183
178
|
super(VanillaGNN, self).__init__(**kwargs)
|
|
184
179
|
self.dim_in = dim_in
|
|
185
180
|
self.dim_h = dim_h
|
|
186
181
|
self.dim_out = dim_out
|
|
187
|
-
self.
|
|
188
|
-
|
|
189
|
-
self.
|
|
182
|
+
self.rank = rank
|
|
183
|
+
|
|
184
|
+
self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h, self.rank)
|
|
185
|
+
self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h, self.rank)
|
|
186
|
+
self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out, None)
|
|
190
187
|
|
|
191
188
|
def call(self, x, adjacency):
|
|
192
189
|
h = self.gnn1(x, adjacency)
|
|
@@ -208,13 +205,13 @@ class VanillaGNN(tf.keras.Model):
|
|
|
208
205
|
out = self(x, adjacency)
|
|
209
206
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
|
|
210
207
|
loss = tf.reduce_mean(loss)
|
|
211
|
-
f1 = self.compute_f1_score(out, y)
|
|
208
|
+
f1 = round(self.compute_f1_score(out, y), 4)
|
|
212
209
|
return loss.numpy(), f1
|
|
213
210
|
|
|
214
211
|
def test(self, data):
|
|
215
212
|
out = self(data.x, data.adjacency)
|
|
216
213
|
test_f1 = self.compute_f1_score(out, data.y)
|
|
217
|
-
return test_f1
|
|
214
|
+
return round(test_f1, 4)
|
|
218
215
|
|
|
219
216
|
def predict(self, data):
|
|
220
217
|
out = self(data.x, data.adjacency)
|
|
@@ -225,6 +222,7 @@ class VanillaGNN(tf.keras.Model):
|
|
|
225
222
|
"dim_in": self.dim_in,
|
|
226
223
|
"dim_h": self.dim_h,
|
|
227
224
|
"dim_out": self.dim_out,
|
|
225
|
+
"rank": self.rank,
|
|
228
226
|
}
|
|
229
227
|
base_config = super(VanillaGNN, self).get_config()
|
|
230
228
|
return dict(list(base_config.items()) + list(config.items()))
|
|
@@ -235,6 +233,7 @@ class VanillaGNN(tf.keras.Model):
|
|
|
235
233
|
dim_in=config["dim_in"],
|
|
236
234
|
dim_h=config["dim_h"],
|
|
237
235
|
dim_out=config["dim_out"],
|
|
236
|
+
rank=config["rank"],
|
|
238
237
|
)
|
|
239
238
|
|
|
240
239
|
@tf.function
|
|
@@ -248,10 +247,6 @@ class VanillaGNN(tf.keras.Model):
|
|
|
248
247
|
return loss
|
|
249
248
|
|
|
250
249
|
def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
|
|
251
|
-
warnings.warn(
|
|
252
|
-
"It is normal for validation metrics to underperform. Use the test method to validate after training.",
|
|
253
|
-
UserWarning,
|
|
254
|
-
)
|
|
255
250
|
optimizers = {
|
|
256
251
|
"sgd": tf.keras.optimizers.SGD(),
|
|
257
252
|
"adam": tf.keras.optimizers.Adam(),
|
|
@@ -290,56 +285,20 @@ class VanillaGNN(tf.keras.Model):
|
|
|
290
285
|
train_f1_scores.append(train_f1)
|
|
291
286
|
|
|
292
287
|
if epoch % 5 == 0:
|
|
288
|
+
clear_output(wait=True)
|
|
289
|
+
warnings.warn(
|
|
290
|
+
"It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
|
|
291
|
+
UserWarning,
|
|
292
|
+
)
|
|
293
293
|
val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
|
|
294
294
|
val_losses.append(val_loss)
|
|
295
295
|
val_f1_scores.append(val_f1)
|
|
296
|
-
clear_output(wait=True)
|
|
297
296
|
print(
|
|
298
|
-
f"Epoch {epoch:>3} | Train Loss: {train_loss:.
|
|
297
|
+
f"Epoch {epoch:>3} | Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}"
|
|
299
298
|
)
|
|
300
299
|
|
|
301
300
|
return train_losses, train_f1_scores, val_losses, val_f1_scores
|
|
302
301
|
|
|
303
302
|
|
|
304
303
|
if __name__ == "__main__":
|
|
305
|
-
|
|
306
|
-
import pandas as pd
|
|
307
|
-
from sklearn.datasets import load_iris
|
|
308
|
-
|
|
309
|
-
# Load the dataset
|
|
310
|
-
iris = load_iris()
|
|
311
|
-
|
|
312
|
-
# Convert to a DataFrame for easy exploration
|
|
313
|
-
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
|
|
314
|
-
iris_df["species"] = iris.target
|
|
315
|
-
|
|
316
|
-
iris_df["sepal length (cm)"] = iris_df["sepal length (cm)"].astype("category")
|
|
317
|
-
iris_df["sepal width (cm)"] = iris_df["sepal width (cm)"].astype("category")
|
|
318
|
-
iris_df["petal length (cm)"] = iris_df["petal length (cm)"].astype("category")
|
|
319
|
-
iris_df["petal width (cm)"] = iris_df["petal width (cm)"].astype("category")
|
|
320
|
-
|
|
321
|
-
# Display the first few rows of the dataset
|
|
322
|
-
print(iris_df.head())
|
|
323
|
-
|
|
324
|
-
iris_df = iris_df.sample(frac=1, replace=False).reset_index(drop=True)
|
|
325
|
-
|
|
326
|
-
data = Data(iris_df, "species")
|
|
327
|
-
|
|
328
|
-
model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
|
|
329
|
-
print("Before training F1:", model.test(data))
|
|
330
|
-
model.fit(data, epochs=200, batch_size=32, test_size=0.5)
|
|
331
|
-
model.save("./best_model", save_format="tf")
|
|
332
|
-
print("After training F1:", model.test(data))
|
|
333
|
-
best_model = tf.keras.models.load_model("./best_model")
|
|
334
|
-
|
|
335
|
-
print("After loading F1:", best_model.test(data))
|
|
336
|
-
df_results = pd.DataFrame()
|
|
337
|
-
|
|
338
|
-
# Suppose we have a new dataset without the target variable
|
|
339
|
-
iris_df = iris_df.drop(columns=["species"])
|
|
340
|
-
data_new = Data(iris_df)
|
|
341
|
-
print("Predictions:", best_model.predict(data_new))
|
|
342
|
-
df_results["predicted"] = list(model.predict(data))
|
|
343
|
-
df_results["actual"] = list(data.y)
|
|
344
|
-
# df_results.to_csv("results.csv", index=False)
|
|
345
|
-
breakpoint()
|
|
304
|
+
print("Examples will be running below")
|