likelihood 1.2.16__tar.gz → 1.2.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {likelihood-1.2.16 → likelihood-1.2.17}/PKG-INFO +2 -1
  2. likelihood-1.2.17/likelihood/graph/nn.py +344 -0
  3. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/models/deep/autoencoders.py +11 -3
  4. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood.egg-info/PKG-INFO +2 -1
  5. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood.egg-info/SOURCES.txt +1 -0
  6. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood.egg-info/requires.txt +1 -0
  7. {likelihood-1.2.16 → likelihood-1.2.17}/setup.py +1 -1
  8. {likelihood-1.2.16 → likelihood-1.2.17}/LICENSE +0 -0
  9. {likelihood-1.2.16 → likelihood-1.2.17}/README.md +0 -0
  10. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/__init__.py +0 -0
  11. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/graph/__init__.py +0 -0
  12. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/graph/graph.py +0 -0
  13. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/main.py +0 -0
  14. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/models/__init__.py +0 -0
  15. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/models/deep/__init__.py +0 -0
  16. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/models/regression.py +0 -0
  17. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/models/simulation.py +0 -0
  18. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/models/utils.py +0 -0
  19. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/tools/__init__.py +0 -0
  20. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/tools/numeric_tools.py +0 -0
  21. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood/tools/tools.py +0 -0
  22. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood.egg-info/dependency_links.txt +0 -0
  23. {likelihood-1.2.16 → likelihood-1.2.17}/likelihood.egg-info/top_level.txt +0 -0
  24. {likelihood-1.2.16 → likelihood-1.2.17}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.16
3
+ Version: 1.2.17
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -30,6 +30,7 @@ Requires-Dist: networkx; extra == "full"
30
30
  Requires-Dist: pyvis; extra == "full"
31
31
  Requires-Dist: tensorflow; extra == "full"
32
32
  Requires-Dist: keras-tuner; extra == "full"
33
+ Requires-Dist: scikit-learn; extra == "full"
33
34
 
34
35
  ![likelihood](https://raw.githubusercontent.com/RodolfoFerro/likelihood/main/likelihood.png)
35
36
 
@@ -0,0 +1,344 @@
1
+ import warnings
2
+ from typing import List, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import tensorflow as tf
7
+ from numpy import ndarray
8
+ from pandas.core.frame import DataFrame
9
+ from sklearn.metrics import f1_score
10
+ from sklearn.model_selection import train_test_split
11
+
12
+ from likelihood.tools import generate_feature_yaml
13
+
14
+
15
+ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
16
+ """Compares the similarity between two arrays of categories.
17
+
18
+ Parameters
19
+ ----------
20
+ arr1 : `ndarray`
21
+ The first array of categories.
22
+ arr2 : `ndarray`
23
+ The second array of categories.
24
+
25
+ Returns
26
+ -------
27
+ count: `int`
28
+ The number of categories that are the same in both arrays.
29
+ """
30
+
31
+ count = 0
32
+ for i in range(len(arr1)):
33
+ if arr1[i] == arr2[i]:
34
+ count += 1
35
+ return count
36
+
37
+
38
+ def cal_adjency_matrix(
39
+ df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
40
+ ) -> Tuple[dict, ndarray]:
41
+ """Calculates the adjacency matrix for a given DataFrame.
42
+ The adjacency matrix is a matrix that represents the similarity between each pair of categories.
43
+ The similarity is calculated using the `compare_similarity` function.
44
+ The resulting matrix is a square matrix with the same number of rows and columns as the input DataFrame.
45
+
46
+ Parameters
47
+ ----------
48
+ df : `DataFrame`
49
+ The input DataFrame containing the categories.
50
+ exclude_subset : `List[str]`, optional
51
+ A list of categories to exclude from the calculation of the adjacency matrix.
52
+ sparse : `bool`, optional
53
+ Whether to return a sparse matrix or a dense matrix.
54
+ **kwargs : `dict`
55
+ Additional keyword arguments to pass to the `compare_similarity` function.
56
+
57
+ Keyword Arguments:
58
+ ----------
59
+ similarity: `int`
60
+ The minimum number of categories that must be the same in both arrays to be considered similar.
61
+
62
+ Returns
63
+ -------
64
+ adj_dict : `dict`
65
+ A dictionary containing the categories.
66
+ adjacency_matrix : `ndarray`
67
+ The adjacency matrix.
68
+ """
69
+
70
+ yaml_ = generate_feature_yaml(df)
71
+ categorical_columns = yaml_["categorical_features"]
72
+ if len(exclude_subset) > 0:
73
+ categorical_columns = [col for col in categorical_columns if col not in exclude_subset]
74
+
75
+ if len(categorical_columns) > 1:
76
+ df_categorical = df[categorical_columns].copy()
77
+ else:
78
+ categorical_columns = [
79
+ col
80
+ for col in df.columns
81
+ if (
82
+ col not in exclude_subset
83
+ and pd.api.types.is_integer_dtype(df[col])
84
+ and len(df[col].unique()) > 2
85
+ )
86
+ ]
87
+ df_categorical = df[categorical_columns].copy()
88
+
89
+ assert len(df_categorical) > 0
90
+
91
+ similarity = kwargs["similarity"] if "similarity" in kwargs else len(df_categorical.columns) - 1
92
+ assert similarity <= df_categorical.shape[1]
93
+
94
+ adj_dict = {}
95
+ for index, row in df_categorical.iterrows():
96
+ adj_dict[index] = row.to_list()
97
+
98
+ adjacency_matrix = np.zeros((len(df_categorical), len(df_categorical)))
99
+
100
+ for i in range(len(df_categorical)):
101
+ for j in range(len(df_categorical)):
102
+ if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
103
+ adjacency_matrix[i][j] = 1
104
+
105
+ if sparse:
106
+ num_nodes = adjacency_matrix.shape[0]
107
+
108
+ indices = np.argwhere(adjacency_matrix != 0.0)
109
+ indices = tf.constant(indices, dtype=tf.int64)
110
+ values = tf.constant(adjacency_matrix[indices[:, 0], indices[:, 1]], dtype=tf.float32)
111
+ adjacency_matrix = tf.sparse.SparseTensor(
112
+ indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
113
+ )
114
+
115
+ return adj_dict, adjacency_matrix
116
+ else:
117
+ return adj_dict, adjacency_matrix
118
+
119
+
120
+ class Data:
121
+ def __init__(
122
+ self,
123
+ df: DataFrame,
124
+ target: str | None = None,
125
+ exclude_subset: List[str] = [],
126
+ ):
127
+ _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
128
+ if target is not None:
129
+ X = df.drop(columns=[target] + exclude_subset)
130
+ else:
131
+ X = df.drop(columns=exclude_subset)
132
+ self.columns = X.columns
133
+ X = X.to_numpy()
134
+ self.x = np.asarray(X).astype(np.float32)
135
+ self.adjacency = adjacency
136
+ if target is not None:
137
+ self.y = np.asarray(df[target].values).astype(np.int32)
138
+
139
+
140
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
141
+ class VanillaGNNLayer(tf.keras.layers.Layer):
142
+ def __init__(self, dim_in, dim_out, kernel_initializer="glorot_uniform", **kwargs):
143
+ super(VanillaGNNLayer, self).__init__(**kwargs)
144
+ self.dim_out = dim_out
145
+ self.kernel_initializer = kernel_initializer
146
+ self.linear = None
147
+
148
+ def build(self, input_shape):
149
+ self.linear = tf.keras.layers.Dense(
150
+ self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
151
+ )
152
+ super(VanillaGNNLayer, self).build(input_shape)
153
+
154
+ def call(self, x, adjacency):
155
+ x = self.linear(x)
156
+ x = tf.sparse.sparse_dense_matmul(adjacency, x)
157
+ return x
158
+
159
+ def get_config(self):
160
+ config = super(VanillaGNNLayer, self).get_config()
161
+ config.update(
162
+ {
163
+ "dim_out": self.dim_out,
164
+ "kernel_initializer": tf.keras.initializers.serialize(
165
+ self.linear.kernel_initializer
166
+ ),
167
+ }
168
+ )
169
+ return config
170
+
171
+
172
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
173
+ class VanillaGNN(tf.keras.Model):
174
+ def __init__(self, dim_in, dim_h, dim_out, **kwargs):
175
+ super(VanillaGNN, self).__init__(**kwargs)
176
+ self.dim_in = dim_in
177
+ self.dim_h = dim_h
178
+ self.dim_out = dim_out
179
+ self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h)
180
+ self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h)
181
+ self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out)
182
+
183
+ def build(self, input_shape):
184
+ super(VanillaGNN, self).build(input_shape)
185
+ dummy_input = tf.keras.Input(shape=input_shape[1:])
186
+ dummy_adjacency = tf.sparse.SparseTensor(
187
+ indices=[[0, 0]], values=[1.0], dense_shape=[input_shape[0], input_shape[0]]
188
+ )
189
+ _ = self(dummy_input, dummy_adjacency)
190
+
191
+ def call(self, x, adjacency):
192
+ h = self.gnn1(x, adjacency)
193
+ h = tf.nn.tanh(h)
194
+ h = self.gnn2(h, adjacency)
195
+ h = self.gnn3(h, adjacency)
196
+ return tf.nn.softmax(h, axis=1)
197
+
198
+ def f1_macro(self, y_true, y_pred):
199
+ return f1_score(y_true, y_pred, average="macro")
200
+
201
+ def compute_f1_score(self, logits, labels):
202
+ predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
203
+ true_labels = tf.cast(labels, tf.int32)
204
+ return self.f1_macro(true_labels.numpy(), predictions.numpy())
205
+
206
+ def evaluate(self, x, adjacency, y):
207
+ y = tf.cast(y, tf.int32)
208
+ out = self(x, adjacency)
209
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
210
+ loss = tf.reduce_mean(loss)
211
+ f1 = self.compute_f1_score(out, y)
212
+ return loss.numpy(), f1
213
+
214
+ def test(self, data):
215
+ out = self(data.x, data.adjacency)
216
+ test_f1 = self.compute_f1_score(out, data.y)
217
+ return test_f1
218
+
219
+ def predict(self, data):
220
+ out = self(data.x, data.adjacency)
221
+ return tf.argmax(out, axis=1, output_type=tf.int32).numpy()
222
+
223
+ def get_config(self):
224
+ config = {
225
+ "dim_in": self.dim_in,
226
+ "dim_h": self.dim_h,
227
+ "dim_out": self.dim_out,
228
+ }
229
+ base_config = super(VanillaGNN, self).get_config()
230
+ return dict(list(base_config.items()) + list(config.items()))
231
+
232
+ @classmethod
233
+ def from_config(cls, config):
234
+ return cls(
235
+ dim_in=config["dim_in"],
236
+ dim_h=config["dim_h"],
237
+ dim_out=config["dim_out"],
238
+ )
239
+
240
+ @tf.function
241
+ def train_step(self, batch_x, batch_adjacency, batch_y, optimizer):
242
+ with tf.GradientTape() as tape:
243
+ out = self(batch_x, batch_adjacency)
244
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=batch_y, logits=out)
245
+ loss = tf.reduce_mean(loss)
246
+ gradients = tape.gradient(loss, self.trainable_variables)
247
+ optimizer.apply_gradients(zip(gradients, self.trainable_variables))
248
+ return loss
249
+
250
+ def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
251
+ warnings.warn(
252
+ "It is normal for validation metrics to underperform. Use the test method to validate after training.",
253
+ UserWarning,
254
+ )
255
+ optimizers = {
256
+ "sgd": tf.keras.optimizers.SGD(),
257
+ "adam": tf.keras.optimizers.Adam(),
258
+ "adamw": tf.keras.optimizers.AdamW(),
259
+ "adadelta": tf.keras.optimizers.Adadelta(),
260
+ "rmsprop": tf.keras.optimizers.RMSprop(),
261
+ }
262
+ optimizer = optimizers[optimizer]
263
+ train_losses = []
264
+ train_f1_scores = []
265
+ val_losses = []
266
+ val_f1_scores = []
267
+
268
+ X_train, X_test, y_train, y_test = train_test_split(
269
+ data.x, data.y, test_size=test_size, shuffle=False
270
+ )
271
+ adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
272
+ adjacency_test = tf.sparse.slice(
273
+ data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
274
+ )
275
+
276
+ batch_starts = np.arange(0, len(X_train), batch_size)
277
+ for epoch in range(epochs):
278
+ np.random.shuffle(batch_starts)
279
+ for start in batch_starts:
280
+ end = start + batch_size
281
+ batch_x = X_train[start:end, :]
282
+ batch_adjacency = tf.sparse.slice(
283
+ adjacency_train, [start, start], [batch_size, batch_size]
284
+ )
285
+ batch_y = y_train[start:end]
286
+ train_loss = self.train_step(batch_x, batch_adjacency, batch_y, optimizer)
287
+
288
+ train_loss, train_f1 = self.evaluate(X_train, adjacency_train, y_train)
289
+ train_losses.append(train_loss)
290
+ train_f1_scores.append(train_f1)
291
+
292
+ if epoch % 2 == 0:
293
+ val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
294
+ val_losses.append(val_loss)
295
+ val_f1_scores.append(val_f1)
296
+ print(
297
+ f"Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train F1: {train_f1:.3f} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}"
298
+ )
299
+
300
+ return train_losses, train_f1_scores, val_losses, val_f1_scores
301
+
302
+
303
+ if __name__ == "__main__":
304
+ # Example usage
305
+ import pandas as pd
306
+ from sklearn.datasets import load_iris
307
+
308
+ # Load the dataset
309
+ iris = load_iris()
310
+
311
+ # Convert to a DataFrame for easy exploration
312
+ iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
313
+ iris_df["species"] = iris.target
314
+
315
+ iris_df["sepal length (cm)"] = iris_df["sepal length (cm)"].astype("category")
316
+ iris_df["sepal width (cm)"] = iris_df["sepal width (cm)"].astype("category")
317
+ iris_df["petal length (cm)"] = iris_df["petal length (cm)"].astype("category")
318
+ iris_df["petal width (cm)"] = iris_df["petal width (cm)"].astype("category")
319
+
320
+ # Display the first few rows of the dataset
321
+ print(iris_df.head())
322
+
323
+ iris_df = iris_df.sample(frac=1, replace=False).reset_index(drop=True)
324
+
325
+ data = Data(iris_df, "species")
326
+
327
+ model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
328
+ print("Before training F1:", model.test(data))
329
+ model.fit(data, epochs=200, batch_size=32, test_size=0.5)
330
+ model.save("./best_model.keras")
331
+ print("After training F1:", model.test(data))
332
+ best_model = tf.keras.models.load_model("./best_model.keras")
333
+
334
+ print("After loading F1:", best_model.test(data))
335
+ df_results = pd.DataFrame()
336
+
337
+ # Suppose we have a new dataset without the target variable
338
+ iris_df = iris_df.drop(columns=["species"])
339
+ data_new = Data(iris_df)
340
+ print("Predictions:", best_model.predict(data_new))
341
+ df_results["predicted"] = list(model.predict(data))
342
+ df_results["actual"] = list(data.y)
343
+ # df_results.to_csv("results.csv", index=False)
344
+ breakpoint()
@@ -175,7 +175,13 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
175
175
 
176
176
 
177
177
  def setup_model(
178
- data: DataFrame, target: str, epochs: int, train_size: float = 0.7, seed=None, **kwargs
178
+ data: DataFrame,
179
+ target: str,
180
+ epochs: int,
181
+ train_size: float = 0.7,
182
+ seed=None,
183
+ filepath: str = "./my_dir/best_model.keras",
184
+ **kwargs
179
185
  ) -> AutoClassifier:
180
186
  """Setup model for training and tuning.
181
187
 
@@ -191,6 +197,8 @@ def setup_model(
191
197
  The proportion of the dataset to use for training.
192
198
  seed : `Any` | `int`
193
199
  The random seed to use for reproducibility.
200
+ filepath : `str`
201
+ The path to save the best model to.
194
202
 
195
203
  Keyword Arguments:
196
204
  ----------
@@ -257,13 +265,13 @@ def setup_model(
257
265
  best_model = models[0]
258
266
 
259
267
  # save model
260
- best_model.save("./my_dir/best_model.keras")
268
+ best_model.save(filepath)
261
269
 
262
270
  if verbose:
263
271
  tuner.results_summary()
264
272
  else:
265
273
  # Load the best model from the directory
266
- best_model = tf.keras.models.load_model("./my_dir/best_model.keras")
274
+ best_model = tf.keras.models.load_model(filepath)
267
275
 
268
276
  return best_model
269
277
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.16
3
+ Version: 1.2.17
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -30,6 +30,7 @@ Requires-Dist: networkx; extra == "full"
30
30
  Requires-Dist: pyvis; extra == "full"
31
31
  Requires-Dist: tensorflow; extra == "full"
32
32
  Requires-Dist: keras-tuner; extra == "full"
33
+ Requires-Dist: scikit-learn; extra == "full"
33
34
 
34
35
  ![likelihood](https://raw.githubusercontent.com/RodolfoFerro/likelihood/main/likelihood.png)
35
36
 
@@ -10,6 +10,7 @@ likelihood.egg-info/requires.txt
10
10
  likelihood.egg-info/top_level.txt
11
11
  likelihood/graph/__init__.py
12
12
  likelihood/graph/graph.py
13
+ likelihood/graph/nn.py
13
14
  likelihood/models/__init__.py
14
15
  likelihood/models/regression.py
15
16
  likelihood/models/simulation.py
@@ -16,3 +16,4 @@ networkx
16
16
  pyvis
17
17
  tensorflow
18
18
  keras-tuner
19
+ scikit-learn
@@ -31,7 +31,7 @@ setuptools.setup(
31
31
  packages=setuptools.find_packages(),
32
32
  install_requires=install_requires,
33
33
  extras_require={
34
- "full": ["networkx", "pyvis", "tensorflow", "keras-tuner"],
34
+ "full": ["networkx", "pyvis", "tensorflow", "keras-tuner", "scikit-learn"],
35
35
  },
36
36
  classifiers=[
37
37
  "Programming Language :: Python :: 3",
File without changes
File without changes
File without changes