likelihood 1.4.0__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {likelihood-1.4.0 → likelihood-1.5.0}/PKG-INFO +1 -1
  2. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/graph/nn.py +72 -113
  3. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/deep/autoencoders.py +352 -116
  4. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/tools/__init__.py +1 -0
  5. likelihood-1.5.0/likelihood/tools/figures.py +348 -0
  6. likelihood-1.5.0/likelihood/tools/models_tools.py +253 -0
  7. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/tools/tools.py +26 -84
  8. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood.egg-info/PKG-INFO +1 -1
  9. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood.egg-info/SOURCES.txt +2 -0
  10. {likelihood-1.4.0 → likelihood-1.5.0}/LICENSE +0 -0
  11. {likelihood-1.4.0 → likelihood-1.5.0}/README.md +0 -0
  12. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/__init__.py +0 -0
  13. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/graph/__init__.py +0 -0
  14. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/graph/graph.py +0 -0
  15. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/main.py +0 -0
  16. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/__init__.py +0 -0
  17. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/deep/__init__.py +0 -0
  18. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/hmm.py +0 -0
  19. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/regression.py +0 -0
  20. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/simulation.py +0 -0
  21. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/models/utils.py +0 -0
  22. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood/tools/numeric_tools.py +0 -0
  23. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood.egg-info/dependency_links.txt +0 -0
  24. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood.egg-info/requires.txt +0 -0
  25. {likelihood-1.4.0 → likelihood-1.5.0}/likelihood.egg-info/top_level.txt +0 -0
  26. {likelihood-1.4.0 → likelihood-1.5.0}/setup.cfg +0 -0
  27. {likelihood-1.4.0 → likelihood-1.5.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: likelihood
3
- Version: 1.4.0
3
+ Version: 1.5.0
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -5,7 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
5
  logging.getLogger("tensorflow").setLevel(logging.ERROR)
6
6
 
7
7
  import warnings
8
- from typing import List, Tuple
8
+ from typing import Any, List, Tuple
9
9
 
10
10
  import numpy as np
11
11
  import pandas as pd
@@ -15,48 +15,43 @@ from pandas.core.frame import DataFrame
15
15
  from sklearn.metrics import f1_score
16
16
  from sklearn.model_selection import train_test_split
17
17
 
18
- from likelihood.tools import generate_feature_yaml
19
-
20
18
  tf.get_logger().setLevel("ERROR")
21
19
 
20
+ from likelihood.tools import LoRALayer
22
21
 
23
- def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
24
- """Compares the similarity between two arrays of categories.
25
22
 
26
- Parameters
27
- ----------
28
- arr1 : `ndarray`
29
- The first array of categories.
30
- arr2 : `ndarray`
31
- The second array of categories.
23
+ def compare_similarity(arr1: List[Any], arr2: List[Any], threshold: float = 0.05) -> int:
24
+ """Calculate the similarity between two arrays considering numeric values near to 1 in ratio."""
32
25
 
33
- Returns
34
- -------
35
- count: `int`
36
- The number of categories that are the same in both arrays.
37
- """
26
+ def is_similar(a: Any, b: Any) -> bool:
27
+ if isinstance(a, (int, float)) and isinstance(b, (int, float)):
28
+ if a == 0 and b == 0:
29
+ return True
30
+ if a == 0 or b == 0:
31
+ return False
32
+ # For numeric values, check if their ratio is within the threshold range
33
+ ratio = max(a, b) / min(a, b)
34
+ return 1 - threshold <= ratio <= 1 + threshold
35
+ else:
36
+ return a == b
38
37
 
39
- count = 0
40
- for i in range(len(arr1)):
41
- if arr1[i] == arr2[i]:
42
- count += 1
43
- return count
38
+ return sum(is_similar(a, b) for a, b in zip(arr1, arr2))
44
39
 
45
40
 
46
41
  def cal_adjacency_matrix(
47
42
  df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
48
43
  ) -> Tuple[dict, np.ndarray]:
49
44
  """Calculates the adjacency matrix for a given DataFrame.
50
- The adjacency matrix is a matrix that represents the similarity between each pair of categories.
45
+ The adjacency matrix is a matrix that represents the similarity between each pair of features.
51
46
  The similarity is calculated using the `compare_similarity` function.
52
- The resulting matrix is a square matrix with the same number of rows and columns as the input DataFrame.
47
+ The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
53
48
 
54
49
  Parameters
55
50
  ----------
56
51
  df : `DataFrame`
57
- The input DataFrame containing the categories.
52
+ The input DataFrame containing the features.
58
53
  exclude_subset : `List[str]`, optional
59
- A list of categories to exclude from the calculation of the adjacency matrix.
54
+ A list of features to exclude from the calculation of the adjacency matrix.
60
55
  sparse : `bool`, optional
61
56
  Whether to return a sparse matrix or a dense matrix.
62
57
  **kwargs : `dict`
@@ -65,49 +60,37 @@ def cal_adjacency_matrix(
65
60
  Keyword Arguments:
66
61
  ----------
67
62
  similarity: `int`
68
- The minimum number of categories that must be the same in both arrays to be considered similar.
63
+ The minimum number of features that must be the same in both arrays to be considered similar.
64
+ threshold : `float`
65
+ The threshold value used in the `compare_similarity` function. Default is 0.05.
69
66
 
70
67
  Returns
71
68
  -------
72
69
  adj_dict : `dict`
73
- A dictionary containing the categories.
70
+ A dictionary containing the features.
74
71
  adjacency_matrix : `ndarray`
75
72
  The adjacency matrix.
76
73
  """
77
74
 
78
- yaml_ = generate_feature_yaml(df)
79
- categorical_columns = yaml_["categorical_features"]
80
75
  if len(exclude_subset) > 0:
81
- categorical_columns = [col for col in categorical_columns if col not in exclude_subset]
82
-
83
- if len(categorical_columns) > 1:
84
- df_categorical = df[categorical_columns].copy()
76
+ columns = [col for col in df.columns if col not in exclude_subset]
77
+ df_ = df[columns].copy()
85
78
  else:
86
- categorical_columns = [
87
- col
88
- for col in df.columns
89
- if (
90
- col not in exclude_subset
91
- and pd.api.types.is_integer_dtype(df[col])
92
- and len(df[col].unique()) > 2
93
- )
94
- ]
95
- df_categorical = df[categorical_columns].copy()
79
+ df_ = df.copy()
96
80
 
97
- assert len(df_categorical) > 0
81
+ assert len(df_) > 0
98
82
 
99
- similarity = kwargs.get("similarity", len(df_categorical.columns) - 1)
100
- assert similarity <= df_categorical.shape[1]
83
+ similarity = kwargs.get("similarity", len(df_.columns) - 1)
84
+ threshold = kwargs.get("threshold", 0.05)
85
+ assert similarity <= df_.shape[1]
101
86
 
102
- adj_dict = {}
103
- for index, row in df_categorical.iterrows():
104
- adj_dict[index] = row.to_list()
87
+ adj_dict = {index: row.tolist() for index, row in df_.iterrows()}
105
88
 
106
- adjacency_matrix = np.zeros((len(df_categorical), len(df_categorical)))
89
+ adjacency_matrix = np.zeros((len(df_), len(df_)))
107
90
 
108
- for i in range(len(df_categorical)):
109
- for j in range(len(df_categorical)):
110
- if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
91
+ for i in range(len(df_)):
92
+ for j in range(len(df_)):
93
+ if compare_similarity(adj_dict[i], adj_dict[j], threshold=threshold) >= similarity:
111
94
  adjacency_matrix[i][j] = 1
112
95
 
113
96
  if sparse:
@@ -131,8 +114,13 @@ class Data:
131
114
  df: DataFrame,
132
115
  target: str | None = None,
133
116
  exclude_subset: List[str] = [],
117
+ **kwargs,
134
118
  ):
135
- _, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=True)
119
+ sparse = kwargs.get("sparse", True)
120
+ threshold = kwargs.get("threshold", 0.05)
121
+ _, adjacency = cal_adjacency_matrix(
122
+ df, exclude_subset=exclude_subset, sparse=sparse, threshold=threshold
123
+ )
136
124
  if target is not None:
137
125
  X = df.drop(columns=[target] + exclude_subset)
138
126
  else:
@@ -147,16 +135,20 @@ class Data:
147
135
 
148
136
  @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
149
137
  class VanillaGNNLayer(tf.keras.layers.Layer):
150
- def __init__(self, dim_in, dim_out, kernel_initializer="glorot_uniform", **kwargs):
138
+ def __init__(self, dim_in, dim_out, rank=None, kernel_initializer="glorot_uniform", **kwargs):
151
139
  super(VanillaGNNLayer, self).__init__(**kwargs)
152
140
  self.dim_out = dim_out
141
+ self.rank = rank
153
142
  self.kernel_initializer = kernel_initializer
154
143
  self.linear = None
155
144
 
156
145
  def build(self, input_shape):
157
- self.linear = tf.keras.layers.Dense(
158
- self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
159
- )
146
+ if self.rank:
147
+ self.linear = LoRALayer(self.dim_out, rank=self.rank)
148
+ else:
149
+ self.linear = tf.keras.layers.Dense(
150
+ self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
151
+ )
160
152
  super(VanillaGNNLayer, self).build(input_shape)
161
153
 
162
154
  def call(self, x, adjacency):
@@ -169,8 +161,11 @@ class VanillaGNNLayer(tf.keras.layers.Layer):
169
161
  config.update(
170
162
  {
171
163
  "dim_out": self.dim_out,
172
- "kernel_initializer": tf.keras.initializers.serialize(
173
- self.linear.kernel_initializer
164
+ "rank": self.rank,
165
+ "kernel_initializer": (
166
+ None
167
+ if self.rank
168
+ else tf.keras.initializers.serialize(self.linear.kernel_initializer)
174
169
  ),
175
170
  }
176
171
  )
@@ -179,14 +174,16 @@ class VanillaGNNLayer(tf.keras.layers.Layer):
179
174
 
180
175
  @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
181
176
  class VanillaGNN(tf.keras.Model):
182
- def __init__(self, dim_in, dim_h, dim_out, **kwargs):
177
+ def __init__(self, dim_in, dim_h, dim_out, rank=2, **kwargs):
183
178
  super(VanillaGNN, self).__init__(**kwargs)
184
179
  self.dim_in = dim_in
185
180
  self.dim_h = dim_h
186
181
  self.dim_out = dim_out
187
- self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h)
188
- self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h)
189
- self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out)
182
+ self.rank = rank
183
+
184
+ self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h, self.rank)
185
+ self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h, self.rank)
186
+ self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out, None)
190
187
 
191
188
  def call(self, x, adjacency):
192
189
  h = self.gnn1(x, adjacency)
@@ -208,13 +205,13 @@ class VanillaGNN(tf.keras.Model):
208
205
  out = self(x, adjacency)
209
206
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
210
207
  loss = tf.reduce_mean(loss)
211
- f1 = self.compute_f1_score(out, y)
208
+ f1 = round(self.compute_f1_score(out, y), 4)
212
209
  return loss.numpy(), f1
213
210
 
214
211
  def test(self, data):
215
212
  out = self(data.x, data.adjacency)
216
213
  test_f1 = self.compute_f1_score(out, data.y)
217
- return test_f1
214
+ return round(test_f1, 4)
218
215
 
219
216
  def predict(self, data):
220
217
  out = self(data.x, data.adjacency)
@@ -225,6 +222,7 @@ class VanillaGNN(tf.keras.Model):
225
222
  "dim_in": self.dim_in,
226
223
  "dim_h": self.dim_h,
227
224
  "dim_out": self.dim_out,
225
+ "rank": self.rank,
228
226
  }
229
227
  base_config = super(VanillaGNN, self).get_config()
230
228
  return dict(list(base_config.items()) + list(config.items()))
@@ -235,6 +233,7 @@ class VanillaGNN(tf.keras.Model):
235
233
  dim_in=config["dim_in"],
236
234
  dim_h=config["dim_h"],
237
235
  dim_out=config["dim_out"],
236
+ rank=config["rank"],
238
237
  )
239
238
 
240
239
  @tf.function
@@ -248,10 +247,6 @@ class VanillaGNN(tf.keras.Model):
248
247
  return loss
249
248
 
250
249
  def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
251
- warnings.warn(
252
- "It is normal for validation metrics to underperform. Use the test method to validate after training.",
253
- UserWarning,
254
- )
255
250
  optimizers = {
256
251
  "sgd": tf.keras.optimizers.SGD(),
257
252
  "adam": tf.keras.optimizers.Adam(),
@@ -290,56 +285,20 @@ class VanillaGNN(tf.keras.Model):
290
285
  train_f1_scores.append(train_f1)
291
286
 
292
287
  if epoch % 5 == 0:
288
+ clear_output(wait=True)
289
+ warnings.warn(
290
+ "It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
291
+ UserWarning,
292
+ )
293
293
  val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
294
294
  val_losses.append(val_loss)
295
295
  val_f1_scores.append(val_f1)
296
- clear_output(wait=True)
297
296
  print(
298
- f"Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train F1: {train_f1:.3f} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}"
297
+ f"Epoch {epoch:>3} | Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}"
299
298
  )
300
299
 
301
300
  return train_losses, train_f1_scores, val_losses, val_f1_scores
302
301
 
303
302
 
304
303
  if __name__ == "__main__":
305
- # Example usage
306
- import pandas as pd
307
- from sklearn.datasets import load_iris
308
-
309
- # Load the dataset
310
- iris = load_iris()
311
-
312
- # Convert to a DataFrame for easy exploration
313
- iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
314
- iris_df["species"] = iris.target
315
-
316
- iris_df["sepal length (cm)"] = iris_df["sepal length (cm)"].astype("category")
317
- iris_df["sepal width (cm)"] = iris_df["sepal width (cm)"].astype("category")
318
- iris_df["petal length (cm)"] = iris_df["petal length (cm)"].astype("category")
319
- iris_df["petal width (cm)"] = iris_df["petal width (cm)"].astype("category")
320
-
321
- # Display the first few rows of the dataset
322
- print(iris_df.head())
323
-
324
- iris_df = iris_df.sample(frac=1, replace=False).reset_index(drop=True)
325
-
326
- data = Data(iris_df, "species")
327
-
328
- model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
329
- print("Before training F1:", model.test(data))
330
- model.fit(data, epochs=200, batch_size=32, test_size=0.5)
331
- model.save("./best_model", save_format="tf")
332
- print("After training F1:", model.test(data))
333
- best_model = tf.keras.models.load_model("./best_model")
334
-
335
- print("After loading F1:", best_model.test(data))
336
- df_results = pd.DataFrame()
337
-
338
- # Suppose we have a new dataset without the target variable
339
- iris_df = iris_df.drop(columns=["species"])
340
- data_new = Data(iris_df)
341
- print("Predictions:", best_model.predict(data_new))
342
- df_results["predicted"] = list(model.predict(data))
343
- df_results["actual"] = list(data.y)
344
- # df_results.to_csv("results.csv", index=False)
345
- breakpoint()
304
+ print("Examples will be running below")