likelihood 1.2.16__py3-none-any.whl → 1.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/graph/nn.py ADDED
@@ -0,0 +1,344 @@
1
+ import warnings
2
+ from typing import List, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import tensorflow as tf
7
+ from numpy import ndarray
8
+ from pandas.core.frame import DataFrame
9
+ from sklearn.metrics import f1_score
10
+ from sklearn.model_selection import train_test_split
11
+
12
+ from likelihood.tools import generate_feature_yaml
13
+
14
+
15
+ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
16
+ """Compares the similarity between two arrays of categories.
17
+
18
+ Parameters
19
+ ----------
20
+ arr1 : `ndarray`
21
+ The first array of categories.
22
+ arr2 : `ndarray`
23
+ The second array of categories.
24
+
25
+ Returns
26
+ -------
27
+ count: `int`
28
+ The number of categories that are the same in both arrays.
29
+ """
30
+
31
+ count = 0
32
+ for i in range(len(arr1)):
33
+ if arr1[i] == arr2[i]:
34
+ count += 1
35
+ return count
36
+
37
+
38
+ def cal_adjency_matrix(
39
+ df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
40
+ ) -> Tuple[dict, ndarray]:
41
+ """Calculates the adjacency matrix for a given DataFrame.
42
+ The adjacency matrix is a matrix that represents the similarity between each pair of categories.
43
+ The similarity is calculated using the `compare_similarity` function.
44
+ The resulting matrix is a square matrix with the same number of rows and columns as the input DataFrame.
45
+
46
+ Parameters
47
+ ----------
48
+ df : `DataFrame`
49
+ The input DataFrame containing the categories.
50
+ exclude_subset : `List[str]`, optional
51
+ A list of categories to exclude from the calculation of the adjacency matrix.
52
+ sparse : `bool`, optional
53
+ Whether to return a sparse matrix or a dense matrix.
54
+ **kwargs : `dict`
55
+ Additional keyword arguments to pass to the `compare_similarity` function.
56
+
57
+ Keyword Arguments:
58
+ ----------
59
+ similarity: `int`
60
+ The minimum number of categories that must be the same in both arrays to be considered similar.
61
+
62
+ Returns
63
+ -------
64
+ adj_dict : `dict`
65
+ A dictionary containing the categories.
66
+ adjacency_matrix : `ndarray`
67
+ The adjacency matrix.
68
+ """
69
+
70
+ yaml_ = generate_feature_yaml(df)
71
+ categorical_columns = yaml_["categorical_features"]
72
+ if len(exclude_subset) > 0:
73
+ categorical_columns = [col for col in categorical_columns if col not in exclude_subset]
74
+
75
+ if len(categorical_columns) > 1:
76
+ df_categorical = df[categorical_columns].copy()
77
+ else:
78
+ categorical_columns = [
79
+ col
80
+ for col in df.columns
81
+ if (
82
+ col not in exclude_subset
83
+ and pd.api.types.is_integer_dtype(df[col])
84
+ and len(df[col].unique()) > 2
85
+ )
86
+ ]
87
+ df_categorical = df[categorical_columns].copy()
88
+
89
+ assert len(df_categorical) > 0
90
+
91
+ similarity = kwargs["similarity"] if "similarity" in kwargs else len(df_categorical.columns) - 1
92
+ assert similarity <= df_categorical.shape[1]
93
+
94
+ adj_dict = {}
95
+ for index, row in df_categorical.iterrows():
96
+ adj_dict[index] = row.to_list()
97
+
98
+ adjacency_matrix = np.zeros((len(df_categorical), len(df_categorical)))
99
+
100
+ for i in range(len(df_categorical)):
101
+ for j in range(len(df_categorical)):
102
+ if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
103
+ adjacency_matrix[i][j] = 1
104
+
105
+ if sparse:
106
+ num_nodes = adjacency_matrix.shape[0]
107
+
108
+ indices = np.argwhere(adjacency_matrix != 0.0)
109
+ indices = tf.constant(indices, dtype=tf.int64)
110
+ values = tf.constant(adjacency_matrix[indices[:, 0], indices[:, 1]], dtype=tf.float32)
111
+ adjacency_matrix = tf.sparse.SparseTensor(
112
+ indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
113
+ )
114
+
115
+ return adj_dict, adjacency_matrix
116
+ else:
117
+ return adj_dict, adjacency_matrix
118
+
119
+
120
+ class Data:
121
+ def __init__(
122
+ self,
123
+ df: DataFrame,
124
+ target: str | None = None,
125
+ exclude_subset: List[str] = [],
126
+ ):
127
+ _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
128
+ if target is not None:
129
+ X = df.drop(columns=[target] + exclude_subset)
130
+ else:
131
+ X = df.drop(columns=exclude_subset)
132
+ self.columns = X.columns
133
+ X = X.to_numpy()
134
+ self.x = np.asarray(X).astype(np.float32)
135
+ self.adjacency = adjacency
136
+ if target is not None:
137
+ self.y = np.asarray(df[target].values).astype(np.int32)
138
+
139
+
140
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
141
+ class VanillaGNNLayer(tf.keras.layers.Layer):
142
+ def __init__(self, dim_in, dim_out, kernel_initializer="glorot_uniform", **kwargs):
143
+ super(VanillaGNNLayer, self).__init__(**kwargs)
144
+ self.dim_out = dim_out
145
+ self.kernel_initializer = kernel_initializer
146
+ self.linear = None
147
+
148
+ def build(self, input_shape):
149
+ self.linear = tf.keras.layers.Dense(
150
+ self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
151
+ )
152
+ super(VanillaGNNLayer, self).build(input_shape)
153
+
154
+ def call(self, x, adjacency):
155
+ x = self.linear(x)
156
+ x = tf.sparse.sparse_dense_matmul(adjacency, x)
157
+ return x
158
+
159
+ def get_config(self):
160
+ config = super(VanillaGNNLayer, self).get_config()
161
+ config.update(
162
+ {
163
+ "dim_out": self.dim_out,
164
+ "kernel_initializer": tf.keras.initializers.serialize(
165
+ self.linear.kernel_initializer
166
+ ),
167
+ }
168
+ )
169
+ return config
170
+
171
+
172
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
173
+ class VanillaGNN(tf.keras.Model):
174
+ def __init__(self, dim_in, dim_h, dim_out, **kwargs):
175
+ super(VanillaGNN, self).__init__(**kwargs)
176
+ self.dim_in = dim_in
177
+ self.dim_h = dim_h
178
+ self.dim_out = dim_out
179
+ self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h)
180
+ self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h)
181
+ self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out)
182
+
183
+ def build(self, input_shape):
184
+ super(VanillaGNN, self).build(input_shape)
185
+ dummy_input = tf.keras.Input(shape=input_shape[1:])
186
+ dummy_adjacency = tf.sparse.SparseTensor(
187
+ indices=[[0, 0]], values=[1.0], dense_shape=[input_shape[0], input_shape[0]]
188
+ )
189
+ _ = self(dummy_input, dummy_adjacency)
190
+
191
+ def call(self, x, adjacency):
192
+ h = self.gnn1(x, adjacency)
193
+ h = tf.nn.tanh(h)
194
+ h = self.gnn2(h, adjacency)
195
+ h = self.gnn3(h, adjacency)
196
+ return tf.nn.softmax(h, axis=1)
197
+
198
+ def f1_macro(self, y_true, y_pred):
199
+ return f1_score(y_true, y_pred, average="macro")
200
+
201
+ def compute_f1_score(self, logits, labels):
202
+ predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
203
+ true_labels = tf.cast(labels, tf.int32)
204
+ return self.f1_macro(true_labels.numpy(), predictions.numpy())
205
+
206
+ def evaluate(self, x, adjacency, y):
207
+ y = tf.cast(y, tf.int32)
208
+ out = self(x, adjacency)
209
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
210
+ loss = tf.reduce_mean(loss)
211
+ f1 = self.compute_f1_score(out, y)
212
+ return loss.numpy(), f1
213
+
214
+ def test(self, data):
215
+ out = self(data.x, data.adjacency)
216
+ test_f1 = self.compute_f1_score(out, data.y)
217
+ return test_f1
218
+
219
+ def predict(self, data):
220
+ out = self(data.x, data.adjacency)
221
+ return tf.argmax(out, axis=1, output_type=tf.int32).numpy()
222
+
223
+ def get_config(self):
224
+ config = {
225
+ "dim_in": self.dim_in,
226
+ "dim_h": self.dim_h,
227
+ "dim_out": self.dim_out,
228
+ }
229
+ base_config = super(VanillaGNN, self).get_config()
230
+ return dict(list(base_config.items()) + list(config.items()))
231
+
232
+ @classmethod
233
+ def from_config(cls, config):
234
+ return cls(
235
+ dim_in=config["dim_in"],
236
+ dim_h=config["dim_h"],
237
+ dim_out=config["dim_out"],
238
+ )
239
+
240
+ @tf.function
241
+ def train_step(self, batch_x, batch_adjacency, batch_y, optimizer):
242
+ with tf.GradientTape() as tape:
243
+ out = self(batch_x, batch_adjacency)
244
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=batch_y, logits=out)
245
+ loss = tf.reduce_mean(loss)
246
+ gradients = tape.gradient(loss, self.trainable_variables)
247
+ optimizer.apply_gradients(zip(gradients, self.trainable_variables))
248
+ return loss
249
+
250
+ def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
251
+ warnings.warn(
252
+ "It is normal for validation metrics to underperform. Use the test method to validate after training.",
253
+ UserWarning,
254
+ )
255
+ optimizers = {
256
+ "sgd": tf.keras.optimizers.SGD(),
257
+ "adam": tf.keras.optimizers.Adam(),
258
+ "adamw": tf.keras.optimizers.AdamW(),
259
+ "adadelta": tf.keras.optimizers.Adadelta(),
260
+ "rmsprop": tf.keras.optimizers.RMSprop(),
261
+ }
262
+ optimizer = optimizers[optimizer]
263
+ train_losses = []
264
+ train_f1_scores = []
265
+ val_losses = []
266
+ val_f1_scores = []
267
+
268
+ X_train, X_test, y_train, y_test = train_test_split(
269
+ data.x, data.y, test_size=test_size, shuffle=False
270
+ )
271
+ adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
272
+ adjacency_test = tf.sparse.slice(
273
+ data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
274
+ )
275
+
276
+ batch_starts = np.arange(0, len(X_train), batch_size)
277
+ for epoch in range(epochs):
278
+ np.random.shuffle(batch_starts)
279
+ for start in batch_starts:
280
+ end = start + batch_size
281
+ batch_x = X_train[start:end, :]
282
+ batch_adjacency = tf.sparse.slice(
283
+ adjacency_train, [start, start], [batch_size, batch_size]
284
+ )
285
+ batch_y = y_train[start:end]
286
+ train_loss = self.train_step(batch_x, batch_adjacency, batch_y, optimizer)
287
+
288
+ train_loss, train_f1 = self.evaluate(X_train, adjacency_train, y_train)
289
+ train_losses.append(train_loss)
290
+ train_f1_scores.append(train_f1)
291
+
292
+ if epoch % 2 == 0:
293
+ val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
294
+ val_losses.append(val_loss)
295
+ val_f1_scores.append(val_f1)
296
+ print(
297
+ f"Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train F1: {train_f1:.3f} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}"
298
+ )
299
+
300
+ return train_losses, train_f1_scores, val_losses, val_f1_scores
301
+
302
+
303
+ if __name__ == "__main__":
304
+ # Example usage
305
+ import pandas as pd
306
+ from sklearn.datasets import load_iris
307
+
308
+ # Load the dataset
309
+ iris = load_iris()
310
+
311
+ # Convert to a DataFrame for easy exploration
312
+ iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
313
+ iris_df["species"] = iris.target
314
+
315
+ iris_df["sepal length (cm)"] = iris_df["sepal length (cm)"].astype("category")
316
+ iris_df["sepal width (cm)"] = iris_df["sepal width (cm)"].astype("category")
317
+ iris_df["petal length (cm)"] = iris_df["petal length (cm)"].astype("category")
318
+ iris_df["petal width (cm)"] = iris_df["petal width (cm)"].astype("category")
319
+
320
+ # Display the first few rows of the dataset
321
+ print(iris_df.head())
322
+
323
+ iris_df = iris_df.sample(frac=1, replace=False).reset_index(drop=True)
324
+
325
+ data = Data(iris_df, "species")
326
+
327
+ model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
328
+ print("Before training F1:", model.test(data))
329
+ model.fit(data, epochs=200, batch_size=32, test_size=0.5)
330
+ model.save("./best_model.keras")
331
+ print("After training F1:", model.test(data))
332
+ best_model = tf.keras.models.load_model("./best_model.keras")
333
+
334
+ print("After loading F1:", best_model.test(data))
335
+ df_results = pd.DataFrame()
336
+
337
+ # Suppose we have a new dataset without the target variable
338
+ iris_df = iris_df.drop(columns=["species"])
339
+ data_new = Data(iris_df)
340
+ print("Predictions:", best_model.predict(data_new))
341
+ df_results["predicted"] = list(model.predict(data))
342
+ df_results["actual"] = list(data.y)
343
+ # df_results.to_csv("results.csv", index=False)
344
+ breakpoint()
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  from functools import partial
3
+ from shutil import rmtree
3
4
 
4
5
  import keras_tuner
5
6
  import numpy as np
@@ -15,26 +16,26 @@ class AutoClassifier(tf.keras.Model):
15
16
  An auto-classifier model that automatically determines the best classification strategy based on the input data.
16
17
 
17
18
  Attributes:
18
- - input_shape: The shape of the input data.
19
+ - input_shape_parm: The shape of the input data.
19
20
  - num_classes: The number of classes in the dataset.
20
21
  - units: The number of neurons in each hidden layer.
21
22
  - activation: The type of activation function to use for the neural network layers.
22
23
 
23
24
  Methods:
24
- __init__(self, input_shape, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
25
- build(self, input_shape): Builds the model architecture based on input_shape.
25
+ __init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
26
+ build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
26
27
  call(self, x): Defines the forward pass of the model.
27
28
  get_config(self): Returns the configuration of the model.
28
29
  from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
29
30
  """
30
31
 
31
- def __init__(self, input_shape, num_classes, units, activation):
32
+ def __init__(self, input_shape_parm, num_classes, units, activation):
32
33
  """
33
34
  Initializes an AutoClassifier instance with the given parameters.
34
35
 
35
36
  Parameters
36
37
  ----------
37
- input_shape : `int`
38
+ input_shape_parm : `int`
38
39
  The shape of the input data.
39
40
  num_classes : `int`
40
41
  The number of classes in the dataset.
@@ -44,7 +45,7 @@ class AutoClassifier(tf.keras.Model):
44
45
  The type of activation function to use for the neural network layers.
45
46
  """
46
47
  super(AutoClassifier, self).__init__()
47
- self.input_shape = input_shape
48
+ self.input_shape_parm = input_shape_parm
48
49
  self.num_classes = num_classes
49
50
  self.units = units
50
51
  self.activation = activation
@@ -53,7 +54,7 @@ class AutoClassifier(tf.keras.Model):
53
54
  self.decoder = None
54
55
  self.classifier = None
55
56
 
56
- def build(self, input_shape):
57
+ def build(self, input_shape_parm):
57
58
  self.encoder = tf.keras.Sequential(
58
59
  [
59
60
  tf.keras.layers.Dense(units=self.units, activation=self.activation),
@@ -64,7 +65,7 @@ class AutoClassifier(tf.keras.Model):
64
65
  self.decoder = tf.keras.Sequential(
65
66
  [
66
67
  tf.keras.layers.Dense(units=self.units, activation=self.activation),
67
- tf.keras.layers.Dense(units=self.input_shape, activation=self.activation),
68
+ tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
68
69
  ]
69
70
  )
70
71
 
@@ -81,7 +82,7 @@ class AutoClassifier(tf.keras.Model):
81
82
 
82
83
  def get_config(self):
83
84
  config = {
84
- "input_shape": self.input_shape,
85
+ "input_shape_parm": self.input_shape_parm,
85
86
  "num_classes": self.num_classes,
86
87
  "units": self.units,
87
88
  "activation": self.activation,
@@ -92,7 +93,7 @@ class AutoClassifier(tf.keras.Model):
92
93
  @classmethod
93
94
  def from_config(cls, config):
94
95
  return cls(
95
- input_shape=config["input_shape"],
96
+ input_shape_parm=config["input_shape_parm"],
96
97
  num_classes=config["num_classes"],
97
98
  units=config["units"],
98
99
  activation=config["activation"],
@@ -104,7 +105,7 @@ def call_existing_code(
104
105
  activation: str,
105
106
  threshold: float,
106
107
  optimizer: str,
107
- input_shape: None | int = None,
108
+ input_shape_parm: None | int = None,
108
109
  num_classes: None | int = None,
109
110
  ) -> AutoClassifier:
110
111
  """
@@ -120,7 +121,7 @@ def call_existing_code(
120
121
  The threshold for the classifier.
121
122
  optimizer : `str`
122
123
  The type of optimizer to use for the neural network layers.
123
- input_shape : `None` | `int`
124
+ input_shape_parm : `None` | `int`
124
125
  The shape of the input data.
125
126
  num_classes : `int`
126
127
  The number of classes in the dataset.
@@ -131,7 +132,10 @@ def call_existing_code(
131
132
  The AutoClassifier instance.
132
133
  """
133
134
  model = AutoClassifier(
134
- input_shape=input_shape, num_classes=num_classes, units=units, activation=activation
135
+ input_shape_parm=input_shape_parm,
136
+ num_classes=num_classes,
137
+ units=units,
138
+ activation=activation,
135
139
  )
136
140
  model.compile(
137
141
  optimizer=optimizer,
@@ -141,14 +145,14 @@ def call_existing_code(
141
145
  return model
142
146
 
143
147
 
144
- def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoClassifier:
148
+ def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
145
149
  """Builds a neural network model using Keras Tuner's search algorithm.
146
150
 
147
151
  Parameters
148
152
  ----------
149
153
  hp : `keras_tuner.HyperParameters`
150
154
  The hyperparameters to tune.
151
- input_shape : `None` | `int`
155
+ input_shape_parm : `None` | `int`
152
156
  The shape of the input data.
153
157
  num_classes : `int`
154
158
  The number of classes in the dataset.
@@ -158,7 +162,9 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
158
162
  `keras.Model`
159
163
  The neural network model.
160
164
  """
161
- units = hp.Int("units", min_value=int(input_shape * 0.2), max_value=input_shape, step=2)
165
+ units = hp.Int(
166
+ "units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
167
+ )
162
168
  activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
163
169
  optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
164
170
  threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
@@ -168,14 +174,21 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
168
174
  activation=activation,
169
175
  threshold=threshold,
170
176
  optimizer=optimizer,
171
- input_shape=input_shape,
177
+ input_shape_parm=input_shape_parm,
172
178
  num_classes=num_classes,
173
179
  )
174
180
  return model
175
181
 
176
182
 
177
183
  def setup_model(
178
- data: DataFrame, target: str, epochs: int, train_size: float = 0.7, seed=None, **kwargs
184
+ data: DataFrame,
185
+ target: str,
186
+ epochs: int,
187
+ train_size: float = 0.7,
188
+ seed=None,
189
+ train_mode: bool = True,
190
+ filepath: str = "./my_dir/best_model.keras",
191
+ **kwargs,
179
192
  ) -> AutoClassifier:
180
193
  """Setup model for training and tuning.
181
194
 
@@ -191,6 +204,10 @@ def setup_model(
191
204
  The proportion of the dataset to use for training.
192
205
  seed : `Any` | `int`
193
206
  The random seed to use for reproducibility.
207
+ train_mode : `bool`
208
+ Whether to train the model or not.
209
+ filepath : `str`
210
+ The path to save the best model to.
194
211
 
195
212
  Keyword Arguments:
196
213
  ----------
@@ -226,8 +243,18 @@ def setup_model(
226
243
  ), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
227
244
  validation_split = 1.0 - train_size
228
245
  # Create my_dir path if it does not exist
229
- if not os.path.exists(directory):
230
- os.makedirs(directory)
246
+
247
+ if train_mode:
248
+ # Create a new directory if it does not exist
249
+ try:
250
+ if not os.path.exists(directory):
251
+ os.makedirs(directory)
252
+ else:
253
+ print(f"Directory {directory} already exists, it will be deleted.")
254
+ rmtree(directory)
255
+ os.makedirs(directory)
256
+ except:
257
+ print("Warning: unable to create directory")
231
258
 
232
259
  # Create a Classifier instance
233
260
  y_encoder = OneHotEncoder()
@@ -237,10 +264,12 @@ def setup_model(
237
264
 
238
265
  y = np.asarray(y).astype(np.float32)
239
266
 
240
- input_shape = X.shape[1]
267
+ input_shape_parm = X.shape[1]
241
268
  num_classes = y.shape[1]
242
269
  global build_model
243
- build_model = partial(build_model, input_shape=input_shape, num_classes=num_classes)
270
+ build_model = partial(
271
+ build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
272
+ )
244
273
 
245
274
  # Create the AutoKeras model
246
275
  tuner = keras_tuner.RandomSearch(
@@ -257,13 +286,13 @@ def setup_model(
257
286
  best_model = models[0]
258
287
 
259
288
  # save model
260
- best_model.save("./my_dir/best_model.keras")
289
+ best_model.save(filepath)
261
290
 
262
291
  if verbose:
263
292
  tuner.results_summary()
264
293
  else:
265
294
  # Load the best model from the directory
266
- best_model = tf.keras.models.load_model("./my_dir/best_model.keras")
295
+ best_model = tf.keras.models.load_model(filepath)
267
296
 
268
297
  return best_model
269
298
 
@@ -10,53 +10,65 @@ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_
10
10
 
11
11
 
12
12
  class SimulationEngine(FeatureSelection):
13
+ """
14
+ This class implements a predictive model that utilizes multiple linear regression for numerical target variables
15
+ and multiple logistic regression for categorical target variables.
13
16
 
14
- def __init__(self, df: DataFrame, n_importances: int, **kwargs):
17
+ The class provides methods for training the model on a given dataset, making predictions,
18
+ and evaluating the model's performance.
19
+
20
+ Key features:
21
+ - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
22
+ - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
23
+ - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
24
+
25
+ Usage:
26
+ - Instantiate the class with the training data and target variable.
27
+ - Call the fit method to train the model.
28
+ - Use the predict method to generate predictions on new data.
29
+ - Evaluate the model using built-in metrics for accuracy and error.
30
+
31
+ This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
32
+ for both numerical and categorical outcomes efficiently.
33
+ """
34
+
35
+ def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
15
36
 
16
37
  self.df = df
17
38
  self.n_importances = n_importances
39
+ self.use_scaler = use_scaler
18
40
 
19
41
  super().__init__(**kwargs)
20
42
 
21
- def predict(self, df: DataFrame, column: str, n: int = None) -> ndarray | list:
22
-
23
- # We clean the data set
24
- df = self._clean_data(df)
25
-
43
+ def predict(self, df: DataFrame, column: str) -> ndarray | list:
26
44
  # Let us assign the dictionary entries corresponding to the column
27
45
  w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
28
46
 
29
- try:
30
- df = df[names_cols].copy()
31
- # Change the scale of the dataframe
32
- numeric_df = df.select_dtypes(include="number")
47
+ df = df[names_cols].copy()
48
+ # Change the scale of the dataframe
49
+ dataset = self.df.copy()
50
+ dataset.drop(columns=column, inplace=True)
51
+ numeric_df = dataset.select_dtypes(include="number")
52
+ if self.use_scaler:
33
53
  scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
34
- numeric_scaled = scaler.rescale()
54
+ _ = scaler.rescale()
55
+ dataset_ = df.copy()
56
+ numeric_df = dataset_.select_dtypes(include="number")
57
+ numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
35
58
  numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
36
- df[numeric_df.columns] = numeric_df
37
-
38
- # Encoding the datadrame
39
- for num, colname in enumerate(dfe._encode_columns):
40
- if df[colname].dtype == "object":
41
- encode_dict = dfe.encoding_list[num]
42
- df[colname] = df[colname].apply(
43
- dfe._code_transformation_to, dictionary_list=encode_dict
44
- )
45
-
46
- except:
47
- print("The dataframe provided does not have the same columns as in the fit method.")
48
-
49
- # Assign value to n if n is None
50
- n = n if n != None else len(df)
51
-
52
- # Generation of assertion
53
- assert n > 0 and n <= len(df), '"n" must be interger or "<= len(df)".'
59
+ for col in numeric_df.columns:
60
+ df[col] = numeric_df[col].values
54
61
 
55
- # Sample dataframe
56
- df_aux = df.sample(n)
62
+ # Encoding the datadrame
63
+ for num, colname in enumerate(dfe._encode_columns):
64
+ if df[colname].dtype == "object":
65
+ encode_dict = dfe.encoding_list[num]
66
+ df[colname] = df[colname].apply(
67
+ dfe._code_transformation_to, dictionary_list=encode_dict
68
+ )
57
69
 
58
70
  # PREDICTION
59
- y = df_aux.to_numpy() @ w
71
+ y = df.to_numpy() @ w
60
72
 
61
73
  # Categorical column
62
74
  if quick_encoder != None:
@@ -67,18 +79,18 @@ class SimulationEngine(FeatureSelection):
67
79
  y = [encoding_dic[item] for item in y]
68
80
  # Numeric column
69
81
  else:
70
- # scale output
71
- i = numeric_dict[column]
72
- y += 1
73
- y /= 2
74
- y = y * self.scaler.values[1][i]
82
+ if self.use_scaler:
83
+ # scale output
84
+ y += 1
85
+ y /= 2
86
+ y = y * (self.df[column].max() - self.df[column].min())
75
87
 
76
- return y
88
+ return y[:]
77
89
 
78
90
  def fit(self, **kwargs) -> None:
79
91
 
80
92
  # We run the feature selection algorithm
81
- self.get_digraph(self.df, self.n_importances)
93
+ self.get_digraph(self.df, self.n_importances, self.use_scaler)
82
94
 
83
95
  def _clean_data(self, df: DataFrame) -> DataFrame:
84
96
 
@@ -1,14 +1,14 @@
1
1
  from typing import Dict
2
2
 
3
3
  import numpy as np
4
+ import pandas as pd
4
5
  from numpy import arange, array, ndarray, random
5
6
  from numpy.linalg import solve
6
7
  from pandas.core.frame import DataFrame
7
8
 
8
- # -------------------------------------------------------------------------
9
-
10
9
 
11
- def xi_corr(df: DataFrame) -> DataFrame:
10
+ # -------------------------------------------------------------------------
11
+ def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
12
12
  """Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
13
13
 
14
14
  Parameters
@@ -19,11 +19,15 @@ def xi_corr(df: DataFrame) -> DataFrame:
19
19
  Returns
20
20
  -------
21
21
  `DataFrame`
22
- A dataframe with variable names as keys and their corresponding
23
- correlation coefficients as values.
22
+ A square dataframe with variable names as both index and columns,
23
+ containing their corresponding correlation coefficients.
24
24
  """
25
- correlations = {}
26
- columns = df.columns
25
+
26
+ columns = df.select_dtypes(include="number").columns
27
+ n = len(columns)
28
+
29
+ # Initialize a square matrix for the correlations
30
+ correlations = pd.DataFrame(1.0, index=columns, columns=columns)
27
31
 
28
32
  for i, col1 in enumerate(columns):
29
33
  for j, col2 in enumerate(columns):
@@ -32,9 +36,9 @@ def xi_corr(df: DataFrame) -> DataFrame:
32
36
  y = df[col2].values
33
37
 
34
38
  correlation = xicor(x, y)
35
- correlations[(col1, col2)] = round(correlation, 8)
36
- # dictionary to dataframe
37
- correlations = DataFrame(list(correlations.items()), columns=["Variables", "Xi Correlation"])
39
+ correlations.loc[col1, col2] = round(correlation, 8)
40
+ correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
41
+
38
42
  return correlations
39
43
 
40
44
 
@@ -51,10 +55,11 @@ def xi_corr(df: DataFrame) -> DataFrame:
51
55
  """
52
56
 
53
57
 
54
- def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
55
- """Calculate a new coefficient of correlation between two variables.
58
+ def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
59
+ """
60
+ Calculate a generalized coefficient of correlation between two variables.
56
61
 
57
- The new coefficient of correlation is a generalization of Pearson's correlation.
62
+ This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
58
63
 
59
64
  Parameters
60
65
  ----------
@@ -62,30 +67,52 @@ def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
62
67
  The first variable to be correlated. Must have at least one dimension.
63
68
  Y : `np.ndarray`
64
69
  The second variable to be correlated. Must have at least one dimension.
70
+ ties : bool
71
+ Whether to handle ties using randomization.
72
+ random_seed : int, optional
73
+ Seed for the random number generator for reproducibility.
65
74
 
66
75
  Returns
67
76
  -------
68
77
  xi : `float`
69
78
  The estimated value of the new coefficient of correlation.
70
79
  """
71
- random.seed(42)
80
+
81
+ # Early return for identical arrays
82
+ if np.array_equal(X, Y):
83
+ return 1.0
84
+
72
85
  n = len(X)
73
- order = array([i[0] for i in sorted(enumerate(X), key=lambda x: x[1])])
86
+
87
+ # Early return for cases with less than 2 elements
88
+ if n < 2:
89
+ return 0.0
90
+
91
+ # Flatten the input arrays if they are multidimensional
92
+ X = X.flatten()
93
+ Y = Y.flatten()
94
+
95
+ # Get the sorted order of X
96
+ order = np.argsort(X)
97
+
74
98
  if ties:
75
- l = array([sum(y >= Y[order]) for y in Y[order]])
76
- r = l.copy()
77
- for j in range(n):
78
- if sum([r[j] == r[i] for i in range(n)]) > 1:
79
- tie_index = array([r[j] == r[i] for i in range(n)])
80
- r[tie_index] = random.choice(
81
- r[tie_index] - arange(0, sum([r[j] == r[i] for i in range(n)])),
82
- sum(tie_index),
83
- replace=False,
84
- )
85
- return 1 - n * sum(abs(r[1:] - r[: n - 1])) / (2 * sum(l * (n - l)))
99
+ np.random.seed(random_seed) # Set seed for reproducibility if needed
100
+ ranks = np.argsort(np.argsort(Y[order])) # Get ranks
101
+ unique_ranks, counts = np.unique(ranks, return_counts=True)
102
+
103
+ # Adjust ranks for ties by shuffling
104
+ for rank, count in zip(unique_ranks, counts):
105
+ if count > 1:
106
+ tie_indices = np.where(ranks == rank)[0]
107
+ np.random.shuffle(ranks[tie_indices]) # Randomize ties
108
+
109
+ cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
110
+ return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
111
+ 2 * np.sum(cumulative_counts * (n - cumulative_counts))
112
+ )
86
113
  else:
87
- r = array([sum(y >= Y[order]) for y in Y[order]])
88
- return 1 - 3 * sum(abs(r[1:] - r[: n - 1])) / (n**2 - 1)
114
+ ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
115
+ return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
89
116
 
90
117
 
91
118
  # -------------------------------------------------------------------------
@@ -257,8 +284,8 @@ if __name__ == "__main__":
257
284
  print("New correlation coefficient test")
258
285
  X = np.random.rand(100, 1)
259
286
  Y = X * X
260
- print("coefficient for Y = X * X : ", xicor(X, Y))
261
-
287
+ print("coefficient for Y = X * X : ", xicor(X, Y, False))
288
+ df["index"] = ["A", "B", "C", "D"]
262
289
  print("New correlation coefficient test for pandas DataFrame")
263
290
  values_df = xi_corr(df)
264
291
  breakpoint()
likelihood/tools/tools.py CHANGED
@@ -640,14 +640,14 @@ def cal_average(y: ndarray, alpha: float = 1):
640
640
  class DataScaler:
641
641
  """numpy array `scaler` and `rescaler`"""
642
642
 
643
- __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose"]
643
+ __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
644
644
 
645
645
  def __init__(self, dataset: ndarray, n: int = 1) -> None:
646
646
  """Initializes the parameters required for scaling the data"""
647
647
  self.dataset_ = dataset.copy()
648
648
  self._n = n
649
649
 
650
- def rescale(self) -> ndarray:
650
+ def rescale(self, dataset_: ndarray | None = None) -> ndarray:
651
651
  """Perform a standard rescaling of the data
652
652
 
653
653
  Returns
@@ -655,11 +655,26 @@ class DataScaler:
655
655
  data_scaled : `np.array`
656
656
  An array containing the scaled data.
657
657
  """
658
+ if isinstance(dataset_, ndarray):
659
+ data_scaled = np.copy(dataset_)
660
+ mu = self.values[0]
661
+ sigma = self.values[1]
662
+ f = self.values[2]
663
+ data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
664
+ for i in range(self.dataset_.shape[0]):
665
+ if self._n != None:
666
+ poly = f[i](self.inv_fitting[i](data_scaled[i]))
667
+ data_scaled[i] += -poly
668
+ data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
669
+ return data_scaled
670
+ else:
671
+ self.data_scaled = np.copy(self.dataset_.copy())
658
672
 
659
673
  mu = []
660
674
  sigma = []
661
675
  fitting = []
662
- self.data_scaled = np.copy(self.dataset_)
676
+ self.inv_fitting = []
677
+
663
678
  try:
664
679
  xaxis = range(self.dataset_.shape[1])
665
680
  except:
@@ -675,12 +690,15 @@ class DataScaler:
675
690
  for i in range(self.dataset_.shape[0]):
676
691
  if self._n != None:
677
692
  fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
693
+ inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
678
694
  f = np.poly1d(fit)
679
695
  poly = f(xaxis)
680
696
  fitting.append(f)
697
+ self.inv_fitting.append(inv_fit)
681
698
  self.data_scaled[i, :] += -poly
682
699
  else:
683
700
  fitting.append(0.0)
701
+ self.inv_fitting.append(0.0)
684
702
  mu.append(np.min(self.data_scaled[i, :]))
685
703
  if np.max(self.data_scaled[i, :]) != 0:
686
704
  sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
@@ -1064,7 +1082,7 @@ class FeatureSelection:
1064
1082
  self.all_features_imp_graph: List[Tuple] = []
1065
1083
  self.w_dict = dict()
1066
1084
 
1067
- def get_digraph(self, dataset: DataFrame, n_importances: int) -> str:
1085
+ def get_digraph(self, dataset: DataFrame, n_importances: int, use_scaler: bool = False) -> str:
1068
1086
  """
1069
1087
  Get directed graph showing importance of features.
1070
1088
 
@@ -1092,10 +1110,11 @@ class FeatureSelection:
1092
1110
  feature_string += column + "; "
1093
1111
 
1094
1112
  numeric_df = curr_dataset.select_dtypes(include="number")
1095
- self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
1096
- numeric_scaled = self.scaler.rescale()
1097
- numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
1098
- curr_dataset[numeric_df.columns] = numeric_df
1113
+ if use_scaler:
1114
+ self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
1115
+ numeric_scaled = self.scaler.rescale()
1116
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
1117
+ curr_dataset[numeric_df.columns] = numeric_df
1099
1118
 
1100
1119
  # We construct dictionary to save index for scaling
1101
1120
  numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
@@ -1119,7 +1138,6 @@ class FeatureSelection:
1119
1138
  dfe = DataFrameEncoder(X_aux)
1120
1139
  encoded_df = dfe.encode(save_mode=False)
1121
1140
  # We train
1122
-
1123
1141
  Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
1124
1142
  # We obtain importance
1125
1143
  importance = Model.get_importances()
@@ -1202,7 +1220,7 @@ class FeatureSelection:
1202
1220
 
1203
1221
 
1204
1222
  def check_nan_inf(df: DataFrame) -> DataFrame:
1205
- """Check for `NaN` and `Inf` values in the `DataFrame`. If any are found removes them."""
1223
+ """Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
1206
1224
  nan_values = df.isnull().values.any()
1207
1225
  count = np.isinf(df.select_dtypes(include="number")).values.sum()
1208
1226
  print("There are null values : ", nan_values)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.16
3
+ Version: 1.2.18
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -28,8 +28,9 @@ Requires-Dist: corner
28
28
  Provides-Extra: full
29
29
  Requires-Dist: networkx ; extra == 'full'
30
30
  Requires-Dist: pyvis ; extra == 'full'
31
- Requires-Dist: tensorflow ; extra == 'full'
31
+ Requires-Dist: tensorflow ==2.15.0 ; extra == 'full'
32
32
  Requires-Dist: keras-tuner ; extra == 'full'
33
+ Requires-Dist: scikit-learn ; extra == 'full'
33
34
 
34
35
  ![likelihood](https://raw.githubusercontent.com/RodolfoFerro/likelihood/main/likelihood.png)
35
36
 
@@ -2,17 +2,18 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
2
2
  likelihood/main.py,sha256=prqT9egu3B2rcbsVMqYxuosNbe7NhDBCmmZtQ21aSlQ,8591
3
3
  likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
4
4
  likelihood/graph/graph.py,sha256=wKJqgxXiSbnvzyW3SjhQVrqp00yKMHf3ph6CIDNVhNM,2891
5
+ likelihood/graph/nn.py,sha256=XqTnAHzXP0jSdLd0IOFjVZUZTcQU-XppsZLmJrG2GMo,12372
5
6
  likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
6
7
  likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
7
- likelihood/models/simulation.py,sha256=KYdVjt2PaLo04g8kBsRGQJ5AKMBaQVUH3orZE_TXTy8,2960
8
+ likelihood/models/simulation.py,sha256=mdgQPg_LEY5svPaF4TFv-DoQRE2oP2ig_uXnwINtewM,4039
8
9
  likelihood/models/utils.py,sha256=VtEj07lV-GRoWraQgpfjU0jTt1Ntf9MXgYwe6XYQh20,1552
9
10
  likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
10
- likelihood/models/deep/autoencoders.py,sha256=wgra29Wjyh4KOMdOVEhWLtfqTFvjKeOVf1GthomB7PE,8857
11
+ likelihood/models/deep/autoencoders.py,sha256=lUvFQ7lbjvIPR_IKFnK5VCrSa419P5dOaTL3qSHntJk,9623
11
12
  likelihood/tools/__init__.py,sha256=MCjsCWfBNKE2uMN0VizDN1uFzZ_md0X2WZeBdWhrCR8,50
12
- likelihood/tools/numeric_tools.py,sha256=EQD959b56aovi4PI_og0BITgyUONgDUU9LG9YqNgX70,7554
13
- likelihood/tools/tools.py,sha256=B1_xRZeO2fUSCVUvdkhlB6zO9dGzIglSknydLv7VCEc,41627
14
- likelihood-1.2.16.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
15
- likelihood-1.2.16.dist-info/METADATA,sha256=5htpwpnzwy5Y0sU103sm_K8Yt5xhMjjmLf1a3rx_40s,2463
16
- likelihood-1.2.16.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
17
- likelihood-1.2.16.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
18
- likelihood-1.2.16.dist-info/RECORD,,
13
+ likelihood/tools/numeric_tools.py,sha256=cPTPgdww2ofxfyhJDomqvtXDgsSDs9iRQ7GHLt5Vl6M,8457
14
+ likelihood/tools/tools.py,sha256=O39aPxTNsaBVSJFIkNsUESNSkfG4C7GG77wcR51a8IQ,42543
15
+ likelihood-1.2.18.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
16
+ likelihood-1.2.18.dist-info/METADATA,sha256=8nAjAwwqCDw8K9IBzKG2cgBU5DOLAA-N-RIlr02eyjU,2518
17
+ likelihood-1.2.18.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
18
+ likelihood-1.2.18.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
19
+ likelihood-1.2.18.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.1.1)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5