likelihood 1.2.21__tar.gz → 1.2.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {likelihood-1.2.21 → likelihood-1.2.23}/PKG-INFO +1 -1
  2. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/graph/graph.py +17 -0
  3. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/graph/nn.py +13 -11
  4. likelihood-1.2.23/likelihood/models/simulation.py +223 -0
  5. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/tools/tools.py +307 -261
  6. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood.egg-info/PKG-INFO +1 -1
  7. likelihood-1.2.21/likelihood/models/simulation.py +0 -103
  8. {likelihood-1.2.21 → likelihood-1.2.23}/LICENSE +0 -0
  9. {likelihood-1.2.21 → likelihood-1.2.23}/README.md +0 -0
  10. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/__init__.py +0 -0
  11. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/graph/__init__.py +0 -0
  12. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/main.py +0 -0
  13. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/models/__init__.py +0 -0
  14. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/models/deep/__init__.py +0 -0
  15. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/models/deep/autoencoders.py +0 -0
  16. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/models/regression.py +0 -0
  17. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/models/utils.py +0 -0
  18. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/tools/__init__.py +0 -0
  19. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood/tools/numeric_tools.py +0 -0
  20. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood.egg-info/SOURCES.txt +0 -0
  21. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood.egg-info/dependency_links.txt +0 -0
  22. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood.egg-info/requires.txt +0 -0
  23. {likelihood-1.2.21 → likelihood-1.2.23}/likelihood.egg-info/top_level.txt +0 -0
  24. {likelihood-1.2.21 → likelihood-1.2.23}/setup.cfg +0 -0
  25. {likelihood-1.2.21 → likelihood-1.2.23}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.21
3
+ Version: 1.2.23
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -74,3 +74,20 @@ class DynamicGraph(FeatureSelection):
74
74
  nx_graph.add_edges_from([(source, target, edge)])
75
75
 
76
76
  return nx_graph
77
+
78
+
79
+ # -------------------------------------------------------------------------
80
+ if __name__ == "__main__":
81
+ import numpy as np
82
+ import pandas as pd
83
+
84
+ # Generate data
85
+ x = np.random.rand(3, 100)
86
+ y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
87
+ # Create a DataFrame
88
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
89
+ df["y"] = y
90
+ # Instantiate DynamicGraph
91
+ fs = DynamicGraph(df, n_importances=2)
92
+ print(fs.fit())
93
+ fs.draw()
@@ -1,9 +1,14 @@
1
+ import os
2
+
3
+ os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
4
+ import logging
1
5
  import warnings
2
6
  from typing import List, Tuple
3
7
 
4
8
  import numpy as np
5
9
  import pandas as pd
6
10
  import tensorflow as tf
11
+ from IPython.display import clear_output
7
12
  from numpy import ndarray
8
13
  from pandas.core.frame import DataFrame
9
14
  from sklearn.metrics import f1_score
@@ -11,6 +16,10 @@ from sklearn.model_selection import train_test_split
11
16
 
12
17
  from likelihood.tools import generate_feature_yaml
13
18
 
19
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
20
+
21
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
22
+
14
23
 
15
24
  def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
16
25
  """Compares the similarity between two arrays of categories.
@@ -180,14 +189,6 @@ class VanillaGNN(tf.keras.Model):
180
189
  self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h)
181
190
  self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out)
182
191
 
183
- def build(self, input_shape):
184
- super(VanillaGNN, self).build(input_shape)
185
- dummy_input = tf.keras.Input(shape=input_shape[1:])
186
- dummy_adjacency = tf.sparse.SparseTensor(
187
- indices=[[0, 0]], values=[1.0], dense_shape=[input_shape[0], input_shape[0]]
188
- )
189
- _ = self(dummy_input, dummy_adjacency)
190
-
191
192
  def call(self, x, adjacency):
192
193
  h = self.gnn1(x, adjacency)
193
194
  h = tf.nn.tanh(h)
@@ -289,10 +290,11 @@ class VanillaGNN(tf.keras.Model):
289
290
  train_losses.append(train_loss)
290
291
  train_f1_scores.append(train_f1)
291
292
 
292
- if epoch % 2 == 0:
293
+ if epoch % 5 == 0:
293
294
  val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
294
295
  val_losses.append(val_loss)
295
296
  val_f1_scores.append(val_f1)
297
+ clear_output(wait=True)
296
298
  print(
297
299
  f"Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train F1: {train_f1:.3f} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}"
298
300
  )
@@ -327,9 +329,9 @@ if __name__ == "__main__":
327
329
  model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
328
330
  print("Before training F1:", model.test(data))
329
331
  model.fit(data, epochs=200, batch_size=32, test_size=0.5)
330
- model.save("./best_model.keras")
332
+ model.save("./best_model", save_format="tf")
331
333
  print("After training F1:", model.test(data))
332
- best_model = tf.keras.models.load_model("./best_model.keras")
334
+ best_model = tf.keras.models.load_model("./best_model")
333
335
 
334
336
  print("After loading F1:", best_model.test(data))
335
337
  df_results = pd.DataFrame()
@@ -0,0 +1,223 @@
1
+ import pickle
2
+ import warnings
3
+ from typing import List, Tuple, Union
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import pandas as pd
8
+ from numpy import ndarray
9
+ from pandas.core.frame import DataFrame
10
+
11
+ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
12
+
13
+ # Suppress RankWarning
14
+ warnings.simplefilter("ignore", np.RankWarning)
15
+
16
+
17
+ # --------------------------------------------------------------------------------------------------------------------------------------
18
+ def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
19
+ # Count the frequency of each category in the column
20
+ freq = df[column].value_counts()
21
+
22
+ # Calculate the 25th percentile (Q1) and 75th percentile (Q3)
23
+ q1 = freq.quantile(0.25)
24
+ q3 = freq.quantile(0.75)
25
+
26
+ # Filter categories that are below the 25th percentile and above the 75th percentile
27
+ least_frequent = freq[freq <= q1]
28
+ most_frequent = freq[freq >= q3]
29
+
30
+ # Get the least frequent category (25th percentile) and the most frequent category (75th percentile)
31
+ least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
32
+ most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
33
+
34
+ return least_frequent_category, most_frequent_category
35
+
36
+
37
+ class SimulationEngine(FeatureSelection):
38
+ """
39
+ This class implements a predictive model that utilizes multiple linear regression for numerical target variables
40
+ and multiple logistic regression for categorical target variables.
41
+
42
+ The class provides methods for training the model on a given dataset, making predictions,
43
+ and evaluating the model's performance.
44
+
45
+ Key features:
46
+ - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
47
+ - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
48
+ - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
49
+
50
+ Usage:
51
+ - Instantiate the class with the training data and target variable.
52
+ - Call the fit method to train the model.
53
+ - Use the predict method to generate predictions on new data.
54
+ - Evaluate the model using built-in metrics for accuracy and error.
55
+
56
+ This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
57
+ for both numerical and categorical outcomes efficiently.
58
+ """
59
+
60
+ def __init__(self, use_scaler: bool = False, **kwargs):
61
+
62
+ self.df = pd.DataFrame()
63
+ self.n_importances = None
64
+ self.use_scaler = use_scaler
65
+ self.proba_dict = {}
66
+
67
+ super().__init__(**kwargs)
68
+
69
+ def predict(self, df: DataFrame, column: str) -> ndarray | list:
70
+ # Let us assign the dictionary entries corresponding to the column
71
+ w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
72
+
73
+ df = df[names_cols].copy()
74
+ # Change the scale of the dataframe
75
+ dataset = self.df.copy()
76
+ dataset.drop(columns=column, inplace=True)
77
+ numeric_df = dataset.select_dtypes(include="number")
78
+ if self.use_scaler:
79
+ scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
80
+ _ = scaler.rescale()
81
+ dataset_ = df.copy()
82
+ numeric_df = dataset_.select_dtypes(include="number")
83
+ numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
84
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
85
+ for col in numeric_df.columns:
86
+ df[col] = numeric_df[col].values
87
+
88
+ # Encoding the datadrame
89
+ for num, colname in enumerate(dfe._encode_columns):
90
+ if df[colname].dtype == "object":
91
+ encode_dict = dfe.encoding_list[num]
92
+ df[colname] = df[colname].apply(
93
+ dfe._code_transformation_to, dictionary_list=encode_dict
94
+ )
95
+
96
+ # PREDICTION
97
+ y = df.to_numpy() @ w
98
+
99
+ # Categorical column
100
+ if quick_encoder != None:
101
+
102
+ one_hot = OneHotEncoder()
103
+ y = one_hot.decode(y)
104
+ encoding_dic = quick_encoder.decoding_list[0]
105
+ y = [encoding_dic[item] for item in y]
106
+ # Numeric column
107
+ else:
108
+ if self.use_scaler:
109
+ # scale output
110
+ y += 1
111
+ y /= 2
112
+ y = y * (self.df[column].max() - self.df[column].min())
113
+
114
+ return y[:]
115
+
116
+ def _encode(self, df: DataFrame) -> ndarray | list:
117
+ df = df.copy()
118
+ column = df.columns[0]
119
+ frec = df[column].value_counts() / len(df)
120
+ df.loc[:, "frec"] = df[column].map(frec)
121
+ df.sort_values("frec", inplace=True)
122
+ keys = df[column].to_list()
123
+ values = df["frec"].to_list()
124
+ return dict(zip(keys, values))
125
+
126
+ def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
127
+ self.df = df
128
+ self.n_importances = n_importances
129
+ # We run the feature selection algorithm
130
+ self.get_digraph(self.df, self.n_importances, self.use_scaler)
131
+ proba_dict_keys = list(self.w_dict.keys())
132
+ self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
133
+ for key in proba_dict_keys:
134
+ x = (
135
+ self.df[key].values,
136
+ None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
137
+ )
138
+ poly = kwargs.get("poly", 9)
139
+ plot = kwargs.get("plot", False)
140
+ if not x[1]:
141
+ media = self.df[key].mean()
142
+ desviacion_estandar = self.df[key].std()
143
+ cota_inferior = media - 1.5 * desviacion_estandar
144
+ cota_superior = media + 1.5 * desviacion_estandar
145
+ if plot:
146
+ print(f"Cumulative Distribution Function ({key})")
147
+ f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
148
+ else:
149
+ f, ox = None, None
150
+ least_frequent_category, most_frequent_category = categories_by_quartile(
151
+ self.df[[key]], key
152
+ )
153
+ cota_inferior = x[1].get(least_frequent_category, 0)
154
+ cota_superior = x[1].get(most_frequent_category, 0)
155
+ self.proba_dict[key] = (
156
+ f if f else None,
157
+ x[1],
158
+ (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
159
+ f(cota_inferior) if f else cota_inferior,
160
+ f(cota_superior) if f else cota_superior,
161
+ )
162
+
163
+ def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
164
+ value = (
165
+ value
166
+ if isinstance(value, list)
167
+ else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
168
+ )
169
+ return [
170
+ (
171
+ self.proba_dict[colname][0](val)
172
+ - self.proba_dict[colname][0](val - self.proba_dict[colname][2])
173
+ if (isinstance(val, float) or isinstance(val, int))
174
+ else self.proba_dict[colname][1].get(val, 0)
175
+ )
176
+ for val in value
177
+ ]
178
+
179
+ def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
180
+ return [
181
+ (
182
+ "inlier"
183
+ if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
184
+ else "outlier"
185
+ )
186
+ for val in self.get_proba(value, colname)
187
+ ]
188
+
189
+ def _clean_data(self, df: DataFrame) -> DataFrame:
190
+
191
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
192
+ df.replace(" ", np.nan, inplace=True)
193
+ df = check_nan_inf(df)
194
+ df = df.reset_index()
195
+ df = df.drop(columns=["index"])
196
+
197
+ return df
198
+
199
+ def save(self, filename: str = "./simulation_model") -> None:
200
+ """
201
+ Save the state of the SimulationEngine to a file.
202
+
203
+ Parameters:
204
+ filename (str): The name of the file where the object will be saved.
205
+ """
206
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
207
+ with open(filename, "wb") as f:
208
+ pickle.dump(self, f)
209
+
210
+ @staticmethod
211
+ def load(filename: str = "./simulation_model"):
212
+ """
213
+ Load the state of a SimulationEngine from a file.
214
+
215
+ Parameters:
216
+ filename (str): The name of the file containing the saved object.
217
+
218
+ Returns:
219
+ SimulationEngine: A new instance of SimulationEngine with the loaded state.
220
+ """
221
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
222
+ with open(filename, "rb") as f:
223
+ return pickle.load(f)