likelihood 1.2.22__tar.gz → 1.2.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {likelihood-1.2.22 → likelihood-1.2.24}/PKG-INFO +1 -1
  2. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/graph/graph.py +17 -0
  3. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/graph/nn.py +6 -5
  4. likelihood-1.2.24/likelihood/models/hmm.py +163 -0
  5. likelihood-1.2.24/likelihood/models/simulation.py +222 -0
  6. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/tools/tools.py +307 -261
  7. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood.egg-info/PKG-INFO +1 -1
  8. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood.egg-info/SOURCES.txt +1 -0
  9. likelihood-1.2.22/likelihood/models/simulation.py +0 -103
  10. {likelihood-1.2.22 → likelihood-1.2.24}/LICENSE +0 -0
  11. {likelihood-1.2.22 → likelihood-1.2.24}/README.md +0 -0
  12. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/__init__.py +0 -0
  13. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/graph/__init__.py +0 -0
  14. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/main.py +0 -0
  15. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/models/__init__.py +0 -0
  16. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/models/deep/__init__.py +0 -0
  17. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/models/deep/autoencoders.py +0 -0
  18. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/models/regression.py +0 -0
  19. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/models/utils.py +0 -0
  20. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/tools/__init__.py +0 -0
  21. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood/tools/numeric_tools.py +0 -0
  22. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood.egg-info/dependency_links.txt +0 -0
  23. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood.egg-info/requires.txt +0 -0
  24. {likelihood-1.2.22 → likelihood-1.2.24}/likelihood.egg-info/top_level.txt +0 -0
  25. {likelihood-1.2.22 → likelihood-1.2.24}/setup.cfg +0 -0
  26. {likelihood-1.2.22 → likelihood-1.2.24}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.22
3
+ Version: 1.2.24
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -74,3 +74,20 @@ class DynamicGraph(FeatureSelection):
74
74
  nx_graph.add_edges_from([(source, target, edge)])
75
75
 
76
76
  return nx_graph
77
+
78
+
79
+ # -------------------------------------------------------------------------
80
+ if __name__ == "__main__":
81
+ import numpy as np
82
+ import pandas as pd
83
+
84
+ # Generate data
85
+ x = np.random.rand(3, 100)
86
+ y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
87
+ # Create a DataFrame
88
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
89
+ df["y"] = y
90
+ # Instantiate DynamicGraph
91
+ fs = DynamicGraph(df, n_importances=2)
92
+ print(fs.fit())
93
+ fs.draw()
@@ -1,6 +1,8 @@
1
1
  import os
2
2
 
3
3
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
4
+ # Suppress TensorFlow INFO logs
5
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
4
6
  import logging
5
7
  import warnings
6
8
  from typing import List, Tuple
@@ -9,7 +11,6 @@ import numpy as np
9
11
  import pandas as pd
10
12
  import tensorflow as tf
11
13
  from IPython.display import clear_output
12
- from numpy import ndarray
13
14
  from pandas.core.frame import DataFrame
14
15
  from sklearn.metrics import f1_score
15
16
  from sklearn.model_selection import train_test_split
@@ -21,7 +22,7 @@ logging.getLogger("tensorflow").setLevel(logging.ERROR)
21
22
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
22
23
 
23
24
 
24
- def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
25
+ def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
25
26
  """Compares the similarity between two arrays of categories.
26
27
 
27
28
  Parameters
@@ -44,9 +45,9 @@ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
44
45
  return count
45
46
 
46
47
 
47
- def cal_adjency_matrix(
48
+ def cal_adjacency_matrix(
48
49
  df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
49
- ) -> Tuple[dict, ndarray]:
50
+ ) -> Tuple[dict, np.ndarray]:
50
51
  """Calculates the adjacency matrix for a given DataFrame.
51
52
  The adjacency matrix is a matrix that represents the similarity between each pair of categories.
52
53
  The similarity is calculated using the `compare_similarity` function.
@@ -133,7 +134,7 @@ class Data:
133
134
  target: str | None = None,
134
135
  exclude_subset: List[str] = [],
135
136
  ):
136
- _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
137
+ _, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=True)
137
138
  if target is not None:
138
139
  X = df.drop(columns=[target] + exclude_subset)
139
140
  else:
@@ -0,0 +1,163 @@
1
+ import logging
2
+ import os
3
+ import pickle
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ from IPython.display import clear_output
8
+
9
+
10
+ class HMM:
11
+ def __init__(self, n_states: int, n_observations: int):
12
+ self.n_states = n_states
13
+ self.n_observations = n_observations
14
+
15
+ # Initialize parameters with random values
16
+ self.pi = np.random.dirichlet(np.ones(n_states), size=1)[0]
17
+ self.A = np.random.dirichlet(np.ones(n_states), size=n_states)
18
+ self.B = np.random.dirichlet(np.ones(n_observations), size=n_states)
19
+
20
+ def save_model(self, filename: str = "./hmm") -> None:
21
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
22
+ with open(filename, "wb") as f:
23
+ pickle.dump(self, f)
24
+
25
+ @staticmethod
26
+ def load_model(filename: str = "./hmm") -> "HMM":
27
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
28
+ with open(filename, "rb") as f:
29
+ return pickle.load(f)
30
+
31
+ def forward(self, sequence: List[int]) -> np.ndarray:
32
+ T = len(sequence)
33
+ alpha = np.zeros((T, self.n_states))
34
+
35
+ # Add a small constant (smoothing) to avoid log(0)
36
+ epsilon = 1e-10 # Small value to avoid taking log(0)
37
+
38
+ # Initialization (log-space)
39
+ alpha[0] = np.log(self.pi + epsilon) + np.log(self.B[:, sequence[0]] + epsilon)
40
+ alpha[0] -= np.log(np.sum(np.exp(alpha[0]))) # Normalization (log-space)
41
+
42
+ # Recursion (log-space)
43
+ for t in range(1, T):
44
+ for i in range(self.n_states):
45
+ alpha[t, i] = np.log(
46
+ np.sum(np.exp(alpha[t - 1] + np.log(self.A[:, i] + epsilon)))
47
+ ) + np.log(self.B[i, sequence[t]] + epsilon)
48
+ alpha[t] -= np.log(np.sum(np.exp(alpha[t]))) # Normalization
49
+
50
+ return alpha
51
+
52
+ def backward(self, sequence: List[int]) -> np.ndarray:
53
+ T = len(sequence)
54
+ beta = np.ones((T, self.n_states))
55
+
56
+ # Backward recursion
57
+ for t in range(T - 2, -1, -1):
58
+ for i in range(self.n_states):
59
+ beta[t, i] = np.sum(self.A[i] * self.B[:, sequence[t + 1]] * beta[t + 1])
60
+
61
+ return beta
62
+
63
+ def viterbi(self, sequence: List[int]) -> np.ndarray:
64
+ T = len(sequence)
65
+ delta = np.zeros((T, self.n_states))
66
+ psi = np.zeros((T, self.n_states), dtype=int)
67
+
68
+ # Initialization
69
+ delta[0] = self.pi * self.B[:, sequence[0]]
70
+
71
+ # Recursion
72
+ for t in range(1, T):
73
+ for i in range(self.n_states):
74
+ delta[t, i] = np.max(delta[t - 1] * self.A[:, i]) * self.B[i, sequence[t]]
75
+ psi[t, i] = np.argmax(delta[t - 1] * self.A[:, i])
76
+
77
+ # Reconstruct the most probable path
78
+ state_sequence = np.zeros(T, dtype=int)
79
+ state_sequence[T - 1] = np.argmax(delta[T - 1])
80
+ for t in range(T - 2, -1, -1):
81
+ state_sequence[t] = psi[t + 1, state_sequence[t + 1]]
82
+
83
+ return state_sequence
84
+
85
+ def baum_welch(
86
+ self, sequences: List[List[int]], n_iterations: int, verbose: bool = False
87
+ ) -> None:
88
+ for iteration in range(n_iterations):
89
+ # Initialize accumulators
90
+ A_num = np.zeros((self.n_states, self.n_states))
91
+ B_num = np.zeros((self.n_states, self.n_observations))
92
+ pi_num = np.zeros(self.n_states)
93
+
94
+ for sequence in sequences:
95
+ T = len(sequence)
96
+ alpha = self.forward(sequence)
97
+ beta = self.backward(sequence)
98
+
99
+ # Update pi
100
+ gamma = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
101
+ pi_num += gamma[0]
102
+
103
+ # Update A and B
104
+ for t in range(T - 1):
105
+ xi = np.zeros((self.n_states, self.n_states))
106
+ denom = np.sum(alpha[t] * self.A * self.B[:, sequence[t + 1]] * beta[t + 1])
107
+
108
+ for i in range(self.n_states):
109
+ for j in range(self.n_states):
110
+ xi[i, j] = (
111
+ alpha[t, i]
112
+ * self.A[i, j]
113
+ * self.B[j, sequence[t + 1]]
114
+ * beta[t + 1, j]
115
+ ) / denom
116
+ A_num[i] += xi[i]
117
+
118
+ B_num[:, sequence[t]] += gamma[t]
119
+
120
+ # For the last step of the sequence
121
+ B_num[:, sequence[-1]] += gamma[-1]
122
+
123
+ # Normalize and update parameters
124
+ self.pi = pi_num / len(sequences)
125
+ self.A = A_num / np.sum(A_num, axis=1, keepdims=True)
126
+ self.B = B_num / np.sum(B_num, axis=1, keepdims=True)
127
+
128
+ # Logging parameters every 10 iterations
129
+ if iteration % 10 == 0 and verbose:
130
+ os.system("cls" if os.name == "nt" else "clear")
131
+ clear_output(wait=True)
132
+ logging.info(f"Iteration {iteration}:")
133
+ logging.info("Pi: %s", self.pi)
134
+ logging.info("A:\n%s", self.A)
135
+ logging.info("B:\n%s", self.B)
136
+
137
+ def decoding_accuracy(self, sequences: List[List[int]], true_states: List[List[int]]) -> float:
138
+ correct_predictions = 0
139
+ total_predictions = 0
140
+
141
+ for sequence, true_state in zip(sequences, true_states):
142
+ predicted_states = self.viterbi(sequence)
143
+ correct_predictions += np.sum(predicted_states == true_state)
144
+ total_predictions += len(sequence)
145
+
146
+ accuracy = (correct_predictions / total_predictions) * 100
147
+ return accuracy
148
+
149
+ def state_probabilities(self, sequence: List[int]) -> np.ndarray:
150
+ """
151
+ Returns the smoothed probabilities of the hidden states at each time step.
152
+ This is done by using both forward and backward probabilities.
153
+ """
154
+ alpha = self.forward(sequence)
155
+ beta = self.backward(sequence)
156
+
157
+ # Compute smoothed probabilities (gamma)
158
+ smoothed_probs = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
159
+
160
+ return smoothed_probs
161
+
162
+ def sequence_probability(self, sequence: List[int]) -> np.ndarray:
163
+ return self.state_probabilities(sequence)[-1]
@@ -0,0 +1,222 @@
1
+ import pickle
2
+ import warnings
3
+ from typing import List, Tuple, Union
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pandas.core.frame import DataFrame
9
+
10
+ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
11
+
12
+ # Suppress RankWarning
13
+ warnings.simplefilter("ignore", np.RankWarning)
14
+
15
+
16
+ # --------------------------------------------------------------------------------------------------------------------------------------
17
+ def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
18
+ # Count the frequency of each category in the column
19
+ freq = df[column].value_counts()
20
+
21
+ # Calculate the 25th percentile (Q1) and 75th percentile (Q3)
22
+ q1 = freq.quantile(0.25)
23
+ q3 = freq.quantile(0.75)
24
+
25
+ # Filter categories that are below the 25th percentile and above the 75th percentile
26
+ least_frequent = freq[freq <= q1]
27
+ most_frequent = freq[freq >= q3]
28
+
29
+ # Get the least frequent category (25th percentile) and the most frequent category (75th percentile)
30
+ least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
31
+ most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
32
+
33
+ return least_frequent_category, most_frequent_category
34
+
35
+
36
+ class SimulationEngine(FeatureSelection):
37
+ """
38
+ This class implements a predictive model that utilizes multiple linear regression for numerical target variables
39
+ and multiple logistic regression for categorical target variables.
40
+
41
+ The class provides methods for training the model on a given dataset, making predictions,
42
+ and evaluating the model's performance.
43
+
44
+ Key features:
45
+ - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
46
+ - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
47
+ - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
48
+
49
+ Usage:
50
+ - Instantiate the class with the training data and target variable.
51
+ - Call the fit method to train the model.
52
+ - Use the predict method to generate predictions on new data.
53
+ - Evaluate the model using built-in metrics for accuracy and error.
54
+
55
+ This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
56
+ for both numerical and categorical outcomes efficiently.
57
+ """
58
+
59
+ def __init__(self, use_scaler: bool = False, **kwargs):
60
+
61
+ self.df = pd.DataFrame()
62
+ self.n_importances = None
63
+ self.use_scaler = use_scaler
64
+ self.proba_dict = {}
65
+
66
+ super().__init__(**kwargs)
67
+
68
+ def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
69
+ # Let us assign the dictionary entries corresponding to the column
70
+ w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
71
+
72
+ df = df[names_cols].copy()
73
+ # Change the scale of the DataFrame
74
+ dataset = self.df.copy()
75
+ dataset.drop(columns=column, inplace=True)
76
+ numeric_df = dataset.select_dtypes(include="number")
77
+ if self.use_scaler:
78
+ scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
79
+ _ = scaler.rescale()
80
+ dataset_ = df.copy()
81
+ numeric_df = dataset_.select_dtypes(include="number")
82
+ numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
83
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
84
+ for col in numeric_df.columns:
85
+ df[col] = numeric_df[col].values
86
+
87
+ # Encoding the DataFrame
88
+ for num, colname in enumerate(dfe._encode_columns):
89
+ if df[colname].dtype == "object":
90
+ encode_dict = dfe.encoding_list[num]
91
+ df[colname] = df[colname].apply(
92
+ dfe._code_transformation_to, dictionary_list=encode_dict
93
+ )
94
+
95
+ # Prediction
96
+ y = df.to_numpy() @ w
97
+
98
+ # Categorical column
99
+ if quick_encoder != None:
100
+
101
+ one_hot = OneHotEncoder()
102
+ y = one_hot.decode(y)
103
+ encoding_dic = quick_encoder.decoding_list[0]
104
+ y = [encoding_dic[item] for item in y]
105
+ # Numeric column
106
+ else:
107
+ if self.use_scaler:
108
+ # scale output
109
+ y += 1
110
+ y /= 2
111
+ y = y * (self.df[column].max() - self.df[column].min())
112
+
113
+ return y[:]
114
+
115
+ def _encode(self, df: DataFrame) -> np.ndarray | list:
116
+ df = df.copy()
117
+ column = df.columns[0]
118
+ frec = df[column].value_counts() / len(df)
119
+ df.loc[:, "frec"] = df[column].map(frec)
120
+ df.sort_values("frec", inplace=True)
121
+ keys = df[column].to_list()
122
+ values = df["frec"].to_list()
123
+ return dict(zip(keys, values))
124
+
125
+ def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
126
+ self.df = df
127
+ self.n_importances = n_importances
128
+ # We run the feature selection algorithm
129
+ self.get_digraph(self.df, self.n_importances, self.use_scaler)
130
+ proba_dict_keys = list(self.w_dict.keys())
131
+ self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
132
+ for key in proba_dict_keys:
133
+ x = (
134
+ self.df[key].values,
135
+ None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
136
+ )
137
+ poly = kwargs.get("poly", 9)
138
+ plot = kwargs.get("plot", False)
139
+ if not x[1]:
140
+ media = self.df[key].mean()
141
+ desviacion_estandar = self.df[key].std()
142
+ cota_inferior = media - 1.5 * desviacion_estandar
143
+ cota_superior = media + 1.5 * desviacion_estandar
144
+ if plot:
145
+ print(f"Cumulative Distribution Function ({key})")
146
+ f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
147
+ else:
148
+ f, ox = None, None
149
+ least_frequent_category, most_frequent_category = categories_by_quartile(
150
+ self.df[[key]], key
151
+ )
152
+ cota_inferior = x[1].get(least_frequent_category, 0)
153
+ cota_superior = x[1].get(most_frequent_category, 0)
154
+ self.proba_dict[key] = (
155
+ f if f else None,
156
+ x[1],
157
+ (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
158
+ f(cota_inferior) if f else cota_inferior,
159
+ f(cota_superior) if f else cota_superior,
160
+ )
161
+
162
+ def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
163
+ value = (
164
+ value
165
+ if isinstance(value, list)
166
+ else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
167
+ )
168
+ return [
169
+ (
170
+ self.proba_dict[colname][0](val)
171
+ - self.proba_dict[colname][0](val - self.proba_dict[colname][2])
172
+ if (isinstance(val, float) or isinstance(val, int))
173
+ else self.proba_dict[colname][1].get(val, 0)
174
+ )
175
+ for val in value
176
+ ]
177
+
178
+ def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
179
+ return [
180
+ (
181
+ "inlier"
182
+ if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
183
+ else "outlier"
184
+ )
185
+ for val in self.get_proba(value, colname)
186
+ ]
187
+
188
+ def _clean_data(self, df: DataFrame) -> DataFrame:
189
+
190
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
191
+ df.replace(" ", np.nan, inplace=True)
192
+ df = check_nan_inf(df)
193
+ df = df.reset_index()
194
+ df = df.drop(columns=["index"])
195
+
196
+ return df
197
+
198
+ def save(self, filename: str = "./simulation_model") -> None:
199
+ """
200
+ Save the state of the SimulationEngine to a file.
201
+
202
+ Parameters:
203
+ filename (str): The name of the file where the object will be saved.
204
+ """
205
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
206
+ with open(filename, "wb") as f:
207
+ pickle.dump(self, f)
208
+
209
+ @staticmethod
210
+ def load(filename: str = "./simulation_model"):
211
+ """
212
+ Load the state of a SimulationEngine from a file.
213
+
214
+ Parameters:
215
+ filename (str): The name of the file containing the saved object.
216
+
217
+ Returns:
218
+ SimulationEngine: A new instance of SimulationEngine with the loaded state.
219
+ """
220
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
221
+ with open(filename, "rb") as f:
222
+ return pickle.load(f)