likelihood 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/graph/graph.py CHANGED
@@ -74,3 +74,20 @@ class DynamicGraph(FeatureSelection):
74
74
  nx_graph.add_edges_from([(source, target, edge)])
75
75
 
76
76
  return nx_graph
77
+
78
+
79
+ # -------------------------------------------------------------------------
80
+ if __name__ == "__main__":
81
+ import numpy as np
82
+ import pandas as pd
83
+
84
+ # Generate data
85
+ x = np.random.rand(3, 100)
86
+ y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
87
+ # Create a DataFrame
88
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
89
+ df["y"] = y
90
+ # Instantiate DynamicGraph
91
+ fs = DynamicGraph(df, n_importances=2)
92
+ print(fs.fit())
93
+ fs.draw()
likelihood/graph/nn.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import os
2
2
 
3
3
  os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
4
+ # Suppress TensorFlow INFO logs
5
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
4
6
  import logging
5
7
  import warnings
6
8
  from typing import List, Tuple
@@ -9,7 +11,6 @@ import numpy as np
9
11
  import pandas as pd
10
12
  import tensorflow as tf
11
13
  from IPython.display import clear_output
12
- from numpy import ndarray
13
14
  from pandas.core.frame import DataFrame
14
15
  from sklearn.metrics import f1_score
15
16
  from sklearn.model_selection import train_test_split
@@ -21,7 +22,7 @@ logging.getLogger("tensorflow").setLevel(logging.ERROR)
21
22
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
22
23
 
23
24
 
24
- def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
25
+ def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
25
26
  """Compares the similarity between two arrays of categories.
26
27
 
27
28
  Parameters
@@ -44,9 +45,9 @@ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
44
45
  return count
45
46
 
46
47
 
47
- def cal_adjency_matrix(
48
+ def cal_adjacency_matrix(
48
49
  df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
49
- ) -> Tuple[dict, ndarray]:
50
+ ) -> Tuple[dict, np.ndarray]:
50
51
  """Calculates the adjacency matrix for a given DataFrame.
51
52
  The adjacency matrix is a matrix that represents the similarity between each pair of categories.
52
53
  The similarity is calculated using the `compare_similarity` function.
@@ -133,7 +134,7 @@ class Data:
133
134
  target: str | None = None,
134
135
  exclude_subset: List[str] = [],
135
136
  ):
136
- _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
137
+ _, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=True)
137
138
  if target is not None:
138
139
  X = df.drop(columns=[target] + exclude_subset)
139
140
  else:
@@ -0,0 +1,163 @@
1
+ import logging
2
+ import os
3
+ import pickle
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ from IPython.display import clear_output
8
+
9
+
10
+ class HMM:
11
+ def __init__(self, n_states: int, n_observations: int):
12
+ self.n_states = n_states
13
+ self.n_observations = n_observations
14
+
15
+ # Initialize parameters with random values
16
+ self.pi = np.random.dirichlet(np.ones(n_states), size=1)[0]
17
+ self.A = np.random.dirichlet(np.ones(n_states), size=n_states)
18
+ self.B = np.random.dirichlet(np.ones(n_observations), size=n_states)
19
+
20
+ def save_model(self, filename: str = "./hmm") -> None:
21
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
22
+ with open(filename, "wb") as f:
23
+ pickle.dump(self, f)
24
+
25
+ @staticmethod
26
+ def load_model(filename: str = "./hmm") -> "HMM":
27
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
28
+ with open(filename, "rb") as f:
29
+ return pickle.load(f)
30
+
31
+ def forward(self, sequence: List[int]) -> np.ndarray:
32
+ T = len(sequence)
33
+ alpha = np.zeros((T, self.n_states))
34
+
35
+ # Add a small constant (smoothing) to avoid log(0)
36
+ epsilon = 1e-10 # Small value to avoid taking log(0)
37
+
38
+ # Initialization (log-space)
39
+ alpha[0] = np.log(self.pi + epsilon) + np.log(self.B[:, sequence[0]] + epsilon)
40
+ alpha[0] -= np.log(np.sum(np.exp(alpha[0]))) # Normalization (log-space)
41
+
42
+ # Recursion (log-space)
43
+ for t in range(1, T):
44
+ for i in range(self.n_states):
45
+ alpha[t, i] = np.log(
46
+ np.sum(np.exp(alpha[t - 1] + np.log(self.A[:, i] + epsilon)))
47
+ ) + np.log(self.B[i, sequence[t]] + epsilon)
48
+ alpha[t] -= np.log(np.sum(np.exp(alpha[t]))) # Normalization
49
+
50
+ return alpha
51
+
52
+ def backward(self, sequence: List[int]) -> np.ndarray:
53
+ T = len(sequence)
54
+ beta = np.ones((T, self.n_states))
55
+
56
+ # Backward recursion
57
+ for t in range(T - 2, -1, -1):
58
+ for i in range(self.n_states):
59
+ beta[t, i] = np.sum(self.A[i] * self.B[:, sequence[t + 1]] * beta[t + 1])
60
+
61
+ return beta
62
+
63
+ def viterbi(self, sequence: List[int]) -> np.ndarray:
64
+ T = len(sequence)
65
+ delta = np.zeros((T, self.n_states))
66
+ psi = np.zeros((T, self.n_states), dtype=int)
67
+
68
+ # Initialization
69
+ delta[0] = self.pi * self.B[:, sequence[0]]
70
+
71
+ # Recursion
72
+ for t in range(1, T):
73
+ for i in range(self.n_states):
74
+ delta[t, i] = np.max(delta[t - 1] * self.A[:, i]) * self.B[i, sequence[t]]
75
+ psi[t, i] = np.argmax(delta[t - 1] * self.A[:, i])
76
+
77
+ # Reconstruct the most probable path
78
+ state_sequence = np.zeros(T, dtype=int)
79
+ state_sequence[T - 1] = np.argmax(delta[T - 1])
80
+ for t in range(T - 2, -1, -1):
81
+ state_sequence[t] = psi[t + 1, state_sequence[t + 1]]
82
+
83
+ return state_sequence
84
+
85
+ def baum_welch(
86
+ self, sequences: List[List[int]], n_iterations: int, verbose: bool = False
87
+ ) -> None:
88
+ for iteration in range(n_iterations):
89
+ # Initialize accumulators
90
+ A_num = np.zeros((self.n_states, self.n_states))
91
+ B_num = np.zeros((self.n_states, self.n_observations))
92
+ pi_num = np.zeros(self.n_states)
93
+
94
+ for sequence in sequences:
95
+ T = len(sequence)
96
+ alpha = self.forward(sequence)
97
+ beta = self.backward(sequence)
98
+
99
+ # Update pi
100
+ gamma = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
101
+ pi_num += gamma[0]
102
+
103
+ # Update A and B
104
+ for t in range(T - 1):
105
+ xi = np.zeros((self.n_states, self.n_states))
106
+ denom = np.sum(alpha[t] * self.A * self.B[:, sequence[t + 1]] * beta[t + 1])
107
+
108
+ for i in range(self.n_states):
109
+ for j in range(self.n_states):
110
+ xi[i, j] = (
111
+ alpha[t, i]
112
+ * self.A[i, j]
113
+ * self.B[j, sequence[t + 1]]
114
+ * beta[t + 1, j]
115
+ ) / denom
116
+ A_num[i] += xi[i]
117
+
118
+ B_num[:, sequence[t]] += gamma[t]
119
+
120
+ # For the last step of the sequence
121
+ B_num[:, sequence[-1]] += gamma[-1]
122
+
123
+ # Normalize and update parameters
124
+ self.pi = pi_num / len(sequences)
125
+ self.A = A_num / np.sum(A_num, axis=1, keepdims=True)
126
+ self.B = B_num / np.sum(B_num, axis=1, keepdims=True)
127
+
128
+ # Logging parameters every 10 iterations
129
+ if iteration % 10 == 0 and verbose:
130
+ os.system("cls" if os.name == "nt" else "clear")
131
+ clear_output(wait=True)
132
+ logging.info(f"Iteration {iteration}:")
133
+ logging.info("Pi: %s", self.pi)
134
+ logging.info("A:\n%s", self.A)
135
+ logging.info("B:\n%s", self.B)
136
+
137
+ def decoding_accuracy(self, sequences: List[List[int]], true_states: List[List[int]]) -> float:
138
+ correct_predictions = 0
139
+ total_predictions = 0
140
+
141
+ for sequence, true_state in zip(sequences, true_states):
142
+ predicted_states = self.viterbi(sequence)
143
+ correct_predictions += np.sum(predicted_states == true_state)
144
+ total_predictions += len(sequence)
145
+
146
+ accuracy = (correct_predictions / total_predictions) * 100
147
+ return accuracy
148
+
149
+ def state_probabilities(self, sequence: List[int]) -> np.ndarray:
150
+ """
151
+ Returns the smoothed probabilities of the hidden states at each time step.
152
+ This is done by using both forward and backward probabilities.
153
+ """
154
+ alpha = self.forward(sequence)
155
+ beta = self.backward(sequence)
156
+
157
+ # Compute smoothed probabilities (gamma)
158
+ smoothed_probs = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
159
+
160
+ return smoothed_probs
161
+
162
+ def sequence_probability(self, sequence: List[int]) -> np.ndarray:
163
+ return self.state_probabilities(sequence)[-1]
@@ -1,12 +1,36 @@
1
+ import pickle
2
+ import warnings
3
+ from typing import List, Tuple, Union
4
+
1
5
  import matplotlib.pyplot as plt
2
6
  import numpy as np
3
7
  import pandas as pd
4
- from numpy import ndarray
5
8
  from pandas.core.frame import DataFrame
6
9
 
7
- from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
10
+ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
11
+
12
+ # Suppress RankWarning
13
+ warnings.simplefilter("ignore", np.RankWarning)
14
+
8
15
 
9
16
  # --------------------------------------------------------------------------------------------------------------------------------------
17
+ def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
18
+ # Count the frequency of each category in the column
19
+ freq = df[column].value_counts()
20
+
21
+ # Calculate the 25th percentile (Q1) and 75th percentile (Q3)
22
+ q1 = freq.quantile(0.25)
23
+ q3 = freq.quantile(0.75)
24
+
25
+ # Filter categories that are below the 25th percentile and above the 75th percentile
26
+ least_frequent = freq[freq <= q1]
27
+ most_frequent = freq[freq >= q3]
28
+
29
+ # Get the least frequent category (25th percentile) and the most frequent category (75th percentile)
30
+ least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
31
+ most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
32
+
33
+ return least_frequent_category, most_frequent_category
10
34
 
11
35
 
12
36
  class SimulationEngine(FeatureSelection):
@@ -32,20 +56,21 @@ class SimulationEngine(FeatureSelection):
32
56
  for both numerical and categorical outcomes efficiently.
33
57
  """
34
58
 
35
- def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
59
+ def __init__(self, use_scaler: bool = False, **kwargs):
36
60
 
37
- self.df = df
38
- self.n_importances = n_importances
61
+ self.df = pd.DataFrame()
62
+ self.n_importances = None
39
63
  self.use_scaler = use_scaler
64
+ self.proba_dict = {}
40
65
 
41
66
  super().__init__(**kwargs)
42
67
 
43
- def predict(self, df: DataFrame, column: str) -> ndarray | list:
68
+ def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
44
69
  # Let us assign the dictionary entries corresponding to the column
45
70
  w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
46
71
 
47
72
  df = df[names_cols].copy()
48
- # Change the scale of the dataframe
73
+ # Change the scale of the DataFrame
49
74
  dataset = self.df.copy()
50
75
  dataset.drop(columns=column, inplace=True)
51
76
  numeric_df = dataset.select_dtypes(include="number")
@@ -59,7 +84,7 @@ class SimulationEngine(FeatureSelection):
59
84
  for col in numeric_df.columns:
60
85
  df[col] = numeric_df[col].values
61
86
 
62
- # Encoding the datadrame
87
+ # Encoding the DataFrame
63
88
  for num, colname in enumerate(dfe._encode_columns):
64
89
  if df[colname].dtype == "object":
65
90
  encode_dict = dfe.encoding_list[num]
@@ -67,7 +92,7 @@ class SimulationEngine(FeatureSelection):
67
92
  dfe._code_transformation_to, dictionary_list=encode_dict
68
93
  )
69
94
 
70
- # PREDICTION
95
+ # Prediction
71
96
  y = df.to_numpy() @ w
72
97
 
73
98
  # Categorical column
@@ -87,10 +112,78 @@ class SimulationEngine(FeatureSelection):
87
112
 
88
113
  return y[:]
89
114
 
90
- def fit(self, **kwargs) -> None:
91
-
115
+ def _encode(self, df: DataFrame) -> np.ndarray | list:
116
+ df = df.copy()
117
+ column = df.columns[0]
118
+ frec = df[column].value_counts() / len(df)
119
+ df.loc[:, "frec"] = df[column].map(frec)
120
+ df.sort_values("frec", inplace=True)
121
+ keys = df[column].to_list()
122
+ values = df["frec"].to_list()
123
+ return dict(zip(keys, values))
124
+
125
+ def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
126
+ self.df = df
127
+ self.n_importances = n_importances
92
128
  # We run the feature selection algorithm
93
129
  self.get_digraph(self.df, self.n_importances, self.use_scaler)
130
+ proba_dict_keys = list(self.w_dict.keys())
131
+ self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
132
+ for key in proba_dict_keys:
133
+ x = (
134
+ self.df[key].values,
135
+ None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
136
+ )
137
+ poly = kwargs.get("poly", 9)
138
+ plot = kwargs.get("plot", False)
139
+ if not x[1]:
140
+ media = self.df[key].mean()
141
+ desviacion_estandar = self.df[key].std()
142
+ cota_inferior = media - 1.5 * desviacion_estandar
143
+ cota_superior = media + 1.5 * desviacion_estandar
144
+ if plot:
145
+ print(f"Cumulative Distribution Function ({key})")
146
+ f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
147
+ else:
148
+ f, ox = None, None
149
+ least_frequent_category, most_frequent_category = categories_by_quartile(
150
+ self.df[[key]], key
151
+ )
152
+ cota_inferior = x[1].get(least_frequent_category, 0)
153
+ cota_superior = x[1].get(most_frequent_category, 0)
154
+ self.proba_dict[key] = (
155
+ f if f else None,
156
+ x[1],
157
+ (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
158
+ f(cota_inferior) if f else cota_inferior,
159
+ f(cota_superior) if f else cota_superior,
160
+ )
161
+
162
+ def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
163
+ value = (
164
+ value
165
+ if isinstance(value, list)
166
+ else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
167
+ )
168
+ return [
169
+ (
170
+ self.proba_dict[colname][0](val)
171
+ - self.proba_dict[colname][0](val - self.proba_dict[colname][2])
172
+ if (isinstance(val, float) or isinstance(val, int))
173
+ else self.proba_dict[colname][1].get(val, 0)
174
+ )
175
+ for val in value
176
+ ]
177
+
178
+ def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
179
+ return [
180
+ (
181
+ "inlier"
182
+ if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
183
+ else "outlier"
184
+ )
185
+ for val in self.get_proba(value, colname)
186
+ ]
94
187
 
95
188
  def _clean_data(self, df: DataFrame) -> DataFrame:
96
189
 
@@ -101,3 +194,29 @@ class SimulationEngine(FeatureSelection):
101
194
  df = df.drop(columns=["index"])
102
195
 
103
196
  return df
197
+
198
+ def save(self, filename: str = "./simulation_model") -> None:
199
+ """
200
+ Save the state of the SimulationEngine to a file.
201
+
202
+ Parameters:
203
+ filename (str): The name of the file where the object will be saved.
204
+ """
205
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
206
+ with open(filename, "wb") as f:
207
+ pickle.dump(self, f)
208
+
209
+ @staticmethod
210
+ def load(filename: str = "./simulation_model"):
211
+ """
212
+ Load the state of a SimulationEngine from a file.
213
+
214
+ Parameters:
215
+ filename (str): The name of the file containing the saved object.
216
+
217
+ Returns:
218
+ SimulationEngine: A new instance of SimulationEngine with the loaded state.
219
+ """
220
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
221
+ with open(filename, "rb") as f:
222
+ return pickle.load(f)
likelihood/tools/tools.py CHANGED
@@ -1,15 +1,18 @@
1
1
  import math
2
2
  import os
3
3
  import pickle
4
- from typing import Callable, Dict, List, Tuple
4
+ import warnings
5
+ from typing import Callable, Dict, List, Tuple, Union
5
6
 
6
7
  import matplotlib.pyplot as plt
7
8
  import numpy as np
8
9
  import pandas as pd
9
10
  import yaml
10
- from numpy import ndarray
11
11
  from pandas.core.frame import DataFrame
12
12
 
13
+ # Suppress RankWarning
14
+ warnings.simplefilter("ignore", np.RankWarning)
15
+
13
16
  # -------------------------------------------------------------------------
14
17
 
15
18
  """
@@ -68,7 +71,7 @@ def difference_quotient(f: Callable, x: float, h: float) -> Callable:
68
71
  return (f(x + h) - f(x)) / h
69
72
 
70
73
 
71
- def partial_difference_quotient(f: Callable, v: ndarray, i: int, h: float) -> ndarray:
74
+ def partial_difference_quotient(f: Callable, v: np.ndarray, i: int, h: float) -> np.ndarray:
72
75
  """Calculates the partial difference quotient of `f`
73
76
 
74
77
  Parameters
@@ -93,7 +96,7 @@ def partial_difference_quotient(f: Callable, v: ndarray, i: int, h: float) -> nd
93
96
  return (f(w) - f(v)) / h
94
97
 
95
98
 
96
- def estimate_gradient(f: Callable, v: ndarray, h: float = 1e-4) -> List[ndarray]:
99
+ def estimate_gradient(f: Callable, v: np.ndarray, h: float = 1e-4) -> List[np.ndarray]:
97
100
  """Calculates the gradient of `f` at `v`
98
101
 
99
102
  Parameters
@@ -138,35 +141,32 @@ def generate_feature_yaml(
138
141
  A dictionary with four keys ('ordinal_features', 'numeric_features', 'categorical_features', 'ignore_features')
139
142
  mapping to lists of feature names. Or a YAML formatted string if `yaml_string` is `True`.
140
143
  """
144
+ ignore_features = ignore_features or []
141
145
  feature_info = {
142
146
  "ordinal_features": [],
143
147
  "numeric_features": [],
144
148
  "categorical_features": [],
145
- "ignore_features": [],
149
+ "ignore_features": ignore_features,
146
150
  }
147
151
 
148
152
  for col in df.columns:
149
- if ignore_features and col in ignore_features:
153
+ if col in ignore_features:
150
154
  continue
151
155
 
152
156
  if pd.api.types.is_numeric_dtype(df[col]):
153
- feature_info["numeric_features"].append(col)
157
+ if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
158
+ feature_info["numeric_features"].append(col)
159
+ elif pd.api.types.is_bool_dtype(df[col]):
160
+ feature_info["ordinal_features"].append(col) # Assuming bool can be ordinal
154
161
  elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
155
162
  feature_info["categorical_features"].append(col)
156
- elif pd.api.types.is_integer_dtype(df[col]):
157
- feature_info["ordinal_features"].append(col)
158
- elif pd.api.types.is_float_dtype(df[col]):
159
- feature_info["ordinal_features"].append(col)
160
- elif pd.api.types.is_bool_dtype(df[col]):
161
- feature_info["ordinal_features"].append(col)
162
163
  else:
163
164
  print(f"Unknown type for feature {col}")
164
- feature_info["ignore_features"] = ignore_features
165
165
 
166
166
  if yaml_string:
167
167
  return yaml.dump(feature_info, default_flow_style=False)
168
- else:
169
- return feature_info
168
+
169
+ return feature_info
170
170
 
171
171
 
172
172
  # a function that calculates the percentage of missing values per column is defined
@@ -192,61 +192,9 @@ def cal_missing_values(df: DataFrame) -> None:
192
192
  )
193
193
 
194
194
 
195
- def calculate_probability(x: ndarray, points: int = 1, cond: bool = True) -> ndarray:
196
- """Calculates the probability of the data.
197
-
198
- Parameters
199
- ----------
200
- x : `np.array`
201
- An array containing the data.
202
- points : `int`
203
- An integer value. By default it is set to `1`.
204
- cond : `bool`
205
- A boolean value. By default it is set to `True`.
206
-
207
- Returns
208
- -------
209
- p : `np.array`
210
- An array containing the probability of the data.
211
-
212
- """
213
-
214
- p = []
215
-
216
- f = cdf(x)[0]
217
- for i in range(len(x)):
218
- p.append(f(x[i]))
219
- p = np.array(p)
220
- if cond:
221
- if np.prod(p[-points]) > 1:
222
- print("\nThe probability of the data cannot be calculated.\n")
223
- else:
224
- if np.prod(p[-points]) < 0:
225
- print("\nThe probability of the data cannot be calculated.\n")
226
- else:
227
- print(
228
- "The model has a probability of {:.2f}% of being correct".format(
229
- np.prod(p[-points]) * 100
230
- )
231
- )
232
- else:
233
- if np.sum(p[-points]) < 0:
234
- print("\nThe probability of the data cannot be calculated.\n")
235
- else:
236
- if np.sum(p[-points]) > 1:
237
- print("\nThe probability of the data cannot be calculated.\n")
238
- else:
239
- print(
240
- "The model has a probability of {:.2f}% of being correct".format(
241
- np.sum(p[-points]) * 100
242
- )
243
- )
244
- return p
245
-
246
-
247
195
  def cdf(
248
- x: ndarray, poly: int = 9, inv: bool = False, plot: bool = False, savename: str = None
249
- ) -> ndarray:
196
+ x: np.ndarray, poly: int = 9, inv: bool = False, plot: bool = False, savename: str = None
197
+ ) -> tuple:
250
198
  """Calculates the cumulative distribution function of the data.
251
199
 
252
200
  Parameters
@@ -254,165 +202,229 @@ def cdf(
254
202
  x : `np.array`
255
203
  An array containing the data.
256
204
  poly : `int`
257
- An integer value. By default it is set to `9`.
205
+ Degree of the polynomial fit. By default it is set to `9`.
258
206
  inv : `bool`
259
- A boolean value. By default it is set to `False`.
207
+ If True, calculate the inverse CDF (quantile function).
208
+ plot : `bool`
209
+ If True, plot the results.
210
+ savename : `str`, optional
211
+ Filename to save the plot.
260
212
 
261
213
  Returns
262
214
  -------
263
- cdf_ : `np.array`
264
- An array containing the cumulative distribution function.
265
-
215
+ fit : `np.poly1d`
216
+ Polynomial fit of the CDF or quantile function.
217
+ cdf_values : `np.array`
218
+ Cumulative distribution values.
219
+ sorted_x : `np.array`
220
+ Sorted input data.
266
221
  """
267
222
 
268
- cdf_ = np.cumsum(x) / np.sum(x)
223
+ if len(x) == 0:
224
+ raise ValueError("Input array 'x' must not be empty.")
225
+
226
+ cdf_values = np.cumsum(x) / np.sum(x)
227
+ sorted_x = np.sort(x)
269
228
 
270
- ox = np.sort(x)
271
- I = np.ones(len(ox))
272
- M = np.triu(I)
273
- df = np.dot(ox, M)
274
- df_ = df / np.max(df)
229
+ # Calculate the CDF or inverse CDF (quantile function)
230
+ probabilities = np.linspace(0, 1, len(sorted_x))
275
231
 
276
232
  if inv:
277
- fit = np.polyfit(df_, ox, poly)
233
+ fit = np.polyfit(probabilities, sorted_x, poly)
278
234
  f = np.poly1d(fit)
235
+ plot_label = "Quantile Function"
236
+ x_values = probabilities
237
+ y_values = sorted_x
279
238
  else:
280
- fit = np.polyfit(ox, df_, poly)
239
+ fit = np.polyfit(sorted_x, probabilities, poly)
281
240
  f = np.poly1d(fit)
241
+ plot_label = "Cumulative Distribution Function"
242
+ x_values = sorted_x
243
+ y_values = cdf_values
282
244
 
283
245
  if plot:
284
- if inv:
285
- plt.plot(df_, ox, "o", label="inv cdf")
286
- plt.plot(df_, f(df_), "r--", label="fit")
287
- plt.title("Quantile Function")
288
- plt.xlabel("Probability")
289
- plt.ylabel("Value")
290
- plt.legend()
291
- if savename != None:
292
- plt.savefig(savename, dpi=300)
293
- plt.show()
294
- else:
295
- plt.plot(ox, cdf_, "o", label="cdf")
296
- plt.plot(ox, f(ox), "r--", label="fit")
297
- plt.title("Cumulative Distribution Function")
298
- plt.xlabel("Value")
299
- plt.ylabel("Probability")
300
- plt.legend()
301
- if savename != None:
302
- plt.savefig(savename, dpi=300)
303
- plt.show()
246
+ plt.figure()
247
+ plt.plot(x_values, y_values, "o", label="data")
248
+ plt.plot(x_values, f(x_values), "r--", label="fit")
249
+ plt.title(plot_label)
250
+ plt.xlabel("Probability" if inv else "Value")
251
+ plt.ylabel("Value" if inv else "Probability")
252
+ plt.legend()
253
+ if savename:
254
+ plt.savefig(savename, dpi=300)
255
+ plt.show()
304
256
 
305
- return f, cdf_, ox
257
+ return f, cdf_values, sorted_x
306
258
 
307
259
 
308
- class corr:
309
- """Calculates the correlation of the data.
260
+ def calculate_probability(x: np.ndarray, points: int = 1, cond: bool = True) -> np.ndarray:
261
+ """Calculates the probability of the data based on the CDF fit.
310
262
 
311
263
  Parameters
312
264
  ----------
313
265
  x : `np.array`
314
266
  An array containing the data.
315
- y : `np.array`
316
- An array containing the data.
267
+ points : `int`
268
+ Number of points to consider for the final probability calculation.
269
+ cond : `bool`
270
+ Condition to use product (True) or sum (False) for the final probability check.
317
271
 
318
272
  Returns
319
273
  -------
320
- z : `np.array`
321
- An array containing the correlation of `x` and `y`.
322
-
274
+ p : `np.array`
275
+ Array containing the probabilities of the data.
323
276
  """
324
277
 
278
+ if len(x) == 0:
279
+ raise ValueError("Input array 'x' must not be empty.")
280
+
281
+ fit, _, sorted_x = cdf(x)
282
+ p = fit(x)
283
+
284
+ # Validate probability values
285
+ if cond:
286
+ prob_value = np.prod(p[-points])
287
+ message = "product"
288
+ else:
289
+ prob_value = np.sum(p[-points])
290
+ message = "sum"
291
+
292
+ if 0 <= prob_value <= 1:
293
+ print(f"The model has a probability of {prob_value * 100:.2f}% based on the {message}.")
294
+ else:
295
+ print("\nThe probability of the data cannot be calculated.\n")
296
+
297
+ return p
298
+
299
+
300
+ class CorrelationBase:
301
+ """Base class for correlation calculations."""
302
+
325
303
  __slots__ = ["x", "y", "result", "z"]
326
304
 
327
- def __init__(self, x: ndarray, y: ndarray):
305
+ def __init__(self, x: np.ndarray, y: Union[np.ndarray, None] = None):
328
306
  self.x = x
329
- self.y = y
330
- self.result = np.correlate(x, y, mode="full")
307
+ self.y = y if y is not None else x # Default to autocorrelation if y is not provided
308
+ self._compute_correlation()
331
309
  self.z = self.result[self.result.size // 2 :]
332
- self.z = self.z / float(np.abs(self.z).max())
310
+ self.z /= np.abs(self.z).max()
311
+
312
+ def _compute_correlation(self):
313
+ """Compute the correlation between x and y (or x with itself for autocorrelation)."""
314
+ self.result = np.correlate(self.x, self.y, mode="full")
333
315
 
334
316
  def plot(self):
335
- plt.plot(range(len(self.z)), self.z, label="Correlation")
317
+ """Plot the correlation or autocorrelation."""
318
+ plt.plot(range(len(self.z)), self.z, label=self._get_label())
336
319
  plt.legend()
337
320
  plt.show()
338
321
 
322
+ def _get_label(self) -> str:
323
+ return "Autocorrelation" if np.array_equal(self.x, self.y) else "Correlation"
324
+
339
325
  def __call__(self):
326
+ """Return the computed correlation or autocorrelation."""
340
327
  return self.z
341
328
 
342
329
 
343
- class autocorr:
344
- """Calculates the autocorrelation of the data.
330
+ class Correlation(CorrelationBase):
331
+ """Calculates the cross-correlation of two datasets.
345
332
 
346
333
  Parameters
347
334
  ----------
348
- x : `np.array`
349
- An array containing the data.
335
+ x : `np.ndarray`
336
+ An array containing the first dataset.
337
+ y : `np.ndarray`
338
+ An array containing the second dataset.
350
339
 
351
340
  Returns
352
341
  -------
353
- z : `np.array`
354
- An array containing the autocorrelation of the data.
342
+ z : `np.ndarray`
343
+ An array containing the correlation of `x` and `y`.
355
344
 
356
345
  """
357
346
 
358
- __slots__ = ["x", "result", "z"]
347
+ def __init__(self, x: np.ndarray, y: np.ndarray):
348
+ super().__init__(x, y)
359
349
 
360
- def __init__(self, x: ndarray):
361
- self.x = x
362
- self.result = np.correlate(x, x, mode="full")
363
- self.z = self.result[self.result.size // 2 :]
364
- self.z = self.z / float(np.abs(self.z).max())
365
350
 
366
- def plot(self):
367
- plt.plot(range(len(self.z)), self.z, label="Autocorrelation")
368
- plt.legend()
369
- plt.show()
351
+ class AutoCorrelation(CorrelationBase):
352
+ """Calculates the autocorrelation of a dataset.
370
353
 
371
- def __call__(self):
372
- return self.z
354
+ Parameters
355
+ ----------
356
+ x : `np.ndarray`
357
+ An array containing the data.
373
358
 
359
+ Returns
360
+ -------
361
+ z : `np.ndarray`
362
+ An array containing the autocorrelation of the data.
363
+ """
364
+
365
+ def __init__(self, x: np.ndarray):
366
+ super().__init__(x)
374
367
 
375
- def fft_denoise(dataset: ndarray, sigma: float = 0, mode: bool = True) -> Tuple[ndarray, float]:
376
- """Performs the noise removal using the Fast Fourier Transform.
368
+
369
+ def fft_denoise(
370
+ dataset: np.ndarray, sigma: float = 0, mode: bool = True
371
+ ) -> Tuple[np.ndarray, np.ndarray]:
372
+ """Performs noise removal using the Fast Fourier Transform.
377
373
 
378
374
  Parameters
379
375
  ----------
380
- dataset : `np.array`
381
- An array containing the noised data.
382
- sigma : `float`
383
- A `float` between `0` and `1`. By default it is set to `0`.
384
- mode : `bool`
385
- A boolean value. By default it is set to `True`.
376
+ dataset : `np.ndarray`
377
+ An array containing the noised data. Expected shape (num_samples, num_points).
378
+ sigma : `float`, default=0
379
+ A float between 0 and 1 representing the threshold for noise filtering.
380
+ mode : `bool`, default=True
381
+ If True, print progress messages.
386
382
 
387
383
  Returns
388
384
  -------
389
- dataset : `np.array`
390
- An array containing the denoised data.
391
- period : `float`
392
- period of the function described by the dataset
393
-
385
+ denoised_dataset : `np.ndarray`
386
+ An array containing the denoised data with the same shape as `dataset`.
387
+ periods : `np.ndarray`
388
+ Array of estimated periods for each sample in `dataset`.
394
389
  """
395
- dataset_ = dataset.copy()
396
- for i in range(dataset.shape[0]):
397
- n = dataset.shape[1]
398
- fhat = np.fft.fft(dataset[i, :], n)
399
- freq = (1 / n) * np.arange(n)
400
- L = np.arange(1, np.floor(n / 2), dtype="int")
401
- PSD = fhat * np.conj(fhat) / n
402
- indices = PSD > np.mean(PSD) + sigma * np.std(PSD)
403
- PSDclean = PSD * indices # Zero out all others
404
- fhat = indices * fhat
405
- ffilt = np.fft.ifft(fhat) # Inverse FFT for filtered time signal
406
- dataset_[i, :] = ffilt.real
390
+
391
+ if not (0 <= sigma <= 1):
392
+ raise ValueError("sigma must be between 0 and 1")
393
+
394
+ num_samples, n_points = dataset.shape
395
+ denoised_dataset = np.zeros_like(dataset)
396
+ periods = np.zeros(num_samples)
397
+
398
+ # Precompute values that do not change within the loop
399
+ freq = (1 / n_points) * np.arange(n_points)
400
+ L = np.arange(1, np.floor(n_points / 2), dtype=int)
401
+
402
+ for i in range(num_samples):
403
+ fhat = np.fft.fft(dataset[i, :], n_points)
404
+ PSD = fhat * np.conj(fhat) / n_points
405
+ threshold = np.mean(PSD) + sigma * np.std(PSD)
406
+ indices = PSD > threshold
407
+
408
+ # Zero out all others in frequency domain
409
+ PSDclean = PSD * indices
410
+ fhat_cleaned = fhat * indices
411
+
412
+ # Inverse FFT for filtered time signal
413
+ denoised_signal = np.fft.ifft(fhat_cleaned).real
414
+ denoised_dataset[i, :] = denoised_signal
415
+
407
416
  # Calculate the period of the signal
408
- period = 1 / (2 * freq[L][np.argmax(fhat[L])])
417
+ peak_index = L[np.argmax(np.abs(fhat[L]))]
418
+ periods[i] = 1 / (2 * freq[peak_index])
419
+
409
420
  if mode:
410
421
  print(f"The {i+1}-th row of the dataset has been denoised.")
411
- print(f"The period is {round(period, 4)}")
412
- return dataset_, period
422
+ print(f"The estimated period is {round(periods[i], 4)}")
423
+
424
+ return denoised_dataset, periods
413
425
 
414
426
 
415
- def get_period(dataset: ndarray) -> float:
427
+ def get_period(dataset: np.ndarray) -> float:
416
428
  """Calculates the periodicity of a `dataset`.
417
429
 
418
430
  Parameters
@@ -426,13 +438,31 @@ def get_period(dataset: ndarray) -> float:
426
438
  period of the function described by the `dataset`
427
439
  """
428
440
  n = dataset.size
429
- fhat = np.fft.fft(dataset, n)
430
- freq = (1 / n) * np.arange(n)
431
- L = np.arange(1, np.floor(n / 2), dtype="int")
432
- PSD = fhat * np.conj(fhat) / n
433
- indices = PSD > np.mean(PSD) + np.std(PSD)
434
- fhat = indices * fhat
435
- period = 1 / (2 * freq[L][np.argmax(fhat[L])])
441
+
442
+ # Ensure there are enough points for FFT analysis
443
+ if n < 2:
444
+ raise ValueError("Dataset must contain at least two points.")
445
+
446
+ # Compute the FFT and PSD
447
+ fhat = np.fft.rfft(dataset) # Use rfft for real-valued input to save computation
448
+ freqs = np.fft.rfftfreq(n) # Get only positive frequencies
449
+
450
+ # Calculate the Power Spectral Density (PSD)
451
+ PSD = np.abs(fhat) ** 2 / n
452
+
453
+ # Remove the first frequency component (DC component)
454
+ PSD[0] = 0
455
+
456
+ # Find the index of the maximum PSD value, excluding the DC component
457
+ max_psd_index = np.argmax(PSD)
458
+
459
+ # Calculate the period based on the corresponding frequency
460
+ dominant_freq = freqs[max_psd_index]
461
+ if dominant_freq == 0:
462
+ raise ValueError("No significant periodic component found in the dataset.")
463
+
464
+ period = 1 / dominant_freq
465
+
436
466
  return period
437
467
 
438
468
 
@@ -468,7 +498,7 @@ class LogisticRegression:
468
498
 
469
499
  self.importance = []
470
500
 
471
- def fit(self, dataset: ndarray, values: ndarray) -> None:
501
+ def fit(self, dataset: np.ndarray, values: np.ndarray) -> None:
472
502
  """Performs linear multiple model training
473
503
 
474
504
  Parameters
@@ -501,7 +531,7 @@ class LogisticRegression:
501
531
  a = np.around(self.w[i], decimals=8)
502
532
  self.importance.append(a)
503
533
 
504
- def predict(self, datapoints: ndarray) -> ndarray:
534
+ def predict(self, datapoints: np.ndarray) -> np.ndarray:
505
535
  """
506
536
  Performs predictions for a set of points
507
537
 
@@ -515,7 +545,7 @@ class LogisticRegression:
515
545
 
516
546
  return sig(np.array(self.importance) @ datapoints)
517
547
 
518
- def get_importances(self, print_important_features: bool = False) -> ndarray:
548
+ def get_importances(self, print_important_features: bool = False) -> np.ndarray:
519
549
  """
520
550
  Returns the important features
521
551
 
@@ -547,7 +577,7 @@ class LinearRegression:
547
577
 
548
578
  self.importance = []
549
579
 
550
- def fit(self, dataset: ndarray, values: ndarray, verbose: bool = False) -> None:
580
+ def fit(self, dataset: np.ndarray, values: np.ndarray, verbose: bool = False) -> None:
551
581
  """Performs linear multiple model training
552
582
 
553
583
  Parameters
@@ -580,7 +610,7 @@ class LinearRegression:
580
610
  print("\nParameters:", np.array(self.importance).shape)
581
611
  print("RMSE: {:.4f}".format(mean_square_error(self.y, self.predict(self.X))))
582
612
 
583
- def predict(self, datapoints: ndarray) -> ndarray:
613
+ def predict(self, datapoints: np.ndarray) -> np.ndarray:
584
614
  """
585
615
  Performs predictions for a set of points
586
616
 
@@ -592,7 +622,7 @@ class LinearRegression:
592
622
  """
593
623
  return np.array(self.importance) @ datapoints
594
624
 
595
- def get_importances(self, print_important_features: bool = False) -> ndarray:
625
+ def get_importances(self, print_important_features: bool = False) -> np.ndarray:
596
626
  """
597
627
  Returns the important features
598
628
 
@@ -614,7 +644,7 @@ class LinearRegression:
614
644
  return np.array(self.importance)
615
645
 
616
646
 
617
- def cal_average(y: ndarray, alpha: float = 1):
647
+ def cal_average(y: np.ndarray, alpha: float = 1):
618
648
  """Calculates the moving average of the data
619
649
 
620
650
  Parameters
@@ -642,12 +672,12 @@ class DataScaler:
642
672
 
643
673
  __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
644
674
 
645
- def __init__(self, dataset: ndarray, n: int = 1) -> None:
675
+ def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
646
676
  """Initializes the parameters required for scaling the data"""
647
677
  self.dataset_ = dataset.copy()
648
678
  self._n = n
649
679
 
650
- def rescale(self, dataset_: ndarray | None = None) -> ndarray:
680
+ def rescale(self, dataset_: np.ndarray | None = None) -> np.ndarray:
651
681
  """Perform a standard rescaling of the data
652
682
 
653
683
  Returns
@@ -655,7 +685,7 @@ class DataScaler:
655
685
  data_scaled : `np.array`
656
686
  An array containing the scaled data.
657
687
  """
658
- if isinstance(dataset_, ndarray):
688
+ if isinstance(dataset_, np.ndarray):
659
689
  data_scaled = np.copy(dataset_)
660
690
  mu = self.values[0]
661
691
  sigma = self.values[1]
@@ -711,7 +741,7 @@ class DataScaler:
711
741
 
712
742
  return self.data_scaled
713
743
 
714
- def scale(self, dataset_: ndarray) -> ndarray:
744
+ def scale(self, dataset_: np.ndarray) -> np.ndarray:
715
745
  """Performs the inverse operation to the rescale function
716
746
 
717
747
  Parameters
@@ -755,7 +785,7 @@ def generate_series(n: int, n_steps: int, incline: bool = True):
755
785
  return series.astype(np.float32)
756
786
 
757
787
 
758
- def mean_square_error(y_true: ndarray, y_pred: ndarray, print_error: bool = False):
788
+ def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray, print_error: bool = False):
759
789
  """Calculates the Root Mean Squared Error
760
790
 
761
791
  Parameters
@@ -946,88 +976,65 @@ class PerformanceMeasures:
946
976
  pass
947
977
 
948
978
  # Performance measure Res_T
949
- def f_mean(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
950
- n = len(labels)
979
+ def f_mean(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
980
+ F_vec = self._f1_score(y_true, y_pred, labels)
981
+ mean_f_measure = np.mean(F_vec)
951
982
 
952
- F_vec = self._f1_score(y_true, y_pred, labels=labels)
953
- a = np.sum(F_vec)
983
+ for label, f_measure in zip(labels, F_vec):
984
+ print(f"F-measure of label {label} -> {f_measure}")
954
985
 
955
- for i in range(len(F_vec)):
956
- print("F-measure of label ", labels[i], " -> ", F_vec[i])
986
+ print(f"Mean of F-measure -> {mean_f_measure}")
957
987
 
958
- print("Mean of F-measure -> ", a / n)
988
+ return mean_f_measure
959
989
 
960
990
  # Performance measure Res_P
961
- def resp(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
962
- # We initialize sum counters
963
- sum1 = 0
964
- sum2 = 0
965
-
966
- # Calculamos T_C
991
+ def resp(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
967
992
  T_C = len(y_true)
968
- for i in range(len(labels)):
969
- # We calculate instances of the classes and their F-measures
970
- sum1 += (1 - ((y_true == labels[i]).sum() / T_C)) * self._fi_measure(
971
- y_true, y_pred, labels, i
972
- )
973
- sum2 += 1 - ((y_true == labels[i]).sum()) / T_C
993
+ sum1, sum2 = 0.0, 0.0
994
+ F_vec = self._f1_score(y_true, y_pred, labels)
974
995
 
975
- # Print the metric corresponding to the prediction vector
976
- print("Metric Res_p ->", sum1 / sum2)
996
+ for label_idx, label in enumerate(labels):
997
+ class_instances = np.sum(y_true == label) / T_C
998
+ sum1 += (1 - class_instances) * F_vec[label_idx]
999
+ sum2 += 1 - class_instances
977
1000
 
978
- def _fi_measure(self, y_true: ndarray, y_pred: ndarray, labels: list, i: int) -> int:
979
- F_vec = self._f1_score(y_true, y_pred, labels=labels)
1001
+ res_p = sum1 / sum2 if sum2 != 0 else 0.0 # Avoid division by zero
1002
+ print(f"Metric Res_p -> {res_p}")
980
1003
 
981
- return F_vec[i] # We return the position of the f1-score corresponding to the label
1004
+ return res_p
982
1005
 
983
- # Summary of the labels predicted
984
- def _summary_pred(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
985
- count_mat = self._confu_mat(y_true, y_pred, labels)
986
- print(" ", end="")
987
- for i in range(len(labels)):
988
- print("|--", labels[i], "--", end="")
989
- if i + 1 == len(labels):
990
- print("|", end="")
991
- for i in range(len(labels)):
992
- print("")
993
- print("|--", labels[i], "--|", end="")
994
- for j in range(len(labels)):
995
- if j != 0:
996
- print(" ", end="")
997
- print(" ", int(count_mat[i, j]), " ", end="")
998
-
999
- def _f1_score(self, y_true: ndarray, y_pred: ndarray, labels: list) -> ndarray:
1000
- f1_vec = np.zeros(len(labels))
1001
-
1002
- # Calculate confusion mat
1006
+ def _summary_pred(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> None:
1003
1007
  count_mat = self._confu_mat(y_true, y_pred, labels)
1008
+ print(" ", " | ".join(f"--{label}--" for label in labels))
1009
+ for i, label_i in enumerate(labels):
1010
+ row = [f" {int(count_mat[i, j])} " for j in range(len(labels))]
1011
+ print(f"--{label_i}--|", " | ".join(row))
1004
1012
 
1005
- # sums over columns
1006
- sum1 = np.sum(count_mat, axis=0)
1007
- # sums over rows
1008
- sum2 = np.sum(count_mat, axis=1)
1009
- # Iterate over labels to calculate f1 scores of each one
1010
- for i in range(len(labels)):
1011
- precision = count_mat[i, i] / (sum1[i])
1012
- recall = count_mat[i, i] / (sum2[i])
1013
+ def _f1_score(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
1014
+ count_mat = self._confu_mat(y_true, y_pred, labels)
1015
+ sum_cols = np.sum(count_mat, axis=0)
1016
+ sum_rows = np.sum(count_mat, axis=1)
1013
1017
 
1014
- f1_vec[i] = 2 * ((precision * recall) / (precision + recall))
1018
+ # Avoid division by zero
1019
+ precision = np.divide(
1020
+ count_mat.diagonal(), sum_cols, out=np.zeros_like(sum_cols), where=sum_cols != 0
1021
+ )
1022
+ recall = np.divide(
1023
+ count_mat.diagonal(), sum_rows, out=np.zeros_like(sum_rows), where=sum_rows != 0
1024
+ )
1025
+ f1_vec = 2 * ((precision * recall) / (precision + recall))
1015
1026
 
1016
1027
  return f1_vec
1017
1028
 
1018
1029
  # Returns confusion matrix of predictions
1019
- def _confu_mat(self, y_true: ndarray, y_pred: ndarray, labels: list) -> ndarray:
1020
- labels = np.array(labels)
1021
- count_mat = np.zeros((len(labels), len(labels)))
1022
-
1023
- for i in range(len(labels)):
1024
- for j in range(len(y_pred)):
1025
- if y_pred[j] == labels[i]:
1026
- if y_pred[j] == y_true[j]:
1027
- count_mat[i, i] += 1
1028
- else:
1029
- x = np.where(labels == y_true[j])
1030
- count_mat[i, x[0]] += 1
1030
+ def _confu_mat(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
1031
+ num_classes = len(labels)
1032
+ label_mapping = {label: idx for idx, label in enumerate(labels)}
1033
+ count_mat = np.zeros((num_classes, num_classes))
1034
+
1035
+ for pred_label, true_label in zip(y_pred, y_true):
1036
+ if pred_label in label_mapping and true_label in label_mapping:
1037
+ count_mat[label_mapping[pred_label], label_mapping[true_label]] += 1
1031
1038
 
1032
1039
  return count_mat
1033
1040
 
@@ -1043,10 +1050,10 @@ class OneHotEncoder:
1043
1050
  def __init__(self) -> None:
1044
1051
  pass
1045
1052
 
1046
- def encode(self, x: ndarray | list):
1053
+ def encode(self, x: np.ndarray | list):
1047
1054
  self.x = x
1048
1055
 
1049
- if not isinstance(self.x, ndarray):
1056
+ if not isinstance(self.x, np.ndarray):
1050
1057
  self.x = np.array(self.x) # If not numpy array then convert it
1051
1058
 
1052
1059
  y = np.zeros(
@@ -1057,8 +1064,8 @@ class OneHotEncoder:
1057
1064
 
1058
1065
  return y
1059
1066
 
1060
- def decode(self, x: ndarray | list) -> ndarray:
1061
- if not isinstance(x, ndarray):
1067
+ def decode(self, x: np.ndarray | list) -> np.ndarray:
1068
+ if not isinstance(x, np.ndarray):
1062
1069
  x = np.array(x) # If not numpy array then convert it
1063
1070
 
1064
1071
  # We return the max values of each row
@@ -1220,17 +1227,33 @@ class FeatureSelection:
1220
1227
 
1221
1228
 
1222
1229
  def check_nan_inf(df: DataFrame) -> DataFrame:
1223
- """Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
1230
+ """
1231
+ Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
1232
+
1233
+ Parameters:
1234
+ df (DataFrame): The input DataFrame to be checked.
1235
+
1236
+ Returns:
1237
+ DataFrame: A new DataFrame with NaN and Inf values removed.
1238
+ """
1239
+
1224
1240
  nan_values = df.isnull().values.any()
1225
- count = np.isinf(df.select_dtypes(include="number")).values.sum()
1226
- print("There are null values : ", nan_values)
1227
- print("It contains " + str(count) + " infinite values")
1241
+ inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
1242
+
1228
1243
  if nan_values:
1229
- warning_type = "UserWarning"
1230
- msg = "Some rows may have been deleted due to the existence of nan values."
1231
- print(f"{warning_type}: {msg}")
1232
- print("Missing values correctly removed : ", "{:,}".format(df.isnull().values.sum()))
1233
- df = df.dropna()
1244
+ print("UserWarning: Some rows may have been deleted due to the existence of NaN values.")
1245
+ df.dropna(inplace=True)
1246
+
1247
+ if inf_values:
1248
+ print("UserWarning: Some rows may have been deleted due to the existence of Inf values.")
1249
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
1250
+ df.dropna(inplace=True)
1251
+
1252
+ nan_count = df.isnull().values.sum()
1253
+ inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
1254
+
1255
+ print(f"NaN values removed: {nan_count}")
1256
+ print(f"Infinite values removed: {inf_count}")
1234
1257
 
1235
1258
  return df
1236
1259
 
@@ -1244,6 +1267,7 @@ if __name__ == "__main__":
1244
1267
  helper = PerformanceMeasures()
1245
1268
  helper._summary_pred(y_true, y_pred, labels)
1246
1269
  print(helper._f1_score(y_true, y_pred, labels))
1270
+ print(helper.f_mean(y_true, y_pred, labels))
1247
1271
 
1248
1272
  # Use DataFrameEncoder
1249
1273
  # Create a DataFrame
@@ -1273,6 +1297,13 @@ if __name__ == "__main__":
1273
1297
  # Generate data
1274
1298
  x = np.random.rand(3, 100)
1275
1299
  y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
1300
+ # Create a DataFrame
1301
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
1302
+ df["y"] = y
1303
+ # Instantiate FeatureSelection
1304
+ fs = FeatureSelection()
1305
+ print(fs.get_digraph(df, n_importances=1))
1306
+
1276
1307
  linear_model = LinearRegression()
1277
1308
  linear_model.fit(x, y)
1278
1309
  importance = linear_model.get_importances()
@@ -1303,7 +1334,7 @@ if __name__ == "__main__":
1303
1334
  plt.show()
1304
1335
 
1305
1336
  # Calculate the autocorrelation of the data
1306
- z = autocorr(a[0, :])
1337
+ z = AutoCorrelation(a[0, :])
1307
1338
  z.plot()
1308
1339
  # print(z())
1309
1340
 
@@ -1313,3 +1344,18 @@ if __name__ == "__main__":
1313
1344
  x = np.random.normal(mu, sigma, N)
1314
1345
  f, cdf_, ox = cdf(x, plot=True)
1315
1346
  invf, cdf_, ox = cdf(x, plot=True, inv=True)
1347
+
1348
+ encoder = OneHotEncoder()
1349
+ encoding = encoder.encode([1, 2, 3, 4, 5])
1350
+ assert np.array_equal(
1351
+ encoding,
1352
+ np.array(
1353
+ [
1354
+ [0, 1, 0, 0, 0, 0],
1355
+ [0, 0, 1, 0, 0, 0],
1356
+ [0, 0, 0, 1, 0, 0],
1357
+ [0, 0, 0, 0, 1, 0],
1358
+ [0, 0, 0, 0, 0, 1],
1359
+ ]
1360
+ ),
1361
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.22
3
+ Version: 1.2.24
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -1,19 +1,20 @@
1
1
  likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
2
2
  likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
3
3
  likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
4
- likelihood/graph/graph.py,sha256=wKJqgxXiSbnvzyW3SjhQVrqp00yKMHf3ph6CIDNVhNM,2891
5
- likelihood/graph/nn.py,sha256=jBgb2SMUwM5OBatkIxH2I-_hH1ok5aw2fwXq5a1VAEg,12306
4
+ likelihood/graph/graph.py,sha256=hGWCznxaRQ8BfY2aLjrvwriZkAIsz5ydKXF4x_7b0EQ,3359
5
+ likelihood/graph/nn.py,sha256=3HihXchK4FQcp0j-pzTO36RSNg7EjzhuXYY_8M3C2G0,12366
6
6
  likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
7
+ likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
7
8
  likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
8
- likelihood/models/simulation.py,sha256=mdgQPg_LEY5svPaF4TFv-DoQRE2oP2ig_uXnwINtewM,4039
9
+ likelihood/models/simulation.py,sha256=L_9Mihcca7i_AnvWWrZilFV8VEhz_Z8fDLepmwBGSi8,8832
9
10
  likelihood/models/utils.py,sha256=VtEj07lV-GRoWraQgpfjU0jTt1Ntf9MXgYwe6XYQh20,1552
10
11
  likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
11
12
  likelihood/models/deep/autoencoders.py,sha256=2P--nS96XwMi44q0OIxvIp6Mdbt-B4LqwCSXTn2jYrY,10070
12
13
  likelihood/tools/__init__.py,sha256=MCjsCWfBNKE2uMN0VizDN1uFzZ_md0X2WZeBdWhrCR8,50
13
14
  likelihood/tools/numeric_tools.py,sha256=FA44kbiAcxcquz1el_g3Pqsp5ii8XFkAIrsMs5bGkj0,11445
14
- likelihood/tools/tools.py,sha256=O39aPxTNsaBVSJFIkNsUESNSkfG4C7GG77wcR51a8IQ,42543
15
- likelihood-1.2.22.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
16
- likelihood-1.2.22.dist-info/METADATA,sha256=_7sP0uqn2Qt-HCsWwVtqVML8K9Mite9BDktrT1BX_6I,2504
17
- likelihood-1.2.22.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
18
- likelihood-1.2.22.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
19
- likelihood-1.2.22.dist-info/RECORD,,
15
+ likelihood/tools/tools.py,sha256=iZBC7IHTFpAyxooyel7ZFi-5-G0nCotNLLtxenPw9T8,44303
16
+ likelihood-1.2.24.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
17
+ likelihood-1.2.24.dist-info/METADATA,sha256=Z6fUcQ3cU1oL8_o6px8uidolXPhlnivmztoZQpvlx8o,2504
18
+ likelihood-1.2.24.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
19
+ likelihood-1.2.24.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
20
+ likelihood-1.2.24.dist-info/RECORD,,