likelihood 1.2.22__tar.gz → 1.2.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {likelihood-1.2.22 → likelihood-1.2.23}/PKG-INFO +1 -1
  2. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/graph/graph.py +17 -0
  3. likelihood-1.2.23/likelihood/models/simulation.py +223 -0
  4. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/tools/tools.py +307 -261
  5. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/PKG-INFO +1 -1
  6. likelihood-1.2.22/likelihood/models/simulation.py +0 -103
  7. {likelihood-1.2.22 → likelihood-1.2.23}/LICENSE +0 -0
  8. {likelihood-1.2.22 → likelihood-1.2.23}/README.md +0 -0
  9. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/__init__.py +0 -0
  10. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/graph/__init__.py +0 -0
  11. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/graph/nn.py +0 -0
  12. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/main.py +0 -0
  13. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/__init__.py +0 -0
  14. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/deep/__init__.py +0 -0
  15. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/deep/autoencoders.py +0 -0
  16. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/regression.py +0 -0
  17. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/utils.py +0 -0
  18. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/tools/__init__.py +0 -0
  19. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/tools/numeric_tools.py +0 -0
  20. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/SOURCES.txt +0 -0
  21. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/dependency_links.txt +0 -0
  22. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/requires.txt +0 -0
  23. {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/top_level.txt +0 -0
  24. {likelihood-1.2.22 → likelihood-1.2.23}/setup.cfg +0 -0
  25. {likelihood-1.2.22 → likelihood-1.2.23}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.22
3
+ Version: 1.2.23
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -74,3 +74,20 @@ class DynamicGraph(FeatureSelection):
74
74
  nx_graph.add_edges_from([(source, target, edge)])
75
75
 
76
76
  return nx_graph
77
+
78
+
79
+ # -------------------------------------------------------------------------
80
+ if __name__ == "__main__":
81
+ import numpy as np
82
+ import pandas as pd
83
+
84
+ # Generate data
85
+ x = np.random.rand(3, 100)
86
+ y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
87
+ # Create a DataFrame
88
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
89
+ df["y"] = y
90
+ # Instantiate DynamicGraph
91
+ fs = DynamicGraph(df, n_importances=2)
92
+ print(fs.fit())
93
+ fs.draw()
@@ -0,0 +1,223 @@
1
+ import pickle
2
+ import warnings
3
+ from typing import List, Tuple, Union
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import pandas as pd
8
+ from numpy import ndarray
9
+ from pandas.core.frame import DataFrame
10
+
11
+ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
12
+
13
+ # Suppress RankWarning
14
+ warnings.simplefilter("ignore", np.RankWarning)
15
+
16
+
17
+ # --------------------------------------------------------------------------------------------------------------------------------------
18
+ def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
19
+ # Count the frequency of each category in the column
20
+ freq = df[column].value_counts()
21
+
22
+ # Calculate the 25th percentile (Q1) and 75th percentile (Q3)
23
+ q1 = freq.quantile(0.25)
24
+ q3 = freq.quantile(0.75)
25
+
26
+ # Filter categories that are below the 25th percentile and above the 75th percentile
27
+ least_frequent = freq[freq <= q1]
28
+ most_frequent = freq[freq >= q3]
29
+
30
+ # Get the least frequent category (25th percentile) and the most frequent category (75th percentile)
31
+ least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
32
+ most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
33
+
34
+ return least_frequent_category, most_frequent_category
35
+
36
+
37
+ class SimulationEngine(FeatureSelection):
38
+ """
39
+ This class implements a predictive model that utilizes multiple linear regression for numerical target variables
40
+ and multiple logistic regression for categorical target variables.
41
+
42
+ The class provides methods for training the model on a given dataset, making predictions,
43
+ and evaluating the model's performance.
44
+
45
+ Key features:
46
+ - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
47
+ - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
48
+ - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
49
+
50
+ Usage:
51
+ - Instantiate the class with the training data and target variable.
52
+ - Call the fit method to train the model.
53
+ - Use the predict method to generate predictions on new data.
54
+ - Evaluate the model using built-in metrics for accuracy and error.
55
+
56
+ This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
57
+ for both numerical and categorical outcomes efficiently.
58
+ """
59
+
60
+ def __init__(self, use_scaler: bool = False, **kwargs):
61
+
62
+ self.df = pd.DataFrame()
63
+ self.n_importances = None
64
+ self.use_scaler = use_scaler
65
+ self.proba_dict = {}
66
+
67
+ super().__init__(**kwargs)
68
+
69
+ def predict(self, df: DataFrame, column: str) -> ndarray | list:
70
+ # Let us assign the dictionary entries corresponding to the column
71
+ w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
72
+
73
+ df = df[names_cols].copy()
74
+ # Change the scale of the dataframe
75
+ dataset = self.df.copy()
76
+ dataset.drop(columns=column, inplace=True)
77
+ numeric_df = dataset.select_dtypes(include="number")
78
+ if self.use_scaler:
79
+ scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
80
+ _ = scaler.rescale()
81
+ dataset_ = df.copy()
82
+ numeric_df = dataset_.select_dtypes(include="number")
83
+ numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
84
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
85
+ for col in numeric_df.columns:
86
+ df[col] = numeric_df[col].values
87
+
88
+ # Encoding the datadrame
89
+ for num, colname in enumerate(dfe._encode_columns):
90
+ if df[colname].dtype == "object":
91
+ encode_dict = dfe.encoding_list[num]
92
+ df[colname] = df[colname].apply(
93
+ dfe._code_transformation_to, dictionary_list=encode_dict
94
+ )
95
+
96
+ # PREDICTION
97
+ y = df.to_numpy() @ w
98
+
99
+ # Categorical column
100
+ if quick_encoder != None:
101
+
102
+ one_hot = OneHotEncoder()
103
+ y = one_hot.decode(y)
104
+ encoding_dic = quick_encoder.decoding_list[0]
105
+ y = [encoding_dic[item] for item in y]
106
+ # Numeric column
107
+ else:
108
+ if self.use_scaler:
109
+ # scale output
110
+ y += 1
111
+ y /= 2
112
+ y = y * (self.df[column].max() - self.df[column].min())
113
+
114
+ return y[:]
115
+
116
+ def _encode(self, df: DataFrame) -> ndarray | list:
117
+ df = df.copy()
118
+ column = df.columns[0]
119
+ frec = df[column].value_counts() / len(df)
120
+ df.loc[:, "frec"] = df[column].map(frec)
121
+ df.sort_values("frec", inplace=True)
122
+ keys = df[column].to_list()
123
+ values = df["frec"].to_list()
124
+ return dict(zip(keys, values))
125
+
126
+ def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
127
+ self.df = df
128
+ self.n_importances = n_importances
129
+ # We run the feature selection algorithm
130
+ self.get_digraph(self.df, self.n_importances, self.use_scaler)
131
+ proba_dict_keys = list(self.w_dict.keys())
132
+ self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
133
+ for key in proba_dict_keys:
134
+ x = (
135
+ self.df[key].values,
136
+ None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
137
+ )
138
+ poly = kwargs.get("poly", 9)
139
+ plot = kwargs.get("plot", False)
140
+ if not x[1]:
141
+ media = self.df[key].mean()
142
+ desviacion_estandar = self.df[key].std()
143
+ cota_inferior = media - 1.5 * desviacion_estandar
144
+ cota_superior = media + 1.5 * desviacion_estandar
145
+ if plot:
146
+ print(f"Cumulative Distribution Function ({key})")
147
+ f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
148
+ else:
149
+ f, ox = None, None
150
+ least_frequent_category, most_frequent_category = categories_by_quartile(
151
+ self.df[[key]], key
152
+ )
153
+ cota_inferior = x[1].get(least_frequent_category, 0)
154
+ cota_superior = x[1].get(most_frequent_category, 0)
155
+ self.proba_dict[key] = (
156
+ f if f else None,
157
+ x[1],
158
+ (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
159
+ f(cota_inferior) if f else cota_inferior,
160
+ f(cota_superior) if f else cota_superior,
161
+ )
162
+
163
+ def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
164
+ value = (
165
+ value
166
+ if isinstance(value, list)
167
+ else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
168
+ )
169
+ return [
170
+ (
171
+ self.proba_dict[colname][0](val)
172
+ - self.proba_dict[colname][0](val - self.proba_dict[colname][2])
173
+ if (isinstance(val, float) or isinstance(val, int))
174
+ else self.proba_dict[colname][1].get(val, 0)
175
+ )
176
+ for val in value
177
+ ]
178
+
179
+ def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
180
+ return [
181
+ (
182
+ "inlier"
183
+ if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
184
+ else "outlier"
185
+ )
186
+ for val in self.get_proba(value, colname)
187
+ ]
188
+
189
+ def _clean_data(self, df: DataFrame) -> DataFrame:
190
+
191
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
192
+ df.replace(" ", np.nan, inplace=True)
193
+ df = check_nan_inf(df)
194
+ df = df.reset_index()
195
+ df = df.drop(columns=["index"])
196
+
197
+ return df
198
+
199
+ def save(self, filename: str = "./simulation_model") -> None:
200
+ """
201
+ Save the state of the SimulationEngine to a file.
202
+
203
+ Parameters:
204
+ filename (str): The name of the file where the object will be saved.
205
+ """
206
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
207
+ with open(filename, "wb") as f:
208
+ pickle.dump(self, f)
209
+
210
+ @staticmethod
211
+ def load(filename: str = "./simulation_model"):
212
+ """
213
+ Load the state of a SimulationEngine from a file.
214
+
215
+ Parameters:
216
+ filename (str): The name of the file containing the saved object.
217
+
218
+ Returns:
219
+ SimulationEngine: A new instance of SimulationEngine with the loaded state.
220
+ """
221
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
222
+ with open(filename, "rb") as f:
223
+ return pickle.load(f)
@@ -1,15 +1,18 @@
1
1
  import math
2
2
  import os
3
3
  import pickle
4
- from typing import Callable, Dict, List, Tuple
4
+ import warnings
5
+ from typing import Callable, Dict, List, Tuple, Union
5
6
 
6
7
  import matplotlib.pyplot as plt
7
8
  import numpy as np
8
9
  import pandas as pd
9
10
  import yaml
10
- from numpy import ndarray
11
11
  from pandas.core.frame import DataFrame
12
12
 
13
+ # Suppress RankWarning
14
+ warnings.simplefilter("ignore", np.RankWarning)
15
+
13
16
  # -------------------------------------------------------------------------
14
17
 
15
18
  """
@@ -68,7 +71,7 @@ def difference_quotient(f: Callable, x: float, h: float) -> Callable:
68
71
  return (f(x + h) - f(x)) / h
69
72
 
70
73
 
71
- def partial_difference_quotient(f: Callable, v: ndarray, i: int, h: float) -> ndarray:
74
+ def partial_difference_quotient(f: Callable, v: np.ndarray, i: int, h: float) -> np.ndarray:
72
75
  """Calculates the partial difference quotient of `f`
73
76
 
74
77
  Parameters
@@ -93,7 +96,7 @@ def partial_difference_quotient(f: Callable, v: ndarray, i: int, h: float) -> nd
93
96
  return (f(w) - f(v)) / h
94
97
 
95
98
 
96
- def estimate_gradient(f: Callable, v: ndarray, h: float = 1e-4) -> List[ndarray]:
99
+ def estimate_gradient(f: Callable, v: np.ndarray, h: float = 1e-4) -> List[np.ndarray]:
97
100
  """Calculates the gradient of `f` at `v`
98
101
 
99
102
  Parameters
@@ -138,35 +141,32 @@ def generate_feature_yaml(
138
141
  A dictionary with four keys ('ordinal_features', 'numeric_features', 'categorical_features', 'ignore_features')
139
142
  mapping to lists of feature names. Or a YAML formatted string if `yaml_string` is `True`.
140
143
  """
144
+ ignore_features = ignore_features or []
141
145
  feature_info = {
142
146
  "ordinal_features": [],
143
147
  "numeric_features": [],
144
148
  "categorical_features": [],
145
- "ignore_features": [],
149
+ "ignore_features": ignore_features,
146
150
  }
147
151
 
148
152
  for col in df.columns:
149
- if ignore_features and col in ignore_features:
153
+ if col in ignore_features:
150
154
  continue
151
155
 
152
156
  if pd.api.types.is_numeric_dtype(df[col]):
153
- feature_info["numeric_features"].append(col)
157
+ if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
158
+ feature_info["numeric_features"].append(col)
159
+ elif pd.api.types.is_bool_dtype(df[col]):
160
+ feature_info["ordinal_features"].append(col) # Assuming bool can be ordinal
154
161
  elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
155
162
  feature_info["categorical_features"].append(col)
156
- elif pd.api.types.is_integer_dtype(df[col]):
157
- feature_info["ordinal_features"].append(col)
158
- elif pd.api.types.is_float_dtype(df[col]):
159
- feature_info["ordinal_features"].append(col)
160
- elif pd.api.types.is_bool_dtype(df[col]):
161
- feature_info["ordinal_features"].append(col)
162
163
  else:
163
164
  print(f"Unknown type for feature {col}")
164
- feature_info["ignore_features"] = ignore_features
165
165
 
166
166
  if yaml_string:
167
167
  return yaml.dump(feature_info, default_flow_style=False)
168
- else:
169
- return feature_info
168
+
169
+ return feature_info
170
170
 
171
171
 
172
172
  # a function that calculates the percentage of missing values per column is defined
@@ -192,61 +192,9 @@ def cal_missing_values(df: DataFrame) -> None:
192
192
  )
193
193
 
194
194
 
195
- def calculate_probability(x: ndarray, points: int = 1, cond: bool = True) -> ndarray:
196
- """Calculates the probability of the data.
197
-
198
- Parameters
199
- ----------
200
- x : `np.array`
201
- An array containing the data.
202
- points : `int`
203
- An integer value. By default it is set to `1`.
204
- cond : `bool`
205
- A boolean value. By default it is set to `True`.
206
-
207
- Returns
208
- -------
209
- p : `np.array`
210
- An array containing the probability of the data.
211
-
212
- """
213
-
214
- p = []
215
-
216
- f = cdf(x)[0]
217
- for i in range(len(x)):
218
- p.append(f(x[i]))
219
- p = np.array(p)
220
- if cond:
221
- if np.prod(p[-points]) > 1:
222
- print("\nThe probability of the data cannot be calculated.\n")
223
- else:
224
- if np.prod(p[-points]) < 0:
225
- print("\nThe probability of the data cannot be calculated.\n")
226
- else:
227
- print(
228
- "The model has a probability of {:.2f}% of being correct".format(
229
- np.prod(p[-points]) * 100
230
- )
231
- )
232
- else:
233
- if np.sum(p[-points]) < 0:
234
- print("\nThe probability of the data cannot be calculated.\n")
235
- else:
236
- if np.sum(p[-points]) > 1:
237
- print("\nThe probability of the data cannot be calculated.\n")
238
- else:
239
- print(
240
- "The model has a probability of {:.2f}% of being correct".format(
241
- np.sum(p[-points]) * 100
242
- )
243
- )
244
- return p
245
-
246
-
247
195
  def cdf(
248
- x: ndarray, poly: int = 9, inv: bool = False, plot: bool = False, savename: str = None
249
- ) -> ndarray:
196
+ x: np.ndarray, poly: int = 9, inv: bool = False, plot: bool = False, savename: str = None
197
+ ) -> tuple:
250
198
  """Calculates the cumulative distribution function of the data.
251
199
 
252
200
  Parameters
@@ -254,165 +202,229 @@ def cdf(
254
202
  x : `np.array`
255
203
  An array containing the data.
256
204
  poly : `int`
257
- An integer value. By default it is set to `9`.
205
+ Degree of the polynomial fit. By default it is set to `9`.
258
206
  inv : `bool`
259
- A boolean value. By default it is set to `False`.
207
+ If True, calculate the inverse CDF (quantile function).
208
+ plot : `bool`
209
+ If True, plot the results.
210
+ savename : `str`, optional
211
+ Filename to save the plot.
260
212
 
261
213
  Returns
262
214
  -------
263
- cdf_ : `np.array`
264
- An array containing the cumulative distribution function.
265
-
215
+ fit : `np.poly1d`
216
+ Polynomial fit of the CDF or quantile function.
217
+ cdf_values : `np.array`
218
+ Cumulative distribution values.
219
+ sorted_x : `np.array`
220
+ Sorted input data.
266
221
  """
267
222
 
268
- cdf_ = np.cumsum(x) / np.sum(x)
223
+ if len(x) == 0:
224
+ raise ValueError("Input array 'x' must not be empty.")
225
+
226
+ cdf_values = np.cumsum(x) / np.sum(x)
227
+ sorted_x = np.sort(x)
269
228
 
270
- ox = np.sort(x)
271
- I = np.ones(len(ox))
272
- M = np.triu(I)
273
- df = np.dot(ox, M)
274
- df_ = df / np.max(df)
229
+ # Calculate the CDF or inverse CDF (quantile function)
230
+ probabilities = np.linspace(0, 1, len(sorted_x))
275
231
 
276
232
  if inv:
277
- fit = np.polyfit(df_, ox, poly)
233
+ fit = np.polyfit(probabilities, sorted_x, poly)
278
234
  f = np.poly1d(fit)
235
+ plot_label = "Quantile Function"
236
+ x_values = probabilities
237
+ y_values = sorted_x
279
238
  else:
280
- fit = np.polyfit(ox, df_, poly)
239
+ fit = np.polyfit(sorted_x, probabilities, poly)
281
240
  f = np.poly1d(fit)
241
+ plot_label = "Cumulative Distribution Function"
242
+ x_values = sorted_x
243
+ y_values = cdf_values
282
244
 
283
245
  if plot:
284
- if inv:
285
- plt.plot(df_, ox, "o", label="inv cdf")
286
- plt.plot(df_, f(df_), "r--", label="fit")
287
- plt.title("Quantile Function")
288
- plt.xlabel("Probability")
289
- plt.ylabel("Value")
290
- plt.legend()
291
- if savename != None:
292
- plt.savefig(savename, dpi=300)
293
- plt.show()
294
- else:
295
- plt.plot(ox, cdf_, "o", label="cdf")
296
- plt.plot(ox, f(ox), "r--", label="fit")
297
- plt.title("Cumulative Distribution Function")
298
- plt.xlabel("Value")
299
- plt.ylabel("Probability")
300
- plt.legend()
301
- if savename != None:
302
- plt.savefig(savename, dpi=300)
303
- plt.show()
246
+ plt.figure()
247
+ plt.plot(x_values, y_values, "o", label="data")
248
+ plt.plot(x_values, f(x_values), "r--", label="fit")
249
+ plt.title(plot_label)
250
+ plt.xlabel("Probability" if inv else "Value")
251
+ plt.ylabel("Value" if inv else "Probability")
252
+ plt.legend()
253
+ if savename:
254
+ plt.savefig(savename, dpi=300)
255
+ plt.show()
304
256
 
305
- return f, cdf_, ox
257
+ return f, cdf_values, sorted_x
306
258
 
307
259
 
308
- class corr:
309
- """Calculates the correlation of the data.
260
+ def calculate_probability(x: np.ndarray, points: int = 1, cond: bool = True) -> np.ndarray:
261
+ """Calculates the probability of the data based on the CDF fit.
310
262
 
311
263
  Parameters
312
264
  ----------
313
265
  x : `np.array`
314
266
  An array containing the data.
315
- y : `np.array`
316
- An array containing the data.
267
+ points : `int`
268
+ Number of points to consider for the final probability calculation.
269
+ cond : `bool`
270
+ Condition to use product (True) or sum (False) for the final probability check.
317
271
 
318
272
  Returns
319
273
  -------
320
- z : `np.array`
321
- An array containing the correlation of `x` and `y`.
322
-
274
+ p : `np.array`
275
+ Array containing the probabilities of the data.
323
276
  """
324
277
 
278
+ if len(x) == 0:
279
+ raise ValueError("Input array 'x' must not be empty.")
280
+
281
+ fit, _, sorted_x = cdf(x)
282
+ p = fit(x)
283
+
284
+ # Validate probability values
285
+ if cond:
286
+ prob_value = np.prod(p[-points])
287
+ message = "product"
288
+ else:
289
+ prob_value = np.sum(p[-points])
290
+ message = "sum"
291
+
292
+ if 0 <= prob_value <= 1:
293
+ print(f"The model has a probability of {prob_value * 100:.2f}% based on the {message}.")
294
+ else:
295
+ print("\nThe probability of the data cannot be calculated.\n")
296
+
297
+ return p
298
+
299
+
300
+ class CorrelationBase:
301
+ """Base class for correlation calculations."""
302
+
325
303
  __slots__ = ["x", "y", "result", "z"]
326
304
 
327
- def __init__(self, x: ndarray, y: ndarray):
305
+ def __init__(self, x: np.ndarray, y: Union[np.ndarray, None] = None):
328
306
  self.x = x
329
- self.y = y
330
- self.result = np.correlate(x, y, mode="full")
307
+ self.y = y if y is not None else x # Default to autocorrelation if y is not provided
308
+ self._compute_correlation()
331
309
  self.z = self.result[self.result.size // 2 :]
332
- self.z = self.z / float(np.abs(self.z).max())
310
+ self.z /= np.abs(self.z).max()
311
+
312
+ def _compute_correlation(self):
313
+ """Compute the correlation between x and y (or x with itself for autocorrelation)."""
314
+ self.result = np.correlate(self.x, self.y, mode="full")
333
315
 
334
316
  def plot(self):
335
- plt.plot(range(len(self.z)), self.z, label="Correlation")
317
+ """Plot the correlation or autocorrelation."""
318
+ plt.plot(range(len(self.z)), self.z, label=self._get_label())
336
319
  plt.legend()
337
320
  plt.show()
338
321
 
322
+ def _get_label(self) -> str:
323
+ return "Autocorrelation" if np.array_equal(self.x, self.y) else "Correlation"
324
+
339
325
  def __call__(self):
326
+ """Return the computed correlation or autocorrelation."""
340
327
  return self.z
341
328
 
342
329
 
343
- class autocorr:
344
- """Calculates the autocorrelation of the data.
330
+ class Correlation(CorrelationBase):
331
+ """Calculates the cross-correlation of two datasets.
345
332
 
346
333
  Parameters
347
334
  ----------
348
- x : `np.array`
349
- An array containing the data.
335
+ x : `np.ndarray`
336
+ An array containing the first dataset.
337
+ y : `np.ndarray`
338
+ An array containing the second dataset.
350
339
 
351
340
  Returns
352
341
  -------
353
- z : `np.array`
354
- An array containing the autocorrelation of the data.
342
+ z : `np.ndarray`
343
+ An array containing the correlation of `x` and `y`.
355
344
 
356
345
  """
357
346
 
358
- __slots__ = ["x", "result", "z"]
347
+ def __init__(self, x: np.ndarray, y: np.ndarray):
348
+ super().__init__(x, y)
359
349
 
360
- def __init__(self, x: ndarray):
361
- self.x = x
362
- self.result = np.correlate(x, x, mode="full")
363
- self.z = self.result[self.result.size // 2 :]
364
- self.z = self.z / float(np.abs(self.z).max())
365
350
 
366
- def plot(self):
367
- plt.plot(range(len(self.z)), self.z, label="Autocorrelation")
368
- plt.legend()
369
- plt.show()
351
+ class AutoCorrelation(CorrelationBase):
352
+ """Calculates the autocorrelation of a dataset.
370
353
 
371
- def __call__(self):
372
- return self.z
354
+ Parameters
355
+ ----------
356
+ x : `np.ndarray`
357
+ An array containing the data.
373
358
 
359
+ Returns
360
+ -------
361
+ z : `np.ndarray`
362
+ An array containing the autocorrelation of the data.
363
+ """
364
+
365
+ def __init__(self, x: np.ndarray):
366
+ super().__init__(x)
374
367
 
375
- def fft_denoise(dataset: ndarray, sigma: float = 0, mode: bool = True) -> Tuple[ndarray, float]:
376
- """Performs the noise removal using the Fast Fourier Transform.
368
+
369
+ def fft_denoise(
370
+ dataset: np.ndarray, sigma: float = 0, mode: bool = True
371
+ ) -> Tuple[np.ndarray, np.ndarray]:
372
+ """Performs noise removal using the Fast Fourier Transform.
377
373
 
378
374
  Parameters
379
375
  ----------
380
- dataset : `np.array`
381
- An array containing the noised data.
382
- sigma : `float`
383
- A `float` between `0` and `1`. By default it is set to `0`.
384
- mode : `bool`
385
- A boolean value. By default it is set to `True`.
376
+ dataset : `np.ndarray`
377
+ An array containing the noised data. Expected shape (num_samples, num_points).
378
+ sigma : `float`, default=0
379
+ A float between 0 and 1 representing the threshold for noise filtering.
380
+ mode : `bool`, default=True
381
+ If True, print progress messages.
386
382
 
387
383
  Returns
388
384
  -------
389
- dataset : `np.array`
390
- An array containing the denoised data.
391
- period : `float`
392
- period of the function described by the dataset
393
-
385
+ denoised_dataset : `np.ndarray`
386
+ An array containing the denoised data with the same shape as `dataset`.
387
+ periods : `np.ndarray`
388
+ Array of estimated periods for each sample in `dataset`.
394
389
  """
395
- dataset_ = dataset.copy()
396
- for i in range(dataset.shape[0]):
397
- n = dataset.shape[1]
398
- fhat = np.fft.fft(dataset[i, :], n)
399
- freq = (1 / n) * np.arange(n)
400
- L = np.arange(1, np.floor(n / 2), dtype="int")
401
- PSD = fhat * np.conj(fhat) / n
402
- indices = PSD > np.mean(PSD) + sigma * np.std(PSD)
403
- PSDclean = PSD * indices # Zero out all others
404
- fhat = indices * fhat
405
- ffilt = np.fft.ifft(fhat) # Inverse FFT for filtered time signal
406
- dataset_[i, :] = ffilt.real
390
+
391
+ if not (0 <= sigma <= 1):
392
+ raise ValueError("sigma must be between 0 and 1")
393
+
394
+ num_samples, n_points = dataset.shape
395
+ denoised_dataset = np.zeros_like(dataset)
396
+ periods = np.zeros(num_samples)
397
+
398
+ # Precompute values that do not change within the loop
399
+ freq = (1 / n_points) * np.arange(n_points)
400
+ L = np.arange(1, np.floor(n_points / 2), dtype=int)
401
+
402
+ for i in range(num_samples):
403
+ fhat = np.fft.fft(dataset[i, :], n_points)
404
+ PSD = fhat * np.conj(fhat) / n_points
405
+ threshold = np.mean(PSD) + sigma * np.std(PSD)
406
+ indices = PSD > threshold
407
+
408
+ # Zero out all others in frequency domain
409
+ PSDclean = PSD * indices
410
+ fhat_cleaned = fhat * indices
411
+
412
+ # Inverse FFT for filtered time signal
413
+ denoised_signal = np.fft.ifft(fhat_cleaned).real
414
+ denoised_dataset[i, :] = denoised_signal
415
+
407
416
  # Calculate the period of the signal
408
- period = 1 / (2 * freq[L][np.argmax(fhat[L])])
417
+ peak_index = L[np.argmax(np.abs(fhat[L]))]
418
+ periods[i] = 1 / (2 * freq[peak_index])
419
+
409
420
  if mode:
410
421
  print(f"The {i+1}-th row of the dataset has been denoised.")
411
- print(f"The period is {round(period, 4)}")
412
- return dataset_, period
422
+ print(f"The estimated period is {round(periods[i], 4)}")
423
+
424
+ return denoised_dataset, periods
413
425
 
414
426
 
415
- def get_period(dataset: ndarray) -> float:
427
+ def get_period(dataset: np.ndarray) -> float:
416
428
  """Calculates the periodicity of a `dataset`.
417
429
 
418
430
  Parameters
@@ -426,13 +438,31 @@ def get_period(dataset: ndarray) -> float:
426
438
  period of the function described by the `dataset`
427
439
  """
428
440
  n = dataset.size
429
- fhat = np.fft.fft(dataset, n)
430
- freq = (1 / n) * np.arange(n)
431
- L = np.arange(1, np.floor(n / 2), dtype="int")
432
- PSD = fhat * np.conj(fhat) / n
433
- indices = PSD > np.mean(PSD) + np.std(PSD)
434
- fhat = indices * fhat
435
- period = 1 / (2 * freq[L][np.argmax(fhat[L])])
441
+
442
+ # Ensure there are enough points for FFT analysis
443
+ if n < 2:
444
+ raise ValueError("Dataset must contain at least two points.")
445
+
446
+ # Compute the FFT and PSD
447
+ fhat = np.fft.rfft(dataset) # Use rfft for real-valued input to save computation
448
+ freqs = np.fft.rfftfreq(n) # Get only positive frequencies
449
+
450
+ # Calculate the Power Spectral Density (PSD)
451
+ PSD = np.abs(fhat) ** 2 / n
452
+
453
+ # Remove the first frequency component (DC component)
454
+ PSD[0] = 0
455
+
456
+ # Find the index of the maximum PSD value, excluding the DC component
457
+ max_psd_index = np.argmax(PSD)
458
+
459
+ # Calculate the period based on the corresponding frequency
460
+ dominant_freq = freqs[max_psd_index]
461
+ if dominant_freq == 0:
462
+ raise ValueError("No significant periodic component found in the dataset.")
463
+
464
+ period = 1 / dominant_freq
465
+
436
466
  return period
437
467
 
438
468
 
@@ -468,7 +498,7 @@ class LogisticRegression:
468
498
 
469
499
  self.importance = []
470
500
 
471
- def fit(self, dataset: ndarray, values: ndarray) -> None:
501
+ def fit(self, dataset: np.ndarray, values: np.ndarray) -> None:
472
502
  """Performs linear multiple model training
473
503
 
474
504
  Parameters
@@ -501,7 +531,7 @@ class LogisticRegression:
501
531
  a = np.around(self.w[i], decimals=8)
502
532
  self.importance.append(a)
503
533
 
504
- def predict(self, datapoints: ndarray) -> ndarray:
534
+ def predict(self, datapoints: np.ndarray) -> np.ndarray:
505
535
  """
506
536
  Performs predictions for a set of points
507
537
 
@@ -515,7 +545,7 @@ class LogisticRegression:
515
545
 
516
546
  return sig(np.array(self.importance) @ datapoints)
517
547
 
518
- def get_importances(self, print_important_features: bool = False) -> ndarray:
548
+ def get_importances(self, print_important_features: bool = False) -> np.ndarray:
519
549
  """
520
550
  Returns the important features
521
551
 
@@ -547,7 +577,7 @@ class LinearRegression:
547
577
 
548
578
  self.importance = []
549
579
 
550
- def fit(self, dataset: ndarray, values: ndarray, verbose: bool = False) -> None:
580
+ def fit(self, dataset: np.ndarray, values: np.ndarray, verbose: bool = False) -> None:
551
581
  """Performs linear multiple model training
552
582
 
553
583
  Parameters
@@ -580,7 +610,7 @@ class LinearRegression:
580
610
  print("\nParameters:", np.array(self.importance).shape)
581
611
  print("RMSE: {:.4f}".format(mean_square_error(self.y, self.predict(self.X))))
582
612
 
583
- def predict(self, datapoints: ndarray) -> ndarray:
613
+ def predict(self, datapoints: np.ndarray) -> np.ndarray:
584
614
  """
585
615
  Performs predictions for a set of points
586
616
 
@@ -592,7 +622,7 @@ class LinearRegression:
592
622
  """
593
623
  return np.array(self.importance) @ datapoints
594
624
 
595
- def get_importances(self, print_important_features: bool = False) -> ndarray:
625
+ def get_importances(self, print_important_features: bool = False) -> np.ndarray:
596
626
  """
597
627
  Returns the important features
598
628
 
@@ -614,7 +644,7 @@ class LinearRegression:
614
644
  return np.array(self.importance)
615
645
 
616
646
 
617
- def cal_average(y: ndarray, alpha: float = 1):
647
+ def cal_average(y: np.ndarray, alpha: float = 1):
618
648
  """Calculates the moving average of the data
619
649
 
620
650
  Parameters
@@ -642,12 +672,12 @@ class DataScaler:
642
672
 
643
673
  __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
644
674
 
645
- def __init__(self, dataset: ndarray, n: int = 1) -> None:
675
+ def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
646
676
  """Initializes the parameters required for scaling the data"""
647
677
  self.dataset_ = dataset.copy()
648
678
  self._n = n
649
679
 
650
- def rescale(self, dataset_: ndarray | None = None) -> ndarray:
680
+ def rescale(self, dataset_: np.ndarray | None = None) -> np.ndarray:
651
681
  """Perform a standard rescaling of the data
652
682
 
653
683
  Returns
@@ -655,7 +685,7 @@ class DataScaler:
655
685
  data_scaled : `np.array`
656
686
  An array containing the scaled data.
657
687
  """
658
- if isinstance(dataset_, ndarray):
688
+ if isinstance(dataset_, np.ndarray):
659
689
  data_scaled = np.copy(dataset_)
660
690
  mu = self.values[0]
661
691
  sigma = self.values[1]
@@ -711,7 +741,7 @@ class DataScaler:
711
741
 
712
742
  return self.data_scaled
713
743
 
714
- def scale(self, dataset_: ndarray) -> ndarray:
744
+ def scale(self, dataset_: np.ndarray) -> np.ndarray:
715
745
  """Performs the inverse operation to the rescale function
716
746
 
717
747
  Parameters
@@ -755,7 +785,7 @@ def generate_series(n: int, n_steps: int, incline: bool = True):
755
785
  return series.astype(np.float32)
756
786
 
757
787
 
758
- def mean_square_error(y_true: ndarray, y_pred: ndarray, print_error: bool = False):
788
+ def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray, print_error: bool = False):
759
789
  """Calculates the Root Mean Squared Error
760
790
 
761
791
  Parameters
@@ -946,88 +976,65 @@ class PerformanceMeasures:
946
976
  pass
947
977
 
948
978
  # Performance measure Res_T
949
- def f_mean(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
950
- n = len(labels)
979
+ def f_mean(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
980
+ F_vec = self._f1_score(y_true, y_pred, labels)
981
+ mean_f_measure = np.mean(F_vec)
951
982
 
952
- F_vec = self._f1_score(y_true, y_pred, labels=labels)
953
- a = np.sum(F_vec)
983
+ for label, f_measure in zip(labels, F_vec):
984
+ print(f"F-measure of label {label} -> {f_measure}")
954
985
 
955
- for i in range(len(F_vec)):
956
- print("F-measure of label ", labels[i], " -> ", F_vec[i])
986
+ print(f"Mean of F-measure -> {mean_f_measure}")
957
987
 
958
- print("Mean of F-measure -> ", a / n)
988
+ return mean_f_measure
959
989
 
960
990
  # Performance measure Res_P
961
- def resp(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
962
- # We initialize sum counters
963
- sum1 = 0
964
- sum2 = 0
965
-
966
- # Calculamos T_C
991
+ def resp(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
967
992
  T_C = len(y_true)
968
- for i in range(len(labels)):
969
- # We calculate instances of the classes and their F-measures
970
- sum1 += (1 - ((y_true == labels[i]).sum() / T_C)) * self._fi_measure(
971
- y_true, y_pred, labels, i
972
- )
973
- sum2 += 1 - ((y_true == labels[i]).sum()) / T_C
993
+ sum1, sum2 = 0.0, 0.0
994
+ F_vec = self._f1_score(y_true, y_pred, labels)
974
995
 
975
- # Print the metric corresponding to the prediction vector
976
- print("Metric Res_p ->", sum1 / sum2)
996
+ for label_idx, label in enumerate(labels):
997
+ class_instances = np.sum(y_true == label) / T_C
998
+ sum1 += (1 - class_instances) * F_vec[label_idx]
999
+ sum2 += 1 - class_instances
977
1000
 
978
- def _fi_measure(self, y_true: ndarray, y_pred: ndarray, labels: list, i: int) -> int:
979
- F_vec = self._f1_score(y_true, y_pred, labels=labels)
1001
+ res_p = sum1 / sum2 if sum2 != 0 else 0.0 # Avoid division by zero
1002
+ print(f"Metric Res_p -> {res_p}")
980
1003
 
981
- return F_vec[i] # We return the position of the f1-score corresponding to the label
1004
+ return res_p
982
1005
 
983
- # Summary of the labels predicted
984
- def _summary_pred(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
985
- count_mat = self._confu_mat(y_true, y_pred, labels)
986
- print(" ", end="")
987
- for i in range(len(labels)):
988
- print("|--", labels[i], "--", end="")
989
- if i + 1 == len(labels):
990
- print("|", end="")
991
- for i in range(len(labels)):
992
- print("")
993
- print("|--", labels[i], "--|", end="")
994
- for j in range(len(labels)):
995
- if j != 0:
996
- print(" ", end="")
997
- print(" ", int(count_mat[i, j]), " ", end="")
998
-
999
- def _f1_score(self, y_true: ndarray, y_pred: ndarray, labels: list) -> ndarray:
1000
- f1_vec = np.zeros(len(labels))
1001
-
1002
- # Calculate confusion mat
1006
+ def _summary_pred(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> None:
1003
1007
  count_mat = self._confu_mat(y_true, y_pred, labels)
1008
+ print(" ", " | ".join(f"--{label}--" for label in labels))
1009
+ for i, label_i in enumerate(labels):
1010
+ row = [f" {int(count_mat[i, j])} " for j in range(len(labels))]
1011
+ print(f"--{label_i}--|", " | ".join(row))
1004
1012
 
1005
- # sums over columns
1006
- sum1 = np.sum(count_mat, axis=0)
1007
- # sums over rows
1008
- sum2 = np.sum(count_mat, axis=1)
1009
- # Iterate over labels to calculate f1 scores of each one
1010
- for i in range(len(labels)):
1011
- precision = count_mat[i, i] / (sum1[i])
1012
- recall = count_mat[i, i] / (sum2[i])
1013
+ def _f1_score(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
1014
+ count_mat = self._confu_mat(y_true, y_pred, labels)
1015
+ sum_cols = np.sum(count_mat, axis=0)
1016
+ sum_rows = np.sum(count_mat, axis=1)
1013
1017
 
1014
- f1_vec[i] = 2 * ((precision * recall) / (precision + recall))
1018
+ # Avoid division by zero
1019
+ precision = np.divide(
1020
+ count_mat.diagonal(), sum_cols, out=np.zeros_like(sum_cols), where=sum_cols != 0
1021
+ )
1022
+ recall = np.divide(
1023
+ count_mat.diagonal(), sum_rows, out=np.zeros_like(sum_rows), where=sum_rows != 0
1024
+ )
1025
+ f1_vec = 2 * ((precision * recall) / (precision + recall))
1015
1026
 
1016
1027
  return f1_vec
1017
1028
 
1018
1029
  # Returns confusion matrix of predictions
1019
- def _confu_mat(self, y_true: ndarray, y_pred: ndarray, labels: list) -> ndarray:
1020
- labels = np.array(labels)
1021
- count_mat = np.zeros((len(labels), len(labels)))
1022
-
1023
- for i in range(len(labels)):
1024
- for j in range(len(y_pred)):
1025
- if y_pred[j] == labels[i]:
1026
- if y_pred[j] == y_true[j]:
1027
- count_mat[i, i] += 1
1028
- else:
1029
- x = np.where(labels == y_true[j])
1030
- count_mat[i, x[0]] += 1
1030
+ def _confu_mat(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
1031
+ num_classes = len(labels)
1032
+ label_mapping = {label: idx for idx, label in enumerate(labels)}
1033
+ count_mat = np.zeros((num_classes, num_classes))
1034
+
1035
+ for pred_label, true_label in zip(y_pred, y_true):
1036
+ if pred_label in label_mapping and true_label in label_mapping:
1037
+ count_mat[label_mapping[pred_label], label_mapping[true_label]] += 1
1031
1038
 
1032
1039
  return count_mat
1033
1040
 
@@ -1043,10 +1050,10 @@ class OneHotEncoder:
1043
1050
  def __init__(self) -> None:
1044
1051
  pass
1045
1052
 
1046
- def encode(self, x: ndarray | list):
1053
+ def encode(self, x: np.ndarray | list):
1047
1054
  self.x = x
1048
1055
 
1049
- if not isinstance(self.x, ndarray):
1056
+ if not isinstance(self.x, np.ndarray):
1050
1057
  self.x = np.array(self.x) # If not numpy array then convert it
1051
1058
 
1052
1059
  y = np.zeros(
@@ -1057,8 +1064,8 @@ class OneHotEncoder:
1057
1064
 
1058
1065
  return y
1059
1066
 
1060
- def decode(self, x: ndarray | list) -> ndarray:
1061
- if not isinstance(x, ndarray):
1067
+ def decode(self, x: np.ndarray | list) -> np.ndarray:
1068
+ if not isinstance(x, np.ndarray):
1062
1069
  x = np.array(x) # If not numpy array then convert it
1063
1070
 
1064
1071
  # We return the max values of each row
@@ -1220,17 +1227,33 @@ class FeatureSelection:
1220
1227
 
1221
1228
 
1222
1229
  def check_nan_inf(df: DataFrame) -> DataFrame:
1223
- """Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
1230
+ """
1231
+ Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
1232
+
1233
+ Parameters:
1234
+ df (DataFrame): The input DataFrame to be checked.
1235
+
1236
+ Returns:
1237
+ DataFrame: A new DataFrame with NaN and Inf values removed.
1238
+ """
1239
+
1224
1240
  nan_values = df.isnull().values.any()
1225
- count = np.isinf(df.select_dtypes(include="number")).values.sum()
1226
- print("There are null values : ", nan_values)
1227
- print("It contains " + str(count) + " infinite values")
1241
+ inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
1242
+
1228
1243
  if nan_values:
1229
- warning_type = "UserWarning"
1230
- msg = "Some rows may have been deleted due to the existence of nan values."
1231
- print(f"{warning_type}: {msg}")
1232
- print("Missing values correctly removed : ", "{:,}".format(df.isnull().values.sum()))
1233
- df = df.dropna()
1244
+ print("UserWarning: Some rows may have been deleted due to the existence of NaN values.")
1245
+ df.dropna(inplace=True)
1246
+
1247
+ if inf_values:
1248
+ print("UserWarning: Some rows may have been deleted due to the existence of Inf values.")
1249
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
1250
+ df.dropna(inplace=True)
1251
+
1252
+ nan_count = df.isnull().values.sum()
1253
+ inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
1254
+
1255
+ print(f"NaN values removed: {nan_count}")
1256
+ print(f"Infinite values removed: {inf_count}")
1234
1257
 
1235
1258
  return df
1236
1259
 
@@ -1244,6 +1267,7 @@ if __name__ == "__main__":
1244
1267
  helper = PerformanceMeasures()
1245
1268
  helper._summary_pred(y_true, y_pred, labels)
1246
1269
  print(helper._f1_score(y_true, y_pred, labels))
1270
+ print(helper.f_mean(y_true, y_pred, labels))
1247
1271
 
1248
1272
  # Use DataFrameEncoder
1249
1273
  # Create a DataFrame
@@ -1273,6 +1297,13 @@ if __name__ == "__main__":
1273
1297
  # Generate data
1274
1298
  x = np.random.rand(3, 100)
1275
1299
  y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
1300
+ # Create a DataFrame
1301
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
1302
+ df["y"] = y
1303
+ # Instantiate FeatureSelection
1304
+ fs = FeatureSelection()
1305
+ print(fs.get_digraph(df, n_importances=1))
1306
+
1276
1307
  linear_model = LinearRegression()
1277
1308
  linear_model.fit(x, y)
1278
1309
  importance = linear_model.get_importances()
@@ -1303,7 +1334,7 @@ if __name__ == "__main__":
1303
1334
  plt.show()
1304
1335
 
1305
1336
  # Calculate the autocorrelation of the data
1306
- z = autocorr(a[0, :])
1337
+ z = AutoCorrelation(a[0, :])
1307
1338
  z.plot()
1308
1339
  # print(z())
1309
1340
 
@@ -1313,3 +1344,18 @@ if __name__ == "__main__":
1313
1344
  x = np.random.normal(mu, sigma, N)
1314
1345
  f, cdf_, ox = cdf(x, plot=True)
1315
1346
  invf, cdf_, ox = cdf(x, plot=True, inv=True)
1347
+
1348
+ encoder = OneHotEncoder()
1349
+ encoding = encoder.encode([1, 2, 3, 4, 5])
1350
+ assert np.array_equal(
1351
+ encoding,
1352
+ np.array(
1353
+ [
1354
+ [0, 1, 0, 0, 0, 0],
1355
+ [0, 0, 1, 0, 0, 0],
1356
+ [0, 0, 0, 1, 0, 0],
1357
+ [0, 0, 0, 0, 1, 0],
1358
+ [0, 0, 0, 0, 0, 1],
1359
+ ]
1360
+ ),
1361
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.22
3
+ Version: 1.2.23
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -1,103 +0,0 @@
1
- import matplotlib.pyplot as plt
2
- import numpy as np
3
- import pandas as pd
4
- from numpy import ndarray
5
- from pandas.core.frame import DataFrame
6
-
7
- from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
8
-
9
- # --------------------------------------------------------------------------------------------------------------------------------------
10
-
11
-
12
- class SimulationEngine(FeatureSelection):
13
- """
14
- This class implements a predictive model that utilizes multiple linear regression for numerical target variables
15
- and multiple logistic regression for categorical target variables.
16
-
17
- The class provides methods for training the model on a given dataset, making predictions,
18
- and evaluating the model's performance.
19
-
20
- Key features:
21
- - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
22
- - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
23
- - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
24
-
25
- Usage:
26
- - Instantiate the class with the training data and target variable.
27
- - Call the fit method to train the model.
28
- - Use the predict method to generate predictions on new data.
29
- - Evaluate the model using built-in metrics for accuracy and error.
30
-
31
- This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
32
- for both numerical and categorical outcomes efficiently.
33
- """
34
-
35
- def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
36
-
37
- self.df = df
38
- self.n_importances = n_importances
39
- self.use_scaler = use_scaler
40
-
41
- super().__init__(**kwargs)
42
-
43
- def predict(self, df: DataFrame, column: str) -> ndarray | list:
44
- # Let us assign the dictionary entries corresponding to the column
45
- w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
46
-
47
- df = df[names_cols].copy()
48
- # Change the scale of the dataframe
49
- dataset = self.df.copy()
50
- dataset.drop(columns=column, inplace=True)
51
- numeric_df = dataset.select_dtypes(include="number")
52
- if self.use_scaler:
53
- scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
54
- _ = scaler.rescale()
55
- dataset_ = df.copy()
56
- numeric_df = dataset_.select_dtypes(include="number")
57
- numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
58
- numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
59
- for col in numeric_df.columns:
60
- df[col] = numeric_df[col].values
61
-
62
- # Encoding the datadrame
63
- for num, colname in enumerate(dfe._encode_columns):
64
- if df[colname].dtype == "object":
65
- encode_dict = dfe.encoding_list[num]
66
- df[colname] = df[colname].apply(
67
- dfe._code_transformation_to, dictionary_list=encode_dict
68
- )
69
-
70
- # PREDICTION
71
- y = df.to_numpy() @ w
72
-
73
- # Categorical column
74
- if quick_encoder != None:
75
-
76
- one_hot = OneHotEncoder()
77
- y = one_hot.decode(y)
78
- encoding_dic = quick_encoder.decoding_list[0]
79
- y = [encoding_dic[item] for item in y]
80
- # Numeric column
81
- else:
82
- if self.use_scaler:
83
- # scale output
84
- y += 1
85
- y /= 2
86
- y = y * (self.df[column].max() - self.df[column].min())
87
-
88
- return y[:]
89
-
90
- def fit(self, **kwargs) -> None:
91
-
92
- # We run the feature selection algorithm
93
- self.get_digraph(self.df, self.n_importances, self.use_scaler)
94
-
95
- def _clean_data(self, df: DataFrame) -> DataFrame:
96
-
97
- df.replace([np.inf, -np.inf], np.nan, inplace=True)
98
- df.replace(" ", np.nan, inplace=True)
99
- df = check_nan_inf(df)
100
- df = df.reset_index()
101
- df = df.drop(columns=["index"])
102
-
103
- return df
File without changes
File without changes
File without changes
File without changes