likelihood 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import pickle
2
2
  import warnings
3
- from typing import List, Tuple, Union
3
+ from typing import Dict, List, Tuple, Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -106,7 +106,7 @@ class SimulationEngine(FeatureSelection):
106
106
 
107
107
  return y[:]
108
108
 
109
- def _encode(self, df: DataFrame) -> np.ndarray | list:
109
+ def _encode(self, df: DataFrame) -> Dict[str, float]:
110
110
  df = df.copy()
111
111
  column = df.columns[0]
112
112
  frec = df[column].value_counts() / len(df)
@@ -132,9 +132,9 @@ class SimulationEngine(FeatureSelection):
132
132
  plot = kwargs.get("plot", False)
133
133
  if not x[1]:
134
134
  media = self.df[key].mean()
135
- desviacion_estandar = self.df[key].std()
136
- cota_inferior = media - 1.5 * desviacion_estandar
137
- cota_superior = media + 1.5 * desviacion_estandar
135
+ standard_deviation = self.df[key].std()
136
+ lower_limit = media - 1.5 * standard_deviation
137
+ upper_limit = media + 1.5 * standard_deviation
138
138
  if plot:
139
139
  print(f"Cumulative Distribution Function ({key})")
140
140
  f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
@@ -143,14 +143,14 @@ class SimulationEngine(FeatureSelection):
143
143
  least_frequent_category, most_frequent_category = categories_by_quartile(
144
144
  self.df[[key]], key
145
145
  )
146
- cota_inferior = x[1].get(least_frequent_category, 0)
147
- cota_superior = x[1].get(most_frequent_category, 0)
146
+ lower_limit = x[1].get(least_frequent_category, 0)
147
+ upper_limit = x[1].get(most_frequent_category, 0)
148
148
  self.proba_dict[key] = (
149
149
  f if f else None,
150
150
  x[1],
151
151
  (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
152
- f(cota_inferior) if f else cota_inferior,
153
- f(cota_superior) if f else cota_superior,
152
+ f(lower_limit) if f else lower_limit,
153
+ f(upper_limit) if f else upper_limit,
154
154
  )
155
155
 
156
156
  def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
@@ -0,0 +1,279 @@
1
+ import pickle
2
+ import warnings
3
+ from typing import Union
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import pandas as pd
8
+ import seaborn as sns
9
+
10
+ from likelihood.models import SimulationEngine
11
+ from likelihood.tools.numeric_tools import find_multiples
12
+
13
+ warnings.simplefilter(action="ignore", category=FutureWarning)
14
+
15
+
16
+ class SimpleImputer:
17
+ """Multiple imputation using simulation engine."""
18
+
19
+ def __init__(self, n_features: int | None = None, use_scaler: bool = False):
20
+ """
21
+ Initialize the imputer.
22
+
23
+ Parameters
24
+ ----------
25
+ n_features: int | None
26
+ Number of features to be used in the imputer. Default is None.
27
+ use_scaler: bool
28
+ Whether to use a scaler. Default is False.
29
+ """
30
+ self.n_features = n_features
31
+ self.sim = SimulationEngine(use_scaler=use_scaler)
32
+ self.params = {}
33
+ self.cols_transf = pd.Series([])
34
+
35
+ def fit(self, X: pd.DataFrame) -> None:
36
+ """
37
+ Fit the imputer to the data.
38
+
39
+ Parameters
40
+ ----------
41
+ X: pd.DataFrame
42
+ Dataframe to fit the imputer to.
43
+ """
44
+ X_impute = X.copy()
45
+ self.params = self._get_dict_params(X_impute)
46
+ X_impute = self.sim._clean_data(X_impute)
47
+
48
+ if X_impute.empty:
49
+ raise ValueError(
50
+ "The dataframe is empty after cleaning, it is not possible to train the imputer."
51
+ )
52
+ self.n_features = self.n_features or X_impute.shape[1] - 1
53
+ self.sim.fit(X_impute, self.n_features)
54
+
55
+ def transform(
56
+ self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True
57
+ ) -> pd.DataFrame:
58
+ """
59
+ Impute missing values in the data.
60
+
61
+ Parameters
62
+ -----------
63
+ X: pd.DataFrame
64
+ Dataframe to impute missing values.
65
+ boundary: bool
66
+ Whether to use the boundaries of the data to impute missing values. Default is True.
67
+ inplace: bool
68
+ Whether to modify the columns of the original dataframe or return new ones. Default is True.
69
+ """
70
+ X_impute = X.copy()
71
+ self.cols_transf = X_impute.columns
72
+ for column in X_impute.columns:
73
+ if X_impute[column].isnull().sum() > 0:
74
+
75
+ if not X_impute[column].dtype == "object":
76
+ min_value = self.params[column]["min"]
77
+ max_value = self.params[column]["max"]
78
+ to_compare = self.params[column]["to_compare"]
79
+ for row in X_impute.index:
80
+ if pd.isnull(X_impute.loc[row, column]):
81
+ value_impute = self._check_dtype_convert(
82
+ self.sim.predict(
83
+ self._set_zero(X_impute.loc[row, :], column),
84
+ column,
85
+ )[0],
86
+ to_compare,
87
+ )
88
+ if not X_impute[column].dtype == "object" and boundary:
89
+ if value_impute < min_value:
90
+ value_impute = min_value
91
+ if value_impute > max_value:
92
+ value_impute = max_value
93
+ X_impute.loc[row, column] = value_impute
94
+ else:
95
+ self.cols_transf = self.cols_transf.drop(column)
96
+ if not inplace:
97
+ X_impute = X_impute[self.cols_transf].copy()
98
+ X_impute = X_impute.rename(
99
+ columns={column: column + "_imputed" for column in self.cols_transf}
100
+ )
101
+ X_impute = X.join(X_impute, rsuffix="_imputed")
102
+ order_cols = []
103
+ for column in X.columns:
104
+ if column + "_imputed" in X_impute.columns:
105
+ order_cols.append(column)
106
+ order_cols.append(column + "_imputed")
107
+ else:
108
+ order_cols.append(column)
109
+ X_impute = X_impute[order_cols]
110
+ return X_impute
111
+
112
+ def fit_transform(
113
+ self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True
114
+ ) -> pd.DataFrame:
115
+ """
116
+ Fit and transform the data.
117
+
118
+ Parameters
119
+ -----------
120
+ X: pd.DataFrame
121
+ Dataframe to fit and transform.
122
+ boundary: bool
123
+ Whether to use the boundaries of the data to impute missing values. Default is True.
124
+ inplace: bool
125
+ Whether to modify the columns of the original dataframe or return new ones. Default is True.
126
+ """
127
+ X_train = X.copy()
128
+ self.fit(X_train)
129
+ return self.transform(X, boundary, inplace)
130
+
131
+ def _set_zero(self, X: pd.Series, column_exception) -> pd.DataFrame:
132
+ """
133
+ Set missing values to zero, except for `column_exception`.
134
+
135
+ Parameters
136
+ -----------
137
+ X: pd.Series
138
+ Series to set missing values to zero.
139
+ """
140
+ X = X.copy()
141
+ for column in X.index:
142
+ if pd.isnull(X[column]) and column != column_exception:
143
+ X[column] = 0
144
+ data = X.to_frame().T
145
+ return data
146
+
147
+ def _check_dtype_convert(self, value: Union[int, float], to_compare: Union[int, float]) -> None:
148
+ """
149
+ Check if the value is an integer and convert it to float if it is.
150
+
151
+ Parameters
152
+ -----------
153
+ value: Union[int, float]
154
+ Value to check and convert.
155
+ to_compare: Union[int, float]
156
+ Value to compare to.
157
+ """
158
+ if isinstance(to_compare, int) and isinstance(value, float):
159
+ value = int(round(value, 0))
160
+
161
+ if isinstance(to_compare, float) and isinstance(value, float):
162
+ value = round(value, len(str(to_compare).split(".")[1]))
163
+ return value
164
+
165
+ def _get_dict_params(self, df: pd.DataFrame) -> dict:
166
+ """
167
+ Get the parameters for the imputer.
168
+
169
+ Parameters
170
+ -----------
171
+ df: pd.DataFrame
172
+ Dataframe to get the parameters from.
173
+ """
174
+ params = {}
175
+ for column in df.columns:
176
+ if df[column].isnull().sum() > 0:
177
+ if not df[column].dtype == "object":
178
+ to_compare = df[column].dropna().sample().values[0]
179
+ params[column] = {
180
+ "min": df[column].min(),
181
+ "to_compare": to_compare,
182
+ "max": df[column].max(),
183
+ }
184
+ return params
185
+
186
+ def eval(self, X: pd.DataFrame) -> None:
187
+ """
188
+ Create a histogram of the imputed values.
189
+
190
+ Parameters
191
+ -----------
192
+ X: pd.DataFrame
193
+ Dataframe to create the histogram from.
194
+ """
195
+
196
+ if not isinstance(X, pd.DataFrame):
197
+ raise ValueError("Input X must be a pandas DataFrame.")
198
+
199
+ df = X.copy()
200
+
201
+ imputed_cols = [col for col in df.columns if col.endswith("_imputed")]
202
+ num_impute = len(imputed_cols)
203
+
204
+ if num_impute == 0:
205
+ print("No imputed columns found in the DataFrame.")
206
+ return
207
+
208
+ try:
209
+ ncols, nrows = find_multiples(num_impute)
210
+ except ValueError as e:
211
+ print(f"Error finding multiples for {num_impute}: {e}")
212
+ ncols = 1
213
+ nrows = num_impute
214
+
215
+ _, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 5 * nrows))
216
+ axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]
217
+
218
+ for i, col in enumerate(imputed_cols):
219
+ original_col = col.replace("_imputed", "")
220
+
221
+ if original_col in df.columns:
222
+ original_col_data = df[original_col].dropna()
223
+ ax = axes[i]
224
+
225
+ # Plot the original data
226
+ sns.histplot(
227
+ original_col_data,
228
+ kde=True,
229
+ color="blue",
230
+ label=f"Original",
231
+ bins=10,
232
+ ax=ax,
233
+ )
234
+
235
+ # Plot the imputed data
236
+ sns.histplot(
237
+ df[col],
238
+ kde=True,
239
+ color="red",
240
+ label=f"Imputed",
241
+ bins=10,
242
+ ax=ax,
243
+ )
244
+
245
+ ax.set_xlabel(original_col)
246
+ ax.set_ylabel("Frequency" if i % ncols == 0 else "")
247
+ ax.legend(loc="upper right")
248
+
249
+ plt.suptitle("Histogram Comparison", fontsize=16, fontweight="bold")
250
+ plt.tight_layout()
251
+ plt.subplots_adjust(top=0.9)
252
+ plt.show()
253
+
254
+ def save(self, filename: str = "./imputer") -> None:
255
+ """
256
+ Save the state of the SimpleImputer to a file.
257
+
258
+ Parameters
259
+ -----------
260
+ filename: str
261
+ Name of the file to save the imputer to. Default is "./imputer".
262
+ """
263
+ filename = filename if filename.endswith(".pkl") else filename + ".pkl"
264
+ with open(filename, "wb") as f:
265
+ pickle.dump(self, f)
266
+
267
+ @staticmethod
268
+ def load(filename: str = "./imputer"):
269
+ """
270
+ Load the state of a SimpleImputer from a file.
271
+
272
+ Parameters
273
+ -----------
274
+ filename: str
275
+ Name of the file to load the imputer from. Default is "./imputer".
276
+ """
277
+ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
278
+ with open(filename, "rb") as f:
279
+ return pickle.load(f)
@@ -345,6 +345,27 @@ def gauss_elimination(A: ndarray | list, pr: int = 2) -> ndarray:
345
345
  return X
346
346
 
347
347
 
348
+ def find_multiples(target: int) -> tuple[int, int] | None:
349
+ """Find two factors of a given target number.
350
+
351
+ Parameters
352
+ ----------
353
+ target : int
354
+ The target number to find factors for.
355
+
356
+ Returns
357
+ -------
358
+ tuple[int, int] | None
359
+ A tuple containing two factors of the target number.
360
+ Returns None if no factors are found.
361
+ """
362
+ for i in range(2, target + 1):
363
+ if target % i == 0:
364
+ factor = target // i
365
+ return i, factor
366
+ return None
367
+
368
+
348
369
  # Example usage:
349
370
  if __name__ == "__main__":
350
371
  import pandas as pd
likelihood/tools/tools.py CHANGED
@@ -1167,7 +1167,7 @@ class FeatureSelection:
1167
1167
  self.X = self.X.drop(columns=["index"])
1168
1168
 
1169
1169
 
1170
- def check_nan_inf(df: DataFrame) -> DataFrame:
1170
+ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1171
1171
  """
1172
1172
  Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
1173
1173
 
@@ -1185,20 +1185,32 @@ def check_nan_inf(df: DataFrame) -> DataFrame:
1185
1185
  nan_values = df.isnull().values.any()
1186
1186
  inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
1187
1187
 
1188
+ nan_count = df.isnull().values.sum()
1189
+ inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
1190
+
1188
1191
  if nan_values:
1189
- print("UserWarning: Some rows may have been deleted due to the existence of NaN values.")
1192
+ (
1193
+ print(
1194
+ "UserWarning: Some rows may have been deleted due to the existence of NaN values."
1195
+ )
1196
+ if verbose
1197
+ else None
1198
+ )
1190
1199
  df.dropna(inplace=True)
1191
1200
 
1192
1201
  if inf_values:
1193
- print("UserWarning: Some rows may have been deleted due to the existence of Inf values.")
1202
+ (
1203
+ print(
1204
+ "UserWarning: Some rows may have been deleted due to the existence of Inf values."
1205
+ )
1206
+ if verbose
1207
+ else None
1208
+ )
1194
1209
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1195
1210
  df.dropna(inplace=True)
1196
1211
 
1197
- nan_count = df.isnull().values.sum()
1198
- inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
1199
-
1200
- print(f"NaN values removed: {nan_count}")
1201
- print(f"Infinite values removed: {inf_count}")
1212
+ print(f"NaN values removed: ", "{:,}".format(nan_count))
1213
+ print(f"Infinite values removed: ", "{:,}".format(inf_count))
1202
1214
 
1203
1215
  return df
1204
1216
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 1.5.0
3
+ Version: 1.5.2
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -24,6 +24,7 @@ Requires-Dist: numpy<2.0.0
24
24
  Requires-Dist: pydot==2.0.0
25
25
  Requires-Dist: matplotlib
26
26
  Requires-Dist: graphviz
27
+ Requires-Dist: seaborn
27
28
  Requires-Dist: pyyaml
28
29
  Requires-Dist: pandas
29
30
  Requires-Dist: corner
@@ -39,6 +40,7 @@ Dynamic: classifier
39
40
  Dynamic: description
40
41
  Dynamic: description-content-type
41
42
  Dynamic: home-page
43
+ Dynamic: license-file
42
44
  Dynamic: maintainer
43
45
  Dynamic: maintainer-email
44
46
  Dynamic: provides-extra
@@ -6,17 +6,18 @@ likelihood/graph/nn.py,sha256=EaMmboKriCFnkP48_HLGRAsOZSWxwUlMG0WDGZ4ey1o,11035
6
6
  likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
7
7
  likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
8
8
  likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
9
- likelihood/models/simulation.py,sha256=LFyE_szo7sDukviMLeg_6RoyAaI7yMXUy8f4mDOrGoc,8460
9
+ likelihood/models/simulation.py,sha256=IkYGA6-L1LvSnIlyrVWTzQQu-JnfXml5Tewt-GC05PY,8446
10
10
  likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
11
11
  likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
12
12
  likelihood/models/deep/autoencoders.py,sha256=0EIZwDNlZ9NCfQbhQ_KdXkkRwIjUEU-jk0l0u-J1wmA,44212
13
13
  likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
14
14
  likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
15
+ likelihood/tools/impute.py,sha256=BwBVFSQkG3uWsZEk1THTmqZc3YhHlDhMXgKIV3sx5Lg,9486
15
16
  likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
16
- likelihood/tools/numeric_tools.py,sha256=FA44kbiAcxcquz1el_g3Pqsp5ii8XFkAIrsMs5bGkj0,11445
17
- likelihood/tools/tools.py,sha256=SePaBg-gP29rt5SR2xhqNNQLu7_m0Wner5y_XzdSdpc,42031
18
- likelihood-1.5.0.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
19
- likelihood-1.5.0.dist-info/METADATA,sha256=zTpqZ3w7y_vWY2dqQH7JSfROIkC8dbRcLn2LSCAQGc4,2822
20
- likelihood-1.5.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
21
- likelihood-1.5.0.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
22
- likelihood-1.5.0.dist-info/RECORD,,
17
+ likelihood/tools/numeric_tools.py,sha256=OelCF45QO-zhanX3GmfcdYMfUZxYt353oJ8_gPEdWss,11959
18
+ likelihood/tools/tools.py,sha256=vlQ-peK_z5-MLVnStxlBdl-NfmF6ILxZ6LhBd4K77JI,42282
19
+ likelihood-1.5.2.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
20
+ likelihood-1.5.2.dist-info/METADATA,sha256=ioc6f7SQTASnslCzc4N-dJ4xvnGZTn3llC0Q0OX7nP8,2867
21
+ likelihood-1.5.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
22
+ likelihood-1.5.2.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
23
+ likelihood-1.5.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5