likelihood 2.2.0.dev1__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,212 @@
1
+ import logging
2
+ import os
3
+ from typing import List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
9
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
10
+ import tensorflow as tf
11
+ from sklearn.preprocessing import LabelEncoder
12
+
13
+ tf.get_logger().setLevel("ERROR")
14
+
15
+
16
+ class CategoricalEmbedder:
17
+ def __init__(self, embedding_dim=32):
18
+ self.embedding_dim = embedding_dim
19
+ self.label_encoders = {}
20
+ self.embeddings = {}
21
+
22
+ def fit(self, df: pd.DataFrame, categorical_cols: List):
23
+ """
24
+ Fit the embeddings on the given data.
25
+
26
+ Parameters
27
+ ----------
28
+ df : `pd.DataFrame`
29
+ Pandas DataFrame containing the tabular data.
30
+ categorical_cols : `List`
31
+ List of column names representing categorical features.
32
+
33
+ Returns
34
+ -------
35
+ `None`
36
+ """
37
+ df_processed = df.copy()
38
+ for col in categorical_cols:
39
+ if col not in df_processed.columns:
40
+ raise ValueError(f"Column {col} not found in DataFrame")
41
+
42
+ for col in categorical_cols:
43
+ mode_val = df_processed[col].mode()
44
+ if not mode_val.empty:
45
+ df_processed[col] = df_processed[col].fillna(mode_val[0])
46
+
47
+ for col in categorical_cols:
48
+ le = LabelEncoder()
49
+ df_processed[col] = le.fit_transform(df_processed[col])
50
+ self.label_encoders[col] = le
51
+
52
+ vocab_size = len(le.classes_)
53
+ embedding_matrix = np.random.rand(vocab_size, self.embedding_dim)
54
+ self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
55
+
56
+ def transform(self, df: pd.DataFrame, categorical_cols: List[str]):
57
+ """
58
+ Transform the data using the fitted embeddings.
59
+
60
+ Parameters
61
+ ----------
62
+ df : `pd.DataFrame`
63
+ Pandas DataFrame containing the tabular data.
64
+ categorical_cols : `List[str]`
65
+ List of column names representing categorical features.
66
+
67
+ Returns
68
+ -------
69
+ Transformed Pandas DataFrame with original columns except `categorical_cols` replaced by their embedding representations.
70
+ """
71
+
72
+ df_processed = df.copy()
73
+
74
+ for col in categorical_cols:
75
+ if col not in self.label_encoders:
76
+ raise ValueError(
77
+ f"Column {col} has not been fitted. Please call fit() on this column first."
78
+ )
79
+ mode_val = df_processed[col].mode()
80
+ if not mode_val.empty:
81
+ df_processed[col] = df_processed[col].fillna(mode_val[0])
82
+ le = self.label_encoders[col]
83
+ df_processed[col] = le.transform(df_processed[col])
84
+
85
+ for col in categorical_cols:
86
+ indices_tensor = tf.constant(df_processed[col], dtype=tf.int32)
87
+ embedding_layer = tf.nn.embedding_lookup(
88
+ params=self.embeddings[col], ids=indices_tensor
89
+ )
90
+ if len(embedding_layer.shape) == 1:
91
+ embedding_layer = tf.expand_dims(embedding_layer, axis=0)
92
+
93
+ for i in range(self.embedding_dim):
94
+ df_processed[f"{col}_embed_{i}"] = embedding_layer[:, i]
95
+ df_processed.drop(columns=[col], inplace=True)
96
+
97
+ return df_processed
98
+
99
+ def inverse_transform(self, df: pd.DataFrame, categorical_cols: List[str]):
100
+ """
101
+ Inverse transform the data using the fitted embeddings.
102
+
103
+ Parameters
104
+ ----------
105
+ df : `pd.DataFrame`
106
+ Pandas DataFrame containing the tabular data with embedded representations.
107
+ categorical_cols : `List[str]`
108
+ List of column names representing categorical features.
109
+
110
+ Returns
111
+ -------
112
+ Transformed Pandas DataFrame with original columns replaced by their categorical labels.
113
+ """
114
+
115
+ df_processed = df.copy()
116
+
117
+ for col in categorical_cols:
118
+ if col not in self.label_encoders:
119
+ raise ValueError(
120
+ f"Column {col} has not been fitted. Please call fit() on this column first."
121
+ )
122
+
123
+ embedding_matrix = self.embeddings[col].numpy()
124
+ label_encoder = self.label_encoders[col]
125
+
126
+ embedded_columns = [f"{col}_embed_{i}" for i in range(self.embedding_dim)]
127
+ embeddings = df_processed[embedded_columns].values
128
+
129
+ distances = np.linalg.norm(embedding_matrix - embeddings[:, np.newaxis], axis=2)
130
+ original_indices = np.argmin(distances, axis=1)
131
+ original_labels = label_encoder.inverse_transform(original_indices)
132
+
133
+ df_processed[col] = original_labels
134
+ df_processed.drop(columns=embedded_columns, inplace=True)
135
+
136
+ return df_processed
137
+
138
+ def save_embeddings(self, path: str):
139
+ """
140
+ Save the embeddings to a directory.
141
+
142
+ Parameters
143
+ ----------
144
+ path : `str`
145
+ Path to the directory where embeddings will be saved.
146
+ """
147
+
148
+ os.makedirs(path, exist_ok=True)
149
+ for col, embedding in self.embeddings.items():
150
+ np.save(os.path.join(path, f"{col}_embedding.npy"), embedding.numpy())
151
+
152
+ def load_embeddings(self, path: str):
153
+ """
154
+ Load the embeddings from a directory.
155
+
156
+ Parameters
157
+ ----------
158
+ path : `str`
159
+ Path to the directory where embeddings are saved.
160
+ """
161
+
162
+ for col in self.label_encoders.keys():
163
+ embedding_path = os.path.join(path, f"{col}_embedding.npy")
164
+ if not os.path.exists(embedding_path):
165
+ raise FileNotFoundError(f"Embedding file {embedding_path} not found.")
166
+ embedding_matrix = np.load(embedding_path)
167
+ self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
168
+
169
+
170
+ if __name__ == "__main__":
171
+ data = {
172
+ "color": ["red", "blue", None, "green", "blue"],
173
+ "size": ["S", "M", "XL", "XS", None],
174
+ "price": [10.99, 25.50, 30.00, 8.75, 12.25],
175
+ }
176
+ df = pd.DataFrame(data)
177
+
178
+ # Initialize the embedder
179
+ embedder = CategoricalEmbedder(embedding_dim=3)
180
+
181
+ # Fit the embeddings on the data
182
+ embedder.fit(df, categorical_cols=["color", "size"])
183
+
184
+ # Transform the data using the fitted embeddings
185
+ processed_df = embedder.transform(df, categorical_cols=["color", "size"])
186
+
187
+ print("Processed DataFrame:")
188
+ print(processed_df.head())
189
+
190
+ # Save the embeddings to disk
191
+ embedder.save_embeddings("./embeddings")
192
+
193
+ # Load the embeddings from disk
194
+ new_embedder = CategoricalEmbedder(embedding_dim=3)
195
+ new_embedder.label_encoders = (
196
+ embedder.label_encoders
197
+ ) # Assuming label encodings are consistent across runs
198
+ new_embedder.load_embeddings("./embeddings")
199
+
200
+ # Transform the data using the loaded embeddings
201
+ processed_df_loaded = new_embedder.transform(df, categorical_cols=["color", "size"])
202
+ print("\nProcessed DataFrame with Loaded Embeddings:")
203
+ print(processed_df_loaded.head())
204
+
205
+ # Inverse transform the data
206
+ df_loaded = new_embedder.inverse_transform(
207
+ processed_df_loaded, categorical_cols=["color", "size"]
208
+ )
209
+ print("\nOriginal DataFrame:")
210
+ print(df.head())
211
+ print("\nProcessed DataFrame with Inverse Transform:")
212
+ print(df_loaded.head())
@@ -0,0 +1,348 @@
1
+ import os
2
+ import warnings
3
+ from typing import Optional
4
+
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ from matplotlib.ticker import AutoMinorLocator
8
+ from scipy import stats
9
+
10
+ plt.rcParams.update({"font.size": 14})
11
+
12
+
13
+ def act_pred(
14
+ y_act: np.ndarray,
15
+ y_pred: np.ndarray,
16
+ name: str = "example",
17
+ x_hist: bool = True,
18
+ y_hist: bool = True,
19
+ reg_line: bool = True,
20
+ save_dir: Optional[str] = None,
21
+ ) -> None:
22
+ """
23
+ Creates a scatter plot of actual vs predicted values along with histograms and a regression line.
24
+
25
+ Parameters
26
+ ----------
27
+ y_act : `np.ndarray`
28
+ The actual values (ground truth) as a 1D numpy array.
29
+ y_pred : `np.ndarray`
30
+ The predicted values as a 1D numpy array.
31
+ name : `str`, optional
32
+ The name for saving the plot. Default is "example".
33
+ x_hist : `bool`, optional
34
+ Whether to display the histogram for the actual values (y_act). Default is True.
35
+ y_hist : `bool`, optional
36
+ Whether to display the histogram for the predicted values (y_pred). Default is True.
37
+ reg_line : `bool`, optional
38
+ Whether to plot a regression line (best-fit line) in the scatter plot. Default is True.
39
+ save_dir : `Optional[str]`, optional
40
+ The directory to save the figure. If None, the figure will not be saved. Default is None.
41
+
42
+ Returns
43
+ -------
44
+ `None` : The function doesn't return anything. It generates and optionally saves a plot.
45
+ """
46
+
47
+ y_pred, y_act = y_pred.flatten(), y_act.flatten()
48
+
49
+ if not isinstance(y_act, np.ndarray) or not isinstance(y_pred, np.ndarray):
50
+ raise ValueError("y_act and y_pred must be numpy arrays.")
51
+ if y_act.shape != y_pred.shape:
52
+ raise ValueError("y_act and y_pred must have the same shape.")
53
+
54
+ mec = "#2F4F4F"
55
+ mfc = "#C0C0C0"
56
+
57
+ fig = plt.figure(figsize=(6, 6))
58
+
59
+ left, width = 0.1, 0.65
60
+ bottom, height = 0.1, 0.65
61
+ bottom_h = left + width
62
+ left_h = left + width + 0.05
63
+
64
+ ax2 = fig.add_axes([left, bottom, width, height])
65
+ ax2.tick_params(direction="in", length=7, top=True, right=True)
66
+ ax2.xaxis.set_minor_locator(AutoMinorLocator(2))
67
+ ax2.yaxis.set_minor_locator(AutoMinorLocator(2))
68
+
69
+ ax2.scatter(y_act, y_pred, color=mfc, edgecolor=mec, alpha=0.5, s=35, lw=1.2)
70
+ ax2.plot(
71
+ [y_act.min(), y_act.max()], [y_act.min(), y_act.max()], "k--", alpha=0.8, label="Ideal"
72
+ )
73
+
74
+ ax2.set_xlabel("Actual value")
75
+ ax2.set_ylabel("Predicted value")
76
+ ax2.set_xlim([y_act.min() * 1.05, y_act.max() * 1.05])
77
+ ax2.set_ylim([y_act.min() * 1.05, y_act.max() * 1.05])
78
+
79
+ ax1 = fig.add_axes([left, bottom_h, width, 0.15])
80
+ ax1.hist(y_act, bins=31, density=True, color=mfc, edgecolor=mec, alpha=0.6)
81
+ ax1.set_xticks([])
82
+ ax1.set_yticks([])
83
+ ax1.set_xlim(ax2.get_xlim())
84
+
85
+ if x_hist:
86
+ ax1.set_alpha(1.0)
87
+
88
+ ax3 = fig.add_axes([left_h, bottom, 0.15, height])
89
+ ax3.hist(
90
+ y_pred, bins=31, density=True, color=mfc, edgecolor=mec, orientation="horizontal", alpha=0.6
91
+ )
92
+ ax3.set_xticks([])
93
+ ax3.set_yticks([])
94
+ ax3.set_ylim(ax2.get_ylim())
95
+
96
+ if y_hist:
97
+ ax3.set_alpha(1.0)
98
+
99
+ if reg_line:
100
+ polyfit = np.polyfit(y_act, y_pred, deg=1)
101
+ reg_line_vals = np.poly1d(polyfit)(np.unique(y_act))
102
+ ax2.plot(np.unique(y_act), reg_line_vals, "r-", label="Regression Line", alpha=0.8)
103
+
104
+ ax2.legend(loc="upper left", framealpha=0.35, handlelength=1.5)
105
+
106
+ plt.tight_layout()
107
+
108
+ if save_dir is not None:
109
+ os.makedirs(save_dir, exist_ok=True)
110
+ fig_name = os.path.join(save_dir, f"{name}_act_pred.png")
111
+ plt.savefig(fig_name, bbox_inches="tight", dpi=300)
112
+
113
+ plt.show()
114
+ plt.close(fig)
115
+
116
+
117
+ def residual(
118
+ y_act: np.ndarray, y_pred: np.ndarray, name: str = "example", save_dir: str = None
119
+ ) -> None:
120
+ """
121
+ Plots the residual errors between the actual and predicted values.
122
+
123
+ This function generates a residual plot by calculating the difference between the
124
+ actual values (y_act) and predicted values (y_pred). The plot shows the residuals
125
+ (y_pred - y_act) against the actual values. Optionally, the plot can be saved to a file.
126
+
127
+ Parameters
128
+ ----------
129
+ y_act : `np.ndarray`
130
+ The actual values, typically the ground truth values.
131
+ y_pred : `np.ndarray`
132
+ The predicted values that are compared against the actual values.
133
+ name : `str`, optional
134
+ The name of the plot file (without extension) used when saving the plot. Default is "example".
135
+ save_dir : `str`, optional
136
+ The directory where the plot will be saved. If None, the plot is not saved. Default is None.
137
+
138
+ Returns
139
+ -------
140
+ `None` : This function does not return any value. It generates and optionally saves a plot.
141
+
142
+ Notes
143
+ -----
144
+ - The plot is shown with the residuals (y_pred - y_act) on the y-axis and the actual values (y_act)
145
+ on the x-axis. The plot includes a horizontal line representing the ideal case where the residual
146
+ is zero (i.e., perfect predictions).
147
+ - The plot will be saved as a PNG image if a valid `save_dir` is provided.
148
+ """
149
+
150
+ mec = "#2F4F4F"
151
+ mfc = "#C0C0C0"
152
+
153
+ y_act = np.array(y_act)
154
+ y_pred = np.array(y_pred)
155
+
156
+ xmin = np.min([y_act]) * 0.9
157
+ xmax = np.max([y_act]) / 0.9
158
+ y_err = y_pred - y_act
159
+ ymin = np.min([y_err]) * 0.9
160
+ ymax = np.max([y_err]) / 0.9
161
+
162
+ fig, ax = plt.subplots(figsize=(4, 4))
163
+
164
+ ax.plot(y_act, y_err, "o", mec=mec, mfc=mfc, alpha=0.5, label=None, mew=1.2, ms=5.2)
165
+ ax.plot([xmin, xmax], [0, 0], "k--", alpha=0.8, label="ideal")
166
+
167
+ ax.set_ylabel("Residual error")
168
+ ax.set_xlabel("Actual value")
169
+ ax.legend(loc="lower right")
170
+
171
+ minor_locator_x = AutoMinorLocator(2)
172
+ minor_locator_y = AutoMinorLocator(2)
173
+ ax.get_xaxis().set_minor_locator(minor_locator_x)
174
+ ax.get_yaxis().set_minor_locator(minor_locator_y)
175
+
176
+ ax.tick_params(right=True, top=True, direction="in", length=7)
177
+ ax.tick_params(which="minor", right=True, top=True, direction="in", length=4)
178
+
179
+ ax.set_xlim(xmin, xmax)
180
+ ax.set_ylim(ymin, ymax)
181
+
182
+ if save_dir is not None:
183
+ fig_name = f"{save_dir}/{name}_residual.png"
184
+ os.makedirs(save_dir, exist_ok=True)
185
+ plt.savefig(fig_name, bbox_inches="tight", dpi=300)
186
+
187
+ plt.draw()
188
+ plt.pause(0.001)
189
+
190
+ plt.close()
191
+
192
+
193
+ def residual_hist(
194
+ y_act: np.ndarray, y_pred: np.ndarray, name: str = "example", save_dir: Optional[str] = None
195
+ ) -> None:
196
+ """
197
+ Generates a residual error histogram with kernel density estimate (KDE) for the given true and predicted values.
198
+ Optionally saves the plot to a specified directory.
199
+
200
+ Parameters
201
+ ----------
202
+ y_act : `np.ndarray`
203
+ Array of true (actual) values.
204
+
205
+ y_pred : `np.ndarray`
206
+ Array of predicted values.
207
+
208
+ name : `str`, optional, default="example"
209
+ The name used for the saved plot filename.
210
+
211
+ save_dir : `str`, optional, default=None
212
+ Directory path to save the generated plot. If None, the plot is not saved.
213
+
214
+ Returns
215
+ --------
216
+ `None` : This function generates and optionally saves a plot but does not return any value.
217
+
218
+ Raises
219
+ -------
220
+ `UserWarning` : If the data has high correlation among variables, suggesting dimensionality reduction.
221
+ """
222
+ mec = "#2F4F4F"
223
+ mfc = "#C0C0C0"
224
+ y_pred, y_act = y_pred.flatten(), y_act.flatten()
225
+
226
+ fig, ax = plt.subplots(figsize=(4, 4))
227
+ y_err = y_pred - y_act
228
+ x_range = np.linspace(min(y_err), max(y_err), 1000)
229
+
230
+ try:
231
+ kde_act = stats.gaussian_kde(y_err)
232
+ ax.plot(x_range, kde_act(x_range), "-", lw=1.2, color="k", label="kde")
233
+ except np.linalg.LinAlgError as e:
234
+ warnings.warn(
235
+ "The data has very high correlation among variables. Consider dimensionality reduction.",
236
+ UserWarning,
237
+ )
238
+
239
+ ax.hist(y_err, color=mfc, bins=35, alpha=1, edgecolor=mec, density=True)
240
+
241
+ ax.set_xlabel("Residual error")
242
+ ax.set_ylabel("Relative frequency")
243
+ plt.legend(loc=2, framealpha=0.35, handlelength=1.5)
244
+
245
+ ax.tick_params(direction="in", length=7, top=True, right=True)
246
+
247
+ minor_locator_x = AutoMinorLocator(2)
248
+ minor_locator_y = AutoMinorLocator(2)
249
+ ax.get_xaxis().set_minor_locator(minor_locator_x)
250
+ ax.get_yaxis().set_minor_locator(minor_locator_y)
251
+ plt.tick_params(which="minor", direction="in", length=4, right=True, top=True)
252
+
253
+ if save_dir is not None:
254
+ fig_name = f"{save_dir}/{name}_residual_hist.png"
255
+ os.makedirs(save_dir, exist_ok=True)
256
+ plt.savefig(fig_name, bbox_inches="tight", dpi=300)
257
+
258
+ plt.draw()
259
+ plt.pause(0.001)
260
+ plt.close()
261
+
262
+
263
+ def loss_curve(
264
+ x_data: np.ndarray,
265
+ train_err: np.ndarray,
266
+ val_err: np.ndarray,
267
+ name: str = "example",
268
+ save_dir: Optional[str] = None,
269
+ ) -> None:
270
+ """
271
+ Plots the loss curve for both training and validation errors over epochs,
272
+ and optionally saves the plot as an image.
273
+
274
+ Parameters
275
+ ----------
276
+ x_data : `np.ndarray`
277
+ Array of x-values (usually epochs) for the plot.
278
+ train_err : `np.ndarray`
279
+ Array of training error values.
280
+ val_err : `np.ndarray`
281
+ Array of validation error values.
282
+ name : `str`, optional
283
+ The name to use when saving the plot. Default is "example".
284
+ save_dir : `Optional[str]`, optional
285
+ Directory where the plot should be saved. If None, the plot is not saved. Default is None.
286
+
287
+ Returns
288
+ -------
289
+ `None` : This function does not return any value. It generates and optionally saves a plot.
290
+ """
291
+ mec1 = "#2F4F4F"
292
+ mfc1 = "#C0C0C0"
293
+ mec2 = "maroon"
294
+ mfc2 = "pink"
295
+
296
+ fig, ax = plt.subplots(figsize=(4, 4))
297
+
298
+ ax.plot(
299
+ x_data,
300
+ train_err,
301
+ "-",
302
+ color=mec1,
303
+ marker="o",
304
+ mec=mec1,
305
+ mfc=mfc1,
306
+ ms=4,
307
+ alpha=0.5,
308
+ label="train",
309
+ )
310
+
311
+ ax.plot(
312
+ x_data,
313
+ val_err,
314
+ "--",
315
+ color=mec2,
316
+ marker="s",
317
+ mec=mec2,
318
+ mfc=mfc2,
319
+ ms=4,
320
+ alpha=0.5,
321
+ label="validation",
322
+ )
323
+
324
+ max_val_err = max(val_err)
325
+ ax.axhline(max_val_err, color="b", linestyle="--", alpha=0.3)
326
+
327
+ ax.set_xlabel("Number of training epochs")
328
+ ax.set_ylabel("Loss (Units)")
329
+ ax.set_ylim(0, 2 * np.mean(val_err))
330
+
331
+ ax.legend(loc=1, framealpha=0.35, handlelength=1.5)
332
+
333
+ minor_locator_x = AutoMinorLocator(2)
334
+ minor_locator_y = AutoMinorLocator(2)
335
+ ax.get_xaxis().set_minor_locator(minor_locator_x)
336
+ ax.get_yaxis().set_minor_locator(minor_locator_y)
337
+
338
+ ax.tick_params(right=True, top=True, direction="in", length=7)
339
+ ax.tick_params(which="minor", right=True, top=True, direction="in", length=4)
340
+
341
+ if save_dir is not None:
342
+ fig_name = f"{save_dir}/{name}_loss_curve.png"
343
+ os.makedirs(save_dir, exist_ok=True)
344
+ plt.savefig(fig_name, bbox_inches="tight", dpi=300)
345
+
346
+ plt.draw()
347
+ plt.pause(0.001)
348
+ plt.close()