likelihood 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/__init__.py CHANGED
@@ -16,4 +16,5 @@ To get started with Likelihood, simply import the desired modules and start expl
16
16
 
17
17
  from likelihood.main import *
18
18
  from likelihood.models import *
19
+ from likelihood.pipes import Pipeline
19
20
  from likelihood.tools import *
@@ -7,8 +7,10 @@ from .autoencoders import (
7
7
  keras_tuner,
8
8
  l2,
9
9
  np,
10
+ os,
10
11
  partial,
11
12
  pd,
13
+ rmtree,
12
14
  sampling,
13
15
  suppress_warnings,
14
16
  tf,
@@ -56,7 +56,7 @@ class Env:
56
56
 
57
57
  Returns
58
58
  -------
59
- `tuple` : (current_state, action_pred, reward, next_action, done)
59
+ tuple : (current_state, action_pred, reward, next_action, done)
60
60
  """
61
61
  if self.done:
62
62
  return None, None, 0, None, True
@@ -146,7 +146,7 @@ class AutoQL:
146
146
 
147
147
  Returns
148
148
  -------
149
- `tuple` : (state, action, reward, next_action, done)
149
+ tuple : (state, action, reward, next_action, done)
150
150
  """
151
151
  current_state, value, reward, next_action, done = self.env.step(state, action)
152
152
 
@@ -174,7 +174,7 @@ class AutoQL:
174
174
 
175
175
  Returns
176
176
  -------
177
- `tuple` : (state, action, reward, next_action, done)
177
+ tuple : (state, action, reward, next_action, done)
178
178
  """
179
179
  current_state, greedy_action, reward, next_action, done = self.epsilon_greedy_policy(
180
180
  state, action, epsilon
@@ -202,7 +202,7 @@ class AutoQL:
202
202
 
203
203
  Returns
204
204
  -------
205
- `float` : Training loss
205
+ float : Training loss
206
206
  """
207
207
 
208
208
  batch_ = random.sample(self.replay_buffer, self.batch_size)
likelihood/pipes.py ADDED
@@ -0,0 +1,355 @@
1
+ import json
2
+ from typing import Dict, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from likelihood.tools import generate_html_pipeline
8
+ from likelihood.tools.impute import SimpleImputer
9
+ from likelihood.tools.models_tools import TransformRange, remove_collinearity
10
+ from likelihood.tools.tools import DataFrameEncoder, DataScaler, LinearRegression, OneHotEncoder
11
+
12
+
13
+ class Pipeline:
14
+ def __init__(self, config_path: str):
15
+ """
16
+ Initialize the pipeline with a JSON configuration file.
17
+
18
+ Parameters
19
+ ----------
20
+ config_path : str
21
+ Path to the JSON config defining target column and preprocessing steps.
22
+ """
23
+ self.config = self._load_config(config_path)
24
+ self.target_col = self.config["target_column"]
25
+ self.steps = self.config["preprocessing_steps"]
26
+ self.compute_importance = self.config.get("compute_feature_importance", False)
27
+ self.fitted_components: Dict[str, object] = {}
28
+ self.columns_bin_sizes: Dict[str, int] | None = None
29
+
30
+ def _load_config(self, config_path: str) -> Dict:
31
+ """Load and validate the JSON configuration."""
32
+ with open(config_path, "r") as f:
33
+ config = json.load(f)
34
+
35
+ assert "target_column" in config, "Config must specify 'target_column'"
36
+ assert "preprocessing_steps" in config, "Config must specify 'preprocessing_steps'"
37
+ return config
38
+
39
+ def fit(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
40
+ """
41
+ Fit preprocessing components on the input DataFrame and return cleaned X/y.
42
+
43
+ Parameters
44
+ ----------
45
+ df : pd.DataFrame
46
+ Input data with features + target column.
47
+
48
+ Returns
49
+ -------
50
+ X : pd.DataFrame
51
+ Cleaned feature matrix.
52
+ y : np.ndarray
53
+ Target vector (from self.target_col).
54
+ importances : Optional[np.ndarray]
55
+ Feature importance scores (if compute_feature_importance=True).
56
+ """
57
+ y = df[self.target_col].values
58
+ X = df.drop(columns=[self.target_col]).copy()
59
+
60
+ initial_info = {
61
+ "shape": X.shape,
62
+ "columns": list(X.columns),
63
+ "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
64
+ "missing_values": X.isnull().sum().to_dict(),
65
+ }
66
+
67
+ steps_info = []
68
+ for step in self.steps:
69
+ step_name = step["name"]
70
+ params = step.get("params", {})
71
+ step_info = {
72
+ "step_name": step_name,
73
+ "parameters": params,
74
+ "description": self._get_step_description(step_name),
75
+ }
76
+ step_info["input_columns"] = list(X.columns)
77
+
78
+ X = self._apply_step(step_name, X, fit=True, **params)
79
+
80
+ step_info["output_shape"] = X.shape
81
+ step_info["output_columns"] = list(X.columns)
82
+ step_info["output_dtypes"] = X.dtypes.apply(lambda x: x.name).to_dict()
83
+
84
+ steps_info.append(step_info)
85
+
86
+ final_info = {
87
+ "shape": X.shape,
88
+ "columns": list(X.columns),
89
+ "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
90
+ "missing_values": X.isnull().sum().to_dict(),
91
+ }
92
+
93
+ self.documentation = {
94
+ "initial_dataset": initial_info,
95
+ "processing_steps": steps_info,
96
+ "final_dataset": final_info,
97
+ }
98
+
99
+ importances = None
100
+ if self.compute_importance:
101
+ numeric_X = X.select_dtypes(include=["float"])
102
+ numeric_columns = numeric_X.columns.tolist()
103
+ model = LinearRegression()
104
+ model.fit(numeric_X.T.values, y)
105
+ importances = model.get_importances()
106
+ df_scores = pd.DataFrame([importances], columns=numeric_columns)
107
+ df_scores_abs = df_scores.abs()
108
+ df_scores_norm = (
109
+ df_scores_abs / df_scores_abs.to_numpy().sum()
110
+ if isinstance(importances, np.ndarray)
111
+ else pd.DataFrame()
112
+ )
113
+ return X, y, df_scores_norm
114
+
115
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
116
+ """
117
+ Apply fitted preprocessing steps to new data (no target column needed).
118
+
119
+ Parameters
120
+ ----------
121
+ df : pd.DataFrame
122
+ New data to transform.
123
+
124
+ Returns
125
+ -------
126
+ X_transformed : pd.DataFrame
127
+ Cleaned feature matrix.
128
+ """
129
+ X = df.copy()
130
+ for step_name, _ in self.fitted_components.items():
131
+ X = self._apply_step(step_name, X, fit=False)
132
+
133
+ return X
134
+
135
+ def get_doc(
136
+ self, save_to_file: bool = True, file_name: str = "data_processing_report.html"
137
+ ) -> None:
138
+ """
139
+ Generate an HTML report from `self.documentation` for pipeline documentation.
140
+
141
+ Parameters
142
+ ----------
143
+ save_to_file : bool, optional
144
+ Whether to save generated HTML content to a file. Default is True.
145
+ file_name : str, optional
146
+ Filename for output when `save_to_file` is True. Default is "data_processing_report.html".
147
+ """
148
+
149
+ generate_html_pipeline(self.documentation, save_to_file=save_to_file, file_name=file_name)
150
+
151
+ def _apply_step(self, step_name: str, X: pd.DataFrame, fit: bool, **params) -> pd.DataFrame:
152
+ """Dispatch to the correct handler for a preprocessing step."""
153
+ handlers = {
154
+ "DataScaler": self._handle_datascaler,
155
+ "DataFrameEncoder": self._handle_dataframeencoder,
156
+ "remove_collinearity": self._handle_remove_collinearity,
157
+ "TransformRange": self._handle_transformrange,
158
+ "OneHotEncoder": self._handle_onehotencoder,
159
+ "SimpleImputer": self._handle_simpleimputer,
160
+ }
161
+
162
+ if step_name not in handlers:
163
+ raise ValueError(
164
+ f"Step '{step_name}' not supported. Supported steps: {list(handlers.keys())}"
165
+ )
166
+
167
+ return handlers[step_name](X, fit=fit, **params)
168
+
169
+ def _get_step_description(self, step_name: str) -> str:
170
+ """Return a description of what each preprocessing step does."""
171
+ descriptions = {
172
+ "DataScaler": "Scales numerical features using normalization",
173
+ "DataFrameEncoder": "Encodes categorical variables and normalizes to numerical features",
174
+ "remove_collinearity": "Removes highly correlated features to reduce multicollinearity",
175
+ "TransformRange": "Bins continuous features into discrete ranges",
176
+ "OneHotEncoder": "Converts categorical variables into binary variables",
177
+ "SimpleImputer": "Handles missing values by imputing with multiple linear regression strategies",
178
+ }
179
+
180
+ return descriptions.get(step_name, f"Unknown preprocessing step: {step_name}")
181
+
182
+ # ------------------------------ Step Handlers ------------------------------
183
+ def _handle_datascaler(self, X: pd.DataFrame, fit: bool, n: int = 1) -> pd.DataFrame:
184
+ """Handle DataScaler (fits on training data, applies to all)."""
185
+ numeric_X = X.select_dtypes(include=["float"])
186
+ numeric_columns = numeric_X.columns.tolist()
187
+ n = None if n == 0 else n
188
+ if fit:
189
+ scaler = DataScaler(numeric_X.values.T, n=n)
190
+ self.fitted_components["DataScaler"] = scaler
191
+ numeric_X = pd.DataFrame(scaler.rescale().T, columns=numeric_X.columns)
192
+ else:
193
+ scaler = self.fitted_components["DataScaler"]
194
+ numeric_X = pd.DataFrame(
195
+ scaler.rescale(numeric_X.values.T).T, columns=numeric_X.columns
196
+ )
197
+ for col in numeric_columns:
198
+ X[col] = numeric_X[col]
199
+ return X
200
+
201
+ def _handle_dataframeencoder(
202
+ self, X: pd.DataFrame, fit: bool, norm_method: str = "mean"
203
+ ) -> pd.DataFrame:
204
+ """Handle DataFrameEncoder (fits encoders/normalizers)."""
205
+ if fit:
206
+ encoder = DataFrameEncoder(X)
207
+ encoded_X = encoder.encode(norm_method=norm_method)
208
+ self.fitted_components["DataFrameEncoder"] = encoder
209
+ return encoded_X
210
+ else:
211
+ encoder = self.fitted_components["DataFrameEncoder"]
212
+ encoder._df = X
213
+ return encoder.encode()
214
+
215
+ def _handle_remove_collinearity(
216
+ self, X: pd.DataFrame, fit: bool, threshold: float = 0.9
217
+ ) -> pd.DataFrame:
218
+ """Handle collinearity removal (fits by selecting columns to drop)."""
219
+ numeric_X = X.select_dtypes(include=["float"])
220
+ numeric_columns = numeric_X.columns.tolist()
221
+ categorical_columns = set(X.columns) - set(numeric_columns)
222
+ if fit:
223
+ cleaned_X = remove_collinearity(numeric_X, threshold=threshold)
224
+ dropped_cols = set(X.columns) - set(cleaned_X.columns) - categorical_columns
225
+ self.fitted_components["remove_collinearity"] = dropped_cols
226
+ return X.drop(columns=dropped_cols)
227
+ else:
228
+ dropped_cols = self.fitted_components["remove_collinearity"]
229
+ return X.drop(columns=dropped_cols)
230
+
231
+ def _handle_transformrange(
232
+ self, X: pd.DataFrame, fit: bool, columns_bin_sizes: Dict[str, int] | None = None
233
+ ) -> pd.DataFrame:
234
+ """Handle TransformRange (bin numerical features into ranges)."""
235
+ if fit:
236
+ transformer = TransformRange(columns_bin_sizes)
237
+ cleaned_X = transformer.transform(X)
238
+ self.fitted_components["TransformRange"] = transformer
239
+ self.columns_bin_sizes = columns_bin_sizes
240
+ return cleaned_X
241
+ else:
242
+ transformer = self.fitted_components["TransformRange"]
243
+ return transformer.transform(X, fit=False)
244
+
245
+ def _handle_onehotencoder(
246
+ self, X: pd.DataFrame, fit: bool, columns: List[str] | None = None
247
+ ) -> pd.DataFrame:
248
+ """Handle OneHotEncoder (fits on categorical columns)."""
249
+ if fit:
250
+ tmp_df = X.drop(columns=columns)
251
+ encoder = OneHotEncoder()
252
+ category_to_indices = {}
253
+ for col in columns:
254
+ unique_values = X[col].unique()
255
+ category_to_indices[col] = {value: i for i, value in enumerate(unique_values)}
256
+ encoded_X = encoder.encode(
257
+ X[col].values
258
+ if isinstance(unique_values[0], int)
259
+ else X[col].map(category_to_indices[col])
260
+ )
261
+ tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
262
+ self.fitted_components["OneHotEncoder"] = (encoder, columns, category_to_indices)
263
+ else:
264
+ encoder, columns, category_to_indices = self.fitted_components["OneHotEncoder"]
265
+ tmp_df = X.drop(columns=columns)
266
+ for col in columns:
267
+ unique_values = list(category_to_indices[col].keys())
268
+ encoded_X = encoder.encode(
269
+ (
270
+ X[col].values
271
+ if isinstance(unique_values[0], int)
272
+ else X[col].map(category_to_indices[col])
273
+ ),
274
+ fit=False,
275
+ )
276
+ tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
277
+ return tmp_df
278
+
279
+ def _handle_simpleimputer(
280
+ self,
281
+ X: pd.DataFrame,
282
+ fit: bool,
283
+ use_scaler: bool = False,
284
+ boundary: bool = True,
285
+ ) -> pd.DataFrame:
286
+ "Handle SimpleImputer (fit on numerical and categorical columns)."
287
+ if fit:
288
+ use_scaler = True if use_scaler == 1 else False
289
+ imputer = SimpleImputer(use_scaler=use_scaler)
290
+ tmp_df = imputer.fit_transform(X, boundary=boundary)
291
+ self.fitted_components["SimpleImputer"] = imputer
292
+ return tmp_df
293
+ else:
294
+ imputer = self.fitted_components["SimpleImputer"]
295
+ return imputer.transform(X, boundary=boundary)
296
+
297
+ def save(self, filepath: str) -> None:
298
+ """
299
+ Save the fitted pipeline state to a file using pickle.
300
+
301
+ Parameters
302
+ ----------
303
+ filepath : str
304
+ Path where the serialized pipeline will be saved.
305
+ """
306
+ import pickle
307
+
308
+ save_dict = {
309
+ "config": self.config,
310
+ "fitted_components": self.fitted_components,
311
+ "target_col": self.target_col,
312
+ "steps": self.steps,
313
+ "compute_importance": self.compute_importance,
314
+ "columns_bin_sizes": self.columns_bin_sizes,
315
+ "documentation": self.documentation,
316
+ }
317
+
318
+ filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
319
+
320
+ with open(filepath, "wb") as f:
321
+ pickle.dump(save_dict, f)
322
+
323
+ @classmethod
324
+ def load(cls, filepath: str) -> "Pipeline":
325
+ """
326
+ Load a fitted pipeline from a file.
327
+
328
+ Parameters
329
+ ----------
330
+ filepath : str
331
+ Path to the serialized pipeline file.
332
+
333
+ Returns
334
+ -------
335
+ pipeline : Pipeline
336
+ Reconstructed pipeline instance with fitted components.
337
+ """
338
+ import pickle
339
+
340
+ filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
341
+
342
+ with open(filepath, "rb") as f:
343
+ save_dict = pickle.load(f)
344
+
345
+ pipeline = cls.__new__(cls)
346
+
347
+ pipeline.config = save_dict["config"]
348
+ pipeline.fitted_components = save_dict["fitted_components"]
349
+ pipeline.target_col = save_dict["target_col"]
350
+ pipeline.steps = save_dict["steps"]
351
+ pipeline.compute_importance = save_dict["compute_importance"]
352
+ pipeline.columns_bin_sizes = save_dict["columns_bin_sizes"]
353
+ pipeline.documentation = save_dict["documentation"]
354
+
355
+ return pipeline
@@ -1,3 +1,4 @@
1
1
  from .models_tools import *
2
2
  from .numeric_tools import *
3
+ from .reports import generate_html_pipeline
3
4
  from .tools import *
@@ -47,22 +47,21 @@ class TransformRange:
47
47
  Transforms numerical columns into categorical range bins with descriptive labels.
48
48
  """
49
49
 
50
- def __init__(self, df: pd.DataFrame) -> None:
50
+ def __init__(self, columns_bin_sizes: Dict[str, int]) -> None:
51
51
  """Initializes the class with the original DataFrame.
52
52
 
53
53
  Parameters
54
54
  ----------
55
- df : `pd.DataFrame`
56
- The original DataFrame to transform.
55
+ columns_bin_sizes : `dict`
56
+ A dictionary where the keys are column names and the values are the bin sizes.
57
57
 
58
58
  Raises
59
59
  ------
60
60
  TypeError
61
61
  If df is not a pandas DataFrame.
62
62
  """
63
- if not isinstance(df, pd.DataFrame):
64
- raise TypeError("df must be a pandas DataFrame")
65
- self.df = df.copy() # Create a copy to avoid modifying the original
63
+ self.info = {}
64
+ self.columns_bin_sizes = columns_bin_sizes
66
65
 
67
66
  def _create_bins_and_labels(
68
67
  self, min_val: Union[int, float], max_val: Union[int, float], bin_size: int
@@ -104,15 +103,28 @@ class TransformRange:
104
103
  if bins[-1] <= max_val:
105
104
  bins = np.append(bins, max_val + 1)
106
105
 
106
+ lower_bin_edge = -np.inf
107
+ upper_bin_edge = np.inf
108
+
107
109
  labels = [f"{int(bins[i])}-{int(bins[i+1] - 1)}" for i in range(len(bins) - 1)]
110
+ end = int(bins[-1] - 1)
111
+ bins = bins.tolist()
112
+ bins.insert(0, lower_bin_edge)
113
+ bins.append(upper_bin_edge)
114
+ labels.insert(0, f"< {start}")
115
+ labels.append(f"> {end}")
108
116
  return bins, labels
109
117
 
110
- def _transform_column_to_ranges(self, column: str, bin_size: int) -> pd.Series:
118
+ def _transform_column_to_ranges(
119
+ self, df: pd.DataFrame, column: str, bin_size: int, fit: bool = True
120
+ ) -> pd.Series:
111
121
  """
112
122
  Transforms a column in the DataFrame into range bins.
113
123
 
114
124
  Parameters
115
125
  ----------
126
+ df : `pd.DataFrame`
127
+ The original DataFrame to transform.
116
128
  column : `str`
117
129
  The name of the column to transform.
118
130
  bin_size : `int`
@@ -130,40 +142,51 @@ class TransformRange:
130
142
  ValueError
131
143
  If bin_size is not positive or if column contains non-numeric data.
132
144
  """
133
- if column not in self.df.columns:
134
- raise KeyError(f"Column '{column}' not found in DataFrame")
135
-
136
- if bin_size <= 0:
137
- raise ValueError("bin_size must be positive")
138
-
139
- numeric_series = pd.to_numeric(self.df[column], errors="coerce")
140
- if numeric_series.isna().all():
141
- raise ValueError(f"Column '{column}' contains no valid numeric data")
142
-
143
- min_val = numeric_series.min()
144
- max_val = numeric_series.max()
145
-
146
- if min_val == max_val:
147
- return pd.Series(
148
- [f"{int(min_val)}-{int(max_val)}"] * len(self.df), name=f"{column}_range"
149
- )
145
+ if not isinstance(df, pd.DataFrame):
146
+ raise TypeError("df must be a pandas DataFrame")
147
+ df_ = df.copy() # Create a copy to avoid modifying the original
148
+ numeric_series = pd.to_numeric(df_[column], errors="coerce")
149
+ if fit:
150
+ self.df = df_.copy()
151
+ if column not in df_.columns:
152
+ raise KeyError(f"Column '{column}' not found in DataFrame")
153
+
154
+ if bin_size <= 0:
155
+ raise ValueError("bin_size must be positive")
156
+
157
+ if numeric_series.isna().all():
158
+ raise ValueError(f"Column '{column}' contains no valid numeric data")
159
+
160
+ min_val = numeric_series.min()
161
+ max_val = numeric_series.max()
162
+
163
+ if min_val == max_val:
164
+ return pd.Series(
165
+ [f"{int(min_val)}-{int(max_val)}"] * len(df_), name=f"{column}_range"
166
+ )
167
+ self.info[column] = {"min_value": min_val, "max_value": max_val, "range": bin_size}
168
+ else:
169
+ min_val = self.info[column]["min_value"]
170
+ max_val = self.info[column]["max_value"]
171
+ bin_size = self.info[column]["range"]
150
172
 
151
173
  bins, labels = self._create_bins_and_labels(min_val, max_val, bin_size)
152
-
153
174
  return pd.cut(numeric_series, bins=bins, labels=labels, right=False, include_lowest=True)
154
175
 
155
- def transform_dataframe(
156
- self, columns_bin_sizes: Dict[str, int], drop_original: bool = False
176
+ def transform(
177
+ self, df: pd.DataFrame, drop_original: bool = False, fit: bool = True
157
178
  ) -> pd.DataFrame:
158
179
  """
159
180
  Creates a new DataFrame with range columns.
160
181
 
161
182
  Parameters
162
183
  ----------
163
- columns_bin_sizes : `dict`
164
- A dictionary where the keys are column names and the values are the bin sizes.
184
+ df : `pd.DataFrame`
185
+ The original DataFrame to transform.
165
186
  drop_original : `bool`, optional
166
187
  If True, drops original columns from the result, by default False
188
+ fit : `bool`, default=True
189
+ Whether to compute bin edges based on the data (True) or use predefined binning (False).
167
190
 
168
191
  Returns
169
192
  -------
@@ -175,22 +198,24 @@ class TransformRange:
175
198
  TypeError
176
199
  If columns_bin_sizes is not a dictionary.
177
200
  """
178
- if not isinstance(columns_bin_sizes, dict):
201
+ if not isinstance(self.columns_bin_sizes, dict):
179
202
  raise TypeError("columns_bin_sizes must be a dictionary")
180
203
 
181
- if not columns_bin_sizes:
204
+ if not self.columns_bin_sizes:
182
205
  return pd.DataFrame()
183
206
 
184
207
  range_columns = {}
185
- for column, bin_size in columns_bin_sizes.items():
186
- range_columns[f"{column}_range"] = self._transform_column_to_ranges(column, bin_size)
208
+ for column, bin_size in self.columns_bin_sizes.items():
209
+ range_columns[f"{column}_range"] = self._transform_column_to_ranges(
210
+ df, column, bin_size, fit
211
+ )
187
212
 
188
213
  result_df = pd.DataFrame(range_columns)
189
214
 
190
215
  if not drop_original:
191
- original_cols = [col for col in self.df.columns if col not in columns_bin_sizes]
216
+ original_cols = [col for col in df.columns if col not in self.columns_bin_sizes]
192
217
  if original_cols:
193
- result_df = pd.concat([self.df[original_cols], result_df], axis=1)
218
+ result_df = pd.concat([df[original_cols], result_df], axis=1)
194
219
 
195
220
  return result_df
196
221
 
@@ -0,0 +1,195 @@
1
+ from html import escape
2
+ from IPython.display import HTML, display
3
+
4
+
5
+ def generate_html_pipeline(data_dict, save_to_file=False, file_name="data_processing_report.html"):
6
+ css_js = """
7
+ <style>
8
+ :root {
9
+ --primary: #0d9488;
10
+ --primary-dark: #0f766e;
11
+ --success: #10b981;
12
+ --accent: #3b82f6;
13
+ --card-bg: #ffffff;
14
+ --shadow-sm: 0 2px 6px rgba(0, 0, 0, 0.03);
15
+ --border-radius-md: 6px;
16
+ }
17
+
18
+ * {
19
+ box-sizing: border-box;
20
+ }
21
+
22
+ body {
23
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
24
+ background: #f8fafc;
25
+ color: #1e293b;
26
+ margin: 0;
27
+ padding: 1rem;
28
+ font-size: 14px;
29
+ }
30
+
31
+ h2 {
32
+ background: linear-gradient(135deg, var(--primary), var(--primary-dark));
33
+ color: white;
34
+ text-align: center;
35
+ padding: 1rem;
36
+ border-radius: var(--border-radius-md);
37
+ font-weight: 600;
38
+ font-size: 1.5rem;
39
+ margin-bottom: 1.5rem;
40
+ }
41
+
42
+ section {
43
+ background: var(--card-bg);
44
+ border-radius: var(--border-radius-md);
45
+ padding: 1rem;
46
+ box-shadow: var(--shadow-sm);
47
+ margin-bottom: 1.2rem;
48
+ }
49
+
50
+ h3 {
51
+ color: var(--primary-dark);
52
+ font-weight: 600;
53
+ font-size: 1.2rem;
54
+ border-left: 4px solid var(--success);
55
+ padding-left: 0.8rem;
56
+ margin: 1rem 0 0.8rem;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ font-size: 13px;
63
+ margin: 0.5rem 0 1rem;
64
+ }
65
+
66
+ th, td {
67
+ padding: 0.5rem 0.75rem;
68
+ text-align: left;
69
+ border-bottom: 1px solid #e2e8f0;
70
+ vertical-align: top;
71
+ }
72
+
73
+ thead {
74
+ background-color: #f0fdf4;
75
+ }
76
+
77
+ tbody tr:nth-child(odd) {
78
+ background-color: #f9fafb;
79
+ }
80
+
81
+ tbody tr:hover {
82
+ background-color: #e0f2fe;
83
+ }
84
+
85
+ .nested-table {
86
+ font-size: 12px;
87
+ margin-top: 0.5rem;
88
+ }
89
+
90
+ details {
91
+ margin-bottom: 0.8rem;
92
+ padding: 0.5rem 0.8rem;
93
+ background: #f9f9f9;
94
+ border-radius: var(--border-radius-md);
95
+ }
96
+
97
+ summary {
98
+ font-weight: 600;
99
+ font-size: 1rem;
100
+ color: var(--primary-dark);
101
+ cursor: pointer;
102
+ }
103
+
104
+ summary::before {
105
+ content: "▶";
106
+ margin-right: 6px;
107
+ color: var(--success);
108
+ font-size: 0.9rem;
109
+ }
110
+
111
+ @media (max-width: 768px) {
112
+ body {
113
+ font-size: 13px;
114
+ }
115
+
116
+ h2 {
117
+ font-size: 1.3rem;
118
+ padding: 0.8rem;
119
+ }
120
+
121
+ h3 {
122
+ font-size: 1.1rem;
123
+ }
124
+
125
+ table, .nested-table {
126
+ font-size: 12px;
127
+ }
128
+ }
129
+ </style>
130
+ """
131
+
132
+ def render_value(val):
133
+ if isinstance(val, dict):
134
+ return dict_to_table(val, nested=True)
135
+ elif isinstance(val, list):
136
+ if all(isinstance(item, (str, int, float)) for item in val):
137
+ return ", ".join(escape(str(x)) for x in val)
138
+ else:
139
+ return "<ul>" + "".join(f"<li>{render_value(v)}</li>" for v in val) + "</ul>"
140
+ else:
141
+ return escape(str(val))
142
+
143
+ def dict_to_table(d, title=None, nested=False):
144
+ html = ""
145
+ if title and not nested:
146
+ html += f"<h4>{escape(title)}</h4>"
147
+ table_class = "nested-table" if nested else "table"
148
+ html += f"<table class='{table_class}'>"
149
+ html += "<thead><tr><th>Key</th><th>Value</th></tr></thead><tbody>"
150
+ for key, val in d.items():
151
+ key_html = escape(str(key))
152
+ val_html = render_value(val)
153
+ html += f"<tr><td>{key_html}</td><td>{val_html}</td></tr>"
154
+ html += "</tbody></table>"
155
+ return html
156
+
157
+ html_content = css_js
158
+ html_content += "<h2>📈 Data Processing Report</h2>"
159
+
160
+ html_content += "<section>"
161
+ html_content += "<h3>📁 Initial Dataset</h3>"
162
+ html_content += dict_to_table(data_dict["initial_dataset"])
163
+ html_content += "</section>"
164
+
165
+ html_content += "<section>"
166
+ html_content += "<h3>🔧 Processing Steps</h3>"
167
+ for i, step in enumerate(data_dict["processing_steps"]):
168
+ html_content += "<details open>"
169
+ html_content += f"<summary>Step {i + 1}: {escape(step['step_name'])}</summary>"
170
+ html_content += f"<p><strong>Description:</strong> {escape(step['description'])}</p>"
171
+ html_content += dict_to_table(step["parameters"], title="Parameters", nested=True)
172
+ html_content += dict_to_table(
173
+ {
174
+ "Output Shape": step["output_shape"],
175
+ "Input Columns": step["input_columns"],
176
+ "Output Columns": step["output_columns"],
177
+ "Output Dtypes": step["output_dtypes"],
178
+ },
179
+ title="Output Info",
180
+ nested=True,
181
+ )
182
+ html_content += "</details>"
183
+ html_content += "</section>"
184
+
185
+ html_content += "<section>"
186
+ html_content += "<h3>✅ Final Dataset</h3>"
187
+ html_content += dict_to_table(data_dict["final_dataset"])
188
+ html_content += "</section>"
189
+
190
+ if save_to_file:
191
+ with open(file_name, "w", encoding="utf-8") as f:
192
+ f.write(html_content)
193
+ print(f"✅ Report saved to '{file_name}'")
194
+ else:
195
+ display(HTML(html_content))
likelihood/tools/tools.py CHANGED
@@ -2,7 +2,7 @@ import math
2
2
  import os
3
3
  import pickle
4
4
  import warnings
5
- from typing import Callable, Dict, List, Tuple, Union
5
+ from typing import Callable, Dict, Generator, List, Tuple, Union
6
6
 
7
7
  import matplotlib.pyplot as plt
8
8
  import numpy as np
@@ -25,7 +25,7 @@ Data Science from Scratch, Second Edition, by Joel Grus (O'Reilly).Copyright 201
25
25
  """
26
26
 
27
27
 
28
- def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> List:
28
+ def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> Generator:
29
29
  """Generates 'batch_size'-sized minibatches from the dataset
30
30
 
31
31
  Parameters
@@ -660,7 +660,7 @@ class DataScaler:
660
660
 
661
661
  __slots__ = ["dataset_", "_n", "data_scaled", "values", "inv_fitting"]
662
662
 
663
- def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
663
+ def __init__(self, dataset: np.ndarray, n: int | None = 1) -> None:
664
664
  """Initializes the parameters required for scaling the data"""
665
665
  self.dataset_ = dataset.copy()
666
666
  self._n = n
@@ -1024,20 +1024,21 @@ class OneHotEncoder:
1024
1024
  It receives an array of integers and returns a binary array using the one-hot encoding method.
1025
1025
  """
1026
1026
 
1027
- __slots__ = ["x"]
1027
+ __slots__ = ["num_categories"]
1028
1028
 
1029
1029
  def __init__(self) -> None:
1030
1030
  pass
1031
1031
 
1032
- def encode(self, x: np.ndarray | list):
1033
- self.x = x
1034
-
1035
- if not isinstance(self.x, np.ndarray):
1036
- self.x = np.array(self.x)
1032
+ def encode(self, x: np.ndarray | list, fit: bool = True):
1033
+ if not isinstance(x, np.ndarray):
1034
+ x = np.array(x)
1035
+ x = x.astype(int)
1036
+ if fit:
1037
+ self.num_categories = x.max() + 1
1037
1038
 
1038
- y = np.zeros((self.x.size, self.x.max() + 1))
1039
+ y = np.zeros((x.size, self.num_categories))
1039
1040
 
1040
- y[np.arange(self.x.size), self.x] = 1
1041
+ y[np.arange(x.size), x] = 1
1041
1042
 
1042
1043
  return y
1043
1044
 
@@ -1189,7 +1190,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1189
1190
  if nan_values:
1190
1191
  (
1191
1192
  print(
1192
- "UserWarning: Some rows may have been deleted due to the existence of NaN values."
1193
+ "UserWarning: Some rows may have been deleted due to the existence of NaN values.",
1194
+ f"NaN values removed: ",
1195
+ "{:,}".format(nan_count),
1193
1196
  )
1194
1197
  if verbose
1195
1198
  else None
@@ -1199,7 +1202,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1199
1202
  if inf_values:
1200
1203
  (
1201
1204
  print(
1202
- "UserWarning: Some rows may have been deleted due to the existence of Inf values."
1205
+ "UserWarning: Some rows may have been deleted due to the existence of Inf values.",
1206
+ f"Infinite values removed: ",
1207
+ "{:,}".format(inf_count),
1203
1208
  )
1204
1209
  if verbose
1205
1210
  else None
@@ -1207,9 +1212,6 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1207
1212
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1208
1213
  df.dropna(inplace=True)
1209
1214
 
1210
- print(f"NaN values removed: ", "{:,}".format(nan_count))
1211
- print(f"Infinite values removed: ", "{:,}".format(inf_count))
1212
-
1213
1215
  return df
1214
1216
 
1215
1217
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 2.0.1
3
+ Version: 2.0.2
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -1,5 +1,6 @@
1
- likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
1
+ likelihood/__init__.py,sha256=e2AiFru2wEpWnK6frQlzEI-4r8UyU59ltxtkvOs-nEI,1032
2
2
  likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
3
+ likelihood/pipes.py,sha256=9M94YGppg-Q1yNpSIjAzy4MSNSCGobr10LrRRMpEmdA,13853
3
4
  likelihood/graph/__init__.py,sha256=vUY4pKlnm3eSVTXd2d-5JDPawhqGNRIKRhaHIobsNws,188
4
5
  likelihood/graph/_nn.py,sha256=Sh7dRz8QSI08Ydfw9e--uCxc4KMtHUsCz_-C-loXklQ,13883
5
6
  likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
@@ -10,21 +11,22 @@ likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0
10
11
  likelihood/models/simulation.py,sha256=xsl4mJ2qFCuZR_B9LfQcLjV6OtONU1zyESX3CCUfOiw,8619
11
12
  likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
12
13
  likelihood/models/deep/__init__.py,sha256=I55FciI0BfljYdhW2OGNqcpYV57FhPZETZX7Y1y9GVQ,303
13
- likelihood/models/deep/_autoencoders.py,sha256=CeD79YzU7DdPd92wUNG_EtPVQOBgsgYoC4uS2JF3b6o,30939
14
+ likelihood/models/deep/_autoencoders.py,sha256=VG6nL5Zov1a31TnzbFNALiyBOAvup3s86ShD64JxmLA,30959
14
15
  likelihood/models/deep/_predictor.py,sha256=XI4QfVM7PS_60zYtmi-V8UzNDrASFiDMVPmV17BB8lM,27984
15
16
  likelihood/models/deep/autoencoders.py,sha256=muUBH9BclOK8ViI7PijyMOBBLVox6uwuIabyJvpU5qw,30729
16
17
  likelihood/models/deep/gan.py,sha256=rTnaLmIPjsKg6_0B8JZOVwPxdx59rHmqvzDitdJMCQ4,10924
17
18
  likelihood/models/deep/predictor.py,sha256=q5tPaAbF7s5XIcxVr6fyHTQdZa9tlixO9vb9a9Cw0wM,27831
18
- likelihood/models/deep/rl.py,sha256=VVuwHwK24d2fe3uNHliE1QJsKGZAPhx_pdgj3jqN5rQ,11565
19
- likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
19
+ likelihood/models/deep/rl.py,sha256=WuRP_1RmTIkzbirjb7geUxFsqCAwt7CdtXWKHrmIgo4,11541
20
+ likelihood/tools/__init__.py,sha256=C5r18DQdyBVePyxtlfdVLr9SMFnXeVuvgcZgmN5-3dY,122
20
21
  likelihood/tools/cat_embed.py,sha256=SJ7o1vbrNYp21fLLcjRnWpUDcz1nVSe8TmMvsLIz5CI,7346
21
22
  likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
22
23
  likelihood/tools/impute.py,sha256=n87Tv-xLUAdPl7BQLFcLWSsXBZbXksahyCayJWMydXc,9485
23
- likelihood/tools/models_tools.py,sha256=-QAfvCy9mw-ZyeJHzJJ7O6eDfUXghtA7KfFtTc-Tp0A,14607
24
+ likelihood/tools/models_tools.py,sha256=2e0uzpjgCMn5cB0McN6a_Vff_QHdMhjUbgLHzMferEA,15622
24
25
  likelihood/tools/numeric_tools.py,sha256=JeLECoVS3ayFH53kUYkAMs0fzALZV1M22-tBLM-Q34g,12264
25
- likelihood/tools/tools.py,sha256=5vPUHrm8D4ODsg-MP4uZ3NgXV9fNbs0Olx7RWtUdVDU,42196
26
- likelihood-2.0.1.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
27
- likelihood-2.0.1.dist-info/METADATA,sha256=3mLJAcVO4jzu4IoCVVaSBPMxBWV-xnHs_f_DvvN9G0c,2917
28
- likelihood-2.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- likelihood-2.0.1.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
30
- likelihood-2.0.1.dist-info/RECORD,,
26
+ likelihood/tools/reports.py,sha256=InhmpBQ014oCMLIwz_hkhkxj8LcVbMBy_dl8NPViSdI,5710
27
+ likelihood/tools/tools.py,sha256=5teN35Dt6z4j6GgMiizf9txbD9uVirCJVLsGPTE7pZY,42336
28
+ likelihood-2.0.2.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
29
+ likelihood-2.0.2.dist-info/METADATA,sha256=qdLRVowEQRvzD1UeTT2QYT-wrAtHtJEiM28y_Jv69iA,2917
30
+ likelihood-2.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ likelihood-2.0.2.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
32
+ likelihood-2.0.2.dist-info/RECORD,,