likelihood 2.0.0__tar.gz → 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {likelihood-2.0.0 → likelihood-2.0.2}/PKG-INFO +1 -1
  2. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/__init__.py +1 -0
  3. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/_autoencoders.py +2 -0
  4. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/rl.py +36 -36
  5. likelihood-2.0.2/likelihood/pipes.py +355 -0
  6. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/__init__.py +1 -0
  7. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/models_tools.py +219 -7
  8. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/numeric_tools.py +4 -4
  9. likelihood-2.0.2/likelihood/tools/reports.py +195 -0
  10. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/tools.py +19 -17
  11. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/PKG-INFO +1 -1
  12. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/SOURCES.txt +2 -0
  13. {likelihood-2.0.0 → likelihood-2.0.2}/LICENSE +0 -0
  14. {likelihood-2.0.0 → likelihood-2.0.2}/README.md +0 -0
  15. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/__init__.py +0 -0
  16. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/_nn.py +0 -0
  17. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/graph.py +0 -0
  18. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/nn.py +0 -0
  19. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/main.py +0 -0
  20. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/__init__.py +0 -0
  21. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/__init__.py +0 -0
  22. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/_predictor.py +0 -0
  23. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/autoencoders.py +0 -0
  24. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/gan.py +0 -0
  25. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/predictor.py +0 -0
  26. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/hmm.py +0 -0
  27. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/regression.py +0 -0
  28. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/simulation.py +0 -0
  29. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/utils.py +0 -0
  30. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/cat_embed.py +0 -0
  31. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/figures.py +0 -0
  32. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/impute.py +0 -0
  33. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/dependency_links.txt +0 -0
  34. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/requires.txt +0 -0
  35. {likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/top_level.txt +0 -0
  36. {likelihood-2.0.0 → likelihood-2.0.2}/setup.cfg +0 -0
  37. {likelihood-2.0.0 → likelihood-2.0.2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -16,4 +16,5 @@ To get started with Likelihood, simply import the desired modules and start expl
16
16
 
17
17
  from likelihood.main import *
18
18
  from likelihood.models import *
19
+ from likelihood.pipes import Pipeline
19
20
  from likelihood.tools import *
@@ -7,8 +7,10 @@ from .autoencoders import (
7
7
  keras_tuner,
8
8
  l2,
9
9
  np,
10
+ os,
10
11
  partial,
11
12
  pd,
13
+ rmtree,
12
14
  sampling,
13
15
  suppress_warnings,
14
16
  tf,
@@ -27,12 +27,12 @@ class Env:
27
27
 
28
28
  Parameters
29
29
  ----------
30
- model : Any
31
- Model with `.predict()` method (e.g., Keras model).
32
- maxlen : int
33
- Maximum length of deque. By default it is set to `100`.
34
- name : str
35
- The name of the environment. By default it is set to `likenasium`.
30
+ model : Any
31
+ Model with `.predict()` method (e.g., Keras model).
32
+ maxlen : int
33
+ Maximum length of deque. By default it is set to `100`.
34
+ name : str
35
+ The name of the environment. By default it is set to `likenasium`.
36
36
  """
37
37
  self.model = model
38
38
  self.maxlen = maxlen
@@ -49,14 +49,14 @@ class Env:
49
49
 
50
50
  Parameters
51
51
  ----------
52
- state : `np.ndarray`
53
- Current state to process (input to the model).
54
- action : int
55
- Expected action to process.
52
+ state : `np.ndarray`
53
+ Current state to process (input to the model).
54
+ action : `int`
55
+ Expected action to process.
56
56
 
57
57
  Returns
58
58
  -------
59
- tuple: (current_state, action_pred, reward, next_action, done)
59
+ tuple : (current_state, action_pred, reward, next_action, done)
60
60
  """
61
61
  if self.done:
62
62
  return None, None, 0, None, True
@@ -120,9 +120,9 @@ class AutoQL:
120
120
 
121
121
  Parameters
122
122
  ----------
123
- env : Any
123
+ env : `Any`
124
124
  The environment to interact with
125
- model : tf.keras.Model
125
+ model : `tf.keras.Model`
126
126
  The Q-network model
127
127
  """
128
128
 
@@ -137,16 +137,16 @@ class AutoQL:
137
137
 
138
138
  Parameters
139
139
  ----------
140
- state : `np.ndarray`
141
- Current state.
142
- action : int
143
- Expected action to process.
144
- epsilon : float
145
- Exploration probability. By default it is set to `0`
140
+ state : `np.ndarray`
141
+ Current state.
142
+ action : `int`
143
+ Expected action to process.
144
+ epsilon : `float`
145
+ Exploration probability. By default it is set to `0`
146
146
 
147
147
  Returns
148
148
  -------
149
- tuple: (state, action, reward, next_action, done)
149
+ tuple : (state, action, reward, next_action, done)
150
150
  """
151
151
  current_state, value, reward, next_action, done = self.env.step(state, action)
152
152
 
@@ -164,17 +164,17 @@ class AutoQL:
164
164
 
165
165
  Parameters
166
166
  ----------
167
- state : `np.ndarray`
168
- Current state
169
- action : int
170
- Expected action to process.
167
+ state : `np.ndarray`
168
+ Current state
169
+ action : `int`
170
+ Expected action to process.
171
171
 
172
- epsilon : float
173
- Exploration probability.
172
+ epsilon : `float`
173
+ Exploration probability.
174
174
 
175
175
  Returns
176
176
  -------
177
- tuple: (state, action, reward, next_action, done)
177
+ tuple : (state, action, reward, next_action, done)
178
178
  """
179
179
  current_state, greedy_action, reward, next_action, done = self.epsilon_greedy_policy(
180
180
  state, action, epsilon
@@ -202,7 +202,7 @@ class AutoQL:
202
202
 
203
203
  Returns
204
204
  -------
205
- float: Training loss
205
+ float : Training loss
206
206
  """
207
207
 
208
208
  batch_ = random.sample(self.replay_buffer, self.batch_size)
@@ -250,21 +250,21 @@ class AutoQL:
250
250
 
251
251
  Parameters
252
252
  ----------
253
- optimizer : str
253
+ optimizer : `str`
254
254
  The optimizer for training (e.g., `sgd`). By default it is set to `adam`.
255
- loss_fn : str
255
+ loss_fn : `str`
256
256
  The loss function. By default it is set to `mse`.
257
- num_episodes : int
257
+ num_episodes : `int`
258
258
  Total number of episodes to train. By default it is set to `50`.
259
- num_steps : int
259
+ num_steps : `int`
260
260
  Steps per episode. By default it is set to `100`. If `num_steps` is less than `self.env.maxlen`, then the second will be chosen.
261
- gamma : float
261
+ gamma : `float`
262
262
  Discount factor. By default it is set to `0.7`.
263
- batch_size : int
263
+ batch_size : `int`
264
264
  Size of training batches. By default it is set to `32`.
265
- patience : int
265
+ patience : `int`
266
266
  How many episodes to wait for improvement.
267
- alpha : float
267
+ alpha : `float`
268
268
  Trade-off factor between loss and reward.
269
269
  """
270
270
  rewards = []
@@ -0,0 +1,355 @@
1
+ import json
2
+ from typing import Dict, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from likelihood.tools import generate_html_pipeline
8
+ from likelihood.tools.impute import SimpleImputer
9
+ from likelihood.tools.models_tools import TransformRange, remove_collinearity
10
+ from likelihood.tools.tools import DataFrameEncoder, DataScaler, LinearRegression, OneHotEncoder
11
+
12
+
13
+ class Pipeline:
14
+ def __init__(self, config_path: str):
15
+ """
16
+ Initialize the pipeline with a JSON configuration file.
17
+
18
+ Parameters
19
+ ----------
20
+ config_path : str
21
+ Path to the JSON config defining target column and preprocessing steps.
22
+ """
23
+ self.config = self._load_config(config_path)
24
+ self.target_col = self.config["target_column"]
25
+ self.steps = self.config["preprocessing_steps"]
26
+ self.compute_importance = self.config.get("compute_feature_importance", False)
27
+ self.fitted_components: Dict[str, object] = {}
28
+ self.columns_bin_sizes: Dict[str, int] | None = None
29
+
30
+ def _load_config(self, config_path: str) -> Dict:
31
+ """Load and validate the JSON configuration."""
32
+ with open(config_path, "r") as f:
33
+ config = json.load(f)
34
+
35
+ assert "target_column" in config, "Config must specify 'target_column'"
36
+ assert "preprocessing_steps" in config, "Config must specify 'preprocessing_steps'"
37
+ return config
38
+
39
+ def fit(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
40
+ """
41
+ Fit preprocessing components on the input DataFrame and return cleaned X/y.
42
+
43
+ Parameters
44
+ ----------
45
+ df : pd.DataFrame
46
+ Input data with features + target column.
47
+
48
+ Returns
49
+ -------
50
+ X : pd.DataFrame
51
+ Cleaned feature matrix.
52
+ y : np.ndarray
53
+ Target vector (from self.target_col).
54
+ importances : Optional[np.ndarray]
55
+ Feature importance scores (if compute_feature_importance=True).
56
+ """
57
+ y = df[self.target_col].values
58
+ X = df.drop(columns=[self.target_col]).copy()
59
+
60
+ initial_info = {
61
+ "shape": X.shape,
62
+ "columns": list(X.columns),
63
+ "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
64
+ "missing_values": X.isnull().sum().to_dict(),
65
+ }
66
+
67
+ steps_info = []
68
+ for step in self.steps:
69
+ step_name = step["name"]
70
+ params = step.get("params", {})
71
+ step_info = {
72
+ "step_name": step_name,
73
+ "parameters": params,
74
+ "description": self._get_step_description(step_name),
75
+ }
76
+ step_info["input_columns"] = list(X.columns)
77
+
78
+ X = self._apply_step(step_name, X, fit=True, **params)
79
+
80
+ step_info["output_shape"] = X.shape
81
+ step_info["output_columns"] = list(X.columns)
82
+ step_info["output_dtypes"] = X.dtypes.apply(lambda x: x.name).to_dict()
83
+
84
+ steps_info.append(step_info)
85
+
86
+ final_info = {
87
+ "shape": X.shape,
88
+ "columns": list(X.columns),
89
+ "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
90
+ "missing_values": X.isnull().sum().to_dict(),
91
+ }
92
+
93
+ self.documentation = {
94
+ "initial_dataset": initial_info,
95
+ "processing_steps": steps_info,
96
+ "final_dataset": final_info,
97
+ }
98
+
99
+ importances = None
100
+ if self.compute_importance:
101
+ numeric_X = X.select_dtypes(include=["float"])
102
+ numeric_columns = numeric_X.columns.tolist()
103
+ model = LinearRegression()
104
+ model.fit(numeric_X.T.values, y)
105
+ importances = model.get_importances()
106
+ df_scores = pd.DataFrame([importances], columns=numeric_columns)
107
+ df_scores_abs = df_scores.abs()
108
+ df_scores_norm = (
109
+ df_scores_abs / df_scores_abs.to_numpy().sum()
110
+ if isinstance(importances, np.ndarray)
111
+ else pd.DataFrame()
112
+ )
113
+ return X, y, df_scores_norm
114
+
115
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
116
+ """
117
+ Apply fitted preprocessing steps to new data (no target column needed).
118
+
119
+ Parameters
120
+ ----------
121
+ df : pd.DataFrame
122
+ New data to transform.
123
+
124
+ Returns
125
+ -------
126
+ X_transformed : pd.DataFrame
127
+ Cleaned feature matrix.
128
+ """
129
+ X = df.copy()
130
+ for step_name, _ in self.fitted_components.items():
131
+ X = self._apply_step(step_name, X, fit=False)
132
+
133
+ return X
134
+
135
+ def get_doc(
136
+ self, save_to_file: bool = True, file_name: str = "data_processing_report.html"
137
+ ) -> None:
138
+ """
139
+ Generate an HTML report from `self.documentation` for pipeline documentation.
140
+
141
+ Parameters
142
+ ----------
143
+ save_to_file : bool, optional
144
+ Whether to save generated HTML content to a file. Default is True.
145
+ file_name : str, optional
146
+ Filename for output when `save_to_file` is True. Default is "data_processing_report.html".
147
+ """
148
+
149
+ generate_html_pipeline(self.documentation, save_to_file=save_to_file, file_name=file_name)
150
+
151
+ def _apply_step(self, step_name: str, X: pd.DataFrame, fit: bool, **params) -> pd.DataFrame:
152
+ """Dispatch to the correct handler for a preprocessing step."""
153
+ handlers = {
154
+ "DataScaler": self._handle_datascaler,
155
+ "DataFrameEncoder": self._handle_dataframeencoder,
156
+ "remove_collinearity": self._handle_remove_collinearity,
157
+ "TransformRange": self._handle_transformrange,
158
+ "OneHotEncoder": self._handle_onehotencoder,
159
+ "SimpleImputer": self._handle_simpleimputer,
160
+ }
161
+
162
+ if step_name not in handlers:
163
+ raise ValueError(
164
+ f"Step '{step_name}' not supported. Supported steps: {list(handlers.keys())}"
165
+ )
166
+
167
+ return handlers[step_name](X, fit=fit, **params)
168
+
169
+ def _get_step_description(self, step_name: str) -> str:
170
+ """Return a description of what each preprocessing step does."""
171
+ descriptions = {
172
+ "DataScaler": "Scales numerical features using normalization",
173
+ "DataFrameEncoder": "Encodes categorical variables and normalizes to numerical features",
174
+ "remove_collinearity": "Removes highly correlated features to reduce multicollinearity",
175
+ "TransformRange": "Bins continuous features into discrete ranges",
176
+ "OneHotEncoder": "Converts categorical variables into binary variables",
177
+ "SimpleImputer": "Handles missing values by imputing with multiple linear regression strategies",
178
+ }
179
+
180
+ return descriptions.get(step_name, f"Unknown preprocessing step: {step_name}")
181
+
182
+ # ------------------------------ Step Handlers ------------------------------
183
+ def _handle_datascaler(self, X: pd.DataFrame, fit: bool, n: int = 1) -> pd.DataFrame:
184
+ """Handle DataScaler (fits on training data, applies to all)."""
185
+ numeric_X = X.select_dtypes(include=["float"])
186
+ numeric_columns = numeric_X.columns.tolist()
187
+ n = None if n == 0 else n
188
+ if fit:
189
+ scaler = DataScaler(numeric_X.values.T, n=n)
190
+ self.fitted_components["DataScaler"] = scaler
191
+ numeric_X = pd.DataFrame(scaler.rescale().T, columns=numeric_X.columns)
192
+ else:
193
+ scaler = self.fitted_components["DataScaler"]
194
+ numeric_X = pd.DataFrame(
195
+ scaler.rescale(numeric_X.values.T).T, columns=numeric_X.columns
196
+ )
197
+ for col in numeric_columns:
198
+ X[col] = numeric_X[col]
199
+ return X
200
+
201
+ def _handle_dataframeencoder(
202
+ self, X: pd.DataFrame, fit: bool, norm_method: str = "mean"
203
+ ) -> pd.DataFrame:
204
+ """Handle DataFrameEncoder (fits encoders/normalizers)."""
205
+ if fit:
206
+ encoder = DataFrameEncoder(X)
207
+ encoded_X = encoder.encode(norm_method=norm_method)
208
+ self.fitted_components["DataFrameEncoder"] = encoder
209
+ return encoded_X
210
+ else:
211
+ encoder = self.fitted_components["DataFrameEncoder"]
212
+ encoder._df = X
213
+ return encoder.encode()
214
+
215
+ def _handle_remove_collinearity(
216
+ self, X: pd.DataFrame, fit: bool, threshold: float = 0.9
217
+ ) -> pd.DataFrame:
218
+ """Handle collinearity removal (fits by selecting columns to drop)."""
219
+ numeric_X = X.select_dtypes(include=["float"])
220
+ numeric_columns = numeric_X.columns.tolist()
221
+ categorical_columns = set(X.columns) - set(numeric_columns)
222
+ if fit:
223
+ cleaned_X = remove_collinearity(numeric_X, threshold=threshold)
224
+ dropped_cols = set(X.columns) - set(cleaned_X.columns) - categorical_columns
225
+ self.fitted_components["remove_collinearity"] = dropped_cols
226
+ return X.drop(columns=dropped_cols)
227
+ else:
228
+ dropped_cols = self.fitted_components["remove_collinearity"]
229
+ return X.drop(columns=dropped_cols)
230
+
231
+ def _handle_transformrange(
232
+ self, X: pd.DataFrame, fit: bool, columns_bin_sizes: Dict[str, int] | None = None
233
+ ) -> pd.DataFrame:
234
+ """Handle TransformRange (bin numerical features into ranges)."""
235
+ if fit:
236
+ transformer = TransformRange(columns_bin_sizes)
237
+ cleaned_X = transformer.transform(X)
238
+ self.fitted_components["TransformRange"] = transformer
239
+ self.columns_bin_sizes = columns_bin_sizes
240
+ return cleaned_X
241
+ else:
242
+ transformer = self.fitted_components["TransformRange"]
243
+ return transformer.transform(X, fit=False)
244
+
245
+ def _handle_onehotencoder(
246
+ self, X: pd.DataFrame, fit: bool, columns: List[str] | None = None
247
+ ) -> pd.DataFrame:
248
+ """Handle OneHotEncoder (fits on categorical columns)."""
249
+ if fit:
250
+ tmp_df = X.drop(columns=columns)
251
+ encoder = OneHotEncoder()
252
+ category_to_indices = {}
253
+ for col in columns:
254
+ unique_values = X[col].unique()
255
+ category_to_indices[col] = {value: i for i, value in enumerate(unique_values)}
256
+ encoded_X = encoder.encode(
257
+ X[col].values
258
+ if isinstance(unique_values[0], int)
259
+ else X[col].map(category_to_indices[col])
260
+ )
261
+ tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
262
+ self.fitted_components["OneHotEncoder"] = (encoder, columns, category_to_indices)
263
+ else:
264
+ encoder, columns, category_to_indices = self.fitted_components["OneHotEncoder"]
265
+ tmp_df = X.drop(columns=columns)
266
+ for col in columns:
267
+ unique_values = list(category_to_indices[col].keys())
268
+ encoded_X = encoder.encode(
269
+ (
270
+ X[col].values
271
+ if isinstance(unique_values[0], int)
272
+ else X[col].map(category_to_indices[col])
273
+ ),
274
+ fit=False,
275
+ )
276
+ tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
277
+ return tmp_df
278
+
279
+ def _handle_simpleimputer(
280
+ self,
281
+ X: pd.DataFrame,
282
+ fit: bool,
283
+ use_scaler: bool = False,
284
+ boundary: bool = True,
285
+ ) -> pd.DataFrame:
286
+ "Handle SimpleImputer (fit on numerical and categorical columns)."
287
+ if fit:
288
+ use_scaler = True if use_scaler == 1 else False
289
+ imputer = SimpleImputer(use_scaler=use_scaler)
290
+ tmp_df = imputer.fit_transform(X, boundary=boundary)
291
+ self.fitted_components["SimpleImputer"] = imputer
292
+ return tmp_df
293
+ else:
294
+ imputer = self.fitted_components["SimpleImputer"]
295
+ return imputer.transform(X, boundary=boundary)
296
+
297
+ def save(self, filepath: str) -> None:
298
+ """
299
+ Save the fitted pipeline state to a file using pickle.
300
+
301
+ Parameters
302
+ ----------
303
+ filepath : str
304
+ Path where the serialized pipeline will be saved.
305
+ """
306
+ import pickle
307
+
308
+ save_dict = {
309
+ "config": self.config,
310
+ "fitted_components": self.fitted_components,
311
+ "target_col": self.target_col,
312
+ "steps": self.steps,
313
+ "compute_importance": self.compute_importance,
314
+ "columns_bin_sizes": self.columns_bin_sizes,
315
+ "documentation": self.documentation,
316
+ }
317
+
318
+ filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
319
+
320
+ with open(filepath, "wb") as f:
321
+ pickle.dump(save_dict, f)
322
+
323
+ @classmethod
324
+ def load(cls, filepath: str) -> "Pipeline":
325
+ """
326
+ Load a fitted pipeline from a file.
327
+
328
+ Parameters
329
+ ----------
330
+ filepath : str
331
+ Path to the serialized pipeline file.
332
+
333
+ Returns
334
+ -------
335
+ pipeline : Pipeline
336
+ Reconstructed pipeline instance with fitted components.
337
+ """
338
+ import pickle
339
+
340
+ filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
341
+
342
+ with open(filepath, "rb") as f:
343
+ save_dict = pickle.load(f)
344
+
345
+ pipeline = cls.__new__(cls)
346
+
347
+ pipeline.config = save_dict["config"]
348
+ pipeline.fitted_components = save_dict["fitted_components"]
349
+ pipeline.target_col = save_dict["target_col"]
350
+ pipeline.steps = save_dict["steps"]
351
+ pipeline.compute_importance = save_dict["compute_importance"]
352
+ pipeline.columns_bin_sizes = save_dict["columns_bin_sizes"]
353
+ pipeline.documentation = save_dict["documentation"]
354
+
355
+ return pipeline
@@ -1,3 +1,4 @@
1
1
  from .models_tools import *
2
2
  from .numeric_tools import *
3
+ from .reports import generate_html_pipeline
3
4
  from .tools import *
@@ -11,7 +11,7 @@ logging.getLogger("tensorflow").setLevel(logging.ERROR)
11
11
  import sys
12
12
  import warnings
13
13
  from functools import wraps
14
- from typing import Dict
14
+ from typing import Dict, List, Optional, Tuple, Union
15
15
 
16
16
  import numpy as np
17
17
  import tensorflow as tf
@@ -40,6 +40,214 @@ def suppress_warnings(func):
40
40
  return wrapper
41
41
 
42
42
 
43
+ class TransformRange:
44
+ """
45
+ Generates a new DataFrame with ranges represented as strings.
46
+
47
+ Transforms numerical columns into categorical range bins with descriptive labels.
48
+ """
49
+
50
+ def __init__(self, columns_bin_sizes: Dict[str, int]) -> None:
51
+ """Initializes the class with the original DataFrame.
52
+
53
+ Parameters
54
+ ----------
55
+ columns_bin_sizes : `dict`
56
+ A dictionary where the keys are column names and the values are the bin sizes.
57
+
58
+ Raises
59
+ ------
60
+ TypeError
61
+ If df is not a pandas DataFrame.
62
+ """
63
+ self.info = {}
64
+ self.columns_bin_sizes = columns_bin_sizes
65
+
66
+ def _create_bins_and_labels(
67
+ self, min_val: Union[int, float], max_val: Union[int, float], bin_size: int
68
+ ) -> Tuple[np.ndarray, List[str]]:
69
+ """
70
+ Creates the bin edges and their labels.
71
+
72
+ Parameters
73
+ ----------
74
+ min_val : `int` or `float`
75
+ The minimum value for the range.
76
+ max_val : `int` or `float`
77
+ The maximum value for the range.
78
+ bin_size : `int`
79
+ The size of each bin.
80
+
81
+ Returns
82
+ -------
83
+ bins : `np.ndarray`
84
+ The bin edges.
85
+ labels : `list`
86
+ The labels for the bins.
87
+
88
+ Raises
89
+ ------
90
+ ValueError
91
+ If bin_size is not positive or if min_val >= max_val.
92
+ """
93
+ if bin_size <= 0:
94
+ raise ValueError("bin_size must be positive")
95
+ if min_val >= max_val:
96
+ raise ValueError("min_val must be less than max_val")
97
+
98
+ start = int(min_val)
99
+ end = int(max_val) + bin_size
100
+
101
+ bins = np.arange(start, end + 1, bin_size)
102
+
103
+ if bins[-1] <= max_val:
104
+ bins = np.append(bins, max_val + 1)
105
+
106
+ lower_bin_edge = -np.inf
107
+ upper_bin_edge = np.inf
108
+
109
+ labels = [f"{int(bins[i])}-{int(bins[i+1] - 1)}" for i in range(len(bins) - 1)]
110
+ end = int(bins[-1] - 1)
111
+ bins = bins.tolist()
112
+ bins.insert(0, lower_bin_edge)
113
+ bins.append(upper_bin_edge)
114
+ labels.insert(0, f"< {start}")
115
+ labels.append(f"> {end}")
116
+ return bins, labels
117
+
118
+ def _transform_column_to_ranges(
119
+ self, df: pd.DataFrame, column: str, bin_size: int, fit: bool = True
120
+ ) -> pd.Series:
121
+ """
122
+ Transforms a column in the DataFrame into range bins.
123
+
124
+ Parameters
125
+ ----------
126
+ df : `pd.DataFrame`
127
+ The original DataFrame to transform.
128
+ column : `str`
129
+ The name of the column to transform.
130
+ bin_size : `int`
131
+ The size of each bin.
132
+
133
+ Returns
134
+ -------
135
+ `pd.Series`
136
+ A Series with the range labels.
137
+
138
+ Raises
139
+ ------
140
+ KeyError
141
+ If column is not found in the DataFrame.
142
+ ValueError
143
+ If bin_size is not positive or if column contains non-numeric data.
144
+ """
145
+ if not isinstance(df, pd.DataFrame):
146
+ raise TypeError("df must be a pandas DataFrame")
147
+ df_ = df.copy() # Create a copy to avoid modifying the original
148
+ numeric_series = pd.to_numeric(df_[column], errors="coerce")
149
+ if fit:
150
+ self.df = df_.copy()
151
+ if column not in df_.columns:
152
+ raise KeyError(f"Column '{column}' not found in DataFrame")
153
+
154
+ if bin_size <= 0:
155
+ raise ValueError("bin_size must be positive")
156
+
157
+ if numeric_series.isna().all():
158
+ raise ValueError(f"Column '{column}' contains no valid numeric data")
159
+
160
+ min_val = numeric_series.min()
161
+ max_val = numeric_series.max()
162
+
163
+ if min_val == max_val:
164
+ return pd.Series(
165
+ [f"{int(min_val)}-{int(max_val)}"] * len(df_), name=f"{column}_range"
166
+ )
167
+ self.info[column] = {"min_value": min_val, "max_value": max_val, "range": bin_size}
168
+ else:
169
+ min_val = self.info[column]["min_value"]
170
+ max_val = self.info[column]["max_value"]
171
+ bin_size = self.info[column]["range"]
172
+
173
+ bins, labels = self._create_bins_and_labels(min_val, max_val, bin_size)
174
+ return pd.cut(numeric_series, bins=bins, labels=labels, right=False, include_lowest=True)
175
+
176
+ def transform(
177
+ self, df: pd.DataFrame, drop_original: bool = False, fit: bool = True
178
+ ) -> pd.DataFrame:
179
+ """
180
+ Creates a new DataFrame with range columns.
181
+
182
+ Parameters
183
+ ----------
184
+ df : `pd.DataFrame`
185
+ The original DataFrame to transform.
186
+ drop_original : `bool`, optional
187
+ If True, drops original columns from the result, by default False
188
+ fit : `bool`, default=True
189
+ Whether to compute bin edges based on the data (True) or use predefined binning (False).
190
+
191
+ Returns
192
+ -------
193
+ `pd.DataFrame`
194
+ A DataFrame with the transformed range columns.
195
+
196
+ Raises
197
+ ------
198
+ TypeError
199
+ If columns_bin_sizes is not a dictionary.
200
+ """
201
+ if not isinstance(self.columns_bin_sizes, dict):
202
+ raise TypeError("columns_bin_sizes must be a dictionary")
203
+
204
+ if not self.columns_bin_sizes:
205
+ return pd.DataFrame()
206
+
207
+ range_columns = {}
208
+ for column, bin_size in self.columns_bin_sizes.items():
209
+ range_columns[f"{column}_range"] = self._transform_column_to_ranges(
210
+ df, column, bin_size, fit
211
+ )
212
+
213
+ result_df = pd.DataFrame(range_columns)
214
+
215
+ if not drop_original:
216
+ original_cols = [col for col in df.columns if col not in self.columns_bin_sizes]
217
+ if original_cols:
218
+ result_df = pd.concat([df[original_cols], result_df], axis=1)
219
+
220
+ return result_df
221
+
222
+ def get_range_info(self, column: str) -> Dict[str, Union[int, float, List[str]]]:
223
+ """
224
+ Get information about the range transformation for a specific column.
225
+
226
+ Parameters
227
+ ----------
228
+ column : `str`
229
+ The name of the column to analyze.
230
+
231
+ Returns
232
+ -------
233
+ `dict`
234
+ Dictionary containing min_val, max_val, bin_size, and labels.
235
+ """
236
+ if column not in self.df.columns:
237
+ raise KeyError(f"Column '{column}' not found in DataFrame")
238
+
239
+ numeric_series = pd.to_numeric(self.df[column], errors="coerce")
240
+ min_val = numeric_series.min()
241
+ max_val = numeric_series.max()
242
+
243
+ return {
244
+ "min_value": min_val,
245
+ "max_value": max_val,
246
+ "range": max_val - min_val,
247
+ "column": column,
248
+ }
249
+
250
+
43
251
  def remove_collinearity(df: DataFrame, threshold: float = 0.9):
44
252
  """
45
253
  Removes highly collinear features from the DataFrame based on a correlation threshold.
@@ -56,8 +264,8 @@ def remove_collinearity(df: DataFrame, threshold: float = 0.9):
56
264
  The correlation threshold above which features will be removed. Default is `0.9`.
57
265
 
58
266
  Returns
59
- ----------
60
- DataFrame: A DataFrame with highly collinear features removed.
267
+ -------
268
+ DataFrame : A DataFrame with highly collinear features removed.
61
269
  """
62
270
  corr_matrix = df.corr().abs()
63
271
  upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
@@ -97,11 +305,11 @@ def train_and_insights(
97
305
  Fraction of data to use (default is 1.0).
98
306
 
99
307
  Keyword Arguments:
100
- ----------
308
+ ------------------
101
309
  Additional keyword arguments passed to the `model.fit` function, such as validation split and callbacks.
102
310
 
103
311
  Returns
104
- ----------
312
+ -------
105
313
  `tf.keras.Model`
106
314
  The trained model after fitting.
107
315
  """
@@ -207,7 +415,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
207
415
  A threshold for the eigenvector centrality calculation, used to determine the cutoff for small eigenvalues. Default is `1e-6`.
208
416
 
209
417
  Returns
210
- ----------
418
+ -------
211
419
  DataFrame : A DataFrame containing the following graph metrics as columns.
212
420
  - `Degree Centrality`: Degree centrality values for each node, indicating the number of direct connections each node has.
213
421
  - `Clustering Coefficient`: Clustering coefficient values for each node, representing the degree to which nodes cluster together.
@@ -218,7 +426,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
218
426
  - `Assortativity`: The assortativity coefficient of the graph, measuring the tendency of nodes to connect to similar nodes.
219
427
 
220
428
  Notes
221
- ----------
429
+ -----
222
430
  The returned DataFrame will have one row for each node and one column for each of the computed metrics.
223
431
  """
224
432
  adj_matrix = adj_matrix.astype(int)
@@ -251,3 +459,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
251
459
  metrics_df["Assortativity"] = assortativity
252
460
 
253
461
  return metrics_df
462
+
463
+
464
+ if __name__ == "__main__":
465
+ pass
@@ -154,7 +154,7 @@ def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = No
154
154
  The first variable to be correlated. Must have at least one dimension.
155
155
  Y : `np.ndarray`
156
156
  The second variable to be correlated. Must have at least one dimension.
157
- ties : bool
157
+ ties : `bool`
158
158
  Whether to handle ties using randomization.
159
159
  random_seed : int, optional
160
160
  Seed for the random number generator for reproducibility.
@@ -356,9 +356,9 @@ def find_multiples(target: int) -> tuple[int, int] | None:
356
356
  Returns
357
357
  -------
358
358
  tuple[int, int] | None
359
- If i and i+1 both divide target, returns (i, i+1).
360
- Otherwise, returns (i, target // i).
361
- Returns None if no factors are found.
359
+ If `i` and `i+1` both divide target, returns (i, i+1).
360
+ Otherwise, returns `(i, target // i)`.
361
+ Returns `None` if no factors are found.
362
362
  """
363
363
  for i in range(2, target + 1):
364
364
  if target % i == 0:
@@ -0,0 +1,195 @@
1
+ from html import escape
2
+ from IPython.display import HTML, display
3
+
4
+
5
+ def generate_html_pipeline(data_dict, save_to_file=False, file_name="data_processing_report.html"):
6
+ css_js = """
7
+ <style>
8
+ :root {
9
+ --primary: #0d9488;
10
+ --primary-dark: #0f766e;
11
+ --success: #10b981;
12
+ --accent: #3b82f6;
13
+ --card-bg: #ffffff;
14
+ --shadow-sm: 0 2px 6px rgba(0, 0, 0, 0.03);
15
+ --border-radius-md: 6px;
16
+ }
17
+
18
+ * {
19
+ box-sizing: border-box;
20
+ }
21
+
22
+ body {
23
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
24
+ background: #f8fafc;
25
+ color: #1e293b;
26
+ margin: 0;
27
+ padding: 1rem;
28
+ font-size: 14px;
29
+ }
30
+
31
+ h2 {
32
+ background: linear-gradient(135deg, var(--primary), var(--primary-dark));
33
+ color: white;
34
+ text-align: center;
35
+ padding: 1rem;
36
+ border-radius: var(--border-radius-md);
37
+ font-weight: 600;
38
+ font-size: 1.5rem;
39
+ margin-bottom: 1.5rem;
40
+ }
41
+
42
+ section {
43
+ background: var(--card-bg);
44
+ border-radius: var(--border-radius-md);
45
+ padding: 1rem;
46
+ box-shadow: var(--shadow-sm);
47
+ margin-bottom: 1.2rem;
48
+ }
49
+
50
+ h3 {
51
+ color: var(--primary-dark);
52
+ font-weight: 600;
53
+ font-size: 1.2rem;
54
+ border-left: 4px solid var(--success);
55
+ padding-left: 0.8rem;
56
+ margin: 1rem 0 0.8rem;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ font-size: 13px;
63
+ margin: 0.5rem 0 1rem;
64
+ }
65
+
66
+ th, td {
67
+ padding: 0.5rem 0.75rem;
68
+ text-align: left;
69
+ border-bottom: 1px solid #e2e8f0;
70
+ vertical-align: top;
71
+ }
72
+
73
+ thead {
74
+ background-color: #f0fdf4;
75
+ }
76
+
77
+ tbody tr:nth-child(odd) {
78
+ background-color: #f9fafb;
79
+ }
80
+
81
+ tbody tr:hover {
82
+ background-color: #e0f2fe;
83
+ }
84
+
85
+ .nested-table {
86
+ font-size: 12px;
87
+ margin-top: 0.5rem;
88
+ }
89
+
90
+ details {
91
+ margin-bottom: 0.8rem;
92
+ padding: 0.5rem 0.8rem;
93
+ background: #f9f9f9;
94
+ border-radius: var(--border-radius-md);
95
+ }
96
+
97
+ summary {
98
+ font-weight: 600;
99
+ font-size: 1rem;
100
+ color: var(--primary-dark);
101
+ cursor: pointer;
102
+ }
103
+
104
+ summary::before {
105
+ content: "▶";
106
+ margin-right: 6px;
107
+ color: var(--success);
108
+ font-size: 0.9rem;
109
+ }
110
+
111
+ @media (max-width: 768px) {
112
+ body {
113
+ font-size: 13px;
114
+ }
115
+
116
+ h2 {
117
+ font-size: 1.3rem;
118
+ padding: 0.8rem;
119
+ }
120
+
121
+ h3 {
122
+ font-size: 1.1rem;
123
+ }
124
+
125
+ table, .nested-table {
126
+ font-size: 12px;
127
+ }
128
+ }
129
+ </style>
130
+ """
131
+
132
+ def render_value(val):
133
+ if isinstance(val, dict):
134
+ return dict_to_table(val, nested=True)
135
+ elif isinstance(val, list):
136
+ if all(isinstance(item, (str, int, float)) for item in val):
137
+ return ", ".join(escape(str(x)) for x in val)
138
+ else:
139
+ return "<ul>" + "".join(f"<li>{render_value(v)}</li>" for v in val) + "</ul>"
140
+ else:
141
+ return escape(str(val))
142
+
143
+ def dict_to_table(d, title=None, nested=False):
144
+ html = ""
145
+ if title and not nested:
146
+ html += f"<h4>{escape(title)}</h4>"
147
+ table_class = "nested-table" if nested else "table"
148
+ html += f"<table class='{table_class}'>"
149
+ html += "<thead><tr><th>Key</th><th>Value</th></tr></thead><tbody>"
150
+ for key, val in d.items():
151
+ key_html = escape(str(key))
152
+ val_html = render_value(val)
153
+ html += f"<tr><td>{key_html}</td><td>{val_html}</td></tr>"
154
+ html += "</tbody></table>"
155
+ return html
156
+
157
+ html_content = css_js
158
+ html_content += "<h2>📈 Data Processing Report</h2>"
159
+
160
+ html_content += "<section>"
161
+ html_content += "<h3>📁 Initial Dataset</h3>"
162
+ html_content += dict_to_table(data_dict["initial_dataset"])
163
+ html_content += "</section>"
164
+
165
+ html_content += "<section>"
166
+ html_content += "<h3>🔧 Processing Steps</h3>"
167
+ for i, step in enumerate(data_dict["processing_steps"]):
168
+ html_content += "<details open>"
169
+ html_content += f"<summary>Step {i + 1}: {escape(step['step_name'])}</summary>"
170
+ html_content += f"<p><strong>Description:</strong> {escape(step['description'])}</p>"
171
+ html_content += dict_to_table(step["parameters"], title="Parameters", nested=True)
172
+ html_content += dict_to_table(
173
+ {
174
+ "Output Shape": step["output_shape"],
175
+ "Input Columns": step["input_columns"],
176
+ "Output Columns": step["output_columns"],
177
+ "Output Dtypes": step["output_dtypes"],
178
+ },
179
+ title="Output Info",
180
+ nested=True,
181
+ )
182
+ html_content += "</details>"
183
+ html_content += "</section>"
184
+
185
+ html_content += "<section>"
186
+ html_content += "<h3>✅ Final Dataset</h3>"
187
+ html_content += dict_to_table(data_dict["final_dataset"])
188
+ html_content += "</section>"
189
+
190
+ if save_to_file:
191
+ with open(file_name, "w", encoding="utf-8") as f:
192
+ f.write(html_content)
193
+ print(f"✅ Report saved to '{file_name}'")
194
+ else:
195
+ display(HTML(html_content))
@@ -2,7 +2,7 @@ import math
2
2
  import os
3
3
  import pickle
4
4
  import warnings
5
- from typing import Callable, Dict, List, Tuple, Union
5
+ from typing import Callable, Dict, Generator, List, Tuple, Union
6
6
 
7
7
  import matplotlib.pyplot as plt
8
8
  import numpy as np
@@ -25,7 +25,7 @@ Data Science from Scratch, Second Edition, by Joel Grus (O'Reilly).Copyright 201
25
25
  """
26
26
 
27
27
 
28
- def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> List:
28
+ def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> Generator:
29
29
  """Generates 'batch_size'-sized minibatches from the dataset
30
30
 
31
31
  Parameters
@@ -660,7 +660,7 @@ class DataScaler:
660
660
 
661
661
  __slots__ = ["dataset_", "_n", "data_scaled", "values", "inv_fitting"]
662
662
 
663
- def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
663
+ def __init__(self, dataset: np.ndarray, n: int | None = 1) -> None:
664
664
  """Initializes the parameters required for scaling the data"""
665
665
  self.dataset_ = dataset.copy()
666
666
  self._n = n
@@ -861,7 +861,7 @@ class DataFrameEncoder:
861
861
  """Encodes the `object` type columns of the dataframe
862
862
 
863
863
  Keyword Arguments:
864
- ----------
864
+ ------------------
865
865
  - save_mode (`bool`): An optional integer parameter. By default it is set to `True`
866
866
  - dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
867
867
  - norm_method (`str`): An optional string parameter to perform normalization. By default it is set to `None`
@@ -1024,20 +1024,21 @@ class OneHotEncoder:
1024
1024
  It receives an array of integers and returns a binary array using the one-hot encoding method.
1025
1025
  """
1026
1026
 
1027
- __slots__ = ["x"]
1027
+ __slots__ = ["num_categories"]
1028
1028
 
1029
1029
  def __init__(self) -> None:
1030
1030
  pass
1031
1031
 
1032
- def encode(self, x: np.ndarray | list):
1033
- self.x = x
1034
-
1035
- if not isinstance(self.x, np.ndarray):
1036
- self.x = np.array(self.x)
1032
+ def encode(self, x: np.ndarray | list, fit: bool = True):
1033
+ if not isinstance(x, np.ndarray):
1034
+ x = np.array(x)
1035
+ x = x.astype(int)
1036
+ if fit:
1037
+ self.num_categories = x.max() + 1
1037
1038
 
1038
- y = np.zeros((self.x.size, self.x.max() + 1))
1039
+ y = np.zeros((x.size, self.num_categories))
1039
1040
 
1040
- y[np.arange(self.x.size), self.x] = 1
1041
+ y[np.arange(x.size), x] = 1
1041
1042
 
1042
1043
  return y
1043
1044
 
@@ -1189,7 +1190,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1189
1190
  if nan_values:
1190
1191
  (
1191
1192
  print(
1192
- "UserWarning: Some rows may have been deleted due to the existence of NaN values."
1193
+ "UserWarning: Some rows may have been deleted due to the existence of NaN values.",
1194
+ f"NaN values removed: ",
1195
+ "{:,}".format(nan_count),
1193
1196
  )
1194
1197
  if verbose
1195
1198
  else None
@@ -1199,7 +1202,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1199
1202
  if inf_values:
1200
1203
  (
1201
1204
  print(
1202
- "UserWarning: Some rows may have been deleted due to the existence of Inf values."
1205
+ "UserWarning: Some rows may have been deleted due to the existence of Inf values.",
1206
+ f"Infinite values removed: ",
1207
+ "{:,}".format(inf_count),
1203
1208
  )
1204
1209
  if verbose
1205
1210
  else None
@@ -1207,9 +1212,6 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
1207
1212
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
1208
1213
  df.dropna(inplace=True)
1209
1214
 
1210
- print(f"NaN values removed: ", "{:,}".format(nan_count))
1211
- print(f"Infinite values removed: ", "{:,}".format(inf_count))
1212
-
1213
1215
  return df
1214
1216
 
1215
1217
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -3,6 +3,7 @@ README.md
3
3
  setup.py
4
4
  likelihood/__init__.py
5
5
  likelihood/main.py
6
+ likelihood/pipes.py
6
7
  likelihood.egg-info/PKG-INFO
7
8
  likelihood.egg-info/SOURCES.txt
8
9
  likelihood.egg-info/dependency_links.txt
@@ -30,4 +31,5 @@ likelihood/tools/figures.py
30
31
  likelihood/tools/impute.py
31
32
  likelihood/tools/models_tools.py
32
33
  likelihood/tools/numeric_tools.py
34
+ likelihood/tools/reports.py
33
35
  likelihood/tools/tools.py
File without changes
File without changes
File without changes
File without changes