likelihood 1.2.17__tar.gz → 1.2.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {likelihood-1.2.17 → likelihood-1.2.19}/PKG-INFO +2 -2
  2. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/deep/autoencoders.py +51 -25
  3. likelihood-1.2.19/likelihood/models/simulation.py +103 -0
  4. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/tools/numeric_tools.py +57 -30
  5. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/tools/tools.py +28 -10
  6. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/PKG-INFO +2 -2
  7. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/requires.txt +1 -1
  8. {likelihood-1.2.17 → likelihood-1.2.19}/setup.py +1 -1
  9. likelihood-1.2.17/likelihood/models/simulation.py +0 -91
  10. {likelihood-1.2.17 → likelihood-1.2.19}/LICENSE +0 -0
  11. {likelihood-1.2.17 → likelihood-1.2.19}/README.md +0 -0
  12. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/__init__.py +0 -0
  13. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/graph/__init__.py +0 -0
  14. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/graph/graph.py +0 -0
  15. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/graph/nn.py +0 -0
  16. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/main.py +0 -0
  17. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/__init__.py +0 -0
  18. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/deep/__init__.py +0 -0
  19. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/regression.py +0 -0
  20. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/utils.py +0 -0
  21. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/tools/__init__.py +0 -0
  22. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/SOURCES.txt +0 -0
  23. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/dependency_links.txt +0 -0
  24. {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/top_level.txt +0 -0
  25. {likelihood-1.2.17 → likelihood-1.2.19}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.17
3
+ Version: 1.2.19
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -28,7 +28,7 @@ Requires-Dist: corner
28
28
  Provides-Extra: full
29
29
  Requires-Dist: networkx; extra == "full"
30
30
  Requires-Dist: pyvis; extra == "full"
31
- Requires-Dist: tensorflow; extra == "full"
31
+ Requires-Dist: tensorflow==2.15.0; extra == "full"
32
32
  Requires-Dist: keras-tuner; extra == "full"
33
33
  Requires-Dist: scikit-learn; extra == "full"
34
34
 
@@ -1,40 +1,43 @@
1
1
  import os
2
2
  from functools import partial
3
+ from shutil import rmtree
3
4
 
4
5
  import keras_tuner
5
6
  import numpy as np
6
7
  import pandas as pd
7
8
  import tensorflow as tf
9
+ from likelihood.tools import OneHotEncoder
8
10
  from pandas.core.frame import DataFrame
9
11
 
10
- from likelihood.tools import OneHotEncoder
12
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
11
13
 
12
14
 
15
+ @tf.keras.saving.register_keras_serializable(package="Custom", name="AutoClassifier")
13
16
  class AutoClassifier(tf.keras.Model):
14
17
  """
15
18
  An auto-classifier model that automatically determines the best classification strategy based on the input data.
16
19
 
17
20
  Attributes:
18
- - input_shape: The shape of the input data.
21
+ - input_shape_parm: The shape of the input data.
19
22
  - num_classes: The number of classes in the dataset.
20
23
  - units: The number of neurons in each hidden layer.
21
24
  - activation: The type of activation function to use for the neural network layers.
22
25
 
23
26
  Methods:
24
- __init__(self, input_shape, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
25
- build(self, input_shape): Builds the model architecture based on input_shape.
27
+ __init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
28
+ build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
26
29
  call(self, x): Defines the forward pass of the model.
27
30
  get_config(self): Returns the configuration of the model.
28
31
  from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
29
32
  """
30
33
 
31
- def __init__(self, input_shape, num_classes, units, activation):
34
+ def __init__(self, input_shape_parm, num_classes, units, activation):
32
35
  """
33
36
  Initializes an AutoClassifier instance with the given parameters.
34
37
 
35
38
  Parameters
36
39
  ----------
37
- input_shape : `int`
40
+ input_shape_parm : `int`
38
41
  The shape of the input data.
39
42
  num_classes : `int`
40
43
  The number of classes in the dataset.
@@ -44,7 +47,7 @@ class AutoClassifier(tf.keras.Model):
44
47
  The type of activation function to use for the neural network layers.
45
48
  """
46
49
  super(AutoClassifier, self).__init__()
47
- self.input_shape = input_shape
50
+ self.input_shape_parm = input_shape_parm
48
51
  self.num_classes = num_classes
49
52
  self.units = units
50
53
  self.activation = activation
@@ -64,7 +67,7 @@ class AutoClassifier(tf.keras.Model):
64
67
  self.decoder = tf.keras.Sequential(
65
68
  [
66
69
  tf.keras.layers.Dense(units=self.units, activation=self.activation),
67
- tf.keras.layers.Dense(units=self.input_shape, activation=self.activation),
70
+ tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
68
71
  ]
69
72
  )
70
73
 
@@ -81,7 +84,7 @@ class AutoClassifier(tf.keras.Model):
81
84
 
82
85
  def get_config(self):
83
86
  config = {
84
- "input_shape": self.input_shape,
87
+ "input_shape_parm": self.input_shape_parm,
85
88
  "num_classes": self.num_classes,
86
89
  "units": self.units,
87
90
  "activation": self.activation,
@@ -92,7 +95,7 @@ class AutoClassifier(tf.keras.Model):
92
95
  @classmethod
93
96
  def from_config(cls, config):
94
97
  return cls(
95
- input_shape=config["input_shape"],
98
+ input_shape_parm=config["input_shape_parm"],
96
99
  num_classes=config["num_classes"],
97
100
  units=config["units"],
98
101
  activation=config["activation"],
@@ -104,7 +107,7 @@ def call_existing_code(
104
107
  activation: str,
105
108
  threshold: float,
106
109
  optimizer: str,
107
- input_shape: None | int = None,
110
+ input_shape_parm: None | int = None,
108
111
  num_classes: None | int = None,
109
112
  ) -> AutoClassifier:
110
113
  """
@@ -120,7 +123,7 @@ def call_existing_code(
120
123
  The threshold for the classifier.
121
124
  optimizer : `str`
122
125
  The type of optimizer to use for the neural network layers.
123
- input_shape : `None` | `int`
126
+ input_shape_parm : `None` | `int`
124
127
  The shape of the input data.
125
128
  num_classes : `int`
126
129
  The number of classes in the dataset.
@@ -131,7 +134,10 @@ def call_existing_code(
131
134
  The AutoClassifier instance.
132
135
  """
133
136
  model = AutoClassifier(
134
- input_shape=input_shape, num_classes=num_classes, units=units, activation=activation
137
+ input_shape_parm=input_shape_parm,
138
+ num_classes=num_classes,
139
+ units=units,
140
+ activation=activation,
135
141
  )
136
142
  model.compile(
137
143
  optimizer=optimizer,
@@ -141,14 +147,14 @@ def call_existing_code(
141
147
  return model
142
148
 
143
149
 
144
- def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoClassifier:
150
+ def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
145
151
  """Builds a neural network model using Keras Tuner's search algorithm.
146
152
 
147
153
  Parameters
148
154
  ----------
149
155
  hp : `keras_tuner.HyperParameters`
150
156
  The hyperparameters to tune.
151
- input_shape : `None` | `int`
157
+ input_shape_parm : `None` | `int`
152
158
  The shape of the input data.
153
159
  num_classes : `int`
154
160
  The number of classes in the dataset.
@@ -158,7 +164,9 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
158
164
  `keras.Model`
159
165
  The neural network model.
160
166
  """
161
- units = hp.Int("units", min_value=int(input_shape * 0.2), max_value=input_shape, step=2)
167
+ units = hp.Int(
168
+ "units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
169
+ )
162
170
  activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
163
171
  optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
164
172
  threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
@@ -168,7 +176,7 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
168
176
  activation=activation,
169
177
  threshold=threshold,
170
178
  optimizer=optimizer,
171
- input_shape=input_shape,
179
+ input_shape_parm=input_shape_parm,
172
180
  num_classes=num_classes,
173
181
  )
174
182
  return model
@@ -180,8 +188,9 @@ def setup_model(
180
188
  epochs: int,
181
189
  train_size: float = 0.7,
182
190
  seed=None,
183
- filepath: str = "./my_dir/best_model.keras",
184
- **kwargs
191
+ train_mode: bool = True,
192
+ filepath: str = "./my_dir/best_model",
193
+ **kwargs,
185
194
  ) -> AutoClassifier:
186
195
  """Setup model for training and tuning.
187
196
 
@@ -197,6 +206,8 @@ def setup_model(
197
206
  The proportion of the dataset to use for training.
198
207
  seed : `Any` | `int`
199
208
  The random seed to use for reproducibility.
209
+ train_mode : `bool`
210
+ Whether to train the model or not.
200
211
  filepath : `str`
201
212
  The path to save the best model to.
202
213
 
@@ -227,6 +238,7 @@ def setup_model(
227
238
  verbose = kwargs["verbose"] if "verbose" in kwargs else True
228
239
 
229
240
  X = data.drop(columns=target)
241
+ input_sample = X.sample(1)
230
242
  y = data[target]
231
243
  # Verify if there are categorical columns in the dataframe
232
244
  assert (
@@ -234,21 +246,34 @@ def setup_model(
234
246
  ), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
235
247
  validation_split = 1.0 - train_size
236
248
  # Create my_dir path if it does not exist
237
- if not os.path.exists(directory):
238
- os.makedirs(directory)
249
+
250
+ if train_mode:
251
+ # Create a new directory if it does not exist
252
+ try:
253
+ if not os.path.exists(directory):
254
+ os.makedirs(directory)
255
+ else:
256
+ print(f"Directory {directory} already exists, it will be deleted.")
257
+ rmtree(directory)
258
+ os.makedirs(directory)
259
+ except:
260
+ print("Warning: unable to create directory")
239
261
 
240
262
  # Create a Classifier instance
241
263
  y_encoder = OneHotEncoder()
242
264
  y = y_encoder.encode(y.to_list())
243
265
  X = X.to_numpy()
266
+ input_sample.to_numpy()
244
267
  X = np.asarray(X).astype(np.float32)
245
-
268
+ input_sample = np.asarray(input_sample).astype(np.float32)
246
269
  y = np.asarray(y).astype(np.float32)
247
270
 
248
- input_shape = X.shape[1]
271
+ input_shape_parm = X.shape[1]
249
272
  num_classes = y.shape[1]
250
273
  global build_model
251
- build_model = partial(build_model, input_shape=input_shape, num_classes=num_classes)
274
+ build_model = partial(
275
+ build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
276
+ )
252
277
 
253
278
  # Create the AutoKeras model
254
279
  tuner = keras_tuner.RandomSearch(
@@ -263,9 +288,10 @@ def setup_model(
263
288
  tuner.search(X, y, epochs=epochs, validation_split=validation_split)
264
289
  models = tuner.get_best_models(num_models=2)
265
290
  best_model = models[0]
291
+ best_model(input_sample)
266
292
 
267
293
  # save model
268
- best_model.save(filepath)
294
+ best_model.save(filepath, save_format="tf")
269
295
 
270
296
  if verbose:
271
297
  tuner.results_summary()
@@ -0,0 +1,103 @@
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ import pandas as pd
4
+ from numpy import ndarray
5
+ from pandas.core.frame import DataFrame
6
+
7
+ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
8
+
9
+ # --------------------------------------------------------------------------------------------------------------------------------------
10
+
11
+
12
+ class SimulationEngine(FeatureSelection):
13
+ """
14
+ This class implements a predictive model that utilizes multiple linear regression for numerical target variables
15
+ and multiple logistic regression for categorical target variables.
16
+
17
+ The class provides methods for training the model on a given dataset, making predictions,
18
+ and evaluating the model's performance.
19
+
20
+ Key features:
21
+ - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
22
+ - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
23
+ - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
24
+
25
+ Usage:
26
+ - Instantiate the class with the training data and target variable.
27
+ - Call the fit method to train the model.
28
+ - Use the predict method to generate predictions on new data.
29
+ - Evaluate the model using built-in metrics for accuracy and error.
30
+
31
+ This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
32
+ for both numerical and categorical outcomes efficiently.
33
+ """
34
+
35
+ def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
36
+
37
+ self.df = df
38
+ self.n_importances = n_importances
39
+ self.use_scaler = use_scaler
40
+
41
+ super().__init__(**kwargs)
42
+
43
+ def predict(self, df: DataFrame, column: str) -> ndarray | list:
44
+ # Let us assign the dictionary entries corresponding to the column
45
+ w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
46
+
47
+ df = df[names_cols].copy()
48
+ # Change the scale of the dataframe
49
+ dataset = self.df.copy()
50
+ dataset.drop(columns=column, inplace=True)
51
+ numeric_df = dataset.select_dtypes(include="number")
52
+ if self.use_scaler:
53
+ scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
54
+ _ = scaler.rescale()
55
+ dataset_ = df.copy()
56
+ numeric_df = dataset_.select_dtypes(include="number")
57
+ numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
58
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
59
+ for col in numeric_df.columns:
60
+ df[col] = numeric_df[col].values
61
+
62
+ # Encoding the datadrame
63
+ for num, colname in enumerate(dfe._encode_columns):
64
+ if df[colname].dtype == "object":
65
+ encode_dict = dfe.encoding_list[num]
66
+ df[colname] = df[colname].apply(
67
+ dfe._code_transformation_to, dictionary_list=encode_dict
68
+ )
69
+
70
+ # PREDICTION
71
+ y = df.to_numpy() @ w
72
+
73
+ # Categorical column
74
+ if quick_encoder != None:
75
+
76
+ one_hot = OneHotEncoder()
77
+ y = one_hot.decode(y)
78
+ encoding_dic = quick_encoder.decoding_list[0]
79
+ y = [encoding_dic[item] for item in y]
80
+ # Numeric column
81
+ else:
82
+ if self.use_scaler:
83
+ # scale output
84
+ y += 1
85
+ y /= 2
86
+ y = y * (self.df[column].max() - self.df[column].min())
87
+
88
+ return y[:]
89
+
90
+ def fit(self, **kwargs) -> None:
91
+
92
+ # We run the feature selection algorithm
93
+ self.get_digraph(self.df, self.n_importances, self.use_scaler)
94
+
95
+ def _clean_data(self, df: DataFrame) -> DataFrame:
96
+
97
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
98
+ df.replace(" ", np.nan, inplace=True)
99
+ df = check_nan_inf(df)
100
+ df = df.reset_index()
101
+ df = df.drop(columns=["index"])
102
+
103
+ return df
@@ -1,14 +1,14 @@
1
1
  from typing import Dict
2
2
 
3
3
  import numpy as np
4
+ import pandas as pd
4
5
  from numpy import arange, array, ndarray, random
5
6
  from numpy.linalg import solve
6
7
  from pandas.core.frame import DataFrame
7
8
 
8
- # -------------------------------------------------------------------------
9
-
10
9
 
11
- def xi_corr(df: DataFrame) -> DataFrame:
10
+ # -------------------------------------------------------------------------
11
+ def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
12
12
  """Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
13
13
 
14
14
  Parameters
@@ -19,11 +19,15 @@ def xi_corr(df: DataFrame) -> DataFrame:
19
19
  Returns
20
20
  -------
21
21
  `DataFrame`
22
- A dataframe with variable names as keys and their corresponding
23
- correlation coefficients as values.
22
+ A square dataframe with variable names as both index and columns,
23
+ containing their corresponding correlation coefficients.
24
24
  """
25
- correlations = {}
26
- columns = df.columns
25
+
26
+ columns = df.select_dtypes(include="number").columns
27
+ n = len(columns)
28
+
29
+ # Initialize a square matrix for the correlations
30
+ correlations = pd.DataFrame(1.0, index=columns, columns=columns)
27
31
 
28
32
  for i, col1 in enumerate(columns):
29
33
  for j, col2 in enumerate(columns):
@@ -32,9 +36,9 @@ def xi_corr(df: DataFrame) -> DataFrame:
32
36
  y = df[col2].values
33
37
 
34
38
  correlation = xicor(x, y)
35
- correlations[(col1, col2)] = round(correlation, 8)
36
- # dictionary to dataframe
37
- correlations = DataFrame(list(correlations.items()), columns=["Variables", "Xi Correlation"])
39
+ correlations.loc[col1, col2] = round(correlation, 8)
40
+ correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
41
+
38
42
  return correlations
39
43
 
40
44
 
@@ -51,10 +55,11 @@ def xi_corr(df: DataFrame) -> DataFrame:
51
55
  """
52
56
 
53
57
 
54
- def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
55
- """Calculate a new coefficient of correlation between two variables.
58
+ def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
59
+ """
60
+ Calculate a generalized coefficient of correlation between two variables.
56
61
 
57
- The new coefficient of correlation is a generalization of Pearson's correlation.
62
+ This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
58
63
 
59
64
  Parameters
60
65
  ----------
@@ -62,30 +67,52 @@ def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
62
67
  The first variable to be correlated. Must have at least one dimension.
63
68
  Y : `np.ndarray`
64
69
  The second variable to be correlated. Must have at least one dimension.
70
+ ties : bool
71
+ Whether to handle ties using randomization.
72
+ random_seed : int, optional
73
+ Seed for the random number generator for reproducibility.
65
74
 
66
75
  Returns
67
76
  -------
68
77
  xi : `float`
69
78
  The estimated value of the new coefficient of correlation.
70
79
  """
71
- random.seed(42)
80
+
81
+ # Early return for identical arrays
82
+ if np.array_equal(X, Y):
83
+ return 1.0
84
+
72
85
  n = len(X)
73
- order = array([i[0] for i in sorted(enumerate(X), key=lambda x: x[1])])
86
+
87
+ # Early return for cases with less than 2 elements
88
+ if n < 2:
89
+ return 0.0
90
+
91
+ # Flatten the input arrays if they are multidimensional
92
+ X = X.flatten()
93
+ Y = Y.flatten()
94
+
95
+ # Get the sorted order of X
96
+ order = np.argsort(X)
97
+
74
98
  if ties:
75
- l = array([sum(y >= Y[order]) for y in Y[order]])
76
- r = l.copy()
77
- for j in range(n):
78
- if sum([r[j] == r[i] for i in range(n)]) > 1:
79
- tie_index = array([r[j] == r[i] for i in range(n)])
80
- r[tie_index] = random.choice(
81
- r[tie_index] - arange(0, sum([r[j] == r[i] for i in range(n)])),
82
- sum(tie_index),
83
- replace=False,
84
- )
85
- return 1 - n * sum(abs(r[1:] - r[: n - 1])) / (2 * sum(l * (n - l)))
99
+ np.random.seed(random_seed) # Set seed for reproducibility if needed
100
+ ranks = np.argsort(np.argsort(Y[order])) # Get ranks
101
+ unique_ranks, counts = np.unique(ranks, return_counts=True)
102
+
103
+ # Adjust ranks for ties by shuffling
104
+ for rank, count in zip(unique_ranks, counts):
105
+ if count > 1:
106
+ tie_indices = np.where(ranks == rank)[0]
107
+ np.random.shuffle(ranks[tie_indices]) # Randomize ties
108
+
109
+ cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
110
+ return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
111
+ 2 * np.sum(cumulative_counts * (n - cumulative_counts))
112
+ )
86
113
  else:
87
- r = array([sum(y >= Y[order]) for y in Y[order]])
88
- return 1 - 3 * sum(abs(r[1:] - r[: n - 1])) / (n**2 - 1)
114
+ ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
115
+ return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
89
116
 
90
117
 
91
118
  # -------------------------------------------------------------------------
@@ -257,8 +284,8 @@ if __name__ == "__main__":
257
284
  print("New correlation coefficient test")
258
285
  X = np.random.rand(100, 1)
259
286
  Y = X * X
260
- print("coefficient for Y = X * X : ", xicor(X, Y))
261
-
287
+ print("coefficient for Y = X * X : ", xicor(X, Y, False))
288
+ df["index"] = ["A", "B", "C", "D"]
262
289
  print("New correlation coefficient test for pandas DataFrame")
263
290
  values_df = xi_corr(df)
264
291
  breakpoint()
@@ -640,14 +640,14 @@ def cal_average(y: ndarray, alpha: float = 1):
640
640
  class DataScaler:
641
641
  """numpy array `scaler` and `rescaler`"""
642
642
 
643
- __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose"]
643
+ __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
644
644
 
645
645
  def __init__(self, dataset: ndarray, n: int = 1) -> None:
646
646
  """Initializes the parameters required for scaling the data"""
647
647
  self.dataset_ = dataset.copy()
648
648
  self._n = n
649
649
 
650
- def rescale(self) -> ndarray:
650
+ def rescale(self, dataset_: ndarray | None = None) -> ndarray:
651
651
  """Perform a standard rescaling of the data
652
652
 
653
653
  Returns
@@ -655,11 +655,26 @@ class DataScaler:
655
655
  data_scaled : `np.array`
656
656
  An array containing the scaled data.
657
657
  """
658
+ if isinstance(dataset_, ndarray):
659
+ data_scaled = np.copy(dataset_)
660
+ mu = self.values[0]
661
+ sigma = self.values[1]
662
+ f = self.values[2]
663
+ data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
664
+ for i in range(self.dataset_.shape[0]):
665
+ if self._n != None:
666
+ poly = f[i](self.inv_fitting[i](data_scaled[i]))
667
+ data_scaled[i] += -poly
668
+ data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
669
+ return data_scaled
670
+ else:
671
+ self.data_scaled = np.copy(self.dataset_.copy())
658
672
 
659
673
  mu = []
660
674
  sigma = []
661
675
  fitting = []
662
- self.data_scaled = np.copy(self.dataset_)
676
+ self.inv_fitting = []
677
+
663
678
  try:
664
679
  xaxis = range(self.dataset_.shape[1])
665
680
  except:
@@ -675,12 +690,15 @@ class DataScaler:
675
690
  for i in range(self.dataset_.shape[0]):
676
691
  if self._n != None:
677
692
  fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
693
+ inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
678
694
  f = np.poly1d(fit)
679
695
  poly = f(xaxis)
680
696
  fitting.append(f)
697
+ self.inv_fitting.append(inv_fit)
681
698
  self.data_scaled[i, :] += -poly
682
699
  else:
683
700
  fitting.append(0.0)
701
+ self.inv_fitting.append(0.0)
684
702
  mu.append(np.min(self.data_scaled[i, :]))
685
703
  if np.max(self.data_scaled[i, :]) != 0:
686
704
  sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
@@ -1064,7 +1082,7 @@ class FeatureSelection:
1064
1082
  self.all_features_imp_graph: List[Tuple] = []
1065
1083
  self.w_dict = dict()
1066
1084
 
1067
- def get_digraph(self, dataset: DataFrame, n_importances: int) -> str:
1085
+ def get_digraph(self, dataset: DataFrame, n_importances: int, use_scaler: bool = False) -> str:
1068
1086
  """
1069
1087
  Get directed graph showing importance of features.
1070
1088
 
@@ -1092,10 +1110,11 @@ class FeatureSelection:
1092
1110
  feature_string += column + "; "
1093
1111
 
1094
1112
  numeric_df = curr_dataset.select_dtypes(include="number")
1095
- self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
1096
- numeric_scaled = self.scaler.rescale()
1097
- numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
1098
- curr_dataset[numeric_df.columns] = numeric_df
1113
+ if use_scaler:
1114
+ self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
1115
+ numeric_scaled = self.scaler.rescale()
1116
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
1117
+ curr_dataset[numeric_df.columns] = numeric_df
1099
1118
 
1100
1119
  # We construct dictionary to save index for scaling
1101
1120
  numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
@@ -1119,7 +1138,6 @@ class FeatureSelection:
1119
1138
  dfe = DataFrameEncoder(X_aux)
1120
1139
  encoded_df = dfe.encode(save_mode=False)
1121
1140
  # We train
1122
-
1123
1141
  Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
1124
1142
  # We obtain importance
1125
1143
  importance = Model.get_importances()
@@ -1202,7 +1220,7 @@ class FeatureSelection:
1202
1220
 
1203
1221
 
1204
1222
  def check_nan_inf(df: DataFrame) -> DataFrame:
1205
- """Check for `NaN` and `Inf` values in the `DataFrame`. If any are found removes them."""
1223
+ """Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
1206
1224
  nan_values = df.isnull().values.any()
1207
1225
  count = np.isinf(df.select_dtypes(include="number")).values.sum()
1208
1226
  print("There are null values : ", nan_values)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: likelihood
3
- Version: 1.2.17
3
+ Version: 1.2.19
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -28,7 +28,7 @@ Requires-Dist: corner
28
28
  Provides-Extra: full
29
29
  Requires-Dist: networkx; extra == "full"
30
30
  Requires-Dist: pyvis; extra == "full"
31
- Requires-Dist: tensorflow; extra == "full"
31
+ Requires-Dist: tensorflow==2.15.0; extra == "full"
32
32
  Requires-Dist: keras-tuner; extra == "full"
33
33
  Requires-Dist: scikit-learn; extra == "full"
34
34
 
@@ -14,6 +14,6 @@ corner
14
14
  [full]
15
15
  networkx
16
16
  pyvis
17
- tensorflow
17
+ tensorflow==2.15.0
18
18
  keras-tuner
19
19
  scikit-learn
@@ -31,7 +31,7 @@ setuptools.setup(
31
31
  packages=setuptools.find_packages(),
32
32
  install_requires=install_requires,
33
33
  extras_require={
34
- "full": ["networkx", "pyvis", "tensorflow", "keras-tuner", "scikit-learn"],
34
+ "full": ["networkx", "pyvis", "tensorflow==2.15.0", "keras-tuner", "scikit-learn"],
35
35
  },
36
36
  classifiers=[
37
37
  "Programming Language :: Python :: 3",
@@ -1,91 +0,0 @@
1
- import matplotlib.pyplot as plt
2
- import numpy as np
3
- import pandas as pd
4
- from numpy import ndarray
5
- from pandas.core.frame import DataFrame
6
-
7
- from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
8
-
9
- # --------------------------------------------------------------------------------------------------------------------------------------
10
-
11
-
12
- class SimulationEngine(FeatureSelection):
13
-
14
- def __init__(self, df: DataFrame, n_importances: int, **kwargs):
15
-
16
- self.df = df
17
- self.n_importances = n_importances
18
-
19
- super().__init__(**kwargs)
20
-
21
- def predict(self, df: DataFrame, column: str, n: int = None) -> ndarray | list:
22
-
23
- # We clean the data set
24
- df = self._clean_data(df)
25
-
26
- # Let us assign the dictionary entries corresponding to the column
27
- w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
28
-
29
- try:
30
- df = df[names_cols].copy()
31
- # Change the scale of the dataframe
32
- numeric_df = df.select_dtypes(include="number")
33
- scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
34
- numeric_scaled = scaler.rescale()
35
- numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
36
- df[numeric_df.columns] = numeric_df
37
-
38
- # Encoding the datadrame
39
- for num, colname in enumerate(dfe._encode_columns):
40
- if df[colname].dtype == "object":
41
- encode_dict = dfe.encoding_list[num]
42
- df[colname] = df[colname].apply(
43
- dfe._code_transformation_to, dictionary_list=encode_dict
44
- )
45
-
46
- except:
47
- print("The dataframe provided does not have the same columns as in the fit method.")
48
-
49
- # Assign value to n if n is None
50
- n = n if n != None else len(df)
51
-
52
- # Generation of assertion
53
- assert n > 0 and n <= len(df), '"n" must be interger or "<= len(df)".'
54
-
55
- # Sample dataframe
56
- df_aux = df.sample(n)
57
-
58
- # PREDICTION
59
- y = df_aux.to_numpy() @ w
60
-
61
- # Categorical column
62
- if quick_encoder != None:
63
-
64
- one_hot = OneHotEncoder()
65
- y = one_hot.decode(y)
66
- encoding_dic = quick_encoder.decoding_list[0]
67
- y = [encoding_dic[item] for item in y]
68
- # Numeric column
69
- else:
70
- # scale output
71
- i = numeric_dict[column]
72
- y += 1
73
- y /= 2
74
- y = y * self.scaler.values[1][i]
75
-
76
- return y
77
-
78
- def fit(self, **kwargs) -> None:
79
-
80
- # We run the feature selection algorithm
81
- self.get_digraph(self.df, self.n_importances)
82
-
83
- def _clean_data(self, df: DataFrame) -> DataFrame:
84
-
85
- df.replace([np.inf, -np.inf], np.nan, inplace=True)
86
- df.replace(" ", np.nan, inplace=True)
87
- df = check_nan_inf(df)
88
- df = df.reset_index()
89
- df = df.drop(columns=["index"])
90
-
91
- return df
File without changes
File without changes
File without changes