likelihood 1.2.17__tar.gz → 1.2.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {likelihood-1.2.17 → likelihood-1.2.19}/PKG-INFO +2 -2
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/deep/autoencoders.py +51 -25
- likelihood-1.2.19/likelihood/models/simulation.py +103 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/tools/numeric_tools.py +57 -30
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/tools/tools.py +28 -10
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/PKG-INFO +2 -2
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/requires.txt +1 -1
- {likelihood-1.2.17 → likelihood-1.2.19}/setup.py +1 -1
- likelihood-1.2.17/likelihood/models/simulation.py +0 -91
- {likelihood-1.2.17 → likelihood-1.2.19}/LICENSE +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/README.md +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/graph/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/graph/graph.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/graph/nn.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/main.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/deep/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/regression.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/models/utils.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood/tools/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/SOURCES.txt +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/dependency_links.txt +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/likelihood.egg-info/top_level.txt +0 -0
- {likelihood-1.2.17 → likelihood-1.2.19}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.19
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -28,7 +28,7 @@ Requires-Dist: corner
|
|
|
28
28
|
Provides-Extra: full
|
|
29
29
|
Requires-Dist: networkx; extra == "full"
|
|
30
30
|
Requires-Dist: pyvis; extra == "full"
|
|
31
|
-
Requires-Dist: tensorflow; extra == "full"
|
|
31
|
+
Requires-Dist: tensorflow==2.15.0; extra == "full"
|
|
32
32
|
Requires-Dist: keras-tuner; extra == "full"
|
|
33
33
|
Requires-Dist: scikit-learn; extra == "full"
|
|
34
34
|
|
|
@@ -1,40 +1,43 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from functools import partial
|
|
3
|
+
from shutil import rmtree
|
|
3
4
|
|
|
4
5
|
import keras_tuner
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
7
8
|
import tensorflow as tf
|
|
9
|
+
from likelihood.tools import OneHotEncoder
|
|
8
10
|
from pandas.core.frame import DataFrame
|
|
9
11
|
|
|
10
|
-
|
|
12
|
+
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
|
|
11
13
|
|
|
12
14
|
|
|
15
|
+
@tf.keras.saving.register_keras_serializable(package="Custom", name="AutoClassifier")
|
|
13
16
|
class AutoClassifier(tf.keras.Model):
|
|
14
17
|
"""
|
|
15
18
|
An auto-classifier model that automatically determines the best classification strategy based on the input data.
|
|
16
19
|
|
|
17
20
|
Attributes:
|
|
18
|
-
-
|
|
21
|
+
- input_shape_parm: The shape of the input data.
|
|
19
22
|
- num_classes: The number of classes in the dataset.
|
|
20
23
|
- units: The number of neurons in each hidden layer.
|
|
21
24
|
- activation: The type of activation function to use for the neural network layers.
|
|
22
25
|
|
|
23
26
|
Methods:
|
|
24
|
-
__init__(self,
|
|
25
|
-
build(self,
|
|
27
|
+
__init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
|
|
28
|
+
build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
|
|
26
29
|
call(self, x): Defines the forward pass of the model.
|
|
27
30
|
get_config(self): Returns the configuration of the model.
|
|
28
31
|
from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
|
|
29
32
|
"""
|
|
30
33
|
|
|
31
|
-
def __init__(self,
|
|
34
|
+
def __init__(self, input_shape_parm, num_classes, units, activation):
|
|
32
35
|
"""
|
|
33
36
|
Initializes an AutoClassifier instance with the given parameters.
|
|
34
37
|
|
|
35
38
|
Parameters
|
|
36
39
|
----------
|
|
37
|
-
|
|
40
|
+
input_shape_parm : `int`
|
|
38
41
|
The shape of the input data.
|
|
39
42
|
num_classes : `int`
|
|
40
43
|
The number of classes in the dataset.
|
|
@@ -44,7 +47,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
44
47
|
The type of activation function to use for the neural network layers.
|
|
45
48
|
"""
|
|
46
49
|
super(AutoClassifier, self).__init__()
|
|
47
|
-
self.
|
|
50
|
+
self.input_shape_parm = input_shape_parm
|
|
48
51
|
self.num_classes = num_classes
|
|
49
52
|
self.units = units
|
|
50
53
|
self.activation = activation
|
|
@@ -64,7 +67,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
64
67
|
self.decoder = tf.keras.Sequential(
|
|
65
68
|
[
|
|
66
69
|
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
67
|
-
tf.keras.layers.Dense(units=self.
|
|
70
|
+
tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
|
|
68
71
|
]
|
|
69
72
|
)
|
|
70
73
|
|
|
@@ -81,7 +84,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
81
84
|
|
|
82
85
|
def get_config(self):
|
|
83
86
|
config = {
|
|
84
|
-
"
|
|
87
|
+
"input_shape_parm": self.input_shape_parm,
|
|
85
88
|
"num_classes": self.num_classes,
|
|
86
89
|
"units": self.units,
|
|
87
90
|
"activation": self.activation,
|
|
@@ -92,7 +95,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
92
95
|
@classmethod
|
|
93
96
|
def from_config(cls, config):
|
|
94
97
|
return cls(
|
|
95
|
-
|
|
98
|
+
input_shape_parm=config["input_shape_parm"],
|
|
96
99
|
num_classes=config["num_classes"],
|
|
97
100
|
units=config["units"],
|
|
98
101
|
activation=config["activation"],
|
|
@@ -104,7 +107,7 @@ def call_existing_code(
|
|
|
104
107
|
activation: str,
|
|
105
108
|
threshold: float,
|
|
106
109
|
optimizer: str,
|
|
107
|
-
|
|
110
|
+
input_shape_parm: None | int = None,
|
|
108
111
|
num_classes: None | int = None,
|
|
109
112
|
) -> AutoClassifier:
|
|
110
113
|
"""
|
|
@@ -120,7 +123,7 @@ def call_existing_code(
|
|
|
120
123
|
The threshold for the classifier.
|
|
121
124
|
optimizer : `str`
|
|
122
125
|
The type of optimizer to use for the neural network layers.
|
|
123
|
-
|
|
126
|
+
input_shape_parm : `None` | `int`
|
|
124
127
|
The shape of the input data.
|
|
125
128
|
num_classes : `int`
|
|
126
129
|
The number of classes in the dataset.
|
|
@@ -131,7 +134,10 @@ def call_existing_code(
|
|
|
131
134
|
The AutoClassifier instance.
|
|
132
135
|
"""
|
|
133
136
|
model = AutoClassifier(
|
|
134
|
-
|
|
137
|
+
input_shape_parm=input_shape_parm,
|
|
138
|
+
num_classes=num_classes,
|
|
139
|
+
units=units,
|
|
140
|
+
activation=activation,
|
|
135
141
|
)
|
|
136
142
|
model.compile(
|
|
137
143
|
optimizer=optimizer,
|
|
@@ -141,14 +147,14 @@ def call_existing_code(
|
|
|
141
147
|
return model
|
|
142
148
|
|
|
143
149
|
|
|
144
|
-
def build_model(hp,
|
|
150
|
+
def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
|
|
145
151
|
"""Builds a neural network model using Keras Tuner's search algorithm.
|
|
146
152
|
|
|
147
153
|
Parameters
|
|
148
154
|
----------
|
|
149
155
|
hp : `keras_tuner.HyperParameters`
|
|
150
156
|
The hyperparameters to tune.
|
|
151
|
-
|
|
157
|
+
input_shape_parm : `None` | `int`
|
|
152
158
|
The shape of the input data.
|
|
153
159
|
num_classes : `int`
|
|
154
160
|
The number of classes in the dataset.
|
|
@@ -158,7 +164,9 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
|
|
|
158
164
|
`keras.Model`
|
|
159
165
|
The neural network model.
|
|
160
166
|
"""
|
|
161
|
-
units = hp.Int(
|
|
167
|
+
units = hp.Int(
|
|
168
|
+
"units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
|
|
169
|
+
)
|
|
162
170
|
activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
|
|
163
171
|
optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
|
|
164
172
|
threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
|
|
@@ -168,7 +176,7 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
|
|
|
168
176
|
activation=activation,
|
|
169
177
|
threshold=threshold,
|
|
170
178
|
optimizer=optimizer,
|
|
171
|
-
|
|
179
|
+
input_shape_parm=input_shape_parm,
|
|
172
180
|
num_classes=num_classes,
|
|
173
181
|
)
|
|
174
182
|
return model
|
|
@@ -180,8 +188,9 @@ def setup_model(
|
|
|
180
188
|
epochs: int,
|
|
181
189
|
train_size: float = 0.7,
|
|
182
190
|
seed=None,
|
|
183
|
-
|
|
184
|
-
|
|
191
|
+
train_mode: bool = True,
|
|
192
|
+
filepath: str = "./my_dir/best_model",
|
|
193
|
+
**kwargs,
|
|
185
194
|
) -> AutoClassifier:
|
|
186
195
|
"""Setup model for training and tuning.
|
|
187
196
|
|
|
@@ -197,6 +206,8 @@ def setup_model(
|
|
|
197
206
|
The proportion of the dataset to use for training.
|
|
198
207
|
seed : `Any` | `int`
|
|
199
208
|
The random seed to use for reproducibility.
|
|
209
|
+
train_mode : `bool`
|
|
210
|
+
Whether to train the model or not.
|
|
200
211
|
filepath : `str`
|
|
201
212
|
The path to save the best model to.
|
|
202
213
|
|
|
@@ -227,6 +238,7 @@ def setup_model(
|
|
|
227
238
|
verbose = kwargs["verbose"] if "verbose" in kwargs else True
|
|
228
239
|
|
|
229
240
|
X = data.drop(columns=target)
|
|
241
|
+
input_sample = X.sample(1)
|
|
230
242
|
y = data[target]
|
|
231
243
|
# Verify if there are categorical columns in the dataframe
|
|
232
244
|
assert (
|
|
@@ -234,21 +246,34 @@ def setup_model(
|
|
|
234
246
|
), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
|
|
235
247
|
validation_split = 1.0 - train_size
|
|
236
248
|
# Create my_dir path if it does not exist
|
|
237
|
-
|
|
238
|
-
|
|
249
|
+
|
|
250
|
+
if train_mode:
|
|
251
|
+
# Create a new directory if it does not exist
|
|
252
|
+
try:
|
|
253
|
+
if not os.path.exists(directory):
|
|
254
|
+
os.makedirs(directory)
|
|
255
|
+
else:
|
|
256
|
+
print(f"Directory {directory} already exists, it will be deleted.")
|
|
257
|
+
rmtree(directory)
|
|
258
|
+
os.makedirs(directory)
|
|
259
|
+
except:
|
|
260
|
+
print("Warning: unable to create directory")
|
|
239
261
|
|
|
240
262
|
# Create a Classifier instance
|
|
241
263
|
y_encoder = OneHotEncoder()
|
|
242
264
|
y = y_encoder.encode(y.to_list())
|
|
243
265
|
X = X.to_numpy()
|
|
266
|
+
input_sample.to_numpy()
|
|
244
267
|
X = np.asarray(X).astype(np.float32)
|
|
245
|
-
|
|
268
|
+
input_sample = np.asarray(input_sample).astype(np.float32)
|
|
246
269
|
y = np.asarray(y).astype(np.float32)
|
|
247
270
|
|
|
248
|
-
|
|
271
|
+
input_shape_parm = X.shape[1]
|
|
249
272
|
num_classes = y.shape[1]
|
|
250
273
|
global build_model
|
|
251
|
-
build_model = partial(
|
|
274
|
+
build_model = partial(
|
|
275
|
+
build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
|
|
276
|
+
)
|
|
252
277
|
|
|
253
278
|
# Create the AutoKeras model
|
|
254
279
|
tuner = keras_tuner.RandomSearch(
|
|
@@ -263,9 +288,10 @@ def setup_model(
|
|
|
263
288
|
tuner.search(X, y, epochs=epochs, validation_split=validation_split)
|
|
264
289
|
models = tuner.get_best_models(num_models=2)
|
|
265
290
|
best_model = models[0]
|
|
291
|
+
best_model(input_sample)
|
|
266
292
|
|
|
267
293
|
# save model
|
|
268
|
-
best_model.save(filepath)
|
|
294
|
+
best_model.save(filepath, save_format="tf")
|
|
269
295
|
|
|
270
296
|
if verbose:
|
|
271
297
|
tuner.results_summary()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from numpy import ndarray
|
|
5
|
+
from pandas.core.frame import DataFrame
|
|
6
|
+
|
|
7
|
+
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
|
|
8
|
+
|
|
9
|
+
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SimulationEngine(FeatureSelection):
|
|
13
|
+
"""
|
|
14
|
+
This class implements a predictive model that utilizes multiple linear regression for numerical target variables
|
|
15
|
+
and multiple logistic regression for categorical target variables.
|
|
16
|
+
|
|
17
|
+
The class provides methods for training the model on a given dataset, making predictions,
|
|
18
|
+
and evaluating the model's performance.
|
|
19
|
+
|
|
20
|
+
Key features:
|
|
21
|
+
- Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
|
|
22
|
+
- Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
|
|
23
|
+
- Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
- Instantiate the class with the training data and target variable.
|
|
27
|
+
- Call the fit method to train the model.
|
|
28
|
+
- Use the predict method to generate predictions on new data.
|
|
29
|
+
- Evaluate the model using built-in metrics for accuracy and error.
|
|
30
|
+
|
|
31
|
+
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
|
|
32
|
+
for both numerical and categorical outcomes efficiently.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
|
|
36
|
+
|
|
37
|
+
self.df = df
|
|
38
|
+
self.n_importances = n_importances
|
|
39
|
+
self.use_scaler = use_scaler
|
|
40
|
+
|
|
41
|
+
super().__init__(**kwargs)
|
|
42
|
+
|
|
43
|
+
def predict(self, df: DataFrame, column: str) -> ndarray | list:
|
|
44
|
+
# Let us assign the dictionary entries corresponding to the column
|
|
45
|
+
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
46
|
+
|
|
47
|
+
df = df[names_cols].copy()
|
|
48
|
+
# Change the scale of the dataframe
|
|
49
|
+
dataset = self.df.copy()
|
|
50
|
+
dataset.drop(columns=column, inplace=True)
|
|
51
|
+
numeric_df = dataset.select_dtypes(include="number")
|
|
52
|
+
if self.use_scaler:
|
|
53
|
+
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
54
|
+
_ = scaler.rescale()
|
|
55
|
+
dataset_ = df.copy()
|
|
56
|
+
numeric_df = dataset_.select_dtypes(include="number")
|
|
57
|
+
numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
|
|
58
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
59
|
+
for col in numeric_df.columns:
|
|
60
|
+
df[col] = numeric_df[col].values
|
|
61
|
+
|
|
62
|
+
# Encoding the datadrame
|
|
63
|
+
for num, colname in enumerate(dfe._encode_columns):
|
|
64
|
+
if df[colname].dtype == "object":
|
|
65
|
+
encode_dict = dfe.encoding_list[num]
|
|
66
|
+
df[colname] = df[colname].apply(
|
|
67
|
+
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# PREDICTION
|
|
71
|
+
y = df.to_numpy() @ w
|
|
72
|
+
|
|
73
|
+
# Categorical column
|
|
74
|
+
if quick_encoder != None:
|
|
75
|
+
|
|
76
|
+
one_hot = OneHotEncoder()
|
|
77
|
+
y = one_hot.decode(y)
|
|
78
|
+
encoding_dic = quick_encoder.decoding_list[0]
|
|
79
|
+
y = [encoding_dic[item] for item in y]
|
|
80
|
+
# Numeric column
|
|
81
|
+
else:
|
|
82
|
+
if self.use_scaler:
|
|
83
|
+
# scale output
|
|
84
|
+
y += 1
|
|
85
|
+
y /= 2
|
|
86
|
+
y = y * (self.df[column].max() - self.df[column].min())
|
|
87
|
+
|
|
88
|
+
return y[:]
|
|
89
|
+
|
|
90
|
+
def fit(self, **kwargs) -> None:
|
|
91
|
+
|
|
92
|
+
# We run the feature selection algorithm
|
|
93
|
+
self.get_digraph(self.df, self.n_importances, self.use_scaler)
|
|
94
|
+
|
|
95
|
+
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
96
|
+
|
|
97
|
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
98
|
+
df.replace(" ", np.nan, inplace=True)
|
|
99
|
+
df = check_nan_inf(df)
|
|
100
|
+
df = df.reset_index()
|
|
101
|
+
df = df.drop(columns=["index"])
|
|
102
|
+
|
|
103
|
+
return df
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
4
5
|
from numpy import arange, array, ndarray, random
|
|
5
6
|
from numpy.linalg import solve
|
|
6
7
|
from pandas.core.frame import DataFrame
|
|
7
8
|
|
|
8
|
-
# -------------------------------------------------------------------------
|
|
9
|
-
|
|
10
9
|
|
|
11
|
-
|
|
10
|
+
# -------------------------------------------------------------------------
|
|
11
|
+
def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
12
|
"""Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
|
|
13
13
|
|
|
14
14
|
Parameters
|
|
@@ -19,11 +19,15 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
19
19
|
Returns
|
|
20
20
|
-------
|
|
21
21
|
`DataFrame`
|
|
22
|
-
A dataframe with variable names as
|
|
23
|
-
correlation coefficients
|
|
22
|
+
A square dataframe with variable names as both index and columns,
|
|
23
|
+
containing their corresponding correlation coefficients.
|
|
24
24
|
"""
|
|
25
|
-
|
|
26
|
-
columns = df.columns
|
|
25
|
+
|
|
26
|
+
columns = df.select_dtypes(include="number").columns
|
|
27
|
+
n = len(columns)
|
|
28
|
+
|
|
29
|
+
# Initialize a square matrix for the correlations
|
|
30
|
+
correlations = pd.DataFrame(1.0, index=columns, columns=columns)
|
|
27
31
|
|
|
28
32
|
for i, col1 in enumerate(columns):
|
|
29
33
|
for j, col2 in enumerate(columns):
|
|
@@ -32,9 +36,9 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
32
36
|
y = df[col2].values
|
|
33
37
|
|
|
34
38
|
correlation = xicor(x, y)
|
|
35
|
-
correlations[
|
|
36
|
-
|
|
37
|
-
|
|
39
|
+
correlations.loc[col1, col2] = round(correlation, 8)
|
|
40
|
+
correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
|
|
41
|
+
|
|
38
42
|
return correlations
|
|
39
43
|
|
|
40
44
|
|
|
@@ -51,10 +55,11 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
51
55
|
"""
|
|
52
56
|
|
|
53
57
|
|
|
54
|
-
def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
|
|
55
|
-
"""
|
|
58
|
+
def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
|
|
59
|
+
"""
|
|
60
|
+
Calculate a generalized coefficient of correlation between two variables.
|
|
56
61
|
|
|
57
|
-
|
|
62
|
+
This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
|
|
58
63
|
|
|
59
64
|
Parameters
|
|
60
65
|
----------
|
|
@@ -62,30 +67,52 @@ def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
|
|
|
62
67
|
The first variable to be correlated. Must have at least one dimension.
|
|
63
68
|
Y : `np.ndarray`
|
|
64
69
|
The second variable to be correlated. Must have at least one dimension.
|
|
70
|
+
ties : bool
|
|
71
|
+
Whether to handle ties using randomization.
|
|
72
|
+
random_seed : int, optional
|
|
73
|
+
Seed for the random number generator for reproducibility.
|
|
65
74
|
|
|
66
75
|
Returns
|
|
67
76
|
-------
|
|
68
77
|
xi : `float`
|
|
69
78
|
The estimated value of the new coefficient of correlation.
|
|
70
79
|
"""
|
|
71
|
-
|
|
80
|
+
|
|
81
|
+
# Early return for identical arrays
|
|
82
|
+
if np.array_equal(X, Y):
|
|
83
|
+
return 1.0
|
|
84
|
+
|
|
72
85
|
n = len(X)
|
|
73
|
-
|
|
86
|
+
|
|
87
|
+
# Early return for cases with less than 2 elements
|
|
88
|
+
if n < 2:
|
|
89
|
+
return 0.0
|
|
90
|
+
|
|
91
|
+
# Flatten the input arrays if they are multidimensional
|
|
92
|
+
X = X.flatten()
|
|
93
|
+
Y = Y.flatten()
|
|
94
|
+
|
|
95
|
+
# Get the sorted order of X
|
|
96
|
+
order = np.argsort(X)
|
|
97
|
+
|
|
74
98
|
if ties:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
99
|
+
np.random.seed(random_seed) # Set seed for reproducibility if needed
|
|
100
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks
|
|
101
|
+
unique_ranks, counts = np.unique(ranks, return_counts=True)
|
|
102
|
+
|
|
103
|
+
# Adjust ranks for ties by shuffling
|
|
104
|
+
for rank, count in zip(unique_ranks, counts):
|
|
105
|
+
if count > 1:
|
|
106
|
+
tie_indices = np.where(ranks == rank)[0]
|
|
107
|
+
np.random.shuffle(ranks[tie_indices]) # Randomize ties
|
|
108
|
+
|
|
109
|
+
cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
|
|
110
|
+
return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
|
|
111
|
+
2 * np.sum(cumulative_counts * (n - cumulative_counts))
|
|
112
|
+
)
|
|
86
113
|
else:
|
|
87
|
-
|
|
88
|
-
return 1 - 3 * sum(abs(
|
|
114
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
|
|
115
|
+
return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
|
|
89
116
|
|
|
90
117
|
|
|
91
118
|
# -------------------------------------------------------------------------
|
|
@@ -257,8 +284,8 @@ if __name__ == "__main__":
|
|
|
257
284
|
print("New correlation coefficient test")
|
|
258
285
|
X = np.random.rand(100, 1)
|
|
259
286
|
Y = X * X
|
|
260
|
-
print("coefficient for Y = X * X : ", xicor(X, Y))
|
|
261
|
-
|
|
287
|
+
print("coefficient for Y = X * X : ", xicor(X, Y, False))
|
|
288
|
+
df["index"] = ["A", "B", "C", "D"]
|
|
262
289
|
print("New correlation coefficient test for pandas DataFrame")
|
|
263
290
|
values_df = xi_corr(df)
|
|
264
291
|
breakpoint()
|
|
@@ -640,14 +640,14 @@ def cal_average(y: ndarray, alpha: float = 1):
|
|
|
640
640
|
class DataScaler:
|
|
641
641
|
"""numpy array `scaler` and `rescaler`"""
|
|
642
642
|
|
|
643
|
-
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose"]
|
|
643
|
+
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
|
|
644
644
|
|
|
645
645
|
def __init__(self, dataset: ndarray, n: int = 1) -> None:
|
|
646
646
|
"""Initializes the parameters required for scaling the data"""
|
|
647
647
|
self.dataset_ = dataset.copy()
|
|
648
648
|
self._n = n
|
|
649
649
|
|
|
650
|
-
def rescale(self) -> ndarray:
|
|
650
|
+
def rescale(self, dataset_: ndarray | None = None) -> ndarray:
|
|
651
651
|
"""Perform a standard rescaling of the data
|
|
652
652
|
|
|
653
653
|
Returns
|
|
@@ -655,11 +655,26 @@ class DataScaler:
|
|
|
655
655
|
data_scaled : `np.array`
|
|
656
656
|
An array containing the scaled data.
|
|
657
657
|
"""
|
|
658
|
+
if isinstance(dataset_, ndarray):
|
|
659
|
+
data_scaled = np.copy(dataset_)
|
|
660
|
+
mu = self.values[0]
|
|
661
|
+
sigma = self.values[1]
|
|
662
|
+
f = self.values[2]
|
|
663
|
+
data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
|
|
664
|
+
for i in range(self.dataset_.shape[0]):
|
|
665
|
+
if self._n != None:
|
|
666
|
+
poly = f[i](self.inv_fitting[i](data_scaled[i]))
|
|
667
|
+
data_scaled[i] += -poly
|
|
668
|
+
data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
|
|
669
|
+
return data_scaled
|
|
670
|
+
else:
|
|
671
|
+
self.data_scaled = np.copy(self.dataset_.copy())
|
|
658
672
|
|
|
659
673
|
mu = []
|
|
660
674
|
sigma = []
|
|
661
675
|
fitting = []
|
|
662
|
-
self.
|
|
676
|
+
self.inv_fitting = []
|
|
677
|
+
|
|
663
678
|
try:
|
|
664
679
|
xaxis = range(self.dataset_.shape[1])
|
|
665
680
|
except:
|
|
@@ -675,12 +690,15 @@ class DataScaler:
|
|
|
675
690
|
for i in range(self.dataset_.shape[0]):
|
|
676
691
|
if self._n != None:
|
|
677
692
|
fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
|
|
693
|
+
inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
|
|
678
694
|
f = np.poly1d(fit)
|
|
679
695
|
poly = f(xaxis)
|
|
680
696
|
fitting.append(f)
|
|
697
|
+
self.inv_fitting.append(inv_fit)
|
|
681
698
|
self.data_scaled[i, :] += -poly
|
|
682
699
|
else:
|
|
683
700
|
fitting.append(0.0)
|
|
701
|
+
self.inv_fitting.append(0.0)
|
|
684
702
|
mu.append(np.min(self.data_scaled[i, :]))
|
|
685
703
|
if np.max(self.data_scaled[i, :]) != 0:
|
|
686
704
|
sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
|
|
@@ -1064,7 +1082,7 @@ class FeatureSelection:
|
|
|
1064
1082
|
self.all_features_imp_graph: List[Tuple] = []
|
|
1065
1083
|
self.w_dict = dict()
|
|
1066
1084
|
|
|
1067
|
-
def get_digraph(self, dataset: DataFrame, n_importances: int) -> str:
|
|
1085
|
+
def get_digraph(self, dataset: DataFrame, n_importances: int, use_scaler: bool = False) -> str:
|
|
1068
1086
|
"""
|
|
1069
1087
|
Get directed graph showing importance of features.
|
|
1070
1088
|
|
|
@@ -1092,10 +1110,11 @@ class FeatureSelection:
|
|
|
1092
1110
|
feature_string += column + "; "
|
|
1093
1111
|
|
|
1094
1112
|
numeric_df = curr_dataset.select_dtypes(include="number")
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1113
|
+
if use_scaler:
|
|
1114
|
+
self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
1115
|
+
numeric_scaled = self.scaler.rescale()
|
|
1116
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
1117
|
+
curr_dataset[numeric_df.columns] = numeric_df
|
|
1099
1118
|
|
|
1100
1119
|
# We construct dictionary to save index for scaling
|
|
1101
1120
|
numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
|
|
@@ -1119,7 +1138,6 @@ class FeatureSelection:
|
|
|
1119
1138
|
dfe = DataFrameEncoder(X_aux)
|
|
1120
1139
|
encoded_df = dfe.encode(save_mode=False)
|
|
1121
1140
|
# We train
|
|
1122
|
-
|
|
1123
1141
|
Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
|
|
1124
1142
|
# We obtain importance
|
|
1125
1143
|
importance = Model.get_importances()
|
|
@@ -1202,7 +1220,7 @@ class FeatureSelection:
|
|
|
1202
1220
|
|
|
1203
1221
|
|
|
1204
1222
|
def check_nan_inf(df: DataFrame) -> DataFrame:
|
|
1205
|
-
"""
|
|
1223
|
+
"""Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
|
|
1206
1224
|
nan_values = df.isnull().values.any()
|
|
1207
1225
|
count = np.isinf(df.select_dtypes(include="number")).values.sum()
|
|
1208
1226
|
print("There are null values : ", nan_values)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.19
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -28,7 +28,7 @@ Requires-Dist: corner
|
|
|
28
28
|
Provides-Extra: full
|
|
29
29
|
Requires-Dist: networkx; extra == "full"
|
|
30
30
|
Requires-Dist: pyvis; extra == "full"
|
|
31
|
-
Requires-Dist: tensorflow; extra == "full"
|
|
31
|
+
Requires-Dist: tensorflow==2.15.0; extra == "full"
|
|
32
32
|
Requires-Dist: keras-tuner; extra == "full"
|
|
33
33
|
Requires-Dist: scikit-learn; extra == "full"
|
|
34
34
|
|
|
@@ -31,7 +31,7 @@ setuptools.setup(
|
|
|
31
31
|
packages=setuptools.find_packages(),
|
|
32
32
|
install_requires=install_requires,
|
|
33
33
|
extras_require={
|
|
34
|
-
"full": ["networkx", "pyvis", "tensorflow", "keras-tuner", "scikit-learn"],
|
|
34
|
+
"full": ["networkx", "pyvis", "tensorflow==2.15.0", "keras-tuner", "scikit-learn"],
|
|
35
35
|
},
|
|
36
36
|
classifiers=[
|
|
37
37
|
"Programming Language :: Python :: 3",
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import matplotlib.pyplot as plt
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from numpy import ndarray
|
|
5
|
-
from pandas.core.frame import DataFrame
|
|
6
|
-
|
|
7
|
-
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
|
|
8
|
-
|
|
9
|
-
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SimulationEngine(FeatureSelection):
|
|
13
|
-
|
|
14
|
-
def __init__(self, df: DataFrame, n_importances: int, **kwargs):
|
|
15
|
-
|
|
16
|
-
self.df = df
|
|
17
|
-
self.n_importances = n_importances
|
|
18
|
-
|
|
19
|
-
super().__init__(**kwargs)
|
|
20
|
-
|
|
21
|
-
def predict(self, df: DataFrame, column: str, n: int = None) -> ndarray | list:
|
|
22
|
-
|
|
23
|
-
# We clean the data set
|
|
24
|
-
df = self._clean_data(df)
|
|
25
|
-
|
|
26
|
-
# Let us assign the dictionary entries corresponding to the column
|
|
27
|
-
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
28
|
-
|
|
29
|
-
try:
|
|
30
|
-
df = df[names_cols].copy()
|
|
31
|
-
# Change the scale of the dataframe
|
|
32
|
-
numeric_df = df.select_dtypes(include="number")
|
|
33
|
-
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
34
|
-
numeric_scaled = scaler.rescale()
|
|
35
|
-
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
36
|
-
df[numeric_df.columns] = numeric_df
|
|
37
|
-
|
|
38
|
-
# Encoding the datadrame
|
|
39
|
-
for num, colname in enumerate(dfe._encode_columns):
|
|
40
|
-
if df[colname].dtype == "object":
|
|
41
|
-
encode_dict = dfe.encoding_list[num]
|
|
42
|
-
df[colname] = df[colname].apply(
|
|
43
|
-
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
except:
|
|
47
|
-
print("The dataframe provided does not have the same columns as in the fit method.")
|
|
48
|
-
|
|
49
|
-
# Assign value to n if n is None
|
|
50
|
-
n = n if n != None else len(df)
|
|
51
|
-
|
|
52
|
-
# Generation of assertion
|
|
53
|
-
assert n > 0 and n <= len(df), '"n" must be interger or "<= len(df)".'
|
|
54
|
-
|
|
55
|
-
# Sample dataframe
|
|
56
|
-
df_aux = df.sample(n)
|
|
57
|
-
|
|
58
|
-
# PREDICTION
|
|
59
|
-
y = df_aux.to_numpy() @ w
|
|
60
|
-
|
|
61
|
-
# Categorical column
|
|
62
|
-
if quick_encoder != None:
|
|
63
|
-
|
|
64
|
-
one_hot = OneHotEncoder()
|
|
65
|
-
y = one_hot.decode(y)
|
|
66
|
-
encoding_dic = quick_encoder.decoding_list[0]
|
|
67
|
-
y = [encoding_dic[item] for item in y]
|
|
68
|
-
# Numeric column
|
|
69
|
-
else:
|
|
70
|
-
# scale output
|
|
71
|
-
i = numeric_dict[column]
|
|
72
|
-
y += 1
|
|
73
|
-
y /= 2
|
|
74
|
-
y = y * self.scaler.values[1][i]
|
|
75
|
-
|
|
76
|
-
return y
|
|
77
|
-
|
|
78
|
-
def fit(self, **kwargs) -> None:
|
|
79
|
-
|
|
80
|
-
# We run the feature selection algorithm
|
|
81
|
-
self.get_digraph(self.df, self.n_importances)
|
|
82
|
-
|
|
83
|
-
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
84
|
-
|
|
85
|
-
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
86
|
-
df.replace(" ", np.nan, inplace=True)
|
|
87
|
-
df = check_nan_inf(df)
|
|
88
|
-
df = df.reset_index()
|
|
89
|
-
df = df.drop(columns=["index"])
|
|
90
|
-
|
|
91
|
-
return df
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|