likelihood 1.2.17__tar.gz → 1.2.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {likelihood-1.2.17 → likelihood-1.2.18}/PKG-INFO +2 -2
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/models/deep/autoencoders.py +43 -22
- likelihood-1.2.18/likelihood/models/simulation.py +103 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/tools/numeric_tools.py +57 -30
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/tools/tools.py +28 -10
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood.egg-info/PKG-INFO +2 -2
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood.egg-info/requires.txt +1 -1
- {likelihood-1.2.17 → likelihood-1.2.18}/setup.py +1 -1
- likelihood-1.2.17/likelihood/models/simulation.py +0 -91
- {likelihood-1.2.17 → likelihood-1.2.18}/LICENSE +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/README.md +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/graph/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/graph/graph.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/graph/nn.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/main.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/models/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/models/deep/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/models/regression.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/models/utils.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood/tools/__init__.py +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood.egg-info/SOURCES.txt +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood.egg-info/dependency_links.txt +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/likelihood.egg-info/top_level.txt +0 -0
- {likelihood-1.2.17 → likelihood-1.2.18}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.18
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -28,7 +28,7 @@ Requires-Dist: corner
|
|
|
28
28
|
Provides-Extra: full
|
|
29
29
|
Requires-Dist: networkx; extra == "full"
|
|
30
30
|
Requires-Dist: pyvis; extra == "full"
|
|
31
|
-
Requires-Dist: tensorflow; extra == "full"
|
|
31
|
+
Requires-Dist: tensorflow==2.15.0; extra == "full"
|
|
32
32
|
Requires-Dist: keras-tuner; extra == "full"
|
|
33
33
|
Requires-Dist: scikit-learn; extra == "full"
|
|
34
34
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from functools import partial
|
|
3
|
+
from shutil import rmtree
|
|
3
4
|
|
|
4
5
|
import keras_tuner
|
|
5
6
|
import numpy as np
|
|
@@ -15,26 +16,26 @@ class AutoClassifier(tf.keras.Model):
|
|
|
15
16
|
An auto-classifier model that automatically determines the best classification strategy based on the input data.
|
|
16
17
|
|
|
17
18
|
Attributes:
|
|
18
|
-
-
|
|
19
|
+
- input_shape_parm: The shape of the input data.
|
|
19
20
|
- num_classes: The number of classes in the dataset.
|
|
20
21
|
- units: The number of neurons in each hidden layer.
|
|
21
22
|
- activation: The type of activation function to use for the neural network layers.
|
|
22
23
|
|
|
23
24
|
Methods:
|
|
24
|
-
__init__(self,
|
|
25
|
-
build(self,
|
|
25
|
+
__init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
|
|
26
|
+
build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
|
|
26
27
|
call(self, x): Defines the forward pass of the model.
|
|
27
28
|
get_config(self): Returns the configuration of the model.
|
|
28
29
|
from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
|
|
29
30
|
"""
|
|
30
31
|
|
|
31
|
-
def __init__(self,
|
|
32
|
+
def __init__(self, input_shape_parm, num_classes, units, activation):
|
|
32
33
|
"""
|
|
33
34
|
Initializes an AutoClassifier instance with the given parameters.
|
|
34
35
|
|
|
35
36
|
Parameters
|
|
36
37
|
----------
|
|
37
|
-
|
|
38
|
+
input_shape_parm : `int`
|
|
38
39
|
The shape of the input data.
|
|
39
40
|
num_classes : `int`
|
|
40
41
|
The number of classes in the dataset.
|
|
@@ -44,7 +45,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
44
45
|
The type of activation function to use for the neural network layers.
|
|
45
46
|
"""
|
|
46
47
|
super(AutoClassifier, self).__init__()
|
|
47
|
-
self.
|
|
48
|
+
self.input_shape_parm = input_shape_parm
|
|
48
49
|
self.num_classes = num_classes
|
|
49
50
|
self.units = units
|
|
50
51
|
self.activation = activation
|
|
@@ -53,7 +54,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
53
54
|
self.decoder = None
|
|
54
55
|
self.classifier = None
|
|
55
56
|
|
|
56
|
-
def build(self,
|
|
57
|
+
def build(self, input_shape_parm):
|
|
57
58
|
self.encoder = tf.keras.Sequential(
|
|
58
59
|
[
|
|
59
60
|
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
@@ -64,7 +65,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
64
65
|
self.decoder = tf.keras.Sequential(
|
|
65
66
|
[
|
|
66
67
|
tf.keras.layers.Dense(units=self.units, activation=self.activation),
|
|
67
|
-
tf.keras.layers.Dense(units=self.
|
|
68
|
+
tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
|
|
68
69
|
]
|
|
69
70
|
)
|
|
70
71
|
|
|
@@ -81,7 +82,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
81
82
|
|
|
82
83
|
def get_config(self):
|
|
83
84
|
config = {
|
|
84
|
-
"
|
|
85
|
+
"input_shape_parm": self.input_shape_parm,
|
|
85
86
|
"num_classes": self.num_classes,
|
|
86
87
|
"units": self.units,
|
|
87
88
|
"activation": self.activation,
|
|
@@ -92,7 +93,7 @@ class AutoClassifier(tf.keras.Model):
|
|
|
92
93
|
@classmethod
|
|
93
94
|
def from_config(cls, config):
|
|
94
95
|
return cls(
|
|
95
|
-
|
|
96
|
+
input_shape_parm=config["input_shape_parm"],
|
|
96
97
|
num_classes=config["num_classes"],
|
|
97
98
|
units=config["units"],
|
|
98
99
|
activation=config["activation"],
|
|
@@ -104,7 +105,7 @@ def call_existing_code(
|
|
|
104
105
|
activation: str,
|
|
105
106
|
threshold: float,
|
|
106
107
|
optimizer: str,
|
|
107
|
-
|
|
108
|
+
input_shape_parm: None | int = None,
|
|
108
109
|
num_classes: None | int = None,
|
|
109
110
|
) -> AutoClassifier:
|
|
110
111
|
"""
|
|
@@ -120,7 +121,7 @@ def call_existing_code(
|
|
|
120
121
|
The threshold for the classifier.
|
|
121
122
|
optimizer : `str`
|
|
122
123
|
The type of optimizer to use for the neural network layers.
|
|
123
|
-
|
|
124
|
+
input_shape_parm : `None` | `int`
|
|
124
125
|
The shape of the input data.
|
|
125
126
|
num_classes : `int`
|
|
126
127
|
The number of classes in the dataset.
|
|
@@ -131,7 +132,10 @@ def call_existing_code(
|
|
|
131
132
|
The AutoClassifier instance.
|
|
132
133
|
"""
|
|
133
134
|
model = AutoClassifier(
|
|
134
|
-
|
|
135
|
+
input_shape_parm=input_shape_parm,
|
|
136
|
+
num_classes=num_classes,
|
|
137
|
+
units=units,
|
|
138
|
+
activation=activation,
|
|
135
139
|
)
|
|
136
140
|
model.compile(
|
|
137
141
|
optimizer=optimizer,
|
|
@@ -141,14 +145,14 @@ def call_existing_code(
|
|
|
141
145
|
return model
|
|
142
146
|
|
|
143
147
|
|
|
144
|
-
def build_model(hp,
|
|
148
|
+
def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
|
|
145
149
|
"""Builds a neural network model using Keras Tuner's search algorithm.
|
|
146
150
|
|
|
147
151
|
Parameters
|
|
148
152
|
----------
|
|
149
153
|
hp : `keras_tuner.HyperParameters`
|
|
150
154
|
The hyperparameters to tune.
|
|
151
|
-
|
|
155
|
+
input_shape_parm : `None` | `int`
|
|
152
156
|
The shape of the input data.
|
|
153
157
|
num_classes : `int`
|
|
154
158
|
The number of classes in the dataset.
|
|
@@ -158,7 +162,9 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
|
|
|
158
162
|
`keras.Model`
|
|
159
163
|
The neural network model.
|
|
160
164
|
"""
|
|
161
|
-
units = hp.Int(
|
|
165
|
+
units = hp.Int(
|
|
166
|
+
"units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
|
|
167
|
+
)
|
|
162
168
|
activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
|
|
163
169
|
optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
|
|
164
170
|
threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
|
|
@@ -168,7 +174,7 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
|
|
|
168
174
|
activation=activation,
|
|
169
175
|
threshold=threshold,
|
|
170
176
|
optimizer=optimizer,
|
|
171
|
-
|
|
177
|
+
input_shape_parm=input_shape_parm,
|
|
172
178
|
num_classes=num_classes,
|
|
173
179
|
)
|
|
174
180
|
return model
|
|
@@ -180,8 +186,9 @@ def setup_model(
|
|
|
180
186
|
epochs: int,
|
|
181
187
|
train_size: float = 0.7,
|
|
182
188
|
seed=None,
|
|
189
|
+
train_mode: bool = True,
|
|
183
190
|
filepath: str = "./my_dir/best_model.keras",
|
|
184
|
-
**kwargs
|
|
191
|
+
**kwargs,
|
|
185
192
|
) -> AutoClassifier:
|
|
186
193
|
"""Setup model for training and tuning.
|
|
187
194
|
|
|
@@ -197,6 +204,8 @@ def setup_model(
|
|
|
197
204
|
The proportion of the dataset to use for training.
|
|
198
205
|
seed : `Any` | `int`
|
|
199
206
|
The random seed to use for reproducibility.
|
|
207
|
+
train_mode : `bool`
|
|
208
|
+
Whether to train the model or not.
|
|
200
209
|
filepath : `str`
|
|
201
210
|
The path to save the best model to.
|
|
202
211
|
|
|
@@ -234,8 +243,18 @@ def setup_model(
|
|
|
234
243
|
), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
|
|
235
244
|
validation_split = 1.0 - train_size
|
|
236
245
|
# Create my_dir path if it does not exist
|
|
237
|
-
|
|
238
|
-
|
|
246
|
+
|
|
247
|
+
if train_mode:
|
|
248
|
+
# Create a new directory if it does not exist
|
|
249
|
+
try:
|
|
250
|
+
if not os.path.exists(directory):
|
|
251
|
+
os.makedirs(directory)
|
|
252
|
+
else:
|
|
253
|
+
print(f"Directory {directory} already exists, it will be deleted.")
|
|
254
|
+
rmtree(directory)
|
|
255
|
+
os.makedirs(directory)
|
|
256
|
+
except:
|
|
257
|
+
print("Warning: unable to create directory")
|
|
239
258
|
|
|
240
259
|
# Create a Classifier instance
|
|
241
260
|
y_encoder = OneHotEncoder()
|
|
@@ -245,10 +264,12 @@ def setup_model(
|
|
|
245
264
|
|
|
246
265
|
y = np.asarray(y).astype(np.float32)
|
|
247
266
|
|
|
248
|
-
|
|
267
|
+
input_shape_parm = X.shape[1]
|
|
249
268
|
num_classes = y.shape[1]
|
|
250
269
|
global build_model
|
|
251
|
-
build_model = partial(
|
|
270
|
+
build_model = partial(
|
|
271
|
+
build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
|
|
272
|
+
)
|
|
252
273
|
|
|
253
274
|
# Create the AutoKeras model
|
|
254
275
|
tuner = keras_tuner.RandomSearch(
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from numpy import ndarray
|
|
5
|
+
from pandas.core.frame import DataFrame
|
|
6
|
+
|
|
7
|
+
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
|
|
8
|
+
|
|
9
|
+
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SimulationEngine(FeatureSelection):
|
|
13
|
+
"""
|
|
14
|
+
This class implements a predictive model that utilizes multiple linear regression for numerical target variables
|
|
15
|
+
and multiple logistic regression for categorical target variables.
|
|
16
|
+
|
|
17
|
+
The class provides methods for training the model on a given dataset, making predictions,
|
|
18
|
+
and evaluating the model's performance.
|
|
19
|
+
|
|
20
|
+
Key features:
|
|
21
|
+
- Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
|
|
22
|
+
- Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
|
|
23
|
+
- Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
- Instantiate the class with the training data and target variable.
|
|
27
|
+
- Call the fit method to train the model.
|
|
28
|
+
- Use the predict method to generate predictions on new data.
|
|
29
|
+
- Evaluate the model using built-in metrics for accuracy and error.
|
|
30
|
+
|
|
31
|
+
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
|
|
32
|
+
for both numerical and categorical outcomes efficiently.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
|
|
36
|
+
|
|
37
|
+
self.df = df
|
|
38
|
+
self.n_importances = n_importances
|
|
39
|
+
self.use_scaler = use_scaler
|
|
40
|
+
|
|
41
|
+
super().__init__(**kwargs)
|
|
42
|
+
|
|
43
|
+
def predict(self, df: DataFrame, column: str) -> ndarray | list:
|
|
44
|
+
# Let us assign the dictionary entries corresponding to the column
|
|
45
|
+
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
46
|
+
|
|
47
|
+
df = df[names_cols].copy()
|
|
48
|
+
# Change the scale of the dataframe
|
|
49
|
+
dataset = self.df.copy()
|
|
50
|
+
dataset.drop(columns=column, inplace=True)
|
|
51
|
+
numeric_df = dataset.select_dtypes(include="number")
|
|
52
|
+
if self.use_scaler:
|
|
53
|
+
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
54
|
+
_ = scaler.rescale()
|
|
55
|
+
dataset_ = df.copy()
|
|
56
|
+
numeric_df = dataset_.select_dtypes(include="number")
|
|
57
|
+
numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
|
|
58
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
59
|
+
for col in numeric_df.columns:
|
|
60
|
+
df[col] = numeric_df[col].values
|
|
61
|
+
|
|
62
|
+
# Encoding the datadrame
|
|
63
|
+
for num, colname in enumerate(dfe._encode_columns):
|
|
64
|
+
if df[colname].dtype == "object":
|
|
65
|
+
encode_dict = dfe.encoding_list[num]
|
|
66
|
+
df[colname] = df[colname].apply(
|
|
67
|
+
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# PREDICTION
|
|
71
|
+
y = df.to_numpy() @ w
|
|
72
|
+
|
|
73
|
+
# Categorical column
|
|
74
|
+
if quick_encoder != None:
|
|
75
|
+
|
|
76
|
+
one_hot = OneHotEncoder()
|
|
77
|
+
y = one_hot.decode(y)
|
|
78
|
+
encoding_dic = quick_encoder.decoding_list[0]
|
|
79
|
+
y = [encoding_dic[item] for item in y]
|
|
80
|
+
# Numeric column
|
|
81
|
+
else:
|
|
82
|
+
if self.use_scaler:
|
|
83
|
+
# scale output
|
|
84
|
+
y += 1
|
|
85
|
+
y /= 2
|
|
86
|
+
y = y * (self.df[column].max() - self.df[column].min())
|
|
87
|
+
|
|
88
|
+
return y[:]
|
|
89
|
+
|
|
90
|
+
def fit(self, **kwargs) -> None:
|
|
91
|
+
|
|
92
|
+
# We run the feature selection algorithm
|
|
93
|
+
self.get_digraph(self.df, self.n_importances, self.use_scaler)
|
|
94
|
+
|
|
95
|
+
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
96
|
+
|
|
97
|
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
98
|
+
df.replace(" ", np.nan, inplace=True)
|
|
99
|
+
df = check_nan_inf(df)
|
|
100
|
+
df = df.reset_index()
|
|
101
|
+
df = df.drop(columns=["index"])
|
|
102
|
+
|
|
103
|
+
return df
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
4
5
|
from numpy import arange, array, ndarray, random
|
|
5
6
|
from numpy.linalg import solve
|
|
6
7
|
from pandas.core.frame import DataFrame
|
|
7
8
|
|
|
8
|
-
# -------------------------------------------------------------------------
|
|
9
|
-
|
|
10
9
|
|
|
11
|
-
|
|
10
|
+
# -------------------------------------------------------------------------
|
|
11
|
+
def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
|
|
12
12
|
"""Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
|
|
13
13
|
|
|
14
14
|
Parameters
|
|
@@ -19,11 +19,15 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
19
19
|
Returns
|
|
20
20
|
-------
|
|
21
21
|
`DataFrame`
|
|
22
|
-
A dataframe with variable names as
|
|
23
|
-
correlation coefficients
|
|
22
|
+
A square dataframe with variable names as both index and columns,
|
|
23
|
+
containing their corresponding correlation coefficients.
|
|
24
24
|
"""
|
|
25
|
-
|
|
26
|
-
columns = df.columns
|
|
25
|
+
|
|
26
|
+
columns = df.select_dtypes(include="number").columns
|
|
27
|
+
n = len(columns)
|
|
28
|
+
|
|
29
|
+
# Initialize a square matrix for the correlations
|
|
30
|
+
correlations = pd.DataFrame(1.0, index=columns, columns=columns)
|
|
27
31
|
|
|
28
32
|
for i, col1 in enumerate(columns):
|
|
29
33
|
for j, col2 in enumerate(columns):
|
|
@@ -32,9 +36,9 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
32
36
|
y = df[col2].values
|
|
33
37
|
|
|
34
38
|
correlation = xicor(x, y)
|
|
35
|
-
correlations[
|
|
36
|
-
|
|
37
|
-
|
|
39
|
+
correlations.loc[col1, col2] = round(correlation, 8)
|
|
40
|
+
correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
|
|
41
|
+
|
|
38
42
|
return correlations
|
|
39
43
|
|
|
40
44
|
|
|
@@ -51,10 +55,11 @@ def xi_corr(df: DataFrame) -> DataFrame:
|
|
|
51
55
|
"""
|
|
52
56
|
|
|
53
57
|
|
|
54
|
-
def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
|
|
55
|
-
"""
|
|
58
|
+
def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
|
|
59
|
+
"""
|
|
60
|
+
Calculate a generalized coefficient of correlation between two variables.
|
|
56
61
|
|
|
57
|
-
|
|
62
|
+
This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
|
|
58
63
|
|
|
59
64
|
Parameters
|
|
60
65
|
----------
|
|
@@ -62,30 +67,52 @@ def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
|
|
|
62
67
|
The first variable to be correlated. Must have at least one dimension.
|
|
63
68
|
Y : `np.ndarray`
|
|
64
69
|
The second variable to be correlated. Must have at least one dimension.
|
|
70
|
+
ties : bool
|
|
71
|
+
Whether to handle ties using randomization.
|
|
72
|
+
random_seed : int, optional
|
|
73
|
+
Seed for the random number generator for reproducibility.
|
|
65
74
|
|
|
66
75
|
Returns
|
|
67
76
|
-------
|
|
68
77
|
xi : `float`
|
|
69
78
|
The estimated value of the new coefficient of correlation.
|
|
70
79
|
"""
|
|
71
|
-
|
|
80
|
+
|
|
81
|
+
# Early return for identical arrays
|
|
82
|
+
if np.array_equal(X, Y):
|
|
83
|
+
return 1.0
|
|
84
|
+
|
|
72
85
|
n = len(X)
|
|
73
|
-
|
|
86
|
+
|
|
87
|
+
# Early return for cases with less than 2 elements
|
|
88
|
+
if n < 2:
|
|
89
|
+
return 0.0
|
|
90
|
+
|
|
91
|
+
# Flatten the input arrays if they are multidimensional
|
|
92
|
+
X = X.flatten()
|
|
93
|
+
Y = Y.flatten()
|
|
94
|
+
|
|
95
|
+
# Get the sorted order of X
|
|
96
|
+
order = np.argsort(X)
|
|
97
|
+
|
|
74
98
|
if ties:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
99
|
+
np.random.seed(random_seed) # Set seed for reproducibility if needed
|
|
100
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks
|
|
101
|
+
unique_ranks, counts = np.unique(ranks, return_counts=True)
|
|
102
|
+
|
|
103
|
+
# Adjust ranks for ties by shuffling
|
|
104
|
+
for rank, count in zip(unique_ranks, counts):
|
|
105
|
+
if count > 1:
|
|
106
|
+
tie_indices = np.where(ranks == rank)[0]
|
|
107
|
+
np.random.shuffle(ranks[tie_indices]) # Randomize ties
|
|
108
|
+
|
|
109
|
+
cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
|
|
110
|
+
return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
|
|
111
|
+
2 * np.sum(cumulative_counts * (n - cumulative_counts))
|
|
112
|
+
)
|
|
86
113
|
else:
|
|
87
|
-
|
|
88
|
-
return 1 - 3 * sum(abs(
|
|
114
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
|
|
115
|
+
return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
|
|
89
116
|
|
|
90
117
|
|
|
91
118
|
# -------------------------------------------------------------------------
|
|
@@ -257,8 +284,8 @@ if __name__ == "__main__":
|
|
|
257
284
|
print("New correlation coefficient test")
|
|
258
285
|
X = np.random.rand(100, 1)
|
|
259
286
|
Y = X * X
|
|
260
|
-
print("coefficient for Y = X * X : ", xicor(X, Y))
|
|
261
|
-
|
|
287
|
+
print("coefficient for Y = X * X : ", xicor(X, Y, False))
|
|
288
|
+
df["index"] = ["A", "B", "C", "D"]
|
|
262
289
|
print("New correlation coefficient test for pandas DataFrame")
|
|
263
290
|
values_df = xi_corr(df)
|
|
264
291
|
breakpoint()
|
|
@@ -640,14 +640,14 @@ def cal_average(y: ndarray, alpha: float = 1):
|
|
|
640
640
|
class DataScaler:
|
|
641
641
|
"""numpy array `scaler` and `rescaler`"""
|
|
642
642
|
|
|
643
|
-
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose"]
|
|
643
|
+
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
|
|
644
644
|
|
|
645
645
|
def __init__(self, dataset: ndarray, n: int = 1) -> None:
|
|
646
646
|
"""Initializes the parameters required for scaling the data"""
|
|
647
647
|
self.dataset_ = dataset.copy()
|
|
648
648
|
self._n = n
|
|
649
649
|
|
|
650
|
-
def rescale(self) -> ndarray:
|
|
650
|
+
def rescale(self, dataset_: ndarray | None = None) -> ndarray:
|
|
651
651
|
"""Perform a standard rescaling of the data
|
|
652
652
|
|
|
653
653
|
Returns
|
|
@@ -655,11 +655,26 @@ class DataScaler:
|
|
|
655
655
|
data_scaled : `np.array`
|
|
656
656
|
An array containing the scaled data.
|
|
657
657
|
"""
|
|
658
|
+
if isinstance(dataset_, ndarray):
|
|
659
|
+
data_scaled = np.copy(dataset_)
|
|
660
|
+
mu = self.values[0]
|
|
661
|
+
sigma = self.values[1]
|
|
662
|
+
f = self.values[2]
|
|
663
|
+
data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
|
|
664
|
+
for i in range(self.dataset_.shape[0]):
|
|
665
|
+
if self._n != None:
|
|
666
|
+
poly = f[i](self.inv_fitting[i](data_scaled[i]))
|
|
667
|
+
data_scaled[i] += -poly
|
|
668
|
+
data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
|
|
669
|
+
return data_scaled
|
|
670
|
+
else:
|
|
671
|
+
self.data_scaled = np.copy(self.dataset_.copy())
|
|
658
672
|
|
|
659
673
|
mu = []
|
|
660
674
|
sigma = []
|
|
661
675
|
fitting = []
|
|
662
|
-
self.
|
|
676
|
+
self.inv_fitting = []
|
|
677
|
+
|
|
663
678
|
try:
|
|
664
679
|
xaxis = range(self.dataset_.shape[1])
|
|
665
680
|
except:
|
|
@@ -675,12 +690,15 @@ class DataScaler:
|
|
|
675
690
|
for i in range(self.dataset_.shape[0]):
|
|
676
691
|
if self._n != None:
|
|
677
692
|
fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
|
|
693
|
+
inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
|
|
678
694
|
f = np.poly1d(fit)
|
|
679
695
|
poly = f(xaxis)
|
|
680
696
|
fitting.append(f)
|
|
697
|
+
self.inv_fitting.append(inv_fit)
|
|
681
698
|
self.data_scaled[i, :] += -poly
|
|
682
699
|
else:
|
|
683
700
|
fitting.append(0.0)
|
|
701
|
+
self.inv_fitting.append(0.0)
|
|
684
702
|
mu.append(np.min(self.data_scaled[i, :]))
|
|
685
703
|
if np.max(self.data_scaled[i, :]) != 0:
|
|
686
704
|
sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
|
|
@@ -1064,7 +1082,7 @@ class FeatureSelection:
|
|
|
1064
1082
|
self.all_features_imp_graph: List[Tuple] = []
|
|
1065
1083
|
self.w_dict = dict()
|
|
1066
1084
|
|
|
1067
|
-
def get_digraph(self, dataset: DataFrame, n_importances: int) -> str:
|
|
1085
|
+
def get_digraph(self, dataset: DataFrame, n_importances: int, use_scaler: bool = False) -> str:
|
|
1068
1086
|
"""
|
|
1069
1087
|
Get directed graph showing importance of features.
|
|
1070
1088
|
|
|
@@ -1092,10 +1110,11 @@ class FeatureSelection:
|
|
|
1092
1110
|
feature_string += column + "; "
|
|
1093
1111
|
|
|
1094
1112
|
numeric_df = curr_dataset.select_dtypes(include="number")
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1113
|
+
if use_scaler:
|
|
1114
|
+
self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
1115
|
+
numeric_scaled = self.scaler.rescale()
|
|
1116
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
1117
|
+
curr_dataset[numeric_df.columns] = numeric_df
|
|
1099
1118
|
|
|
1100
1119
|
# We construct dictionary to save index for scaling
|
|
1101
1120
|
numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
|
|
@@ -1119,7 +1138,6 @@ class FeatureSelection:
|
|
|
1119
1138
|
dfe = DataFrameEncoder(X_aux)
|
|
1120
1139
|
encoded_df = dfe.encode(save_mode=False)
|
|
1121
1140
|
# We train
|
|
1122
|
-
|
|
1123
1141
|
Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
|
|
1124
1142
|
# We obtain importance
|
|
1125
1143
|
importance = Model.get_importances()
|
|
@@ -1202,7 +1220,7 @@ class FeatureSelection:
|
|
|
1202
1220
|
|
|
1203
1221
|
|
|
1204
1222
|
def check_nan_inf(df: DataFrame) -> DataFrame:
|
|
1205
|
-
"""
|
|
1223
|
+
"""Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
|
|
1206
1224
|
nan_values = df.isnull().values.any()
|
|
1207
1225
|
count = np.isinf(df.select_dtypes(include="number")).values.sum()
|
|
1208
1226
|
print("There are null values : ", nan_values)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.18
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -28,7 +28,7 @@ Requires-Dist: corner
|
|
|
28
28
|
Provides-Extra: full
|
|
29
29
|
Requires-Dist: networkx; extra == "full"
|
|
30
30
|
Requires-Dist: pyvis; extra == "full"
|
|
31
|
-
Requires-Dist: tensorflow; extra == "full"
|
|
31
|
+
Requires-Dist: tensorflow==2.15.0; extra == "full"
|
|
32
32
|
Requires-Dist: keras-tuner; extra == "full"
|
|
33
33
|
Requires-Dist: scikit-learn; extra == "full"
|
|
34
34
|
|
|
@@ -31,7 +31,7 @@ setuptools.setup(
|
|
|
31
31
|
packages=setuptools.find_packages(),
|
|
32
32
|
install_requires=install_requires,
|
|
33
33
|
extras_require={
|
|
34
|
-
"full": ["networkx", "pyvis", "tensorflow", "keras-tuner", "scikit-learn"],
|
|
34
|
+
"full": ["networkx", "pyvis", "tensorflow==2.15.0", "keras-tuner", "scikit-learn"],
|
|
35
35
|
},
|
|
36
36
|
classifiers=[
|
|
37
37
|
"Programming Language :: Python :: 3",
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import matplotlib.pyplot as plt
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from numpy import ndarray
|
|
5
|
-
from pandas.core.frame import DataFrame
|
|
6
|
-
|
|
7
|
-
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
|
|
8
|
-
|
|
9
|
-
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SimulationEngine(FeatureSelection):
|
|
13
|
-
|
|
14
|
-
def __init__(self, df: DataFrame, n_importances: int, **kwargs):
|
|
15
|
-
|
|
16
|
-
self.df = df
|
|
17
|
-
self.n_importances = n_importances
|
|
18
|
-
|
|
19
|
-
super().__init__(**kwargs)
|
|
20
|
-
|
|
21
|
-
def predict(self, df: DataFrame, column: str, n: int = None) -> ndarray | list:
|
|
22
|
-
|
|
23
|
-
# We clean the data set
|
|
24
|
-
df = self._clean_data(df)
|
|
25
|
-
|
|
26
|
-
# Let us assign the dictionary entries corresponding to the column
|
|
27
|
-
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
28
|
-
|
|
29
|
-
try:
|
|
30
|
-
df = df[names_cols].copy()
|
|
31
|
-
# Change the scale of the dataframe
|
|
32
|
-
numeric_df = df.select_dtypes(include="number")
|
|
33
|
-
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
34
|
-
numeric_scaled = scaler.rescale()
|
|
35
|
-
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
36
|
-
df[numeric_df.columns] = numeric_df
|
|
37
|
-
|
|
38
|
-
# Encoding the datadrame
|
|
39
|
-
for num, colname in enumerate(dfe._encode_columns):
|
|
40
|
-
if df[colname].dtype == "object":
|
|
41
|
-
encode_dict = dfe.encoding_list[num]
|
|
42
|
-
df[colname] = df[colname].apply(
|
|
43
|
-
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
except:
|
|
47
|
-
print("The dataframe provided does not have the same columns as in the fit method.")
|
|
48
|
-
|
|
49
|
-
# Assign value to n if n is None
|
|
50
|
-
n = n if n != None else len(df)
|
|
51
|
-
|
|
52
|
-
# Generation of assertion
|
|
53
|
-
assert n > 0 and n <= len(df), '"n" must be interger or "<= len(df)".'
|
|
54
|
-
|
|
55
|
-
# Sample dataframe
|
|
56
|
-
df_aux = df.sample(n)
|
|
57
|
-
|
|
58
|
-
# PREDICTION
|
|
59
|
-
y = df_aux.to_numpy() @ w
|
|
60
|
-
|
|
61
|
-
# Categorical column
|
|
62
|
-
if quick_encoder != None:
|
|
63
|
-
|
|
64
|
-
one_hot = OneHotEncoder()
|
|
65
|
-
y = one_hot.decode(y)
|
|
66
|
-
encoding_dic = quick_encoder.decoding_list[0]
|
|
67
|
-
y = [encoding_dic[item] for item in y]
|
|
68
|
-
# Numeric column
|
|
69
|
-
else:
|
|
70
|
-
# scale output
|
|
71
|
-
i = numeric_dict[column]
|
|
72
|
-
y += 1
|
|
73
|
-
y /= 2
|
|
74
|
-
y = y * self.scaler.values[1][i]
|
|
75
|
-
|
|
76
|
-
return y
|
|
77
|
-
|
|
78
|
-
def fit(self, **kwargs) -> None:
|
|
79
|
-
|
|
80
|
-
# We run the feature selection algorithm
|
|
81
|
-
self.get_digraph(self.df, self.n_importances)
|
|
82
|
-
|
|
83
|
-
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
84
|
-
|
|
85
|
-
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
86
|
-
df.replace(" ", np.nan, inplace=True)
|
|
87
|
-
df = check_nan_inf(df)
|
|
88
|
-
df = df.reset_index()
|
|
89
|
-
df = df.drop(columns=["index"])
|
|
90
|
-
|
|
91
|
-
return df
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|