likelihood 1.2.22__tar.gz → 1.2.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {likelihood-1.2.22 → likelihood-1.2.23}/PKG-INFO +1 -1
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/graph/graph.py +17 -0
- likelihood-1.2.23/likelihood/models/simulation.py +223 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/tools/tools.py +307 -261
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/PKG-INFO +1 -1
- likelihood-1.2.22/likelihood/models/simulation.py +0 -103
- {likelihood-1.2.22 → likelihood-1.2.23}/LICENSE +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/README.md +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/__init__.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/graph/__init__.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/graph/nn.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/main.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/__init__.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/deep/__init__.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/deep/autoencoders.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/regression.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/models/utils.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/tools/__init__.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood/tools/numeric_tools.py +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/SOURCES.txt +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/dependency_links.txt +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/requires.txt +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/likelihood.egg-info/top_level.txt +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/setup.cfg +0 -0
- {likelihood-1.2.22 → likelihood-1.2.23}/setup.py +0 -0
|
@@ -74,3 +74,20 @@ class DynamicGraph(FeatureSelection):
|
|
|
74
74
|
nx_graph.add_edges_from([(source, target, edge)])
|
|
75
75
|
|
|
76
76
|
return nx_graph
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# -------------------------------------------------------------------------
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
import numpy as np
|
|
82
|
+
import pandas as pd
|
|
83
|
+
|
|
84
|
+
# Generate data
|
|
85
|
+
x = np.random.rand(3, 100)
|
|
86
|
+
y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
|
|
87
|
+
# Create a DataFrame
|
|
88
|
+
df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
|
|
89
|
+
df["y"] = y
|
|
90
|
+
# Instantiate DynamicGraph
|
|
91
|
+
fs = DynamicGraph(df, n_importances=2)
|
|
92
|
+
print(fs.fit())
|
|
93
|
+
fs.draw()
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import warnings
|
|
3
|
+
from typing import List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from numpy import ndarray
|
|
9
|
+
from pandas.core.frame import DataFrame
|
|
10
|
+
|
|
11
|
+
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
|
|
12
|
+
|
|
13
|
+
# Suppress RankWarning
|
|
14
|
+
warnings.simplefilter("ignore", np.RankWarning)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
18
|
+
def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
|
|
19
|
+
# Count the frequency of each category in the column
|
|
20
|
+
freq = df[column].value_counts()
|
|
21
|
+
|
|
22
|
+
# Calculate the 25th percentile (Q1) and 75th percentile (Q3)
|
|
23
|
+
q1 = freq.quantile(0.25)
|
|
24
|
+
q3 = freq.quantile(0.75)
|
|
25
|
+
|
|
26
|
+
# Filter categories that are below the 25th percentile and above the 75th percentile
|
|
27
|
+
least_frequent = freq[freq <= q1]
|
|
28
|
+
most_frequent = freq[freq >= q3]
|
|
29
|
+
|
|
30
|
+
# Get the least frequent category (25th percentile) and the most frequent category (75th percentile)
|
|
31
|
+
least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
|
|
32
|
+
most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None
|
|
33
|
+
|
|
34
|
+
return least_frequent_category, most_frequent_category
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class SimulationEngine(FeatureSelection):
|
|
38
|
+
"""
|
|
39
|
+
This class implements a predictive model that utilizes multiple linear regression for numerical target variables
|
|
40
|
+
and multiple logistic regression for categorical target variables.
|
|
41
|
+
|
|
42
|
+
The class provides methods for training the model on a given dataset, making predictions,
|
|
43
|
+
and evaluating the model's performance.
|
|
44
|
+
|
|
45
|
+
Key features:
|
|
46
|
+
- Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
|
|
47
|
+
- Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
|
|
48
|
+
- Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
|
|
49
|
+
|
|
50
|
+
Usage:
|
|
51
|
+
- Instantiate the class with the training data and target variable.
|
|
52
|
+
- Call the fit method to train the model.
|
|
53
|
+
- Use the predict method to generate predictions on new data.
|
|
54
|
+
- Evaluate the model using built-in metrics for accuracy and error.
|
|
55
|
+
|
|
56
|
+
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
|
|
57
|
+
for both numerical and categorical outcomes efficiently.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self, use_scaler: bool = False, **kwargs):
|
|
61
|
+
|
|
62
|
+
self.df = pd.DataFrame()
|
|
63
|
+
self.n_importances = None
|
|
64
|
+
self.use_scaler = use_scaler
|
|
65
|
+
self.proba_dict = {}
|
|
66
|
+
|
|
67
|
+
super().__init__(**kwargs)
|
|
68
|
+
|
|
69
|
+
def predict(self, df: DataFrame, column: str) -> ndarray | list:
|
|
70
|
+
# Let us assign the dictionary entries corresponding to the column
|
|
71
|
+
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
72
|
+
|
|
73
|
+
df = df[names_cols].copy()
|
|
74
|
+
# Change the scale of the dataframe
|
|
75
|
+
dataset = self.df.copy()
|
|
76
|
+
dataset.drop(columns=column, inplace=True)
|
|
77
|
+
numeric_df = dataset.select_dtypes(include="number")
|
|
78
|
+
if self.use_scaler:
|
|
79
|
+
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
80
|
+
_ = scaler.rescale()
|
|
81
|
+
dataset_ = df.copy()
|
|
82
|
+
numeric_df = dataset_.select_dtypes(include="number")
|
|
83
|
+
numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
|
|
84
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
85
|
+
for col in numeric_df.columns:
|
|
86
|
+
df[col] = numeric_df[col].values
|
|
87
|
+
|
|
88
|
+
# Encoding the datadrame
|
|
89
|
+
for num, colname in enumerate(dfe._encode_columns):
|
|
90
|
+
if df[colname].dtype == "object":
|
|
91
|
+
encode_dict = dfe.encoding_list[num]
|
|
92
|
+
df[colname] = df[colname].apply(
|
|
93
|
+
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# PREDICTION
|
|
97
|
+
y = df.to_numpy() @ w
|
|
98
|
+
|
|
99
|
+
# Categorical column
|
|
100
|
+
if quick_encoder != None:
|
|
101
|
+
|
|
102
|
+
one_hot = OneHotEncoder()
|
|
103
|
+
y = one_hot.decode(y)
|
|
104
|
+
encoding_dic = quick_encoder.decoding_list[0]
|
|
105
|
+
y = [encoding_dic[item] for item in y]
|
|
106
|
+
# Numeric column
|
|
107
|
+
else:
|
|
108
|
+
if self.use_scaler:
|
|
109
|
+
# scale output
|
|
110
|
+
y += 1
|
|
111
|
+
y /= 2
|
|
112
|
+
y = y * (self.df[column].max() - self.df[column].min())
|
|
113
|
+
|
|
114
|
+
return y[:]
|
|
115
|
+
|
|
116
|
+
def _encode(self, df: DataFrame) -> ndarray | list:
|
|
117
|
+
df = df.copy()
|
|
118
|
+
column = df.columns[0]
|
|
119
|
+
frec = df[column].value_counts() / len(df)
|
|
120
|
+
df.loc[:, "frec"] = df[column].map(frec)
|
|
121
|
+
df.sort_values("frec", inplace=True)
|
|
122
|
+
keys = df[column].to_list()
|
|
123
|
+
values = df["frec"].to_list()
|
|
124
|
+
return dict(zip(keys, values))
|
|
125
|
+
|
|
126
|
+
def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None:
|
|
127
|
+
self.df = df
|
|
128
|
+
self.n_importances = n_importances
|
|
129
|
+
# We run the feature selection algorithm
|
|
130
|
+
self.get_digraph(self.df, self.n_importances, self.use_scaler)
|
|
131
|
+
proba_dict_keys = list(self.w_dict.keys())
|
|
132
|
+
self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))]))
|
|
133
|
+
for key in proba_dict_keys:
|
|
134
|
+
x = (
|
|
135
|
+
self.df[key].values,
|
|
136
|
+
None if self.df[key].dtype != "object" else self._encode(self.df[[key]]),
|
|
137
|
+
)
|
|
138
|
+
poly = kwargs.get("poly", 9)
|
|
139
|
+
plot = kwargs.get("plot", False)
|
|
140
|
+
if not x[1]:
|
|
141
|
+
media = self.df[key].mean()
|
|
142
|
+
desviacion_estandar = self.df[key].std()
|
|
143
|
+
cota_inferior = media - 1.5 * desviacion_estandar
|
|
144
|
+
cota_superior = media + 1.5 * desviacion_estandar
|
|
145
|
+
if plot:
|
|
146
|
+
print(f"Cumulative Distribution Function ({key})")
|
|
147
|
+
f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
|
|
148
|
+
else:
|
|
149
|
+
f, ox = None, None
|
|
150
|
+
least_frequent_category, most_frequent_category = categories_by_quartile(
|
|
151
|
+
self.df[[key]], key
|
|
152
|
+
)
|
|
153
|
+
cota_inferior = x[1].get(least_frequent_category, 0)
|
|
154
|
+
cota_superior = x[1].get(most_frequent_category, 0)
|
|
155
|
+
self.proba_dict[key] = (
|
|
156
|
+
f if f else None,
|
|
157
|
+
x[1],
|
|
158
|
+
(np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
|
|
159
|
+
f(cota_inferior) if f else cota_inferior,
|
|
160
|
+
f(cota_superior) if f else cota_superior,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:
|
|
164
|
+
value = (
|
|
165
|
+
value
|
|
166
|
+
if isinstance(value, list)
|
|
167
|
+
else value.flatten().tolist() if isinstance(value, np.ndarray) else [value]
|
|
168
|
+
)
|
|
169
|
+
return [
|
|
170
|
+
(
|
|
171
|
+
self.proba_dict[colname][0](val)
|
|
172
|
+
- self.proba_dict[colname][0](val - self.proba_dict[colname][2])
|
|
173
|
+
if (isinstance(val, float) or isinstance(val, int))
|
|
174
|
+
else self.proba_dict[colname][1].get(val, 0)
|
|
175
|
+
)
|
|
176
|
+
for val in value
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]:
|
|
180
|
+
return [
|
|
181
|
+
(
|
|
182
|
+
"inlier"
|
|
183
|
+
if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4])
|
|
184
|
+
else "outlier"
|
|
185
|
+
)
|
|
186
|
+
for val in self.get_proba(value, colname)
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
190
|
+
|
|
191
|
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
192
|
+
df.replace(" ", np.nan, inplace=True)
|
|
193
|
+
df = check_nan_inf(df)
|
|
194
|
+
df = df.reset_index()
|
|
195
|
+
df = df.drop(columns=["index"])
|
|
196
|
+
|
|
197
|
+
return df
|
|
198
|
+
|
|
199
|
+
def save(self, filename: str = "./simulation_model") -> None:
|
|
200
|
+
"""
|
|
201
|
+
Save the state of the SimulationEngine to a file.
|
|
202
|
+
|
|
203
|
+
Parameters:
|
|
204
|
+
filename (str): The name of the file where the object will be saved.
|
|
205
|
+
"""
|
|
206
|
+
filename = filename if filename.endswith(".pkl") else filename + ".pkl"
|
|
207
|
+
with open(filename, "wb") as f:
|
|
208
|
+
pickle.dump(self, f)
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def load(filename: str = "./simulation_model"):
|
|
212
|
+
"""
|
|
213
|
+
Load the state of a SimulationEngine from a file.
|
|
214
|
+
|
|
215
|
+
Parameters:
|
|
216
|
+
filename (str): The name of the file containing the saved object.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
SimulationEngine: A new instance of SimulationEngine with the loaded state.
|
|
220
|
+
"""
|
|
221
|
+
filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
|
|
222
|
+
with open(filename, "rb") as f:
|
|
223
|
+
return pickle.load(f)
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import math
|
|
2
2
|
import os
|
|
3
3
|
import pickle
|
|
4
|
-
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import Callable, Dict, List, Tuple, Union
|
|
5
6
|
|
|
6
7
|
import matplotlib.pyplot as plt
|
|
7
8
|
import numpy as np
|
|
8
9
|
import pandas as pd
|
|
9
10
|
import yaml
|
|
10
|
-
from numpy import ndarray
|
|
11
11
|
from pandas.core.frame import DataFrame
|
|
12
12
|
|
|
13
|
+
# Suppress RankWarning
|
|
14
|
+
warnings.simplefilter("ignore", np.RankWarning)
|
|
15
|
+
|
|
13
16
|
# -------------------------------------------------------------------------
|
|
14
17
|
|
|
15
18
|
"""
|
|
@@ -68,7 +71,7 @@ def difference_quotient(f: Callable, x: float, h: float) -> Callable:
|
|
|
68
71
|
return (f(x + h) - f(x)) / h
|
|
69
72
|
|
|
70
73
|
|
|
71
|
-
def partial_difference_quotient(f: Callable, v: ndarray, i: int, h: float) -> ndarray:
|
|
74
|
+
def partial_difference_quotient(f: Callable, v: np.ndarray, i: int, h: float) -> np.ndarray:
|
|
72
75
|
"""Calculates the partial difference quotient of `f`
|
|
73
76
|
|
|
74
77
|
Parameters
|
|
@@ -93,7 +96,7 @@ def partial_difference_quotient(f: Callable, v: ndarray, i: int, h: float) -> nd
|
|
|
93
96
|
return (f(w) - f(v)) / h
|
|
94
97
|
|
|
95
98
|
|
|
96
|
-
def estimate_gradient(f: Callable, v: ndarray, h: float = 1e-4) -> List[ndarray]:
|
|
99
|
+
def estimate_gradient(f: Callable, v: np.ndarray, h: float = 1e-4) -> List[np.ndarray]:
|
|
97
100
|
"""Calculates the gradient of `f` at `v`
|
|
98
101
|
|
|
99
102
|
Parameters
|
|
@@ -138,35 +141,32 @@ def generate_feature_yaml(
|
|
|
138
141
|
A dictionary with four keys ('ordinal_features', 'numeric_features', 'categorical_features', 'ignore_features')
|
|
139
142
|
mapping to lists of feature names. Or a YAML formatted string if `yaml_string` is `True`.
|
|
140
143
|
"""
|
|
144
|
+
ignore_features = ignore_features or []
|
|
141
145
|
feature_info = {
|
|
142
146
|
"ordinal_features": [],
|
|
143
147
|
"numeric_features": [],
|
|
144
148
|
"categorical_features": [],
|
|
145
|
-
"ignore_features":
|
|
149
|
+
"ignore_features": ignore_features,
|
|
146
150
|
}
|
|
147
151
|
|
|
148
152
|
for col in df.columns:
|
|
149
|
-
if
|
|
153
|
+
if col in ignore_features:
|
|
150
154
|
continue
|
|
151
155
|
|
|
152
156
|
if pd.api.types.is_numeric_dtype(df[col]):
|
|
153
|
-
|
|
157
|
+
if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
|
|
158
|
+
feature_info["numeric_features"].append(col)
|
|
159
|
+
elif pd.api.types.is_bool_dtype(df[col]):
|
|
160
|
+
feature_info["ordinal_features"].append(col) # Assuming bool can be ordinal
|
|
154
161
|
elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
|
|
155
162
|
feature_info["categorical_features"].append(col)
|
|
156
|
-
elif pd.api.types.is_integer_dtype(df[col]):
|
|
157
|
-
feature_info["ordinal_features"].append(col)
|
|
158
|
-
elif pd.api.types.is_float_dtype(df[col]):
|
|
159
|
-
feature_info["ordinal_features"].append(col)
|
|
160
|
-
elif pd.api.types.is_bool_dtype(df[col]):
|
|
161
|
-
feature_info["ordinal_features"].append(col)
|
|
162
163
|
else:
|
|
163
164
|
print(f"Unknown type for feature {col}")
|
|
164
|
-
feature_info["ignore_features"] = ignore_features
|
|
165
165
|
|
|
166
166
|
if yaml_string:
|
|
167
167
|
return yaml.dump(feature_info, default_flow_style=False)
|
|
168
|
-
|
|
169
|
-
|
|
168
|
+
|
|
169
|
+
return feature_info
|
|
170
170
|
|
|
171
171
|
|
|
172
172
|
# a function that calculates the percentage of missing values per column is defined
|
|
@@ -192,61 +192,9 @@ def cal_missing_values(df: DataFrame) -> None:
|
|
|
192
192
|
)
|
|
193
193
|
|
|
194
194
|
|
|
195
|
-
def calculate_probability(x: ndarray, points: int = 1, cond: bool = True) -> ndarray:
|
|
196
|
-
"""Calculates the probability of the data.
|
|
197
|
-
|
|
198
|
-
Parameters
|
|
199
|
-
----------
|
|
200
|
-
x : `np.array`
|
|
201
|
-
An array containing the data.
|
|
202
|
-
points : `int`
|
|
203
|
-
An integer value. By default it is set to `1`.
|
|
204
|
-
cond : `bool`
|
|
205
|
-
A boolean value. By default it is set to `True`.
|
|
206
|
-
|
|
207
|
-
Returns
|
|
208
|
-
-------
|
|
209
|
-
p : `np.array`
|
|
210
|
-
An array containing the probability of the data.
|
|
211
|
-
|
|
212
|
-
"""
|
|
213
|
-
|
|
214
|
-
p = []
|
|
215
|
-
|
|
216
|
-
f = cdf(x)[0]
|
|
217
|
-
for i in range(len(x)):
|
|
218
|
-
p.append(f(x[i]))
|
|
219
|
-
p = np.array(p)
|
|
220
|
-
if cond:
|
|
221
|
-
if np.prod(p[-points]) > 1:
|
|
222
|
-
print("\nThe probability of the data cannot be calculated.\n")
|
|
223
|
-
else:
|
|
224
|
-
if np.prod(p[-points]) < 0:
|
|
225
|
-
print("\nThe probability of the data cannot be calculated.\n")
|
|
226
|
-
else:
|
|
227
|
-
print(
|
|
228
|
-
"The model has a probability of {:.2f}% of being correct".format(
|
|
229
|
-
np.prod(p[-points]) * 100
|
|
230
|
-
)
|
|
231
|
-
)
|
|
232
|
-
else:
|
|
233
|
-
if np.sum(p[-points]) < 0:
|
|
234
|
-
print("\nThe probability of the data cannot be calculated.\n")
|
|
235
|
-
else:
|
|
236
|
-
if np.sum(p[-points]) > 1:
|
|
237
|
-
print("\nThe probability of the data cannot be calculated.\n")
|
|
238
|
-
else:
|
|
239
|
-
print(
|
|
240
|
-
"The model has a probability of {:.2f}% of being correct".format(
|
|
241
|
-
np.sum(p[-points]) * 100
|
|
242
|
-
)
|
|
243
|
-
)
|
|
244
|
-
return p
|
|
245
|
-
|
|
246
|
-
|
|
247
195
|
def cdf(
|
|
248
|
-
x: ndarray, poly: int = 9, inv: bool = False, plot: bool = False, savename: str = None
|
|
249
|
-
) ->
|
|
196
|
+
x: np.ndarray, poly: int = 9, inv: bool = False, plot: bool = False, savename: str = None
|
|
197
|
+
) -> tuple:
|
|
250
198
|
"""Calculates the cumulative distribution function of the data.
|
|
251
199
|
|
|
252
200
|
Parameters
|
|
@@ -254,165 +202,229 @@ def cdf(
|
|
|
254
202
|
x : `np.array`
|
|
255
203
|
An array containing the data.
|
|
256
204
|
poly : `int`
|
|
257
|
-
|
|
205
|
+
Degree of the polynomial fit. By default it is set to `9`.
|
|
258
206
|
inv : `bool`
|
|
259
|
-
|
|
207
|
+
If True, calculate the inverse CDF (quantile function).
|
|
208
|
+
plot : `bool`
|
|
209
|
+
If True, plot the results.
|
|
210
|
+
savename : `str`, optional
|
|
211
|
+
Filename to save the plot.
|
|
260
212
|
|
|
261
213
|
Returns
|
|
262
214
|
-------
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
215
|
+
fit : `np.poly1d`
|
|
216
|
+
Polynomial fit of the CDF or quantile function.
|
|
217
|
+
cdf_values : `np.array`
|
|
218
|
+
Cumulative distribution values.
|
|
219
|
+
sorted_x : `np.array`
|
|
220
|
+
Sorted input data.
|
|
266
221
|
"""
|
|
267
222
|
|
|
268
|
-
|
|
223
|
+
if len(x) == 0:
|
|
224
|
+
raise ValueError("Input array 'x' must not be empty.")
|
|
225
|
+
|
|
226
|
+
cdf_values = np.cumsum(x) / np.sum(x)
|
|
227
|
+
sorted_x = np.sort(x)
|
|
269
228
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
M = np.triu(I)
|
|
273
|
-
df = np.dot(ox, M)
|
|
274
|
-
df_ = df / np.max(df)
|
|
229
|
+
# Calculate the CDF or inverse CDF (quantile function)
|
|
230
|
+
probabilities = np.linspace(0, 1, len(sorted_x))
|
|
275
231
|
|
|
276
232
|
if inv:
|
|
277
|
-
fit = np.polyfit(
|
|
233
|
+
fit = np.polyfit(probabilities, sorted_x, poly)
|
|
278
234
|
f = np.poly1d(fit)
|
|
235
|
+
plot_label = "Quantile Function"
|
|
236
|
+
x_values = probabilities
|
|
237
|
+
y_values = sorted_x
|
|
279
238
|
else:
|
|
280
|
-
fit = np.polyfit(
|
|
239
|
+
fit = np.polyfit(sorted_x, probabilities, poly)
|
|
281
240
|
f = np.poly1d(fit)
|
|
241
|
+
plot_label = "Cumulative Distribution Function"
|
|
242
|
+
x_values = sorted_x
|
|
243
|
+
y_values = cdf_values
|
|
282
244
|
|
|
283
245
|
if plot:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
else:
|
|
295
|
-
plt.plot(ox, cdf_, "o", label="cdf")
|
|
296
|
-
plt.plot(ox, f(ox), "r--", label="fit")
|
|
297
|
-
plt.title("Cumulative Distribution Function")
|
|
298
|
-
plt.xlabel("Value")
|
|
299
|
-
plt.ylabel("Probability")
|
|
300
|
-
plt.legend()
|
|
301
|
-
if savename != None:
|
|
302
|
-
plt.savefig(savename, dpi=300)
|
|
303
|
-
plt.show()
|
|
246
|
+
plt.figure()
|
|
247
|
+
plt.plot(x_values, y_values, "o", label="data")
|
|
248
|
+
plt.plot(x_values, f(x_values), "r--", label="fit")
|
|
249
|
+
plt.title(plot_label)
|
|
250
|
+
plt.xlabel("Probability" if inv else "Value")
|
|
251
|
+
plt.ylabel("Value" if inv else "Probability")
|
|
252
|
+
plt.legend()
|
|
253
|
+
if savename:
|
|
254
|
+
plt.savefig(savename, dpi=300)
|
|
255
|
+
plt.show()
|
|
304
256
|
|
|
305
|
-
return f,
|
|
257
|
+
return f, cdf_values, sorted_x
|
|
306
258
|
|
|
307
259
|
|
|
308
|
-
|
|
309
|
-
"""Calculates the
|
|
260
|
+
def calculate_probability(x: np.ndarray, points: int = 1, cond: bool = True) -> np.ndarray:
|
|
261
|
+
"""Calculates the probability of the data based on the CDF fit.
|
|
310
262
|
|
|
311
263
|
Parameters
|
|
312
264
|
----------
|
|
313
265
|
x : `np.array`
|
|
314
266
|
An array containing the data.
|
|
315
|
-
|
|
316
|
-
|
|
267
|
+
points : `int`
|
|
268
|
+
Number of points to consider for the final probability calculation.
|
|
269
|
+
cond : `bool`
|
|
270
|
+
Condition to use product (True) or sum (False) for the final probability check.
|
|
317
271
|
|
|
318
272
|
Returns
|
|
319
273
|
-------
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
274
|
+
p : `np.array`
|
|
275
|
+
Array containing the probabilities of the data.
|
|
323
276
|
"""
|
|
324
277
|
|
|
278
|
+
if len(x) == 0:
|
|
279
|
+
raise ValueError("Input array 'x' must not be empty.")
|
|
280
|
+
|
|
281
|
+
fit, _, sorted_x = cdf(x)
|
|
282
|
+
p = fit(x)
|
|
283
|
+
|
|
284
|
+
# Validate probability values
|
|
285
|
+
if cond:
|
|
286
|
+
prob_value = np.prod(p[-points])
|
|
287
|
+
message = "product"
|
|
288
|
+
else:
|
|
289
|
+
prob_value = np.sum(p[-points])
|
|
290
|
+
message = "sum"
|
|
291
|
+
|
|
292
|
+
if 0 <= prob_value <= 1:
|
|
293
|
+
print(f"The model has a probability of {prob_value * 100:.2f}% based on the {message}.")
|
|
294
|
+
else:
|
|
295
|
+
print("\nThe probability of the data cannot be calculated.\n")
|
|
296
|
+
|
|
297
|
+
return p
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class CorrelationBase:
|
|
301
|
+
"""Base class for correlation calculations."""
|
|
302
|
+
|
|
325
303
|
__slots__ = ["x", "y", "result", "z"]
|
|
326
304
|
|
|
327
|
-
def __init__(self, x: ndarray, y: ndarray):
|
|
305
|
+
def __init__(self, x: np.ndarray, y: Union[np.ndarray, None] = None):
|
|
328
306
|
self.x = x
|
|
329
|
-
self.y = y
|
|
330
|
-
self.
|
|
307
|
+
self.y = y if y is not None else x # Default to autocorrelation if y is not provided
|
|
308
|
+
self._compute_correlation()
|
|
331
309
|
self.z = self.result[self.result.size // 2 :]
|
|
332
|
-
self.z
|
|
310
|
+
self.z /= np.abs(self.z).max()
|
|
311
|
+
|
|
312
|
+
def _compute_correlation(self):
|
|
313
|
+
"""Compute the correlation between x and y (or x with itself for autocorrelation)."""
|
|
314
|
+
self.result = np.correlate(self.x, self.y, mode="full")
|
|
333
315
|
|
|
334
316
|
def plot(self):
|
|
335
|
-
|
|
317
|
+
"""Plot the correlation or autocorrelation."""
|
|
318
|
+
plt.plot(range(len(self.z)), self.z, label=self._get_label())
|
|
336
319
|
plt.legend()
|
|
337
320
|
plt.show()
|
|
338
321
|
|
|
322
|
+
def _get_label(self) -> str:
|
|
323
|
+
return "Autocorrelation" if np.array_equal(self.x, self.y) else "Correlation"
|
|
324
|
+
|
|
339
325
|
def __call__(self):
|
|
326
|
+
"""Return the computed correlation or autocorrelation."""
|
|
340
327
|
return self.z
|
|
341
328
|
|
|
342
329
|
|
|
343
|
-
class
|
|
344
|
-
"""Calculates the
|
|
330
|
+
class Correlation(CorrelationBase):
|
|
331
|
+
"""Calculates the cross-correlation of two datasets.
|
|
345
332
|
|
|
346
333
|
Parameters
|
|
347
334
|
----------
|
|
348
|
-
x : `np.
|
|
349
|
-
An array containing the
|
|
335
|
+
x : `np.ndarray`
|
|
336
|
+
An array containing the first dataset.
|
|
337
|
+
y : `np.ndarray`
|
|
338
|
+
An array containing the second dataset.
|
|
350
339
|
|
|
351
340
|
Returns
|
|
352
341
|
-------
|
|
353
|
-
z : `np.
|
|
354
|
-
An array containing the
|
|
342
|
+
z : `np.ndarray`
|
|
343
|
+
An array containing the correlation of `x` and `y`.
|
|
355
344
|
|
|
356
345
|
"""
|
|
357
346
|
|
|
358
|
-
|
|
347
|
+
def __init__(self, x: np.ndarray, y: np.ndarray):
|
|
348
|
+
super().__init__(x, y)
|
|
359
349
|
|
|
360
|
-
def __init__(self, x: ndarray):
|
|
361
|
-
self.x = x
|
|
362
|
-
self.result = np.correlate(x, x, mode="full")
|
|
363
|
-
self.z = self.result[self.result.size // 2 :]
|
|
364
|
-
self.z = self.z / float(np.abs(self.z).max())
|
|
365
350
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
plt.legend()
|
|
369
|
-
plt.show()
|
|
351
|
+
class AutoCorrelation(CorrelationBase):
|
|
352
|
+
"""Calculates the autocorrelation of a dataset.
|
|
370
353
|
|
|
371
|
-
|
|
372
|
-
|
|
354
|
+
Parameters
|
|
355
|
+
----------
|
|
356
|
+
x : `np.ndarray`
|
|
357
|
+
An array containing the data.
|
|
373
358
|
|
|
359
|
+
Returns
|
|
360
|
+
-------
|
|
361
|
+
z : `np.ndarray`
|
|
362
|
+
An array containing the autocorrelation of the data.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
def __init__(self, x: np.ndarray):
|
|
366
|
+
super().__init__(x)
|
|
374
367
|
|
|
375
|
-
|
|
376
|
-
|
|
368
|
+
|
|
369
|
+
def fft_denoise(
|
|
370
|
+
dataset: np.ndarray, sigma: float = 0, mode: bool = True
|
|
371
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
372
|
+
"""Performs noise removal using the Fast Fourier Transform.
|
|
377
373
|
|
|
378
374
|
Parameters
|
|
379
375
|
----------
|
|
380
|
-
dataset : `np.
|
|
381
|
-
An array containing the noised data.
|
|
382
|
-
sigma : `float
|
|
383
|
-
A
|
|
384
|
-
mode : `bool
|
|
385
|
-
|
|
376
|
+
dataset : `np.ndarray`
|
|
377
|
+
An array containing the noised data. Expected shape (num_samples, num_points).
|
|
378
|
+
sigma : `float`, default=0
|
|
379
|
+
A float between 0 and 1 representing the threshold for noise filtering.
|
|
380
|
+
mode : `bool`, default=True
|
|
381
|
+
If True, print progress messages.
|
|
386
382
|
|
|
387
383
|
Returns
|
|
388
384
|
-------
|
|
389
|
-
|
|
390
|
-
An array containing the denoised data
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
385
|
+
denoised_dataset : `np.ndarray`
|
|
386
|
+
An array containing the denoised data with the same shape as `dataset`.
|
|
387
|
+
periods : `np.ndarray`
|
|
388
|
+
Array of estimated periods for each sample in `dataset`.
|
|
394
389
|
"""
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
390
|
+
|
|
391
|
+
if not (0 <= sigma <= 1):
|
|
392
|
+
raise ValueError("sigma must be between 0 and 1")
|
|
393
|
+
|
|
394
|
+
num_samples, n_points = dataset.shape
|
|
395
|
+
denoised_dataset = np.zeros_like(dataset)
|
|
396
|
+
periods = np.zeros(num_samples)
|
|
397
|
+
|
|
398
|
+
# Precompute values that do not change within the loop
|
|
399
|
+
freq = (1 / n_points) * np.arange(n_points)
|
|
400
|
+
L = np.arange(1, np.floor(n_points / 2), dtype=int)
|
|
401
|
+
|
|
402
|
+
for i in range(num_samples):
|
|
403
|
+
fhat = np.fft.fft(dataset[i, :], n_points)
|
|
404
|
+
PSD = fhat * np.conj(fhat) / n_points
|
|
405
|
+
threshold = np.mean(PSD) + sigma * np.std(PSD)
|
|
406
|
+
indices = PSD > threshold
|
|
407
|
+
|
|
408
|
+
# Zero out all others in frequency domain
|
|
409
|
+
PSDclean = PSD * indices
|
|
410
|
+
fhat_cleaned = fhat * indices
|
|
411
|
+
|
|
412
|
+
# Inverse FFT for filtered time signal
|
|
413
|
+
denoised_signal = np.fft.ifft(fhat_cleaned).real
|
|
414
|
+
denoised_dataset[i, :] = denoised_signal
|
|
415
|
+
|
|
407
416
|
# Calculate the period of the signal
|
|
408
|
-
|
|
417
|
+
peak_index = L[np.argmax(np.abs(fhat[L]))]
|
|
418
|
+
periods[i] = 1 / (2 * freq[peak_index])
|
|
419
|
+
|
|
409
420
|
if mode:
|
|
410
421
|
print(f"The {i+1}-th row of the dataset has been denoised.")
|
|
411
|
-
print(f"The period is {round(
|
|
412
|
-
|
|
422
|
+
print(f"The estimated period is {round(periods[i], 4)}")
|
|
423
|
+
|
|
424
|
+
return denoised_dataset, periods
|
|
413
425
|
|
|
414
426
|
|
|
415
|
-
def get_period(dataset: ndarray) -> float:
|
|
427
|
+
def get_period(dataset: np.ndarray) -> float:
|
|
416
428
|
"""Calculates the periodicity of a `dataset`.
|
|
417
429
|
|
|
418
430
|
Parameters
|
|
@@ -426,13 +438,31 @@ def get_period(dataset: ndarray) -> float:
|
|
|
426
438
|
period of the function described by the `dataset`
|
|
427
439
|
"""
|
|
428
440
|
n = dataset.size
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
441
|
+
|
|
442
|
+
# Ensure there are enough points for FFT analysis
|
|
443
|
+
if n < 2:
|
|
444
|
+
raise ValueError("Dataset must contain at least two points.")
|
|
445
|
+
|
|
446
|
+
# Compute the FFT and PSD
|
|
447
|
+
fhat = np.fft.rfft(dataset) # Use rfft for real-valued input to save computation
|
|
448
|
+
freqs = np.fft.rfftfreq(n) # Get only positive frequencies
|
|
449
|
+
|
|
450
|
+
# Calculate the Power Spectral Density (PSD)
|
|
451
|
+
PSD = np.abs(fhat) ** 2 / n
|
|
452
|
+
|
|
453
|
+
# Remove the first frequency component (DC component)
|
|
454
|
+
PSD[0] = 0
|
|
455
|
+
|
|
456
|
+
# Find the index of the maximum PSD value, excluding the DC component
|
|
457
|
+
max_psd_index = np.argmax(PSD)
|
|
458
|
+
|
|
459
|
+
# Calculate the period based on the corresponding frequency
|
|
460
|
+
dominant_freq = freqs[max_psd_index]
|
|
461
|
+
if dominant_freq == 0:
|
|
462
|
+
raise ValueError("No significant periodic component found in the dataset.")
|
|
463
|
+
|
|
464
|
+
period = 1 / dominant_freq
|
|
465
|
+
|
|
436
466
|
return period
|
|
437
467
|
|
|
438
468
|
|
|
@@ -468,7 +498,7 @@ class LogisticRegression:
|
|
|
468
498
|
|
|
469
499
|
self.importance = []
|
|
470
500
|
|
|
471
|
-
def fit(self, dataset: ndarray, values: ndarray) -> None:
|
|
501
|
+
def fit(self, dataset: np.ndarray, values: np.ndarray) -> None:
|
|
472
502
|
"""Performs linear multiple model training
|
|
473
503
|
|
|
474
504
|
Parameters
|
|
@@ -501,7 +531,7 @@ class LogisticRegression:
|
|
|
501
531
|
a = np.around(self.w[i], decimals=8)
|
|
502
532
|
self.importance.append(a)
|
|
503
533
|
|
|
504
|
-
def predict(self, datapoints: ndarray) -> ndarray:
|
|
534
|
+
def predict(self, datapoints: np.ndarray) -> np.ndarray:
|
|
505
535
|
"""
|
|
506
536
|
Performs predictions for a set of points
|
|
507
537
|
|
|
@@ -515,7 +545,7 @@ class LogisticRegression:
|
|
|
515
545
|
|
|
516
546
|
return sig(np.array(self.importance) @ datapoints)
|
|
517
547
|
|
|
518
|
-
def get_importances(self, print_important_features: bool = False) -> ndarray:
|
|
548
|
+
def get_importances(self, print_important_features: bool = False) -> np.ndarray:
|
|
519
549
|
"""
|
|
520
550
|
Returns the important features
|
|
521
551
|
|
|
@@ -547,7 +577,7 @@ class LinearRegression:
|
|
|
547
577
|
|
|
548
578
|
self.importance = []
|
|
549
579
|
|
|
550
|
-
def fit(self, dataset: ndarray, values: ndarray, verbose: bool = False) -> None:
|
|
580
|
+
def fit(self, dataset: np.ndarray, values: np.ndarray, verbose: bool = False) -> None:
|
|
551
581
|
"""Performs linear multiple model training
|
|
552
582
|
|
|
553
583
|
Parameters
|
|
@@ -580,7 +610,7 @@ class LinearRegression:
|
|
|
580
610
|
print("\nParameters:", np.array(self.importance).shape)
|
|
581
611
|
print("RMSE: {:.4f}".format(mean_square_error(self.y, self.predict(self.X))))
|
|
582
612
|
|
|
583
|
-
def predict(self, datapoints: ndarray) -> ndarray:
|
|
613
|
+
def predict(self, datapoints: np.ndarray) -> np.ndarray:
|
|
584
614
|
"""
|
|
585
615
|
Performs predictions for a set of points
|
|
586
616
|
|
|
@@ -592,7 +622,7 @@ class LinearRegression:
|
|
|
592
622
|
"""
|
|
593
623
|
return np.array(self.importance) @ datapoints
|
|
594
624
|
|
|
595
|
-
def get_importances(self, print_important_features: bool = False) -> ndarray:
|
|
625
|
+
def get_importances(self, print_important_features: bool = False) -> np.ndarray:
|
|
596
626
|
"""
|
|
597
627
|
Returns the important features
|
|
598
628
|
|
|
@@ -614,7 +644,7 @@ class LinearRegression:
|
|
|
614
644
|
return np.array(self.importance)
|
|
615
645
|
|
|
616
646
|
|
|
617
|
-
def cal_average(y: ndarray, alpha: float = 1):
|
|
647
|
+
def cal_average(y: np.ndarray, alpha: float = 1):
|
|
618
648
|
"""Calculates the moving average of the data
|
|
619
649
|
|
|
620
650
|
Parameters
|
|
@@ -642,12 +672,12 @@ class DataScaler:
|
|
|
642
672
|
|
|
643
673
|
__slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
|
|
644
674
|
|
|
645
|
-
def __init__(self, dataset: ndarray, n: int = 1) -> None:
|
|
675
|
+
def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
|
|
646
676
|
"""Initializes the parameters required for scaling the data"""
|
|
647
677
|
self.dataset_ = dataset.copy()
|
|
648
678
|
self._n = n
|
|
649
679
|
|
|
650
|
-
def rescale(self, dataset_: ndarray | None = None) -> ndarray:
|
|
680
|
+
def rescale(self, dataset_: np.ndarray | None = None) -> np.ndarray:
|
|
651
681
|
"""Perform a standard rescaling of the data
|
|
652
682
|
|
|
653
683
|
Returns
|
|
@@ -655,7 +685,7 @@ class DataScaler:
|
|
|
655
685
|
data_scaled : `np.array`
|
|
656
686
|
An array containing the scaled data.
|
|
657
687
|
"""
|
|
658
|
-
if isinstance(dataset_, ndarray):
|
|
688
|
+
if isinstance(dataset_, np.ndarray):
|
|
659
689
|
data_scaled = np.copy(dataset_)
|
|
660
690
|
mu = self.values[0]
|
|
661
691
|
sigma = self.values[1]
|
|
@@ -711,7 +741,7 @@ class DataScaler:
|
|
|
711
741
|
|
|
712
742
|
return self.data_scaled
|
|
713
743
|
|
|
714
|
-
def scale(self, dataset_: ndarray) -> ndarray:
|
|
744
|
+
def scale(self, dataset_: np.ndarray) -> np.ndarray:
|
|
715
745
|
"""Performs the inverse operation to the rescale function
|
|
716
746
|
|
|
717
747
|
Parameters
|
|
@@ -755,7 +785,7 @@ def generate_series(n: int, n_steps: int, incline: bool = True):
|
|
|
755
785
|
return series.astype(np.float32)
|
|
756
786
|
|
|
757
787
|
|
|
758
|
-
def mean_square_error(y_true: ndarray, y_pred: ndarray, print_error: bool = False):
|
|
788
|
+
def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray, print_error: bool = False):
|
|
759
789
|
"""Calculates the Root Mean Squared Error
|
|
760
790
|
|
|
761
791
|
Parameters
|
|
@@ -946,88 +976,65 @@ class PerformanceMeasures:
|
|
|
946
976
|
pass
|
|
947
977
|
|
|
948
978
|
# Performance measure Res_T
|
|
949
|
-
def f_mean(self, y_true: ndarray, y_pred: ndarray, labels:
|
|
950
|
-
|
|
979
|
+
def f_mean(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
|
|
980
|
+
F_vec = self._f1_score(y_true, y_pred, labels)
|
|
981
|
+
mean_f_measure = np.mean(F_vec)
|
|
951
982
|
|
|
952
|
-
|
|
953
|
-
|
|
983
|
+
for label, f_measure in zip(labels, F_vec):
|
|
984
|
+
print(f"F-measure of label {label} -> {f_measure}")
|
|
954
985
|
|
|
955
|
-
|
|
956
|
-
print("F-measure of label ", labels[i], " -> ", F_vec[i])
|
|
986
|
+
print(f"Mean of F-measure -> {mean_f_measure}")
|
|
957
987
|
|
|
958
|
-
|
|
988
|
+
return mean_f_measure
|
|
959
989
|
|
|
960
990
|
# Performance measure Res_P
|
|
961
|
-
def resp(self, y_true: ndarray, y_pred: ndarray, labels:
|
|
962
|
-
# We initialize sum counters
|
|
963
|
-
sum1 = 0
|
|
964
|
-
sum2 = 0
|
|
965
|
-
|
|
966
|
-
# Calculamos T_C
|
|
991
|
+
def resp(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
|
|
967
992
|
T_C = len(y_true)
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
sum1 += (1 - ((y_true == labels[i]).sum() / T_C)) * self._fi_measure(
|
|
971
|
-
y_true, y_pred, labels, i
|
|
972
|
-
)
|
|
973
|
-
sum2 += 1 - ((y_true == labels[i]).sum()) / T_C
|
|
993
|
+
sum1, sum2 = 0.0, 0.0
|
|
994
|
+
F_vec = self._f1_score(y_true, y_pred, labels)
|
|
974
995
|
|
|
975
|
-
|
|
976
|
-
|
|
996
|
+
for label_idx, label in enumerate(labels):
|
|
997
|
+
class_instances = np.sum(y_true == label) / T_C
|
|
998
|
+
sum1 += (1 - class_instances) * F_vec[label_idx]
|
|
999
|
+
sum2 += 1 - class_instances
|
|
977
1000
|
|
|
978
|
-
|
|
979
|
-
|
|
1001
|
+
res_p = sum1 / sum2 if sum2 != 0 else 0.0 # Avoid division by zero
|
|
1002
|
+
print(f"Metric Res_p -> {res_p}")
|
|
980
1003
|
|
|
981
|
-
return
|
|
1004
|
+
return res_p
|
|
982
1005
|
|
|
983
|
-
|
|
984
|
-
def _summary_pred(self, y_true: ndarray, y_pred: ndarray, labels: list) -> None:
|
|
985
|
-
count_mat = self._confu_mat(y_true, y_pred, labels)
|
|
986
|
-
print(" ", end="")
|
|
987
|
-
for i in range(len(labels)):
|
|
988
|
-
print("|--", labels[i], "--", end="")
|
|
989
|
-
if i + 1 == len(labels):
|
|
990
|
-
print("|", end="")
|
|
991
|
-
for i in range(len(labels)):
|
|
992
|
-
print("")
|
|
993
|
-
print("|--", labels[i], "--|", end="")
|
|
994
|
-
for j in range(len(labels)):
|
|
995
|
-
if j != 0:
|
|
996
|
-
print(" ", end="")
|
|
997
|
-
print(" ", int(count_mat[i, j]), " ", end="")
|
|
998
|
-
|
|
999
|
-
def _f1_score(self, y_true: ndarray, y_pred: ndarray, labels: list) -> ndarray:
|
|
1000
|
-
f1_vec = np.zeros(len(labels))
|
|
1001
|
-
|
|
1002
|
-
# Calculate confusion mat
|
|
1006
|
+
def _summary_pred(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> None:
|
|
1003
1007
|
count_mat = self._confu_mat(y_true, y_pred, labels)
|
|
1008
|
+
print(" ", " | ".join(f"--{label}--" for label in labels))
|
|
1009
|
+
for i, label_i in enumerate(labels):
|
|
1010
|
+
row = [f" {int(count_mat[i, j])} " for j in range(len(labels))]
|
|
1011
|
+
print(f"--{label_i}--|", " | ".join(row))
|
|
1004
1012
|
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
# Iterate over labels to calculate f1 scores of each one
|
|
1010
|
-
for i in range(len(labels)):
|
|
1011
|
-
precision = count_mat[i, i] / (sum1[i])
|
|
1012
|
-
recall = count_mat[i, i] / (sum2[i])
|
|
1013
|
+
def _f1_score(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
|
|
1014
|
+
count_mat = self._confu_mat(y_true, y_pred, labels)
|
|
1015
|
+
sum_cols = np.sum(count_mat, axis=0)
|
|
1016
|
+
sum_rows = np.sum(count_mat, axis=1)
|
|
1013
1017
|
|
|
1014
|
-
|
|
1018
|
+
# Avoid division by zero
|
|
1019
|
+
precision = np.divide(
|
|
1020
|
+
count_mat.diagonal(), sum_cols, out=np.zeros_like(sum_cols), where=sum_cols != 0
|
|
1021
|
+
)
|
|
1022
|
+
recall = np.divide(
|
|
1023
|
+
count_mat.diagonal(), sum_rows, out=np.zeros_like(sum_rows), where=sum_rows != 0
|
|
1024
|
+
)
|
|
1025
|
+
f1_vec = 2 * ((precision * recall) / (precision + recall))
|
|
1015
1026
|
|
|
1016
1027
|
return f1_vec
|
|
1017
1028
|
|
|
1018
1029
|
# Returns confusion matrix of predictions
|
|
1019
|
-
def _confu_mat(self, y_true: ndarray, y_pred: ndarray, labels:
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
count_mat[i, i] += 1
|
|
1028
|
-
else:
|
|
1029
|
-
x = np.where(labels == y_true[j])
|
|
1030
|
-
count_mat[i, x[0]] += 1
|
|
1030
|
+
def _confu_mat(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
|
|
1031
|
+
num_classes = len(labels)
|
|
1032
|
+
label_mapping = {label: idx for idx, label in enumerate(labels)}
|
|
1033
|
+
count_mat = np.zeros((num_classes, num_classes))
|
|
1034
|
+
|
|
1035
|
+
for pred_label, true_label in zip(y_pred, y_true):
|
|
1036
|
+
if pred_label in label_mapping and true_label in label_mapping:
|
|
1037
|
+
count_mat[label_mapping[pred_label], label_mapping[true_label]] += 1
|
|
1031
1038
|
|
|
1032
1039
|
return count_mat
|
|
1033
1040
|
|
|
@@ -1043,10 +1050,10 @@ class OneHotEncoder:
|
|
|
1043
1050
|
def __init__(self) -> None:
|
|
1044
1051
|
pass
|
|
1045
1052
|
|
|
1046
|
-
def encode(self, x: ndarray | list):
|
|
1053
|
+
def encode(self, x: np.ndarray | list):
|
|
1047
1054
|
self.x = x
|
|
1048
1055
|
|
|
1049
|
-
if not isinstance(self.x, ndarray):
|
|
1056
|
+
if not isinstance(self.x, np.ndarray):
|
|
1050
1057
|
self.x = np.array(self.x) # If not numpy array then convert it
|
|
1051
1058
|
|
|
1052
1059
|
y = np.zeros(
|
|
@@ -1057,8 +1064,8 @@ class OneHotEncoder:
|
|
|
1057
1064
|
|
|
1058
1065
|
return y
|
|
1059
1066
|
|
|
1060
|
-
def decode(self, x: ndarray | list) -> ndarray:
|
|
1061
|
-
if not isinstance(x, ndarray):
|
|
1067
|
+
def decode(self, x: np.ndarray | list) -> np.ndarray:
|
|
1068
|
+
if not isinstance(x, np.ndarray):
|
|
1062
1069
|
x = np.array(x) # If not numpy array then convert it
|
|
1063
1070
|
|
|
1064
1071
|
# We return the max values of each row
|
|
@@ -1220,17 +1227,33 @@ class FeatureSelection:
|
|
|
1220
1227
|
|
|
1221
1228
|
|
|
1222
1229
|
def check_nan_inf(df: DataFrame) -> DataFrame:
|
|
1223
|
-
"""
|
|
1230
|
+
"""
|
|
1231
|
+
Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
|
|
1232
|
+
|
|
1233
|
+
Parameters:
|
|
1234
|
+
df (DataFrame): The input DataFrame to be checked.
|
|
1235
|
+
|
|
1236
|
+
Returns:
|
|
1237
|
+
DataFrame: A new DataFrame with NaN and Inf values removed.
|
|
1238
|
+
"""
|
|
1239
|
+
|
|
1224
1240
|
nan_values = df.isnull().values.any()
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
print("It contains " + str(count) + " infinite values")
|
|
1241
|
+
inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
|
|
1242
|
+
|
|
1228
1243
|
if nan_values:
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1244
|
+
print("UserWarning: Some rows may have been deleted due to the existence of NaN values.")
|
|
1245
|
+
df.dropna(inplace=True)
|
|
1246
|
+
|
|
1247
|
+
if inf_values:
|
|
1248
|
+
print("UserWarning: Some rows may have been deleted due to the existence of Inf values.")
|
|
1249
|
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
1250
|
+
df.dropna(inplace=True)
|
|
1251
|
+
|
|
1252
|
+
nan_count = df.isnull().values.sum()
|
|
1253
|
+
inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
|
|
1254
|
+
|
|
1255
|
+
print(f"NaN values removed: {nan_count}")
|
|
1256
|
+
print(f"Infinite values removed: {inf_count}")
|
|
1234
1257
|
|
|
1235
1258
|
return df
|
|
1236
1259
|
|
|
@@ -1244,6 +1267,7 @@ if __name__ == "__main__":
|
|
|
1244
1267
|
helper = PerformanceMeasures()
|
|
1245
1268
|
helper._summary_pred(y_true, y_pred, labels)
|
|
1246
1269
|
print(helper._f1_score(y_true, y_pred, labels))
|
|
1270
|
+
print(helper.f_mean(y_true, y_pred, labels))
|
|
1247
1271
|
|
|
1248
1272
|
# Use DataFrameEncoder
|
|
1249
1273
|
# Create a DataFrame
|
|
@@ -1273,6 +1297,13 @@ if __name__ == "__main__":
|
|
|
1273
1297
|
# Generate data
|
|
1274
1298
|
x = np.random.rand(3, 100)
|
|
1275
1299
|
y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
|
|
1300
|
+
# Create a DataFrame
|
|
1301
|
+
df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
|
|
1302
|
+
df["y"] = y
|
|
1303
|
+
# Instantiate FeatureSelection
|
|
1304
|
+
fs = FeatureSelection()
|
|
1305
|
+
print(fs.get_digraph(df, n_importances=1))
|
|
1306
|
+
|
|
1276
1307
|
linear_model = LinearRegression()
|
|
1277
1308
|
linear_model.fit(x, y)
|
|
1278
1309
|
importance = linear_model.get_importances()
|
|
@@ -1303,7 +1334,7 @@ if __name__ == "__main__":
|
|
|
1303
1334
|
plt.show()
|
|
1304
1335
|
|
|
1305
1336
|
# Calculate the autocorrelation of the data
|
|
1306
|
-
z =
|
|
1337
|
+
z = AutoCorrelation(a[0, :])
|
|
1307
1338
|
z.plot()
|
|
1308
1339
|
# print(z())
|
|
1309
1340
|
|
|
@@ -1313,3 +1344,18 @@ if __name__ == "__main__":
|
|
|
1313
1344
|
x = np.random.normal(mu, sigma, N)
|
|
1314
1345
|
f, cdf_, ox = cdf(x, plot=True)
|
|
1315
1346
|
invf, cdf_, ox = cdf(x, plot=True, inv=True)
|
|
1347
|
+
|
|
1348
|
+
encoder = OneHotEncoder()
|
|
1349
|
+
encoding = encoder.encode([1, 2, 3, 4, 5])
|
|
1350
|
+
assert np.array_equal(
|
|
1351
|
+
encoding,
|
|
1352
|
+
np.array(
|
|
1353
|
+
[
|
|
1354
|
+
[0, 1, 0, 0, 0, 0],
|
|
1355
|
+
[0, 0, 1, 0, 0, 0],
|
|
1356
|
+
[0, 0, 0, 1, 0, 0],
|
|
1357
|
+
[0, 0, 0, 0, 1, 0],
|
|
1358
|
+
[0, 0, 0, 0, 0, 1],
|
|
1359
|
+
]
|
|
1360
|
+
),
|
|
1361
|
+
)
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
import matplotlib.pyplot as plt
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from numpy import ndarray
|
|
5
|
-
from pandas.core.frame import DataFrame
|
|
6
|
-
|
|
7
|
-
from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_nan_inf
|
|
8
|
-
|
|
9
|
-
# --------------------------------------------------------------------------------------------------------------------------------------
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SimulationEngine(FeatureSelection):
|
|
13
|
-
"""
|
|
14
|
-
This class implements a predictive model that utilizes multiple linear regression for numerical target variables
|
|
15
|
-
and multiple logistic regression for categorical target variables.
|
|
16
|
-
|
|
17
|
-
The class provides methods for training the model on a given dataset, making predictions,
|
|
18
|
-
and evaluating the model's performance.
|
|
19
|
-
|
|
20
|
-
Key features:
|
|
21
|
-
- Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
|
|
22
|
-
- Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
|
|
23
|
-
- Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
|
|
24
|
-
|
|
25
|
-
Usage:
|
|
26
|
-
- Instantiate the class with the training data and target variable.
|
|
27
|
-
- Call the fit method to train the model.
|
|
28
|
-
- Use the predict method to generate predictions on new data.
|
|
29
|
-
- Evaluate the model using built-in metrics for accuracy and error.
|
|
30
|
-
|
|
31
|
-
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
|
|
32
|
-
for both numerical and categorical outcomes efficiently.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
|
|
36
|
-
|
|
37
|
-
self.df = df
|
|
38
|
-
self.n_importances = n_importances
|
|
39
|
-
self.use_scaler = use_scaler
|
|
40
|
-
|
|
41
|
-
super().__init__(**kwargs)
|
|
42
|
-
|
|
43
|
-
def predict(self, df: DataFrame, column: str) -> ndarray | list:
|
|
44
|
-
# Let us assign the dictionary entries corresponding to the column
|
|
45
|
-
w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
|
|
46
|
-
|
|
47
|
-
df = df[names_cols].copy()
|
|
48
|
-
# Change the scale of the dataframe
|
|
49
|
-
dataset = self.df.copy()
|
|
50
|
-
dataset.drop(columns=column, inplace=True)
|
|
51
|
-
numeric_df = dataset.select_dtypes(include="number")
|
|
52
|
-
if self.use_scaler:
|
|
53
|
-
scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
54
|
-
_ = scaler.rescale()
|
|
55
|
-
dataset_ = df.copy()
|
|
56
|
-
numeric_df = dataset_.select_dtypes(include="number")
|
|
57
|
-
numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
|
|
58
|
-
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
59
|
-
for col in numeric_df.columns:
|
|
60
|
-
df[col] = numeric_df[col].values
|
|
61
|
-
|
|
62
|
-
# Encoding the datadrame
|
|
63
|
-
for num, colname in enumerate(dfe._encode_columns):
|
|
64
|
-
if df[colname].dtype == "object":
|
|
65
|
-
encode_dict = dfe.encoding_list[num]
|
|
66
|
-
df[colname] = df[colname].apply(
|
|
67
|
-
dfe._code_transformation_to, dictionary_list=encode_dict
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
# PREDICTION
|
|
71
|
-
y = df.to_numpy() @ w
|
|
72
|
-
|
|
73
|
-
# Categorical column
|
|
74
|
-
if quick_encoder != None:
|
|
75
|
-
|
|
76
|
-
one_hot = OneHotEncoder()
|
|
77
|
-
y = one_hot.decode(y)
|
|
78
|
-
encoding_dic = quick_encoder.decoding_list[0]
|
|
79
|
-
y = [encoding_dic[item] for item in y]
|
|
80
|
-
# Numeric column
|
|
81
|
-
else:
|
|
82
|
-
if self.use_scaler:
|
|
83
|
-
# scale output
|
|
84
|
-
y += 1
|
|
85
|
-
y /= 2
|
|
86
|
-
y = y * (self.df[column].max() - self.df[column].min())
|
|
87
|
-
|
|
88
|
-
return y[:]
|
|
89
|
-
|
|
90
|
-
def fit(self, **kwargs) -> None:
|
|
91
|
-
|
|
92
|
-
# We run the feature selection algorithm
|
|
93
|
-
self.get_digraph(self.df, self.n_importances, self.use_scaler)
|
|
94
|
-
|
|
95
|
-
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
96
|
-
|
|
97
|
-
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
98
|
-
df.replace(" ", np.nan, inplace=True)
|
|
99
|
-
df = check_nan_inf(df)
|
|
100
|
-
df = df.reset_index()
|
|
101
|
-
df = df.drop(columns=["index"])
|
|
102
|
-
|
|
103
|
-
return df
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|