likelihood 2.2.0.dev1__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/VERSION +1 -0
- likelihood/__init__.py +20 -0
- likelihood/graph/__init__.py +9 -0
- likelihood/graph/_nn.py +283 -0
- likelihood/graph/graph.py +86 -0
- likelihood/graph/nn.py +329 -0
- likelihood/main.py +273 -0
- likelihood/models/__init__.py +3 -0
- likelihood/models/deep/__init__.py +13 -0
- likelihood/models/deep/_autoencoders.py +896 -0
- likelihood/models/deep/_predictor.py +809 -0
- likelihood/models/deep/autoencoders.py +903 -0
- likelihood/models/deep/bandit.py +97 -0
- likelihood/models/deep/gan.py +313 -0
- likelihood/models/deep/predictor.py +805 -0
- likelihood/models/deep/rl.py +345 -0
- likelihood/models/environments.py +202 -0
- likelihood/models/hmm.py +163 -0
- likelihood/models/regression.py +451 -0
- likelihood/models/simulation.py +213 -0
- likelihood/models/utils.py +87 -0
- likelihood/pipes.py +382 -0
- likelihood/rust_py_integration.cpython-310-x86_64-linux-gnu.so +0 -0
- likelihood/tools/__init__.py +4 -0
- likelihood/tools/cat_embed.py +212 -0
- likelihood/tools/figures.py +348 -0
- likelihood/tools/impute.py +278 -0
- likelihood/tools/models_tools.py +866 -0
- likelihood/tools/numeric_tools.py +390 -0
- likelihood/tools/reports.py +375 -0
- likelihood/tools/tools.py +1336 -0
- likelihood-2.2.0.dev1.dist-info/METADATA +68 -0
- likelihood-2.2.0.dev1.dist-info/RECORD +37 -0
- likelihood-2.2.0.dev1.dist-info/WHEEL +5 -0
- likelihood-2.2.0.dev1.dist-info/licenses/LICENSE +21 -0
- likelihood-2.2.0.dev1.dist-info/top_level.txt +5 -0
- src/lib.rs +12 -0
|
@@ -0,0 +1,1336 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import os
|
|
3
|
+
import pickle
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import Callable, Dict, Generator, List, Tuple, Union
|
|
6
|
+
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import yaml
|
|
11
|
+
from packaging import version
|
|
12
|
+
|
|
13
|
+
if version.parse(np.__version__) < version.parse("2.0.0"):
|
|
14
|
+
filter = np.RankWarning
|
|
15
|
+
else:
|
|
16
|
+
filter = np.exceptions.RankWarning
|
|
17
|
+
|
|
18
|
+
warnings.simplefilter("ignore", filter)
|
|
19
|
+
|
|
20
|
+
# -------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
Data Science from Scratch, Second Edition, by Joel Grus (O'Reilly).Copyright 2019 Joel Grus, 978-1-492-04113-9
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> Generator:
|
|
28
|
+
"""Generates 'batch_size'-sized minibatches from the dataset
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
dataset : `List`
|
|
33
|
+
The data to be divided into mini-batch.
|
|
34
|
+
batch_size : `int`
|
|
35
|
+
Specifies the size of each mini-batch.
|
|
36
|
+
shuffle : `bool`
|
|
37
|
+
If set `True`, the data will be shuffled before dividing it into mini-batches.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
`Generator`
|
|
42
|
+
A list of lists containing the mini-batches. Each sublist is a separate mini-batch with length `batch_size`.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# start indexes 0, batch_size, 2 * batch_size, ...
|
|
46
|
+
batch_starts = [start for start in range(0, len(dataset), batch_size)]
|
|
47
|
+
|
|
48
|
+
if shuffle:
|
|
49
|
+
np.random.shuffle(batch_starts) # shuffle the batches
|
|
50
|
+
|
|
51
|
+
for start in batch_starts:
|
|
52
|
+
end = start + batch_size
|
|
53
|
+
yield dataset[start:end]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def difference_quotient(f: Callable, x: float, h: float) -> Callable:
|
|
57
|
+
"""Calculates the difference quotient of `f` evaluated at `x` and `x + h`
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
`f(x)` : `Callable`
|
|
62
|
+
function.
|
|
63
|
+
x : `float`
|
|
64
|
+
Independent term.
|
|
65
|
+
h : `float`
|
|
66
|
+
Step size.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
`(f(x + h) - f(x)) / h` : `float`
|
|
71
|
+
Difference quotient of `f` evaluated at `x`.
|
|
72
|
+
"""
|
|
73
|
+
return (f(x + h) - f(x)) / h
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def partial_difference_quotient(f: Callable, v: np.ndarray, i: int, h: float) -> np.ndarray:
|
|
77
|
+
"""Calculates the partial difference quotient of `f`
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
`f(x0,...,xi-th)` : `Callable` function
|
|
82
|
+
Function to differentiate.
|
|
83
|
+
v : `Vector` | `np.array`
|
|
84
|
+
1D array representing vector `v=(x0,...,xi)`.
|
|
85
|
+
h : `float`
|
|
86
|
+
Step size.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
`(f(w) - f(v)) / h` : `np.array`
|
|
91
|
+
the `i-th` partial difference quotient of `f` at `v`
|
|
92
|
+
"""
|
|
93
|
+
w = [
|
|
94
|
+
v_j + (h if j == i else 0) for j, v_j in enumerate(v) # add h to just the ith element of v
|
|
95
|
+
]
|
|
96
|
+
return (f(w) - f(v)) / h
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def estimate_gradient(f: Callable, v: np.ndarray, h: float = 1e-4) -> List[np.ndarray]:
|
|
100
|
+
"""Calculates the gradient of `f` at `v`
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
`f(x0,...,xi-th)` : `Callable` function
|
|
105
|
+
Function to differentiate.
|
|
106
|
+
v : `Vector` | `np.array`
|
|
107
|
+
1D array representing vector `v=(x0,...,xi)`.
|
|
108
|
+
h : `float`. By default it is set to `1e-4`
|
|
109
|
+
The step size used to approximate the derivative.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
grad_f : `List[np.array]`
|
|
114
|
+
A list containing the estimated gradients of each component of `f` evaluated at `v`.
|
|
115
|
+
"""
|
|
116
|
+
return [partial_difference_quotient(f, v, i, h) for i in range(len(v))]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# -------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def generate_feature_yaml(
|
|
123
|
+
df: pd.DataFrame, ignore_features: List[str] = None, yaml_string: bool = False
|
|
124
|
+
) -> Dict | str:
|
|
125
|
+
"""
|
|
126
|
+
Generate a YAML string containing information about ordinal, numeric, and categorical features
|
|
127
|
+
based on the given DataFrame.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
df : `pd.DataFrame`
|
|
132
|
+
The DataFrame containing the data.
|
|
133
|
+
ignore_features : `List[`str`]`
|
|
134
|
+
A list of features to ignore.
|
|
135
|
+
yaml_string : `bool`
|
|
136
|
+
If `True`, return the result as a YAML formatted string. Otherwise, return it as a dictionary. Default is `False`.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
feature_info : `Dict` | `str`
|
|
141
|
+
A dictionary with four keys ('ordinal_features', 'numeric_features', 'categorical_features', 'ignore_features')
|
|
142
|
+
mapping to lists of feature names. Or a YAML formatted string if `yaml_string` is `True`.
|
|
143
|
+
"""
|
|
144
|
+
ignore_features = ignore_features or []
|
|
145
|
+
feature_info = {
|
|
146
|
+
"ordinal_features": [],
|
|
147
|
+
"numeric_features": [],
|
|
148
|
+
"categorical_features": [],
|
|
149
|
+
"ignore_features": ignore_features,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
for col in df.columns:
|
|
153
|
+
if col in ignore_features:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
157
|
+
if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
|
|
158
|
+
feature_info["numeric_features"].append(col)
|
|
159
|
+
elif pd.api.types.is_bool_dtype(df[col]):
|
|
160
|
+
feature_info["ordinal_features"].append(col) # Assuming bool can be ordinal
|
|
161
|
+
elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
|
|
162
|
+
feature_info["categorical_features"].append(col)
|
|
163
|
+
else:
|
|
164
|
+
print(f"Unknown type for feature {col}")
|
|
165
|
+
|
|
166
|
+
if yaml_string:
|
|
167
|
+
return yaml.dump(feature_info, default_flow_style=False)
|
|
168
|
+
|
|
169
|
+
return feature_info
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def cal_missing_values(df: pd.DataFrame) -> None:
|
|
173
|
+
"""Calculate the percentage of missing (`NaN`/`NaT`) values per column in a DataFrame.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
df : `pd.DataFrame`
|
|
178
|
+
The input dataframe.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
`None` : Prints out a table with columns as index and percentages of missing values as data.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
col = df.columns
|
|
186
|
+
print("Total size :", "{:,}".format(len(df)))
|
|
187
|
+
for i in col:
|
|
188
|
+
print(
|
|
189
|
+
str(i) + " : " f"{(df.isnull().sum()[i]/(df.isnull().sum()[i]+df[i].count()))*100:.2f}%"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def cdf(
|
|
194
|
+
x: np.ndarray,
|
|
195
|
+
poly: int = 9,
|
|
196
|
+
inv: bool = False,
|
|
197
|
+
plot: bool = False,
|
|
198
|
+
savename: str | None = None,
|
|
199
|
+
key: str | None = None,
|
|
200
|
+
) -> tuple:
|
|
201
|
+
"""Calculates the cumulative distribution function of the data.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
x : `np.ndarray`
|
|
206
|
+
An array containing the data.
|
|
207
|
+
poly : `int`
|
|
208
|
+
Degree of the polynomial fit. By default it is set to `9`.
|
|
209
|
+
inv : `bool`
|
|
210
|
+
If True, calculate the inverse CDF (quantile function).
|
|
211
|
+
plot : `bool`
|
|
212
|
+
If True, plot the results.
|
|
213
|
+
savename : `str` or `None`, optional
|
|
214
|
+
Filename to save the plot.
|
|
215
|
+
key : `str` or `None`, optional
|
|
216
|
+
Additional information to display with the chart title.
|
|
217
|
+
|
|
218
|
+
Returns
|
|
219
|
+
-------
|
|
220
|
+
fit : `np.poly1d`
|
|
221
|
+
Polynomial fit of the CDF or quantile function.
|
|
222
|
+
cdf_values : `np.ndarray`
|
|
223
|
+
Cumulative distribution values.
|
|
224
|
+
sorted_x : `np.ndarray`
|
|
225
|
+
Sorted input data.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
if len(x) == 0:
|
|
229
|
+
raise ValueError("Input array 'x' must not be empty.")
|
|
230
|
+
|
|
231
|
+
cdf_values = np.cumsum(x) / np.sum(x)
|
|
232
|
+
sorted_x = np.sort(x)
|
|
233
|
+
|
|
234
|
+
probabilities = np.linspace(0, 1, len(sorted_x))
|
|
235
|
+
|
|
236
|
+
if inv:
|
|
237
|
+
fit = np.polyfit(probabilities, sorted_x, poly)
|
|
238
|
+
f = np.poly1d(fit)
|
|
239
|
+
plot_label = "Quantile Function"
|
|
240
|
+
x_values = probabilities
|
|
241
|
+
y_values = sorted_x
|
|
242
|
+
else:
|
|
243
|
+
fit = np.polyfit(sorted_x, probabilities, poly)
|
|
244
|
+
f = np.poly1d(fit)
|
|
245
|
+
plot_label = (
|
|
246
|
+
"Cumulative Distribution Function"
|
|
247
|
+
if key is None
|
|
248
|
+
else f"Cumulative Distribution Function ({key})"
|
|
249
|
+
)
|
|
250
|
+
x_values = sorted_x
|
|
251
|
+
y_values = cdf_values
|
|
252
|
+
|
|
253
|
+
if plot:
|
|
254
|
+
plt.figure()
|
|
255
|
+
plt.plot(x_values, y_values, "o", label="data")
|
|
256
|
+
plt.plot(x_values, f(x_values), "r--", label="fit")
|
|
257
|
+
plt.title(plot_label)
|
|
258
|
+
plt.xlabel("Probability" if inv else "Value")
|
|
259
|
+
plt.ylabel("Value" if inv else "Probability")
|
|
260
|
+
plt.legend()
|
|
261
|
+
if savename:
|
|
262
|
+
plt.savefig(savename, dpi=300)
|
|
263
|
+
plt.show()
|
|
264
|
+
|
|
265
|
+
return f, cdf_values, sorted_x
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def calculate_probability(x: np.ndarray, points: int = 1, cond: bool = True) -> np.ndarray:
|
|
269
|
+
"""Calculates the probability of the data based on the CDF fit.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
x : `np.ndarray`
|
|
274
|
+
An array containing the data.
|
|
275
|
+
points : `int`
|
|
276
|
+
Number of points to consider for the final probability calculation.
|
|
277
|
+
cond : `bool`
|
|
278
|
+
Condition to use product (True) or sum (False) for the final probability check.
|
|
279
|
+
|
|
280
|
+
Returns
|
|
281
|
+
-------
|
|
282
|
+
p : `np.ndarray`
|
|
283
|
+
Array containing the probabilities of the data.
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
if len(x) == 0:
|
|
287
|
+
raise ValueError("Input array 'x' must not be empty.")
|
|
288
|
+
|
|
289
|
+
fit, _, sorted_x = cdf(x)
|
|
290
|
+
p = fit(x)
|
|
291
|
+
|
|
292
|
+
if cond:
|
|
293
|
+
prob_value = np.prod(p[-points])
|
|
294
|
+
message = "product"
|
|
295
|
+
else:
|
|
296
|
+
prob_value = np.sum(p[-points])
|
|
297
|
+
message = "sum"
|
|
298
|
+
|
|
299
|
+
if 0 <= prob_value <= 1:
|
|
300
|
+
print(f"The model has a probability of {prob_value * 100:.2f}% based on the {message}.")
|
|
301
|
+
else:
|
|
302
|
+
print("\nThe probability of the data cannot be calculated.\n")
|
|
303
|
+
|
|
304
|
+
return p
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class CorrelationBase:
|
|
308
|
+
"""Base class for correlation calculations."""
|
|
309
|
+
|
|
310
|
+
__slots__ = ["x", "y", "result", "z"]
|
|
311
|
+
|
|
312
|
+
def __init__(self, x: np.ndarray, y: Union[np.ndarray, None] = None):
|
|
313
|
+
self.x = x
|
|
314
|
+
self.y = y if y is not None else x
|
|
315
|
+
self._compute_correlation()
|
|
316
|
+
self.z = self.result[self.result.size // 2 :]
|
|
317
|
+
self.z /= np.abs(self.z).max()
|
|
318
|
+
|
|
319
|
+
def _compute_correlation(self):
|
|
320
|
+
"""Compute the correlation between x and y (or x with itself for autocorrelation)."""
|
|
321
|
+
self.result = np.correlate(self.x, self.y, mode="full")
|
|
322
|
+
|
|
323
|
+
def plot(self):
|
|
324
|
+
"""Plot the correlation or autocorrelation."""
|
|
325
|
+
plt.plot(range(len(self.z)), self.z, label=self._get_label())
|
|
326
|
+
plt.legend()
|
|
327
|
+
plt.show()
|
|
328
|
+
|
|
329
|
+
def _get_label(self) -> str:
|
|
330
|
+
return "Autocorrelation" if np.array_equal(self.x, self.y) else "Correlation"
|
|
331
|
+
|
|
332
|
+
def __call__(self):
|
|
333
|
+
"""Return the computed correlation or autocorrelation."""
|
|
334
|
+
return self.z
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class Correlation(CorrelationBase):
|
|
338
|
+
"""Calculates the cross-correlation of two datasets.
|
|
339
|
+
|
|
340
|
+
Parameters
|
|
341
|
+
----------
|
|
342
|
+
x : `np.ndarray`
|
|
343
|
+
An array containing the first dataset.
|
|
344
|
+
y : `np.ndarray`
|
|
345
|
+
An array containing the second dataset.
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
z : `np.ndarray`
|
|
350
|
+
An array containing the correlation of `x` and `y`.
|
|
351
|
+
"""
|
|
352
|
+
|
|
353
|
+
def __init__(self, x: np.ndarray, y: np.ndarray):
|
|
354
|
+
super().__init__(x, y)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class AutoCorrelation(CorrelationBase):
|
|
358
|
+
"""Calculates the autocorrelation of a dataset.
|
|
359
|
+
|
|
360
|
+
Parameters
|
|
361
|
+
----------
|
|
362
|
+
x : `np.ndarray`
|
|
363
|
+
An array containing the data.
|
|
364
|
+
|
|
365
|
+
Returns
|
|
366
|
+
-------
|
|
367
|
+
z : `np.ndarray`
|
|
368
|
+
An array containing the autocorrelation of the data.
|
|
369
|
+
"""
|
|
370
|
+
|
|
371
|
+
def __init__(self, x: np.ndarray):
|
|
372
|
+
super().__init__(x)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def fft_denoise(
|
|
376
|
+
dataset: np.ndarray, sigma: float = 0, mode: bool = True
|
|
377
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
378
|
+
"""Performs noise removal using the Fast Fourier Transform.
|
|
379
|
+
|
|
380
|
+
Parameters
|
|
381
|
+
----------
|
|
382
|
+
dataset : `np.ndarray`
|
|
383
|
+
An array containing the noised data. Expected shape (num_samples, num_points).
|
|
384
|
+
sigma : `float`, default=0
|
|
385
|
+
A float between 0 and 1 representing the threshold for noise filtering.
|
|
386
|
+
mode : `bool`, default=True
|
|
387
|
+
If True, print progress messages.
|
|
388
|
+
|
|
389
|
+
Returns
|
|
390
|
+
-------
|
|
391
|
+
denoised_dataset : `np.ndarray`
|
|
392
|
+
An array containing the denoised data with the same shape as `dataset`.
|
|
393
|
+
periods : `np.ndarray`
|
|
394
|
+
Array of estimated periods for each sample in `dataset`.
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
if not (0 <= sigma <= 1):
|
|
398
|
+
raise ValueError("sigma must be between 0 and 1")
|
|
399
|
+
|
|
400
|
+
num_samples, n_points = dataset.shape
|
|
401
|
+
denoised_dataset = np.zeros_like(dataset)
|
|
402
|
+
periods = np.zeros(num_samples)
|
|
403
|
+
|
|
404
|
+
freq = (1 / n_points) * np.arange(n_points)
|
|
405
|
+
L = np.arange(1, np.floor(n_points / 2), dtype=int)
|
|
406
|
+
|
|
407
|
+
for i in range(num_samples):
|
|
408
|
+
fhat = np.fft.fft(dataset[i, :], n_points)
|
|
409
|
+
PSD = fhat * np.conj(fhat) / n_points
|
|
410
|
+
threshold = np.mean(PSD) + sigma * np.std(PSD)
|
|
411
|
+
indices = PSD > threshold
|
|
412
|
+
|
|
413
|
+
PSDclean = PSD * indices
|
|
414
|
+
fhat_cleaned = fhat * indices
|
|
415
|
+
|
|
416
|
+
denoised_signal = np.fft.ifft(fhat_cleaned).real
|
|
417
|
+
denoised_dataset[i, :] = denoised_signal
|
|
418
|
+
|
|
419
|
+
peak_index = L[np.argmax(np.abs(fhat[L]))]
|
|
420
|
+
periods[i] = 1 / freq[peak_index]
|
|
421
|
+
|
|
422
|
+
if mode:
|
|
423
|
+
print(f"The {i+1}-th row of the dataset has been denoised.")
|
|
424
|
+
print(f"The estimated period is {round(periods[i], 4)}")
|
|
425
|
+
|
|
426
|
+
return denoised_dataset, periods
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def get_period(dataset: np.ndarray) -> float:
|
|
430
|
+
"""Calculates the periodicity of a `dataset`.
|
|
431
|
+
|
|
432
|
+
Parameters
|
|
433
|
+
----------
|
|
434
|
+
dataset : `np.ndarray`
|
|
435
|
+
the `dataset` describing the function over which the period is calculated.
|
|
436
|
+
|
|
437
|
+
Returns
|
|
438
|
+
-------
|
|
439
|
+
period : `float`
|
|
440
|
+
period of the function described by the `dataset`.
|
|
441
|
+
"""
|
|
442
|
+
n = dataset.size
|
|
443
|
+
|
|
444
|
+
if n < 2:
|
|
445
|
+
raise ValueError("Dataset must contain at least two points.")
|
|
446
|
+
|
|
447
|
+
fhat = np.fft.rfft(dataset)
|
|
448
|
+
freqs = np.fft.rfftfreq(n)
|
|
449
|
+
|
|
450
|
+
PSD = np.abs(fhat) ** 2 / n
|
|
451
|
+
|
|
452
|
+
PSD[0] = 0
|
|
453
|
+
|
|
454
|
+
max_psd_index = np.argmax(PSD)
|
|
455
|
+
|
|
456
|
+
dominant_freq = freqs[max_psd_index]
|
|
457
|
+
if dominant_freq == 0:
|
|
458
|
+
raise ValueError("No significant periodic component found in the dataset.")
|
|
459
|
+
|
|
460
|
+
period = 1 / dominant_freq
|
|
461
|
+
|
|
462
|
+
return period
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def sigmoide_inv(y: float) -> float:
|
|
466
|
+
"""Calculates the inverse of the sigmoid function
|
|
467
|
+
|
|
468
|
+
Parameters
|
|
469
|
+
----------
|
|
470
|
+
y : `float`
|
|
471
|
+
the number to evaluate the function.
|
|
472
|
+
|
|
473
|
+
Returns
|
|
474
|
+
-------
|
|
475
|
+
`float`
|
|
476
|
+
value of evaluated function.
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
return math.log(y / (1 - y))
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def sigmoide(x: float) -> float:
|
|
483
|
+
"""The sigmoid function"""
|
|
484
|
+
return 1 / (1 + math.exp(-x))
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
class LogisticRegression:
|
|
488
|
+
"""class implementing multiple logistic regression"""
|
|
489
|
+
|
|
490
|
+
__slots__ = ["importance", "X", "y", "w"]
|
|
491
|
+
|
|
492
|
+
def __init__(self) -> None:
|
|
493
|
+
"""The class initializer"""
|
|
494
|
+
|
|
495
|
+
self.importance = []
|
|
496
|
+
|
|
497
|
+
def fit(self, dataset: np.ndarray, values: np.ndarray) -> None:
|
|
498
|
+
"""Performs linear multiple model training
|
|
499
|
+
|
|
500
|
+
Parameters
|
|
501
|
+
----------
|
|
502
|
+
dataset : `np.ndarray`
|
|
503
|
+
An array containing the scaled data.
|
|
504
|
+
values : `np.ndarray`
|
|
505
|
+
A set of values returned by the linear function.
|
|
506
|
+
|
|
507
|
+
Returns
|
|
508
|
+
-------
|
|
509
|
+
importance : `np.ndarray`
|
|
510
|
+
An array containing the importance of each feature.
|
|
511
|
+
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
self.X = dataset
|
|
515
|
+
self.y = values
|
|
516
|
+
|
|
517
|
+
U, S, VT = np.linalg.svd(self.X, full_matrices=False)
|
|
518
|
+
|
|
519
|
+
inverse_sig = np.vectorize(sigmoide_inv)
|
|
520
|
+
self.w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ inverse_sig(self.y)
|
|
521
|
+
|
|
522
|
+
if self.y.shape[1] > 1:
|
|
523
|
+
for row in self.w:
|
|
524
|
+
self.importance.append(np.around(np.max(row), decimals=8))
|
|
525
|
+
else:
|
|
526
|
+
for i in range(self.X.shape[0]):
|
|
527
|
+
a = np.around(self.w[i], decimals=8)
|
|
528
|
+
self.importance.append(a)
|
|
529
|
+
|
|
530
|
+
def predict(self, datapoints: np.ndarray) -> np.ndarray:
|
|
531
|
+
"""
|
|
532
|
+
Performs predictions for a set of points
|
|
533
|
+
|
|
534
|
+
Parameters
|
|
535
|
+
----------
|
|
536
|
+
datapoints : `np.ndarray`
|
|
537
|
+
An array containing the values of the independent variable.
|
|
538
|
+
|
|
539
|
+
Returns
|
|
540
|
+
-------
|
|
541
|
+
`np.ndarray`
|
|
542
|
+
|
|
543
|
+
"""
|
|
544
|
+
sig = np.vectorize(sigmoide)
|
|
545
|
+
|
|
546
|
+
return sig(np.array(self.importance) @ datapoints)
|
|
547
|
+
|
|
548
|
+
def get_importances(self, print_important_features: bool = False) -> np.ndarray:
|
|
549
|
+
"""
|
|
550
|
+
Returns the important features
|
|
551
|
+
|
|
552
|
+
Parameters
|
|
553
|
+
----------
|
|
554
|
+
print_important_features : `bool`
|
|
555
|
+
determines whether or not are printed on the screen. By default it is set to `False`.
|
|
556
|
+
|
|
557
|
+
Returns
|
|
558
|
+
-------
|
|
559
|
+
importance : `np.ndarray`
|
|
560
|
+
An array containing the importance of each feature.
|
|
561
|
+
"""
|
|
562
|
+
if print_important_features:
|
|
563
|
+
for i, a in enumerate(self.importance):
|
|
564
|
+
print(f"The importance of the {i+1} feature is {a}")
|
|
565
|
+
return np.array(self.importance)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
class LinearRegression:
|
|
569
|
+
"""class implementing multiple linear regression"""
|
|
570
|
+
|
|
571
|
+
__slots__ = ["importance", "X", "y", "w"]
|
|
572
|
+
|
|
573
|
+
def __init__(self) -> None:
|
|
574
|
+
"""The class initializer"""
|
|
575
|
+
|
|
576
|
+
self.importance = []
|
|
577
|
+
|
|
578
|
+
def fit(self, dataset: np.ndarray, values: np.ndarray, verbose: bool = False) -> None:
|
|
579
|
+
"""Performs linear multiple model training
|
|
580
|
+
|
|
581
|
+
Parameters
|
|
582
|
+
----------
|
|
583
|
+
dataset : `np.ndarray`
|
|
584
|
+
An array containing the scaled data.
|
|
585
|
+
values : `np.ndarray`
|
|
586
|
+
A set of values returned by the linear function.
|
|
587
|
+
|
|
588
|
+
Returns
|
|
589
|
+
-------
|
|
590
|
+
`None` : The function doesn't return anything.
|
|
591
|
+
"""
|
|
592
|
+
|
|
593
|
+
self.X = dataset
|
|
594
|
+
self.y = values
|
|
595
|
+
|
|
596
|
+
U, S, VT = np.linalg.svd(self.X, full_matrices=False)
|
|
597
|
+
self.w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ self.y
|
|
598
|
+
|
|
599
|
+
for i in range(self.X.shape[0]):
|
|
600
|
+
a = np.around(self.w[i], decimals=8)
|
|
601
|
+
self.importance.append(a)
|
|
602
|
+
|
|
603
|
+
if verbose:
|
|
604
|
+
print("\nSummary:")
|
|
605
|
+
print("--------")
|
|
606
|
+
print("\nParameters:", np.array(self.importance).shape)
|
|
607
|
+
print("RMSE: {:.4f}".format(mean_square_error(self.y, self.predict(self.X))))
|
|
608
|
+
|
|
609
|
+
def predict(self, datapoints: np.ndarray) -> np.ndarray:
|
|
610
|
+
"""
|
|
611
|
+
Performs predictions for a set of points
|
|
612
|
+
|
|
613
|
+
Parameters
|
|
614
|
+
----------
|
|
615
|
+
datapoints : `np.ndarray`
|
|
616
|
+
An array containing the values of the independent variable.
|
|
617
|
+
|
|
618
|
+
"""
|
|
619
|
+
return np.array(self.importance) @ datapoints
|
|
620
|
+
|
|
621
|
+
def get_importances(self, print_important_features: bool = False) -> np.ndarray:
|
|
622
|
+
"""
|
|
623
|
+
Returns the important features
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
print_important_features : `bool`
|
|
628
|
+
determines whether or not are printed on the screen. By default it is set to `False`.
|
|
629
|
+
|
|
630
|
+
Returns
|
|
631
|
+
-------
|
|
632
|
+
importance : `np.ndarray`
|
|
633
|
+
An array containing the importance of each feature.
|
|
634
|
+
"""
|
|
635
|
+
if print_important_features:
|
|
636
|
+
for i, a in enumerate(self.importance):
|
|
637
|
+
print(f"The importance of the {i+1} feature is {a}")
|
|
638
|
+
return np.array(self.importance)
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def cal_average(y: np.ndarray, alpha: float = 1):
|
|
642
|
+
"""Calculates the moving average of the data
|
|
643
|
+
|
|
644
|
+
Parameters
|
|
645
|
+
----------
|
|
646
|
+
y : `np.ndarray`
|
|
647
|
+
An array containing the data.
|
|
648
|
+
alpha : `float`
|
|
649
|
+
A `float` between `0` and `1`. By default it is set to `1`.
|
|
650
|
+
|
|
651
|
+
Returns
|
|
652
|
+
-------
|
|
653
|
+
average : `float`
|
|
654
|
+
The average of the data.
|
|
655
|
+
"""
|
|
656
|
+
|
|
657
|
+
window_size = int(alpha * len(y))
|
|
658
|
+
if len(y) < 2:
|
|
659
|
+
return y
|
|
660
|
+
|
|
661
|
+
n = min(window_size, len(y))
|
|
662
|
+
if n <= 1:
|
|
663
|
+
return y
|
|
664
|
+
|
|
665
|
+
padded = np.pad(y, n // 2, mode="edge")
|
|
666
|
+
return np.convolve(padded, np.ones(n) / n, mode="valid")[: len(y)]
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
class DataScaler:
|
|
670
|
+
"""numpy array `scaler` and `rescaler`"""
|
|
671
|
+
|
|
672
|
+
__slots__ = ["dataset_", "_n", "data_scaled", "values", "inv_fitting"]
|
|
673
|
+
|
|
674
|
+
def __init__(self, dataset: np.ndarray, n: int | None = 1) -> None:
|
|
675
|
+
"""Initializes the parameters required for scaling the data"""
|
|
676
|
+
self.dataset_ = dataset.copy()
|
|
677
|
+
self._n = n
|
|
678
|
+
|
|
679
|
+
def rescale(self, dataset_: np.ndarray | None = None) -> np.ndarray:
|
|
680
|
+
"""Perform a standard rescaling of the data
|
|
681
|
+
|
|
682
|
+
Returns
|
|
683
|
+
-------
|
|
684
|
+
data_scaled : `np.ndarray`
|
|
685
|
+
An array containing the scaled data.
|
|
686
|
+
"""
|
|
687
|
+
if isinstance(dataset_, np.ndarray):
|
|
688
|
+
data_scaled = np.copy(dataset_)
|
|
689
|
+
mu = self.values[0]
|
|
690
|
+
sigma = self.values[1]
|
|
691
|
+
f = self.values[2]
|
|
692
|
+
data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
|
|
693
|
+
for i in range(self.dataset_.shape[0]):
|
|
694
|
+
if self._n != None:
|
|
695
|
+
poly = f[i](self.inv_fitting[i](data_scaled[i]))
|
|
696
|
+
data_scaled[i] += -poly
|
|
697
|
+
data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
|
|
698
|
+
return data_scaled
|
|
699
|
+
else:
|
|
700
|
+
self.data_scaled = np.copy(self.dataset_.copy())
|
|
701
|
+
|
|
702
|
+
mu = []
|
|
703
|
+
sigma = []
|
|
704
|
+
fitting = []
|
|
705
|
+
self.inv_fitting = []
|
|
706
|
+
|
|
707
|
+
try:
|
|
708
|
+
xaxis = range(self.dataset_.shape[1])
|
|
709
|
+
except:
|
|
710
|
+
error_type = "IndexError"
|
|
711
|
+
msg = "Trying to access an item at an invalid index."
|
|
712
|
+
print(f"{error_type}: {msg}")
|
|
713
|
+
return None
|
|
714
|
+
for i in range(self.dataset_.shape[0]):
|
|
715
|
+
if self._n != None:
|
|
716
|
+
fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
|
|
717
|
+
inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
|
|
718
|
+
f = np.poly1d(fit)
|
|
719
|
+
poly = f(xaxis)
|
|
720
|
+
fitting.append(f)
|
|
721
|
+
self.inv_fitting.append(inv_fit)
|
|
722
|
+
self.data_scaled[i, :] += -poly
|
|
723
|
+
else:
|
|
724
|
+
fitting.append(0.0)
|
|
725
|
+
self.inv_fitting.append(0.0)
|
|
726
|
+
mu.append(np.min(self.data_scaled[i, :]))
|
|
727
|
+
if np.max(self.data_scaled[i, :]) != 0:
|
|
728
|
+
sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
|
|
729
|
+
else:
|
|
730
|
+
sigma.append(1)
|
|
731
|
+
|
|
732
|
+
self.data_scaled[i, :] = 2 * ((self.data_scaled[i, :] - mu[i]) / sigma[i]) - 1
|
|
733
|
+
|
|
734
|
+
self.values = [mu, sigma, fitting]
|
|
735
|
+
|
|
736
|
+
return self.data_scaled
|
|
737
|
+
|
|
738
|
+
def scale(self, dataset_: np.ndarray) -> np.ndarray:
|
|
739
|
+
"""Performs the inverse operation to the rescale function
|
|
740
|
+
|
|
741
|
+
Parameters
|
|
742
|
+
----------
|
|
743
|
+
dataset_ : `np.ndarray`
|
|
744
|
+
An array containing the scaled values.
|
|
745
|
+
|
|
746
|
+
Returns
|
|
747
|
+
-------
|
|
748
|
+
dataset_ : `np.ndarray`
|
|
749
|
+
An array containing the rescaled data.
|
|
750
|
+
"""
|
|
751
|
+
for i in range(dataset_.shape[0]):
|
|
752
|
+
dataset_[i, :] += 1
|
|
753
|
+
dataset_[i, :] /= 2
|
|
754
|
+
dataset_[i, :] = dataset_[i, :] * self.values[1][i]
|
|
755
|
+
dataset_[i, :] += self.values[0][i]
|
|
756
|
+
if self._n != None:
|
|
757
|
+
dataset_[i, :] += self.values[2][i](range(dataset_.shape[1]))
|
|
758
|
+
|
|
759
|
+
return dataset_
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def generate_series(n: int, n_steps: int, incline: bool = True):
|
|
763
|
+
"""Function that generates `n` series of length `n_steps`"""
|
|
764
|
+
freq1, freq2, offsets1, offsets2 = np.random.rand(4, n, 1)
|
|
765
|
+
|
|
766
|
+
if incline:
|
|
767
|
+
slope = np.random.rand(n, 1)
|
|
768
|
+
else:
|
|
769
|
+
slope = 0.0
|
|
770
|
+
offsets2 = 1
|
|
771
|
+
|
|
772
|
+
time = np.linspace(0, 1, n_steps)
|
|
773
|
+
series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10)) # wave 1
|
|
774
|
+
series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20)) # + wave 2
|
|
775
|
+
series += 0.7 * (np.random.rand(n, n_steps) - 0.5) # + noise
|
|
776
|
+
series += 5 * slope * time + 2 * (offsets2 - offsets1) * time ** (1 - offsets2)
|
|
777
|
+
series = series
|
|
778
|
+
return series.astype(np.float32)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray, print_error: bool = False):
|
|
782
|
+
"""Calculates the Root Mean Squared Error
|
|
783
|
+
|
|
784
|
+
Parameters
|
|
785
|
+
----------
|
|
786
|
+
y_true : `np.ndarray`
|
|
787
|
+
An array containing the true values.
|
|
788
|
+
y_pred : `np.ndarray`
|
|
789
|
+
An array containing the predicted values.
|
|
790
|
+
|
|
791
|
+
Returns
|
|
792
|
+
-------
|
|
793
|
+
RMSE : `float`
|
|
794
|
+
The Root Mean Squared Error.
|
|
795
|
+
"""
|
|
796
|
+
if print_error:
|
|
797
|
+
print(f"The RMSE is {np.sqrt(np.mean((y_true - y_pred)**2))}")
|
|
798
|
+
|
|
799
|
+
return np.sqrt(np.mean((y_true - y_pred) ** 2))
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
class DataFrameEncoder:
|
|
803
|
+
"""Allows encoding and decoding Dataframes"""
|
|
804
|
+
|
|
805
|
+
__slots__ = [
|
|
806
|
+
"_df",
|
|
807
|
+
"_names",
|
|
808
|
+
"_encode_columns",
|
|
809
|
+
"encoding_list",
|
|
810
|
+
"decoding_list",
|
|
811
|
+
"median_list",
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
def __init__(self, data: pd.DataFrame) -> None:
|
|
815
|
+
"""Sets the columns of the `DataFrame`"""
|
|
816
|
+
self._df = data.copy()
|
|
817
|
+
self._names = data.columns
|
|
818
|
+
self._encode_columns = []
|
|
819
|
+
self.encoding_list = []
|
|
820
|
+
self.decoding_list = []
|
|
821
|
+
self.median_list = []
|
|
822
|
+
|
|
823
|
+
def load_config(self, path_to_dictionaries: str = "./", **kwargs) -> None:
|
|
824
|
+
"""Loads dictionaries from a given directory
|
|
825
|
+
|
|
826
|
+
Keyword Arguments
|
|
827
|
+
-----------------
|
|
828
|
+
- dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
|
|
829
|
+
"""
|
|
830
|
+
dictionary_name = (
|
|
831
|
+
kwargs["dictionary_name"] if "dictionary_name" in kwargs else "labelencoder_dictionary"
|
|
832
|
+
)
|
|
833
|
+
with open(os.path.join(path_to_dictionaries, dictionary_name + ".pkl"), "rb") as file:
|
|
834
|
+
labelencoder = pickle.load(file)
|
|
835
|
+
self.encoding_list = labelencoder[0]
|
|
836
|
+
self.decoding_list = labelencoder[1]
|
|
837
|
+
self._encode_columns = labelencoder[2]
|
|
838
|
+
self.median_list = labelencoder[3]
|
|
839
|
+
print("Configuration successfully uploaded")
|
|
840
|
+
|
|
841
|
+
def train(self, path_to_save: str, **kwargs) -> None:
|
|
842
|
+
"""Trains the encoders and decoders using the `DataFrame`"""
|
|
843
|
+
save_mode = kwargs["save_mode"] if "save_mode" in kwargs else True
|
|
844
|
+
dictionary_name = (
|
|
845
|
+
kwargs["dictionary_name"] if "dictionary_name" in kwargs else "labelencoder_dictionary"
|
|
846
|
+
)
|
|
847
|
+
norm_method = kwargs["norm_method"] if "norm_method" in kwargs else "None"
|
|
848
|
+
for i in self._names:
|
|
849
|
+
if self._df[i].dtype == "object":
|
|
850
|
+
self._encode_columns.append(i)
|
|
851
|
+
column_index = range(len(self._df[i].unique()))
|
|
852
|
+
column_keys = self._df[i].unique()
|
|
853
|
+
encode_dict = dict(zip(column_keys, column_index))
|
|
854
|
+
decode_dict = dict(zip(column_index, column_keys))
|
|
855
|
+
self._df[i] = self._df[i].apply(
|
|
856
|
+
self._code_transformation_to, dictionary_list=encode_dict
|
|
857
|
+
)
|
|
858
|
+
if len(self._df[i].unique()) > 1:
|
|
859
|
+
median_value = len(self._df[i].unique()) // 2
|
|
860
|
+
else:
|
|
861
|
+
median_value = 1.0
|
|
862
|
+
if norm_method == "median":
|
|
863
|
+
self._df[i] = self._df[i].astype("float64")
|
|
864
|
+
self._df[i] = self._df[i] / median_value
|
|
865
|
+
self.median_list.append(median_value)
|
|
866
|
+
self.encoding_list.append(encode_dict)
|
|
867
|
+
self.decoding_list.append(decode_dict)
|
|
868
|
+
if save_mode:
|
|
869
|
+
self._save_encoder(path_to_save, dictionary_name)
|
|
870
|
+
|
|
871
|
+
def encode(self, path_to_save: str = "./", **kwargs) -> pd.DataFrame:
|
|
872
|
+
"""Encodes the `object` type columns of the dataframe
|
|
873
|
+
|
|
874
|
+
Keyword Arguments
|
|
875
|
+
-----------------
|
|
876
|
+
- save_mode (`bool`): An optional integer parameter. By default it is set to `True`
|
|
877
|
+
- dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
|
|
878
|
+
- norm_method (`str`): An optional string parameter to perform normalization. By default it is set to `None`
|
|
879
|
+
"""
|
|
880
|
+
if len(self.encoding_list) == 0:
|
|
881
|
+
self.train(path_to_save, **kwargs)
|
|
882
|
+
return self._df
|
|
883
|
+
|
|
884
|
+
else:
|
|
885
|
+
print("Configuration detected")
|
|
886
|
+
if len(self.median_list) == len(self._encode_columns):
|
|
887
|
+
median_mode = True
|
|
888
|
+
else:
|
|
889
|
+
median_mode = False
|
|
890
|
+
for num, colname in enumerate(self._encode_columns):
|
|
891
|
+
if self._df[colname].dtype == "object":
|
|
892
|
+
encode_dict = self.encoding_list[num]
|
|
893
|
+
self._df[colname] = self._df[colname].apply(
|
|
894
|
+
self._code_transformation_to, dictionary_list=encode_dict
|
|
895
|
+
)
|
|
896
|
+
if median_mode:
|
|
897
|
+
self._df[colname] = self._df[colname].astype("float64")
|
|
898
|
+
self._df[colname] = self._df[colname] / self.median_list[num]
|
|
899
|
+
return self._df
|
|
900
|
+
|
|
901
|
+
def decode(self) -> pd.DataFrame:
|
|
902
|
+
"""Decodes the `int` type columns of the `DataFrame`"""
|
|
903
|
+
j = 0
|
|
904
|
+
df_decoded = self._df.copy()
|
|
905
|
+
if len(self.median_list) == len(self._encode_columns):
|
|
906
|
+
median_mode = True
|
|
907
|
+
else:
|
|
908
|
+
median_mode = False
|
|
909
|
+
try:
|
|
910
|
+
number_of_columns = len(self.decoding_list[j])
|
|
911
|
+
for i in self._encode_columns:
|
|
912
|
+
if df_decoded[i].dtype == "int64" or df_decoded[i].dtype == "float64":
|
|
913
|
+
if median_mode:
|
|
914
|
+
df_decoded[i] = df_decoded[i] * self.median_list[j]
|
|
915
|
+
df_decoded[i] = df_decoded[i].astype("int64")
|
|
916
|
+
df_decoded[i] = df_decoded[i].apply(
|
|
917
|
+
self._code_transformation_to, dictionary_list=self.decoding_list[j]
|
|
918
|
+
)
|
|
919
|
+
j += 1
|
|
920
|
+
return df_decoded
|
|
921
|
+
except AttributeError as e:
|
|
922
|
+
warning_type = "UserWarning"
|
|
923
|
+
msg = "It is not possible to decode the dataframe, since it has not been encoded"
|
|
924
|
+
msg += "Error: {%s}" % e
|
|
925
|
+
print(f"{warning_type}: {msg}")
|
|
926
|
+
|
|
927
|
+
def get_dictionaries(self) -> Tuple[List[dict], List[dict]]:
|
|
928
|
+
"""Allows to return the `list` of dictionaries for `encoding` and `decoding`"""
|
|
929
|
+
try:
|
|
930
|
+
return self.encoding_list, self.decoding_list
|
|
931
|
+
except ValueError as e:
|
|
932
|
+
warning_type = "UserWarning"
|
|
933
|
+
msg = "It is not possible to return the list of dictionaries as they have not been created."
|
|
934
|
+
msg += "Error: {%s}" % e
|
|
935
|
+
print(f"{warning_type}: {msg}")
|
|
936
|
+
|
|
937
|
+
def _save_encoder(self, path_to_save: str, dictionary_name: str) -> None:
|
|
938
|
+
"""Method to serialize the `encoding_list`, `decoding_list` and `_encode_columns` list"""
|
|
939
|
+
with open(path_to_save + dictionary_name + ".pkl", "wb") as f:
|
|
940
|
+
pickle.dump(
|
|
941
|
+
[self.encoding_list, self.decoding_list, self._encode_columns, self.median_list], f
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
def _code_transformation_to(self, character: str, dictionary_list: List[dict]) -> int:
|
|
945
|
+
"""Auxiliary function to perform data transformation using a dictionary
|
|
946
|
+
|
|
947
|
+
Parameters
|
|
948
|
+
----------
|
|
949
|
+
character : `str`
|
|
950
|
+
A character data type.
|
|
951
|
+
dictionary_list : List[`dict`]
|
|
952
|
+
An object of dictionary type.
|
|
953
|
+
|
|
954
|
+
Returns
|
|
955
|
+
-------
|
|
956
|
+
dict_type[`character`] or `np.nan` if dict_type[`character`] doesn't exist.
|
|
957
|
+
"""
|
|
958
|
+
try:
|
|
959
|
+
return dictionary_list[character]
|
|
960
|
+
except:
|
|
961
|
+
return np.nan
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
class PerformanceMeasures:
|
|
965
|
+
"""Class with methods to measure performance"""
|
|
966
|
+
|
|
967
|
+
def __init__(self) -> None:
|
|
968
|
+
pass
|
|
969
|
+
|
|
970
|
+
def f_mean(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
|
|
971
|
+
F_vec = self._f1_score(y_true, y_pred, labels)
|
|
972
|
+
mean_f_measure = np.mean(F_vec)
|
|
973
|
+
mean_f_measure = np.around(mean_f_measure, decimals=4)
|
|
974
|
+
|
|
975
|
+
for label, f_measure in zip(labels, F_vec):
|
|
976
|
+
print(f"F-measure of label {label} -> {f_measure}")
|
|
977
|
+
|
|
978
|
+
print(f"Mean of F-measure -> {mean_f_measure}")
|
|
979
|
+
|
|
980
|
+
return mean_f_measure
|
|
981
|
+
|
|
982
|
+
def resp(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
|
|
983
|
+
T_C = len(y_true)
|
|
984
|
+
sum1, sum2 = 0.0, 0.0
|
|
985
|
+
F_vec = self._f1_score(y_true, y_pred, labels)
|
|
986
|
+
|
|
987
|
+
for label_idx, label in enumerate(labels):
|
|
988
|
+
class_instances = np.sum(y_true == label) / T_C
|
|
989
|
+
sum1 += (1 - class_instances) * F_vec[label_idx]
|
|
990
|
+
sum2 += 1 - class_instances
|
|
991
|
+
|
|
992
|
+
res_p = sum1 / sum2 if sum2 != 0 else 0.0
|
|
993
|
+
print(f"Metric Res_p -> {res_p}")
|
|
994
|
+
|
|
995
|
+
return res_p
|
|
996
|
+
|
|
997
|
+
def _summary_pred(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> None:
|
|
998
|
+
count_mat = self._confu_mat(y_true, y_pred, labels)
|
|
999
|
+
print(" " * 6, " | ".join(f"--{label}--" for label in labels))
|
|
1000
|
+
for i, label_i in enumerate(labels):
|
|
1001
|
+
row = [f" {int(count_mat[i, j]):5d} " for j in range(len(labels))]
|
|
1002
|
+
print(f"--{label_i}--|", " | ".join(row))
|
|
1003
|
+
|
|
1004
|
+
def _f1_score(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
|
|
1005
|
+
count_mat = self._confu_mat(y_true, y_pred, labels)
|
|
1006
|
+
sum_cols = np.sum(count_mat, axis=0)
|
|
1007
|
+
sum_rows = np.sum(count_mat, axis=1)
|
|
1008
|
+
|
|
1009
|
+
precision = np.divide(
|
|
1010
|
+
count_mat.diagonal(), sum_cols, out=np.zeros_like(sum_cols), where=sum_cols != 0
|
|
1011
|
+
)
|
|
1012
|
+
recall = np.divide(
|
|
1013
|
+
count_mat.diagonal(), sum_rows, out=np.zeros_like(sum_rows), where=sum_rows != 0
|
|
1014
|
+
)
|
|
1015
|
+
f1_vec = 2 * ((precision * recall) / (precision + recall))
|
|
1016
|
+
f1_vec = np.around(f1_vec, decimals=4)
|
|
1017
|
+
|
|
1018
|
+
return f1_vec
|
|
1019
|
+
|
|
1020
|
+
def _confu_mat(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
|
|
1021
|
+
num_classes = len(labels)
|
|
1022
|
+
label_mapping = {label: idx for idx, label in enumerate(labels)}
|
|
1023
|
+
count_mat = np.zeros((num_classes, num_classes))
|
|
1024
|
+
|
|
1025
|
+
for pred_label, true_label in zip(y_pred, y_true):
|
|
1026
|
+
if pred_label in label_mapping and true_label in label_mapping:
|
|
1027
|
+
count_mat[label_mapping[pred_label], label_mapping[true_label]] += 1
|
|
1028
|
+
|
|
1029
|
+
return count_mat
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
class OneHotEncoder:
|
|
1033
|
+
"""
|
|
1034
|
+
Class used to encode categorical variables.
|
|
1035
|
+
It receives an array of integers and returns a binary array using the one-hot encoding method.
|
|
1036
|
+
"""
|
|
1037
|
+
|
|
1038
|
+
__slots__ = ["num_categories"]
|
|
1039
|
+
|
|
1040
|
+
def __init__(self) -> None:
|
|
1041
|
+
self.num_categories = None
|
|
1042
|
+
|
|
1043
|
+
def encode(self, x: np.ndarray | list, fit: bool = True):
|
|
1044
|
+
if not isinstance(x, np.ndarray):
|
|
1045
|
+
x = np.array(x)
|
|
1046
|
+
valid_mask = ~np.isnan(x)
|
|
1047
|
+
x_int = x[valid_mask].astype(int)
|
|
1048
|
+
|
|
1049
|
+
if fit:
|
|
1050
|
+
self.num_categories = x_int.max() + 1
|
|
1051
|
+
else:
|
|
1052
|
+
if np.any(x_int >= self.num_categories):
|
|
1053
|
+
self.num_categories = max(self.num_categories, x_int.max() + 1)
|
|
1054
|
+
y = np.zeros((x.shape[0], self.num_categories))
|
|
1055
|
+
y[np.where(valid_mask)[0], x_int] = 1
|
|
1056
|
+
|
|
1057
|
+
return y
|
|
1058
|
+
|
|
1059
|
+
def decode(self, x: np.ndarray | list) -> np.ndarray:
|
|
1060
|
+
if not isinstance(x, np.ndarray):
|
|
1061
|
+
x = np.array(x)
|
|
1062
|
+
|
|
1063
|
+
# rows that have at least one 1 (i.e., were valid)
|
|
1064
|
+
valid_mask = x.sum(axis=1) > 0
|
|
1065
|
+
|
|
1066
|
+
y = np.full(x.shape[0], np.nan)
|
|
1067
|
+
y[valid_mask] = np.argmax(x[valid_mask], axis=1)
|
|
1068
|
+
return y
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
class FeatureSelection:
|
|
1072
|
+
"""
|
|
1073
|
+
Generate the data graph using a variation of the feature selection algorithm.
|
|
1074
|
+
|
|
1075
|
+
- The method `get_digraph` returns the network based on the feature selection method.
|
|
1076
|
+
"""
|
|
1077
|
+
|
|
1078
|
+
__slots__ = ["not_features", "X", "all_features_imp_graph", "w_dict", "scaler"]
|
|
1079
|
+
|
|
1080
|
+
def __init__(self, not_features: list[str] = []) -> None:
|
|
1081
|
+
"""The initializer of the class. The initial parameter is a list of strings with variables to discard."""
|
|
1082
|
+
self.not_features: List[str] = not_features
|
|
1083
|
+
self.all_features_imp_graph: List[Tuple] = []
|
|
1084
|
+
self.w_dict = dict()
|
|
1085
|
+
|
|
1086
|
+
def get_digraph(
|
|
1087
|
+
self, dataset: pd.DataFrame, n_importances: int, use_scaler: bool = False
|
|
1088
|
+
) -> str:
|
|
1089
|
+
"""
|
|
1090
|
+
Get directed graph showing importance of features.
|
|
1091
|
+
|
|
1092
|
+
Parameters
|
|
1093
|
+
----------
|
|
1094
|
+
dataset : `pd.DataFrame`
|
|
1095
|
+
Dataset to be used for generating the graph.
|
|
1096
|
+
n_importances : `int`
|
|
1097
|
+
Number of top importances to show in the graph.
|
|
1098
|
+
|
|
1099
|
+
Returns
|
|
1100
|
+
-------
|
|
1101
|
+
`str`
|
|
1102
|
+
A string representation of the directed graph.
|
|
1103
|
+
"""
|
|
1104
|
+
self._load_data(dataset)
|
|
1105
|
+
|
|
1106
|
+
curr_dataset = self.X
|
|
1107
|
+
columns = list(curr_dataset.columns)
|
|
1108
|
+
|
|
1109
|
+
feature_string = " digraph { "
|
|
1110
|
+
for column in columns:
|
|
1111
|
+
feature_string += column + "; "
|
|
1112
|
+
|
|
1113
|
+
numeric_df = curr_dataset.select_dtypes(include="number")
|
|
1114
|
+
if use_scaler:
|
|
1115
|
+
self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
|
|
1116
|
+
numeric_scaled = self.scaler.rescale()
|
|
1117
|
+
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
1118
|
+
curr_dataset[numeric_df.columns] = numeric_df
|
|
1119
|
+
|
|
1120
|
+
numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
|
|
1121
|
+
|
|
1122
|
+
for index_column, column in enumerate(columns):
|
|
1123
|
+
Y = curr_dataset[column]
|
|
1124
|
+
column_type = Y.dtype
|
|
1125
|
+
if column_type != "object":
|
|
1126
|
+
Model = LinearRegression()
|
|
1127
|
+
X_aux = curr_dataset.drop([column], axis=1)
|
|
1128
|
+
dfe = DataFrameEncoder(X_aux)
|
|
1129
|
+
encoded_df = dfe.encode(save_mode=False)
|
|
1130
|
+
Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
|
|
1131
|
+
importance = Model.get_importances()
|
|
1132
|
+
w = Model.w
|
|
1133
|
+
else:
|
|
1134
|
+
Model = LogisticRegression()
|
|
1135
|
+
num_unique_entries = curr_dataset[column].nunique()
|
|
1136
|
+
quick_encoder = DataFrameEncoder(Y.to_frame())
|
|
1137
|
+
encoded_Y = quick_encoder.encode(save_mode=False)
|
|
1138
|
+
one_hot = OneHotEncoder()
|
|
1139
|
+
train_y = one_hot.encode(encoded_Y[column])
|
|
1140
|
+
for i in range(len(train_y)):
|
|
1141
|
+
for j in range(num_unique_entries):
|
|
1142
|
+
if train_y[i][j] == 1.0:
|
|
1143
|
+
train_y[i][j] = 0.73105
|
|
1144
|
+
else:
|
|
1145
|
+
train_y[i][j] = 0.5
|
|
1146
|
+
X_aux = curr_dataset.drop([column], axis=1)
|
|
1147
|
+
dfe = DataFrameEncoder(X_aux)
|
|
1148
|
+
encoded_df = dfe.encode(save_mode=False)
|
|
1149
|
+
Model.fit(encoded_df.to_numpy().T, train_y)
|
|
1150
|
+
importance = Model.get_importances()
|
|
1151
|
+
w = Model.w
|
|
1152
|
+
top_n_indexes = sorted(
|
|
1153
|
+
range(len(importance)), key=lambda i: importance[i], reverse=True
|
|
1154
|
+
)[:n_importances]
|
|
1155
|
+
|
|
1156
|
+
names_cols = list(X_aux.columns)
|
|
1157
|
+
features_imp_node = [
|
|
1158
|
+
(names_cols[top_n_indexes[i]], importance[top_n_indexes[i]])
|
|
1159
|
+
for i in range(n_importances)
|
|
1160
|
+
]
|
|
1161
|
+
|
|
1162
|
+
if column_type != "object":
|
|
1163
|
+
self.w_dict[column] = (w, None, names_cols, dfe, numeric_dict)
|
|
1164
|
+
else:
|
|
1165
|
+
self.w_dict[column] = (w, quick_encoder, names_cols, dfe, numeric_dict)
|
|
1166
|
+
self.all_features_imp_graph.append((column, features_imp_node))
|
|
1167
|
+
for i in top_n_indexes:
|
|
1168
|
+
feature_string += names_cols[i] + " -> "
|
|
1169
|
+
|
|
1170
|
+
feature_string += column + "; "
|
|
1171
|
+
|
|
1172
|
+
return feature_string + "} "
|
|
1173
|
+
|
|
1174
|
+
def _load_data(self, dataset: pd.DataFrame):
|
|
1175
|
+
if len(self.not_features) > 0:
|
|
1176
|
+
self.X = dataset.drop(columns=self.not_features)
|
|
1177
|
+
|
|
1178
|
+
else:
|
|
1179
|
+
self.X = dataset
|
|
1180
|
+
|
|
1181
|
+
self.X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
1182
|
+
self.X.replace(" ", np.nan, inplace=True)
|
|
1183
|
+
self.X.dropna(inplace=True)
|
|
1184
|
+
self.X = self.X.reset_index()
|
|
1185
|
+
self.X = self.X.drop(columns=["index"])
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
def check_nan_inf(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
|
|
1189
|
+
"""
|
|
1190
|
+
Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
|
|
1191
|
+
|
|
1192
|
+
Parameters
|
|
1193
|
+
----------
|
|
1194
|
+
df : pd.DataFrame
|
|
1195
|
+
The input DataFrame to be checked.
|
|
1196
|
+
|
|
1197
|
+
Returns
|
|
1198
|
+
-------
|
|
1199
|
+
pd.DataFrame
|
|
1200
|
+
A new DataFrame with NaN and Inf values removed.
|
|
1201
|
+
"""
|
|
1202
|
+
|
|
1203
|
+
nan_values = df.isnull().values.any()
|
|
1204
|
+
inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
|
|
1205
|
+
|
|
1206
|
+
nan_count = df.isnull().values.sum()
|
|
1207
|
+
inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
|
|
1208
|
+
|
|
1209
|
+
if nan_values:
|
|
1210
|
+
(
|
|
1211
|
+
print(
|
|
1212
|
+
"UserWarning: Some rows may have been deleted due to the existence of NaN values.",
|
|
1213
|
+
f"NaN values removed: ",
|
|
1214
|
+
"{:,}".format(nan_count),
|
|
1215
|
+
)
|
|
1216
|
+
if verbose
|
|
1217
|
+
else None
|
|
1218
|
+
)
|
|
1219
|
+
df.dropna(inplace=True)
|
|
1220
|
+
|
|
1221
|
+
if inf_values:
|
|
1222
|
+
(
|
|
1223
|
+
print(
|
|
1224
|
+
"UserWarning: Some rows may have been deleted due to the existence of Inf values.",
|
|
1225
|
+
f"Infinite values removed: ",
|
|
1226
|
+
"{:,}".format(inf_count),
|
|
1227
|
+
)
|
|
1228
|
+
if verbose
|
|
1229
|
+
else None
|
|
1230
|
+
)
|
|
1231
|
+
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
1232
|
+
df.dropna(inplace=True)
|
|
1233
|
+
|
|
1234
|
+
return df
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
# -------------------------------------------------------------------------
|
|
1238
|
+
if __name__ == "__main__":
|
|
1239
|
+
y_true = np.array([1, 2, 2, 1, 1])
|
|
1240
|
+
y_pred = np.array([1, 1, 2, 2, 1])
|
|
1241
|
+
|
|
1242
|
+
labels = [1, 2]
|
|
1243
|
+
helper = PerformanceMeasures()
|
|
1244
|
+
helper._summary_pred(y_true, y_pred, labels)
|
|
1245
|
+
print(helper._f1_score(y_true, y_pred, labels))
|
|
1246
|
+
print(helper.f_mean(y_true, y_pred, labels))
|
|
1247
|
+
|
|
1248
|
+
# Use DataFrameEncoder
|
|
1249
|
+
data = {"Name": ["John", "Alice", "Bob", "Jafet", "Beto"], "Age": [25, 30, 35, 21, 28]}
|
|
1250
|
+
import pandas as pd
|
|
1251
|
+
|
|
1252
|
+
df = pd.DataFrame(data)
|
|
1253
|
+
# Instantiate DataFrameEncoder
|
|
1254
|
+
dfe = DataFrameEncoder(df)
|
|
1255
|
+
# Encode the dataframe
|
|
1256
|
+
encoded_df = dfe.encode(norm_method="median")
|
|
1257
|
+
# Decode the dataframe
|
|
1258
|
+
decoded_df = dfe.decode()
|
|
1259
|
+
|
|
1260
|
+
# Instantiate DataFrameEncoder
|
|
1261
|
+
# Use load_config method
|
|
1262
|
+
dfe2 = DataFrameEncoder(df)
|
|
1263
|
+
dfe2.load_config()
|
|
1264
|
+
|
|
1265
|
+
encoded_df2 = dfe2.encode()
|
|
1266
|
+
# Decode the dataframe
|
|
1267
|
+
decoded_df2 = dfe2.decode()
|
|
1268
|
+
# Check if the loaded dictionaries match the original ones
|
|
1269
|
+
assert dfe.encoding_list == dfe2.encoding_list
|
|
1270
|
+
assert dfe.decoding_list == dfe2.decoding_list
|
|
1271
|
+
|
|
1272
|
+
# Generate data
|
|
1273
|
+
x = np.random.rand(3, 100)
|
|
1274
|
+
y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
|
|
1275
|
+
# Create a DataFrame
|
|
1276
|
+
df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
|
|
1277
|
+
df["y"] = y
|
|
1278
|
+
# Instantiate FeatureSelection
|
|
1279
|
+
fs = FeatureSelection()
|
|
1280
|
+
print(fs.get_digraph(df, n_importances=1))
|
|
1281
|
+
|
|
1282
|
+
linear_model = LinearRegression()
|
|
1283
|
+
linear_model.fit(x, y)
|
|
1284
|
+
importance = linear_model.get_importances()
|
|
1285
|
+
y_hat = linear_model.predict(x)
|
|
1286
|
+
|
|
1287
|
+
# Graph the data for visualization
|
|
1288
|
+
plt.plot(x[0, :], y, "o", label="Original Data")
|
|
1289
|
+
plt.plot(x[0, :], y_hat, "x", label="$\\hat{y}$")
|
|
1290
|
+
plt.legend()
|
|
1291
|
+
plt.xlabel("$x$")
|
|
1292
|
+
plt.ylabel("$y, \\hat{y}$")
|
|
1293
|
+
plt.show()
|
|
1294
|
+
|
|
1295
|
+
a = generate_series(1, 40, incline=False)
|
|
1296
|
+
# Graph the data for visualization
|
|
1297
|
+
plt.plot(range(len(a[0, :])), a[0, :], label="Original Data")
|
|
1298
|
+
plt.legend()
|
|
1299
|
+
plt.xlabel("Time periods")
|
|
1300
|
+
plt.ylabel("$y(t)$")
|
|
1301
|
+
plt.show()
|
|
1302
|
+
|
|
1303
|
+
a_denoise, _ = fft_denoise(a)
|
|
1304
|
+
|
|
1305
|
+
plt.plot(range(len(a_denoise[0, :])), a_denoise[0, :], label="Denoise Data")
|
|
1306
|
+
plt.legend()
|
|
1307
|
+
plt.xlabel("Time periods")
|
|
1308
|
+
plt.ylabel("$y(t)$")
|
|
1309
|
+
plt.show()
|
|
1310
|
+
|
|
1311
|
+
# Calculate the autocorrelation of the data
|
|
1312
|
+
z = AutoCorrelation(a[0, :])
|
|
1313
|
+
z.plot()
|
|
1314
|
+
# print(z())
|
|
1315
|
+
|
|
1316
|
+
N = 1000
|
|
1317
|
+
mu = np.random.uniform(0, 10.0)
|
|
1318
|
+
sigma = np.random.uniform(0.1, 1.0)
|
|
1319
|
+
x = np.random.normal(mu, sigma, N)
|
|
1320
|
+
f, cdf_, ox = cdf(x, plot=True)
|
|
1321
|
+
invf, cdf_, ox = cdf(x, plot=True, inv=True)
|
|
1322
|
+
|
|
1323
|
+
encoder = OneHotEncoder()
|
|
1324
|
+
encoding = encoder.encode([1, 2, 3, 4, 5])
|
|
1325
|
+
assert np.array_equal(
|
|
1326
|
+
encoding,
|
|
1327
|
+
np.array(
|
|
1328
|
+
[
|
|
1329
|
+
[0, 1, 0, 0, 0, 0],
|
|
1330
|
+
[0, 0, 1, 0, 0, 0],
|
|
1331
|
+
[0, 0, 0, 1, 0, 0],
|
|
1332
|
+
[0, 0, 0, 0, 1, 0],
|
|
1333
|
+
[0, 0, 0, 0, 0, 1],
|
|
1334
|
+
]
|
|
1335
|
+
),
|
|
1336
|
+
)
|