likelihood 2.2.0.dev1__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1336 @@
1
+ import math
2
+ import os
3
+ import pickle
4
+ import warnings
5
+ from typing import Callable, Dict, Generator, List, Tuple, Union
6
+
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import yaml
11
+ from packaging import version
12
+
13
+ if version.parse(np.__version__) < version.parse("2.0.0"):
14
+ filter = np.RankWarning
15
+ else:
16
+ filter = np.exceptions.RankWarning
17
+
18
+ warnings.simplefilter("ignore", filter)
19
+
20
+ # -------------------------------------------------------------------------
21
+
22
+ """
23
+ Data Science from Scratch, Second Edition, by Joel Grus (O'Reilly).Copyright 2019 Joel Grus, 978-1-492-04113-9
24
+ """
25
+
26
+
27
+ def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> Generator:
28
+ """Generates 'batch_size'-sized minibatches from the dataset
29
+
30
+ Parameters
31
+ ----------
32
+ dataset : `List`
33
+ The data to be divided into mini-batch.
34
+ batch_size : `int`
35
+ Specifies the size of each mini-batch.
36
+ shuffle : `bool`
37
+ If set `True`, the data will be shuffled before dividing it into mini-batches.
38
+
39
+ Returns
40
+ -------
41
+ `Generator`
42
+ A list of lists containing the mini-batches. Each sublist is a separate mini-batch with length `batch_size`.
43
+ """
44
+
45
+ # start indexes 0, batch_size, 2 * batch_size, ...
46
+ batch_starts = [start for start in range(0, len(dataset), batch_size)]
47
+
48
+ if shuffle:
49
+ np.random.shuffle(batch_starts) # shuffle the batches
50
+
51
+ for start in batch_starts:
52
+ end = start + batch_size
53
+ yield dataset[start:end]
54
+
55
+
56
+ def difference_quotient(f: Callable, x: float, h: float) -> Callable:
57
+ """Calculates the difference quotient of `f` evaluated at `x` and `x + h`
58
+
59
+ Parameters
60
+ ----------
61
+ `f(x)` : `Callable`
62
+ function.
63
+ x : `float`
64
+ Independent term.
65
+ h : `float`
66
+ Step size.
67
+
68
+ Returns
69
+ -------
70
+ `(f(x + h) - f(x)) / h` : `float`
71
+ Difference quotient of `f` evaluated at `x`.
72
+ """
73
+ return (f(x + h) - f(x)) / h
74
+
75
+
76
+ def partial_difference_quotient(f: Callable, v: np.ndarray, i: int, h: float) -> np.ndarray:
77
+ """Calculates the partial difference quotient of `f`
78
+
79
+ Parameters
80
+ ----------
81
+ `f(x0,...,xi-th)` : `Callable` function
82
+ Function to differentiate.
83
+ v : `Vector` | `np.array`
84
+ 1D array representing vector `v=(x0,...,xi)`.
85
+ h : `float`
86
+ Step size.
87
+
88
+ Returns
89
+ -------
90
+ `(f(w) - f(v)) / h` : `np.array`
91
+ the `i-th` partial difference quotient of `f` at `v`
92
+ """
93
+ w = [
94
+ v_j + (h if j == i else 0) for j, v_j in enumerate(v) # add h to just the ith element of v
95
+ ]
96
+ return (f(w) - f(v)) / h
97
+
98
+
99
+ def estimate_gradient(f: Callable, v: np.ndarray, h: float = 1e-4) -> List[np.ndarray]:
100
+ """Calculates the gradient of `f` at `v`
101
+
102
+ Parameters
103
+ ----------
104
+ `f(x0,...,xi-th)` : `Callable` function
105
+ Function to differentiate.
106
+ v : `Vector` | `np.array`
107
+ 1D array representing vector `v=(x0,...,xi)`.
108
+ h : `float`. By default it is set to `1e-4`
109
+ The step size used to approximate the derivative.
110
+
111
+ Returns
112
+ -------
113
+ grad_f : `List[np.array]`
114
+ A list containing the estimated gradients of each component of `f` evaluated at `v`.
115
+ """
116
+ return [partial_difference_quotient(f, v, i, h) for i in range(len(v))]
117
+
118
+
119
+ # -------------------------------------------------------------------------
120
+
121
+
122
+ def generate_feature_yaml(
123
+ df: pd.DataFrame, ignore_features: List[str] = None, yaml_string: bool = False
124
+ ) -> Dict | str:
125
+ """
126
+ Generate a YAML string containing information about ordinal, numeric, and categorical features
127
+ based on the given DataFrame.
128
+
129
+ Parameters
130
+ ----------
131
+ df : `pd.DataFrame`
132
+ The DataFrame containing the data.
133
+ ignore_features : `List[`str`]`
134
+ A list of features to ignore.
135
+ yaml_string : `bool`
136
+ If `True`, return the result as a YAML formatted string. Otherwise, return it as a dictionary. Default is `False`.
137
+
138
+ Returns
139
+ -------
140
+ feature_info : `Dict` | `str`
141
+ A dictionary with four keys ('ordinal_features', 'numeric_features', 'categorical_features', 'ignore_features')
142
+ mapping to lists of feature names. Or a YAML formatted string if `yaml_string` is `True`.
143
+ """
144
+ ignore_features = ignore_features or []
145
+ feature_info = {
146
+ "ordinal_features": [],
147
+ "numeric_features": [],
148
+ "categorical_features": [],
149
+ "ignore_features": ignore_features,
150
+ }
151
+
152
+ for col in df.columns:
153
+ if col in ignore_features:
154
+ continue
155
+
156
+ if pd.api.types.is_numeric_dtype(df[col]):
157
+ if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_float_dtype(df[col]):
158
+ feature_info["numeric_features"].append(col)
159
+ elif pd.api.types.is_bool_dtype(df[col]):
160
+ feature_info["ordinal_features"].append(col) # Assuming bool can be ordinal
161
+ elif pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_categorical_dtype(df[col]):
162
+ feature_info["categorical_features"].append(col)
163
+ else:
164
+ print(f"Unknown type for feature {col}")
165
+
166
+ if yaml_string:
167
+ return yaml.dump(feature_info, default_flow_style=False)
168
+
169
+ return feature_info
170
+
171
+
172
+ def cal_missing_values(df: pd.DataFrame) -> None:
173
+ """Calculate the percentage of missing (`NaN`/`NaT`) values per column in a DataFrame.
174
+
175
+ Parameters
176
+ ----------
177
+ df : `pd.DataFrame`
178
+ The input dataframe.
179
+
180
+ Returns
181
+ -------
182
+ `None` : Prints out a table with columns as index and percentages of missing values as data.
183
+ """
184
+
185
+ col = df.columns
186
+ print("Total size :", "{:,}".format(len(df)))
187
+ for i in col:
188
+ print(
189
+ str(i) + " : " f"{(df.isnull().sum()[i]/(df.isnull().sum()[i]+df[i].count()))*100:.2f}%"
190
+ )
191
+
192
+
193
+ def cdf(
194
+ x: np.ndarray,
195
+ poly: int = 9,
196
+ inv: bool = False,
197
+ plot: bool = False,
198
+ savename: str | None = None,
199
+ key: str | None = None,
200
+ ) -> tuple:
201
+ """Calculates the cumulative distribution function of the data.
202
+
203
+ Parameters
204
+ ----------
205
+ x : `np.ndarray`
206
+ An array containing the data.
207
+ poly : `int`
208
+ Degree of the polynomial fit. By default it is set to `9`.
209
+ inv : `bool`
210
+ If True, calculate the inverse CDF (quantile function).
211
+ plot : `bool`
212
+ If True, plot the results.
213
+ savename : `str` or `None`, optional
214
+ Filename to save the plot.
215
+ key : `str` or `None`, optional
216
+ Additional information to display with the chart title.
217
+
218
+ Returns
219
+ -------
220
+ fit : `np.poly1d`
221
+ Polynomial fit of the CDF or quantile function.
222
+ cdf_values : `np.ndarray`
223
+ Cumulative distribution values.
224
+ sorted_x : `np.ndarray`
225
+ Sorted input data.
226
+ """
227
+
228
+ if len(x) == 0:
229
+ raise ValueError("Input array 'x' must not be empty.")
230
+
231
+ cdf_values = np.cumsum(x) / np.sum(x)
232
+ sorted_x = np.sort(x)
233
+
234
+ probabilities = np.linspace(0, 1, len(sorted_x))
235
+
236
+ if inv:
237
+ fit = np.polyfit(probabilities, sorted_x, poly)
238
+ f = np.poly1d(fit)
239
+ plot_label = "Quantile Function"
240
+ x_values = probabilities
241
+ y_values = sorted_x
242
+ else:
243
+ fit = np.polyfit(sorted_x, probabilities, poly)
244
+ f = np.poly1d(fit)
245
+ plot_label = (
246
+ "Cumulative Distribution Function"
247
+ if key is None
248
+ else f"Cumulative Distribution Function ({key})"
249
+ )
250
+ x_values = sorted_x
251
+ y_values = cdf_values
252
+
253
+ if plot:
254
+ plt.figure()
255
+ plt.plot(x_values, y_values, "o", label="data")
256
+ plt.plot(x_values, f(x_values), "r--", label="fit")
257
+ plt.title(plot_label)
258
+ plt.xlabel("Probability" if inv else "Value")
259
+ plt.ylabel("Value" if inv else "Probability")
260
+ plt.legend()
261
+ if savename:
262
+ plt.savefig(savename, dpi=300)
263
+ plt.show()
264
+
265
+ return f, cdf_values, sorted_x
266
+
267
+
268
+ def calculate_probability(x: np.ndarray, points: int = 1, cond: bool = True) -> np.ndarray:
269
+ """Calculates the probability of the data based on the CDF fit.
270
+
271
+ Parameters
272
+ ----------
273
+ x : `np.ndarray`
274
+ An array containing the data.
275
+ points : `int`
276
+ Number of points to consider for the final probability calculation.
277
+ cond : `bool`
278
+ Condition to use product (True) or sum (False) for the final probability check.
279
+
280
+ Returns
281
+ -------
282
+ p : `np.ndarray`
283
+ Array containing the probabilities of the data.
284
+ """
285
+
286
+ if len(x) == 0:
287
+ raise ValueError("Input array 'x' must not be empty.")
288
+
289
+ fit, _, sorted_x = cdf(x)
290
+ p = fit(x)
291
+
292
+ if cond:
293
+ prob_value = np.prod(p[-points])
294
+ message = "product"
295
+ else:
296
+ prob_value = np.sum(p[-points])
297
+ message = "sum"
298
+
299
+ if 0 <= prob_value <= 1:
300
+ print(f"The model has a probability of {prob_value * 100:.2f}% based on the {message}.")
301
+ else:
302
+ print("\nThe probability of the data cannot be calculated.\n")
303
+
304
+ return p
305
+
306
+
307
+ class CorrelationBase:
308
+ """Base class for correlation calculations."""
309
+
310
+ __slots__ = ["x", "y", "result", "z"]
311
+
312
+ def __init__(self, x: np.ndarray, y: Union[np.ndarray, None] = None):
313
+ self.x = x
314
+ self.y = y if y is not None else x
315
+ self._compute_correlation()
316
+ self.z = self.result[self.result.size // 2 :]
317
+ self.z /= np.abs(self.z).max()
318
+
319
+ def _compute_correlation(self):
320
+ """Compute the correlation between x and y (or x with itself for autocorrelation)."""
321
+ self.result = np.correlate(self.x, self.y, mode="full")
322
+
323
+ def plot(self):
324
+ """Plot the correlation or autocorrelation."""
325
+ plt.plot(range(len(self.z)), self.z, label=self._get_label())
326
+ plt.legend()
327
+ plt.show()
328
+
329
+ def _get_label(self) -> str:
330
+ return "Autocorrelation" if np.array_equal(self.x, self.y) else "Correlation"
331
+
332
+ def __call__(self):
333
+ """Return the computed correlation or autocorrelation."""
334
+ return self.z
335
+
336
+
337
+ class Correlation(CorrelationBase):
338
+ """Calculates the cross-correlation of two datasets.
339
+
340
+ Parameters
341
+ ----------
342
+ x : `np.ndarray`
343
+ An array containing the first dataset.
344
+ y : `np.ndarray`
345
+ An array containing the second dataset.
346
+
347
+ Returns
348
+ -------
349
+ z : `np.ndarray`
350
+ An array containing the correlation of `x` and `y`.
351
+ """
352
+
353
+ def __init__(self, x: np.ndarray, y: np.ndarray):
354
+ super().__init__(x, y)
355
+
356
+
357
+ class AutoCorrelation(CorrelationBase):
358
+ """Calculates the autocorrelation of a dataset.
359
+
360
+ Parameters
361
+ ----------
362
+ x : `np.ndarray`
363
+ An array containing the data.
364
+
365
+ Returns
366
+ -------
367
+ z : `np.ndarray`
368
+ An array containing the autocorrelation of the data.
369
+ """
370
+
371
+ def __init__(self, x: np.ndarray):
372
+ super().__init__(x)
373
+
374
+
375
+ def fft_denoise(
376
+ dataset: np.ndarray, sigma: float = 0, mode: bool = True
377
+ ) -> Tuple[np.ndarray, np.ndarray]:
378
+ """Performs noise removal using the Fast Fourier Transform.
379
+
380
+ Parameters
381
+ ----------
382
+ dataset : `np.ndarray`
383
+ An array containing the noised data. Expected shape (num_samples, num_points).
384
+ sigma : `float`, default=0
385
+ A float between 0 and 1 representing the threshold for noise filtering.
386
+ mode : `bool`, default=True
387
+ If True, print progress messages.
388
+
389
+ Returns
390
+ -------
391
+ denoised_dataset : `np.ndarray`
392
+ An array containing the denoised data with the same shape as `dataset`.
393
+ periods : `np.ndarray`
394
+ Array of estimated periods for each sample in `dataset`.
395
+ """
396
+
397
+ if not (0 <= sigma <= 1):
398
+ raise ValueError("sigma must be between 0 and 1")
399
+
400
+ num_samples, n_points = dataset.shape
401
+ denoised_dataset = np.zeros_like(dataset)
402
+ periods = np.zeros(num_samples)
403
+
404
+ freq = (1 / n_points) * np.arange(n_points)
405
+ L = np.arange(1, np.floor(n_points / 2), dtype=int)
406
+
407
+ for i in range(num_samples):
408
+ fhat = np.fft.fft(dataset[i, :], n_points)
409
+ PSD = fhat * np.conj(fhat) / n_points
410
+ threshold = np.mean(PSD) + sigma * np.std(PSD)
411
+ indices = PSD > threshold
412
+
413
+ PSDclean = PSD * indices
414
+ fhat_cleaned = fhat * indices
415
+
416
+ denoised_signal = np.fft.ifft(fhat_cleaned).real
417
+ denoised_dataset[i, :] = denoised_signal
418
+
419
+ peak_index = L[np.argmax(np.abs(fhat[L]))]
420
+ periods[i] = 1 / freq[peak_index]
421
+
422
+ if mode:
423
+ print(f"The {i+1}-th row of the dataset has been denoised.")
424
+ print(f"The estimated period is {round(periods[i], 4)}")
425
+
426
+ return denoised_dataset, periods
427
+
428
+
429
+ def get_period(dataset: np.ndarray) -> float:
430
+ """Calculates the periodicity of a `dataset`.
431
+
432
+ Parameters
433
+ ----------
434
+ dataset : `np.ndarray`
435
+ the `dataset` describing the function over which the period is calculated.
436
+
437
+ Returns
438
+ -------
439
+ period : `float`
440
+ period of the function described by the `dataset`.
441
+ """
442
+ n = dataset.size
443
+
444
+ if n < 2:
445
+ raise ValueError("Dataset must contain at least two points.")
446
+
447
+ fhat = np.fft.rfft(dataset)
448
+ freqs = np.fft.rfftfreq(n)
449
+
450
+ PSD = np.abs(fhat) ** 2 / n
451
+
452
+ PSD[0] = 0
453
+
454
+ max_psd_index = np.argmax(PSD)
455
+
456
+ dominant_freq = freqs[max_psd_index]
457
+ if dominant_freq == 0:
458
+ raise ValueError("No significant periodic component found in the dataset.")
459
+
460
+ period = 1 / dominant_freq
461
+
462
+ return period
463
+
464
+
465
+ def sigmoide_inv(y: float) -> float:
466
+ """Calculates the inverse of the sigmoid function
467
+
468
+ Parameters
469
+ ----------
470
+ y : `float`
471
+ the number to evaluate the function.
472
+
473
+ Returns
474
+ -------
475
+ `float`
476
+ value of evaluated function.
477
+ """
478
+
479
+ return math.log(y / (1 - y))
480
+
481
+
482
+ def sigmoide(x: float) -> float:
483
+ """The sigmoid function"""
484
+ return 1 / (1 + math.exp(-x))
485
+
486
+
487
+ class LogisticRegression:
488
+ """class implementing multiple logistic regression"""
489
+
490
+ __slots__ = ["importance", "X", "y", "w"]
491
+
492
+ def __init__(self) -> None:
493
+ """The class initializer"""
494
+
495
+ self.importance = []
496
+
497
+ def fit(self, dataset: np.ndarray, values: np.ndarray) -> None:
498
+ """Performs linear multiple model training
499
+
500
+ Parameters
501
+ ----------
502
+ dataset : `np.ndarray`
503
+ An array containing the scaled data.
504
+ values : `np.ndarray`
505
+ A set of values returned by the linear function.
506
+
507
+ Returns
508
+ -------
509
+ importance : `np.ndarray`
510
+ An array containing the importance of each feature.
511
+
512
+ """
513
+
514
+ self.X = dataset
515
+ self.y = values
516
+
517
+ U, S, VT = np.linalg.svd(self.X, full_matrices=False)
518
+
519
+ inverse_sig = np.vectorize(sigmoide_inv)
520
+ self.w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ inverse_sig(self.y)
521
+
522
+ if self.y.shape[1] > 1:
523
+ for row in self.w:
524
+ self.importance.append(np.around(np.max(row), decimals=8))
525
+ else:
526
+ for i in range(self.X.shape[0]):
527
+ a = np.around(self.w[i], decimals=8)
528
+ self.importance.append(a)
529
+
530
+ def predict(self, datapoints: np.ndarray) -> np.ndarray:
531
+ """
532
+ Performs predictions for a set of points
533
+
534
+ Parameters
535
+ ----------
536
+ datapoints : `np.ndarray`
537
+ An array containing the values of the independent variable.
538
+
539
+ Returns
540
+ -------
541
+ `np.ndarray`
542
+
543
+ """
544
+ sig = np.vectorize(sigmoide)
545
+
546
+ return sig(np.array(self.importance) @ datapoints)
547
+
548
+ def get_importances(self, print_important_features: bool = False) -> np.ndarray:
549
+ """
550
+ Returns the important features
551
+
552
+ Parameters
553
+ ----------
554
+ print_important_features : `bool`
555
+ determines whether or not are printed on the screen. By default it is set to `False`.
556
+
557
+ Returns
558
+ -------
559
+ importance : `np.ndarray`
560
+ An array containing the importance of each feature.
561
+ """
562
+ if print_important_features:
563
+ for i, a in enumerate(self.importance):
564
+ print(f"The importance of the {i+1} feature is {a}")
565
+ return np.array(self.importance)
566
+
567
+
568
+ class LinearRegression:
569
+ """class implementing multiple linear regression"""
570
+
571
+ __slots__ = ["importance", "X", "y", "w"]
572
+
573
+ def __init__(self) -> None:
574
+ """The class initializer"""
575
+
576
+ self.importance = []
577
+
578
+ def fit(self, dataset: np.ndarray, values: np.ndarray, verbose: bool = False) -> None:
579
+ """Performs linear multiple model training
580
+
581
+ Parameters
582
+ ----------
583
+ dataset : `np.ndarray`
584
+ An array containing the scaled data.
585
+ values : `np.ndarray`
586
+ A set of values returned by the linear function.
587
+
588
+ Returns
589
+ -------
590
+ `None` : The function doesn't return anything.
591
+ """
592
+
593
+ self.X = dataset
594
+ self.y = values
595
+
596
+ U, S, VT = np.linalg.svd(self.X, full_matrices=False)
597
+ self.w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ self.y
598
+
599
+ for i in range(self.X.shape[0]):
600
+ a = np.around(self.w[i], decimals=8)
601
+ self.importance.append(a)
602
+
603
+ if verbose:
604
+ print("\nSummary:")
605
+ print("--------")
606
+ print("\nParameters:", np.array(self.importance).shape)
607
+ print("RMSE: {:.4f}".format(mean_square_error(self.y, self.predict(self.X))))
608
+
609
+ def predict(self, datapoints: np.ndarray) -> np.ndarray:
610
+ """
611
+ Performs predictions for a set of points
612
+
613
+ Parameters
614
+ ----------
615
+ datapoints : `np.ndarray`
616
+ An array containing the values of the independent variable.
617
+
618
+ """
619
+ return np.array(self.importance) @ datapoints
620
+
621
+ def get_importances(self, print_important_features: bool = False) -> np.ndarray:
622
+ """
623
+ Returns the important features
624
+
625
+ Parameters
626
+ ----------
627
+ print_important_features : `bool`
628
+ determines whether or not are printed on the screen. By default it is set to `False`.
629
+
630
+ Returns
631
+ -------
632
+ importance : `np.ndarray`
633
+ An array containing the importance of each feature.
634
+ """
635
+ if print_important_features:
636
+ for i, a in enumerate(self.importance):
637
+ print(f"The importance of the {i+1} feature is {a}")
638
+ return np.array(self.importance)
639
+
640
+
641
+ def cal_average(y: np.ndarray, alpha: float = 1):
642
+ """Calculates the moving average of the data
643
+
644
+ Parameters
645
+ ----------
646
+ y : `np.ndarray`
647
+ An array containing the data.
648
+ alpha : `float`
649
+ A `float` between `0` and `1`. By default it is set to `1`.
650
+
651
+ Returns
652
+ -------
653
+ average : `float`
654
+ The average of the data.
655
+ """
656
+
657
+ window_size = int(alpha * len(y))
658
+ if len(y) < 2:
659
+ return y
660
+
661
+ n = min(window_size, len(y))
662
+ if n <= 1:
663
+ return y
664
+
665
+ padded = np.pad(y, n // 2, mode="edge")
666
+ return np.convolve(padded, np.ones(n) / n, mode="valid")[: len(y)]
667
+
668
+
669
+ class DataScaler:
670
+ """numpy array `scaler` and `rescaler`"""
671
+
672
+ __slots__ = ["dataset_", "_n", "data_scaled", "values", "inv_fitting"]
673
+
674
+ def __init__(self, dataset: np.ndarray, n: int | None = 1) -> None:
675
+ """Initializes the parameters required for scaling the data"""
676
+ self.dataset_ = dataset.copy()
677
+ self._n = n
678
+
679
+ def rescale(self, dataset_: np.ndarray | None = None) -> np.ndarray:
680
+ """Perform a standard rescaling of the data
681
+
682
+ Returns
683
+ -------
684
+ data_scaled : `np.ndarray`
685
+ An array containing the scaled data.
686
+ """
687
+ if isinstance(dataset_, np.ndarray):
688
+ data_scaled = np.copy(dataset_)
689
+ mu = self.values[0]
690
+ sigma = self.values[1]
691
+ f = self.values[2]
692
+ data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
693
+ for i in range(self.dataset_.shape[0]):
694
+ if self._n != None:
695
+ poly = f[i](self.inv_fitting[i](data_scaled[i]))
696
+ data_scaled[i] += -poly
697
+ data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
698
+ return data_scaled
699
+ else:
700
+ self.data_scaled = np.copy(self.dataset_.copy())
701
+
702
+ mu = []
703
+ sigma = []
704
+ fitting = []
705
+ self.inv_fitting = []
706
+
707
+ try:
708
+ xaxis = range(self.dataset_.shape[1])
709
+ except:
710
+ error_type = "IndexError"
711
+ msg = "Trying to access an item at an invalid index."
712
+ print(f"{error_type}: {msg}")
713
+ return None
714
+ for i in range(self.dataset_.shape[0]):
715
+ if self._n != None:
716
+ fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
717
+ inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
718
+ f = np.poly1d(fit)
719
+ poly = f(xaxis)
720
+ fitting.append(f)
721
+ self.inv_fitting.append(inv_fit)
722
+ self.data_scaled[i, :] += -poly
723
+ else:
724
+ fitting.append(0.0)
725
+ self.inv_fitting.append(0.0)
726
+ mu.append(np.min(self.data_scaled[i, :]))
727
+ if np.max(self.data_scaled[i, :]) != 0:
728
+ sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
729
+ else:
730
+ sigma.append(1)
731
+
732
+ self.data_scaled[i, :] = 2 * ((self.data_scaled[i, :] - mu[i]) / sigma[i]) - 1
733
+
734
+ self.values = [mu, sigma, fitting]
735
+
736
+ return self.data_scaled
737
+
738
+ def scale(self, dataset_: np.ndarray) -> np.ndarray:
739
+ """Performs the inverse operation to the rescale function
740
+
741
+ Parameters
742
+ ----------
743
+ dataset_ : `np.ndarray`
744
+ An array containing the scaled values.
745
+
746
+ Returns
747
+ -------
748
+ dataset_ : `np.ndarray`
749
+ An array containing the rescaled data.
750
+ """
751
+ for i in range(dataset_.shape[0]):
752
+ dataset_[i, :] += 1
753
+ dataset_[i, :] /= 2
754
+ dataset_[i, :] = dataset_[i, :] * self.values[1][i]
755
+ dataset_[i, :] += self.values[0][i]
756
+ if self._n != None:
757
+ dataset_[i, :] += self.values[2][i](range(dataset_.shape[1]))
758
+
759
+ return dataset_
760
+
761
+
762
+ def generate_series(n: int, n_steps: int, incline: bool = True):
763
+ """Function that generates `n` series of length `n_steps`"""
764
+ freq1, freq2, offsets1, offsets2 = np.random.rand(4, n, 1)
765
+
766
+ if incline:
767
+ slope = np.random.rand(n, 1)
768
+ else:
769
+ slope = 0.0
770
+ offsets2 = 1
771
+
772
+ time = np.linspace(0, 1, n_steps)
773
+ series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10)) # wave 1
774
+ series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20)) # + wave 2
775
+ series += 0.7 * (np.random.rand(n, n_steps) - 0.5) # + noise
776
+ series += 5 * slope * time + 2 * (offsets2 - offsets1) * time ** (1 - offsets2)
777
+ series = series
778
+ return series.astype(np.float32)
779
+
780
+
781
+ def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray, print_error: bool = False):
782
+ """Calculates the Root Mean Squared Error
783
+
784
+ Parameters
785
+ ----------
786
+ y_true : `np.ndarray`
787
+ An array containing the true values.
788
+ y_pred : `np.ndarray`
789
+ An array containing the predicted values.
790
+
791
+ Returns
792
+ -------
793
+ RMSE : `float`
794
+ The Root Mean Squared Error.
795
+ """
796
+ if print_error:
797
+ print(f"The RMSE is {np.sqrt(np.mean((y_true - y_pred)**2))}")
798
+
799
+ return np.sqrt(np.mean((y_true - y_pred) ** 2))
800
+
801
+
802
+ class DataFrameEncoder:
803
+ """Allows encoding and decoding Dataframes"""
804
+
805
+ __slots__ = [
806
+ "_df",
807
+ "_names",
808
+ "_encode_columns",
809
+ "encoding_list",
810
+ "decoding_list",
811
+ "median_list",
812
+ ]
813
+
814
+ def __init__(self, data: pd.DataFrame) -> None:
815
+ """Sets the columns of the `DataFrame`"""
816
+ self._df = data.copy()
817
+ self._names = data.columns
818
+ self._encode_columns = []
819
+ self.encoding_list = []
820
+ self.decoding_list = []
821
+ self.median_list = []
822
+
823
+ def load_config(self, path_to_dictionaries: str = "./", **kwargs) -> None:
824
+ """Loads dictionaries from a given directory
825
+
826
+ Keyword Arguments
827
+ -----------------
828
+ - dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
829
+ """
830
+ dictionary_name = (
831
+ kwargs["dictionary_name"] if "dictionary_name" in kwargs else "labelencoder_dictionary"
832
+ )
833
+ with open(os.path.join(path_to_dictionaries, dictionary_name + ".pkl"), "rb") as file:
834
+ labelencoder = pickle.load(file)
835
+ self.encoding_list = labelencoder[0]
836
+ self.decoding_list = labelencoder[1]
837
+ self._encode_columns = labelencoder[2]
838
+ self.median_list = labelencoder[3]
839
+ print("Configuration successfully uploaded")
840
+
841
+ def train(self, path_to_save: str, **kwargs) -> None:
842
+ """Trains the encoders and decoders using the `DataFrame`"""
843
+ save_mode = kwargs["save_mode"] if "save_mode" in kwargs else True
844
+ dictionary_name = (
845
+ kwargs["dictionary_name"] if "dictionary_name" in kwargs else "labelencoder_dictionary"
846
+ )
847
+ norm_method = kwargs["norm_method"] if "norm_method" in kwargs else "None"
848
+ for i in self._names:
849
+ if self._df[i].dtype == "object":
850
+ self._encode_columns.append(i)
851
+ column_index = range(len(self._df[i].unique()))
852
+ column_keys = self._df[i].unique()
853
+ encode_dict = dict(zip(column_keys, column_index))
854
+ decode_dict = dict(zip(column_index, column_keys))
855
+ self._df[i] = self._df[i].apply(
856
+ self._code_transformation_to, dictionary_list=encode_dict
857
+ )
858
+ if len(self._df[i].unique()) > 1:
859
+ median_value = len(self._df[i].unique()) // 2
860
+ else:
861
+ median_value = 1.0
862
+ if norm_method == "median":
863
+ self._df[i] = self._df[i].astype("float64")
864
+ self._df[i] = self._df[i] / median_value
865
+ self.median_list.append(median_value)
866
+ self.encoding_list.append(encode_dict)
867
+ self.decoding_list.append(decode_dict)
868
+ if save_mode:
869
+ self._save_encoder(path_to_save, dictionary_name)
870
+
871
+ def encode(self, path_to_save: str = "./", **kwargs) -> pd.DataFrame:
872
+ """Encodes the `object` type columns of the dataframe
873
+
874
+ Keyword Arguments
875
+ -----------------
876
+ - save_mode (`bool`): An optional integer parameter. By default it is set to `True`
877
+ - dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
878
+ - norm_method (`str`): An optional string parameter to perform normalization. By default it is set to `None`
879
+ """
880
+ if len(self.encoding_list) == 0:
881
+ self.train(path_to_save, **kwargs)
882
+ return self._df
883
+
884
+ else:
885
+ print("Configuration detected")
886
+ if len(self.median_list) == len(self._encode_columns):
887
+ median_mode = True
888
+ else:
889
+ median_mode = False
890
+ for num, colname in enumerate(self._encode_columns):
891
+ if self._df[colname].dtype == "object":
892
+ encode_dict = self.encoding_list[num]
893
+ self._df[colname] = self._df[colname].apply(
894
+ self._code_transformation_to, dictionary_list=encode_dict
895
+ )
896
+ if median_mode:
897
+ self._df[colname] = self._df[colname].astype("float64")
898
+ self._df[colname] = self._df[colname] / self.median_list[num]
899
+ return self._df
900
+
901
+ def decode(self) -> pd.DataFrame:
902
+ """Decodes the `int` type columns of the `DataFrame`"""
903
+ j = 0
904
+ df_decoded = self._df.copy()
905
+ if len(self.median_list) == len(self._encode_columns):
906
+ median_mode = True
907
+ else:
908
+ median_mode = False
909
+ try:
910
+ number_of_columns = len(self.decoding_list[j])
911
+ for i in self._encode_columns:
912
+ if df_decoded[i].dtype == "int64" or df_decoded[i].dtype == "float64":
913
+ if median_mode:
914
+ df_decoded[i] = df_decoded[i] * self.median_list[j]
915
+ df_decoded[i] = df_decoded[i].astype("int64")
916
+ df_decoded[i] = df_decoded[i].apply(
917
+ self._code_transformation_to, dictionary_list=self.decoding_list[j]
918
+ )
919
+ j += 1
920
+ return df_decoded
921
+ except AttributeError as e:
922
+ warning_type = "UserWarning"
923
+ msg = "It is not possible to decode the dataframe, since it has not been encoded"
924
+ msg += "Error: {%s}" % e
925
+ print(f"{warning_type}: {msg}")
926
+
927
+ def get_dictionaries(self) -> Tuple[List[dict], List[dict]]:
928
+ """Allows to return the `list` of dictionaries for `encoding` and `decoding`"""
929
+ try:
930
+ return self.encoding_list, self.decoding_list
931
+ except ValueError as e:
932
+ warning_type = "UserWarning"
933
+ msg = "It is not possible to return the list of dictionaries as they have not been created."
934
+ msg += "Error: {%s}" % e
935
+ print(f"{warning_type}: {msg}")
936
+
937
+ def _save_encoder(self, path_to_save: str, dictionary_name: str) -> None:
938
+ """Method to serialize the `encoding_list`, `decoding_list` and `_encode_columns` list"""
939
+ with open(path_to_save + dictionary_name + ".pkl", "wb") as f:
940
+ pickle.dump(
941
+ [self.encoding_list, self.decoding_list, self._encode_columns, self.median_list], f
942
+ )
943
+
944
+ def _code_transformation_to(self, character: str, dictionary_list: List[dict]) -> int:
945
+ """Auxiliary function to perform data transformation using a dictionary
946
+
947
+ Parameters
948
+ ----------
949
+ character : `str`
950
+ A character data type.
951
+ dictionary_list : List[`dict`]
952
+ An object of dictionary type.
953
+
954
+ Returns
955
+ -------
956
+ dict_type[`character`] or `np.nan` if dict_type[`character`] doesn't exist.
957
+ """
958
+ try:
959
+ return dictionary_list[character]
960
+ except:
961
+ return np.nan
962
+
963
+
964
+ class PerformanceMeasures:
965
+ """Class with methods to measure performance"""
966
+
967
+ def __init__(self) -> None:
968
+ pass
969
+
970
+ def f_mean(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
971
+ F_vec = self._f1_score(y_true, y_pred, labels)
972
+ mean_f_measure = np.mean(F_vec)
973
+ mean_f_measure = np.around(mean_f_measure, decimals=4)
974
+
975
+ for label, f_measure in zip(labels, F_vec):
976
+ print(f"F-measure of label {label} -> {f_measure}")
977
+
978
+ print(f"Mean of F-measure -> {mean_f_measure}")
979
+
980
+ return mean_f_measure
981
+
982
+ def resp(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
983
+ T_C = len(y_true)
984
+ sum1, sum2 = 0.0, 0.0
985
+ F_vec = self._f1_score(y_true, y_pred, labels)
986
+
987
+ for label_idx, label in enumerate(labels):
988
+ class_instances = np.sum(y_true == label) / T_C
989
+ sum1 += (1 - class_instances) * F_vec[label_idx]
990
+ sum2 += 1 - class_instances
991
+
992
+ res_p = sum1 / sum2 if sum2 != 0 else 0.0
993
+ print(f"Metric Res_p -> {res_p}")
994
+
995
+ return res_p
996
+
997
+ def _summary_pred(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> None:
998
+ count_mat = self._confu_mat(y_true, y_pred, labels)
999
+ print(" " * 6, " | ".join(f"--{label}--" for label in labels))
1000
+ for i, label_i in enumerate(labels):
1001
+ row = [f" {int(count_mat[i, j]):5d} " for j in range(len(labels))]
1002
+ print(f"--{label_i}--|", " | ".join(row))
1003
+
1004
+ def _f1_score(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
1005
+ count_mat = self._confu_mat(y_true, y_pred, labels)
1006
+ sum_cols = np.sum(count_mat, axis=0)
1007
+ sum_rows = np.sum(count_mat, axis=1)
1008
+
1009
+ precision = np.divide(
1010
+ count_mat.diagonal(), sum_cols, out=np.zeros_like(sum_cols), where=sum_cols != 0
1011
+ )
1012
+ recall = np.divide(
1013
+ count_mat.diagonal(), sum_rows, out=np.zeros_like(sum_rows), where=sum_rows != 0
1014
+ )
1015
+ f1_vec = 2 * ((precision * recall) / (precision + recall))
1016
+ f1_vec = np.around(f1_vec, decimals=4)
1017
+
1018
+ return f1_vec
1019
+
1020
+ def _confu_mat(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
1021
+ num_classes = len(labels)
1022
+ label_mapping = {label: idx for idx, label in enumerate(labels)}
1023
+ count_mat = np.zeros((num_classes, num_classes))
1024
+
1025
+ for pred_label, true_label in zip(y_pred, y_true):
1026
+ if pred_label in label_mapping and true_label in label_mapping:
1027
+ count_mat[label_mapping[pred_label], label_mapping[true_label]] += 1
1028
+
1029
+ return count_mat
1030
+
1031
+
1032
+ class OneHotEncoder:
1033
+ """
1034
+ Class used to encode categorical variables.
1035
+ It receives an array of integers and returns a binary array using the one-hot encoding method.
1036
+ """
1037
+
1038
+ __slots__ = ["num_categories"]
1039
+
1040
+ def __init__(self) -> None:
1041
+ self.num_categories = None
1042
+
1043
+ def encode(self, x: np.ndarray | list, fit: bool = True):
1044
+ if not isinstance(x, np.ndarray):
1045
+ x = np.array(x)
1046
+ valid_mask = ~np.isnan(x)
1047
+ x_int = x[valid_mask].astype(int)
1048
+
1049
+ if fit:
1050
+ self.num_categories = x_int.max() + 1
1051
+ else:
1052
+ if np.any(x_int >= self.num_categories):
1053
+ self.num_categories = max(self.num_categories, x_int.max() + 1)
1054
+ y = np.zeros((x.shape[0], self.num_categories))
1055
+ y[np.where(valid_mask)[0], x_int] = 1
1056
+
1057
+ return y
1058
+
1059
+ def decode(self, x: np.ndarray | list) -> np.ndarray:
1060
+ if not isinstance(x, np.ndarray):
1061
+ x = np.array(x)
1062
+
1063
+ # rows that have at least one 1 (i.e., were valid)
1064
+ valid_mask = x.sum(axis=1) > 0
1065
+
1066
+ y = np.full(x.shape[0], np.nan)
1067
+ y[valid_mask] = np.argmax(x[valid_mask], axis=1)
1068
+ return y
1069
+
1070
+
1071
+ class FeatureSelection:
1072
+ """
1073
+ Generate the data graph using a variation of the feature selection algorithm.
1074
+
1075
+ - The method `get_digraph` returns the network based on the feature selection method.
1076
+ """
1077
+
1078
+ __slots__ = ["not_features", "X", "all_features_imp_graph", "w_dict", "scaler"]
1079
+
1080
+ def __init__(self, not_features: list[str] = []) -> None:
1081
+ """The initializer of the class. The initial parameter is a list of strings with variables to discard."""
1082
+ self.not_features: List[str] = not_features
1083
+ self.all_features_imp_graph: List[Tuple] = []
1084
+ self.w_dict = dict()
1085
+
1086
+ def get_digraph(
1087
+ self, dataset: pd.DataFrame, n_importances: int, use_scaler: bool = False
1088
+ ) -> str:
1089
+ """
1090
+ Get directed graph showing importance of features.
1091
+
1092
+ Parameters
1093
+ ----------
1094
+ dataset : `pd.DataFrame`
1095
+ Dataset to be used for generating the graph.
1096
+ n_importances : `int`
1097
+ Number of top importances to show in the graph.
1098
+
1099
+ Returns
1100
+ -------
1101
+ `str`
1102
+ A string representation of the directed graph.
1103
+ """
1104
+ self._load_data(dataset)
1105
+
1106
+ curr_dataset = self.X
1107
+ columns = list(curr_dataset.columns)
1108
+
1109
+ feature_string = " digraph { "
1110
+ for column in columns:
1111
+ feature_string += column + "; "
1112
+
1113
+ numeric_df = curr_dataset.select_dtypes(include="number")
1114
+ if use_scaler:
1115
+ self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
1116
+ numeric_scaled = self.scaler.rescale()
1117
+ numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
1118
+ curr_dataset[numeric_df.columns] = numeric_df
1119
+
1120
+ numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
1121
+
1122
+ for index_column, column in enumerate(columns):
1123
+ Y = curr_dataset[column]
1124
+ column_type = Y.dtype
1125
+ if column_type != "object":
1126
+ Model = LinearRegression()
1127
+ X_aux = curr_dataset.drop([column], axis=1)
1128
+ dfe = DataFrameEncoder(X_aux)
1129
+ encoded_df = dfe.encode(save_mode=False)
1130
+ Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
1131
+ importance = Model.get_importances()
1132
+ w = Model.w
1133
+ else:
1134
+ Model = LogisticRegression()
1135
+ num_unique_entries = curr_dataset[column].nunique()
1136
+ quick_encoder = DataFrameEncoder(Y.to_frame())
1137
+ encoded_Y = quick_encoder.encode(save_mode=False)
1138
+ one_hot = OneHotEncoder()
1139
+ train_y = one_hot.encode(encoded_Y[column])
1140
+ for i in range(len(train_y)):
1141
+ for j in range(num_unique_entries):
1142
+ if train_y[i][j] == 1.0:
1143
+ train_y[i][j] = 0.73105
1144
+ else:
1145
+ train_y[i][j] = 0.5
1146
+ X_aux = curr_dataset.drop([column], axis=1)
1147
+ dfe = DataFrameEncoder(X_aux)
1148
+ encoded_df = dfe.encode(save_mode=False)
1149
+ Model.fit(encoded_df.to_numpy().T, train_y)
1150
+ importance = Model.get_importances()
1151
+ w = Model.w
1152
+ top_n_indexes = sorted(
1153
+ range(len(importance)), key=lambda i: importance[i], reverse=True
1154
+ )[:n_importances]
1155
+
1156
+ names_cols = list(X_aux.columns)
1157
+ features_imp_node = [
1158
+ (names_cols[top_n_indexes[i]], importance[top_n_indexes[i]])
1159
+ for i in range(n_importances)
1160
+ ]
1161
+
1162
+ if column_type != "object":
1163
+ self.w_dict[column] = (w, None, names_cols, dfe, numeric_dict)
1164
+ else:
1165
+ self.w_dict[column] = (w, quick_encoder, names_cols, dfe, numeric_dict)
1166
+ self.all_features_imp_graph.append((column, features_imp_node))
1167
+ for i in top_n_indexes:
1168
+ feature_string += names_cols[i] + " -> "
1169
+
1170
+ feature_string += column + "; "
1171
+
1172
+ return feature_string + "} "
1173
+
1174
+ def _load_data(self, dataset: pd.DataFrame):
1175
+ if len(self.not_features) > 0:
1176
+ self.X = dataset.drop(columns=self.not_features)
1177
+
1178
+ else:
1179
+ self.X = dataset
1180
+
1181
+ self.X.replace([np.inf, -np.inf], np.nan, inplace=True)
1182
+ self.X.replace(" ", np.nan, inplace=True)
1183
+ self.X.dropna(inplace=True)
1184
+ self.X = self.X.reset_index()
1185
+ self.X = self.X.drop(columns=["index"])
1186
+
1187
+
1188
+ def check_nan_inf(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
1189
+ """
1190
+ Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
1191
+
1192
+ Parameters
1193
+ ----------
1194
+ df : pd.DataFrame
1195
+ The input DataFrame to be checked.
1196
+
1197
+ Returns
1198
+ -------
1199
+ pd.DataFrame
1200
+ A new DataFrame with NaN and Inf values removed.
1201
+ """
1202
+
1203
+ nan_values = df.isnull().values.any()
1204
+ inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
1205
+
1206
+ nan_count = df.isnull().values.sum()
1207
+ inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
1208
+
1209
+ if nan_values:
1210
+ (
1211
+ print(
1212
+ "UserWarning: Some rows may have been deleted due to the existence of NaN values.",
1213
+ f"NaN values removed: ",
1214
+ "{:,}".format(nan_count),
1215
+ )
1216
+ if verbose
1217
+ else None
1218
+ )
1219
+ df.dropna(inplace=True)
1220
+
1221
+ if inf_values:
1222
+ (
1223
+ print(
1224
+ "UserWarning: Some rows may have been deleted due to the existence of Inf values.",
1225
+ f"Infinite values removed: ",
1226
+ "{:,}".format(inf_count),
1227
+ )
1228
+ if verbose
1229
+ else None
1230
+ )
1231
+ df.replace([np.inf, -np.inf], np.nan, inplace=True)
1232
+ df.dropna(inplace=True)
1233
+
1234
+ return df
1235
+
1236
+
1237
+ # -------------------------------------------------------------------------
1238
+ if __name__ == "__main__":
1239
+ y_true = np.array([1, 2, 2, 1, 1])
1240
+ y_pred = np.array([1, 1, 2, 2, 1])
1241
+
1242
+ labels = [1, 2]
1243
+ helper = PerformanceMeasures()
1244
+ helper._summary_pred(y_true, y_pred, labels)
1245
+ print(helper._f1_score(y_true, y_pred, labels))
1246
+ print(helper.f_mean(y_true, y_pred, labels))
1247
+
1248
+ # Use DataFrameEncoder
1249
+ data = {"Name": ["John", "Alice", "Bob", "Jafet", "Beto"], "Age": [25, 30, 35, 21, 28]}
1250
+ import pandas as pd
1251
+
1252
+ df = pd.DataFrame(data)
1253
+ # Instantiate DataFrameEncoder
1254
+ dfe = DataFrameEncoder(df)
1255
+ # Encode the dataframe
1256
+ encoded_df = dfe.encode(norm_method="median")
1257
+ # Decode the dataframe
1258
+ decoded_df = dfe.decode()
1259
+
1260
+ # Instantiate DataFrameEncoder
1261
+ # Use load_config method
1262
+ dfe2 = DataFrameEncoder(df)
1263
+ dfe2.load_config()
1264
+
1265
+ encoded_df2 = dfe2.encode()
1266
+ # Decode the dataframe
1267
+ decoded_df2 = dfe2.decode()
1268
+ # Check if the loaded dictionaries match the original ones
1269
+ assert dfe.encoding_list == dfe2.encoding_list
1270
+ assert dfe.decoding_list == dfe2.decoding_list
1271
+
1272
+ # Generate data
1273
+ x = np.random.rand(3, 100)
1274
+ y = 0.1 * x[0, :] + 0.4 * x[1, :] + 0.5 * x[2, :] + 0.1
1275
+ # Create a DataFrame
1276
+ df = pd.DataFrame(x.T, columns=["x1", "x2", "x3"])
1277
+ df["y"] = y
1278
+ # Instantiate FeatureSelection
1279
+ fs = FeatureSelection()
1280
+ print(fs.get_digraph(df, n_importances=1))
1281
+
1282
+ linear_model = LinearRegression()
1283
+ linear_model.fit(x, y)
1284
+ importance = linear_model.get_importances()
1285
+ y_hat = linear_model.predict(x)
1286
+
1287
+ # Graph the data for visualization
1288
+ plt.plot(x[0, :], y, "o", label="Original Data")
1289
+ plt.plot(x[0, :], y_hat, "x", label="$\\hat{y}$")
1290
+ plt.legend()
1291
+ plt.xlabel("$x$")
1292
+ plt.ylabel("$y, \\hat{y}$")
1293
+ plt.show()
1294
+
1295
+ a = generate_series(1, 40, incline=False)
1296
+ # Graph the data for visualization
1297
+ plt.plot(range(len(a[0, :])), a[0, :], label="Original Data")
1298
+ plt.legend()
1299
+ plt.xlabel("Time periods")
1300
+ plt.ylabel("$y(t)$")
1301
+ plt.show()
1302
+
1303
+ a_denoise, _ = fft_denoise(a)
1304
+
1305
+ plt.plot(range(len(a_denoise[0, :])), a_denoise[0, :], label="Denoise Data")
1306
+ plt.legend()
1307
+ plt.xlabel("Time periods")
1308
+ plt.ylabel("$y(t)$")
1309
+ plt.show()
1310
+
1311
+ # Calculate the autocorrelation of the data
1312
+ z = AutoCorrelation(a[0, :])
1313
+ z.plot()
1314
+ # print(z())
1315
+
1316
+ N = 1000
1317
+ mu = np.random.uniform(0, 10.0)
1318
+ sigma = np.random.uniform(0.1, 1.0)
1319
+ x = np.random.normal(mu, sigma, N)
1320
+ f, cdf_, ox = cdf(x, plot=True)
1321
+ invf, cdf_, ox = cdf(x, plot=True, inv=True)
1322
+
1323
+ encoder = OneHotEncoder()
1324
+ encoding = encoder.encode([1, 2, 3, 4, 5])
1325
+ assert np.array_equal(
1326
+ encoding,
1327
+ np.array(
1328
+ [
1329
+ [0, 1, 0, 0, 0, 0],
1330
+ [0, 0, 1, 0, 0, 0],
1331
+ [0, 0, 0, 1, 0, 0],
1332
+ [0, 0, 0, 0, 1, 0],
1333
+ [0, 0, 0, 0, 0, 1],
1334
+ ]
1335
+ ),
1336
+ )