likelihood 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/nn.py +8 -2
- likelihood/models/deep/autoencoders.py +312 -109
- likelihood/models/simulation.py +9 -9
- likelihood/tools/figures.py +348 -0
- likelihood/tools/impute.py +279 -0
- likelihood/tools/models_tools.py +161 -9
- likelihood/tools/numeric_tools.py +21 -0
- likelihood/tools/tools.py +46 -92
- {likelihood-1.4.1.dist-info → likelihood-1.5.1.dist-info}/METADATA +3 -2
- likelihood-1.5.1.dist-info/RECORD +23 -0
- {likelihood-1.4.1.dist-info → likelihood-1.5.1.dist-info}/WHEEL +1 -1
- likelihood-1.4.1.dist-info/RECORD +0 -21
- {likelihood-1.4.1.dist-info → likelihood-1.5.1.dist-info/licenses}/LICENSE +0 -0
- {likelihood-1.4.1.dist-info → likelihood-1.5.1.dist-info}/top_level.txt +0 -0
likelihood/tools/models_tools.py
CHANGED
|
@@ -3,11 +3,148 @@ import os
|
|
|
3
3
|
|
|
4
4
|
import networkx as nx
|
|
5
5
|
import pandas as pd
|
|
6
|
+
from pandas.core.frame import DataFrame
|
|
6
7
|
|
|
7
8
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
8
9
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
9
10
|
|
|
11
|
+
import sys
|
|
12
|
+
import warnings
|
|
13
|
+
from functools import wraps
|
|
14
|
+
from typing import Dict
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
10
17
|
import tensorflow as tf
|
|
18
|
+
from pandas.core.frame import DataFrame
|
|
19
|
+
|
|
20
|
+
from .figures import *
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class suppress_prints:
|
|
24
|
+
def __enter__(self):
|
|
25
|
+
self.original_stdout = sys.stdout
|
|
26
|
+
sys.stdout = open(os.devnull, "w")
|
|
27
|
+
|
|
28
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
29
|
+
sys.stdout.close()
|
|
30
|
+
sys.stdout = self.original_stdout
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def suppress_warnings(func):
|
|
34
|
+
@wraps(func)
|
|
35
|
+
def wrapper(*args, **kwargs):
|
|
36
|
+
with warnings.catch_warnings():
|
|
37
|
+
warnings.simplefilter("ignore")
|
|
38
|
+
return func(*args, **kwargs)
|
|
39
|
+
|
|
40
|
+
return wrapper
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def remove_collinearity(df: DataFrame, threshold: float = 0.9):
|
|
44
|
+
"""
|
|
45
|
+
Removes highly collinear features from the DataFrame based on a correlation threshold.
|
|
46
|
+
|
|
47
|
+
This function calculates the correlation matrix of the DataFrame and removes columns
|
|
48
|
+
that are highly correlated with any other column in the DataFrame. It uses an absolute
|
|
49
|
+
correlation value greater than the specified threshold to identify which columns to drop.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
df : `DataFrame`
|
|
54
|
+
The input DataFrame containing numerical data.
|
|
55
|
+
threshold : `float`
|
|
56
|
+
The correlation threshold above which features will be removed. Default is `0.9`.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
----------
|
|
60
|
+
DataFrame: A DataFrame with highly collinear features removed.
|
|
61
|
+
"""
|
|
62
|
+
corr_matrix = df.corr().abs()
|
|
63
|
+
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
64
|
+
to_drop = [
|
|
65
|
+
column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)
|
|
66
|
+
]
|
|
67
|
+
df_reduced = df.drop(columns=to_drop)
|
|
68
|
+
|
|
69
|
+
return df_reduced
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def train_and_insights(
|
|
73
|
+
x_data: np.ndarray,
|
|
74
|
+
y_act: np.ndarray,
|
|
75
|
+
model: tf.keras.Model,
|
|
76
|
+
patience: int = 3,
|
|
77
|
+
reg: bool = False,
|
|
78
|
+
frac: float = 1.0,
|
|
79
|
+
**kwargs: Optional[Dict],
|
|
80
|
+
) -> tf.keras.Model:
|
|
81
|
+
"""
|
|
82
|
+
Train a Keras model and provide insights on the training and validation metrics.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
x_data : `np.ndarray`
|
|
87
|
+
Input data for training the model.
|
|
88
|
+
y_act : `np.ndarray`
|
|
89
|
+
Actual labels corresponding to x_data.
|
|
90
|
+
model : `tf.keras.Model`
|
|
91
|
+
The Keras model to train.
|
|
92
|
+
patience : `int`
|
|
93
|
+
The patience parameter for early stopping callback (default is 3).
|
|
94
|
+
reg : `bool`
|
|
95
|
+
Flag to determine if residual analysis should be performed (default is `False`).
|
|
96
|
+
frac : `float`
|
|
97
|
+
Fraction of data to use (default is 1.0).
|
|
98
|
+
|
|
99
|
+
Keyword Arguments:
|
|
100
|
+
----------
|
|
101
|
+
Additional keyword arguments passed to the `model.fit` function, such as validation split and callbacks.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
----------
|
|
105
|
+
`tf.keras.Model`
|
|
106
|
+
The trained model after fitting.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
validation_split = kwargs.get("validation_split", 0.2)
|
|
110
|
+
callback = kwargs.get(
|
|
111
|
+
"callback", [tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=patience)]
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
for key in ["validation_split", "callback"]:
|
|
115
|
+
if key in kwargs:
|
|
116
|
+
del kwargs[key]
|
|
117
|
+
|
|
118
|
+
history = model.fit(
|
|
119
|
+
x_data,
|
|
120
|
+
y_act,
|
|
121
|
+
validation_split=validation_split,
|
|
122
|
+
verbose=False,
|
|
123
|
+
callbacks=callback,
|
|
124
|
+
**kwargs,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
hist = pd.DataFrame(history.history)
|
|
128
|
+
hist["epoch"] = history.epoch
|
|
129
|
+
|
|
130
|
+
columns = hist.columns
|
|
131
|
+
train_err, train_metric = columns[0], columns[1]
|
|
132
|
+
val_err, val_metric = columns[2], columns[3]
|
|
133
|
+
train_err, val_err = hist[train_err].values, hist[val_err].values
|
|
134
|
+
|
|
135
|
+
with suppress_prints():
|
|
136
|
+
n = int(len(x_data) * frac)
|
|
137
|
+
y_pred = model.predict(x_data[:n])
|
|
138
|
+
y_act = y_act[:n]
|
|
139
|
+
|
|
140
|
+
if reg:
|
|
141
|
+
residual(y_act, y_pred)
|
|
142
|
+
residual_hist(y_act, y_pred)
|
|
143
|
+
act_pred(y_act, y_pred)
|
|
144
|
+
|
|
145
|
+
loss_curve(hist["epoch"].values, train_err, val_err)
|
|
146
|
+
|
|
147
|
+
return model
|
|
11
148
|
|
|
12
149
|
|
|
13
150
|
@tf.keras.utils.register_keras_serializable(package="Custom", name="LoRALayer")
|
|
@@ -58,16 +195,31 @@ def apply_lora(model, rank=4):
|
|
|
58
195
|
return new_model
|
|
59
196
|
|
|
60
197
|
|
|
61
|
-
def graph_metrics(adj_matrix, eigenvector_threshold=1e-6):
|
|
198
|
+
def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -> DataFrame:
|
|
62
199
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
200
|
+
Calculate various graph metrics based on the given adjacency matrix and return them in a single DataFrame.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
adj_matrix : `np.ndarray`
|
|
205
|
+
The adjacency matrix representing the graph, where each element denotes the presence/weight of an edge between nodes.
|
|
206
|
+
eigenvector_threshold : `float`
|
|
207
|
+
A threshold for the eigenvector centrality calculation, used to determine the cutoff for small eigenvalues. Default is `1e-6`.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
----------
|
|
211
|
+
DataFrame : A DataFrame containing the following graph metrics as columns.
|
|
212
|
+
- `Degree Centrality`: Degree centrality values for each node, indicating the number of direct connections each node has.
|
|
213
|
+
- `Clustering Coefficient`: Clustering coefficient values for each node, representing the degree to which nodes cluster together.
|
|
214
|
+
- `Eigenvector Centrality`: Eigenvector centrality values, indicating the influence of a node in the graph based on the eigenvectors of the adjacency matrix.
|
|
215
|
+
- `Degree`: The degree of each node, representing the number of edges connected to each node.
|
|
216
|
+
- `Betweenness Centrality`: Betweenness centrality values, representing the extent to which a node lies on the shortest paths between other nodes.
|
|
217
|
+
- `Closeness Centrality`: Closeness centrality values, indicating the inverse of the average shortest path distance from a node to all other nodes in the graph.
|
|
218
|
+
- `Assortativity`: The assortativity coefficient of the graph, measuring the tendency of nodes to connect to similar nodes.
|
|
219
|
+
|
|
220
|
+
Notes
|
|
221
|
+
----------
|
|
222
|
+
The returned DataFrame will have one row for each node and one column for each of the computed metrics.
|
|
71
223
|
"""
|
|
72
224
|
adj_matrix = adj_matrix.astype(int)
|
|
73
225
|
G = nx.from_numpy_array(adj_matrix)
|
|
@@ -345,6 +345,27 @@ def gauss_elimination(A: ndarray | list, pr: int = 2) -> ndarray:
|
|
|
345
345
|
return X
|
|
346
346
|
|
|
347
347
|
|
|
348
|
+
def find_multiples(target: int) -> tuple[int, int] | None:
|
|
349
|
+
"""Find two factors of a given target number.
|
|
350
|
+
|
|
351
|
+
Parameters
|
|
352
|
+
----------
|
|
353
|
+
target : int
|
|
354
|
+
The target number to find factors for.
|
|
355
|
+
|
|
356
|
+
Returns
|
|
357
|
+
-------
|
|
358
|
+
tuple[int, int] | None
|
|
359
|
+
A tuple containing two factors of the target number.
|
|
360
|
+
Returns None if no factors are found.
|
|
361
|
+
"""
|
|
362
|
+
for i in range(2, target + 1):
|
|
363
|
+
if target % i == 0:
|
|
364
|
+
factor = target // i
|
|
365
|
+
return i, factor
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
|
|
348
369
|
# Example usage:
|
|
349
370
|
if __name__ == "__main__":
|
|
350
371
|
import pandas as pd
|
likelihood/tools/tools.py
CHANGED
|
@@ -169,7 +169,6 @@ def generate_feature_yaml(
|
|
|
169
169
|
return feature_info
|
|
170
170
|
|
|
171
171
|
|
|
172
|
-
# a function that calculates the percentage of missing values per column is defined
|
|
173
172
|
def cal_missing_values(df: DataFrame) -> None:
|
|
174
173
|
"""Calculate the percentage of missing (`NaN`/`NaT`) values per column in a dataframe.
|
|
175
174
|
|
|
@@ -180,8 +179,7 @@ def cal_missing_values(df: DataFrame) -> None:
|
|
|
180
179
|
|
|
181
180
|
Returns
|
|
182
181
|
-------
|
|
183
|
-
`None`
|
|
184
|
-
Prints out a table with columns as index and percentages of missing values as data.
|
|
182
|
+
`None` : Prints out a table with columns as index and percentages of missing values as data.
|
|
185
183
|
"""
|
|
186
184
|
|
|
187
185
|
col = df.columns
|
|
@@ -226,7 +224,6 @@ def cdf(
|
|
|
226
224
|
cdf_values = np.cumsum(x) / np.sum(x)
|
|
227
225
|
sorted_x = np.sort(x)
|
|
228
226
|
|
|
229
|
-
# Calculate the CDF or inverse CDF (quantile function)
|
|
230
227
|
probabilities = np.linspace(0, 1, len(sorted_x))
|
|
231
228
|
|
|
232
229
|
if inv:
|
|
@@ -281,7 +278,6 @@ def calculate_probability(x: np.ndarray, points: int = 1, cond: bool = True) ->
|
|
|
281
278
|
fit, _, sorted_x = cdf(x)
|
|
282
279
|
p = fit(x)
|
|
283
280
|
|
|
284
|
-
# Validate probability values
|
|
285
281
|
if cond:
|
|
286
282
|
prob_value = np.prod(p[-points])
|
|
287
283
|
message = "product"
|
|
@@ -304,7 +300,7 @@ class CorrelationBase:
|
|
|
304
300
|
|
|
305
301
|
def __init__(self, x: np.ndarray, y: Union[np.ndarray, None] = None):
|
|
306
302
|
self.x = x
|
|
307
|
-
self.y = y if y is not None else x
|
|
303
|
+
self.y = y if y is not None else x
|
|
308
304
|
self._compute_correlation()
|
|
309
305
|
self.z = self.result[self.result.size // 2 :]
|
|
310
306
|
self.z /= np.abs(self.z).max()
|
|
@@ -395,7 +391,6 @@ def fft_denoise(
|
|
|
395
391
|
denoised_dataset = np.zeros_like(dataset)
|
|
396
392
|
periods = np.zeros(num_samples)
|
|
397
393
|
|
|
398
|
-
# Precompute values that do not change within the loop
|
|
399
394
|
freq = (1 / n_points) * np.arange(n_points)
|
|
400
395
|
L = np.arange(1, np.floor(n_points / 2), dtype=int)
|
|
401
396
|
|
|
@@ -405,15 +400,12 @@ def fft_denoise(
|
|
|
405
400
|
threshold = np.mean(PSD) + sigma * np.std(PSD)
|
|
406
401
|
indices = PSD > threshold
|
|
407
402
|
|
|
408
|
-
# Zero out all others in frequency domain
|
|
409
403
|
PSDclean = PSD * indices
|
|
410
404
|
fhat_cleaned = fhat * indices
|
|
411
405
|
|
|
412
|
-
# Inverse FFT for filtered time signal
|
|
413
406
|
denoised_signal = np.fft.ifft(fhat_cleaned).real
|
|
414
407
|
denoised_dataset[i, :] = denoised_signal
|
|
415
408
|
|
|
416
|
-
# Calculate the period of the signal
|
|
417
409
|
peak_index = L[np.argmax(np.abs(fhat[L]))]
|
|
418
410
|
periods[i] = 1 / (2 * freq[peak_index])
|
|
419
411
|
|
|
@@ -430,33 +422,27 @@ def get_period(dataset: np.ndarray) -> float:
|
|
|
430
422
|
Parameters
|
|
431
423
|
----------
|
|
432
424
|
dataset : `ndarray`
|
|
433
|
-
the `dataset` describing the function over which the period is calculated
|
|
425
|
+
the `dataset` describing the function over which the period is calculated.
|
|
434
426
|
|
|
435
427
|
Returns
|
|
436
428
|
-------
|
|
437
429
|
period : `float`
|
|
438
|
-
period of the function described by the `dataset
|
|
430
|
+
period of the function described by the `dataset`.
|
|
439
431
|
"""
|
|
440
432
|
n = dataset.size
|
|
441
433
|
|
|
442
|
-
# Ensure there are enough points for FFT analysis
|
|
443
434
|
if n < 2:
|
|
444
435
|
raise ValueError("Dataset must contain at least two points.")
|
|
445
436
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
freqs = np.fft.rfftfreq(n) # Get only positive frequencies
|
|
437
|
+
fhat = np.fft.rfft(dataset)
|
|
438
|
+
freqs = np.fft.rfftfreq(n)
|
|
449
439
|
|
|
450
|
-
# Calculate the Power Spectral Density (PSD)
|
|
451
440
|
PSD = np.abs(fhat) ** 2 / n
|
|
452
441
|
|
|
453
|
-
# Remove the first frequency component (DC component)
|
|
454
442
|
PSD[0] = 0
|
|
455
443
|
|
|
456
|
-
# Find the index of the maximum PSD value, excluding the DC component
|
|
457
444
|
max_psd_index = np.argmax(PSD)
|
|
458
445
|
|
|
459
|
-
# Calculate the period based on the corresponding frequency
|
|
460
446
|
dominant_freq = freqs[max_psd_index]
|
|
461
447
|
if dominant_freq == 0:
|
|
462
448
|
raise ValueError("No significant periodic component found in the dataset.")
|
|
@@ -472,12 +458,12 @@ def sigmoide_inv(y: float) -> float:
|
|
|
472
458
|
Parameters
|
|
473
459
|
----------
|
|
474
460
|
y : `float`
|
|
475
|
-
the number to evaluate the function
|
|
461
|
+
the number to evaluate the function.
|
|
476
462
|
|
|
477
463
|
Returns
|
|
478
464
|
-------
|
|
479
465
|
`float`
|
|
480
|
-
value of evaluated function
|
|
466
|
+
value of evaluated function.
|
|
481
467
|
"""
|
|
482
468
|
|
|
483
469
|
return math.log(y / (1 - y))
|
|
@@ -540,6 +526,10 @@ class LogisticRegression:
|
|
|
540
526
|
datapoints : `np.array`
|
|
541
527
|
An array containing the values of the independent variable.
|
|
542
528
|
|
|
529
|
+
Returns
|
|
530
|
+
-------
|
|
531
|
+
`np.array`
|
|
532
|
+
|
|
543
533
|
"""
|
|
544
534
|
sig = np.vectorize(sigmoide)
|
|
545
535
|
|
|
@@ -558,8 +548,6 @@ class LogisticRegression:
|
|
|
558
548
|
-------
|
|
559
549
|
importance : `np.array`
|
|
560
550
|
An array containing the importance of each feature.
|
|
561
|
-
|
|
562
|
-
|
|
563
551
|
"""
|
|
564
552
|
if print_important_features:
|
|
565
553
|
for i, a in enumerate(self.importance):
|
|
@@ -589,9 +577,7 @@ class LinearRegression:
|
|
|
589
577
|
|
|
590
578
|
Returns
|
|
591
579
|
-------
|
|
592
|
-
|
|
593
|
-
An array containing the importance of each feature.
|
|
594
|
-
|
|
580
|
+
`None` : The function doesn't return anything.
|
|
595
581
|
"""
|
|
596
582
|
|
|
597
583
|
self.X = dataset
|
|
@@ -635,8 +621,6 @@ class LinearRegression:
|
|
|
635
621
|
-------
|
|
636
622
|
importance : `np.array`
|
|
637
623
|
An array containing the importance of each feature.
|
|
638
|
-
|
|
639
|
-
|
|
640
624
|
"""
|
|
641
625
|
if print_important_features:
|
|
642
626
|
for i, a in enumerate(self.importance):
|
|
@@ -658,7 +642,6 @@ def cal_average(y: np.ndarray, alpha: float = 1):
|
|
|
658
642
|
-------
|
|
659
643
|
average : `float`
|
|
660
644
|
The average of the data.
|
|
661
|
-
|
|
662
645
|
"""
|
|
663
646
|
|
|
664
647
|
n = int(alpha * len(y))
|
|
@@ -799,7 +782,6 @@ def mean_square_error(y_true: np.ndarray, y_pred: np.ndarray, print_error: bool
|
|
|
799
782
|
-------
|
|
800
783
|
RMSE : `float`
|
|
801
784
|
The Root Mean Squared Error.
|
|
802
|
-
|
|
803
785
|
"""
|
|
804
786
|
if print_error:
|
|
805
787
|
print(f"The RMSE is {np.sqrt(np.mean((y_true - y_pred)**2))}")
|
|
@@ -975,7 +957,6 @@ class PerformanceMeasures:
|
|
|
975
957
|
def __init__(self) -> None:
|
|
976
958
|
pass
|
|
977
959
|
|
|
978
|
-
# Performance measure Res_T
|
|
979
960
|
def f_mean(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
|
|
980
961
|
F_vec = self._f1_score(y_true, y_pred, labels)
|
|
981
962
|
mean_f_measure = np.mean(F_vec)
|
|
@@ -988,7 +969,6 @@ class PerformanceMeasures:
|
|
|
988
969
|
|
|
989
970
|
return mean_f_measure
|
|
990
971
|
|
|
991
|
-
# Performance measure Res_P
|
|
992
972
|
def resp(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> float:
|
|
993
973
|
T_C = len(y_true)
|
|
994
974
|
sum1, sum2 = 0.0, 0.0
|
|
@@ -999,7 +979,7 @@ class PerformanceMeasures:
|
|
|
999
979
|
sum1 += (1 - class_instances) * F_vec[label_idx]
|
|
1000
980
|
sum2 += 1 - class_instances
|
|
1001
981
|
|
|
1002
|
-
res_p = sum1 / sum2 if sum2 != 0 else 0.0
|
|
982
|
+
res_p = sum1 / sum2 if sum2 != 0 else 0.0
|
|
1003
983
|
print(f"Metric Res_p -> {res_p}")
|
|
1004
984
|
|
|
1005
985
|
return res_p
|
|
@@ -1016,7 +996,6 @@ class PerformanceMeasures:
|
|
|
1016
996
|
sum_cols = np.sum(count_mat, axis=0)
|
|
1017
997
|
sum_rows = np.sum(count_mat, axis=1)
|
|
1018
998
|
|
|
1019
|
-
# Avoid division by zero
|
|
1020
999
|
precision = np.divide(
|
|
1021
1000
|
count_mat.diagonal(), sum_cols, out=np.zeros_like(sum_cols), where=sum_cols != 0
|
|
1022
1001
|
)
|
|
@@ -1028,7 +1007,6 @@ class PerformanceMeasures:
|
|
|
1028
1007
|
|
|
1029
1008
|
return f1_vec
|
|
1030
1009
|
|
|
1031
|
-
# Returns confusion matrix of predictions
|
|
1032
1010
|
def _confu_mat(self, y_true: np.ndarray, y_pred: np.ndarray, labels: List[int]) -> np.ndarray:
|
|
1033
1011
|
num_classes = len(labels)
|
|
1034
1012
|
label_mapping = {label: idx for idx, label in enumerate(labels)}
|
|
@@ -1056,21 +1034,18 @@ class OneHotEncoder:
|
|
|
1056
1034
|
self.x = x
|
|
1057
1035
|
|
|
1058
1036
|
if not isinstance(self.x, np.ndarray):
|
|
1059
|
-
self.x = np.array(self.x)
|
|
1037
|
+
self.x = np.array(self.x)
|
|
1060
1038
|
|
|
1061
|
-
y = np.zeros(
|
|
1062
|
-
(self.x.size, self.x.max() + 1)
|
|
1063
|
-
) # Build matrix of (size num of entries) x (max value + 1)
|
|
1039
|
+
y = np.zeros((self.x.size, self.x.max() + 1))
|
|
1064
1040
|
|
|
1065
|
-
y[np.arange(self.x.size), self.x] = 1
|
|
1041
|
+
y[np.arange(self.x.size), self.x] = 1
|
|
1066
1042
|
|
|
1067
1043
|
return y
|
|
1068
1044
|
|
|
1069
1045
|
def decode(self, x: np.ndarray | list) -> np.ndarray:
|
|
1070
1046
|
if not isinstance(x, np.ndarray):
|
|
1071
|
-
x = np.array(x)
|
|
1047
|
+
x = np.array(x)
|
|
1072
1048
|
|
|
1073
|
-
# We return the max values of each row
|
|
1074
1049
|
y = np.argmax(x, axis=1)
|
|
1075
1050
|
|
|
1076
1051
|
return y
|
|
@@ -1107,13 +1082,11 @@ class FeatureSelection:
|
|
|
1107
1082
|
`str`
|
|
1108
1083
|
A string representation of the directed graph.
|
|
1109
1084
|
"""
|
|
1110
|
-
# Assign and clean dataset
|
|
1111
1085
|
self._load_data(dataset)
|
|
1112
1086
|
|
|
1113
1087
|
curr_dataset = self.X
|
|
1114
1088
|
columns = list(curr_dataset.columns)
|
|
1115
1089
|
|
|
1116
|
-
# We construct string from causal_graph
|
|
1117
1090
|
feature_string = " digraph { "
|
|
1118
1091
|
for column in columns:
|
|
1119
1092
|
feature_string += column + "; "
|
|
@@ -1125,85 +1098,53 @@ class FeatureSelection:
|
|
|
1125
1098
|
numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
|
|
1126
1099
|
curr_dataset[numeric_df.columns] = numeric_df
|
|
1127
1100
|
|
|
1128
|
-
# We construct dictionary to save index for scaling
|
|
1129
1101
|
numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
|
|
1130
1102
|
|
|
1131
|
-
# Iterate over all the columns to obtain their importances.
|
|
1132
1103
|
for index_column, column in enumerate(columns):
|
|
1133
|
-
|
|
1134
|
-
# Variable to predict
|
|
1135
1104
|
Y = curr_dataset[column]
|
|
1136
|
-
|
|
1137
|
-
# We check whether it is numerical or categorical.
|
|
1138
1105
|
column_type = Y.dtype
|
|
1139
1106
|
if column_type != "object":
|
|
1140
|
-
# Linear regression model
|
|
1141
1107
|
Model = LinearRegression()
|
|
1142
|
-
|
|
1143
|
-
# Auxiliary dataset without the column in question
|
|
1144
1108
|
X_aux = curr_dataset.drop([column], axis=1)
|
|
1145
|
-
|
|
1146
|
-
# We encode
|
|
1147
1109
|
dfe = DataFrameEncoder(X_aux)
|
|
1148
1110
|
encoded_df = dfe.encode(save_mode=False)
|
|
1149
|
-
# We train
|
|
1150
1111
|
Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
|
|
1151
|
-
# We obtain importance
|
|
1152
1112
|
importance = Model.get_importances()
|
|
1153
1113
|
w = Model.w
|
|
1154
1114
|
else:
|
|
1155
1115
|
Model = LogisticRegression()
|
|
1156
1116
|
num_unique_entries = curr_dataset[column].nunique()
|
|
1157
|
-
|
|
1158
1117
|
quick_encoder = DataFrameEncoder(Y.to_frame())
|
|
1159
1118
|
encoded_Y = quick_encoder.encode(save_mode=False)
|
|
1160
|
-
|
|
1161
|
-
# Mapping to one-hot
|
|
1162
1119
|
one_hot = OneHotEncoder()
|
|
1163
1120
|
train_y = one_hot.encode(encoded_Y[column])
|
|
1164
|
-
# PASSING 0 -> 0.5 and 1 -> 0.73105
|
|
1165
1121
|
for i in range(len(train_y)):
|
|
1166
1122
|
for j in range(num_unique_entries):
|
|
1167
1123
|
if train_y[i][j] == 1.0:
|
|
1168
1124
|
train_y[i][j] = 0.73105
|
|
1169
1125
|
else:
|
|
1170
1126
|
train_y[i][j] = 0.5
|
|
1171
|
-
|
|
1172
|
-
# Delete the column in question
|
|
1173
1127
|
X_aux = curr_dataset.drop([column], axis=1)
|
|
1174
|
-
|
|
1175
|
-
# We encode
|
|
1176
1128
|
dfe = DataFrameEncoder(X_aux)
|
|
1177
1129
|
encoded_df = dfe.encode(save_mode=False)
|
|
1178
|
-
|
|
1179
|
-
# We train
|
|
1180
1130
|
Model.fit(encoded_df.to_numpy().T, train_y)
|
|
1181
|
-
|
|
1182
|
-
# We obtain importance
|
|
1183
1131
|
importance = Model.get_importances()
|
|
1184
1132
|
w = Model.w
|
|
1185
|
-
|
|
1186
|
-
# We obtain the $n$ most important ones
|
|
1187
1133
|
top_n_indexes = sorted(
|
|
1188
1134
|
range(len(importance)), key=lambda i: importance[i], reverse=True
|
|
1189
1135
|
)[:n_importances]
|
|
1190
1136
|
|
|
1191
|
-
# We build the string for the column in question
|
|
1192
1137
|
names_cols = list(X_aux.columns)
|
|
1193
|
-
# We store the indices, values and column names in a list of tuples.
|
|
1194
1138
|
features_imp_node = [
|
|
1195
1139
|
(names_cols[top_n_indexes[i]], importance[top_n_indexes[i]])
|
|
1196
1140
|
for i in range(n_importances)
|
|
1197
1141
|
]
|
|
1198
|
-
# We store w's for predictions
|
|
1199
1142
|
|
|
1200
1143
|
if column_type != "object":
|
|
1201
1144
|
self.w_dict[column] = (w, None, names_cols, dfe, numeric_dict)
|
|
1202
1145
|
else:
|
|
1203
1146
|
self.w_dict[column] = (w, quick_encoder, names_cols, dfe, numeric_dict)
|
|
1204
|
-
# Add to general list
|
|
1205
1147
|
self.all_features_imp_graph.append((column, features_imp_node))
|
|
1206
|
-
# We format it
|
|
1207
1148
|
for i in top_n_indexes:
|
|
1208
1149
|
feature_string += names_cols[i] + " -> "
|
|
1209
1150
|
|
|
@@ -1212,10 +1153,8 @@ class FeatureSelection:
|
|
|
1212
1153
|
return feature_string + "} "
|
|
1213
1154
|
|
|
1214
1155
|
def _load_data(self, dataset: DataFrame):
|
|
1215
|
-
# Assign data and clean dataset of unneeded columns
|
|
1216
1156
|
|
|
1217
1157
|
if len(self.not_features) > 0:
|
|
1218
|
-
# We remove unnecessary columns
|
|
1219
1158
|
self.X = dataset.drop(columns=self.not_features)
|
|
1220
1159
|
|
|
1221
1160
|
else:
|
|
@@ -1228,34 +1167,50 @@ class FeatureSelection:
|
|
|
1228
1167
|
self.X = self.X.drop(columns=["index"])
|
|
1229
1168
|
|
|
1230
1169
|
|
|
1231
|
-
def check_nan_inf(df: DataFrame) -> DataFrame:
|
|
1170
|
+
def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
|
|
1232
1171
|
"""
|
|
1233
1172
|
Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
|
|
1234
1173
|
|
|
1235
|
-
Parameters
|
|
1236
|
-
|
|
1174
|
+
Parameters
|
|
1175
|
+
----------
|
|
1176
|
+
df : DataFrame
|
|
1177
|
+
The input DataFrame to be checked.
|
|
1237
1178
|
|
|
1238
|
-
Returns
|
|
1239
|
-
|
|
1179
|
+
Returns
|
|
1180
|
+
----------
|
|
1181
|
+
DataFrame
|
|
1182
|
+
A new DataFrame with NaN and Inf values removed.
|
|
1240
1183
|
"""
|
|
1241
1184
|
|
|
1242
1185
|
nan_values = df.isnull().values.any()
|
|
1243
1186
|
inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
|
|
1244
1187
|
|
|
1188
|
+
nan_count = df.isnull().values.sum()
|
|
1189
|
+
inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
|
|
1190
|
+
|
|
1245
1191
|
if nan_values:
|
|
1246
|
-
|
|
1192
|
+
(
|
|
1193
|
+
print(
|
|
1194
|
+
"UserWarning: Some rows may have been deleted due to the existence of NaN values."
|
|
1195
|
+
)
|
|
1196
|
+
if verbose
|
|
1197
|
+
else None
|
|
1198
|
+
)
|
|
1247
1199
|
df.dropna(inplace=True)
|
|
1248
1200
|
|
|
1249
1201
|
if inf_values:
|
|
1250
|
-
|
|
1202
|
+
(
|
|
1203
|
+
print(
|
|
1204
|
+
"UserWarning: Some rows may have been deleted due to the existence of Inf values."
|
|
1205
|
+
)
|
|
1206
|
+
if verbose
|
|
1207
|
+
else None
|
|
1208
|
+
)
|
|
1251
1209
|
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
1252
1210
|
df.dropna(inplace=True)
|
|
1253
1211
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
print(f"NaN values removed: {nan_count}")
|
|
1258
|
-
print(f"Infinite values removed: {inf_count}")
|
|
1212
|
+
print(f"NaN values removed: ", "{:,}".format(nan_count))
|
|
1213
|
+
print(f"Infinite values removed: ", "{:,}".format(inf_count))
|
|
1259
1214
|
|
|
1260
1215
|
return df
|
|
1261
1216
|
|
|
@@ -1272,7 +1227,6 @@ if __name__ == "__main__":
|
|
|
1272
1227
|
print(helper.f_mean(y_true, y_pred, labels))
|
|
1273
1228
|
|
|
1274
1229
|
# Use DataFrameEncoder
|
|
1275
|
-
# Create a DataFrame
|
|
1276
1230
|
data = {"Name": ["John", "Alice", "Bob", "Jafet", "Beto"], "Age": [25, 30, 35, 21, 28]}
|
|
1277
1231
|
import pandas as pd
|
|
1278
1232
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.1
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -39,6 +39,7 @@ Dynamic: classifier
|
|
|
39
39
|
Dynamic: description
|
|
40
40
|
Dynamic: description-content-type
|
|
41
41
|
Dynamic: home-page
|
|
42
|
+
Dynamic: license-file
|
|
42
43
|
Dynamic: maintainer
|
|
43
44
|
Dynamic: maintainer-email
|
|
44
45
|
Dynamic: provides-extra
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
|
|
2
|
+
likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
|
|
3
|
+
likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
|
|
4
|
+
likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
|
|
5
|
+
likelihood/graph/nn.py,sha256=EaMmboKriCFnkP48_HLGRAsOZSWxwUlMG0WDGZ4ey1o,11035
|
|
6
|
+
likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
|
|
7
|
+
likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
|
|
8
|
+
likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
|
|
9
|
+
likelihood/models/simulation.py,sha256=IkYGA6-L1LvSnIlyrVWTzQQu-JnfXml5Tewt-GC05PY,8446
|
|
10
|
+
likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
|
|
11
|
+
likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
|
|
12
|
+
likelihood/models/deep/autoencoders.py,sha256=0EIZwDNlZ9NCfQbhQ_KdXkkRwIjUEU-jk0l0u-J1wmA,44212
|
|
13
|
+
likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
|
|
14
|
+
likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
|
|
15
|
+
likelihood/tools/impute.py,sha256=BwBVFSQkG3uWsZEk1THTmqZc3YhHlDhMXgKIV3sx5Lg,9486
|
|
16
|
+
likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
|
|
17
|
+
likelihood/tools/numeric_tools.py,sha256=OelCF45QO-zhanX3GmfcdYMfUZxYt353oJ8_gPEdWss,11959
|
|
18
|
+
likelihood/tools/tools.py,sha256=vlQ-peK_z5-MLVnStxlBdl-NfmF6ILxZ6LhBd4K77JI,42282
|
|
19
|
+
likelihood-1.5.1.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
|
|
20
|
+
likelihood-1.5.1.dist-info/METADATA,sha256=s__LhxtBZXbQHaU-WQtpRvOmfnP7zZ1nqhI6I9IRNFA,2844
|
|
21
|
+
likelihood-1.5.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
22
|
+
likelihood-1.5.1.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
|
|
23
|
+
likelihood-1.5.1.dist-info/RECORD,,
|