likelihood 1.5.3__py3-none-any.whl → 1.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/graph/nn.py +74 -48
- likelihood/models/simulation.py +0 -3
- likelihood/tools/impute.py +0 -1
- likelihood/tools/numeric_tools.py +11 -3
- likelihood/tools/tools.py +0 -1
- {likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/METADATA +2 -1
- {likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/RECORD +10 -10
- {likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/WHEEL +1 -1
- {likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/licenses/LICENSE +0 -0
- {likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/top_level.txt +0 -0
likelihood/graph/nn.py
CHANGED
|
@@ -5,6 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
|
5
5
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
|
6
6
|
|
|
7
7
|
import warnings
|
|
8
|
+
from multiprocessing import Pool, cpu_count
|
|
8
9
|
from typing import Any, List, Tuple
|
|
9
10
|
|
|
10
11
|
import numpy as np
|
|
@@ -13,65 +14,79 @@ import tensorflow as tf
|
|
|
13
14
|
from IPython.display import clear_output
|
|
14
15
|
from pandas.core.frame import DataFrame
|
|
15
16
|
from sklearn.metrics import f1_score
|
|
16
|
-
from sklearn.model_selection import train_test_split
|
|
17
17
|
|
|
18
18
|
tf.get_logger().setLevel("ERROR")
|
|
19
19
|
|
|
20
20
|
from likelihood.tools import LoRALayer
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def
|
|
24
|
-
"""
|
|
23
|
+
def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
|
|
24
|
+
"""Vectorized similarity comparison between two numeric/categorical arrays."""
|
|
25
|
+
arr1 = np.asarray(arr1)
|
|
26
|
+
arr2 = np.asarray(arr2)
|
|
25
27
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
28
|
+
is_numeric = np.vectorize(
|
|
29
|
+
lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
|
|
30
|
+
)(arr1, arr2)
|
|
31
|
+
|
|
32
|
+
similarity = np.zeros_like(arr1, dtype=bool)
|
|
33
|
+
|
|
34
|
+
if np.any(is_numeric):
|
|
35
|
+
a_num = arr1[is_numeric].astype(float)
|
|
36
|
+
b_num = arr2[is_numeric].astype(float)
|
|
37
|
+
|
|
38
|
+
both_zero = (a_num == 0) & (b_num == 0)
|
|
39
|
+
nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
|
|
40
|
+
ratio = np.zeros_like(a_num)
|
|
41
|
+
ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
|
|
42
|
+
a_num[nonzero], b_num[nonzero]
|
|
43
|
+
)
|
|
44
|
+
numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
|
|
45
|
+
|
|
46
|
+
similarity[is_numeric] = numeric_similar
|
|
47
|
+
|
|
48
|
+
similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
|
|
49
|
+
|
|
50
|
+
return np.count_nonzero(similarity)
|
|
37
51
|
|
|
38
|
-
|
|
52
|
+
|
|
53
|
+
def compare_pair(pair, data, similarity, threshold):
|
|
54
|
+
i, j = pair
|
|
55
|
+
sim = compare_similarity_np(data[i], data[j], threshold=threshold)
|
|
56
|
+
return (i, j, 1 if sim >= similarity else 0)
|
|
39
57
|
|
|
40
58
|
|
|
41
59
|
def cal_adjacency_matrix(
|
|
42
|
-
df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
60
|
+
df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
|
|
43
61
|
) -> Tuple[dict, np.ndarray]:
|
|
44
|
-
"""
|
|
45
|
-
|
|
46
|
-
The similarity is calculated using the `compare_similarity` function.
|
|
47
|
-
The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
|
|
62
|
+
"""
|
|
63
|
+
Calculates the adjacency matrix for a given DataFrame using parallel processing.
|
|
48
64
|
|
|
49
65
|
Parameters
|
|
50
66
|
----------
|
|
51
67
|
df : `DataFrame`
|
|
52
68
|
The input DataFrame containing the features.
|
|
53
|
-
exclude_subset : `List[str]`, optional
|
|
69
|
+
exclude_subset : `List[str]`, `optional`
|
|
54
70
|
A list of features to exclude from the calculation of the adjacency matrix.
|
|
55
|
-
sparse : `bool`, optional
|
|
71
|
+
sparse : `bool`, `optional`
|
|
56
72
|
Whether to return a sparse matrix or a dense matrix.
|
|
57
73
|
**kwargs : `dict`
|
|
58
74
|
Additional keyword arguments to pass to the `compare_similarity` function.
|
|
59
75
|
|
|
60
|
-
Keyword Arguments:
|
|
61
|
-
----------
|
|
62
|
-
similarity: `int`
|
|
63
|
-
The minimum number of features that must be the same in both arrays to be considered similar.
|
|
64
|
-
threshold : `float`
|
|
65
|
-
The threshold value used in the `compare_similarity` function. Default is 0.05.
|
|
66
|
-
|
|
67
76
|
Returns
|
|
68
77
|
-------
|
|
69
78
|
adj_dict : `dict`
|
|
70
79
|
A dictionary containing the features.
|
|
71
80
|
adjacency_matrix : `ndarray`
|
|
72
81
|
The adjacency matrix.
|
|
73
|
-
"""
|
|
74
82
|
|
|
83
|
+
Keyword Arguments:
|
|
84
|
+
----------
|
|
85
|
+
similarity: `int`
|
|
86
|
+
The minimum number of features that must be the same in both arrays to be considered similar.
|
|
87
|
+
threshold : `float`
|
|
88
|
+
The threshold value used in the `compare_similarity` function. Default is 0.0
|
|
89
|
+
"""
|
|
75
90
|
if len(exclude_subset) > 0:
|
|
76
91
|
columns = [col for col in df.columns if col not in exclude_subset]
|
|
77
92
|
df_ = df[columns].copy()
|
|
@@ -84,14 +99,26 @@ def cal_adjacency_matrix(
|
|
|
84
99
|
threshold = kwargs.get("threshold", 0.05)
|
|
85
100
|
assert similarity <= df_.shape[1]
|
|
86
101
|
|
|
87
|
-
|
|
102
|
+
data = df_.to_numpy()
|
|
103
|
+
n = len(data)
|
|
88
104
|
|
|
89
|
-
|
|
105
|
+
adj_dict = {i: data[i].tolist() for i in range(n)}
|
|
90
106
|
|
|
91
|
-
|
|
92
|
-
for
|
|
93
|
-
|
|
94
|
-
|
|
107
|
+
def pair_generator():
|
|
108
|
+
for i in range(n):
|
|
109
|
+
for j in range(i, n):
|
|
110
|
+
yield (i, j)
|
|
111
|
+
|
|
112
|
+
with Pool(cpu_count()) as pool:
|
|
113
|
+
results = pool.starmap(
|
|
114
|
+
compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
|
|
118
|
+
for i, j, val in results:
|
|
119
|
+
if val:
|
|
120
|
+
adjacency_matrix[i, j] = 1
|
|
121
|
+
adjacency_matrix[j, i] = 1
|
|
95
122
|
|
|
96
123
|
if sparse:
|
|
97
124
|
num_nodes = adjacency_matrix.shape[0]
|
|
@@ -103,9 +130,7 @@ def cal_adjacency_matrix(
|
|
|
103
130
|
indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
|
|
104
131
|
)
|
|
105
132
|
|
|
106
|
-
|
|
107
|
-
else:
|
|
108
|
-
return adj_dict, adjacency_matrix
|
|
133
|
+
return adj_dict, adjacency_matrix
|
|
109
134
|
|
|
110
135
|
|
|
111
136
|
class Data:
|
|
@@ -260,12 +285,17 @@ class VanillaGNN(tf.keras.Model):
|
|
|
260
285
|
val_losses = []
|
|
261
286
|
val_f1_scores = []
|
|
262
287
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
288
|
+
num_nodes = len(data.x)
|
|
289
|
+
split_index = int((1 - test_size) * num_nodes)
|
|
290
|
+
|
|
291
|
+
X_train, X_test = data.x[:split_index], data.x[split_index:]
|
|
292
|
+
y_train, y_test = data.y[:split_index], data.y[split_index:]
|
|
293
|
+
|
|
294
|
+
adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
|
|
267
295
|
adjacency_test = tf.sparse.slice(
|
|
268
|
-
data.adjacency,
|
|
296
|
+
data.adjacency,
|
|
297
|
+
[split_index, split_index],
|
|
298
|
+
[num_nodes - split_index, num_nodes - split_index],
|
|
269
299
|
)
|
|
270
300
|
|
|
271
301
|
batch_starts = np.arange(0, len(X_train), batch_size)
|
|
@@ -286,10 +316,6 @@ class VanillaGNN(tf.keras.Model):
|
|
|
286
316
|
|
|
287
317
|
if epoch % 5 == 0:
|
|
288
318
|
clear_output(wait=True)
|
|
289
|
-
warnings.warn(
|
|
290
|
-
"It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
|
|
291
|
-
UserWarning,
|
|
292
|
-
)
|
|
293
319
|
val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
|
|
294
320
|
val_losses.append(val_loss)
|
|
295
321
|
val_f1_scores.append(val_f1)
|
likelihood/models/simulation.py
CHANGED
|
@@ -51,7 +51,6 @@ class SimulationEngine(FeatureSelection):
|
|
|
51
51
|
"""
|
|
52
52
|
|
|
53
53
|
def __init__(self, use_scaler: bool = False, **kwargs):
|
|
54
|
-
|
|
55
54
|
self.df = pd.DataFrame()
|
|
56
55
|
self.n_importances = None
|
|
57
56
|
self.use_scaler = use_scaler
|
|
@@ -91,7 +90,6 @@ class SimulationEngine(FeatureSelection):
|
|
|
91
90
|
|
|
92
91
|
# Categorical column
|
|
93
92
|
if quick_encoder != None:
|
|
94
|
-
|
|
95
93
|
one_hot = OneHotEncoder()
|
|
96
94
|
y = one_hot.decode(y)
|
|
97
95
|
encoding_dic = quick_encoder.decoding_list[0]
|
|
@@ -180,7 +178,6 @@ class SimulationEngine(FeatureSelection):
|
|
|
180
178
|
]
|
|
181
179
|
|
|
182
180
|
def _clean_data(self, df: DataFrame) -> DataFrame:
|
|
183
|
-
|
|
184
181
|
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
185
182
|
df.replace(" ", np.nan, inplace=True)
|
|
186
183
|
df = check_nan_inf(df)
|
likelihood/tools/impute.py
CHANGED
|
@@ -71,7 +71,6 @@ class SimpleImputer:
|
|
|
71
71
|
self.cols_transf = X_impute.columns
|
|
72
72
|
for column in X_impute.columns:
|
|
73
73
|
if X_impute[column].isnull().sum() > 0:
|
|
74
|
-
|
|
75
74
|
if not X_impute[column].dtype == "object":
|
|
76
75
|
min_value = self.params[column]["min"]
|
|
77
76
|
max_value = self.params[column]["max"]
|
|
@@ -356,13 +356,16 @@ def find_multiples(target: int) -> tuple[int, int] | None:
|
|
|
356
356
|
Returns
|
|
357
357
|
-------
|
|
358
358
|
tuple[int, int] | None
|
|
359
|
-
|
|
359
|
+
If i and i+1 both divide target, returns (i, i+1).
|
|
360
|
+
Otherwise, returns (i, target // i).
|
|
360
361
|
Returns None if no factors are found.
|
|
361
362
|
"""
|
|
362
363
|
for i in range(2, target + 1):
|
|
363
364
|
if target % i == 0:
|
|
364
|
-
|
|
365
|
-
|
|
365
|
+
if (i + 1) <= target and target % (i + 1) == 0:
|
|
366
|
+
return i + 1, target // (i + 1)
|
|
367
|
+
else:
|
|
368
|
+
return i, target // i
|
|
366
369
|
return None
|
|
367
370
|
|
|
368
371
|
|
|
@@ -396,4 +399,9 @@ if __name__ == "__main__":
|
|
|
396
399
|
df["index"] = ["A", "B", "C", "D"]
|
|
397
400
|
print("New correlation coefficient test for pandas DataFrame")
|
|
398
401
|
values_df = xi_corr(df)
|
|
402
|
+
print(find_multiples(30))
|
|
403
|
+
print(find_multiples(25))
|
|
404
|
+
print(find_multiples(49))
|
|
405
|
+
print(find_multiples(17))
|
|
406
|
+
print(find_multiples(24))
|
|
399
407
|
breakpoint()
|
likelihood/tools/tools.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: likelihood
|
|
3
|
-
Version: 1.5.
|
|
3
|
+
Version: 1.5.5
|
|
4
4
|
Summary: A package that performs the maximum likelihood algorithm.
|
|
5
5
|
Home-page: https://github.com/jzsmoreno/likelihood/
|
|
6
6
|
Author: J. A. Moreno-Guerra
|
|
@@ -28,6 +28,7 @@ Requires-Dist: seaborn
|
|
|
28
28
|
Requires-Dist: pyyaml
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: corner
|
|
31
|
+
Requires-Dist: tqdm
|
|
31
32
|
Provides-Extra: full
|
|
32
33
|
Requires-Dist: networkx; extra == "full"
|
|
33
34
|
Requires-Dist: pyvis; extra == "full"
|
|
@@ -2,23 +2,23 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
|
|
|
2
2
|
likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
|
|
3
3
|
likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
|
|
4
4
|
likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
|
|
5
|
-
likelihood/graph/nn.py,sha256=
|
|
5
|
+
likelihood/graph/nn.py,sha256=uxCxGt1suKmThmEjFope2ew93-WlgvGhgr6RVCHwzhM,11420
|
|
6
6
|
likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
|
|
7
7
|
likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
|
|
8
8
|
likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
|
|
9
|
-
likelihood/models/simulation.py,sha256=
|
|
9
|
+
likelihood/models/simulation.py,sha256=6OD2IXAnbctxtOzUJ2b9vKW7_tdGs4dQYmQQShqsioA,8443
|
|
10
10
|
likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
|
|
11
11
|
likelihood/models/deep/__init__.py,sha256=m607FtMP2gAfPtM0mssFXMKyKOqoeYskZ_xIC6dKhr4,47
|
|
12
12
|
likelihood/models/deep/autoencoders.py,sha256=0EIZwDNlZ9NCfQbhQ_KdXkkRwIjUEU-jk0l0u-J1wmA,44212
|
|
13
13
|
likelihood/models/deep/gan.py,sha256=aoSaNO5LvCU62cjxA0AxvnQvE7NSFtrp1Ta4EDJchpo,10874
|
|
14
14
|
likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
|
|
15
15
|
likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
|
|
16
|
-
likelihood/tools/impute.py,sha256=
|
|
16
|
+
likelihood/tools/impute.py,sha256=n87Tv-xLUAdPl7BQLFcLWSsXBZbXksahyCayJWMydXc,9485
|
|
17
17
|
likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
|
|
18
|
-
likelihood/tools/numeric_tools.py,sha256=
|
|
19
|
-
likelihood/tools/tools.py,sha256=
|
|
20
|
-
likelihood-1.5.
|
|
21
|
-
likelihood-1.5.
|
|
22
|
-
likelihood-1.5.
|
|
23
|
-
likelihood-1.5.
|
|
24
|
-
likelihood-1.5.
|
|
18
|
+
likelihood/tools/numeric_tools.py,sha256=Hwf-lbqROqPPZ9N7eVzKIDyZxFGQdP53isWxPqpG0eo,12254
|
|
19
|
+
likelihood/tools/tools.py,sha256=FyldbmYNgt4gK89BKgDsya2_EIENwZZwdbBx5pfNhj4,42281
|
|
20
|
+
likelihood-1.5.5.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
|
|
21
|
+
likelihood-1.5.5.dist-info/METADATA,sha256=jtu0BJ0483cmd4DAKqqn_rsSru1-LVS2Wmj998jMkoA,2886
|
|
22
|
+
likelihood-1.5.5.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
23
|
+
likelihood-1.5.5.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
|
|
24
|
+
likelihood-1.5.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|