likelihood 2.2.0.dev1__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- likelihood/VERSION +1 -0
- likelihood/__init__.py +20 -0
- likelihood/graph/__init__.py +9 -0
- likelihood/graph/_nn.py +283 -0
- likelihood/graph/graph.py +86 -0
- likelihood/graph/nn.py +329 -0
- likelihood/main.py +273 -0
- likelihood/models/__init__.py +3 -0
- likelihood/models/deep/__init__.py +13 -0
- likelihood/models/deep/_autoencoders.py +896 -0
- likelihood/models/deep/_predictor.py +809 -0
- likelihood/models/deep/autoencoders.py +903 -0
- likelihood/models/deep/bandit.py +97 -0
- likelihood/models/deep/gan.py +313 -0
- likelihood/models/deep/predictor.py +805 -0
- likelihood/models/deep/rl.py +345 -0
- likelihood/models/environments.py +202 -0
- likelihood/models/hmm.py +163 -0
- likelihood/models/regression.py +451 -0
- likelihood/models/simulation.py +213 -0
- likelihood/models/utils.py +87 -0
- likelihood/pipes.py +382 -0
- likelihood/rust_py_integration.cpython-310-x86_64-linux-gnu.so +0 -0
- likelihood/tools/__init__.py +4 -0
- likelihood/tools/cat_embed.py +212 -0
- likelihood/tools/figures.py +348 -0
- likelihood/tools/impute.py +278 -0
- likelihood/tools/models_tools.py +866 -0
- likelihood/tools/numeric_tools.py +390 -0
- likelihood/tools/reports.py +375 -0
- likelihood/tools/tools.py +1336 -0
- likelihood-2.2.0.dev1.dist-info/METADATA +68 -0
- likelihood-2.2.0.dev1.dist-info/RECORD +37 -0
- likelihood-2.2.0.dev1.dist-info/WHEEL +5 -0
- likelihood-2.2.0.dev1.dist-info/licenses/LICENSE +21 -0
- likelihood-2.2.0.dev1.dist-info/top_level.txt +5 -0
- src/lib.rs +12 -0
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from numpy.linalg import solve
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# -------------------------------------------------------------------------
|
|
7
|
+
def get_metrics(dataset, actual_column_name, predicted_column_name, verbose=False):
|
|
8
|
+
# Variables to keep track of the number of correct and total predictions
|
|
9
|
+
true_positives = 0 # Correctly predicted positives
|
|
10
|
+
true_negatives = 0 # Correctly predicted negatives
|
|
11
|
+
false_positives = 0 # Negatives predicted as positives
|
|
12
|
+
false_negatives = 0 # Positives predicted as negatives
|
|
13
|
+
total_predictions = len(dataset)
|
|
14
|
+
|
|
15
|
+
# Counters for actual and predicted classes
|
|
16
|
+
actual_positive_count = 0
|
|
17
|
+
actual_negative_count = 0
|
|
18
|
+
predicted_positive_count = 0
|
|
19
|
+
predicted_negative_count = 0
|
|
20
|
+
|
|
21
|
+
for index, row in dataset.iterrows():
|
|
22
|
+
actual_class = row[actual_column_name]
|
|
23
|
+
predicted_class = row[predicted_column_name]
|
|
24
|
+
|
|
25
|
+
# Update confusion matrix counts
|
|
26
|
+
if actual_class == 1 and predicted_class == 1: # True positive
|
|
27
|
+
true_positives += 1
|
|
28
|
+
elif actual_class == 0 and predicted_class == 0: # True negative
|
|
29
|
+
true_negatives += 1
|
|
30
|
+
elif actual_class == 0 and predicted_class == 1: # False positive
|
|
31
|
+
false_positives += 1
|
|
32
|
+
elif actual_class == 1 and predicted_class == 0: # False negative
|
|
33
|
+
false_negatives += 1
|
|
34
|
+
|
|
35
|
+
# Update class counts
|
|
36
|
+
if actual_class == 1:
|
|
37
|
+
actual_positive_count += 1
|
|
38
|
+
else:
|
|
39
|
+
actual_negative_count += 1
|
|
40
|
+
|
|
41
|
+
if predicted_class == 1:
|
|
42
|
+
predicted_positive_count += 1
|
|
43
|
+
else:
|
|
44
|
+
predicted_negative_count += 1
|
|
45
|
+
|
|
46
|
+
# Calculate accuracy
|
|
47
|
+
accuracy = (true_positives + true_negatives) / total_predictions * 100
|
|
48
|
+
|
|
49
|
+
# Calculate precision
|
|
50
|
+
if true_positives + false_positives > 0:
|
|
51
|
+
precision = true_positives / (true_positives + false_positives) * 100
|
|
52
|
+
else:
|
|
53
|
+
precision = 0 # Avoid division by zero
|
|
54
|
+
|
|
55
|
+
# Calculate recall
|
|
56
|
+
if true_positives + false_negatives > 0:
|
|
57
|
+
recall = true_positives / (true_positives + false_negatives) * 100
|
|
58
|
+
else:
|
|
59
|
+
recall = 0 # Avoid division by zero
|
|
60
|
+
|
|
61
|
+
# Calculate F1-Score
|
|
62
|
+
if precision + recall > 0:
|
|
63
|
+
f1_score = 2 * (precision * recall) / (precision + recall)
|
|
64
|
+
else:
|
|
65
|
+
f1_score = 0 # Avoid division by zero
|
|
66
|
+
|
|
67
|
+
coeff_1 = (true_positives + false_positives) * (false_positives + true_negatives)
|
|
68
|
+
coeff_2 = (true_positives + false_negatives) * (false_negatives + true_negatives)
|
|
69
|
+
if coeff_1 + coeff_2 > 0:
|
|
70
|
+
kappa = (
|
|
71
|
+
2
|
|
72
|
+
* (true_positives * true_negatives - false_negatives * false_positives)
|
|
73
|
+
/ (coeff_1 + coeff_2)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
metrics = {
|
|
77
|
+
"accuracy": accuracy,
|
|
78
|
+
"precision": precision,
|
|
79
|
+
"recall": recall,
|
|
80
|
+
"f1_score": f1_score,
|
|
81
|
+
"kappa": kappa,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if verbose:
|
|
85
|
+
print(f"Accuracy: {accuracy:.2f}%")
|
|
86
|
+
print(f"Precision: {precision:.2f}%")
|
|
87
|
+
print(f"Recall: {recall:.2f}%")
|
|
88
|
+
print(f"F1-Score: {f1_score:.2f}")
|
|
89
|
+
print(f"Cohen's Kappa: {kappa:.4f}")
|
|
90
|
+
else:
|
|
91
|
+
return metrics
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
|
|
95
|
+
"""Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
df : `pd.DataFrame`
|
|
100
|
+
Input data containing the variables to be correlated.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
correlations : `pd.DataFrame`
|
|
105
|
+
A square dataframe with variable names as both index and columns,
|
|
106
|
+
containing their corresponding correlation coefficients.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
columns = df.select_dtypes(include="number").columns
|
|
110
|
+
n = len(columns)
|
|
111
|
+
|
|
112
|
+
# Initialize a square matrix for the correlations
|
|
113
|
+
correlations = pd.DataFrame(1.0, index=columns, columns=columns)
|
|
114
|
+
|
|
115
|
+
for i, col1 in enumerate(columns):
|
|
116
|
+
for j, col2 in enumerate(columns):
|
|
117
|
+
if i < j:
|
|
118
|
+
x = df[col1].values
|
|
119
|
+
y = df[col2].values
|
|
120
|
+
|
|
121
|
+
correlation = xicor(x, y)
|
|
122
|
+
correlations.loc[col1, col2] = round(correlation, 8)
|
|
123
|
+
correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
|
|
124
|
+
|
|
125
|
+
return correlations
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
"""
|
|
129
|
+
@article{Chatterjee2019ANC,
|
|
130
|
+
title={A New Coefficient of Correlation},
|
|
131
|
+
author={Sourav Chatterjee},
|
|
132
|
+
journal={Journal of the American Statistical Association},
|
|
133
|
+
year={2019},
|
|
134
|
+
volume={116},
|
|
135
|
+
pages={2009 - 2022},
|
|
136
|
+
url={https://api.semanticscholar.org/CorpusID:202719281}
|
|
137
|
+
}
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
|
|
142
|
+
"""
|
|
143
|
+
Calculate a generalized coefficient of correlation between two variables.
|
|
144
|
+
|
|
145
|
+
This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
|
|
146
|
+
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
X : `np.ndarray`
|
|
150
|
+
The first variable to be correlated. Must have at least one dimension.
|
|
151
|
+
Y : `np.ndarray`
|
|
152
|
+
The second variable to be correlated. Must have at least one dimension.
|
|
153
|
+
ties : `bool`
|
|
154
|
+
Whether to handle ties using randomization.
|
|
155
|
+
random_seed : int, optional
|
|
156
|
+
Seed for the random number generator for reproducibility.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
xi : `float`
|
|
161
|
+
The estimated value of the new coefficient of correlation.
|
|
162
|
+
"""
|
|
163
|
+
# Early return for identical arrays
|
|
164
|
+
if np.array_equal(X, Y):
|
|
165
|
+
return 1.0
|
|
166
|
+
n = len(X)
|
|
167
|
+
# Early return for cases with less than 2 elements
|
|
168
|
+
if n < 2:
|
|
169
|
+
return 0.0
|
|
170
|
+
# Flatten the input arrays if they are multidimensional
|
|
171
|
+
X = X.flatten()
|
|
172
|
+
Y = Y.flatten()
|
|
173
|
+
# Get the sorted order of X
|
|
174
|
+
order = np.argsort(X)
|
|
175
|
+
if ties:
|
|
176
|
+
np.random.seed(random_seed) # Set seed for reproducibility if needed
|
|
177
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks
|
|
178
|
+
unique_ranks, counts = np.unique(ranks, return_counts=True)
|
|
179
|
+
# Adjust ranks for ties by shuffling
|
|
180
|
+
for rank, count in zip(unique_ranks, counts):
|
|
181
|
+
if count > 1:
|
|
182
|
+
tie_indices = np.where(ranks == rank)[0]
|
|
183
|
+
np.random.shuffle(ranks[tie_indices]) # Randomize ties
|
|
184
|
+
cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
|
|
185
|
+
return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
|
|
186
|
+
2 * np.sum(cumulative_counts * (n - cumulative_counts))
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
|
|
190
|
+
return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# -------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def ecprint(A: np.ndarray) -> None:
|
|
197
|
+
"""Function that prints the augmented matrix.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
A : `np.ndarray`
|
|
202
|
+
The augmented matrix.
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
`None`
|
|
207
|
+
Prints the matrix to console.
|
|
208
|
+
"""
|
|
209
|
+
n = len(A)
|
|
210
|
+
for i in range(0, n):
|
|
211
|
+
line = ""
|
|
212
|
+
for j in range(0, n + 1):
|
|
213
|
+
line += str(format(round(A[i][j], 2))) + "\t"
|
|
214
|
+
if j == n - 1:
|
|
215
|
+
line += "| "
|
|
216
|
+
print(line)
|
|
217
|
+
print()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def sor_elimination(
|
|
221
|
+
A: np.ndarray,
|
|
222
|
+
b: np.ndarray,
|
|
223
|
+
n: int,
|
|
224
|
+
max_iterations: int,
|
|
225
|
+
w: float,
|
|
226
|
+
error: float = 1e-3,
|
|
227
|
+
verbose: bool = True,
|
|
228
|
+
) -> np.ndarray:
|
|
229
|
+
"""Computes the Successive Over-Relaxation algorithm.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
A : `np.ndarray`
|
|
234
|
+
Coefficient matrix of the system of equations.
|
|
235
|
+
b : `np.ndarray`
|
|
236
|
+
Right-hand side vector of the system of equations.
|
|
237
|
+
n : `int`
|
|
238
|
+
Dimension of the system of equations.
|
|
239
|
+
max_iterations : `int`
|
|
240
|
+
Maximum number of iterations allowed.
|
|
241
|
+
w : `float`
|
|
242
|
+
Relaxation parameter.
|
|
243
|
+
error : `float`, optional
|
|
244
|
+
Desired level of accuracy, default is 1e-3.
|
|
245
|
+
verbose : `bool`, optional
|
|
246
|
+
Whether to print intermediate results, default is False.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
xi : `np.ndarray`
|
|
251
|
+
The solution of the system of equations.
|
|
252
|
+
"""
|
|
253
|
+
xin = np.zeros(n)
|
|
254
|
+
for k in range(max_iterations):
|
|
255
|
+
xi = np.zeros(n)
|
|
256
|
+
for i in range(n):
|
|
257
|
+
s1 = np.dot(A[i, :i], xin[:i])
|
|
258
|
+
s2 = np.dot(A[i, i + 1 :], xin[i + 1 :])
|
|
259
|
+
xi[i] = (w / A[i, i]) * (b[i] - s1 - s2) + (1.0 - w) * xin[i]
|
|
260
|
+
|
|
261
|
+
difference = np.max(np.abs(xi - xin))
|
|
262
|
+
if verbose:
|
|
263
|
+
print(f"Iteration {k + 1}: xi = {xi}, error = {difference}")
|
|
264
|
+
if difference <= error:
|
|
265
|
+
if verbose:
|
|
266
|
+
print(f"Converged after {k + 1} iterations.")
|
|
267
|
+
return xi
|
|
268
|
+
xin = np.copy(xi)
|
|
269
|
+
|
|
270
|
+
raise RuntimeError("Convergence not achieved within the maximum number of iterations.")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def gauss_elimination(A: np.ndarray | list, pr: int = 2) -> np.ndarray:
|
|
274
|
+
"""Computes the Gauss elimination algorithm.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
A : `np.ndarray` or `list`
|
|
279
|
+
An array containing the parameters of the $n$ equations
|
|
280
|
+
with the equalities.
|
|
281
|
+
pr : `int`
|
|
282
|
+
significant numbers of decimals.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
X : `np.ndarray`
|
|
287
|
+
The solution of the system of $n$ equationsS
|
|
288
|
+
"""
|
|
289
|
+
|
|
290
|
+
n = len(A)
|
|
291
|
+
X = [0 for _ in range(n)]
|
|
292
|
+
|
|
293
|
+
for i in range(n - 1):
|
|
294
|
+
for p in range(i, n):
|
|
295
|
+
if i <= p <= (n - 1) and A[p][i] != 0:
|
|
296
|
+
if p != i:
|
|
297
|
+
A[p], A[i] = A[i], A[p]
|
|
298
|
+
break
|
|
299
|
+
elif p == (n - 1):
|
|
300
|
+
print("There is no single solution")
|
|
301
|
+
return None
|
|
302
|
+
|
|
303
|
+
for j in range(i + 1, n):
|
|
304
|
+
if i <= j <= n and A[j][i] != 0:
|
|
305
|
+
if A[i][i] < A[j][i]:
|
|
306
|
+
A[j], A[i] = A[i], A[j]
|
|
307
|
+
break
|
|
308
|
+
|
|
309
|
+
for j in range(i + 1, n):
|
|
310
|
+
if A[i][i] == 0:
|
|
311
|
+
print("There is no single solution")
|
|
312
|
+
return None
|
|
313
|
+
factor = A[j][i] / A[i][i]
|
|
314
|
+
A[j] = [A[j][k] - factor * A[i][k] for k in range(n + 1)]
|
|
315
|
+
|
|
316
|
+
if A[n - 1][n - 1] == 0:
|
|
317
|
+
print("There is no single solution")
|
|
318
|
+
return None
|
|
319
|
+
|
|
320
|
+
X[n - 1] = A[n - 1][n] / A[n - 1][n - 1]
|
|
321
|
+
for i in range(n - 2, -1, -1):
|
|
322
|
+
s = sum(A[i][j] * X[j] for j in range(i + 1, n))
|
|
323
|
+
X[i] = (A[i][n] - s) / A[i][i]
|
|
324
|
+
|
|
325
|
+
ecprint(A)
|
|
326
|
+
print("The solution is:")
|
|
327
|
+
for i in range(n):
|
|
328
|
+
print(f"\tX{i} = {round(X[i], pr)}")
|
|
329
|
+
|
|
330
|
+
return X
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def find_multiples(target: int) -> tuple[int, int] | None:
|
|
334
|
+
"""
|
|
335
|
+
Find two factors of a given target number that are as close to each other as possible.
|
|
336
|
+
|
|
337
|
+
Parameters
|
|
338
|
+
----------
|
|
339
|
+
target : int
|
|
340
|
+
The target number to find factors for.
|
|
341
|
+
|
|
342
|
+
Returns
|
|
343
|
+
-------
|
|
344
|
+
tuple[int, int] | None
|
|
345
|
+
If `i` and `target // i` both divide target, returns (i, target // i).
|
|
346
|
+
Otherwise, returns `(i, target // i)`.
|
|
347
|
+
Returns `None` if no factors are found.
|
|
348
|
+
"""
|
|
349
|
+
for i in reversed(range(1, int(target**0.5) + 1)):
|
|
350
|
+
if target % i == 0:
|
|
351
|
+
return max(i, target // i), min(i, target // i)
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# Example usage:
|
|
356
|
+
if __name__ == "__main__":
|
|
357
|
+
import pandas as pd
|
|
358
|
+
|
|
359
|
+
# Create a sample dataframe with some random
|
|
360
|
+
data = {"x": [3, 5, 7, 9], "y": [4, 6, 8, 2], "z": [1, 2, 1, 3]}
|
|
361
|
+
df = pd.DataFrame(data)
|
|
362
|
+
print("Using the SOR relaxation method : ")
|
|
363
|
+
# Define the coefficient matrix A and the number of variables b
|
|
364
|
+
A = np.array([[1, 1, 1], [1, -1, 2], [1, -1, -3]])
|
|
365
|
+
Ag = A.copy()
|
|
366
|
+
b = np.array([6, 5, -10])
|
|
367
|
+
print("b :", b)
|
|
368
|
+
# Solve Ax=b, x = [1, 2, 3]
|
|
369
|
+
x = solve(A, b)
|
|
370
|
+
x_hat_sor = sor_elimination(A, b, 3, 200, 0.05)
|
|
371
|
+
# assert np.allclose(x, x_hat_sor), f"Expected:\n{x}\ngot\n{x_hat_sor}"
|
|
372
|
+
|
|
373
|
+
print("Using Gaussian elimination :")
|
|
374
|
+
Ag = np.insert(Ag, len(Ag), b, axis=1)
|
|
375
|
+
print(Ag)
|
|
376
|
+
x_hat_gaus = gauss_elimination(Ag)
|
|
377
|
+
|
|
378
|
+
print("New correlation coefficient test")
|
|
379
|
+
X = np.random.rand(100, 1)
|
|
380
|
+
Y = X * X
|
|
381
|
+
print("coefficient for Y = X * X :", xicor(X, Y, False))
|
|
382
|
+
df["index"] = ["A", "B", "C", "D"]
|
|
383
|
+
print("New correlation coefficient test for pandas DataFrame")
|
|
384
|
+
values_df = xi_corr(df)
|
|
385
|
+
print(find_multiples(30))
|
|
386
|
+
print(find_multiples(25))
|
|
387
|
+
print(find_multiples(49))
|
|
388
|
+
print(find_multiples(17))
|
|
389
|
+
print(find_multiples(24))
|
|
390
|
+
breakpoint()
|