likelihood 2.2.0.dev1__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,390 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from numpy.linalg import solve
4
+
5
+
6
+ # -------------------------------------------------------------------------
7
+ def get_metrics(dataset, actual_column_name, predicted_column_name, verbose=False):
8
+ # Variables to keep track of the number of correct and total predictions
9
+ true_positives = 0 # Correctly predicted positives
10
+ true_negatives = 0 # Correctly predicted negatives
11
+ false_positives = 0 # Negatives predicted as positives
12
+ false_negatives = 0 # Positives predicted as negatives
13
+ total_predictions = len(dataset)
14
+
15
+ # Counters for actual and predicted classes
16
+ actual_positive_count = 0
17
+ actual_negative_count = 0
18
+ predicted_positive_count = 0
19
+ predicted_negative_count = 0
20
+
21
+ for index, row in dataset.iterrows():
22
+ actual_class = row[actual_column_name]
23
+ predicted_class = row[predicted_column_name]
24
+
25
+ # Update confusion matrix counts
26
+ if actual_class == 1 and predicted_class == 1: # True positive
27
+ true_positives += 1
28
+ elif actual_class == 0 and predicted_class == 0: # True negative
29
+ true_negatives += 1
30
+ elif actual_class == 0 and predicted_class == 1: # False positive
31
+ false_positives += 1
32
+ elif actual_class == 1 and predicted_class == 0: # False negative
33
+ false_negatives += 1
34
+
35
+ # Update class counts
36
+ if actual_class == 1:
37
+ actual_positive_count += 1
38
+ else:
39
+ actual_negative_count += 1
40
+
41
+ if predicted_class == 1:
42
+ predicted_positive_count += 1
43
+ else:
44
+ predicted_negative_count += 1
45
+
46
+ # Calculate accuracy
47
+ accuracy = (true_positives + true_negatives) / total_predictions * 100
48
+
49
+ # Calculate precision
50
+ if true_positives + false_positives > 0:
51
+ precision = true_positives / (true_positives + false_positives) * 100
52
+ else:
53
+ precision = 0 # Avoid division by zero
54
+
55
+ # Calculate recall
56
+ if true_positives + false_negatives > 0:
57
+ recall = true_positives / (true_positives + false_negatives) * 100
58
+ else:
59
+ recall = 0 # Avoid division by zero
60
+
61
+ # Calculate F1-Score
62
+ if precision + recall > 0:
63
+ f1_score = 2 * (precision * recall) / (precision + recall)
64
+ else:
65
+ f1_score = 0 # Avoid division by zero
66
+
67
+ coeff_1 = (true_positives + false_positives) * (false_positives + true_negatives)
68
+ coeff_2 = (true_positives + false_negatives) * (false_negatives + true_negatives)
69
+ if coeff_1 + coeff_2 > 0:
70
+ kappa = (
71
+ 2
72
+ * (true_positives * true_negatives - false_negatives * false_positives)
73
+ / (coeff_1 + coeff_2)
74
+ )
75
+
76
+ metrics = {
77
+ "accuracy": accuracy,
78
+ "precision": precision,
79
+ "recall": recall,
80
+ "f1_score": f1_score,
81
+ "kappa": kappa,
82
+ }
83
+
84
+ if verbose:
85
+ print(f"Accuracy: {accuracy:.2f}%")
86
+ print(f"Precision: {precision:.2f}%")
87
+ print(f"Recall: {recall:.2f}%")
88
+ print(f"F1-Score: {f1_score:.2f}")
89
+ print(f"Cohen's Kappa: {kappa:.4f}")
90
+ else:
91
+ return metrics
92
+
93
+
94
+ def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
95
+ """Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
96
+
97
+ Parameters
98
+ ----------
99
+ df : `pd.DataFrame`
100
+ Input data containing the variables to be correlated.
101
+
102
+ Returns
103
+ -------
104
+ correlations : `pd.DataFrame`
105
+ A square dataframe with variable names as both index and columns,
106
+ containing their corresponding correlation coefficients.
107
+ """
108
+
109
+ columns = df.select_dtypes(include="number").columns
110
+ n = len(columns)
111
+
112
+ # Initialize a square matrix for the correlations
113
+ correlations = pd.DataFrame(1.0, index=columns, columns=columns)
114
+
115
+ for i, col1 in enumerate(columns):
116
+ for j, col2 in enumerate(columns):
117
+ if i < j:
118
+ x = df[col1].values
119
+ y = df[col2].values
120
+
121
+ correlation = xicor(x, y)
122
+ correlations.loc[col1, col2] = round(correlation, 8)
123
+ correlations.loc[col2, col1] = round(correlation, 8) # Mirror the correlation
124
+
125
+ return correlations
126
+
127
+
128
+ """
129
+ @article{Chatterjee2019ANC,
130
+ title={A New Coefficient of Correlation},
131
+ author={Sourav Chatterjee},
132
+ journal={Journal of the American Statistical Association},
133
+ year={2019},
134
+ volume={116},
135
+ pages={2009 - 2022},
136
+ url={https://api.semanticscholar.org/CorpusID:202719281}
137
+ }
138
+ """
139
+
140
+
141
+ def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
142
+ """
143
+ Calculate a generalized coefficient of correlation between two variables.
144
+
145
+ This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
146
+
147
+ Parameters
148
+ ----------
149
+ X : `np.ndarray`
150
+ The first variable to be correlated. Must have at least one dimension.
151
+ Y : `np.ndarray`
152
+ The second variable to be correlated. Must have at least one dimension.
153
+ ties : `bool`
154
+ Whether to handle ties using randomization.
155
+ random_seed : int, optional
156
+ Seed for the random number generator for reproducibility.
157
+
158
+ Returns
159
+ -------
160
+ xi : `float`
161
+ The estimated value of the new coefficient of correlation.
162
+ """
163
+ # Early return for identical arrays
164
+ if np.array_equal(X, Y):
165
+ return 1.0
166
+ n = len(X)
167
+ # Early return for cases with less than 2 elements
168
+ if n < 2:
169
+ return 0.0
170
+ # Flatten the input arrays if they are multidimensional
171
+ X = X.flatten()
172
+ Y = Y.flatten()
173
+ # Get the sorted order of X
174
+ order = np.argsort(X)
175
+ if ties:
176
+ np.random.seed(random_seed) # Set seed for reproducibility if needed
177
+ ranks = np.argsort(np.argsort(Y[order])) # Get ranks
178
+ unique_ranks, counts = np.unique(ranks, return_counts=True)
179
+ # Adjust ranks for ties by shuffling
180
+ for rank, count in zip(unique_ranks, counts):
181
+ if count > 1:
182
+ tie_indices = np.where(ranks == rank)[0]
183
+ np.random.shuffle(ranks[tie_indices]) # Randomize ties
184
+ cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
185
+ return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
186
+ 2 * np.sum(cumulative_counts * (n - cumulative_counts))
187
+ )
188
+ else:
189
+ ranks = np.argsort(np.argsort(Y[order])) # Get ranks without randomization
190
+ return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
191
+
192
+
193
+ # -------------------------------------------------------------------------
194
+
195
+
196
+ def ecprint(A: np.ndarray) -> None:
197
+ """Function that prints the augmented matrix.
198
+
199
+ Parameters
200
+ ----------
201
+ A : `np.ndarray`
202
+ The augmented matrix.
203
+
204
+ Returns
205
+ -------
206
+ `None`
207
+ Prints the matrix to console.
208
+ """
209
+ n = len(A)
210
+ for i in range(0, n):
211
+ line = ""
212
+ for j in range(0, n + 1):
213
+ line += str(format(round(A[i][j], 2))) + "\t"
214
+ if j == n - 1:
215
+ line += "| "
216
+ print(line)
217
+ print()
218
+
219
+
220
+ def sor_elimination(
221
+ A: np.ndarray,
222
+ b: np.ndarray,
223
+ n: int,
224
+ max_iterations: int,
225
+ w: float,
226
+ error: float = 1e-3,
227
+ verbose: bool = True,
228
+ ) -> np.ndarray:
229
+ """Computes the Successive Over-Relaxation algorithm.
230
+
231
+ Parameters
232
+ ----------
233
+ A : `np.ndarray`
234
+ Coefficient matrix of the system of equations.
235
+ b : `np.ndarray`
236
+ Right-hand side vector of the system of equations.
237
+ n : `int`
238
+ Dimension of the system of equations.
239
+ max_iterations : `int`
240
+ Maximum number of iterations allowed.
241
+ w : `float`
242
+ Relaxation parameter.
243
+ error : `float`, optional
244
+ Desired level of accuracy, default is 1e-3.
245
+ verbose : `bool`, optional
246
+ Whether to print intermediate results, default is False.
247
+
248
+ Returns
249
+ -------
250
+ xi : `np.ndarray`
251
+ The solution of the system of equations.
252
+ """
253
+ xin = np.zeros(n)
254
+ for k in range(max_iterations):
255
+ xi = np.zeros(n)
256
+ for i in range(n):
257
+ s1 = np.dot(A[i, :i], xin[:i])
258
+ s2 = np.dot(A[i, i + 1 :], xin[i + 1 :])
259
+ xi[i] = (w / A[i, i]) * (b[i] - s1 - s2) + (1.0 - w) * xin[i]
260
+
261
+ difference = np.max(np.abs(xi - xin))
262
+ if verbose:
263
+ print(f"Iteration {k + 1}: xi = {xi}, error = {difference}")
264
+ if difference <= error:
265
+ if verbose:
266
+ print(f"Converged after {k + 1} iterations.")
267
+ return xi
268
+ xin = np.copy(xi)
269
+
270
+ raise RuntimeError("Convergence not achieved within the maximum number of iterations.")
271
+
272
+
273
+ def gauss_elimination(A: np.ndarray | list, pr: int = 2) -> np.ndarray:
274
+ """Computes the Gauss elimination algorithm.
275
+
276
+ Parameters
277
+ ----------
278
+ A : `np.ndarray` or `list`
279
+ An array containing the parameters of the $n$ equations
280
+ with the equalities.
281
+ pr : `int`
282
+ significant numbers of decimals.
283
+
284
+ Returns
285
+ -------
286
+ X : `np.ndarray`
287
+ The solution of the system of $n$ equationsS
288
+ """
289
+
290
+ n = len(A)
291
+ X = [0 for _ in range(n)]
292
+
293
+ for i in range(n - 1):
294
+ for p in range(i, n):
295
+ if i <= p <= (n - 1) and A[p][i] != 0:
296
+ if p != i:
297
+ A[p], A[i] = A[i], A[p]
298
+ break
299
+ elif p == (n - 1):
300
+ print("There is no single solution")
301
+ return None
302
+
303
+ for j in range(i + 1, n):
304
+ if i <= j <= n and A[j][i] != 0:
305
+ if A[i][i] < A[j][i]:
306
+ A[j], A[i] = A[i], A[j]
307
+ break
308
+
309
+ for j in range(i + 1, n):
310
+ if A[i][i] == 0:
311
+ print("There is no single solution")
312
+ return None
313
+ factor = A[j][i] / A[i][i]
314
+ A[j] = [A[j][k] - factor * A[i][k] for k in range(n + 1)]
315
+
316
+ if A[n - 1][n - 1] == 0:
317
+ print("There is no single solution")
318
+ return None
319
+
320
+ X[n - 1] = A[n - 1][n] / A[n - 1][n - 1]
321
+ for i in range(n - 2, -1, -1):
322
+ s = sum(A[i][j] * X[j] for j in range(i + 1, n))
323
+ X[i] = (A[i][n] - s) / A[i][i]
324
+
325
+ ecprint(A)
326
+ print("The solution is:")
327
+ for i in range(n):
328
+ print(f"\tX{i} = {round(X[i], pr)}")
329
+
330
+ return X
331
+
332
+
333
+ def find_multiples(target: int) -> tuple[int, int] | None:
334
+ """
335
+ Find two factors of a given target number that are as close to each other as possible.
336
+
337
+ Parameters
338
+ ----------
339
+ target : int
340
+ The target number to find factors for.
341
+
342
+ Returns
343
+ -------
344
+ tuple[int, int] | None
345
+ If `i` and `target // i` both divide target, returns (i, target // i).
346
+ Otherwise, returns `(i, target // i)`.
347
+ Returns `None` if no factors are found.
348
+ """
349
+ for i in reversed(range(1, int(target**0.5) + 1)):
350
+ if target % i == 0:
351
+ return max(i, target // i), min(i, target // i)
352
+ return None
353
+
354
+
355
+ # Example usage:
356
+ if __name__ == "__main__":
357
+ import pandas as pd
358
+
359
+ # Create a sample dataframe with some random
360
+ data = {"x": [3, 5, 7, 9], "y": [4, 6, 8, 2], "z": [1, 2, 1, 3]}
361
+ df = pd.DataFrame(data)
362
+ print("Using the SOR relaxation method : ")
363
+ # Define the coefficient matrix A and the number of variables b
364
+ A = np.array([[1, 1, 1], [1, -1, 2], [1, -1, -3]])
365
+ Ag = A.copy()
366
+ b = np.array([6, 5, -10])
367
+ print("b :", b)
368
+ # Solve Ax=b, x = [1, 2, 3]
369
+ x = solve(A, b)
370
+ x_hat_sor = sor_elimination(A, b, 3, 200, 0.05)
371
+ # assert np.allclose(x, x_hat_sor), f"Expected:\n{x}\ngot\n{x_hat_sor}"
372
+
373
+ print("Using Gaussian elimination :")
374
+ Ag = np.insert(Ag, len(Ag), b, axis=1)
375
+ print(Ag)
376
+ x_hat_gaus = gauss_elimination(Ag)
377
+
378
+ print("New correlation coefficient test")
379
+ X = np.random.rand(100, 1)
380
+ Y = X * X
381
+ print("coefficient for Y = X * X :", xicor(X, Y, False))
382
+ df["index"] = ["A", "B", "C", "D"]
383
+ print("New correlation coefficient test for pandas DataFrame")
384
+ values_df = xi_corr(df)
385
+ print(find_multiples(30))
386
+ print(find_multiples(25))
387
+ print(find_multiples(49))
388
+ print(find_multiples(17))
389
+ print(find_multiples(24))
390
+ breakpoint()