mlquantify 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -1,449 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.model_selection import StratifiedKFold
4
-
5
-
6
-
7
- def sqEuclidean(dist1, dist2):
8
- """
9
- Compute the squared Euclidean distance between two probability distributions.
10
-
11
- The squared Euclidean distance is a measure of dissimilarity between two probability
12
- distributions. It is defined as:
13
-
14
- D(P, Q) = Σ(Pᵢ - Qᵢ)²
15
-
16
- Parameters
17
- ----------
18
- dist1 : array-like
19
- The first probability distribution \( P \), where each element \( Pᵢ \) represents
20
- the probability of the \( i \)-th event.
21
- dist2 : array-like
22
- The second probability distribution \( Q \), where each element \( Qᵢ \) represents
23
- the probability of the \( i \)-th event.
24
-
25
- Returns
26
- -------
27
- float
28
- The squared Euclidean distance between the two distributions.
29
-
30
- Notes
31
- -----
32
- - This distance is non-negative and equals zero if and only if the two distributions
33
- are identical.
34
- - Both input distributions must be valid probability distributions; their elements
35
- should be non-negative and sum to 1.
36
- """
37
- P = dist1
38
- Q = dist2
39
- return sum((P - Q)**2)
40
-
41
-
42
- def probsymm(dist1, dist2):
43
- """
44
- Compute the probabilistic symmetric distance between two probability distributions.
45
-
46
- The probabilistic symmetric distance is a measure of dissimilarity between two probability
47
- distributions. It is defined as:
48
-
49
- D(P, Q) = 2 * Σ((Pᵢ - Qᵢ)² / (Pᵢ + Qᵢ))
50
-
51
- Parameters
52
- ----------
53
- dist1 : array-like
54
- The first probability distribution \( P \), where each element \( Pᵢ \) represents
55
- the probability of the \( i \)-th event.
56
- dist2 : array-like
57
- The second probability distribution \( Q \), where each element \( Qᵢ \) represents
58
- the probability of the \( i \)-th event.
59
-
60
- Returns
61
- -------
62
- float
63
- The probabilistic symmetric distance between the two distributions.
64
-
65
- Notes
66
- -----
67
- - This distance is non-negative and equals zero if and only if the two distributions
68
- are identical.
69
- - Both input distributions must be valid probability distributions; their elements
70
- should be non-negative and sum to 1.
71
- - Division by zero is avoided by assuming the input distributions have no zero elements.
72
- """
73
- P = dist1
74
- Q = dist2
75
- return 2 * sum((P - Q)**2 / (P + Q))
76
-
77
-
78
- def topsoe(dist1, dist2):
79
- """
80
- Compute the Topsøe distance between two probability distributions.
81
-
82
- The Topsøe distance is a measure of divergence between two probability distributions.
83
- It is defined as:
84
-
85
- D(P, Q) = Σ(Pᵢ * log(2 * Pᵢ / (Pᵢ + Qᵢ)) + Qᵢ * log(2 * Qᵢ / (Pᵢ + Qᵢ)))
86
-
87
- Parameters
88
- ----------
89
- dist1 : array-like
90
- The first probability distribution \( P \), where each element \( Pᵢ \) represents
91
- the probability of the \( i \)-th event.
92
- dist2 : array-like
93
- The second probability distribution \( Q \), where each element \( Qᵢ \) represents
94
- the probability of the \( i \)-th event.
95
-
96
- Returns
97
- -------
98
- float
99
- The Topsøe distance between the two distributions.
100
-
101
- Notes
102
- -----
103
- - This distance is non-negative and equals zero if and only if the two distributions
104
- are identical.
105
- - Both input distributions must be valid probability distributions; their elements
106
- should be non-negative and sum to 1.
107
- - Division by zero is avoided by assuming the input distributions have no zero elements.
108
- - The logarithm used is the natural logarithm.
109
- """
110
- P = dist1
111
- Q = dist2
112
- return sum(P * np.log(2 * P / (P + Q)) + Q * np.log(2 * Q / (P + Q)))
113
-
114
-
115
- def hellinger(dist1, dist2):
116
- """
117
- Compute the Hellinger distance between two probability distributions.
118
-
119
- The Hellinger distance is a measure of similarity between two probability distributions.
120
- It is defined as:
121
-
122
- H(P, Q) = 2 * sqrt(|1 - Σ√(Pᵢ * Qᵢ)|)
123
-
124
- Parameters
125
- ----------
126
- dist1 : array-like
127
- The first probability distribution \( P \), where each element \( Pᵢ \) represents
128
- the probability of the \( i \)-th event.
129
- dist2 : array-like
130
- The second probability distribution \( Q \), where each element \( Qᵢ \) represents
131
- the probability of the \( i \)-th event.
132
-
133
- Returns
134
- -------
135
- float
136
- The Hellinger distance between the two distributions.
137
-
138
- Notes
139
- -----
140
- - The Hellinger distance ranges from 0 to 2, where 0 indicates that the distributions
141
- are identical, and 2 indicates that they are completely different.
142
- - Both input distributions must be valid probability distributions; their elements
143
- should be non-negative and sum to 1.
144
- - The absolute value is used to handle numerical errors that may cause the expression
145
- inside the square root to become slightly negative.
146
- """
147
- P=dist1
148
- Q=dist2
149
- return 2 * np.sqrt(np.abs(1 - sum(np.sqrt(P * Q))))
150
-
151
-
152
-
153
-
154
-
155
-
156
-
157
-
158
-
159
-
160
-
161
-
162
-
163
- def get_scores(X, y, learner, folds: int = 10, learner_fitted: bool = False) -> tuple:
164
- """
165
- Generate true labels and predicted probabilities using a machine learning model.
166
-
167
- This function evaluates a machine learning model using cross-validation or directly
168
- with a pre-fitted model, returning the true labels and predicted probabilities.
169
-
170
- Parameters
171
- ----------
172
- X : Union[np.ndarray, pd.DataFrame]
173
- Input features for the model.
174
- y : Union[np.ndarray, pd.Series]
175
- Target labels corresponding to the input features.
176
- learner : object
177
- A machine learning model that implements the `fit` and `predict_proba` methods.
178
- folds : int, optional
179
- Number of folds for stratified cross-validation. Defaults to 10.
180
- learner_fitted : bool, optional
181
- If `True`, assumes the learner is already fitted and directly predicts probabilities
182
- without performing cross-validation. Defaults to `False`.
183
-
184
- Returns
185
- -------
186
- tuple
187
- - An array of true labels.
188
- - An array of predicted probabilities.
189
-
190
- Notes
191
- -----
192
- - When `learner_fitted` is `True`, the model is assumed to be pre-trained and no
193
- cross-validation is performed.
194
- - When `learner_fitted` is `False`, stratified k-fold cross-validation is used to
195
- generate predictions.
196
- - The input data `X` and `y` are converted to pandas objects for compatibility.
197
- """
198
- if isinstance(X, np.ndarray):
199
- X = pd.DataFrame(X)
200
- if isinstance(y, np.ndarray):
201
- y = pd.Series(y)
202
-
203
- if learner_fitted:
204
- probabilities = learner.predict_proba(X)
205
- y_label = y
206
- else:
207
- skf = StratifiedKFold(n_splits=folds)
208
- probabilities = []
209
- y_label = []
210
-
211
- for train_index, valid_index in skf.split(X, y):
212
- tr_data = pd.DataFrame(X.iloc[train_index]) # Train data and labels
213
- tr_label = y.iloc[train_index]
214
- valid_data = pd.DataFrame(X.iloc[valid_index]) # Validation data and labels
215
- valid_label = y.iloc[valid_index]
216
-
217
- learner.fit(tr_data, tr_label)
218
- probabilities.extend(learner.predict_proba(valid_data)) # Evaluating scores
219
- y_label.extend(valid_label)
220
-
221
- return np.asarray(y_label), np.asarray(probabilities)
222
-
223
-
224
-
225
-
226
-
227
-
228
- def getHist(scores, nbins):
229
- """
230
- Calculate histogram-like bin probabilities for a given set of scores.
231
-
232
- This function divides the score range into equal bins and computes the proportion
233
- of scores in each bin, normalized by the total count.
234
-
235
- Parameters
236
- ----------
237
- scores : np.ndarray
238
- A 1-dimensional array of scores.
239
- nbins : int
240
- Number of bins for dividing the score range.
241
-
242
- Returns
243
- -------
244
- np.ndarray
245
- An array containing the normalized bin probabilities.
246
-
247
- Notes
248
- -----
249
- - The bins are equally spaced between 0 and 1, with an additional upper boundary
250
- to include the maximum score.
251
- - The returned probabilities are normalized to account for the total number of scores.
252
- """
253
- breaks = np.linspace(0, 1, int(nbins) + 1)
254
- breaks = np.delete(breaks, -1)
255
- breaks = np.append(breaks, 1.1)
256
-
257
- re = np.repeat(1 / (len(breaks) - 1), (len(breaks) - 1))
258
- for i in range(1, len(breaks)):
259
- re[i - 1] = (re[i - 1] + len(np.where((scores >= breaks[i - 1]) & (scores < breaks[i]))[0])) / (len(scores) + 1)
260
-
261
- return re
262
-
263
-
264
-
265
-
266
-
267
-
268
-
269
- def MoSS(n:int, alpha:float, m:float):
270
- """
271
- Generate a synthetic dataset using the MoSS method.
272
-
273
- Parameters
274
- ----------
275
- n : int
276
- The number of samples to generate.
277
- alpha : float
278
- The proportion of positive samples in the dataset.
279
- m : float
280
- The shape parameter for the synthetic dataset.
281
-
282
- Returns
283
- -------
284
- tuple
285
- A tuple containing the synthetic positive and negative samples.
286
- """
287
- n_pos = int(n*alpha)
288
- n_neg = int((1-alpha)*n)
289
-
290
- x_pos = np.arange(1, n_pos, 1)
291
- x_neg = np.arange(1, n_neg, 1)
292
-
293
- syn_plus = np.power(x_pos/(n_pos+1), m)
294
- syn_neg = 1 - np.power(x_neg/(n_neg+1), m)
295
-
296
- #moss = np.union1d(syn_plus, syn_neg)
297
-
298
- return syn_plus, syn_neg
299
-
300
-
301
-
302
-
303
-
304
- def ternary_search(left, right, f, eps=1e-4):
305
- """This function applies Ternary search
306
-
307
- Parameters
308
- ----------
309
- left : float
310
- The left boundary of the search interval.
311
- right : float
312
- The right boundary of the search interval.
313
- f : function
314
- The function to optimize.
315
- eps : float, optional
316
- The desired precision of the result. Defaults to 1e-4.
317
-
318
- Returns
319
- -------
320
- float
321
- The value of the argument that minimizes the function.
322
- """
323
-
324
- while True:
325
- if abs(left - right) < eps:
326
- return(left + right) / 2
327
-
328
- leftThird = left + (right - left) / 3
329
- rightThird = right - (right - left) / 3
330
-
331
- if f(leftThird) > f(rightThird):
332
- left = leftThird
333
- else:
334
- right = rightThird
335
-
336
-
337
-
338
-
339
-
340
-
341
-
342
-
343
-
344
-
345
-
346
-
347
- def compute_table(y, y_pred, classes):
348
- """
349
- Compute the confusion matrix table for a binary classification task.
350
-
351
- Parameters
352
- ----------
353
- y : np.ndarray
354
- The true labels.
355
- y_pred : np.ndarray
356
- The predicted labels.
357
- classes : np.ndarray
358
- The unique classes in the dataset.
359
-
360
- Returns
361
- -------
362
- tuple
363
- A tuple containing the True Positives, False Positives, False Negatives, and True Negatives.
364
- """
365
- TP = np.logical_and(y == y_pred, y == classes[1]).sum()
366
- FP = np.logical_and(y != y_pred, y == classes[0]).sum()
367
- FN = np.logical_and(y != y_pred, y == classes[1]).sum()
368
- TN = np.logical_and(y == y_pred, y == classes[0]).sum()
369
- return TP, FP, FN, TN
370
-
371
-
372
- def compute_tpr(TP, FN):
373
- """
374
- Compute the True Positive Rate (Recall) for a binary classification task.
375
-
376
- Parameters
377
- ----------
378
- TP : int
379
- The number of True Positives.
380
- FN : int
381
- The number of False Negatives.
382
-
383
- Returns
384
- -------
385
- float
386
- The True Positive Rate (Recall).
387
- """
388
- if TP + FN == 0:
389
- return 0
390
- return TP / (TP + FN)
391
-
392
-
393
- def compute_fpr(FP, TN):
394
- """
395
- Compute the False Positive Rate for a binary classification task.
396
-
397
- Parameters
398
- ----------
399
- FP : int
400
- The number of False Positives.
401
- TN : int
402
- The number of True Negatives.
403
-
404
- Returns
405
- -------
406
- float
407
- The False Positive Rate.
408
- """
409
- if FP + TN == 0:
410
- return 0
411
- return FP / (FP + TN)
412
-
413
-
414
- def adjust_threshold(y, probabilities:np.ndarray, classes:np.ndarray) -> tuple:
415
- """
416
- Adjust the threshold for a binary quantification task to maximize the True Positive Rate.
417
-
418
- Parameters
419
- ----------
420
- y : np.ndarray
421
- The true labels.
422
- probabilities : np.ndarray
423
- The predicted probabilities.
424
- classes : np.ndarray
425
- The unique classes in the dataset.
426
-
427
- Returns
428
- -------
429
- tuple
430
- The best True Positive Rate and False Positive Rate.
431
- """
432
- unique_scores = np.linspace(0, 1, 101)
433
-
434
- tprs = []
435
- fprs = []
436
-
437
- for threshold in unique_scores:
438
- y_pred = np.where(probabilities >= threshold, classes[1], classes[0])
439
-
440
- TP, FP, FN, TN = compute_table(y, y_pred, classes)
441
-
442
- tpr = compute_tpr(TP, FN)
443
- fpr = compute_fpr(FP, TN)
444
-
445
- tprs.append(tpr)
446
- fprs.append(fpr)
447
-
448
- #best_tpr, best_fpr = self.adjust_threshold(np.asarray(tprs), np.asarray(fprs))
449
- return (unique_scores, np.asarray(tprs), np.asarray(fprs))
@@ -1,22 +0,0 @@
1
- mlquantify/__init__.py,sha256=EzRAX5TpVjDLP_Z9RG98xMhMGVckcYDHxwaZLrSKZjA,979
2
- mlquantify/base.py,sha256=hJ9FYYNGeO5-WJlpJpsUiu_LQL1fimvZPPNsKptxN7w,19196
3
- mlquantify/model_selection.py,sha256=rPR4fwwxuihzx5Axq4NhMOeuMBzpoC9pKp5taYNt_LY,12678
4
- mlquantify/plots.py,sha256=9XOhx4QXkN9RkkiErLuL90FWIBUV2YTEJNT4Jwfy0ac,12380
5
- mlquantify/classification/__init__.py,sha256=3FGf-F4SOM3gByUPsWdnBzjyC_31B3MtzuolEuocPls,22
6
- mlquantify/classification/methods.py,sha256=yDSbpoqM3hfF0a9ATzKqfG9S-44x-0Rq0lkAVJKTIEs,5006
7
- mlquantify/evaluation/__init__.py,sha256=x1grng0n_QeZpVBU8-pwagYdBMkbMRILtrp1qk_bLvk,447
8
- mlquantify/evaluation/measures.py,sha256=fIKyxxlD8em3oaj4u_BeXmNyUQG_A0vXWY8APPgNoJ0,6579
9
- mlquantify/evaluation/protocol.py,sha256=WILyr6i4GZLk9DZqzhcyQ1jSCO0GhoEn_lqTwWCUf64,10000
10
- mlquantify/methods/__init__.py,sha256=ya3Mn7bcz2r3oaIT7yVR4iJkAfgEAwF4xDK54C0rZ7U,536
11
- mlquantify/methods/aggregative.py,sha256=F5Z-tGA9OcZgMBLKOeaos6wIgvvnDeriZ4y0TyMpDrc,39051
12
- mlquantify/methods/meta.py,sha256=mBunCc_PFLdmrs5sf5MDc8TbO3VFpLAmxV2y2VDNjY8,19052
13
- mlquantify/methods/mixture_models.py,sha256=si2Pzaka5Kbva4QKBzLolvb_8V0ZEjp68UBAiOwl49s,35166
14
- mlquantify/methods/non_aggregative.py,sha256=xaBu21TUtiYkOEUKO16NaNMwdNa6-SNjfBsc5PpIMyI,4815
15
- mlquantify/methods/threshold_optimization.py,sha256=NYGKbYvtfmiBeU8wpTiFCdURkijcPRZtybPOt6vtXbY,30489
16
- mlquantify/utils/__init__.py,sha256=logWrL6B6mukP8tvYm_UPEdO9eNA-J-ySILr7-syDoc,44
17
- mlquantify/utils/general.py,sha256=wKJSmwF1KfSlSrDm0KTf92FMvB62BBOxf2Se9HyeWYE,8668
18
- mlquantify/utils/method.py,sha256=RL4vBJGl5_6DZ59Bs62hdNXI_hnoDIWilMMyMPiOjBg,12631
19
- mlquantify-0.1.8.dist-info/METADATA,sha256=PQbJUuo_c3k2PApZjapoIz0Cx61ovZZxh_j291TchIs,5166
20
- mlquantify-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- mlquantify-0.1.8.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
22
- mlquantify-0.1.8.dist-info/RECORD,,