mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. mlquantify/__init__.py +32 -6
  2. mlquantify/base.py +559 -257
  3. mlquantify/classification/__init__.py +1 -1
  4. mlquantify/classification/methods.py +160 -0
  5. mlquantify/evaluation/__init__.py +14 -2
  6. mlquantify/evaluation/measures.py +215 -0
  7. mlquantify/evaluation/protocol.py +647 -0
  8. mlquantify/methods/__init__.py +37 -40
  9. mlquantify/methods/aggregative.py +1030 -0
  10. mlquantify/methods/meta.py +472 -0
  11. mlquantify/methods/mixture_models.py +1003 -0
  12. mlquantify/methods/non_aggregative.py +136 -0
  13. mlquantify/methods/threshold_optimization.py +957 -0
  14. mlquantify/model_selection.py +377 -232
  15. mlquantify/plots.py +367 -0
  16. mlquantify/utils/__init__.py +2 -2
  17. mlquantify/utils/general.py +334 -0
  18. mlquantify/utils/method.py +449 -0
  19. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
  20. mlquantify-0.1.1.dist-info/RECORD +22 -0
  21. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
  22. mlquantify/classification/pwkclf.py +0 -73
  23. mlquantify/evaluation/measures/__init__.py +0 -26
  24. mlquantify/evaluation/measures/ae.py +0 -11
  25. mlquantify/evaluation/measures/bias.py +0 -16
  26. mlquantify/evaluation/measures/kld.py +0 -8
  27. mlquantify/evaluation/measures/mse.py +0 -12
  28. mlquantify/evaluation/measures/nae.py +0 -16
  29. mlquantify/evaluation/measures/nkld.py +0 -13
  30. mlquantify/evaluation/measures/nrae.py +0 -16
  31. mlquantify/evaluation/measures/rae.py +0 -12
  32. mlquantify/evaluation/measures/se.py +0 -12
  33. mlquantify/evaluation/protocol/_Protocol.py +0 -202
  34. mlquantify/evaluation/protocol/__init__.py +0 -2
  35. mlquantify/evaluation/protocol/app.py +0 -146
  36. mlquantify/evaluation/protocol/npp.py +0 -34
  37. mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
  38. mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
  39. mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
  40. mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
  41. mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
  42. mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
  43. mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
  44. mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
  45. mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
  46. mlquantify/methods/aggregative/__init__.py +0 -9
  47. mlquantify/methods/aggregative/cc.py +0 -32
  48. mlquantify/methods/aggregative/emq.py +0 -86
  49. mlquantify/methods/aggregative/fm.py +0 -72
  50. mlquantify/methods/aggregative/gac.py +0 -96
  51. mlquantify/methods/aggregative/gpac.py +0 -87
  52. mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
  53. mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
  54. mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
  55. mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
  56. mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
  57. mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
  58. mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
  59. mlquantify/methods/aggregative/pcc.py +0 -33
  60. mlquantify/methods/aggregative/pwk.py +0 -38
  61. mlquantify/methods/meta/__init__.py +0 -1
  62. mlquantify/methods/meta/ensemble.py +0 -236
  63. mlquantify/methods/non_aggregative/__init__.py +0 -1
  64. mlquantify/methods/non_aggregative/hdx.py +0 -71
  65. mlquantify/plots/__init__.py +0 -2
  66. mlquantify/plots/distribution_plot.py +0 -109
  67. mlquantify/plots/protocol_plot.py +0 -193
  68. mlquantify/utils/general_purposes/__init__.py +0 -8
  69. mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
  70. mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
  71. mlquantify/utils/general_purposes/get_real_prev.py +0 -9
  72. mlquantify/utils/general_purposes/load_quantifier.py +0 -4
  73. mlquantify/utils/general_purposes/make_prevs.py +0 -23
  74. mlquantify/utils/general_purposes/normalize.py +0 -20
  75. mlquantify/utils/general_purposes/parallel.py +0 -10
  76. mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
  77. mlquantify/utils/method_purposes/__init__.py +0 -6
  78. mlquantify/utils/method_purposes/distances.py +0 -21
  79. mlquantify/utils/method_purposes/getHist.py +0 -13
  80. mlquantify/utils/method_purposes/get_scores.py +0 -33
  81. mlquantify/utils/method_purposes/moss.py +0 -16
  82. mlquantify/utils/method_purposes/ternary_search.py +0 -14
  83. mlquantify/utils/method_purposes/tprfpr.py +0 -42
  84. mlquantify-0.0.11.2.dist-info/RECORD +0 -73
  85. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,957 @@
1
+ from abc import abstractmethod
2
+ import numpy as np
3
+ import warnings
4
+ from sklearn.base import BaseEstimator
5
+
6
+ from ..base import AggregativeQuantifier
7
+ from ..utils.method import adjust_threshold, get_scores
8
+ import mlquantify as mq
9
+
10
+
11
+
12
+
13
+ class ThresholdOptimization(AggregativeQuantifier):
14
+ """
15
+ Generic Class for methods that adjust the decision boundary of the underlying classifier
16
+ to make the ACC (base method for threshold methods) estimation more numerically stable.
17
+ Most strategies involve altering the denominator of the ACC equation.
18
+
19
+ This class serves as a base for implementing threshold optimization techniques in classification
20
+ tasks. It is designed to adjust thresholds based on true positive and false positive rates,
21
+ ensuring better quantification performance.
22
+
23
+ Parameters
24
+ ----------
25
+ learner : BaseEstimator
26
+ A scikit-learn compatible classifier to be used for threshold optimization.
27
+ threshold : float, optional
28
+ The threshold value to be used for classification decisions. Default is 0.5.
29
+
30
+ Attributes
31
+ ----------
32
+ learner : BaseEstimator
33
+ A scikit-learn compatible classifier.
34
+ threshold : float, optional
35
+ The optimized threshold used for classification decisions.
36
+ cc_output : float, optional
37
+ The classification count output, representing the proportion of instances classified
38
+ as positive based on the threshold.
39
+ tpr : float, optional
40
+ The true positive rate corresponding to the best threshold.
41
+ fpr : float, optional
42
+ The false positive rate corresponding to the best threshold.
43
+
44
+ Notes
45
+ -----
46
+ All methods that inherit from this class will be binary quantifiers. In case of multiclass problems, it will be made One vs All.
47
+
48
+ Examples
49
+ --------
50
+ >>> from mlquantify.methods.threshold_optimization import ThresholdOptimization
51
+ >>> from mlquantify.utils.general import get_real_prev
52
+ >>> from sklearn.datasets import load_breast_cancer
53
+ >>> from sklearn.svm import SVC
54
+ >>> from sklearn.model_selection import train_test_split
55
+ >>>
56
+ >>> class MyThrMethod(ThresholdOptimization):
57
+ ... def __init__(self, learner, threshold=0.5):
58
+ ... super().__init__(learner)
59
+ ... self.threshold = threshold
60
+ ... def best_tprfpr(self, thresholds, tpr, fpr):
61
+ ... return thresholds[20], tpr[20], fpr[20]
62
+ >>>
63
+ >>> features, target = load_breast_cancer(return_X_y=True)
64
+ >>>
65
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
66
+ >>>
67
+ >>> mtm = MyThrMethod(learner=SVC(probability=True), threshold=0.5)
68
+ >>> mtm.fit(X_train, y_train)
69
+ >>> y_pred = mtm.predict(X_test)
70
+ """
71
+
72
+ def __init__(self, learner: BaseEstimator=None):
73
+ self.learner = learner
74
+ self.threshold = None
75
+ self.cc_output = None
76
+ self.tpr = None
77
+ self.fpr = None
78
+
79
+ @property
80
+ def is_probabilistic(self) -> bool:
81
+ """
82
+ Returns whether the method is probabilistic.
83
+
84
+ This method is used to determine whether the quantification method is probabilistic,
85
+ meaning it uses class-conditional probabilities to estimate class prevalences.
86
+
87
+ Returns
88
+ -------
89
+ bool
90
+ True, indicating that this method is probabilistic.
91
+ """
92
+ return True
93
+
94
+ @property
95
+ def is_multiclass(self) -> bool:
96
+ """
97
+ Returns whether the method is applicable to multiclass quantification.
98
+
99
+ Threshold-based methods are typically binary classifiers, so this method
100
+ returns False.
101
+
102
+ Returns
103
+ -------
104
+ bool
105
+ False, indicating that this method does not support multiclass quantification.
106
+ """
107
+ return False
108
+
109
+ def _fit_method(self, X, y):
110
+ """
111
+ Fits the classifier and adjusts thresholds based on true positive rate (TPR) and false positive rate (FPR).
112
+
113
+ Parameters
114
+ ----------
115
+ X : pd.DataFrame or np.ndarray
116
+ The input features for training.
117
+ y : pd.Series or np.ndarray
118
+ The target labels for training.
119
+
120
+ Returns
121
+ -------
122
+ self : ThresholdOptimization
123
+ The fitted quantifier object with the best threshold, TPR, and FPR.
124
+ """
125
+ # Get predicted labels and probabilities
126
+ if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
127
+ y_labels = mq.arguments["y_labels"]
128
+ probabilities = mq.arguments["posteriors_train"]
129
+ else:
130
+ y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
131
+
132
+ # Adjust thresholds and compute true and false positive rates
133
+ thresholds, tprs, fprs = adjust_threshold(y_labels, probabilities[:, 1], self.classes)
134
+
135
+ # Find the best threshold based on TPR and FPR
136
+ self.threshold, self.tpr, self.fpr = self.best_tprfpr(thresholds, tprs, fprs)
137
+
138
+ return self
139
+
140
+ def _predict_method(self, X) -> dict:
141
+ """
142
+ Predicts class prevalences using the adjusted threshold.
143
+
144
+ Parameters
145
+ ----------
146
+ X : pd.DataFrame or np.ndarray
147
+ The input features for prediction.
148
+
149
+ Returns
150
+ -------
151
+ np.ndarray
152
+ An array of predicted prevalences for the classes.
153
+ """
154
+ # Get predicted probabilities for the positive class
155
+ probabilities = self.predict_learner(X)[:, 1]
156
+
157
+ # Compute the classification count output based on the threshold
158
+ self.cc_output = len(probabilities[probabilities >= self.threshold]) / len(probabilities)
159
+
160
+ # Calculate prevalence, ensuring it is within [0, 1]
161
+ if self.tpr - self.fpr == 0:
162
+ prevalence = self.cc_output
163
+ else:
164
+ # Equation of threshold methods to compute prevalence
165
+ prevalence = np.clip((self.cc_output - self.fpr) / (self.tpr - self.fpr), 0, 1)
166
+
167
+ prevalences = [1 - prevalence, prevalence]
168
+
169
+ return np.asarray(prevalences)
170
+
171
+ @abstractmethod
172
+ def best_tprfpr(self, thresholds: np.ndarray, tpr: np.ndarray, fpr: np.ndarray) -> float:
173
+ """
174
+ Abstract method for determining the best TPR (True Positive Rate) and FPR (False Positive Rate)
175
+ to use in the equation for threshold optimization.
176
+
177
+ This method needs to be implemented by subclasses to define how the best threshold
178
+ is chosen based on TPR and FPR.
179
+
180
+ Parameters
181
+ ----------
182
+ thresholds : np.ndarray
183
+ An array of threshold values.
184
+ tpr : np.ndarray
185
+ An array of true positive rates corresponding to the thresholds.
186
+ fpr : np.ndarray
187
+ An array of false positive rates corresponding to the thresholds.
188
+
189
+ Returns
190
+ -------
191
+ float
192
+ The best threshold value determined based on the true positive and false positive rates.
193
+ """
194
+ ...
195
+
196
+
197
+
198
+
199
+
200
+
201
+ class ACC(ThresholdOptimization):
202
+ """
203
+ Adjusted Classify and Count (ACC). This method is a base approach for threshold-based
204
+ quantification methods.
205
+
206
+ As described in the ThresholdOptimization base class, this method estimates the true
207
+ positive rate (TPR) and false positive rate (FPR) from the training data. It then uses
208
+ these values to adjust the output of the Classify and Count (CC) method, making the
209
+ quantification process more accurate and stable.
210
+
211
+ Parameters
212
+ ----------
213
+ learner : BaseEstimator
214
+ A scikit-learn compatible classifier to be used for quantification.
215
+ threshold : float, optional
216
+ The decision threshold for classifying instances. Default is 0.5.
217
+
218
+ Attributes
219
+ ----------
220
+ learner : BaseEstimator
221
+ A scikit-learn compatible classifier.
222
+ threshold : float
223
+ The decision threshold used to classify instances as positive or negative. Default is 0.5.
224
+
225
+ See Also
226
+ --------
227
+ ThresholdOptimization : Base class for threshold-based quantification methods.
228
+ CC : Classify and Count quantification method.
229
+
230
+ References
231
+ ----------
232
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
233
+
234
+ Examples
235
+ --------
236
+ >>> from mlquantify.methods.aggregative import ACC
237
+ >>> from mlquantify.utils.general import get_real_prev
238
+ >>> from sklearn.datasets import load_breast_cancer
239
+ >>> from sklearn.svm import SVC
240
+ >>> from sklearn.model_selection import train_test_split
241
+ >>>
242
+ >>> features, target = load_breast_cancer(return_X_y=True)
243
+ >>>
244
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
245
+ >>>
246
+ >>> acc = ACC(learner=SVC(probability=True), threshold=0.5)
247
+ >>> acc.fit(X_train, y_train)
248
+ >>> y_pred = acc.predict(X_test)
249
+ >>> y_pred
250
+ {0: 0.3968506555196656, 1: 0.6031493444803344}
251
+ >>> get_real_prev(y_test)
252
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
253
+ """
254
+
255
+ def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
256
+ super().__init__(learner)
257
+ self.threshold = threshold
258
+
259
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
260
+ """
261
+ Determines the true positive rate (TPR) and false positive rate (FPR) for the specified threshold.
262
+
263
+ This method identifies the TPR and FPR corresponding to the threshold provided
264
+ during initialization. It assumes that the `thresholds`, `tprs`, and `fprs` arrays
265
+ are aligned, meaning the `i-th` element of each array corresponds to the same threshold.
266
+
267
+ Parameters
268
+ ----------
269
+ thresholds : np.ndarray
270
+ An array of threshold values.
271
+ tprs : np.ndarray
272
+ An array of true positive rates corresponding to the thresholds.
273
+ fprs : np.ndarray
274
+ An array of false positive rates corresponding to the thresholds.
275
+
276
+ Returns
277
+ -------
278
+ tuple
279
+ A tuple containing the threshold, the true positive rate (TPR), and the false
280
+ positive rate (FPR) for the specified threshold.
281
+
282
+ Raises
283
+ ------
284
+ IndexError
285
+ If the specified threshold is not found in the `thresholds` array.
286
+ """
287
+ # Get the TPR and FPR where the threshold matches the specified value
288
+ tpr = tprs[thresholds == self.threshold][0]
289
+ fpr = fprs[thresholds == self.threshold][0]
290
+ return (self.threshold, tpr, fpr)
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+ class MAX(ThresholdOptimization):
301
+ """
302
+ Threshold MAX. This quantification method selects the threshold that maximizes
303
+ the absolute difference between the true positive rate (TPR) and false positive
304
+ rate (FPR). This threshold is then used in the denominator of the equation for
305
+ adjusted prevalence estimation.
306
+
307
+ Parameters
308
+ ----------
309
+ learner : BaseEstimator
310
+ A scikit-learn compatible classifier to be used for quantification.
311
+
312
+ Attributes
313
+ ----------
314
+ learner : BaseEstimator
315
+ A scikit-learn compatible classifier.
316
+
317
+ See Also
318
+ --------
319
+ ThresholdOptimization : Base class for threshold-based quantification methods.
320
+ ACC : Adjusted Classify and Count quantification method.
321
+ CC : Classify and Count quantification method.
322
+
323
+ References
324
+ ----------
325
+ FORMAN, George. Counting positives accurately despite inaccurate classification. In: European conference on machine learning. Berlin, Heidelberg: Springer Berlin Heidelberg, 2005. p. 564-575. Available at: https://link.springer.com/chapter/10.1007/11564096_56
326
+
327
+ Examples
328
+ --------
329
+ >>> from mlquantify.methods.aggregative import MAX
330
+ >>> from mlquantify.utils.general import get_real_prev
331
+ >>> from sklearn.datasets import load_breast_cancer
332
+ >>> from sklearn.svm import SVC
333
+ >>> from sklearn.model_selection import train_test_split
334
+ >>>
335
+ >>> features, target = load_breast_cancer(return_X_y=True)
336
+ >>>
337
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
338
+ >>>
339
+ >>> maxq = MAX(learner=SVC(probability=True))
340
+ >>> maxq.fit(X_train, y_train)
341
+ >>> y_pred = maxq.predict(X_test)
342
+ >>> y_pred
343
+ {0: 0.3920664352842359, 1: 0.6079335647157641}
344
+ >>> get_real_prev(y_test)
345
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
346
+ """
347
+
348
+ def __init__(self, learner: BaseEstimator=None):
349
+ super().__init__(learner)
350
+
351
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
352
+ """
353
+ Determines the optimal threshold by maximizing the absolute difference between
354
+ the true positive rate (TPR) and the false positive rate (FPR).
355
+
356
+ This method identifies the index where `|TPR - FPR|` is maximized and retrieves
357
+ the corresponding threshold, TPR, and FPR.
358
+
359
+ Parameters
360
+ ----------
361
+ thresholds : np.ndarray
362
+ An array of threshold values.
363
+ tprs : np.ndarray
364
+ An array of true positive rates corresponding to the thresholds.
365
+ fprs : np.ndarray
366
+ An array of false positive rates corresponding to the thresholds.
367
+
368
+ Returns
369
+ -------
370
+ tuple
371
+ A tuple containing:
372
+ - The threshold that maximizes `|TPR - FPR|`.
373
+ - The true positive rate (TPR) at the selected threshold.
374
+ - The false positive rate (FPR) at the selected threshold.
375
+
376
+ Raises
377
+ ------
378
+ ValueError
379
+ If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
380
+ """
381
+ max_index = np.argmax(np.abs(tprs - fprs))
382
+
383
+ # Retrieve the corresponding threshold, TPR, and FPR
384
+ threshold = thresholds[max_index]
385
+ tpr = tprs[max_index]
386
+ fpr = fprs[max_index]
387
+ return (threshold, tpr, fpr)
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+ class MS(ThresholdOptimization):
398
+ """
399
+ Median Sweep (MS). This quantification method uses an ensemble
400
+ of threshold-based methods, taking the median values of the
401
+ true positive rate (TPR) and false positive rate (FPR) across
402
+ all thresholds to compute adjusted prevalences.
403
+
404
+ Parameters
405
+ ----------
406
+ learner : BaseEstimator
407
+ A scikit-learn compatible classifier to be used for quantification.
408
+ threshold : float, optional
409
+ The default threshold value to use for the quantification method. Default is 0.5.
410
+
411
+ Attributes
412
+ ----------
413
+ learner : BaseEstimator
414
+ A scikit-learn compatible classifier.
415
+ threshold : float
416
+ The default threshold to use for the quantification method, typically 0.5.
417
+
418
+ See Also
419
+ --------
420
+ ThresholdOptimization : Base class for threshold-based quantification methods.
421
+ ACC : Adjusted Classify and Count quantification method.
422
+ MAX : Threshold MAX quantification method.
423
+ CC : Classify and Count quantification method.
424
+
425
+ References
426
+ ----------
427
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
428
+
429
+ Examples
430
+ --------
431
+ >>> from mlquantify.methods.aggregative import MS
432
+ >>> from mlquantify.utils.general import get_real_prev
433
+ >>> from sklearn.datasets import load_breast_cancer
434
+ >>> from sklearn.svm import SVC
435
+ >>> from sklearn.model_selection import train_test_split
436
+ >>>
437
+ >>> features, target = load_breast_cancer(return_X_y=True)
438
+ >>>
439
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
440
+ >>>
441
+ >>> ms = MS(learner=SVC(probability=True))
442
+ >>> ms.fit(X_train, y_train)
443
+ >>> y_pred = ms.predict(X_test)
444
+ >>> y_pred
445
+ {0: 0.41287676595138967, 1: 0.5871232340486103}
446
+ >>> get_real_prev(y_test)
447
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
448
+ """
449
+
450
+ def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
451
+ super().__init__(learner)
452
+ self.threshold = threshold
453
+
454
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
455
+ """
456
+ Determines the optimal TPR and FPR by taking the median of
457
+ all TPR and FPR values across the given thresholds.
458
+
459
+ This method computes the median values of TPR and FPR to
460
+ mitigate the influence of outliers and variability in the
461
+ performance metrics.
462
+
463
+ Parameters
464
+ ----------
465
+ thresholds : np.ndarray
466
+ An array of threshold values.
467
+ tprs : np.ndarray
468
+ An array of true positive rates corresponding to the thresholds.
469
+ fprs : np.ndarray
470
+ An array of false positive rates corresponding to the thresholds.
471
+
472
+ Returns
473
+ -------
474
+ tuple
475
+ A tuple containing:
476
+ - The default threshold value (float).
477
+ - The median true positive rate (float).
478
+ - The median false positive rate (float).
479
+
480
+ Raises
481
+ ------
482
+ ValueError
483
+ If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
484
+ """
485
+ # Compute median TPR and FPR
486
+ tpr = np.median(tprs)
487
+ fpr = np.median(fprs)
488
+
489
+ return (self.threshold, tpr, fpr)
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+ class MS2(ThresholdOptimization):
499
+ """
500
+ Median Sweep 2 (MS2). This method is an extension of the
501
+ Median Sweep strategy, but it focuses only on cases where
502
+ the difference between the true positive rate (TPR) and the
503
+ false positive rate (FPR) exceeds a threshold (0.25). The
504
+ method computes the median values of TPR, FPR, and thresholds
505
+ for these selected cases.
506
+
507
+ Parameters
508
+ ----------
509
+ learner : BaseEstimator
510
+ A scikit-learn compatible classifier to be used for quantification.
511
+
512
+ Attributes
513
+ ----------
514
+ learner : BaseEstimator
515
+ A scikit-learn compatible classifier.
516
+
517
+ References
518
+ ----------
519
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
520
+
521
+ See Also
522
+ --------
523
+ ThresholdOptimization : Base class for threshold-based quantification methods.
524
+ ACC : Adjusted Classify and Count quantification method.
525
+ MS : Median Sweep quantification method.
526
+ CC : Classify and Count quantification method.
527
+
528
+ Examples
529
+ --------
530
+ >>> from mlquantify.methods.aggregative import MS2
531
+ >>> from mlquantify.utils.general import get_real_prev
532
+ >>> from sklearn.datasets import load_breast_cancer
533
+ >>> from sklearn.svm import SVC
534
+ >>> from sklearn.model_selection import train_test_split
535
+ >>>
536
+ >>> features, target = load_breast_cancer(return_X_y=True)
537
+ >>>
538
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
539
+ >>>
540
+ >>> ms2 = MS2(learner=SVC(probability=True))
541
+ >>> ms2.fit(X_train, y_train)
542
+ >>> y_pred = ms2.predict(X_test)
543
+ >>> y_pred
544
+ {0: 0.41287676595138967, 1: 0.5871232340486103}
545
+ >>> get_real_prev(y_test)
546
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
547
+ """
548
+
549
+ def __init__(self, learner: BaseEstimator=None):
550
+ super().__init__(learner)
551
+
552
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
553
+ """
554
+ Determines the optimal threshold, TPR, and FPR by focusing only on
555
+ cases where the absolute difference between TPR and FPR is greater
556
+ than 0.25. For these cases, the method computes the median values.
557
+
558
+ Parameters
559
+ ----------
560
+ thresholds : np.ndarray
561
+ An array of threshold values.
562
+ tprs : np.ndarray
563
+ An array of true positive rates corresponding to the thresholds.
564
+ fprs : np.ndarray
565
+ An array of false positive rates corresponding to the thresholds.
566
+
567
+ Returns
568
+ -------
569
+ tuple
570
+ A tuple containing:
571
+ - The median threshold value for cases meeting the condition (float).
572
+ - The median true positive rate for cases meeting the condition (float).
573
+ - The median false positive rate for cases meeting the condition (float).
574
+
575
+ Raises
576
+ ------
577
+ ValueError
578
+ If no cases satisfy the condition `|TPR - FPR| > 0.25`.
579
+ Warning
580
+ If all TPR or FPR values are zero.
581
+ """
582
+ # Check if all TPR or FPR values are zero
583
+ if np.all(tprs == 0) or np.all(fprs == 0):
584
+ warnings.warn("All TPR or FPR values are zero.")
585
+
586
+ # Identify indices where the condition is satisfied
587
+ indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
588
+ if len(indices) == 0:
589
+ raise ValueError("No cases meet the condition |TPR - FPR| > 0.25.")
590
+
591
+ # Compute medians for the selected cases
592
+ threshold = np.median(thresholds[indices])
593
+ tpr = np.median(tprs[indices])
594
+ fpr = np.median(fprs[indices])
595
+
596
+ return (threshold, tpr, fpr)
597
+
598
+
599
+ class PACC(ThresholdOptimization):
600
+ """
601
+ Probabilistic Adjusted Classify and Count (PACC).
602
+ This method extends the Adjusted Classify and Count (AC) approach
603
+ by leveraging the average class-conditional confidences obtained
604
+ from a probabilistic classifier instead of relying solely on true
605
+ positive and false positive rates.
606
+
607
+ Parameters
608
+ ----------
609
+ learner : BaseEstimator
610
+ A scikit-learn compatible classifier to be used for quantification.
611
+ threshold : float, optional
612
+ The decision threshold for classification. Default is 0.5.
613
+
614
+ Attributes
615
+ ----------
616
+ learner : BaseEstimator
617
+ A scikit-learn compatible classifier.
618
+ threshold : float
619
+ Decision threshold for classification. Default is 0.5.
620
+ tpr : float
621
+ True positive rate computed during the fitting process.
622
+ fpr : float
623
+ False positive rate computed during the fitting process.
624
+
625
+ See Also
626
+ --------
627
+ ThresholdOptimization : Base class for threshold-based quantification methods.
628
+ ACC : Adjusted Classify and Count quantification method.
629
+ CC : Classify and Count quantification method.
630
+
631
+ References
632
+ ----------
633
+ A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
634
+
635
+ Examples
636
+ --------
637
+ >>> from mlquantify.methods.aggregative import PACC
638
+ >>> from mlquantify.utils.general import get_real_prev
639
+ >>> from sklearn.datasets import load_breast_cancer
640
+ >>> from sklearn.svm import SVC
641
+ >>> from sklearn.model_selection import train_test_split
642
+ >>>
643
+ >>> features, target = load_breast_cancer(return_X_y=True)
644
+ >>>
645
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
646
+ >>>
647
+ >>> pacc = PACC(learner=SVC(probability=True))
648
+ >>> pacc.fit(X_train, y_train)
649
+ >>> y_pred = pacc.predict(X_test)
650
+ >>> y_pred
651
+ {0: 0.4664886119311328, 1: 0.5335113880688672}
652
+ >>> get_real_prev(y_test)
653
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
654
+ """
655
+
656
+ def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
657
+ super().__init__(learner)
658
+ self.threshold = threshold
659
+
660
+ def _predict_method(self, X):
661
+ """
662
+ Predicts the class prevalence using the mean class-conditional
663
+ probabilities from a probabilistic classifier.
664
+
665
+ Parameters
666
+ ----------
667
+ X : array-like or sparse matrix of shape (n_samples, n_features)
668
+ The input data for prediction.
669
+
670
+ Returns
671
+ -------
672
+ dict
673
+ A dictionary with class labels as keys and their respective
674
+ prevalence estimates as values.
675
+
676
+ Notes
677
+ -----
678
+ The prevalence is adjusted using the formula:
679
+ prevalence = |mean_score - FPR| / (TPR - FPR),
680
+ where mean_score is the average probability for the positive class.
681
+
682
+ Raises
683
+ ------
684
+ ZeroDivisionError
685
+ If `TPR - FPR` equals zero, indicating that the classifier's
686
+ performance does not vary across the threshold range.
687
+ """
688
+ prevalences = {}
689
+
690
+ # Calculate probabilities for the positive class
691
+ probabilities = self.predict_learner(X)[:, 1]
692
+
693
+ # Compute the mean score for the positive class
694
+ mean_scores = np.mean(probabilities)
695
+
696
+ # Adjust prevalence based on TPR and FPR
697
+ if self.tpr - self.fpr == 0:
698
+ prevalence = mean_scores
699
+ else:
700
+ prevalence = np.clip(abs(mean_scores - self.fpr) / (self.tpr - self.fpr), 0, 1)
701
+
702
+ # Map the computed prevalence to the class labels
703
+ prevalences[self.classes[0]] = 1 - prevalence
704
+ prevalences[self.classes[1]] = prevalence
705
+
706
+ return prevalences
707
+
708
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
709
+ """
710
+ Finds the true positive rate (TPR) and false positive rate (FPR)
711
+ corresponding to the specified decision threshold.
712
+
713
+ Parameters
714
+ ----------
715
+ thresholds : np.ndarray
716
+ An array of threshold values.
717
+ tprs : np.ndarray
718
+ An array of true positive rates corresponding to the thresholds.
719
+ fprs : np.ndarray
720
+ An array of false positive rates corresponding to the thresholds.
721
+
722
+ Returns
723
+ -------
724
+ tuple
725
+ A tuple containing the specified threshold, TPR, and FPR.
726
+
727
+ Raises
728
+ ------
729
+ IndexError
730
+ If the specified threshold is not found in the `thresholds` array.
731
+ """
732
+ # Locate TPR and FPR for the specified threshold
733
+ tpr = tprs[thresholds == self.threshold][0]
734
+ fpr = fprs[thresholds == self.threshold][0]
735
+ return (self.threshold, tpr, fpr)
736
+
737
+
738
+
739
+
740
+ def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
741
+ tpr = tprs[thresholds == self.threshold][0]
742
+ fpr = fprs[thresholds == self.threshold][0]
743
+ return (self.threshold, tpr, fpr)
744
+
745
+
746
+
747
+
748
+
749
+
750
+
751
+
752
+ class T50(ThresholdOptimization):
753
+ """
754
+ Threshold 50 (T50). This method adjusts the decision threshold
755
+ to the point where the true positive rate (TPR) is approximately
756
+ equal to 0.5. This approach is particularly useful for balancing
757
+ sensitivity and specificity in binary classification tasks.
758
+
759
+ Parameters
760
+ ----------
761
+ learner : BaseEstimator
762
+ A scikit-learn compatible classifier to be used for quantification.
763
+
764
+ Attributes
765
+ ----------
766
+ learner : BaseEstimator
767
+ A scikit-learn compatible classifier.
768
+ threshold : float
769
+ Decision threshold determined during training.
770
+ tpr : float
771
+ True positive rate corresponding to the selected threshold.
772
+ fpr : float
773
+ False positive rate corresponding to the selected threshold.
774
+
775
+ See Also
776
+ --------
777
+ ThresholdOptimization : Base class for threshold-based quantification methods.
778
+ ACC : Adjusted Classify and Count quantification method.
779
+ CC : Classify and Count quantification method.
780
+
781
+ References
782
+ ----------
783
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
784
+
785
+ Examples
786
+ --------
787
+ >>> from mlquantify.methods.aggregative import T50
788
+ >>> from mlquantify.utils.general import get_real_prev
789
+ >>> from sklearn.datasets import load_breast_cancer
790
+ >>> from sklearn.svm import SVC
791
+ >>> from sklearn.model_selection import train_test_split
792
+ >>>
793
+ >>> features, target = load_breast_cancer(return_X_y=True)
794
+ >>>
795
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
796
+ >>>
797
+ >>> t50 = T50(learner=SVC(probability=True))
798
+ >>> t50.fit(X_train, y_train)
799
+ >>> y_pred = t50.predict(X_test)
800
+ >>> y_pred
801
+ {0: 0.49563196626070505, 1: 0.504368033739295}
802
+ >>> get_real_prev(y_test)
803
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
804
+ """
805
+
806
+ def __init__(self, learner: BaseEstimator=None):
807
+ super().__init__(learner)
808
+
809
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
810
+ """
811
+ Determines the threshold, true positive rate (TPR), and false positive
812
+ rate (FPR) where TPR is closest to 0.5.
813
+
814
+ Parameters
815
+ ----------
816
+ thresholds : np.ndarray
817
+ An array of threshold values.
818
+ tprs : np.ndarray
819
+ An array of true positive rates corresponding to the thresholds.
820
+ fprs : np.ndarray
821
+ An array of false positive rates corresponding to the thresholds.
822
+
823
+ Returns
824
+ -------
825
+ tuple
826
+ A tuple containing the selected threshold, TPR, and FPR.
827
+
828
+ Notes
829
+ -----
830
+ - The method identifies the index where the absolute difference
831
+ between TPR and 0.5 is minimized.
832
+ - This ensures that the selected threshold represents a balance
833
+ point in the ROC space.
834
+
835
+ Raises
836
+ ------
837
+ ValueError
838
+ If the arrays `thresholds`, `tprs`, or `fprs` are empty or
839
+ misaligned in length.
840
+ """
841
+ # Find the index where TPR is closest to 0.5
842
+ min_index = np.argmin(np.abs(tprs - 0.5))
843
+
844
+ # Retrieve the corresponding threshold, TPR, and FPR
845
+ threshold = thresholds[min_index]
846
+ tpr = tprs[min_index]
847
+ fpr = fprs[min_index]
848
+
849
+ return (threshold, tpr, fpr)
850
+
851
+
852
+
853
+
854
+
855
+
856
+
857
+
858
+
859
+
860
+ class X_method(ThresholdOptimization):
861
+ """
862
+ Threshold X. This method identifies the decision threshold where the
863
+ false positive rate (FPR) is approximately equal to 1 - true positive rate (TPR).
864
+ This criterion is useful for identifying thresholds that align with a balance
865
+ point on the ROC curve.
866
+
867
+ Parameters
868
+ ----------
869
+ learner : BaseEstimator
870
+ A scikit-learn compatible classifier to be used for quantification.
871
+
872
+ Attributes
873
+ ----------
874
+ learner : BaseEstimator
875
+ A scikit-learn compatible classifier.
876
+ threshold : float
877
+ Decision threshold determined during training.
878
+ tpr : float
879
+ True positive rate corresponding to the selected threshold.
880
+ fpr : float
881
+ False positive rate corresponding to the selected threshold.
882
+
883
+ See Also
884
+ --------
885
+ ThresholdOptimization : Base class for threshold-based quantification methods.
886
+ ACC : Adjusted Classify and Count quantification method.
887
+ CC : Classify and Count quantification method.
888
+
889
+ References
890
+ ----------
891
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
892
+
893
+ Examples
894
+ --------
895
+ >>> from mlquantify.methods.aggregative import X_method
896
+ >>> from mlquantify.utils.general import get_real_prev
897
+ >>> from sklearn.datasets import load_breast_cancer
898
+ >>> from sklearn.svm import SVC
899
+ >>> from sklearn.model_selection import train_test_split
900
+ >>>
901
+ >>> features, target = load_breast_cancer(return_X_y=True)
902
+ >>>
903
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
904
+ >>>
905
+ >>> x_method = X_method(learner=SVC(probability=True))
906
+ >>> x_method.fit(X_train, y_train)
907
+ >>> y_pred = x_method.predict(X_test)
908
+ >>> y_pred
909
+ {0: 0.40523495782808205, 1: 0.594765042171918}
910
+ >>> get_real_prev(y_test)
911
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
912
+ """
913
+
914
+ def __init__(self, learner: BaseEstimator=None):
915
+ super().__init__(learner)
916
+
917
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
918
+ """
919
+ Determines the threshold, true positive rate (TPR), and false positive
920
+ rate (FPR) where FPR is closest to 1 - TPR.
921
+
922
+ Parameters
923
+ ----------
924
+ thresholds : np.ndarray
925
+ An array of threshold values.
926
+ tprs : np.ndarray
927
+ An array of true positive rates corresponding to the thresholds.
928
+ fprs : np.ndarray
929
+ An array of false positive rates corresponding to the thresholds.
930
+
931
+ Returns
932
+ -------
933
+ tuple
934
+ A tuple containing the selected threshold, TPR, and FPR.
935
+
936
+ Notes
937
+ -----
938
+ - The method identifies the index where the absolute difference
939
+ between FPR and 1 - TPR is minimized.
940
+ - This ensures that the selected threshold corresponds to a balance
941
+ point based on the given criterion.
942
+
943
+ Raises
944
+ ------
945
+ ValueError
946
+ If the arrays `thresholds`, `tprs`, or `fprs` are empty or
947
+ misaligned in length.
948
+ """
949
+ # Find the index where FPR is closest to 1 - TPR
950
+ min_index = np.argmin(np.abs(1 - (tprs + fprs)))
951
+
952
+ # Retrieve the corresponding threshold, TPR, and FPR
953
+ threshold = thresholds[min_index]
954
+ tpr = tprs[min_index]
955
+ fpr = fprs[min_index]
956
+
957
+ return (threshold, tpr, fpr)