mlquantify 0.0.11__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. mlquantify/__init__.py +32 -6
  2. mlquantify/base.py +559 -256
  3. mlquantify/classification/__init__.py +1 -1
  4. mlquantify/classification/methods.py +160 -0
  5. mlquantify/evaluation/__init__.py +14 -2
  6. mlquantify/evaluation/measures.py +215 -0
  7. mlquantify/evaluation/protocol.py +647 -0
  8. mlquantify/methods/__init__.py +37 -40
  9. mlquantify/methods/aggregative.py +1030 -0
  10. mlquantify/methods/meta.py +472 -0
  11. mlquantify/methods/mixture_models.py +1003 -0
  12. mlquantify/methods/non_aggregative.py +136 -0
  13. mlquantify/methods/threshold_optimization.py +959 -0
  14. mlquantify/model_selection.py +377 -232
  15. mlquantify/plots.py +367 -0
  16. mlquantify/utils/__init__.py +2 -2
  17. mlquantify/utils/general.py +334 -0
  18. mlquantify/utils/method.py +449 -0
  19. {mlquantify-0.0.11.dist-info → mlquantify-0.1.0.dist-info}/METADATA +137 -126
  20. mlquantify-0.1.0.dist-info/RECORD +22 -0
  21. {mlquantify-0.0.11.dist-info → mlquantify-0.1.0.dist-info}/WHEEL +1 -1
  22. mlquantify/classification/pwkclf.py +0 -73
  23. mlquantify/evaluation/measures/__init__.py +0 -26
  24. mlquantify/evaluation/measures/ae.py +0 -11
  25. mlquantify/evaluation/measures/bias.py +0 -16
  26. mlquantify/evaluation/measures/kld.py +0 -8
  27. mlquantify/evaluation/measures/mse.py +0 -12
  28. mlquantify/evaluation/measures/nae.py +0 -16
  29. mlquantify/evaluation/measures/nkld.py +0 -13
  30. mlquantify/evaluation/measures/nrae.py +0 -16
  31. mlquantify/evaluation/measures/rae.py +0 -12
  32. mlquantify/evaluation/measures/se.py +0 -12
  33. mlquantify/evaluation/protocol/_Protocol.py +0 -202
  34. mlquantify/evaluation/protocol/__init__.py +0 -2
  35. mlquantify/evaluation/protocol/app.py +0 -146
  36. mlquantify/evaluation/protocol/npp.py +0 -34
  37. mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
  38. mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
  39. mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
  40. mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
  41. mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
  42. mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
  43. mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
  44. mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
  45. mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
  46. mlquantify/methods/aggregative/__init__.py +0 -9
  47. mlquantify/methods/aggregative/cc.py +0 -32
  48. mlquantify/methods/aggregative/emq.py +0 -86
  49. mlquantify/methods/aggregative/fm.py +0 -72
  50. mlquantify/methods/aggregative/gac.py +0 -96
  51. mlquantify/methods/aggregative/gpac.py +0 -87
  52. mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
  53. mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
  54. mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
  55. mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
  56. mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
  57. mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
  58. mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
  59. mlquantify/methods/aggregative/pcc.py +0 -33
  60. mlquantify/methods/aggregative/pwk.py +0 -38
  61. mlquantify/methods/meta/__init__.py +0 -1
  62. mlquantify/methods/meta/ensemble.py +0 -236
  63. mlquantify/methods/non_aggregative/__init__.py +0 -1
  64. mlquantify/methods/non_aggregative/hdx.py +0 -71
  65. mlquantify/plots/__init__.py +0 -2
  66. mlquantify/plots/distribution_plot.py +0 -109
  67. mlquantify/plots/protocol_plot.py +0 -157
  68. mlquantify/utils/general_purposes/__init__.py +0 -8
  69. mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
  70. mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
  71. mlquantify/utils/general_purposes/get_real_prev.py +0 -9
  72. mlquantify/utils/general_purposes/load_quantifier.py +0 -4
  73. mlquantify/utils/general_purposes/make_prevs.py +0 -23
  74. mlquantify/utils/general_purposes/normalize.py +0 -20
  75. mlquantify/utils/general_purposes/parallel.py +0 -10
  76. mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
  77. mlquantify/utils/method_purposes/__init__.py +0 -6
  78. mlquantify/utils/method_purposes/distances.py +0 -21
  79. mlquantify/utils/method_purposes/getHist.py +0 -13
  80. mlquantify/utils/method_purposes/get_scores.py +0 -33
  81. mlquantify/utils/method_purposes/moss.py +0 -16
  82. mlquantify/utils/method_purposes/ternary_search.py +0 -14
  83. mlquantify/utils/method_purposes/tprfpr.py +0 -42
  84. mlquantify-0.0.11.dist-info/RECORD +0 -73
  85. {mlquantify-0.0.11.dist-info → mlquantify-0.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,959 @@
1
+ from abc import abstractmethod
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator
4
+
5
+ from ..base import AggregativeQuantifier
6
+ from ..utils.method import adjust_threshold, get_scores
7
+ import mlquantify as mq
8
+
9
+
10
+
11
+
12
+ class ThresholdOptimization(AggregativeQuantifier):
13
+ """
14
+ Generic Class for methods that adjust the decision boundary of the underlying classifier
15
+ to make the ACC (base method for threshold methods) estimation more numerically stable.
16
+ Most strategies involve altering the denominator of the ACC equation.
17
+
18
+ This class serves as a base for implementing threshold optimization techniques in classification
19
+ tasks. It is designed to adjust thresholds based on true positive and false positive rates,
20
+ ensuring better quantification performance.
21
+
22
+ Parameters
23
+ ----------
24
+ learner : BaseEstimator
25
+ A scikit-learn compatible classifier to be used for threshold optimization.
26
+ threshold : float, optional
27
+ The threshold value to be used for classification decisions. Default is 0.5.
28
+
29
+ Attributes
30
+ ----------
31
+ learner : BaseEstimator
32
+ A scikit-learn compatible classifier.
33
+ threshold : float, optional
34
+ The optimized threshold used for classification decisions.
35
+ cc_output : float, optional
36
+ The classification count output, representing the proportion of instances classified
37
+ as positive based on the threshold.
38
+ tpr : float, optional
39
+ The true positive rate corresponding to the best threshold.
40
+ fpr : float, optional
41
+ The false positive rate corresponding to the best threshold.
42
+
43
+ Notes
44
+ -----
45
+ All methods that inherit from this class will be binary quantifiers. In case of multiclass problems, it will be made One vs All.
46
+
47
+ Examples
48
+ --------
49
+ >>> from mlquantify.methods.threshold_optimization import ThresholdOptimization
50
+ >>> from mlquantify.utils.general import get_real_prev
51
+ >>> from sklearn.datasets import load_breast_cancer
52
+ >>> from sklearn.svm import SVC
53
+ >>> from sklearn.model_selection import train_test_split
54
+ >>>
55
+ >>> class MyThrMethod(ThresholdOptimization):
56
+ ... def __init__(self, learner, threshold=0.5):
57
+ ... super().__init__(learner)
58
+ ... self.threshold = threshold
59
+ ... def best_tprfpr(self, thresholds, tpr, fpr):
60
+ ... return thresholds[20], tpr[20], fpr[20]
61
+ >>>
62
+ >>> features, target = load_breast_cancer(return_X_y=True)
63
+ >>>
64
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
65
+ >>>
66
+ >>> mtm = MyThrMethod(learner=SVC(probability=True), threshold=0.5)
67
+ >>> mtm.fit(X_train, y_train)
68
+ >>> y_pred = mtm.predict(X_test)
69
+ """
70
+
71
+ def __init__(self, learner: BaseEstimator=None):
72
+ self.learner = learner
73
+ self.threshold = None
74
+ self.cc_output = None
75
+ self.tpr = None
76
+ self.fpr = None
77
+
78
+ @property
79
+ def is_probabilistic(self) -> bool:
80
+ """
81
+ Returns whether the method is probabilistic.
82
+
83
+ This method is used to determine whether the quantification method is probabilistic,
84
+ meaning it uses class-conditional probabilities to estimate class prevalences.
85
+
86
+ Returns
87
+ -------
88
+ bool
89
+ True, indicating that this method is probabilistic.
90
+ """
91
+ return True
92
+
93
+ @property
94
+ def is_multiclass(self) -> bool:
95
+ """
96
+ Returns whether the method is applicable to multiclass quantification.
97
+
98
+ Threshold-based methods are typically binary classifiers, so this method
99
+ returns False.
100
+
101
+ Returns
102
+ -------
103
+ bool
104
+ False, indicating that this method does not support multiclass quantification.
105
+ """
106
+ return False
107
+
108
+ def _fit_method(self, X, y):
109
+ """
110
+ Fits the classifier and adjusts thresholds based on true positive rate (TPR) and false positive rate (FPR).
111
+
112
+ Parameters
113
+ ----------
114
+ X : pd.DataFrame or np.ndarray
115
+ The input features for training.
116
+ y : pd.Series or np.ndarray
117
+ The target labels for training.
118
+
119
+ Returns
120
+ -------
121
+ self : ThresholdOptimization
122
+ The fitted quantifier object with the best threshold, TPR, and FPR.
123
+ """
124
+ # Get predicted labels and probabilities
125
+ if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
126
+ y_labels = mq.arguments["y_labels"]
127
+ probabilities = mq.arguments["posteriors_train"]
128
+ else:
129
+ y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
130
+
131
+ # Adjust thresholds and compute true and false positive rates
132
+ thresholds, tprs, fprs = adjust_threshold(y_labels, probabilities[:, 1], self.classes)
133
+
134
+ # Find the best threshold based on TPR and FPR
135
+ self.threshold, self.tpr, self.fpr = self.best_tprfpr(thresholds, tprs, fprs)
136
+
137
+ return self
138
+
139
+ def _predict_method(self, X) -> dict:
140
+ """
141
+ Predicts class prevalences using the adjusted threshold.
142
+
143
+ Parameters
144
+ ----------
145
+ X : pd.DataFrame or np.ndarray
146
+ The input features for prediction.
147
+
148
+ Returns
149
+ -------
150
+ np.ndarray
151
+ An array of predicted prevalences for the classes.
152
+ """
153
+ # Get predicted probabilities for the positive class
154
+ probabilities = self.predict_learner(X)[:, 1]
155
+
156
+ # Compute the classification count output based on the threshold
157
+ self.cc_output = len(probabilities[probabilities >= self.threshold]) / len(probabilities)
158
+
159
+ # Calculate prevalence, ensuring it is within [0, 1]
160
+ if self.tpr - self.fpr == 0:
161
+ prevalence = self.cc_output
162
+ else:
163
+ # Equation of threshold methods to compute prevalence
164
+ prevalence = np.clip((self.cc_output - self.fpr) / (self.tpr - self.fpr), 0, 1)
165
+
166
+ prevalences = [1 - prevalence, prevalence]
167
+
168
+ return np.asarray(prevalences)
169
+
170
+ @abstractmethod
171
+ def best_tprfpr(self, thresholds: np.ndarray, tpr: np.ndarray, fpr: np.ndarray) -> float:
172
+ """
173
+ Abstract method for determining the best TPR (True Positive Rate) and FPR (False Positive Rate)
174
+ to use in the equation for threshold optimization.
175
+
176
+ This method needs to be implemented by subclasses to define how the best threshold
177
+ is chosen based on TPR and FPR.
178
+
179
+ Parameters
180
+ ----------
181
+ thresholds : np.ndarray
182
+ An array of threshold values.
183
+ tpr : np.ndarray
184
+ An array of true positive rates corresponding to the thresholds.
185
+ fpr : np.ndarray
186
+ An array of false positive rates corresponding to the thresholds.
187
+
188
+ Returns
189
+ -------
190
+ float
191
+ The best threshold value determined based on the true positive and false positive rates.
192
+ """
193
+ ...
194
+
195
+
196
+
197
+
198
+
199
+
200
+ class ACC(ThresholdOptimization):
201
+ """
202
+ Adjusted Classify and Count (ACC). This method is a base approach for threshold-based
203
+ quantification methods.
204
+
205
+ As described in the ThresholdOptimization base class, this method estimates the true
206
+ positive rate (TPR) and false positive rate (FPR) from the training data. It then uses
207
+ these values to adjust the output of the Classify and Count (CC) method, making the
208
+ quantification process more accurate and stable.
209
+
210
+ Parameters
211
+ ----------
212
+ learner : BaseEstimator
213
+ A scikit-learn compatible classifier to be used for quantification.
214
+ threshold : float, optional
215
+ The decision threshold for classifying instances. Default is 0.5.
216
+
217
+ Attributes
218
+ ----------
219
+ learner : BaseEstimator
220
+ A scikit-learn compatible classifier.
221
+ threshold : float
222
+ The decision threshold used to classify instances as positive or negative. Default is 0.5.
223
+
224
+ See Also
225
+ --------
226
+ ThresholdOptimization : Base class for threshold-based quantification methods.
227
+ CC : Classify and Count quantification method.
228
+
229
+ References
230
+ ----------
231
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
232
+
233
+ Examples
234
+ --------
235
+ >>> from mlquantify.methods.aggregative import ACC
236
+ >>> from mlquantify.utils.general import get_real_prev
237
+ >>> from sklearn.datasets import load_breast_cancer
238
+ >>> from sklearn.svm import SVC
239
+ >>> from sklearn.model_selection import train_test_split
240
+ >>>
241
+ >>> features, target = load_breast_cancer(return_X_y=True)
242
+ >>>
243
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
244
+ >>>
245
+ >>> acc = ACC(learner=SVC(probability=True), threshold=0.5)
246
+ >>> acc.fit(X_train, y_train)
247
+ >>> y_pred = acc.predict(X_test)
248
+ >>> y_pred
249
+ {0: 0.3968506555196656, 1: 0.6031493444803344}
250
+ >>> get_real_prev(y_test)
251
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
252
+ """
253
+
254
+ def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
255
+ super().__init__(learner)
256
+ self.threshold = threshold
257
+
258
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
259
+ """
260
+ Determines the true positive rate (TPR) and false positive rate (FPR) for the specified threshold.
261
+
262
+ This method identifies the TPR and FPR corresponding to the threshold provided
263
+ during initialization. It assumes that the `thresholds`, `tprs`, and `fprs` arrays
264
+ are aligned, meaning the `i-th` element of each array corresponds to the same threshold.
265
+
266
+ Parameters
267
+ ----------
268
+ thresholds : np.ndarray
269
+ An array of threshold values.
270
+ tprs : np.ndarray
271
+ An array of true positive rates corresponding to the thresholds.
272
+ fprs : np.ndarray
273
+ An array of false positive rates corresponding to the thresholds.
274
+
275
+ Returns
276
+ -------
277
+ tuple
278
+ A tuple containing the threshold, the true positive rate (TPR), and the false
279
+ positive rate (FPR) for the specified threshold.
280
+
281
+ Raises
282
+ ------
283
+ IndexError
284
+ If the specified threshold is not found in the `thresholds` array.
285
+ """
286
+ # Get the TPR and FPR where the threshold matches the specified value
287
+ tpr = tprs[thresholds == self.threshold][0]
288
+ fpr = fprs[thresholds == self.threshold][0]
289
+ return (self.threshold, tpr, fpr)
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+ class MAX(ThresholdOptimization):
300
+ """
301
+ Threshold MAX. This quantification method selects the threshold that maximizes
302
+ the absolute difference between the true positive rate (TPR) and false positive
303
+ rate (FPR). This threshold is then used in the denominator of the equation for
304
+ adjusted prevalence estimation.
305
+
306
+ Parameters
307
+ ----------
308
+ learner : BaseEstimator
309
+ A scikit-learn compatible classifier to be used for quantification.
310
+
311
+ Attributes
312
+ ----------
313
+ learner : BaseEstimator
314
+ A scikit-learn compatible classifier.
315
+
316
+ See Also
317
+ --------
318
+ ThresholdOptimization : Base class for threshold-based quantification methods.
319
+ ACC : Adjusted Classify and Count quantification method.
320
+ CC : Classify and Count quantification method.
321
+
322
+ References
323
+ ----------
324
+ FORMAN, George. Counting positives accurately despite inaccurate classification. In: European conference on machine learning. Berlin, Heidelberg: Springer Berlin Heidelberg, 2005. p. 564-575. Available at: https://link.springer.com/chapter/10.1007/11564096_56
325
+
326
+ Examples
327
+ --------
328
+ >>> from mlquantify.methods.aggregative import MAX
329
+ >>> from mlquantify.utils.general import get_real_prev
330
+ >>> from sklearn.datasets import load_breast_cancer
331
+ >>> from sklearn.svm import SVC
332
+ >>> from sklearn.model_selection import train_test_split
333
+ >>>
334
+ >>> features, target = load_breast_cancer(return_X_y=True)
335
+ >>>
336
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
337
+ >>>
338
+ >>> maxq = MAX(learner=SVC(probability=True))
339
+ >>> maxq.fit(X_train, y_train)
340
+ >>> y_pred = maxq.predict(X_test)
341
+ >>> y_pred
342
+ {0: 0.3920664352842359, 1: 0.6079335647157641}
343
+ >>> get_real_prev(y_test)
344
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
345
+ """
346
+
347
+ def __init__(self, learner: BaseEstimator=None):
348
+ super().__init__(learner)
349
+
350
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
351
+ """
352
+ Determines the optimal threshold by maximizing the absolute difference between
353
+ the true positive rate (TPR) and the false positive rate (FPR).
354
+
355
+ This method identifies the index where `|TPR - FPR|` is maximized and retrieves
356
+ the corresponding threshold, TPR, and FPR.
357
+
358
+ Parameters
359
+ ----------
360
+ thresholds : np.ndarray
361
+ An array of threshold values.
362
+ tprs : np.ndarray
363
+ An array of true positive rates corresponding to the thresholds.
364
+ fprs : np.ndarray
365
+ An array of false positive rates corresponding to the thresholds.
366
+
367
+ Returns
368
+ -------
369
+ tuple
370
+ A tuple containing:
371
+ - The threshold that maximizes `|TPR - FPR|`.
372
+ - The true positive rate (TPR) at the selected threshold.
373
+ - The false positive rate (FPR) at the selected threshold.
374
+
375
+ Raises
376
+ ------
377
+ ValueError
378
+ If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
379
+ """
380
+ max_index = np.argmax(np.abs(tprs - fprs))
381
+
382
+ # Retrieve the corresponding threshold, TPR, and FPR
383
+ threshold = thresholds[max_index]
384
+ tpr = tprs[max_index]
385
+ fpr = fprs[max_index]
386
+ return (threshold, tpr, fpr)
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+ class MS(ThresholdOptimization):
397
+ """
398
+ Median Sweep (MS). This quantification method uses an ensemble
399
+ of threshold-based methods, taking the median values of the
400
+ true positive rate (TPR) and false positive rate (FPR) across
401
+ all thresholds to compute adjusted prevalences.
402
+
403
+ Parameters
404
+ ----------
405
+ learner : BaseEstimator
406
+ A scikit-learn compatible classifier to be used for quantification.
407
+ threshold : float, optional
408
+ The default threshold value to use for the quantification method. Default is 0.5.
409
+
410
+ Attributes
411
+ ----------
412
+ learner : BaseEstimator
413
+ A scikit-learn compatible classifier.
414
+ threshold : float
415
+ The default threshold to use for the quantification method, typically 0.5.
416
+
417
+ See Also
418
+ --------
419
+ ThresholdOptimization : Base class for threshold-based quantification methods.
420
+ ACC : Adjusted Classify and Count quantification method.
421
+ MAX : Threshold MAX quantification method.
422
+ CC : Classify and Count quantification method.
423
+
424
+ References
425
+ ----------
426
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
427
+
428
+ Examples
429
+ --------
430
+ >>> from mlquantify.methods.aggregative import MS
431
+ >>> from mlquantify.utils.general import get_real_prev
432
+ >>> from sklearn.datasets import load_breast_cancer
433
+ >>> from sklearn.svm import SVC
434
+ >>> from sklearn.model_selection import train_test_split
435
+ >>>
436
+ >>> features, target = load_breast_cancer(return_X_y=True)
437
+ >>>
438
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
439
+ >>>
440
+ >>> ms = MS(learner=SVC(probability=True))
441
+ >>> ms.fit(X_train, y_train)
442
+ >>> y_pred = ms.predict(X_test)
443
+ >>> y_pred
444
+ {0: 0.41287676595138967, 1: 0.5871232340486103}
445
+ >>> get_real_prev(y_test)
446
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
447
+ """
448
+
449
+ def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
450
+ super().__init__(learner)
451
+ self.threshold = threshold
452
+
453
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
454
+ """
455
+ Determines the optimal TPR and FPR by taking the median of
456
+ all TPR and FPR values across the given thresholds.
457
+
458
+ This method computes the median values of TPR and FPR to
459
+ mitigate the influence of outliers and variability in the
460
+ performance metrics.
461
+
462
+ Parameters
463
+ ----------
464
+ thresholds : np.ndarray
465
+ An array of threshold values.
466
+ tprs : np.ndarray
467
+ An array of true positive rates corresponding to the thresholds.
468
+ fprs : np.ndarray
469
+ An array of false positive rates corresponding to the thresholds.
470
+
471
+ Returns
472
+ -------
473
+ tuple
474
+ A tuple containing:
475
+ - The default threshold value (float).
476
+ - The median true positive rate (float).
477
+ - The median false positive rate (float).
478
+
479
+ Raises
480
+ ------
481
+ ValueError
482
+ If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
483
+ """
484
+ # Compute median TPR and FPR
485
+ tpr = np.median(tprs)
486
+ fpr = np.median(fprs)
487
+
488
+ return (self.threshold, tpr, fpr)
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+ class MS2(ThresholdOptimization):
498
+ """
499
+ Median Sweep 2 (MS2). This method is an extension of the
500
+ Median Sweep strategy, but it focuses only on cases where
501
+ the difference between the true positive rate (TPR) and the
502
+ false positive rate (FPR) exceeds a threshold (0.25). The
503
+ method computes the median values of TPR, FPR, and thresholds
504
+ for these selected cases.
505
+
506
+ Parameters
507
+ ----------
508
+ learner : BaseEstimator
509
+ A scikit-learn compatible classifier to be used for quantification.
510
+
511
+ Attributes
512
+ ----------
513
+ learner : BaseEstimator
514
+ A scikit-learn compatible classifier.
515
+
516
+ References
517
+ ----------
518
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
519
+
520
+ See Also
521
+ --------
522
+ ThresholdOptimization : Base class for threshold-based quantification methods.
523
+ ACC : Adjusted Classify and Count quantification method.
524
+ MS : Median Sweep quantification method.
525
+ CC : Classify and Count quantification method.
526
+
527
+ Examples
528
+ --------
529
+ >>> from mlquantify.methods.aggregative import MS2
530
+ >>> from mlquantify.utils.general import get_real_prev
531
+ >>> from sklearn.datasets import load_breast_cancer
532
+ >>> from sklearn.svm import SVC
533
+ >>> from sklearn.model_selection import train_test_split
534
+ >>>
535
+ >>> features, target = load_breast_cancer(return_X_y=True)
536
+ >>>
537
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
538
+ >>>
539
+ >>> ms2 = MS2(learner=SVC(probability=True))
540
+ >>> ms2.fit(X_train, y_train)
541
+ >>> y_pred = ms2.predict(X_test)
542
+ >>> y_pred
543
+ {0: 0.41287676595138967, 1: 0.5871232340486103}
544
+ >>> get_real_prev(y_test)
545
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
546
+ """
547
+
548
+ def __init__(self, learner: BaseEstimator=None):
549
+ super().__init__(learner)
550
+
551
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
552
+ """
553
+ Determines the optimal threshold, TPR, and FPR by focusing only on
554
+ cases where the absolute difference between TPR and FPR is greater
555
+ than 0.25. For these cases, the method computes the median values.
556
+
557
+ Parameters
558
+ ----------
559
+ thresholds : np.ndarray
560
+ An array of threshold values.
561
+ tprs : np.ndarray
562
+ An array of true positive rates corresponding to the thresholds.
563
+ fprs : np.ndarray
564
+ An array of false positive rates corresponding to the thresholds.
565
+
566
+ Returns
567
+ -------
568
+ tuple
569
+ A tuple containing:
570
+ - The median threshold value for cases meeting the condition (float).
571
+ - The median true positive rate for cases meeting the condition (float).
572
+ - The median false positive rate for cases meeting the condition (float).
573
+
574
+ Raises
575
+ ------
576
+ ValueError
577
+ If no cases satisfy the condition `|TPR - FPR| > 0.25` or if the
578
+ input arrays are empty or have mismatched lengths.
579
+ """
580
+ # Identify indices where the condition is satisfied
581
+ indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
582
+ if len(indices) == 0:
583
+ raise ValueError("No cases meet the condition |TPR - FPR| > 0.25.")
584
+
585
+ # Compute medians for the selected cases
586
+ threshold = np.median(thresholds[indices])
587
+ tpr = np.median(tprs[indices])
588
+ fpr = np.median(fprs[indices])
589
+
590
+ return (threshold, tpr, fpr)
591
+
592
+
593
+
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+ class PACC(ThresholdOptimization):
602
+ """
603
+ Probabilistic Adjusted Classify and Count (PACC).
604
+ This method extends the Adjusted Classify and Count (AC) approach
605
+ by leveraging the average class-conditional confidences obtained
606
+ from a probabilistic classifier instead of relying solely on true
607
+ positive and false positive rates.
608
+
609
+ Parameters
610
+ ----------
611
+ learner : BaseEstimator
612
+ A scikit-learn compatible classifier to be used for quantification.
613
+ threshold : float, optional
614
+ The decision threshold for classification. Default is 0.5.
615
+
616
+ Attributes
617
+ ----------
618
+ learner : BaseEstimator
619
+ A scikit-learn compatible classifier.
620
+ threshold : float
621
+ Decision threshold for classification. Default is 0.5.
622
+ tpr : float
623
+ True positive rate computed during the fitting process.
624
+ fpr : float
625
+ False positive rate computed during the fitting process.
626
+
627
+ See Also
628
+ --------
629
+ ThresholdOptimization : Base class for threshold-based quantification methods.
630
+ ACC : Adjusted Classify and Count quantification method.
631
+ CC : Classify and Count quantification method.
632
+
633
+ References
634
+ ----------
635
+ A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
636
+
637
+ Examples
638
+ --------
639
+ >>> from mlquantify.methods.aggregative import PACC
640
+ >>> from mlquantify.utils.general import get_real_prev
641
+ >>> from sklearn.datasets import load_breast_cancer
642
+ >>> from sklearn.svm import SVC
643
+ >>> from sklearn.model_selection import train_test_split
644
+ >>>
645
+ >>> features, target = load_breast_cancer(return_X_y=True)
646
+ >>>
647
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
648
+ >>>
649
+ >>> pacc = PACC(learner=SVC(probability=True))
650
+ >>> pacc.fit(X_train, y_train)
651
+ >>> y_pred = pacc.predict(X_test)
652
+ >>> y_pred
653
+ {0: 0.4664886119311328, 1: 0.5335113880688672}
654
+ >>> get_real_prev(y_test)
655
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
656
+ """
657
+
658
+ def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
659
+ super().__init__(learner)
660
+ self.threshold = threshold
661
+
662
+ def _predict_method(self, X):
663
+ """
664
+ Predicts the class prevalence using the mean class-conditional
665
+ probabilities from a probabilistic classifier.
666
+
667
+ Parameters
668
+ ----------
669
+ X : array-like or sparse matrix of shape (n_samples, n_features)
670
+ The input data for prediction.
671
+
672
+ Returns
673
+ -------
674
+ dict
675
+ A dictionary with class labels as keys and their respective
676
+ prevalence estimates as values.
677
+
678
+ Notes
679
+ -----
680
+ The prevalence is adjusted using the formula:
681
+ prevalence = |mean_score - FPR| / (TPR - FPR),
682
+ where mean_score is the average probability for the positive class.
683
+
684
+ Raises
685
+ ------
686
+ ZeroDivisionError
687
+ If `TPR - FPR` equals zero, indicating that the classifier's
688
+ performance does not vary across the threshold range.
689
+ """
690
+ prevalences = {}
691
+
692
+ # Calculate probabilities for the positive class
693
+ probabilities = self.predict_learner(X)[:, 1]
694
+
695
+ # Compute the mean score for the positive class
696
+ mean_scores = np.mean(probabilities)
697
+
698
+ # Adjust prevalence based on TPR and FPR
699
+ if self.tpr - self.fpr == 0:
700
+ prevalence = mean_scores
701
+ else:
702
+ prevalence = np.clip(abs(mean_scores - self.fpr) / (self.tpr - self.fpr), 0, 1)
703
+
704
+ # Map the computed prevalence to the class labels
705
+ prevalences[self.classes[0]] = 1 - prevalence
706
+ prevalences[self.classes[1]] = prevalence
707
+
708
+ return prevalences
709
+
710
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
711
+ """
712
+ Finds the true positive rate (TPR) and false positive rate (FPR)
713
+ corresponding to the specified decision threshold.
714
+
715
+ Parameters
716
+ ----------
717
+ thresholds : np.ndarray
718
+ An array of threshold values.
719
+ tprs : np.ndarray
720
+ An array of true positive rates corresponding to the thresholds.
721
+ fprs : np.ndarray
722
+ An array of false positive rates corresponding to the thresholds.
723
+
724
+ Returns
725
+ -------
726
+ tuple
727
+ A tuple containing the specified threshold, TPR, and FPR.
728
+
729
+ Raises
730
+ ------
731
+ IndexError
732
+ If the specified threshold is not found in the `thresholds` array.
733
+ """
734
+ # Locate TPR and FPR for the specified threshold
735
+ tpr = tprs[thresholds == self.threshold][0]
736
+ fpr = fprs[thresholds == self.threshold][0]
737
+ return (self.threshold, tpr, fpr)
738
+
739
+
740
+
741
+
742
+ def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
743
+ tpr = tprs[thresholds == self.threshold][0]
744
+ fpr = fprs[thresholds == self.threshold][0]
745
+ return (self.threshold, tpr, fpr)
746
+
747
+
748
+
749
+
750
+
751
+
752
+
753
+
754
+ class T50(ThresholdOptimization):
755
+ """
756
+ Threshold 50 (T50). This method adjusts the decision threshold
757
+ to the point where the true positive rate (TPR) is approximately
758
+ equal to 0.5. This approach is particularly useful for balancing
759
+ sensitivity and specificity in binary classification tasks.
760
+
761
+ Parameters
762
+ ----------
763
+ learner : BaseEstimator
764
+ A scikit-learn compatible classifier to be used for quantification.
765
+
766
+ Attributes
767
+ ----------
768
+ learner : BaseEstimator
769
+ A scikit-learn compatible classifier.
770
+ threshold : float
771
+ Decision threshold determined during training.
772
+ tpr : float
773
+ True positive rate corresponding to the selected threshold.
774
+ fpr : float
775
+ False positive rate corresponding to the selected threshold.
776
+
777
+ See Also
778
+ --------
779
+ ThresholdOptimization : Base class for threshold-based quantification methods.
780
+ ACC : Adjusted Classify and Count quantification method.
781
+ CC : Classify and Count quantification method.
782
+
783
+ References
784
+ ----------
785
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
786
+
787
+ Examples
788
+ --------
789
+ >>> from mlquantify.methods.aggregative import T50
790
+ >>> from mlquantify.utils.general import get_real_prev
791
+ >>> from sklearn.datasets import load_breast_cancer
792
+ >>> from sklearn.svm import SVC
793
+ >>> from sklearn.model_selection import train_test_split
794
+ >>>
795
+ >>> features, target = load_breast_cancer(return_X_y=True)
796
+ >>>
797
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
798
+ >>>
799
+ >>> t50 = T50(learner=SVC(probability=True))
800
+ >>> t50.fit(X_train, y_train)
801
+ >>> y_pred = t50.predict(X_test)
802
+ >>> y_pred
803
+ {0: 0.49563196626070505, 1: 0.504368033739295}
804
+ >>> get_real_prev(y_test)
805
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
806
+ """
807
+
808
+ def __init__(self, learner: BaseEstimator=None):
809
+ super().__init__(learner)
810
+
811
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
812
+ """
813
+ Determines the threshold, true positive rate (TPR), and false positive
814
+ rate (FPR) where TPR is closest to 0.5.
815
+
816
+ Parameters
817
+ ----------
818
+ thresholds : np.ndarray
819
+ An array of threshold values.
820
+ tprs : np.ndarray
821
+ An array of true positive rates corresponding to the thresholds.
822
+ fprs : np.ndarray
823
+ An array of false positive rates corresponding to the thresholds.
824
+
825
+ Returns
826
+ -------
827
+ tuple
828
+ A tuple containing the selected threshold, TPR, and FPR.
829
+
830
+ Notes
831
+ -----
832
+ - The method identifies the index where the absolute difference
833
+ between TPR and 0.5 is minimized.
834
+ - This ensures that the selected threshold represents a balance
835
+ point in the ROC space.
836
+
837
+ Raises
838
+ ------
839
+ ValueError
840
+ If the arrays `thresholds`, `tprs`, or `fprs` are empty or
841
+ misaligned in length.
842
+ """
843
+ # Find the index where TPR is closest to 0.5
844
+ min_index = np.argmin(np.abs(tprs - 0.5))
845
+
846
+ # Retrieve the corresponding threshold, TPR, and FPR
847
+ threshold = thresholds[min_index]
848
+ tpr = tprs[min_index]
849
+ fpr = fprs[min_index]
850
+
851
+ return (threshold, tpr, fpr)
852
+
853
+
854
+
855
+
856
+
857
+
858
+
859
+
860
+
861
+
862
+ class X_method(ThresholdOptimization):
863
+ """
864
+ Threshold X. This method identifies the decision threshold where the
865
+ false positive rate (FPR) is approximately equal to 1 - true positive rate (TPR).
866
+ This criterion is useful for identifying thresholds that align with a balance
867
+ point on the ROC curve.
868
+
869
+ Parameters
870
+ ----------
871
+ learner : BaseEstimator
872
+ A scikit-learn compatible classifier to be used for quantification.
873
+
874
+ Attributes
875
+ ----------
876
+ learner : BaseEstimator
877
+ A scikit-learn compatible classifier.
878
+ threshold : float
879
+ Decision threshold determined during training.
880
+ tpr : float
881
+ True positive rate corresponding to the selected threshold.
882
+ fpr : float
883
+ False positive rate corresponding to the selected threshold.
884
+
885
+ See Also
886
+ --------
887
+ ThresholdOptimization : Base class for threshold-based quantification methods.
888
+ ACC : Adjusted Classify and Count quantification method.
889
+ CC : Classify and Count quantification method.
890
+
891
+ References
892
+ ----------
893
+ FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
894
+
895
+ Examples
896
+ --------
897
+ >>> from mlquantify.methods.aggregative import X_method
898
+ >>> from mlquantify.utils.general import get_real_prev
899
+ >>> from sklearn.datasets import load_breast_cancer
900
+ >>> from sklearn.svm import SVC
901
+ >>> from sklearn.model_selection import train_test_split
902
+ >>>
903
+ >>> features, target = load_breast_cancer(return_X_y=True)
904
+ >>>
905
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
906
+ >>>
907
+ >>> x_method = X_method(learner=SVC(probability=True))
908
+ >>> x_method.fit(X_train, y_train)
909
+ >>> y_pred = x_method.predict(X_test)
910
+ >>> y_pred
911
+ {0: 0.40523495782808205, 1: 0.594765042171918}
912
+ >>> get_real_prev(y_test)
913
+ {0: 0.3991228070175439, 1: 0.6008771929824561}
914
+ """
915
+
916
+ def __init__(self, learner: BaseEstimator=None):
917
+ super().__init__(learner)
918
+
919
+ def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
920
+ """
921
+ Determines the threshold, true positive rate (TPR), and false positive
922
+ rate (FPR) where FPR is closest to 1 - TPR.
923
+
924
+ Parameters
925
+ ----------
926
+ thresholds : np.ndarray
927
+ An array of threshold values.
928
+ tprs : np.ndarray
929
+ An array of true positive rates corresponding to the thresholds.
930
+ fprs : np.ndarray
931
+ An array of false positive rates corresponding to the thresholds.
932
+
933
+ Returns
934
+ -------
935
+ tuple
936
+ A tuple containing the selected threshold, TPR, and FPR.
937
+
938
+ Notes
939
+ -----
940
+ - The method identifies the index where the absolute difference
941
+ between FPR and 1 - TPR is minimized.
942
+ - This ensures that the selected threshold corresponds to a balance
943
+ point based on the given criterion.
944
+
945
+ Raises
946
+ ------
947
+ ValueError
948
+ If the arrays `thresholds`, `tprs`, or `fprs` are empty or
949
+ misaligned in length.
950
+ """
951
+ # Find the index where FPR is closest to 1 - TPR
952
+ min_index = np.argmin(np.abs(1 - (tprs + fprs)))
953
+
954
+ # Retrieve the corresponding threshold, TPR, and FPR
955
+ threshold = thresholds[min_index]
956
+ tpr = tprs[min_index]
957
+ fpr = fprs[min_index]
958
+
959
+ return (threshold, tpr, fpr)