mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +10 -29
  2. mlquantify/adjust_counting/__init__.py +24 -0
  3. mlquantify/adjust_counting/_adjustment.py +648 -0
  4. mlquantify/adjust_counting/_base.py +245 -0
  5. mlquantify/adjust_counting/_counting.py +153 -0
  6. mlquantify/adjust_counting/_utils.py +109 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +329 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +147 -0
  13. mlquantify/likelihood/_classes.py +430 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +785 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +147 -0
  22. mlquantify/mixture/_classes.py +458 -0
  23. mlquantify/mixture/_utils.py +163 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +168 -0
  31. mlquantify/neighbors/_classes.py +150 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
  33. mlquantify/neighbors/_kde.py +268 -0
  34. mlquantify/neighbors/_utils.py +131 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +64 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.10.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,648 @@
1
+ import numpy as np
2
+ from abc import abstractmethod
3
+ from scipy.optimize import minimize
4
+ import warnings
5
+
6
+ from mlquantify.adjust_counting._base import BaseAdjustCount
7
+ from mlquantify.adjust_counting._counting import CC, PCC
8
+ from mlquantify.base_aggregative import (
9
+ CrispLearnerQMixin,
10
+ SoftLearnerQMixin,
11
+ uses_soft_predictions,
12
+ )
13
+ from mlquantify.multiclass import define_binary
14
+ from mlquantify.adjust_counting._utils import evaluate_thresholds
15
+ from mlquantify.utils._constraints import Interval, Options
16
+
17
+
18
+ @define_binary
19
+ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
20
+ r"""Base Class for Threshold-based adjustment methods for quantification.
21
+
22
+ This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
23
+ which adjust prevalence estimates based on the classifier's ROC curve,
24
+ as proposed by [1]_.
25
+
26
+ These methods correct the bias in *Classify & Count (CC)* estimates caused
27
+ by differences in class distributions between the training and test datasets.
28
+
29
+ The adjusted prevalence is calculated using the following formula:
30
+
31
+ .. math::
32
+
33
+ \hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
34
+
35
+ where:
36
+ - :math:`p'` is the observed positive proportion from CC,
37
+ - :math:`\text{TPR} = P(\hat{y}=1|y=1)` is the True Positive Rate,
38
+ - :math:`\text{FPR} = P(\hat{y}=1|y=0)` is the False Positive Rate.
39
+
40
+
41
+ Parameters
42
+ ----------
43
+ learner : estimator, optional
44
+ A supervised learning model with `fit` and `predict_proba` methods.
45
+ threshold : float, default=0.5
46
+ Classification threshold in [0, 1].
47
+ strategy : {'ovr'}, default='ovr'
48
+ Strategy used for multiclass adaptation.
49
+
50
+ Attributes
51
+ ----------
52
+ learner : estimator
53
+ The underlying classification model.
54
+ classes : ndarray of shape (n_classes,)
55
+ Unique class labels observed during training.
56
+
57
+ Notes
58
+ -----
59
+ - Defined only for binary quantification tasks.
60
+ - When applied to multiclass problems, the one-vs-rest strategy (`ovr`)
61
+ is used automatically.
62
+
63
+
64
+ Examples
65
+ --------
66
+ >>> from sklearn.linear_model import LogisticRegression
67
+ >>> from mlquantify.adjust_counting import ThresholdAdjustment
68
+ >>> import numpy as np
69
+ >>> class CustomThreshold(ThresholdAdjustment):
70
+ ... def get_best_threshold(self, thresholds, tprs, fprs):
71
+ ... idx = np.argmax(tprs - fprs)
72
+ ... return thresholds[idx], tprs[idx], fprs[idx]
73
+ >>> X = np.random.randn(100, 4)
74
+ >>> y = np.random.randint(0, 2, 100)
75
+ >>> q = CustomThreshold(learner=LogisticRegression())
76
+ >>> q.fit(X, y)
77
+ >>> q.predict(X)
78
+ {0: 0.49, 1: 0.51}
79
+
80
+ References
81
+ ----------
82
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate
83
+ Classification", *Proceedings of ECML*, pp. 564-575.
84
+ .. [2] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
85
+ *Data Mining and Knowledge Discovery*, 17(2), 164-206.
86
+ """
87
+
88
+ _parameter_constraints = {
89
+ "threshold": [
90
+ Interval(0.0, 1.0),
91
+ Interval(0, 1, discrete=True),
92
+ ],
93
+ }
94
+
95
+ def __init__(self, learner=None, threshold=0.5, strategy="ovr"):
96
+ super().__init__(learner=learner)
97
+ self.threshold = threshold
98
+ self.strategy = strategy
99
+
100
+ def _adjust(self, predictions, train_y_scores, train_y_values):
101
+ """Internal adjustment computation based on selected ROC threshold."""
102
+ positive_scores = train_y_scores[:, 1]
103
+
104
+ thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
105
+ threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
106
+
107
+ cc_predictions = CC(threshold).aggregate(predictions)[1]
108
+
109
+ if tpr - fpr == 0:
110
+ prevalence = cc_predictions
111
+ else:
112
+ prevalence = np.clip((cc_predictions - fpr) / (tpr - fpr), 0, 1)
113
+
114
+ return np.asarray([1 - prevalence, prevalence])
115
+
116
+ @abstractmethod
117
+ def get_best_threshold(self, thresholds, tprs, fprs):
118
+ """Select the best threshold according to the specific method."""
119
+ ...
120
+
121
+
122
+ class MatrixAdjustment(BaseAdjustCount):
123
+ r"""Base class for matrix-based quantification adjustments.
124
+
125
+ This class implements the matrix correction model for quantification
126
+ as formulated in Firat (2016) [1]_, which expresses the observed prevalences
127
+ as a linear combination of true prevalences through the confusion matrix.
128
+
129
+ The system is modeled as:
130
+
131
+ .. math::
132
+
133
+ \mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
134
+
135
+ subject to the constraints:
136
+
137
+ .. math::
138
+
139
+ \hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
140
+
141
+ where:
142
+ - :math:`\mathbf{y}` is the vector of predicted prevalences in test set,
143
+ - :math:`\mathbf{C}` is the confusion matrix,
144
+ - :math:`\hat{\pi}_F` is the true class prevalence vector (unknown),
145
+ - :math:`\varepsilon` is the residual error.
146
+
147
+ The model can be solved via:
148
+
149
+ - **Linear algebraic solution**: uses matrix inversion
150
+ - **Constrained optimization**: quadratic or least-squares approach
151
+
152
+
153
+ Parameters
154
+ ----------
155
+ learner : estimator, optional
156
+ Classifier with `fit` and `predict` methods.
157
+ solver : {'optim', 'linear'}, optional
158
+ Solver for the adjustment system:
159
+
160
+ - `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
161
+ - `'optim'`: uses optimization (e.g., FM)
162
+
163
+ Attributes
164
+ ----------
165
+ CM : ndarray of shape (n_classes, n_classes)
166
+ Confusion matrix used for correction.
167
+ classes : ndarray
168
+ Class labels observed in training.
169
+
170
+
171
+ Examples
172
+ --------
173
+ >>> from sklearn.linear_model import LogisticRegression
174
+ >>> from mlquantify.adjust_counting import MatrixAdjustment
175
+ >>> import numpy as np
176
+ >>> class MyMatrix(MatrixAdjustment):
177
+ ... def _compute_confusion_matrix(self, preds, y):
178
+ ... cm = np.ones((2, 2))
179
+ ... return cm / cm.sum(axis=1, keepdims=True)
180
+ >>> q = MyMatrix(learner=LogisticRegression(), solver='linear')
181
+ >>> X = np.random.randn(50, 4)
182
+ >>> y = np.random.randint(0, 2, 50)
183
+ >>> q.fit(X, y)
184
+ >>> q.predict(X)
185
+ {0: 0.5, 1: 0.5}
186
+
187
+ References
188
+ ----------
189
+ .. [1] Firat, A. (2016). "Unified Framework for Quantification",
190
+ *Proceedings of AAAI Conference on Artificial Intelligence*,
191
+ pp. 1-8.
192
+ """
193
+
194
+
195
+ _parameter_constraints = {"solver": Options(["optim", "linear"])}
196
+
197
+ def __init__(self, learner=None, solver=None):
198
+ super().__init__(learner=learner)
199
+ self.solver = solver
200
+
201
+ def _adjust(self, predictions, train_y_pred, train_y_values):
202
+ n_class = len(np.unique(train_y_values))
203
+ self.CM = np.zeros((n_class, n_class))
204
+
205
+ if self.solver == 'optim':
206
+ priors = np.array(list(CC().aggregate(train_y_pred).values()))
207
+ self.CM = self._compute_confusion_matrix(train_y_pred, train_y_values, priors)
208
+ prevs_estim = self._get_estimations(predictions > priors)
209
+ prevalence = self._solve_optimization(prevs_estim, priors)
210
+ else:
211
+ self.CM = self._compute_confusion_matrix(train_y_pred)
212
+ prevs_estim = self._get_estimations(predictions)
213
+ prevalence = self._solve_linear(prevs_estim)
214
+
215
+ return prevalence
216
+
217
+ def _solve_linear(self, prevs_estim):
218
+ r"""
219
+ Solve the system using matrix inversion.
220
+ """
221
+ try:
222
+ adjusted = np.linalg.solve(self.CM, prevs_estim)
223
+ adjusted = np.clip(adjusted, 0, 1)
224
+ adjusted /= adjusted.sum()
225
+ except np.linalg.LinAlgError:
226
+ adjusted = prevs_estim
227
+ return adjusted
228
+
229
+ def _solve_optimization(self, prevs_estim, priors):
230
+ r"""Solve the system linearly.
231
+
232
+ The solution is obtained by matrix inversion:
233
+
234
+ .. math::
235
+
236
+ \hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
237
+
238
+ where :math:`\mathbf{C}` is the confusion matrix and :math:`\mathbf{p}`
239
+ is the observed prevalence vector.
240
+
241
+ Parameters
242
+ ----------
243
+ p : ndarray of shape (n_classes,)
244
+ Observed prevalence vector from test set.
245
+
246
+ Returns
247
+ -------
248
+ ndarray of shape (n_classes,)
249
+ Adjusted prevalence estimates :math:`\hat{\pi}_F`.
250
+ """
251
+ def objective(prevs_pred):
252
+ return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
253
+
254
+ constraints = [
255
+ {'type': 'eq', 'fun': lambda x: np.sum(x) - 1},
256
+ {'type': 'ineq', 'fun': lambda x: x}
257
+ ]
258
+ bounds = [(0, 1)] * self.CM.shape[1]
259
+ init = np.full(self.CM.shape[1], 1 / self.CM.shape[1])
260
+ result = minimize(objective, init, constraints=constraints, bounds=bounds)
261
+ return result.x if result.success else priors
262
+
263
+ def _get_estimations(self, predictions):
264
+ """Return prevalence estimates using CC (crisp) or PCC (probabilistic)."""
265
+ if uses_soft_predictions(self):
266
+ return np.array(list(PCC().aggregate(predictions).values()))
267
+ return np.array(list(CC().aggregate(predictions).values()))
268
+
269
+ @abstractmethod
270
+ def _compute_confusion_matrix(self, predictions, *args):
271
+ ...
272
+
273
+
274
+ class FM(SoftLearnerQMixin, MatrixAdjustment):
275
+ r"""Friedman Method for quantification adjustment.
276
+
277
+ This class implements the Friedman (2015) matrix-based quantification adjustment, which formulates the quantification problem as a constrained optimization problem. It adjusts the estimated class prevalences by minimizing the difference between predicted and expected prevalences, subject to valid prevalence constraints.
278
+
279
+ The confusion matrix is computed by applying estimated posterior probabilities
280
+ over true labels, enabling accurate correction of prevalence estimates under
281
+ concept drift.
282
+
283
+ The confusion matrix is estimated for each class :math:`k` by:
284
+ applying thresholding on posterior probabilities against prior prevalence,
285
+ as described in the FM algorithm. This enables the correction using
286
+ a quadratic optimization approach.
287
+
288
+ The method solves:
289
+
290
+ .. math::
291
+
292
+ \min_{\hat{\pi}_F} \| \mathbf{C} \hat{\pi}_F - \mathbf{p} \|^2
293
+
294
+ subject to constraints:
295
+
296
+ .. math::
297
+
298
+ \hat{\pi}_F \geq 0, \quad \sum_k \hat{\pi}_{F,k} = 1
299
+
300
+ where :math:`\mathbf{C}` is the confusion matrix, :math:`\mathbf{p}` is the
301
+ vector of predicted prevalences.
302
+
303
+
304
+ Parameters
305
+ ----------
306
+ learner : estimator, optional
307
+ Base classifier with `fit` and `predict_proba` methods.
308
+ If None, a default estimator will be used.
309
+
310
+ Attributes
311
+ ----------
312
+ CM : ndarray of shape (n_classes, n_classes)
313
+ Confusion matrix used for correction.
314
+
315
+
316
+ Examples
317
+ --------
318
+ >>> from mlquantify.adjust_counting import FM
319
+ >>> import numpy as np
320
+ >>> X = np.random.randn(50, 4)
321
+ >>> y = np.random.randint(0, 2, 50)
322
+ >>> fm = FM(learner=LogisticRegression())
323
+ >>> fm.fit(X, y)
324
+ >>> fm.predict(X)
325
+ {0: 0.5, 1: 0.5}
326
+
327
+ References
328
+ ----------
329
+ .. [1] Friedman, J. H., et al. (2015). "Detecting and Dealing with Concept Drift",
330
+ *Proceedings of the IEEE*, 103(11), 1522-1541.
331
+ """
332
+ def __init__(self, learner=None):
333
+ super().__init__(learner=learner, solver='optim')
334
+
335
+ def _compute_confusion_matrix(self, posteriors, y_true, priors):
336
+ for i, _class in enumerate(self.classes_):
337
+ indices = (y_true == _class)
338
+ self.CM[:, i] = self._get_estimations(posteriors[indices] > priors)
339
+ return self.CM
340
+
341
+
342
+ class GAC(CrispLearnerQMixin, MatrixAdjustment):
343
+ r"""Generalized Adjusted Count method.
344
+
345
+ This class implements the Generalized Adjusted Count (GAC) algorithm for
346
+ quantification adjustment as described in Firat (2016) [1]_. The method
347
+ adjusts the estimated class prevalences by normalizing the confusion matrix
348
+ based on prevalence estimates, providing a correction for bias caused by
349
+ distribution differences between training and test data.
350
+
351
+ The confusion matrix is normalized by dividing each column by the prevalence
352
+ estimate of the corresponding class. For classes with zero estimated prevalence,
353
+ the diagonal element is set to 1 to avoid division by zero.
354
+
355
+ This normalization ensures that the matrix best reflects the classifier's
356
+ behavior relative to the estimated class distributions, improving quantification
357
+ accuracy.
358
+
359
+ Parameters
360
+ ----------
361
+ learner : estimator, optional
362
+ Base classifier with `fit` and `predict` methods.
363
+
364
+ Attributes
365
+ ----------
366
+ CM : ndarray of shape (n_classes, n_classes)
367
+ Normalized confusion matrix used for adjusting predicted prevalences.
368
+ classes_ : ndarray
369
+ Array of class labels observed during training.
370
+
371
+
372
+ Examples
373
+ --------
374
+ >>> from sklearn.linear_model import LogisticRegression
375
+ >>> from mlquantify.adjust_counting import GAC
376
+ >>> import numpy as np
377
+ >>> gac = GAC(learner=LogisticRegression())
378
+ >>> X = np.random.randn(50, 4)
379
+ >>> y = np.random.randint(0, 2, 50)
380
+ >>> gac.fit(X, y)
381
+ >>> gac.predict(X)
382
+ {0: 0.5, 1: 0.5}
383
+
384
+ References
385
+ ----------
386
+ .. [1] Firat, A. (2016). "Unified Framework for Quantification",
387
+ *Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
388
+ """
389
+ def __init__(self, learner=None):
390
+ super().__init__(learner=learner, solver='linear')
391
+
392
+ def _compute_confusion_matrix(self, predictions):
393
+ prev_estim = self._get_estimations(predictions)
394
+ for i, _ in enumerate(self.classes_):
395
+ if prev_estim[i] == 0:
396
+ self.CM[i, i] = 1
397
+ else:
398
+ self.CM[:, i] /= prev_estim[i]
399
+ return self.CM
400
+
401
+
402
+ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
403
+ r"""Probabilistic Generalized Adjusted Count (GPAC) method.
404
+
405
+ This class implements the probabilistic extension of the Generalized Adjusted Count method
406
+ as presented in Firat (2016) [1]_. The GPAC method normalizes the confusion matrix by
407
+ the estimated prevalences from posterior probabilities, enabling a probabilistic correction
408
+ of class prevalences.
409
+
410
+ The normalization divides each column of the confusion matrix by the estimated prevalence
411
+ of the corresponding class. If a class has zero estimated prevalence, the diagonal element
412
+ for that class is set to 1 to maintain matrix validity.
413
+
414
+ GPAC extends the GAC approach by using soft probabilistic predictions (posterior probabilities)
415
+ rather than crisp class labels, potentially improving quantification accuracy when
416
+ posterior probabilities are well calibrated.
417
+
418
+ Parameters
419
+ ----------
420
+ learner : estimator, optional
421
+ Base classifier with `fit` and `predict_proba` methods.
422
+
423
+ Attributes
424
+ ----------
425
+ CM : ndarray of shape (n_classes, n_classes)
426
+ Normalized confusion matrix used for adjustment.
427
+ classes_ : ndarray
428
+ Array of class labels observed during training.
429
+
430
+
431
+ Examples
432
+ --------
433
+ >>> from sklearn.linear_model import LogisticRegression
434
+ >>> from mlquantify.adjust_counting import GPAC
435
+ >>> import numpy as np
436
+ >>> gpac = GPAC(learner=LogisticRegression())
437
+ >>> X = np.random.randn(50, 4)
438
+ >>> y = np.random.randint(0, 2, 50)
439
+ >>> gpac.fit(X, y)
440
+ >>> gpac.predict(X)
441
+ {0: 0.5, 1: 0.5}
442
+
443
+ References
444
+ ----------
445
+ .. [1] Firat, A. (2016). "Unified Framework for Quantification",
446
+ *Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
447
+ """
448
+ def __init__(self, learner=None):
449
+ super().__init__(learner=learner, solver='linear')
450
+
451
+ def _compute_confusion_matrix(self, posteriors):
452
+ prev_estim = self._get_estimations(posteriors)
453
+ for i, _ in enumerate(self.classes_):
454
+ if prev_estim[i] == 0:
455
+ self.CM[i, i] = 1
456
+ else:
457
+ self.CM[:, i] /= prev_estim[i]
458
+ return self.CM
459
+
460
+
461
+ class ACC(ThresholdAdjustment):
462
+ r"""Adjusted Count (ACC) — baseline threshold correction.
463
+
464
+ This method corrects the bias in class prevalence estimates caused by imperfect
465
+ classification accuracy, by adjusting the observed positive count using estimates
466
+ of the classifier's true positive rate (TPR) and false positive rate (FPR).
467
+
468
+ It uses a fixed classification threshold and applies the formula:
469
+
470
+ .. math::
471
+
472
+ p = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
473
+
474
+ where :math:`p'` is the observed positive proportion from :class:`CC`,
475
+
476
+
477
+ Parameters
478
+ ----------
479
+ learner : estimator, optional
480
+ A supervised learning model with `fit` and `predict_proba` methods.
481
+ threshold : float, default=0.5
482
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
483
+
484
+ References
485
+ ----------
486
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
487
+ *ECML*, pp. 564-575.
488
+ """
489
+
490
+ def get_best_threshold(self, thresholds, tprs, fprs):
491
+ tpr = tprs[thresholds == self.threshold][0]
492
+ fpr = fprs[thresholds == self.threshold][0]
493
+ return (self.threshold, tpr, fpr)
494
+
495
+
496
+ class X_method(ThresholdAdjustment):
497
+ r"""X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
498
+
499
+ This method selects the classification threshold at which the sum of the true positive
500
+ rate (TPR) and false positive rate (FPR) equals one. This threshold choice balances
501
+ errors in a specific way improving quantification.
502
+
503
+
504
+ Parameters
505
+ ----------
506
+ learner : estimator, optional
507
+ A supervised learning model with `fit` and `predict_proba` methods.
508
+ threshold : float, default=0.5
509
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
510
+
511
+ References
512
+ ----------
513
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
514
+ *ECML*, pp. 564-575.
515
+ """
516
+ def get_best_threshold(self, thresholds, tprs, fprs):
517
+ idx = np.argmin(np.abs(1 - (tprs + fprs)))
518
+ return thresholds[idx], tprs[idx], fprs[idx]
519
+
520
+
521
+ class MAX(ThresholdAdjustment):
522
+ r"""MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
523
+
524
+ This method selects the threshold that maximizes the difference between the true positive
525
+ rate (TPR) and the false positive rate (FPR), effectively optimizing classification
526
+ performance for quantification.
527
+
528
+
529
+ Parameters
530
+ ----------
531
+ learner : estimator, optional
532
+ A supervised learning model with `fit` and `predict_proba` methods.
533
+ threshold : float, default=0.5
534
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
535
+
536
+
537
+ References
538
+ ----------
539
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
540
+ *ECML*, pp. 564-575.
541
+ """
542
+ def get_best_threshold(self, thresholds, tprs, fprs):
543
+ idx = np.argmax(np.abs(tprs - fprs))
544
+ return thresholds[idx], tprs[idx], fprs[idx]
545
+
546
+
547
+ class T50(ThresholdAdjustment):
548
+ r"""T50 — selects threshold where :math:`\text{TPR} = 0.5`.
549
+
550
+ This method chooses the classification threshold such that the true positive rate (TPR)
551
+ equals 0.5, avoiding regions with unreliable estimates at extreme thresholds.
552
+
553
+
554
+ Parameters
555
+ ----------
556
+ learner : estimator, optional
557
+ A supervised learning model with `fit` and `predict_proba` methods.
558
+ threshold : float, default=0.5
559
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
560
+
561
+
562
+ References
563
+ ----------
564
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
565
+ *ECML*, pp. 564-575.
566
+ """
567
+ def get_best_threshold(self, thresholds, tprs, fprs):
568
+ idx = np.argmin(np.abs(tprs - 0.5))
569
+ return thresholds[idx], tprs[idx], fprs[idx]
570
+
571
+
572
+ class MS(ThresholdAdjustment):
573
+ r"""Median Sweep (MS) — median prevalence estimate across all thresholds.
574
+
575
+ This method computes class prevalence estimates at multiple classification thresholds,
576
+ using the adjusted count formula for each, then returns the median of these estimates,
577
+ reducing variance caused by any single threshold selection.
578
+
579
+ It thus leverages the strengths of bootstrap-like variance reduction without heavy
580
+ computation.
581
+
582
+
583
+ Parameters
584
+ ----------
585
+ learner : estimator, optional
586
+ A supervised learning model with `fit` and `predict_proba` methods.
587
+ threshold : float, default=0.5
588
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
589
+
590
+
591
+ References
592
+ ----------
593
+ .. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
594
+ *Data Mining and Knowledge Discovery*, 17(2), 164-206.
595
+ """
596
+ def _adjust(self, predictions, train_y_scores, train_y_values):
597
+ positive_scores = train_y_scores[:, 1]
598
+
599
+ thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
600
+ thresholds, tprs, fprs = self.get_best_threshold(thresholds, tprs, fprs)
601
+
602
+ prevs = []
603
+ for thr, tpr, fpr in zip(thresholds, tprs, fprs):
604
+ cc_predictions = CC(thr).aggregate(predictions)
605
+ cc_predictions = cc_predictions[1]
606
+ prevalence = cc_predictions if tpr - fpr == 0 else (cc_predictions - fpr) / (tpr - fpr)
607
+ prevs.append(prevalence)
608
+ prevalence = np.median(prevs)
609
+ return np.asarray([1 - prevalence, prevalence])
610
+
611
+ def get_best_threshold(self, thresholds, tprs, fprs):
612
+ return thresholds, tprs, fprs
613
+
614
+
615
+ class MS2(MS):
616
+ r"""MS2 — Median Sweep variant constraining :math:`|\text{TPR} - \text{FPR}| > 0.25`.
617
+
618
+ This variant of Median Sweep excludes thresholds where the absolute difference
619
+ between true positive rate (TPR) and false positive rate (FPR) is below 0.25,
620
+ improving stability by avoiding ambiguous threshold regions.
621
+
622
+
623
+ Parameters
624
+ ----------
625
+ learner : estimator, optional
626
+ A supervised learning model with `fit` and `predict_proba` methods.
627
+ threshold : float, default=0.5
628
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
629
+
630
+
631
+ Warnings
632
+ --------
633
+ - Warns if all TPR or FPR values are zero.
634
+ - Warns if no thresholds satisfy the constraint.
635
+
636
+ References
637
+ ----------
638
+ .. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
639
+ *Data Mining and Knowledge Discovery*, 17(2), 164-206.
640
+ """
641
+ def get_best_threshold(self, thresholds, tprs, fprs):
642
+ if np.all(tprs == 0) or np.all(fprs == 0):
643
+ warnings.warn("All TPR or FPR values are zero.")
644
+ indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
645
+ if len(indices) == 0:
646
+ warnings.warn("No cases satisfy |TPR - FPR| > 0.25.")
647
+ indices = np.where(np.abs(tprs - fprs) >= 0)[0]
648
+ return thresholds[indices], tprs[indices], fprs[indices]