mlquantify 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mlquantify/__init__.py CHANGED
@@ -1,3 +1,13 @@
1
1
  "mlquantify, a Python package for quantification"
2
2
 
3
-
3
+ from . import neighbors
4
+ from . import likelihood
5
+ from . import mixture
6
+ from . import meta
7
+ from . import adjust_counting
8
+ from . import model_selection
9
+ from . import base_aggregative
10
+ from . import base
11
+ from . import calibration
12
+ from . import confidence
13
+ from . import multiclass
@@ -1,4 +1,7 @@
1
- from ._counting import CC, PCC
1
+ from ._counting import (
2
+ CC,
3
+ PCC
4
+ )
2
5
  from ._adjustment import (
3
6
  ThresholdAdjustment,
4
7
  MatrixAdjustment,
@@ -11,4 +14,11 @@ from ._adjustment import (
11
14
  T50,
12
15
  MS,
13
16
  MS2,
17
+ )
18
+
19
+ from ._utils import (
20
+ compute_table,
21
+ compute_fpr,
22
+ compute_tpr,
23
+ evaluate_thresholds,
14
24
  )
@@ -17,38 +17,26 @@ from mlquantify.utils._constraints import Interval, Options
17
17
 
18
18
  @define_binary
19
19
  class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
20
- r"""
21
- Applies threshold-based adjustment methods for quantification.
20
+ r"""Base Class for Threshold-based adjustment methods for quantification.
22
21
 
23
22
  This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
24
- which adjust prevalence estimates based on the classifiers ROC curve, as proposed by
25
- Forman (2005, 2008).
23
+ which adjust prevalence estimates based on the classifier's ROC curve,
24
+ as proposed by [1]_.
26
25
 
27
- These methods correct the bias in *Classify & Count (CC)* estimates caused by differences
28
- in class distributions between the training and test datasets.
29
-
30
- Mathematical formulation
31
-
32
- Given:
33
- - \( p' \): observed positive proportion from CC,
34
- - \( \text{TPR} = P(\hat{y}=1|y=1) \),
35
- - \( \text{FPR} = P(\hat{y}=1|y=0) \),
36
-
37
- the adjusted prevalence is given by:
38
-
39
- \[
40
- \hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
41
- \]
26
+ These methods correct the bias in *Classify & Count (CC)* estimates caused
27
+ by differences in class distributions between the training and test datasets.
28
+
29
+ The adjusted prevalence is calculated using the following formula:
42
30
 
43
- (Forman, *Counting Positives Accurately Despite Inaccurate Classification*, ECML 2005;
44
- *Quantifying Counts and Costs via Classification*, DMKD 2008).
31
+ .. math::
45
32
 
33
+ \hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
46
34
 
47
- Notes
48
- -----
49
- - Defined only for binary quantification tasks.
50
- - When applied to multiclass problems, the one-vs-rest strategy (`ovr`) is used automatically.
51
-
35
+ where:
36
+ - :math:`p'` is the observed positive proportion from CC,
37
+ - :math:`\text{TPR} = P(\hat{y}=1|y=1)` is the True Positive Rate,
38
+ - :math:`\text{FPR} = P(\hat{y}=1|y=0)` is the False Positive Rate.
39
+
52
40
 
53
41
  Parameters
54
42
  ----------
@@ -59,7 +47,6 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
59
47
  strategy : {'ovr'}, default='ovr'
60
48
  Strategy used for multiclass adaptation.
61
49
 
62
-
63
50
  Attributes
64
51
  ----------
65
52
  learner : estimator
@@ -67,6 +54,12 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
67
54
  classes : ndarray of shape (n_classes,)
68
55
  Unique class labels observed during training.
69
56
 
57
+ Notes
58
+ -----
59
+ - Defined only for binary quantification tasks.
60
+ - When applied to multiclass problems, the one-vs-rest strategy (`ovr`)
61
+ is used automatically.
62
+
70
63
 
71
64
  Examples
72
65
  --------
@@ -74,7 +67,7 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
74
67
  >>> from mlquantify.adjust_counting import ThresholdAdjustment
75
68
  >>> import numpy as np
76
69
  >>> class CustomThreshold(ThresholdAdjustment):
77
- ... def _get_best_threshold(self, thresholds, tprs, fprs):
70
+ ... def get_best_threshold(self, thresholds, tprs, fprs):
78
71
  ... idx = np.argmax(tprs - fprs)
79
72
  ... return thresholds[idx], tprs[idx], fprs[idx]
80
73
  >>> X = np.random.randn(100, 4)
@@ -83,6 +76,13 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
83
76
  >>> q.fit(X, y)
84
77
  >>> q.predict(X)
85
78
  {0: 0.49, 1: 0.51}
79
+
80
+ References
81
+ ----------
82
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate
83
+ Classification", *Proceedings of ECML*, pp. 564-575.
84
+ .. [2] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
85
+ *Data Mining and Knowledge Discovery*, 17(2), 164-206.
86
86
  """
87
87
 
88
88
  _parameter_constraints = {
@@ -101,8 +101,8 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
101
101
  """Internal adjustment computation based on selected ROC threshold."""
102
102
  positive_scores = train_y_scores[:, 1]
103
103
 
104
- thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
105
- threshold, tpr, fpr = self._get_best_threshold(thresholds, tprs, fprs)
104
+ thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
105
+ threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
106
106
 
107
107
  cc_predictions = CC(threshold).aggregate(predictions)[1]
108
108
 
@@ -114,42 +114,40 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
114
114
  return np.asarray([1 - prevalence, prevalence])
115
115
 
116
116
  @abstractmethod
117
- def _get_best_threshold(self, thresholds, tprs, fprs):
117
+ def get_best_threshold(self, thresholds, tprs, fprs):
118
118
  """Select the best threshold according to the specific method."""
119
119
  ...
120
120
 
121
121
 
122
122
  class MatrixAdjustment(BaseAdjustCount):
123
- r"""
124
- Base class for matrix-based quantification adjustments (FM, GAC, GPAC).
123
+ r"""Base class for matrix-based quantification adjustments.
125
124
 
126
125
  This class implements the matrix correction model for quantification
127
- as formulated in Firat (2016), which expresses the observed prevalences as
128
- a linear combination of true prevalences through the confusion matrix.
126
+ as formulated in Firat (2016) [1]_, which expresses the observed prevalences
127
+ as a linear combination of true prevalences through the confusion matrix.
129
128
 
130
- Mathematical model
129
+ The system is modeled as:
131
130
 
132
- The system is given by:
131
+ .. math::
133
132
 
134
- \[
135
- \mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
136
- \]
137
-
138
- subject to:
139
-
140
- \[
141
- \hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
142
- \]
133
+ \mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
134
+
135
+ subject to the constraints:
136
+
137
+ .. math::
138
+
139
+ \hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
143
140
 
144
141
  where:
145
- - \( \mathbf{y} \): vector of predicted prevalences in test set,
146
- - \( \mathbf{C} \): confusion matrix,
147
- - \( \hat{\pi}_F \): true class prevalence vector (unknown),
148
- - \( \varepsilon \): residual error.
142
+ - :math:`\mathbf{y}` is the vector of predicted prevalences in test set,
143
+ - :math:`\mathbf{C}` is the confusion matrix,
144
+ - :math:`\hat{\pi}_F` is the true class prevalence vector (unknown),
145
+ - :math:`\varepsilon` is the residual error.
146
+
147
+ The model can be solved via:
149
148
 
150
- The model can be solved either via:
151
- - Linear algebraic solution, or
152
- - Constrained optimization (quadratic or least-squares).
149
+ - **Linear algebraic solution**: uses matrix inversion
150
+ - **Constrained optimization**: quadratic or least-squares approach
153
151
 
154
152
 
155
153
  Parameters
@@ -158,10 +156,10 @@ class MatrixAdjustment(BaseAdjustCount):
158
156
  Classifier with `fit` and `predict` methods.
159
157
  solver : {'optim', 'linear'}, optional
160
158
  Solver for the adjustment system:
159
+
161
160
  - `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
162
161
  - `'optim'`: uses optimization (e.g., FM)
163
162
 
164
-
165
163
  Attributes
166
164
  ----------
167
165
  CM : ndarray of shape (n_classes, n_classes)
@@ -170,15 +168,11 @@ class MatrixAdjustment(BaseAdjustCount):
170
168
  Class labels observed in training.
171
169
 
172
170
 
173
- References
174
- ----------
175
- - Firat, A. (2016). *Unified Framework for Quantification.* AAAI, pp. 1-8.
176
-
177
-
178
171
  Examples
179
172
  --------
180
173
  >>> from sklearn.linear_model import LogisticRegression
181
174
  >>> from mlquantify.adjust_counting import MatrixAdjustment
175
+ >>> import numpy as np
182
176
  >>> class MyMatrix(MatrixAdjustment):
183
177
  ... def _compute_confusion_matrix(self, preds, y):
184
178
  ... cm = np.ones((2, 2))
@@ -189,8 +183,15 @@ class MatrixAdjustment(BaseAdjustCount):
189
183
  >>> q.fit(X, y)
190
184
  >>> q.predict(X)
191
185
  {0: 0.5, 1: 0.5}
186
+
187
+ References
188
+ ----------
189
+ .. [1] Firat, A. (2016). "Unified Framework for Quantification",
190
+ *Proceedings of AAAI Conference on Artificial Intelligence*,
191
+ pp. 1-8.
192
192
  """
193
193
 
194
+
194
195
  _parameter_constraints = {"solver": Options(["optim", "linear"])}
195
196
 
196
197
  def __init__(self, learner=None, solver=None):
@@ -215,11 +216,7 @@ class MatrixAdjustment(BaseAdjustCount):
215
216
 
216
217
  def _solve_linear(self, prevs_estim):
217
218
  r"""
218
- Solve the system linearly:
219
-
220
- \[
221
- \hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
222
- \]
219
+ Solve the system using matrix inversion.
223
220
  """
224
221
  try:
225
222
  adjusted = np.linalg.solve(self.CM, prevs_estim)
@@ -230,13 +227,26 @@ class MatrixAdjustment(BaseAdjustCount):
230
227
  return adjusted
231
228
 
232
229
  def _solve_optimization(self, prevs_estim, priors):
233
- r"""
234
- Solve via constrained least squares:
230
+ r"""Solve the system linearly.
231
+
232
+ The solution is obtained by matrix inversion:
235
233
 
236
- \[
237
- \min_{\hat{\pi}_F} \| \mathbf{C}\hat{\pi}_F - \mathbf{p} \|_2^2
238
- \quad \text{s.t. } \hat{\pi}_F \ge 0, \ \sum_k \hat{\pi}_{F,k} = 1
239
- \]
234
+ .. math::
235
+
236
+ \hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
237
+
238
+ where :math:`\mathbf{C}` is the confusion matrix and :math:`\mathbf{p}`
239
+ is the observed prevalence vector.
240
+
241
+ Parameters
242
+ ----------
243
+ p : ndarray of shape (n_classes,)
244
+ Observed prevalence vector from test set.
245
+
246
+ Returns
247
+ -------
248
+ ndarray of shape (n_classes,)
249
+ Adjusted prevalence estimates :math:`\hat{\pi}_F`.
240
250
  """
241
251
  def objective(prevs_pred):
242
252
  return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
@@ -262,7 +272,63 @@ class MatrixAdjustment(BaseAdjustCount):
262
272
 
263
273
 
264
274
  class FM(SoftLearnerQMixin, MatrixAdjustment):
265
- """Forman's Matrix Adjustment (FM) — solved via optimization."""
275
+ r"""Friedman Method for quantification adjustment.
276
+
277
+ This class implements the Friedman (2015) matrix-based quantification adjustment, which formulates the quantification problem as a constrained optimization problem. It adjusts the estimated class prevalences by minimizing the difference between predicted and expected prevalences, subject to valid prevalence constraints.
278
+
279
+ The confusion matrix is computed by applying estimated posterior probabilities
280
+ over true labels, enabling accurate correction of prevalence estimates under
281
+ concept drift.
282
+
283
+ The confusion matrix is estimated for each class :math:`k` by:
284
+ applying thresholding on posterior probabilities against prior prevalence,
285
+ as described in the FM algorithm. This enables the correction using
286
+ a quadratic optimization approach.
287
+
288
+ The method solves:
289
+
290
+ .. math::
291
+
292
+ \min_{\hat{\pi}_F} \| \mathbf{C} \hat{\pi}_F - \mathbf{p} \|^2
293
+
294
+ subject to constraints:
295
+
296
+ .. math::
297
+
298
+ \hat{\pi}_F \geq 0, \quad \sum_k \hat{\pi}_{F,k} = 1
299
+
300
+ where :math:`\mathbf{C}` is the confusion matrix, :math:`\mathbf{p}` is the
301
+ vector of predicted prevalences.
302
+
303
+
304
+ Parameters
305
+ ----------
306
+ learner : estimator, optional
307
+ Base classifier with `fit` and `predict_proba` methods.
308
+ If None, a default estimator will be used.
309
+
310
+ Attributes
311
+ ----------
312
+ CM : ndarray of shape (n_classes, n_classes)
313
+ Confusion matrix used for correction.
314
+
315
+
316
+ Examples
317
+ --------
318
+ >>> from mlquantify.adjust_counting import FM
319
+ >>> import numpy as np
320
+ >>> X = np.random.randn(50, 4)
321
+ >>> y = np.random.randint(0, 2, 50)
322
+ >>> fm = FM(learner=LogisticRegression())
323
+ >>> fm.fit(X, y)
324
+ >>> fm.predict(X)
325
+ {0: 0.5, 1: 0.5}
326
+
327
+ References
328
+ ----------
329
+ .. [1] Friedman, J. H., et al. (2015). "Detecting and Dealing with Concept Drift",
330
+ *Proceedings of the IEEE*, 103(11), 1522-1541.
331
+ """
266
332
  def __init__(self, learner=None):
267
333
  super().__init__(learner=learner, solver='optim')
268
334
 
@@ -274,7 +340,52 @@ class FM(SoftLearnerQMixin, MatrixAdjustment):
274
340
 
275
341
 
276
342
  class GAC(CrispLearnerQMixin, MatrixAdjustment):
277
- """Gonzalez-Castro’s Generalized Adjusted Count (GAC) method."""
343
+ r"""Generalized Adjusted Count method.
344
+
345
+ This class implements the Generalized Adjusted Count (GAC) algorithm for
346
+ quantification adjustment as described in Firat (2016) [1]_. The method
347
+ adjusts the estimated class prevalences by normalizing the confusion matrix
348
+ based on prevalence estimates, providing a correction for bias caused by
349
+ distribution differences between training and test data.
350
+
351
+ The confusion matrix is normalized by dividing each column by the prevalence
352
+ estimate of the corresponding class. For classes with zero estimated prevalence,
353
+ the diagonal element is set to 1 to avoid division by zero.
354
+
355
+ This normalization ensures that the matrix best reflects the classifier's
356
+ behavior relative to the estimated class distributions, improving quantification
357
+ accuracy.
358
+
359
+ Parameters
360
+ ----------
361
+ learner : estimator, optional
362
+ Base classifier with `fit` and `predict` methods.
363
+
364
+ Attributes
365
+ ----------
366
+ CM : ndarray of shape (n_classes, n_classes)
367
+ Normalized confusion matrix used for adjusting predicted prevalences.
368
+ classes_ : ndarray
369
+ Array of class labels observed during training.
370
+
371
+
372
+ Examples
373
+ --------
374
+ >>> from sklearn.linear_model import LogisticRegression
375
+ >>> from mlquantify.adjust_counting import GAC
376
+ >>> import numpy as np
377
+ >>> gac = GAC(learner=LogisticRegression())
378
+ >>> X = np.random.randn(50, 4)
379
+ >>> y = np.random.randint(0, 2, 50)
380
+ >>> gac.fit(X, y)
381
+ >>> gac.predict(X)
382
+ {0: 0.5, 1: 0.5}
383
+
384
+ References
385
+ ----------
386
+ .. [1] Firat, A. (2016). "Unified Framework for Quantification",
387
+ *Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
388
+ """
278
389
  def __init__(self, learner=None):
279
390
  super().__init__(learner=learner, solver='linear')
280
391
 
@@ -289,7 +400,51 @@ class GAC(CrispLearnerQMixin, MatrixAdjustment):
289
400
 
290
401
 
291
402
  class GPAC(SoftLearnerQMixin, MatrixAdjustment):
292
- """Probabilistic GAC (GPAC) — soft version using posterior probabilities."""
403
+ r"""Probabilistic Generalized Adjusted Count (GPAC) method.
404
+
405
+ This class implements the probabilistic extension of the Generalized Adjusted Count method
406
+ as presented in Firat (2016) [1]_. The GPAC method normalizes the confusion matrix by
407
+ the estimated prevalences from posterior probabilities, enabling a probabilistic correction
408
+ of class prevalences.
409
+
410
+ The normalization divides each column of the confusion matrix by the estimated prevalence
411
+ of the corresponding class. If a class has zero estimated prevalence, the diagonal element
412
+ for that class is set to 1 to maintain matrix validity.
413
+
414
+ GPAC extends the GAC approach by using soft probabilistic predictions (posterior probabilities)
415
+ rather than crisp class labels, potentially improving quantification accuracy when
416
+ posterior probabilities are well calibrated.
417
+
418
+ Parameters
419
+ ----------
420
+ learner : estimator, optional
421
+ Base classifier with `fit` and `predict_proba` methods.
422
+
423
+ Attributes
424
+ ----------
425
+ CM : ndarray of shape (n_classes, n_classes)
426
+ Normalized confusion matrix used for adjustment.
427
+ classes_ : ndarray
428
+ Array of class labels observed during training.
429
+
430
+
431
+ Examples
432
+ --------
433
+ >>> from sklearn.linear_model import LogisticRegression
434
+ >>> from mlquantify.adjust_counting import GPAC
435
+ >>> import numpy as np
436
+ >>> gpac = GPAC(learner=LogisticRegression())
437
+ >>> X = np.random.randn(50, 4)
438
+ >>> y = np.random.randint(0, 2, 50)
439
+ >>> gpac.fit(X, y)
440
+ >>> gpac.predict(X)
441
+ {0: 0.5, 1: 0.5}
442
+
443
+ References
444
+ ----------
445
+ .. [1] Firat, A. (2016). "Unified Framework for Quantification",
446
+ *Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
447
+ """
293
448
  def __init__(self, learner=None):
294
449
  super().__init__(learner=learner, solver='linear')
295
450
 
@@ -304,41 +459,145 @@ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
304
459
 
305
460
 
306
461
  class ACC(ThresholdAdjustment):
307
- """Adjusted Count (ACC) — baseline threshold correction."""
308
- def _get_best_threshold(self, thresholds, tprs, fprs):
462
+ r"""Adjusted Count (ACC) — baseline threshold correction.
463
+
464
+ This method corrects the bias in class prevalence estimates caused by imperfect
465
+ classification accuracy, by adjusting the observed positive count using estimates
466
+ of the classifier's true positive rate (TPR) and false positive rate (FPR).
467
+
468
+ It uses a fixed classification threshold and applies the formula:
469
+
470
+ .. math::
471
+
472
+ p = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
473
+
474
+ where :math:`p'` is the observed positive proportion from :class:`CC`,
475
+
476
+
477
+ Parameters
478
+ ----------
479
+ learner : estimator, optional
480
+ A supervised learning model with `fit` and `predict_proba` methods.
481
+ threshold : float, default=0.5
482
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
483
+
484
+ References
485
+ ----------
486
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
487
+ *ECML*, pp. 564-575.
488
+ """
489
+
490
+ def get_best_threshold(self, thresholds, tprs, fprs):
309
491
  tpr = tprs[thresholds == self.threshold][0]
310
492
  fpr = fprs[thresholds == self.threshold][0]
311
493
  return (self.threshold, tpr, fpr)
312
494
 
313
495
 
314
496
  class X_method(ThresholdAdjustment):
315
- """X method — threshold where \( \text{TPR} + \text{FPR} = 1 \)."""
316
- def _get_best_threshold(self, thresholds, tprs, fprs):
497
+ r"""X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
498
+
499
+ This method selects the classification threshold at which the sum of the true positive
500
+ rate (TPR) and false positive rate (FPR) equals one. This threshold choice balances
501
+ errors in a specific way improving quantification.
502
+
503
+
504
+ Parameters
505
+ ----------
506
+ learner : estimator, optional
507
+ A supervised learning model with `fit` and `predict_proba` methods.
508
+ threshold : float, default=0.5
509
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
510
+
511
+ References
512
+ ----------
513
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
514
+ *ECML*, pp. 564-575.
515
+ """
516
+ def get_best_threshold(self, thresholds, tprs, fprs):
317
517
  idx = np.argmin(np.abs(1 - (tprs + fprs)))
318
518
  return thresholds[idx], tprs[idx], fprs[idx]
319
519
 
320
520
 
321
521
  class MAX(ThresholdAdjustment):
322
- r"""MAX method — threshold maximizing \( \text{TPR} - \text{FPR} \)."""
323
- def _get_best_threshold(self, thresholds, tprs, fprs):
522
+ r"""MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
523
+
524
+ This method selects the threshold that maximizes the difference between the true positive
525
+ rate (TPR) and the false positive rate (FPR), effectively optimizing classification
526
+ performance for quantification.
527
+
528
+
529
+ Parameters
530
+ ----------
531
+ learner : estimator, optional
532
+ A supervised learning model with `fit` and `predict_proba` methods.
533
+ threshold : float, default=0.5
534
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
535
+
536
+
537
+ References
538
+ ----------
539
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
540
+ *ECML*, pp. 564-575.
541
+ """
542
+ def get_best_threshold(self, thresholds, tprs, fprs):
324
543
  idx = np.argmax(np.abs(tprs - fprs))
325
544
  return thresholds[idx], tprs[idx], fprs[idx]
326
545
 
327
546
 
328
547
  class T50(ThresholdAdjustment):
329
- r"""T50 — selects threshold where \( \text{TPR} = 0.5 \)."""
330
- def _get_best_threshold(self, thresholds, tprs, fprs):
548
+ r"""T50 — selects threshold where :math:`\text{TPR} = 0.5`.
549
+
550
+ This method chooses the classification threshold such that the true positive rate (TPR)
551
+ equals 0.5, avoiding regions with unreliable estimates at extreme thresholds.
552
+
553
+
554
+ Parameters
555
+ ----------
556
+ learner : estimator, optional
557
+ A supervised learning model with `fit` and `predict_proba` methods.
558
+ threshold : float, default=0.5
559
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
560
+
561
+
562
+ References
563
+ ----------
564
+ .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
565
+ *ECML*, pp. 564-575.
566
+ """
567
+ def get_best_threshold(self, thresholds, tprs, fprs):
331
568
  idx = np.argmin(np.abs(tprs - 0.5))
332
569
  return thresholds[idx], tprs[idx], fprs[idx]
333
570
 
334
571
 
335
572
  class MS(ThresholdAdjustment):
336
- r"""Median Sweep (MS) — median prevalence across all thresholds."""
573
+ r"""Median Sweep (MS) — median prevalence estimate across all thresholds.
574
+
575
+ This method computes class prevalence estimates at multiple classification thresholds,
576
+ using the adjusted count formula for each, then returns the median of these estimates,
577
+ reducing variance caused by any single threshold selection.
578
+
579
+ It thus leverages the strengths of bootstrap-like variance reduction without heavy
580
+ computation.
581
+
582
+
583
+ Parameters
584
+ ----------
585
+ learner : estimator, optional
586
+ A supervised learning model with `fit` and `predict_proba` methods.
587
+ threshold : float, default=0.5
588
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
589
+
590
+
591
+ References
592
+ ----------
593
+ .. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
594
+ *Data Mining and Knowledge Discovery*, 17(2), 164-206.
595
+ """
337
596
  def _adjust(self, predictions, train_y_scores, train_y_values):
338
597
  positive_scores = train_y_scores[:, 1]
339
598
 
340
- thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
341
- thresholds, tprs, fprs = self._get_best_threshold(thresholds, tprs, fprs)
599
+ thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
600
+ thresholds, tprs, fprs = self.get_best_threshold(thresholds, tprs, fprs)
342
601
 
343
602
  prevs = []
344
603
  for thr, tpr, fpr in zip(thresholds, tprs, fprs):
@@ -349,13 +608,37 @@ class MS(ThresholdAdjustment):
349
608
  prevalence = np.median(prevs)
350
609
  return np.asarray([1 - prevalence, prevalence])
351
610
 
352
- def _get_best_threshold(self, thresholds, tprs, fprs):
611
+ def get_best_threshold(self, thresholds, tprs, fprs):
353
612
  return thresholds, tprs, fprs
354
613
 
355
614
 
356
615
  class MS2(MS):
357
- r"""MS2 — Median Sweep variant with constraint \( |\text{TPR} - \text{FPR}| > 0.25 \)."""
358
- def _get_best_threshold(self, thresholds, tprs, fprs):
616
+ r"""MS2 — Median Sweep variant constraining :math:`|\text{TPR} - \text{FPR}| > 0.25`.
617
+
618
+ This variant of Median Sweep excludes thresholds where the absolute difference
619
+ between true positive rate (TPR) and false positive rate (FPR) is below 0.25,
620
+ improving stability by avoiding ambiguous threshold regions.
621
+
622
+
623
+ Parameters
624
+ ----------
625
+ learner : estimator, optional
626
+ A supervised learning model with `fit` and `predict_proba` methods.
627
+ threshold : float, default=0.5
628
+ Classification threshold in [0, 1] for applying in the :class:`CC` output.
629
+
630
+
631
+ Warnings
632
+ --------
633
+ - Warns if all TPR or FPR values are zero.
634
+ - Warns if no thresholds satisfy the constraint.
635
+
636
+ References
637
+ ----------
638
+ .. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
639
+ *Data Mining and Knowledge Discovery*, 17(2), 164-206.
640
+ """
641
+ def get_best_threshold(self, thresholds, tprs, fprs):
359
642
  if np.all(tprs == 0) or np.all(fprs == 0):
360
643
  warnings.warn("All TPR or FPR values are zero.")
361
644
  indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
@@ -174,7 +174,7 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
174
174
  --------
175
175
  >>> from mlquantify.base_count import BaseAdjustCount
176
176
  >>> import numpy as np
177
-
177
+ >>> from sklearn.linear_model import LogisticRegression
178
178
  >>> class ACC(CrispLearnerQMixin, BaseAdjustCount):
179
179
  ... def _adjust(self, preds, train_preds, y_train):
180
180
  ... tpr = np.mean(train_preds[y_train == 1])
@@ -182,8 +182,6 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
182
182
  ... p_obs = np.mean(preds)
183
183
  ... p_adj = (p_obs - fpr) / (tpr - fpr)
184
184
  ... return np.clip([1 - p_adj, p_adj], 0, 1)
185
-
186
- >>> from sklearn.linear_model import LogisticRegression
187
185
  >>> X = np.random.randn(100, 5)
188
186
  >>> y = np.random.randint(0, 2, 100)
189
187
  >>> q = ACC(learner=LogisticRegression())