mlquantify 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,7 @@ from mlquantify.utils.prevalence import get_prev_from_labels
30
30
 
31
31
 
32
32
  def get_protocol_sampler(protocol_name, batch_size, n_prevalences, min_prev, max_prev, n_classes):
33
- """ Returns a prevalence sampler function based on the specified protocol name.
33
+ r""" Returns a prevalence sampler function based on the specified protocol name.
34
34
 
35
35
  Parameters
36
36
  ----------
@@ -80,8 +80,7 @@ def get_protocol_sampler(protocol_name, batch_size, n_prevalences, min_prev, max
80
80
  return protocol
81
81
 
82
82
  class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
83
- """
84
- Ensemble-based Quantifier combining multiple models trained on varied data samples
83
+ r"""Ensemble-based Quantifier combining multiple models trained on varied data samples
85
84
  with controlled prevalence distributions to improve robustness and accuracy.
86
85
 
87
86
  This quantifier constructs an ensemble of quantification models using batches of training
@@ -128,18 +127,6 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
128
127
  posteriors_generator : callable or None
129
128
  Function to generate posterior probabilities for new samples.
130
129
 
131
- Methods
132
- -------
133
- fit(X, y)
134
- Fits all ensemble member quantifiers on sampled training batches.
135
- predict(X)
136
- Aggregates ensemble member predictions into final prevalence estimates.
137
- ptr_selection_metric(prevalences, train_prevalences)
138
- Implements PTR-based selection metric on prevalence estimates.
139
- ds_get_posteriors(X, y)
140
- Computes posterior probabilities for training data with cross-validated logistic regression.
141
- ds_selection_metric(X, prevalences, train_distributions, posteriors_generator)
142
- Implements DS-based selection metric comparing posterior distributions.
143
130
 
144
131
  Notes
145
132
  -----
@@ -149,9 +136,25 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
149
136
 
150
137
  Examples
151
138
  --------
152
- >>> ensemble = EnsembleQ(quantifier=SomeQuantifier(), size=30, protocol='kraemer', selection_metric='ptr')
139
+ >>> from mlquantify.ensemble import EnsembleQ
140
+ >>> from mlquantify.mixture import DyS
141
+ >>> from sklearn.ensemble import RandomForestClassifier
142
+ >>>
143
+ >>> ensemble = EnsembleQ(
144
+ ... quantifier=DyS(RandomForestClassifier()),
145
+ ... size=30,
146
+ ... protocol='artificial', # APP protocol
147
+ ... selection_metric='ptr'
148
+ ... )
153
149
  >>> ensemble.fit(X_train, y_train)
154
150
  >>> prevalence_estimates = ensemble.predict(X_test)
151
+
152
+ References
153
+ ----------
154
+ .. [1] Pérez-Gállego, P., Castaño, A., Ramón Quevedo, J., & José del Coz, J. (2019). Dynamic ensemble selection for quantification tasks. Information Fusion, 45, 1-15. https://doi.org/10.1016/j.inffus.2018.01.001
155
+
156
+ .. [2] Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. Information Fusion, 34, 87-100. https://doi.org/10.1016/j.inffus.2016.07.001
157
+
155
158
  """
156
159
 
157
160
  _parameter_constraints = {
@@ -306,7 +309,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
306
309
 
307
310
 
308
311
  def ptr_selection_metric(self, prevalences, train_prevalences):
309
- """
312
+ r"""
310
313
  Selects the prevalence estimates from models trained on samples whose prevalence is most similar
311
314
  to an initial approximation of the test prevalence as estimated by all models in the ensemble.
312
315
 
@@ -326,7 +329,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
326
329
  return _select_k(prevalences, order, k=self.p_metric)
327
330
 
328
331
  def ds_get_posteriors(self, X, y):
329
- """
332
+ r"""
330
333
  Generate posterior probabilities using cross-validated logistic regression.
331
334
  This method computes posterior probabilities for the training data via cross-validation,
332
335
  using a logistic regression classifier with hyperparameters optimized through grid search.
@@ -370,7 +373,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
370
373
 
371
374
 
372
375
  def ds_selection_metric(self, X, prevalences, train_distributions, posteriors_generator):
373
- """
376
+ r"""
374
377
  Selects the prevalence estimates from models trained on samples whose distribution of posterior
375
378
  probabilities is most similar to the distribution of posterior probabilities for the test data.
376
379
 
@@ -393,7 +396,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
393
396
  return _select_k(prevalences, order, k=self.p_metric)
394
397
 
395
398
  def _select_k(elements, order, k):
396
- """
399
+ r"""
397
400
  Selects the k elements from the list of elements based on the order.
398
401
  If the list is empty, it returns the original list.
399
402
 
@@ -422,7 +425,7 @@ def _select_k(elements, order, k):
422
425
 
423
426
 
424
427
  class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
425
- """
428
+ r"""
426
429
  Aggregative Bootstrap Quantifier to compute prevalence confidence regions.
427
430
 
428
431
  This metaquantifier applies bootstrapping to both training and test data predictions
@@ -445,18 +448,17 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
445
448
  confidence_level : float between 0 and 1, default=0.95
446
449
  Confidence level for intervals or regions.
447
450
 
448
- Methods
449
- -------
450
- fit(X, y, val_split=None)
451
- Fits base quantifier and generates training predictions (optionally splitting data).
452
- predict(X)
453
- Returns prevalence estimates and confidence regions aggregated from bootstrap samples.
454
- aggregate(predictions, train_predictions, train_y_values)
455
- Performs bootstrap resampling aggregation to obtain prevalence confidence regions.
456
451
 
457
452
  Examples
458
453
  --------
459
- >>> agg_boot = AggregativeBootstrap(quantifier=SomeQuantifier, n_train_bootstraps=100, n_test_bootstraps=100)
454
+ >>> from mlquantify.ensemble import AggregativeBootstrap
455
+ >>> from mlquantify.neighbors import EMQ
456
+ >>> from sklearn.ensemble import RandomForestClassifier
457
+ >>> agg_boot = AggregativeBootstrap(
458
+ ... quantifier=EMQ(RandomForestClassifier()),
459
+ ... n_train_bootstraps=100,
460
+ ... n_test_bootstraps=100
461
+ ... )
460
462
  >>> agg_boot.fit(X_train, y_train)
461
463
  >>> prevalence, conf_region = agg_boot.predict(X_test)
462
464
  """
@@ -485,7 +487,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
485
487
  self.confidence_level = confidence_level
486
488
 
487
489
  def fit(self, X, y, val_split=None):
488
- """ Fits the aggregative bootstrap model to the given training data.
490
+ r""" Fits the aggregative bootstrap model to the given training data.
489
491
 
490
492
  Parameters
491
493
  ----------
@@ -498,6 +500,11 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
498
500
  -------
499
501
  self : AggregativeBootstrap
500
502
  The fitted aggregative bootstrap model.
503
+
504
+ Raises
505
+ ------
506
+ ValueError
507
+ If the provided quantifier is not an aggregative quantifier.
501
508
  """
502
509
  X, y = validate_data(self, X, y)
503
510
  self.classes = np.unique(y)
@@ -524,7 +531,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
524
531
  return self
525
532
 
526
533
  def predict(self, X):
527
- """ Predicts the class prevalences for the given test data.
534
+ r""" Predicts the class prevalences for the given test data.
528
535
 
529
536
  Parameters
530
537
  ----------
@@ -546,7 +553,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
546
553
 
547
554
 
548
555
  def aggregate(self, predictions, train_predictions, train_y_values):
549
- """ Aggregates the predictions using bootstrap resampling.
556
+ r""" Aggregates the predictions using bootstrap resampling.
550
557
 
551
558
  Parameters
552
559
  ----------
@@ -612,10 +619,10 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
612
619
 
613
620
 
614
621
  class QuaDapt(MetaquantifierMixin, BaseQuantifier):
615
- r"""QuaDapt Metaquantifier: Adaptive quantification using score merging and distance measures.
622
+ r"""QuaDapt Metaquantifier: Adaptive quantification using synthetic scores.
616
623
 
617
624
  This metaquantifier improves prevalence estimation by merging training samples
618
- with different score distributions using a merging factor \( m \). It evaluates
625
+ with different score distributions using a merging factor :math: \( m \). It evaluates
619
626
  candidate merging factors, chooses the best by minimizing a distribution distance
620
627
  metric (Hellinger, Topsoe, ProbSymm, or SORD), and aggregates quantification accordingly.
621
628
 
@@ -625,38 +632,28 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
625
632
  The base quantifier model to adapt.
626
633
  measure : {'hellinger', 'topsoe', 'probsymm', 'sord'}, default='topsoe'
627
634
  The distribution distance metric used to select the best merging factor.
628
- merging_factor : array-like
635
+ merging_factors : array-like
629
636
  Candidate merging factor values to evaluate.
630
637
 
631
- Methods
632
- -------
633
- fit(X, y)
634
- Fits the base learner on training data.
635
- predict(X)
636
- Predicts prevalence aggregating via the best merging factor.
637
- aggregate(predictions, train_y_values)
638
- Performs adaptation and aggregation based on merged score distributions.
639
- _get_best_merging_factor(predictions)
640
- Evaluates merging factors and selects the best based on minimum distance.
641
- _get_best_distance(predictions, pos_scores, neg_scores)
642
- Computes the distance metric between predicted and class score distributions.
643
-
644
- Class Methods
645
- -------------
646
- MoSS(n, alpha, m)
647
- Generates merged score samples modeling class conditional distributions
648
- parameterized by mixing proportion alpha and merging factor m.
649
-
650
638
  Examples
651
639
  --------
652
- >>> quadapt = QuaDapt(quantifier=SomeQuantifier, merging_factor=[0.1, 0.5, 1.0], measure='sord')
653
- >>> quadapt.fit(X_train, y_train)
654
- >>> prevalence = quadapt.predict(X_test)
640
+ >>> from mlquantify.meta import QuaDapt
641
+ >>> from mlquantify.adjust_counting import ACC
642
+ >>> from sklearn.ensemble import RandomForestClassifier
643
+ >>> quadapt_acc = QuaDapt(
644
+ ... quantifier=ACC(RandomForestClassifier()),
645
+ ... merging_factor=[0.1, 0.5, 1.0],
646
+ ... measure='sord'
647
+ ... )
648
+ >>> quadapt_acc.fit(X_train, y_train)
649
+ >>> prevalence = quadapt_acc.predict(X_test)
650
+
651
+
655
652
  """
656
653
 
657
654
  _parameter_constraints = {
658
655
  "quantifier": [BaseQuantifier],
659
- "merging_factor": "array-like",
656
+ "merging_factors": "array-like",
660
657
  "measure": [Options(["hellinger", "topsoe", "probsymm", "sord"])],
661
658
  "random_state": [Options([None, int])],
662
659
  }
@@ -664,10 +661,10 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
664
661
  def __init__(self,
665
662
  quantifier,
666
663
  measure="topsoe",
667
- merging_factor=(0.1, 1.0, 0.2)):
664
+ merging_factors=(0.1, 1.0, 0.2)):
668
665
  self.quantifier = quantifier
669
666
  self.measure = measure
670
- self.merging_factor = merging_factor
667
+ self.merging_factors = merging_factors
671
668
 
672
669
 
673
670
  def fit(self, X, y):
@@ -719,7 +716,7 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
719
716
 
720
717
  def _get_best_merging_factor(self, predictions):
721
718
 
722
- MF = np.atleast_1d(np.round(self.merging_factor, 2)).astype(float)
719
+ MF = np.atleast_1d(np.round(self.merging_factors, 2)).astype(float)
723
720
 
724
721
  distances = []
725
722
 
@@ -748,6 +745,33 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
748
745
 
749
746
  @classmethod
750
747
  def MoSS(cls, n, alpha, m):
748
+ r"""Model for Score Simulation
749
+
750
+ MoSS has three key parameters:
751
+ (I) the number of observations `n`;
752
+ (II) the class proportion `\alpha`, which defines the prevalence of the positive class;
753
+ (III) the merging factor :math:`m`, which controls the overlap between positive and negative score distributions
754
+ (where :math:`m=0` represents easily separable classes and :math:`m=1` represents highly overlapping ones).
755
+
756
+ .. math::
757
+
758
+ \mathrm{moss}(n, \alpha, \mathfrak{m}) = \mathrm{syn}(\oplus, \lfloor \alpha n \rfloor, \mathfrak{m}) \cup \mathrm{syn}(\ominus , \lfloor (1 - \alpha) n \rfloor, \mathfrak{m})
759
+
760
+ Notes
761
+ -----
762
+ The MoSS generates only binary scores, simulating positive and negative class scores.
763
+
764
+ Examples
765
+ --------
766
+ >>> scores = QuaDapt.MoSS(n=1000, alpha=0.3, m=0.5)
767
+ >>> print(scores.shape)
768
+ (1000, 3)
769
+
770
+ References
771
+ ----------
772
+ .. [1] Maletzke, A., Reis, D. dos, Hassan, W., & Batista, G. (2021).
773
+ Accurately Quantifying under Score Variability. 2021 IEEE International Conference on Data Mining (ICDM), 1228-1233. https://doi.org/10.1109/ICDM51629.2021.00149
774
+ """
751
775
  p_score = np.random.uniform(size=int(n * alpha)) ** m
752
776
  n_score = 1 - (np.random.uniform(size=int(round(n * (1 - alpha), 0))) ** m)
753
777
  scores = np.column_stack(
mlquantify/metrics/_oq.py CHANGED
@@ -28,7 +28,7 @@ def process_inputs(prev_pred, prev_real):
28
28
 
29
29
 
30
30
  def NMD(prev_pred, prev_real, distances=None):
31
- """
31
+ r"""
32
32
  Compute the Normalized Match Distance (NMD), also known as Earth Mover’s Distance (EMD),
33
33
  for ordinal quantification evaluation.
34
34
 
@@ -66,7 +66,7 @@ def NMD(prev_pred, prev_real, distances=None):
66
66
 
67
67
 
68
68
  def RNOD(prev_pred, prev_real, distances=None):
69
- """
69
+ r"""
70
70
  Compute the Root Normalised Order-aware Divergence (RNOD) for ordinal quantification evaluation.
71
71
 
72
72
  Parameters
mlquantify/metrics/_rq.py CHANGED
@@ -31,7 +31,7 @@ def process_inputs(prev_pred, prev_real):
31
31
 
32
32
 
33
33
  def VSE(prev_pred, prev_real, train_values):
34
- """
34
+ r"""
35
35
  Compute the Variance-normalised Squared Error (VSE).
36
36
 
37
37
  Parameters
@@ -60,7 +60,7 @@ def VSE(prev_pred, prev_real, train_values):
60
60
 
61
61
 
62
62
  def CvM_L1(prev_pred, prev_real, n_bins=100):
63
- """
63
+ r"""
64
64
  Compute the L1 version of the Cramér–von Mises statistic (Xiao et al., 2006)
65
65
  between two cumulative distributions, as suggested by Bella et al. (2014).
66
66
 
@@ -30,7 +30,7 @@ def process_inputs(prev_pred, prev_real):
30
30
 
31
31
 
32
32
  def AE(prev_pred, prev_real):
33
- """
33
+ r"""
34
34
  Compute the absolute error for each class or a dictionary of errors if input is a dictionary.
35
35
 
36
36
  Parameters
@@ -57,7 +57,7 @@ def AE(prev_pred, prev_real):
57
57
 
58
58
 
59
59
  def MAE(prev_pred, prev_real):
60
- """
60
+ r"""
61
61
  Compute the mean absolute error between the real and predicted prevalences.
62
62
 
63
63
  Parameters
@@ -78,7 +78,7 @@ def MAE(prev_pred, prev_real):
78
78
 
79
79
 
80
80
  def KLD(prev_pred, prev_real):
81
- """
81
+ r"""
82
82
  Compute the Kullback-Leibler divergence between the real and predicted prevalences.
83
83
 
84
84
  Parameters
@@ -99,7 +99,7 @@ def KLD(prev_pred, prev_real):
99
99
 
100
100
 
101
101
  def SE(prev_pred, prev_real):
102
- """
102
+ r"""
103
103
  Compute the mean squared error between the real and predicted prevalences.
104
104
 
105
105
  Parameters
@@ -120,7 +120,7 @@ def SE(prev_pred, prev_real):
120
120
 
121
121
 
122
122
  def MSE(prev_pred, prev_real):
123
- """ Mean Squared Error
123
+ r""" Mean Squared Error
124
124
 
125
125
  Parameters
126
126
  ----------
@@ -140,7 +140,7 @@ def MSE(prev_pred, prev_real):
140
140
 
141
141
 
142
142
  def NAE(prev_pred, prev_real):
143
- """
143
+ r"""
144
144
  Compute the normalized absolute error between the real and predicted prevalences.
145
145
 
146
146
  Parameters
@@ -163,7 +163,7 @@ def NAE(prev_pred, prev_real):
163
163
 
164
164
 
165
165
  def NKLD(prev_pred, prev_real):
166
- """
166
+ r"""
167
167
  Compute the normalized Kullback-Leibler divergence between the real and predicted prevalences.
168
168
 
169
169
  Parameters
@@ -186,7 +186,7 @@ def NKLD(prev_pred, prev_real):
186
186
 
187
187
 
188
188
  def RAE(prev_pred, prev_real):
189
- """
189
+ r"""
190
190
  Compute the relative absolute error between the real and predicted prevalences.
191
191
 
192
192
  Parameters
@@ -207,7 +207,7 @@ def RAE(prev_pred, prev_real):
207
207
 
208
208
 
209
209
  def NRAE(prev_pred, prev_real):
210
- """
210
+ r"""
211
211
  Compute the normalized relative absolute error between the real and predicted prevalences.
212
212
 
213
213
  Parameters
@@ -15,8 +15,7 @@ from mlquantify.mixture._utils import (
15
15
  )
16
16
 
17
17
  class BaseMixture(BaseQuantifier):
18
- """
19
- Base class for mixture-model quantifiers.
18
+ r"""Base class for mixture-model quantifiers.
20
19
 
21
20
  Mixture Models (MM) for quantification estimate class prevalences by modeling
22
21
  the test set score distribution as a mixture of the individual class score
@@ -39,7 +38,7 @@ class BaseMixture(BaseQuantifier):
39
38
  scores or histograms, and the choice of distance can affect quantification accuracy
40
39
  and robustness.
41
40
 
42
- The DyS framework (Maletzke et al. 2019) generalizes mixture models by introducing
41
+ The DyS framework [3]_ generalizes mixture models by introducing
43
42
  a variety of distribution dissimilarity measures, enabling flexible and effective
44
43
  quantification methods.
45
44
 
@@ -49,11 +48,13 @@ class BaseMixture(BaseQuantifier):
49
48
  Mixture models are defined for only binary quantification problems. For multi-class
50
49
  problems, a one-vs-rest strategy is applied, training a binary mixture model for
51
50
  each class against the rest.
51
+
52
52
 
53
53
  Parameters
54
54
  ----------
55
55
  None directly; subclasses implement fitting and prediction logic.
56
56
 
57
+
57
58
  Attributes
58
59
  ----------
59
60
  _precomputed : bool
@@ -63,19 +64,6 @@ class BaseMixture(BaseQuantifier):
63
64
  classes : ndarray of shape (n_classes,)
64
65
  Unique class labels seen during training.
65
66
 
66
- Methods
67
- -------
68
- fit(X, y, *args, **kwargs):
69
- Fit the mixture quantifier with training data. Validates input and
70
- calls internal fitting procedure.
71
- predict(X, *args, **kwargs):
72
- Predict class prevalences for input data by leveraging best mixture parameters.
73
- get_best_distance(*args, **kwargs):
74
- Return the best distance measure and associated mixture parameters found.
75
- best_mixture(X):
76
- Abstract method to determine optimal mixture parameters on input data.
77
- get_distance(dist_train, dist_test, measure="hellinger"):
78
- Compute a specified distance between two distributions.
79
67
 
80
68
  References
81
69
  ----------
@@ -118,6 +106,14 @@ class BaseMixture(BaseQuantifier):
118
106
  return self._predict(X, *args, **kwargs)
119
107
 
120
108
  def get_best_distance(self, *args, **kwargs):
109
+ r""" Get the best distance value from the mixture fitting process.
110
+
111
+ Notes
112
+ -----
113
+ If the quantifier has not been fitted yet, it will fit the model for getting the
114
+ best distance.
115
+
116
+ """
121
117
  _, best_distance = self.best_mixture(*args, **kwargs)
122
118
  return best_distance
123
119
 
@@ -128,9 +124,7 @@ class BaseMixture(BaseQuantifier):
128
124
 
129
125
  @classmethod
130
126
  def get_distance(cls, dist_train, dist_test, measure="hellinger"):
131
- """
132
- Compute distance between two distributions.
133
- """
127
+ r"""Compute distance between two distributions."""
134
128
 
135
129
  if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
136
130
  raise ValueError("One or both vectors are zero (empty)...")
@@ -21,8 +21,7 @@ from mlquantify.mixture._utils import (
21
21
  # =====================================================
22
22
  @define_binary
23
23
  class AggregativeMixture(SoftLearnerQMixin, AggregationMixin, BaseMixture):
24
- """
25
- Base class for Mixture-based Quantification Methods.
24
+ r"""Base class for Mixture-based Quantification Methods.
26
25
 
27
26
  These methods assume that the test score distribution is a mixture
28
27
  of the positive and negative score distributions from the training data.
@@ -105,7 +104,7 @@ class AggregativeMixture(SoftLearnerQMixin, AggregationMixin, BaseMixture):
105
104
  # =====================================================
106
105
 
107
106
  class DyS(AggregativeMixture):
108
- """Distribution y-Similarity (DyS) quantification method.
107
+ r"""Distribution y-Similarity (DyS) quantification method.
109
108
 
110
109
  Uses mixture modeling with a dissimilarity measure between distributions
111
110
  computed on histograms of classifier scores. This method optimizes mixture
@@ -128,7 +127,9 @@ class DyS(AggregativeMixture):
128
127
 
129
128
  Examples
130
129
  --------
131
- >>> q = DyS(learner=my_learner, measure="hellinger")
130
+ >>> from mlquantify.mixture import DyS
131
+ >>> from sklearn.linear_model import LogisticRegression
132
+ >>> q = DyS(learner=LogisticRegression(), measure="hellinger")
132
133
  >>> q.fit(X_train, y_train)
133
134
  >>> prevalences = q.predict(X_test)
134
135
  """
@@ -147,6 +148,35 @@ class DyS(AggregativeMixture):
147
148
  self.bins_size = np.asarray(bins_size, dtype=int)
148
149
 
149
150
  def best_mixture(self, predictions, pos_scores, neg_scores):
151
+ r"""Determine the best mixture parameters for the given data.
152
+
153
+ Applies ternary search to find the mixture weight minimizing the distance
154
+ between the test score histogram and the mixture of positive and negative
155
+
156
+ The mixture weight :math:`\alpha` is estimated as:
157
+ .. math::
158
+ \alpha = \arg \min_{\alpha \in [0, 1]} D \left( H_{test}, \alpha H_{pos} + (1 - \alpha) H_{neg} \right)
159
+
160
+ where :math:`D` is the selected distance measure and :math:`H` denotes histograms.
161
+
162
+
163
+ Parameters
164
+ ----------
165
+ predictions : ndarray
166
+ Classifier scores for the test data.
167
+ pos_scores : ndarray
168
+ Classifier scores for the positive class from training data.
169
+ neg_scores : ndarray
170
+ Classifier scores for the negative class from training data.
171
+
172
+
173
+ Returns
174
+ -------
175
+ alpha : float
176
+ Estimated mixture weight.
177
+ best_distance : float
178
+ Distance corresponding to the best mixture weight.
179
+ """
150
180
 
151
181
  prevs = []
152
182
  self.distances = []
@@ -175,7 +205,7 @@ class DyS(AggregativeMixture):
175
205
  # =====================================================
176
206
 
177
207
  class HDy(AggregativeMixture):
178
- """Hellinger Distance Minimization (HDy) quantification method.
208
+ r"""Hellinger Distance Minimization (HDy) quantification method.
179
209
 
180
210
  Estimates class prevalences by finding mixture weights that minimize
181
211
  the Hellinger distance between the histogram of test scores and the mixture
@@ -193,6 +223,35 @@ class HDy(AggregativeMixture):
193
223
  """
194
224
 
195
225
  def best_mixture(self, predictions, pos_scores, neg_scores):
226
+ r"""Determine the best mixture parameters for the given data.
227
+
228
+ Compute the mixture weight :math:`\alpha` that minimizes the Hellinger distance between the test score histogram and the mixture of positive and negative class score histograms.
229
+
230
+ The mixture weight :math:`\alpha` is estimated as:
231
+ .. math::
232
+ \alpha = \arg \min_{\alpha \in [0, 1]} Hellinger \left( H_{test}, \alpha H_{pos} + (1 - \alpha) H_{neg} \right)
233
+
234
+ where :math:`H` denotes histograms.
235
+
236
+
237
+ Parameters
238
+ ----------
239
+ predictions : ndarray
240
+ Classifier scores for the test data.
241
+ pos_scores : ndarray
242
+ Classifier scores for the positive class from training data.
243
+ neg_scores : ndarray
244
+ Classifier scores for the negative class from training data.
245
+
246
+
247
+ Returns
248
+ -------
249
+ alpha : float
250
+ Estimated mixture weight.
251
+ best_distance : float
252
+ Distance corresponding to the best mixture weight.
253
+ """
254
+
196
255
  bins_size = np.arange(10, 110, 11)
197
256
  alpha_values = np.round(np.linspace(0, 1, 101), 2)
198
257
 
@@ -228,13 +287,12 @@ class SMM(AggregativeMixture):
228
287
 
229
288
  Estimates class prevalence by matching the mean score of the test samples
230
289
  to a convex combination of positive and negative training scores. The mixture
231
- weight \( \alpha \) is computed as:
290
+ weight :math:`\alpha` is computed as:
232
291
 
233
- \[
234
- \alpha = \frac{\bar{s}_{test} - \bar{s}_{neg}}{\bar{s}_{pos} - \bar{s}_{neg}}
235
- \]
292
+ .. math::
293
+ \alpha = \frac{\bar{s}_{test} - \bar{s}_{neg}}{\bar{s}_{pos} - \bar{s}_{neg}}
236
294
 
237
- where \( \bar{s} \) denotes the sample mean.
295
+ where :math:`\bar{s}` denotes the sample mean.
238
296
 
239
297
  Parameters
240
298
  ----------