mlquantify 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +11 -1
- mlquantify/adjust_counting/__init__.py +11 -1
- mlquantify/adjust_counting/_adjustment.py +370 -87
- mlquantify/adjust_counting/_base.py +1 -3
- mlquantify/adjust_counting/_counting.py +27 -19
- mlquantify/adjust_counting/_utils.py +23 -28
- mlquantify/confidence.py +16 -22
- mlquantify/likelihood/_base.py +38 -52
- mlquantify/likelihood/_classes.py +88 -72
- mlquantify/meta/_classes.py +86 -62
- mlquantify/metrics/_oq.py +2 -2
- mlquantify/metrics/_rq.py +2 -2
- mlquantify/metrics/_slq.py +9 -9
- mlquantify/mixture/_base.py +13 -19
- mlquantify/mixture/_classes.py +68 -10
- mlquantify/mixture/_utils.py +62 -11
- mlquantify/model_selection/_protocol.py +6 -6
- mlquantify/model_selection/_search.py +1 -1
- mlquantify/neighbors/_base.py +35 -65
- mlquantify/neighbors/_classes.py +1 -10
- mlquantify/neighbors/_classification.py +5 -12
- mlquantify/neighbors/_kde.py +7 -9
- mlquantify/neighbors/_utils.py +17 -21
- mlquantify/utils/prevalence.py +4 -1
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.10.dist-info}/METADATA +1 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify-0.1.9.dist-info/RECORD +0 -53
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
mlquantify/meta/_classes.py
CHANGED
|
@@ -30,7 +30,7 @@ from mlquantify.utils.prevalence import get_prev_from_labels
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def get_protocol_sampler(protocol_name, batch_size, n_prevalences, min_prev, max_prev, n_classes):
|
|
33
|
-
""" Returns a prevalence sampler function based on the specified protocol name.
|
|
33
|
+
r""" Returns a prevalence sampler function based on the specified protocol name.
|
|
34
34
|
|
|
35
35
|
Parameters
|
|
36
36
|
----------
|
|
@@ -80,8 +80,7 @@ def get_protocol_sampler(protocol_name, batch_size, n_prevalences, min_prev, max
|
|
|
80
80
|
return protocol
|
|
81
81
|
|
|
82
82
|
class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
83
|
-
"""
|
|
84
|
-
Ensemble-based Quantifier combining multiple models trained on varied data samples
|
|
83
|
+
r"""Ensemble-based Quantifier combining multiple models trained on varied data samples
|
|
85
84
|
with controlled prevalence distributions to improve robustness and accuracy.
|
|
86
85
|
|
|
87
86
|
This quantifier constructs an ensemble of quantification models using batches of training
|
|
@@ -128,18 +127,6 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
|
128
127
|
posteriors_generator : callable or None
|
|
129
128
|
Function to generate posterior probabilities for new samples.
|
|
130
129
|
|
|
131
|
-
Methods
|
|
132
|
-
-------
|
|
133
|
-
fit(X, y)
|
|
134
|
-
Fits all ensemble member quantifiers on sampled training batches.
|
|
135
|
-
predict(X)
|
|
136
|
-
Aggregates ensemble member predictions into final prevalence estimates.
|
|
137
|
-
ptr_selection_metric(prevalences, train_prevalences)
|
|
138
|
-
Implements PTR-based selection metric on prevalence estimates.
|
|
139
|
-
ds_get_posteriors(X, y)
|
|
140
|
-
Computes posterior probabilities for training data with cross-validated logistic regression.
|
|
141
|
-
ds_selection_metric(X, prevalences, train_distributions, posteriors_generator)
|
|
142
|
-
Implements DS-based selection metric comparing posterior distributions.
|
|
143
130
|
|
|
144
131
|
Notes
|
|
145
132
|
-----
|
|
@@ -149,9 +136,25 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
|
149
136
|
|
|
150
137
|
Examples
|
|
151
138
|
--------
|
|
152
|
-
>>> ensemble
|
|
139
|
+
>>> from mlquantify.ensemble import EnsembleQ
|
|
140
|
+
>>> from mlquantify.mixture import DyS
|
|
141
|
+
>>> from sklearn.ensemble import RandomForestClassifier
|
|
142
|
+
>>>
|
|
143
|
+
>>> ensemble = EnsembleQ(
|
|
144
|
+
... quantifier=DyS(RandomForestClassifier()),
|
|
145
|
+
... size=30,
|
|
146
|
+
... protocol='artificial', # APP protocol
|
|
147
|
+
... selection_metric='ptr'
|
|
148
|
+
... )
|
|
153
149
|
>>> ensemble.fit(X_train, y_train)
|
|
154
150
|
>>> prevalence_estimates = ensemble.predict(X_test)
|
|
151
|
+
|
|
152
|
+
References
|
|
153
|
+
----------
|
|
154
|
+
.. [1] Pérez-Gállego, P., Castaño, A., Ramón Quevedo, J., & José del Coz, J. (2019). Dynamic ensemble selection for quantification tasks. Information Fusion, 45, 1-15. https://doi.org/10.1016/j.inffus.2018.01.001
|
|
155
|
+
|
|
156
|
+
.. [2] Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. Information Fusion, 34, 87-100. https://doi.org/10.1016/j.inffus.2016.07.001
|
|
157
|
+
|
|
155
158
|
"""
|
|
156
159
|
|
|
157
160
|
_parameter_constraints = {
|
|
@@ -306,7 +309,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
|
306
309
|
|
|
307
310
|
|
|
308
311
|
def ptr_selection_metric(self, prevalences, train_prevalences):
|
|
309
|
-
"""
|
|
312
|
+
r"""
|
|
310
313
|
Selects the prevalence estimates from models trained on samples whose prevalence is most similar
|
|
311
314
|
to an initial approximation of the test prevalence as estimated by all models in the ensemble.
|
|
312
315
|
|
|
@@ -326,7 +329,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
|
326
329
|
return _select_k(prevalences, order, k=self.p_metric)
|
|
327
330
|
|
|
328
331
|
def ds_get_posteriors(self, X, y):
|
|
329
|
-
"""
|
|
332
|
+
r"""
|
|
330
333
|
Generate posterior probabilities using cross-validated logistic regression.
|
|
331
334
|
This method computes posterior probabilities for the training data via cross-validation,
|
|
332
335
|
using a logistic regression classifier with hyperparameters optimized through grid search.
|
|
@@ -370,7 +373,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
|
370
373
|
|
|
371
374
|
|
|
372
375
|
def ds_selection_metric(self, X, prevalences, train_distributions, posteriors_generator):
|
|
373
|
-
"""
|
|
376
|
+
r"""
|
|
374
377
|
Selects the prevalence estimates from models trained on samples whose distribution of posterior
|
|
375
378
|
probabilities is most similar to the distribution of posterior probabilities for the test data.
|
|
376
379
|
|
|
@@ -393,7 +396,7 @@ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
|
|
|
393
396
|
return _select_k(prevalences, order, k=self.p_metric)
|
|
394
397
|
|
|
395
398
|
def _select_k(elements, order, k):
|
|
396
|
-
"""
|
|
399
|
+
r"""
|
|
397
400
|
Selects the k elements from the list of elements based on the order.
|
|
398
401
|
If the list is empty, it returns the original list.
|
|
399
402
|
|
|
@@ -422,7 +425,7 @@ def _select_k(elements, order, k):
|
|
|
422
425
|
|
|
423
426
|
|
|
424
427
|
class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
425
|
-
"""
|
|
428
|
+
r"""
|
|
426
429
|
Aggregative Bootstrap Quantifier to compute prevalence confidence regions.
|
|
427
430
|
|
|
428
431
|
This metaquantifier applies bootstrapping to both training and test data predictions
|
|
@@ -445,18 +448,17 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
|
445
448
|
confidence_level : float between 0 and 1, default=0.95
|
|
446
449
|
Confidence level for intervals or regions.
|
|
447
450
|
|
|
448
|
-
Methods
|
|
449
|
-
-------
|
|
450
|
-
fit(X, y, val_split=None)
|
|
451
|
-
Fits base quantifier and generates training predictions (optionally splitting data).
|
|
452
|
-
predict(X)
|
|
453
|
-
Returns prevalence estimates and confidence regions aggregated from bootstrap samples.
|
|
454
|
-
aggregate(predictions, train_predictions, train_y_values)
|
|
455
|
-
Performs bootstrap resampling aggregation to obtain prevalence confidence regions.
|
|
456
451
|
|
|
457
452
|
Examples
|
|
458
453
|
--------
|
|
459
|
-
>>>
|
|
454
|
+
>>> from mlquantify.ensemble import AggregativeBootstrap
|
|
455
|
+
>>> from mlquantify.neighbors import EMQ
|
|
456
|
+
>>> from sklearn.ensemble import RandomForestClassifier
|
|
457
|
+
>>> agg_boot = AggregativeBootstrap(
|
|
458
|
+
... quantifier=EMQ(RandomForestClassifier()),
|
|
459
|
+
... n_train_bootstraps=100,
|
|
460
|
+
... n_test_bootstraps=100
|
|
461
|
+
... )
|
|
460
462
|
>>> agg_boot.fit(X_train, y_train)
|
|
461
463
|
>>> prevalence, conf_region = agg_boot.predict(X_test)
|
|
462
464
|
"""
|
|
@@ -485,7 +487,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
|
485
487
|
self.confidence_level = confidence_level
|
|
486
488
|
|
|
487
489
|
def fit(self, X, y, val_split=None):
|
|
488
|
-
""" Fits the aggregative bootstrap model to the given training data.
|
|
490
|
+
r""" Fits the aggregative bootstrap model to the given training data.
|
|
489
491
|
|
|
490
492
|
Parameters
|
|
491
493
|
----------
|
|
@@ -498,6 +500,11 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
|
498
500
|
-------
|
|
499
501
|
self : AggregativeBootstrap
|
|
500
502
|
The fitted aggregative bootstrap model.
|
|
503
|
+
|
|
504
|
+
Raises
|
|
505
|
+
------
|
|
506
|
+
ValueError
|
|
507
|
+
If the provided quantifier is not an aggregative quantifier.
|
|
501
508
|
"""
|
|
502
509
|
X, y = validate_data(self, X, y)
|
|
503
510
|
self.classes = np.unique(y)
|
|
@@ -524,7 +531,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
|
524
531
|
return self
|
|
525
532
|
|
|
526
533
|
def predict(self, X):
|
|
527
|
-
""" Predicts the class prevalences for the given test data.
|
|
534
|
+
r""" Predicts the class prevalences for the given test data.
|
|
528
535
|
|
|
529
536
|
Parameters
|
|
530
537
|
----------
|
|
@@ -546,7 +553,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
|
546
553
|
|
|
547
554
|
|
|
548
555
|
def aggregate(self, predictions, train_predictions, train_y_values):
|
|
549
|
-
""" Aggregates the predictions using bootstrap resampling.
|
|
556
|
+
r""" Aggregates the predictions using bootstrap resampling.
|
|
550
557
|
|
|
551
558
|
Parameters
|
|
552
559
|
----------
|
|
@@ -612,10 +619,10 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
|
|
|
612
619
|
|
|
613
620
|
|
|
614
621
|
class QuaDapt(MetaquantifierMixin, BaseQuantifier):
|
|
615
|
-
r"""QuaDapt Metaquantifier: Adaptive quantification using
|
|
622
|
+
r"""QuaDapt Metaquantifier: Adaptive quantification using synthetic scores.
|
|
616
623
|
|
|
617
624
|
This metaquantifier improves prevalence estimation by merging training samples
|
|
618
|
-
with different score distributions using a merging factor \( m \). It evaluates
|
|
625
|
+
with different score distributions using a merging factor :math: \( m \). It evaluates
|
|
619
626
|
candidate merging factors, chooses the best by minimizing a distribution distance
|
|
620
627
|
metric (Hellinger, Topsoe, ProbSymm, or SORD), and aggregates quantification accordingly.
|
|
621
628
|
|
|
@@ -625,38 +632,28 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
|
|
|
625
632
|
The base quantifier model to adapt.
|
|
626
633
|
measure : {'hellinger', 'topsoe', 'probsymm', 'sord'}, default='topsoe'
|
|
627
634
|
The distribution distance metric used to select the best merging factor.
|
|
628
|
-
|
|
635
|
+
merging_factors : array-like
|
|
629
636
|
Candidate merging factor values to evaluate.
|
|
630
637
|
|
|
631
|
-
Methods
|
|
632
|
-
-------
|
|
633
|
-
fit(X, y)
|
|
634
|
-
Fits the base learner on training data.
|
|
635
|
-
predict(X)
|
|
636
|
-
Predicts prevalence aggregating via the best merging factor.
|
|
637
|
-
aggregate(predictions, train_y_values)
|
|
638
|
-
Performs adaptation and aggregation based on merged score distributions.
|
|
639
|
-
_get_best_merging_factor(predictions)
|
|
640
|
-
Evaluates merging factors and selects the best based on minimum distance.
|
|
641
|
-
_get_best_distance(predictions, pos_scores, neg_scores)
|
|
642
|
-
Computes the distance metric between predicted and class score distributions.
|
|
643
|
-
|
|
644
|
-
Class Methods
|
|
645
|
-
-------------
|
|
646
|
-
MoSS(n, alpha, m)
|
|
647
|
-
Generates merged score samples modeling class conditional distributions
|
|
648
|
-
parameterized by mixing proportion alpha and merging factor m.
|
|
649
|
-
|
|
650
638
|
Examples
|
|
651
639
|
--------
|
|
652
|
-
>>>
|
|
653
|
-
>>>
|
|
654
|
-
>>>
|
|
640
|
+
>>> from mlquantify.meta import QuaDapt
|
|
641
|
+
>>> from mlquantify.adjust_counting import ACC
|
|
642
|
+
>>> from sklearn.ensemble import RandomForestClassifier
|
|
643
|
+
>>> quadapt_acc = QuaDapt(
|
|
644
|
+
... quantifier=ACC(RandomForestClassifier()),
|
|
645
|
+
... merging_factor=[0.1, 0.5, 1.0],
|
|
646
|
+
... measure='sord'
|
|
647
|
+
... )
|
|
648
|
+
>>> quadapt_acc.fit(X_train, y_train)
|
|
649
|
+
>>> prevalence = quadapt_acc.predict(X_test)
|
|
650
|
+
|
|
651
|
+
|
|
655
652
|
"""
|
|
656
653
|
|
|
657
654
|
_parameter_constraints = {
|
|
658
655
|
"quantifier": [BaseQuantifier],
|
|
659
|
-
"
|
|
656
|
+
"merging_factors": "array-like",
|
|
660
657
|
"measure": [Options(["hellinger", "topsoe", "probsymm", "sord"])],
|
|
661
658
|
"random_state": [Options([None, int])],
|
|
662
659
|
}
|
|
@@ -664,10 +661,10 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
|
|
|
664
661
|
def __init__(self,
|
|
665
662
|
quantifier,
|
|
666
663
|
measure="topsoe",
|
|
667
|
-
|
|
664
|
+
merging_factors=(0.1, 1.0, 0.2)):
|
|
668
665
|
self.quantifier = quantifier
|
|
669
666
|
self.measure = measure
|
|
670
|
-
self.
|
|
667
|
+
self.merging_factors = merging_factors
|
|
671
668
|
|
|
672
669
|
|
|
673
670
|
def fit(self, X, y):
|
|
@@ -719,7 +716,7 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
|
|
|
719
716
|
|
|
720
717
|
def _get_best_merging_factor(self, predictions):
|
|
721
718
|
|
|
722
|
-
MF = np.atleast_1d(np.round(self.
|
|
719
|
+
MF = np.atleast_1d(np.round(self.merging_factors, 2)).astype(float)
|
|
723
720
|
|
|
724
721
|
distances = []
|
|
725
722
|
|
|
@@ -748,6 +745,33 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
|
|
|
748
745
|
|
|
749
746
|
@classmethod
|
|
750
747
|
def MoSS(cls, n, alpha, m):
|
|
748
|
+
r"""Model for Score Simulation
|
|
749
|
+
|
|
750
|
+
MoSS has three key parameters:
|
|
751
|
+
(I) the number of observations `n`;
|
|
752
|
+
(II) the class proportion `\alpha`, which defines the prevalence of the positive class;
|
|
753
|
+
(III) the merging factor :math:`m`, which controls the overlap between positive and negative score distributions
|
|
754
|
+
(where :math:`m=0` represents easily separable classes and :math:`m=1` represents highly overlapping ones).
|
|
755
|
+
|
|
756
|
+
.. math::
|
|
757
|
+
|
|
758
|
+
\mathrm{moss}(n, \alpha, \mathfrak{m}) = \mathrm{syn}(\oplus, \lfloor \alpha n \rfloor, \mathfrak{m}) \cup \mathrm{syn}(\ominus , \lfloor (1 - \alpha) n \rfloor, \mathfrak{m})
|
|
759
|
+
|
|
760
|
+
Notes
|
|
761
|
+
-----
|
|
762
|
+
The MoSS generates only binary scores, simulating positive and negative class scores.
|
|
763
|
+
|
|
764
|
+
Examples
|
|
765
|
+
--------
|
|
766
|
+
>>> scores = QuaDapt.MoSS(n=1000, alpha=0.3, m=0.5)
|
|
767
|
+
>>> print(scores.shape)
|
|
768
|
+
(1000, 3)
|
|
769
|
+
|
|
770
|
+
References
|
|
771
|
+
----------
|
|
772
|
+
.. [1] Maletzke, A., Reis, D. dos, Hassan, W., & Batista, G. (2021).
|
|
773
|
+
Accurately Quantifying under Score Variability. 2021 IEEE International Conference on Data Mining (ICDM), 1228-1233. https://doi.org/10.1109/ICDM51629.2021.00149
|
|
774
|
+
"""
|
|
751
775
|
p_score = np.random.uniform(size=int(n * alpha)) ** m
|
|
752
776
|
n_score = 1 - (np.random.uniform(size=int(round(n * (1 - alpha), 0))) ** m)
|
|
753
777
|
scores = np.column_stack(
|
mlquantify/metrics/_oq.py
CHANGED
|
@@ -28,7 +28,7 @@ def process_inputs(prev_pred, prev_real):
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def NMD(prev_pred, prev_real, distances=None):
|
|
31
|
-
"""
|
|
31
|
+
r"""
|
|
32
32
|
Compute the Normalized Match Distance (NMD), also known as Earth Mover’s Distance (EMD),
|
|
33
33
|
for ordinal quantification evaluation.
|
|
34
34
|
|
|
@@ -66,7 +66,7 @@ def NMD(prev_pred, prev_real, distances=None):
|
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
def RNOD(prev_pred, prev_real, distances=None):
|
|
69
|
-
"""
|
|
69
|
+
r"""
|
|
70
70
|
Compute the Root Normalised Order-aware Divergence (RNOD) for ordinal quantification evaluation.
|
|
71
71
|
|
|
72
72
|
Parameters
|
mlquantify/metrics/_rq.py
CHANGED
|
@@ -31,7 +31,7 @@ def process_inputs(prev_pred, prev_real):
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def VSE(prev_pred, prev_real, train_values):
|
|
34
|
-
"""
|
|
34
|
+
r"""
|
|
35
35
|
Compute the Variance-normalised Squared Error (VSE).
|
|
36
36
|
|
|
37
37
|
Parameters
|
|
@@ -60,7 +60,7 @@ def VSE(prev_pred, prev_real, train_values):
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def CvM_L1(prev_pred, prev_real, n_bins=100):
|
|
63
|
-
"""
|
|
63
|
+
r"""
|
|
64
64
|
Compute the L1 version of the Cramér–von Mises statistic (Xiao et al., 2006)
|
|
65
65
|
between two cumulative distributions, as suggested by Bella et al. (2014).
|
|
66
66
|
|
mlquantify/metrics/_slq.py
CHANGED
|
@@ -30,7 +30,7 @@ def process_inputs(prev_pred, prev_real):
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def AE(prev_pred, prev_real):
|
|
33
|
-
"""
|
|
33
|
+
r"""
|
|
34
34
|
Compute the absolute error for each class or a dictionary of errors if input is a dictionary.
|
|
35
35
|
|
|
36
36
|
Parameters
|
|
@@ -57,7 +57,7 @@ def AE(prev_pred, prev_real):
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
def MAE(prev_pred, prev_real):
|
|
60
|
-
"""
|
|
60
|
+
r"""
|
|
61
61
|
Compute the mean absolute error between the real and predicted prevalences.
|
|
62
62
|
|
|
63
63
|
Parameters
|
|
@@ -78,7 +78,7 @@ def MAE(prev_pred, prev_real):
|
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
def KLD(prev_pred, prev_real):
|
|
81
|
-
"""
|
|
81
|
+
r"""
|
|
82
82
|
Compute the Kullback-Leibler divergence between the real and predicted prevalences.
|
|
83
83
|
|
|
84
84
|
Parameters
|
|
@@ -99,7 +99,7 @@ def KLD(prev_pred, prev_real):
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
def SE(prev_pred, prev_real):
|
|
102
|
-
"""
|
|
102
|
+
r"""
|
|
103
103
|
Compute the mean squared error between the real and predicted prevalences.
|
|
104
104
|
|
|
105
105
|
Parameters
|
|
@@ -120,7 +120,7 @@ def SE(prev_pred, prev_real):
|
|
|
120
120
|
|
|
121
121
|
|
|
122
122
|
def MSE(prev_pred, prev_real):
|
|
123
|
-
""" Mean Squared Error
|
|
123
|
+
r""" Mean Squared Error
|
|
124
124
|
|
|
125
125
|
Parameters
|
|
126
126
|
----------
|
|
@@ -140,7 +140,7 @@ def MSE(prev_pred, prev_real):
|
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
def NAE(prev_pred, prev_real):
|
|
143
|
-
"""
|
|
143
|
+
r"""
|
|
144
144
|
Compute the normalized absolute error between the real and predicted prevalences.
|
|
145
145
|
|
|
146
146
|
Parameters
|
|
@@ -163,7 +163,7 @@ def NAE(prev_pred, prev_real):
|
|
|
163
163
|
|
|
164
164
|
|
|
165
165
|
def NKLD(prev_pred, prev_real):
|
|
166
|
-
"""
|
|
166
|
+
r"""
|
|
167
167
|
Compute the normalized Kullback-Leibler divergence between the real and predicted prevalences.
|
|
168
168
|
|
|
169
169
|
Parameters
|
|
@@ -186,7 +186,7 @@ def NKLD(prev_pred, prev_real):
|
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
def RAE(prev_pred, prev_real):
|
|
189
|
-
"""
|
|
189
|
+
r"""
|
|
190
190
|
Compute the relative absolute error between the real and predicted prevalences.
|
|
191
191
|
|
|
192
192
|
Parameters
|
|
@@ -207,7 +207,7 @@ def RAE(prev_pred, prev_real):
|
|
|
207
207
|
|
|
208
208
|
|
|
209
209
|
def NRAE(prev_pred, prev_real):
|
|
210
|
-
"""
|
|
210
|
+
r"""
|
|
211
211
|
Compute the normalized relative absolute error between the real and predicted prevalences.
|
|
212
212
|
|
|
213
213
|
Parameters
|
mlquantify/mixture/_base.py
CHANGED
|
@@ -15,8 +15,7 @@ from mlquantify.mixture._utils import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
class BaseMixture(BaseQuantifier):
|
|
18
|
-
"""
|
|
19
|
-
Base class for mixture-model quantifiers.
|
|
18
|
+
r"""Base class for mixture-model quantifiers.
|
|
20
19
|
|
|
21
20
|
Mixture Models (MM) for quantification estimate class prevalences by modeling
|
|
22
21
|
the test set score distribution as a mixture of the individual class score
|
|
@@ -39,7 +38,7 @@ class BaseMixture(BaseQuantifier):
|
|
|
39
38
|
scores or histograms, and the choice of distance can affect quantification accuracy
|
|
40
39
|
and robustness.
|
|
41
40
|
|
|
42
|
-
The DyS framework
|
|
41
|
+
The DyS framework [3]_ generalizes mixture models by introducing
|
|
43
42
|
a variety of distribution dissimilarity measures, enabling flexible and effective
|
|
44
43
|
quantification methods.
|
|
45
44
|
|
|
@@ -49,11 +48,13 @@ class BaseMixture(BaseQuantifier):
|
|
|
49
48
|
Mixture models are defined for only binary quantification problems. For multi-class
|
|
50
49
|
problems, a one-vs-rest strategy is applied, training a binary mixture model for
|
|
51
50
|
each class against the rest.
|
|
51
|
+
|
|
52
52
|
|
|
53
53
|
Parameters
|
|
54
54
|
----------
|
|
55
55
|
None directly; subclasses implement fitting and prediction logic.
|
|
56
56
|
|
|
57
|
+
|
|
57
58
|
Attributes
|
|
58
59
|
----------
|
|
59
60
|
_precomputed : bool
|
|
@@ -63,19 +64,6 @@ class BaseMixture(BaseQuantifier):
|
|
|
63
64
|
classes : ndarray of shape (n_classes,)
|
|
64
65
|
Unique class labels seen during training.
|
|
65
66
|
|
|
66
|
-
Methods
|
|
67
|
-
-------
|
|
68
|
-
fit(X, y, *args, **kwargs):
|
|
69
|
-
Fit the mixture quantifier with training data. Validates input and
|
|
70
|
-
calls internal fitting procedure.
|
|
71
|
-
predict(X, *args, **kwargs):
|
|
72
|
-
Predict class prevalences for input data by leveraging best mixture parameters.
|
|
73
|
-
get_best_distance(*args, **kwargs):
|
|
74
|
-
Return the best distance measure and associated mixture parameters found.
|
|
75
|
-
best_mixture(X):
|
|
76
|
-
Abstract method to determine optimal mixture parameters on input data.
|
|
77
|
-
get_distance(dist_train, dist_test, measure="hellinger"):
|
|
78
|
-
Compute a specified distance between two distributions.
|
|
79
67
|
|
|
80
68
|
References
|
|
81
69
|
----------
|
|
@@ -118,6 +106,14 @@ class BaseMixture(BaseQuantifier):
|
|
|
118
106
|
return self._predict(X, *args, **kwargs)
|
|
119
107
|
|
|
120
108
|
def get_best_distance(self, *args, **kwargs):
|
|
109
|
+
r""" Get the best distance value from the mixture fitting process.
|
|
110
|
+
|
|
111
|
+
Notes
|
|
112
|
+
-----
|
|
113
|
+
If the quantifier has not been fitted yet, it will fit the model for getting the
|
|
114
|
+
best distance.
|
|
115
|
+
|
|
116
|
+
"""
|
|
121
117
|
_, best_distance = self.best_mixture(*args, **kwargs)
|
|
122
118
|
return best_distance
|
|
123
119
|
|
|
@@ -128,9 +124,7 @@ class BaseMixture(BaseQuantifier):
|
|
|
128
124
|
|
|
129
125
|
@classmethod
|
|
130
126
|
def get_distance(cls, dist_train, dist_test, measure="hellinger"):
|
|
131
|
-
"""
|
|
132
|
-
Compute distance between two distributions.
|
|
133
|
-
"""
|
|
127
|
+
r"""Compute distance between two distributions."""
|
|
134
128
|
|
|
135
129
|
if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
|
|
136
130
|
raise ValueError("One or both vectors are zero (empty)...")
|
mlquantify/mixture/_classes.py
CHANGED
|
@@ -21,8 +21,7 @@ from mlquantify.mixture._utils import (
|
|
|
21
21
|
# =====================================================
|
|
22
22
|
@define_binary
|
|
23
23
|
class AggregativeMixture(SoftLearnerQMixin, AggregationMixin, BaseMixture):
|
|
24
|
-
"""
|
|
25
|
-
Base class for Mixture-based Quantification Methods.
|
|
24
|
+
r"""Base class for Mixture-based Quantification Methods.
|
|
26
25
|
|
|
27
26
|
These methods assume that the test score distribution is a mixture
|
|
28
27
|
of the positive and negative score distributions from the training data.
|
|
@@ -105,7 +104,7 @@ class AggregativeMixture(SoftLearnerQMixin, AggregationMixin, BaseMixture):
|
|
|
105
104
|
# =====================================================
|
|
106
105
|
|
|
107
106
|
class DyS(AggregativeMixture):
|
|
108
|
-
"""Distribution y-Similarity (DyS) quantification method.
|
|
107
|
+
r"""Distribution y-Similarity (DyS) quantification method.
|
|
109
108
|
|
|
110
109
|
Uses mixture modeling with a dissimilarity measure between distributions
|
|
111
110
|
computed on histograms of classifier scores. This method optimizes mixture
|
|
@@ -128,7 +127,9 @@ class DyS(AggregativeMixture):
|
|
|
128
127
|
|
|
129
128
|
Examples
|
|
130
129
|
--------
|
|
131
|
-
>>>
|
|
130
|
+
>>> from mlquantify.mixture import DyS
|
|
131
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
132
|
+
>>> q = DyS(learner=LogisticRegression(), measure="hellinger")
|
|
132
133
|
>>> q.fit(X_train, y_train)
|
|
133
134
|
>>> prevalences = q.predict(X_test)
|
|
134
135
|
"""
|
|
@@ -147,6 +148,35 @@ class DyS(AggregativeMixture):
|
|
|
147
148
|
self.bins_size = np.asarray(bins_size, dtype=int)
|
|
148
149
|
|
|
149
150
|
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
151
|
+
r"""Determine the best mixture parameters for the given data.
|
|
152
|
+
|
|
153
|
+
Applies ternary search to find the mixture weight minimizing the distance
|
|
154
|
+
between the test score histogram and the mixture of positive and negative
|
|
155
|
+
|
|
156
|
+
The mixture weight :math:`\alpha` is estimated as:
|
|
157
|
+
.. math::
|
|
158
|
+
\alpha = \arg \min_{\alpha \in [0, 1]} D \left( H_{test}, \alpha H_{pos} + (1 - \alpha) H_{neg} \right)
|
|
159
|
+
|
|
160
|
+
where :math:`D` is the selected distance measure and :math:`H` denotes histograms.
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
Parameters
|
|
164
|
+
----------
|
|
165
|
+
predictions : ndarray
|
|
166
|
+
Classifier scores for the test data.
|
|
167
|
+
pos_scores : ndarray
|
|
168
|
+
Classifier scores for the positive class from training data.
|
|
169
|
+
neg_scores : ndarray
|
|
170
|
+
Classifier scores for the negative class from training data.
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
alpha : float
|
|
176
|
+
Estimated mixture weight.
|
|
177
|
+
best_distance : float
|
|
178
|
+
Distance corresponding to the best mixture weight.
|
|
179
|
+
"""
|
|
150
180
|
|
|
151
181
|
prevs = []
|
|
152
182
|
self.distances = []
|
|
@@ -175,7 +205,7 @@ class DyS(AggregativeMixture):
|
|
|
175
205
|
# =====================================================
|
|
176
206
|
|
|
177
207
|
class HDy(AggregativeMixture):
|
|
178
|
-
"""Hellinger Distance Minimization (HDy) quantification method.
|
|
208
|
+
r"""Hellinger Distance Minimization (HDy) quantification method.
|
|
179
209
|
|
|
180
210
|
Estimates class prevalences by finding mixture weights that minimize
|
|
181
211
|
the Hellinger distance between the histogram of test scores and the mixture
|
|
@@ -193,6 +223,35 @@ class HDy(AggregativeMixture):
|
|
|
193
223
|
"""
|
|
194
224
|
|
|
195
225
|
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
226
|
+
r"""Determine the best mixture parameters for the given data.
|
|
227
|
+
|
|
228
|
+
Compute the mixture weight :math:`\alpha` that minimizes the Hellinger distance between the test score histogram and the mixture of positive and negative class score histograms.
|
|
229
|
+
|
|
230
|
+
The mixture weight :math:`\alpha` is estimated as:
|
|
231
|
+
.. math::
|
|
232
|
+
\alpha = \arg \min_{\alpha \in [0, 1]} Hellinger \left( H_{test}, \alpha H_{pos} + (1 - \alpha) H_{neg} \right)
|
|
233
|
+
|
|
234
|
+
where :math:`H` denotes histograms.
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
predictions : ndarray
|
|
240
|
+
Classifier scores for the test data.
|
|
241
|
+
pos_scores : ndarray
|
|
242
|
+
Classifier scores for the positive class from training data.
|
|
243
|
+
neg_scores : ndarray
|
|
244
|
+
Classifier scores for the negative class from training data.
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
Returns
|
|
248
|
+
-------
|
|
249
|
+
alpha : float
|
|
250
|
+
Estimated mixture weight.
|
|
251
|
+
best_distance : float
|
|
252
|
+
Distance corresponding to the best mixture weight.
|
|
253
|
+
"""
|
|
254
|
+
|
|
196
255
|
bins_size = np.arange(10, 110, 11)
|
|
197
256
|
alpha_values = np.round(np.linspace(0, 1, 101), 2)
|
|
198
257
|
|
|
@@ -228,13 +287,12 @@ class SMM(AggregativeMixture):
|
|
|
228
287
|
|
|
229
288
|
Estimates class prevalence by matching the mean score of the test samples
|
|
230
289
|
to a convex combination of positive and negative training scores. The mixture
|
|
231
|
-
weight
|
|
290
|
+
weight :math:`\alpha` is computed as:
|
|
232
291
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
\]
|
|
292
|
+
.. math::
|
|
293
|
+
\alpha = \frac{\bar{s}_{test} - \bar{s}_{neg}}{\bar{s}_{pos} - \bar{s}_{neg}}
|
|
236
294
|
|
|
237
|
-
where
|
|
295
|
+
where :math:`\bar{s}` denotes the sample mean.
|
|
238
296
|
|
|
239
297
|
Parameters
|
|
240
298
|
----------
|