mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +0 -29
- mlquantify/adjust_counting/__init__.py +14 -0
- mlquantify/adjust_counting/_adjustment.py +365 -0
- mlquantify/adjust_counting/_base.py +247 -0
- mlquantify/adjust_counting/_counting.py +145 -0
- mlquantify/adjust_counting/_utils.py +114 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +335 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +161 -0
- mlquantify/likelihood/_classes.py +414 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +761 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +153 -0
- mlquantify/mixture/_classes.py +400 -0
- mlquantify/mixture/_utils.py +112 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +198 -0
- mlquantify/neighbors/_classes.py +159 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
- mlquantify/neighbors/_kde.py +270 -0
- mlquantify/neighbors/_utils.py +135 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +61 -0
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
- mlquantify-0.1.9.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -291
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.7.dist-info/RECORD +0 -22
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from mlquantify.base_aggregative import (
|
|
4
|
+
SoftLearnerQMixin,
|
|
5
|
+
CrispLearnerQMixin
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from mlquantify.adjust_counting._base import BaseCount
|
|
9
|
+
from mlquantify.utils._validation import validate_predictions, validate_prevalences, check_classes_attribute
|
|
10
|
+
from mlquantify.utils._constraints import Interval
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CC(CrispLearnerQMixin, BaseCount):
|
|
15
|
+
"""Classify and Count (CC) quantifier.
|
|
16
|
+
|
|
17
|
+
Implements the Classify and Count method for quantification as described in:
|
|
18
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
19
|
+
ECML, pp. 564-575.
|
|
20
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
21
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
learner : estimator, optional
|
|
27
|
+
A supervised learning estimator with fit and predict methods.
|
|
28
|
+
If None, it is expected that will be used the aggregate method directly.
|
|
29
|
+
threshold : float, default=0.5
|
|
30
|
+
Decision threshold for converting predicted probabilities into class labels.
|
|
31
|
+
Must be in the interval [0.0, 1.0].
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
Attributes
|
|
35
|
+
----------
|
|
36
|
+
learner : estimator
|
|
37
|
+
Underlying classification model.
|
|
38
|
+
classes : ndarray of shape (n_classes,)
|
|
39
|
+
Unique class labels observed during training.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> from mlquantify.adjust_counting import CC
|
|
45
|
+
>>> import numpy as np
|
|
46
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
47
|
+
>>> X = np.random.randn(100, 5)
|
|
48
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
49
|
+
>>> q = CC(learner=LogisticRegression())
|
|
50
|
+
>>> q.fit(X, y)
|
|
51
|
+
>>> q.predict(X)
|
|
52
|
+
{0: 0.47, 1: 0.53}
|
|
53
|
+
>> q2 = CC()
|
|
54
|
+
>>> predictions = np.random.rand(200)
|
|
55
|
+
>>> q2.aggregate(predictions)
|
|
56
|
+
{0: 0.51, 1: 0.49}
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
_parameters_constraints = {
|
|
60
|
+
"threshold": [
|
|
61
|
+
Interval(0.0, 1.0),
|
|
62
|
+
Interval(0, 1, discrete=True),
|
|
63
|
+
],
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def __init__(self, learner=None, threshold=0.5):
|
|
67
|
+
super().__init__(learner=learner)
|
|
68
|
+
self.threshold = threshold
|
|
69
|
+
|
|
70
|
+
def aggregate(self, predictions):
|
|
71
|
+
predictions = validate_predictions(self, predictions)
|
|
72
|
+
|
|
73
|
+
self.classes_ = check_classes_attribute(self, np.unique(predictions))
|
|
74
|
+
class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
|
|
75
|
+
prevalences = class_counts / len(predictions)
|
|
76
|
+
|
|
77
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
78
|
+
return prevalences
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class PCC(SoftLearnerQMixin, BaseCount):
|
|
82
|
+
"""Probabilistic Classify and Count (PCC) quantifier.
|
|
83
|
+
|
|
84
|
+
Implements the Probabilistic Classify and Count method for quantification as described in:
|
|
85
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
86
|
+
ECML, pp. 564-575.
|
|
87
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
88
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
learner : estimator, optional
|
|
94
|
+
A supervised learning estimator with fit and predict_proba methods.
|
|
95
|
+
If None, it is expected that will be used the aggregate method directly.
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
Attributes
|
|
99
|
+
----------
|
|
100
|
+
learner : estimator
|
|
101
|
+
Underlying classification model.
|
|
102
|
+
classes : ndarray of shape (n_classes,)
|
|
103
|
+
Unique class labels observed during training.
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
>>> from mlquantify.adjust_counting import PCC
|
|
109
|
+
>>> import numpy as np
|
|
110
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
111
|
+
>>> X = np.random.randn(100, 5)
|
|
112
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
113
|
+
>>> q = PCC(learner=LogisticRegression())
|
|
114
|
+
>>> q.fit(X, y)
|
|
115
|
+
>>> q.predict(X)
|
|
116
|
+
{0: 0.48, 1: 0.52}
|
|
117
|
+
>>> q2 = PCC()
|
|
118
|
+
>>> predictions = np.random.rand(200, 2)
|
|
119
|
+
>>> q2.aggregate(predictions)
|
|
120
|
+
{0: 0.50, 1: 0.50}
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
def __init__(self, learner=None):
|
|
124
|
+
super().__init__(learner=learner)
|
|
125
|
+
|
|
126
|
+
def aggregate(self, predictions):
|
|
127
|
+
predictions = validate_predictions(self, predictions)
|
|
128
|
+
|
|
129
|
+
# Handle categorical predictions (1D array with class labels)
|
|
130
|
+
if predictions.ndim == 1 and not np.issubdtype(predictions.dtype, (np.floating, np.integer)):
|
|
131
|
+
self.classes_ = check_classes_attribute(self, np.unique(predictions))
|
|
132
|
+
class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
|
|
133
|
+
prevalences = class_counts / len(predictions)
|
|
134
|
+
else:
|
|
135
|
+
# Handle probability predictions (2D array or 1D probabilities)
|
|
136
|
+
if predictions.ndim == 2:
|
|
137
|
+
self.classes_ = check_classes_attribute(self, np.arange(predictions.shape[1]))
|
|
138
|
+
else:
|
|
139
|
+
self.classes_ = check_classes_attribute(self, np.arange(2))
|
|
140
|
+
prevalences = np.mean(predictions, axis=0) if predictions.ndim == 2 else predictions.mean()
|
|
141
|
+
if predictions.ndim == 1:
|
|
142
|
+
prevalences = np.array([1-prevalences, prevalences])
|
|
143
|
+
|
|
144
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
145
|
+
return prevalences
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def compute_table(y, y_pred, classes):
|
|
5
|
+
"""
|
|
6
|
+
Compute the confusion matrix table for a binary classification task.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
y : np.ndarray
|
|
12
|
+
The true labels.
|
|
13
|
+
y_pred : np.ndarray
|
|
14
|
+
The predicted labels.
|
|
15
|
+
classes : np.ndarray
|
|
16
|
+
The unique classes in the dataset.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
tuple
|
|
22
|
+
A tuple containing the True Positives, False Positives, False Negatives, and True Negatives.
|
|
23
|
+
"""
|
|
24
|
+
TP = np.logical_and(y == y_pred, y == classes[1]).sum()
|
|
25
|
+
FP = np.logical_and(y != y_pred, y == classes[0]).sum()
|
|
26
|
+
FN = np.logical_and(y != y_pred, y == classes[1]).sum()
|
|
27
|
+
TN = np.logical_and(y == y_pred, y == classes[0]).sum()
|
|
28
|
+
return TP, FP, FN, TN
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def compute_tpr(TP, FN):
|
|
32
|
+
"""
|
|
33
|
+
Compute the True Positive Rate (Recall) for a binary classification task.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
TP : int
|
|
39
|
+
The number of True Positives.
|
|
40
|
+
FN : int
|
|
41
|
+
The number of False Negatives.
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
float
|
|
47
|
+
The True Positive Rate (Recall).
|
|
48
|
+
"""
|
|
49
|
+
if TP + FN == 0:
|
|
50
|
+
return 0
|
|
51
|
+
return TP / (TP + FN)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def compute_fpr(FP, TN):
|
|
55
|
+
"""
|
|
56
|
+
Compute the False Positive Rate for a binary classification task.
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
FP : int
|
|
62
|
+
The number of False Positives.
|
|
63
|
+
TN : int
|
|
64
|
+
The number of True Negatives.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
float
|
|
70
|
+
The False Positive Rate.
|
|
71
|
+
"""
|
|
72
|
+
if FP + TN == 0:
|
|
73
|
+
return 0
|
|
74
|
+
return FP / (FP + TN)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def evaluate_thresholds (y, probabilities:np.ndarray, classes) -> tuple:
|
|
78
|
+
"""
|
|
79
|
+
Adjust the threshold for a binary quantification task to maximize the True Positive Rate.
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
y : np.ndarray
|
|
85
|
+
The true labels.
|
|
86
|
+
probabilities : np.ndarray
|
|
87
|
+
The predicted probabilities.
|
|
88
|
+
classes : np.ndarray
|
|
89
|
+
The unique classes in the dataset.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
Returns
|
|
93
|
+
-------
|
|
94
|
+
tuple
|
|
95
|
+
The best True Positive Rate and False Positive Rate.
|
|
96
|
+
"""
|
|
97
|
+
unique_scores = np.linspace(0, 1, 101)
|
|
98
|
+
|
|
99
|
+
tprs = []
|
|
100
|
+
fprs = []
|
|
101
|
+
|
|
102
|
+
for threshold in unique_scores:
|
|
103
|
+
y_pred = np.where(probabilities >= threshold, classes[1], classes[0])
|
|
104
|
+
|
|
105
|
+
TP, FP, FN, TN = compute_table(y, y_pred, classes)
|
|
106
|
+
|
|
107
|
+
tpr = compute_tpr(TP, FN)
|
|
108
|
+
fpr = compute_fpr(FP, TN)
|
|
109
|
+
|
|
110
|
+
tprs.append(tpr)
|
|
111
|
+
fprs.append(fpr)
|
|
112
|
+
|
|
113
|
+
#best_tpr, best_fpr = self.adjust_threshold(np.asarray(tprs), np.asarray(fprs))
|
|
114
|
+
return (unique_scores, np.asarray(tprs), np.asarray(fprs))
|