mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def process_inputs(prev_pred, prev_real):
|
|
4
|
+
"""
|
|
5
|
+
.. :noindex:
|
|
6
|
+
|
|
7
|
+
Process the input data for internal use.
|
|
8
|
+
"""
|
|
9
|
+
if isinstance(prev_real, dict):
|
|
10
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
11
|
+
if isinstance(prev_pred, dict):
|
|
12
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
13
|
+
if isinstance(prev_real, list):
|
|
14
|
+
prev_real = np.asarray(prev_real)
|
|
15
|
+
if isinstance(prev_pred, list):
|
|
16
|
+
prev_pred = np.asarray(prev_pred)
|
|
17
|
+
|
|
18
|
+
# Pad with zeros if lengths differ
|
|
19
|
+
len_real = len(prev_real)
|
|
20
|
+
len_pred = len(prev_pred)
|
|
21
|
+
|
|
22
|
+
if len_real > len_pred:
|
|
23
|
+
prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
|
|
24
|
+
elif len_pred > len_real:
|
|
25
|
+
prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
|
|
26
|
+
|
|
27
|
+
return prev_real, prev_pred
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def NMD(prev_pred, prev_real, distances=None):
|
|
31
|
+
r"""
|
|
32
|
+
Compute the Normalized Match Distance (NMD), also known as Earth Mover’s Distance (EMD),
|
|
33
|
+
for ordinal quantification evaluation.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
prev_real : array-like or dict
|
|
38
|
+
True prevalence values for each ordered class.
|
|
39
|
+
|
|
40
|
+
prev_pred : array-like or dict
|
|
41
|
+
Predicted prevalence values for each ordered class.
|
|
42
|
+
|
|
43
|
+
distances : array-like of shape (n_classes-1,), optional
|
|
44
|
+
Distance between consecutive classes (d(y_i, y_{i+1})).
|
|
45
|
+
If None, all distances are assumed to be 1.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
nmd : float
|
|
50
|
+
Normalized Match Distance between predicted and true prevalences.
|
|
51
|
+
"""
|
|
52
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
53
|
+
n_classes = len(prev_real)
|
|
54
|
+
|
|
55
|
+
if distances is None:
|
|
56
|
+
distances = np.ones(n_classes - 1)
|
|
57
|
+
else:
|
|
58
|
+
distances = np.asarray(distances, dtype=float)
|
|
59
|
+
if len(distances) != n_classes - 1:
|
|
60
|
+
raise ValueError("Length of distances must be n_classes - 1.")
|
|
61
|
+
|
|
62
|
+
# cumulative differences
|
|
63
|
+
cum_diffs = np.cumsum(prev_pred - prev_real)
|
|
64
|
+
nmd = np.sum(distances * np.abs(cum_diffs[:-1])) / (n_classes - 1)
|
|
65
|
+
return float(nmd)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def RNOD(prev_pred, prev_real, distances=None):
|
|
69
|
+
r"""
|
|
70
|
+
Compute the Root Normalised Order-aware Divergence (RNOD) for ordinal quantification evaluation.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
prev_real : array-like or dict
|
|
75
|
+
True prevalence values for each ordered class.
|
|
76
|
+
|
|
77
|
+
prev_pred : array-like or dict
|
|
78
|
+
Predicted prevalence values for each ordered class.
|
|
79
|
+
|
|
80
|
+
distances : 2D array-like of shape (n_classes, n_classes), optional
|
|
81
|
+
Distance matrix between classes (d(y_i, y_j)).
|
|
82
|
+
If None, assumes d(y_i, y_j) = |i - j|.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
rnod : float
|
|
87
|
+
Root Normalised Order-aware Divergence between predicted and true prevalences.
|
|
88
|
+
"""
|
|
89
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
90
|
+
n_classes = len(prev_real)
|
|
91
|
+
Y_star = np.where(prev_real > 0)[0]
|
|
92
|
+
|
|
93
|
+
# default distance: |i - j|
|
|
94
|
+
if distances is None:
|
|
95
|
+
distances = np.abs(np.arange(n_classes)[:, None] - np.arange(n_classes)[None, :])
|
|
96
|
+
else:
|
|
97
|
+
distances = np.asarray(distances, dtype=float)
|
|
98
|
+
if distances.shape != (n_classes, n_classes):
|
|
99
|
+
raise ValueError("Distance matrix must be of shape (n_classes, n_classes).")
|
|
100
|
+
|
|
101
|
+
diff_sq = (prev_real - prev_pred) ** 2
|
|
102
|
+
total = 0.0
|
|
103
|
+
for i in Y_star:
|
|
104
|
+
for j in range(n_classes):
|
|
105
|
+
total += distances[j, i] * diff_sq[j]
|
|
106
|
+
|
|
107
|
+
denom = len(Y_star) * (n_classes - 1)
|
|
108
|
+
rnod = np.sqrt(total / denom)
|
|
109
|
+
return float(rnod)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from scipy.stats import cumfreq
|
|
3
|
+
from mlquantify.metrics._slq import SE
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def process_inputs(prev_pred, prev_real):
|
|
7
|
+
"""
|
|
8
|
+
.. :noindex:
|
|
9
|
+
|
|
10
|
+
Process the input data for internal use.
|
|
11
|
+
"""
|
|
12
|
+
if isinstance(prev_real, dict):
|
|
13
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
14
|
+
if isinstance(prev_pred, dict):
|
|
15
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
16
|
+
if isinstance(prev_real, list):
|
|
17
|
+
prev_real = np.asarray(prev_real)
|
|
18
|
+
if isinstance(prev_pred, list):
|
|
19
|
+
prev_pred = np.asarray(prev_pred)
|
|
20
|
+
|
|
21
|
+
# Pad with zeros if lengths differ
|
|
22
|
+
len_real = len(prev_real)
|
|
23
|
+
len_pred = len(prev_pred)
|
|
24
|
+
|
|
25
|
+
if len_real > len_pred:
|
|
26
|
+
prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
|
|
27
|
+
elif len_pred > len_real:
|
|
28
|
+
prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
|
|
29
|
+
|
|
30
|
+
return prev_real, prev_pred
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def VSE(prev_pred, prev_real, train_values):
|
|
34
|
+
r"""
|
|
35
|
+
Compute the Variance-normalised Squared Error (VSE).
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
prev_real : array-like
|
|
40
|
+
True regression values (from test set).
|
|
41
|
+
|
|
42
|
+
prev_pred : array-like
|
|
43
|
+
Predicted regression values (from test set).
|
|
44
|
+
|
|
45
|
+
train_values : array-like
|
|
46
|
+
True regression values from training set, used to compute variance normalization.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
verror : float
|
|
51
|
+
Variance-normalised squared error.
|
|
52
|
+
"""
|
|
53
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
54
|
+
if isinstance(train_values, dict):
|
|
55
|
+
train_values = np.asarray(list(train_values.values()))
|
|
56
|
+
var_train = np.var(train_values, ddof=1)
|
|
57
|
+
if var_train == 0:
|
|
58
|
+
return np.nan
|
|
59
|
+
return SE(prev_pred, prev_real) / var_train
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def CvM_L1(prev_pred, prev_real, n_bins=100):
|
|
63
|
+
r"""
|
|
64
|
+
Compute the L1 version of the Cramér–von Mises statistic (Xiao et al., 2006)
|
|
65
|
+
between two cumulative distributions, as suggested by Bella et al. (2014).
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
prev_real : array-like
|
|
70
|
+
True regression values.
|
|
71
|
+
|
|
72
|
+
prev_pred : array-like
|
|
73
|
+
Predicted regression values.
|
|
74
|
+
|
|
75
|
+
n_bins : int, optional
|
|
76
|
+
Number of bins used to estimate cumulative distributions (default=100).
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
statistic : float
|
|
81
|
+
L1 Cramér–von Mises distance between cumulative distributions.
|
|
82
|
+
"""
|
|
83
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
84
|
+
|
|
85
|
+
# Compute empirical cumulative distributions
|
|
86
|
+
min_val = min(np.min(prev_real), np.min(prev_pred))
|
|
87
|
+
max_val = max(np.max(prev_real), np.max(prev_pred))
|
|
88
|
+
|
|
89
|
+
real_cum = cumfreq(prev_real, numbins=n_bins, defaultreallimits=(min_val, max_val))
|
|
90
|
+
pred_cum = cumfreq(prev_pred, numbins=n_bins, defaultreallimits=(min_val, max_val))
|
|
91
|
+
|
|
92
|
+
# Normalize to [0, 1]
|
|
93
|
+
F_real = real_cum.cumcount / real_cum.cumcount[-1]
|
|
94
|
+
F_pred = pred_cum.cumcount / pred_cum.cumcount[-1]
|
|
95
|
+
|
|
96
|
+
# L1 integral between cumulative distributions
|
|
97
|
+
statistic = np.mean(np.abs(F_real - F_pred))
|
|
98
|
+
return float(statistic)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
|
|
3
|
-
def process_inputs(
|
|
3
|
+
def process_inputs(prev_pred, prev_real):
|
|
4
4
|
"""
|
|
5
5
|
.. :noindex:
|
|
6
6
|
|
|
@@ -10,11 +10,27 @@ def process_inputs(prev_real, prev_pred):
|
|
|
10
10
|
prev_real = np.asarray(list(prev_real.values()))
|
|
11
11
|
if isinstance(prev_pred, dict):
|
|
12
12
|
prev_pred = np.asarray(list(prev_pred.values()))
|
|
13
|
+
if isinstance(prev_real, list):
|
|
14
|
+
print(prev_real)
|
|
15
|
+
prev_real = np.asarray(prev_real)
|
|
16
|
+
if isinstance(prev_pred, list):
|
|
17
|
+
print(prev_pred)
|
|
18
|
+
prev_pred = np.asarray(prev_pred)
|
|
19
|
+
|
|
20
|
+
# Pad with zeros if lengths differ
|
|
21
|
+
len_real = len(prev_real)
|
|
22
|
+
len_pred = len(prev_pred)
|
|
23
|
+
|
|
24
|
+
if len_real > len_pred:
|
|
25
|
+
prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
|
|
26
|
+
elif len_pred > len_real:
|
|
27
|
+
prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
|
|
28
|
+
|
|
13
29
|
return prev_real, prev_pred
|
|
14
30
|
|
|
15
31
|
|
|
16
|
-
def
|
|
17
|
-
"""
|
|
32
|
+
def AE(prev_pred, prev_real):
|
|
33
|
+
r"""
|
|
18
34
|
Compute the absolute error for each class or a dictionary of errors if input is a dictionary.
|
|
19
35
|
|
|
20
36
|
Parameters
|
|
@@ -32,16 +48,16 @@ def absolute_error(prev_real, prev_pred):
|
|
|
32
48
|
"""
|
|
33
49
|
if isinstance(prev_real, dict):
|
|
34
50
|
classes = prev_real.keys()
|
|
35
|
-
prev_real, prev_pred = process_inputs(
|
|
51
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
36
52
|
abs_errors = np.abs(prev_pred - prev_real)
|
|
37
53
|
return {class_: float(err) for class_, err in zip(classes, abs_errors)}
|
|
38
|
-
prev_real, prev_pred = process_inputs(
|
|
54
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
39
55
|
return np.abs(prev_pred - prev_real)
|
|
40
56
|
|
|
41
57
|
|
|
42
58
|
|
|
43
|
-
def
|
|
44
|
-
"""
|
|
59
|
+
def MAE(prev_pred, prev_real):
|
|
60
|
+
r"""
|
|
45
61
|
Compute the mean absolute error between the real and predicted prevalences.
|
|
46
62
|
|
|
47
63
|
Parameters
|
|
@@ -57,12 +73,12 @@ def mean_absolute_error(prev_real, prev_pred):
|
|
|
57
73
|
error : float
|
|
58
74
|
Mean absolute error across all classes.
|
|
59
75
|
"""
|
|
60
|
-
prev_real, prev_pred = process_inputs(
|
|
61
|
-
return np.mean(
|
|
76
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
77
|
+
return np.mean(AE(prev_pred, prev_real))
|
|
62
78
|
|
|
63
79
|
|
|
64
|
-
def
|
|
65
|
-
"""
|
|
80
|
+
def KLD(prev_pred, prev_real):
|
|
81
|
+
r"""
|
|
66
82
|
Compute the Kullback-Leibler divergence between the real and predicted prevalences.
|
|
67
83
|
|
|
68
84
|
Parameters
|
|
@@ -78,12 +94,12 @@ def kullback_leibler_divergence(prev_real, prev_pred):
|
|
|
78
94
|
divergence : array-like of shape (n_classes,)
|
|
79
95
|
Kullback-Leibler divergence for each class.
|
|
80
96
|
"""
|
|
81
|
-
prev_real, prev_pred = process_inputs(
|
|
97
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
82
98
|
return prev_real * np.abs(np.log(prev_real / prev_pred))
|
|
83
99
|
|
|
84
100
|
|
|
85
|
-
def
|
|
86
|
-
"""
|
|
101
|
+
def SE(prev_pred, prev_real):
|
|
102
|
+
r"""
|
|
87
103
|
Compute the mean squared error between the real and predicted prevalences.
|
|
88
104
|
|
|
89
105
|
Parameters
|
|
@@ -99,13 +115,12 @@ def squared_error(prev_real, prev_pred):
|
|
|
99
115
|
error : float
|
|
100
116
|
Mean squared error across all classes.
|
|
101
117
|
"""
|
|
102
|
-
prev_real, prev_pred = process_inputs(
|
|
118
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
103
119
|
return np.mean((prev_pred - prev_real) ** 2, axis=-1)
|
|
104
120
|
|
|
105
121
|
|
|
106
|
-
def
|
|
107
|
-
"""
|
|
108
|
-
Compute the mean squared error across all classes.
|
|
122
|
+
def MSE(prev_pred, prev_real):
|
|
123
|
+
r""" Mean Squared Error
|
|
109
124
|
|
|
110
125
|
Parameters
|
|
111
126
|
----------
|
|
@@ -120,12 +135,12 @@ def mean_squared_error(prev_real, prev_pred):
|
|
|
120
135
|
mse : float
|
|
121
136
|
Mean squared error across all classes.
|
|
122
137
|
"""
|
|
123
|
-
prev_real, prev_pred = process_inputs(
|
|
124
|
-
return
|
|
138
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
139
|
+
return SE(prev_pred, prev_real).mean()
|
|
125
140
|
|
|
126
141
|
|
|
127
|
-
def
|
|
128
|
-
"""
|
|
142
|
+
def NAE(prev_pred, prev_real):
|
|
143
|
+
r"""
|
|
129
144
|
Compute the normalized absolute error between the real and predicted prevalences.
|
|
130
145
|
|
|
131
146
|
Parameters
|
|
@@ -141,14 +156,14 @@ def normalized_absolute_error(prev_real, prev_pred):
|
|
|
141
156
|
error : float
|
|
142
157
|
Normalized absolute error across all classes.
|
|
143
158
|
"""
|
|
144
|
-
prev_real, prev_pred = process_inputs(
|
|
145
|
-
abs_error =
|
|
159
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
160
|
+
abs_error = MAE(prev_pred, prev_real)
|
|
146
161
|
z_abs_error = 2 * (1 - np.min(prev_real))
|
|
147
162
|
return abs_error / z_abs_error
|
|
148
163
|
|
|
149
164
|
|
|
150
|
-
def
|
|
151
|
-
"""
|
|
165
|
+
def NKLD(prev_pred, prev_real):
|
|
166
|
+
r"""
|
|
152
167
|
Compute the normalized Kullback-Leibler divergence between the real and predicted prevalences.
|
|
153
168
|
|
|
154
169
|
Parameters
|
|
@@ -164,14 +179,14 @@ def normalized_kullback_leibler_divergence(prev_real, prev_pred):
|
|
|
164
179
|
divergence : float
|
|
165
180
|
Normalized Kullback-Leibler divergence across all classes.
|
|
166
181
|
"""
|
|
167
|
-
prev_real, prev_pred = process_inputs(
|
|
168
|
-
kl_divergence =
|
|
182
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
183
|
+
kl_divergence = KLD(prev_pred, prev_real)
|
|
169
184
|
euler = np.exp(kl_divergence)
|
|
170
185
|
return 2 * (euler / (euler + 1)) - 1
|
|
171
186
|
|
|
172
187
|
|
|
173
|
-
def
|
|
174
|
-
"""
|
|
188
|
+
def RAE(prev_pred, prev_real):
|
|
189
|
+
r"""
|
|
175
190
|
Compute the relative absolute error between the real and predicted prevalences.
|
|
176
191
|
|
|
177
192
|
Parameters
|
|
@@ -187,12 +202,12 @@ def relative_absolute_error(prev_real, prev_pred):
|
|
|
187
202
|
error : float
|
|
188
203
|
Relative absolute error across all classes.
|
|
189
204
|
"""
|
|
190
|
-
prev_real, prev_pred = process_inputs(
|
|
191
|
-
return (
|
|
205
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
206
|
+
return (MAE(prev_pred, prev_real) / prev_real).mean(axis=-1)
|
|
192
207
|
|
|
193
208
|
|
|
194
|
-
def
|
|
195
|
-
"""
|
|
209
|
+
def NRAE(prev_pred, prev_real):
|
|
210
|
+
r"""
|
|
196
211
|
Compute the normalized relative absolute error between the real and predicted prevalences.
|
|
197
212
|
|
|
198
213
|
Parameters
|
|
@@ -208,8 +223,8 @@ def normalized_relative_absolute_error(prev_real, prev_pred):
|
|
|
208
223
|
error : float
|
|
209
224
|
Normalized relative absolute error across all classes.
|
|
210
225
|
"""
|
|
211
|
-
prev_real, prev_pred = process_inputs(
|
|
212
|
-
relative =
|
|
226
|
+
prev_real, prev_pred = process_inputs(prev_pred, prev_real)
|
|
227
|
+
relative = RAE(prev_pred, prev_real)
|
|
213
228
|
z_relative = (len(prev_real) - 1 + ((1 - np.min(prev_real)) / np.min(prev_real))) / len(prev_real)
|
|
214
229
|
return relative / z_relative
|
|
215
230
|
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
from mlquantify.base import BaseQuantifier
|
|
5
|
+
|
|
6
|
+
from mlquantify.mixture._utils import sqEuclidean
|
|
7
|
+
from mlquantify.utils._decorators import _fit_context
|
|
8
|
+
from mlquantify.utils._validation import validate_y, validate_data
|
|
9
|
+
from mlquantify.multiclass import define_binary
|
|
10
|
+
from mlquantify.mixture._utils import (
|
|
11
|
+
hellinger,
|
|
12
|
+
topsoe,
|
|
13
|
+
probsymm,
|
|
14
|
+
sqEuclidean
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
class BaseMixture(BaseQuantifier):
|
|
18
|
+
r"""Base class for mixture-model quantifiers.
|
|
19
|
+
|
|
20
|
+
Mixture Models (MM) for quantification estimate class prevalences by modeling
|
|
21
|
+
the test set score distribution as a mixture of the individual class score
|
|
22
|
+
distributions learned from training data. The goal is to find the mixture
|
|
23
|
+
parameters, i.e., class proportions, that best represent the observed test data.
|
|
24
|
+
|
|
25
|
+
Mixture-based quantifiers approximate class-conditional distributions typically
|
|
26
|
+
via histograms or empirical distributions of classifier scores, treating the test
|
|
27
|
+
distribution as a weighted sum (mixture) of these. Estimation proceeds by finding
|
|
28
|
+
the mixture weights that minimize a distance or divergence measure between the
|
|
29
|
+
observed test distribution and the mixture of training class distributions.
|
|
30
|
+
|
|
31
|
+
Common distance measures used in evaluating mixtures include:
|
|
32
|
+
- Hellinger distance
|
|
33
|
+
- Topsoe distance (a symmetric Jensen-Shannon type divergence)
|
|
34
|
+
- Probabilistic symmetric divergence
|
|
35
|
+
- Squared Euclidean distance
|
|
36
|
+
|
|
37
|
+
These distances compare probability distributions representing class-conditioned
|
|
38
|
+
scores or histograms, and the choice of distance can affect quantification accuracy
|
|
39
|
+
and robustness.
|
|
40
|
+
|
|
41
|
+
The DyS framework [3]_ generalizes mixture models by introducing
|
|
42
|
+
a variety of distribution dissimilarity measures, enabling flexible and effective
|
|
43
|
+
quantification methods.
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
Notes
|
|
47
|
+
-----
|
|
48
|
+
Mixture models are defined for only binary quantification problems. For multi-class
|
|
49
|
+
problems, a one-vs-rest strategy is applied, training a binary mixture model for
|
|
50
|
+
each class against the rest.
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
None directly; subclasses implement fitting and prediction logic.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
Attributes
|
|
59
|
+
----------
|
|
60
|
+
_precomputed : bool
|
|
61
|
+
Indicates if preprocess computations on data have been performed.
|
|
62
|
+
distances : Any
|
|
63
|
+
Stores intermediate or final distance computations used in model selection.
|
|
64
|
+
classes : ndarray of shape (n_classes,)
|
|
65
|
+
Unique class labels seen during training.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
References
|
|
69
|
+
----------
|
|
70
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.* ECML, pp. 564-575.
|
|
71
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.* Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
72
|
+
[3] Maletzke, A., dos Reis, D., Cherman, E., & Batista, G. (2019). *DyS: A Framework for Mixture Models in Quantification.* AAAI Conference on Artificial Intelligence.
|
|
73
|
+
[4] Esuli, A., Moreo, A., & Sebastiani, F. (2023). *Learning to Quantify.* Springer.
|
|
74
|
+
|
|
75
|
+
Examples
|
|
76
|
+
--------
|
|
77
|
+
>>> import numpy as np
|
|
78
|
+
>>> class MyMixture(BaseMixture):
|
|
79
|
+
... def best_mixture(self, X):
|
|
80
|
+
... # Implementation example: estimate mixture weights minimizing Hellinger distance
|
|
81
|
+
... pass
|
|
82
|
+
>>> X_train = np.random.rand(100, 10)
|
|
83
|
+
>>> y_train = np.random.randint(0, 2, size=100)
|
|
84
|
+
>>> quantifier = MyMixture()
|
|
85
|
+
>>> quantifier.fit(X_train, y_train)
|
|
86
|
+
>>> prevalences = quantifier.predict(X_train)
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self):
|
|
90
|
+
self._precomputed = False
|
|
91
|
+
self.distances = None
|
|
92
|
+
|
|
93
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
94
|
+
def fit(self, X, y, *args, **kwargs):
|
|
95
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
96
|
+
X, y = validate_data(self, X, y)
|
|
97
|
+
validate_y(self, y)
|
|
98
|
+
self.classes_ = np.unique(y)
|
|
99
|
+
|
|
100
|
+
self._fit(X, y, *args, **kwargs)
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
def predict(self, X, *args, **kwargs):
|
|
104
|
+
"""Predict class prevalences for the given data."""
|
|
105
|
+
X = validate_data(self, X)
|
|
106
|
+
return self._predict(X, *args, **kwargs)
|
|
107
|
+
|
|
108
|
+
def get_best_distance(self, *args, **kwargs):
|
|
109
|
+
r""" Get the best distance value from the mixture fitting process.
|
|
110
|
+
|
|
111
|
+
Notes
|
|
112
|
+
-----
|
|
113
|
+
If the quantifier has not been fitted yet, it will fit the model for getting the
|
|
114
|
+
best distance.
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
_, best_distance = self.best_mixture(*args, **kwargs)
|
|
118
|
+
return best_distance
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def best_mixture(self, X):
|
|
122
|
+
"""Determine the best mixture parameters for the given data."""
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def get_distance(cls, dist_train, dist_test, measure="hellinger"):
|
|
127
|
+
r"""Compute distance between two distributions."""
|
|
128
|
+
|
|
129
|
+
if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
|
|
130
|
+
raise ValueError("One or both vectors are zero (empty)...")
|
|
131
|
+
if len(dist_train) != len(dist_test):
|
|
132
|
+
raise ValueError("Arrays must have the same length.")
|
|
133
|
+
|
|
134
|
+
dist_train = np.maximum(dist_train, 1e-20)
|
|
135
|
+
dist_test = np.maximum(dist_test, 1e-20)
|
|
136
|
+
|
|
137
|
+
if measure == "topsoe":
|
|
138
|
+
return topsoe(dist_train, dist_test)
|
|
139
|
+
elif measure == "probsymm":
|
|
140
|
+
return probsymm(dist_train, dist_test)
|
|
141
|
+
elif measure == "hellinger":
|
|
142
|
+
return hellinger(dist_train, dist_test)
|
|
143
|
+
elif measure == "euclidean":
|
|
144
|
+
return sqEuclidean(dist_train, dist_test)
|
|
145
|
+
else:
|
|
146
|
+
raise ValueError(f"Invalid measure: {measure}")
|
|
147
|
+
|