mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -291
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.7.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,21 @@
1
+ from ._oq import (
2
+ NMD,
3
+ RNOD,
4
+ )
5
+
6
+ from ._rq import (
7
+ VSE,
8
+ CvM_L1,
9
+ )
10
+
11
+ from ._slq import (
12
+ AE,
13
+ SE,
14
+ MAE,
15
+ MSE,
16
+ KLD,
17
+ RAE,
18
+ NAE,
19
+ NRAE,
20
+ NKLD,
21
+ )
@@ -0,0 +1,109 @@
1
+ import numpy as np
2
+
3
+ def process_inputs(prev_pred, prev_real):
4
+ """
5
+ .. :noindex:
6
+
7
+ Process the input data for internal use.
8
+ """
9
+ if isinstance(prev_real, dict):
10
+ prev_real = np.asarray(list(prev_real.values()))
11
+ if isinstance(prev_pred, dict):
12
+ prev_pred = np.asarray(list(prev_pred.values()))
13
+ if isinstance(prev_real, list):
14
+ prev_real = np.asarray(prev_real)
15
+ if isinstance(prev_pred, list):
16
+ prev_pred = np.asarray(prev_pred)
17
+
18
+ # Pad with zeros if lengths differ
19
+ len_real = len(prev_real)
20
+ len_pred = len(prev_pred)
21
+
22
+ if len_real > len_pred:
23
+ prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
24
+ elif len_pred > len_real:
25
+ prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
26
+
27
+ return prev_real, prev_pred
28
+
29
+
30
+ def NMD(prev_pred, prev_real, distances=None):
31
+ """
32
+ Compute the Normalized Match Distance (NMD), also known as Earth Mover’s Distance (EMD),
33
+ for ordinal quantification evaluation.
34
+
35
+ Parameters
36
+ ----------
37
+ prev_real : array-like or dict
38
+ True prevalence values for each ordered class.
39
+
40
+ prev_pred : array-like or dict
41
+ Predicted prevalence values for each ordered class.
42
+
43
+ distances : array-like of shape (n_classes-1,), optional
44
+ Distance between consecutive classes (d(y_i, y_{i+1})).
45
+ If None, all distances are assumed to be 1.
46
+
47
+ Returns
48
+ -------
49
+ nmd : float
50
+ Normalized Match Distance between predicted and true prevalences.
51
+ """
52
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
53
+ n_classes = len(prev_real)
54
+
55
+ if distances is None:
56
+ distances = np.ones(n_classes - 1)
57
+ else:
58
+ distances = np.asarray(distances, dtype=float)
59
+ if len(distances) != n_classes - 1:
60
+ raise ValueError("Length of distances must be n_classes - 1.")
61
+
62
+ # cumulative differences
63
+ cum_diffs = np.cumsum(prev_pred - prev_real)
64
+ nmd = np.sum(distances * np.abs(cum_diffs[:-1])) / (n_classes - 1)
65
+ return float(nmd)
66
+
67
+
68
+ def RNOD(prev_pred, prev_real, distances=None):
69
+ """
70
+ Compute the Root Normalised Order-aware Divergence (RNOD) for ordinal quantification evaluation.
71
+
72
+ Parameters
73
+ ----------
74
+ prev_real : array-like or dict
75
+ True prevalence values for each ordered class.
76
+
77
+ prev_pred : array-like or dict
78
+ Predicted prevalence values for each ordered class.
79
+
80
+ distances : 2D array-like of shape (n_classes, n_classes), optional
81
+ Distance matrix between classes (d(y_i, y_j)).
82
+ If None, assumes d(y_i, y_j) = |i - j|.
83
+
84
+ Returns
85
+ -------
86
+ rnod : float
87
+ Root Normalised Order-aware Divergence between predicted and true prevalences.
88
+ """
89
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
90
+ n_classes = len(prev_real)
91
+ Y_star = np.where(prev_real > 0)[0]
92
+
93
+ # default distance: |i - j|
94
+ if distances is None:
95
+ distances = np.abs(np.arange(n_classes)[:, None] - np.arange(n_classes)[None, :])
96
+ else:
97
+ distances = np.asarray(distances, dtype=float)
98
+ if distances.shape != (n_classes, n_classes):
99
+ raise ValueError("Distance matrix must be of shape (n_classes, n_classes).")
100
+
101
+ diff_sq = (prev_real - prev_pred) ** 2
102
+ total = 0.0
103
+ for i in Y_star:
104
+ for j in range(n_classes):
105
+ total += distances[j, i] * diff_sq[j]
106
+
107
+ denom = len(Y_star) * (n_classes - 1)
108
+ rnod = np.sqrt(total / denom)
109
+ return float(rnod)
@@ -0,0 +1,98 @@
1
+ import numpy as np
2
+ from scipy.stats import cumfreq
3
+ from mlquantify.metrics._slq import SE
4
+
5
+
6
+ def process_inputs(prev_pred, prev_real):
7
+ """
8
+ .. :noindex:
9
+
10
+ Process the input data for internal use.
11
+ """
12
+ if isinstance(prev_real, dict):
13
+ prev_real = np.asarray(list(prev_real.values()))
14
+ if isinstance(prev_pred, dict):
15
+ prev_pred = np.asarray(list(prev_pred.values()))
16
+ if isinstance(prev_real, list):
17
+ prev_real = np.asarray(prev_real)
18
+ if isinstance(prev_pred, list):
19
+ prev_pred = np.asarray(prev_pred)
20
+
21
+ # Pad with zeros if lengths differ
22
+ len_real = len(prev_real)
23
+ len_pred = len(prev_pred)
24
+
25
+ if len_real > len_pred:
26
+ prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
27
+ elif len_pred > len_real:
28
+ prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
29
+
30
+ return prev_real, prev_pred
31
+
32
+
33
+ def VSE(prev_pred, prev_real, train_values):
34
+ """
35
+ Compute the Variance-normalised Squared Error (VSE).
36
+
37
+ Parameters
38
+ ----------
39
+ prev_real : array-like
40
+ True regression values (from test set).
41
+
42
+ prev_pred : array-like
43
+ Predicted regression values (from test set).
44
+
45
+ train_values : array-like
46
+ True regression values from training set, used to compute variance normalization.
47
+
48
+ Returns
49
+ -------
50
+ verror : float
51
+ Variance-normalised squared error.
52
+ """
53
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
54
+ if isinstance(train_values, dict):
55
+ train_values = np.asarray(list(train_values.values()))
56
+ var_train = np.var(train_values, ddof=1)
57
+ if var_train == 0:
58
+ return np.nan
59
+ return SE(prev_pred, prev_real) / var_train
60
+
61
+
62
+ def CvM_L1(prev_pred, prev_real, n_bins=100):
63
+ """
64
+ Compute the L1 version of the Cramér–von Mises statistic (Xiao et al., 2006)
65
+ between two cumulative distributions, as suggested by Bella et al. (2014).
66
+
67
+ Parameters
68
+ ----------
69
+ prev_real : array-like
70
+ True regression values.
71
+
72
+ prev_pred : array-like
73
+ Predicted regression values.
74
+
75
+ n_bins : int, optional
76
+ Number of bins used to estimate cumulative distributions (default=100).
77
+
78
+ Returns
79
+ -------
80
+ statistic : float
81
+ L1 Cramér–von Mises distance between cumulative distributions.
82
+ """
83
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
84
+
85
+ # Compute empirical cumulative distributions
86
+ min_val = min(np.min(prev_real), np.min(prev_pred))
87
+ max_val = max(np.max(prev_real), np.max(prev_pred))
88
+
89
+ real_cum = cumfreq(prev_real, numbins=n_bins, defaultreallimits=(min_val, max_val))
90
+ pred_cum = cumfreq(prev_pred, numbins=n_bins, defaultreallimits=(min_val, max_val))
91
+
92
+ # Normalize to [0, 1]
93
+ F_real = real_cum.cumcount / real_cum.cumcount[-1]
94
+ F_pred = pred_cum.cumcount / pred_cum.cumcount[-1]
95
+
96
+ # L1 integral between cumulative distributions
97
+ statistic = np.mean(np.abs(F_real - F_pred))
98
+ return float(statistic)
@@ -1,6 +1,6 @@
1
1
  import numpy as np
2
2
 
3
- def process_inputs(prev_real, prev_pred):
3
+ def process_inputs(prev_pred, prev_real):
4
4
  """
5
5
  .. :noindex:
6
6
 
@@ -10,10 +10,26 @@ def process_inputs(prev_real, prev_pred):
10
10
  prev_real = np.asarray(list(prev_real.values()))
11
11
  if isinstance(prev_pred, dict):
12
12
  prev_pred = np.asarray(list(prev_pred.values()))
13
+ if isinstance(prev_real, list):
14
+ print(prev_real)
15
+ prev_real = np.asarray(prev_real)
16
+ if isinstance(prev_pred, list):
17
+ print(prev_pred)
18
+ prev_pred = np.asarray(prev_pred)
19
+
20
+ # Pad with zeros if lengths differ
21
+ len_real = len(prev_real)
22
+ len_pred = len(prev_pred)
23
+
24
+ if len_real > len_pred:
25
+ prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
26
+ elif len_pred > len_real:
27
+ prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
28
+
13
29
  return prev_real, prev_pred
14
30
 
15
31
 
16
- def absolute_error(prev_real, prev_pred):
32
+ def AE(prev_pred, prev_real):
17
33
  """
18
34
  Compute the absolute error for each class or a dictionary of errors if input is a dictionary.
19
35
 
@@ -32,15 +48,15 @@ def absolute_error(prev_real, prev_pred):
32
48
  """
33
49
  if isinstance(prev_real, dict):
34
50
  classes = prev_real.keys()
35
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
51
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
36
52
  abs_errors = np.abs(prev_pred - prev_real)
37
53
  return {class_: float(err) for class_, err in zip(classes, abs_errors)}
38
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
54
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
39
55
  return np.abs(prev_pred - prev_real)
40
56
 
41
57
 
42
58
 
43
- def mean_absolute_error(prev_real, prev_pred):
59
+ def MAE(prev_pred, prev_real):
44
60
  """
45
61
  Compute the mean absolute error between the real and predicted prevalences.
46
62
 
@@ -57,11 +73,11 @@ def mean_absolute_error(prev_real, prev_pred):
57
73
  error : float
58
74
  Mean absolute error across all classes.
59
75
  """
60
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
61
- return np.mean(absolute_error(prev_real, prev_pred))
76
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
77
+ return np.mean(AE(prev_pred, prev_real))
62
78
 
63
79
 
64
- def kullback_leibler_divergence(prev_real, prev_pred):
80
+ def KLD(prev_pred, prev_real):
65
81
  """
66
82
  Compute the Kullback-Leibler divergence between the real and predicted prevalences.
67
83
 
@@ -78,11 +94,11 @@ def kullback_leibler_divergence(prev_real, prev_pred):
78
94
  divergence : array-like of shape (n_classes,)
79
95
  Kullback-Leibler divergence for each class.
80
96
  """
81
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
97
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
82
98
  return prev_real * np.abs(np.log(prev_real / prev_pred))
83
99
 
84
100
 
85
- def squared_error(prev_real, prev_pred):
101
+ def SE(prev_pred, prev_real):
86
102
  """
87
103
  Compute the mean squared error between the real and predicted prevalences.
88
104
 
@@ -99,13 +115,12 @@ def squared_error(prev_real, prev_pred):
99
115
  error : float
100
116
  Mean squared error across all classes.
101
117
  """
102
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
118
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
103
119
  return np.mean((prev_pred - prev_real) ** 2, axis=-1)
104
120
 
105
121
 
106
- def mean_squared_error(prev_real, prev_pred):
107
- """
108
- Compute the mean squared error across all classes.
122
+ def MSE(prev_pred, prev_real):
123
+ """ Mean Squared Error
109
124
 
110
125
  Parameters
111
126
  ----------
@@ -120,11 +135,11 @@ def mean_squared_error(prev_real, prev_pred):
120
135
  mse : float
121
136
  Mean squared error across all classes.
122
137
  """
123
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
124
- return squared_error(prev_real, prev_pred).mean()
138
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
139
+ return SE(prev_pred, prev_real).mean()
125
140
 
126
141
 
127
- def normalized_absolute_error(prev_real, prev_pred):
142
+ def NAE(prev_pred, prev_real):
128
143
  """
129
144
  Compute the normalized absolute error between the real and predicted prevalences.
130
145
 
@@ -141,13 +156,13 @@ def normalized_absolute_error(prev_real, prev_pred):
141
156
  error : float
142
157
  Normalized absolute error across all classes.
143
158
  """
144
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
145
- abs_error = mean_absolute_error(prev_real, prev_pred)
159
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
160
+ abs_error = MAE(prev_pred, prev_real)
146
161
  z_abs_error = 2 * (1 - np.min(prev_real))
147
162
  return abs_error / z_abs_error
148
163
 
149
164
 
150
- def normalized_kullback_leibler_divergence(prev_real, prev_pred):
165
+ def NKLD(prev_pred, prev_real):
151
166
  """
152
167
  Compute the normalized Kullback-Leibler divergence between the real and predicted prevalences.
153
168
 
@@ -164,13 +179,13 @@ def normalized_kullback_leibler_divergence(prev_real, prev_pred):
164
179
  divergence : float
165
180
  Normalized Kullback-Leibler divergence across all classes.
166
181
  """
167
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
168
- kl_divergence = kullback_leibler_divergence(prev_real, prev_pred)
182
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
183
+ kl_divergence = KLD(prev_pred, prev_real)
169
184
  euler = np.exp(kl_divergence)
170
185
  return 2 * (euler / (euler + 1)) - 1
171
186
 
172
187
 
173
- def relative_absolute_error(prev_real, prev_pred):
188
+ def RAE(prev_pred, prev_real):
174
189
  """
175
190
  Compute the relative absolute error between the real and predicted prevalences.
176
191
 
@@ -187,11 +202,11 @@ def relative_absolute_error(prev_real, prev_pred):
187
202
  error : float
188
203
  Relative absolute error across all classes.
189
204
  """
190
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
191
- return (mean_absolute_error(prev_real, prev_pred) / prev_real).mean(axis=-1)
205
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
206
+ return (MAE(prev_pred, prev_real) / prev_real).mean(axis=-1)
192
207
 
193
208
 
194
- def normalized_relative_absolute_error(prev_real, prev_pred):
209
+ def NRAE(prev_pred, prev_real):
195
210
  """
196
211
  Compute the normalized relative absolute error between the real and predicted prevalences.
197
212
 
@@ -208,8 +223,8 @@ def normalized_relative_absolute_error(prev_real, prev_pred):
208
223
  error : float
209
224
  Normalized relative absolute error across all classes.
210
225
  """
211
- prev_real, prev_pred = process_inputs(prev_real, prev_pred)
212
- relative = relative_absolute_error(prev_real, prev_pred)
226
+ prev_real, prev_pred = process_inputs(prev_pred, prev_real)
227
+ relative = RAE(prev_pred, prev_real)
213
228
  z_relative = (len(prev_real) - 1 + ((1 - np.min(prev_real)) / np.min(prev_real))) / len(prev_real)
214
229
  return relative / z_relative
215
230
 
@@ -0,0 +1,7 @@
1
+ from ._classes import (
2
+ HDy,
3
+ DyS,
4
+ SMM,
5
+ SORD,
6
+ HDx
7
+ )
@@ -0,0 +1,153 @@
1
+ import numpy as np
2
+ from abc import abstractmethod
3
+
4
+ from mlquantify.base import BaseQuantifier
5
+
6
+ from mlquantify.mixture._utils import sqEuclidean
7
+ from mlquantify.utils._decorators import _fit_context
8
+ from mlquantify.utils._validation import validate_y, validate_data
9
+ from mlquantify.multiclass import define_binary
10
+ from mlquantify.mixture._utils import (
11
+ hellinger,
12
+ topsoe,
13
+ probsymm,
14
+ sqEuclidean
15
+ )
16
+
17
+ class BaseMixture(BaseQuantifier):
18
+ """
19
+ Base class for mixture-model quantifiers.
20
+
21
+ Mixture Models (MM) for quantification estimate class prevalences by modeling
22
+ the test set score distribution as a mixture of the individual class score
23
+ distributions learned from training data. The goal is to find the mixture
24
+ parameters, i.e., class proportions, that best represent the observed test data.
25
+
26
+ Mixture-based quantifiers approximate class-conditional distributions typically
27
+ via histograms or empirical distributions of classifier scores, treating the test
28
+ distribution as a weighted sum (mixture) of these. Estimation proceeds by finding
29
+ the mixture weights that minimize a distance or divergence measure between the
30
+ observed test distribution and the mixture of training class distributions.
31
+
32
+ Common distance measures used in evaluating mixtures include:
33
+ - Hellinger distance
34
+ - Topsoe distance (a symmetric Jensen-Shannon type divergence)
35
+ - Probabilistic symmetric divergence
36
+ - Squared Euclidean distance
37
+
38
+ These distances compare probability distributions representing class-conditioned
39
+ scores or histograms, and the choice of distance can affect quantification accuracy
40
+ and robustness.
41
+
42
+ The DyS framework (Maletzke et al. 2019) generalizes mixture models by introducing
43
+ a variety of distribution dissimilarity measures, enabling flexible and effective
44
+ quantification methods.
45
+
46
+
47
+ Notes
48
+ -----
49
+ Mixture models are defined for only binary quantification problems. For multi-class
50
+ problems, a one-vs-rest strategy is applied, training a binary mixture model for
51
+ each class against the rest.
52
+
53
+ Parameters
54
+ ----------
55
+ None directly; subclasses implement fitting and prediction logic.
56
+
57
+ Attributes
58
+ ----------
59
+ _precomputed : bool
60
+ Indicates if preprocess computations on data have been performed.
61
+ distances : Any
62
+ Stores intermediate or final distance computations used in model selection.
63
+ classes : ndarray of shape (n_classes,)
64
+ Unique class labels seen during training.
65
+
66
+ Methods
67
+ -------
68
+ fit(X, y, *args, **kwargs):
69
+ Fit the mixture quantifier with training data. Validates input and
70
+ calls internal fitting procedure.
71
+ predict(X, *args, **kwargs):
72
+ Predict class prevalences for input data by leveraging best mixture parameters.
73
+ get_best_distance(*args, **kwargs):
74
+ Return the best distance measure and associated mixture parameters found.
75
+ best_mixture(X):
76
+ Abstract method to determine optimal mixture parameters on input data.
77
+ get_distance(dist_train, dist_test, measure="hellinger"):
78
+ Compute a specified distance between two distributions.
79
+
80
+ References
81
+ ----------
82
+ [1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.* ECML, pp. 564-575.
83
+ [2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.* Data Mining and Knowledge Discovery, 17(2), 164-206.
84
+ [3] Maletzke, A., dos Reis, D., Cherman, E., & Batista, G. (2019). *DyS: A Framework for Mixture Models in Quantification.* AAAI Conference on Artificial Intelligence.
85
+ [4] Esuli, A., Moreo, A., & Sebastiani, F. (2023). *Learning to Quantify.* Springer.
86
+
87
+ Examples
88
+ --------
89
+ >>> import numpy as np
90
+ >>> class MyMixture(BaseMixture):
91
+ ... def best_mixture(self, X):
92
+ ... # Implementation example: estimate mixture weights minimizing Hellinger distance
93
+ ... pass
94
+ >>> X_train = np.random.rand(100, 10)
95
+ >>> y_train = np.random.randint(0, 2, size=100)
96
+ >>> quantifier = MyMixture()
97
+ >>> quantifier.fit(X_train, y_train)
98
+ >>> prevalences = quantifier.predict(X_train)
99
+ """
100
+
101
+ def __init__(self):
102
+ self._precomputed = False
103
+ self.distances = None
104
+
105
+ @_fit_context(prefer_skip_nested_validation=True)
106
+ def fit(self, X, y, *args, **kwargs):
107
+ """Fit the quantifier using the provided data and learner."""
108
+ X, y = validate_data(self, X, y)
109
+ validate_y(self, y)
110
+ self.classes_ = np.unique(y)
111
+
112
+ self._fit(X, y, *args, **kwargs)
113
+ return self
114
+
115
+ def predict(self, X, *args, **kwargs):
116
+ """Predict class prevalences for the given data."""
117
+ X = validate_data(self, X)
118
+ return self._predict(X, *args, **kwargs)
119
+
120
+ def get_best_distance(self, *args, **kwargs):
121
+ _, best_distance = self.best_mixture(*args, **kwargs)
122
+ return best_distance
123
+
124
+ @abstractmethod
125
+ def best_mixture(self, X):
126
+ """Determine the best mixture parameters for the given data."""
127
+ pass
128
+
129
+ @classmethod
130
+ def get_distance(cls, dist_train, dist_test, measure="hellinger"):
131
+ """
132
+ Compute distance between two distributions.
133
+ """
134
+
135
+ if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
136
+ raise ValueError("One or both vectors are zero (empty)...")
137
+ if len(dist_train) != len(dist_test):
138
+ raise ValueError("Arrays must have the same length.")
139
+
140
+ dist_train = np.maximum(dist_train, 1e-20)
141
+ dist_test = np.maximum(dist_test, 1e-20)
142
+
143
+ if measure == "topsoe":
144
+ return topsoe(dist_train, dist_test)
145
+ elif measure == "probsymm":
146
+ return probsymm(dist_train, dist_test)
147
+ elif measure == "hellinger":
148
+ return hellinger(dist_train, dist_test)
149
+ elif measure == "euclidean":
150
+ return sqEuclidean(dist_train, dist_test)
151
+ else:
152
+ raise ValueError(f"Invalid measure: {measure}")
153
+