mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -291
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.7.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,400 @@
1
+ import numpy as np
2
+ from abc import abstractmethod
3
+
4
+ from mlquantify.base import BaseQuantifier
5
+ from mlquantify.base_aggregative import AggregationMixin, SoftLearnerQMixin, _get_learner_function
6
+ from mlquantify.mixture._base import BaseMixture
7
+ from mlquantify.multiclass import define_binary
8
+ from mlquantify.utils._constraints import Interval, Options
9
+ from mlquantify.utils._decorators import _fit_context
10
+ from mlquantify.utils._get_scores import apply_cross_validation
11
+ from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_prevalences, validate_y
12
+ from mlquantify.mixture._utils import (
13
+ getHist,
14
+ ternary_search,
15
+ )
16
+
17
+
18
+
19
+ # =====================================================
20
+ # Base class
21
+ # =====================================================
22
+ @define_binary
23
+ class AggregativeMixture(SoftLearnerQMixin, AggregationMixin, BaseMixture):
24
+ """
25
+ Base class for Mixture-based Quantification Methods.
26
+
27
+ These methods assume that the test score distribution is a mixture
28
+ of the positive and negative score distributions from the training data.
29
+ """
30
+
31
+ _parameter_constraints = {
32
+ "strategy": [Options(["ovr", "ovo"])]
33
+ }
34
+
35
+ def __init__(self, learner = None, strategy="ovr"):
36
+ super().__init__()
37
+ self.learner = learner
38
+ self.pos_scores = None
39
+ self.neg_scores = None
40
+ self.distances = None
41
+ self.strategy = strategy
42
+
43
+ def _fit(self, X, y, learner_fitted=False, *args, **kwargs):
44
+ learner_function = _get_learner_function(self)
45
+
46
+ if learner_fitted:
47
+ train_predictions = getattr(self.learner, learner_function)(X)
48
+ train_y_values = y
49
+ else:
50
+ train_predictions, train_y_values = apply_cross_validation(
51
+ self.learner,
52
+ X,
53
+ y,
54
+ function= learner_function,
55
+ cv= 5,
56
+ stratified= True,
57
+ random_state= None,
58
+ shuffle= True
59
+ )
60
+
61
+ self.train_predictions = train_predictions
62
+ self.train_y_values = train_y_values
63
+
64
+ self._precompute_training(train_predictions, train_y_values)
65
+ return self
66
+
67
+ def _precompute_training(self, train_predictions, train_y_values):
68
+ """
69
+ Fit learner and store score distributions for positive and negative classes.
70
+ """
71
+ # Store scores for positive and negative classes
72
+ self.pos_scores = train_predictions[train_y_values == self.classes_[1], 1]
73
+ self.neg_scores = train_predictions[train_y_values == self.classes_[0], 1]
74
+ self._precomputed = True
75
+ return self
76
+
77
+ def _predict(self, X):
78
+ """Predict class prevalences for the given data."""
79
+ predictions = getattr(self.learner, _get_learner_function(self))(X)
80
+ prevalences = self.aggregate(predictions, self.train_predictions, self.train_y_values)
81
+
82
+ return prevalences
83
+
84
+ def aggregate(self, predictions, train_predictions, train_y_values):
85
+ predictions = validate_predictions(self, predictions)
86
+ self.classes_ = check_classes_attribute(self, np.unique(train_y_values))
87
+
88
+ if not self._precomputed:
89
+ self._precompute_training(train_predictions, train_y_values)
90
+ self._precomputed = True
91
+
92
+ pos_test_scores = predictions[:, 1]
93
+
94
+ best_alpha, _ = self.best_mixture(pos_test_scores, self.pos_scores, self.neg_scores)
95
+ prevalence = np.array([1 - best_alpha, best_alpha])
96
+ prevalence = validate_prevalences(self, prevalence, self.classes_)
97
+ return prevalence
98
+
99
+ @abstractmethod
100
+ def best_mixture(self, predictions, pos_scores, neg_scores):
101
+ ...
102
+
103
+ # =====================================================
104
+ # DyS
105
+ # =====================================================
106
+
107
+ class DyS(AggregativeMixture):
108
+ """Distribution y-Similarity (DyS) quantification method.
109
+
110
+ Uses mixture modeling with a dissimilarity measure between distributions
111
+ computed on histograms of classifier scores. This method optimizes mixture
112
+ weights by minimizing a chosen distance measure: Hellinger, Topsoe, or ProbSymm.
113
+
114
+ Parameters
115
+ ----------
116
+ learner : estimator, optional
117
+ Base probabilistic classifier.
118
+ measure : {'hellinger', 'topsoe', 'probsymm'}, default='topsoe'
119
+ Distance function to minimize.
120
+ bins_size : array-like or None
121
+ Histogram bin sizes to try for score representation. Defaults to a set of
122
+ bin sizes between 2 and 30.
123
+
124
+ References
125
+ ----------
126
+ [1] Maletzke et al. (2019). DyS: A Framework for Mixture Models in Quantification. AAAI 2019.
127
+ [2] Esuli et al. (2023). Learning to Quantify. Springer.
128
+
129
+ Examples
130
+ --------
131
+ >>> q = DyS(learner=my_learner, measure="hellinger")
132
+ >>> q.fit(X_train, y_train)
133
+ >>> prevalences = q.predict(X_test)
134
+ """
135
+
136
+ _parameter_constraints = {
137
+ "measure": [Options(["hellinger", "topsoe", "probsymm"])],
138
+ "bins_size": ["array-like", None]
139
+ }
140
+
141
+ def __init__(self, learner=None, measure="topsoe", bins_size=None):
142
+ super().__init__(learner)
143
+ if bins_size is None:
144
+ bins_size = np.append(np.linspace(2, 20, 10), 30)
145
+
146
+ self.measure = measure
147
+ self.bins_size = np.asarray(bins_size, dtype=int)
148
+
149
+ def best_mixture(self, predictions, pos_scores, neg_scores):
150
+
151
+ prevs = []
152
+ self.distances = []
153
+ for bins in self.bins_size:
154
+ pos = getHist(pos_scores, bins)
155
+ neg = getHist(neg_scores, bins)
156
+ test = getHist(predictions, bins)
157
+
158
+ def f(alpha):
159
+ mix = self._mix(pos, neg, alpha)
160
+ return BaseMixture.get_distance(mix, test, measure=self.measure)
161
+
162
+ alpha = ternary_search(0, 1, f)
163
+ prevs.append(alpha)
164
+ self.distances.append(f(alpha))
165
+ alpha = np.median(prevs)
166
+ best_distance = np.median(self.distances)
167
+ return alpha, best_distance
168
+
169
+ def _mix(self, pos_hist, neg_hist, alpha):
170
+ return alpha * pos_hist + (1 - alpha) * neg_hist
171
+
172
+
173
+ # =====================================================
174
+ # HDy
175
+ # =====================================================
176
+
177
+ class HDy(AggregativeMixture):
178
+ """Hellinger Distance Minimization (HDy) quantification method.
179
+
180
+ Estimates class prevalences by finding mixture weights that minimize
181
+ the Hellinger distance between the histogram of test scores and the mixture
182
+ of positive and negative class score histograms, evaluated over multiple bin sizes.
183
+
184
+ Parameters
185
+ ----------
186
+ learner : estimator, optional
187
+ Base probabilistic classifier.
188
+
189
+ References
190
+ ----------
191
+ [2] Esuli et al. (2023). Learning to Quantify. Springer.
192
+
193
+ """
194
+
195
+ def best_mixture(self, predictions, pos_scores, neg_scores):
196
+ bins_size = np.arange(10, 110, 11)
197
+ alpha_values = np.round(np.linspace(0, 1, 101), 2)
198
+
199
+ alphas, self.distances = [], []
200
+ for bins in bins_size:
201
+ pos = getHist(pos_scores, bins)
202
+ neg = getHist(neg_scores, bins)
203
+ test = getHist(predictions, bins)
204
+ dists = []
205
+ for a in alpha_values:
206
+ mix = self._mix(pos, neg, a)
207
+ dists.append(BaseMixture.get_distance(mix, test, measure="hellinger"))
208
+ a = alpha_values[np.argmin(dists)]
209
+ alphas.append(a)
210
+ self.distances.append(np.min(dists))
211
+
212
+ best_alpha = np.median(alphas)
213
+ best_distance = np.median(self.distances)
214
+
215
+ return best_alpha, best_distance
216
+
217
+ def _mix(self, pos_hist, neg_hist, alpha):
218
+ return alpha * pos_hist + (1 - alpha) * neg_hist
219
+
220
+
221
+
222
+ # =====================================================
223
+ # SMM
224
+ # =====================================================
225
+
226
+ class SMM(AggregativeMixture):
227
+ r"""Sample Mean Matching (SMM) quantification method.
228
+
229
+ Estimates class prevalence by matching the mean score of the test samples
230
+ to a convex combination of positive and negative training scores. The mixture
231
+ weight \( \alpha \) is computed as:
232
+
233
+ \[
234
+ \alpha = \frac{\bar{s}_{test} - \bar{s}_{neg}}{\bar{s}_{pos} - \bar{s}_{neg}}
235
+ \]
236
+
237
+ where \( \bar{s} \) denotes the sample mean.
238
+
239
+ Parameters
240
+ ----------
241
+ learner : estimator, optional
242
+ Base probabilistic classifier.
243
+
244
+ References
245
+ ----------
246
+ [2] Esuli et al. (2023). Learning to Quantify. Springer.
247
+ """
248
+
249
+ def best_mixture(self, predictions, pos_scores, neg_scores):
250
+ mean_pos = np.mean(pos_scores)
251
+ mean_neg = np.mean(neg_scores)
252
+ mean_test = np.mean(predictions)
253
+
254
+ alpha = (mean_test - mean_neg) / (mean_pos - mean_neg)
255
+ return alpha, None
256
+
257
+
258
+ # =====================================================
259
+ # SORD
260
+ # =====================================================
261
+
262
+ class SORD(AggregativeMixture):
263
+ """Sample Ordinal Distance (SORD) quantification method.
264
+
265
+ Estimates prevalence by minimizing the weighted sum of absolute score differences
266
+ between test data and training classes. The method creates weighted score
267
+ vectors for positive, negative, and test samples, sorts them, and computes
268
+ a cumulative absolute difference as the distance measure.
269
+
270
+ Parameters
271
+ ----------
272
+ learner : estimator, optional
273
+ Base probabilistic classifier.
274
+
275
+ References
276
+ ----------
277
+ [2] Esuli et al. (2023). Learning to Quantify. Springer.
278
+ """
279
+
280
+ def best_mixture(self, predictions, pos_scores, neg_scores):
281
+ alphas = np.linspace(0, 1, 101)
282
+ self.distances = []
283
+
284
+ pos, neg, test = pos_scores, neg_scores, predictions
285
+ n_pos, n_neg, n_test = len(pos), len(neg), len(test)
286
+ for a in alphas:
287
+ pos_w = np.full(n_pos, a / n_pos)
288
+ neg_w = np.full(n_neg, (1 - a) / n_neg)
289
+ test_w = np.full(n_test, -1 / n_test)
290
+ scores = np.concatenate([pos, neg, test])
291
+ weights = np.concatenate([pos_w, neg_w, test_w])
292
+ idx = np.argsort(scores)
293
+ sorted_scores = scores[idx]
294
+ sorted_weights = weights[idx]
295
+ cum_w = sorted_weights[0]
296
+ total = 0
297
+ for i in range(1, len(sorted_scores)):
298
+ seg = sorted_scores[i] - sorted_scores[i - 1]
299
+ total += abs(seg * cum_w)
300
+ cum_w += sorted_weights[i]
301
+ self.distances.append(total)
302
+
303
+ best_distance_index = np.argmin(self.distances)
304
+ best_alpha = alphas[best_distance_index]
305
+ best_distance = self.distances[best_distance_index]
306
+ return best_alpha, best_distance
307
+
308
+
309
+
310
+
311
+
312
+ # =====================================================
313
+ # Non aggregative Mixture-based Quantifiers
314
+ # =====================================================
315
+
316
+ class HDx(BaseMixture):
317
+ """
318
+ Hellinger Distance-based Quantifier (HDx).
319
+
320
+ A non-aggregative mixture quantifier that estimates class prevalences by
321
+ minimizing the average Hellinger distance between class-wise feature histograms
322
+ of training data and test data. It iterates over mixture weights and histogram bin sizes,
323
+ evaluating distance per feature and aggregates the results.
324
+
325
+ Parameters
326
+ ----------
327
+ bins_size : array-like, optional
328
+ Histogram bin sizes to consider for discretizing features.
329
+ strategy : {'ovr', 'ovo'}, default='ovr'
330
+ Multiclass quantification strategy.
331
+
332
+ Attributes
333
+ ----------
334
+ pos_features : ndarray
335
+ Training samples of the positive class.
336
+ neg_features : ndarray
337
+ Training samples of the negative class.
338
+
339
+ References
340
+ ----------
341
+ [2] Esuli et al. (2023). Learning to Quantify. Springer.
342
+ """
343
+
344
+ _parameter_constraints = {
345
+ "bins_size": ["array-like", None],
346
+ "strategy": [Options(["ovr", "ovo"])]
347
+ }
348
+
349
+ def __init__(self, bins_size=None, strategy="ovr"):
350
+ super().__init__()
351
+ if bins_size is None:
352
+ bins_size = np.append(np.linspace(2, 20, 10), 30)
353
+
354
+ self.bins_size = bins_size
355
+ self.neg_features = None
356
+ self.pos_features = None
357
+ self.strategy = strategy
358
+
359
+
360
+ def _fit(self, X, y, *args, **kwargs):
361
+ self.pos_features = X[y == self.classes_[1]]
362
+ self.neg_features = X[y == self.classes_[0]]
363
+ return self
364
+
365
+ def _predict(self, X) -> np.ndarray:
366
+ alpha, _ = self.best_mixture(X, self.pos_features, self.neg_features)
367
+ prevalence = np.array([1 - alpha, alpha])
368
+ prevalence = validate_prevalences(self, prevalence, self.classes_)
369
+ return prevalence
370
+
371
+ def best_mixture(self, X, pos, neg):
372
+ alpha_values = np.round(np.linspace(0, 1, 101), 2)
373
+ self.distances = []
374
+
375
+ # Iterate over alpha values to compute the prevalence
376
+ for alpha in alpha_values:
377
+ distances = []
378
+
379
+ # For each feature, compute the Hellinger distance
380
+ for feature_idx in range(X.shape[1]):
381
+
382
+ for bins in self.bins_size:
383
+
384
+ pos_feature = pos[:, feature_idx]
385
+ neg_feature = neg[:, feature_idx]
386
+ test_feature = X[:, feature_idx]
387
+
388
+ pos_hist = getHist(pos_feature, bins)
389
+ neg_hist = getHist(neg_feature, bins)
390
+ test_hist = getHist(test_feature, bins)
391
+
392
+ mix_hist = alpha * pos_hist + (1 - alpha) * neg_hist
393
+ distance = BaseMixture.get_distance(mix_hist, test_hist, measure="hellinger")
394
+ distances.append(distance)
395
+
396
+ avg_distance = np.mean(distances)
397
+ self.distances.append(avg_distance)
398
+ best_alpha = alpha_values[np.argmin(self.distances)]
399
+ best_distance = np.min(self.distances)
400
+ return best_alpha, best_distance
@@ -0,0 +1,112 @@
1
+ import numpy as np
2
+
3
+
4
+ # =====================================================
5
+ # Utility functions
6
+ # =====================================================
7
+
8
+ def getHist(scores, nbins):
9
+ """
10
+ Calculate histogram-like bin probabilities for a given set of scores.
11
+
12
+ This function divides the score range into equal bins and computes the proportion
13
+ of scores in each bin, normalized by the total count.
14
+
15
+ Parameters
16
+ ----------
17
+ scores : np.ndarray
18
+ A 1-dimensional array of scores.
19
+ nbins : int
20
+ Number of bins for dividing the score range.
21
+
22
+ Returns
23
+ -------
24
+ np.ndarray
25
+ An array containing the normalized bin probabilities.
26
+
27
+ Notes
28
+ -----
29
+ - The bins are equally spaced between 0 and 1, with an additional upper boundary
30
+ to include the maximum score.
31
+ - The returned probabilities are normalized to account for the total number of scores.
32
+ """
33
+ breaks = np.linspace(0, 1, int(nbins) + 1)
34
+ breaks = np.delete(breaks, -1)
35
+ breaks = np.append(breaks, 1.1)
36
+
37
+ re = np.repeat(1 / (len(breaks) - 1), (len(breaks) - 1))
38
+ for i in range(1, len(breaks)):
39
+ re[i - 1] = (re[i - 1] + len(np.where((scores >= breaks[i - 1]) & (scores < breaks[i]))[0])) / (len(scores) + 1)
40
+
41
+ return re
42
+
43
+
44
+ def ternary_search(left: float, right: float, func, tol: float = 1e-4) -> float:
45
+ """
46
+ Ternary search to find the minimum of a unimodal function in [left, right].
47
+
48
+ Parameters
49
+ ----------
50
+ left : float
51
+ Left bound.
52
+ right : float
53
+ Right bound.
54
+ func : callable
55
+ Function to minimize.
56
+ tol : float
57
+ Tolerance for termination.
58
+
59
+ Returns
60
+ -------
61
+ float
62
+ Approximate position of the minimum.
63
+ """
64
+ while right - left > tol:
65
+ m1 = left + (right - left) / 3
66
+ m2 = right - (right - left) / 3
67
+ f1, f2 = func(m1), func(m2)
68
+ if f1 < f2:
69
+ right = m2
70
+ else:
71
+ left = m1
72
+ return (left + right) / 2
73
+
74
+
75
+ def topsoe(p: np.ndarray, q: np.ndarray) -> float:
76
+ """
77
+ Topsoe distance between two probability distributions.
78
+
79
+ D_T(p, q) = sum( p*log(2p/(p+q)) + q*log(2q/(p+q)) )
80
+ """
81
+ p = np.maximum(p, 1e-20)
82
+ q = np.maximum(q, 1e-20)
83
+ return np.sum(p * np.log(2 * p / (p + q)) + q * np.log(2 * q / (p + q)))
84
+
85
+
86
+ def probsymm(p: np.ndarray, q: np.ndarray) -> float:
87
+ """
88
+ Probabilistic Symmetric distance.
89
+
90
+ D_PS(p, q) = sum( (p - q) * log(p / q) )
91
+ """
92
+ p = np.maximum(p, 1e-20)
93
+ q = np.maximum(q, 1e-20)
94
+ return np.sum((p - q) * np.log(p / q))
95
+
96
+
97
+ def hellinger(p: np.ndarray, q: np.ndarray) -> float:
98
+ """
99
+ Hellinger distance between two probability distributions.
100
+
101
+ H(p, q) = (1/sqrt(2)) * sqrt( sum( (sqrt(p) - sqrt(q))^2 ) )
102
+ """
103
+ p = np.maximum(p, 1e-20)
104
+ q = np.maximum(q, 1e-20)
105
+ return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
106
+
107
+
108
+ def sqEuclidean(p: np.ndarray, q: np.ndarray) -> float:
109
+ """
110
+ Squared Euclidean distance between two vectors.
111
+ """
112
+ return np.sum((p - q) ** 2)
@@ -0,0 +1,9 @@
1
+ from ._protocol import (
2
+ BaseProtocol,
3
+ APP,
4
+ NPP,
5
+ UPP,
6
+ PPP
7
+ )
8
+
9
+ from ._search import GridSearchQ