mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +10 -29
  2. mlquantify/adjust_counting/__init__.py +24 -0
  3. mlquantify/adjust_counting/_adjustment.py +648 -0
  4. mlquantify/adjust_counting/_base.py +245 -0
  5. mlquantify/adjust_counting/_counting.py +153 -0
  6. mlquantify/adjust_counting/_utils.py +109 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +329 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +147 -0
  13. mlquantify/likelihood/_classes.py +430 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +785 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +147 -0
  22. mlquantify/mixture/_classes.py +458 -0
  23. mlquantify/mixture/_utils.py +163 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +168 -0
  31. mlquantify/neighbors/_classes.py +150 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
  33. mlquantify/neighbors/_kde.py +268 -0
  34. mlquantify/neighbors/_utils.py +131 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +64 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.10.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,430 @@
1
+ import numpy as np
2
+ from mlquantify.base_aggregative import SoftLearnerQMixin
3
+ from mlquantify.likelihood._base import BaseIterativeLikelihood
4
+ from mlquantify.metrics._slq import MAE
5
+ from mlquantify.multiclass import define_binary
6
+ from mlquantify.utils._constraints import (
7
+ Interval,
8
+ CallableConstraint,
9
+ Options
10
+ )
11
+
12
+ class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
13
+ r"""Expectation-Maximization Quantifier (EMQ).
14
+
15
+ Estimates class prevalences under prior probability shift by alternating
16
+ between expectation **(E)** and maximization **(M)** steps on posterior probabilities.
17
+
18
+ E-step:
19
+ .. math::
20
+ p_i^{(s+1)}(x) = \frac{q_i^{(s)} p_i(x)}{\sum_j q_j^{(s)} p_j(x)}
21
+
22
+ M-step:
23
+ .. math::
24
+ q_i^{(s+1)} = \frac{1}{N} \sum_{n=1}^N p_i^{(s+1)}(x_n)
25
+
26
+ where
27
+ - :math:`p_i(x)` are posterior probabilities predicted by the classifier
28
+ - :math:`q_i^{(s)}` are class prevalence estimates at iteration :math:`s`
29
+ - :math:`N` is the number of test instances.
30
+
31
+ Calibrations supported on posterior probabilities before **EM** iteration:
32
+
33
+ Temperature Scaling (TS):
34
+ .. math::
35
+ \hat{p} = \text{softmax}\left(\frac{\log(p)}{T}\right)
36
+
37
+ Bias-Corrected Temperature Scaling (BCTS):
38
+ .. math::
39
+ \hat{p} = \text{softmax}\left(\frac{\log(p)}{T} + b\right)
40
+
41
+ Vector Scaling (VS):
42
+ .. math::
43
+ \hat{p}_i = \text{softmax}(W_i \cdot \log(p_i) + b_i)
44
+
45
+ No-Bias Vector Scaling (NBVS):
46
+ .. math::
47
+ \hat{p}_i = \text{softmax}(W_i \cdot \log(p_i))
48
+
49
+ Parameters
50
+ ----------
51
+ learner : estimator, optional
52
+ Probabilistic classifier supporting predict_proba.
53
+ tol : float, default=1e-4
54
+ Convergence threshold.
55
+ max_iter : int, default=100
56
+ Maximum EM iterations.
57
+ calib_function : str or callable, optional
58
+ Calibration method:
59
+ - 'ts': Temperature Scaling
60
+ - 'bcts': Bias-Corrected Temperature Scaling
61
+ - 'vs': Vector Scaling
62
+ - 'nbvs': No-Bias Vector Scaling
63
+ - callable: custom calibration function
64
+ criteria : callable, default=MAE
65
+ Convergence metric.
66
+
67
+ References
68
+ ----------
69
+ .. [1] Saerens, M., Latinne, P., & Decaestecker, C. (2002).
70
+ Adjusting the Outputs of a Classifier to New a Priori Probabilities.
71
+ Neural Computation, 14(1), 2141-2156.
72
+ .. [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
73
+ """
74
+
75
+ _parameter_constraints = {
76
+ "tol": [Interval(0, None, inclusive_left=False)],
77
+ "max_iter": [Interval(1, None, inclusive_left=True)],
78
+ "calib_function": [
79
+ Options(["bcts", "ts", "vs", "nbvs", None]),
80
+ ],
81
+ "criteria": [CallableConstraint()],
82
+ }
83
+
84
+ def __init__(self,
85
+ learner=None,
86
+ tol=1e-4,
87
+ max_iter=100,
88
+ calib_function=None,
89
+ criteria=MAE):
90
+ super().__init__(learner=learner, tol=tol, max_iter=max_iter)
91
+ self.calib_function = calib_function
92
+ self.criteria = criteria
93
+
94
+ def _iterate(self, predictions, priors):
95
+ r"""Perform EM quantification iteration.
96
+
97
+ Steps:
98
+ - Calibrate posterior predictions if calibration function specified.
99
+ - Apply EM procedure to re-estimate prevalences, based on training priors and calibrated posteriors.
100
+
101
+ Parameters
102
+ ----------
103
+ predictions : ndarray of shape (n_samples, n_classes)
104
+ Posterior probabilities for each class on test data.
105
+ priors : ndarray of shape (n_classes,)
106
+ Training set class prevalences, serving as initial priors.
107
+
108
+ Returns
109
+ -------
110
+ prevalences : ndarray of shape (n_classes,)
111
+ Estimated class prevalences after EM iteration.
112
+ """
113
+ calibrated_predictions = self._apply_calibration(predictions)
114
+ prevalences, _ = self.EM(
115
+ posteriors=calibrated_predictions,
116
+ priors=priors,
117
+ tolerance=self.tol,
118
+ max_iter=self.max_iter,
119
+ criteria=self.criteria
120
+ )
121
+ return prevalences
122
+
123
+
124
+ @classmethod
125
+ def EM(cls, posteriors, priors, tolerance=1e-6, max_iter=100, criteria=MAE):
126
+ r"""Static method implementing the EM algorithm for quantification.
127
+
128
+ Parameters
129
+ ----------
130
+ posteriors : ndarray of shape (n_samples, n_classes)
131
+ Posterior probability predictions.
132
+ priors : ndarray of shape (n_classes,)
133
+ Training class prior probabilities.
134
+ tolerance : float
135
+ Convergence threshold based on difference between iterations.
136
+ max_iter : int
137
+ Max number of EM iterations.
138
+ criteria : callable
139
+ Metric to assess convergence, e.g., MAE.
140
+
141
+ Returns
142
+ -------
143
+ qs : ndarray of shape (n_classes,)
144
+ Estimated test set class prevalences.
145
+ ps : ndarray of shape (n_samples, n_classes)
146
+ Updated soft membership probabilities per instance.
147
+ """
148
+
149
+ Px = np.array(posteriors, dtype=np.float64)
150
+ Ptr = np.array(priors, dtype=np.float64)
151
+
152
+
153
+
154
+ if np.prod(Ptr) == 0:
155
+ Ptr += tolerance
156
+ Ptr /= Ptr.sum()
157
+
158
+ qs = np.copy(Ptr)
159
+ s, converged = 0, False
160
+ qs_prev_ = None
161
+
162
+ while not converged and s < max_iter:
163
+ # E-step:
164
+ ps_unnormalized = (qs / Ptr) * Px
165
+ ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
166
+
167
+ # M-step:
168
+ qs = ps.mean(axis=0)
169
+
170
+ if qs_prev_ is not None and criteria(qs_prev_, qs) < tolerance and s > 10:
171
+ converged = True
172
+
173
+ qs_prev_ = qs
174
+ s += 1
175
+
176
+ if not converged:
177
+ print('[warning] the method has reached the maximum number of iterations; it might have not converged')
178
+
179
+ return qs, ps
180
+
181
+
182
+ def _apply_calibration(self, predictions):
183
+ r"""Calibrate posterior predictions with specified calibration method.
184
+
185
+ Parameters
186
+ ----------
187
+ predictions : ndarray
188
+ Posterior predictions to calibrate.
189
+
190
+ Returns
191
+ -------
192
+ calibrated_predictions : ndarray
193
+ Calibrated posterior predictions.
194
+
195
+ Raises
196
+ ------
197
+ ValueError
198
+ If calib_function is unrecognized.
199
+ """
200
+ if self.calib_function is None:
201
+ return predictions
202
+
203
+ if isinstance(self.calib_function, str):
204
+ method = self.calib_function.lower()
205
+ if method == "ts":
206
+ return self._temperature_scaling(predictions)
207
+ elif method == "bcts":
208
+ return self._bias_corrected_temperature_scaling(predictions)
209
+ elif method == "vs":
210
+ return self._vector_scaling(predictions)
211
+ elif method == "nbvs":
212
+ return self._no_bias_vector_scaling(predictions)
213
+
214
+ elif callable(self.calib_function):
215
+ return self.calib_function(predictions)
216
+
217
+ raise ValueError(
218
+ f"Invalid calib_function '{self.calib_function}'. Expected one of {{'bcts', 'ts', 'vs', 'nbvs', None, callable}}."
219
+ )
220
+
221
+ def _temperature_scaling(self, preds):
222
+ """Temperature Scaling calibration applied to logits."""
223
+ T = 1.0
224
+ preds = np.clip(preds, 1e-12, 1.0)
225
+ logits = np.log(preds)
226
+ scaled = logits / T
227
+ exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
228
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
229
+
230
+ def _bias_corrected_temperature_scaling(self, preds):
231
+ """Bias-Corrected Temperature Scaling calibration."""
232
+ T = 1.0
233
+ bias = np.zeros(preds.shape[1])
234
+ preds = np.clip(preds, 1e-12, 1.0)
235
+ logits = np.log(preds)
236
+ logits = logits / T + bias
237
+ exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
238
+ return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
239
+
240
+ def _vector_scaling(self, preds):
241
+ """Vector Scaling calibration."""
242
+ W = np.ones(preds.shape[1])
243
+ b = np.zeros(preds.shape[1])
244
+ preds = np.clip(preds, 1e-12, 1.0)
245
+ logits = np.log(preds)
246
+ scaled = logits * W + b
247
+ exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
248
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
249
+
250
+ def _no_bias_vector_scaling(self, preds):
251
+ """No-Bias Vector Scaling calibration."""
252
+ W = np.ones(preds.shape[1])
253
+ preds = np.clip(preds, 1e-12, 1.0)
254
+ logits = np.log(preds)
255
+ scaled = logits * W
256
+ exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
257
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
258
+
259
+
260
+
261
+ class MLPE(SoftLearnerQMixin, BaseIterativeLikelihood):
262
+ r"""Maximum Likelihood Prevalence Estimation (MLPE).
263
+
264
+ Returns training priors as prevalence estimates without adaptations.
265
+
266
+ Parameters
267
+ ----------
268
+ learner : estimator, optional
269
+ Base classifier.
270
+
271
+ References
272
+ ----------
273
+ .. [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
274
+ """
275
+
276
+ def __init__(self, learner=None):
277
+ super().__init__(learner=learner, max_iter=1)
278
+
279
+ def _iterate(self, predictions, priors):
280
+ """Returns training priors without adjustment.
281
+
282
+ Parameters
283
+ ----------
284
+ predictions : array-like
285
+ Ignored in this implementation.
286
+ priors : array-like
287
+ Training priors, returned as is.
288
+
289
+ Returns
290
+ -------
291
+ prevalences : array-like
292
+ Equal to the training priors.
293
+ """
294
+ return priors
295
+
296
+ @define_binary
297
+ class CDE(SoftLearnerQMixin, BaseIterativeLikelihood):
298
+ r"""CDE-Iterate for binary classification prevalence estimation.
299
+
300
+ Threshold :math:`\tau` from false positive and false negative costs:
301
+ .. math::
302
+ \tau = \frac{c_{FP}}{c_{FP} + c_{FN}}
303
+
304
+ Hard classification by thresholding posterior probability :math:`p(+|x)` at :math:`\tau`:
305
+ .. math::
306
+ \hat{y}(x) = \mathbf{1}_{p(+|x) > \tau}
307
+
308
+ Prevalence estimation via classify-and-count:
309
+ .. math::
310
+ \hat{p}_U(+) = \frac{1}{N} \sum_{n=1}^N \hat{y}(x_n)
311
+
312
+ False positive cost update:
313
+ .. math::
314
+ c_{FP}^{new} = \frac{p_L(+)}{p_L(-)} \times \frac{\hat{p}_U(-)}{\hat{p}_U(+)} \times c_{FN}
315
+
316
+ Parameters
317
+ ----------
318
+ learner : estimator, optional
319
+ Wrapped classifier (unused).
320
+ tol : float, default=1e-4
321
+ Convergence tolerance.
322
+ max_iter : int, default=100
323
+ Max iterations.
324
+ init_cfp : float, default=1.0
325
+ Initial false positive cost.
326
+
327
+ References
328
+ ----------
329
+ .. [1] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
330
+ """
331
+
332
+ _parameter_constraints = {
333
+ "tol": [Interval(0, None, inclusive_left=False)],
334
+ "max_iter": [Interval(1, None, inclusive_left=True)],
335
+ "init_cfp": [Interval(0, None, inclusive_left=False)]
336
+ }
337
+
338
+ def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0):
339
+ super().__init__(learner=learner, tol=tol, max_iter=max_iter)
340
+ self.init_cfp = float(init_cfp)
341
+
342
+ def _iterate(self, predictions, priors):
343
+ r"""Iteratively estimate prevalences via cost-sensitive thresholding.
344
+
345
+ Parameters
346
+ ----------
347
+ predictions : ndarray, shape (n_samples, 2)
348
+ Posterior probabilities for binary classes [neg, pos].
349
+ priors : ndarray, shape (2,)
350
+ Training priors [p(neg), p(pos)].
351
+
352
+ Returns
353
+ -------
354
+ prevalences : ndarray, shape (2,)
355
+ Estimated prevalences for classes [neg, pos].
356
+ """
357
+ P = np.asarray(predictions, dtype=np.float64)
358
+ Ptr = np.asarray(priors, dtype=np.float64)
359
+
360
+ # basic checks
361
+ if P.ndim != 2 or P.shape[1] != 2:
362
+ raise ValueError("CDE implementation here supports binary case only: predictions shape (n,2).")
363
+
364
+ # ensure no zeros
365
+ eps = 1e-12
366
+ P = np.clip(P, eps, 1.0)
367
+
368
+ # training priors pL(+), pL(-)
369
+ # assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
370
+ pL_pos = Ptr[1]
371
+ pL_neg = Ptr[0]
372
+ if pL_pos <= 0 or pL_neg <= 0:
373
+ # keep them positive to avoid divisions by zero
374
+ pL_pos = max(pL_pos, eps)
375
+ pL_neg = max(pL_neg, eps)
376
+
377
+ # initialize costs
378
+ cFN = 1.0
379
+ cFP = float(self.init_cfp)
380
+
381
+ prev_prev_pos = None
382
+ s = 0
383
+
384
+ # iterate: compute threshold from costs, classify, estimate prevalences via CC,
385
+ # update cFP via eq. (4.27), repeat
386
+ while s < self.max_iter:
387
+ # decision threshold tau for positive class:
388
+ # Derivation:
389
+ # predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
390
+ # => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
391
+ # since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
392
+ # p(+|x) > cost_FP / (cost_FP + cost_FN)
393
+ tau = cFP / (cFP + cFN)
394
+
395
+ # hard predictions for positive class using threshold on posterior for positive (col 1)
396
+ pos_probs = P[:, 1]
397
+ hard_pos = (pos_probs > tau).astype(float)
398
+
399
+ # classify-and-count prevalence estimate on U
400
+ prev_pos = hard_pos.mean()
401
+ prev_neg = 1.0 - prev_pos
402
+
403
+ # update cFP according to Eq. 4.27:
404
+ # cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
405
+ # guard against zero prev_pos / prev_neg
406
+ prev_pos_safe = max(prev_pos, eps)
407
+ prev_neg_safe = max(prev_neg, eps)
408
+
409
+ cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
410
+
411
+ # check convergence on prevalences (absolute change)
412
+ if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
413
+ break
414
+
415
+ # prepare next iter
416
+ cFP = cFP_new
417
+ prev_prev_pos = prev_pos
418
+ s += 1
419
+
420
+ # if didn't converge within max_iter we keep last estimate (book warns about lack of fisher consistency)
421
+ if s >= self.max_iter:
422
+ # optional: warning
423
+ # print('[warning] CDE-Iterate reached max_iter without converging')
424
+ pass
425
+
426
+ prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
427
+ # ensure sums to 1 (numerical safety)
428
+ prevalences = prevalences / prevalences.sum()
429
+
430
+ return prevalences
@@ -0,0 +1 @@
1
+ from ._classes import EnsembleQ, QuaDapt, AggregativeBootstrap