mlquantify 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ import numpy as np
2
+ from abc import abstractmethod
3
+
4
+ from mlquantify.base import BaseQuantifier
5
+
6
+ from mlquantify.base_aggregative import (
7
+ AggregationMixin,
8
+ _get_learner_function
9
+ )
10
+ from mlquantify.adjust_counting import CC
11
+ from mlquantify.utils._decorators import _fit_context
12
+ from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_y, validate_data, validate_prevalences
13
+
14
+
15
+
16
+ class BaseIterativeLikelihood(AggregationMixin, BaseQuantifier):
17
+ """
18
+ Iterative, likelihood-based quantification via EM adjustment.
19
+
20
+ This is the base class for quantification methods that estimate class prevalences
21
+ by solving the maximum likelihood problem under prior probability shift, using
22
+ iterative procedures such as the EM (Expectation-Maximization) algorithm
23
+ [1], [2].
24
+
25
+ These methods repeatedly adjust the estimated class prevalences for a test set
26
+ by maximizing the likelihood of observed classifier outputs (posterior probabilities),
27
+ under the assumption that the within-class conditional distributions remain fixed
28
+ between training and test domains.
29
+
30
+ Mathematical formulation
31
+ ------------------------
32
+ Let:
33
+ - \( p_k^t \) denote the prior probability for class \( k \) in the training set (\( \sum_k p_k^t = 1 \)),
34
+ - \( s_k(x) \) be the classifier's posterior probability estimate (for class \( k \), given instance \( x \), fitted on training set),
35
+ - \( p_k \) be the (unknown) prior for the test set,
36
+ - \( x_1, \dots, x_N \) the unlabeled test set instances.
37
+
38
+ The procedure iteratively estimates \( p_k \) by maximizing the observed data likelihood
39
+
40
+ \[
41
+ L = \prod_{i=1}^N \sum_{k=1}^K s_k(x_i) \frac{p_k}{p_k^t}
42
+ \]
43
+
44
+ The E-step updates soft memberships:
45
+
46
+ \[
47
+ w_{ik}^{(t)} = \frac{s_k(x_i) \cdot (p_k^{(t-1)} / p_k^t)}{\sum_{j=1}^K s_j(x_i) \cdot (p_j^{(t-1)} / p_j^t)}
48
+ \]
49
+ and the M-step re-estimates prevalences:
50
+
51
+ \[
52
+ p_k^{(t)} = \frac{1}{N} \sum_{i=1}^N w_{ik}^{(t)}
53
+ \]
54
+ See also [1].
55
+
56
+ Notes
57
+ -----
58
+ - Defined for multiclass and binary quantification (single-label), as long as the classifier provides well-calibrated posterior probabilities.
59
+ - Assumes prior probability shift only.
60
+ - Converges to a (local) maximum of the data likelihood.
61
+ - The algorithm is Fisher-consistent under prior probability shift [2].
62
+ - Closely related to the Expectation-Maximization (EM) algorithm for mixture models.
63
+
64
+ Parameters
65
+ ----------
66
+ learner : estimator, optional
67
+ Probabilistic classifier instance with `fit(X, y)` and `predict_proba(X)`.
68
+ tol : float, default=1e-4
69
+ Convergence tolerance for prevalence update.
70
+ max_iter : int, default=100
71
+ Maximum number of EM update iterations.
72
+
73
+ Attributes
74
+ ----------
75
+ learner : estimator
76
+ Underlying classifier instance.
77
+ tol : float
78
+ Stopping tolerance for EM prevalence estimation.
79
+ max_iter : int
80
+ Maximum updates performed.
81
+ classes : ndarray of shape (n_classes,)
82
+ Unique class labels seen in training.
83
+ priors : ndarray of shape (n_classes,)
84
+ Class distribution of the training set.
85
+ y_train : array-like
86
+ Training labels (used for estimating priors and confusion matrix if needed).
87
+
88
+ Examples
89
+ --------
90
+ >>> import numpy as np
91
+ >>> from sklearn.linear_model import LogisticRegression
92
+ >>> class MyEM(BaseIterativeLikelihood):
93
+ ... def _iterate(self, predictions, priors):
94
+ ... # EM iteration logic here
95
+ ... pass
96
+ >>> X = np.random.randn(200, 8)
97
+ >>> y = np.random.randint(0, 3, size=(200,))
98
+ >>> q = MyEM(learner=LogisticRegression(max_iter=200))
99
+ >>> q.fit(X, y)
100
+ >>> q.predict(X)
101
+ {0: 0.32, 1: 0.40, 2: 0.28}
102
+
103
+ References
104
+ ----------
105
+ [1] Saerens, M., Latinne, P., & Decaestecker, C. (2002). *Adjusting the Outputs of a Classifier to New a Priori Probabilities: A Simple Procedure.* Neural Computation, 14(1), 2141-2156.
106
+
107
+ [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). *Learning to Quantify.* The Information Retrieval Series 47, Springer. https://doi.org/10.1007/978-3-031-20467-8
108
+ """
109
+
110
+
111
+
112
+ @abstractmethod
113
+ def __init__(self,
114
+ learner=None,
115
+ tol=1e-4,
116
+ max_iter=100):
117
+ self.learner = learner
118
+ self.tol = tol
119
+ self.max_iter = max_iter
120
+
121
+ def __mlquantify_tags__(self):
122
+ tags = super().__mlquantify_tags__()
123
+ tags.prediction_requirements.requires_train_proba = False
124
+ return tags
125
+
126
+
127
+ @_fit_context(prefer_skip_nested_validation=True)
128
+ def fit(self, X, y):
129
+ """Fit the quantifier using the provided data and learner."""
130
+ X, y = validate_data(self, X, y)
131
+ validate_y(self, y)
132
+ self.classes_ = np.unique(y)
133
+ self.learner.fit(X, y)
134
+ counts = np.array([np.count_nonzero(y == _class) for _class in self.classes_])
135
+ self.priors = counts / len(y)
136
+ self.y_train = y
137
+
138
+ return self
139
+
140
+ def predict(self, X):
141
+ """Predict class prevalences for the given data."""
142
+ estimator_function = _get_learner_function(self)
143
+ predictions = getattr(self.learner, estimator_function)(X)
144
+ prevalences = self.aggregate(predictions, self.y_train)
145
+ return prevalences
146
+
147
+ def aggregate(self, predictions, y_train):
148
+ predictions = validate_predictions(self, predictions)
149
+ self.classes_ = check_classes_attribute(self, np.unique(y_train))
150
+
151
+ if not hasattr(self, 'priors') or len(self.priors) != len(self.classes_):
152
+ counts = np.array([np.count_nonzero(y_train == _class) for _class in self.classes_])
153
+ self.priors = counts / len(y_train)
154
+
155
+ prevalences = self._iterate(predictions, self.priors)
156
+ prevalences = validate_prevalences(self, prevalences, self.classes_)
157
+ return prevalences
158
+
159
+ @abstractmethod
160
+ def _iterate(self, predictions, priors):
161
+ ...
@@ -0,0 +1,414 @@
1
+ import numpy as np
2
+ from mlquantify.base_aggregative import SoftLearnerQMixin
3
+ from mlquantify.likelihood._base import BaseIterativeLikelihood
4
+ from mlquantify.metrics._slq import MAE
5
+ from mlquantify.multiclass import define_binary
6
+ from mlquantify.utils._constraints import (
7
+ Interval,
8
+ CallableConstraint,
9
+ Options
10
+ )
11
+
12
+ class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
13
+ """Expectation-Maximization Quantifier.
14
+
15
+ Implements iterative quantification using an EM algorithm to adjust class
16
+ prevalences under prior probability shift, assimilating posterior probabilities
17
+ (soft predictions) from a probabilistic classifier.
18
+
19
+ The EM procedure alternates between estimating posterior memberships of test
20
+ instances (E-step) and re-estimating class prevalences (M-step), iterating until
21
+ convergence (tolerance or max iterations) on prevalence change measured by a
22
+ user-defined criteria (default: Mean Absolute Error, MAE).
23
+
24
+ Supports optional calibration of predicted posteriors before iteration.
25
+
26
+ Parameters
27
+ ----------
28
+ learner : estimator, optional
29
+ Probabilistic classifier fit on training data with `predict_proba`.
30
+ tol : float, default=1e-4
31
+ Convergence threshold for EM iterative updates.
32
+ max_iter : int, default=100
33
+ Maximum number of EM iterations.
34
+ calib_function : str or callable, optional
35
+ Calibration method applied to posterior probabilities.
36
+ Supported strings: 'bcts', 'ts', 'vs', 'nbvs'.
37
+ criteria : callable, default=MAE
38
+ Function to measure convergence between prevalence estimates.
39
+
40
+ Methods
41
+ -------
42
+ _iterate(predictions, priors)
43
+ Executes EM iterations to estimate prevalences from posterior probabilities.
44
+ EM(posteriors, priors, tolerance, max_iter, criteria)
45
+ Static method implementing the EM loop with E-step and M-step.
46
+ _apply_calibration(predictions)
47
+ Applies optional calibration method to posterior predictions.
48
+
49
+ References
50
+ ----------
51
+ [1] Saerens et al. (2002). Adjusting the Outputs of a Classifier to New a Priori Probabilities. Neural Computation, 14(1), 2141-2156.
52
+ [2] Esuli et al. (2023). Learning to Quantify. Springer.
53
+ """
54
+
55
+ _parameter_constraints = {
56
+ "tol": [Interval(0, None, inclusive_left=False)],
57
+ "max_iter": [Interval(1, None, inclusive_left=True)],
58
+ "calib_function": [
59
+ Options(["bcts", "ts", "vs", "nbvs", None]),
60
+ ],
61
+ "criteria": [CallableConstraint()],
62
+ }
63
+
64
+ def __init__(self,
65
+ learner=None,
66
+ tol=1e-4,
67
+ max_iter=100,
68
+ calib_function=None,
69
+ criteria=MAE):
70
+ super().__init__(learner=learner, tol=tol, max_iter=max_iter)
71
+ self.calib_function = calib_function
72
+ self.criteria = criteria
73
+
74
+ def _iterate(self, predictions, priors):
75
+ """
76
+ Perform EM quantification iteration.
77
+
78
+ Steps:
79
+ - Calibrate posterior predictions if calibration function specified.
80
+ - Apply EM procedure to re-estimate prevalences, based on training priors and calibrated posteriors.
81
+
82
+ Parameters
83
+ ----------
84
+ predictions : ndarray of shape (n_samples, n_classes)
85
+ Posterior probabilities for each class on test data.
86
+ priors : ndarray of shape (n_classes,)
87
+ Training set class prevalences, serving as initial priors.
88
+
89
+ Returns
90
+ -------
91
+ prevalences : ndarray of shape (n_classes,)
92
+ Estimated class prevalences after EM iteration.
93
+ """
94
+ calibrated_predictions = self._apply_calibration(predictions)
95
+ prevalences, _ = self.EM(
96
+ posteriors=calibrated_predictions,
97
+ priors=priors,
98
+ tolerance=self.tol,
99
+ max_iter=self.max_iter,
100
+ criteria=self.criteria
101
+ )
102
+ return prevalences
103
+
104
+
105
+ @classmethod
106
+ def EM(cls, posteriors, priors, tolerance=1e-6, max_iter=100, criteria=MAE):
107
+ """
108
+ Static method implementing the EM algorithm for quantification.
109
+
110
+ Parameters
111
+ ----------
112
+ posteriors : ndarray of shape (n_samples, n_classes)
113
+ Posterior probability predictions.
114
+ priors : ndarray of shape (n_classes,)
115
+ Training class prior probabilities.
116
+ tolerance : float
117
+ Convergence threshold based on difference between iterations.
118
+ max_iter : int
119
+ Max number of EM iterations.
120
+ criteria : callable
121
+ Metric to assess convergence, e.g., MAE.
122
+
123
+ Returns
124
+ -------
125
+ qs : ndarray of shape (n_classes,)
126
+ Estimated test set class prevalences.
127
+ ps : ndarray of shape (n_samples, n_classes)
128
+ Updated soft membership probabilities per instance.
129
+ """
130
+
131
+ Px = np.array(posteriors, dtype=np.float64)
132
+ Ptr = np.array(priors, dtype=np.float64)
133
+
134
+
135
+
136
+ if np.prod(Ptr) == 0:
137
+ Ptr += tolerance
138
+ Ptr /= Ptr.sum()
139
+
140
+ qs = np.copy(Ptr)
141
+ s, converged = 0, False
142
+ qs_prev_ = None
143
+
144
+ while not converged and s < max_iter:
145
+ # E-step:
146
+ ps_unnormalized = (qs / Ptr) * Px
147
+ ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
148
+
149
+ # M-step:
150
+ qs = ps.mean(axis=0)
151
+
152
+ if qs_prev_ is not None and criteria(qs_prev_, qs) < tolerance and s > 10:
153
+ converged = True
154
+
155
+ qs_prev_ = qs
156
+ s += 1
157
+
158
+ if not converged:
159
+ print('[warning] the method has reached the maximum number of iterations; it might have not converged')
160
+
161
+ return qs, ps
162
+
163
+
164
+ def _apply_calibration(self, predictions):
165
+ """
166
+ Calibrate posterior predictions with specified calibration method.
167
+
168
+ Parameters
169
+ ----------
170
+ predictions : ndarray
171
+ Posterior predictions to calibrate.
172
+
173
+ Returns
174
+ -------
175
+ calibrated_predictions : ndarray
176
+ Calibrated posterior predictions.
177
+
178
+ Raises
179
+ ------
180
+ ValueError
181
+ If calib_function is unrecognized.
182
+ """
183
+ if self.calib_function is None:
184
+ return predictions
185
+
186
+ if isinstance(self.calib_function, str):
187
+ method = self.calib_function.lower()
188
+ if method == "ts":
189
+ return self._temperature_scaling(predictions)
190
+ elif method == "bcts":
191
+ return self._bias_corrected_temperature_scaling(predictions)
192
+ elif method == "vs":
193
+ return self._vector_scaling(predictions)
194
+ elif method == "nbvs":
195
+ return self._no_bias_vector_scaling(predictions)
196
+
197
+ elif callable(self.calib_function):
198
+ return self.calib_function(predictions)
199
+
200
+ raise ValueError(
201
+ f"Invalid calib_function '{self.calib_function}'. Expected one of {{'bcts', 'ts', 'vs', 'nbvs', None, callable}}."
202
+ )
203
+
204
+ def _temperature_scaling(self, preds):
205
+ """Temperature Scaling calibration applied to logits."""
206
+ T = 1.0
207
+ preds = np.clip(preds, 1e-12, 1.0)
208
+ logits = np.log(preds)
209
+ scaled = logits / T
210
+ exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
211
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
212
+
213
+ def _bias_corrected_temperature_scaling(self, preds):
214
+ """Bias-Corrected Temperature Scaling calibration."""
215
+ T = 1.0
216
+ bias = np.zeros(preds.shape[1])
217
+ preds = np.clip(preds, 1e-12, 1.0)
218
+ logits = np.log(preds)
219
+ logits = logits / T + bias
220
+ exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
221
+ return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
222
+
223
+ def _vector_scaling(self, preds):
224
+ """Vector Scaling calibration."""
225
+ W = np.ones(preds.shape[1])
226
+ b = np.zeros(preds.shape[1])
227
+ preds = np.clip(preds, 1e-12, 1.0)
228
+ logits = np.log(preds)
229
+ scaled = logits * W + b
230
+ exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
231
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
232
+
233
+ def _no_bias_vector_scaling(self, preds):
234
+ """No-Bias Vector Scaling calibration."""
235
+ W = np.ones(preds.shape[1])
236
+ preds = np.clip(preds, 1e-12, 1.0)
237
+ logits = np.log(preds)
238
+ scaled = logits * W
239
+ exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
240
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
241
+
242
+
243
+
244
+ class MLPE(SoftLearnerQMixin, BaseIterativeLikelihood):
245
+ """
246
+ Maximum Likelihood Prevalence Estimation (MLPE) quantifier.
247
+
248
+ A simple iterative likelihood quantification method that returns the
249
+ training priors as the estimated prevalences, effectively skipping iteration.
250
+
251
+ This method assumes no prior probability shift between training and test.
252
+
253
+ Parameters
254
+ ----------
255
+ learner : estimator, optional
256
+ Base classifier for possible extension/fitting.
257
+ """
258
+
259
+ def __init__(self, learner=None):
260
+ super().__init__(learner=learner, max_iter=1)
261
+
262
+ def _iterate(self, predictions, priors):
263
+ """Returns training priors without adjustment.
264
+
265
+ Parameters
266
+ ----------
267
+ predictions : array-like
268
+ Ignored in this implementation.
269
+ priors : array-like
270
+ Training priors, returned as is.
271
+
272
+ Returns
273
+ -------
274
+ prevalences : array-like
275
+ Equal to the training priors.
276
+ """
277
+ return priors
278
+
279
+ @define_binary
280
+ class CDE(SoftLearnerQMixin, BaseIterativeLikelihood):
281
+ """
282
+ CDE-Iterate (Class Distribution Estimation Iterate) for binary classification.
283
+
284
+ This method iteratively estimates class prevalences under prior probability shift
285
+ by updating the false positive cost in a cost-sensitive classification framework,
286
+ using a thresholding strategy based on posterior probabilities.
287
+
288
+ The procedure:
289
+ - Calculates a threshold from false-positive (cFP) and false-negative (cFN) costs.
290
+ - Assigns hard positive predictions where posterior probability exceeds threshold.
291
+ - Estimates the prevalence via classify-and-count on thresholded predictions.
292
+ - Updates false positive cost according to prevalence estimates and training priors.
293
+ - Iterates until prevalence estimates converge or max iterations reached.
294
+
295
+ This implementation adopts the transductive thresholding variant described in
296
+ Esuli et al. (2023), rather than retraining a cost-sensitive classifier as in
297
+ Xue & Weiss (2009).
298
+
299
+ Parameters
300
+ ----------
301
+ learner : estimator, optional
302
+ Wrapped classifier (unused here but part of base interface).
303
+ tol : float, default=1e-4
304
+ Absolute tolerance for convergence of estimated prevalences.
305
+ max_iter : int, default=100
306
+ Max number of iterations allowed.
307
+ init_cfp : float, default=1.0
308
+ Initial false positive cost coefficient.
309
+
310
+ References
311
+ ----------
312
+ [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
313
+ """
314
+
315
+ _parameter_constraints = {
316
+ "tol": [Interval(0, None, inclusive_left=False)],
317
+ "max_iter": [Interval(1, None, inclusive_left=True)],
318
+ "init_cfp": [Interval(0, None, inclusive_left=False)]
319
+ }
320
+
321
+ def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0):
322
+ super().__init__(learner=learner, tol=tol, max_iter=max_iter)
323
+ self.init_cfp = float(init_cfp)
324
+
325
+ def _iterate(self, predictions, priors):
326
+ """
327
+ Iteratively estimate prevalences via cost-sensitive thresholding.
328
+
329
+ Parameters
330
+ ----------
331
+ predictions : ndarray, shape (n_samples, 2)
332
+ Posterior probabilities for binary classes [neg, pos].
333
+ priors : ndarray, shape (2,)
334
+ Training priors [p(neg), p(pos)].
335
+
336
+ Returns
337
+ -------
338
+ prevalences : ndarray, shape (2,)
339
+ Estimated prevalences for classes [neg, pos].
340
+ """
341
+ P = np.asarray(predictions, dtype=np.float64)
342
+ Ptr = np.asarray(priors, dtype=np.float64)
343
+
344
+ # basic checks
345
+ if P.ndim != 2 or P.shape[1] != 2:
346
+ raise ValueError("CDE implementation here supports binary case only: predictions shape (n,2).")
347
+
348
+ # ensure no zeros
349
+ eps = 1e-12
350
+ P = np.clip(P, eps, 1.0)
351
+
352
+ # training priors pL(+), pL(-)
353
+ # assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
354
+ pL_pos = Ptr[1]
355
+ pL_neg = Ptr[0]
356
+ if pL_pos <= 0 or pL_neg <= 0:
357
+ # keep them positive to avoid divisions by zero
358
+ pL_pos = max(pL_pos, eps)
359
+ pL_neg = max(pL_neg, eps)
360
+
361
+ # initialize costs
362
+ cFN = 1.0
363
+ cFP = float(self.init_cfp)
364
+
365
+ prev_prev_pos = None
366
+ s = 0
367
+
368
+ # iterate: compute threshold from costs, classify, estimate prevalences via CC,
369
+ # update cFP via eq. (4.27), repeat
370
+ while s < self.max_iter:
371
+ # decision threshold tau for positive class:
372
+ # Derivation:
373
+ # predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
374
+ # => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
375
+ # since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
376
+ # p(+|x) > cost_FP / (cost_FP + cost_FN)
377
+ tau = cFP / (cFP + cFN)
378
+
379
+ # hard predictions for positive class using threshold on posterior for positive (col 1)
380
+ pos_probs = P[:, 1]
381
+ hard_pos = (pos_probs > tau).astype(float)
382
+
383
+ # classify-and-count prevalence estimate on U
384
+ prev_pos = hard_pos.mean()
385
+ prev_neg = 1.0 - prev_pos
386
+
387
+ # update cFP according to Eq. 4.27:
388
+ # cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
389
+ # guard against zero prev_pos / prev_neg
390
+ prev_pos_safe = max(prev_pos, eps)
391
+ prev_neg_safe = max(prev_neg, eps)
392
+
393
+ cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
394
+
395
+ # check convergence on prevalences (absolute change)
396
+ if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
397
+ break
398
+
399
+ # prepare next iter
400
+ cFP = cFP_new
401
+ prev_prev_pos = prev_pos
402
+ s += 1
403
+
404
+ # if didn't converge within max_iter we keep last estimate (book warns about lack of fisher consistency)
405
+ if s >= self.max_iter:
406
+ # optional: warning
407
+ # print('[warning] CDE-Iterate reached max_iter without converging')
408
+ pass
409
+
410
+ prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
411
+ # ensure sums to 1 (numerical safety)
412
+ prevalences = prevalences / prevalences.sum()
413
+
414
+ return prevalences
@@ -0,0 +1 @@
1
+ from ._classes import EnsembleQ, QuaDapt, AggregativeBootstrap