mlquantify 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,16 @@
1
+ from mlquantify.base import BaseQuantifier
2
+ from mlquantify.base_aggregative import AggregationMixin
1
3
  import numpy as np
2
4
  from mlquantify.base_aggregative import SoftLearnerQMixin
3
- from mlquantify.likelihood._base import BaseIterativeLikelihood
4
5
  from mlquantify.metrics._slq import MAE
5
- from mlquantify.multiclass import define_binary
6
+ from mlquantify.utils import _fit_context, validate_data, check_classes_attribute, validate_predictions, validate_prevalences
6
7
  from mlquantify.utils._constraints import (
7
8
  Interval,
8
9
  CallableConstraint,
9
10
  Options
10
11
  )
11
12
 
12
- class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
13
+ class EMQ(SoftLearnerQMixin, AggregationMixin, BaseQuantifier):
13
14
  r"""Expectation-Maximization Quantifier (EMQ).
14
15
 
15
16
  Estimates class prevalences under prior probability shift by alternating
@@ -81,45 +82,63 @@ class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
81
82
  "criteria": [CallableConstraint()],
82
83
  }
83
84
 
85
+ def __mlquantify_tags__(self):
86
+ tags = super().__mlquantify_tags__()
87
+ tags.prediction_requirements.requires_train_proba = False
88
+ return tags
89
+
84
90
  def __init__(self,
85
91
  learner=None,
86
92
  tol=1e-4,
87
93
  max_iter=100,
88
94
  calib_function=None,
89
95
  criteria=MAE):
90
- super().__init__(learner=learner, tol=tol, max_iter=max_iter)
96
+ self.learner = learner
97
+ self.tol = tol
98
+ self.max_iter = max_iter
91
99
  self.calib_function = calib_function
92
100
  self.criteria = criteria
101
+
102
+ @_fit_context(prefer_skip_nested_validation=True)
103
+ def fit(self, X, y):
104
+ """Fit the quantifier using the provided data and learner."""
105
+ X, y = validate_data(self, X, y)
106
+ self.classes_ = np.unique(y)
107
+ self.learner.fit(X, y)
108
+ counts = np.array([np.count_nonzero(y == _class) for _class in self.classes_])
109
+ self.priors = counts / len(y)
110
+ self.y_train = y
111
+
112
+ return self
113
+
114
+ def predict(self, X):
115
+ """Predict the prevalence of each class."""
116
+ X = validate_data(self, X)
117
+ estimator_function = _get_learner_function(self)
118
+ predictions = getattr(self.learner, estimator_function)(X)
119
+ prevalences = self.aggregate(predictions, self.y_train)
120
+ return prevalences
121
+
122
+ def aggregate(self, predictions, y_train):
123
+ predictions = validate_predictions(self, predictions)
124
+ self.classes_ = check_classes_attribute(self, np.unique(y_train))
93
125
 
94
- def _iterate(self, predictions, priors):
95
- r"""Perform EM quantification iteration.
96
-
97
- Steps:
98
- - Calibrate posterior predictions if calibration function specified.
99
- - Apply EM procedure to re-estimate prevalences, based on training priors and calibrated posteriors.
100
-
101
- Parameters
102
- ----------
103
- predictions : ndarray of shape (n_samples, n_classes)
104
- Posterior probabilities for each class on test data.
105
- priors : ndarray of shape (n_classes,)
106
- Training set class prevalences, serving as initial priors.
107
-
108
- Returns
109
- -------
110
- prevalences : ndarray of shape (n_classes,)
111
- Estimated class prevalences after EM iteration.
112
- """
126
+ if not hasattr(self, 'priors') or len(self.priors) != len(self.classes_):
127
+ counts = np.array([np.count_nonzero(y_train == _class) for _class in self.classes_])
128
+ self.priors = counts / len(y_train)
129
+
113
130
  calibrated_predictions = self._apply_calibration(predictions)
114
131
  prevalences, _ = self.EM(
115
132
  posteriors=calibrated_predictions,
116
- priors=priors,
133
+ priors=self.priors,
117
134
  tolerance=self.tol,
118
135
  max_iter=self.max_iter,
119
136
  criteria=self.criteria
120
137
  )
121
- return prevalences
122
138
 
139
+ prevalences = validate_prevalences(self, prevalences, self.classes_)
140
+ return prevalences
141
+
123
142
 
124
143
  @classmethod
125
144
  def EM(cls, posteriors, priors, tolerance=1e-6, max_iter=100, criteria=MAE):
@@ -254,177 +273,4 @@ class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
254
273
  logits = np.log(preds)
255
274
  scaled = logits * W
256
275
  exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
257
- return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
258
-
259
-
260
-
261
- class MLPE(SoftLearnerQMixin, BaseIterativeLikelihood):
262
- r"""Maximum Likelihood Prevalence Estimation (MLPE).
263
-
264
- Returns training priors as prevalence estimates without adaptations.
265
-
266
- Parameters
267
- ----------
268
- learner : estimator, optional
269
- Base classifier.
270
-
271
- References
272
- ----------
273
- .. [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
274
- """
275
-
276
- def __init__(self, learner=None):
277
- super().__init__(learner=learner, max_iter=1)
278
-
279
- def _iterate(self, predictions, priors):
280
- """Returns training priors without adjustment.
281
-
282
- Parameters
283
- ----------
284
- predictions : array-like
285
- Ignored in this implementation.
286
- priors : array-like
287
- Training priors, returned as is.
288
-
289
- Returns
290
- -------
291
- prevalences : array-like
292
- Equal to the training priors.
293
- """
294
- return priors
295
-
296
- @define_binary
297
- class CDE(SoftLearnerQMixin, BaseIterativeLikelihood):
298
- r"""CDE-Iterate for binary classification prevalence estimation.
299
-
300
- Threshold :math:`\tau` from false positive and false negative costs:
301
- .. math::
302
- \tau = \frac{c_{FP}}{c_{FP} + c_{FN}}
303
-
304
- Hard classification by thresholding posterior probability :math:`p(+|x)` at :math:`\tau`:
305
- .. math::
306
- \hat{y}(x) = \mathbf{1}_{p(+|x) > \tau}
307
-
308
- Prevalence estimation via classify-and-count:
309
- .. math::
310
- \hat{p}_U(+) = \frac{1}{N} \sum_{n=1}^N \hat{y}(x_n)
311
-
312
- False positive cost update:
313
- .. math::
314
- c_{FP}^{new} = \frac{p_L(+)}{p_L(-)} \times \frac{\hat{p}_U(-)}{\hat{p}_U(+)} \times c_{FN}
315
-
316
- Parameters
317
- ----------
318
- learner : estimator, optional
319
- Wrapped classifier (unused).
320
- tol : float, default=1e-4
321
- Convergence tolerance.
322
- max_iter : int, default=100
323
- Max iterations.
324
- init_cfp : float, default=1.0
325
- Initial false positive cost.
326
-
327
- References
328
- ----------
329
- .. [1] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
330
- """
331
-
332
- _parameter_constraints = {
333
- "tol": [Interval(0, None, inclusive_left=False)],
334
- "max_iter": [Interval(1, None, inclusive_left=True)],
335
- "init_cfp": [Interval(0, None, inclusive_left=False)]
336
- }
337
-
338
- def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0):
339
- super().__init__(learner=learner, tol=tol, max_iter=max_iter)
340
- self.init_cfp = float(init_cfp)
341
-
342
- def _iterate(self, predictions, priors):
343
- r"""Iteratively estimate prevalences via cost-sensitive thresholding.
344
-
345
- Parameters
346
- ----------
347
- predictions : ndarray, shape (n_samples, 2)
348
- Posterior probabilities for binary classes [neg, pos].
349
- priors : ndarray, shape (2,)
350
- Training priors [p(neg), p(pos)].
351
-
352
- Returns
353
- -------
354
- prevalences : ndarray, shape (2,)
355
- Estimated prevalences for classes [neg, pos].
356
- """
357
- P = np.asarray(predictions, dtype=np.float64)
358
- Ptr = np.asarray(priors, dtype=np.float64)
359
-
360
- # basic checks
361
- if P.ndim != 2 or P.shape[1] != 2:
362
- raise ValueError("CDE implementation here supports binary case only: predictions shape (n,2).")
363
-
364
- # ensure no zeros
365
- eps = 1e-12
366
- P = np.clip(P, eps, 1.0)
367
-
368
- # training priors pL(+), pL(-)
369
- # assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
370
- pL_pos = Ptr[1]
371
- pL_neg = Ptr[0]
372
- if pL_pos <= 0 or pL_neg <= 0:
373
- # keep them positive to avoid divisions by zero
374
- pL_pos = max(pL_pos, eps)
375
- pL_neg = max(pL_neg, eps)
376
-
377
- # initialize costs
378
- cFN = 1.0
379
- cFP = float(self.init_cfp)
380
-
381
- prev_prev_pos = None
382
- s = 0
383
-
384
- # iterate: compute threshold from costs, classify, estimate prevalences via CC,
385
- # update cFP via eq. (4.27), repeat
386
- while s < self.max_iter:
387
- # decision threshold tau for positive class:
388
- # Derivation:
389
- # predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
390
- # => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
391
- # since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
392
- # p(+|x) > cost_FP / (cost_FP + cost_FN)
393
- tau = cFP / (cFP + cFN)
394
-
395
- # hard predictions for positive class using threshold on posterior for positive (col 1)
396
- pos_probs = P[:, 1]
397
- hard_pos = (pos_probs > tau).astype(float)
398
-
399
- # classify-and-count prevalence estimate on U
400
- prev_pos = hard_pos.mean()
401
- prev_neg = 1.0 - prev_pos
402
-
403
- # update cFP according to Eq. 4.27:
404
- # cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
405
- # guard against zero prev_pos / prev_neg
406
- prev_pos_safe = max(prev_pos, eps)
407
- prev_neg_safe = max(prev_neg, eps)
408
-
409
- cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
410
-
411
- # check convergence on prevalences (absolute change)
412
- if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
413
- break
414
-
415
- # prepare next iter
416
- cFP = cFP_new
417
- prev_prev_pos = prev_pos
418
- s += 1
419
-
420
- # if didn't converge within max_iter we keep last estimate (book warns about lack of fisher consistency)
421
- if s >= self.max_iter:
422
- # optional: warning
423
- # print('[warning] CDE-Iterate reached max_iter without converging')
424
- pass
425
-
426
- prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
427
- # ensure sums to 1 (numerical safety)
428
- prevalences = prevalences / prevalences.sum()
429
-
430
- return prevalences
276
+ return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
@@ -518,15 +518,15 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
518
518
 
519
519
  if val_split is None:
520
520
  model.fit(X, y)
521
- train_y_values = y
521
+ y_train = y
522
522
  train_predictions = getattr(model, learner_function)(X)
523
523
  else:
524
524
  X_fit, y_fit, X_val, y_val = train_test_split(X, y, test_size=val_split, random_state=self.random_state)
525
525
  model.fit(X_fit, y_fit)
526
- train_y_values = y_val
526
+ y_train = y_val
527
527
  train_predictions = getattr(model, learner_function)(X_val)
528
528
  self.train_predictions = train_predictions
529
- self.train_y_values = train_y_values
529
+ self.y_train = y_train
530
530
 
531
531
  return self
532
532
 
@@ -549,10 +549,10 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
549
549
 
550
550
  predictions = getattr(model, learner_function)(X)
551
551
 
552
- return self.aggregate(predictions, self.train_predictions, self.train_y_values)
552
+ return self.aggregate(predictions, self.train_predictions, self.y_train)
553
553
 
554
554
 
555
- def aggregate(self, predictions, train_predictions, train_y_values):
555
+ def aggregate(self, predictions, train_predictions, y_train):
556
556
  r""" Aggregates the predictions using bootstrap resampling.
557
557
 
558
558
  Parameters
@@ -561,7 +561,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
561
561
  The input data.
562
562
  train_predictions : array-like of shape (n_samples, n_classes)
563
563
  The training predictions.
564
- train_y_values : array-like of shape (n_samples,)
564
+ y_train : array-like of shape (n_samples,)
565
565
  The training target values.
566
566
 
567
567
  Returns
@@ -571,7 +571,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
571
571
  """
572
572
  prevalences = []
573
573
 
574
- self.classes = np.unique(train_y_values)
574
+ self.classes = np.unique(y_train)
575
575
 
576
576
  for train_idx in bootstrap_sample_indices(
577
577
  n_samples=len(train_predictions),
@@ -580,7 +580,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
580
580
  random_state=self.random_state
581
581
  ):
582
582
  train_pred_boot = train_predictions[train_idx]
583
- train_y_boot = train_y_values[train_idx]
583
+ train_y_boot = y_train[train_idx]
584
584
 
585
585
  for test_idx in bootstrap_sample_indices(
586
586
  n_samples=len(predictions),
@@ -679,7 +679,7 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
679
679
  raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} does not use training probabilities, which are required for QuaDapt.")
680
680
 
681
681
  self.quantifier.learner.fit(X, y)
682
- self.train_y_values = y
682
+ self.y_train = y
683
683
 
684
684
  return self
685
685
 
@@ -691,64 +691,72 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
691
691
 
692
692
  predictions = getattr(model, "predict_proba")(X)
693
693
 
694
- return self.aggregate(predictions, self.train_y_values)
694
+ return self.aggregate(predictions, self.y_train)
695
695
 
696
696
 
697
- def aggregate(self, predictions, train_y_values):
697
+ def aggregate(self, predictions, y_train):
698
698
 
699
- pos_predictions = predictions[:, 1]
700
- m = self._get_best_merging_factor(pos_predictions)
699
+ prevalence, _, _ = self.best_mixture(predictions)
700
+ prevalences = np.asarray([1-prevalence, prevalence])
701
701
 
702
- self.classes = self.classes if hasattr(self, 'classes') else np.unique(train_y_values)
703
-
704
- moss_scores, moss_labels = self.MoSS(1000, 0.5, m)
705
-
706
- prevalences = self.quantifier.aggregate(predictions,
707
- moss_scores,
708
- moss_labels)
702
+ self.classes = self.classes if hasattr(self, 'classes') else np.unique(y_train)
709
703
 
710
- prevalences = {self.classes[i]: v for i, v in enumerate(prevalences.values())}
704
+ prevalences = validate_prevalences(self, prevalences, self.classes)
711
705
  return prevalences
712
706
 
713
707
 
714
- def _get_best_merging_factor(self, predictions):
708
+ def best_mixture(self, predictions):
709
+ predictions = predictions[:, 1]
715
710
 
716
711
  MF = np.atleast_1d(np.round(self.merging_factors, 2)).astype(float)
717
712
 
718
713
  distances = []
714
+ alphas = []
719
715
 
720
716
  for mf in MF:
721
- scores, labels = self.MoSS(1000, 0.5, mf)
717
+ scores, labels = self.MoSS(n=1000, alpha=0.5, merging_factor=mf)
722
718
  pos_scores = scores[labels == 1][:, 1]
723
719
  neg_scores = scores[labels == 0][:, 1]
720
+
721
+ if self.measure in ["hellinger", "topsoe", "probsymm"]:
722
+ method = DyS(measure=self.measure)
723
+ elif self.measure == "sord":
724
+ method = SORD()
724
725
 
725
- best_distance = self._get_best_distance(predictions, pos_scores, neg_scores)
726
+ alpha, distance = method.best_mixture(predictions, pos_scores, neg_scores)
726
727
 
727
- distances.append(best_distance)
728
+ distances.append(distance)
729
+ alphas.append(alpha)
728
730
 
729
731
  best_m = MF[np.argmin(distances)]
730
- return best_m
732
+ best_alpha = alphas[np.argmin(distances)]
733
+ best_distance = np.min(distances)
734
+ return best_alpha, best_distance, best_m
731
735
 
732
- def _get_best_distance(self, predictions, pos_scores, neg_scores):
733
-
734
- if self.measure in ["hellinger", "topsoe", "probsymm"]:
735
- method = DyS(measure=self.measure)
736
- elif self.measure == "sord":
737
- method = SORD()
736
+ def get_best_distance(self, predictions):
738
737
 
739
- best_distance = method.get_best_distance(predictions, pos_scores, neg_scores)
740
- return best_distance
738
+ _, distance, _= self.get_best_merging_factor(predictions)
739
+
740
+ return distance
741
741
 
742
742
 
743
743
  @classmethod
744
- def MoSS(cls, n, alpha, m):
744
+ def MoSS(cls, n, alpha, merging_factor):
745
745
  r"""Model for Score Simulation
746
746
 
747
- MoSS has three key parameters:
748
- (I) the number of observations `n`;
749
- (II) the class proportion `\alpha`, which defines the prevalence of the positive class;
750
- (III) the merging factor :math:`m`, which controls the overlap between positive and negative score distributions
751
- (where :math:`m=0` represents easily separable classes and :math:`m=1` represents highly overlapping ones).
747
+ Parameters
748
+ ----------
749
+ n : int
750
+ Number of observations.
751
+ alpha : float
752
+ Class proportion, which defines the prevalence of the positive class.
753
+ m : float
754
+ Merging factor, which controls the overlap between positive and negative score distributions.
755
+
756
+ Returns
757
+ -------
758
+ tuple
759
+ Tuple of score and label arrays.
752
760
 
753
761
  .. math::
754
762
 
@@ -776,9 +784,9 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
776
784
  n_neg = n - n_pos
777
785
 
778
786
  # Scores positivos
779
- p_score = np.random.uniform(size=n_pos) ** m
787
+ p_score = np.random.uniform(size=n_pos) ** merging_factor
780
788
  # Scores negativos
781
- n_score = 1 - (np.random.uniform(size=n_neg) ** m)
789
+ n_score = 1 - (np.random.uniform(size=n_neg) ** merging_factor)
782
790
 
783
791
  # Construção dos arrays de features (duas colunas iguais)
784
792
  moss = np.column_stack(
@@ -3,5 +3,6 @@ from ._classes import (
3
3
  DyS,
4
4
  SMM,
5
5
  SORD,
6
- HDx
6
+ HDx,
7
+ MMD_RKHS
7
8
  )