PyPI - mlquantify - Versions diffs - 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl - Mend

mlquantify 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

mlquantify/__init__.py +2 -1
mlquantify/adjust_counting/__init__.py +6 -5
mlquantify/adjust_counting/_adjustment.py +208 -37
mlquantify/adjust_counting/_base.py +5 -6
mlquantify/adjust_counting/_counting.py +10 -7
mlquantify/likelihood/__init__.py +0 -2
mlquantify/likelihood/_classes.py +45 -199
mlquantify/meta/_classes.py +50 -42
mlquantify/mixture/__init__.py +2 -1
mlquantify/mixture/_classes.py +310 -15
mlquantify/model_selection/_search.py +1 -1
mlquantify/neighbors/_base.py +15 -15
mlquantify/neighbors/_classes.py +2 -2
mlquantify/neighbors/_kde.py +6 -6
mlquantify/neural/__init__.py +1 -1
mlquantify/neural/_base.py +0 -0
mlquantify/neural/_classes.py +609 -0
mlquantify/neural/_perm_invariant.py +0 -0
mlquantify/neural/_utils.py +0 -0
mlquantify/utils/__init__.py +2 -1
mlquantify/utils/_constraints.py +2 -0
mlquantify/utils/_validation.py +9 -0
{mlquantify-0.1.19.dist-info → mlquantify-0.1.21.dist-info}/METADATA +13 -18
{mlquantify-0.1.19.dist-info → mlquantify-0.1.21.dist-info}/RECORD +27 -23
{mlquantify-0.1.19.dist-info → mlquantify-0.1.21.dist-info}/WHEEL +1 -1
mlquantify-0.1.21.dist-info/licenses/LICENSE +28 -0
mlquantify/likelihood/_base.py +0 -147
{mlquantify-0.1.19.dist-info → mlquantify-0.1.21.dist-info}/top_level.txt +0 -0

mlquantify/likelihood/_classes.py CHANGED Viewed

@@ -1,15 +1,16 @@
+from mlquantify.base import BaseQuantifier
+from mlquantify.base_aggregative import AggregationMixin
 import numpy as np
 from mlquantify.base_aggregative import SoftLearnerQMixin
-from mlquantify.likelihood._base import BaseIterativeLikelihood
 from mlquantify.metrics._slq import MAE
-from mlquantify.multiclass import define_binary
+from mlquantify.utils import _fit_context, validate_data, check_classes_attribute, validate_predictions, validate_prevalences
 from mlquantify.utils._constraints import (
     Interval,
     CallableConstraint,
     Options
 )
-class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
+class EMQ(SoftLearnerQMixin, AggregationMixin, BaseQuantifier):
     r"""Expectation-Maximization Quantifier (EMQ).
     Estimates class prevalences under prior probability shift by alternating
@@ -81,45 +82,63 @@ class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
         "criteria": [CallableConstraint()],
     }
+    def __mlquantify_tags__(self):
+        tags = super().__mlquantify_tags__()
+        tags.prediction_requirements.requires_train_proba = False
+        return tags
     def __init__(self,
                  learner=None,
                  tol=1e-4,
                  max_iter=100,
                  calib_function=None,
                  criteria=MAE):
-        super().__init__(learner=learner, tol=tol, max_iter=max_iter)
+        self.learner = learner
+        self.tol = tol
+        self.max_iter = max_iter
         self.calib_function = calib_function
         self.criteria = criteria
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the quantifier using the provided data and learner."""
+        X, y = validate_data(self, X, y)
+        self.classes_ = np.unique(y)
+        self.learner.fit(X, y)
+        counts = np.array([np.count_nonzero(y == _class) for _class in self.classes_])
+        self.priors = counts / len(y)
+        self.y_train = y
+        return self
+    def predict(self, X):
+        """Predict the prevalence of each class."""
+        X = validate_data(self, X)
+        estimator_function = _get_learner_function(self)
+        predictions = getattr(self.learner, estimator_function)(X)
+        prevalences = self.aggregate(predictions, self.y_train)
+        return prevalences
+    def aggregate(self, predictions, y_train):
+        predictions = validate_predictions(self, predictions)
+        self.classes_ = check_classes_attribute(self, np.unique(y_train))
-    def _iterate(self, predictions, priors):
-        r"""Perform EM quantification iteration.
-        Steps:
-        - Calibrate posterior predictions if calibration function specified.
-        - Apply EM procedure to re-estimate prevalences, based on training priors and calibrated posteriors.
-        Parameters
-        ----------
-        predictions : ndarray of shape (n_samples, n_classes)
-            Posterior probabilities for each class on test data.
-        priors : ndarray of shape (n_classes,)
-            Training set class prevalences, serving as initial priors.
-        Returns
-        -------
-        prevalences : ndarray of shape (n_classes,)
-            Estimated class prevalences after EM iteration.
-        """
+        if not hasattr(self, 'priors') or len(self.priors) != len(self.classes_):
+            counts = np.array([np.count_nonzero(y_train == _class) for _class in self.classes_])
+            self.priors = counts / len(y_train)
         calibrated_predictions = self._apply_calibration(predictions)
         prevalences, _ = self.EM(
             posteriors=calibrated_predictions,
-            priors=priors,
+            priors=self.priors,
             tolerance=self.tol,
             max_iter=self.max_iter,
             criteria=self.criteria
         )
-        return prevalences
+        prevalences = validate_prevalences(self, prevalences, self.classes_)
+        return prevalences
     @classmethod
     def EM(cls, posteriors, priors, tolerance=1e-6, max_iter=100, criteria=MAE):
@@ -254,177 +273,4 @@ class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
         logits = np.log(preds)
         scaled = logits * W
         exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
-        return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
-class MLPE(SoftLearnerQMixin, BaseIterativeLikelihood):
-    r"""Maximum Likelihood Prevalence Estimation (MLPE).
-    Returns training priors as prevalence estimates without adaptations.
-    Parameters
-    ----------
-    learner : estimator, optional
-        Base classifier.
-    References
-    ----------
-    .. [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
-    """
-    def __init__(self, learner=None):
-        super().__init__(learner=learner, max_iter=1)
-    def _iterate(self, predictions, priors):
-        """Returns training priors without adjustment.
-        Parameters
-        ----------
-        predictions : array-like
-            Ignored in this implementation.
-        priors : array-like
-            Training priors, returned as is.
-        Returns
-        -------
-        prevalences : array-like
-            Equal to the training priors.
-        """
-        return priors
-@define_binary
-class CDE(SoftLearnerQMixin, BaseIterativeLikelihood):
-    r"""CDE-Iterate for binary classification prevalence estimation.
-    Threshold :math:`\tau` from false positive and false negative costs:
-    .. math::
-        \tau = \frac{c_{FP}}{c_{FP} + c_{FN}}
-    Hard classification by thresholding posterior probability :math:`p(+|x)` at :math:`\tau`:
-    .. math::
-        \hat{y}(x) = \mathbf{1}_{p(+|x) > \tau}
-    Prevalence estimation via classify-and-count:
-    .. math::
-        \hat{p}_U(+) = \frac{1}{N} \sum_{n=1}^N \hat{y}(x_n)
-    False positive cost update:
-    .. math::
-        c_{FP}^{new} = \frac{p_L(+)}{p_L(-)} \times \frac{\hat{p}_U(-)}{\hat{p}_U(+)} \times c_{FN}
-    Parameters
-    ----------
-    learner : estimator, optional
-        Wrapped classifier (unused).
-    tol : float, default=1e-4
-        Convergence tolerance.
-    max_iter : int, default=100
-        Max iterations.
-    init_cfp : float, default=1.0
-        Initial false positive cost.
-    References
-    ----------
-    .. [1] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
-    """
-    _parameter_constraints = {
-        "tol": [Interval(0, None, inclusive_left=False)],
-        "max_iter": [Interval(1, None, inclusive_left=True)],
-        "init_cfp": [Interval(0, None, inclusive_left=False)]
-    }
-    def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0):
-        super().__init__(learner=learner, tol=tol, max_iter=max_iter)
-        self.init_cfp = float(init_cfp)
-    def _iterate(self, predictions, priors):
-        r"""Iteratively estimate prevalences via cost-sensitive thresholding.
-        Parameters
-        ----------
-        predictions : ndarray, shape (n_samples, 2)
-            Posterior probabilities for binary classes [neg, pos].
-        priors : ndarray, shape (2,)
-            Training priors [p(neg), p(pos)].
-        Returns
-        -------
-        prevalences : ndarray, shape (2,)
-            Estimated prevalences for classes [neg, pos].
-        """
-        P = np.asarray(predictions, dtype=np.float64)
-        Ptr = np.asarray(priors, dtype=np.float64)
-        # basic checks
-        if P.ndim != 2 or P.shape[1] != 2:
-            raise ValueError("CDE implementation here supports binary case only: predictions shape (n,2).")
-        # ensure no zeros
-        eps = 1e-12
-        P = np.clip(P, eps, 1.0)
-        # training priors pL(+), pL(-)
-        # assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
-        pL_pos = Ptr[1]
-        pL_neg = Ptr[0]
-        if pL_pos <= 0 or pL_neg <= 0:
-            # keep them positive to avoid divisions by zero
-            pL_pos = max(pL_pos, eps)
-            pL_neg = max(pL_neg, eps)
-        # initialize costs
-        cFN = 1.0
-        cFP = float(self.init_cfp)
-        prev_prev_pos = None
-        s = 0
-        # iterate: compute threshold from costs, classify, estimate prevalences via CC,
-        # update cFP via eq. (4.27), repeat
-        while s < self.max_iter:
-            # decision threshold tau for positive class:
-            # Derivation:
-            # predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
-            # => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
-            # since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
-            # p(+|x) > cost_FP / (cost_FP + cost_FN)
-            tau = cFP / (cFP + cFN)
-            # hard predictions for positive class using threshold on posterior for positive (col 1)
-            pos_probs = P[:, 1]
-            hard_pos = (pos_probs > tau).astype(float)
-            # classify-and-count prevalence estimate on U
-            prev_pos = hard_pos.mean()
-            prev_neg = 1.0 - prev_pos
-            # update cFP according to Eq. 4.27:
-            # cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
-            # guard against zero prev_pos / prev_neg
-            prev_pos_safe = max(prev_pos, eps)
-            prev_neg_safe = max(prev_neg, eps)
-            cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
-            # check convergence on prevalences (absolute change)
-            if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
-                break
-            # prepare next iter
-            cFP = cFP_new
-            prev_prev_pos = prev_pos
-            s += 1
-        # if didn't converge within max_iter we keep last estimate (book warns about lack of fisher consistency)
-        if s >= self.max_iter:
-            # optional: warning
-            # print('[warning] CDE-Iterate reached max_iter without converging')
-            pass
-        prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
-        # ensure sums to 1 (numerical safety)
-        prevalences = prevalences / prevalences.sum()
-        return prevalences
+        return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)

mlquantify/meta/_classes.py CHANGED Viewed

@@ -518,15 +518,15 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
         if val_split is None:
             model.fit(X, y)
-            train_y_values = y
+            y_train = y
             train_predictions = getattr(model, learner_function)(X)
         else:
             X_fit, y_fit, X_val, y_val = train_test_split(X, y, test_size=val_split, random_state=self.random_state)
             model.fit(X_fit, y_fit)
-            train_y_values = y_val
+            y_train = y_val
             train_predictions = getattr(model, learner_function)(X_val)
         self.train_predictions = train_predictions
-        self.train_y_values = train_y_values
+        self.y_train = y_train
         return self
@@ -549,10 +549,10 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
         predictions = getattr(model, learner_function)(X)
-        return self.aggregate(predictions, self.train_predictions, self.train_y_values)
+        return self.aggregate(predictions, self.train_predictions, self.y_train)
-    def aggregate(self, predictions, train_predictions, train_y_values):
+    def aggregate(self, predictions, train_predictions, y_train):
         r""" Aggregates the predictions using bootstrap resampling.
         Parameters
@@ -561,7 +561,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
             The input data.
         train_predictions : array-like of shape (n_samples, n_classes)
             The training predictions.
-        train_y_values : array-like of shape (n_samples,)
+        y_train : array-like of shape (n_samples,)
             The training target values.
         Returns
@@ -571,7 +571,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
         """
         prevalences = []
-        self.classes = np.unique(train_y_values)
+        self.classes = np.unique(y_train)
         for train_idx in bootstrap_sample_indices(
             n_samples=len(train_predictions),
@@ -580,7 +580,7 @@ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
             random_state=self.random_state
         ):
             train_pred_boot = train_predictions[train_idx]
-            train_y_boot = train_y_values[train_idx]
+            train_y_boot = y_train[train_idx]
             for test_idx in bootstrap_sample_indices(
                 n_samples=len(predictions),
@@ -679,7 +679,7 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
             raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} does not use training probabilities, which are required for QuaDapt.")
         self.quantifier.learner.fit(X, y)
-        self.train_y_values = y
+        self.y_train = y
         return self
@@ -691,64 +691,72 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
         predictions = getattr(model, "predict_proba")(X)
-        return self.aggregate(predictions, self.train_y_values)
+        return self.aggregate(predictions, self.y_train)
-    def aggregate(self, predictions, train_y_values):
+    def aggregate(self, predictions, y_train):
-        pos_predictions = predictions[:, 1]
-        m = self._get_best_merging_factor(pos_predictions)
+        prevalence, _, _ = self.best_mixture(predictions)
+        prevalences = np.asarray([1-prevalence, prevalence])
-        self.classes = self.classes if hasattr(self, 'classes') else np.unique(train_y_values)
-        moss_scores, moss_labels = self.MoSS(1000, 0.5, m)
-        prevalences = self.quantifier.aggregate(predictions,
-                                                moss_scores,
-                                                moss_labels)
+        self.classes = self.classes if hasattr(self, 'classes') else np.unique(y_train)
-        prevalences = {self.classes[i]: v for i, v in enumerate(prevalences.values())}
+        prevalences = validate_prevalences(self, prevalences, self.classes)
         return prevalences
-    def _get_best_merging_factor(self, predictions):
+    def best_mixture(self, predictions):
+        predictions = predictions[:, 1]
         MF = np.atleast_1d(np.round(self.merging_factors, 2)).astype(float)
         distances = []
+        alphas = []
         for mf in MF:
-            scores, labels = self.MoSS(1000, 0.5, mf)
+            scores, labels = self.MoSS(n=1000, alpha=0.5, merging_factor=mf)
             pos_scores = scores[labels == 1][:, 1]
             neg_scores = scores[labels == 0][:, 1]
+            if self.measure in ["hellinger", "topsoe", "probsymm"]:
+                method = DyS(measure=self.measure)
+            elif self.measure == "sord":
+                method = SORD()
-            best_distance = self._get_best_distance(predictions, pos_scores, neg_scores)
+            alpha, distance = method.best_mixture(predictions, pos_scores, neg_scores)
-            distances.append(best_distance)
+            distances.append(distance)
+            alphas.append(alpha)
         best_m = MF[np.argmin(distances)]
-        return best_m
+        best_alpha = alphas[np.argmin(distances)]
+        best_distance = np.min(distances)
+        return best_alpha, best_distance, best_m
-    def _get_best_distance(self, predictions, pos_scores, neg_scores):
-        if self.measure in ["hellinger", "topsoe", "probsymm"]:
-            method = DyS(measure=self.measure)
-        elif self.measure == "sord":
-            method = SORD()
+    def get_best_distance(self, predictions):
-        best_distance = method.get_best_distance(predictions, pos_scores, neg_scores)
-        return best_distance
+        _, distance, _= self.get_best_merging_factor(predictions)
+        return distance
     @classmethod
-    def MoSS(cls, n, alpha, m):
+    def MoSS(cls, n, alpha, merging_factor):
         r"""Model for Score Simulation
-        MoSS has three key parameters:
-        (I) the number of observations `n`;
-        (II) the class proportion `\alpha`, which defines the prevalence of the positive class;
-        (III) the merging factor :math:`m`, which controls the overlap between positive and negative score distributions
-        (where :math:`m=0` represents easily separable classes and :math:`m=1` represents highly overlapping ones).
+        Parameters
+        ----------
+        n : int
+            Number of observations.
+        alpha : float
+            Class proportion, which defines the prevalence of the positive class.
+        m : float
+            Merging factor, which controls the overlap between positive and negative score distributions.
+        Returns
+        -------
+        tuple
+            Tuple of score and label arrays.
         .. math::
@@ -776,9 +784,9 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
         n_neg = n - n_pos
         # Scores positivos
-        p_score = np.random.uniform(size=n_pos) ** m
+        p_score = np.random.uniform(size=n_pos) ** merging_factor
         # Scores negativos
-        n_score = 1 - (np.random.uniform(size=n_neg) ** m)
+        n_score = 1 - (np.random.uniform(size=n_neg) ** merging_factor)
         # Construção dos arrays de features (duas colunas iguais)
         moss = np.column_stack(

mlquantify/mixture/__init__.py CHANGED Viewed

@@ -3,5 +3,6 @@ from ._classes import (
     DyS,
     SMM,
     SORD,
-    HDx
+    HDx,
+    MMD_RKHS
 )

mlquantify 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

mlquantify 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl