PyPI - mlquantify - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

mlquantify 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

mlquantify/__init__.py +11 -1
mlquantify/adjust_counting/__init__.py +11 -1
mlquantify/adjust_counting/_adjustment.py +370 -87
mlquantify/adjust_counting/_base.py +1 -3
mlquantify/adjust_counting/_counting.py +27 -19
mlquantify/adjust_counting/_utils.py +23 -28
mlquantify/confidence.py +16 -22
mlquantify/likelihood/_base.py +38 -52
mlquantify/likelihood/_classes.py +88 -72
mlquantify/meta/_classes.py +86 -62
mlquantify/metrics/_oq.py +2 -2
mlquantify/metrics/_rq.py +2 -2
mlquantify/metrics/_slq.py +9 -9
mlquantify/mixture/_base.py +13 -19
mlquantify/mixture/_classes.py +68 -10
mlquantify/mixture/_utils.py +62 -11
mlquantify/model_selection/_protocol.py +6 -6
mlquantify/model_selection/_search.py +1 -1
mlquantify/neighbors/_base.py +35 -65
mlquantify/neighbors/_classes.py +1 -10
mlquantify/neighbors/_classification.py +5 -12
mlquantify/neighbors/_kde.py +7 -9
mlquantify/neighbors/_utils.py +17 -21
mlquantify/utils/_validation.py +3 -3
mlquantify/utils/prevalence.py +4 -1
{mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/METADATA +10 -18
mlquantify-0.1.11.dist-info/RECORD +53 -0
mlquantify-0.1.9.dist-info/RECORD +0 -53
{mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/WHEEL +0 -0
{mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/top_level.txt +0 -0

mlquantify/__init__.py CHANGED Viewed

@@ -1,3 +1,13 @@
 "mlquantify, a Python package for quantification"
+from . import neighbors
+from . import likelihood
+from . import mixture
+from . import meta
+from . import adjust_counting
+from . import model_selection
+from . import base_aggregative
+from . import base
+from . import calibration
+from . import confidence
+from . import multiclass

mlquantify/adjust_counting/__init__.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from ._counting import CC, PCC
+from ._counting import (
+    CC,
+    PCC
+)
 from ._adjustment import (
     ThresholdAdjustment,
     MatrixAdjustment,
@@ -11,4 +14,11 @@ from ._adjustment import (
     T50,
     MS,
     MS2,
+)
+from ._utils import (
+    compute_table,
+    compute_fpr,
+    compute_tpr,
+    evaluate_thresholds,
 )

mlquantify/adjust_counting/_adjustment.py CHANGED Viewed

@@ -17,38 +17,26 @@ from mlquantify.utils._constraints import Interval, Options
 @define_binary
 class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
-    r"""
-    Applies threshold-based adjustment methods for quantification.
+    r"""Base Class for Threshold-based adjustment methods for quantification.
     This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
-    which adjust prevalence estimates based on the classifier’s ROC curve, as proposed by
-    Forman (2005, 2008).
+    which adjust prevalence estimates based on the classifier's ROC curve,
+    as proposed by [1]_.
-    These methods correct the bias in *Classify & Count (CC)* estimates caused by differences
-    in class distributions between the training and test datasets.
-    Mathematical formulation
-    Given:
-    - \( p' \): observed positive proportion from CC,
-    - \( \text{TPR} = P(\hat{y}=1|y=1) \),
-    - \( \text{FPR} = P(\hat{y}=1|y=0) \),
-    the adjusted prevalence is given by:
-    \[
-    \hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
-    \]
+    These methods correct the bias in *Classify & Count (CC)* estimates caused
+    by differences in class distributions between the training and test datasets.
+    The adjusted prevalence is calculated using the following formula:
-    (Forman, *Counting Positives Accurately Despite Inaccurate Classification*, ECML 2005;
-     *Quantifying Counts and Costs via Classification*, DMKD 2008).
+    .. math::
+        \hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
-    Notes
-    -----
-    - Defined only for binary quantification tasks.
-    - When applied to multiclass problems, the one-vs-rest strategy (`ovr`) is used automatically.
+    where:
+        - :math:`p'` is the observed positive proportion from CC,
+        - :math:`\text{TPR} = P(\hat{y}=1|y=1)` is the True Positive Rate,
+        - :math:`\text{FPR} = P(\hat{y}=1|y=0)` is the False Positive Rate.
     Parameters
     ----------
@@ -59,7 +47,6 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
     strategy : {'ovr'}, default='ovr'
         Strategy used for multiclass adaptation.
     Attributes
     ----------
     learner : estimator
@@ -67,6 +54,12 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
     classes : ndarray of shape (n_classes,)
         Unique class labels observed during training.
+    Notes
+    -----
+    - Defined only for binary quantification tasks.
+    - When applied to multiclass problems, the one-vs-rest strategy (`ovr`)
+    is used automatically.
     Examples
     --------
@@ -74,7 +67,7 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
     >>> from mlquantify.adjust_counting import ThresholdAdjustment
     >>> import numpy as np
     >>> class CustomThreshold(ThresholdAdjustment):
-    ...     def _get_best_threshold(self, thresholds, tprs, fprs):
+    ...     def get_best_threshold(self, thresholds, tprs, fprs):
     ...         idx = np.argmax(tprs - fprs)
     ...         return thresholds[idx], tprs[idx], fprs[idx]
     >>> X = np.random.randn(100, 4)
@@ -83,6 +76,13 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
     >>> q.fit(X, y)
     >>> q.predict(X)
     {0: 0.49, 1: 0.51}
+    References
+    ----------
+    .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate
+        Classification", *Proceedings of ECML*, pp. 564-575.
+    .. [2] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
+        *Data Mining and Knowledge Discovery*, 17(2), 164-206.
     """
     _parameter_constraints = {
@@ -101,8 +101,8 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
         """Internal adjustment computation based on selected ROC threshold."""
         positive_scores = train_y_scores[:, 1]
-        thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
-        threshold, tpr, fpr = self._get_best_threshold(thresholds, tprs, fprs)
+        thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
+        threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
         cc_predictions = CC(threshold).aggregate(predictions)[1]
@@ -114,42 +114,40 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
         return np.asarray([1 - prevalence, prevalence])
     @abstractmethod
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    def get_best_threshold(self, thresholds, tprs, fprs):
         """Select the best threshold according to the specific method."""
         ...
 class MatrixAdjustment(BaseAdjustCount):
-    r"""
-    Base class for matrix-based quantification adjustments (FM, GAC, GPAC).
+    r"""Base class for matrix-based quantification adjustments.
     This class implements the matrix correction model for quantification
-    as formulated in Firat (2016), which expresses the observed prevalences as
-    a linear combination of true prevalences through the confusion matrix.
+    as formulated in Firat (2016) [1]_, which expresses the observed prevalences
+    as a linear combination of true prevalences through the confusion matrix.
-    Mathematical model
+    The system is modeled as:
-    The system is given by:
+    .. math::
-    \[
-    \mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
-    \]
-    subject to:
-    \[
-    \hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
-    \]
+        \mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
+    subject to the constraints:
+    .. math::
+        \hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
     where:
-    - \( \mathbf{y} \): vector of predicted prevalences in test set,
-    - \( \mathbf{C} \): confusion matrix,
-    - \( \hat{\pi}_F \): true class prevalence vector (unknown),
-    - \( \varepsilon \): residual error.
+        - :math:`\mathbf{y}` is the vector of predicted prevalences in test set,
+        - :math:`\mathbf{C}` is the confusion matrix,
+        - :math:`\hat{\pi}_F` is the true class prevalence vector (unknown),
+        - :math:`\varepsilon` is the residual error.
+    The model can be solved via:
-    The model can be solved either via:
-    - Linear algebraic solution, or
-    - Constrained optimization (quadratic or least-squares).
+    - **Linear algebraic solution**: uses matrix inversion
+    - **Constrained optimization**: quadratic or least-squares approach
     Parameters
@@ -158,10 +156,10 @@ class MatrixAdjustment(BaseAdjustCount):
         Classifier with `fit` and `predict` methods.
     solver : {'optim', 'linear'}, optional
         Solver for the adjustment system:
         - `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
         - `'optim'`: uses optimization (e.g., FM)
     Attributes
     ----------
     CM : ndarray of shape (n_classes, n_classes)
@@ -170,15 +168,11 @@ class MatrixAdjustment(BaseAdjustCount):
         Class labels observed in training.
-    References
-    ----------
-    - Firat, A. (2016). *Unified Framework for Quantification.* AAAI, pp. 1-8.
     Examples
     --------
     >>> from sklearn.linear_model import LogisticRegression
     >>> from mlquantify.adjust_counting import MatrixAdjustment
+    >>> import numpy as np
     >>> class MyMatrix(MatrixAdjustment):
     ...     def _compute_confusion_matrix(self, preds, y):
     ...         cm = np.ones((2, 2))
@@ -189,8 +183,15 @@ class MatrixAdjustment(BaseAdjustCount):
     >>> q.fit(X, y)
     >>> q.predict(X)
     {0: 0.5, 1: 0.5}
+    References
+    ----------
+    .. [1] Firat, A. (2016). "Unified Framework for Quantification",
+        *Proceedings of AAAI Conference on Artificial Intelligence*,
+        pp. 1-8.
     """
     _parameter_constraints = {"solver": Options(["optim", "linear"])}
     def __init__(self, learner=None, solver=None):
@@ -215,11 +216,7 @@ class MatrixAdjustment(BaseAdjustCount):
     def _solve_linear(self, prevs_estim):
         r"""
-        Solve the system linearly:
-        \[
-        \hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
-        \]
+        Solve the system using matrix inversion.
         """
         try:
             adjusted = np.linalg.solve(self.CM, prevs_estim)
@@ -230,13 +227,26 @@ class MatrixAdjustment(BaseAdjustCount):
         return adjusted
     def _solve_optimization(self, prevs_estim, priors):
-        r"""
-        Solve via constrained least squares:
+        r"""Solve the system linearly.
+        The solution is obtained by matrix inversion:
-        \[
-        \min_{\hat{\pi}_F} \| \mathbf{C}\hat{\pi}_F - \mathbf{p} \|_2^2
-        \quad \text{s.t. } \hat{\pi}_F \ge 0, \ \sum_k \hat{\pi}_{F,k} = 1
-        \]
+        .. math::
+            \hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
+        where :math:`\mathbf{C}` is the confusion matrix and :math:`\mathbf{p}`
+        is the observed prevalence vector.
+        Parameters
+        ----------
+        p : ndarray of shape (n_classes,)
+            Observed prevalence vector from test set.
+        Returns
+        -------
+        ndarray of shape (n_classes,)
+            Adjusted prevalence estimates :math:`\hat{\pi}_F`.
         """
         def objective(prevs_pred):
             return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
@@ -262,7 +272,63 @@ class MatrixAdjustment(BaseAdjustCount):
 class FM(SoftLearnerQMixin, MatrixAdjustment):
-    """Forman's Matrix Adjustment (FM) — solved via optimization."""
+    r"""Friedman Method for quantification adjustment.
+    This class implements the Friedman (2015) matrix-based quantification adjustment, which formulates the quantification problem as a constrained optimization problem. It adjusts the estimated class prevalences by minimizing the difference between predicted and expected prevalences, subject to valid prevalence constraints.
+    The confusion matrix is computed by applying estimated posterior probabilities
+    over true labels, enabling accurate correction of prevalence estimates under
+    concept drift.
+    The confusion matrix is estimated for each class :math:`k` by:
+    applying thresholding on posterior probabilities against prior prevalence,
+    as described in the FM algorithm. This enables the correction using
+    a quadratic optimization approach.
+    The method solves:
+    .. math::
+        \min_{\hat{\pi}_F} \| \mathbf{C} \hat{\pi}_F - \mathbf{p} \|^2
+    subject to constraints:
+    .. math::
+        \hat{\pi}_F \geq 0, \quad \sum_k \hat{\pi}_{F,k} = 1
+    where :math:`\mathbf{C}` is the confusion matrix, :math:`\mathbf{p}` is the
+    vector of predicted prevalences.
+    Parameters
+    ----------
+    learner : estimator, optional
+        Base classifier with `fit` and `predict_proba` methods.
+        If None, a default estimator will be used.
+    Attributes
+    ----------
+    CM : ndarray of shape (n_classes, n_classes)
+        Confusion matrix used for correction.
+    Examples
+    --------
+    >>> from mlquantify.adjust_counting import FM
+    >>> import numpy as np
+    >>> X = np.random.randn(50, 4)
+    >>> y = np.random.randint(0, 2, 50)
+    >>> fm = FM(learner=LogisticRegression())
+    >>> fm.fit(X, y)
+    >>> fm.predict(X)
+    {0: 0.5, 1: 0.5}
+    References
+    ----------
+    .. [1] Friedman, J. H., et al. (2015). "Detecting and Dealing with Concept Drift",
+           *Proceedings of the IEEE*, 103(11), 1522-1541.
+    """
     def __init__(self, learner=None):
         super().__init__(learner=learner, solver='optim')
@@ -274,7 +340,52 @@ class FM(SoftLearnerQMixin, MatrixAdjustment):
 class GAC(CrispLearnerQMixin, MatrixAdjustment):
-    """Gonzalez-Castro’s Generalized Adjusted Count (GAC) method."""
+    r"""Generalized Adjusted Count method.
+    This class implements the Generalized Adjusted Count (GAC) algorithm for
+    quantification adjustment as described in Firat (2016) [1]_. The method
+    adjusts the estimated class prevalences by normalizing the confusion matrix
+    based on prevalence estimates, providing a correction for bias caused by
+    distribution differences between training and test data.
+    The confusion matrix is normalized by dividing each column by the prevalence
+    estimate of the corresponding class. For classes with zero estimated prevalence,
+    the diagonal element is set to 1 to avoid division by zero.
+    This normalization ensures that the matrix best reflects the classifier's
+    behavior relative to the estimated class distributions, improving quantification
+    accuracy.
+    Parameters
+    ----------
+    learner : estimator, optional
+        Base classifier with `fit` and `predict` methods.
+    Attributes
+    ----------
+    CM : ndarray of shape (n_classes, n_classes)
+        Normalized confusion matrix used for adjusting predicted prevalences.
+    classes_ : ndarray
+        Array of class labels observed during training.
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from mlquantify.adjust_counting import GAC
+    >>> import numpy as np
+    >>> gac = GAC(learner=LogisticRegression())
+    >>> X = np.random.randn(50, 4)
+    >>> y = np.random.randint(0, 2, 50)
+    >>> gac.fit(X, y)
+    >>> gac.predict(X)
+    {0: 0.5, 1: 0.5}
+    References
+    ----------
+    .. [1] Firat, A. (2016). "Unified Framework for Quantification",
+           *Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
+    """
     def __init__(self, learner=None):
         super().__init__(learner=learner, solver='linear')
@@ -289,7 +400,51 @@ class GAC(CrispLearnerQMixin, MatrixAdjustment):
 class GPAC(SoftLearnerQMixin, MatrixAdjustment):
-    """Probabilistic GAC (GPAC) — soft version using posterior probabilities."""
+    r"""Probabilistic Generalized Adjusted Count (GPAC) method.
+    This class implements the probabilistic extension of the Generalized Adjusted Count method
+    as presented in Firat (2016) [1]_. The GPAC method normalizes the confusion matrix by
+    the estimated prevalences from posterior probabilities, enabling a probabilistic correction
+    of class prevalences.
+    The normalization divides each column of the confusion matrix by the estimated prevalence
+    of the corresponding class. If a class has zero estimated prevalence, the diagonal element
+    for that class is set to 1 to maintain matrix validity.
+    GPAC extends the GAC approach by using soft probabilistic predictions (posterior probabilities)
+    rather than crisp class labels, potentially improving quantification accuracy when
+    posterior probabilities are well calibrated.
+    Parameters
+    ----------
+    learner : estimator, optional
+        Base classifier with `fit` and `predict_proba` methods.
+    Attributes
+    ----------
+    CM : ndarray of shape (n_classes, n_classes)
+        Normalized confusion matrix used for adjustment.
+    classes_ : ndarray
+        Array of class labels observed during training.
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from mlquantify.adjust_counting import GPAC
+    >>> import numpy as np
+    >>> gpac = GPAC(learner=LogisticRegression())
+    >>> X = np.random.randn(50, 4)
+    >>> y = np.random.randint(0, 2, 50)
+    >>> gpac.fit(X, y)
+    >>> gpac.predict(X)
+    {0: 0.5, 1: 0.5}
+    References
+    ----------
+    .. [1] Firat, A. (2016). "Unified Framework for Quantification",
+           *Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
+    """
     def __init__(self, learner=None):
         super().__init__(learner=learner, solver='linear')
@@ -304,41 +459,145 @@ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
 class ACC(ThresholdAdjustment):
-    """Adjusted Count (ACC) — baseline threshold correction."""
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    r"""Adjusted Count (ACC) — baseline threshold correction.
+    This method corrects the bias in class prevalence estimates caused by imperfect
+    classification accuracy, by adjusting the observed positive count using estimates
+    of the classifier's true positive rate (TPR) and false positive rate (FPR).
+    It uses a fixed classification threshold and applies the formula:
+    .. math::
+        p = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
+    where :math:`p'` is the observed positive proportion from :class:`CC`,
+    Parameters
+    ----------
+    learner : estimator, optional
+        A supervised learning model with `fit` and `predict_proba` methods.
+    threshold : float, default=0.5
+        Classification threshold in [0, 1] for applying in the :class:`CC` output.
+    References
+    ----------
+    .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
+           *ECML*, pp. 564-575.
+    """
+    def get_best_threshold(self, thresholds, tprs, fprs):
         tpr = tprs[thresholds == self.threshold][0]
         fpr = fprs[thresholds == self.threshold][0]
         return (self.threshold, tpr, fpr)
 class X_method(ThresholdAdjustment):
-    """X method — threshold where \( \text{TPR} + \text{FPR} = 1 \)."""
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    r"""X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
+    This method selects the classification threshold at which the sum of the true positive
+    rate (TPR) and false positive rate (FPR) equals one. This threshold choice balances
+    errors in a specific way improving quantification.
+    Parameters
+    ----------
+    learner : estimator, optional
+        A supervised learning model with `fit` and `predict_proba` methods.
+    threshold : float, default=0.5
+        Classification threshold in [0, 1] for applying in the :class:`CC` output.
+    References
+    ----------
+    .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
+           *ECML*, pp. 564-575.
+    """
+    def get_best_threshold(self, thresholds, tprs, fprs):
         idx = np.argmin(np.abs(1 - (tprs + fprs)))
         return thresholds[idx], tprs[idx], fprs[idx]
 class MAX(ThresholdAdjustment):
-    r"""MAX method — threshold maximizing \( \text{TPR} - \text{FPR} \)."""
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    r"""MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
+    This method selects the threshold that maximizes the difference between the true positive
+    rate (TPR) and the false positive rate (FPR), effectively optimizing classification
+    performance for quantification.
+    Parameters
+    ----------
+    learner : estimator, optional
+        A supervised learning model with `fit` and `predict_proba` methods.
+    threshold : float, default=0.5
+        Classification threshold in [0, 1] for applying in the :class:`CC` output.
+    References
+    ----------
+    .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
+           *ECML*, pp. 564-575.
+    """
+    def get_best_threshold(self, thresholds, tprs, fprs):
         idx = np.argmax(np.abs(tprs - fprs))
         return thresholds[idx], tprs[idx], fprs[idx]
 class T50(ThresholdAdjustment):
-    r"""T50 — selects threshold where \( \text{TPR} = 0.5 \)."""
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    r"""T50 — selects threshold where :math:`\text{TPR} = 0.5`.
+    This method chooses the classification threshold such that the true positive rate (TPR)
+    equals 0.5, avoiding regions with unreliable estimates at extreme thresholds.
+    Parameters
+    ----------
+    learner : estimator, optional
+        A supervised learning model with `fit` and `predict_proba` methods.
+    threshold : float, default=0.5
+        Classification threshold in [0, 1] for applying in the :class:`CC` output.
+    References
+    ----------
+    .. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
+           *ECML*, pp. 564-575.
+    """
+    def get_best_threshold(self, thresholds, tprs, fprs):
         idx = np.argmin(np.abs(tprs - 0.5))
         return thresholds[idx], tprs[idx], fprs[idx]
 class MS(ThresholdAdjustment):
-    r"""Median Sweep (MS) — median prevalence across all thresholds."""
+    r"""Median Sweep (MS) — median prevalence estimate across all thresholds.
+    This method computes class prevalence estimates at multiple classification thresholds,
+    using the adjusted count formula for each, then returns the median of these estimates,
+    reducing variance caused by any single threshold selection.
+    It thus leverages the strengths of bootstrap-like variance reduction without heavy
+    computation.
+    Parameters
+    ----------
+    learner : estimator, optional
+        A supervised learning model with `fit` and `predict_proba` methods.
+    threshold : float, default=0.5
+        Classification threshold in [0, 1] for applying in the :class:`CC` output.
+    References
+    ----------
+    .. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
+           *Data Mining and Knowledge Discovery*, 17(2), 164-206.
+    """
     def _adjust(self, predictions, train_y_scores, train_y_values):
         positive_scores = train_y_scores[:, 1]
-        thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
-        thresholds, tprs, fprs = self._get_best_threshold(thresholds, tprs, fprs)
+        thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
+        thresholds, tprs, fprs = self.get_best_threshold(thresholds, tprs, fprs)
         prevs = []
         for thr, tpr, fpr in zip(thresholds, tprs, fprs):
@@ -349,13 +608,37 @@ class MS(ThresholdAdjustment):
         prevalence = np.median(prevs)
         return np.asarray([1 - prevalence, prevalence])
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    def get_best_threshold(self, thresholds, tprs, fprs):
         return thresholds, tprs, fprs
 class MS2(MS):
-    r"""MS2 — Median Sweep variant with constraint \( |\text{TPR} - \text{FPR}| > 0.25 \)."""
-    def _get_best_threshold(self, thresholds, tprs, fprs):
+    r"""MS2 — Median Sweep variant constraining :math:`|\text{TPR} - \text{FPR}| > 0.25`.
+    This variant of Median Sweep excludes thresholds where the absolute difference
+    between true positive rate (TPR) and false positive rate (FPR) is below 0.25,
+    improving stability by avoiding ambiguous threshold regions.
+    Parameters
+    ----------
+    learner : estimator, optional
+        A supervised learning model with `fit` and `predict_proba` methods.
+    threshold : float, default=0.5
+        Classification threshold in [0, 1] for applying in the :class:`CC` output.
+    Warnings
+    --------
+    - Warns if all TPR or FPR values are zero.
+    - Warns if no thresholds satisfy the constraint.
+    References
+    ----------
+    .. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
+           *Data Mining and Knowledge Discovery*, 17(2), 164-206.
+    """
+    def get_best_threshold(self, thresholds, tprs, fprs):
         if np.all(tprs == 0) or np.all(fprs == 0):
             warnings.warn("All TPR or FPR values are zero.")
         indices = np.where(np.abs(tprs - fprs) > 0.25)[0]

mlquantify/adjust_counting/_base.py CHANGED Viewed

@@ -174,7 +174,7 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
     --------
     >>> from mlquantify.base_count import BaseAdjustCount
     >>> import numpy as np
+    >>> from sklearn.linear_model import LogisticRegression
     >>> class ACC(CrispLearnerQMixin, BaseAdjustCount):
     ...     def _adjust(self, preds, train_preds, y_train):
     ...         tpr = np.mean(train_preds[y_train == 1])
@@ -182,8 +182,6 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
     ...         p_obs = np.mean(preds)
     ...         p_adj = (p_obs - fpr) / (tpr - fpr)
     ...         return np.clip([1 - p_adj, p_adj], 0, 1)
-    >>> from sklearn.linear_model import LogisticRegression
     >>> X = np.random.randn(100, 5)
     >>> y = np.random.randint(0, 2, 100)
     >>> q = ACC(learner=LogisticRegression())

mlquantify 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

mlquantify 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl