PyPI - mlquantify - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

mlquantify 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

mlquantify/__init__.py +0 -29
mlquantify/adjust_counting/__init__.py +14 -0
mlquantify/adjust_counting/_adjustment.py +365 -0
mlquantify/adjust_counting/_base.py +247 -0
mlquantify/adjust_counting/_counting.py +145 -0
mlquantify/adjust_counting/_utils.py +114 -0
mlquantify/base.py +117 -519
mlquantify/base_aggregative.py +209 -0
mlquantify/calibration.py +1 -0
mlquantify/confidence.py +335 -0
mlquantify/likelihood/__init__.py +5 -0
mlquantify/likelihood/_base.py +161 -0
mlquantify/likelihood/_classes.py +414 -0
mlquantify/meta/__init__.py +1 -0
mlquantify/meta/_classes.py +761 -0
mlquantify/metrics/__init__.py +21 -0
mlquantify/metrics/_oq.py +109 -0
mlquantify/metrics/_rq.py +98 -0
mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
mlquantify/mixture/__init__.py +7 -0
mlquantify/mixture/_base.py +153 -0
mlquantify/mixture/_classes.py +400 -0
mlquantify/mixture/_utils.py +112 -0
mlquantify/model_selection/__init__.py +9 -0
mlquantify/model_selection/_protocol.py +358 -0
mlquantify/model_selection/_search.py +315 -0
mlquantify/model_selection/_split.py +1 -0
mlquantify/multiclass.py +350 -0
mlquantify/neighbors/__init__.py +9 -0
mlquantify/neighbors/_base.py +198 -0
mlquantify/neighbors/_classes.py +159 -0
mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
mlquantify/neighbors/_kde.py +270 -0
mlquantify/neighbors/_utils.py +135 -0
mlquantify/neural/__init__.py +1 -0
mlquantify/utils/__init__.py +47 -2
mlquantify/utils/_artificial.py +27 -0
mlquantify/utils/_constraints.py +219 -0
mlquantify/utils/_context.py +21 -0
mlquantify/utils/_decorators.py +36 -0
mlquantify/utils/_exceptions.py +12 -0
mlquantify/utils/_get_scores.py +159 -0
mlquantify/utils/_load.py +18 -0
mlquantify/utils/_parallel.py +6 -0
mlquantify/utils/_random.py +36 -0
mlquantify/utils/_sampling.py +273 -0
mlquantify/utils/_tags.py +44 -0
mlquantify/utils/_validation.py +447 -0
mlquantify/utils/prevalence.py +61 -0
{mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
mlquantify-0.1.9.dist-info/RECORD +53 -0
mlquantify/classification/__init__.py +0 -1
mlquantify/evaluation/__init__.py +0 -14
mlquantify/evaluation/protocol.py +0 -291
mlquantify/methods/__init__.py +0 -37
mlquantify/methods/aggregative.py +0 -1159
mlquantify/methods/meta.py +0 -472
mlquantify/methods/mixture_models.py +0 -1003
mlquantify/methods/non_aggregative.py +0 -136
mlquantify/methods/threshold_optimization.py +0 -869
mlquantify/model_selection.py +0 -377
mlquantify/plots.py +0 -367
mlquantify/utils/general.py +0 -371
mlquantify/utils/method.py +0 -449
mlquantify-0.1.7.dist-info/RECORD +0 -22
{mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
{mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0

mlquantify/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from ._oq import (
+    NMD,
+    RNOD,
+)
+from ._rq import (
+    VSE,
+    CvM_L1,
+)
+from ._slq import (
+    AE,
+    SE,
+    MAE,
+    MSE,
+    KLD,
+    RAE,
+    NAE,
+    NRAE,
+    NKLD,
+)

mlquantify/metrics/_oq.py ADDED Viewed

@@ -0,0 +1,109 @@
+import numpy as np
+def process_inputs(prev_pred, prev_real):
+    """
+    .. :noindex:
+    Process the input data for internal use.
+    """
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    if isinstance(prev_real, list):
+        prev_real = np.asarray(prev_real)
+    if isinstance(prev_pred, list):
+        prev_pred = np.asarray(prev_pred)
+    # Pad with zeros if lengths differ
+    len_real = len(prev_real)
+    len_pred = len(prev_pred)
+    if len_real > len_pred:
+        prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
+    elif len_pred > len_real:
+        prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
+    return prev_real, prev_pred
+def NMD(prev_pred, prev_real, distances=None):
+    """
+    Compute the Normalized Match Distance (NMD), also known as Earth Mover’s Distance (EMD),
+    for ordinal quantification evaluation.
+    Parameters
+    ----------
+    prev_real : array-like or dict
+        True prevalence values for each ordered class.
+    prev_pred : array-like or dict
+        Predicted prevalence values for each ordered class.
+    distances : array-like of shape (n_classes-1,), optional
+        Distance between consecutive classes (d(y_i, y_{i+1})).
+        If None, all distances are assumed to be 1.
+    Returns
+    -------
+    nmd : float
+        Normalized Match Distance between predicted and true prevalences.
+    """
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    n_classes = len(prev_real)
+    if distances is None:
+        distances = np.ones(n_classes - 1)
+    else:
+        distances = np.asarray(distances, dtype=float)
+        if len(distances) != n_classes - 1:
+            raise ValueError("Length of distances must be n_classes - 1.")
+    # cumulative differences
+    cum_diffs = np.cumsum(prev_pred - prev_real)
+    nmd = np.sum(distances * np.abs(cum_diffs[:-1])) / (n_classes - 1)
+    return float(nmd)
+def RNOD(prev_pred, prev_real, distances=None):
+    """
+    Compute the Root Normalised Order-aware Divergence (RNOD) for ordinal quantification evaluation.
+    Parameters
+    ----------
+    prev_real : array-like or dict
+        True prevalence values for each ordered class.
+    prev_pred : array-like or dict
+        Predicted prevalence values for each ordered class.
+    distances : 2D array-like of shape (n_classes, n_classes), optional
+        Distance matrix between classes (d(y_i, y_j)).
+        If None, assumes d(y_i, y_j) = |i - j|.
+    Returns
+    -------
+    rnod : float
+        Root Normalised Order-aware Divergence between predicted and true prevalences.
+    """
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    n_classes = len(prev_real)
+    Y_star = np.where(prev_real > 0)[0]
+    # default distance: |i - j|
+    if distances is None:
+        distances = np.abs(np.arange(n_classes)[:, None] - np.arange(n_classes)[None, :])
+    else:
+        distances = np.asarray(distances, dtype=float)
+        if distances.shape != (n_classes, n_classes):
+            raise ValueError("Distance matrix must be of shape (n_classes, n_classes).")
+    diff_sq = (prev_real - prev_pred) ** 2
+    total = 0.0
+    for i in Y_star:
+        for j in range(n_classes):
+            total += distances[j, i] * diff_sq[j]
+    denom = len(Y_star) * (n_classes - 1)
+    rnod = np.sqrt(total / denom)
+    return float(rnod)

mlquantify/metrics/_rq.py ADDED Viewed

@@ -0,0 +1,98 @@
+import numpy as np
+from scipy.stats import cumfreq
+from mlquantify.metrics._slq import SE
+def process_inputs(prev_pred, prev_real):
+    """
+    .. :noindex:
+    Process the input data for internal use.
+    """
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    if isinstance(prev_real, list):
+        prev_real = np.asarray(prev_real)
+    if isinstance(prev_pred, list):
+        prev_pred = np.asarray(prev_pred)
+    # Pad with zeros if lengths differ
+    len_real = len(prev_real)
+    len_pred = len(prev_pred)
+    if len_real > len_pred:
+        prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
+    elif len_pred > len_real:
+        prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
+    return prev_real, prev_pred
+def VSE(prev_pred, prev_real, train_values):
+    """
+    Compute the Variance-normalised Squared Error (VSE).
+    Parameters
+    ----------
+    prev_real : array-like
+        True regression values (from test set).
+    prev_pred : array-like
+        Predicted regression values (from test set).
+    train_values : array-like
+        True regression values from training set, used to compute variance normalization.
+    Returns
+    -------
+    verror : float
+        Variance-normalised squared error.
+    """
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    if isinstance(train_values, dict):
+        train_values = np.asarray(list(train_values.values()))
+    var_train = np.var(train_values, ddof=1)
+    if var_train == 0:
+        return np.nan
+    return SE(prev_pred, prev_real) / var_train
+def CvM_L1(prev_pred, prev_real, n_bins=100):
+    """
+    Compute the L1 version of the Cramér–von Mises statistic (Xiao et al., 2006)
+    between two cumulative distributions, as suggested by Bella et al. (2014).
+    Parameters
+    ----------
+    prev_real : array-like
+        True regression values.
+    prev_pred : array-like
+        Predicted regression values.
+    n_bins : int, optional
+        Number of bins used to estimate cumulative distributions (default=100).
+    Returns
+    -------
+    statistic : float
+        L1 Cramér–von Mises distance between cumulative distributions.
+    """
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    # Compute empirical cumulative distributions
+    min_val = min(np.min(prev_real), np.min(prev_pred))
+    max_val = max(np.max(prev_real), np.max(prev_pred))
+    real_cum = cumfreq(prev_real, numbins=n_bins, defaultreallimits=(min_val, max_val))
+    pred_cum = cumfreq(prev_pred, numbins=n_bins, defaultreallimits=(min_val, max_val))
+    # Normalize to [0, 1]
+    F_real = real_cum.cumcount / real_cum.cumcount[-1]
+    F_pred = pred_cum.cumcount / pred_cum.cumcount[-1]
+    # L1 integral between cumulative distributions
+    statistic = np.mean(np.abs(F_real - F_pred))
+    return float(statistic)

mlquantify/{evaluation/measures.py → metrics/_slq.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
-def process_inputs(prev_real, prev_pred):
+def process_inputs(prev_pred, prev_real):
     """
     .. :noindex:
@@ -10,10 +10,26 @@ def process_inputs(prev_real, prev_pred):
         prev_real = np.asarray(list(prev_real.values()))
     if isinstance(prev_pred, dict):
         prev_pred = np.asarray(list(prev_pred.values()))
+    if isinstance(prev_real, list):
+        print(prev_real)
+        prev_real = np.asarray(prev_real)
+    if isinstance(prev_pred, list):
+        print(prev_pred)
+        prev_pred = np.asarray(prev_pred)
+    # Pad with zeros if lengths differ
+    len_real = len(prev_real)
+    len_pred = len(prev_pred)
+    if len_real > len_pred:
+        prev_pred = np.pad(prev_pred, (0, len_real - len_pred), constant_values=0)
+    elif len_pred > len_real:
+        prev_real = np.pad(prev_real, (0, len_pred - len_real), constant_values=0)
     return prev_real, prev_pred
-def absolute_error(prev_real, prev_pred):
+def AE(prev_pred, prev_real):
     """
     Compute the absolute error for each class or a dictionary of errors if input is a dictionary.
@@ -32,15 +48,15 @@ def absolute_error(prev_real, prev_pred):
     """
     if isinstance(prev_real, dict):
         classes = prev_real.keys()
-        prev_real, prev_pred = process_inputs(prev_real, prev_pred)
+        prev_real, prev_pred = process_inputs(prev_pred, prev_real)
         abs_errors = np.abs(prev_pred - prev_real)
         return {class_: float(err) for class_, err in zip(classes, abs_errors)}
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
     return np.abs(prev_pred - prev_real)
-def mean_absolute_error(prev_real, prev_pred):
+def MAE(prev_pred, prev_real):
     """
     Compute the mean absolute error between the real and predicted prevalences.
@@ -57,11 +73,11 @@ def mean_absolute_error(prev_real, prev_pred):
     error : float
         Mean absolute error across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
-    return np.mean(absolute_error(prev_real, prev_pred))
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    return np.mean(AE(prev_pred, prev_real))
-def kullback_leibler_divergence(prev_real, prev_pred):
+def KLD(prev_pred, prev_real):
     """
     Compute the Kullback-Leibler divergence between the real and predicted prevalences.
@@ -78,11 +94,11 @@ def kullback_leibler_divergence(prev_real, prev_pred):
     divergence : array-like of shape (n_classes,)
         Kullback-Leibler divergence for each class.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
     return prev_real * np.abs(np.log(prev_real / prev_pred))
-def squared_error(prev_real, prev_pred):
+def SE(prev_pred, prev_real):
     """
     Compute the mean squared error between the real and predicted prevalences.
@@ -99,13 +115,12 @@ def squared_error(prev_real, prev_pred):
     error : float
         Mean squared error across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
     return np.mean((prev_pred - prev_real) ** 2, axis=-1)
-def mean_squared_error(prev_real, prev_pred):
-    """
-    Compute the mean squared error across all classes.
+def MSE(prev_pred, prev_real):
+    """ Mean Squared Error
     Parameters
     ----------
@@ -120,11 +135,11 @@ def mean_squared_error(prev_real, prev_pred):
     mse : float
         Mean squared error across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
-    return squared_error(prev_real, prev_pred).mean()
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    return SE(prev_pred, prev_real).mean()
-def normalized_absolute_error(prev_real, prev_pred):
+def NAE(prev_pred, prev_real):
     """
     Compute the normalized absolute error between the real and predicted prevalences.
@@ -141,13 +156,13 @@ def normalized_absolute_error(prev_real, prev_pred):
     error : float
         Normalized absolute error across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
-    abs_error = mean_absolute_error(prev_real, prev_pred)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    abs_error = MAE(prev_pred, prev_real)
     z_abs_error = 2 * (1 - np.min(prev_real))
     return abs_error / z_abs_error
-def normalized_kullback_leibler_divergence(prev_real, prev_pred):
+def NKLD(prev_pred, prev_real):
     """
     Compute the normalized Kullback-Leibler divergence between the real and predicted prevalences.
@@ -164,13 +179,13 @@ def normalized_kullback_leibler_divergence(prev_real, prev_pred):
     divergence : float
         Normalized Kullback-Leibler divergence across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
-    kl_divergence = kullback_leibler_divergence(prev_real, prev_pred)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    kl_divergence = KLD(prev_pred, prev_real)
     euler = np.exp(kl_divergence)
     return 2 * (euler / (euler + 1)) - 1
-def relative_absolute_error(prev_real, prev_pred):
+def RAE(prev_pred, prev_real):
     """
     Compute the relative absolute error between the real and predicted prevalences.
@@ -187,11 +202,11 @@ def relative_absolute_error(prev_real, prev_pred):
     error : float
         Relative absolute error across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
-    return (mean_absolute_error(prev_real, prev_pred) / prev_real).mean(axis=-1)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    return (MAE(prev_pred, prev_real) / prev_real).mean(axis=-1)
-def normalized_relative_absolute_error(prev_real, prev_pred):
+def NRAE(prev_pred, prev_real):
     """
     Compute the normalized relative absolute error between the real and predicted prevalences.
@@ -208,8 +223,8 @@ def normalized_relative_absolute_error(prev_real, prev_pred):
     error : float
         Normalized relative absolute error across all classes.
     """
-    prev_real, prev_pred = process_inputs(prev_real, prev_pred)
-    relative = relative_absolute_error(prev_real, prev_pred)
+    prev_real, prev_pred = process_inputs(prev_pred, prev_real)
+    relative = RAE(prev_pred, prev_real)
     z_relative = (len(prev_real) - 1 + ((1 - np.min(prev_real)) / np.min(prev_real))) / len(prev_real)
     return relative / z_relative

mlquantify/mixture/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from ._classes import (
+    HDy,
+    DyS,
+    SMM,
+    SORD,
+    HDx
+)

mlquantify/mixture/_base.py ADDED Viewed

@@ -0,0 +1,153 @@
+import numpy as np
+from abc import abstractmethod
+from mlquantify.base import BaseQuantifier
+from mlquantify.mixture._utils import sqEuclidean
+from mlquantify.utils._decorators import _fit_context
+from mlquantify.utils._validation import validate_y, validate_data
+from mlquantify.multiclass import define_binary
+from mlquantify.mixture._utils import (
+    hellinger,
+    topsoe,
+    probsymm,
+    sqEuclidean
+)
+class BaseMixture(BaseQuantifier):
+    """
+    Base class for mixture-model quantifiers.
+    Mixture Models (MM) for quantification estimate class prevalences by modeling
+    the test set score distribution as a mixture of the individual class score
+    distributions learned from training data. The goal is to find the mixture
+    parameters, i.e., class proportions, that best represent the observed test data.
+    Mixture-based quantifiers approximate class-conditional distributions typically
+    via histograms or empirical distributions of classifier scores, treating the test
+    distribution as a weighted sum (mixture) of these. Estimation proceeds by finding
+    the mixture weights that minimize a distance or divergence measure between the
+    observed test distribution and the mixture of training class distributions.
+    Common distance measures used in evaluating mixtures include:
+    - Hellinger distance
+    - Topsoe distance (a symmetric Jensen-Shannon type divergence)
+    - Probabilistic symmetric divergence
+    - Squared Euclidean distance
+    These distances compare probability distributions representing class-conditioned
+    scores or histograms, and the choice of distance can affect quantification accuracy
+    and robustness.
+    The DyS framework (Maletzke et al. 2019) generalizes mixture models by introducing
+    a variety of distribution dissimilarity measures, enabling flexible and effective
+    quantification methods.
+    Notes
+    -----
+    Mixture models are defined for only binary quantification problems. For multi-class
+    problems, a one-vs-rest strategy is applied, training a binary mixture model for
+    each class against the rest.
+    Parameters
+    ----------
+    None directly; subclasses implement fitting and prediction logic.
+    Attributes
+    ----------
+    _precomputed : bool
+        Indicates if preprocess computations on data have been performed.
+    distances : Any
+        Stores intermediate or final distance computations used in model selection.
+    classes : ndarray of shape (n_classes,)
+        Unique class labels seen during training.
+    Methods
+    -------
+    fit(X, y, *args, **kwargs):
+        Fit the mixture quantifier with training data. Validates input and
+        calls internal fitting procedure.
+    predict(X, *args, **kwargs):
+        Predict class prevalences for input data by leveraging best mixture parameters.
+    get_best_distance(*args, **kwargs):
+        Return the best distance measure and associated mixture parameters found.
+    best_mixture(X):
+        Abstract method to determine optimal mixture parameters on input data.
+    get_distance(dist_train, dist_test, measure="hellinger"):
+        Compute a specified distance between two distributions.
+    References
+    ----------
+    [1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.* ECML, pp. 564-575.
+    [2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.* Data Mining and Knowledge Discovery, 17(2), 164-206.
+    [3] Maletzke, A., dos Reis, D., Cherman, E., & Batista, G. (2019). *DyS: A Framework for Mixture Models in Quantification.* AAAI Conference on Artificial Intelligence.
+    [4] Esuli, A., Moreo, A., & Sebastiani, F. (2023). *Learning to Quantify.* Springer.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> class MyMixture(BaseMixture):
+    ...     def best_mixture(self, X):
+    ...         # Implementation example: estimate mixture weights minimizing Hellinger distance
+    ...         pass
+    >>> X_train = np.random.rand(100, 10)
+    >>> y_train = np.random.randint(0, 2, size=100)
+    >>> quantifier = MyMixture()
+    >>> quantifier.fit(X_train, y_train)
+    >>> prevalences = quantifier.predict(X_train)
+    """
+    def __init__(self):
+        self._precomputed = False
+        self.distances = None
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, *args, **kwargs):
+        """Fit the quantifier using the provided data and learner."""
+        X, y = validate_data(self, X, y)
+        validate_y(self, y)
+        self.classes_ = np.unique(y)
+        self._fit(X, y, *args, **kwargs)
+        return self
+    def predict(self, X, *args, **kwargs):
+        """Predict class prevalences for the given data."""
+        X = validate_data(self, X)
+        return self._predict(X, *args, **kwargs)
+    def get_best_distance(self, *args, **kwargs):
+        _, best_distance = self.best_mixture(*args, **kwargs)
+        return best_distance
+    @abstractmethod
+    def best_mixture(self, X):
+        """Determine the best mixture parameters for the given data."""
+        pass
+    @classmethod
+    def get_distance(cls, dist_train, dist_test, measure="hellinger"):
+        """
+        Compute distance between two distributions.
+        """
+        if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
+            raise ValueError("One or both vectors are zero (empty)...")
+        if len(dist_train) != len(dist_test):
+            raise ValueError("Arrays must have the same length.")
+        dist_train = np.maximum(dist_train, 1e-20)
+        dist_test = np.maximum(dist_test, 1e-20)
+        if measure == "topsoe":
+            return topsoe(dist_train, dist_test)
+        elif measure == "probsymm":
+            return probsymm(dist_train, dist_test)
+        elif measure == "hellinger":
+            return hellinger(dist_train, dist_test)
+        elif measure == "euclidean":
+            return sqEuclidean(dist_train, dist_test)
+        else:
+            raise ValueError(f"Invalid measure: {measure}")

mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

mlquantify 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl