PyPI - mlquantify - Versions diffs - 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl - Mend

mlquantify 0.1.18py3-none-any.whl → 0.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

mlquantify/meta/_classes.py CHANGED Viewed

@@ -696,59 +696,67 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
     def aggregate(self, predictions, train_y_values):
-        pos_predictions = predictions[:, 1]
-        m = self._get_best_merging_factor(pos_predictions)
+        prevalence, _, _ = self.best_mixture(predictions)
+        prevalences = np.asarray([1-prevalence, prevalence])
         self.classes = self.classes if hasattr(self, 'classes') else np.unique(train_y_values)
-        moss_scores, moss_labels = self.MoSS(1000, 0.5, m)
-        prevalences = self.quantifier.aggregate(predictions,
-                                                moss_scores,
-                                                moss_labels)
-        prevalences = {self.classes[i]: v for i, v in enumerate(prevalences.values())}
+        prevalences = validate_prevalences(self, prevalences, self.classes)
         return prevalences
-    def _get_best_merging_factor(self, predictions):
+    def best_mixture(self, predictions):
+        predictions = predictions[:, 1]
         MF = np.atleast_1d(np.round(self.merging_factors, 2)).astype(float)
         distances = []
+        alphas = []
         for mf in MF:
-            scores, labels = self.MoSS(1000, 0.5, mf)
+            scores, labels = self.MoSS(n=1000, alpha=0.5, merging_factor=mf)
             pos_scores = scores[labels == 1][:, 1]
             neg_scores = scores[labels == 0][:, 1]
+            if self.measure in ["hellinger", "topsoe", "probsymm"]:
+                method = DyS(measure=self.measure)
+            elif self.measure == "sord":
+                method = SORD()
-            best_distance = self._get_best_distance(predictions, pos_scores, neg_scores)
+            alpha, distance = method.best_mixture(predictions, pos_scores, neg_scores)
-            distances.append(best_distance)
+            distances.append(distance)
+            alphas.append(alpha)
         best_m = MF[np.argmin(distances)]
-        return best_m
+        best_alpha = alphas[np.argmin(distances)]
+        best_distance = np.min(distances)
+        return best_alpha, best_distance, best_m
-    def _get_best_distance(self, predictions, pos_scores, neg_scores):
-        if self.measure in ["hellinger", "topsoe", "probsymm"]:
-            method = DyS(measure=self.measure)
-        elif self.measure == "sord":
-            method = SORD()
+    def get_best_distance(self, predictions):
-        best_distance = method.get_best_distance(predictions, pos_scores, neg_scores)
-        return best_distance
+        _, distance, _= self.get_best_merging_factor(predictions)
+        return distance
     @classmethod
-    def MoSS(cls, n, alpha, m):
+    def MoSS(cls, n, alpha, merging_factor):
         r"""Model for Score Simulation
-        MoSS has three key parameters:
-        (I) the number of observations `n`;
-        (II) the class proportion `\alpha`, which defines the prevalence of the positive class;
-        (III) the merging factor :math:`m`, which controls the overlap between positive and negative score distributions
-        (where :math:`m=0` represents easily separable classes and :math:`m=1` represents highly overlapping ones).
+        Parameters
+        ----------
+        n : int
+            Number of observations.
+        alpha : float
+            Class proportion, which defines the prevalence of the positive class.
+        m : float
+            Merging factor, which controls the overlap between positive and negative score distributions.
+        Returns
+        -------
+        tuple
+            Tuple of score and label arrays.
         .. math::
@@ -776,9 +784,9 @@ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
         n_neg = n - n_pos
         # Scores positivos
-        p_score = np.random.uniform(size=n_pos) ** m
+        p_score = np.random.uniform(size=n_pos) ** merging_factor
         # Scores negativos
-        n_score = 1 - (np.random.uniform(size=n_neg) ** m)
+        n_score = 1 - (np.random.uniform(size=n_neg) ** merging_factor)
         # Construção dos arrays de features (duas colunas iguais)
         moss = np.column_stack(

mlquantify/model_selection/_protocol.py CHANGED Viewed

@@ -8,6 +8,7 @@ from mlquantify.utils._sampling import (
     simplex_uniform_kraemer,
     simplex_uniform_sampling,
 )
+from mlquantify.utils._random import check_random_state
 from mlquantify.utils._validation import validate_data
 from abc import ABC, abstractmethod
 from logging import warning
@@ -170,6 +171,8 @@ class APP(BaseProtocol):
     def _iter_indices(self, X: np.ndarray, y: np.ndarray):
         n_dim = len(np.unique(y))
+        rng = check_random_state(self.random_state)
         for batch_size in self.batch_size:
             prevalences = simplex_grid_sampling(n_dim=n_dim,
@@ -178,9 +181,8 @@ class APP(BaseProtocol):
                                               min_val=self.min_prev,
                                               max_val=self.max_prev)
             for prev in prevalences:
-                indexes = get_indexes_with_prevalence(y, prev, batch_size)
+                indexes = get_indexes_with_prevalence(y, prev, batch_size, random_state=rng)
                 yield indexes
@@ -221,10 +223,10 @@ class NPP(BaseProtocol):
         self.repeats = repeats
     def _iter_indices(self, X: np.ndarray, y: np.ndarray):
+        rng = check_random_state(self.random_state)
         for _ in range(self.n_samples):
             for batch_size in self.batch_size:
-                idx = np.random.choice(X.shape[0], batch_size, replace=True)
+                idx = rng.choice(X.shape[0], batch_size, replace=True)
                 for _ in range(self.repeats):
                     yield idx
@@ -289,6 +291,8 @@ class UPP(BaseProtocol):
     def _iter_indices(self, X: np.ndarray, y: np.ndarray):
         n_dim = len(np.unique(y))
+        rng = check_random_state(self.random_state)
         for batch_size in self.batch_size:
             if self.algorithm == 'kraemer':
@@ -296,16 +300,17 @@ class UPP(BaseProtocol):
                                            n_prev=self.n_prevalences,
                                            n_iter=self.repeats,
                                            min_val=self.min_prev,
-                                           max_val=self.max_prev)
+                                           max_val=self.max_prev,
+                                           random_state=rng)
             elif self.algorithm == 'uniform':
                 prevalences = simplex_uniform_sampling(n_dim=n_dim,
                                               n_prev=self.n_prevalences,
                                               n_iter=self.repeats,
                                               min_val=self.min_prev,
-                                              max_val=self.max_prev)
+                                              max_val=self.max_prev,
+                                              random_state=rng)
             for prev in prevalences:
-                indexes = get_indexes_with_prevalence(y, prev, batch_size)
+                indexes = get_indexes_with_prevalence(y, prev, batch_size, random_state=rng)
                 yield indexes
@@ -347,12 +352,12 @@ class PPP(BaseProtocol):
                         repeats=repeats)
     def _iter_indices(self, X: np.ndarray, y: np.ndarray):
+        rng = check_random_state(self.random_state)
         for batch_size in self.batch_size:
             for prev in self.prevalences:
                 if isinstance(prev, float):
                     prev = [1-prev, prev]
-                indexes = get_indexes_with_prevalence(y, prev, batch_size)
+                indexes = get_indexes_with_prevalence(y, prev, batch_size, random_state=rng)
                 yield indexes

mlquantify/utils/_sampling.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import numpy as np
+from mlquantify.utils import check_random_state
 import itertools
-def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
+def get_indexes_with_prevalence(y, prevalence: list, sample_size:int, random_state: int = None):
     """
     Get indexes for a stratified sample based on the prevalence of each class.
@@ -23,6 +24,7 @@ def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
         List of indexes for the stratified sample.
     """
     classes = np.unique(y)
+    rng = check_random_state(random_state)
     # Ensure the sum of prevalences is 1
     assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
@@ -43,12 +45,12 @@ def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
         class_indexes = np.where(y == class_)[0]
         # Sample the indexes for the current class
-        sampled_class_indexes = np.random.choice(class_indexes, size=num_samples, replace=True)
+        sampled_class_indexes = rng.choice(class_indexes, size=num_samples, replace=True)
         sampled_indexes.extend(sampled_class_indexes)
         total_sampled += num_samples
-    np.random.shuffle(sampled_indexes)  # Shuffle after collecting all indexes
+    rng.shuffle(sampled_indexes)  # Shuffle after collecting all indexes
     return sampled_indexes
@@ -59,7 +61,8 @@ def simplex_uniform_kraemer(n_dim: int,
                             n_iter: int,
                             min_val: float = 0.0,
                             max_val: float = 1.0,
-                            max_tries: int = 1000) -> np.ndarray:
+                            max_tries: int = 1000,
+                            random_state: int = None) -> np.ndarray:
     """
     Generates n_prev prevalence vectors of n_dim classes uniformly
     distributed on the simplex, with optional lower and upper bounds.
@@ -91,28 +94,25 @@ def simplex_uniform_kraemer(n_dim: int,
     if min_val * n_dim > 1 or max_val * n_dim < 1:
         raise ValueError("Invalid bounds: they make it impossible to sum to 1.")
+    rng = check_random_state(random_state)
     effective_simplex_size = 1 - n_dim * min_val
     prevs = []
-    # Amostragem em blocos até atingir n_prev válidos
     tries = 0
-    batch_size = max(n_prev, 1000)  # Gera em blocos grandes para eficiência
+    batch_size = n_prev
     while len(prevs) < n_prev and tries < max_tries:
         tries += 1
-        # Geração de pontos uniformes no simplex reduzido
-        u = np.random.uniform(0, 1, (batch_size, n_dim - 1))
+        u = rng.uniform(0, 1, (batch_size, n_dim - 1))
         u.sort(axis=1)
         simplex = np.diff(np.concatenate([np.zeros((batch_size, 1)), u, np.ones((batch_size, 1))], axis=1), axis=1)
-        # Escala para [min_val, max_val]
         scaled = min_val + simplex * effective_simplex_size
-        # Normaliza para garantir soma = 1
         scaled /= scaled.sum(axis=1, keepdims=True)
-        # Filtra apenas vetores válidos
         mask = np.all((scaled >= min_val) & (scaled <= max_val), axis=1)
         valid = scaled[mask]
@@ -122,11 +122,13 @@ def simplex_uniform_kraemer(n_dim: int,
     if not prevs:
         raise RuntimeError("No valid prevalences found with given constraints. Try adjusting min_val/max_val.")
-    if n_iter > 1:
-        prevs = np.tile(prevs, (n_iter, 1))
     result = np.vstack(prevs)
-    return result[:n_prev]
+    result = result[:n_prev]
+    if n_iter > 1:
+        result = np.repeat(result, n_iter, axis=0)
+    return result
@@ -135,7 +137,7 @@ def simplex_grid_sampling(
     n_prev: int,
     n_iter: int,
     min_val: float,
-    max_val: float
+    max_val: float,
 ) -> np.ndarray:
     """
     Efficiently generates artificial prevalence vectors that sum to 1
@@ -181,7 +183,7 @@ def simplex_grid_sampling(
     # Repetição se necessário
     if n_iter > 1:
-        prevs = np.tile(prevs, (n_iter, 1))
+        prevs = np.repeat(prevs, n_iter, axis=0)
     return prevs
@@ -193,7 +195,8 @@ def simplex_uniform_sampling(
     n_prev: int,
     n_iter: int,
     min_val: float,
-    max_val: float
+    max_val: float,
+    random_state: int = None
 ) -> np.ndarray:
     """
     Generates uniformly distributed prevalence vectors within the simplex,
@@ -265,9 +268,8 @@ def bootstrap_sample_indices(
     np.ndarray
         Array containing indices for a bootstrap sample.
     """
-    if random_state is not None:
-        np.random.seed(random_state)
+    rng = check_random_state(random_state)
     for _ in range(n_bootstraps):
-        indices = np.random.choice(n_samples, size=batch_size, replace=True)
+        indices = rng.choice(n_samples, size=batch_size, replace=True)
         yield indices

mlquantify/utils/prevalence.py CHANGED Viewed

@@ -3,27 +3,36 @@ import pandas as pd
 from collections import defaultdict
-def get_prev_from_labels(y, format="dict") -> dict:
+def get_prev_from_labels(y, format="dict", classes: list = None):
     """
     Get the real prevalence of each class in the target array.
     Parameters
     ----------
     y : np.ndarray or pd.Series
         Array of class labels.
+    format : str, default="dict"
+        Format of the output. Can be "array" or "dict".
+    classes : list, optional
+        List of unique classes. If provided, the output will be sorted by these classes.
     Returns
     -------
-    dict
-        Dictionary of class labels and their corresponding prevalence.
+    dict or np.ndarray
+        Dictionary of class labels and their corresponding prevalence or array of prevalences.
     """
     if isinstance(y, np.ndarray):
         y = pd.Series(y)
+    counts = y.value_counts(normalize=True).sort_index()
+    if classes is not None:
+        counts = counts.reindex(classes, fill_value=0.0)
     if format == "array":
-        prevalences = y.value_counts(normalize=True).sort_index().values
-        return prevalences
-    real_prevs = y.value_counts(normalize=True).to_dict()
-    real_prevs = dict(sorted(real_prevs.items()))
+        return counts.values
+    real_prevs = counts.to_dict()
     return real_prevs

{mlquantify-0.1.18.dist-info → mlquantify-0.1.20.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mlquantify
-Version: 0.1.18
+Version: 0.1.20
 Summary: Quantification Library
 Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
 Maintainer: Luiz Fernando Luth Junior

{mlquantify-0.1.18.dist-info → mlquantify-0.1.20.dist-info}/RECORD RENAMED Viewed

@@ -13,7 +13,7 @@ mlquantify/likelihood/__init__.py,sha256=3dC5uregNmquUKz0r0-3aPspfjZjKGn3TRBoZPO
 mlquantify/likelihood/_base.py,sha256=seu_Vb58QttcGbFjHKAplMYGZcVbIHqkyTXEK2cax9A,5830
 mlquantify/likelihood/_classes.py,sha256=PZ31cAwO8q5X3O2_oSmQ1FM6bY4EsB8hWEcAgcEmWXQ,14731
 mlquantify/meta/__init__.py,sha256=GzdGw4ky_kmd5VNWiLBULy06IdN_MLCDAuJKbnMOx4s,62
-mlquantify/meta/_classes.py,sha256=RKEVghPMBlyv516xrUtTyUkHvC2-5IsTUO_oVwAt3Gw,30930
+mlquantify/meta/_classes.py,sha256=0o3LBPGc-8znwJL0_TFo9zXjHrXqXc0QIPpzwaghFKQ,30898
 mlquantify/metrics/__init__.py,sha256=3bzzjSYTgrZIJsfAgJidQlB-bnjInwVYUvJ34bPhZxY,186
 mlquantify/metrics/_oq.py,sha256=koXDKeHWksl_vHpZuhc2pAps8wvu_MOgEztlSr04MmE,3544
 mlquantify/metrics/_rq.py,sha256=3yiEmGaRAGpzL29Et3tNqkJ3RMsLXwUX3uL9RoIgi40,3034
@@ -23,7 +23,7 @@ mlquantify/mixture/_base.py,sha256=1-yW64FPQXB_d9hH9KjSlDnmFtW9FY7S2hppXAd1DBg,5
 mlquantify/mixture/_classes.py,sha256=Dx0KWS-RtVVmJwXvPKIVWitsJhgcYRRiypLYrgE66x4,16420
 mlquantify/mixture/_utils.py,sha256=CKlC081nrkJ8Pil7lrPZvNZC_xfpXV8SsuQq3M_LHgA,4037
 mlquantify/model_selection/__init__.py,sha256=98I0uf8k6lbWAjazGyGjbOdPOvzU8aMRLqC3I7D3jzk,113
-mlquantify/model_selection/_protocol.py,sha256=2k0M_7YwZf7YLoQ8ElR2xMvLySVgtE_EvWieMXTIzTA,12499
+mlquantify/model_selection/_protocol.py,sha256=XhkNUN-XAuGkihm0jwQL665ps2G9bevxme_yrETNQHo,12902
 mlquantify/model_selection/_search.py,sha256=1UoP3tZ-pdfM25C-gOS89qjGKcDgQEeU7GTbwtsLKHU,10695
 mlquantify/model_selection/_split.py,sha256=chG3GNX2BBDTWIuSVfZUJ_YF_ZVBSoel2d_AN0OChS0,6
 mlquantify/neighbors/__init__.py,sha256=rIOuSaUhjqEXsUN9HNZ62P53QG0N7lJ3j1pvf8kJzms,93
@@ -43,11 +43,11 @@ mlquantify/utils/_get_scores.py,sha256=VlTvgg_t4D9MzcgsH7YvP_wIL5AZ8XmEtGpbFivdV
 mlquantify/utils/_load.py,sha256=cMGXIs-8mUB4blAmagyDNNvAaV2hysRgeInQMl5fDHg,303
 mlquantify/utils/_parallel.py,sha256=XotpX9nsj6nW-tNCmZ-ahTcRztgnn9oQKP2cl1rLdYM,196
 mlquantify/utils/_random.py,sha256=7F3nyy7Pa_kN8xP8P1L6MOM4WFu4BirE7bOfGTZ1Spk,1275
-mlquantify/utils/_sampling.py,sha256=QQxE2WKLdiCFUfPF6fKgzyrsOUIWYf74w_w8fbYVc2c,8409
+mlquantify/utils/_sampling.py,sha256=3W0vUuvLvoYrt-BZpSM0HM1XJEZr0XYIdkOcUP5hp-8,8350
 mlquantify/utils/_tags.py,sha256=Rz78TLpxgVxBKS0mKTlC9Qo_kn6HaEwVKNXh8pxFT7M,1095
 mlquantify/utils/_validation.py,sha256=zn4OHfa704YBaPKskhiThUG7wS5fvDoHBpcEgb1i8qM,18078
-mlquantify/utils/prevalence.py,sha256=FXLCJViQb2yDbyTXeGZt8WsPPnSZINhorQYZTKXOn14,1772
-mlquantify-0.1.18.dist-info/METADATA,sha256=XrQ188Icw5RZEAN8tvHRHTsRm1IKB1iwR_tm6G7uB0w,4701
-mlquantify-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mlquantify-0.1.18.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
-mlquantify-0.1.18.dist-info/RECORD,,
+mlquantify/utils/prevalence.py,sha256=LG-KXJ5Eb4w26WMpu4PoBpxMSHaqrmTQqdRlyqNRJ1o,2020
+mlquantify-0.1.20.dist-info/METADATA,sha256=VTVfeUzcWUpxdiPLHxr1wlkzfpyRAZ5ABhuAJksBg9E,4701
+mlquantify-0.1.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mlquantify-0.1.20.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
+mlquantify-0.1.20.dist-info/RECORD,,

{mlquantify-0.1.18.dist-info → mlquantify-0.1.20.dist-info}/WHEEL RENAMED Viewed

File without changes

{mlquantify-0.1.18.dist-info → mlquantify-0.1.20.dist-info}/top_level.txt RENAMED Viewed

File without changes

mlquantify 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

mlquantify 0.1.18py3-none-any.whl → 0.1.20py3-none-any.whl