PyPI - mlquantify - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

mlquantify 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

mlquantify/evaluation/protocol.py CHANGED Viewed

@@ -1,647 +1,297 @@
 from abc import ABC, abstractmethod
 import numpy as np
-import pandas as pd
-from typing import Union, List, Tuple, Any
-from sklearn.base import BaseEstimator
-from time import time
+from typing import Generator, Tuple
 from tqdm import tqdm
-from ..methods import METHODS, AGGREGATIVE, NON_AGGREGATIVE
 from ..utils.general import *
-from ..utils.method import *
-from . import MEASURES
-from ..base import Quantifier
-import mlquantify as mq
 class Protocol(ABC):
     """Base class for evaluation protocols.
     Parameters
     ----------
-    models : Union[List[Union[str, Quantifier]], str, Quantifier]
-        List of quantification models, a single model name, or 'all' for all models.
-    learner : BaseEstimator, optional
-        Machine learning model to be used with the quantifiers. Required for model methods.
-    n_jobs : int, optional
-        Number of jobs to run in parallel. Default is 1.
+    batch_size : int or list of int
+        The size of the batches to be used in the evaluation.
     random_state : int, optional
-        Seed for random number generation. Default is 32.
-    verbose : bool, optional
-        Whether to print progress messages. Default is False.
-    return_type : str, optional
-        Type of return value ('predictions' or 'table'). Default is 'predictions'.
-    measures : List[str], optional
-        List of error measures to calculate. Must be in MEASURES or None. Default is None.
-    columns : List[str], optional
-        Columns to be included in the table. Default is ['ITERATION', 'QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'BATCH_SIZE'].
+        The random seed for reproducibility.
     Attributes
     ----------
-    models : List[Quantifier]
-        List of quantification models.
-    learner : BaseEstimator
-        Machine learning model to be used with the quantifiers.
-    n_jobs : int
-        Number of jobs to run in parallel.
-    random_state : int
-        Seed for random number generation.
-    verbose : bool
-        Whether to print progress messages.
-    return_type : str
-        Type of return value ('predictions' or 'table').
-    measures : List[str]
-        List of error measures to calculate.
-    columns : List[str]
-        Columns to be included in the table.
+    n_combinations : int
     Raises
     ------
-    AssertionError
-        If measures contain invalid error measures.
-        If return_type is invalid.
-        If columns does not contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
+    ValueError
+        If the batch size is not a positive integer or list of positive integers.
     Notes
     -----
-    - The 'models' parameter can be a list of Quantifiers, a single Quantifier, a list of model names, a single model name, or 'all'.
-    - If 'models' is a list of model names or 'all', 'learner' must be provided.
-    - The 'all' option for 'models' will use all quantification models available in the library.
-    - If 'models' is a Quantifier or list of Quantifier, 'learner' is not required. But the models must be initializated
-    - You can pass your own model by passing a Quantifier object.
-    - Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
-    - If 'return_type' is 'table', the table will contain the columns specified in 'columns' and the error measures in 'measures'.
-    - For creating your own protocol, you must have the attributes 'models', 'learner', 'n_jobs', 'random_state', 'verbose', 'return_type', 'measures', and 'columns'., but columns can be changed, as long as it contains ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
-    See Also
-    --------
-    APP : Artificial Prevalence Protocol.
-    NPP : Natural Prevalence Protocol.
-    Quantifier : Base class for quantification methods.
+    This class serves as a base class for different evaluation protocols, each with its own strategy for splitting the data into batches.
     Examples
     --------
-    import numpy as np
-    >>> from mlquantify.evaluation.protocol import Protocol
-    >>> from mlquantify.utils import get_real_prev
-    >>> from sklearn.ensemble import RandomForestClassifier
-    >>> from sklearn.datasets import load_breast_cancer
-    >>> from sklearn.model_selection import train_test_split
-    >>> import time as t
-    >>>
-    >>> class MyProtocol(Protocol):
-    ...    def __init__(self,
-    ...                models,
-    ...                learner,
-    ...                n_jobs,
-    ...                random_state,
-    ...                verbose,
-    ...                return_type,
-    ...                measures,
-    ...                sample_size,
-    ...                iterations=10):
-    ...        super().__init__(models,
-    ...                         learner,
-    ...                         n_jobs,
-    ...                         random_state,
-    ...                         verbose,
-    ...                         return_type,
-    ...                         measures,
-    ...                         columns=['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'TIME'])
-    ...        self.sample_size = sample_size
-    ...        self.iterations = iterations
-    ...
-    ...    def predict_protocol(self, X_test, y_test):
-    ...        predictions = []
-    ...
-    ...        X_sample, y_sample = self._new_sample(X_test, y_test)
-    ...
-    ...        for _ in range(self.iterations):
-    ...            for model in self.models:
-    ...                quantifier = model.__class__.__name__
-    ...
-    ...                real_prev = get_real_prev(y_sample)
+    >>> class MyCustomProtocol(Protocol):
+    ...     def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
+    ...         for batch_size in self.batch_size:
+    ...             yield np.random.choice(X.shape[0], batch_size, replace=True)
     ...
-    ...                start_time = t.time()
-    ...                pred_prev = model.predict(X_sample)
-    ...                end_time = t.time()
-    ...                time = end_time - start_time
-    ...
-    ...                predictions.append([quantifier, real_prev, pred_prev, time])
-    ...
-    ...        return predictions
-    ...
-    ...    def _new_sample(self, X_test, y_test):
-    ...        indexes = np.random.choice(len(X_test), size=self.sample_size, replace=False)
-    ...        return X_test[indexes], y_test[indexes]
-    >>>
-    >>>
-    >>> features, target = load_breast_cancer(return_X_y=True)
-    >>>
-    >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)
-    >>>
-    >>> protocol = MyProtocol(models=["CC", "EMQ", "DyS"], # or [CC(learner), EMQ(learner), DyS(learner)]
-    ...                    learner=RandomForestClassifier(),
-    ...                    n_jobs=1,
-    ...                    random_state=42,
-    ...                    verbose=True,
-    ...                    return_type="table",
-    ...                    measures=None,
-    ...                    sample_size=100)
-    >>>
-    >>> protocol.fit(X_train, y_train)
-    >>> table = protocol.predict(X_test, y_test)
-    >>> print(table)
+    >>> protocol = MyCustomProtocol(batch_size=100, random_state=42)
+    >>> for train_idx, test_idx in protocol.split(X, y):
+    ...     # Train and evaluate model
+    ...     pass
     """
-    def __init__(self,
-                 models: Union[List[Union[str, Quantifier]], str, Quantifier],
-                 learner: BaseEstimator = None,
-                 n_jobs: int = 1,
-                 random_state: int = 32,
-                 verbose: bool = False,
-                 return_type: str = "predictions",
-                 measures: List[str] = None,
-                 columns: List[str] = ["ITERATION", "QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]):
-        assert not measures or all(m in MEASURES for m in measures), \
-            f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
-        assert return_type in ["predictions", "table"], \
-            "Invalid return_type. Valid options: ['predictions', 'table']"
-        assert all(col in columns for col in ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS"]), \
-            "Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS']"
-        # Fixed parameters
-        self.models = self._initialize_models(models, learner)
-        self.learner = learner
-        self.n_jobs = n_jobs
-        self.random_state = random_state
-        self.verbose = verbose
-        self.return_type = return_type
-        self.measures = measures
-        self.columns = columns
-    def _initialize_models(self, models, learner):
-        """Initializes the quantification models.
-        Parameters
-        ----------
-        models : Union[List[Union[str, Quantifier]], str, Quantifier]
-            List of quantification models, a single model name, or 'all' for all models.
-        learner : BaseEstimator
-            Machine learning model to be used with the quantifiers.
-        Returns
-        -------
-        List[Quantifier]
-            List of quantification models.
-        """
-        if isinstance(models, list):
-            if all(isinstance(model, Quantifier) for model in models):
-                return models
-            return [get_method(model)(learner) for model in models]
-        if isinstance(models, Quantifier):
-            return [models]
-        assert learner is not None, "Learner is required for model methods."
+    def __init__(self, batch_size, random_state=None, **kwargs):
+        if isinstance(batch_size, int):
+            self.n_combinations = 1
+        else:
+            self.n_combinations = len(batch_size)
-        model_dict = {
-            "all": METHODS.values,
-            "aggregative": AGGREGATIVE.values,
-            "non_aggregative": NON_AGGREGATIVE.values
-        }
+        self.batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
+        self.random_state = random_state
-        if models in model_dict:
-            return [model(learner) if hasattr(model, "learner") else model() for model in model_dict[models]()]
-        return [get_method(models)(learner)]
-    def sout(self, msg):
-        """Prints a message if verbose is True."""
-        if self.verbose:
-            print('[APP]' + msg)
-    def fit(self, X_train, y_train):
-        """Fits the models with the training data.
-        Parameters
-        ----------
-        X_train : np.ndarray
-            Features of the training set.
-        y_train : np.ndarray
-            Labels of the training set.
-        Returns
-        -------
-        Protocol
-            Fitted protocol.
+        for name, value in kwargs.items():
+            setattr(self, name, value)
+            if isinstance(value, list):
+                self.n_combinations *= len(value)
+            elif isinstance(value, (int, float)):
+                self.n_combinations *= value
+            else:
+                raise ValueError(f"Invalid argument {name}={value}: must be int/float or list of int/float.")
+    def split(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray, np.ndarray]:
         """
-        self.sout("Fitting models")
+        Split the data into samples for evaluation.
-        args = ((model, X_train, y_train) for model in self.models)
-        wrapper = tqdm if self.verbose else lambda x, **kwargs: x
-        self.models = Parallel(n_jobs=self.n_jobs)(  # Parallel processing of models
-            delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
-        )
-        self.sout("Fit [Done]")
-        return self
-    def predict(self, X_test: np.ndarray, y_test: np.ndarray) -> Any:
-        """Predicts the prevalence for the test set.
         Parameters
         ----------
-        X_test : np.ndarray
-            Features of the test set.
-        y_test : np.ndarray
-            Labels of the test set.
-        Returns
-        -------
-        Any
-            Predictions for the test set. Can be a table or a tuple with the quantifier names, real prevalence, and predicted prevalence.
-        """
-        predictions = self.predict_protocol(X_test, y_test)
-        predictions_df = pd.DataFrame(predictions, columns=self.columns)
-        if self.return_type == "table":
-            if self.measures:
-                smoothed_factor = 1 / (2 * len(X_test))
-                def smooth(values: np.ndarray) -> np.ndarray:
-                    return (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
-                for metric in self.measures:
-                    predictions_df[metric] = predictions_df.apply(
-                        lambda row: get_measure(metric)(
-                            smooth(np.array(row["REAL_PREVS"])),
-                            smooth(np.array(row["PRED_PREVS"]))
-                        ),
-                        axis=1
-                    )
-            return predictions_df
-        return (
-            predictions_df["QUANTIFIER"].to_numpy(),  # Quantifier names
-            np.stack(predictions_df["REAL_PREVS"].to_numpy()),  # REAL_PREVS
-            np.stack(predictions_df["PRED_PREVS"].to_numpy())   # PRED_PREVS
-        )
+        X : np.ndarray
+            The input features.
+        y : np.ndarray
+            The target labels.
-    @abstractmethod
-    def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> np.ndarray:
-        """Abstract method that every protocol must implement
-        Parameters
-        ----------
-        X_test : np.ndarray
-            Features of the test set.
-        y_test : np.ndarray
-            Labels of the test set.
-        Returns
-        -------
-        np.ndarray
-            Predictions for the test set. With the same format as the column names attribute.
+        Yields
+        ------
+        Generator[np.ndarray, np.ndarray]
+            A generator that yields the indices for each split.
         """
-        ...
+        indices = np.arange(X.shape[0])
+        for idx in self._split_indices_masks(X, y):
+            indexes = indices[idx]
+            yield indexes
-    @abstractmethod
-    def _new_sample(self) -> Tuple[np.ndarray, np.ndarray]:
-        """Abstract method of sample extraction for each protocol.
+    def _split_indices_masks(self, X: np.ndarray, y: np.ndarray) -> Generator[Tuple[np.ndarray, np.ndarray]]:
+        for idx in self._iter_indices(X, y):
-        Returns:
-            Tuple[np.ndarray, np.ndarray]: Tuple containing X_sample and y_sample.
-        """
-        ...
+            mask = np.zeros(X.shape[0], dtype=bool)
+            mask[idx] = True
-    @staticmethod
-    def _delayed_fit(model, X_train, y_train):
-        """Method to fit the model in parallel.
-        Parameters
-        ----------
-        model : Quantifier
-            Quantification model.
-        X_train : np.ndarray
-            Features of the training set.
-        y_train : np.ndarray
-            Labels of the training set.
-        Returns
-        -------
-        Quantifier
-            Fitted quantification model
-        """
-        model_name = model.__class__.__name__
-        if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
-            model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
-        start = time()
-        model = model.fit(X=X_train, y=y_train)
-        duration = time() - start
-        print(f"\tFitted {model_name} in {duration:.3f} seconds")
-        return model
+            yield mask
+    @abstractmethod
+    def _iter_indices(self, X, y):
+        """Abstract method to be implemented by subclasses to yield indices for each batch."""
+        pass
+    def get_n_combinations(self) -> int:
+        """
+        Get the number of combinations for the current protocol.
+        """
+        return self.n_combinations
 class APP(Protocol):
-    """Artificial Prevalence Protocol.
-    This approach splits a test into several samples varying prevalence and sample size,
-    with n iterations. For a list of Quantifiers, it computes training and testing for
-    each one and returns either a table of results with error measures or just the predictions.
+    """Artificial Prevalence Protocol (APP) for evaluation.
+    This protocol generates artificial prevalence distributions for the evaluation in an exhaustive manner, testing all possible combinations of prevalences.
     Parameters
     ----------
-    models : Union[List[Union[str, Quantifier]], str, Quantifier]
-        List of quantification models, a single model name, or 'all' for all models.
-    batch_size : Union[List[int], int]
-        Size of the batches to be processed, or a list of sizes.
-    learner : BaseEstimator, optional
-        Machine learning model to be used with the quantifiers. Required for model methods.
-    n_prevs : int, optional
-        Number of prevalence points to generate. Default is 100.
-    n_iterations : int, optional
-        Number of iterations for the protocol. Default is 1.
-    n_jobs : int, optional
-        Number of jobs to run in parallel. Default is 1.
+    batch_size : int or list of int
+        The size of the batches to be used in the evaluation.
+    n_prevalences : int
+        The number of artificial prevalences to generate.
+    repeats : int, optional
+        The number of times to repeat the evaluation with different random seeds.
     random_state : int, optional
-        Seed for random number generation. Default is 32.
-    verbose : bool, optional
-        Whether to print progress messages. Default is False.
-    return_type : str, optional
-        Type of return value ('predictions' or 'table'). Default is 'predictions'.
-    measures : List[str], optional
-        List of error measures to calculate. Must be in MEASURES or None. Default is None.
+        The random seed for reproducibility.
     Attributes
     ----------
-    models : List[Quantifier]
-        List of quantification models.
-    batch_size : Union[List[int], int]
-        Size of the batches to be processed.
-    learner : BaseEstimator
-        Machine learning model to be used with the quantifiers.
-    n_prevs : int
-        Number of prevalence points to generate.
-    n_iterations : int
-        Number of iterations for the protocol.
-    n_jobs : int
-        Number of jobs to run in parallel.
+    n_prevalences : int
+        The number of artificial prevalences to generate.
+    repeats : int
+        The number of times to repeat the evaluation with different random seeds.
     random_state : int
-        Seed for random number generation.
-    verbose : bool
-        Whether to print progress messages.
-    return_type : str
-        Type of return value ('predictions' or 'table').
-    measures : List[str]
-        List of error measures to calculate.
+        The random seed for reproducibility.
-    Raises
-    ------
-    AssertionError
-        If return_type is invalid.
-    See Also
-    --------
-    Protocol : Base class for evaluation protocols.
-    NPP : Natural Prevalence Protocol.
-    Quantifier : Base class for quantification methods.
+    Notes
+    -----
+    It is important to note that in case of multiclass problems, the time complexity of this protocol can be significantly higher due to the increased number of combinations to evaluate.
     Examples
     --------
-    >>> from mlquantify.evaluation.protocol import APP
-    >>> from sklearn.ensemble import RandomForestClassifier
-    >>> from sklearn.datasets import load_breast_cancer
-    >>> from sklearn.model_selection import train_test_split
-    >>>
-    >>> # Loading dataset from sklearn
-    >>> features, target = load_breast_cancer(return_X_y=True)
-    >>>
-    >>> #Splitting into train and test
-    >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
-    >>>
-    >>> app = APP(models=["CC", "EMQ", "DyS"],
-    ...           batch_size=[10, 50, 100],
-    ...           learner=RandomForestClassifier(),
-    ...           n_prevs=100, # Default
-    ...           n_jobs=-1,
-    ...           return_type="table",
-    ...           measures=["ae", "se"],
-    ...           verbose=True)
-    >>>
-    >>> app.fit(X_train, y_train)
-    >>>
-    >>> table = app.predict(X_test, y_test)
-    >>>
-    >>> print(table)
-    """
-    def __init__(self,
-                 models: Union[List[Union[str, Quantifier]], str, Quantifier],
-                 batch_size: Union[List[int], int],
-                 learner: BaseEstimator = None,
-                 n_prevs: int = 100,
-                 n_iterations: int = 1,
-                 n_jobs: int = 1,
-                 random_state: int = 32,
-                 verbose: bool = False,
-                 return_type: str = "predictions",
-                 measures: List[str] = None):
-        super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
-        self.n_prevs = n_prevs
-        self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
-        self.n_prevs = n_prevs
-        self.n_iterations = n_iterations
+    >>> protocol = APP(batch_size=[100, 200], n_prevalences=5, repeats=3, random_state=42)
+    >>> for train_idx, test_idx in protocol.split(X, y):
+    ...     # Train and evaluate model
+    ...     pass
+    """
-    def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> Tuple:
-        """Generates several samples with artificial prevalences and sizes.
-        For each model, predicts with this sample, aggregating all together
-        with a pandas dataframe if requested, or else just the predictions.
+    def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
+        super().__init__(batch_size=batch_size,
+                            random_state=random_state,
+                            n_prevalences=n_prevalences,
+                            repeats=repeats)
-        Parameters
-        ----------
-        X_test : np.ndarray
-            Features of the test set.
-        y_test : np.ndarray
-            Labels of the test set.
+    def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
-        Returns
-        -------
-        Tuple
-            Tuple containing the (iteration, model name, prev, prev_pred, and batch size).
-        """
-        n_dim = len(np.unique(y_test))
-        prevs = generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
-        args = [
-            (iteration, X_test, y_test, model, prev, bs, self.verbose)
-            for prev in prevs for bs in self.batch_size for model in self.models for iteration in range(self.n_iterations)
-        ]
+        n_dim = len(np.unique(y))
-        size = len(prevs) * len(self.models) * len(self.batch_size) * self.n_iterations
+        for batch_size in self.batch_size:
+            prevalences = generate_artificial_prevalences(n_dim=n_dim,
+                                                           n_prev=self.n_prevalences,
+                                                           n_iter=self.repeats)
+            for prev in prevalences:
+                indexes = get_indexes_with_prevalence(y, prev, batch_size)
+                yield indexes
-        predictions = []
-        for arg in tqdm(args, desc="Running APP", total=size):
-            predictions.append(self._predict(*arg))
-        return predictions
-    def _predict(self, iteration:int, X: np.ndarray, y: np.ndarray, model: Any, prev: List[float], batch_size: int, verbose: bool) -> Tuple:
-        """Method predicts into the new sample for each model and prevalence.
+class NPP(Protocol):
+    """No Prevalence Protocol (NPP) for evaluation.
+    This protocol just samples the data without any consideration for prevalence, with all instances having equal probability of being selected.
-        Parameters
-        ----------
-        iteration : int
-            Current iteration.
-        X : np.ndarray
-            Features of the test set.
-        y : np.ndarray
-            Labels of the test set.
-        model : Any
-            Quantification model.
-        prev : List[float]
-            Prevalence values for the sample.
-        batch_size : int
-            Batch size for the sample.
-        verbose : bool
-            Whether to print progress messages.
-        Returns
-        -------
-        Tuple
-            Tuple containing the iteration, model name, prev, prev_pred, and batch size.
-        """
-        model_name = model.__class__.__name__
-        if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
-            model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
-        if verbose:
-            print(f'\t {model_name} with {batch_size} instances and prev {prev}')
-        X_sample, _ = self._new_sample(X, y, prev, batch_size)
-        prev_pred = np.asarray(list(model.predict(X_sample).values()))
-        if verbose:
-            print(f'\t \\--Ending {model_name} with {batch_size} instances and prev {prev}\n')
-        return (iteration+1, model_name, prev, prev_pred, batch_size)
+    Parameters
+    ----------
+    batch_size : int or list of int
+        The size of the batches to be used in the evaluation.
+    random_state : int, optional
+        The random seed for reproducibility.
+    Attributes
+    ----------
+    n_prevalences : int
+        The number of artificial prevalences to generate.
+    repeats : int
+        The number of times to repeat the evaluation with different random seeds.
+    random_state : int
+        The random seed for reproducibility.
-    def _new_sample(self, X: np.ndarray, y: np.ndarray, prev: List[float], batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
-        """Generates a new sample with a specified prevalence and size.
+    Examples
+    --------
+    >>> protocol = NPP(batch_size=100, random_state=42)
+    >>> for train_idx, test_idx in protocol.split(X, y):
+    ...     # Train and evaluate model
+    ...     pass
+    """
-        Parameters
-        ----------
-        X : np.ndarray
-            Features of the test set.
-        y : np.ndarray
-            Labels of the test set.
-        prev : List[float]
-            Prevalence values for the sample.
-        batch_size : int
-            Batch size for the sample.
+    def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
-        Returns
-        -------
-        Tuple[np.ndarray, np.ndarray]
-            Tuple containing the new sample features and labels.
-        """
-        sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
-        return (np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0))
+        for batch_size in self.batch_size:
+            yield np.random.choice(X.shape[0], batch_size, replace=True)
+class UPP(Protocol):
+    """Uniform Prevalence Protocol (UPP) for evaluation.
+    An extension of the APP that generates artificial prevalence distributions uniformly across all classes utilizing the kraemer sampling method.
+    Parameters
+    ----------
+    batch_size : int or list of int
+        The size of the batches to be used in the evaluation.
+    n_prevalences : int
+        The number of artificial prevalences to generate.
+    repeats : int
+        The number of times to repeat the evaluation with different random seeds.
+    random_state : int, optional
+        The random seed for reproducibility.
+    Attributes
+    ----------
+    n_prevalences : int
+        The number of artificial prevalences to generate.
+    repeats : int
+        The number of times to repeat the evaluation with different random seeds.
+    random_state : int
+        The random seed for reproducibility.
+    Examples
+    --------
+    >>> protocol = UPP(batch_size=100, n_prevalences=5, repeats=3, random_state=42)
+    >>> for train_idx, test_idx in protocol.split(X, y):
+    ...     # Train and evaluate model
+    ...     pass
+    """
+    def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
+        super().__init__(batch_size=batch_size,
+                            random_state=random_state,
+                            n_prevalences=n_prevalences,
+                            repeats=repeats)
+    def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
+        n_dim = len(np.unique(y))
+        for batch_size in self.batch_size:
+            prevalences = kraemer_sampling(n_dim=n_dim,
+                                           n_prev=self.n_prevalences,
+                                           n_iter=self.repeats)
+            for prev in prevalences:
+                indexes = get_indexes_with_prevalence(y, prev, batch_size)
+                yield indexes
+class PPP(Protocol):
+    """ Personalized Prevalence Protocol (PPP) for evaluation.
+    This protocol generates artificial prevalence distributions personalized for each class.
-class NPP(Protocol):
-    """Natural Prevalence Protocol.
-    This approach splits a test into several samples varying sample size,
-    with n iterations. For a list of Quantifiers, it computes training and testing for
-    each one and returns either a table of results with error measures or just the predictions.
     Parameters
     ----------
-    models : Union[List[Union[str, Quantifier]], str, Quantifier]
-        List of quantification models, a single model name, or 'all' for all models.
-    batch_size : Union[List[int], int]
-        Size of the batches to be processed, or a list of sizes.
-    learner : BaseEstimator, optional
-        Machine learning model to be used with the quantifiers. Required for model methods.
-    n_iterations : int, optional
-        Number of iterations for the protocol. Default is 1.
-    n_jobs : int, optional
-        Number of jobs to run in parallel. Default is 1.
+    batch_size : int or list of int
+        The size of the batches to be used in the evaluation.
+    prevalences : list of float
+        The list of artificial prevalences to generate for each class.
+    repeats : int
+        The number of times to repeat the evaluation with different random seeds.
     random_state : int, optional
-        Seed for random number generation. Default is 32.
-    verbose : bool, optional
-        Whether to print progress messages. Default is False.
-    return_type : str, optional
-        Type of return value ('predictions' or 'table'). Default is 'predictions'.
-    measures : List[str], optional
-        List of error measures to calculate. Must be in MEASURES or None. Default is None.
+        The random seed for reproducibility.
     Attributes
     ----------
-    models : List[Quantifier]
-        List of quantification models.
-    batch_size : Union[List[int], int]
-        Size of the batches to be processed.
-    learner : BaseEstimator
-        Machine learning model to be used with the quantifiers.
-    n_iterations : int
-        Number of iterations for the protocol.
-    n_jobs : int
-        Number of jobs to run in parallel.
+    prevalences : list of float
+        The list of artificial prevalences to generate for each class.
+    repeats : int
+        The number of times to repeat the evaluation with different random seeds.
     random_state : int
-        Seed for random number generation.
-    verbose : bool
-        Whether to print progress messages.
-    return_type : str
-        Type of return value ('predictions' or 'table').
-    measures : List[str]
-        List of error measures to calculate.
+        The random seed for reproducibility.
+    Examples
+    --------
+    >>> protocol = PPP(batch_size=100, prevalences=[0.1, 0.9], repeats=3, random_state=42)
+    >>> for train_idx, test_idx in protocol.split(X, y):
+    ...     # Train and evaluate model
+    ...     pass
     """
+    def __init__(self, batch_size, prevalences, repeats=1, random_state=None):
+        super().__init__(batch_size=batch_size,
+                        random_state=random_state,
+                        prevalences=prevalences,
+                        repeats=repeats)
-    def __init__(self,
-                 models: Union[List[Union[str, Quantifier]], str, Quantifier],
-                 learner: BaseEstimator = None,
-                 n_jobs: int = 1,
-                 random_state: int = 32,
-                 verbose: bool = False,
-                 return_type: str = "predictions",
-                 measures: List[str] = None):
-        super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
-    def predict_protocol(self, X_test, y_test) -> tuple:
-        raise NotImplementedError
-    def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
-        raise NotImplementedError
+    def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
-    def _delayed_predict(self, args) -> tuple:
-        raise NotImplementedError
+        for batch_size in self.batch_size:
+            for prev in self.prevalences:
+                if isinstance(prev, float):
+                    prev = [1-prev, prev]
+                indexes = get_indexes_with_prevalence(y, prev, batch_size)
+                yield indexes

mlquantify/utils/general.py CHANGED Viewed

@@ -26,12 +26,9 @@ def convert_columns_to_arrays(df, columns:list = ['PRED_PREVS', 'REAL_PREVS']):
     return df
-def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:list):
+def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
     """
-    Generate indexes for a stratified sample based on the prevalence of each class.
+    Get indexes for a stratified sample based on the prevalence of each class.
     Parameters
     ----------
@@ -48,10 +45,13 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
     -------
     list
         List of indexes for the stratified sample.
-    """
+    """
+    classes = np.unique(y)
     # Ensure the sum of prevalences is 1
     assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
     # Ensure the number of prevalences matches the number of classes
+    assert len(prevalence) == len(classes), "The number of prevalences must match the number of classes"
     sampled_indexes = []
     total_sampled = 0
@@ -78,6 +78,43 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
+def kraemer_sampling(n_dim: int, n_prev: int, n_iter: int = 1) -> np.ndarray:
+    """
+    Uniform sampling from the unit simplex using Kraemer's algorithm.
+    Parameters
+    ----------
+    n_dim : int
+        Number of dimensions.
+    n_prev : int
+        Size of the sample.
+    n_iter : int
+        Number of iterations.
+    Returns
+    -------
+    np.ndarray
+        Array of sampled prevalences.
+    """
+    def _sampling(n_dim: int, n_prev: int) -> np.ndarray:
+        if n_dim == 2:
+            u = np.random.rand(n_prev)
+            return np.vstack([1 - u, u]).T
+        else:
+            u = np.random.rand(n_prev, n_dim - 1)
+            u.sort(axis=-1)   # sort each row
+            _0s = np.zeros((n_prev, 1))
+            _1s = np.ones((n_prev, 1))
+            a = np.hstack([_0s, u])
+            b = np.hstack([u, _1s])
+            return b - a
+    # repeat n_iter times
+    prevs = _sampling(n_dim, n_prev)
+    return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
 def generate_artificial_prevalences(n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
     """Generates n artificial prevalences with n dimensions.

{mlquantify-0.1.3.dist-info → mlquantify-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mlquantify
-Version: 0.1.3
+Version: 0.1.4
 Summary: Quantification Library
 Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
 Maintainer: Luiz Fernando Luth Junior
@@ -40,9 +40,9 @@ ___
 ## Latest Release
-- **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
-- In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
-- Explore the [API documentation](#) for detailed developer information.
+- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
+- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
+- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
 - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
 ___
@@ -70,7 +70,7 @@ ___
 | **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
 | **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
 | **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
-| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, BIAS, NAE, SE, KLD, etc.). |
+| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
 | **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
 | **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
 | **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
@@ -82,7 +82,10 @@ ___
 This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
 ```python
-import mlquantify as mq
+from mlquantify.methods import EMQ
+from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
+from mlquantify.utils import get_real_prev
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.datasets import load_breast_cancer
 from sklearn.model_selection import train_test_split
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
 #Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
-model = mq.methods.EMQ(RandomForestClassifier())
+model = EMQ(RandomForestClassifier())
 model.fit(X_train, y_train)
 #Predict the class prevalence for X_test
 pred_prevalence = model.predict(X_test)
-real_prevalence = mq.utils.get_real_prev(y_test)
+real_prevalence = get_real_prev(y_test)
 #Get the error for the prediction
-ae = mq.evaluation.absolute_error(real_prevalence, pred_prevalence)
-bias = mq.evaluation.bias(real_prevalence, pred_prevalence)
+ae = absolute_error(real_prevalence, pred_prevalence)
+mae = mean_absolute_error(real_prevalence, pred_prevalence)
-print(f"Mean Squared Error (MSE) -> {ae:.4f}")
-print(f"Bias -> {bias}")
+print(f"Absolute Error -> {ae}")
+print(f"Mean Absolute Error -> {mae}")
 ```
 ___
@@ -125,7 +128,7 @@ ___
 ## Documentation
-##### API is avaliable [here](#)
+##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
 - [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
 - [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)

{mlquantify-0.1.3.dist-info → mlquantify-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -6,7 +6,7 @@ mlquantify/classification/__init__.py,sha256=3FGf-F4SOM3gByUPsWdnBzjyC_31B3Mtzuo
 mlquantify/classification/methods.py,sha256=yDSbpoqM3hfF0a9ATzKqfG9S-44x-0Rq0lkAVJKTIEs,5006
 mlquantify/evaluation/__init__.py,sha256=x1grng0n_QeZpVBU8-pwagYdBMkbMRILtrp1qk_bLvk,447
 mlquantify/evaluation/measures.py,sha256=fIKyxxlD8em3oaj4u_BeXmNyUQG_A0vXWY8APPgNoJ0,6579
-mlquantify/evaluation/protocol.py,sha256=OsOXm_vf7sYlw9pQv08WxAvvgzo10bAqiDM-1cpz7nQ,24020
+mlquantify/evaluation/protocol.py,sha256=__tzRyqW4cJz4Fl87TInf7dXxIJ6bSaYaSaw-SdkNmM,10365
 mlquantify/methods/__init__.py,sha256=ya3Mn7bcz2r3oaIT7yVR4iJkAfgEAwF4xDK54C0rZ7U,536
 mlquantify/methods/aggregative.py,sha256=F5Z-tGA9OcZgMBLKOeaos6wIgvvnDeriZ4y0TyMpDrc,39051
 mlquantify/methods/meta.py,sha256=sZWQHUGkm6iiqujmIpHDL_8tDdKQ161bzD5mcpXLWEY,19066
@@ -14,9 +14,9 @@ mlquantify/methods/mixture_models.py,sha256=si2Pzaka5Kbva4QKBzLolvb_8V0ZEjp68UBA
 mlquantify/methods/non_aggregative.py,sha256=xaBu21TUtiYkOEUKO16NaNMwdNa6-SNjfBsc5PpIMyI,4815
 mlquantify/methods/threshold_optimization.py,sha256=NYGKbYvtfmiBeU8wpTiFCdURkijcPRZtybPOt6vtXbY,30489
 mlquantify/utils/__init__.py,sha256=logWrL6B6mukP8tvYm_UPEdO9eNA-J-ySILr7-syDoc,44
-mlquantify/utils/general.py,sha256=Li5ix_dy19dUhYNgiUsNHdqqnSVYvznUBUuyr-zYSPI,7554
+mlquantify/utils/general.py,sha256=wKJSmwF1KfSlSrDm0KTf92FMvB62BBOxf2Se9HyeWYE,8668
 mlquantify/utils/method.py,sha256=RL4vBJGl5_6DZ59Bs62hdNXI_hnoDIWilMMyMPiOjBg,12631
-mlquantify-0.1.3.dist-info/METADATA,sha256=FkF8Qt_lHsa0Lf0sXAQ36Ri5bs5aMkAoNVzubTPty1A,4940
-mlquantify-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mlquantify-0.1.3.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
-mlquantify-0.1.3.dist-info/RECORD,,
+mlquantify-0.1.4.dist-info/METADATA,sha256=UtNxYnZnSt6HS0B8JsW5A5tvxlxFUH_GODjF1AXXsSY,5166
+mlquantify-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mlquantify-0.1.4.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
+mlquantify-0.1.4.dist-info/RECORD,,

{mlquantify-0.1.3.dist-info → mlquantify-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{mlquantify-0.1.3.dist-info → mlquantify-0.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

mlquantify 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

mlquantify 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl