PyPI - mlquantify - Versions diffs - 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

mlquantify 0.0.11.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

mlquantify/__init__.py +32 -6
mlquantify/base.py +559 -257
mlquantify/classification/__init__.py +1 -1
mlquantify/classification/methods.py +160 -0
mlquantify/evaluation/__init__.py +14 -2
mlquantify/evaluation/measures.py +215 -0
mlquantify/evaluation/protocol.py +647 -0
mlquantify/methods/__init__.py +37 -40
mlquantify/methods/aggregative.py +1030 -0
mlquantify/methods/meta.py +472 -0
mlquantify/methods/mixture_models.py +1003 -0
mlquantify/methods/non_aggregative.py +136 -0
mlquantify/methods/threshold_optimization.py +957 -0
mlquantify/model_selection.py +377 -232
mlquantify/plots.py +367 -0
mlquantify/utils/__init__.py +2 -2
mlquantify/utils/general.py +334 -0
mlquantify/utils/method.py +449 -0
{mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
mlquantify-0.1.1.dist-info/RECORD +22 -0
{mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
mlquantify/classification/pwkclf.py +0 -73
mlquantify/evaluation/measures/__init__.py +0 -26
mlquantify/evaluation/measures/ae.py +0 -11
mlquantify/evaluation/measures/bias.py +0 -16
mlquantify/evaluation/measures/kld.py +0 -8
mlquantify/evaluation/measures/mse.py +0 -12
mlquantify/evaluation/measures/nae.py +0 -16
mlquantify/evaluation/measures/nkld.py +0 -13
mlquantify/evaluation/measures/nrae.py +0 -16
mlquantify/evaluation/measures/rae.py +0 -12
mlquantify/evaluation/measures/se.py +0 -12
mlquantify/evaluation/protocol/_Protocol.py +0 -202
mlquantify/evaluation/protocol/__init__.py +0 -2
mlquantify/evaluation/protocol/app.py +0 -146
mlquantify/evaluation/protocol/npp.py +0 -34
mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
mlquantify/methods/aggregative/__init__.py +0 -9
mlquantify/methods/aggregative/cc.py +0 -32
mlquantify/methods/aggregative/emq.py +0 -86
mlquantify/methods/aggregative/fm.py +0 -72
mlquantify/methods/aggregative/gac.py +0 -96
mlquantify/methods/aggregative/gpac.py +0 -87
mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
mlquantify/methods/aggregative/pcc.py +0 -33
mlquantify/methods/aggregative/pwk.py +0 -38
mlquantify/methods/meta/__init__.py +0 -1
mlquantify/methods/meta/ensemble.py +0 -236
mlquantify/methods/non_aggregative/__init__.py +0 -1
mlquantify/methods/non_aggregative/hdx.py +0 -71
mlquantify/plots/__init__.py +0 -2
mlquantify/plots/distribution_plot.py +0 -109
mlquantify/plots/protocol_plot.py +0 -193
mlquantify/utils/general_purposes/__init__.py +0 -8
mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
mlquantify/utils/general_purposes/get_real_prev.py +0 -9
mlquantify/utils/general_purposes/load_quantifier.py +0 -4
mlquantify/utils/general_purposes/make_prevs.py +0 -23
mlquantify/utils/general_purposes/normalize.py +0 -20
mlquantify/utils/general_purposes/parallel.py +0 -10
mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
mlquantify/utils/method_purposes/__init__.py +0 -6
mlquantify/utils/method_purposes/distances.py +0 -21
mlquantify/utils/method_purposes/getHist.py +0 -13
mlquantify/utils/method_purposes/get_scores.py +0 -33
mlquantify/utils/method_purposes/moss.py +0 -16
mlquantify/utils/method_purposes/ternary_search.py +0 -14
mlquantify/utils/method_purposes/tprfpr.py +0 -42
mlquantify-0.0.11.2.dist-info/RECORD +0 -73
{mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0

mlquantify/model_selection.py CHANGED Viewed

@@ -1,232 +1,377 @@
-from .base import Quantifier
-from typing import Union, List
-import itertools
-from tqdm import tqdm
-import signal
-from copy import deepcopy
-import numpy as np
-from sklearn.model_selection import train_test_split
-from .utils import parallel
-from .evaluation import get_measure, APP, NPP
-class GridSearchQ(Quantifier):
-    """
-    Hyperparameter optimization for quantification models using grid search.
-    Args:
-        model (Quantifier): The base quantification model.
-        param_grid (dict): Hyperparameters to search over.
-        protocol (str, optional): Quantification protocol ('app' or 'npp'). Defaults to 'app'.
-        n_prevs (int, optional): Number of prevalence points for APP. Defaults to None.
-        n_repetitions (int, optional): Number of repetitions for NPP. Defaults to 1.
-        scoring (Union[List[str], str], optional): Metric(s) for evaluation. Defaults to "mae".
-        refit (bool, optional): Refit model on best parameters. Defaults to True.
-        val_split (float, optional): Proportion of data for validation. Defaults to 0.4.
-        n_jobs (int, optional): Number of parallel jobs. Defaults to 1.
-        random_seed (int, optional): Seed for reproducibility. Defaults to 42.
-        timeout (int, optional): Max time per parameter combination (seconds). Defaults to -1.
-        verbose (bool, optional): Verbosity of output. Defaults to False.
-    """
-    def __init__(self,
-                 model: Quantifier,
-                 param_grid: dict,
-                 protocol: str = 'app',
-                 n_prevs: int = None,
-                 n_repetitions: int = 1,
-                 scoring: Union[List[str], str] = "ae",
-                 refit: bool = True,
-                 val_split: float = 0.4,
-                 n_jobs: int = 1,
-                 random_seed: int = 42,
-                 timeout: int = -1,
-                 verbose: bool = False):
-        self.model = model
-        self.param_grid = param_grid
-        self.protocol = protocol.lower()
-        self.n_prevs = n_prevs
-        self.n_repetitions = n_repetitions
-        self.refit = refit
-        self.val_split = val_split
-        self.n_jobs = n_jobs
-        self.random_seed = random_seed
-        self.timeout = timeout
-        self.verbose = verbose
-        self.scoring = [get_measure(measure) for measure in (scoring if isinstance(scoring, list) else [scoring])]
-        assert self.protocol in {'app', 'npp'}, 'Unknown protocol; valid ones are "app" or "npp".'
-        if self.protocol == 'npp' and self.n_repetitions <= 1:
-            raise ValueError('For "npp" protocol, n_repetitions must be greater than 1.')
-    def sout(self, msg):
-        """Prints messages if verbose is True."""
-        if self.verbose:
-            print(f'[{self.__class__.__name__}]: {msg}')
-    def __get_protocol(self, model, sample_size):
-        """Get the appropriate protocol instance.
-        Args:
-            model (Quantifier): The quantification model.
-            sample_size (int): The sample size for batch processing.
-        Returns:
-            object: Instance of APP or NPP protocol.
-        """
-        protocol_params = {
-            'models': model,
-            'batch_size': sample_size,
-            'n_iterations': self.n_repetitions,
-            'n_jobs': self.n_jobs,
-            'verbose': False,
-            'random_state': 35,
-            'return_type': "predictions"
-        }
-        return APP(n_prevs=self.n_prevs, **protocol_params) if self.protocol == 'app' else NPP(**protocol_params)
-    def fit(self, X, y):
-        """Fit the quantifier model and perform grid search.
-        Args:
-            X (array-like): Training features.
-            y (array-like): Training labels.
-        Returns:
-            self: Fitted GridSearchQ instance.
-        """
-        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_split, random_state=self.random_seed)
-        param_combinations = list(itertools.product(*self.param_grid.values()))
-        best_score, best_params = None, None
-        if self.timeout > 0:
-            signal.signal(signal.SIGALRM, self._timeout_handler)
-        def evaluate_combination(params):
-            """Evaluate a single combination of hyperparameters.
-            Args:
-                params (tuple): A tuple of hyperparameter values.
-            Returns:
-                float or None: The evaluation score, or None if timeout occurred.
-            """
-            if self.verbose:
-                print(f"\tEvaluate Combination for {str(params)}")
-            model = deepcopy(self.model)
-            model.set_params(**dict(zip(self.param_grid.keys(), params)))
-            protocol_instance = self.__get_protocol(model, len(y_train))
-            try:
-                if self.timeout > 0:
-                    signal.alarm(self.timeout)
-                protocol_instance.fit(X_train, y_train)
-                _, real_prevs, pred_prevs = protocol_instance.predict(X_val, y_val)
-                scores = [np.mean([measure(rp, pp) for rp, pp in zip(real_prevs, pred_prevs)]) for measure in self.scoring]
-                if self.timeout > 0:
-                    signal.alarm(0)
-                if self.verbose:
-                    print(f"\t\\--ended evaluation of {str(params)}")
-                return np.mean(scores) if scores else None
-            except TimeoutError:
-                self.sout(f'Timeout reached for combination {params}.')
-                return None
-        results = parallel(
-            evaluate_combination,
-            tqdm(param_combinations, desc="Evaluating combination", total=len(param_combinations)) if self.verbose else param_combinations,
-            n_jobs=self.n_jobs
-        )
-        for score, params in zip(results, param_combinations):
-            if score is not None and (best_score is None or score < best_score):
-                best_score, best_params = score, params
-        self.best_score_ = best_score
-        self.best_params_ = dict(zip(self.param_grid.keys(), best_params))
-        self.sout(f'Optimization complete. Best score: {self.best_score_}, with parameters: {self.best_params_}.')
-        if self.refit and self.best_params_:
-            self.model.set_params(**self.best_params_)
-            self.model.fit(X, y)
-            self.best_model_ = self.model
-        return self
-    def predict(self, X):
-        """Make predictions using the best found model.
-        Args:
-            X (array-like): Data to predict on.
-        Returns:
-            array-like: Predictions.
-        """
-        if not hasattr(self, 'best_model_'):
-            raise RuntimeError("The model has not been fitted yet.")
-        return self.best_model_.predict(X)
-    @property
-    def classes_(self):
-        """Get the classes of the best model.
-        Returns:
-            array-like: The classes.
-        """
-        return self.best_model_.classes_
-    def set_params(self, **parameters):
-        """Set the hyperparameters for grid search.
-        Args:
-            parameters (dict): Hyperparameters to set.
-        """
-        self.param_grid = parameters
-    def get_params(self, deep=True):
-        """Get the parameters of the best model.
-        Args:
-            deep (bool, optional): If True, will return the parameters for this estimator and contained subobjects. Defaults to True.
-        Returns:
-            dict: Parameters of the best model.
-        """
-        if hasattr(self, 'best_model_'):
-            return self.best_model_.get_params()
-        raise ValueError('get_params called before fit')
-    def best_model(self):
-        """Return the best model after fitting.
-        Returns:
-            Quantifier: The best model.
-        Raises:
-            ValueError: If called before fitting.
-        """
-        if hasattr(self, 'best_model_'):
-            return self.best_model_
-        raise ValueError('best_model called before fit')
-    def _timeout_handler(self, signum, frame):
-        """Handle timeouts during evaluation.
-        Args:
-            signum (int): Signal number.
-            frame (object): Current stack frame.
-        Raises:
-            TimeoutError: When the timeout is reached.
-        """
-        raise TimeoutError()
+from .base import Quantifier
+from typing import Union, List
+import itertools
+from tqdm import tqdm
+import signal
+from copy import deepcopy
+import numpy as np
+from sklearn.model_selection import train_test_split
+from .utils.general import parallel, get_measure
+from .evaluation.protocol import APP, NPP
+class GridSearchQ(Quantifier):
+    """Hyperparameter optimization for quantification models using grid search.
+    GridSearchQ allows hyperparameter tuning for quantification models
+    by minimizing a quantification-oriented loss over a parameter grid.
+    This method evaluates hyperparameter configurations using quantification
+    metrics rather than standard classification metrics, ensuring better
+    approximation of class distributions.
+    Parameters
+    ----------
+    model : Quantifier
+        The base quantification model to optimize.
+    param_grid : dict
+        Dictionary where keys are parameter names (str) and values are
+        lists of parameter settings to try.
+    protocol : str, default='app'
+        The quantification protocol to use. Supported options are:
+        - 'app': Artificial Prevalence Protocol.
+        - 'npp': Natural Prevalence Protocol.
+    n_prevs : int, default=None
+        Number of prevalence points to generate for APP.
+    n_repetitions : int, default=1
+        Number of repetitions to perform for NPP.
+    scoring : Union[List[str], str], default='mae'
+        Metric or metrics to evaluate the model's performance. Can be
+        a string (e.g., 'mae') or a list of metrics.
+    refit : bool, default=True
+        If True, refit the model using the best found hyperparameters
+        on the entire dataset.
+    val_split : float, default=0.4
+        Proportion of the training data to use for validation. Only
+        applicable if cross-validation is not used.
+    n_jobs : int, default=1
+        The number of jobs to run in parallel. -1 means using all processors.
+    random_seed : int, default=42
+        Random seed for reproducibility.
+    timeout : int, default=-1
+        Maximum time (in seconds) allowed for a single parameter combination.
+        A value of -1 disables the timeout.
+    verbose : bool, default=False
+        If True, print progress messages during grid search.
+    Attributes
+    ----------
+    best_params : dict
+        The parameter setting that gave the best results on the validation set.
+    best_score : float
+        The best score achieved during the grid search.
+    results : pandas.DataFrame
+        A DataFrame containing details of all evaluations, including parameters,
+        scores, and execution times.
+    References
+    ----------
+    The idea of using grid search for hyperparameter optimization in
+    quantification models was discussed in:
+    Moreo, Alejandro; Sebastiani, Fabrizio. "Re-assessing the 'Classify and Count'
+    Quantification Method". In: Advances in Information Retrieval:
+    43rd European Conference on IR Research, ECIR 2021, Virtual Event,
+    March 28–April 1, 2021, Proceedings, Part II. Springer International Publishing,
+    2021, pp. 75–91. [Link](https://link.springer.com/chapter/10.1007/978-3-030-72240-1_6).
+    Examples
+    --------
+    >>> from mlquantify.methods.aggregative import DyS
+    >>> from mlquantify.model_selection import GridSearchQ
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.model_selection import train_test_split
+    >>>
+    >>> # Loading dataset from sklearn
+    >>> features, target = load_breast_cancer(return_X_y=True)
+    >>>
+    >>> # Splitting into train and test
+    >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
+    >>>
+    >>> model = DyS(RandomForestClassifier())
+    >>>
+    >>> # Creating the hyperparameter grid
+    >>> param_grid = {
+    >>>     'learner__n_estimators': [100, 500, 1000],
+    >>>     'learner__criterion': ["gini", "entropy"],
+    >>>     'measure': ["topsoe", "hellinger"]
+    >>> }
+    >>>
+    >>> gs = GridSearchQ(
+    ...                 model=model,
+    ...                 param_grid=param_grid,
+    ...                 protocol='app', # Default
+    ...                 n_prevs=100,    # Default
+    ...                 scoring='nae',
+    ...                 refit=True,     # Default
+    ...                 val_split=0.3,
+    ...                 n_jobs=-1,
+    ...                 verbose=True)
+    >>>
+    >>> gs.fit(X_train, y_train)
+    [GridSearchQ]: Optimization complete. Best score: 0.0060630241297973545, with parameters: {'learner__n_estimators': 500, 'learner__criterion': 'entropy', 'measure': 'topsoe'}.
+    >>> predictions = gs.predict(X_test)
+    >>> predictions
+    {0: 0.4182508973311534, 1: 0.5817491026688466}
+    """
+    def __init__(self,
+                 model: Quantifier,
+                 param_grid: dict,
+                 protocol: str = 'app',
+                 n_prevs: int = 100,
+                 n_repetitions: int = 1,
+                 scoring: Union[List[str], str] = "ae",
+                 refit: bool = True,
+                 val_split: float = 0.4,
+                 n_jobs: int = 1,
+                 random_seed: int = 42,
+                 timeout: int = -1,
+                 verbose: bool = False):
+        self.model = model
+        self.param_grid = param_grid
+        self.protocol = protocol.lower()
+        self.n_prevs = n_prevs
+        self.n_repetitions = n_repetitions
+        self.refit = refit
+        self.val_split = val_split
+        self.n_jobs = n_jobs
+        self.random_seed = random_seed
+        self.timeout = timeout
+        self.verbose = verbose
+        self.scoring = [get_measure(measure) for measure in (scoring if isinstance(scoring, list) else [scoring])]
+        assert self.protocol in {'app', 'npp'}, 'Unknown protocol; valid ones are "app" or "npp".'
+        if self.protocol == 'npp' and self.n_repetitions <= 1:
+            raise ValueError('For "npp" protocol, n_repetitions must be greater than 1.')
+    def sout(self, msg):
+        """Prints messages if verbose is True."""
+        if self.verbose:
+            print(f'[{self.__class__.__name__}]: {msg}')
+    def __get_protocol(self, model, sample_size):
+        """Get the appropriate protocol instance.
+        Parameters
+        ----------
+        model : Quantifier
+            The quantification model.
+        sample_size : int
+            The sample size for batch processing.
+        Returns
+        -------
+        object
+            Instance of APP or NPP protocol, depending on the configured protocol.
+        """
+        protocol_params = {
+            'models': model,
+            'batch_size': sample_size,
+            'n_iterations': self.n_repetitions,
+            'n_jobs': self.n_jobs,
+            'verbose': False,
+            'random_state': 35,
+            'return_type': "predictions"
+        }
+        return APP(n_prevs=self.n_prevs, **protocol_params) if self.protocol == 'app' else NPP(**protocol_params)
+    def fit(self, X, y):
+        """Fit the quantifier model and perform grid search.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training features, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+        y : array-like of shape (n_samples,)
+            Training labels.
+        Returns
+        -------
+        self : GridSearchQ
+            Returns the fitted instance of GridSearchQ.
+        """
+        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_split, random_state=self.random_seed)
+        param_combinations = list(itertools.product(*self.param_grid.values()))
+        best_score, best_params = None, None
+        if self.timeout > 0:
+            signal.signal(signal.SIGALRM, self._timeout_handler)
+        def evaluate_combination(params):
+            """Evaluate a single combination of hyperparameters.
+            Parameters
+            ----------
+            params : tuple
+                A tuple of hyperparameter values.
+            Returns
+            -------
+            float or None
+                The evaluation score, or None if a timeout occurred.
+            """
+            if self.verbose:
+                print(f"\tEvaluating combination: {str(params)}")
+            model = deepcopy(self.model)
+            model.set_params(**dict(zip(self.param_grid.keys(), params)))
+            protocol_instance = self.__get_protocol(model, len(y_train))
+            try:
+                if self.timeout > 0:
+                    signal.alarm(self.timeout)
+                protocol_instance.fit(X_train, y_train)
+                _, real_prevs, pred_prevs = protocol_instance.predict(X_val, y_val)
+                scores = [np.mean([measure(rp, pp) for rp, pp in zip(real_prevs, pred_prevs)]) for measure in self.scoring]
+                if self.timeout > 0:
+                    signal.alarm(0)
+                if self.verbose:
+                    print(f"\t\\--Finished evaluation: {str(params)}")
+                return np.mean(scores) if scores else None
+            except TimeoutError:
+                self.sout(f'Timeout reached for combination: {params}.')
+                return None
+        results = parallel(
+            evaluate_combination,
+            tqdm(param_combinations, desc="Evaluating combinations", total=len(param_combinations)) if self.verbose else param_combinations,
+            n_jobs=self.n_jobs
+        )
+        for score, params in zip(results, param_combinations):
+            if score is not None and (best_score is None or score < best_score):
+                best_score, best_params = score, params
+        self.best_score = best_score
+        self.best_params = dict(zip(self.param_grid.keys(), best_params))
+        self.sout(f'Optimization complete. Best score: {self.best_score}, with parameters: {self.best_params}.')
+        if self.refit and self.best_params:
+            self.model.set_params(**self.best_params)
+            self.model.fit(X, y)
+            self.best_model_ = self.model
+        return self
+    def predict(self, X):
+        """Make predictions using the best found model.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Data to predict on.
+        Returns
+        -------
+        array-like
+            Predictions for the input data.
+        Raises
+        ------
+        RuntimeError
+            If the model has not been fitted yet.
+        """
+        if not hasattr(self, 'best_model_'):
+            raise RuntimeError("The model has not been fitted yet.")
+        return self.best_model_.predict(X)
+    @property
+    def classes_(self):
+        """Get the classes of the best model.
+        Returns
+        -------
+        array-like
+            The classes learned by the best model.
+        """
+        return self.best_model_.classes_
+    def set_params(self, **parameters):
+        """Set the hyperparameters for grid search.
+        Parameters
+        ----------
+        parameters : dict
+            Dictionary of hyperparameters to set.
+        """
+        self.param_grid = parameters
+    def get_params(self, deep=True):
+        """Get the parameters of the best model.
+        Parameters
+        ----------
+        deep : bool, optional, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects.
+        Returns
+        -------
+        dict
+            Parameters of the best model.
+        Raises
+        ------
+        ValueError
+            If called before the model has been fitted.
+        """
+        if hasattr(self, 'best_model_'):
+            return self.best_model_.get_params()
+        raise ValueError('get_params called before fit.')
+    def best_model(self):
+        """Return the best model after fitting.
+        Returns
+        -------
+        Quantifier
+            The best fitted model.
+        Raises
+        ------
+        ValueError
+            If called before fitting.
+        """
+        if hasattr(self, 'best_model_'):
+            return self.best_model_
+        raise ValueError('best_model called before fit.')
+    def _timeout_handler(self, signum, frame):
+        """Handle timeouts during evaluation.
+        Parameters
+        ----------
+        signum : int
+            Signal number.
+        frame : object
+            Current stack frame.
+        Raises
+        ------
+        TimeoutError
+            Raised when the timeout is reached.
+        """
+        raise TimeoutError

mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

mlquantify 0.0.11.2py3-none-any.whl → 0.1.1py3-none-any.whl