PyPI - mlquantify - Versions diffs - 0.0.11__tar.gz → 0.0.11.1__tar.gz - Mend

mlquantify 0.0.11tar.gz → 0.0.11.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

{mlquantify-0.0.11 → mlquantify-0.0.11.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mlquantify
-Version: 0.0.11
+Version: 0.0.11.1
 Summary: Quantification Library
 Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
 Maintainer: Luiz Fernando Luth Junior
@@ -12,14 +12,6 @@ Classifier: Operating System :: Unix
 Classifier: Operating System :: MacOS :: MacOS X
 Classifier: Operating System :: Microsoft :: Windows
 Description-Content-Type: text/markdown
-Requires-Dist: scikit-learn
-Requires-Dist: numpy
-Requires-Dist: scipy
-Requires-Dist: joblib
-Requires-Dist: tqdm
-Requires-Dist: pandas
-Requires-Dist: xlrd
-Requires-Dist: matplotlib
 <h1 align="center">MLQuantify</h1>
 <h4 align="center">A Python Package for Quantification</h4>
@@ -114,13 +106,9 @@ ___
 ##### API is avaliable [here](#)
 - [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
-- [Model Selection](#)
-- [Evaluation](#)
-- [Plotting](#)
+- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
+- [Evaluation](https://github.com/luizfernandolj/mlquantify/wiki/Evaluation)
+- [Plotting](https://github.com/luizfernandolj/mlquantify/wiki/Plotting)
 ___
-### See the References in the pdf below
-...

{mlquantify-0.0.11 → mlquantify-0.0.11.1}/README.md RENAMED Viewed

@@ -91,13 +91,9 @@ ___
 ##### API is avaliable [here](#)
 - [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
-- [Model Selection](#)
-- [Evaluation](#)
-- [Plotting](#)
+- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
+- [Evaluation](https://github.com/luizfernandolj/mlquantify/wiki/Evaluation)
+- [Plotting](https://github.com/luizfernandolj/mlquantify/wiki/Plotting)
 ___
-### See the References in the pdf below
-...

mlquantify-0.0.11.1/mlquantify/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .classification import *
+from .evaluation import *
+from .methods import *
+from .utils import *
+from .plots import *
+from .model_selection import GridSearchQ

{mlquantify-0.0.11 → mlquantify-0.0.11.1}/mlquantify/base.py RENAMED Viewed

@@ -138,13 +138,14 @@ class AggregativeQuantifier(Quantifier, ABC):
         return self.learner.get_params()
     def set_params(self, **params):
         # Model Params
         for key, value in params.items():
             if hasattr(self, key):
                 setattr(self, key, value)
         # Learner Params
-        if self.learner:
+        if self.learner is not None:
             learner_params = {k.replace('learner__', ''): v for k, v in params.items() if 'learner__' in k}
             if learner_params:
                 self.learner.set_params(**learner_params)

mlquantify-0.0.11.1/mlquantify/classification/pwkclf.py ADDED Viewed

@@ -0,0 +1,73 @@
+from sklearn.neighbors import NearestNeighbors
+from sklearn.base import BaseEstimator
+import numpy as np
+import pandas as pd
+class PWKCLF(BaseEstimator):
+    """Learner based on k-Nearest Neighborst (KNN) to use on the method PWK,
+    that also is based on KNN.
+    """
+    def __init__(self,
+                 alpha=1,
+                 n_neighbors=10,
+                 algorithm="auto",
+                 metric="euclidean",
+                 leaf_size=30,
+                 p=2,
+                 metric_params=None,
+                 n_jobs=None):
+        if alpha < 1:
+            raise ValueError("alpha must not be smaller than 1")
+        self.alpha = alpha
+        self.n_neighbors = n_neighbors
+        self.nbrs = NearestNeighbors(n_neighbors=n_neighbors,
+                                     algorithm=algorithm,
+                                     leaf_size=leaf_size,
+                                     metric=metric,
+                                     p=p,
+                                     metric_params=metric_params,
+                                     n_jobs=n_jobs)
+        self.Y = None
+        self.Y_map = None
+        self.w = None
+        self.y = None
+    def fit(self, X, y):
+        n_samples = X.shape[0]
+        if n_samples < self.n_neighbors:
+            self.nbrs.set_params(n_neighbors=n_samples)
+        self.y = y
+        if isinstance(y, pd.DataFrame):
+            self.y = y.reset_index(drop=True)
+        Y_cts = np.unique(y, return_counts=True)
+        self.Y = Y_cts[0]
+        self.Y_map = dict(zip(self.Y, range(len(self.Y))))
+        min_class_count = np.min(Y_cts[1])
+        self.w = (Y_cts[1] / min_class_count) ** (-1.0 / self.alpha)
+        self.nbrs.fit(X)
+        return self
+    def predict(self, X):
+        n_samples = X.shape[0]
+        nn_indices = self.nbrs.kneighbors(X, return_distance=False)
+        CM = np.zeros((n_samples, len(self.Y)))
+        for i in range(n_samples):
+            for j in nn_indices[i]:
+                CM[i, self.Y_map[self.y[j]]] += 1
+        CM = np.multiply(CM, self.w)
+        predictions = np.apply_along_axis(np.argmax, axis=1, arr=CM)
+        return self.Y[predictions]

mlquantify-0.0.11.1/mlquantify/evaluation/measures/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+from .ae import absolute_error
+from .kld import kullback_leibler_divergence
+from .nkld import normalized_kullback_leibler_divergence
+from .rae import relative_absolute_error
+from .nae import normalized_absolute_error
+from .bias import bias
+from .nrae import normalized_relative_absolute_error
+from .se import squared_error
+from .mse import mean_squared_error
+MEASURES = {
+    "ae": absolute_error,
+    "nae": normalized_absolute_error,
+    "kld": kullback_leibler_divergence,
+    "nkld": normalized_kullback_leibler_divergence,
+    "nrae": normalized_relative_absolute_error,
+    "rae": relative_absolute_error,
+    "se": squared_error,
+    "mse": mean_squared_error
+}
+def get_measure(measure:str):
+    return MEASURES.get(measure)

mlquantify-0.0.11.1/mlquantify/evaluation/measures/ae.py ADDED Viewed

@@ -0,0 +1,11 @@
+import numpy as np
+def absolute_error(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    abs_error = abs(prev_pred - prev_real).mean(axis=-1)
+    return abs_error

mlquantify-0.0.11.1/mlquantify/evaluation/measures/bias.py ADDED Viewed

@@ -0,0 +1,16 @@
+import numpy as np
+def bias(prev_real:np.any, prev_pred:np.any):
+    classes = None
+    if isinstance(prev_real, dict):
+        classes = prev_real.keys()
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    abs_errors = abs(prev_pred - prev_real)
+    if classes:
+        return {class_:abs_error for class_, abs_error in zip(classes, abs_errors)}
+    return abs_errors

mlquantify-0.0.11.1/mlquantify/evaluation/measures/kld.py ADDED Viewed

@@ -0,0 +1,8 @@
+import numpy as np
+def kullback_leibler_divergence(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    return prev_real * abs(np.log((prev_real / prev_pred)))

mlquantify-0.0.11.1/mlquantify/evaluation/measures/mse.py ADDED Viewed

@@ -0,0 +1,12 @@
+import numpy as np
+from .se import squared_error
+def mean_squared_error(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    mean_sq_error = squared_error(prev_real, prev_pred).mean()
+    return mean_sq_error

mlquantify-0.0.11.1/mlquantify/evaluation/measures/nae.py ADDED Viewed

@@ -0,0 +1,16 @@
+import numpy as np
+from .ae import absolute_error
+def normalized_absolute_error(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    abs_error = absolute_error(prev_real, prev_pred)
+    z_abs_error = (2 * (1 - min(prev_real)))
+    normalized = abs_error / z_abs_error
+    return normalized

mlquantify-0.0.11.1/mlquantify/evaluation/measures/nkld.py ADDED Viewed

@@ -0,0 +1,13 @@
+import numpy as np
+from .kld import kullback_leibler_divergence
+def normalized_kullback_leibler_divergence(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    euler = np.exp(kullback_leibler_divergence(prev_real, prev_pred))
+    normalized = 2 * (euler / (euler + 1)) - 1
+    return normalized

mlquantify-0.0.11.1/mlquantify/evaluation/measures/nrae.py ADDED Viewed

@@ -0,0 +1,16 @@
+import numpy as np
+from .rae import relative_absolute_error
+def normalized_relative_absolute_error(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    relative = relative_absolute_error(prev_real, prev_pred)
+    z_relative = (len(prev_real) - 1 + ((1 - min(prev_real)) / min(prev_real))) / len(prev_real)
+    normalized = relative/z_relative
+    return normalized

mlquantify-0.0.11.1/mlquantify/evaluation/measures/rae.py ADDED Viewed

@@ -0,0 +1,12 @@
+import numpy as np
+from . import absolute_error
+def relative_absolute_error(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    relative = (absolute_error(prev_real, prev_pred) / prev_real).mean(axis=-1)
+    return relative

mlquantify-0.0.11.1/mlquantify/evaluation/measures/se.py ADDED Viewed

@@ -0,0 +1,12 @@
+import numpy as np
+from .ae import absolute_error
+def squared_error(prev_real:np.any, prev_pred:np.any):
+    if isinstance(prev_real, dict):
+        prev_real = np.asarray(list(prev_real.values()))
+    if isinstance(prev_pred, dict):
+        prev_pred = np.asarray(list(prev_pred.values()))
+    sq_abs_error = ((prev_pred - prev_real) ** 2).mean(axis=-1)
+    return sq_abs_error

mlquantify-0.0.11.1/mlquantify/evaluation/protocol/_Protocol.py ADDED Viewed

@@ -0,0 +1,202 @@
+from abc import ABC, abstractmethod
+import numpy as np
+import pandas as pd
+from typing import Union, List
+from sklearn.base import BaseEstimator
+from time import time
+from tqdm import tqdm
+from ...methods import get_method, METHODS, AGGREGATIVE, NON_AGGREGATIVE
+from ...utils import *
+from ..measures import get_measure, MEASURES
+from ...base import Quantifier, AggregativeQuantifier
+class Protocol(ABC):
+    """Base class for implementing different quantification protocols.
+    This abstract class provides a structure for creating protocols that involve
+    fitting quantification models to training data and generating predictions on test data.
+    It supports parallel processing, multiple iterations, and different output formats.
+    Args:
+        models (Union[List[Union[str, Quantifier]], str, Quantifier]):
+            List of quantification models, a single model name, or 'all' for all models.
+        batch_size (Union[List[int], int]):
+            Size of the batches to be processed, or a list of sizes.
+        learner (BaseEstimator, optional):
+            Machine learning model to be used with the quantifiers. Required for model methods.
+        n_iterations (int, optional):
+            Number of iterations for the protocol. Default is 1.
+        n_jobs (int, optional):
+            Number of jobs to run in parallel. Default is 1.
+        random_state (int, optional):
+            Seed for random number generation. Default is 32.
+        verbose (bool, optional):
+            Whether to print progress messages. Default is False.
+        return_type (str, optional):
+            Type of return value ('predictions' or 'table'). Default is 'predictions'.
+        measures (List[str], optional):
+            List of error measures to calculate. Must be in MEASURES or None. Default is None.
+    """
+    def __init__(self,
+                 models: Union[List[Union[str, Quantifier]], str, Quantifier],
+                 batch_size: Union[List[int], int],
+                 learner: BaseEstimator = None,
+                 n_iterations: int = 1,
+                 n_jobs: int = 1,
+                 random_state: int = 32,
+                 verbose: bool = False,
+                 return_type: str = "predictions",
+                 measures: List[str] = None):
+        assert not measures or all(m in MEASURES for m in measures), \
+            f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
+        assert return_type in ["predictions", "table"], \
+            "Invalid return_type. Valid options: ['predictions', 'table']"
+        self.models = self._initialize_models(models, learner)
+        self.learner = learner
+        self.batch_size = batch_size
+        self.n_iterations = n_iterations
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.verbose = verbose
+        self.return_type = return_type
+        self.measures = measures
+    def _initialize_models(self, models, learner):
+        if isinstance(models, list):
+            if isinstance(models[0], Quantifier):
+                return models
+            assert learner is not None, "Learner is required for model methods."
+            return [get_method(model)(learner) for model in models]
+        if isinstance(models, Quantifier):
+            return [models]
+        assert learner is not None, "Learner is required for model methods."
+        if models == "all":
+            print(hasattr(list(AGGREGATIVE.values())[0], "learner"))
+            models = [model(learner) if hasattr(model, "learner") else model() for model in METHODS.values()]
+            return models
+        if models == "aggregative":
+            return [model(learner) for model in AGGREGATIVE.values()]
+        if models == "non_aggregative":
+            return [model() for model in NON_AGGREGATIVE.values()]
+        return [get_method(models)(learner)]
+    def sout(self, msg):
+        if self.verbose:
+            print('[APP]' + msg)
+    def fit(self, X_train, y_train):
+        """Fit all methods into the training data.
+        Args:
+            X_train (array-like): Features of training.
+            y_train (array-like): Labels of training.
+        """
+        self.sout("Fitting models")
+        args = ((model, X_train, y_train, self.verbose) for model in self.models)
+        self.models = parallel(
+            self._delayed_fit,
+            tqdm(args, desc="Fitting models", total=len(self.models)) if self.verbose else args,
+            self.n_jobs)
+        self.sout("Fit [Done]")
+        return self
+    def predict(self, X_test, y_test) -> np.any:
+        """Generate several samples with artificial prevalences, and sizes.
+        And for each method, predicts with this sample, aggregating all toguether
+        with a pandas dataframe if request, or else just the predictions.
+        Args:
+            X_test (array-like): Features of test.
+            y_test (array-like): Labels of test.
+        Returns:
+            tuple: tuple containing the model, real_prev and pred_prev, or.
+            DataFrame: table of results, along with error measures if requested.
+        """
+        predictions = self.predict_protocol(X_test, y_test)
+        predictions_df = pd.DataFrame(predictions)
+        if self.return_type == "table":
+            predictions_df.columns = ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]
+            if self.measures:
+                def smooth(values:np.ndarray) ->np.ndarray:
+                    smoothed_factor = 1/(2 * len(X_test))
+                    values = (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
+                    return values
+                for metric in self.measures:
+                    predictions_df[metric] = predictions_df.apply(
+                        lambda row: get_measure(metric)(smooth(row["REAL_PREVS"]), smooth(row["PRED_PREVS"])),
+                        axis=1
+                    )
+            return predictions_df
+        predictions_array = predictions_df.to_numpy()
+        return (
+            predictions_array[:, 0],  # Model names
+            np.stack(predictions_array[:, 1]),  # Prev
+            np.stack(predictions_array[:, 2])   # Prev_pred
+        )
+    @abstractmethod
+    def predict_protocol(self) -> np.ndarray:
+        """ Abstract method that every protocol has to implement """
+        ...
+    @abstractmethod
+    def _new_sample(self) -> tuple:
+        """ Abstract method of sample extraction for each protocol
+        Returns:
+            tuple: tuple containing the X_sample and the y_sample
+        """
+        ...
+    @abstractmethod
+    def _delayed_predict(self, args) -> tuple:
+        """abstract method for predicting in the extracted
+        samples, is delayed for running in parallel for
+        eficciency purposes.
+        """
+        ...
+    def _delayed_fit(self, args):
+        model, X_train, y_train, verbose = args
+        if verbose:
+            print(f"\tFitting {model.__class__.__name__}")
+            start = time()
+        model = model.fit(X=X_train, y=y_train)
+        if verbose:
+            end = time()
+            print(f"\t\\--Fit ended for {model.__class__.__name__} in {round(end - start, 3)} seconds")
+        return model

mlquantify-0.0.11.1/mlquantify/evaluation/protocol/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .app import APP
2	+ from .npp import NPP

mlquantify-0.0.11.1/mlquantify/evaluation/protocol/app.py ADDED Viewed

@@ -0,0 +1,146 @@
+import numpy as np
+import pandas as pd
+from typing import Union, List
+from sklearn.base import BaseEstimator
+import itertools
+from tqdm import tqdm
+from ...utils import generate_artificial_indexes, parallel
+from ...base import Quantifier
+from ._Protocol import Protocol
+class APP(Protocol):
+    """Artificial Prevalence Protocol. It splits a test into several
+    samples varying prevalence and sample size, with n iterations.
+    For a list of Quantifiers, it computes training and testing
+    for each one and returns either a table of results with error measures
+    or just the predictions.
+    """
+    def __init__(self,
+                 models: Union[List[Union[str, Quantifier]], str, Quantifier],
+                 batch_size: Union[List[int], int],
+                 learner: BaseEstimator = None,
+                 n_prevs: int = 100,
+                 n_iterations: int = 1,
+                 n_jobs: int = 1,
+                 random_state: int = 32,
+                 verbose: bool = False,
+                 return_type: str = "predictions",
+                 measures: List[str] = None):
+        super().__init__(models, batch_size, learner, n_iterations, n_jobs, random_state, verbose, return_type, measures)
+        self.n_prevs = n_prevs
+    def predict_protocol(self, X_test, y_test) -> tuple:
+        """Generates several samples with artificial prevalences and sizes.
+        For each model, predicts with this sample, aggregating all together
+        with a pandas dataframe if requested, or else just the predictions.
+        Args:
+            X_test (array-like): Features of the test set.
+            y_test (array-like): Labels of the test set.
+        Returns:
+            tuple: predictions containing the model name, real prev, pred prev, and batch size
+        """
+        n_dim = len(np.unique(y_test))
+        prevs = self._generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
+        args = self._generate_args(X_test, y_test, prevs)
+        batch_size = 1
+        if isinstance(self.batch_size, list):
+            batch_size = len(self.batch_size)
+        size = len(prevs) * len(self.models) * batch_size * self.n_iterations
+        predictions = parallel(
+            self._delayed_predict,
+            tqdm(args, desc="Running APP", total=size) if self.verbose else args,
+            n_jobs=self.n_jobs
+        )
+        return predictions
+    def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
+        """Generates a new sample with a specified prevalence and size.
+        Args:
+            X (array-like): Features from which to take the new sample.
+            y (array-like): Labels from which to take the new sample.
+            prev (List[float]): The specified prevalences.
+            batch_size (int): Sample size.
+        Returns:
+            tuple: New sample's features and labels.
+        """
+        sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
+        return np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0)
+    def _delayed_predict(self, args) -> tuple:
+        """Method predicts into the new sample, is delayed for running
+        in parallel for eficciency purposes
+        Args:
+            args (Any): arguments to use
+        Returns:
+            tuple: returns the (method name, real_prev, pred_prev and sample_size)
+        """
+        X, y, model, prev, batch_size, verbose = args
+        if verbose:
+            print(f'\t {model.__class__.__name__} with {str(batch_size)} instances and prev {str(prev)}')
+        X_sample, _ = self._new_sample(X, y, prev, batch_size)
+        prev_pred = np.asarray(list(model.predict(X=X_sample).values()))
+        if verbose:
+            print(f'\t \\--Ending {model.__class__.__name__} with {str(batch_size)} instances and prev {str(prev)} \n')
+        return [model.__class__.__name__, prev, prev_pred, batch_size]
+    def _generate_artificial_prevalences(self, n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
+        """Generates n artificial prevalences with n dimensions.
+        Args:
+            n_dim (int): Number of dimensions for the artificial prevalence.
+            n_prev (int): Number of prevalence points to generate.
+            n_iter (int): Number of iterations.
+        Returns:
+            np.ndarray: Generated artificial prevalences.
+        """
+        s = np.linspace(0., 1., n_prev, endpoint=True)
+        prevs = np.array([p + (1 - sum(p),) for p in itertools.product(*(s,) * (n_dim - 1)) if sum(p) <= 1])
+        return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
+    def _generate_args(self, X_test, y_test, prevs):
+        """Generates arguments for parallel processing based on the model, prevalence, and batch size.
+        Args:
+            X_test (array-like): Features of the test set.
+            y_test (array-like): Labels of the test set.
+            prevs (np.ndarray): Artificial prevalences generated.
+        Returns:
+            List[tuple]: List of arguments for parallel processing.
+        """
+        if isinstance(self.batch_size, list):
+            return [(X_test, y_test, model, prev, bs, self.verbose)
+                    for prev in prevs for bs in self.batch_size for model in self.models]
+        else:
+            return [(X_test, y_test, model, prev, self.batch_size, self.verbose)
+                    for prev in prevs for model in self.models]

mlquantify 0.0.11__tar.gz → 0.0.11.1__tar.gz

mlquantify 0.0.11tar.gz → 0.0.11.1tar.gz