PyPI - mlquantify - Versions diffs - 0.0.1__tar.gz - Mend

mlquantify 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

mlquantify-0.0.1/MANIFEST.in +4 -0
mlquantify-0.0.1/PKG-INFO +22 -0
mlquantify-0.0.1/README.md +2 -0
mlquantify-0.0.1/mlquantify/base.py +256 -0
mlquantify-0.0.1/mlquantify/classification/__init__.py +1 -0
mlquantify-0.0.1/mlquantify/evaluation/__init__.py +2 -0
mlquantify-0.0.1/mlquantify/methods/__init__.py +40 -0
mlquantify-0.0.1/mlquantify/model_selection.py +232 -0
mlquantify-0.0.1/mlquantify/plots/__init__.py +2 -0
mlquantify-0.0.1/mlquantify/utils/__init__.py +2 -0
mlquantify-0.0.1/mlquantify.egg-info/PKG-INFO +22 -0
mlquantify-0.0.1/mlquantify.egg-info/SOURCES.txt +15 -0
mlquantify-0.0.1/mlquantify.egg-info/dependency_links.txt +1 -0
mlquantify-0.0.1/mlquantify.egg-info/requires.txt +8 -0
mlquantify-0.0.1/mlquantify.egg-info/top_level.txt +1 -0
mlquantify-0.0.1/setup.cfg +4 -0
mlquantify-0.0.1/setup.py +26 -0

mlquantify-0.0.1/MANIFEST.in ADDED Viewed

@@ -0,0 +1,4 @@
+exclude mlquantify/**/*.py
+include mlquantify/**/__init__.py
+include mlquantify/base.py
+include mlquantify/model_selection.py

mlquantify-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,22 @@
+Metadata-Version: 2.1
+Name: mlquantify
+Version: 0.0.1
+Summary: Quantification Library
+Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
+Maintainer: Luiz Fernando Luth Junior
+Keywords: python,machine learning,quantification,quantify
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: Unix
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+Requires-Dist: scikit-learn
+Requires-Dist: numpy
+Requires-Dist: scipy
+Requires-Dist: joblib
+Requires-Dist: tqdm
+Requires-Dist: pandas
+Requires-Dist: xlrd
+Requires-Dist: matplotlib

mlquantify-0.0.1/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # LibQuantifiers
2	+ Quantification package

mlquantify-0.0.1/mlquantify/base.py ADDED Viewed

@@ -0,0 +1,256 @@
+from abc import abstractmethod, ABC
+from sklearn.base import BaseEstimator
+from copy import deepcopy
+import numpy as np
+import joblib
+from .utils import parallel, normalize_prevalence
+class Quantifier(ABC, BaseEstimator):
+    """ Abstract Class for quantifiers."""
+    @abstractmethod
+    def fit(self, X, y) -> object: ...
+    @abstractmethod
+    def predict(self, X) -> dict: ...
+    @property
+    def classes(self) -> list:
+        return self._classes
+    @classes.setter
+    def classes(self, classes):
+        self._classes = sorted(list(classes))
+    @property
+    def n_class(self) -> list:
+        return len(self._classes)
+    @property
+    def multiclass_method(self) -> bool:
+        return True
+    @property
+    def binary_data(self) -> bool:
+        return len(self._classes) == 2
+    def save_quantifier(self, path: str=None) -> None:
+        if not path:
+            path = f"{self.__class__.__name__}.joblib"
+        joblib.dump(self, path)
+class AggregativeQuantifier(Quantifier, ABC):
+    """Abstract class for all Aggregative quantifiers, it means that each one of the quantifiers,
+     uses a learner or possibly a classifier to generate predictions.
+     This class is mostly used to detect whether or not its a binary or multiclass problem, and doing
+     One-Vs-All in case of multiclass dataset and not multiclass quantifier method.
+    """
+    def __init__(self):
+        # Dictionary to hold binary quantifiers for each class.
+        self.binary_quantifiers = {}
+        self.learner_fitted = False
+        self.cv_folds = 10
+    def fit(self, X, y, learner_fitted=False, cv_folds: int = 10, n_jobs:int=1):
+        """Fit the quantifier model.
+        Args:
+            X (array-like): Training features.
+            y (array-like): Training labels.
+            learner_fitted (bool, optional): Whether the learner is already fitted. Defaults to False.
+            cv_folds (int, optional): Number of cross-validation folds. Defaults to 10.
+        Returns:
+            self: Fitted quantifier.
+        """
+        self.n_jobs = n_jobs
+        self.learner_fitted = learner_fitted
+        self.cv_folds = cv_folds
+        self.classes = np.unique(y)
+        if self.binary_data or self.multiclass_method:
+            return self._fit_method(X, y)
+        # Making one vs all
+        self.binary_quantifiers = {class_: deepcopy(self) for class_ in self.classes}
+        parallel(self.delayed_fit, self.classes, self.n_jobs, X, y)
+        return self
+    def predict(self, X) -> dict:
+        """Predict class prevalences for the given data.
+        Args:
+            X (array-like): Test features.
+        Returns:
+            dict: Dictionary with class prevalences.
+        """
+        if self.binary_data or self.multiclass_method:
+            prevalences = self._predict_method(X)
+            return normalize_prevalence(prevalences, self.classes)
+        # Making one vs all
+        prevalences = np.asarray(parallel(self.delayed_predict, self.classes, self.n_jobs, X))
+        return normalize_prevalence(prevalences, self.classes)
+    @abstractmethod
+    def _fit_method(self, X, y):
+        """Abstract fit method that each quantification method must implement.
+        Args:
+            X (array-like): Training features.
+            y (array-like): Training labels.
+            learner_fitted (bool): Whether the learner is already fitted.
+            cv_folds (int): Number of cross-validation folds.
+        """
+        ...
+    @abstractmethod
+    def _predict_method(self, X) -> dict:
+        """Abstract predict method that each quantification method must implement.
+        Args:
+            X (array-like): Test data to generate class prevalences.
+        Returns:
+            dict: Dictionary with class:prevalence for each class.
+        """
+        ...
+    @property
+    def learner(self):
+        return self.learner_
+    @learner.setter
+    def learner(self, value):
+        self.learner_ = value
+    def get_params(self, deep=True):
+        return self.learner.get_params()
+    def set_params(self, **params):
+        # Model Params
+        for key, value in params.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+        # Learner Params
+        if self.learner:
+            learner_params = {k.replace('learner__', ''): v for k, v in params.items() if 'learner__' in k}
+            if learner_params:
+                self.learner.set_params(**learner_params)
+        return self
+    # MULTICLASS METHODS
+    def delayed_fit(self, class_, X, y):
+        """Delayed fit method for one-vs-all strategy, with parallel running.
+        Args:
+            class_ (Any): The class for which the model is being fitted.
+            X (array-like): Training features.
+            y (array-like): Training labels.
+            learner_fitted (bool): Whether the learner is already fitted.
+            cv_folds (int): Number of cross-validation folds.
+        Returns:
+            self: Fitted binary quantifier for the given class.
+        """
+        y_class = (y == class_).astype(int)
+        return self.binary_quantifiers[class_].fit(X, y_class)
+    def delayed_predict(self, class_, X):
+        """Delayed predict method for one-vs-all strategy, with parallel running.
+        Args:
+            class_ (Any): The class for which the model is making predictions.
+            X (array-like): Test features.
+        Returns:
+            float: Predicted prevalence for the given class.
+        """
+        return self.binary_quantifiers[class_].predict(X)[1]
+class NonAggregativeQuantifier(Quantifier):
+    """Abstract class for Non Aggregative quantifiers, it means that
+    theses methods does not use a classifier or specift learner on it's
+    predictions.
+    """
+    def fit(self, X, y, n_jobs:int=1):
+        """Fit the quantifier model.
+        Args:
+            X (array-like): Training features.
+            y (array-like): Training labels.
+            learner_fitted (bool, optional): Whether the learner is already fitted. Defaults to False.
+            cv_folds (int, optional): Number of cross-validation folds. Defaults to 10.
+        Returns:
+            self: Fitted quantifier.
+        """
+        self.n_jobs = n_jobs
+        self.classes = np.unique(y)
+        if self.binary_data or self.multiclass_method:
+            return self._fit_method(X, y)
+        # Making one vs all
+        self.binary_quantifiers = {class_: deepcopy(self) for class_ in self.classes}
+        parallel(self.delayed_fit, self.classes, self.n_jobs, X, y)
+        return self
+    def predict(self, X) -> dict:
+        """Predict class prevalences for the given data.
+        Args:
+            X (array-like): Test features.
+        Returns:
+            dict: Dictionary with class prevalences.
+        """
+        if self.binary_data or self.multiclass_method:
+            prevalences = self._predict_method(X)
+            return normalize_prevalence(prevalences, self.classes)
+        # Making one vs all
+        prevalences = np.asarray(parallel(self.delayed_predict, self.classes, self.n_jobs, X))
+        return normalize_prevalence(prevalences, self.classes)
+    @abstractmethod
+    def _fit_method(self, X, y):
+        """Abstract fit method that each quantification method must implement.
+        Args:
+            X (array-like): Training features.
+            y (array-like): Training labels.
+            learner_fitted (bool): Whether the learner is already fitted.
+            cv_folds (int): Number of cross-validation folds.
+        """
+        ...
+    @abstractmethod
+    def _predict_method(self, X) -> dict:
+        """Abstract predict method that each quantification method must implement.
+        Args:
+            X (array-like): Test data to generate class prevalences.
+        Returns:
+            dict: Dictionary with class:prevalence for each class.
+        """
+        ...

mlquantify-0.0.1/mlquantify/classification/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .pwkclf import PWKCLF

mlquantify-0.0.1/mlquantify/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .measures import *
2	+ from .protocol import *

mlquantify-0.0.1/mlquantify/methods/__init__.py ADDED Viewed

@@ -0,0 +1,40 @@
+from .aggregative import *
+from .non_aggregative import *
+from .meta import *
+AGGREGATIVE = {
+    "CC": CC,
+    "PCC": PCC,
+    "EMQ": EMQ,
+    "FM": FM,
+    "GAC": GAC,
+    "GPAC": GPAC,
+    "PWK": PWK,
+    "ACC": ACC,
+    "MAX": MAX,
+    "MS": MS,
+    "MS2": MS2,
+    "PACC": PACC,
+    "T50": T50,
+    "X": X_method,
+    "DyS": DyS,
+    "DySsyn": DySsyn,
+    "HDy": HDy,
+    "SMM": SMM,
+    "SORD": SORD,
+}
+NON_AGGREGATIVE = {
+    "HDx": HDx,
+}
+META = {
+    "ENSEMBLE": Ensemble
+}
+METHODS = AGGREGATIVE | NON_AGGREGATIVE
+def get_method(method:str):
+    return METHODS.get(method)

mlquantify-0.0.1/mlquantify/model_selection.py ADDED Viewed

@@ -0,0 +1,232 @@
+from .base import Quantifier
+from typing import Union, List
+import itertools
+from tqdm import tqdm
+import signal
+from copy import deepcopy
+import numpy as np
+from sklearn.model_selection import train_test_split
+from .utils import parallel
+from .evaluation import get_measure, APP, NPP
+class GridSearchQ(Quantifier):
+    """
+    Hyperparameter optimization for quantification models using grid search.
+    Args:
+        model (Quantifier): The base quantification model.
+        param_grid (dict): Hyperparameters to search over.
+        protocol (str, optional): Quantification protocol ('app' or 'npp'). Defaults to 'app'.
+        n_prevs (int, optional): Number of prevalence points for APP. Defaults to None.
+        n_repetitions (int, optional): Number of repetitions for NPP. Defaults to 1.
+        scoring (Union[List[str], str], optional): Metric(s) for evaluation. Defaults to "mae".
+        refit (bool, optional): Refit model on best parameters. Defaults to True.
+        val_split (float, optional): Proportion of data for validation. Defaults to 0.4.
+        n_jobs (int, optional): Number of parallel jobs. Defaults to 1.
+        random_seed (int, optional): Seed for reproducibility. Defaults to 42.
+        timeout (int, optional): Max time per parameter combination (seconds). Defaults to -1.
+        verbose (bool, optional): Verbosity of output. Defaults to False.
+    """
+    def __init__(self,
+                 model: Quantifier,
+                 param_grid: dict,
+                 protocol: str = 'app',
+                 n_prevs: int = None,
+                 n_repetitions: int = 1,
+                 scoring: Union[List[str], str] = "ae",
+                 refit: bool = True,
+                 val_split: float = 0.4,
+                 n_jobs: int = 1,
+                 random_seed: int = 42,
+                 timeout: int = -1,
+                 verbose: bool = False):
+        self.model = model
+        self.param_grid = param_grid
+        self.protocol = protocol.lower()
+        self.n_prevs = n_prevs
+        self.n_repetitions = n_repetitions
+        self.refit = refit
+        self.val_split = val_split
+        self.n_jobs = n_jobs
+        self.random_seed = random_seed
+        self.timeout = timeout
+        self.verbose = verbose
+        self.scoring = [get_measure(measure) for measure in (scoring if isinstance(scoring, list) else [scoring])]
+        assert self.protocol in {'app', 'npp'}, 'Unknown protocol; valid ones are "app" or "npp".'
+        if self.protocol == 'npp' and self.n_repetitions <= 1:
+            raise ValueError('For "npp" protocol, n_repetitions must be greater than 1.')
+    def sout(self, msg):
+        """Prints messages if verbose is True."""
+        if self.verbose:
+            print(f'[{self.__class__.__name__}]: {msg}')
+    def __get_protocol(self, model, sample_size):
+        """Get the appropriate protocol instance.
+        Args:
+            model (Quantifier): The quantification model.
+            sample_size (int): The sample size for batch processing.
+        Returns:
+            object: Instance of APP or NPP protocol.
+        """
+        protocol_params = {
+            'models': model,
+            'batch_size': sample_size,
+            'n_iterations': self.n_repetitions,
+            'n_jobs': self.n_jobs,
+            'verbose': False,
+            'random_state': 35,
+            'return_type': "predictions"
+        }
+        return APP(n_prevs=self.n_prevs, **protocol_params) if self.protocol == 'app' else NPP(**protocol_params)
+    def fit(self, X, y):
+        """Fit the quantifier model and perform grid search.
+        Args:
+            X (array-like): Training features.
+            y (array-like): Training labels.
+        Returns:
+            self: Fitted GridSearchQ instance.
+        """
+        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_split, random_state=self.random_seed)
+        param_combinations = list(itertools.product(*self.param_grid.values()))
+        best_score, best_params = None, None
+        if self.timeout > 0:
+            signal.signal(signal.SIGALRM, self._timeout_handler)
+        def evaluate_combination(params):
+            """Evaluate a single combination of hyperparameters.
+            Args:
+                params (tuple): A tuple of hyperparameter values.
+            Returns:
+                float or None: The evaluation score, or None if timeout occurred.
+            """
+            if self.verbose:
+                print(f"\tEvaluate Combination for {str(params)}")
+            model = deepcopy(self.model)
+            model.set_params(**dict(zip(self.param_grid.keys(), params)))
+            protocol_instance = self.__get_protocol(model, len(y_train))
+            try:
+                if self.timeout > 0:
+                    signal.alarm(self.timeout)
+                protocol_instance.fit(X_train, y_train)
+                _, real_prevs, pred_prevs = protocol_instance.predict(X_val, y_val)
+                scores = [np.mean([measure(rp, pp) for rp, pp in zip(real_prevs, pred_prevs)]) for measure in self.scoring]
+                if self.timeout > 0:
+                    signal.alarm(0)
+                if self.verbose:
+                    print(f"\t\\--ended evaluation of {str(params)}")
+                return np.mean(scores) if scores else None
+            except TimeoutError:
+                self.sout(f'Timeout reached for combination {params}.')
+                return None
+        results = parallel(
+            evaluate_combination,
+            tqdm(param_combinations, desc="Evaluating combination", total=len(param_combinations)) if self.verbose else param_combinations,
+            n_jobs=self.n_jobs
+        )
+        for score, params in zip(results, param_combinations):
+            if score is not None and (best_score is None or score < best_score):
+                best_score, best_params = score, params
+        self.best_score_ = best_score
+        self.best_params_ = dict(zip(self.param_grid.keys(), best_params))
+        self.sout(f'Optimization complete. Best score: {self.best_score_}, with parameters: {self.best_params_}.')
+        if self.refit and self.best_params_:
+            self.model.set_params(**self.best_params_)
+            self.model.fit(X, y)
+            self.best_model_ = self.model
+        return self
+    def predict(self, X):
+        """Make predictions using the best found model.
+        Args:
+            X (array-like): Data to predict on.
+        Returns:
+            array-like: Predictions.
+        """
+        if not hasattr(self, 'best_model_'):
+            raise RuntimeError("The model has not been fitted yet.")
+        return self.best_model_.predict(X)
+    @property
+    def classes_(self):
+        """Get the classes of the best model.
+        Returns:
+            array-like: The classes.
+        """
+        return self.best_model_.classes_
+    def set_params(self, **parameters):
+        """Set the hyperparameters for grid search.
+        Args:
+            parameters (dict): Hyperparameters to set.
+        """
+        self.param_grid = parameters
+    def get_params(self, deep=True):
+        """Get the parameters of the best model.
+        Args:
+            deep (bool, optional): If True, will return the parameters for this estimator and contained subobjects. Defaults to True.
+        Returns:
+            dict: Parameters of the best model.
+        """
+        if hasattr(self, 'best_model_'):
+            return self.best_model_.get_params()
+        raise ValueError('get_params called before fit')
+    def best_model(self):
+        """Return the best model after fitting.
+        Returns:
+            Quantifier: The best model.
+        Raises:
+            ValueError: If called before fitting.
+        """
+        if hasattr(self, 'best_model_'):
+            return self.best_model_
+        raise ValueError('best_model called before fit')
+    def _timeout_handler(self, signum, frame):
+        """Handle timeouts during evaluation.
+        Args:
+            signum (int): Signal number.
+            frame (object): Current stack frame.
+        Raises:
+            TimeoutError: When the timeout is reached.
+        """
+        raise TimeoutError()

mlquantify-0.0.1/mlquantify/plots/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .protocol_plot import protocol_boxplot, protocol_lineplot
2	+ from .distribution_plot import class_distribution_plot

mlquantify-0.0.1/mlquantify/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .general_purposes import *
2	+ from .method_purposes import *

mlquantify-0.0.1/mlquantify.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,22 @@
+Metadata-Version: 2.1
+Name: mlquantify
+Version: 0.0.1
+Summary: Quantification Library
+Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
+Maintainer: Luiz Fernando Luth Junior
+Keywords: python,machine learning,quantification,quantify
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: Unix
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+Requires-Dist: scikit-learn
+Requires-Dist: numpy
+Requires-Dist: scipy
+Requires-Dist: joblib
+Requires-Dist: tqdm
+Requires-Dist: pandas
+Requires-Dist: xlrd
+Requires-Dist: matplotlib

mlquantify-0.0.1/mlquantify.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,15 @@
+MANIFEST.in
+README.md
+setup.py
+mlquantify/base.py
+mlquantify/model_selection.py
+mlquantify.egg-info/PKG-INFO
+mlquantify.egg-info/SOURCES.txt
+mlquantify.egg-info/dependency_links.txt
+mlquantify.egg-info/requires.txt
+mlquantify.egg-info/top_level.txt
+mlquantify/classification/__init__.py
+mlquantify/evaluation/__init__.py
+mlquantify/methods/__init__.py
+mlquantify/plots/__init__.py
+mlquantify/utils/__init__.py

mlquantify-0.0.1/mlquantify.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlquantify-0.0.1/mlquantify.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,8 @@
+scikit-learn
+numpy
+scipy
+joblib
+tqdm
+pandas
+xlrd
+matplotlib

mlquantify-0.0.1/mlquantify.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ mlquantify

mlquantify-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

mlquantify-0.0.1/setup.py ADDED Viewed

@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+VERSION = '0.0.1'
+DESCRIPTION = 'Quantification Library'
+# Setting up
+setup(
+    name="mlquantify",
+    version=VERSION,
+    url="https://github.com/luizfernandolj/QuantifyML/tree/master",
+    maintainer="Luiz Fernando Luth Junior",
+    description=DESCRIPTION,
+    long_description_content_type="text/markdown",
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=['scikit-learn', 'numpy', 'scipy', 'joblib', 'tqdm', 'pandas', 'xlrd', 'matplotlib'],
+    keywords=['python', 'machine learning', 'quantification', 'quantify'],
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Science/Research",
+        "Programming Language :: Python :: 3",
+        "Operating System :: Unix",
+        "Operating System :: MacOS :: MacOS X",
+        "Operating System :: Microsoft :: Windows",
+    ]
+)