PyPI - mlquantify - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

mlquantify 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

mlquantify/evaluation/protocol.py +216 -566
mlquantify/methods/aggregative.py +130 -1
mlquantify/methods/threshold_optimization.py +0 -151
mlquantify/utils/general.py +43 -6
{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/METADATA +16 -13
{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/RECORD +8 -8
{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/WHEEL +1 -1
{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/top_level.txt +0 -0

mlquantify/methods/aggregative.py CHANGED Viewed

@@ -907,10 +907,140 @@ class PCC(AggregativeQuantifier):
+class PACC(AggregativeQuantifier):
+    """
+    Probabilistic Adjusted Classify and Count (PACC).
+    This method extends the Adjusted Classify and Count (AC) approach
+    by leveraging the average class-conditional confidences obtained
+    from a probabilistic classifier instead of relying solely on true
+    positive and false positive rates.
+    Parameters
+    ----------
+    learner : BaseEstimator
+        A scikit-learn compatible classifier to be used for quantification.
+    threshold : float, optional
+        The decision threshold for classification. Default is 0.5.
+    Attributes
+    ----------
+    learner : BaseEstimator
+        A scikit-learn compatible classifier.
+    threshold : float
+        Decision threshold for classification. Default is 0.5.
+    tpr : float
+        True positive rate computed during the fitting process.
+    fpr : float
+        False positive rate computed during the fitting process.
+    See Also
+    --------
+    ThresholdOptimization : Base class for threshold-based quantification methods.
+    ACC : Adjusted Classify and Count quantification method.
+    CC : Classify and Count quantification method.
+    References
+    ----------
+    A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
+    Examples
+    --------
+    >>> from mlquantify.methods.aggregative import PACC
+    >>> from mlquantify.utils.general import get_real_prev
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.svm import SVC
+    >>> from sklearn.model_selection import train_test_split
+    >>>
+    >>> features, target = load_breast_cancer(return_X_y=True)
+    >>>
+    >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
+    >>>
+    >>> pacc = PACC(learner=SVC(probability=True))
+    >>> pacc.fit(X_train, y_train)
+    >>> y_pred = pacc.predict(X_test)
+    >>> y_pred
+    {0: 0.4664886119311328, 1: 0.5335113880688672}
+    >>> get_real_prev(y_test)
+    {0: 0.3991228070175439, 1: 0.6008771929824561}
+    """
+    def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
+        self.learner = learner
+        self.threshold = threshold
+        self.mean_pos = None
+        self.mean_neg = None
+    @property
+    def is_probabilistic(self) -> bool:
+        return True
+    @property
+    def is_multiclass(self) -> bool:
+        return False
+    def _fit_method(self, X, y):
+        # Get predicted labels and probabilities
+        if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
+            y_labels = mq.arguments["y_labels"]
+            probabilities = mq.arguments["posteriors_train"]
+        else:
+            y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
+        # Adjust thresholds and compute true and false positive rates
+        self.mean_pos = np.mean(probabilities[y_labels == self.classes[1], 1])
+        self.mean_neg = np.mean(probabilities[y_labels != self.classes[1], 1])
+        return self
+    def _predict_method(self, X):
+        """
+        Predicts the class prevalence using the mean class-conditional
+        probabilities from a probabilistic classifier.
+        Parameters
+        ----------
+        X : array-like or sparse matrix of shape (n_samples, n_features)
+            The input data for prediction.
+        Returns
+        -------
+        dict
+            A dictionary with class labels as keys and their respective
+            prevalence estimates as values.
+        Notes
+        -----
+        The prevalence is adjusted using the formula:
+            prevalence = |mean_score - FPR| / (TPR - FPR),
+        where mean_score is the average probability for the positive class.
+        Raises
+        ------
+        ZeroDivisionError
+            If `TPR - FPR` equals zero, indicating that the classifier's
+            performance does not vary across the threshold range.
+        """
+        prevalences = {}
+        # Calculate probabilities for the positive class
+        probabilities = self.predict_learner(X)[:, 1]
+        # Compute the mean score for the positive class
+        mean_scores = np.mean(probabilities)
+        # Adjust prevalence based on TPR and FPR
+        if self.mean_pos - self.mean_neg == 0:
+            prevalence = mean_scores
+        else:
+            prevalence = np.clip(abs(mean_scores - self.mean_neg) / (self.mean_pos - self.mean_neg), 0, 1)
+        # Map the computed prevalence to the class labels
+        prevalences[self.classes[0]] = 1 - prevalence
+        prevalences[self.classes[1]] = prevalence
+        return prevalences
 class PWK(AggregativeQuantifier):
@@ -1012,7 +1142,6 @@ class PWK(AggregativeQuantifier):
 from . import threshold_optimization
 ACC = threshold_optimization.ACC
-PACC = threshold_optimization.PACC
 T50 = threshold_optimization.T50
 MAX = threshold_optimization.MAX
 X_method  = threshold_optimization.X_method

mlquantify/methods/threshold_optimization.py CHANGED Viewed

@@ -659,157 +659,6 @@ class MS2(ThresholdOptimization):
         return np.asarray(prevalences)
-class PACC(ThresholdOptimization):
-    """
-    Probabilistic Adjusted Classify and Count (PACC).
-    This method extends the Adjusted Classify and Count (AC) approach
-    by leveraging the average class-conditional confidences obtained
-    from a probabilistic classifier instead of relying solely on true
-    positive and false positive rates.
-    Parameters
-    ----------
-    learner : BaseEstimator
-        A scikit-learn compatible classifier to be used for quantification.
-    threshold : float, optional
-        The decision threshold for classification. Default is 0.5.
-    Attributes
-    ----------
-    learner : BaseEstimator
-        A scikit-learn compatible classifier.
-    threshold : float
-        Decision threshold for classification. Default is 0.5.
-    tpr : float
-        True positive rate computed during the fitting process.
-    fpr : float
-        False positive rate computed during the fitting process.
-    See Also
-    --------
-    ThresholdOptimization : Base class for threshold-based quantification methods.
-    ACC : Adjusted Classify and Count quantification method.
-    CC : Classify and Count quantification method.
-    References
-    ----------
-    A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
-    Examples
-    --------
-    >>> from mlquantify.methods.aggregative import PACC
-    >>> from mlquantify.utils.general import get_real_prev
-    >>> from sklearn.datasets import load_breast_cancer
-    >>> from sklearn.svm import SVC
-    >>> from sklearn.model_selection import train_test_split
-    >>>
-    >>> features, target = load_breast_cancer(return_X_y=True)
-    >>>
-    >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
-    >>>
-    >>> pacc = PACC(learner=SVC(probability=True))
-    >>> pacc.fit(X_train, y_train)
-    >>> y_pred = pacc.predict(X_test)
-    >>> y_pred
-    {0: 0.4664886119311328, 1: 0.5335113880688672}
-    >>> get_real_prev(y_test)
-    {0: 0.3991228070175439, 1: 0.6008771929824561}
-    """
-    def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
-        super().__init__(learner)
-        self.threshold = threshold
-    def _predict_method(self, X):
-        """
-        Predicts the class prevalence using the mean class-conditional
-        probabilities from a probabilistic classifier.
-        Parameters
-        ----------
-        X : array-like or sparse matrix of shape (n_samples, n_features)
-            The input data for prediction.
-        Returns
-        -------
-        dict
-            A dictionary with class labels as keys and their respective
-            prevalence estimates as values.
-        Notes
-        -----
-        The prevalence is adjusted using the formula:
-            prevalence = |mean_score - FPR| / (TPR - FPR),
-        where mean_score is the average probability for the positive class.
-        Raises
-        ------
-        ZeroDivisionError
-            If `TPR - FPR` equals zero, indicating that the classifier's
-            performance does not vary across the threshold range.
-        """
-        prevalences = {}
-        # Calculate probabilities for the positive class
-        probabilities = self.predict_learner(X)[:, 1]
-        # Compute the mean score for the positive class
-        mean_scores = np.mean(probabilities)
-        # Adjust prevalence based on TPR and FPR
-        if self.tpr - self.fpr == 0:
-            prevalence = mean_scores
-        else:
-            prevalence = np.clip(abs(mean_scores - self.fpr) / (self.tpr - self.fpr), 0, 1)
-        # Map the computed prevalence to the class labels
-        prevalences[self.classes[0]] = 1 - prevalence
-        prevalences[self.classes[1]] = prevalence
-        return prevalences
-    def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
-        """
-        Finds the true positive rate (TPR) and false positive rate (FPR)
-        corresponding to the specified decision threshold.
-        Parameters
-        ----------
-        thresholds : np.ndarray
-            An array of threshold values.
-        tprs : np.ndarray
-            An array of true positive rates corresponding to the thresholds.
-        fprs : np.ndarray
-            An array of false positive rates corresponding to the thresholds.
-        Returns
-        -------
-        tuple
-            A tuple containing the specified threshold, TPR, and FPR.
-        Raises
-        ------
-        IndexError
-            If the specified threshold is not found in the `thresholds` array.
-        """
-        # Locate TPR and FPR for the specified threshold
-        tpr = tprs[thresholds == self.threshold][0]
-        fpr = fprs[thresholds == self.threshold][0]
-        return (self.threshold, tpr, fpr)
-    def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
-        tpr = tprs[thresholds == self.threshold][0]
-        fpr = fprs[thresholds == self.threshold][0]
-        return (self.threshold, tpr, fpr)
 class T50(ThresholdOptimization):

mlquantify/utils/general.py CHANGED Viewed

@@ -26,12 +26,9 @@ def convert_columns_to_arrays(df, columns:list = ['PRED_PREVS', 'REAL_PREVS']):
     return df
-def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:list):
+def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
     """
-    Generate indexes for a stratified sample based on the prevalence of each class.
+    Get indexes for a stratified sample based on the prevalence of each class.
     Parameters
     ----------
@@ -48,10 +45,13 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
     -------
     list
         List of indexes for the stratified sample.
-    """
+    """
+    classes = np.unique(y)
     # Ensure the sum of prevalences is 1
     assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
     # Ensure the number of prevalences matches the number of classes
+    assert len(prevalence) == len(classes), "The number of prevalences must match the number of classes"
     sampled_indexes = []
     total_sampled = 0
@@ -78,6 +78,43 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
+def kraemer_sampling(n_dim: int, n_prev: int, n_iter: int = 1) -> np.ndarray:
+    """
+    Uniform sampling from the unit simplex using Kraemer's algorithm.
+    Parameters
+    ----------
+    n_dim : int
+        Number of dimensions.
+    n_prev : int
+        Size of the sample.
+    n_iter : int
+        Number of iterations.
+    Returns
+    -------
+    np.ndarray
+        Array of sampled prevalences.
+    """
+    def _sampling(n_dim: int, n_prev: int) -> np.ndarray:
+        if n_dim == 2:
+            u = np.random.rand(n_prev)
+            return np.vstack([1 - u, u]).T
+        else:
+            u = np.random.rand(n_prev, n_dim - 1)
+            u.sort(axis=-1)   # sort each row
+            _0s = np.zeros((n_prev, 1))
+            _1s = np.ones((n_prev, 1))
+            a = np.hstack([_0s, u])
+            b = np.hstack([u, _1s])
+            return b - a
+    # repeat n_iter times
+    prevs = _sampling(n_dim, n_prev)
+    return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
 def generate_artificial_prevalences(n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
     """Generates n artificial prevalences with n dimensions.

{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mlquantify
-Version: 0.1.2
+Version: 0.1.4
 Summary: Quantification Library
 Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
 Maintainer: Luiz Fernando Luth Junior
@@ -40,9 +40,9 @@ ___
 ## Latest Release
-- **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
-- In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
-- Explore the [API documentation](#) for detailed developer information.
+- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
+- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
+- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
 - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
 ___
@@ -70,7 +70,7 @@ ___
 | **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
 | **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
 | **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
-| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, BIAS, NAE, SE, KLD, etc.). |
+| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
 | **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
 | **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
 | **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
@@ -82,7 +82,10 @@ ___
 This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
 ```python
-import mlquantify as mq
+from mlquantify.methods import EMQ
+from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
+from mlquantify.utils import get_real_prev
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.datasets import load_breast_cancer
 from sklearn.model_selection import train_test_split
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
 #Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
-model = mq.methods.EMQ(RandomForestClassifier())
+model = EMQ(RandomForestClassifier())
 model.fit(X_train, y_train)
 #Predict the class prevalence for X_test
 pred_prevalence = model.predict(X_test)
-real_prevalence = mq.utils.get_real_prev(y_test)
+real_prevalence = get_real_prev(y_test)
 #Get the error for the prediction
-ae = mq.evaluation.absolute_error(real_prevalence, pred_prevalence)
-bias = mq.evaluation.bias(real_prevalence, pred_prevalence)
+ae = absolute_error(real_prevalence, pred_prevalence)
+mae = mean_absolute_error(real_prevalence, pred_prevalence)
-print(f"Mean Squared Error (MSE) -> {ae:.4f}")
-print(f"Bias -> {bias}")
+print(f"Absolute Error -> {ae}")
+print(f"Mean Absolute Error -> {mae}")
 ```
 ___
@@ -125,7 +128,7 @@ ___
 ## Documentation
-##### API is avaliable [here](#)
+##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
 - [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
 - [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)

{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -6,17 +6,17 @@ mlquantify/classification/__init__.py,sha256=3FGf-F4SOM3gByUPsWdnBzjyC_31B3Mtzuo
 mlquantify/classification/methods.py,sha256=yDSbpoqM3hfF0a9ATzKqfG9S-44x-0Rq0lkAVJKTIEs,5006
 mlquantify/evaluation/__init__.py,sha256=x1grng0n_QeZpVBU8-pwagYdBMkbMRILtrp1qk_bLvk,447
 mlquantify/evaluation/measures.py,sha256=fIKyxxlD8em3oaj4u_BeXmNyUQG_A0vXWY8APPgNoJ0,6579
-mlquantify/evaluation/protocol.py,sha256=OsOXm_vf7sYlw9pQv08WxAvvgzo10bAqiDM-1cpz7nQ,24020
+mlquantify/evaluation/protocol.py,sha256=__tzRyqW4cJz4Fl87TInf7dXxIJ6bSaYaSaw-SdkNmM,10365
 mlquantify/methods/__init__.py,sha256=ya3Mn7bcz2r3oaIT7yVR4iJkAfgEAwF4xDK54C0rZ7U,536
-mlquantify/methods/aggregative.py,sha256=rL_xlX2nYECrxFSjBJNlxj6h3b-iIs7l_XgxIRSYHpw,34164
+mlquantify/methods/aggregative.py,sha256=F5Z-tGA9OcZgMBLKOeaos6wIgvvnDeriZ4y0TyMpDrc,39051
 mlquantify/methods/meta.py,sha256=sZWQHUGkm6iiqujmIpHDL_8tDdKQ161bzD5mcpXLWEY,19066
 mlquantify/methods/mixture_models.py,sha256=si2Pzaka5Kbva4QKBzLolvb_8V0ZEjp68UBAiOwl49s,35166
 mlquantify/methods/non_aggregative.py,sha256=xaBu21TUtiYkOEUKO16NaNMwdNa6-SNjfBsc5PpIMyI,4815
-mlquantify/methods/threshold_optimization.py,sha256=-iOcP5YcXZd0XZHGvbmcoE72hXR6D9YCoTnr1l80-9k,35796
+mlquantify/methods/threshold_optimization.py,sha256=NYGKbYvtfmiBeU8wpTiFCdURkijcPRZtybPOt6vtXbY,30489
 mlquantify/utils/__init__.py,sha256=logWrL6B6mukP8tvYm_UPEdO9eNA-J-ySILr7-syDoc,44
-mlquantify/utils/general.py,sha256=Li5ix_dy19dUhYNgiUsNHdqqnSVYvznUBUuyr-zYSPI,7554
+mlquantify/utils/general.py,sha256=wKJSmwF1KfSlSrDm0KTf92FMvB62BBOxf2Se9HyeWYE,8668
 mlquantify/utils/method.py,sha256=RL4vBJGl5_6DZ59Bs62hdNXI_hnoDIWilMMyMPiOjBg,12631
-mlquantify-0.1.2.dist-info/METADATA,sha256=2j3pqrm5djMAPm7bKTIjBjtg71OzAbFpwC-_ofOoSlc,4940
-mlquantify-0.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-mlquantify-0.1.2.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
-mlquantify-0.1.2.dist-info/RECORD,,
+mlquantify-0.1.4.dist-info/METADATA,sha256=UtNxYnZnSt6HS0B8JsW5A5tvxlxFUH_GODjF1AXXsSY,5166
+mlquantify-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mlquantify-0.1.4.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
+mlquantify-0.1.4.dist-info/RECORD,,

{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{mlquantify-0.1.2.dist-info → mlquantify-0.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

mlquantify 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

mlquantify 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl