PyPI - mlquantify - Versions diffs - 0.0.11.4__tar.gz → 0.0.11.6__tar.gz - Mend

mlquantify 0.0.11.4tar.gz → 0.0.11.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mlquantify
-Version: 0.0.11.4
+Version: 0.0.11.6
 Summary: Quantification Library
 Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
 Maintainer: Luiz Fernando Luth Junior
@@ -32,7 +32,7 @@ ___
 ## Latest Release
-- **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
+- **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
 - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
 - Explore the [API documentation](#) for detailed developer information.
 - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/README.md RENAMED Viewed

@@ -9,7 +9,7 @@ ___
 ## Latest Release
-- **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
+- **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
 - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
 - Explore the [API documentation](#) for detailed developer information.
 - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/gac.py RENAMED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
 from ...base import AggregativeQuantifier
@@ -13,10 +13,12 @@ class GAC(AggregativeQuantifier):
     and solve it via constrained least-squares regression.
     """
-    def __init__(self, learner: BaseEstimator):
+    def __init__(self, learner: BaseEstimator, train_size:float=0.6, random_state:int=None):
         assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
         self.learner = learner
         self.cond_prob_matrix = None
+        self.train_size = train_size
+        self.random_state = random_state
     def _fit_method(self, X, y):
         # Ensure X and y are DataFrames
@@ -29,26 +31,17 @@ class GAC(AggregativeQuantifier):
             y_pred = self.learner.predict(X)
             y_label = y
         else:
-            # Cross-validation for generating predictions
-            skf = StratifiedKFold(n_splits=self.cv_folds)
-            y_pred = []
-            y_label = []
+            X_train, X_val, y_train, y_val = train_test_split(
+                X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
+            )
-            for train_index, valid_index in skf.split(X, y):
-                train_data = pd.DataFrame(X.iloc[train_index])
-                train_label = y.iloc[train_index]
-                valid_data = pd.DataFrame(X.iloc[valid_index])
-                valid_label = y.iloc[valid_index]
-                self.learner.fit(train_data, train_label)
-                y_pred.extend(self.learner.predict(valid_data))
-                y_label.extend(valid_label)
+            self.learner.fit(X_train, y_train)
+            y_label = y_val
+            y_pred = self.learner.predict(X_val)
         # Compute conditional probability matrix
-        self.cond_prob_matrix = self.get_cond_prob_matrix(self.classes, y, y_pred)
+        self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_label, y_pred)
         return self
@@ -66,11 +59,11 @@ class GAC(AggregativeQuantifier):
         return adjusted_prevalences
     @classmethod
-    def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
+    def get_cond_prob_matrix(cls, classes:list, y_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
         """ Estimate the conditional probability matrix P(yi|yj)"""
-        CM = confusion_matrix(true_labels, predictions, labels=classes).T
-        CM = CM.astype(np.float32)
+        CM = confusion_matrix(y_labels, predictions, labels=classes).T
+        CM = CM.astype(float)
         class_counts = CM.sum(axis=0)
         for i, _ in enumerate(classes):
             if class_counts[i] == 0:
@@ -91,6 +84,6 @@ class GAC(AggregativeQuantifier):
             adjusted_prevalences = np.linalg.solve(A, B)
             adjusted_prevalences = np.clip(adjusted_prevalences, 0, 1)
             adjusted_prevalences /= adjusted_prevalences.sum()
-        except (np.linalg.LinAlgError, ValueError):
+        except (np.linalg.LinAlgError):
             adjusted_prevalences = predicted_prevalences  # No way to adjust them
         return adjusted_prevalences

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/gpac.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
 from .gac import GAC
 from ...base import AggregativeQuantifier
@@ -14,10 +14,12 @@ class GPAC(AggregativeQuantifier):
     """
-    def __init__(self, learner: BaseEstimator):
+    def __init__(self, learner: BaseEstimator, train_size:float=0.6, random_state:int=None):
         assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
         self.learner = learner
         self.cond_prob_matrix = None
+        self.train_size = train_size
+        self.random_state = random_state
     def _fit_method(self, X, y):
         # Convert X and y to DataFrames if they are numpy arrays
@@ -28,31 +30,20 @@ class GPAC(AggregativeQuantifier):
         if self.learner_fitted:
             # Use existing model to predict
-            predictions = self.learner.predict(X)
-            true_labels = y
+            y_pred = self.learner.predict(X)
+            y_labels = y
         else:
-            # Perform cross-validation to generate predictions
-            skf = StratifiedKFold(n_splits=self.cv_folds)
-            predictions = []
-            true_labels = []
+            X_train, X_val, y_train, y_val = train_test_split(
+                X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
+            )
-            for train_index, valid_index in skf.split(X, y):
-                # Split data into training and validation sets
-                train_data = pd.DataFrame(X.iloc[train_index])
-                train_labels = y.iloc[train_index]
-                valid_data = pd.DataFrame(X.iloc[valid_index])
-                valid_labels = y.iloc[valid_index]
-                # Train the learner
-                self.learner.fit(train_data, train_labels)
-                # Predict and collect results
-                predictions.extend(self.learner.predict(valid_data))
-                true_labels.extend(valid_labels)
+            self.learner.fit(X_train, y_train)
+            y_labels = y_val
+            y_pred = self.learner.predict(X_val)
         # Compute conditional probability matrix using GAC
-        self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, true_labels, predictions)
+        self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_labels, y_pred)
         return self
@@ -73,15 +64,15 @@ class GPAC(AggregativeQuantifier):
         return adjusted_prevalences
     @classmethod
-    def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
+    def get_cond_prob_matrix(cls, classes:list, y_labels:np.ndarray, y_pred:np.ndarray) -> np.ndarray:
         """Estimate the matrix where entry (i,j) is the estimate of P(yi|yj)"""
         n_classes = len(classes)
         cond_prob_matrix = np.eye(n_classes)
         for i, class_ in enumerate(classes):
-            class_indices = true_labels == class_
+            class_indices = y_labels == class_
             if class_indices.any():
-                cond_prob_matrix[i] = predictions[class_indices].mean(axis=0)
+                cond_prob_matrix[i] = y_pred[class_indices].mean(axis=0)
         return cond_prob_matrix.T

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py RENAMED Viewed

@@ -16,7 +16,6 @@ class MixtureModel(AggregativeQuantifier):
         self.learner = learner
         self.pos_scores = None
         self.neg_scores = None
-        self.distance = None
     @property
     def multiclass_method(self) -> bool:

mlquantify-0.0.11.6/mlquantify/methods/aggregative/mixtureModels/dys.py ADDED Viewed

@@ -0,0 +1,107 @@
+import numpy as np
+from sklearn.base import BaseEstimator
+from ._MixtureModel import MixtureModel
+from ....utils import getHist, ternary_search
+class DyS(MixtureModel):
+    """Distribution y-Similarity framework. Is a
+    method that generalises the HDy approach by
+    considering the dissimilarity function DS as
+    a parameter of the model
+    """
+    def __init__(self, learner:BaseEstimator, measure:str="topsoe", bins_size:np.ndarray=None):
+        assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
+        assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
+        super().__init__(learner)
+        # Set up bins_size
+        if not bins_size:
+            bins_size = np.append(np.linspace(2,20,10), 30)
+        if isinstance(bins_size, list):
+            bins_size = np.asarray(bins_size)
+        self.bins_size = bins_size
+        self.measure = measure
+        self.prevs = None # Array of prevalences that minimizes the distances
+    def _compute_prevalence(self, test_scores:np.ndarray) -> float:
+        prevs = self.GetMinDistancesDyS(test_scores)
+        # Use the median of the prevalences as the final prevalence estimate
+        prevalence = np.median(prevs)
+        return prevalence
+    def best_distance(self, X_test) -> float:
+        test_scores = self.learner.predict_proba(X_test)
+        prevs = self.GetMinDistancesDyS(test_scores)
+        size = len(prevs)
+        best_prev = np.median(prevs)
+        if size % 2 != 0:  # ODD
+            index = np.argmax(prevs == best_prev)
+            bin_size = self.bins_size[index]
+        else:  # EVEN
+            # Sort the values in self.prevs
+            ordered_prevs = np.sort(prevs)
+            # Find the two middle indices
+            middle1 = np.floor(size / 2).astype(int)
+            middle2 = np.ceil(size / 2).astype(int)
+            # Get the values corresponding to the median positions
+            median1 = ordered_prevs[middle1]
+            median2 = ordered_prevs[middle2]
+            # Find the indices of median1 and median2 in prevs
+            index1 = np.argmax(prevs == median1)
+            index2 = np.argmax(prevs == median2)
+            # Calculate the average of the corresponding bin sizes
+            bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
+        pos_bin_density = getHist(self.pos_scores, bin_size)
+        neg_bin_density = getHist(self.neg_scores, bin_size)
+        test_bin_density = getHist(test_scores, bin_size)
+        train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
+        distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
+        return distance
+    def GetMinDistancesDyS(self, test_scores) -> list:
+        # Compute prevalence by evaluating the distance metric across various bin sizes
+        prevs = []
+        # Iterate over each bin size
+        for bins in self.bins_size:
+            # Compute histogram densities for positive, negative, and test scores
+            pos_bin_density = getHist(self.pos_scores, bins)
+            neg_bin_density = getHist(self.neg_scores, bins)
+            test_bin_density = getHist(test_scores, bins)
+            # Define the function to minimize
+            def f(x):
+                # Combine densities using a mixture of positive and negative densities
+                train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
+                # Calculate the distance between combined density and test density
+                return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
+            # Use ternary search to find the best x that minimizes the distance
+            prevs.append(ternary_search(0, 1, f))
+        return prevs

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/dys_syn.py RENAMED Viewed

@@ -34,6 +34,7 @@ class DySsyn(MixtureModel):
         self.m = None
     def _fit_method(self, X, y):
         if not self.learner_fitted:
             self.learner.fit(X, y)
@@ -45,16 +46,41 @@ class DySsyn(MixtureModel):
     def _compute_prevalence(self, test_scores:np.ndarray) -> float:    #creating bins from 10 to 110 with step size 10
+        distances = self.GetMinDistancesDySsyn(test_scores)
+        # Use the median of the prevss as the final prevalence estimate
+        index = min(distances, key=lambda d: distances[d][0])
+        prevalence = distances[index][1]
+        return prevalence
+    def best_distance(self, X_test):
+        test_scores = self.learner.predict_proba(X_test)
+        distances = self.GetMinDistancesDySsyn(test_scores)
+        index = min(distances, key=lambda d: distances[d][0])
+        distance = distances[index][0]
+        return distance
+    def GetMinDistancesDySsyn(self, test_scores) -> list:
         # Compute prevalence by evaluating the distance metric across various bin sizes
         if self.n is None:
             self.n = len(test_scores)
-        distances = {}
+        values = {}
         # Iterate over each bin size
         for m in self.merge_factor:
             pos_scores, neg_scores = MoSS(self.n, self.alpha_train, m)
-            result  = []
+            prevs  = []
             for bins in self.bins_size:
                 # Compute histogram densities for positive, negative, and test scores
                 pos_bin_density = getHist(pos_scores, bins)
@@ -69,21 +95,42 @@ class DySsyn(MixtureModel):
                     return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
                 # Use ternary search to find the best x that minimizes the distance
-                result.append(ternary_search(0, 1, f))
-            prevalence = np.median(result)
+                prevs.append(ternary_search(0, 1, f))
+            size = len(prevs)
+            best_prev = np.median(prevs)
+            if size % 2 != 0:  # ODD
+                index = np.argmax(prevs == best_prev)
+                bin_size = self.bins_size[index]
+            else:  # EVEN
+                # Sort the values in self.prevs
+                ordered_prevs = np.sort(prevs)
+                # Find the two middle indices
+                middle1 = np.floor(size / 2).astype(int)
+                middle2 = np.ceil(size / 2).astype(int)
+                # Get the values corresponding to the median positions
+                median1 = ordered_prevs[middle1]
+                median2 = ordered_prevs[middle2]
+                # Find the indices of median1 and median2 in prevs
+                index1 = np.argmax(prevs == median1)
+                index2 = np.argmax(prevs == median2)
+                # Calculate the average of the corresponding bin sizes
+                bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
-            bins_size = self.bins_size[result == prevalence][0]
+            pos_bin_density = getHist(pos_scores, bin_size)
+            neg_bin_density = getHist(neg_scores, bin_size)
+            test_bin_density = getHist(test_scores, bin_size)
-            pos_bin_density = getHist(pos_scores, bins_size)
-            neg_bin_density = getHist(neg_scores, bins_size)
-            test_bin_density = getHist(test_scores, bins_size)
+            train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
-            train_combined_density = (pos_bin_density * prevalence) + (neg_bin_density * (1 - prevalence))
-            d = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
-            distances[m] = (d, prevalence)
-        # Use the median of the results as the final prevalence estimate
-        index = min(distances, key=lambda d: distances[d][0])
-        prevalence = distances[index][1]
+            distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
-        return prevalence
+            values[m] = (distance, best_prev)
+        return values

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/hdy.py RENAMED Viewed

@@ -14,15 +14,54 @@ class HDy(MixtureModel):
     def __init__(self, learner: BaseEstimator):
         assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
         super().__init__(learner)
     def _compute_prevalence(self, test_scores: np.ndarray) -> float:
+        best_alphas, _ = self.GetMinDistancesHDy(test_scores)
+        # Compute the median of the best alpha values as the final prevalence estimate
+        prevalence = np.median(best_alphas)
+        return prevalence
+    def best_distance(self, X_test) -> float:
+        test_scores = self.learner.predict_proba(X_test)
+        _, distances = self.GetMinDistancesHDy(test_scores)
+        size = len(distances)
+        if size % 2 != 0:  # ODD
+            index = size // 2
+            distance = distances[index]
+        else:  # EVEN
+            # Find the two middle indices
+            middle1 = np.floor(size / 2).astype(int)
+            middle2 = np.ceil(size / 2).astype(int)
+            # Get the values corresponding to the median positions
+            dist1 = distances[middle1]
+            dist2 = distances[middle2]
+            # Calculate the average of the corresponding distances
+            distance = np.mean([dist1, dist2])
+        return distance
+    def GetMinDistancesHDy(self, test_scores: np.ndarray) -> tuple:
         # Define bin sizes and alpha values
-        bin_size = np.arange(10, 110, 11)  # Bins from 10 to 110 with a step size of 10
+        bins_size = np.arange(10, 110, 11)  # Bins from 10 to 110 with a step size of 10
         alpha_values = np.round(np.linspace(0, 1, 101), 2)  # Alpha values from 0 to 1, rounded to 2 decimal places
         best_alphas = []
-        for bins in bin_size:
+        distances = []
+        for bins in bins_size:
             pos_bin_density = getHist(self.pos_scores, bins)
             neg_bin_density = getHist(self.neg_scores, bins)
@@ -39,8 +78,6 @@ class HDy(MixtureModel):
             # Find the alpha value that minimizes the distance
             best_alphas.append(alpha_values[np.argmin(distances)])
-        # Compute the median of the best alpha values as the final prevalence estimate
-        prevalence = np.median(best_alphas)
+            distances.append(min(distances))
-        return prevalence
+        return best_alphas, distances

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mlquantify
-Version: 0.0.11.4
+Version: 0.0.11.6
 Summary: Quantification Library
 Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
 Maintainer: Luiz Fernando Luth Junior
@@ -32,7 +32,7 @@ ___
 ## Latest Release
-- **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
+- **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
 - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
 - Explore the [API documentation](#) for detailed developer information.
 - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)

{mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ here = pathlib.Path(__file__).parent.resolve()
 long_description = (here / 'README.md').read_text(encoding='utf-8')
-VERSION = '0.0.11.4'
+VERSION = '0.0.11.6'
 DESCRIPTION = 'Quantification Library'
 # Setting up

mlquantify-0.0.11.4/mlquantify/methods/aggregative/mixtureModels/dys.py DELETED Viewed

@@ -1,55 +0,0 @@
-import numpy as np
-from sklearn.base import BaseEstimator
-from ._MixtureModel import MixtureModel
-from ....utils import getHist, ternary_search
-class DyS(MixtureModel):
-    """Distribution y-Similarity framework. Is a
-    method that generalises the HDy approach by
-    considering the dissimilarity function DS as
-    a parameter of the model
-    """
-    def __init__(self, learner:BaseEstimator, measure:str="topsoe", bins_size:np.ndarray=None):
-        assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
-        assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
-        super().__init__(learner)
-        # Set up bins_size
-        if not bins_size:
-            bins_size = np.append(np.linspace(2,20,10), 30)
-        if isinstance(bins_size, list):
-            bins_size = np.asarray(bins_size)
-        self.bins_size = bins_size
-        self.measure = measure
-    def _compute_prevalence(self, test_scores:np.ndarray) -> float:    #creating bins from 10 to 110 with step size 10
-        # Compute prevalence by evaluating the distance metric across various bin sizes
-        result = []
-        # Iterate over each bin size
-        for bins in self.bins_size:
-            # Compute histogram densities for positive, negative, and test scores
-            pos_bin_density = getHist(self.pos_scores, bins)
-            neg_bin_density = getHist(self.neg_scores, bins)
-            test_bin_density = getHist(test_scores, bins)
-            # Define the function to minimize
-            def f(x):
-                # Combine densities using a mixture of positive and negative densities
-                train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
-                # Calculate the distance between combined density and test density
-                return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
-            # Use ternary search to find the best x that minimizes the distance
-            result.append(ternary_search(0, 1, f))
-        # Use the median of the results as the final prevalence estimate
-        prevalence = np.median(result)
-        return prevalence