mlquantify 0.0.11__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. mlquantify/__init__.py +32 -6
  2. mlquantify/base.py +559 -256
  3. mlquantify/classification/__init__.py +1 -1
  4. mlquantify/classification/methods.py +160 -0
  5. mlquantify/evaluation/__init__.py +14 -2
  6. mlquantify/evaluation/measures.py +215 -0
  7. mlquantify/evaluation/protocol.py +647 -0
  8. mlquantify/methods/__init__.py +37 -40
  9. mlquantify/methods/aggregative.py +1030 -0
  10. mlquantify/methods/meta.py +472 -0
  11. mlquantify/methods/mixture_models.py +1003 -0
  12. mlquantify/methods/non_aggregative.py +136 -0
  13. mlquantify/methods/threshold_optimization.py +959 -0
  14. mlquantify/model_selection.py +377 -232
  15. mlquantify/plots.py +367 -0
  16. mlquantify/utils/__init__.py +2 -2
  17. mlquantify/utils/general.py +334 -0
  18. mlquantify/utils/method.py +449 -0
  19. {mlquantify-0.0.11.dist-info → mlquantify-0.1.0.dist-info}/METADATA +137 -126
  20. mlquantify-0.1.0.dist-info/RECORD +22 -0
  21. {mlquantify-0.0.11.dist-info → mlquantify-0.1.0.dist-info}/WHEEL +1 -1
  22. mlquantify/classification/pwkclf.py +0 -73
  23. mlquantify/evaluation/measures/__init__.py +0 -26
  24. mlquantify/evaluation/measures/ae.py +0 -11
  25. mlquantify/evaluation/measures/bias.py +0 -16
  26. mlquantify/evaluation/measures/kld.py +0 -8
  27. mlquantify/evaluation/measures/mse.py +0 -12
  28. mlquantify/evaluation/measures/nae.py +0 -16
  29. mlquantify/evaluation/measures/nkld.py +0 -13
  30. mlquantify/evaluation/measures/nrae.py +0 -16
  31. mlquantify/evaluation/measures/rae.py +0 -12
  32. mlquantify/evaluation/measures/se.py +0 -12
  33. mlquantify/evaluation/protocol/_Protocol.py +0 -202
  34. mlquantify/evaluation/protocol/__init__.py +0 -2
  35. mlquantify/evaluation/protocol/app.py +0 -146
  36. mlquantify/evaluation/protocol/npp.py +0 -34
  37. mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
  38. mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
  39. mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
  40. mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
  41. mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
  42. mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
  43. mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
  44. mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
  45. mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
  46. mlquantify/methods/aggregative/__init__.py +0 -9
  47. mlquantify/methods/aggregative/cc.py +0 -32
  48. mlquantify/methods/aggregative/emq.py +0 -86
  49. mlquantify/methods/aggregative/fm.py +0 -72
  50. mlquantify/methods/aggregative/gac.py +0 -96
  51. mlquantify/methods/aggregative/gpac.py +0 -87
  52. mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
  53. mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
  54. mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
  55. mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
  56. mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
  57. mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
  58. mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
  59. mlquantify/methods/aggregative/pcc.py +0 -33
  60. mlquantify/methods/aggregative/pwk.py +0 -38
  61. mlquantify/methods/meta/__init__.py +0 -1
  62. mlquantify/methods/meta/ensemble.py +0 -236
  63. mlquantify/methods/non_aggregative/__init__.py +0 -1
  64. mlquantify/methods/non_aggregative/hdx.py +0 -71
  65. mlquantify/plots/__init__.py +0 -2
  66. mlquantify/plots/distribution_plot.py +0 -109
  67. mlquantify/plots/protocol_plot.py +0 -157
  68. mlquantify/utils/general_purposes/__init__.py +0 -8
  69. mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
  70. mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
  71. mlquantify/utils/general_purposes/get_real_prev.py +0 -9
  72. mlquantify/utils/general_purposes/load_quantifier.py +0 -4
  73. mlquantify/utils/general_purposes/make_prevs.py +0 -23
  74. mlquantify/utils/general_purposes/normalize.py +0 -20
  75. mlquantify/utils/general_purposes/parallel.py +0 -10
  76. mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
  77. mlquantify/utils/method_purposes/__init__.py +0 -6
  78. mlquantify/utils/method_purposes/distances.py +0 -21
  79. mlquantify/utils/method_purposes/getHist.py +0 -13
  80. mlquantify/utils/method_purposes/get_scores.py +0 -33
  81. mlquantify/utils/method_purposes/moss.py +0 -16
  82. mlquantify/utils/method_purposes/ternary_search.py +0 -14
  83. mlquantify/utils/method_purposes/tprfpr.py +0 -42
  84. mlquantify-0.0.11.dist-info/RECORD +0 -73
  85. {mlquantify-0.0.11.dist-info → mlquantify-0.1.0.dist-info}/top_level.txt +0 -0
@@ -1,32 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
- from ...base import AggregativeQuantifier
4
-
5
- class CC(AggregativeQuantifier):
6
- """Classify and Count. The simplest quantification method
7
- involves classifying each instance and then counting the
8
- number of instances assigned to each class to estimate
9
- the class prevalence.
10
- """
11
-
12
- def __init__(self, learner: BaseEstimator):
13
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
14
- self.learner = learner
15
-
16
-
17
- def _fit_method(self, X, y):
18
- if not self.learner_fitted:
19
- self.learner.fit(X, y)
20
- return self
21
-
22
-
23
- def _predict_method(self, X) -> dict:
24
- predicted_labels = self.learner.predict(X)
25
-
26
- # Count occurrences of each class in the predictions
27
- class_counts = np.array([np.count_nonzero(predicted_labels == _class) for _class in self.classes])
28
-
29
- # Calculate the prevalence of each class
30
- prevalences = class_counts / len(predicted_labels)
31
-
32
- return prevalences
@@ -1,86 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from sklearn.base import BaseEstimator
4
-
5
- from ...base import AggregativeQuantifier
6
-
7
- class EMQ(AggregativeQuantifier):
8
- """Expectation Maximisation Quantifier. It is a method that
9
- ajust the priors and posteriors probabilities of a learner
10
- """
11
-
12
- MAX_ITER = 1000
13
- EPSILON = 1e-6
14
-
15
- def __init__(self, learner: BaseEstimator):
16
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
17
- self.learner = learner
18
- self.priors = None
19
-
20
- def _fit_method(self, X, y):
21
-
22
- if not self.learner_fitted:
23
- self.learner.fit(X, y)
24
-
25
- counts = np.array([np.count_nonzero(y == _class) for _class in self.classes])
26
- self.priors = counts / len(y)
27
-
28
- return self
29
-
30
- def _predict_method(self, X) -> dict:
31
-
32
- posteriors = self.learner.predict_proba(X)
33
- prevalences, _ = self.EM(self.priors, posteriors)
34
-
35
- return prevalences
36
-
37
-
38
- def predict_proba(self, X, epsilon:float=EPSILON, max_iter:int=MAX_ITER) -> np.ndarray:
39
- posteriors = self.learner.predict_proba(X)
40
- _, posteriors = self.EM(self.priors, posteriors, epsilon, max_iter)
41
- return posteriors
42
-
43
-
44
- @classmethod
45
- def EM(cls, priors, posteriors, epsilon=EPSILON, max_iter=MAX_ITER):
46
- """Expectaion Maximization function, it iterates several times
47
- and At each iteration step, both the a posteriori and the a
48
- priori probabilities are reestimated sequentially for each new
49
- observation and each class. The iterative procedure proceeds
50
- until the convergence of the estimated probabilities.
51
-
52
- Args:
53
- priors (array-like): priors probabilites of the train.
54
- posteriors (array-like): posteriors probabiblities of the test.
55
- epsilon (float): value that helps to indify the convergence.
56
- max_iter (int): max number of iterations.
57
-
58
- Returns:
59
- the predicted prevalence and the ajusted posteriors.
60
- """
61
-
62
- Px = posteriors
63
- prev_prevalence = np.copy(priors)
64
- running_estimate = np.copy(prev_prevalence) # Initialized with the training prevalence
65
-
66
- iteration, converged = 0, False
67
- previous_estimate = None
68
-
69
- while not converged and iteration < max_iter:
70
- # E-step: ps is P(y|xi)
71
- posteriors_unnormalized = (running_estimate / prev_prevalence) * Px
72
- posteriors = posteriors_unnormalized / posteriors_unnormalized.sum(axis=1, keepdims=True)
73
-
74
- # M-step:
75
- running_estimate = posteriors.mean(axis=0)
76
-
77
- if previous_estimate is not None and np.mean(np.abs(running_estimate - previous_estimate)) < epsilon and iteration > 10:
78
- converged = True
79
-
80
- previous_estimate = running_estimate
81
- iteration += 1
82
-
83
- if not converged:
84
- print('[Warning] The method has reached the maximum number of iterations; it might not have converged')
85
-
86
- return running_estimate, posteriors
@@ -1,72 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
- from scipy.optimize import minimize
4
-
5
- from ...base import AggregativeQuantifier
6
- from ...utils import get_scores
7
-
8
- class FM(AggregativeQuantifier):
9
- """The Friedman Method. Similar to GPAC,
10
- but instead of averaging the confidence scores
11
- from probabilistic classifiers, it uses the proportion
12
- of confidence scores that are higher or lower than the
13
- expected class frequencies found in the training data.
14
- """
15
-
16
-
17
- def __init__(self, learner: BaseEstimator):
18
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
19
- self.learner = learner
20
- self.CM = None
21
-
22
- def _fit_method(self, X, y):
23
- # Get predicted labels and probabilities using cross-validation
24
- y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
25
-
26
- # Fit the learner if it hasn't been fitted already
27
- if not self.learner_fitted:
28
- self.learner.fit(X, y)
29
-
30
- # Initialize the confusion matrix
31
- CM = np.zeros((self.n_class, self.n_class))
32
-
33
- # Calculate the class priors
34
- class_counts = np.array([np.count_nonzero(y_labels == _class) for _class in self.classes])
35
- self.priors = class_counts / len(y_labels)
36
-
37
- # Populate the confusion matrix
38
- for i, _class in enumerate(self.classes):
39
- indices = np.where(y_labels == _class)[0]
40
- CM[:, i] = np.sum(probabilities[indices] > self.priors, axis=0)
41
-
42
- # Normalize the confusion matrix by class counts
43
- self.CM = CM / class_counts
44
-
45
- return self
46
-
47
- def _predict_method(self, X) -> dict:
48
- posteriors = self.learner.predict_proba(X)
49
-
50
- # Calculate the estimated prevalences in the test set
51
- prevs_estim = np.sum(posteriors > self.priors, axis=0) / posteriors.shape[0]
52
- # Define the objective function for optimization
53
- def objective(prevs_pred):
54
- return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
55
-
56
- # Constraints for the optimization problem
57
- constraints = [{'type': 'eq', 'fun': lambda prevs_pred: np.sum(prevs_pred) - 1.0},
58
- {'type': 'ineq', 'fun': lambda prevs_pred: prevs_pred}]
59
-
60
- # Initial guess for the optimization
61
- initial_guess = np.ones(self.CM.shape[1]) / self.CM.shape[1]
62
-
63
- # Solve the optimization problem
64
- result = minimize(objective, initial_guess, constraints=constraints, bounds=[(0, 1)]*self.CM.shape[1])
65
-
66
- if result.success:
67
- prevalences = result.x
68
- else:
69
- print("Optimization did not converge")
70
- prevalences = self.priors
71
-
72
- return prevalences
@@ -1,96 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from sklearn.base import BaseEstimator
4
- from sklearn.metrics import confusion_matrix
5
- from sklearn.model_selection import StratifiedKFold
6
-
7
- from ...base import AggregativeQuantifier
8
-
9
-
10
- class GAC(AggregativeQuantifier):
11
- """Generalized Adjusted Count. It applies a
12
- classifier to build a system of linear equations,
13
- and solve it via constrained least-squares regression.
14
- """
15
-
16
- def __init__(self, learner: BaseEstimator):
17
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
18
- self.learner = learner
19
- self.cond_prob_matrix = None
20
-
21
- def _fit_method(self, X, y):
22
- # Ensure X and y are DataFrames
23
- if isinstance(X, np.ndarray):
24
- X = pd.DataFrame(X)
25
- if isinstance(y, np.ndarray):
26
- y = pd.Series(y)
27
-
28
- if self.learner_fitted:
29
- y_pred = self.learner.predict(X)
30
- y_label = y
31
- else:
32
- # Cross-validation for generating predictions
33
- skf = StratifiedKFold(n_splits=self.cv_folds)
34
- y_pred = []
35
- y_label = []
36
-
37
- for train_index, valid_index in skf.split(X, y):
38
-
39
- train_data = pd.DataFrame(X.iloc[train_index])
40
- train_label = y.iloc[train_index]
41
-
42
- valid_data = pd.DataFrame(X.iloc[valid_index])
43
- valid_label = y.iloc[valid_index]
44
-
45
- self.learner.fit(train_data, train_label)
46
-
47
- y_pred.extend(self.learner.predict(valid_data))
48
- y_label.extend(valid_label)
49
-
50
- # Compute conditional probability matrix
51
- self.cond_prob_matrix = self.get_cond_prob_matrix(self.classes, y, y_pred)
52
-
53
- return self
54
-
55
- def _predict_method(self, X) -> dict:
56
- # Predict class labels for the test data
57
- y_pred = self.learner.predict(X)
58
-
59
- # Distribution of predictions in the test set
60
- _, counts = np.unique(y_pred, return_counts=True)
61
- predicted_prevalences = counts / counts.sum()
62
-
63
- # Adjust prevalences based on conditional probability matrix
64
- adjusted_prevalences = self.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
65
-
66
- return adjusted_prevalences
67
-
68
- @classmethod
69
- def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
70
- """ Estimate the conditional probability matrix P(yi|yj)"""
71
-
72
- CM = confusion_matrix(true_labels, predictions, labels=classes).T
73
- CM = CM.astype(np.float32)
74
- class_counts = CM.sum(axis=0)
75
- for i, _ in enumerate(classes):
76
- if class_counts[i] == 0:
77
- CM[i, i] = 1
78
- else:
79
- CM[:, i] /= class_counts[i]
80
- return CM
81
-
82
- @classmethod
83
- def solve_adjustment(cls, cond_prob_matrix, predicted_prevalences):
84
- """ Solve the linear system Ax = B with A=cond_prob_matrix and B=predicted_prevalences
85
- """
86
-
87
- #
88
- A = cond_prob_matrix
89
- B = predicted_prevalences
90
- try:
91
- adjusted_prevalences = np.linalg.solve(A, B)
92
- adjusted_prevalences = np.clip(adjusted_prevalences, 0, 1)
93
- adjusted_prevalences /= adjusted_prevalences.sum()
94
- except (np.linalg.LinAlgError, ValueError):
95
- adjusted_prevalences = predicted_prevalences # No way to adjust them
96
- return adjusted_prevalences
@@ -1,87 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from sklearn.base import BaseEstimator
4
- from sklearn.model_selection import StratifiedKFold
5
-
6
- from .gac import GAC
7
- from ...base import AggregativeQuantifier
8
-
9
- class GPAC(AggregativeQuantifier):
10
- """Generalized Probabilistic Adjusted Count. Like
11
- GAC, it also build a system of linear equations, but
12
- utilize the confidence scores from probabilistic
13
- classifiers as in the PAC method.
14
- """
15
-
16
-
17
- def __init__(self, learner: BaseEstimator):
18
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
19
- self.learner = learner
20
- self.cond_prob_matrix = None
21
-
22
- def _fit_method(self, X, y):
23
- # Convert X and y to DataFrames if they are numpy arrays
24
- if isinstance(X, np.ndarray):
25
- X = pd.DataFrame(X)
26
- if isinstance(y, np.ndarray):
27
- y = pd.DataFrame(y)
28
-
29
- if self.learner_fitted:
30
- # Use existing model to predict
31
- predictions = self.learner.predict(X)
32
- true_labels = y
33
- else:
34
- # Perform cross-validation to generate predictions
35
- skf = StratifiedKFold(n_splits=self.cv_folds)
36
- predictions = []
37
- true_labels = []
38
-
39
- for train_index, valid_index in skf.split(X, y):
40
- # Split data into training and validation sets
41
- train_data = pd.DataFrame(X.iloc[train_index])
42
- train_labels = y.iloc[train_index]
43
-
44
- valid_data = pd.DataFrame(X.iloc[valid_index])
45
- valid_labels = y.iloc[valid_index]
46
-
47
- # Train the learner
48
- self.learner.fit(train_data, train_labels)
49
-
50
- # Predict and collect results
51
- predictions.extend(self.learner.predict(valid_data))
52
- true_labels.extend(valid_labels)
53
-
54
- # Compute conditional probability matrix using GAC
55
- self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, true_labels, predictions)
56
-
57
- return self
58
-
59
- def _predict_method(self, X) -> dict:
60
- # Predict class labels for the test data
61
- predictions = self.learner.predict(X)
62
-
63
- # Calculate the distribution of predictions in the test set
64
- predicted_prevalences = np.zeros(self.n_class)
65
- _, counts = np.unique(predictions, return_counts=True)
66
- predicted_prevalences[:len(counts)] = counts
67
- predicted_prevalences = predicted_prevalences / predicted_prevalences.sum()
68
-
69
- # Adjust prevalences based on the conditional probability matrix from GAC
70
- adjusted_prevalences = GAC.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
71
-
72
- # Map class labels to their corresponding prevalences
73
- return adjusted_prevalences
74
-
75
- @classmethod
76
- def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
77
- """Estimate the matrix where entry (i,j) is the estimate of P(yi|yj)"""
78
-
79
- n_classes = len(classes)
80
- cond_prob_matrix = np.eye(n_classes)
81
-
82
- for i, class_ in enumerate(classes):
83
- class_indices = true_labels == class_
84
- if class_indices.any():
85
- cond_prob_matrix[i] = predictions[class_indices].mean(axis=0)
86
-
87
- return cond_prob_matrix.T
@@ -1,81 +0,0 @@
1
- from abc import abstractmethod
2
- import numpy as np
3
- from sklearn.base import BaseEstimator
4
-
5
- from ....base import AggregativeQuantifier
6
- from ....utils import probsymm, sqEuclidean, topsoe, hellinger, get_scores
7
-
8
- class MixtureModel(AggregativeQuantifier):
9
- """Generic Class for the Mixture Models methods, which
10
- are based oon the assumption that the cumulative
11
- distribution of the scores assigned to data points in the test
12
- is a mixture of the scores in train data
13
- """
14
-
15
- def __init__(self, learner: BaseEstimator):
16
- self.learner = learner
17
- self.pos_scores = None
18
- self.neg_scores = None
19
- self.distance = None
20
-
21
- @property
22
- def multiclass_method(self) -> bool:
23
- return False
24
-
25
- def _fit_method(self, X, y):
26
- # Compute scores with cross validation and fit the learner if not already fitted
27
- y_label, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
28
-
29
- # Separate positive and negative scores based on labels
30
- self.pos_scores = probabilities[y_label == self.classes[1]][:, 1]
31
- self.neg_scores = probabilities[y_label == self.classes[0]][:, 1]
32
-
33
- return self
34
-
35
- def _predict_method(self, X) -> dict:
36
- prevalences = {}
37
-
38
- # Get the predicted probabilities for the positive class
39
- test_scores = self.learner.predict_proba(X)[:, 1]
40
-
41
- # Compute the prevalence using the provided measure
42
- prevalence = np.clip(self._compute_prevalence(test_scores), 0, 1)
43
-
44
- # Clip the prevalence to be within the [0, 1] range and compute the complement for the other class
45
- prevalences = np.asarray([1- prevalence, prevalence])
46
-
47
- return prevalences
48
-
49
- @abstractmethod
50
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
51
- """ Abstract method for computing the prevalence using the test scores """
52
- ...
53
-
54
- def get_distance(self, dist_train, dist_test, measure: str) -> float:
55
- """Compute the distance between training and test distributions using the specified metric"""
56
-
57
- # Check if any vector is too small or if they have different lengths
58
- if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
59
- raise ValueError("One or both vectors are zero (empty)...")
60
- if len(dist_train) != len(dist_test):
61
- raise ValueError("Arrays need to be of equal size...")
62
-
63
- # Convert distributions to numpy arrays for efficient computation
64
- dist_train = np.array(dist_train, dtype=float)
65
- dist_test = np.array(dist_test, dtype=float)
66
-
67
- # Avoid division by zero by correcting zero values
68
- dist_train[dist_train < 1e-20] = 1e-20
69
- dist_test[dist_test < 1e-20] = 1e-20
70
-
71
- # Compute and return the distance based on the selected metric
72
- if measure == 'topsoe':
73
- return topsoe(dist_train, dist_test)
74
- elif measure == 'probsymm':
75
- return probsymm(dist_train, dist_test)
76
- elif measure == 'hellinger':
77
- return hellinger(dist_train, dist_test)
78
- elif measure == 'euclidean':
79
- return sqEuclidean(dist_train, dist_test)
80
- else:
81
- return 100 # Default value if an unknown measure is provided
@@ -1,5 +0,0 @@
1
- from .hdy import HDy
2
- from .dys import DyS
3
- from .sord import SORD
4
- from .smm import SMM
5
- from .dys_syn import DySsyn
@@ -1,55 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
-
4
- from ._MixtureModel import MixtureModel
5
- from ....utils import getHist, ternary_search
6
-
7
- class DyS(MixtureModel):
8
- """Distribution y-Similarity framework. Is a
9
- method that generalises the HDy approach by
10
- considering the dissimilarity function DS as
11
- a parameter of the model
12
- """
13
-
14
- def __init__(self, learner:BaseEstimator, measure:str="topsoe", bins_size:np.ndarray=None):
15
- assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
16
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
17
- super().__init__(learner)
18
-
19
- # Set up bins_size
20
- if not bins_size:
21
- bins_size = np.append(np.linspace(2,20,10), 30)
22
- if isinstance(bins_size, list):
23
- bins_size = np.asarray(bins_size)
24
-
25
- self.bins_size = bins_size
26
- self.measure = measure
27
-
28
-
29
- def _compute_prevalence(self, test_scores:np.ndarray) -> float: #creating bins from 10 to 110 with step size 10
30
- # Compute prevalence by evaluating the distance metric across various bin sizes
31
-
32
- result = []
33
-
34
- # Iterate over each bin size
35
- for bins in self.bins_size:
36
- # Compute histogram densities for positive, negative, and test scores
37
- pos_bin_density = getHist(self.pos_scores, bins)
38
- neg_bin_density = getHist(self.neg_scores, bins)
39
- test_bin_density = getHist(test_scores, bins)
40
-
41
- # Define the function to minimize
42
- def f(x):
43
- # Combine densities using a mixture of positive and negative densities
44
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
45
- # Calculate the distance between combined density and test density
46
- return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
47
-
48
- # Use ternary search to find the best x that minimizes the distance
49
- result.append(ternary_search(0, 1, f))
50
-
51
- # Use the median of the results as the final prevalence estimate
52
- prevalence = np.median(result)
53
-
54
- return prevalence
55
-
@@ -1,89 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
-
4
- from ._MixtureModel import MixtureModel
5
- from ....utils import getHist, ternary_search, MoSS, get_real_prev
6
-
7
- class DySsyn(MixtureModel):
8
- """Synthetic Distribution y-Similarity. This method works the
9
- same as DyS method, but istead of using the train scores, it
10
- generates them via MoSS (Model for Score Simulation) which
11
- generate a spectrum of score distributions from highly separated
12
- scores to fully mixed scores.
13
- """
14
-
15
- def __init__(self, learner:BaseEstimator, measure:str="topsoe", merge_factor:np.ndarray=None, bins_size:np.ndarray=None, alpha_train:float=0.5, n:int=None):
16
- assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
17
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
18
- super().__init__(learner)
19
-
20
- # Set up bins_size
21
- if not bins_size:
22
- bins_size = np.append(np.linspace(2,20,10), 30)
23
- if isinstance(bins_size, list):
24
- bins_size = np.asarray(bins_size)
25
-
26
- if not merge_factor:
27
- merge_factor = np.linspace(0.1, 0.4, 10)
28
-
29
- self.bins_size = bins_size
30
- self.merge_factor = merge_factor
31
- self.alpha_train = alpha_train
32
- self.n = n
33
- self.measure = measure
34
- self.m = None
35
-
36
-
37
- def _fit_method(self, X, y):
38
- if not self.learner_fitted:
39
- self.learner.fit(X, y)
40
-
41
- self.alpha_train = list(get_real_prev(y).values())[1]
42
-
43
- return self
44
-
45
-
46
-
47
- def _compute_prevalence(self, test_scores:np.ndarray) -> float: #creating bins from 10 to 110 with step size 10
48
- # Compute prevalence by evaluating the distance metric across various bin sizes
49
- if self.n is None:
50
- self.n = len(test_scores)
51
-
52
- distances = {}
53
-
54
- # Iterate over each bin size
55
- for m in self.merge_factor:
56
- pos_scores, neg_scores = MoSS(self.n, self.alpha_train, m)
57
- result = []
58
- for bins in self.bins_size:
59
- # Compute histogram densities for positive, negative, and test scores
60
- pos_bin_density = getHist(pos_scores, bins)
61
- neg_bin_density = getHist(neg_scores, bins)
62
- test_bin_density = getHist(test_scores, bins)
63
-
64
- # Define the function to minimize
65
- def f(x):
66
- # Combine densities using a mixture of positive and negative densities
67
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
68
- # Calculate the distance between combined density and test density
69
- return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
70
-
71
- # Use ternary search to find the best x that minimizes the distance
72
- result.append(ternary_search(0, 1, f))
73
- prevalence = np.median(result)
74
-
75
- bins_size = self.bins_size[result == prevalence][0]
76
-
77
- pos_bin_density = getHist(pos_scores, bins_size)
78
- neg_bin_density = getHist(neg_scores, bins_size)
79
- test_bin_density = getHist(test_scores, bins_size)
80
-
81
- train_combined_density = (pos_bin_density * prevalence) + (neg_bin_density * (1 - prevalence))
82
- d = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
83
- distances[m] = (d, prevalence)
84
- # Use the median of the results as the final prevalence estimate
85
- index = min(distances, key=lambda d: distances[d][0])
86
- prevalence = distances[index][1]
87
-
88
- return prevalence
89
-
@@ -1,46 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
-
4
- from ._MixtureModel import MixtureModel
5
- from ....utils import getHist
6
-
7
- class HDy(MixtureModel):
8
- """Hellinger Distance Minimization. The method
9
- is based on computing the hellinger distance of
10
- two distributions, test distribution and the mixture
11
- of the positive and negative distribution of the train.
12
- """
13
-
14
- def __init__(self, learner: BaseEstimator):
15
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
16
- super().__init__(learner)
17
-
18
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
19
- # Define bin sizes and alpha values
20
- bin_size = np.arange(10, 110, 11) # Bins from 10 to 110 with a step size of 10
21
- alpha_values = np.round(np.linspace(0, 1, 101), 2) # Alpha values from 0 to 1, rounded to 2 decimal places
22
-
23
- best_alphas = []
24
-
25
- for bins in bin_size:
26
-
27
- pos_bin_density = getHist(self.pos_scores, bins)
28
- neg_bin_density = getHist(self.neg_scores, bins)
29
- test_bin_density = getHist(test_scores, bins)
30
-
31
- distances = []
32
-
33
- # Evaluate distance for each alpha value
34
- for x in alpha_values:
35
- # Combine densities using a mixture of positive and negative densities
36
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
37
- # Compute the distance using the Hellinger measure
38
- distances.append(self.get_distance(train_combined_density, test_bin_density, measure="hellinger"))
39
-
40
- # Find the alpha value that minimizes the distance
41
- best_alphas.append(alpha_values[np.argmin(distances)])
42
-
43
- # Compute the median of the best alpha values as the final prevalence estimate
44
- prevalence = np.median(best_alphas)
45
-
46
- return prevalence
@@ -1,27 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
-
4
- from ._MixtureModel import MixtureModel
5
-
6
- class SMM(MixtureModel):
7
- """Sample Mean Matching. The method is
8
- a member of the DyS framework that uses
9
- simple means to represent the score
10
- distribution for positive, negative
11
- and unlabelled scores.
12
- """
13
-
14
- def __init__(self, learner: BaseEstimator):
15
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
16
- super().__init__(learner)
17
-
18
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
19
- mean_pos_score = np.mean(self.pos_scores)
20
- mean_neg_score = np.mean(self.neg_scores)
21
- mean_test_score = np.mean(test_scores)
22
-
23
- # Calculate prevalence as the proportion of the positive class
24
- # based on the mean test score relative to the mean positive and negative scores
25
- prevalence = (mean_test_score - mean_neg_score) / (mean_pos_score - mean_neg_score)
26
-
27
- return prevalence