mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +32 -6
- mlquantify/base.py +559 -257
- mlquantify/classification/__init__.py +1 -1
- mlquantify/classification/methods.py +160 -0
- mlquantify/evaluation/__init__.py +14 -2
- mlquantify/evaluation/measures.py +215 -0
- mlquantify/evaluation/protocol.py +647 -0
- mlquantify/methods/__init__.py +37 -40
- mlquantify/methods/aggregative.py +1030 -0
- mlquantify/methods/meta.py +472 -0
- mlquantify/methods/mixture_models.py +1003 -0
- mlquantify/methods/non_aggregative.py +136 -0
- mlquantify/methods/threshold_optimization.py +957 -0
- mlquantify/model_selection.py +377 -232
- mlquantify/plots.py +367 -0
- mlquantify/utils/__init__.py +2 -2
- mlquantify/utils/general.py +334 -0
- mlquantify/utils/method.py +449 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
- mlquantify-0.1.1.dist-info/RECORD +22 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
- mlquantify/classification/pwkclf.py +0 -73
- mlquantify/evaluation/measures/__init__.py +0 -26
- mlquantify/evaluation/measures/ae.py +0 -11
- mlquantify/evaluation/measures/bias.py +0 -16
- mlquantify/evaluation/measures/kld.py +0 -8
- mlquantify/evaluation/measures/mse.py +0 -12
- mlquantify/evaluation/measures/nae.py +0 -16
- mlquantify/evaluation/measures/nkld.py +0 -13
- mlquantify/evaluation/measures/nrae.py +0 -16
- mlquantify/evaluation/measures/rae.py +0 -12
- mlquantify/evaluation/measures/se.py +0 -12
- mlquantify/evaluation/protocol/_Protocol.py +0 -202
- mlquantify/evaluation/protocol/__init__.py +0 -2
- mlquantify/evaluation/protocol/app.py +0 -146
- mlquantify/evaluation/protocol/npp.py +0 -34
- mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
- mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
- mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
- mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
- mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
- mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
- mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
- mlquantify/methods/aggregative/__init__.py +0 -9
- mlquantify/methods/aggregative/cc.py +0 -32
- mlquantify/methods/aggregative/emq.py +0 -86
- mlquantify/methods/aggregative/fm.py +0 -72
- mlquantify/methods/aggregative/gac.py +0 -96
- mlquantify/methods/aggregative/gpac.py +0 -87
- mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
- mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
- mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
- mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
- mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
- mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
- mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
- mlquantify/methods/aggregative/pcc.py +0 -33
- mlquantify/methods/aggregative/pwk.py +0 -38
- mlquantify/methods/meta/__init__.py +0 -1
- mlquantify/methods/meta/ensemble.py +0 -236
- mlquantify/methods/non_aggregative/__init__.py +0 -1
- mlquantify/methods/non_aggregative/hdx.py +0 -71
- mlquantify/plots/__init__.py +0 -2
- mlquantify/plots/distribution_plot.py +0 -109
- mlquantify/plots/protocol_plot.py +0 -193
- mlquantify/utils/general_purposes/__init__.py +0 -8
- mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
- mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
- mlquantify/utils/general_purposes/get_real_prev.py +0 -9
- mlquantify/utils/general_purposes/load_quantifier.py +0 -4
- mlquantify/utils/general_purposes/make_prevs.py +0 -23
- mlquantify/utils/general_purposes/normalize.py +0 -20
- mlquantify/utils/general_purposes/parallel.py +0 -10
- mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
- mlquantify/utils/method_purposes/__init__.py +0 -6
- mlquantify/utils/method_purposes/distances.py +0 -21
- mlquantify/utils/method_purposes/getHist.py +0 -13
- mlquantify/utils/method_purposes/get_scores.py +0 -33
- mlquantify/utils/method_purposes/moss.py +0 -16
- mlquantify/utils/method_purposes/ternary_search.py +0 -14
- mlquantify/utils/method_purposes/tprfpr.py +0 -42
- mlquantify-0.0.11.2.dist-info/RECORD +0 -73
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
from ...base import AggregativeQuantifier
|
|
4
|
-
|
|
5
|
-
class CC(AggregativeQuantifier):
|
|
6
|
-
"""Classify and Count. The simplest quantification method
|
|
7
|
-
involves classifying each instance and then counting the
|
|
8
|
-
number of instances assigned to each class to estimate
|
|
9
|
-
the class prevalence.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, learner: BaseEstimator):
|
|
13
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
14
|
-
self.learner = learner
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _fit_method(self, X, y):
|
|
18
|
-
if not self.learner_fitted:
|
|
19
|
-
self.learner.fit(X, y)
|
|
20
|
-
return self
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _predict_method(self, X) -> dict:
|
|
24
|
-
predicted_labels = self.learner.predict(X)
|
|
25
|
-
|
|
26
|
-
# Count occurrences of each class in the predictions
|
|
27
|
-
class_counts = np.array([np.count_nonzero(predicted_labels == _class) for _class in self.classes])
|
|
28
|
-
|
|
29
|
-
# Calculate the prevalence of each class
|
|
30
|
-
prevalences = class_counts / len(predicted_labels)
|
|
31
|
-
|
|
32
|
-
return prevalences
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
|
|
5
|
-
from ...base import AggregativeQuantifier
|
|
6
|
-
|
|
7
|
-
class EMQ(AggregativeQuantifier):
|
|
8
|
-
"""Expectation Maximisation Quantifier. It is a method that
|
|
9
|
-
ajust the priors and posteriors probabilities of a learner
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
MAX_ITER = 1000
|
|
13
|
-
EPSILON = 1e-6
|
|
14
|
-
|
|
15
|
-
def __init__(self, learner: BaseEstimator):
|
|
16
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
17
|
-
self.learner = learner
|
|
18
|
-
self.priors = None
|
|
19
|
-
|
|
20
|
-
def _fit_method(self, X, y):
|
|
21
|
-
|
|
22
|
-
if not self.learner_fitted:
|
|
23
|
-
self.learner.fit(X, y)
|
|
24
|
-
|
|
25
|
-
counts = np.array([np.count_nonzero(y == _class) for _class in self.classes])
|
|
26
|
-
self.priors = counts / len(y)
|
|
27
|
-
|
|
28
|
-
return self
|
|
29
|
-
|
|
30
|
-
def _predict_method(self, X) -> dict:
|
|
31
|
-
|
|
32
|
-
posteriors = self.learner.predict_proba(X)
|
|
33
|
-
prevalences, _ = self.EM(self.priors, posteriors)
|
|
34
|
-
|
|
35
|
-
return prevalences
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def predict_proba(self, X, epsilon:float=EPSILON, max_iter:int=MAX_ITER) -> np.ndarray:
|
|
39
|
-
posteriors = self.learner.predict_proba(X)
|
|
40
|
-
_, posteriors = self.EM(self.priors, posteriors, epsilon, max_iter)
|
|
41
|
-
return posteriors
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@classmethod
|
|
45
|
-
def EM(cls, priors, posteriors, epsilon=EPSILON, max_iter=MAX_ITER):
|
|
46
|
-
"""Expectaion Maximization function, it iterates several times
|
|
47
|
-
and At each iteration step, both the a posteriori and the a
|
|
48
|
-
priori probabilities are reestimated sequentially for each new
|
|
49
|
-
observation and each class. The iterative procedure proceeds
|
|
50
|
-
until the convergence of the estimated probabilities.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
priors (array-like): priors probabilites of the train.
|
|
54
|
-
posteriors (array-like): posteriors probabiblities of the test.
|
|
55
|
-
epsilon (float): value that helps to indify the convergence.
|
|
56
|
-
max_iter (int): max number of iterations.
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
the predicted prevalence and the ajusted posteriors.
|
|
60
|
-
"""
|
|
61
|
-
|
|
62
|
-
Px = posteriors
|
|
63
|
-
prev_prevalence = np.copy(priors)
|
|
64
|
-
running_estimate = np.copy(prev_prevalence) # Initialized with the training prevalence
|
|
65
|
-
|
|
66
|
-
iteration, converged = 0, False
|
|
67
|
-
previous_estimate = None
|
|
68
|
-
|
|
69
|
-
while not converged and iteration < max_iter:
|
|
70
|
-
# E-step: ps is P(y|xi)
|
|
71
|
-
posteriors_unnormalized = (running_estimate / prev_prevalence) * Px
|
|
72
|
-
posteriors = posteriors_unnormalized / posteriors_unnormalized.sum(axis=1, keepdims=True)
|
|
73
|
-
|
|
74
|
-
# M-step:
|
|
75
|
-
running_estimate = posteriors.mean(axis=0)
|
|
76
|
-
|
|
77
|
-
if previous_estimate is not None and np.mean(np.abs(running_estimate - previous_estimate)) < epsilon and iteration > 10:
|
|
78
|
-
converged = True
|
|
79
|
-
|
|
80
|
-
previous_estimate = running_estimate
|
|
81
|
-
iteration += 1
|
|
82
|
-
|
|
83
|
-
if not converged:
|
|
84
|
-
print('[Warning] The method has reached the maximum number of iterations; it might not have converged')
|
|
85
|
-
|
|
86
|
-
return running_estimate, posteriors
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
from scipy.optimize import minimize
|
|
4
|
-
|
|
5
|
-
from ...base import AggregativeQuantifier
|
|
6
|
-
from ...utils import get_scores
|
|
7
|
-
|
|
8
|
-
class FM(AggregativeQuantifier):
|
|
9
|
-
"""The Friedman Method. Similar to GPAC,
|
|
10
|
-
but instead of averaging the confidence scores
|
|
11
|
-
from probabilistic classifiers, it uses the proportion
|
|
12
|
-
of confidence scores that are higher or lower than the
|
|
13
|
-
expected class frequencies found in the training data.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def __init__(self, learner: BaseEstimator):
|
|
18
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
19
|
-
self.learner = learner
|
|
20
|
-
self.CM = None
|
|
21
|
-
|
|
22
|
-
def _fit_method(self, X, y):
|
|
23
|
-
# Get predicted labels and probabilities using cross-validation
|
|
24
|
-
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
25
|
-
|
|
26
|
-
# Fit the learner if it hasn't been fitted already
|
|
27
|
-
if not self.learner_fitted:
|
|
28
|
-
self.learner.fit(X, y)
|
|
29
|
-
|
|
30
|
-
# Initialize the confusion matrix
|
|
31
|
-
CM = np.zeros((self.n_class, self.n_class))
|
|
32
|
-
|
|
33
|
-
# Calculate the class priors
|
|
34
|
-
class_counts = np.array([np.count_nonzero(y_labels == _class) for _class in self.classes])
|
|
35
|
-
self.priors = class_counts / len(y_labels)
|
|
36
|
-
|
|
37
|
-
# Populate the confusion matrix
|
|
38
|
-
for i, _class in enumerate(self.classes):
|
|
39
|
-
indices = np.where(y_labels == _class)[0]
|
|
40
|
-
CM[:, i] = np.sum(probabilities[indices] > self.priors, axis=0)
|
|
41
|
-
|
|
42
|
-
# Normalize the confusion matrix by class counts
|
|
43
|
-
self.CM = CM / class_counts
|
|
44
|
-
|
|
45
|
-
return self
|
|
46
|
-
|
|
47
|
-
def _predict_method(self, X) -> dict:
|
|
48
|
-
posteriors = self.learner.predict_proba(X)
|
|
49
|
-
|
|
50
|
-
# Calculate the estimated prevalences in the test set
|
|
51
|
-
prevs_estim = np.sum(posteriors > self.priors, axis=0) / posteriors.shape[0]
|
|
52
|
-
# Define the objective function for optimization
|
|
53
|
-
def objective(prevs_pred):
|
|
54
|
-
return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
|
|
55
|
-
|
|
56
|
-
# Constraints for the optimization problem
|
|
57
|
-
constraints = [{'type': 'eq', 'fun': lambda prevs_pred: np.sum(prevs_pred) - 1.0},
|
|
58
|
-
{'type': 'ineq', 'fun': lambda prevs_pred: prevs_pred}]
|
|
59
|
-
|
|
60
|
-
# Initial guess for the optimization
|
|
61
|
-
initial_guess = np.ones(self.CM.shape[1]) / self.CM.shape[1]
|
|
62
|
-
|
|
63
|
-
# Solve the optimization problem
|
|
64
|
-
result = minimize(objective, initial_guess, constraints=constraints, bounds=[(0, 1)]*self.CM.shape[1])
|
|
65
|
-
|
|
66
|
-
if result.success:
|
|
67
|
-
prevalences = result.x
|
|
68
|
-
else:
|
|
69
|
-
print("Optimization did not converge")
|
|
70
|
-
prevalences = self.priors
|
|
71
|
-
|
|
72
|
-
return prevalences
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
from sklearn.metrics import confusion_matrix
|
|
5
|
-
from sklearn.model_selection import StratifiedKFold
|
|
6
|
-
|
|
7
|
-
from ...base import AggregativeQuantifier
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class GAC(AggregativeQuantifier):
|
|
11
|
-
"""Generalized Adjusted Count. It applies a
|
|
12
|
-
classifier to build a system of linear equations,
|
|
13
|
-
and solve it via constrained least-squares regression.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, learner: BaseEstimator):
|
|
17
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
18
|
-
self.learner = learner
|
|
19
|
-
self.cond_prob_matrix = None
|
|
20
|
-
|
|
21
|
-
def _fit_method(self, X, y):
|
|
22
|
-
# Ensure X and y are DataFrames
|
|
23
|
-
if isinstance(X, np.ndarray):
|
|
24
|
-
X = pd.DataFrame(X)
|
|
25
|
-
if isinstance(y, np.ndarray):
|
|
26
|
-
y = pd.Series(y)
|
|
27
|
-
|
|
28
|
-
if self.learner_fitted:
|
|
29
|
-
y_pred = self.learner.predict(X)
|
|
30
|
-
y_label = y
|
|
31
|
-
else:
|
|
32
|
-
# Cross-validation for generating predictions
|
|
33
|
-
skf = StratifiedKFold(n_splits=self.cv_folds)
|
|
34
|
-
y_pred = []
|
|
35
|
-
y_label = []
|
|
36
|
-
|
|
37
|
-
for train_index, valid_index in skf.split(X, y):
|
|
38
|
-
|
|
39
|
-
train_data = pd.DataFrame(X.iloc[train_index])
|
|
40
|
-
train_label = y.iloc[train_index]
|
|
41
|
-
|
|
42
|
-
valid_data = pd.DataFrame(X.iloc[valid_index])
|
|
43
|
-
valid_label = y.iloc[valid_index]
|
|
44
|
-
|
|
45
|
-
self.learner.fit(train_data, train_label)
|
|
46
|
-
|
|
47
|
-
y_pred.extend(self.learner.predict(valid_data))
|
|
48
|
-
y_label.extend(valid_label)
|
|
49
|
-
|
|
50
|
-
# Compute conditional probability matrix
|
|
51
|
-
self.cond_prob_matrix = self.get_cond_prob_matrix(self.classes, y, y_pred)
|
|
52
|
-
|
|
53
|
-
return self
|
|
54
|
-
|
|
55
|
-
def _predict_method(self, X) -> dict:
|
|
56
|
-
# Predict class labels for the test data
|
|
57
|
-
y_pred = self.learner.predict(X)
|
|
58
|
-
|
|
59
|
-
# Distribution of predictions in the test set
|
|
60
|
-
_, counts = np.unique(y_pred, return_counts=True)
|
|
61
|
-
predicted_prevalences = counts / counts.sum()
|
|
62
|
-
|
|
63
|
-
# Adjust prevalences based on conditional probability matrix
|
|
64
|
-
adjusted_prevalences = self.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
|
|
65
|
-
|
|
66
|
-
return adjusted_prevalences
|
|
67
|
-
|
|
68
|
-
@classmethod
|
|
69
|
-
def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
|
|
70
|
-
""" Estimate the conditional probability matrix P(yi|yj)"""
|
|
71
|
-
|
|
72
|
-
CM = confusion_matrix(true_labels, predictions, labels=classes).T
|
|
73
|
-
CM = CM.astype(np.float32)
|
|
74
|
-
class_counts = CM.sum(axis=0)
|
|
75
|
-
for i, _ in enumerate(classes):
|
|
76
|
-
if class_counts[i] == 0:
|
|
77
|
-
CM[i, i] = 1
|
|
78
|
-
else:
|
|
79
|
-
CM[:, i] /= class_counts[i]
|
|
80
|
-
return CM
|
|
81
|
-
|
|
82
|
-
@classmethod
|
|
83
|
-
def solve_adjustment(cls, cond_prob_matrix, predicted_prevalences):
|
|
84
|
-
""" Solve the linear system Ax = B with A=cond_prob_matrix and B=predicted_prevalences
|
|
85
|
-
"""
|
|
86
|
-
|
|
87
|
-
#
|
|
88
|
-
A = cond_prob_matrix
|
|
89
|
-
B = predicted_prevalences
|
|
90
|
-
try:
|
|
91
|
-
adjusted_prevalences = np.linalg.solve(A, B)
|
|
92
|
-
adjusted_prevalences = np.clip(adjusted_prevalences, 0, 1)
|
|
93
|
-
adjusted_prevalences /= adjusted_prevalences.sum()
|
|
94
|
-
except (np.linalg.LinAlgError, ValueError):
|
|
95
|
-
adjusted_prevalences = predicted_prevalences # No way to adjust them
|
|
96
|
-
return adjusted_prevalences
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
from sklearn.model_selection import StratifiedKFold
|
|
5
|
-
|
|
6
|
-
from .gac import GAC
|
|
7
|
-
from ...base import AggregativeQuantifier
|
|
8
|
-
|
|
9
|
-
class GPAC(AggregativeQuantifier):
|
|
10
|
-
"""Generalized Probabilistic Adjusted Count. Like
|
|
11
|
-
GAC, it also build a system of linear equations, but
|
|
12
|
-
utilize the confidence scores from probabilistic
|
|
13
|
-
classifiers as in the PAC method.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def __init__(self, learner: BaseEstimator):
|
|
18
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
19
|
-
self.learner = learner
|
|
20
|
-
self.cond_prob_matrix = None
|
|
21
|
-
|
|
22
|
-
def _fit_method(self, X, y):
|
|
23
|
-
# Convert X and y to DataFrames if they are numpy arrays
|
|
24
|
-
if isinstance(X, np.ndarray):
|
|
25
|
-
X = pd.DataFrame(X)
|
|
26
|
-
if isinstance(y, np.ndarray):
|
|
27
|
-
y = pd.DataFrame(y)
|
|
28
|
-
|
|
29
|
-
if self.learner_fitted:
|
|
30
|
-
# Use existing model to predict
|
|
31
|
-
predictions = self.learner.predict(X)
|
|
32
|
-
true_labels = y
|
|
33
|
-
else:
|
|
34
|
-
# Perform cross-validation to generate predictions
|
|
35
|
-
skf = StratifiedKFold(n_splits=self.cv_folds)
|
|
36
|
-
predictions = []
|
|
37
|
-
true_labels = []
|
|
38
|
-
|
|
39
|
-
for train_index, valid_index in skf.split(X, y):
|
|
40
|
-
# Split data into training and validation sets
|
|
41
|
-
train_data = pd.DataFrame(X.iloc[train_index])
|
|
42
|
-
train_labels = y.iloc[train_index]
|
|
43
|
-
|
|
44
|
-
valid_data = pd.DataFrame(X.iloc[valid_index])
|
|
45
|
-
valid_labels = y.iloc[valid_index]
|
|
46
|
-
|
|
47
|
-
# Train the learner
|
|
48
|
-
self.learner.fit(train_data, train_labels)
|
|
49
|
-
|
|
50
|
-
# Predict and collect results
|
|
51
|
-
predictions.extend(self.learner.predict(valid_data))
|
|
52
|
-
true_labels.extend(valid_labels)
|
|
53
|
-
|
|
54
|
-
# Compute conditional probability matrix using GAC
|
|
55
|
-
self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, true_labels, predictions)
|
|
56
|
-
|
|
57
|
-
return self
|
|
58
|
-
|
|
59
|
-
def _predict_method(self, X) -> dict:
|
|
60
|
-
# Predict class labels for the test data
|
|
61
|
-
predictions = self.learner.predict(X)
|
|
62
|
-
|
|
63
|
-
# Calculate the distribution of predictions in the test set
|
|
64
|
-
predicted_prevalences = np.zeros(self.n_class)
|
|
65
|
-
_, counts = np.unique(predictions, return_counts=True)
|
|
66
|
-
predicted_prevalences[:len(counts)] = counts
|
|
67
|
-
predicted_prevalences = predicted_prevalences / predicted_prevalences.sum()
|
|
68
|
-
|
|
69
|
-
# Adjust prevalences based on the conditional probability matrix from GAC
|
|
70
|
-
adjusted_prevalences = GAC.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
|
|
71
|
-
|
|
72
|
-
# Map class labels to their corresponding prevalences
|
|
73
|
-
return adjusted_prevalences
|
|
74
|
-
|
|
75
|
-
@classmethod
|
|
76
|
-
def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
|
|
77
|
-
"""Estimate the matrix where entry (i,j) is the estimate of P(yi|yj)"""
|
|
78
|
-
|
|
79
|
-
n_classes = len(classes)
|
|
80
|
-
cond_prob_matrix = np.eye(n_classes)
|
|
81
|
-
|
|
82
|
-
for i, class_ in enumerate(classes):
|
|
83
|
-
class_indices = true_labels == class_
|
|
84
|
-
if class_indices.any():
|
|
85
|
-
cond_prob_matrix[i] = predictions[class_indices].mean(axis=0)
|
|
86
|
-
|
|
87
|
-
return cond_prob_matrix.T
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
from abc import abstractmethod
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
|
|
5
|
-
from ....base import AggregativeQuantifier
|
|
6
|
-
from ....utils import probsymm, sqEuclidean, topsoe, hellinger, get_scores
|
|
7
|
-
|
|
8
|
-
class MixtureModel(AggregativeQuantifier):
|
|
9
|
-
"""Generic Class for the Mixture Models methods, which
|
|
10
|
-
are based oon the assumption that the cumulative
|
|
11
|
-
distribution of the scores assigned to data points in the test
|
|
12
|
-
is a mixture of the scores in train data
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, learner: BaseEstimator):
|
|
16
|
-
self.learner = learner
|
|
17
|
-
self.pos_scores = None
|
|
18
|
-
self.neg_scores = None
|
|
19
|
-
self.distance = None
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def multiclass_method(self) -> bool:
|
|
23
|
-
return False
|
|
24
|
-
|
|
25
|
-
def _fit_method(self, X, y):
|
|
26
|
-
# Compute scores with cross validation and fit the learner if not already fitted
|
|
27
|
-
y_label, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
28
|
-
|
|
29
|
-
# Separate positive and negative scores based on labels
|
|
30
|
-
self.pos_scores = probabilities[y_label == self.classes[1]][:, 1]
|
|
31
|
-
self.neg_scores = probabilities[y_label == self.classes[0]][:, 1]
|
|
32
|
-
|
|
33
|
-
return self
|
|
34
|
-
|
|
35
|
-
def _predict_method(self, X) -> dict:
|
|
36
|
-
prevalences = {}
|
|
37
|
-
|
|
38
|
-
# Get the predicted probabilities for the positive class
|
|
39
|
-
test_scores = self.learner.predict_proba(X)[:, 1]
|
|
40
|
-
|
|
41
|
-
# Compute the prevalence using the provided measure
|
|
42
|
-
prevalence = np.clip(self._compute_prevalence(test_scores), 0, 1)
|
|
43
|
-
|
|
44
|
-
# Clip the prevalence to be within the [0, 1] range and compute the complement for the other class
|
|
45
|
-
prevalences = np.asarray([1- prevalence, prevalence])
|
|
46
|
-
|
|
47
|
-
return prevalences
|
|
48
|
-
|
|
49
|
-
@abstractmethod
|
|
50
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
51
|
-
""" Abstract method for computing the prevalence using the test scores """
|
|
52
|
-
...
|
|
53
|
-
|
|
54
|
-
def get_distance(self, dist_train, dist_test, measure: str) -> float:
|
|
55
|
-
"""Compute the distance between training and test distributions using the specified metric"""
|
|
56
|
-
|
|
57
|
-
# Check if any vector is too small or if they have different lengths
|
|
58
|
-
if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
|
|
59
|
-
raise ValueError("One or both vectors are zero (empty)...")
|
|
60
|
-
if len(dist_train) != len(dist_test):
|
|
61
|
-
raise ValueError("Arrays need to be of equal size...")
|
|
62
|
-
|
|
63
|
-
# Convert distributions to numpy arrays for efficient computation
|
|
64
|
-
dist_train = np.array(dist_train, dtype=float)
|
|
65
|
-
dist_test = np.array(dist_test, dtype=float)
|
|
66
|
-
|
|
67
|
-
# Avoid division by zero by correcting zero values
|
|
68
|
-
dist_train[dist_train < 1e-20] = 1e-20
|
|
69
|
-
dist_test[dist_test < 1e-20] = 1e-20
|
|
70
|
-
|
|
71
|
-
# Compute and return the distance based on the selected metric
|
|
72
|
-
if measure == 'topsoe':
|
|
73
|
-
return topsoe(dist_train, dist_test)
|
|
74
|
-
elif measure == 'probsymm':
|
|
75
|
-
return probsymm(dist_train, dist_test)
|
|
76
|
-
elif measure == 'hellinger':
|
|
77
|
-
return hellinger(dist_train, dist_test)
|
|
78
|
-
elif measure == 'euclidean':
|
|
79
|
-
return sqEuclidean(dist_train, dist_test)
|
|
80
|
-
else:
|
|
81
|
-
return 100 # Default value if an unknown measure is provided
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._MixtureModel import MixtureModel
|
|
5
|
-
from ....utils import getHist, ternary_search
|
|
6
|
-
|
|
7
|
-
class DyS(MixtureModel):
|
|
8
|
-
"""Distribution y-Similarity framework. Is a
|
|
9
|
-
method that generalises the HDy approach by
|
|
10
|
-
considering the dissimilarity function DS as
|
|
11
|
-
a parameter of the model
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
def __init__(self, learner:BaseEstimator, measure:str="topsoe", bins_size:np.ndarray=None):
|
|
15
|
-
assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
|
|
16
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
17
|
-
super().__init__(learner)
|
|
18
|
-
|
|
19
|
-
# Set up bins_size
|
|
20
|
-
if not bins_size:
|
|
21
|
-
bins_size = np.append(np.linspace(2,20,10), 30)
|
|
22
|
-
if isinstance(bins_size, list):
|
|
23
|
-
bins_size = np.asarray(bins_size)
|
|
24
|
-
|
|
25
|
-
self.bins_size = bins_size
|
|
26
|
-
self.measure = measure
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _compute_prevalence(self, test_scores:np.ndarray) -> float: #creating bins from 10 to 110 with step size 10
|
|
30
|
-
# Compute prevalence by evaluating the distance metric across various bin sizes
|
|
31
|
-
|
|
32
|
-
result = []
|
|
33
|
-
|
|
34
|
-
# Iterate over each bin size
|
|
35
|
-
for bins in self.bins_size:
|
|
36
|
-
# Compute histogram densities for positive, negative, and test scores
|
|
37
|
-
pos_bin_density = getHist(self.pos_scores, bins)
|
|
38
|
-
neg_bin_density = getHist(self.neg_scores, bins)
|
|
39
|
-
test_bin_density = getHist(test_scores, bins)
|
|
40
|
-
|
|
41
|
-
# Define the function to minimize
|
|
42
|
-
def f(x):
|
|
43
|
-
# Combine densities using a mixture of positive and negative densities
|
|
44
|
-
train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
|
|
45
|
-
# Calculate the distance between combined density and test density
|
|
46
|
-
return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
47
|
-
|
|
48
|
-
# Use ternary search to find the best x that minimizes the distance
|
|
49
|
-
result.append(ternary_search(0, 1, f))
|
|
50
|
-
|
|
51
|
-
# Use the median of the results as the final prevalence estimate
|
|
52
|
-
prevalence = np.median(result)
|
|
53
|
-
|
|
54
|
-
return prevalence
|
|
55
|
-
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._MixtureModel import MixtureModel
|
|
5
|
-
from ....utils import getHist, ternary_search, MoSS, get_real_prev
|
|
6
|
-
|
|
7
|
-
class DySsyn(MixtureModel):
|
|
8
|
-
"""Synthetic Distribution y-Similarity. This method works the
|
|
9
|
-
same as DyS method, but istead of using the train scores, it
|
|
10
|
-
generates them via MoSS (Model for Score Simulation) which
|
|
11
|
-
generate a spectrum of score distributions from highly separated
|
|
12
|
-
scores to fully mixed scores.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, learner:BaseEstimator, measure:str="topsoe", merge_factor:np.ndarray=None, bins_size:np.ndarray=None, alpha_train:float=0.5, n:int=None):
|
|
16
|
-
assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
|
|
17
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
18
|
-
super().__init__(learner)
|
|
19
|
-
|
|
20
|
-
# Set up bins_size
|
|
21
|
-
if not bins_size:
|
|
22
|
-
bins_size = np.append(np.linspace(2,20,10), 30)
|
|
23
|
-
if isinstance(bins_size, list):
|
|
24
|
-
bins_size = np.asarray(bins_size)
|
|
25
|
-
|
|
26
|
-
if not merge_factor:
|
|
27
|
-
merge_factor = np.linspace(0.1, 0.4, 10)
|
|
28
|
-
|
|
29
|
-
self.bins_size = bins_size
|
|
30
|
-
self.merge_factor = merge_factor
|
|
31
|
-
self.alpha_train = alpha_train
|
|
32
|
-
self.n = n
|
|
33
|
-
self.measure = measure
|
|
34
|
-
self.m = None
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _fit_method(self, X, y):
|
|
38
|
-
if not self.learner_fitted:
|
|
39
|
-
self.learner.fit(X, y)
|
|
40
|
-
|
|
41
|
-
self.alpha_train = list(get_real_prev(y).values())[1]
|
|
42
|
-
|
|
43
|
-
return self
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def _compute_prevalence(self, test_scores:np.ndarray) -> float: #creating bins from 10 to 110 with step size 10
|
|
48
|
-
# Compute prevalence by evaluating the distance metric across various bin sizes
|
|
49
|
-
if self.n is None:
|
|
50
|
-
self.n = len(test_scores)
|
|
51
|
-
|
|
52
|
-
distances = {}
|
|
53
|
-
|
|
54
|
-
# Iterate over each bin size
|
|
55
|
-
for m in self.merge_factor:
|
|
56
|
-
pos_scores, neg_scores = MoSS(self.n, self.alpha_train, m)
|
|
57
|
-
result = []
|
|
58
|
-
for bins in self.bins_size:
|
|
59
|
-
# Compute histogram densities for positive, negative, and test scores
|
|
60
|
-
pos_bin_density = getHist(pos_scores, bins)
|
|
61
|
-
neg_bin_density = getHist(neg_scores, bins)
|
|
62
|
-
test_bin_density = getHist(test_scores, bins)
|
|
63
|
-
|
|
64
|
-
# Define the function to minimize
|
|
65
|
-
def f(x):
|
|
66
|
-
# Combine densities using a mixture of positive and negative densities
|
|
67
|
-
train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
|
|
68
|
-
# Calculate the distance between combined density and test density
|
|
69
|
-
return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
70
|
-
|
|
71
|
-
# Use ternary search to find the best x that minimizes the distance
|
|
72
|
-
result.append(ternary_search(0, 1, f))
|
|
73
|
-
prevalence = np.median(result)
|
|
74
|
-
|
|
75
|
-
bins_size = self.bins_size[result == prevalence][0]
|
|
76
|
-
|
|
77
|
-
pos_bin_density = getHist(pos_scores, bins_size)
|
|
78
|
-
neg_bin_density = getHist(neg_scores, bins_size)
|
|
79
|
-
test_bin_density = getHist(test_scores, bins_size)
|
|
80
|
-
|
|
81
|
-
train_combined_density = (pos_bin_density * prevalence) + (neg_bin_density * (1 - prevalence))
|
|
82
|
-
d = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
83
|
-
distances[m] = (d, prevalence)
|
|
84
|
-
# Use the median of the results as the final prevalence estimate
|
|
85
|
-
index = min(distances, key=lambda d: distances[d][0])
|
|
86
|
-
prevalence = distances[index][1]
|
|
87
|
-
|
|
88
|
-
return prevalence
|
|
89
|
-
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._MixtureModel import MixtureModel
|
|
5
|
-
from ....utils import getHist
|
|
6
|
-
|
|
7
|
-
class HDy(MixtureModel):
|
|
8
|
-
"""Hellinger Distance Minimization. The method
|
|
9
|
-
is based on computing the hellinger distance of
|
|
10
|
-
two distributions, test distribution and the mixture
|
|
11
|
-
of the positive and negative distribution of the train.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
def __init__(self, learner: BaseEstimator):
|
|
15
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
16
|
-
super().__init__(learner)
|
|
17
|
-
|
|
18
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
19
|
-
# Define bin sizes and alpha values
|
|
20
|
-
bin_size = np.arange(10, 110, 11) # Bins from 10 to 110 with a step size of 10
|
|
21
|
-
alpha_values = np.round(np.linspace(0, 1, 101), 2) # Alpha values from 0 to 1, rounded to 2 decimal places
|
|
22
|
-
|
|
23
|
-
best_alphas = []
|
|
24
|
-
|
|
25
|
-
for bins in bin_size:
|
|
26
|
-
|
|
27
|
-
pos_bin_density = getHist(self.pos_scores, bins)
|
|
28
|
-
neg_bin_density = getHist(self.neg_scores, bins)
|
|
29
|
-
test_bin_density = getHist(test_scores, bins)
|
|
30
|
-
|
|
31
|
-
distances = []
|
|
32
|
-
|
|
33
|
-
# Evaluate distance for each alpha value
|
|
34
|
-
for x in alpha_values:
|
|
35
|
-
# Combine densities using a mixture of positive and negative densities
|
|
36
|
-
train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
|
|
37
|
-
# Compute the distance using the Hellinger measure
|
|
38
|
-
distances.append(self.get_distance(train_combined_density, test_bin_density, measure="hellinger"))
|
|
39
|
-
|
|
40
|
-
# Find the alpha value that minimizes the distance
|
|
41
|
-
best_alphas.append(alpha_values[np.argmin(distances)])
|
|
42
|
-
|
|
43
|
-
# Compute the median of the best alpha values as the final prevalence estimate
|
|
44
|
-
prevalence = np.median(best_alphas)
|
|
45
|
-
|
|
46
|
-
return prevalence
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._MixtureModel import MixtureModel
|
|
5
|
-
|
|
6
|
-
class SMM(MixtureModel):
|
|
7
|
-
"""Sample Mean Matching. The method is
|
|
8
|
-
a member of the DyS framework that uses
|
|
9
|
-
simple means to represent the score
|
|
10
|
-
distribution for positive, negative
|
|
11
|
-
and unlabelled scores.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
def __init__(self, learner: BaseEstimator):
|
|
15
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
16
|
-
super().__init__(learner)
|
|
17
|
-
|
|
18
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
19
|
-
mean_pos_score = np.mean(self.pos_scores)
|
|
20
|
-
mean_neg_score = np.mean(self.neg_scores)
|
|
21
|
-
mean_test_score = np.mean(test_scores)
|
|
22
|
-
|
|
23
|
-
# Calculate prevalence as the proportion of the positive class
|
|
24
|
-
# based on the mean test score relative to the mean positive and negative scores
|
|
25
|
-
prevalence = (mean_test_score - mean_neg_score) / (mean_pos_score - mean_neg_score)
|
|
26
|
-
|
|
27
|
-
return prevalence
|