mlquantify 0.0.11.2__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +32 -6
- mlquantify/base.py +559 -257
- mlquantify/classification/__init__.py +1 -1
- mlquantify/classification/methods.py +160 -0
- mlquantify/evaluation/__init__.py +14 -2
- mlquantify/evaluation/measures.py +215 -0
- mlquantify/evaluation/protocol.py +647 -0
- mlquantify/methods/__init__.py +37 -40
- mlquantify/methods/aggregative.py +1030 -0
- mlquantify/methods/meta.py +472 -0
- mlquantify/methods/mixture_models.py +1003 -0
- mlquantify/methods/non_aggregative.py +136 -0
- mlquantify/methods/threshold_optimization.py +959 -0
- mlquantify/model_selection.py +377 -232
- mlquantify/plots.py +367 -0
- mlquantify/utils/__init__.py +2 -2
- mlquantify/utils/general.py +334 -0
- mlquantify/utils/method.py +449 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.0.dist-info}/METADATA +137 -122
- mlquantify-0.1.0.dist-info/RECORD +22 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.0.dist-info}/WHEEL +1 -1
- mlquantify/classification/pwkclf.py +0 -73
- mlquantify/evaluation/measures/__init__.py +0 -26
- mlquantify/evaluation/measures/ae.py +0 -11
- mlquantify/evaluation/measures/bias.py +0 -16
- mlquantify/evaluation/measures/kld.py +0 -8
- mlquantify/evaluation/measures/mse.py +0 -12
- mlquantify/evaluation/measures/nae.py +0 -16
- mlquantify/evaluation/measures/nkld.py +0 -13
- mlquantify/evaluation/measures/nrae.py +0 -16
- mlquantify/evaluation/measures/rae.py +0 -12
- mlquantify/evaluation/measures/se.py +0 -12
- mlquantify/evaluation/protocol/_Protocol.py +0 -202
- mlquantify/evaluation/protocol/__init__.py +0 -2
- mlquantify/evaluation/protocol/app.py +0 -146
- mlquantify/evaluation/protocol/npp.py +0 -34
- mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
- mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
- mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
- mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
- mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
- mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
- mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
- mlquantify/methods/aggregative/__init__.py +0 -9
- mlquantify/methods/aggregative/cc.py +0 -32
- mlquantify/methods/aggregative/emq.py +0 -86
- mlquantify/methods/aggregative/fm.py +0 -72
- mlquantify/methods/aggregative/gac.py +0 -96
- mlquantify/methods/aggregative/gpac.py +0 -87
- mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
- mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
- mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
- mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
- mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
- mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
- mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
- mlquantify/methods/aggregative/pcc.py +0 -33
- mlquantify/methods/aggregative/pwk.py +0 -38
- mlquantify/methods/meta/__init__.py +0 -1
- mlquantify/methods/meta/ensemble.py +0 -236
- mlquantify/methods/non_aggregative/__init__.py +0 -1
- mlquantify/methods/non_aggregative/hdx.py +0 -71
- mlquantify/plots/__init__.py +0 -2
- mlquantify/plots/distribution_plot.py +0 -109
- mlquantify/plots/protocol_plot.py +0 -193
- mlquantify/utils/general_purposes/__init__.py +0 -8
- mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
- mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
- mlquantify/utils/general_purposes/get_real_prev.py +0 -9
- mlquantify/utils/general_purposes/load_quantifier.py +0 -4
- mlquantify/utils/general_purposes/make_prevs.py +0 -23
- mlquantify/utils/general_purposes/normalize.py +0 -20
- mlquantify/utils/general_purposes/parallel.py +0 -10
- mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
- mlquantify/utils/method_purposes/__init__.py +0 -6
- mlquantify/utils/method_purposes/distances.py +0 -21
- mlquantify/utils/method_purposes/getHist.py +0 -13
- mlquantify/utils/method_purposes/get_scores.py +0 -33
- mlquantify/utils/method_purposes/moss.py +0 -16
- mlquantify/utils/method_purposes/ternary_search.py +0 -14
- mlquantify/utils/method_purposes/tprfpr.py +0 -42
- mlquantify-0.0.11.2.dist-info/RECORD +0 -73
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from typing import Union, List
|
|
5
|
-
from sklearn.base import BaseEstimator
|
|
6
|
-
from time import time
|
|
7
|
-
from tqdm import tqdm
|
|
8
|
-
|
|
9
|
-
from ...methods import get_method, METHODS, AGGREGATIVE, NON_AGGREGATIVE
|
|
10
|
-
from ...utils import *
|
|
11
|
-
from ..measures import get_measure, MEASURES
|
|
12
|
-
from ...base import Quantifier, AggregativeQuantifier
|
|
13
|
-
|
|
14
|
-
class Protocol(ABC):
|
|
15
|
-
"""Base class for implementing different quantification protocols.
|
|
16
|
-
|
|
17
|
-
This abstract class provides a structure for creating protocols that involve
|
|
18
|
-
fitting quantification models to training data and generating predictions on test data.
|
|
19
|
-
It supports parallel processing, multiple iterations, and different output formats.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
models (Union[List[Union[str, Quantifier]], str, Quantifier]):
|
|
23
|
-
List of quantification models, a single model name, or 'all' for all models.
|
|
24
|
-
batch_size (Union[List[int], int]):
|
|
25
|
-
Size of the batches to be processed, or a list of sizes.
|
|
26
|
-
learner (BaseEstimator, optional):
|
|
27
|
-
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
28
|
-
n_iterations (int, optional):
|
|
29
|
-
Number of iterations for the protocol. Default is 1.
|
|
30
|
-
n_jobs (int, optional):
|
|
31
|
-
Number of jobs to run in parallel. Default is 1.
|
|
32
|
-
random_state (int, optional):
|
|
33
|
-
Seed for random number generation. Default is 32.
|
|
34
|
-
verbose (bool, optional):
|
|
35
|
-
Whether to print progress messages. Default is False.
|
|
36
|
-
return_type (str, optional):
|
|
37
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
38
|
-
measures (List[str], optional):
|
|
39
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def __init__(self,
|
|
44
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
45
|
-
batch_size: Union[List[int], int],
|
|
46
|
-
learner: BaseEstimator = None,
|
|
47
|
-
n_iterations: int = 1,
|
|
48
|
-
n_jobs: int = 1,
|
|
49
|
-
random_state: int = 32,
|
|
50
|
-
verbose: bool = False,
|
|
51
|
-
return_type: str = "predictions",
|
|
52
|
-
measures: List[str] = None):
|
|
53
|
-
|
|
54
|
-
assert not measures or all(m in MEASURES for m in measures), \
|
|
55
|
-
f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
|
|
56
|
-
assert return_type in ["predictions", "table"], \
|
|
57
|
-
"Invalid return_type. Valid options: ['predictions', 'table']"
|
|
58
|
-
|
|
59
|
-
self.models = self._initialize_models(models, learner)
|
|
60
|
-
self.learner = learner
|
|
61
|
-
self.batch_size = batch_size
|
|
62
|
-
self.n_iterations = n_iterations
|
|
63
|
-
self.n_jobs = n_jobs
|
|
64
|
-
self.random_state = random_state
|
|
65
|
-
self.verbose = verbose
|
|
66
|
-
self.return_type = return_type
|
|
67
|
-
self.measures = measures
|
|
68
|
-
|
|
69
|
-
def _initialize_models(self, models, learner):
|
|
70
|
-
if isinstance(models, list):
|
|
71
|
-
if isinstance(models[0], Quantifier):
|
|
72
|
-
return models
|
|
73
|
-
assert learner is not None, "Learner is required for model methods."
|
|
74
|
-
return [get_method(model)(learner) for model in models]
|
|
75
|
-
if isinstance(models, Quantifier):
|
|
76
|
-
return [models]
|
|
77
|
-
|
|
78
|
-
assert learner is not None, "Learner is required for model methods."
|
|
79
|
-
|
|
80
|
-
if models == "all":
|
|
81
|
-
print(hasattr(list(AGGREGATIVE.values())[0], "learner"))
|
|
82
|
-
models = [model(learner) if hasattr(model, "learner") else model() for model in METHODS.values()]
|
|
83
|
-
return models
|
|
84
|
-
if models == "aggregative":
|
|
85
|
-
return [model(learner) for model in AGGREGATIVE.values()]
|
|
86
|
-
if models == "non_aggregative":
|
|
87
|
-
return [model() for model in NON_AGGREGATIVE.values()]
|
|
88
|
-
|
|
89
|
-
return [get_method(models)(learner)]
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def sout(self, msg):
|
|
93
|
-
if self.verbose:
|
|
94
|
-
print('[APP]' + msg)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def fit(self, X_train, y_train):
|
|
98
|
-
"""Fit all methods into the training data.
|
|
99
|
-
|
|
100
|
-
Args:
|
|
101
|
-
X_train (array-like): Features of training.
|
|
102
|
-
y_train (array-like): Labels of training.
|
|
103
|
-
"""
|
|
104
|
-
self.sout("Fitting models")
|
|
105
|
-
|
|
106
|
-
args = ((model, X_train, y_train, self.verbose) for model in self.models)
|
|
107
|
-
self.models = parallel(
|
|
108
|
-
self._delayed_fit,
|
|
109
|
-
tqdm(args, desc="Fitting models", total=len(self.models)) if self.verbose else args,
|
|
110
|
-
self.n_jobs)
|
|
111
|
-
|
|
112
|
-
self.sout("Fit [Done]")
|
|
113
|
-
return self
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def predict(self, X_test, y_test) -> np.any:
|
|
117
|
-
"""Generate several samples with artificial prevalences, and sizes.
|
|
118
|
-
And for each method, predicts with this sample, aggregating all toguether
|
|
119
|
-
with a pandas dataframe if request, or else just the predictions.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
X_test (array-like): Features of test.
|
|
123
|
-
y_test (array-like): Labels of test.
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
tuple: tuple containing the model, real_prev and pred_prev, or.
|
|
127
|
-
DataFrame: table of results, along with error measures if requested.
|
|
128
|
-
"""
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
predictions = self.predict_protocol(X_test, y_test)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
predictions_df = pd.DataFrame(predictions)
|
|
135
|
-
|
|
136
|
-
if self.return_type == "table":
|
|
137
|
-
predictions_df.columns = ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]
|
|
138
|
-
|
|
139
|
-
if self.measures:
|
|
140
|
-
|
|
141
|
-
def smooth(values:np.ndarray) ->np.ndarray:
|
|
142
|
-
smoothed_factor = 1/(2 * len(X_test))
|
|
143
|
-
|
|
144
|
-
values = (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
|
|
145
|
-
|
|
146
|
-
return values
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
for metric in self.measures:
|
|
150
|
-
predictions_df[metric] = predictions_df.apply(
|
|
151
|
-
lambda row: get_measure(metric)(smooth(row["REAL_PREVS"]), smooth(row["PRED_PREVS"])),
|
|
152
|
-
axis=1
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
return predictions_df
|
|
156
|
-
|
|
157
|
-
predictions_array = predictions_df.to_numpy()
|
|
158
|
-
return (
|
|
159
|
-
predictions_array[:, 0], # Model names
|
|
160
|
-
np.stack(predictions_array[:, 1]), # Prev
|
|
161
|
-
np.stack(predictions_array[:, 2]) # Prev_pred
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
@abstractmethod
|
|
166
|
-
def predict_protocol(self) -> np.ndarray:
|
|
167
|
-
""" Abstract method that every protocol has to implement """
|
|
168
|
-
...
|
|
169
|
-
|
|
170
|
-
@abstractmethod
|
|
171
|
-
def _new_sample(self) -> tuple:
|
|
172
|
-
""" Abstract method of sample extraction for each protocol
|
|
173
|
-
|
|
174
|
-
Returns:
|
|
175
|
-
tuple: tuple containing the X_sample and the y_sample
|
|
176
|
-
"""
|
|
177
|
-
...
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
@abstractmethod
|
|
181
|
-
def _delayed_predict(self, args) -> tuple:
|
|
182
|
-
"""abstract method for predicting in the extracted
|
|
183
|
-
samples, is delayed for running in parallel for
|
|
184
|
-
eficciency purposes.
|
|
185
|
-
"""
|
|
186
|
-
...
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
def _delayed_fit(self, args):
|
|
191
|
-
model, X_train, y_train, verbose = args
|
|
192
|
-
|
|
193
|
-
if verbose:
|
|
194
|
-
print(f"\tFitting {model.__class__.__name__}")
|
|
195
|
-
start = time()
|
|
196
|
-
|
|
197
|
-
model = model.fit(X=X_train, y=y_train)
|
|
198
|
-
|
|
199
|
-
if verbose:
|
|
200
|
-
end = time()
|
|
201
|
-
print(f"\t\\--Fit ended for {model.__class__.__name__} in {round(end - start, 3)} seconds")
|
|
202
|
-
return model
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from typing import Union, List
|
|
4
|
-
from sklearn.base import BaseEstimator
|
|
5
|
-
import itertools
|
|
6
|
-
from tqdm import tqdm
|
|
7
|
-
|
|
8
|
-
from ...utils import generate_artificial_indexes, parallel
|
|
9
|
-
from ...base import Quantifier
|
|
10
|
-
from ._Protocol import Protocol
|
|
11
|
-
|
|
12
|
-
class APP(Protocol):
|
|
13
|
-
"""Artificial Prevalence Protocol. It splits a test into several
|
|
14
|
-
samples varying prevalence and sample size, with n iterations.
|
|
15
|
-
For a list of Quantifiers, it computes training and testing
|
|
16
|
-
for each one and returns either a table of results with error measures
|
|
17
|
-
or just the predictions.
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def __init__(self,
|
|
21
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
22
|
-
batch_size: Union[List[int], int],
|
|
23
|
-
learner: BaseEstimator = None,
|
|
24
|
-
n_prevs: int = 100,
|
|
25
|
-
n_iterations: int = 1,
|
|
26
|
-
n_jobs: int = 1,
|
|
27
|
-
random_state: int = 32,
|
|
28
|
-
verbose: bool = False,
|
|
29
|
-
return_type: str = "predictions",
|
|
30
|
-
measures: List[str] = None):
|
|
31
|
-
|
|
32
|
-
super().__init__(models, batch_size, learner, n_iterations, n_jobs, random_state, verbose, return_type, measures)
|
|
33
|
-
self.n_prevs = n_prevs
|
|
34
|
-
|
|
35
|
-
def predict_protocol(self, X_test, y_test) -> tuple:
|
|
36
|
-
"""Generates several samples with artificial prevalences and sizes.
|
|
37
|
-
For each model, predicts with this sample, aggregating all together
|
|
38
|
-
with a pandas dataframe if requested, or else just the predictions.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
X_test (array-like): Features of the test set.
|
|
42
|
-
y_test (array-like): Labels of the test set.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
tuple: predictions containing the model name, real prev, pred prev, and batch size
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
n_dim = len(np.unique(y_test))
|
|
49
|
-
prevs = self._generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
|
|
50
|
-
|
|
51
|
-
args = self._generate_args(X_test, y_test, prevs)
|
|
52
|
-
batch_size = 1
|
|
53
|
-
|
|
54
|
-
if isinstance(self.batch_size, list):
|
|
55
|
-
batch_size = len(self.batch_size)
|
|
56
|
-
|
|
57
|
-
size = len(prevs) * len(self.models) * batch_size * self.n_iterations
|
|
58
|
-
|
|
59
|
-
predictions = parallel(
|
|
60
|
-
self._delayed_predict,
|
|
61
|
-
tqdm(args, desc="Running APP", total=size) if self.verbose else args,
|
|
62
|
-
n_jobs=self.n_jobs
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
return predictions
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
|
|
69
|
-
"""Generates a new sample with a specified prevalence and size.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
X (array-like): Features from which to take the new sample.
|
|
73
|
-
y (array-like): Labels from which to take the new sample.
|
|
74
|
-
prev (List[float]): The specified prevalences.
|
|
75
|
-
batch_size (int): Sample size.
|
|
76
|
-
|
|
77
|
-
Returns:
|
|
78
|
-
tuple: New sample's features and labels.
|
|
79
|
-
"""
|
|
80
|
-
sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
|
|
81
|
-
return np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def _delayed_predict(self, args) -> tuple:
|
|
86
|
-
"""Method predicts into the new sample, is delayed for running
|
|
87
|
-
in parallel for eficciency purposes
|
|
88
|
-
|
|
89
|
-
Args:
|
|
90
|
-
args (Any): arguments to use
|
|
91
|
-
|
|
92
|
-
Returns:
|
|
93
|
-
tuple: returns the (method name, real_prev, pred_prev and sample_size)
|
|
94
|
-
"""
|
|
95
|
-
|
|
96
|
-
X, y, model, prev, batch_size, verbose = args
|
|
97
|
-
|
|
98
|
-
if verbose:
|
|
99
|
-
print(f'\t {model.__class__.__name__} with {str(batch_size)} instances and prev {str(prev)}')
|
|
100
|
-
|
|
101
|
-
X_sample, _ = self._new_sample(X, y, prev, batch_size)
|
|
102
|
-
prev_pred = np.asarray(list(model.predict(X=X_sample).values()))
|
|
103
|
-
|
|
104
|
-
if verbose:
|
|
105
|
-
print(f'\t \\--Ending {model.__class__.__name__} with {str(batch_size)} instances and prev {str(prev)} \n')
|
|
106
|
-
|
|
107
|
-
return [model.__class__.__name__, prev, prev_pred, batch_size]
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def _generate_artificial_prevalences(self, n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
|
|
113
|
-
"""Generates n artificial prevalences with n dimensions.
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
n_dim (int): Number of dimensions for the artificial prevalence.
|
|
117
|
-
n_prev (int): Number of prevalence points to generate.
|
|
118
|
-
n_iter (int): Number of iterations.
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
np.ndarray: Generated artificial prevalences.
|
|
122
|
-
"""
|
|
123
|
-
s = np.linspace(0., 1., n_prev, endpoint=True)
|
|
124
|
-
prevs = np.array([p + (1 - sum(p),) for p in itertools.product(*(s,) * (n_dim - 1)) if sum(p) <= 1])
|
|
125
|
-
|
|
126
|
-
return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def _generate_args(self, X_test, y_test, prevs):
|
|
131
|
-
"""Generates arguments for parallel processing based on the model, prevalence, and batch size.
|
|
132
|
-
|
|
133
|
-
Args:
|
|
134
|
-
X_test (array-like): Features of the test set.
|
|
135
|
-
y_test (array-like): Labels of the test set.
|
|
136
|
-
prevs (np.ndarray): Artificial prevalences generated.
|
|
137
|
-
|
|
138
|
-
Returns:
|
|
139
|
-
List[tuple]: List of arguments for parallel processing.
|
|
140
|
-
"""
|
|
141
|
-
if isinstance(self.batch_size, list):
|
|
142
|
-
return [(X_test, y_test, model, prev, bs, self.verbose)
|
|
143
|
-
for prev in prevs for bs in self.batch_size for model in self.models]
|
|
144
|
-
else:
|
|
145
|
-
return [(X_test, y_test, model, prev, self.batch_size, self.verbose)
|
|
146
|
-
for prev in prevs for model in self.models]
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
from typing import Union, List
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ...base import Quantifier
|
|
5
|
-
from ._Protocol import Protocol
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class NPP(Protocol):
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def __init__(self,
|
|
12
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
13
|
-
batch_size: Union[List[int], int],
|
|
14
|
-
learner: BaseEstimator = None,
|
|
15
|
-
n_iterations: int = 1,
|
|
16
|
-
n_jobs: int = 1,
|
|
17
|
-
random_state: int = 32,
|
|
18
|
-
verbose: bool = False,
|
|
19
|
-
return_type: str = "predictions",
|
|
20
|
-
measures: List[str] = None):
|
|
21
|
-
|
|
22
|
-
super().__init__(models, batch_size, learner, n_iterations, n_jobs, random_state, verbose, return_type, measures)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def predict_protocol(self, X_test, y_test) -> tuple:
|
|
26
|
-
raise NotImplementedError
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
|
|
30
|
-
raise NotImplementedError
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def _delayed_predict(self, args) -> tuple:
|
|
34
|
-
raise NotImplementedError
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from abc import abstractmethod
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
|
|
5
|
-
from ....base import AggregativeQuantifier
|
|
6
|
-
from ....utils import adjust_threshold, get_scores
|
|
7
|
-
|
|
8
|
-
class ThresholdOptimization(AggregativeQuantifier):
|
|
9
|
-
"""Generic Class for methods that are based on adjustments
|
|
10
|
-
of the decision boundary of the underlying classifier in order
|
|
11
|
-
to make the ACC (base method for threshold methods) estimation
|
|
12
|
-
more numerically stable. Most of its strategies involve changing
|
|
13
|
-
the behavior of the denominator of the ACC equation.
|
|
14
|
-
"""
|
|
15
|
-
# Class for optimizing classification thresholds
|
|
16
|
-
|
|
17
|
-
def __init__(self, learner: BaseEstimator):
|
|
18
|
-
self.learner = learner
|
|
19
|
-
self.threshold = None
|
|
20
|
-
self.cc_output = None
|
|
21
|
-
self.tpr = None
|
|
22
|
-
self.fpr = None
|
|
23
|
-
|
|
24
|
-
@property
|
|
25
|
-
def multiclass_method(self) -> bool:
|
|
26
|
-
""" All threshold Methods are binary or non multiclass """
|
|
27
|
-
return False
|
|
28
|
-
|
|
29
|
-
def _fit_method(self, X, y):
|
|
30
|
-
|
|
31
|
-
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
32
|
-
|
|
33
|
-
# Adjust thresholds and compute true and false positive rates
|
|
34
|
-
thresholds, tprs, fprs = adjust_threshold(y_labels, probabilities[:, 1], self.classes)
|
|
35
|
-
|
|
36
|
-
# Find the best threshold based on TPR and FPR
|
|
37
|
-
self.threshold, self.tpr, self.fpr = self.best_tprfpr(thresholds, tprs, fprs)
|
|
38
|
-
|
|
39
|
-
return self
|
|
40
|
-
|
|
41
|
-
def _predict_method(self, X) -> dict:
|
|
42
|
-
|
|
43
|
-
probabilities = self.learner.predict_proba(X)[:, 1]
|
|
44
|
-
|
|
45
|
-
# Compute the classification count output
|
|
46
|
-
self.cc_output = len(probabilities[probabilities >= self.threshold]) / len(probabilities)
|
|
47
|
-
|
|
48
|
-
# Calculate prevalence, ensuring it's within [0, 1]
|
|
49
|
-
if self.tpr - self.fpr == 0:
|
|
50
|
-
prevalence = self.cc_output
|
|
51
|
-
else:
|
|
52
|
-
# Equation of all threshold methods to compute prevalence
|
|
53
|
-
prevalence = np.clip((self.cc_output - self.fpr) / (self.tpr - self.fpr), 0, 1)
|
|
54
|
-
|
|
55
|
-
prevalences = [1- prevalence, prevalence]
|
|
56
|
-
|
|
57
|
-
return np.asarray(prevalences)
|
|
58
|
-
|
|
59
|
-
@abstractmethod
|
|
60
|
-
def best_tprfpr(self, thresholds: np.ndarray, tpr: np.ndarray, fpr: np.ndarray) -> float:
|
|
61
|
-
"""Abstract method for determining the best TPR and FPR to use in the equation"""
|
|
62
|
-
...
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
|
|
5
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
6
|
-
|
|
7
|
-
class ACC(ThresholdOptimization):
|
|
8
|
-
""" Adjusted Classify and Count or Adjusted Count. Is a
|
|
9
|
-
base method for the threhold methods.
|
|
10
|
-
As described on the Threshold base class, this method
|
|
11
|
-
estimate the true positive and false positive rates from
|
|
12
|
-
the training data and utilize them to adjust the output
|
|
13
|
-
of the CC method.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, learner:BaseEstimator, threshold:float=0.5):
|
|
17
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
18
|
-
super().__init__(learner)
|
|
19
|
-
self.threshold = threshold
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
23
|
-
# Get the tpr and fpr where the threshold is equal to the base threshold, default is 0.5
|
|
24
|
-
|
|
25
|
-
tpr = tprs[thresholds == self.threshold][0]
|
|
26
|
-
fpr = fprs[thresholds == self.threshold][0]
|
|
27
|
-
return (self.threshold, tpr, fpr)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
5
|
-
|
|
6
|
-
class MAX(ThresholdOptimization):
|
|
7
|
-
""" Threshold MAX. This method tries to use the
|
|
8
|
-
threshold where it maximizes the difference between
|
|
9
|
-
tpr and fpr to use in the denominator of the equation.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, learner:BaseEstimator):
|
|
13
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
14
|
-
super().__init__(learner)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
18
|
-
max_index = np.argmax(np.abs(tprs - fprs))
|
|
19
|
-
|
|
20
|
-
threshold = thresholds[max_index]
|
|
21
|
-
tpr= tprs[max_index]
|
|
22
|
-
fpr = fprs[max_index]
|
|
23
|
-
return (threshold, tpr, fpr)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
5
|
-
|
|
6
|
-
class MS(ThresholdOptimization):
|
|
7
|
-
""" Median Sweep. This method uses an
|
|
8
|
-
ensemble of such threshold-based methods and
|
|
9
|
-
takes the median prediction.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, learner:BaseEstimator, threshold:float=0.5):
|
|
13
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
14
|
-
super().__init__(learner)
|
|
15
|
-
self.threshold = threshold
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
19
|
-
tpr = np.median(tprs)
|
|
20
|
-
fpr = np.median(fprs)
|
|
21
|
-
return (self.threshold, tpr, fpr)
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
5
|
-
|
|
6
|
-
class MS2(ThresholdOptimization):
|
|
7
|
-
""" Median Sweep 2. It relies on the same
|
|
8
|
-
strategy of the Median Sweep, but compute
|
|
9
|
-
the median only for cases in which
|
|
10
|
-
tpr -fpr > 0.25
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, learner:BaseEstimator):
|
|
14
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
15
|
-
super().__init__(learner)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
19
|
-
indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
|
|
20
|
-
|
|
21
|
-
threshold = np.median(thresholds[indices])
|
|
22
|
-
tpr = np.median(tprs[indices])
|
|
23
|
-
fpr = np.median(fprs[indices])
|
|
24
|
-
|
|
25
|
-
return (threshold, tpr, fpr)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
|
|
5
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
6
|
-
|
|
7
|
-
class PACC(ThresholdOptimization):
|
|
8
|
-
""" Probabilistic Adjusted Classify and Count.
|
|
9
|
-
This method adapts the AC approach by using average
|
|
10
|
-
classconditional confidences from a probabilistic
|
|
11
|
-
classifier instead of true positive and false positive rates.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
def __init__(self, learner:BaseEstimator, threshold:float=0.5):
|
|
15
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
16
|
-
super().__init__(learner)
|
|
17
|
-
self.threshold = threshold
|
|
18
|
-
|
|
19
|
-
def _predict_method(self, X):
|
|
20
|
-
prevalences = {}
|
|
21
|
-
|
|
22
|
-
probabilities = self.learner.predict_proba(X)[:, 1]
|
|
23
|
-
|
|
24
|
-
mean_scores = np.mean(probabilities)
|
|
25
|
-
|
|
26
|
-
if self.tpr - self.fpr == 0:
|
|
27
|
-
prevalence = mean_scores
|
|
28
|
-
else:
|
|
29
|
-
prevalence = np.clip(abs(mean_scores - self.fpr) / (self.tpr - self.fpr), 0, 1)
|
|
30
|
-
|
|
31
|
-
prevalences[self.classes[1]] = prevalence
|
|
32
|
-
prevalences[self.classes[0]] = 1 - prevalence
|
|
33
|
-
|
|
34
|
-
return prevalences
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
39
|
-
tpr = tprs[thresholds == self.threshold][0]
|
|
40
|
-
fpr = fprs[thresholds == self.threshold][0]
|
|
41
|
-
return (self.threshold, tpr, fpr)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
5
|
-
|
|
6
|
-
class T50(ThresholdOptimization):
|
|
7
|
-
""" Threshold 50. This method tries to
|
|
8
|
-
use the threshold where tpr = 0.5.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, learner:BaseEstimator):
|
|
12
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
13
|
-
super().__init__(learner)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
17
|
-
min_index = np.argmin(np.abs(tprs - 0.5))
|
|
18
|
-
threshold = thresholds[min_index]
|
|
19
|
-
tpr = tprs[min_index]
|
|
20
|
-
fpr = fprs[min_index]
|
|
21
|
-
return (threshold, tpr, fpr)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._ThreholdOptimization import ThresholdOptimization
|
|
5
|
-
|
|
6
|
-
class X_method(ThresholdOptimization):
|
|
7
|
-
""" Threshold X. This method tries to
|
|
8
|
-
use the threshold where fpr = 1 - tpr
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, learner:BaseEstimator):
|
|
12
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
13
|
-
super().__init__(learner)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
17
|
-
min_index = np.argmin(abs(1 - (tprs + fprs)))
|
|
18
|
-
|
|
19
|
-
threshold = thresholds[min_index]
|
|
20
|
-
tpr = tprs[min_index]
|
|
21
|
-
fpr = fprs[min_index]
|
|
22
|
-
|
|
23
|
-
return (threshold, tpr, fpr)
|