mlquantify 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. mlquantify/__init__.py +6 -0
  2. mlquantify/base.py +256 -0
  3. mlquantify/classification/__init__.py +1 -0
  4. mlquantify/classification/pwkclf.py +73 -0
  5. mlquantify/evaluation/__init__.py +2 -0
  6. mlquantify/evaluation/measures/__init__.py +26 -0
  7. mlquantify/evaluation/measures/ae.py +11 -0
  8. mlquantify/evaluation/measures/bias.py +16 -0
  9. mlquantify/evaluation/measures/kld.py +8 -0
  10. mlquantify/evaluation/measures/mse.py +12 -0
  11. mlquantify/evaluation/measures/nae.py +16 -0
  12. mlquantify/evaluation/measures/nkld.py +13 -0
  13. mlquantify/evaluation/measures/nrae.py +16 -0
  14. mlquantify/evaluation/measures/rae.py +12 -0
  15. mlquantify/evaluation/measures/se.py +12 -0
  16. mlquantify/evaluation/protocol/_Protocol.py +202 -0
  17. mlquantify/evaluation/protocol/__init__.py +2 -0
  18. mlquantify/evaluation/protocol/app.py +146 -0
  19. mlquantify/evaluation/protocol/npp.py +34 -0
  20. mlquantify/methods/__init__.py +40 -0
  21. mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +62 -0
  22. mlquantify/methods/aggregative/ThreholdOptm/__init__.py +7 -0
  23. mlquantify/methods/aggregative/ThreholdOptm/acc.py +27 -0
  24. mlquantify/methods/aggregative/ThreholdOptm/max.py +23 -0
  25. mlquantify/methods/aggregative/ThreholdOptm/ms.py +21 -0
  26. mlquantify/methods/aggregative/ThreholdOptm/ms2.py +25 -0
  27. mlquantify/methods/aggregative/ThreholdOptm/pacc.py +41 -0
  28. mlquantify/methods/aggregative/ThreholdOptm/t50.py +21 -0
  29. mlquantify/methods/aggregative/ThreholdOptm/x.py +23 -0
  30. mlquantify/methods/aggregative/__init__.py +9 -0
  31. mlquantify/methods/aggregative/cc.py +32 -0
  32. mlquantify/methods/aggregative/emq.py +86 -0
  33. mlquantify/methods/aggregative/fm.py +72 -0
  34. mlquantify/methods/aggregative/gac.py +96 -0
  35. mlquantify/methods/aggregative/gpac.py +87 -0
  36. mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +81 -0
  37. mlquantify/methods/aggregative/mixtureModels/__init__.py +5 -0
  38. mlquantify/methods/aggregative/mixtureModels/dys.py +55 -0
  39. mlquantify/methods/aggregative/mixtureModels/dys_syn.py +89 -0
  40. mlquantify/methods/aggregative/mixtureModels/hdy.py +46 -0
  41. mlquantify/methods/aggregative/mixtureModels/smm.py +27 -0
  42. mlquantify/methods/aggregative/mixtureModels/sord.py +77 -0
  43. mlquantify/methods/aggregative/pcc.py +33 -0
  44. mlquantify/methods/aggregative/pwk.py +38 -0
  45. mlquantify/methods/meta/__init__.py +1 -0
  46. mlquantify/methods/meta/ensemble.py +236 -0
  47. mlquantify/methods/non_aggregative/__init__.py +1 -0
  48. mlquantify/methods/non_aggregative/hdx.py +71 -0
  49. mlquantify/model_selection.py +232 -0
  50. mlquantify/plots/__init__.py +2 -0
  51. mlquantify/plots/distribution_plot.py +109 -0
  52. mlquantify/plots/protocol_plot.py +157 -0
  53. mlquantify/utils/__init__.py +2 -0
  54. mlquantify/utils/general_purposes/__init__.py +8 -0
  55. mlquantify/utils/general_purposes/convert_col_to_array.py +13 -0
  56. mlquantify/utils/general_purposes/generate_artificial_indexes.py +29 -0
  57. mlquantify/utils/general_purposes/get_real_prev.py +9 -0
  58. mlquantify/utils/general_purposes/load_quantifier.py +4 -0
  59. mlquantify/utils/general_purposes/make_prevs.py +23 -0
  60. mlquantify/utils/general_purposes/normalize.py +20 -0
  61. mlquantify/utils/general_purposes/parallel.py +10 -0
  62. mlquantify/utils/general_purposes/round_protocol_df.py +14 -0
  63. mlquantify/utils/method_purposes/__init__.py +6 -0
  64. mlquantify/utils/method_purposes/distances.py +21 -0
  65. mlquantify/utils/method_purposes/getHist.py +13 -0
  66. mlquantify/utils/method_purposes/get_scores.py +33 -0
  67. mlquantify/utils/method_purposes/moss.py +16 -0
  68. mlquantify/utils/method_purposes/ternary_search.py +14 -0
  69. mlquantify/utils/method_purposes/tprfpr.py +42 -0
  70. mlquantify-0.0.1.dist-info/METADATA +23 -0
  71. mlquantify-0.0.1.dist-info/RECORD +73 -0
  72. mlquantify-0.0.1.dist-info/WHEEL +5 -0
  73. mlquantify-0.0.1.dist-info/top_level.txt +1 -0
mlquantify/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from .classification import *
2
+ from .evaluation import *
3
+ from .methods import *
4
+ from .utils import *
5
+ from .plots import *
6
+ from .model_selection import GridSearchQ
mlquantify/base.py ADDED
@@ -0,0 +1,256 @@
1
+ from abc import abstractmethod, ABC
2
+ from sklearn.base import BaseEstimator
3
+ from copy import deepcopy
4
+ import numpy as np
5
+ import joblib
6
+
7
+
8
+ from .utils import parallel, normalize_prevalence
9
+
10
+ class Quantifier(ABC, BaseEstimator):
11
+ """ Abstract Class for quantifiers."""
12
+
13
+ @abstractmethod
14
+ def fit(self, X, y) -> object: ...
15
+
16
+ @abstractmethod
17
+ def predict(self, X) -> dict: ...
18
+
19
+ @property
20
+ def classes(self) -> list:
21
+ return self._classes
22
+
23
+ @classes.setter
24
+ def classes(self, classes):
25
+ self._classes = sorted(list(classes))
26
+
27
+ @property
28
+ def n_class(self) -> list:
29
+ return len(self._classes)
30
+
31
+ @property
32
+ def multiclass_method(self) -> bool:
33
+ return True
34
+
35
+ @property
36
+ def binary_data(self) -> bool:
37
+ return len(self._classes) == 2
38
+
39
+
40
+ def save_quantifier(self, path: str=None) -> None:
41
+ if not path:
42
+ path = f"{self.__class__.__name__}.joblib"
43
+ joblib.dump(self, path)
44
+
45
+
46
+
47
+ class AggregativeQuantifier(Quantifier, ABC):
48
+ """Abstract class for all Aggregative quantifiers, it means that each one of the quantifiers,
49
+ uses a learner or possibly a classifier to generate predictions.
50
+ This class is mostly used to detect whether or not its a binary or multiclass problem, and doing
51
+ One-Vs-All in case of multiclass dataset and not multiclass quantifier method.
52
+ """
53
+
54
+
55
+ def __init__(self):
56
+ # Dictionary to hold binary quantifiers for each class.
57
+ self.binary_quantifiers = {}
58
+ self.learner_fitted = False
59
+ self.cv_folds = 10
60
+
61
+ def fit(self, X, y, learner_fitted=False, cv_folds: int = 10, n_jobs:int=1):
62
+ """Fit the quantifier model.
63
+
64
+ Args:
65
+ X (array-like): Training features.
66
+ y (array-like): Training labels.
67
+ learner_fitted (bool, optional): Whether the learner is already fitted. Defaults to False.
68
+ cv_folds (int, optional): Number of cross-validation folds. Defaults to 10.
69
+
70
+ Returns:
71
+ self: Fitted quantifier.
72
+ """
73
+ self.n_jobs = n_jobs
74
+ self.learner_fitted = learner_fitted
75
+ self.cv_folds = cv_folds
76
+
77
+ self.classes = np.unique(y)
78
+ if self.binary_data or self.multiclass_method:
79
+ return self._fit_method(X, y)
80
+
81
+ # Making one vs all
82
+ self.binary_quantifiers = {class_: deepcopy(self) for class_ in self.classes}
83
+ parallel(self.delayed_fit, self.classes, self.n_jobs, X, y)
84
+
85
+ return self
86
+
87
+ def predict(self, X) -> dict:
88
+ """Predict class prevalences for the given data.
89
+
90
+ Args:
91
+ X (array-like): Test features.
92
+
93
+ Returns:
94
+ dict: Dictionary with class prevalences.
95
+ """
96
+ if self.binary_data or self.multiclass_method:
97
+ prevalences = self._predict_method(X)
98
+ return normalize_prevalence(prevalences, self.classes)
99
+
100
+ # Making one vs all
101
+ prevalences = np.asarray(parallel(self.delayed_predict, self.classes, self.n_jobs, X))
102
+ return normalize_prevalence(prevalences, self.classes)
103
+
104
+ @abstractmethod
105
+ def _fit_method(self, X, y):
106
+ """Abstract fit method that each quantification method must implement.
107
+
108
+ Args:
109
+ X (array-like): Training features.
110
+ y (array-like): Training labels.
111
+ learner_fitted (bool): Whether the learner is already fitted.
112
+ cv_folds (int): Number of cross-validation folds.
113
+ """
114
+ ...
115
+
116
+ @abstractmethod
117
+ def _predict_method(self, X) -> dict:
118
+ """Abstract predict method that each quantification method must implement.
119
+
120
+ Args:
121
+ X (array-like): Test data to generate class prevalences.
122
+
123
+ Returns:
124
+ dict: Dictionary with class:prevalence for each class.
125
+ """
126
+ ...
127
+
128
+ @property
129
+ def learner(self):
130
+ return self.learner_
131
+
132
+ @learner.setter
133
+ def learner(self, value):
134
+ self.learner_ = value
135
+
136
+
137
+ def get_params(self, deep=True):
138
+ return self.learner.get_params()
139
+
140
+ def set_params(self, **params):
141
+ # Model Params
142
+ for key, value in params.items():
143
+ if hasattr(self, key):
144
+ setattr(self, key, value)
145
+
146
+ # Learner Params
147
+ if self.learner:
148
+ learner_params = {k.replace('learner__', ''): v for k, v in params.items() if 'learner__' in k}
149
+ if learner_params:
150
+ self.learner.set_params(**learner_params)
151
+
152
+ return self
153
+
154
+
155
+ # MULTICLASS METHODS
156
+
157
+ def delayed_fit(self, class_, X, y):
158
+ """Delayed fit method for one-vs-all strategy, with parallel running.
159
+
160
+ Args:
161
+ class_ (Any): The class for which the model is being fitted.
162
+ X (array-like): Training features.
163
+ y (array-like): Training labels.
164
+ learner_fitted (bool): Whether the learner is already fitted.
165
+ cv_folds (int): Number of cross-validation folds.
166
+
167
+ Returns:
168
+ self: Fitted binary quantifier for the given class.
169
+ """
170
+ y_class = (y == class_).astype(int)
171
+ return self.binary_quantifiers[class_].fit(X, y_class)
172
+
173
+ def delayed_predict(self, class_, X):
174
+ """Delayed predict method for one-vs-all strategy, with parallel running.
175
+
176
+ Args:
177
+ class_ (Any): The class for which the model is making predictions.
178
+ X (array-like): Test features.
179
+
180
+ Returns:
181
+ float: Predicted prevalence for the given class.
182
+ """
183
+ return self.binary_quantifiers[class_].predict(X)[1]
184
+
185
+
186
+ class NonAggregativeQuantifier(Quantifier):
187
+ """Abstract class for Non Aggregative quantifiers, it means that
188
+ theses methods does not use a classifier or specift learner on it's
189
+ predictions.
190
+ """
191
+
192
+
193
+ def fit(self, X, y, n_jobs:int=1):
194
+ """Fit the quantifier model.
195
+
196
+ Args:
197
+ X (array-like): Training features.
198
+ y (array-like): Training labels.
199
+ learner_fitted (bool, optional): Whether the learner is already fitted. Defaults to False.
200
+ cv_folds (int, optional): Number of cross-validation folds. Defaults to 10.
201
+
202
+ Returns:
203
+ self: Fitted quantifier.
204
+ """
205
+ self.n_jobs = n_jobs
206
+ self.classes = np.unique(y)
207
+ if self.binary_data or self.multiclass_method:
208
+ return self._fit_method(X, y)
209
+
210
+ # Making one vs all
211
+ self.binary_quantifiers = {class_: deepcopy(self) for class_ in self.classes}
212
+ parallel(self.delayed_fit, self.classes, self.n_jobs, X, y)
213
+
214
+ return self
215
+
216
+ def predict(self, X) -> dict:
217
+ """Predict class prevalences for the given data.
218
+
219
+ Args:
220
+ X (array-like): Test features.
221
+
222
+ Returns:
223
+ dict: Dictionary with class prevalences.
224
+ """
225
+ if self.binary_data or self.multiclass_method:
226
+ prevalences = self._predict_method(X)
227
+ return normalize_prevalence(prevalences, self.classes)
228
+
229
+ # Making one vs all
230
+ prevalences = np.asarray(parallel(self.delayed_predict, self.classes, self.n_jobs, X))
231
+ return normalize_prevalence(prevalences, self.classes)
232
+
233
+
234
+ @abstractmethod
235
+ def _fit_method(self, X, y):
236
+ """Abstract fit method that each quantification method must implement.
237
+
238
+ Args:
239
+ X (array-like): Training features.
240
+ y (array-like): Training labels.
241
+ learner_fitted (bool): Whether the learner is already fitted.
242
+ cv_folds (int): Number of cross-validation folds.
243
+ """
244
+ ...
245
+
246
+ @abstractmethod
247
+ def _predict_method(self, X) -> dict:
248
+ """Abstract predict method that each quantification method must implement.
249
+
250
+ Args:
251
+ X (array-like): Test data to generate class prevalences.
252
+
253
+ Returns:
254
+ dict: Dictionary with class:prevalence for each class.
255
+ """
256
+ ...
@@ -0,0 +1 @@
1
+ from .pwkclf import PWKCLF
@@ -0,0 +1,73 @@
1
+ from sklearn.neighbors import NearestNeighbors
2
+ from sklearn.base import BaseEstimator
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ class PWKCLF(BaseEstimator):
7
+ """Learner based on k-Nearest Neighborst (KNN) to use on the method PWK,
8
+ that also is based on KNN.
9
+ """
10
+
11
+
12
+ def __init__(self,
13
+ alpha=1,
14
+ n_neighbors=10,
15
+ algorithm="auto",
16
+ metric="euclidean",
17
+ leaf_size=30,
18
+ p=2,
19
+ metric_params=None,
20
+ n_jobs=None):
21
+
22
+ if alpha < 1:
23
+ raise ValueError("alpha must not be smaller than 1")
24
+
25
+ self.alpha = alpha
26
+ self.n_neighbors = n_neighbors
27
+
28
+ self.nbrs = NearestNeighbors(n_neighbors=n_neighbors,
29
+ algorithm=algorithm,
30
+ leaf_size=leaf_size,
31
+ metric=metric,
32
+ p=p,
33
+ metric_params=metric_params,
34
+ n_jobs=n_jobs)
35
+
36
+ self.Y = None
37
+ self.Y_map = None
38
+ self.w = None
39
+ self.y = None
40
+
41
+ def fit(self, X, y):
42
+ n_samples = X.shape[0]
43
+ if n_samples < self.n_neighbors:
44
+ self.nbrs.set_params(n_neighbors=n_samples)
45
+
46
+ self.y = y
47
+
48
+ if isinstance(y, pd.DataFrame):
49
+ self.y = y.reset_index(drop=True)
50
+
51
+ Y_cts = np.unique(y, return_counts=True)
52
+ self.Y = Y_cts[0]
53
+ self.Y_map = dict(zip(self.Y, range(len(self.Y))))
54
+
55
+ min_class_count = np.min(Y_cts[1])
56
+ self.w = (Y_cts[1] / min_class_count) ** (-1.0 / self.alpha)
57
+ self.nbrs.fit(X)
58
+ return self
59
+
60
+ def predict(self, X):
61
+ n_samples = X.shape[0]
62
+ nn_indices = self.nbrs.kneighbors(X, return_distance=False)
63
+
64
+ CM = np.zeros((n_samples, len(self.Y)))
65
+
66
+ for i in range(n_samples):
67
+ for j in nn_indices[i]:
68
+ CM[i, self.Y_map[self.y[j]]] += 1
69
+
70
+ CM = np.multiply(CM, self.w)
71
+ predictions = np.apply_along_axis(np.argmax, axis=1, arr=CM)
72
+
73
+ return self.Y[predictions]
@@ -0,0 +1,2 @@
1
+ from .measures import *
2
+ from .protocol import *
@@ -0,0 +1,26 @@
1
+ from .ae import absolute_error
2
+ from .kld import kullback_leibler_divergence
3
+ from .nkld import normalized_kullback_leibler_divergence
4
+ from .rae import relative_absolute_error
5
+ from .nae import normalized_absolute_error
6
+ from .bias import bias
7
+ from .nrae import normalized_relative_absolute_error
8
+ from .se import squared_error
9
+ from .mse import mean_squared_error
10
+
11
+
12
+
13
+ MEASURES = {
14
+ "ae": absolute_error,
15
+ "nae": normalized_absolute_error,
16
+ "kld": kullback_leibler_divergence,
17
+ "nkld": normalized_kullback_leibler_divergence,
18
+ "nrae": normalized_relative_absolute_error,
19
+ "rae": relative_absolute_error,
20
+ "se": squared_error,
21
+ "mse": mean_squared_error
22
+ }
23
+
24
+
25
+ def get_measure(measure:str):
26
+ return MEASURES.get(measure)
@@ -0,0 +1,11 @@
1
+ import numpy as np
2
+
3
+ def absolute_error(prev_real:np.any, prev_pred:np.any):
4
+ if isinstance(prev_real, dict):
5
+ prev_real = np.asarray(list(prev_real.values()))
6
+ if isinstance(prev_pred, dict):
7
+ prev_pred = np.asarray(list(prev_pred.values()))
8
+
9
+ abs_error = abs(prev_pred - prev_real).mean(axis=-1)
10
+
11
+ return abs_error
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+
3
+ def bias(prev_real:np.any, prev_pred:np.any):
4
+ classes = None
5
+ if isinstance(prev_real, dict):
6
+ classes = prev_real.keys()
7
+ prev_real = np.asarray(list(prev_real.values()))
8
+ if isinstance(prev_pred, dict):
9
+ prev_pred = np.asarray(list(prev_pred.values()))
10
+
11
+ abs_errors = abs(prev_pred - prev_real)
12
+
13
+ if classes:
14
+ return {class_:abs_error for class_, abs_error in zip(classes, abs_errors)}
15
+
16
+ return abs_errors
@@ -0,0 +1,8 @@
1
+ import numpy as np
2
+
3
+ def kullback_leibler_divergence(prev_real:np.any, prev_pred:np.any):
4
+ if isinstance(prev_real, dict):
5
+ prev_real = np.asarray(list(prev_real.values()))
6
+ if isinstance(prev_pred, dict):
7
+ prev_pred = np.asarray(list(prev_pred.values()))
8
+ return prev_real * abs(np.log((prev_real / prev_pred)))
@@ -0,0 +1,12 @@
1
+ import numpy as np
2
+ from .se import squared_error
3
+
4
+ def mean_squared_error(prev_real:np.any, prev_pred:np.any):
5
+ if isinstance(prev_real, dict):
6
+ prev_real = np.asarray(list(prev_real.values()))
7
+ if isinstance(prev_pred, dict):
8
+ prev_pred = np.asarray(list(prev_pred.values()))
9
+
10
+ mean_sq_error = squared_error(prev_real, prev_pred).mean()
11
+
12
+ return mean_sq_error
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+ from .ae import absolute_error
3
+
4
+ def normalized_absolute_error(prev_real:np.any, prev_pred:np.any):
5
+ if isinstance(prev_real, dict):
6
+ prev_real = np.asarray(list(prev_real.values()))
7
+ if isinstance(prev_pred, dict):
8
+ prev_pred = np.asarray(list(prev_pred.values()))
9
+
10
+ abs_error = absolute_error(prev_real, prev_pred)
11
+
12
+ z_abs_error = (2 * (1 - min(prev_real)))
13
+
14
+ normalized = abs_error / z_abs_error
15
+
16
+ return normalized
@@ -0,0 +1,13 @@
1
+ import numpy as np
2
+ from .kld import kullback_leibler_divergence
3
+
4
+ def normalized_kullback_leibler_divergence(prev_real:np.any, prev_pred:np.any):
5
+ if isinstance(prev_real, dict):
6
+ prev_real = np.asarray(list(prev_real.values()))
7
+ if isinstance(prev_pred, dict):
8
+ prev_pred = np.asarray(list(prev_pred.values()))
9
+
10
+ euler = np.exp(kullback_leibler_divergence(prev_real, prev_pred))
11
+ normalized = 2 * (euler / (euler + 1)) - 1
12
+
13
+ return normalized
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+ from .rae import relative_absolute_error
3
+
4
+ def normalized_relative_absolute_error(prev_real:np.any, prev_pred:np.any):
5
+ if isinstance(prev_real, dict):
6
+ prev_real = np.asarray(list(prev_real.values()))
7
+ if isinstance(prev_pred, dict):
8
+ prev_pred = np.asarray(list(prev_pred.values()))
9
+
10
+ relative = relative_absolute_error(prev_real, prev_pred)
11
+
12
+ z_relative = (len(prev_real) - 1 + ((1 - min(prev_real)) / min(prev_real))) / len(prev_real)
13
+
14
+ normalized = relative/z_relative
15
+
16
+ return normalized
@@ -0,0 +1,12 @@
1
+ import numpy as np
2
+ from . import absolute_error
3
+
4
+ def relative_absolute_error(prev_real:np.any, prev_pred:np.any):
5
+ if isinstance(prev_real, dict):
6
+ prev_real = np.asarray(list(prev_real.values()))
7
+ if isinstance(prev_pred, dict):
8
+ prev_pred = np.asarray(list(prev_pred.values()))
9
+
10
+ relative = (absolute_error(prev_real, prev_pred) / prev_real).mean(axis=-1)
11
+
12
+ return relative
@@ -0,0 +1,12 @@
1
+ import numpy as np
2
+ from .ae import absolute_error
3
+
4
+ def squared_error(prev_real:np.any, prev_pred:np.any):
5
+ if isinstance(prev_real, dict):
6
+ prev_real = np.asarray(list(prev_real.values()))
7
+ if isinstance(prev_pred, dict):
8
+ prev_pred = np.asarray(list(prev_pred.values()))
9
+
10
+ sq_abs_error = ((prev_pred - prev_real) ** 2).mean(axis=-1)
11
+
12
+ return sq_abs_error