mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -291
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.7.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -1,472 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from copy import deepcopy
4
- from tqdm import tqdm
5
- from sklearn.linear_model import LogisticRegression
6
- from sklearn.model_selection import GridSearchCV, cross_val_predict
7
- from ..evaluation import measures
8
- from ..base import Quantifier
9
- from ..utils.method import getHist, hellinger
10
- from ..utils.general import make_prevs, normalize_prevalence, parallel, get_indexes_with_prevalence
11
-
12
- class Ensemble(Quantifier):
13
- """Ensemble of Quantification Models.
14
-
15
- This class implements an ensemble of quantification methods,
16
- allowing parallel processing for evaluation. The ensemble
17
- method is based on the articles by Pérez-Gállego et al. (2017, 2019).
18
-
19
- This approach of Ensemble is made of taking multiple
20
- samples varying class proportions on each, and for the
21
- predictions, it takes the k models which as the minimum
22
- seletion metric
23
-
24
- Attributes
25
- ----------
26
- base_quantifier : Quantifier
27
- The base quantifier model to be used in the ensemble.
28
- size : int
29
- The number of samples to be generated for the ensemble.
30
- min_prop : float
31
- The minimum proportion of each class in the generated samples.
32
- selection_metric : str
33
- The metric used for selecting the best models in the ensemble.
34
- Valid options are 'all', 'ptr', and 'ds'.
35
- - all -> return all the predictions
36
- - ptr -> computes the selected error measure
37
- - ds -> computes the hellinger distance of the train and test
38
- distributions for each model
39
- p_metric : float
40
- The proportion of models to be selected based on the selection metric.
41
- return_type : str
42
- The type of aggregation to be used for the final prediction.
43
- Valid options are 'mean' and 'median'.
44
- max_sample_size : int or None
45
- The maximum size of the samples to be generated. If None, the entire dataset is used.
46
- max_trials : int
47
- The maximum number of trials to generate valid samples.
48
- n_jobs : int
49
- The number of parallel jobs to run.
50
- verbose : bool
51
- If True, prints progress messages during fitting and prediction.
52
-
53
- See Also
54
- --------
55
- joblib.Parallel : Parallel processing utility for Python.
56
-
57
- Parameters
58
- ----------
59
- quantifier : Quantifier
60
- The base quantifier model to be used in the ensemble.
61
- size : int, optional (default=50)
62
- The number of samples to be generated for the ensemble.
63
- min_prop : float, optional (default=0.1)
64
- The minimum proportion of each class in the generated samples.
65
- selection_metric : str, optional (default='all')
66
- The metric used for selecting the best models in the ensemble.
67
- Valid options are 'all', 'ptr', and 'ds'.
68
- p_metric : float, optional (default=0.25)
69
- The proportion of models to be selected based on the selection metric.
70
- return_type : str, optional (default='mean')
71
- The type of aggregation to be used for the final prediction.
72
- Valid options are 'mean' and 'median'.
73
- max_sample_size : int or None, optional (default=None)
74
- The maximum size of the samples to be generated. If None, the entire dataset is used.
75
- max_trials : int, optional (default=100)
76
- The maximum number of trials to generate valid samples.
77
- n_jobs : int, optional (default=1)
78
- The number of parallel jobs to run.
79
- verbose : bool, optional (default=False)
80
- If True, prints progress messages during fitting and prediction.
81
-
82
- References
83
- ----------
84
- .. [1] PÉREZ-GÁLLEGO, Pablo; QUEVEDO, José Ramón; DEL COZ, Juan José. Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. Information Fusion, v. 34, p. 87-100, 2017. Avaliable at https://www.sciencedirect.com/science/article/abs/pii/S1566253516300628?casa_token=XblH-3kwhf4AAAAA:oxNRiCdHZQQa1C8BCJM5PBnFrd26p8-9SSBdm8Luf1Dm35w88w0NdpvoCf1RxBBqtshjyAhNpsDd
85
- .. [2] PÉREZ-GÁLLEGO, Pablo et al. Dynamic ensemble selection for quantification tasks. Information Fusion, v. 45, p. 1-15, 2019. Avaliable at https://www.sciencedirect.com/science/article/abs/pii/S1566253517303652?casa_token=jWmc592j5uMAAAAA:2YNeZGAGD0NJEMkcO-YBr7Ak-Ik7njLEcG8SKdowLdpbJ0mwPjYKKiqvQ-C3qICG8yU0m4xUZ3Yv
86
-
87
- Examples
88
- --------
89
- >>> from mlquantify.methods import FM, Ensemble
90
- >>> from mlquantify.utils.general import get_real_prev
91
- >>> from sklearn.ensemble import RandomForestClassifier
92
- >>> from sklearn.datasets import load_breast_cancer
93
- >>> from sklearn.model_selection import train_test_split
94
- >>>
95
- >>> features, target = load_breast_cancer(return_X_y=True)
96
- >>>
97
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
98
- >>>
99
- >>> model = FM(RandomForestClassifier())
100
- >>> ensemble = Ensemble(quantifier=model,
101
- ... size=50,
102
- ... selection_metric='ptr',
103
- ... return_type='median',
104
- ... n_jobs=-1,
105
- ... verbose=False)
106
- >>>
107
- >>> ensemble.fit(X_train, y_train)
108
- >>>
109
- >>> predictions = ensemble.predict(X_test)
110
- >>> predictions
111
- {0: 0.4589857954621449, 1: 0.5410142045378551}
112
- >>> get_real_prev(y_test)
113
- {0: 0.45614035087719296, 1: 0.543859649122807}
114
- """
115
-
116
- SELECTION_METRICS = {'all', 'ptr', 'ds'}
117
-
118
- def __init__(self,
119
- quantifier:Quantifier,
120
- size:int=50,
121
- min_prop:float=0.1,
122
- selection_metric:str='all',
123
- p_metric:float=0.25,
124
- return_type:str="mean",
125
- max_sample_size:int=None,
126
- max_trials:int=100,
127
- n_jobs:int=1,
128
- verbose:bool=False):
129
-
130
- assert selection_metric in Ensemble.SELECTION_METRICS, \
131
- f'unknown selection_metric={selection_metric}; valid are {Ensemble.SELECTION_METRICS}'
132
- assert max_sample_size is None or max_sample_size > 0, \
133
- 'wrong value for max_sample_size; set it to a positive number or None'
134
-
135
- self.base_quantifier = quantifier
136
- self.size = size
137
- self.min_prop = min_prop
138
- self.p_metric = p_metric
139
- self.selection_metric = selection_metric
140
- self.return_type = return_type
141
- self.n_jobs = n_jobs
142
- self.proba_generator = None
143
- self.verbose = verbose
144
- self.max_sample_size = max_sample_size
145
- self.max_trials = max_trials
146
-
147
- def sout(self, msg):
148
- """Prints a message if verbose is True."""
149
- if self.verbose:
150
- print('[Ensemble]' + msg)
151
-
152
- def fit(self, X, y):
153
- """ Fits the ensemble model to the given training data.
154
-
155
- Parameters
156
- ----------
157
- X : array-like of shape (n_samples, n_features)
158
- The input data.
159
- y : array-like of shape (n_samples,)
160
- The target values.
161
-
162
- Returns
163
- -------
164
- self : Ensemble
165
- The fitted ensemble model.
166
- """
167
- self.sout('Fit')
168
-
169
- self.classes = np.unique(y)
170
-
171
- if self.selection_metric == 'ds' and not self.binary_data:
172
- raise ValueError(f'ds selection_metric is only defined for binary quantification, but this dataset is not binary')
173
- # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
174
- # min_pos positive examples)
175
- sample_size = len(y) if self.max_sample_size is None else min(self.max_sample_size, len(y))
176
- prevs = [_draw_simplex(ndim=self.n_class, min_val=self.min_prop, max_trials=self.max_trials) for _ in range(self.size)]
177
-
178
-
179
- posteriors = None
180
- if self.selection_metric == 'ds':
181
- # precompute the training posterior probabilities
182
- posteriors, self.proba_generator = self.ds_get_posteriors(X, y)
183
-
184
-
185
- args = (
186
- (X, y, self.base_quantifier, prev, posteriors, self.verbose, sample_size)
187
- for prev in prevs
188
- )
189
-
190
- self.ensemble = parallel(
191
- _delayed_new_sample,
192
- tqdm(args, desc='fitting ensemble', total=self.size) if self.verbose else args,
193
- n_jobs=self.n_jobs)
194
-
195
- self.sout('Fit [Done]')
196
- return self
197
-
198
- def predict(self, X):
199
- """ Predicts the class prevalences for the given test data.
200
-
201
- Parameters
202
- ----------
203
- X : array-like of shape (n_samples, n_features)
204
- The input data.
205
-
206
- Returns
207
- -------
208
- prevalences : array-like of shape (n_samples, n_classes)
209
- The predicted class prevalences.
210
- """
211
- self.sout('Predict')
212
-
213
- args = ((Qi, X) for Qi in self.ensemble)
214
-
215
- prevalences = np.asarray(
216
- parallel(_delayed_predict,
217
- tqdm(args, desc="Predicting Ensemble", total=len(self.ensemble)) if self.verbose else args,
218
- n_jobs=self.n_jobs)
219
- )
220
-
221
- prevalences = pd.DataFrame(prevalences).to_numpy()
222
-
223
- self.p_metric = int(len(prevalences) * self.p_metric)
224
-
225
- if self.selection_metric == 'ptr':
226
- prevalences = self.ptr_selection_metric(prevalences)
227
- elif self.selection_metric == 'ds':
228
- prevalences = self.ds_selection_metric(prevalences, X)
229
-
230
-
231
- if self.return_type == "median":
232
- prevalences = np.median(prevalences, axis=0)
233
- else:
234
- prevalences = np.mean(prevalences, axis=0)
235
-
236
-
237
- self.sout('Predict [Done]')
238
- return normalize_prevalence(prevalences, self.classes)
239
-
240
-
241
- def ptr_selection_metric(self, prevalences):
242
- """
243
- Selects the prevalence estimates from models trained on samples whose prevalence is most similar
244
- to an initial approximation of the test prevalence as estimated by all models in the ensemble.
245
-
246
- Parameters
247
- ----------
248
- prevalences : numpy.ndarray
249
- An array of prevalence estimates provided by each model in the ensemble.
250
-
251
- Returns
252
- -------
253
- numpy.ndarray
254
- The selected prevalence estimates after applying the PTR selection metric.
255
- """
256
- test_prev_estim = prevalences.mean(axis=0)
257
- tr_prevs = [m[1] for m in self.ensemble]
258
- ptr_differences = [measures.mean_squared_error(test_prev_estim, ptr_i) for ptr_i in tr_prevs]
259
- order = np.argsort(ptr_differences)
260
- return _select_k(prevalences, order, k=self.p_metric)
261
-
262
- def ds_get_posteriors(self, X, y):
263
- """
264
- Generate posterior probabilities using cross-validated logistic regression.
265
- This method computes posterior probabilities for the training data via cross-validation,
266
- using a logistic regression classifier with hyperparameters optimized through grid search.
267
- It also returns a function to generate posterior probabilities for new data.
268
-
269
- Parameters
270
- ----------
271
- X : array-like of shape (n_samples, n_features)
272
- The feature matrix representing the training data.
273
- y : array-like of shape (n_samples,)
274
- The target vector representing class labels for the training data.
275
-
276
- Returns
277
- -------
278
- posteriors : ndarray of shape (n_samples, n_classes)
279
- Posterior probabilities for the training data obtained through cross-validation.
280
- posteriors_generator : callable
281
- A function that computes posterior probabilities for new input data.
282
-
283
- Notes
284
- -----
285
- - In scenarios where the quantifier is not based on a probabilistic classifier, it's necessary
286
- to train a separate probabilistic model to obtain posterior probabilities.
287
- - Using cross-validation ensures that the posterior probabilities for the training data are unbiased,
288
- as each data point is evaluated by a model not trained on that point.
289
- - Hyperparameters for the logistic regression classifier are optimized using a grid search with
290
- cross-validation to improve the model's performance.
291
- """
292
- lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
293
-
294
- optim = GridSearchCV(
295
- lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
296
- ).fit(X, y)
297
-
298
- posteriors = cross_val_predict(
299
- optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
300
- )
301
- posteriors_generator = optim.best_estimator_.predict_proba
302
-
303
- return posteriors, posteriors_generator
304
-
305
-
306
- def ds_selection_metric(self, prevalences, test):
307
- """
308
- Selects the prevalence estimates from models trained on samples whose distribution of posterior
309
- probabilities is most similar to the distribution of posterior probabilities for the test data.
310
-
311
- Parameters
312
- ----------
313
- prevalences : numpy.ndarray
314
- An array of prevalence estimates provided by each model in the ensemble.
315
- test : array-like of shape (n_samples, n_features)
316
- The feature matrix representing the test data.
317
-
318
- Returns
319
- -------
320
- numpy.ndarray
321
- The selected prevalence estimates after applying the DS selection metric.
322
- """
323
- test_posteriors = self.proba_generator(test)
324
- test_distribution = getHist(test_posteriors, 8)
325
- tr_distributions = [m[2] for m in self.ensemble]
326
- dist = [hellinger(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
327
- order = np.argsort(dist)
328
- return _select_k(prevalences, order, k=self.p_metric)
329
-
330
- def _select_k(elements, order, k):
331
- """
332
- Selects the k elements from the list of elements based on the order.
333
- If the list is empty, it returns the original list.
334
-
335
- Parameters
336
- ----------
337
- elements : array-like
338
- The array of elements to be selected from.
339
- order : array-like
340
- The order of the elements.
341
- k : int
342
- The number of elements to be selected.
343
-
344
- Returns
345
- -------
346
- array-like
347
- The selected elements.
348
- """
349
- elements_k = [elements[idx] for idx in order[:k]]
350
- if elements_k:
351
- return elements_k
352
- print(f"Unable to take {k} for elements with size {len(elements)}")
353
- return elements
354
-
355
-
356
-
357
- def _delayed_new_sample(args):
358
- """
359
- Fits a new sample for the ensemble, this method is used for parallelization purposes generating a new artificial sample for each quantifier.
360
-
361
- Parameters
362
- ----------
363
- args : tuple
364
- A tuple containing the following elements:
365
-
366
- X : array-like of shape (n_samples, n_features)
367
- The feature matrix representing the training data.
368
- y : array-like of shape (n_samples,)
369
- The target vector representing class labels for the training data.
370
- base_quantifier : Quantifier
371
- The base quantifier model to be used in the ensemble.
372
- prev : array-like of shape (n_classes,)
373
- The class prevalences for the new sample.
374
- posteriors : array-like of shape (n_samples, n_classes)
375
- The posterior probabilities for the training data obtained through cross-validation.
376
- verbose : bool
377
- If True, prints progress messages during fitting and prediction.
378
- sample_size : int
379
- The size of the sample to be generated.
380
-
381
- Returns
382
- -------
383
- tuple
384
- A tuple containing the following elements:
385
-
386
- model : Quantifier
387
- The fitted quantifier model.
388
- tr_prevalence : array-like of shape (n_classes,)
389
- The class prevalences for the new sample.
390
- tr_distribution : array-like of shape (n_classes,)
391
- The distribution of posterior probabilities for the new sample.
392
- X : array-like of shape (n_samples, n_features)
393
- The feature matrix representing the training data.
394
- y : array-like of shape (n_samples,)
395
- The target vector representing class labels for the training data.
396
- """
397
-
398
-
399
- X, y, base_quantifier, prev, posteriors, verbose, sample_size = args
400
- if verbose:
401
- print(f'\tfit-start for prev {str(np.round(prev, 3))}, sample_size={sample_size}')
402
- model = deepcopy(base_quantifier)
403
-
404
- sample_index = get_indexes_with_prevalence(y, prev, sample_size)
405
- X_sample = np.take(X, sample_index, axis=0)
406
- y_sample = np.take(y, sample_index, axis=0)
407
- #print(X_sample)
408
-
409
- model.fit(X_sample, y_sample)
410
-
411
- tr_prevalence = prev
412
- tr_distribution = getHist(posteriors[sample_index], 8) if (posteriors is not None) else None
413
- if verbose:
414
- print(f'\t \\--fit-ended for prev {str(np.round(prev, 3))}')
415
- return (model, tr_prevalence, tr_distribution, X, y)
416
-
417
-
418
- def _delayed_predict(args):
419
- """
420
- Predicts the class prevalences for the given test data.
421
-
422
- Parameters
423
- ----------
424
- args : tuple
425
- A tuple containing the following elements:
426
-
427
- quantifier : Quantifier
428
- The quantifier model to be used for prediction.
429
- X : array-like of shape (n_samples, n_features)
430
- The input data.
431
-
432
- Returns
433
- -------
434
- array-like of shape (n_samples, n_classes)
435
- The predicted class prevalences.
436
- """
437
- quantifier, X = args
438
- #print(np.asarray(list(quantifier[0].predict(X).values())))
439
- return list(quantifier[0].predict(X).values())
440
-
441
-
442
- def _draw_simplex(ndim, min_val, max_trials=100):
443
- """
444
- Return a uniform sample from the ndim-dimensional simplex, ensuring all dimensions are >= min_val.
445
-
446
- Note:
447
- For min_val > 0, the sampling is not truly uniform because the simplex is restricted.
448
-
449
- Parameters:
450
- ndim (int): Number of dimensions of the simplex.
451
- min_val (float): Minimum allowed value for each dimension. Must be less than 1 / ndim.
452
- max_trials (int, optional): Maximum number of attempts to find a valid sample (default is 100).
453
-
454
- Returns:
455
- numpy.ndarray: A sample from the ndim-dimensional simplex where all dimensions are >= min_val.
456
-
457
- Raises:
458
- ValueError: If min_val >= 1 / ndim, or if a valid sample cannot be found within max_trials trials.
459
- """
460
- if min_val >= 1 / ndim:
461
- raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
462
- f'all its values are >={min_val} (try with a larger value for min_pos)')
463
- trials = 0
464
- while True:
465
- u = make_prevs(ndim)
466
- if all(u >= min_val):
467
- return u
468
- trials += 1
469
- if trials >= max_trials:
470
- raise ValueError(f'it looks like finding a random simplex with all its dimensions being'
471
- f'>= {min_val} is unlikely (it failed after {max_trials} trials)')
472
-