mlquantify 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ exclude mlquantify/**/*.py
2
+ include mlquantify/**/__init__.py
3
+ include mlquantify/base.py
4
+ include mlquantify/model_selection.py
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.1
2
+ Name: mlquantify
3
+ Version: 0.0.1
4
+ Summary: Quantification Library
5
+ Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
6
+ Maintainer: Luiz Fernando Luth Junior
7
+ Keywords: python,machine learning,quantification,quantify
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: Unix
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: scikit-learn
16
+ Requires-Dist: numpy
17
+ Requires-Dist: scipy
18
+ Requires-Dist: joblib
19
+ Requires-Dist: tqdm
20
+ Requires-Dist: pandas
21
+ Requires-Dist: xlrd
22
+ Requires-Dist: matplotlib
@@ -0,0 +1,2 @@
1
+ # LibQuantifiers
2
+ Quantification package
@@ -0,0 +1,256 @@
1
+ from abc import abstractmethod, ABC
2
+ from sklearn.base import BaseEstimator
3
+ from copy import deepcopy
4
+ import numpy as np
5
+ import joblib
6
+
7
+
8
+ from .utils import parallel, normalize_prevalence
9
+
10
+ class Quantifier(ABC, BaseEstimator):
11
+ """ Abstract Class for quantifiers."""
12
+
13
+ @abstractmethod
14
+ def fit(self, X, y) -> object: ...
15
+
16
+ @abstractmethod
17
+ def predict(self, X) -> dict: ...
18
+
19
+ @property
20
+ def classes(self) -> list:
21
+ return self._classes
22
+
23
+ @classes.setter
24
+ def classes(self, classes):
25
+ self._classes = sorted(list(classes))
26
+
27
+ @property
28
+ def n_class(self) -> list:
29
+ return len(self._classes)
30
+
31
+ @property
32
+ def multiclass_method(self) -> bool:
33
+ return True
34
+
35
+ @property
36
+ def binary_data(self) -> bool:
37
+ return len(self._classes) == 2
38
+
39
+
40
+ def save_quantifier(self, path: str=None) -> None:
41
+ if not path:
42
+ path = f"{self.__class__.__name__}.joblib"
43
+ joblib.dump(self, path)
44
+
45
+
46
+
47
+ class AggregativeQuantifier(Quantifier, ABC):
48
+ """Abstract class for all Aggregative quantifiers, it means that each one of the quantifiers,
49
+ uses a learner or possibly a classifier to generate predictions.
50
+ This class is mostly used to detect whether or not its a binary or multiclass problem, and doing
51
+ One-Vs-All in case of multiclass dataset and not multiclass quantifier method.
52
+ """
53
+
54
+
55
+ def __init__(self):
56
+ # Dictionary to hold binary quantifiers for each class.
57
+ self.binary_quantifiers = {}
58
+ self.learner_fitted = False
59
+ self.cv_folds = 10
60
+
61
+ def fit(self, X, y, learner_fitted=False, cv_folds: int = 10, n_jobs:int=1):
62
+ """Fit the quantifier model.
63
+
64
+ Args:
65
+ X (array-like): Training features.
66
+ y (array-like): Training labels.
67
+ learner_fitted (bool, optional): Whether the learner is already fitted. Defaults to False.
68
+ cv_folds (int, optional): Number of cross-validation folds. Defaults to 10.
69
+
70
+ Returns:
71
+ self: Fitted quantifier.
72
+ """
73
+ self.n_jobs = n_jobs
74
+ self.learner_fitted = learner_fitted
75
+ self.cv_folds = cv_folds
76
+
77
+ self.classes = np.unique(y)
78
+ if self.binary_data or self.multiclass_method:
79
+ return self._fit_method(X, y)
80
+
81
+ # Making one vs all
82
+ self.binary_quantifiers = {class_: deepcopy(self) for class_ in self.classes}
83
+ parallel(self.delayed_fit, self.classes, self.n_jobs, X, y)
84
+
85
+ return self
86
+
87
+ def predict(self, X) -> dict:
88
+ """Predict class prevalences for the given data.
89
+
90
+ Args:
91
+ X (array-like): Test features.
92
+
93
+ Returns:
94
+ dict: Dictionary with class prevalences.
95
+ """
96
+ if self.binary_data or self.multiclass_method:
97
+ prevalences = self._predict_method(X)
98
+ return normalize_prevalence(prevalences, self.classes)
99
+
100
+ # Making one vs all
101
+ prevalences = np.asarray(parallel(self.delayed_predict, self.classes, self.n_jobs, X))
102
+ return normalize_prevalence(prevalences, self.classes)
103
+
104
+ @abstractmethod
105
+ def _fit_method(self, X, y):
106
+ """Abstract fit method that each quantification method must implement.
107
+
108
+ Args:
109
+ X (array-like): Training features.
110
+ y (array-like): Training labels.
111
+ learner_fitted (bool): Whether the learner is already fitted.
112
+ cv_folds (int): Number of cross-validation folds.
113
+ """
114
+ ...
115
+
116
+ @abstractmethod
117
+ def _predict_method(self, X) -> dict:
118
+ """Abstract predict method that each quantification method must implement.
119
+
120
+ Args:
121
+ X (array-like): Test data to generate class prevalences.
122
+
123
+ Returns:
124
+ dict: Dictionary with class:prevalence for each class.
125
+ """
126
+ ...
127
+
128
+ @property
129
+ def learner(self):
130
+ return self.learner_
131
+
132
+ @learner.setter
133
+ def learner(self, value):
134
+ self.learner_ = value
135
+
136
+
137
+ def get_params(self, deep=True):
138
+ return self.learner.get_params()
139
+
140
+ def set_params(self, **params):
141
+ # Model Params
142
+ for key, value in params.items():
143
+ if hasattr(self, key):
144
+ setattr(self, key, value)
145
+
146
+ # Learner Params
147
+ if self.learner:
148
+ learner_params = {k.replace('learner__', ''): v for k, v in params.items() if 'learner__' in k}
149
+ if learner_params:
150
+ self.learner.set_params(**learner_params)
151
+
152
+ return self
153
+
154
+
155
+ # MULTICLASS METHODS
156
+
157
+ def delayed_fit(self, class_, X, y):
158
+ """Delayed fit method for one-vs-all strategy, with parallel running.
159
+
160
+ Args:
161
+ class_ (Any): The class for which the model is being fitted.
162
+ X (array-like): Training features.
163
+ y (array-like): Training labels.
164
+ learner_fitted (bool): Whether the learner is already fitted.
165
+ cv_folds (int): Number of cross-validation folds.
166
+
167
+ Returns:
168
+ self: Fitted binary quantifier for the given class.
169
+ """
170
+ y_class = (y == class_).astype(int)
171
+ return self.binary_quantifiers[class_].fit(X, y_class)
172
+
173
+ def delayed_predict(self, class_, X):
174
+ """Delayed predict method for one-vs-all strategy, with parallel running.
175
+
176
+ Args:
177
+ class_ (Any): The class for which the model is making predictions.
178
+ X (array-like): Test features.
179
+
180
+ Returns:
181
+ float: Predicted prevalence for the given class.
182
+ """
183
+ return self.binary_quantifiers[class_].predict(X)[1]
184
+
185
+
186
+ class NonAggregativeQuantifier(Quantifier):
187
+ """Abstract class for Non Aggregative quantifiers, it means that
188
+ theses methods does not use a classifier or specift learner on it's
189
+ predictions.
190
+ """
191
+
192
+
193
+ def fit(self, X, y, n_jobs:int=1):
194
+ """Fit the quantifier model.
195
+
196
+ Args:
197
+ X (array-like): Training features.
198
+ y (array-like): Training labels.
199
+ learner_fitted (bool, optional): Whether the learner is already fitted. Defaults to False.
200
+ cv_folds (int, optional): Number of cross-validation folds. Defaults to 10.
201
+
202
+ Returns:
203
+ self: Fitted quantifier.
204
+ """
205
+ self.n_jobs = n_jobs
206
+ self.classes = np.unique(y)
207
+ if self.binary_data or self.multiclass_method:
208
+ return self._fit_method(X, y)
209
+
210
+ # Making one vs all
211
+ self.binary_quantifiers = {class_: deepcopy(self) for class_ in self.classes}
212
+ parallel(self.delayed_fit, self.classes, self.n_jobs, X, y)
213
+
214
+ return self
215
+
216
+ def predict(self, X) -> dict:
217
+ """Predict class prevalences for the given data.
218
+
219
+ Args:
220
+ X (array-like): Test features.
221
+
222
+ Returns:
223
+ dict: Dictionary with class prevalences.
224
+ """
225
+ if self.binary_data or self.multiclass_method:
226
+ prevalences = self._predict_method(X)
227
+ return normalize_prevalence(prevalences, self.classes)
228
+
229
+ # Making one vs all
230
+ prevalences = np.asarray(parallel(self.delayed_predict, self.classes, self.n_jobs, X))
231
+ return normalize_prevalence(prevalences, self.classes)
232
+
233
+
234
+ @abstractmethod
235
+ def _fit_method(self, X, y):
236
+ """Abstract fit method that each quantification method must implement.
237
+
238
+ Args:
239
+ X (array-like): Training features.
240
+ y (array-like): Training labels.
241
+ learner_fitted (bool): Whether the learner is already fitted.
242
+ cv_folds (int): Number of cross-validation folds.
243
+ """
244
+ ...
245
+
246
+ @abstractmethod
247
+ def _predict_method(self, X) -> dict:
248
+ """Abstract predict method that each quantification method must implement.
249
+
250
+ Args:
251
+ X (array-like): Test data to generate class prevalences.
252
+
253
+ Returns:
254
+ dict: Dictionary with class:prevalence for each class.
255
+ """
256
+ ...
@@ -0,0 +1 @@
1
+ from .pwkclf import PWKCLF
@@ -0,0 +1,2 @@
1
+ from .measures import *
2
+ from .protocol import *
@@ -0,0 +1,40 @@
1
+ from .aggregative import *
2
+ from .non_aggregative import *
3
+ from .meta import *
4
+
5
+
6
+ AGGREGATIVE = {
7
+ "CC": CC,
8
+ "PCC": PCC,
9
+ "EMQ": EMQ,
10
+ "FM": FM,
11
+ "GAC": GAC,
12
+ "GPAC": GPAC,
13
+ "PWK": PWK,
14
+ "ACC": ACC,
15
+ "MAX": MAX,
16
+ "MS": MS,
17
+ "MS2": MS2,
18
+ "PACC": PACC,
19
+ "T50": T50,
20
+ "X": X_method,
21
+ "DyS": DyS,
22
+ "DySsyn": DySsyn,
23
+ "HDy": HDy,
24
+ "SMM": SMM,
25
+ "SORD": SORD,
26
+ }
27
+
28
+ NON_AGGREGATIVE = {
29
+ "HDx": HDx,
30
+ }
31
+
32
+ META = {
33
+ "ENSEMBLE": Ensemble
34
+ }
35
+
36
+
37
+ METHODS = AGGREGATIVE | NON_AGGREGATIVE
38
+
39
+ def get_method(method:str):
40
+ return METHODS.get(method)
@@ -0,0 +1,232 @@
1
+ from .base import Quantifier
2
+ from typing import Union, List
3
+ import itertools
4
+ from tqdm import tqdm
5
+ import signal
6
+ from copy import deepcopy
7
+ import numpy as np
8
+ from sklearn.model_selection import train_test_split
9
+ from .utils import parallel
10
+ from .evaluation import get_measure, APP, NPP
11
+
12
+ class GridSearchQ(Quantifier):
13
+ """
14
+ Hyperparameter optimization for quantification models using grid search.
15
+
16
+ Args:
17
+ model (Quantifier): The base quantification model.
18
+ param_grid (dict): Hyperparameters to search over.
19
+ protocol (str, optional): Quantification protocol ('app' or 'npp'). Defaults to 'app'.
20
+ n_prevs (int, optional): Number of prevalence points for APP. Defaults to None.
21
+ n_repetitions (int, optional): Number of repetitions for NPP. Defaults to 1.
22
+ scoring (Union[List[str], str], optional): Metric(s) for evaluation. Defaults to "mae".
23
+ refit (bool, optional): Refit model on best parameters. Defaults to True.
24
+ val_split (float, optional): Proportion of data for validation. Defaults to 0.4.
25
+ n_jobs (int, optional): Number of parallel jobs. Defaults to 1.
26
+ random_seed (int, optional): Seed for reproducibility. Defaults to 42.
27
+ timeout (int, optional): Max time per parameter combination (seconds). Defaults to -1.
28
+ verbose (bool, optional): Verbosity of output. Defaults to False.
29
+ """
30
+
31
+ def __init__(self,
32
+ model: Quantifier,
33
+ param_grid: dict,
34
+ protocol: str = 'app',
35
+ n_prevs: int = None,
36
+ n_repetitions: int = 1,
37
+ scoring: Union[List[str], str] = "ae",
38
+ refit: bool = True,
39
+ val_split: float = 0.4,
40
+ n_jobs: int = 1,
41
+ random_seed: int = 42,
42
+ timeout: int = -1,
43
+ verbose: bool = False):
44
+
45
+ self.model = model
46
+ self.param_grid = param_grid
47
+ self.protocol = protocol.lower()
48
+ self.n_prevs = n_prevs
49
+ self.n_repetitions = n_repetitions
50
+ self.refit = refit
51
+ self.val_split = val_split
52
+ self.n_jobs = n_jobs
53
+ self.random_seed = random_seed
54
+ self.timeout = timeout
55
+ self.verbose = verbose
56
+ self.scoring = [get_measure(measure) for measure in (scoring if isinstance(scoring, list) else [scoring])]
57
+
58
+ assert self.protocol in {'app', 'npp'}, 'Unknown protocol; valid ones are "app" or "npp".'
59
+
60
+ if self.protocol == 'npp' and self.n_repetitions <= 1:
61
+ raise ValueError('For "npp" protocol, n_repetitions must be greater than 1.')
62
+
63
+ def sout(self, msg):
64
+ """Prints messages if verbose is True."""
65
+ if self.verbose:
66
+ print(f'[{self.__class__.__name__}]: {msg}')
67
+
68
+ def __get_protocol(self, model, sample_size):
69
+ """Get the appropriate protocol instance.
70
+
71
+ Args:
72
+ model (Quantifier): The quantification model.
73
+ sample_size (int): The sample size for batch processing.
74
+
75
+ Returns:
76
+ object: Instance of APP or NPP protocol.
77
+ """
78
+ protocol_params = {
79
+ 'models': model,
80
+ 'batch_size': sample_size,
81
+ 'n_iterations': self.n_repetitions,
82
+ 'n_jobs': self.n_jobs,
83
+ 'verbose': False,
84
+ 'random_state': 35,
85
+ 'return_type': "predictions"
86
+ }
87
+ return APP(n_prevs=self.n_prevs, **protocol_params) if self.protocol == 'app' else NPP(**protocol_params)
88
+
89
+ def fit(self, X, y):
90
+ """Fit the quantifier model and perform grid search.
91
+
92
+ Args:
93
+ X (array-like): Training features.
94
+ y (array-like): Training labels.
95
+
96
+ Returns:
97
+ self: Fitted GridSearchQ instance.
98
+ """
99
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.val_split, random_state=self.random_seed)
100
+ param_combinations = list(itertools.product(*self.param_grid.values()))
101
+ best_score, best_params = None, None
102
+
103
+ if self.timeout > 0:
104
+ signal.signal(signal.SIGALRM, self._timeout_handler)
105
+
106
+ def evaluate_combination(params):
107
+ """Evaluate a single combination of hyperparameters.
108
+
109
+ Args:
110
+ params (tuple): A tuple of hyperparameter values.
111
+
112
+ Returns:
113
+ float or None: The evaluation score, or None if timeout occurred.
114
+ """
115
+
116
+ if self.verbose:
117
+ print(f"\tEvaluate Combination for {str(params)}")
118
+
119
+
120
+ model = deepcopy(self.model)
121
+ model.set_params(**dict(zip(self.param_grid.keys(), params)))
122
+ protocol_instance = self.__get_protocol(model, len(y_train))
123
+
124
+ try:
125
+ if self.timeout > 0:
126
+ signal.alarm(self.timeout)
127
+
128
+ protocol_instance.fit(X_train, y_train)
129
+ _, real_prevs, pred_prevs = protocol_instance.predict(X_val, y_val)
130
+ scores = [np.mean([measure(rp, pp) for rp, pp in zip(real_prevs, pred_prevs)]) for measure in self.scoring]
131
+
132
+ if self.timeout > 0:
133
+ signal.alarm(0)
134
+
135
+
136
+
137
+ if self.verbose:
138
+ print(f"\t\\--ended evaluation of {str(params)}")
139
+
140
+ return np.mean(scores) if scores else None
141
+ except TimeoutError:
142
+ self.sout(f'Timeout reached for combination {params}.')
143
+ return None
144
+
145
+ results = parallel(
146
+ evaluate_combination,
147
+ tqdm(param_combinations, desc="Evaluating combination", total=len(param_combinations)) if self.verbose else param_combinations,
148
+ n_jobs=self.n_jobs
149
+ )
150
+
151
+ for score, params in zip(results, param_combinations):
152
+ if score is not None and (best_score is None or score < best_score):
153
+ best_score, best_params = score, params
154
+
155
+ self.best_score_ = best_score
156
+ self.best_params_ = dict(zip(self.param_grid.keys(), best_params))
157
+ self.sout(f'Optimization complete. Best score: {self.best_score_}, with parameters: {self.best_params_}.')
158
+
159
+ if self.refit and self.best_params_:
160
+ self.model.set_params(**self.best_params_)
161
+ self.model.fit(X, y)
162
+ self.best_model_ = self.model
163
+
164
+ return self
165
+
166
+ def predict(self, X):
167
+ """Make predictions using the best found model.
168
+
169
+ Args:
170
+ X (array-like): Data to predict on.
171
+
172
+ Returns:
173
+ array-like: Predictions.
174
+ """
175
+ if not hasattr(self, 'best_model_'):
176
+ raise RuntimeError("The model has not been fitted yet.")
177
+ return self.best_model_.predict(X)
178
+
179
+ @property
180
+ def classes_(self):
181
+ """Get the classes of the best model.
182
+
183
+ Returns:
184
+ array-like: The classes.
185
+ """
186
+ return self.best_model_.classes_
187
+
188
+ def set_params(self, **parameters):
189
+ """Set the hyperparameters for grid search.
190
+
191
+ Args:
192
+ parameters (dict): Hyperparameters to set.
193
+ """
194
+ self.param_grid = parameters
195
+
196
+ def get_params(self, deep=True):
197
+ """Get the parameters of the best model.
198
+
199
+ Args:
200
+ deep (bool, optional): If True, will return the parameters for this estimator and contained subobjects. Defaults to True.
201
+
202
+ Returns:
203
+ dict: Parameters of the best model.
204
+ """
205
+ if hasattr(self, 'best_model_'):
206
+ return self.best_model_.get_params()
207
+ raise ValueError('get_params called before fit')
208
+
209
+ def best_model(self):
210
+ """Return the best model after fitting.
211
+
212
+ Returns:
213
+ Quantifier: The best model.
214
+
215
+ Raises:
216
+ ValueError: If called before fitting.
217
+ """
218
+ if hasattr(self, 'best_model_'):
219
+ return self.best_model_
220
+ raise ValueError('best_model called before fit')
221
+
222
+ def _timeout_handler(self, signum, frame):
223
+ """Handle timeouts during evaluation.
224
+
225
+ Args:
226
+ signum (int): Signal number.
227
+ frame (object): Current stack frame.
228
+
229
+ Raises:
230
+ TimeoutError: When the timeout is reached.
231
+ """
232
+ raise TimeoutError()
@@ -0,0 +1,2 @@
1
+ from .protocol_plot import protocol_boxplot, protocol_lineplot
2
+ from .distribution_plot import class_distribution_plot
@@ -0,0 +1,2 @@
1
+ from .general_purposes import *
2
+ from .method_purposes import *
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.1
2
+ Name: mlquantify
3
+ Version: 0.0.1
4
+ Summary: Quantification Library
5
+ Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
6
+ Maintainer: Luiz Fernando Luth Junior
7
+ Keywords: python,machine learning,quantification,quantify
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Operating System :: Unix
12
+ Classifier: Operating System :: MacOS :: MacOS X
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: scikit-learn
16
+ Requires-Dist: numpy
17
+ Requires-Dist: scipy
18
+ Requires-Dist: joblib
19
+ Requires-Dist: tqdm
20
+ Requires-Dist: pandas
21
+ Requires-Dist: xlrd
22
+ Requires-Dist: matplotlib
@@ -0,0 +1,15 @@
1
+ MANIFEST.in
2
+ README.md
3
+ setup.py
4
+ mlquantify/base.py
5
+ mlquantify/model_selection.py
6
+ mlquantify.egg-info/PKG-INFO
7
+ mlquantify.egg-info/SOURCES.txt
8
+ mlquantify.egg-info/dependency_links.txt
9
+ mlquantify.egg-info/requires.txt
10
+ mlquantify.egg-info/top_level.txt
11
+ mlquantify/classification/__init__.py
12
+ mlquantify/evaluation/__init__.py
13
+ mlquantify/methods/__init__.py
14
+ mlquantify/plots/__init__.py
15
+ mlquantify/utils/__init__.py
@@ -0,0 +1,8 @@
1
+ scikit-learn
2
+ numpy
3
+ scipy
4
+ joblib
5
+ tqdm
6
+ pandas
7
+ xlrd
8
+ matplotlib
@@ -0,0 +1 @@
1
+ mlquantify
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,26 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ VERSION = '0.0.1'
4
+ DESCRIPTION = 'Quantification Library'
5
+
6
+ # Setting up
7
+ setup(
8
+ name="mlquantify",
9
+ version=VERSION,
10
+ url="https://github.com/luizfernandolj/QuantifyML/tree/master",
11
+ maintainer="Luiz Fernando Luth Junior",
12
+ description=DESCRIPTION,
13
+ long_description_content_type="text/markdown",
14
+ packages=find_packages(),
15
+ include_package_data=True,
16
+ install_requires=['scikit-learn', 'numpy', 'scipy', 'joblib', 'tqdm', 'pandas', 'xlrd', 'matplotlib'],
17
+ keywords=['python', 'machine learning', 'quantification', 'quantify'],
18
+ classifiers=[
19
+ "Development Status :: 4 - Beta",
20
+ "Intended Audience :: Science/Research",
21
+ "Programming Language :: Python :: 3",
22
+ "Operating System :: Unix",
23
+ "Operating System :: MacOS :: MacOS X",
24
+ "Operating System :: Microsoft :: Windows",
25
+ ]
26
+ )