mlquantify 0.0.1__tar.gz → 0.0.11.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify-0.0.11.1/PKG-INFO +114 -0
- mlquantify-0.0.11.1/README.md +99 -0
- mlquantify-0.0.11.1/mlquantify/__init__.py +6 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/base.py +2 -1
- mlquantify-0.0.11.1/mlquantify/classification/pwkclf.py +73 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/__init__.py +26 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/ae.py +11 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/bias.py +16 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/kld.py +8 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/mse.py +12 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/nae.py +16 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/nkld.py +13 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/nrae.py +16 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/rae.py +12 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/measures/se.py +12 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/protocol/_Protocol.py +202 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/protocol/__init__.py +2 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/protocol/app.py +146 -0
- mlquantify-0.0.11.1/mlquantify/evaluation/protocol/npp.py +34 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +62 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/__init__.py +7 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/acc.py +27 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/max.py +23 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/ms.py +21 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/ms2.py +25 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/pacc.py +41 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/t50.py +21 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/ThreholdOptm/x.py +23 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/__init__.py +9 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/cc.py +32 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/emq.py +86 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/fm.py +72 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/gac.py +96 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/gpac.py +87 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +81 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/__init__.py +5 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/dys.py +55 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/dys_syn.py +89 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/hdy.py +46 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/smm.py +27 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/mixtureModels/sord.py +77 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/pcc.py +33 -0
- mlquantify-0.0.11.1/mlquantify/methods/aggregative/pwk.py +38 -0
- mlquantify-0.0.11.1/mlquantify/methods/meta/__init__.py +1 -0
- mlquantify-0.0.11.1/mlquantify/methods/meta/ensemble.py +236 -0
- mlquantify-0.0.11.1/mlquantify/methods/non_aggregative/__init__.py +1 -0
- mlquantify-0.0.11.1/mlquantify/methods/non_aggregative/hdx.py +71 -0
- mlquantify-0.0.11.1/mlquantify/plots/distribution_plot.py +109 -0
- mlquantify-0.0.11.1/mlquantify/plots/protocol_plot.py +157 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/__init__.py +8 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/convert_col_to_array.py +13 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/generate_artificial_indexes.py +29 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/get_real_prev.py +9 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/load_quantifier.py +4 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/make_prevs.py +23 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/normalize.py +20 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/parallel.py +10 -0
- mlquantify-0.0.11.1/mlquantify/utils/general_purposes/round_protocol_df.py +14 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/__init__.py +6 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/distances.py +21 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/getHist.py +13 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/get_scores.py +33 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/moss.py +16 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/ternary_search.py +14 -0
- mlquantify-0.0.11.1/mlquantify/utils/method_purposes/tprfpr.py +42 -0
- mlquantify-0.0.11.1/mlquantify.egg-info/PKG-INFO +114 -0
- mlquantify-0.0.11.1/mlquantify.egg-info/SOURCES.txt +76 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/setup.py +8 -1
- mlquantify-0.0.1/MANIFEST.in +0 -4
- mlquantify-0.0.1/PKG-INFO +0 -22
- mlquantify-0.0.1/README.md +0 -2
- mlquantify-0.0.1/mlquantify.egg-info/PKG-INFO +0 -22
- mlquantify-0.0.1/mlquantify.egg-info/SOURCES.txt +0 -15
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/classification/__init__.py +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/evaluation/__init__.py +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/methods/__init__.py +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/model_selection.py +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/plots/__init__.py +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify/utils/__init__.py +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify.egg-info/dependency_links.txt +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify.egg-info/requires.txt +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/mlquantify.egg-info/top_level.txt +0 -0
- {mlquantify-0.0.1 → mlquantify-0.0.11.1}/setup.cfg +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mlquantify
|
|
3
|
+
Version: 0.0.11.1
|
|
4
|
+
Summary: Quantification Library
|
|
5
|
+
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
|
+
Maintainer: Luiz Fernando Luth Junior
|
|
7
|
+
Keywords: python,machine learning,quantification,quantify
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: Unix
|
|
12
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
<h1 align="center">MLQuantify</h1>
|
|
17
|
+
<h4 align="center">A Python Package for Quantification</h4>
|
|
18
|
+
|
|
19
|
+
___
|
|
20
|
+
|
|
21
|
+
**mlquantify** is a Python library for quantification, also known as supervised prevalence estimation, designed to estimate the distribution of classes within datasets. It offers a range of tools for various quantification methods, model selection tailored for quantification tasks, evaluation metrics, and protocols to assess quantification performance. Additionally, mlquantify includes popular datasets and visualization tools to help analyze and interpret results.
|
|
22
|
+
|
|
23
|
+
___
|
|
24
|
+
|
|
25
|
+
## Latest Release
|
|
26
|
+
|
|
27
|
+
- **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
28
|
+
- In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
|
|
29
|
+
- Explore the [API documentation](#) for detailed developer information.
|
|
30
|
+
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
31
|
+
|
|
32
|
+
___
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
To install mlquantify, run the following command:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install mlquantify
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
___
|
|
43
|
+
|
|
44
|
+
## Contents
|
|
45
|
+
|
|
46
|
+
| Section | Description |
|
|
47
|
+
|---|---|
|
|
48
|
+
| **Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
49
|
+
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
50
|
+
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
51
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, BIAS, NAE, SE, KLD, etc.). |
|
|
52
|
+
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
53
|
+
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
54
|
+
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
55
|
+
|
|
56
|
+
___
|
|
57
|
+
|
|
58
|
+
## Quick example:
|
|
59
|
+
|
|
60
|
+
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import mlquantify as mq
|
|
64
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
65
|
+
from sklearn.datasets import load_breast_cancer
|
|
66
|
+
from sklearn.model_selection import train_test_split
|
|
67
|
+
|
|
68
|
+
# Loading dataset from sklearn
|
|
69
|
+
features, target = load_breast_cancer(return_X_y=True)
|
|
70
|
+
|
|
71
|
+
#Splitting into train and test
|
|
72
|
+
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
73
|
+
|
|
74
|
+
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
75
|
+
model = mq.methods.EMQ(RandomForestClassifier())
|
|
76
|
+
model.fit(X_train, y_train)
|
|
77
|
+
|
|
78
|
+
#Predict the class prevalence for X_test
|
|
79
|
+
pred_prevalence = model.predict(X_test)
|
|
80
|
+
real_prevalence = mq.utils.get_real_prev(y_test)
|
|
81
|
+
|
|
82
|
+
#Get the error for the prediction
|
|
83
|
+
ae = mq.evaluation.absolute_error(real_prevalence, pred_prevalence)
|
|
84
|
+
bias = mq.evaluation.bias(real_prevalence, pred_prevalence)
|
|
85
|
+
|
|
86
|
+
print(f"Mean Squared Error (MSE) -> {ae:.4f}")
|
|
87
|
+
print(f"Bias -> {bias}")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
___
|
|
91
|
+
|
|
92
|
+
## Requirements
|
|
93
|
+
|
|
94
|
+
- Scikit-learn
|
|
95
|
+
- pandas
|
|
96
|
+
- numpy
|
|
97
|
+
- joblib
|
|
98
|
+
- tqdm
|
|
99
|
+
- matplotlib
|
|
100
|
+
- xlrd
|
|
101
|
+
|
|
102
|
+
___
|
|
103
|
+
|
|
104
|
+
## Documentation
|
|
105
|
+
|
|
106
|
+
##### API is avaliable [here](#)
|
|
107
|
+
|
|
108
|
+
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
109
|
+
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
110
|
+
- [Evaluation](https://github.com/luizfernandolj/mlquantify/wiki/Evaluation)
|
|
111
|
+
- [Plotting](https://github.com/luizfernandolj/mlquantify/wiki/Plotting)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
___
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
<h1 align="center">MLQuantify</h1>
|
|
2
|
+
<h4 align="center">A Python Package for Quantification</h4>
|
|
3
|
+
|
|
4
|
+
___
|
|
5
|
+
|
|
6
|
+
**mlquantify** is a Python library for quantification, also known as supervised prevalence estimation, designed to estimate the distribution of classes within datasets. It offers a range of tools for various quantification methods, model selection tailored for quantification tasks, evaluation metrics, and protocols to assess quantification performance. Additionally, mlquantify includes popular datasets and visualization tools to help analyze and interpret results.
|
|
7
|
+
|
|
8
|
+
___
|
|
9
|
+
|
|
10
|
+
## Latest Release
|
|
11
|
+
|
|
12
|
+
- **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
13
|
+
- In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
|
|
14
|
+
- Explore the [API documentation](#) for detailed developer information.
|
|
15
|
+
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
16
|
+
|
|
17
|
+
___
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
To install mlquantify, run the following command:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install mlquantify
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
___
|
|
28
|
+
|
|
29
|
+
## Contents
|
|
30
|
+
|
|
31
|
+
| Section | Description |
|
|
32
|
+
|---|---|
|
|
33
|
+
| **Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
34
|
+
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
35
|
+
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
36
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, BIAS, NAE, SE, KLD, etc.). |
|
|
37
|
+
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
38
|
+
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
39
|
+
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
40
|
+
|
|
41
|
+
___
|
|
42
|
+
|
|
43
|
+
## Quick example:
|
|
44
|
+
|
|
45
|
+
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import mlquantify as mq
|
|
49
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
50
|
+
from sklearn.datasets import load_breast_cancer
|
|
51
|
+
from sklearn.model_selection import train_test_split
|
|
52
|
+
|
|
53
|
+
# Loading dataset from sklearn
|
|
54
|
+
features, target = load_breast_cancer(return_X_y=True)
|
|
55
|
+
|
|
56
|
+
#Splitting into train and test
|
|
57
|
+
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
58
|
+
|
|
59
|
+
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
60
|
+
model = mq.methods.EMQ(RandomForestClassifier())
|
|
61
|
+
model.fit(X_train, y_train)
|
|
62
|
+
|
|
63
|
+
#Predict the class prevalence for X_test
|
|
64
|
+
pred_prevalence = model.predict(X_test)
|
|
65
|
+
real_prevalence = mq.utils.get_real_prev(y_test)
|
|
66
|
+
|
|
67
|
+
#Get the error for the prediction
|
|
68
|
+
ae = mq.evaluation.absolute_error(real_prevalence, pred_prevalence)
|
|
69
|
+
bias = mq.evaluation.bias(real_prevalence, pred_prevalence)
|
|
70
|
+
|
|
71
|
+
print(f"Mean Squared Error (MSE) -> {ae:.4f}")
|
|
72
|
+
print(f"Bias -> {bias}")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
___
|
|
76
|
+
|
|
77
|
+
## Requirements
|
|
78
|
+
|
|
79
|
+
- Scikit-learn
|
|
80
|
+
- pandas
|
|
81
|
+
- numpy
|
|
82
|
+
- joblib
|
|
83
|
+
- tqdm
|
|
84
|
+
- matplotlib
|
|
85
|
+
- xlrd
|
|
86
|
+
|
|
87
|
+
___
|
|
88
|
+
|
|
89
|
+
## Documentation
|
|
90
|
+
|
|
91
|
+
##### API is avaliable [here](#)
|
|
92
|
+
|
|
93
|
+
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
94
|
+
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
95
|
+
- [Evaluation](https://github.com/luizfernandolj/mlquantify/wiki/Evaluation)
|
|
96
|
+
- [Plotting](https://github.com/luizfernandolj/mlquantify/wiki/Plotting)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
___
|
|
@@ -138,13 +138,14 @@ class AggregativeQuantifier(Quantifier, ABC):
|
|
|
138
138
|
return self.learner.get_params()
|
|
139
139
|
|
|
140
140
|
def set_params(self, **params):
|
|
141
|
+
|
|
141
142
|
# Model Params
|
|
142
143
|
for key, value in params.items():
|
|
143
144
|
if hasattr(self, key):
|
|
144
145
|
setattr(self, key, value)
|
|
145
146
|
|
|
146
147
|
# Learner Params
|
|
147
|
-
if self.learner:
|
|
148
|
+
if self.learner is not None:
|
|
148
149
|
learner_params = {k.replace('learner__', ''): v for k, v in params.items() if 'learner__' in k}
|
|
149
150
|
if learner_params:
|
|
150
151
|
self.learner.set_params(**learner_params)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from sklearn.neighbors import NearestNeighbors
|
|
2
|
+
from sklearn.base import BaseEstimator
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
class PWKCLF(BaseEstimator):
|
|
7
|
+
"""Learner based on k-Nearest Neighborst (KNN) to use on the method PWK,
|
|
8
|
+
that also is based on KNN.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __init__(self,
|
|
13
|
+
alpha=1,
|
|
14
|
+
n_neighbors=10,
|
|
15
|
+
algorithm="auto",
|
|
16
|
+
metric="euclidean",
|
|
17
|
+
leaf_size=30,
|
|
18
|
+
p=2,
|
|
19
|
+
metric_params=None,
|
|
20
|
+
n_jobs=None):
|
|
21
|
+
|
|
22
|
+
if alpha < 1:
|
|
23
|
+
raise ValueError("alpha must not be smaller than 1")
|
|
24
|
+
|
|
25
|
+
self.alpha = alpha
|
|
26
|
+
self.n_neighbors = n_neighbors
|
|
27
|
+
|
|
28
|
+
self.nbrs = NearestNeighbors(n_neighbors=n_neighbors,
|
|
29
|
+
algorithm=algorithm,
|
|
30
|
+
leaf_size=leaf_size,
|
|
31
|
+
metric=metric,
|
|
32
|
+
p=p,
|
|
33
|
+
metric_params=metric_params,
|
|
34
|
+
n_jobs=n_jobs)
|
|
35
|
+
|
|
36
|
+
self.Y = None
|
|
37
|
+
self.Y_map = None
|
|
38
|
+
self.w = None
|
|
39
|
+
self.y = None
|
|
40
|
+
|
|
41
|
+
def fit(self, X, y):
|
|
42
|
+
n_samples = X.shape[0]
|
|
43
|
+
if n_samples < self.n_neighbors:
|
|
44
|
+
self.nbrs.set_params(n_neighbors=n_samples)
|
|
45
|
+
|
|
46
|
+
self.y = y
|
|
47
|
+
|
|
48
|
+
if isinstance(y, pd.DataFrame):
|
|
49
|
+
self.y = y.reset_index(drop=True)
|
|
50
|
+
|
|
51
|
+
Y_cts = np.unique(y, return_counts=True)
|
|
52
|
+
self.Y = Y_cts[0]
|
|
53
|
+
self.Y_map = dict(zip(self.Y, range(len(self.Y))))
|
|
54
|
+
|
|
55
|
+
min_class_count = np.min(Y_cts[1])
|
|
56
|
+
self.w = (Y_cts[1] / min_class_count) ** (-1.0 / self.alpha)
|
|
57
|
+
self.nbrs.fit(X)
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
def predict(self, X):
|
|
61
|
+
n_samples = X.shape[0]
|
|
62
|
+
nn_indices = self.nbrs.kneighbors(X, return_distance=False)
|
|
63
|
+
|
|
64
|
+
CM = np.zeros((n_samples, len(self.Y)))
|
|
65
|
+
|
|
66
|
+
for i in range(n_samples):
|
|
67
|
+
for j in nn_indices[i]:
|
|
68
|
+
CM[i, self.Y_map[self.y[j]]] += 1
|
|
69
|
+
|
|
70
|
+
CM = np.multiply(CM, self.w)
|
|
71
|
+
predictions = np.apply_along_axis(np.argmax, axis=1, arr=CM)
|
|
72
|
+
|
|
73
|
+
return self.Y[predictions]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .ae import absolute_error
|
|
2
|
+
from .kld import kullback_leibler_divergence
|
|
3
|
+
from .nkld import normalized_kullback_leibler_divergence
|
|
4
|
+
from .rae import relative_absolute_error
|
|
5
|
+
from .nae import normalized_absolute_error
|
|
6
|
+
from .bias import bias
|
|
7
|
+
from .nrae import normalized_relative_absolute_error
|
|
8
|
+
from .se import squared_error
|
|
9
|
+
from .mse import mean_squared_error
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
MEASURES = {
|
|
14
|
+
"ae": absolute_error,
|
|
15
|
+
"nae": normalized_absolute_error,
|
|
16
|
+
"kld": kullback_leibler_divergence,
|
|
17
|
+
"nkld": normalized_kullback_leibler_divergence,
|
|
18
|
+
"nrae": normalized_relative_absolute_error,
|
|
19
|
+
"rae": relative_absolute_error,
|
|
20
|
+
"se": squared_error,
|
|
21
|
+
"mse": mean_squared_error
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_measure(measure:str):
|
|
26
|
+
return MEASURES.get(measure)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def absolute_error(prev_real:np.any, prev_pred:np.any):
|
|
4
|
+
if isinstance(prev_real, dict):
|
|
5
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
6
|
+
if isinstance(prev_pred, dict):
|
|
7
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
8
|
+
|
|
9
|
+
abs_error = abs(prev_pred - prev_real).mean(axis=-1)
|
|
10
|
+
|
|
11
|
+
return abs_error
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def bias(prev_real:np.any, prev_pred:np.any):
|
|
4
|
+
classes = None
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
classes = prev_real.keys()
|
|
7
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
8
|
+
if isinstance(prev_pred, dict):
|
|
9
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
10
|
+
|
|
11
|
+
abs_errors = abs(prev_pred - prev_real)
|
|
12
|
+
|
|
13
|
+
if classes:
|
|
14
|
+
return {class_:abs_error for class_, abs_error in zip(classes, abs_errors)}
|
|
15
|
+
|
|
16
|
+
return abs_errors
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def kullback_leibler_divergence(prev_real:np.any, prev_pred:np.any):
|
|
4
|
+
if isinstance(prev_real, dict):
|
|
5
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
6
|
+
if isinstance(prev_pred, dict):
|
|
7
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
8
|
+
return prev_real * abs(np.log((prev_real / prev_pred)))
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .se import squared_error
|
|
3
|
+
|
|
4
|
+
def mean_squared_error(prev_real:np.any, prev_pred:np.any):
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
7
|
+
if isinstance(prev_pred, dict):
|
|
8
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
9
|
+
|
|
10
|
+
mean_sq_error = squared_error(prev_real, prev_pred).mean()
|
|
11
|
+
|
|
12
|
+
return mean_sq_error
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .ae import absolute_error
|
|
3
|
+
|
|
4
|
+
def normalized_absolute_error(prev_real:np.any, prev_pred:np.any):
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
7
|
+
if isinstance(prev_pred, dict):
|
|
8
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
9
|
+
|
|
10
|
+
abs_error = absolute_error(prev_real, prev_pred)
|
|
11
|
+
|
|
12
|
+
z_abs_error = (2 * (1 - min(prev_real)))
|
|
13
|
+
|
|
14
|
+
normalized = abs_error / z_abs_error
|
|
15
|
+
|
|
16
|
+
return normalized
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .kld import kullback_leibler_divergence
|
|
3
|
+
|
|
4
|
+
def normalized_kullback_leibler_divergence(prev_real:np.any, prev_pred:np.any):
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
7
|
+
if isinstance(prev_pred, dict):
|
|
8
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
9
|
+
|
|
10
|
+
euler = np.exp(kullback_leibler_divergence(prev_real, prev_pred))
|
|
11
|
+
normalized = 2 * (euler / (euler + 1)) - 1
|
|
12
|
+
|
|
13
|
+
return normalized
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .rae import relative_absolute_error
|
|
3
|
+
|
|
4
|
+
def normalized_relative_absolute_error(prev_real:np.any, prev_pred:np.any):
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
7
|
+
if isinstance(prev_pred, dict):
|
|
8
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
9
|
+
|
|
10
|
+
relative = relative_absolute_error(prev_real, prev_pred)
|
|
11
|
+
|
|
12
|
+
z_relative = (len(prev_real) - 1 + ((1 - min(prev_real)) / min(prev_real))) / len(prev_real)
|
|
13
|
+
|
|
14
|
+
normalized = relative/z_relative
|
|
15
|
+
|
|
16
|
+
return normalized
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from . import absolute_error
|
|
3
|
+
|
|
4
|
+
def relative_absolute_error(prev_real:np.any, prev_pred:np.any):
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
7
|
+
if isinstance(prev_pred, dict):
|
|
8
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
9
|
+
|
|
10
|
+
relative = (absolute_error(prev_real, prev_pred) / prev_real).mean(axis=-1)
|
|
11
|
+
|
|
12
|
+
return relative
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .ae import absolute_error
|
|
3
|
+
|
|
4
|
+
def squared_error(prev_real:np.any, prev_pred:np.any):
|
|
5
|
+
if isinstance(prev_real, dict):
|
|
6
|
+
prev_real = np.asarray(list(prev_real.values()))
|
|
7
|
+
if isinstance(prev_pred, dict):
|
|
8
|
+
prev_pred = np.asarray(list(prev_pred.values()))
|
|
9
|
+
|
|
10
|
+
sq_abs_error = ((prev_pred - prev_real) ** 2).mean(axis=-1)
|
|
11
|
+
|
|
12
|
+
return sq_abs_error
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Union, List
|
|
5
|
+
from sklearn.base import BaseEstimator
|
|
6
|
+
from time import time
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from ...methods import get_method, METHODS, AGGREGATIVE, NON_AGGREGATIVE
|
|
10
|
+
from ...utils import *
|
|
11
|
+
from ..measures import get_measure, MEASURES
|
|
12
|
+
from ...base import Quantifier, AggregativeQuantifier
|
|
13
|
+
|
|
14
|
+
class Protocol(ABC):
|
|
15
|
+
"""Base class for implementing different quantification protocols.
|
|
16
|
+
|
|
17
|
+
This abstract class provides a structure for creating protocols that involve
|
|
18
|
+
fitting quantification models to training data and generating predictions on test data.
|
|
19
|
+
It supports parallel processing, multiple iterations, and different output formats.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
models (Union[List[Union[str, Quantifier]], str, Quantifier]):
|
|
23
|
+
List of quantification models, a single model name, or 'all' for all models.
|
|
24
|
+
batch_size (Union[List[int], int]):
|
|
25
|
+
Size of the batches to be processed, or a list of sizes.
|
|
26
|
+
learner (BaseEstimator, optional):
|
|
27
|
+
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
28
|
+
n_iterations (int, optional):
|
|
29
|
+
Number of iterations for the protocol. Default is 1.
|
|
30
|
+
n_jobs (int, optional):
|
|
31
|
+
Number of jobs to run in parallel. Default is 1.
|
|
32
|
+
random_state (int, optional):
|
|
33
|
+
Seed for random number generation. Default is 32.
|
|
34
|
+
verbose (bool, optional):
|
|
35
|
+
Whether to print progress messages. Default is False.
|
|
36
|
+
return_type (str, optional):
|
|
37
|
+
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
38
|
+
measures (List[str], optional):
|
|
39
|
+
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def __init__(self,
|
|
44
|
+
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
45
|
+
batch_size: Union[List[int], int],
|
|
46
|
+
learner: BaseEstimator = None,
|
|
47
|
+
n_iterations: int = 1,
|
|
48
|
+
n_jobs: int = 1,
|
|
49
|
+
random_state: int = 32,
|
|
50
|
+
verbose: bool = False,
|
|
51
|
+
return_type: str = "predictions",
|
|
52
|
+
measures: List[str] = None):
|
|
53
|
+
|
|
54
|
+
assert not measures or all(m in MEASURES for m in measures), \
|
|
55
|
+
f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
|
|
56
|
+
assert return_type in ["predictions", "table"], \
|
|
57
|
+
"Invalid return_type. Valid options: ['predictions', 'table']"
|
|
58
|
+
|
|
59
|
+
self.models = self._initialize_models(models, learner)
|
|
60
|
+
self.learner = learner
|
|
61
|
+
self.batch_size = batch_size
|
|
62
|
+
self.n_iterations = n_iterations
|
|
63
|
+
self.n_jobs = n_jobs
|
|
64
|
+
self.random_state = random_state
|
|
65
|
+
self.verbose = verbose
|
|
66
|
+
self.return_type = return_type
|
|
67
|
+
self.measures = measures
|
|
68
|
+
|
|
69
|
+
def _initialize_models(self, models, learner):
|
|
70
|
+
if isinstance(models, list):
|
|
71
|
+
if isinstance(models[0], Quantifier):
|
|
72
|
+
return models
|
|
73
|
+
assert learner is not None, "Learner is required for model methods."
|
|
74
|
+
return [get_method(model)(learner) for model in models]
|
|
75
|
+
if isinstance(models, Quantifier):
|
|
76
|
+
return [models]
|
|
77
|
+
|
|
78
|
+
assert learner is not None, "Learner is required for model methods."
|
|
79
|
+
|
|
80
|
+
if models == "all":
|
|
81
|
+
print(hasattr(list(AGGREGATIVE.values())[0], "learner"))
|
|
82
|
+
models = [model(learner) if hasattr(model, "learner") else model() for model in METHODS.values()]
|
|
83
|
+
return models
|
|
84
|
+
if models == "aggregative":
|
|
85
|
+
return [model(learner) for model in AGGREGATIVE.values()]
|
|
86
|
+
if models == "non_aggregative":
|
|
87
|
+
return [model() for model in NON_AGGREGATIVE.values()]
|
|
88
|
+
|
|
89
|
+
return [get_method(models)(learner)]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def sout(self, msg):
|
|
93
|
+
if self.verbose:
|
|
94
|
+
print('[APP]' + msg)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def fit(self, X_train, y_train):
|
|
98
|
+
"""Fit all methods into the training data.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
X_train (array-like): Features of training.
|
|
102
|
+
y_train (array-like): Labels of training.
|
|
103
|
+
"""
|
|
104
|
+
self.sout("Fitting models")
|
|
105
|
+
|
|
106
|
+
args = ((model, X_train, y_train, self.verbose) for model in self.models)
|
|
107
|
+
self.models = parallel(
|
|
108
|
+
self._delayed_fit,
|
|
109
|
+
tqdm(args, desc="Fitting models", total=len(self.models)) if self.verbose else args,
|
|
110
|
+
self.n_jobs)
|
|
111
|
+
|
|
112
|
+
self.sout("Fit [Done]")
|
|
113
|
+
return self
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def predict(self, X_test, y_test) -> np.any:
|
|
117
|
+
"""Generate several samples with artificial prevalences, and sizes.
|
|
118
|
+
And for each method, predicts with this sample, aggregating all toguether
|
|
119
|
+
with a pandas dataframe if request, or else just the predictions.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
X_test (array-like): Features of test.
|
|
123
|
+
y_test (array-like): Labels of test.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
tuple: tuple containing the model, real_prev and pred_prev, or.
|
|
127
|
+
DataFrame: table of results, along with error measures if requested.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
predictions = self.predict_protocol(X_test, y_test)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
predictions_df = pd.DataFrame(predictions)
|
|
135
|
+
|
|
136
|
+
if self.return_type == "table":
|
|
137
|
+
predictions_df.columns = ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]
|
|
138
|
+
|
|
139
|
+
if self.measures:
|
|
140
|
+
|
|
141
|
+
def smooth(values:np.ndarray) ->np.ndarray:
|
|
142
|
+
smoothed_factor = 1/(2 * len(X_test))
|
|
143
|
+
|
|
144
|
+
values = (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
|
|
145
|
+
|
|
146
|
+
return values
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
for metric in self.measures:
|
|
150
|
+
predictions_df[metric] = predictions_df.apply(
|
|
151
|
+
lambda row: get_measure(metric)(smooth(row["REAL_PREVS"]), smooth(row["PRED_PREVS"])),
|
|
152
|
+
axis=1
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
return predictions_df
|
|
156
|
+
|
|
157
|
+
predictions_array = predictions_df.to_numpy()
|
|
158
|
+
return (
|
|
159
|
+
predictions_array[:, 0], # Model names
|
|
160
|
+
np.stack(predictions_array[:, 1]), # Prev
|
|
161
|
+
np.stack(predictions_array[:, 2]) # Prev_pred
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def predict_protocol(self) -> np.ndarray:
|
|
167
|
+
""" Abstract method that every protocol has to implement """
|
|
168
|
+
...
|
|
169
|
+
|
|
170
|
+
@abstractmethod
|
|
171
|
+
def _new_sample(self) -> tuple:
|
|
172
|
+
""" Abstract method of sample extraction for each protocol
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
tuple: tuple containing the X_sample and the y_sample
|
|
176
|
+
"""
|
|
177
|
+
...
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@abstractmethod
|
|
181
|
+
def _delayed_predict(self, args) -> tuple:
|
|
182
|
+
"""abstract method for predicting in the extracted
|
|
183
|
+
samples, is delayed for running in parallel for
|
|
184
|
+
eficciency purposes.
|
|
185
|
+
"""
|
|
186
|
+
...
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _delayed_fit(self, args):
|
|
191
|
+
model, X_train, y_train, verbose = args
|
|
192
|
+
|
|
193
|
+
if verbose:
|
|
194
|
+
print(f"\tFitting {model.__class__.__name__}")
|
|
195
|
+
start = time()
|
|
196
|
+
|
|
197
|
+
model = model.fit(X=X_train, y=y_train)
|
|
198
|
+
|
|
199
|
+
if verbose:
|
|
200
|
+
end = time()
|
|
201
|
+
print(f"\t\\--Fit ended for {model.__class__.__name__} in {round(end - start, 3)} seconds")
|
|
202
|
+
return model
|