BenchmarkDPFair 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarkdpfair-0.1.0/LICENSE +21 -0
- benchmarkdpfair-0.1.0/PKG-INFO +165 -0
- benchmarkdpfair-0.1.0/README.md +110 -0
- benchmarkdpfair-0.1.0/pyproject.toml +46 -0
- benchmarkdpfair-0.1.0/setup.cfg +4 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/__init__.py +4 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/benchmark.py +282 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/dataconf.py +58 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/__init__.py +0 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/auxiliar.py +94 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/benchmark.py +176 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/inp.py +141 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/pos.py +183 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/pre.py +233 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/types.py +15 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/verifiers.py +102 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/__init__.py +5 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/dataconf.py +94 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/datagen.py +246 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/utils/verifiers.py +28 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair/__init__.py +4 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/PKG-INFO +165 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/SOURCES.txt +27 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/dependency_links.txt +1 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/requires.txt +21 -0
- benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/top_level.txt +1 -0
- benchmarkdpfair-0.1.0/tests/test_benchmark.py +31 -0
- benchmarkdpfair-0.1.0/tests/test_dataconf.py +25 -0
- benchmarkdpfair-0.1.0/tests/test_datagen.py +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Vinicius Gabriel Angelozzi Verona de Resende, Héber Hwang Arcolezi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: BenchmarkDPFair
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A differentially private data synthesizer and fairness intervention benchmark framework
|
|
5
|
+
Author-email: Vinicius Gabriel Angelozzi Verona de Resende <verona.projects@tutanota.com>, Héber Hwang Arcolezi <heber.hwang-arcolezi@etsmtl.ca>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Vinicius Gabriel Angelozzi Verona de Resende, Héber Hwang Arcolezi
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/vinicius-verona/dp-fair-intervention-benchmark
|
|
29
|
+
Project-URL: Issues, https://github.com/vinicius-verona/dp-fair-intervention-benchmark/issues
|
|
30
|
+
Requires-Python: >=3.9
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: aif360>=0.6.1
|
|
34
|
+
Requires-Dist: aif360[inFairness]>=0.6.1
|
|
35
|
+
Requires-Dist: fairlearn>=0.12.0
|
|
36
|
+
Requires-Dist: inFairness>=0.2.3
|
|
37
|
+
Requires-Dist: matplotlib>=3.9.4
|
|
38
|
+
Requires-Dist: matplotlib-inline>=0.1.7
|
|
39
|
+
Requires-Dist: numpy>=1.26.4
|
|
40
|
+
Requires-Dist: pandas>=2.2.3
|
|
41
|
+
Requires-Dist: scikit-learn>=1.6.1
|
|
42
|
+
Requires-Dist: scipy>=1.13.1
|
|
43
|
+
Requires-Dist: smartnoise-sql>=1.0.6
|
|
44
|
+
Requires-Dist: smartnoise-synth>=1.0.5
|
|
45
|
+
Requires-Dist: tabulate>=0.9.0
|
|
46
|
+
Requires-Dist: ucimlrepo>=0.0.7
|
|
47
|
+
Requires-Dist: xgboost>=2.1.1
|
|
48
|
+
Requires-Dist: tensorflow>=2.19.0
|
|
49
|
+
Requires-Dist: tensorflow-io-gcs-filesystem>=0.37.1
|
|
50
|
+
Requires-Dist: cvxpy>=1.6.5
|
|
51
|
+
Requires-Dist: jax<0.5,>=0.4.30
|
|
52
|
+
Requires-Dist: jaxlib<0.5,>=0.4.30
|
|
53
|
+
Requires-Dist: chex>=0.1.87
|
|
54
|
+
Dynamic: license-file
|
|
55
|
+
|
|
56
|
+
# DP+Fair Benchmarking Framework
|
|
57
|
+
|
|
58
|
+
This repository provides a Python framework for **benchmarking fairness mechanisms** on **Differentially Private Synthetic Data**.
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Features
|
|
64
|
+
|
|
65
|
+
- ⚡ Simple, reproducible setup for benchmarking algorithms
|
|
66
|
+
- 🧩 Flexible API to plug in any classifier implementing `fit`, `predict`, and `predict_proba`
|
|
67
|
+
- 📊 Pre-offered datasets included under `data/`
|
|
68
|
+
- 🔬 Configurable experiment settings: dataset schema, dataset synthesizer, seeds, privacy-budget, input/outputs, classifier, data pre-processing.
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
To install, clone the repository and install dependencies:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
git clone https://github.com/vinicius-verona/dp-fair-intervention-benchmark.git
|
|
78
|
+
cd dp-fair-intervention-benchmark
|
|
79
|
+
pip install -e .
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Alternatively, you can install from **PyPI** (Yet to be made available):
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install dp-fair-intervention-benchmark
|
|
86
|
+
````
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Repository Structure
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
├── data/ # Pre-offered datasets
|
|
94
|
+
├── src/ # Core source code
|
|
95
|
+
├── examples/ # Some demo
|
|
96
|
+
├── tests/ # Unit tests
|
|
97
|
+
└── README.md
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Quick Start
|
|
103
|
+
|
|
104
|
+
Here is a minimal usage example:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from BenchmarkDPFair.DataGenerator import generate_data, DatasetGeneratorConfig
|
|
108
|
+
from BenchmarkDPFair.Benchmark import BenchmarkDatasetConfig, BenchmarkInfo
|
|
109
|
+
|
|
110
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
111
|
+
|
|
112
|
+
# Generate Data
|
|
113
|
+
data_conf = DatasetGeneratorConfig(
|
|
114
|
+
name = "Adult",
|
|
115
|
+
target= "...",
|
|
116
|
+
synthesizer = "aim",
|
|
117
|
+
root_dir="./data",
|
|
118
|
+
sensitive_attr = "...",
|
|
119
|
+
categorical_cols = [...],
|
|
120
|
+
sensitive_cols = [...],
|
|
121
|
+
privacy_budgets=[...],
|
|
122
|
+
binary_encoder=...
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
generate_data("adult.csv", data_conf, verbose=True) # Saves as CSV
|
|
126
|
+
|
|
127
|
+
# Dataset configuration
|
|
128
|
+
benchmark_config = BenchmarkInfo(
|
|
129
|
+
dp_method="aim",
|
|
130
|
+
output_dir="./data/Adult/output/",
|
|
131
|
+
seeds = [...],
|
|
132
|
+
eps = [...]
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
benchmark_dataset = BenchmarkDatasetConfig(
|
|
136
|
+
name = "Adult",
|
|
137
|
+
target= "income",
|
|
138
|
+
root_dir="./data",
|
|
139
|
+
sensitive_attr = "...",
|
|
140
|
+
index_col="...",
|
|
141
|
+
categorical_cols = [...],
|
|
142
|
+
sensitive_cols = [...],
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
benchmark(benchmark_info=benchmark_config, data_conf=benchmark_dataset)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
More detailed examples can be found in the [`example/`](example/) directory.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
License: **MIT**
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Contributing
|
|
159
|
+
|
|
160
|
+
Contributions are welcome:
|
|
161
|
+
|
|
162
|
+
* Open an issue for bug reports or feature requests
|
|
163
|
+
* Submit a pull request to the `main` branch for code contributions
|
|
164
|
+
|
|
165
|
+
---
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# DP+Fair Benchmarking Framework
|
|
2
|
+
|
|
3
|
+
This repository provides a Python framework for **benchmarking fairness mechanisms** on **Differentially Private Synthetic Data**.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- ⚡ Simple, reproducible setup for benchmarking algorithms
|
|
11
|
+
- 🧩 Flexible API to plug in any classifier implementing `fit`, `predict`, and `predict_proba`
|
|
12
|
+
- 📊 Pre-offered datasets included under `data/`
|
|
13
|
+
- 🔬 Configurable experiment settings: dataset schema, dataset synthesizer, seeds, privacy-budget, input/outputs, classifier, data pre-processing.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
To install, clone the repository and install dependencies:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
git clone https://github.com/vinicius-verona/dp-fair-intervention-benchmark.git
|
|
23
|
+
cd dp-fair-intervention-benchmark
|
|
24
|
+
pip install -e .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Alternatively, you can install from **PyPI** (Yet to be made available):
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install dp-fair-intervention-benchmark
|
|
31
|
+
````
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Repository Structure
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
├── data/ # Pre-offered datasets
|
|
39
|
+
├── src/ # Core source code
|
|
40
|
+
├── examples/ # Some demo
|
|
41
|
+
├── tests/ # Unit tests
|
|
42
|
+
└── README.md
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
Here is a minimal usage example:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from BenchmarkDPFair.DataGenerator import generate_data, DatasetGeneratorConfig
|
|
53
|
+
from BenchmarkDPFair.Benchmark import BenchmarkDatasetConfig, BenchmarkInfo
|
|
54
|
+
|
|
55
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
56
|
+
|
|
57
|
+
# Generate Data
|
|
58
|
+
data_conf = DatasetGeneratorConfig(
|
|
59
|
+
name = "Adult",
|
|
60
|
+
target= "...",
|
|
61
|
+
synthesizer = "aim",
|
|
62
|
+
root_dir="./data",
|
|
63
|
+
sensitive_attr = "...",
|
|
64
|
+
categorical_cols = [...],
|
|
65
|
+
sensitive_cols = [...],
|
|
66
|
+
privacy_budgets=[...],
|
|
67
|
+
binary_encoder=...
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
generate_data("adult.csv", data_conf, verbose=True) # Saves as CSV
|
|
71
|
+
|
|
72
|
+
# Dataset configuration
|
|
73
|
+
benchmark_config = BenchmarkInfo(
|
|
74
|
+
dp_method="aim",
|
|
75
|
+
output_dir="./data/Adult/output/",
|
|
76
|
+
seeds = [...],
|
|
77
|
+
eps = [...]
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
benchmark_dataset = BenchmarkDatasetConfig(
|
|
81
|
+
name = "Adult",
|
|
82
|
+
target= "income",
|
|
83
|
+
root_dir="./data",
|
|
84
|
+
sensitive_attr = "...",
|
|
85
|
+
index_col="...",
|
|
86
|
+
categorical_cols = [...],
|
|
87
|
+
sensitive_cols = [...],
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
benchmark(benchmark_info=benchmark_config, data_conf=benchmark_dataset)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
More detailed examples can be found in the [`example/`](example/) directory.
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## License
|
|
98
|
+
|
|
99
|
+
License: **MIT**
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Contributing
|
|
104
|
+
|
|
105
|
+
Contributions are welcome:
|
|
106
|
+
|
|
107
|
+
* Open an issue for bug reports or feature requests
|
|
108
|
+
* Submit a pull request to the `main` branch for code contributions
|
|
109
|
+
|
|
110
|
+
---
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "BenchmarkDPFair"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A differentially private data synthesizer and fairness intervention benchmark framework"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name="Vinicius Gabriel Angelozzi Verona de Resende", email="verona.projects@tutanota.com" },
|
|
11
|
+
{ name="Héber Hwang Arcolezi", email="heber.hwang-arcolezi@etsmtl.ca" }
|
|
12
|
+
]
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
requires-python = ">=3.9"
|
|
15
|
+
dependencies = [
|
|
16
|
+
"aif360>=0.6.1",
|
|
17
|
+
"aif360[inFairness]>=0.6.1",
|
|
18
|
+
"fairlearn>=0.12.0",
|
|
19
|
+
"inFairness>=0.2.3",
|
|
20
|
+
"matplotlib>=3.9.4",
|
|
21
|
+
"matplotlib-inline>=0.1.7",
|
|
22
|
+
"numpy>=1.26.4",
|
|
23
|
+
"pandas>=2.2.3",
|
|
24
|
+
"scikit-learn>=1.6.1",
|
|
25
|
+
"scipy>=1.13.1",
|
|
26
|
+
"smartnoise-sql>=1.0.6",
|
|
27
|
+
"smartnoise-synth>=1.0.5",
|
|
28
|
+
"tabulate>=0.9.0",
|
|
29
|
+
"ucimlrepo>=0.0.7",
|
|
30
|
+
"xgboost>=2.1.1",
|
|
31
|
+
"tensorflow>=2.19.0",
|
|
32
|
+
"tensorflow-io-gcs-filesystem>=0.37.1",
|
|
33
|
+
"cvxpy>=1.6.5",
|
|
34
|
+
"jax>=0.4.30,<0.5",
|
|
35
|
+
"jaxlib>=0.4.30,<0.5",
|
|
36
|
+
"chex>=0.1.87"
|
|
37
|
+
|
|
38
|
+
]
|
|
39
|
+
license = { file = "LICENSE" }
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
where = ["src"]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/vinicius-verona/dp-fair-intervention-benchmark"
|
|
46
|
+
Issues = "https://github.com/vinicius-verona/dp-fair-intervention-benchmark/issues"
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import warnings
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import inspect
|
|
6
|
+
|
|
7
|
+
from typing import Callable, List, Any, Optional, Tuple, Union
|
|
8
|
+
from sklearn.model_selection import train_test_split
|
|
9
|
+
from tabulate import tabulate
|
|
10
|
+
|
|
11
|
+
from .dataconf import BenchmarkDatasetConfig
|
|
12
|
+
from .utils.types import FloatOrTuple, DFTuple
|
|
13
|
+
from .utils.verifiers import check_data_loader, check_splitdata, check_target, read_verification, check_dict
|
|
14
|
+
|
|
15
|
+
from .utils.benchmark import Benchmark
|
|
16
|
+
from .utils.auxiliar import save_experiment
|
|
17
|
+
|
|
18
|
+
DEFAULT_SEEDS : List[float]= [5,42,253,4112,32645,602627,153073,53453,178753,243421,767707,113647,796969,553067,96797,133843,6977,460403,126613,583879]
|
|
19
|
+
DEFAULT_EPS : List[float] = [0.05, 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 5, 10, 15, 20]
|
|
20
|
+
DP_ALGORITHM : str = ""
|
|
21
|
+
|
|
22
|
+
class BenchmarkInfo:
|
|
23
|
+
def __init__(self, dp_method:str, output_dir: str, data_loader: Optional[Callable[..., DFTuple]] = None, dlkwargs: Union[dict, set] = {},
|
|
24
|
+
split_data: Optional[FloatOrTuple] = None, normalize: bool = True, seeds: List[float] = DEFAULT_SEEDS,
|
|
25
|
+
eps: List[Union[float,int]] = DEFAULT_EPS, classifier: Any = None, classifier_kwargs: Optional[Union[dict,set]] = None):
|
|
26
|
+
"""
|
|
27
|
+
Set of possible confiigurations for the Benchmark experiments.
|
|
28
|
+
|
|
29
|
+
**In case you do not use our own generator, read the documentation first to understand how the benchmark expects the data to be organized.**
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
dp_method : str
|
|
34
|
+
Which DP symthetic data generator was used
|
|
35
|
+
output_dir : str
|
|
36
|
+
Directory to save the experiment logs and metrics.
|
|
37
|
+
data_loader : Callable, optional
|
|
38
|
+
In case a new data loader needs to be used, refer to the documentation to understand the default data loader's behaviour. data_loader must accept seed as an argument and also kwargs.
|
|
39
|
+
dlkwargs : dict | set, optional
|
|
40
|
+
Custom parameters for the data loader.
|
|
41
|
+
split_data : FloatOrTuple, optional
|
|
42
|
+
Split distributions used while loading data. If not provided, the final distributions are **0.6, 0.2 and 0.2**, which is `split_data = (0.4, 0.5)`.
|
|
43
|
+
normalize : bool, optional
|
|
44
|
+
Allow MinMax normalization of the data. Default is **True**.
|
|
45
|
+
seeds : List[int], optional
|
|
46
|
+
List of seeds for the benchmark. Used to increase reproducibility.
|
|
47
|
+
eps : List[float|int], optional
|
|
48
|
+
List of DP epsilons (privacy budget) analysed during the benchmark.
|
|
49
|
+
classifier : Any, optional
|
|
50
|
+
Custom classifier. **Must implement fit, predict and predict_proba**. Default is [XGBoost](https://xgboost.readthedocs.io/en/stable/).
|
|
51
|
+
classifier_kwargs : dict | set, optional
|
|
52
|
+
Custom parameters for the classifier.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
self.dp_method = dp_method
|
|
56
|
+
self.output_dir = output_dir
|
|
57
|
+
self.normalize = normalize
|
|
58
|
+
self.seeds = seeds
|
|
59
|
+
self.eps = eps
|
|
60
|
+
|
|
61
|
+
global DP_ALGORITHM
|
|
62
|
+
DP_ALGORITHM = self.dp_method
|
|
63
|
+
|
|
64
|
+
check_splitdata(split_data)
|
|
65
|
+
self.split = split_data
|
|
66
|
+
|
|
67
|
+
# Wrap user-supplied function with enforcement
|
|
68
|
+
self.data_loader = check_data_loader(data_loader) if data_loader is not None else self.__data_loader
|
|
69
|
+
self.custom_loader = False if data_loader is None else True
|
|
70
|
+
self.dlkwargs = dlkwargs
|
|
71
|
+
|
|
72
|
+
self.classifier = classifier
|
|
73
|
+
self.classifier_kwargs = classifier_kwargs
|
|
74
|
+
|
|
75
|
+
def dataloader(self, **kwargs) -> DFTuple:
|
|
76
|
+
"""
|
|
77
|
+
Data loader, by default assumes that within the `baseline_dir` there exists a CSV file with the name set in `filename` parameter.
|
|
78
|
+
|
|
79
|
+
If the `split_data` has been set before, it will look for the file mentioned and split it into three sets following the provided distribution.
|
|
80
|
+
|
|
81
|
+
The split happens sequentially, if two values has been provided to split, the first split (train+test) happens normally, and then the test set is split following the second distribution.
|
|
82
|
+
|
|
83
|
+
If only one number has been provided and no test directory found, the split happens sequentially following the distribution of the test set.
|
|
84
|
+
|
|
85
|
+
**Please refer to the documentation to understand how the default dataloader expects the directory structure to be like.**
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
data_conf : DatasetConf
|
|
90
|
+
Configuration of the desired dataset.
|
|
91
|
+
filename : str
|
|
92
|
+
The name of the CSV file to load.
|
|
93
|
+
seed : int
|
|
94
|
+
The current seed used to load the file and split the data.
|
|
95
|
+
verbose : bool, optional
|
|
96
|
+
If `true` prints information on the laoded dataset.
|
|
97
|
+
extra_processing : Callable, optional
|
|
98
|
+
Custom (users) porcessing function applied to loaded data. Will be called using kwargs and the loaded data as arguments.
|
|
99
|
+
kwargs : Any, optional,
|
|
100
|
+
If an extra processing function is provided, will be forwarded while calling, with the loaded dataset.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
----------
|
|
104
|
+
Three tuple[pd.DataFrame, pd.DataFrame]
|
|
105
|
+
- A 2-tuple of pandas DataFrames `(X, y)`.
|
|
106
|
+
"""
|
|
107
|
+
return self.data_loader(**kwargs)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@check_data_loader
|
|
111
|
+
def __data_loader(self, data_conf: BenchmarkDatasetConfig, filename: str, seed: int, **kwargs) -> DFTuple:
|
|
112
|
+
return _load_data(data_conf, filename, seed, split=self.split, **kwargs)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _load_data(data_conf: BenchmarkDatasetConfig, filename: str, seed: int, epsilon: Optional[float] = None,
|
|
116
|
+
verbose: bool=True, split: Optional[FloatOrTuple] = None, extra_processing: Optional[Callable] = None, **kwargs) -> DFTuple:
|
|
117
|
+
|
|
118
|
+
if verbose:
|
|
119
|
+
print(f"** Loading dataset {data_conf.name.upper()} **")
|
|
120
|
+
|
|
121
|
+
if split is None:
|
|
122
|
+
split = (0.4, 0.5)
|
|
123
|
+
|
|
124
|
+
base, ext = os.path.splitext(filename)
|
|
125
|
+
base_pattern = base.rsplit("_", 1)
|
|
126
|
+
|
|
127
|
+
if (os.path.dirname(filename)):
|
|
128
|
+
test_path = os.path.dirname(os.path.dirname(filename)) + "DP-dataset-test/"
|
|
129
|
+
else:
|
|
130
|
+
test_path = f"{data_conf.dir}/{data_conf.name}/{DP_ALGORITHM}/DP-dataset-test/"
|
|
131
|
+
filename = f"{data_conf.dir}/{data_conf.name}/{DP_ALGORITHM}/DP-dataset-{f'epsilon-{epsilon}' if epsilon is not None else 'train'}/{filename}"
|
|
132
|
+
|
|
133
|
+
test_filename = f"{base_pattern[0]}_test{ext}"
|
|
134
|
+
|
|
135
|
+
cols = list(dict.fromkeys(data_conf.usecols + [data_conf.index_col] if data_conf.index_col else data_conf.usecols))
|
|
136
|
+
ds = pd.read_csv(filename, usecols=lambda col: col in cols)
|
|
137
|
+
|
|
138
|
+
if data_conf.index_col:
|
|
139
|
+
ds.set_index(data_conf.index_col, inplace=True)
|
|
140
|
+
|
|
141
|
+
# Verify if data was read successfully
|
|
142
|
+
read_verification(ds, data_conf.usecols)
|
|
143
|
+
|
|
144
|
+
# Apply extra processing to dataset if the user wants it
|
|
145
|
+
if extra_processing is not None:
|
|
146
|
+
extra_processing(ds, **kwargs)
|
|
147
|
+
|
|
148
|
+
# Ensure all dataset is numerical
|
|
149
|
+
for col in data_conf.categorical_cols:
|
|
150
|
+
if not pd.api.types.is_numeric_dtype(ds[col]):
|
|
151
|
+
ds[col] = ds[col].astype('category').cat.codes # Int encode
|
|
152
|
+
|
|
153
|
+
X = ds.drop(columns=[data_conf.target])
|
|
154
|
+
y = ds[data_conf.target]
|
|
155
|
+
|
|
156
|
+
# Split data
|
|
157
|
+
if not os.path.exists(test_path) or not os.path.exists(test_path + "/" + test_filename):
|
|
158
|
+
if verbose:
|
|
159
|
+
train_split_distrib = 1 - split[0] if isinstance(split, Tuple) else split
|
|
160
|
+
val_split_distrib = split[0] * (1 - split[1]) if isinstance(split, Tuple) else split * (1 - split)
|
|
161
|
+
test_split_distrib = split[0] * split[1] if isinstance(split, Tuple) else split * split
|
|
162
|
+
print(f"[WARN] Test directory and/or file with test set not found, the provided {filename} will be split into three sets with distributions {(train_split_distrib, val_split_distrib, test_split_distrib)}.")
|
|
163
|
+
print(f" This is the path we are looking for: {test_path + '/' + test_filename}.\n")
|
|
164
|
+
|
|
165
|
+
# No test path found, so split the data from filename
|
|
166
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split[0] if isinstance(split, Tuple) else split, random_state=seed)
|
|
167
|
+
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=split[1] if isinstance(split, Tuple) else split, random_state=seed)
|
|
168
|
+
|
|
169
|
+
else:
|
|
170
|
+
X_train = X
|
|
171
|
+
y_train = y
|
|
172
|
+
|
|
173
|
+
test_ds = pd.read_csv(test_path + "/" + test_filename, usecols=lambda col: col in cols)
|
|
174
|
+
|
|
175
|
+
if data_conf.index_col:
|
|
176
|
+
test_ds.set_index(data_conf.index_col, inplace=True)
|
|
177
|
+
|
|
178
|
+
# Verify if data was read successfully
|
|
179
|
+
read_verification(test_ds, data_conf.usecols)
|
|
180
|
+
|
|
181
|
+
# Apply extra processing to dataset if the user wants it
|
|
182
|
+
if extra_processing is not None:
|
|
183
|
+
extra_processing(test_ds, **kwargs)
|
|
184
|
+
|
|
185
|
+
X_test = test_ds.drop(columns=[data_conf.target])
|
|
186
|
+
y_test = test_ds[data_conf.target]
|
|
187
|
+
|
|
188
|
+
if isinstance(split, Tuple):
|
|
189
|
+
print(f"[WARN] You provided a tuple {split} of splitting distribution and a test directory and file has been found in {test_path}, the second value of the tuple will be used.\n")
|
|
190
|
+
|
|
191
|
+
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=split[1] if isinstance(split, Tuple) else split, random_state=seed)
|
|
192
|
+
|
|
193
|
+
if verbose:
|
|
194
|
+
data = [
|
|
195
|
+
["X_train", X_train.shape],
|
|
196
|
+
["X_val", X_val.shape],
|
|
197
|
+
["X_test", X_test.shape],
|
|
198
|
+
["y_train", y_train.shape],
|
|
199
|
+
["y_val", y_val.shape],
|
|
200
|
+
["y_test", y_test.shape],
|
|
201
|
+
]
|
|
202
|
+
print("\n#### Data Information ####")
|
|
203
|
+
print(tabulate(data, headers=["Dataset", "Shape"], tablefmt="github"))
|
|
204
|
+
print("###########################\n")
|
|
205
|
+
|
|
206
|
+
# Check that the target column is binary
|
|
207
|
+
check_target(y_train, data_conf.target)
|
|
208
|
+
check_target(y_val, data_conf.target)
|
|
209
|
+
check_target(y_test, data_conf.target)
|
|
210
|
+
|
|
211
|
+
return (X_train, y_train), (X_val, y_val), (X_test, y_test)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
############# Experiments #############
|
|
215
|
+
def _experiment(seed, dataset_conf: BenchmarkDatasetConfig, benchmark_info: BenchmarkInfo, savefile):
|
|
216
|
+
np.random.seed(seed)
|
|
217
|
+
output_dir = f"{benchmark_info.output_dir}/{dataset_conf.name}/{benchmark_info.dp_method}/results/"
|
|
218
|
+
|
|
219
|
+
print(f"\n*********************** Fair-only - seed = {seed} ***********************\n")
|
|
220
|
+
extra_kwargs = {
|
|
221
|
+
"data_conf": dataset_conf,
|
|
222
|
+
"filename": dataset_conf.name + f"_split_dataset_seed_{seed}_train.csv",
|
|
223
|
+
"custom_loader": benchmark_info.custom_loader,
|
|
224
|
+
"epsilon": None,
|
|
225
|
+
"seed": seed,
|
|
226
|
+
"classifier": benchmark_info.classifier,
|
|
227
|
+
"classifier_kwargs": benchmark_info.classifier_kwargs
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
original_experiment = Benchmark(
|
|
231
|
+
name="baseline", data_loader=benchmark_info.data_loader,
|
|
232
|
+
normalize=benchmark_info.normalize, seed=seed, dlkwargs=benchmark_info.dlkwargs, ekwargs = extra_kwargs
|
|
233
|
+
)
|
|
234
|
+
original_experiment.run()
|
|
235
|
+
|
|
236
|
+
save_experiment(original_experiment, seed, filename=savefile, path=output_dir,synth=benchmark_info.dp_method)
|
|
237
|
+
|
|
238
|
+
del original_experiment
|
|
239
|
+
|
|
240
|
+
for epsilon in benchmark_info.eps:
|
|
241
|
+
print(f"\n*********************** DP & DP+Fair | ε={epsilon} ***********************\n")
|
|
242
|
+
extra_kwargs = {
|
|
243
|
+
"data_conf": dataset_conf,
|
|
244
|
+
"filename": dataset_conf.name + f"_split_dataset_seed_{seed}_epsilon-{epsilon}.csv",
|
|
245
|
+
"custom_loader": benchmark_info.custom_loader,
|
|
246
|
+
"epsilon": epsilon,
|
|
247
|
+
"seed": seed,
|
|
248
|
+
"classifier": benchmark_info.classifier,
|
|
249
|
+
"classifier_kwargs": benchmark_info.classifier_kwargs
|
|
250
|
+
}
|
|
251
|
+
dp_experiment = Benchmark(
|
|
252
|
+
name="dp", data_loader=benchmark_info.data_loader,
|
|
253
|
+
normalize=benchmark_info.normalize, seed=seed, dlkwargs=benchmark_info.dlkwargs, ekwargs=extra_kwargs
|
|
254
|
+
)
|
|
255
|
+
dp_experiment.run()
|
|
256
|
+
|
|
257
|
+
save_experiment(dp_experiment, seed, epsilon, filename=savefile, path=output_dir,synth=benchmark_info.dp_method)
|
|
258
|
+
|
|
259
|
+
del dp_experiment.data_loader, dp_experiment
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def benchmark(data_conf: BenchmarkDatasetConfig, benchmark_info: BenchmarkInfo):
|
|
263
|
+
"""
|
|
264
|
+
Execute benchmark of Fairness interventions on models trained on original data and differentially private synthetic data.
|
|
265
|
+
|
|
266
|
+
**The results obtained are output into a csv file in the defined output directory.**
|
|
267
|
+
|
|
268
|
+
Parameters
|
|
269
|
+
-----------
|
|
270
|
+
data_conf: BenchmarkDatasetConfig
|
|
271
|
+
Configurations on the dataset used
|
|
272
|
+
|
|
273
|
+
benchmark_info: BenchmarkInfo
|
|
274
|
+
Configurations about the experiments
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
print(f"Running DP Benchmark on dataset: '{data_conf.name}' with target: '{data_conf.target}' and sensitive attribute: '{data_conf.sensitive_attr}'")
|
|
278
|
+
|
|
279
|
+
savefile = f"benchmark_results_seeds_{'_'.join(str(seed) for seed in benchmark_info.seeds)}_eps_{'_'.join(str(e) for e in benchmark_info.eps)}_synth_{benchmark_info.dp_method}.csv"
|
|
280
|
+
|
|
281
|
+
for seed in benchmark_info.seeds:
|
|
282
|
+
_experiment(seed, data_conf, benchmark_info, savefile)
|