BenchmarkDPFair 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. benchmarkdpfair-0.1.0/LICENSE +21 -0
  2. benchmarkdpfair-0.1.0/PKG-INFO +165 -0
  3. benchmarkdpfair-0.1.0/README.md +110 -0
  4. benchmarkdpfair-0.1.0/pyproject.toml +46 -0
  5. benchmarkdpfair-0.1.0/setup.cfg +4 -0
  6. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/__init__.py +4 -0
  7. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/benchmark.py +282 -0
  8. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/dataconf.py +58 -0
  9. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/__init__.py +0 -0
  10. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/auxiliar.py +94 -0
  11. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/benchmark.py +176 -0
  12. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/inp.py +141 -0
  13. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/pos.py +183 -0
  14. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/pre.py +233 -0
  15. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/types.py +15 -0
  16. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/Benchmark/utils/verifiers.py +102 -0
  17. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/__init__.py +5 -0
  18. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/dataconf.py +94 -0
  19. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/datagen.py +246 -0
  20. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/DataGenerator/utils/verifiers.py +28 -0
  21. benchmarkdpfair-0.1.0/src/BenchmarkDPFair/__init__.py +4 -0
  22. benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/PKG-INFO +165 -0
  23. benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/SOURCES.txt +27 -0
  24. benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/dependency_links.txt +1 -0
  25. benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/requires.txt +21 -0
  26. benchmarkdpfair-0.1.0/src/BenchmarkDPFair.egg-info/top_level.txt +1 -0
  27. benchmarkdpfair-0.1.0/tests/test_benchmark.py +31 -0
  28. benchmarkdpfair-0.1.0/tests/test_dataconf.py +25 -0
  29. benchmarkdpfair-0.1.0/tests/test_datagen.py +0 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Vinicius Gabriel Angelozzi Verona de Resende, Héber Hwang Arcolezi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,165 @@
1
+ Metadata-Version: 2.4
2
+ Name: BenchmarkDPFair
3
+ Version: 0.1.0
4
+ Summary: A differentially private data synthesizer and fairness intervention benchmark framework
5
+ Author-email: Vinicius Gabriel Angelozzi Verona de Resende <verona.projects@tutanota.com>, Héber Hwang Arcolezi <heber.hwang-arcolezi@etsmtl.ca>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Vinicius Gabriel Angelozzi Verona de Resende, Héber Hwang Arcolezi
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/vinicius-verona/dp-fair-intervention-benchmark
29
+ Project-URL: Issues, https://github.com/vinicius-verona/dp-fair-intervention-benchmark/issues
30
+ Requires-Python: >=3.9
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE
33
+ Requires-Dist: aif360>=0.6.1
34
+ Requires-Dist: aif360[inFairness]>=0.6.1
35
+ Requires-Dist: fairlearn>=0.12.0
36
+ Requires-Dist: inFairness>=0.2.3
37
+ Requires-Dist: matplotlib>=3.9.4
38
+ Requires-Dist: matplotlib-inline>=0.1.7
39
+ Requires-Dist: numpy>=1.26.4
40
+ Requires-Dist: pandas>=2.2.3
41
+ Requires-Dist: scikit-learn>=1.6.1
42
+ Requires-Dist: scipy>=1.13.1
43
+ Requires-Dist: smartnoise-sql>=1.0.6
44
+ Requires-Dist: smartnoise-synth>=1.0.5
45
+ Requires-Dist: tabulate>=0.9.0
46
+ Requires-Dist: ucimlrepo>=0.0.7
47
+ Requires-Dist: xgboost>=2.1.1
48
+ Requires-Dist: tensorflow>=2.19.0
49
+ Requires-Dist: tensorflow-io-gcs-filesystem>=0.37.1
50
+ Requires-Dist: cvxpy>=1.6.5
51
+ Requires-Dist: jax<0.5,>=0.4.30
52
+ Requires-Dist: jaxlib<0.5,>=0.4.30
53
+ Requires-Dist: chex>=0.1.87
54
+ Dynamic: license-file
55
+
56
+ # DP+Fair Benchmarking Framework
57
+
58
+ This repository provides a Python framework for **benchmarking fairness mechanisms** on **Differentially Private Synthetic Data**.
59
+
60
+
61
+ ---
62
+
63
+ ## Features
64
+
65
+ - ⚡ Simple, reproducible setup for benchmarking algorithms
66
+ - 🧩 Flexible API to plug in any classifier implementing `fit`, `predict`, and `predict_proba`
67
+ - 📊 Pre-offered datasets included under `data/`
68
+ - 🔬 Configurable experiment settings: dataset schema, dataset synthesizer, seeds, privacy-budget, input/outputs, classifier, data pre-processing.
69
+
70
+ ---
71
+
72
+ ## Installation
73
+
74
+ To install, clone the repository and install dependencies:
75
+
76
+ ```bash
77
+ git clone https://github.com/vinicius-verona/dp-fair-intervention-benchmark.git
78
+ cd dp-fair-intervention-benchmark
79
+ pip install -e .
80
+ ```
81
+
82
+ Alternatively, you can install from **PyPI** (Yet to be made available):
83
+
84
+ ```bash
85
+ pip install dp-fair-intervention-benchmark
86
+ ````
87
+
88
+ ---
89
+
90
+ ## Repository Structure
91
+
92
+ ```
93
+ ├── data/ # Pre-offered datasets
94
+ ├── src/ # Core source code
95
+ ├── examples/ # Some demo
96
+ ├── tests/ # Unit tests
97
+ └── README.md
98
+ ```
99
+
100
+ ---
101
+
102
+ ## Quick Start
103
+
104
+ Here is a minimal usage example:
105
+
106
+ ```python
107
+ from BenchmarkDPFair.DataGenerator import generate_data, DatasetGeneratorConfig
108
+ from BenchmarkDPFair.Benchmark import BenchmarkDatasetConfig, BenchmarkInfo
109
+
110
+ from sklearn.ensemble import RandomForestClassifier
111
+
112
+ # Generate Data
113
+ data_conf = DatasetGeneratorConfig(
114
+ name = "Adult",
115
+ target= "...",
116
+ synthesizer = "aim",
117
+ root_dir="./data",
118
+ sensitive_attr = "...",
119
+ categorical_cols = [...],
120
+ sensitive_cols = [...],
121
+ privacy_budgets=[...],
122
+ binary_encoder=...
123
+ )
124
+
125
+ generate_data("adult.csv", data_conf, verbose=True) # Saves as CSV
126
+
127
+ # Dataset configuration
128
+ benchmark_config = BenchmarkInfo(
129
+ dp_method="aim",
130
+ output_dir="./data/Adult/output/",
131
+ seeds = [...],
132
+ eps = [...]
133
+ )
134
+
135
+ benchmark_dataset = BenchmarkDatasetConfig(
136
+ name = "Adult",
137
+ target= "income",
138
+ root_dir="./data",
139
+ sensitive_attr = "...",
140
+ index_col="...",
141
+ categorical_cols = [...],
142
+ sensitive_cols = [...],
143
+ )
144
+
145
+ benchmark(benchmark_info=benchmark_config, data_conf=benchmark_dataset)
146
+ ```
147
+
148
+ More detailed examples can be found in the [`example/`](example/) directory.
149
+
150
+ ---
151
+
152
+ ## License
153
+
154
+ License: **MIT**
155
+
156
+ ---
157
+
158
+ ## Contributing
159
+
160
+ Contributions are welcome:
161
+
162
+ * Open an issue for bug reports or feature requests
163
+ * Submit a pull request to the `main` branch for code contributions
164
+
165
+ ---
@@ -0,0 +1,110 @@
1
+ # DP+Fair Benchmarking Framework
2
+
3
+ This repository provides a Python framework for **benchmarking fairness mechanisms** on **Differentially Private Synthetic Data**.
4
+
5
+
6
+ ---
7
+
8
+ ## Features
9
+
10
+ - ⚡ Simple, reproducible setup for benchmarking algorithms
11
+ - 🧩 Flexible API to plug in any classifier implementing `fit`, `predict`, and `predict_proba`
12
+ - 📊 Pre-offered datasets included under `data/`
13
+ - 🔬 Configurable experiment settings: dataset schema, dataset synthesizer, seeds, privacy-budget, input/outputs, classifier, data pre-processing.
14
+
15
+ ---
16
+
17
+ ## Installation
18
+
19
+ To install, clone the repository and install dependencies:
20
+
21
+ ```bash
22
+ git clone https://github.com/vinicius-verona/dp-fair-intervention-benchmark.git
23
+ cd dp-fair-intervention-benchmark
24
+ pip install -e .
25
+ ```
26
+
27
+ Alternatively, you can install from **PyPI** (Yet to be made available):
28
+
29
+ ```bash
30
+ pip install dp-fair-intervention-benchmark
31
+ ````
32
+
33
+ ---
34
+
35
+ ## Repository Structure
36
+
37
+ ```
38
+ ├── data/ # Pre-offered datasets
39
+ ├── src/ # Core source code
40
+ ├── examples/ # Some demo
41
+ ├── tests/ # Unit tests
42
+ └── README.md
43
+ ```
44
+
45
+ ---
46
+
47
+ ## Quick Start
48
+
49
+ Here is a minimal usage example:
50
+
51
+ ```python
52
+ from BenchmarkDPFair.DataGenerator import generate_data, DatasetGeneratorConfig
53
+ from BenchmarkDPFair.Benchmark import BenchmarkDatasetConfig, BenchmarkInfo
54
+
55
+ from sklearn.ensemble import RandomForestClassifier
56
+
57
+ # Generate Data
58
+ data_conf = DatasetGeneratorConfig(
59
+ name = "Adult",
60
+ target= "...",
61
+ synthesizer = "aim",
62
+ root_dir="./data",
63
+ sensitive_attr = "...",
64
+ categorical_cols = [...],
65
+ sensitive_cols = [...],
66
+ privacy_budgets=[...],
67
+ binary_encoder=...
68
+ )
69
+
70
+ generate_data("adult.csv", data_conf, verbose=True) # Saves as CSV
71
+
72
+ # Dataset configuration
73
+ benchmark_config = BenchmarkInfo(
74
+ dp_method="aim",
75
+ output_dir="./data/Adult/output/",
76
+ seeds = [...],
77
+ eps = [...]
78
+ )
79
+
80
+ benchmark_dataset = BenchmarkDatasetConfig(
81
+ name = "Adult",
82
+ target= "income",
83
+ root_dir="./data",
84
+ sensitive_attr = "...",
85
+ index_col="...",
86
+ categorical_cols = [...],
87
+ sensitive_cols = [...],
88
+ )
89
+
90
+ benchmark(benchmark_info=benchmark_config, data_conf=benchmark_dataset)
91
+ ```
92
+
93
+ More detailed examples can be found in the [`example/`](example/) directory.
94
+
95
+ ---
96
+
97
+ ## License
98
+
99
+ License: **MIT**
100
+
101
+ ---
102
+
103
+ ## Contributing
104
+
105
+ Contributions are welcome:
106
+
107
+ * Open an issue for bug reports or feature requests
108
+ * Submit a pull request to the `main` branch for code contributions
109
+
110
+ ---
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "BenchmarkDPFair"
7
+ version = "0.1.0"
8
+ description = "A differentially private data synthesizer and fairness intervention benchmark framework"
9
+ authors = [
10
+ { name="Vinicius Gabriel Angelozzi Verona de Resende", email="verona.projects@tutanota.com" },
11
+ { name="Héber Hwang Arcolezi", email="heber.hwang-arcolezi@etsmtl.ca" }
12
+ ]
13
+ readme = "README.md"
14
+ requires-python = ">=3.9"
15
+ dependencies = [
16
+ "aif360>=0.6.1",
17
+ "aif360[inFairness]>=0.6.1",
18
+ "fairlearn>=0.12.0",
19
+ "inFairness>=0.2.3",
20
+ "matplotlib>=3.9.4",
21
+ "matplotlib-inline>=0.1.7",
22
+ "numpy>=1.26.4",
23
+ "pandas>=2.2.3",
24
+ "scikit-learn>=1.6.1",
25
+ "scipy>=1.13.1",
26
+ "smartnoise-sql>=1.0.6",
27
+ "smartnoise-synth>=1.0.5",
28
+ "tabulate>=0.9.0",
29
+ "ucimlrepo>=0.0.7",
30
+ "xgboost>=2.1.1",
31
+ "tensorflow>=2.19.0",
32
+ "tensorflow-io-gcs-filesystem>=0.37.1",
33
+ "cvxpy>=1.6.5",
34
+ "jax>=0.4.30,<0.5",
35
+ "jaxlib>=0.4.30,<0.5",
36
+ "chex>=0.1.87"
37
+
38
+ ]
39
+ license = { file = "LICENSE" }
40
+
41
+ [tool.setuptools.packages.find]
42
+ where = ["src"]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/vinicius-verona/dp-fair-intervention-benchmark"
46
+ Issues = "https://github.com/vinicius-verona/dp-fair-intervention-benchmark/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ from .benchmark import BenchmarkInfo,benchmark
2
+ from .dataconf import BenchmarkDatasetConfig
3
+
4
+ __all__ = ["BenchmarkInfo", "BenchmarkDatasetConfig", "benchmark"]
@@ -0,0 +1,282 @@
1
+ import os
2
+ import warnings
3
+ import pandas as pd
4
+ import numpy as np
5
+ import inspect
6
+
7
+ from typing import Callable, List, Any, Optional, Tuple, Union
8
+ from sklearn.model_selection import train_test_split
9
+ from tabulate import tabulate
10
+
11
+ from .dataconf import BenchmarkDatasetConfig
12
+ from .utils.types import FloatOrTuple, DFTuple
13
+ from .utils.verifiers import check_data_loader, check_splitdata, check_target, read_verification, check_dict
14
+
15
+ from .utils.benchmark import Benchmark
16
+ from .utils.auxiliar import save_experiment
17
+
18
+ DEFAULT_SEEDS : List[float]= [5,42,253,4112,32645,602627,153073,53453,178753,243421,767707,113647,796969,553067,96797,133843,6977,460403,126613,583879]
19
+ DEFAULT_EPS : List[float] = [0.05, 0.1, 0.25, 0.5, 0.75, 1, 2, 3, 5, 10, 15, 20]
20
+ DP_ALGORITHM : str = ""
21
+
22
+ class BenchmarkInfo:
23
+ def __init__(self, dp_method:str, output_dir: str, data_loader: Optional[Callable[..., DFTuple]] = None, dlkwargs: Union[dict, set] = {},
24
+ split_data: Optional[FloatOrTuple] = None, normalize: bool = True, seeds: List[float] = DEFAULT_SEEDS,
25
+ eps: List[Union[float,int]] = DEFAULT_EPS, classifier: Any = None, classifier_kwargs: Optional[Union[dict,set]] = None):
26
+ """
27
+ Set of possible confiigurations for the Benchmark experiments.
28
+
29
+ **In case you do not use our own generator, read the documentation first to understand how the benchmark expects the data to be organized.**
30
+
31
+ Parameters
32
+ ----------
33
+ dp_method : str
34
+ Which DP symthetic data generator was used
35
+ output_dir : str
36
+ Directory to save the experiment logs and metrics.
37
+ data_loader : Callable, optional
38
+ In case a new data loader needs to be used, refer to the documentation to understand the default data loader's behaviour. data_loader must accept seed as an argument and also kwargs.
39
+ dlkwargs : dict | set, optional
40
+ Custom parameters for the data loader.
41
+ split_data : FloatOrTuple, optional
42
+ Split distributions used while loading data. If not provided, the final distributions are **0.6, 0.2 and 0.2**, which is `split_data = (0.4, 0.5)`.
43
+ normalize : bool, optional
44
+ Allow MinMax normalization of the data. Default is **True**.
45
+ seeds : List[int], optional
46
+ List of seeds for the benchmark. Used to increase reproducibility.
47
+ eps : List[float|int], optional
48
+ List of DP epsilons (privacy budget) analysed during the benchmark.
49
+ classifier : Any, optional
50
+ Custom classifier. **Must implement fit, predict and predict_proba**. Default is [XGBoost](https://xgboost.readthedocs.io/en/stable/).
51
+ classifier_kwargs : dict | set, optional
52
+ Custom parameters for the classifier.
53
+ """
54
+
55
+ self.dp_method = dp_method
56
+ self.output_dir = output_dir
57
+ self.normalize = normalize
58
+ self.seeds = seeds
59
+ self.eps = eps
60
+
61
+ global DP_ALGORITHM
62
+ DP_ALGORITHM = self.dp_method
63
+
64
+ check_splitdata(split_data)
65
+ self.split = split_data
66
+
67
+ # Wrap user-supplied function with enforcement
68
+ self.data_loader = check_data_loader(data_loader) if data_loader is not None else self.__data_loader
69
+ self.custom_loader = False if data_loader is None else True
70
+ self.dlkwargs = dlkwargs
71
+
72
+ self.classifier = classifier
73
+ self.classifier_kwargs = classifier_kwargs
74
+
75
+ def dataloader(self, **kwargs) -> DFTuple:
76
+ """
77
+ Data loader, by default assumes that within the `baseline_dir` there exists a CSV file with the name set in `filename` parameter.
78
+
79
+ If the `split_data` has been set before, it will look for the file mentioned and split it into three sets following the provided distribution.
80
+
81
+ The split happens sequentially, if two values has been provided to split, the first split (train+test) happens normally, and then the test set is split following the second distribution.
82
+
83
+ If only one number has been provided and no test directory found, the split happens sequentially following the distribution of the test set.
84
+
85
+ **Please refer to the documentation to understand how the default dataloader expects the directory structure to be like.**
86
+
87
+ Parameters
88
+ ----------
89
+ data_conf : DatasetConf
90
+ Configuration of the desired dataset.
91
+ filename : str
92
+ The name of the CSV file to load.
93
+ seed : int
94
+ The current seed used to load the file and split the data.
95
+ verbose : bool, optional
96
+ If `true` prints information on the laoded dataset.
97
+ extra_processing : Callable, optional
98
+ Custom (users) porcessing function applied to loaded data. Will be called using kwargs and the loaded data as arguments.
99
+ kwargs : Any, optional,
100
+ If an extra processing function is provided, will be forwarded while calling, with the loaded dataset.
101
+
102
+ Returns
103
+ ----------
104
+ Three tuple[pd.DataFrame, pd.DataFrame]
105
+ - A 2-tuple of pandas DataFrames `(X, y)`.
106
+ """
107
+ return self.data_loader(**kwargs)
108
+
109
+
110
+ @check_data_loader
111
+ def __data_loader(self, data_conf: BenchmarkDatasetConfig, filename: str, seed: int, **kwargs) -> DFTuple:
112
+ return _load_data(data_conf, filename, seed, split=self.split, **kwargs)
113
+
114
+
115
+ def _load_data(data_conf: BenchmarkDatasetConfig, filename: str, seed: int, epsilon: Optional[float] = None,
116
+ verbose: bool=True, split: Optional[FloatOrTuple] = None, extra_processing: Optional[Callable] = None, **kwargs) -> DFTuple:
117
+
118
+ if verbose:
119
+ print(f"** Loading dataset {data_conf.name.upper()} **")
120
+
121
+ if split is None:
122
+ split = (0.4, 0.5)
123
+
124
+ base, ext = os.path.splitext(filename)
125
+ base_pattern = base.rsplit("_", 1)
126
+
127
+ if (os.path.dirname(filename)):
128
+ test_path = os.path.dirname(os.path.dirname(filename)) + "DP-dataset-test/"
129
+ else:
130
+ test_path = f"{data_conf.dir}/{data_conf.name}/{DP_ALGORITHM}/DP-dataset-test/"
131
+ filename = f"{data_conf.dir}/{data_conf.name}/{DP_ALGORITHM}/DP-dataset-{f'epsilon-{epsilon}' if epsilon is not None else 'train'}/{filename}"
132
+
133
+ test_filename = f"{base_pattern[0]}_test{ext}"
134
+
135
+ cols = list(dict.fromkeys(data_conf.usecols + [data_conf.index_col] if data_conf.index_col else data_conf.usecols))
136
+ ds = pd.read_csv(filename, usecols=lambda col: col in cols)
137
+
138
+ if data_conf.index_col:
139
+ ds.set_index(data_conf.index_col, inplace=True)
140
+
141
+ # Verify if data was read successfully
142
+ read_verification(ds, data_conf.usecols)
143
+
144
+ # Apply extra processing to dataset if the user wants it
145
+ if extra_processing is not None:
146
+ extra_processing(ds, **kwargs)
147
+
148
+ # Ensure all dataset is numerical
149
+ for col in data_conf.categorical_cols:
150
+ if not pd.api.types.is_numeric_dtype(ds[col]):
151
+ ds[col] = ds[col].astype('category').cat.codes # Int encode
152
+
153
+ X = ds.drop(columns=[data_conf.target])
154
+ y = ds[data_conf.target]
155
+
156
+ # Split data
157
+ if not os.path.exists(test_path) or not os.path.exists(test_path + "/" + test_filename):
158
+ if verbose:
159
+ train_split_distrib = 1 - split[0] if isinstance(split, Tuple) else split
160
+ val_split_distrib = split[0] * (1 - split[1]) if isinstance(split, Tuple) else split * (1 - split)
161
+ test_split_distrib = split[0] * split[1] if isinstance(split, Tuple) else split * split
162
+ print(f"[WARN] Test directory and/or file with test set not found, the provided {filename} will be split into three sets with distributions {(train_split_distrib, val_split_distrib, test_split_distrib)}.")
163
+ print(f" This is the path we are looking for: {test_path + '/' + test_filename}.\n")
164
+
165
+ # No test path found, so split the data from filename
166
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split[0] if isinstance(split, Tuple) else split, random_state=seed)
167
+ X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=split[1] if isinstance(split, Tuple) else split, random_state=seed)
168
+
169
+ else:
170
+ X_train = X
171
+ y_train = y
172
+
173
+ test_ds = pd.read_csv(test_path + "/" + test_filename, usecols=lambda col: col in cols)
174
+
175
+ if data_conf.index_col:
176
+ test_ds.set_index(data_conf.index_col, inplace=True)
177
+
178
+ # Verify if data was read successfully
179
+ read_verification(test_ds, data_conf.usecols)
180
+
181
+ # Apply extra processing to dataset if the user wants it
182
+ if extra_processing is not None:
183
+ extra_processing(test_ds, **kwargs)
184
+
185
+ X_test = test_ds.drop(columns=[data_conf.target])
186
+ y_test = test_ds[data_conf.target]
187
+
188
+ if isinstance(split, Tuple):
189
+ print(f"[WARN] You provided a tuple {split} of splitting distribution and a test directory and file has been found in {test_path}, the second value of the tuple will be used.\n")
190
+
191
+ X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=split[1] if isinstance(split, Tuple) else split, random_state=seed)
192
+
193
+ if verbose:
194
+ data = [
195
+ ["X_train", X_train.shape],
196
+ ["X_val", X_val.shape],
197
+ ["X_test", X_test.shape],
198
+ ["y_train", y_train.shape],
199
+ ["y_val", y_val.shape],
200
+ ["y_test", y_test.shape],
201
+ ]
202
+ print("\n#### Data Information ####")
203
+ print(tabulate(data, headers=["Dataset", "Shape"], tablefmt="github"))
204
+ print("###########################\n")
205
+
206
+ # Check that the target column is binary
207
+ check_target(y_train, data_conf.target)
208
+ check_target(y_val, data_conf.target)
209
+ check_target(y_test, data_conf.target)
210
+
211
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test)
212
+
213
+
214
+ ############# Experiments #############
215
+ def _experiment(seed, dataset_conf: BenchmarkDatasetConfig, benchmark_info: BenchmarkInfo, savefile):
216
+ np.random.seed(seed)
217
+ output_dir = f"{benchmark_info.output_dir}/{dataset_conf.name}/{benchmark_info.dp_method}/results/"
218
+
219
+ print(f"\n*********************** Fair-only - seed = {seed} ***********************\n")
220
+ extra_kwargs = {
221
+ "data_conf": dataset_conf,
222
+ "filename": dataset_conf.name + f"_split_dataset_seed_{seed}_train.csv",
223
+ "custom_loader": benchmark_info.custom_loader,
224
+ "epsilon": None,
225
+ "seed": seed,
226
+ "classifier": benchmark_info.classifier,
227
+ "classifier_kwargs": benchmark_info.classifier_kwargs
228
+ }
229
+
230
+ original_experiment = Benchmark(
231
+ name="baseline", data_loader=benchmark_info.data_loader,
232
+ normalize=benchmark_info.normalize, seed=seed, dlkwargs=benchmark_info.dlkwargs, ekwargs = extra_kwargs
233
+ )
234
+ original_experiment.run()
235
+
236
+ save_experiment(original_experiment, seed, filename=savefile, path=output_dir,synth=benchmark_info.dp_method)
237
+
238
+ del original_experiment
239
+
240
+ for epsilon in benchmark_info.eps:
241
+ print(f"\n*********************** DP & DP+Fair | ε={epsilon} ***********************\n")
242
+ extra_kwargs = {
243
+ "data_conf": dataset_conf,
244
+ "filename": dataset_conf.name + f"_split_dataset_seed_{seed}_epsilon-{epsilon}.csv",
245
+ "custom_loader": benchmark_info.custom_loader,
246
+ "epsilon": epsilon,
247
+ "seed": seed,
248
+ "classifier": benchmark_info.classifier,
249
+ "classifier_kwargs": benchmark_info.classifier_kwargs
250
+ }
251
+ dp_experiment = Benchmark(
252
+ name="dp", data_loader=benchmark_info.data_loader,
253
+ normalize=benchmark_info.normalize, seed=seed, dlkwargs=benchmark_info.dlkwargs, ekwargs=extra_kwargs
254
+ )
255
+ dp_experiment.run()
256
+
257
+ save_experiment(dp_experiment, seed, epsilon, filename=savefile, path=output_dir,synth=benchmark_info.dp_method)
258
+
259
+ del dp_experiment.data_loader, dp_experiment
260
+
261
+
262
+ def benchmark(data_conf: BenchmarkDatasetConfig, benchmark_info: BenchmarkInfo):
263
+ """
264
+ Execute benchmark of Fairness interventions on models trained on original data and differentially private synthetic data.
265
+
266
+ **The results obtained are output into a csv file in the defined output directory.**
267
+
268
+ Parameters
269
+ -----------
270
+ data_conf: BenchmarkDatasetConfig
271
+ Configurations on the dataset used
272
+
273
+ benchmark_info: BenchmarkInfo
274
+ Configurations about the experiments
275
+ """
276
+
277
+ print(f"Running DP Benchmark on dataset: '{data_conf.name}' with target: '{data_conf.target}' and sensitive attribute: '{data_conf.sensitive_attr}'")
278
+
279
+ savefile = f"benchmark_results_seeds_{'_'.join(str(seed) for seed in benchmark_info.seeds)}_eps_{'_'.join(str(e) for e in benchmark_info.eps)}_synth_{benchmark_info.dp_method}.csv"
280
+
281
+ for seed in benchmark_info.seeds:
282
+ _experiment(seed, data_conf, benchmark_info, savefile)