synthyverse 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. synthyverse-0.1.0/LICENSE +21 -0
  2. synthyverse-0.1.0/MANIFEST.in +5 -0
  3. synthyverse-0.1.0/PKG-INFO +195 -0
  4. synthyverse-0.1.0/README.md +137 -0
  5. synthyverse-0.1.0/logo/logo.png +0 -0
  6. synthyverse-0.1.0/pyproject.toml +3 -0
  7. synthyverse-0.1.0/requirements/evaluation/eval.txt +6 -0
  8. synthyverse-0.1.0/requirements/generators/arf.txt +2 -0
  9. synthyverse-0.1.0/requirements/generators/base.txt +1 -0
  10. synthyverse-0.1.0/requirements/generators/bn.txt +3 -0
  11. synthyverse-0.1.0/requirements/generators/ctgan.txt +2 -0
  12. synthyverse-0.1.0/requirements/generators/tvae.txt +2 -0
  13. synthyverse-0.1.0/setup.cfg +4 -0
  14. synthyverse-0.1.0/setup.py +60 -0
  15. synthyverse-0.1.0/synthyverse/__init__.py +0 -0
  16. synthyverse-0.1.0/synthyverse/__version__.py +1 -0
  17. synthyverse-0.1.0/synthyverse/benchmark/__init__.py +1 -0
  18. synthyverse-0.1.0/synthyverse/benchmark/benchmark.py +88 -0
  19. synthyverse-0.1.0/synthyverse/evaluation/__init__.py +1 -0
  20. synthyverse-0.1.0/synthyverse/evaluation/eval.py +120 -0
  21. synthyverse-0.1.0/synthyverse/evaluation/fidelity.py +230 -0
  22. synthyverse-0.1.0/synthyverse/evaluation/privacy.py +88 -0
  23. synthyverse-0.1.0/synthyverse/evaluation/utility.py +115 -0
  24. synthyverse-0.1.0/synthyverse/generators/__init__.py +23 -0
  25. synthyverse-0.1.0/synthyverse/generators/arf_generator/__init__.py +1 -0
  26. synthyverse-0.1.0/synthyverse/generators/arf_generator/arf.py +21 -0
  27. synthyverse-0.1.0/synthyverse/generators/base.py +43 -0
  28. synthyverse-0.1.0/synthyverse/generators/bn_generator/__init__.py +1 -0
  29. synthyverse-0.1.0/synthyverse/generators/bn_generator/bn.py +45 -0
  30. synthyverse-0.1.0/synthyverse/generators/ctgan_generator/__init__.py +1 -0
  31. synthyverse-0.1.0/synthyverse/generators/ctgan_generator/ct_gan.py +49 -0
  32. synthyverse-0.1.0/synthyverse/generators/tvae_generator/__init__.py +1 -0
  33. synthyverse-0.1.0/synthyverse/generators/tvae_generator/tvae.py +38 -0
  34. synthyverse-0.1.0/synthyverse/utils/__init__.py +3 -0
  35. synthyverse-0.1.0/synthyverse/utils/oneclass.py +316 -0
  36. synthyverse-0.1.0/synthyverse/utils/preprocessing.py +73 -0
  37. synthyverse-0.1.0/synthyverse/utils/reproducibility.py +18 -0
  38. synthyverse-0.1.0/synthyverse/utils/utils.py +122 -0
  39. synthyverse-0.1.0/synthyverse/utils/xgb_utils.py +20 -0
  40. synthyverse-0.1.0/synthyverse.egg-info/PKG-INFO +195 -0
  41. synthyverse-0.1.0/synthyverse.egg-info/SOURCES.txt +43 -0
  42. synthyverse-0.1.0/synthyverse.egg-info/dependency_links.txt +1 -0
  43. synthyverse-0.1.0/synthyverse.egg-info/requires.txt +37 -0
  44. synthyverse-0.1.0/synthyverse.egg-info/top_level.txt +1 -0
  45. synthyverse-0.1.0/tutorial.ipynb +185 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jim Achterberg
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include requirements/**/*
2
+ include README.md
3
+ include LICENSE
4
+ include logo/*
5
+ include tutorial.ipynb
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.1
2
+ Name: synthyverse
3
+ Version: 0.1.0
4
+ Summary: Synthetic data generation and evaluation library
5
+ Home-page: https://github.com/synthyverse/synthyverse
6
+ Author: Jim Achterberg, Saif Ul Islam, Zia Ur Rehman
7
+ Author-email:
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Provides-Extra: eval
24
+ Requires-Dist: torch; extra == "eval"
25
+ Requires-Dist: pandas; extra == "eval"
26
+ Requires-Dist: numpy; extra == "eval"
27
+ Requires-Dist: scikit-learn; extra == "eval"
28
+ Requires-Dist: xgboost; extra == "eval"
29
+ Requires-Dist: sdmetrics; extra == "eval"
30
+ Provides-Extra: ctgan
31
+ Requires-Dist: pandas; extra == "ctgan"
32
+ Requires-Dist: ctgan==0.10.0; extra == "ctgan"
33
+ Requires-Dist: pandas; extra == "ctgan"
34
+ Provides-Extra: arf
35
+ Requires-Dist: pandas; extra == "arf"
36
+ Requires-Dist: arfpy; extra == "arf"
37
+ Requires-Dist: pandas; extra == "arf"
38
+ Provides-Extra: bn
39
+ Requires-Dist: pandas; extra == "bn"
40
+ Requires-Dist: synthcity; extra == "bn"
41
+ Requires-Dist: opacus==1.5.3; extra == "bn"
42
+ Requires-Dist: pandas; extra == "bn"
43
+ Provides-Extra: tvae
44
+ Requires-Dist: pandas; extra == "tvae"
45
+ Requires-Dist: ctgan==0.10.0; extra == "tvae"
46
+ Requires-Dist: pandas; extra == "tvae"
47
+ Provides-Extra: full
48
+ Requires-Dist: sdmetrics; extra == "full"
49
+ Requires-Dist: arfpy; extra == "full"
50
+ Requires-Dist: opacus==1.5.3; extra == "full"
51
+ Requires-Dist: torch; extra == "full"
52
+ Requires-Dist: xgboost; extra == "full"
53
+ Requires-Dist: pandas; extra == "full"
54
+ Requires-Dist: scikit-learn; extra == "full"
55
+ Requires-Dist: numpy; extra == "full"
56
+ Requires-Dist: synthcity; extra == "full"
57
+ Requires-Dist: ctgan==0.10.0; extra == "full"
58
+
59
+ <table align="center" border="0">
60
+ <tr>
61
+ <td align="center">
62
+
63
+ <img src="logo/logo.png" alt="Synthyverse logo" width="250" height="auto">
64
+
65
+ <br/>
66
+ <br/>
67
+
68
+ Welcome to the synthyverse!
69
+
70
+ The most extensive ecosystem for synthetic data generation and evaluation in Python.
71
+
72
+ _The synthyverse is a work in progress. Please provide any suggestions through a GitHub Issue._
73
+
74
+ </td>
75
+ </tr>
76
+ </table>
77
+
78
+ <div style="clear: both;"></div>
79
+
80
+ # Features
81
+ - 🔧 **Highly modular installation.** Install only those modules which you require to keep your installation lightweight.
82
+ - 📚 **Most extensive library for synthetic data.** Any generator or metric can be quickly added without dependency conflicts due to synthyverse's modular installation. This allows the synthyverse to host the most generators and evaluation metrics out of any synthetic data library.
83
+ - ⚙️ **Benchmarking module for simplified synthetic data pipelines.** The benchmarking module executes a modular pipeline of synthetic data generation and evaluation. Choose a generator, set of evaluation metrics, and pipeline parameters, and obtain results on synthetic data quality.
84
+ - 👷 **Minimal preprocessing required.** All preprocessing is handled under the hood in the synthyverse, so no need for scaling, one-hot encoding, or handling missing values.
85
+
86
+ # Installation
87
+ The synthyverse is unique in its modular installation set-up. To avoid conflicting dependencies, we provide various installation templates. Each template installs only those dependencies which are required to access certain modules.
88
+
89
+ Templates provide installation for specific generators, the evaluation module, and more. Install multiple templates to get access to multiple modules of the synthyverse, e.g., multiple generators and evaluation.
90
+
91
+ _We strongly advise to only install templates which you require during a specific run. Installing multiple templates gives rise to potential dependency conflicts. Use separate virtual environments across installations. Note that the core installation without any template doesn't install any modules._
92
+
93
+ See the [overview of templates](synthyverse/TEMPLATES.md).
94
+
95
+ ### General Installation Template
96
+
97
+ ```bash
98
+ pip install synthyverse[template]
99
+ ```
100
+
101
+ ### Installation Examples
102
+ ```bash
103
+ pip install synthyverse[ctgan]
104
+ ```
105
+
106
+ ```bash
107
+ pip install synthyverse[arf,bn,ctgan,tvae]
108
+ ```
109
+
110
+ ```bash
111
+ pip install synthyverse[ctgan,eval]
112
+ ```
113
+
114
+
115
+ # Usage
116
+
117
+ ### Synthetic Data Generation
118
+ Import desired generator. Note that you can only import generators according to your installed synthyverse template.
119
+
120
+ See [all available generators](synthyverse/generators/GENERATORS.md).
121
+ ```python
122
+ from synthyverse.generators import ARFGenerator
123
+ generator = ARFGenerator(num_trees=20, random_state=0)
124
+ ```
125
+
126
+ Fit the generator.
127
+ ```python
128
+ from sklearn.datasets import load_breast_cancer
129
+ X = load_breast_cancer(as_frame=True).frame
130
+ generator.fit(X, discrete_features=["target"])
131
+ ```
132
+
133
+ Sample a synthetic dataset.
134
+ ```python
135
+ syn = generator.generate(len(X))
136
+ ```
137
+
138
+ ### Synthetic Data Evaluation
139
+ Choose a set of metrics. Either choose default metrics as a list, or provide them as a dictionary with carefully selected hyperparameters. Add a dash to the metric name to compute various configurations of the same evaluation metric.
140
+
141
+ See [all available metrics](synthyverse/evaluation/METRICS.md).
142
+ ```python
143
+ metrics = ["mle", "dcr", "similarity"]
144
+ metrics={
145
+ "mle-trts": {"train_set": "real"},
146
+ "mle-tstr": {"train_set": "synthetic"},
147
+ "dcr": {"estimates": ["mean", 0.01, 0.05]},
148
+ "similarity":{}
149
+ }
150
+ ```
151
+
152
+ Set-up a MetricEvaluator object.
153
+
154
+ ```python
155
+ from synthyverse.evaluation import MetricEvaluator
156
+
157
+ evaluator = MetricEvaluator(
158
+ metrics=metrics,
159
+ discrete_features=["target"],
160
+ target_column="target",
161
+ random_state=seed
162
+ )
163
+ ```
164
+
165
+ Evaluate the metrics with respect to the synthetic data, the training data used to fit the generator, and an independent holdout/test set of real data.
166
+
167
+ ```python
168
+ results = evaluator.evaluate(X_train, X_test, syn)
169
+ ```
170
+
171
+ ### Benchmarking
172
+
173
+ Set-up a benchmarking object. Supply the [generator name and its parameters](synthyverse/generators/GENERATORS.md), [evaluation metrics](synthyverse/evaluation/METRICS.md), the number of random train-test splits to fit the generator to, number of random initializations to fit the generator to, the number of synthetic sets to sample for each fitted generator, and the size of the test set.
174
+
175
+ ```python
176
+ from synthyverse.benchmark import TabularBenchmark
177
+
178
+ benchmark = TabularBenchmark(
179
+ generator_name="arf",
180
+ generator_params={"num_trees": 20},
181
+ n_random_splits=3,
182
+ n_inits=3,
183
+ n_generated_datasets=20,
184
+ metrics=["classifier_test", "mle", "dcr"],
185
+ test_size=0.3,
186
+ )
187
+ ```
188
+
189
+ Run the benchmarking pipeline on a dataset.
190
+ ```python
191
+ results = benchmark.run(X, target_column="target", discrete_columns=["target"])
192
+ ```
193
+
194
+ # Tutorials
195
+ - [Tabular Synthetic Data with the synthyverse: Introduction](tutorial.ipynb)
@@ -0,0 +1,137 @@
1
+ <table align="center" border="0">
2
+ <tr>
3
+ <td align="center">
4
+
5
+ <img src="logo/logo.png" alt="Synthyverse logo" width="250" height="auto">
6
+
7
+ <br/>
8
+ <br/>
9
+
10
+ Welcome to the synthyverse!
11
+
12
+ The most extensive ecosystem for synthetic data generation and evaluation in Python.
13
+
14
+ _The synthyverse is a work in progress. Please provide any suggestions through a GitHub Issue._
15
+
16
+ </td>
17
+ </tr>
18
+ </table>
19
+
20
+ <div style="clear: both;"></div>
21
+
22
+ # Features
23
+ - 🔧 **Highly modular installation.** Install only those modules which you require to keep your installation lightweight.
24
+ - 📚 **Most extensive library for synthetic data.** Any generator or metric can be quickly added without dependency conflicts due to synthyverse's modular installation. This allows the synthyverse to host the most generators and evaluation metrics out of any synthetic data library.
25
+ - ⚙️ **Benchmarking module for simplified synthetic data pipelines.** The benchmarking module executes a modular pipeline of synthetic data generation and evaluation. Choose a generator, set of evaluation metrics, and pipeline parameters, and obtain results on synthetic data quality.
26
+ - 👷 **Minimal preprocessing required.** All preprocessing is handled under the hood in the synthyverse, so no need for scaling, one-hot encoding, or handling missing values.
27
+
28
+ # Installation
29
+ The synthyverse is unique in its modular installation set-up. To avoid conflicting dependencies, we provide various installation templates. Each template installs only those dependencies which are required to access certain modules.
30
+
31
+ Templates provide installation for specific generators, the evaluation module, and more. Install multiple templates to get access to multiple modules of the synthyverse, e.g., multiple generators and evaluation.
32
+
33
+ _We strongly advise to only install templates which you require during a specific run. Installing multiple templates gives rise to potential dependency conflicts. Use separate virtual environments across installations. Note that the core installation without any template doesn't install any modules._
34
+
35
+ See the [overview of templates](synthyverse/TEMPLATES.md).
36
+
37
+ ### General Installation Template
38
+
39
+ ```bash
40
+ pip install synthyverse[template]
41
+ ```
42
+
43
+ ### Installation Examples
44
+ ```bash
45
+ pip install synthyverse[ctgan]
46
+ ```
47
+
48
+ ```bash
49
+ pip install synthyverse[arf,bn,ctgan,tvae]
50
+ ```
51
+
52
+ ```bash
53
+ pip install synthyverse[ctgan,eval]
54
+ ```
55
+
56
+
57
+ # Usage
58
+
59
+ ### Synthetic Data Generation
60
+ Import desired generator. Note that you can only import generators according to your installed synthyverse template.
61
+
62
+ See [all available generators](synthyverse/generators/GENERATORS.md).
63
+ ```python
64
+ from synthyverse.generators import ARFGenerator
65
+ generator = ARFGenerator(num_trees=20, random_state=0)
66
+ ```
67
+
68
+ Fit the generator.
69
+ ```python
70
+ from sklearn.datasets import load_breast_cancer
71
+ X = load_breast_cancer(as_frame=True).frame
72
+ generator.fit(X, discrete_features=["target"])
73
+ ```
74
+
75
+ Sample a synthetic dataset.
76
+ ```python
77
+ syn = generator.generate(len(X))
78
+ ```
79
+
80
+ ### Synthetic Data Evaluation
81
+ Choose a set of metrics. Either choose default metrics as a list, or provide them as a dictionary with carefully selected hyperparameters. Add a dash to the metric name to compute various configurations of the same evaluation metric.
82
+
83
+ See [all available metrics](synthyverse/evaluation/METRICS.md).
84
+ ```python
85
+ metrics = ["mle", "dcr", "similarity"]
86
+ metrics={
87
+ "mle-trts": {"train_set": "real"},
88
+ "mle-tstr": {"train_set": "synthetic"},
89
+ "dcr": {"estimates": ["mean", 0.01, 0.05]},
90
+ "similarity":{}
91
+ }
92
+ ```
93
+
94
+ Set-up a MetricEvaluator object.
95
+
96
+ ```python
97
+ from synthyverse.evaluation import MetricEvaluator
98
+
99
+ evaluator = MetricEvaluator(
100
+ metrics=metrics,
101
+ discrete_features=["target"],
102
+ target_column="target",
103
+ random_state=seed
104
+ )
105
+ ```
106
+
107
+ Evaluate the metrics with respect to the synthetic data, the training data used to fit the generator, and an independent holdout/test set of real data.
108
+
109
+ ```python
110
+ results = evaluator.evaluate(X_train, X_test, syn)
111
+ ```
112
+
113
+ ### Benchmarking
114
+
115
+ Set-up a benchmarking object. Supply the [generator name and its parameters](synthyverse/generators/GENERATORS.md), [evaluation metrics](synthyverse/evaluation/METRICS.md), the number of random train-test splits to fit the generator to, number of random initializations to fit the generator to, the number of synthetic sets to sample for each fitted generator, and the size of the test set.
116
+
117
+ ```python
118
+ from synthyverse.benchmark import TabularBenchmark
119
+
120
+ benchmark = TabularBenchmark(
121
+ generator_name="arf",
122
+ generator_params={"num_trees": 20},
123
+ n_random_splits=3,
124
+ n_inits=3,
125
+ n_generated_datasets=20,
126
+ metrics=["classifier_test", "mle", "dcr"],
127
+ test_size=0.3,
128
+ )
129
+ ```
130
+
131
+ Run the benchmarking pipeline on a dataset.
132
+ ```python
133
+ results = benchmark.run(X, target_column="target", discrete_columns=["target"])
134
+ ```
135
+
136
+ # Tutorials
137
+ - [Tabular Synthetic Data with the synthyverse: Introduction](tutorial.ipynb)
Binary file
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,6 @@
1
+ torch
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ xgboost
6
+ sdmetrics
@@ -0,0 +1,2 @@
1
+ arfpy
2
+ pandas
@@ -0,0 +1 @@
1
+ pandas
@@ -0,0 +1,3 @@
1
+ synthcity
2
+ opacus==1.5.3
3
+ pandas
@@ -0,0 +1,2 @@
1
+ ctgan==0.10.0
2
+ pandas
@@ -0,0 +1,2 @@
1
+ ctgan==0.10.0
2
+ pandas
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,60 @@
1
+ from setuptools import setup, find_packages
2
+ import os
3
+
4
+ version = "0.1.0"
5
+
6
+
7
+ def read_requirements(filename):
8
+ # Get the directory where setup.py is located
9
+ setup_dir = os.path.dirname(os.path.abspath(__file__))
10
+ filepath = os.path.join(setup_dir, "requirements", filename)
11
+ with open(filepath, "r", encoding="utf-8") as f:
12
+ return [line.strip() for line in f if line.strip() and not line.startswith("#")]
13
+
14
+
15
+ # Define extras dynamically from requirements folder
16
+ base_requirements = read_requirements("generators/base.txt")
17
+ extras = {
18
+ "eval": read_requirements("evaluation/eval.txt"),
19
+ "ctgan": base_requirements + read_requirements("generators/ctgan.txt"),
20
+ "arf": base_requirements + read_requirements("generators/arf.txt"),
21
+ "bn": base_requirements + read_requirements("generators/bn.txt"),
22
+ "tvae": base_requirements + read_requirements("generators/tvae.txt"),
23
+ }
24
+
25
+ # Create a "full" extra that includes all extras
26
+ extras["full"] = []
27
+ for key in extras:
28
+ extras["full"].extend(extras[key])
29
+ extras["full"] = list(set(extras["full"]))
30
+
31
+ core_dependencies = [] # Always-installed dependencies
32
+
33
+ setup(
34
+ name="synthyverse",
35
+ version=version,
36
+ description="Synthetic data generation and evaluation library",
37
+ author="Jim Achterberg, Saif Ul Islam, Zia Ur Rehman",
38
+ author_email=" ",
39
+ packages=find_packages(),
40
+ install_requires=core_dependencies,
41
+ extras_require=extras,
42
+ python_requires=">=3.8",
43
+ long_description=open("README.md", encoding="utf-8").read(),
44
+ long_description_content_type="text/markdown",
45
+ url="https://github.com/synthyverse/synthyverse",
46
+ classifiers=[
47
+ "Development Status :: 3 - Alpha",
48
+ "Intended Audience :: Developers",
49
+ "Intended Audience :: Science/Research",
50
+ "License :: OSI Approved :: MIT License",
51
+ "Programming Language :: Python :: 3",
52
+ "Programming Language :: Python :: 3.8",
53
+ "Programming Language :: Python :: 3.9",
54
+ "Programming Language :: Python :: 3.10",
55
+ "Programming Language :: Python :: 3.11",
56
+ "Programming Language :: Python :: 3.12",
57
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
58
+ "Topic :: Software Development :: Libraries :: Python Modules",
59
+ ],
60
+ )
File without changes
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1 @@
1
+ from .benchmark import TabularBenchmark
@@ -0,0 +1,88 @@
1
+ from sklearn.model_selection import train_test_split
2
+ from ..evaluation.eval import MetricEvaluator
3
+ from ..utils.utils import get_generator, free_up_memory
4
+ from ..utils.reproducibility import set_seed
5
+ import pandas as pd
6
+ from time import time
7
+
8
+
9
+ class TabularBenchmark:
10
+ def __init__(
11
+ self,
12
+ generator_name: str = "arf",
13
+ generator_params: dict = {},
14
+ n_random_splits: int = 1,
15
+ n_inits: int = 1,
16
+ n_generated_datasets: int = 1,
17
+ metrics: list = ["classifier_test", "mle", "dcr"],
18
+ test_size: float = 0.3,
19
+ ):
20
+
21
+ self.generator_name = generator_name
22
+ self.generator_params = generator_params
23
+ self.n_random_splits = n_random_splits
24
+ self.n_inits = n_inits
25
+ self.n_generated_datasets = n_generated_datasets
26
+ self.metrics = metrics
27
+ self.test_size = test_size
28
+
29
+ def run(self, X: pd.DataFrame, target_column: str, discrete_columns: list):
30
+
31
+ results = {}
32
+ generator_ = get_generator(self.generator_name)
33
+ for split_i in range(self.n_random_splits):
34
+ results[f"split_{split_i}"] = {}
35
+
36
+ # split data according to current seed
37
+ stratify = None
38
+ if target_column in discrete_columns:
39
+ stratify = X[target_column]
40
+ X_train, X_test = train_test_split(
41
+ X, stratify=stratify, test_size=self.test_size, random_state=split_i
42
+ )
43
+ X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(
44
+ drop=True
45
+ )
46
+
47
+ for init_i in range(self.n_inits):
48
+ results[f"split_{split_i}"][f"init_{init_i}"] = {}
49
+ set_seed(init_i)
50
+ generator = generator_(random_state=init_i, **self.generator_params)
51
+ start_time = time()
52
+ generator.fit(X_train, discrete_columns)
53
+ results[f"split_{split_i}"][f"init_{init_i}"]["training_time"] = (
54
+ time() - start_time
55
+ )
56
+
57
+ # potentially generate multiple datasets
58
+ for generated_dataset_i in range(self.n_generated_datasets):
59
+ results[f"split_{split_i}"][f"init_{init_i}"][
60
+ f"generated_dataset_{generated_dataset_i}"
61
+ ] = {}
62
+ start_time = time()
63
+ X_syn = generator.generate(len(X))
64
+ results[f"split_{split_i}"][f"init_{init_i}"][
65
+ f"generated_dataset_{generated_dataset_i}"
66
+ ] = {}
67
+ results[f"split_{split_i}"][f"init_{init_i}"][
68
+ f"generated_dataset_{generated_dataset_i}"
69
+ ]["inference_time"] = (time() - start_time)
70
+ start_time = time()
71
+ evaluator = MetricEvaluator(
72
+ metrics=self.metrics,
73
+ discrete_features=discrete_columns,
74
+ target_column=target_column,
75
+ random_state=init_i,
76
+ )
77
+ metric_results = evaluator.evaluate(X_train, X_test, X_syn)
78
+ results[f"split_{split_i}"][f"init_{init_i}"][
79
+ f"generated_dataset_{generated_dataset_i}"
80
+ ]["evaluation_time"] = (time() - start_time)
81
+ results[f"split_{split_i}"][f"init_{init_i}"][
82
+ f"generated_dataset_{generated_dataset_i}"
83
+ ].update(metric_results)
84
+
85
+ # free up memory for next iteration
86
+ free_up_memory()
87
+
88
+ return results
@@ -0,0 +1 @@
1
+ from .eval import MetricEvaluator