synthyverse 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synthyverse-0.1.0/LICENSE +21 -0
- synthyverse-0.1.0/MANIFEST.in +5 -0
- synthyverse-0.1.0/PKG-INFO +195 -0
- synthyverse-0.1.0/README.md +137 -0
- synthyverse-0.1.0/logo/logo.png +0 -0
- synthyverse-0.1.0/pyproject.toml +3 -0
- synthyverse-0.1.0/requirements/evaluation/eval.txt +6 -0
- synthyverse-0.1.0/requirements/generators/arf.txt +2 -0
- synthyverse-0.1.0/requirements/generators/base.txt +1 -0
- synthyverse-0.1.0/requirements/generators/bn.txt +3 -0
- synthyverse-0.1.0/requirements/generators/ctgan.txt +2 -0
- synthyverse-0.1.0/requirements/generators/tvae.txt +2 -0
- synthyverse-0.1.0/setup.cfg +4 -0
- synthyverse-0.1.0/setup.py +60 -0
- synthyverse-0.1.0/synthyverse/__init__.py +0 -0
- synthyverse-0.1.0/synthyverse/__version__.py +1 -0
- synthyverse-0.1.0/synthyverse/benchmark/__init__.py +1 -0
- synthyverse-0.1.0/synthyverse/benchmark/benchmark.py +88 -0
- synthyverse-0.1.0/synthyverse/evaluation/__init__.py +1 -0
- synthyverse-0.1.0/synthyverse/evaluation/eval.py +120 -0
- synthyverse-0.1.0/synthyverse/evaluation/fidelity.py +230 -0
- synthyverse-0.1.0/synthyverse/evaluation/privacy.py +88 -0
- synthyverse-0.1.0/synthyverse/evaluation/utility.py +115 -0
- synthyverse-0.1.0/synthyverse/generators/__init__.py +23 -0
- synthyverse-0.1.0/synthyverse/generators/arf_generator/__init__.py +1 -0
- synthyverse-0.1.0/synthyverse/generators/arf_generator/arf.py +21 -0
- synthyverse-0.1.0/synthyverse/generators/base.py +43 -0
- synthyverse-0.1.0/synthyverse/generators/bn_generator/__init__.py +1 -0
- synthyverse-0.1.0/synthyverse/generators/bn_generator/bn.py +45 -0
- synthyverse-0.1.0/synthyverse/generators/ctgan_generator/__init__.py +1 -0
- synthyverse-0.1.0/synthyverse/generators/ctgan_generator/ct_gan.py +49 -0
- synthyverse-0.1.0/synthyverse/generators/tvae_generator/__init__.py +1 -0
- synthyverse-0.1.0/synthyverse/generators/tvae_generator/tvae.py +38 -0
- synthyverse-0.1.0/synthyverse/utils/__init__.py +3 -0
- synthyverse-0.1.0/synthyverse/utils/oneclass.py +316 -0
- synthyverse-0.1.0/synthyverse/utils/preprocessing.py +73 -0
- synthyverse-0.1.0/synthyverse/utils/reproducibility.py +18 -0
- synthyverse-0.1.0/synthyverse/utils/utils.py +122 -0
- synthyverse-0.1.0/synthyverse/utils/xgb_utils.py +20 -0
- synthyverse-0.1.0/synthyverse.egg-info/PKG-INFO +195 -0
- synthyverse-0.1.0/synthyverse.egg-info/SOURCES.txt +43 -0
- synthyverse-0.1.0/synthyverse.egg-info/dependency_links.txt +1 -0
- synthyverse-0.1.0/synthyverse.egg-info/requires.txt +37 -0
- synthyverse-0.1.0/synthyverse.egg-info/top_level.txt +1 -0
- synthyverse-0.1.0/tutorial.ipynb +185 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Jim Achterberg
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: synthyverse
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Synthetic data generation and evaluation library
|
|
5
|
+
Home-page: https://github.com/synthyverse/synthyverse
|
|
6
|
+
Author: Jim Achterberg, Saif Ul Islam, Zia Ur Rehman
|
|
7
|
+
Author-email:
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Provides-Extra: eval
|
|
24
|
+
Requires-Dist: torch; extra == "eval"
|
|
25
|
+
Requires-Dist: pandas; extra == "eval"
|
|
26
|
+
Requires-Dist: numpy; extra == "eval"
|
|
27
|
+
Requires-Dist: scikit-learn; extra == "eval"
|
|
28
|
+
Requires-Dist: xgboost; extra == "eval"
|
|
29
|
+
Requires-Dist: sdmetrics; extra == "eval"
|
|
30
|
+
Provides-Extra: ctgan
|
|
31
|
+
Requires-Dist: pandas; extra == "ctgan"
|
|
32
|
+
Requires-Dist: ctgan==0.10.0; extra == "ctgan"
|
|
33
|
+
Requires-Dist: pandas; extra == "ctgan"
|
|
34
|
+
Provides-Extra: arf
|
|
35
|
+
Requires-Dist: pandas; extra == "arf"
|
|
36
|
+
Requires-Dist: arfpy; extra == "arf"
|
|
37
|
+
Requires-Dist: pandas; extra == "arf"
|
|
38
|
+
Provides-Extra: bn
|
|
39
|
+
Requires-Dist: pandas; extra == "bn"
|
|
40
|
+
Requires-Dist: synthcity; extra == "bn"
|
|
41
|
+
Requires-Dist: opacus==1.5.3; extra == "bn"
|
|
42
|
+
Requires-Dist: pandas; extra == "bn"
|
|
43
|
+
Provides-Extra: tvae
|
|
44
|
+
Requires-Dist: pandas; extra == "tvae"
|
|
45
|
+
Requires-Dist: ctgan==0.10.0; extra == "tvae"
|
|
46
|
+
Requires-Dist: pandas; extra == "tvae"
|
|
47
|
+
Provides-Extra: full
|
|
48
|
+
Requires-Dist: sdmetrics; extra == "full"
|
|
49
|
+
Requires-Dist: arfpy; extra == "full"
|
|
50
|
+
Requires-Dist: opacus==1.5.3; extra == "full"
|
|
51
|
+
Requires-Dist: torch; extra == "full"
|
|
52
|
+
Requires-Dist: xgboost; extra == "full"
|
|
53
|
+
Requires-Dist: pandas; extra == "full"
|
|
54
|
+
Requires-Dist: scikit-learn; extra == "full"
|
|
55
|
+
Requires-Dist: numpy; extra == "full"
|
|
56
|
+
Requires-Dist: synthcity; extra == "full"
|
|
57
|
+
Requires-Dist: ctgan==0.10.0; extra == "full"
|
|
58
|
+
|
|
59
|
+
<table align="center" border="0">
|
|
60
|
+
<tr>
|
|
61
|
+
<td align="center">
|
|
62
|
+
|
|
63
|
+
<img src="logo/logo.png" alt="Synthyverse logo" width="250" height="auto">
|
|
64
|
+
|
|
65
|
+
<br/>
|
|
66
|
+
<br/>
|
|
67
|
+
|
|
68
|
+
Welcome to the synthyverse!
|
|
69
|
+
|
|
70
|
+
The most extensive ecosystem for synthetic data generation and evaluation in Python.
|
|
71
|
+
|
|
72
|
+
_The synthyverse is a work in progress. Please provide any suggestions through a GitHub Issue._
|
|
73
|
+
|
|
74
|
+
</td>
|
|
75
|
+
</tr>
|
|
76
|
+
</table>
|
|
77
|
+
|
|
78
|
+
<div style="clear: both;"></div>
|
|
79
|
+
|
|
80
|
+
# Features
|
|
81
|
+
- 🔧 **Highly modular installation.** Install only those modules which you require to keep your installation lightweight.
|
|
82
|
+
- 📚 **Most extensive library for synthetic data.** Any generator or metric can be quickly added without dependency conflicts due to synthyverse's modular installation. This allows the synthyverse to host the most generators and evaluation metrics out of any synthetic data library.
|
|
83
|
+
- ⚙️ **Benchmarking module for simplified synthetic data pipelines.** The benchmarking module executes a modular pipeline of synthetic data generation and evaluation. Choose a generator, set of evaluation metrics, and pipeline parameters, and obtain results on synthetic data quality.
|
|
84
|
+
- 👷 **Minimal preprocessing required.** All preprocessing is handled under the hood in the synthyverse, so no need for scaling, one-hot encoding, or handling missing values.
|
|
85
|
+
|
|
86
|
+
# Installation
|
|
87
|
+
The synthyverse is unique in its modular installation set-up. To avoid conflicting dependencies, we provide various installation templates. Each template installs only those dependencies which are required to access certain modules.
|
|
88
|
+
|
|
89
|
+
Templates provide installation for specific generators, the evaluation module, and more. Install multiple templates to get access to multiple modules of the synthyverse, e.g., multiple generators and evaluation.
|
|
90
|
+
|
|
91
|
+
_We strongly advise to only install templates which you require during a specific run. Installing multiple templates gives rise to potential dependency conflicts. Use separate virtual environments across installations. Note that the core installation without any template doesn't install any modules._
|
|
92
|
+
|
|
93
|
+
See the [overview of templates](synthyverse/TEMPLATES.md).
|
|
94
|
+
|
|
95
|
+
### General Installation Template
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pip install synthyverse[template]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Installation Examples
|
|
102
|
+
```bash
|
|
103
|
+
pip install synthyverse[ctgan]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
pip install synthyverse[arf,bn,ctgan,tvae]
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pip install synthyverse[ctgan,eval]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# Usage
|
|
116
|
+
|
|
117
|
+
### Synthetic Data Generation
|
|
118
|
+
Import desired generator. Note that you can only import generators according to your installed synthyverse template.
|
|
119
|
+
|
|
120
|
+
See [all available generators](synthyverse/generators/GENERATORS.md).
|
|
121
|
+
```python
|
|
122
|
+
from synthyverse.generators import ARFGenerator
|
|
123
|
+
generator = ARFGenerator(num_trees=20, random_state=0)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Fit the generator.
|
|
127
|
+
```python
|
|
128
|
+
from sklearn.datasets import load_breast_cancer
|
|
129
|
+
X = load_breast_cancer(as_frame=True).frame
|
|
130
|
+
generator.fit(X, discrete_features=["target"])
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Sample a synthetic dataset.
|
|
134
|
+
```python
|
|
135
|
+
syn = generator.generate(len(X))
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Synthetic Data Evaluation
|
|
139
|
+
Choose a set of metrics. Either choose default metrics as a list, or provide them as a dictionary with carefully selected hyperparameters. Add a dash to the metric name to compute various configurations of the same evaluation metric.
|
|
140
|
+
|
|
141
|
+
See [all available metrics](synthyverse/evaluation/METRICS.md).
|
|
142
|
+
```python
|
|
143
|
+
metrics = ["mle", "dcr", "similarity"]
|
|
144
|
+
metrics={
|
|
145
|
+
"mle-trts": {"train_set": "real"},
|
|
146
|
+
"mle-tstr": {"train_set": "synthetic"},
|
|
147
|
+
"dcr": {"estimates": ["mean", 0.01, 0.05]},
|
|
148
|
+
"similarity":{}
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Set-up a MetricEvaluator object.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from synthyverse.evaluation import MetricEvaluator
|
|
156
|
+
|
|
157
|
+
evaluator = MetricEvaluator(
|
|
158
|
+
metrics=metrics,
|
|
159
|
+
discrete_features=["target"],
|
|
160
|
+
target_column="target",
|
|
161
|
+
random_state=seed
|
|
162
|
+
)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Evaluate the metrics with respect to the synthetic data, the training data used to fit the generator, and an independent holdout/test set of real data.
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
results = evaluator.evaluate(X_train, X_test, syn)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Benchmarking
|
|
172
|
+
|
|
173
|
+
Set-up a benchmarking object. Supply the [generator name and its parameters](synthyverse/generators/GENERATORS.md), [evaluation metrics](synthyverse/evaluation/METRICS.md), the number of random train-test splits to fit the generator to, number of random initializations to fit the generator to, the number of synthetic sets to sample for each fitted generator, and the size of the test set.
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from synthyverse.benchmark import TabularBenchmark
|
|
177
|
+
|
|
178
|
+
benchmark = TabularBenchmark(
|
|
179
|
+
generator_name="arf",
|
|
180
|
+
generator_params={"num_trees": 20},
|
|
181
|
+
n_random_splits=3,
|
|
182
|
+
n_inits=3,
|
|
183
|
+
n_generated_datasets=20,
|
|
184
|
+
metrics=["classifier_test", "mle", "dcr"],
|
|
185
|
+
test_size=0.3,
|
|
186
|
+
)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Run the benchmarking pipeline on a dataset.
|
|
190
|
+
```python
|
|
191
|
+
results = benchmark.run(X, target_column="target", discrete_columns=["target"])
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
# Tutorials
|
|
195
|
+
- [Tabular Synthetic Data with the synthyverse: Introduction](tutorial.ipynb)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
<table align="center" border="0">
|
|
2
|
+
<tr>
|
|
3
|
+
<td align="center">
|
|
4
|
+
|
|
5
|
+
<img src="logo/logo.png" alt="Synthyverse logo" width="250" height="auto">
|
|
6
|
+
|
|
7
|
+
<br/>
|
|
8
|
+
<br/>
|
|
9
|
+
|
|
10
|
+
Welcome to the synthyverse!
|
|
11
|
+
|
|
12
|
+
The most extensive ecosystem for synthetic data generation and evaluation in Python.
|
|
13
|
+
|
|
14
|
+
_The synthyverse is a work in progress. Please provide any suggestions through a GitHub Issue._
|
|
15
|
+
|
|
16
|
+
</td>
|
|
17
|
+
</tr>
|
|
18
|
+
</table>
|
|
19
|
+
|
|
20
|
+
<div style="clear: both;"></div>
|
|
21
|
+
|
|
22
|
+
# Features
|
|
23
|
+
- 🔧 **Highly modular installation.** Install only those modules which you require to keep your installation lightweight.
|
|
24
|
+
- 📚 **Most extensive library for synthetic data.** Any generator or metric can be quickly added without dependency conflicts due to synthyverse's modular installation. This allows the synthyverse to host the most generators and evaluation metrics out of any synthetic data library.
|
|
25
|
+
- ⚙️ **Benchmarking module for simplified synthetic data pipelines.** The benchmarking module executes a modular pipeline of synthetic data generation and evaluation. Choose a generator, set of evaluation metrics, and pipeline parameters, and obtain results on synthetic data quality.
|
|
26
|
+
- 👷 **Minimal preprocessing required.** All preprocessing is handled under the hood in the synthyverse, so no need for scaling, one-hot encoding, or handling missing values.
|
|
27
|
+
|
|
28
|
+
# Installation
|
|
29
|
+
The synthyverse is unique in its modular installation set-up. To avoid conflicting dependencies, we provide various installation templates. Each template installs only those dependencies which are required to access certain modules.
|
|
30
|
+
|
|
31
|
+
Templates provide installation for specific generators, the evaluation module, and more. Install multiple templates to get access to multiple modules of the synthyverse, e.g., multiple generators and evaluation.
|
|
32
|
+
|
|
33
|
+
_We strongly advise to only install templates which you require during a specific run. Installing multiple templates gives rise to potential dependency conflicts. Use separate virtual environments across installations. Note that the core installation without any template doesn't install any modules._
|
|
34
|
+
|
|
35
|
+
See the [overview of templates](synthyverse/TEMPLATES.md).
|
|
36
|
+
|
|
37
|
+
### General Installation Template
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install synthyverse[template]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Installation Examples
|
|
44
|
+
```bash
|
|
45
|
+
pip install synthyverse[ctgan]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install synthyverse[arf,bn,ctgan,tvae]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install synthyverse[ctgan,eval]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Usage
|
|
58
|
+
|
|
59
|
+
### Synthetic Data Generation
|
|
60
|
+
Import desired generator. Note that you can only import generators according to your installed synthyverse template.
|
|
61
|
+
|
|
62
|
+
See [all available generators](synthyverse/generators/GENERATORS.md).
|
|
63
|
+
```python
|
|
64
|
+
from synthyverse.generators import ARFGenerator
|
|
65
|
+
generator = ARFGenerator(num_trees=20, random_state=0)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Fit the generator.
|
|
69
|
+
```python
|
|
70
|
+
from sklearn.datasets import load_breast_cancer
|
|
71
|
+
X = load_breast_cancer(as_frame=True).frame
|
|
72
|
+
generator.fit(X, discrete_features=["target"])
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Sample a synthetic dataset.
|
|
76
|
+
```python
|
|
77
|
+
syn = generator.generate(len(X))
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Synthetic Data Evaluation
|
|
81
|
+
Choose a set of metrics. Either choose default metrics as a list, or provide them as a dictionary with carefully selected hyperparameters. Add a dash to the metric name to compute various configurations of the same evaluation metric.
|
|
82
|
+
|
|
83
|
+
See [all available metrics](synthyverse/evaluation/METRICS.md).
|
|
84
|
+
```python
|
|
85
|
+
metrics = ["mle", "dcr", "similarity"]
|
|
86
|
+
metrics={
|
|
87
|
+
"mle-trts": {"train_set": "real"},
|
|
88
|
+
"mle-tstr": {"train_set": "synthetic"},
|
|
89
|
+
"dcr": {"estimates": ["mean", 0.01, 0.05]},
|
|
90
|
+
"similarity":{}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Set-up a MetricEvaluator object.
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from synthyverse.evaluation import MetricEvaluator
|
|
98
|
+
|
|
99
|
+
evaluator = MetricEvaluator(
|
|
100
|
+
metrics=metrics,
|
|
101
|
+
discrete_features=["target"],
|
|
102
|
+
target_column="target",
|
|
103
|
+
random_state=seed
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Evaluate the metrics with respect to the synthetic data, the training data used to fit the generator, and an independent holdout/test set of real data.
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
results = evaluator.evaluate(X_train, X_test, syn)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Benchmarking
|
|
114
|
+
|
|
115
|
+
Set-up a benchmarking object. Supply the [generator name and its parameters](synthyverse/generators/GENERATORS.md), [evaluation metrics](synthyverse/evaluation/METRICS.md), the number of random train-test splits to fit the generator to, number of random initializations to fit the generator to, the number of synthetic sets to sample for each fitted generator, and the size of the test set.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from synthyverse.benchmark import TabularBenchmark
|
|
119
|
+
|
|
120
|
+
benchmark = TabularBenchmark(
|
|
121
|
+
generator_name="arf",
|
|
122
|
+
generator_params={"num_trees": 20},
|
|
123
|
+
n_random_splits=3,
|
|
124
|
+
n_inits=3,
|
|
125
|
+
n_generated_datasets=20,
|
|
126
|
+
metrics=["classifier_test", "mle", "dcr"],
|
|
127
|
+
test_size=0.3,
|
|
128
|
+
)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Run the benchmarking pipeline on a dataset.
|
|
132
|
+
```python
|
|
133
|
+
results = benchmark.run(X, target_column="target", discrete_columns=["target"])
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
# Tutorials
|
|
137
|
+
- [Tabular Synthetic Data with the synthyverse: Introduction](tutorial.ipynb)
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pandas
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
version = "0.1.0"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_requirements(filename):
|
|
8
|
+
# Get the directory where setup.py is located
|
|
9
|
+
setup_dir = os.path.dirname(os.path.abspath(__file__))
|
|
10
|
+
filepath = os.path.join(setup_dir, "requirements", filename)
|
|
11
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
12
|
+
return [line.strip() for line in f if line.strip() and not line.startswith("#")]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Define extras dynamically from requirements folder
|
|
16
|
+
base_requirements = read_requirements("generators/base.txt")
|
|
17
|
+
extras = {
|
|
18
|
+
"eval": read_requirements("evaluation/eval.txt"),
|
|
19
|
+
"ctgan": base_requirements + read_requirements("generators/ctgan.txt"),
|
|
20
|
+
"arf": base_requirements + read_requirements("generators/arf.txt"),
|
|
21
|
+
"bn": base_requirements + read_requirements("generators/bn.txt"),
|
|
22
|
+
"tvae": base_requirements + read_requirements("generators/tvae.txt"),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Create a "full" extra that includes all extras
|
|
26
|
+
extras["full"] = []
|
|
27
|
+
for key in extras:
|
|
28
|
+
extras["full"].extend(extras[key])
|
|
29
|
+
extras["full"] = list(set(extras["full"]))
|
|
30
|
+
|
|
31
|
+
core_dependencies = [] # Always-installed dependencies
|
|
32
|
+
|
|
33
|
+
setup(
|
|
34
|
+
name="synthyverse",
|
|
35
|
+
version=version,
|
|
36
|
+
description="Synthetic data generation and evaluation library",
|
|
37
|
+
author="Jim Achterberg, Saif Ul Islam, Zia Ur Rehman",
|
|
38
|
+
author_email=" ",
|
|
39
|
+
packages=find_packages(),
|
|
40
|
+
install_requires=core_dependencies,
|
|
41
|
+
extras_require=extras,
|
|
42
|
+
python_requires=">=3.8",
|
|
43
|
+
long_description=open("README.md", encoding="utf-8").read(),
|
|
44
|
+
long_description_content_type="text/markdown",
|
|
45
|
+
url="https://github.com/synthyverse/synthyverse",
|
|
46
|
+
classifiers=[
|
|
47
|
+
"Development Status :: 3 - Alpha",
|
|
48
|
+
"Intended Audience :: Developers",
|
|
49
|
+
"Intended Audience :: Science/Research",
|
|
50
|
+
"License :: OSI Approved :: MIT License",
|
|
51
|
+
"Programming Language :: Python :: 3",
|
|
52
|
+
"Programming Language :: Python :: 3.8",
|
|
53
|
+
"Programming Language :: Python :: 3.9",
|
|
54
|
+
"Programming Language :: Python :: 3.10",
|
|
55
|
+
"Programming Language :: Python :: 3.11",
|
|
56
|
+
"Programming Language :: Python :: 3.12",
|
|
57
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
58
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
59
|
+
],
|
|
60
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .benchmark import TabularBenchmark
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from sklearn.model_selection import train_test_split
|
|
2
|
+
from ..evaluation.eval import MetricEvaluator
|
|
3
|
+
from ..utils.utils import get_generator, free_up_memory
|
|
4
|
+
from ..utils.reproducibility import set_seed
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from time import time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TabularBenchmark:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
generator_name: str = "arf",
|
|
13
|
+
generator_params: dict = {},
|
|
14
|
+
n_random_splits: int = 1,
|
|
15
|
+
n_inits: int = 1,
|
|
16
|
+
n_generated_datasets: int = 1,
|
|
17
|
+
metrics: list = ["classifier_test", "mle", "dcr"],
|
|
18
|
+
test_size: float = 0.3,
|
|
19
|
+
):
|
|
20
|
+
|
|
21
|
+
self.generator_name = generator_name
|
|
22
|
+
self.generator_params = generator_params
|
|
23
|
+
self.n_random_splits = n_random_splits
|
|
24
|
+
self.n_inits = n_inits
|
|
25
|
+
self.n_generated_datasets = n_generated_datasets
|
|
26
|
+
self.metrics = metrics
|
|
27
|
+
self.test_size = test_size
|
|
28
|
+
|
|
29
|
+
def run(self, X: pd.DataFrame, target_column: str, discrete_columns: list):
|
|
30
|
+
|
|
31
|
+
results = {}
|
|
32
|
+
generator_ = get_generator(self.generator_name)
|
|
33
|
+
for split_i in range(self.n_random_splits):
|
|
34
|
+
results[f"split_{split_i}"] = {}
|
|
35
|
+
|
|
36
|
+
# split data according to current seed
|
|
37
|
+
stratify = None
|
|
38
|
+
if target_column in discrete_columns:
|
|
39
|
+
stratify = X[target_column]
|
|
40
|
+
X_train, X_test = train_test_split(
|
|
41
|
+
X, stratify=stratify, test_size=self.test_size, random_state=split_i
|
|
42
|
+
)
|
|
43
|
+
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(
|
|
44
|
+
drop=True
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
for init_i in range(self.n_inits):
|
|
48
|
+
results[f"split_{split_i}"][f"init_{init_i}"] = {}
|
|
49
|
+
set_seed(init_i)
|
|
50
|
+
generator = generator_(random_state=init_i, **self.generator_params)
|
|
51
|
+
start_time = time()
|
|
52
|
+
generator.fit(X_train, discrete_columns)
|
|
53
|
+
results[f"split_{split_i}"][f"init_{init_i}"]["training_time"] = (
|
|
54
|
+
time() - start_time
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# potentially generate multiple datasets
|
|
58
|
+
for generated_dataset_i in range(self.n_generated_datasets):
|
|
59
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
60
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
61
|
+
] = {}
|
|
62
|
+
start_time = time()
|
|
63
|
+
X_syn = generator.generate(len(X))
|
|
64
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
65
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
66
|
+
] = {}
|
|
67
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
68
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
69
|
+
]["inference_time"] = (time() - start_time)
|
|
70
|
+
start_time = time()
|
|
71
|
+
evaluator = MetricEvaluator(
|
|
72
|
+
metrics=self.metrics,
|
|
73
|
+
discrete_features=discrete_columns,
|
|
74
|
+
target_column=target_column,
|
|
75
|
+
random_state=init_i,
|
|
76
|
+
)
|
|
77
|
+
metric_results = evaluator.evaluate(X_train, X_test, X_syn)
|
|
78
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
79
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
80
|
+
]["evaluation_time"] = (time() - start_time)
|
|
81
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
82
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
83
|
+
].update(metric_results)
|
|
84
|
+
|
|
85
|
+
# free up memory for next iteration
|
|
86
|
+
free_up_memory()
|
|
87
|
+
|
|
88
|
+
return results
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .eval import MetricEvaluator
|