sdgym 0.13.1.dev0__tar.gz → 0.13.2.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdgym-0.13.1.dev0/sdgym.egg-info → sdgym-0.13.2.dev0}/PKG-INFO +3 -4
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/README.md +2 -3
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/pyproject.toml +1 -1
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/__init__.py +1 -1
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_explorer/result_explorer.py +112 -21
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_explorer/result_handler.py +101 -30
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/utils.py +5 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0/sdgym.egg-info}/PKG-INFO +3 -4
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/LICENSE +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/__init__.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/benchmark.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/config_utils.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/credentials_utils.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_dataset_utils.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/benchmark.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/__init__.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/__main__.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/collect.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/summary.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/utils.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/dataset_explorer.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/datasets.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/errors.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/metrics.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/progress.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_explorer/__init__.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_writer.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/__init__.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/run_benchmark.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/utils.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/s3.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizer_descriptions.yaml +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/__init__.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/base.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/column.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/generate.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/identity.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/sdv.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/uniform.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/utils.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/SOURCES.txt +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/entry_points.txt +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/requires.txt +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/top_level.txt +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/setup.cfg +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/tests/test_scripts.py +0 -0
- {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/tests/test_tasks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.2.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License-Expression: BUSL-1.1
|
|
@@ -178,9 +178,7 @@ Now, we can benchmark the different techniques:
|
|
|
178
178
|
```python
|
|
179
179
|
import sdgym
|
|
180
180
|
|
|
181
|
-
sdgym.benchmark_single_table(
|
|
182
|
-
synthesizers=(sdv_synthesizers + baseline_synthesizers)
|
|
183
|
-
)
|
|
181
|
+
sdgym.benchmark_single_table(synthesizers=(sdv_synthesizers + baseline_synthesizers))
|
|
184
182
|
```
|
|
185
183
|
|
|
186
184
|
The result is a detailed performance, memory and quality evaluation across the synthesizers
|
|
@@ -197,6 +195,7 @@ def my_training_logic(data, metadata):
|
|
|
197
195
|
# train it using the data
|
|
198
196
|
return synthesizer
|
|
199
197
|
|
|
198
|
+
|
|
200
199
|
def my_sampling_logic(trained_synthesizer, num_rows):
|
|
201
200
|
# use the trained synthesizer to create
|
|
202
201
|
# num_rows of synthetic data
|
|
@@ -73,9 +73,7 @@ Now, we can benchmark the different techniques:
|
|
|
73
73
|
```python
|
|
74
74
|
import sdgym
|
|
75
75
|
|
|
76
|
-
sdgym.benchmark_single_table(
|
|
77
|
-
synthesizers=(sdv_synthesizers + baseline_synthesizers)
|
|
78
|
-
)
|
|
76
|
+
sdgym.benchmark_single_table(synthesizers=(sdv_synthesizers + baseline_synthesizers))
|
|
79
77
|
```
|
|
80
78
|
|
|
81
79
|
The result is a detailed performance, memory and quality evaluation across the synthesizers
|
|
@@ -92,6 +90,7 @@ def my_training_logic(data, metadata):
|
|
|
92
90
|
# train it using the data
|
|
93
91
|
return synthesizer
|
|
94
92
|
|
|
93
|
+
|
|
95
94
|
def my_sampling_logic(trained_synthesizer, num_rows):
|
|
96
95
|
# use the trained synthesizer to create
|
|
97
96
|
# num_rows of synthetic data
|
|
@@ -161,7 +161,7 @@ namespaces = false
|
|
|
161
161
|
version = {attr = 'sdgym.__version__'}
|
|
162
162
|
|
|
163
163
|
[tool.bumpversion]
|
|
164
|
-
current_version = "0.13.
|
|
164
|
+
current_version = "0.13.2.dev0"
|
|
165
165
|
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
|
|
166
166
|
serialize = [
|
|
167
167
|
'{major}.{minor}.{patch}.{release}{candidate}',
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""SDGym Results Explorer for accessing and managing benchmark results."""
|
|
2
2
|
|
|
3
|
+
import operator
|
|
3
4
|
import os
|
|
5
|
+
from datetime import datetime
|
|
4
6
|
|
|
5
7
|
from sdgym.datasets import _load_dataset_with_client
|
|
6
8
|
from sdgym.result_explorer.result_handler import (
|
|
9
|
+
RESULTS_FOLDER_PREFIX,
|
|
7
10
|
SYNTHESIZER_BASELINE,
|
|
8
11
|
LocalResultsHandler,
|
|
9
12
|
S3ResultsHandler,
|
|
@@ -39,6 +42,29 @@ def _resolve_effective_path(path, modality):
|
|
|
39
42
|
class ResultsExplorer:
|
|
40
43
|
"""Explorer for SDGym benchmark results, supporting both local and S3 storage."""
|
|
41
44
|
|
|
45
|
+
def _get_latest_run(self):
|
|
46
|
+
"""Get the folder name of the latest SDGym run."""
|
|
47
|
+
candidates = []
|
|
48
|
+
for name in self._handler.list():
|
|
49
|
+
name = name.rstrip('/')
|
|
50
|
+
if not name.startswith(RESULTS_FOLDER_PREFIX):
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
date_str = name[len(RESULTS_FOLDER_PREFIX) :]
|
|
54
|
+
try:
|
|
55
|
+
date_obj = datetime.strptime(date_str, '%m_%d_%Y')
|
|
56
|
+
except ValueError:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
candidates.append((date_obj, name))
|
|
60
|
+
if not candidates:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f'No run folders found. Expected folders like '
|
|
63
|
+
f"'{RESULTS_FOLDER_PREFIX}MM_DD_YYYY' under: {self.path}/{self.modality}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return max(candidates, key=operator.itemgetter(0))[1]
|
|
67
|
+
|
|
42
68
|
def _create_results_handler(self, original_path, effective_path):
|
|
43
69
|
"""Create the appropriate results handler for local or S3 storage."""
|
|
44
70
|
baseline_synthesizer = _BASELINE_BY_MODALITY.get(self.modality, SYNTHESIZER_BASELINE)
|
|
@@ -86,22 +112,59 @@ class ResultsExplorer:
|
|
|
86
112
|
|
|
87
113
|
return self._handler.get_file_path(path_parts, end_filename)
|
|
88
114
|
|
|
89
|
-
def load_synthesizer(self,
|
|
90
|
-
"""Load the synthesizer for a given dataset and synthesizer.
|
|
115
|
+
def load_synthesizer(self, dataset_name, synthesizer_name, results_folder_name=None):
|
|
116
|
+
"""Load the synthesizer for a given dataset and synthesizer.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
dataset_name (str):
|
|
120
|
+
The name of the dataset.
|
|
121
|
+
synthesizer_name (str):
|
|
122
|
+
The name of the synthesizer.
|
|
123
|
+
results_folder_name (str, optional):
|
|
124
|
+
The name of the results folder to load from. If not provided,
|
|
125
|
+
the latest run will be used. Defaults to None.
|
|
126
|
+
"""
|
|
127
|
+
results_folder_name = results_folder_name or self._get_latest_run()
|
|
91
128
|
file_path = self._get_file_path(
|
|
92
|
-
results_folder_name,
|
|
129
|
+
results_folder_name=results_folder_name,
|
|
130
|
+
dataset_name=dataset_name,
|
|
131
|
+
synthesizer_name=synthesizer_name,
|
|
132
|
+
file_type='synthesizer',
|
|
93
133
|
)
|
|
94
134
|
return self._handler.load_synthesizer(file_path)
|
|
95
135
|
|
|
96
|
-
def load_synthetic_data(self,
|
|
97
|
-
"""Load the synthetic data for a given dataset and synthesizer.
|
|
136
|
+
def load_synthetic_data(self, dataset_name, synthesizer_name, results_folder_name=None):
|
|
137
|
+
"""Load the synthetic data for a given dataset and synthesizer.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
dataset_name (str):
|
|
141
|
+
The name of the dataset.
|
|
142
|
+
synthesizer_name (str):
|
|
143
|
+
The name of the synthesizer.
|
|
144
|
+
results_folder_name (str, optional):
|
|
145
|
+
The name of the results folder to load from. If not provided,
|
|
146
|
+
the latest run will be used. Defaults to None.
|
|
147
|
+
"""
|
|
148
|
+
results_folder_name = results_folder_name or self._get_latest_run()
|
|
98
149
|
file_path = self._get_file_path(
|
|
99
|
-
results_folder_name,
|
|
150
|
+
results_folder_name=results_folder_name,
|
|
151
|
+
dataset_name=dataset_name,
|
|
152
|
+
synthesizer_name=synthesizer_name,
|
|
153
|
+
file_type='synthetic_data',
|
|
100
154
|
)
|
|
101
155
|
return self._handler.load_synthetic_data(file_path)
|
|
102
156
|
|
|
103
157
|
def load_real_data(self, dataset_name):
|
|
104
|
-
"""Load the real data for a given dataset.
|
|
158
|
+
"""Load the real data for a given dataset.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
dataset_name (str):
|
|
162
|
+
The name of the dataset.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
pd.DataFrame:
|
|
166
|
+
A DataFrame containing the real data for the specified dataset.
|
|
167
|
+
"""
|
|
105
168
|
data, _ = _load_dataset_with_client(
|
|
106
169
|
modality=self.modality,
|
|
107
170
|
dataset=dataset_name,
|
|
@@ -109,46 +172,74 @@ class ResultsExplorer:
|
|
|
109
172
|
)
|
|
110
173
|
return data
|
|
111
174
|
|
|
112
|
-
def summarize(self,
|
|
175
|
+
def summarize(self, results_folder_name=None):
|
|
113
176
|
"""Summarize the results in the specified folder.
|
|
114
177
|
|
|
115
178
|
Args:
|
|
116
|
-
|
|
117
|
-
The name of the results folder to summarize.
|
|
179
|
+
results_folder_name (str, optional):
|
|
180
|
+
The name of the results folder to summarize. If not provided,
|
|
181
|
+
the latest run will be used. Defaults to None.
|
|
118
182
|
|
|
119
183
|
Returns:
|
|
120
184
|
tuple (pd.DataFrame, pd.DataFrame):
|
|
121
185
|
- A summary DataFrame with the number of Wins per synthesizer.
|
|
122
186
|
- A DataFrame with the results of the benchmark for the specified folder.
|
|
123
187
|
"""
|
|
124
|
-
|
|
188
|
+
results_folder_name = results_folder_name or self._get_latest_run()
|
|
189
|
+
return self._handler.summarize(results_folder_name=results_folder_name)
|
|
125
190
|
|
|
126
|
-
def all_runs_complete(self,
|
|
191
|
+
def all_runs_complete(self, results_folder_name=None):
|
|
127
192
|
"""Check if all runs in the specified folder are complete."""
|
|
128
|
-
|
|
193
|
+
results_folder_name = results_folder_name or self._get_latest_run()
|
|
194
|
+
return self._handler.all_runs_complete(results_folder_name=results_folder_name)
|
|
129
195
|
|
|
130
|
-
def load_results(
|
|
196
|
+
def load_results(
|
|
197
|
+
self, dataset_names=None, synthesizer_names=None, summary=False, results_folder_name=None
|
|
198
|
+
):
|
|
131
199
|
"""Load and aggregate all the results CSV files from the specified results folder.
|
|
132
200
|
|
|
133
201
|
Args:
|
|
134
|
-
|
|
135
|
-
|
|
202
|
+
dataset_names (list[str], optional):
|
|
203
|
+
A list of dataset names to filter results for. If None, results for all
|
|
204
|
+
datasets will be loaded. Defaults to None.
|
|
205
|
+
synthesizer_names (list[str], optional):
|
|
206
|
+
A list of synthesizer names to filter results for. If None, results for all
|
|
207
|
+
synthesizers will be loaded. Defaults to None.
|
|
208
|
+
summary (bool, optional):
|
|
209
|
+
If True, only return the summary results which include the following columns:
|
|
210
|
+
- 'Dataset'
|
|
211
|
+
- 'Synthesizer'
|
|
212
|
+
- 'Adjusted_Total_Time'
|
|
213
|
+
- 'Adjusted_Quality_Score'
|
|
214
|
+
- 'Diagnostic_Score'
|
|
215
|
+
Defaults to False.
|
|
216
|
+
results_folder_name (str, optional):
|
|
217
|
+
The name of the results folder to load results from. If not provided,
|
|
218
|
+
the latest run will be used. Defaults to None.
|
|
136
219
|
|
|
137
220
|
Returns:
|
|
138
221
|
pd.DataFrame:
|
|
139
222
|
A DataFrame containing the results of the specified folder.
|
|
140
223
|
"""
|
|
141
|
-
|
|
224
|
+
results_folder_name = results_folder_name or self._get_latest_run()
|
|
225
|
+
return self._handler.load_results(
|
|
226
|
+
results_folder_name=results_folder_name,
|
|
227
|
+
dataset_names=dataset_names,
|
|
228
|
+
synthesizer_names=synthesizer_names,
|
|
229
|
+
summary=summary,
|
|
230
|
+
)
|
|
142
231
|
|
|
143
|
-
def load_metainfo(self, results_folder_name):
|
|
232
|
+
def load_metainfo(self, results_folder_name=None):
|
|
144
233
|
"""Load and aggregate all the metainfo YAML files from the specified results folder.
|
|
145
234
|
|
|
146
235
|
Args:
|
|
147
|
-
results_folder_name (str):
|
|
148
|
-
The name of the results folder to load metainfo from.
|
|
236
|
+
results_folder_name (str, optional):
|
|
237
|
+
The name of the results folder to load metainfo from. If not provided,
|
|
238
|
+
the latest run will be used. Defaults to None.
|
|
149
239
|
|
|
150
240
|
Returns:
|
|
151
241
|
dict:
|
|
152
242
|
A dictionary containing the metainfo of the specified folder.
|
|
153
243
|
"""
|
|
154
|
-
|
|
244
|
+
results_folder_name = results_folder_name or self._get_latest_run()
|
|
245
|
+
return self._handler.load_metainfo(results_folder_name=results_folder_name)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import io
|
|
4
4
|
import operator
|
|
5
5
|
import os
|
|
6
|
+
import warnings
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
|
|
@@ -12,6 +13,7 @@ import yaml
|
|
|
12
13
|
from botocore.exceptions import ClientError
|
|
13
14
|
|
|
14
15
|
from sdgym._dataset_utils import _read_zipped_data
|
|
16
|
+
from sdgym.utils import _is_list_of_type
|
|
15
17
|
|
|
16
18
|
SYNTHESIZER_BASELINE = 'GaussianCopulaSynthesizer'
|
|
17
19
|
RESULTS_FOLDER_PREFIX = 'SDGym_results_'
|
|
@@ -19,6 +21,13 @@ metainfo_PREFIX = 'metainfo'
|
|
|
19
21
|
RESULTS_FILE_PREFIX = 'results'
|
|
20
22
|
NUM_DIGITS_DATE = 10
|
|
21
23
|
REGEX_SYNTHESIZER_NAME = r'\s*\(\d+\)\s*$'
|
|
24
|
+
SUMMARY_COLUMNS = [
|
|
25
|
+
'Dataset',
|
|
26
|
+
'Synthesizer',
|
|
27
|
+
'Adjusted_Total_Time',
|
|
28
|
+
'Adjusted_Quality_Score',
|
|
29
|
+
'Diagnostic_Score',
|
|
30
|
+
]
|
|
22
31
|
|
|
23
32
|
|
|
24
33
|
class ResultsHandler(ABC):
|
|
@@ -151,13 +160,15 @@ class ResultsHandler(ABC):
|
|
|
151
160
|
filtered_results = filtered_results.sort_values(by=['Dataset', 'Synthesizer'])
|
|
152
161
|
return filtered_results.reset_index(drop=True)
|
|
153
162
|
|
|
154
|
-
def summarize(self,
|
|
163
|
+
def summarize(self, results_folder_name):
|
|
155
164
|
"""Summarize the results in the specified folder."""
|
|
156
165
|
all_folders = [f for f in self.list() if f.startswith(RESULTS_FOLDER_PREFIX)]
|
|
157
|
-
if
|
|
158
|
-
raise ValueError(
|
|
166
|
+
if results_folder_name not in all_folders:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f'Folder "{results_folder_name}" does not exist in the results directory.'
|
|
169
|
+
)
|
|
159
170
|
|
|
160
|
-
date = pd.to_datetime(
|
|
171
|
+
date = pd.to_datetime(results_folder_name[-NUM_DIGITS_DATE:], format='%m_%d_%Y')
|
|
161
172
|
folder_to_results = {}
|
|
162
173
|
for folder in all_folders:
|
|
163
174
|
folder_date = pd.to_datetime(folder[len(RESULTS_FOLDER_PREFIX) :], format='%m_%d_%Y')
|
|
@@ -181,28 +192,86 @@ class ResultsHandler(ABC):
|
|
|
181
192
|
|
|
182
193
|
summarized_table = self._get_summarize_table(folder_to_results, folder_infos)
|
|
183
194
|
|
|
184
|
-
return summarized_table, folder_to_results[
|
|
195
|
+
return summarized_table, folder_to_results[results_folder_name]
|
|
196
|
+
|
|
197
|
+
def _validate_load_results_filters(self, dataset_names, synthesizer_names, summary):
|
|
198
|
+
if dataset_names is not None:
|
|
199
|
+
if not _is_list_of_type(dataset_names, str):
|
|
200
|
+
raise ValueError('`dataset_names` must be a list of strings or None.')
|
|
201
|
+
|
|
202
|
+
if synthesizer_names is not None:
|
|
203
|
+
if not _is_list_of_type(synthesizer_names, str):
|
|
204
|
+
raise ValueError('`synthesizer_names` must be a list of strings or None.')
|
|
185
205
|
|
|
186
|
-
|
|
206
|
+
if not isinstance(summary, bool):
|
|
207
|
+
raise ValueError('`summary` must be a boolean.')
|
|
208
|
+
|
|
209
|
+
def load_results(
|
|
210
|
+
self, results_folder_name, dataset_names=None, synthesizer_names=None, summary=False
|
|
211
|
+
):
|
|
187
212
|
"""Load and aggregate all the results CSV files from the specified results folder.
|
|
188
213
|
|
|
189
214
|
Args:
|
|
190
215
|
results_folder_name (str):
|
|
191
216
|
The name of the results folder to load results from.
|
|
217
|
+
dataset_names (list of str, optional):
|
|
218
|
+
A list of dataset names to filter results for. If None, results for all
|
|
219
|
+
datasets will be loaded. Defaults to None.
|
|
220
|
+
synthesizer_names (list of str, optional):
|
|
221
|
+
A list of synthesizer names to filter results for. If None, results for all
|
|
222
|
+
synthesizers will be loaded. Defaults to None.
|
|
223
|
+
summary (bool, optional):
|
|
224
|
+
If True, only return the summary results which include the following columns:
|
|
225
|
+
- 'Dataset'
|
|
226
|
+
- 'Synthesizer'
|
|
227
|
+
- 'Adjusted_Total_Time'
|
|
228
|
+
- 'Adjusted_Quality_Score'
|
|
229
|
+
- 'Diagnostic_Score'
|
|
230
|
+
Defaults to False.
|
|
192
231
|
|
|
193
232
|
Returns:
|
|
194
233
|
pd.DataFrame:
|
|
195
234
|
A DataFrame containing the results of the specified folder.
|
|
196
235
|
"""
|
|
236
|
+
has_dataset_filter = dataset_names is not None
|
|
237
|
+
has_synthesizer_filter = synthesizer_names is not None
|
|
197
238
|
self._validate_folder_name(results_folder_name)
|
|
239
|
+
self._validate_load_results_filters(dataset_names, synthesizer_names, summary)
|
|
198
240
|
result_filenames = self._get_results_files(
|
|
199
241
|
results_folder_name, prefix=RESULTS_FILE_PREFIX, suffix='.csv'
|
|
200
242
|
)
|
|
201
243
|
|
|
202
|
-
|
|
244
|
+
result = pd.concat(
|
|
203
245
|
self._get_results(results_folder_name, result_filenames),
|
|
204
246
|
ignore_index=True,
|
|
205
247
|
)
|
|
248
|
+
if has_dataset_filter:
|
|
249
|
+
result = result[result['Dataset'].isin(dataset_names)]
|
|
250
|
+
|
|
251
|
+
if has_synthesizer_filter:
|
|
252
|
+
result = result[result['Synthesizer'].isin(synthesizer_names)]
|
|
253
|
+
|
|
254
|
+
if result.empty:
|
|
255
|
+
filters = []
|
|
256
|
+
if has_dataset_filter:
|
|
257
|
+
filters.append(f'- Datasets: {", ".join(dataset_names)}')
|
|
258
|
+
if has_synthesizer_filter:
|
|
259
|
+
filters.append(f'- Synthesizers: {", ".join(synthesizer_names)}')
|
|
260
|
+
|
|
261
|
+
if filters:
|
|
262
|
+
filters_text = '\n'.join(filters)
|
|
263
|
+
warning_message = (
|
|
264
|
+
f'No results found in folder "{results_folder_name}" '
|
|
265
|
+
f'matching the specified filters:\n'
|
|
266
|
+
f'{filters_text}'
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
warning_message = f'No results found in folder "{results_folder_name}".'
|
|
270
|
+
|
|
271
|
+
warnings.warn(warning_message)
|
|
272
|
+
|
|
273
|
+
result = result[SUMMARY_COLUMNS] if summary else result
|
|
274
|
+
return result.reset_index(drop=True)
|
|
206
275
|
|
|
207
276
|
def load_metainfo(self, results_folder_name):
|
|
208
277
|
"""Load and aggregate all the metainfo YAML files from the specified results folder.
|
|
@@ -227,14 +296,16 @@ class ResultsHandler(ABC):
|
|
|
227
296
|
|
|
228
297
|
return results
|
|
229
298
|
|
|
230
|
-
def all_runs_complete(self,
|
|
299
|
+
def all_runs_complete(self, results_folder_name):
|
|
231
300
|
"""Check if all runs in the specified folder are complete."""
|
|
232
|
-
yaml_files = self._get_results_files(
|
|
301
|
+
yaml_files = self._get_results_files(
|
|
302
|
+
results_folder_name, prefix=metainfo_PREFIX, suffix='.yaml'
|
|
303
|
+
)
|
|
233
304
|
if not yaml_files:
|
|
234
305
|
return False
|
|
235
306
|
|
|
236
307
|
for yaml_file in yaml_files:
|
|
237
|
-
metainfo_info = self._load_yaml_file(
|
|
308
|
+
metainfo_info = self._load_yaml_file(results_folder_name, yaml_file)
|
|
238
309
|
if metainfo_info.get('completed_date') is None:
|
|
239
310
|
return False
|
|
240
311
|
|
|
@@ -278,21 +349,21 @@ class LocalResultsHandler(ResultsHandler):
|
|
|
278
349
|
|
|
279
350
|
return pd.read_csv(full_path)
|
|
280
351
|
|
|
281
|
-
def _get_results_files(self,
|
|
352
|
+
def _get_results_files(self, results_folder_name, prefix, suffix):
|
|
282
353
|
return [
|
|
283
354
|
f
|
|
284
|
-
for f in os.listdir(os.path.join(self.base_path,
|
|
355
|
+
for f in os.listdir(os.path.join(self.base_path, results_folder_name))
|
|
285
356
|
if f.endswith(suffix) and f.startswith(prefix)
|
|
286
357
|
]
|
|
287
358
|
|
|
288
|
-
def _get_results(self,
|
|
359
|
+
def _get_results(self, results_folder_name, file_names):
|
|
289
360
|
return [
|
|
290
|
-
pd.read_csv(os.path.join(self.base_path,
|
|
361
|
+
pd.read_csv(os.path.join(self.base_path, results_folder_name, file_name))
|
|
291
362
|
for file_name in file_names
|
|
292
363
|
]
|
|
293
364
|
|
|
294
|
-
def _load_yaml_file(self,
|
|
295
|
-
file_path = os.path.join(self.base_path,
|
|
365
|
+
def _load_yaml_file(self, results_folder_name, file_name):
|
|
366
|
+
file_path = os.path.join(self.base_path, results_folder_name, file_name)
|
|
296
367
|
with open(file_path, 'r') as f:
|
|
297
368
|
return yaml.safe_load(f)
|
|
298
369
|
|
|
@@ -396,29 +467,29 @@ class S3ResultsHandler(ResultsHandler):
|
|
|
396
467
|
|
|
397
468
|
return pd.read_csv(io.BytesIO(body))
|
|
398
469
|
|
|
399
|
-
def _get_results_files(self,
|
|
400
|
-
s3_prefix = f'{self.prefix}{
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
470
|
+
def _get_results_files(self, results_folder_name, prefix, suffix):
|
|
471
|
+
s3_prefix = f'{self.prefix}{results_folder_name}/'
|
|
472
|
+
paginator = self.s3_client.get_paginator('list_objects_v2')
|
|
473
|
+
files = []
|
|
474
|
+
for page in paginator.paginate(Bucket=self.bucket_name, Prefix=s3_prefix):
|
|
475
|
+
for obj in page.get('Contents', []):
|
|
476
|
+
key = obj['Key']
|
|
477
|
+
if key.startswith(s3_prefix + prefix) and key.endswith(suffix):
|
|
478
|
+
files.append(key.rsplit('/', 1)[-1])
|
|
404
479
|
|
|
405
|
-
return
|
|
406
|
-
obj['Key'].split('/')[-1]
|
|
407
|
-
for obj in response['Contents']
|
|
408
|
-
if obj['Key'].startswith(s3_prefix + prefix) and obj['Key'].endswith(suffix)
|
|
409
|
-
]
|
|
480
|
+
return files
|
|
410
481
|
|
|
411
|
-
def _get_results(self,
|
|
482
|
+
def _get_results(self, results_folder_name, file_names):
|
|
412
483
|
results = []
|
|
413
484
|
for file_name in file_names:
|
|
414
|
-
s3_key = f'{self.prefix}{
|
|
485
|
+
s3_key = f'{self.prefix}{results_folder_name}/{file_name}'
|
|
415
486
|
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=s3_key)
|
|
416
487
|
result_df = pd.read_csv(io.BytesIO(response['Body'].read()))
|
|
417
488
|
results.append(result_df)
|
|
418
489
|
|
|
419
490
|
return results
|
|
420
491
|
|
|
421
|
-
def _load_yaml_file(self,
|
|
422
|
-
s3_key = f'{self.prefix}{
|
|
492
|
+
def _load_yaml_file(self, results_folder_name, file_name):
|
|
493
|
+
s3_key = f'{self.prefix}{results_folder_name}/{file_name}'
|
|
423
494
|
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=s3_key)
|
|
424
495
|
return yaml.safe_load(response['Body'])
|
|
@@ -204,3 +204,8 @@ def _set_column_width(writer, df, sheet_name):
|
|
|
204
204
|
max_length = max(df[column].astype(str).map(len).max(), len(column))
|
|
205
205
|
column_letter = get_column_letter(col_idx)
|
|
206
206
|
worksheet.column_dimensions[column_letter].width = max_length + 2
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _is_list_of_type(values, type_to_check=str):
|
|
210
|
+
"""Checks that 'values' is a list and all elements are of type 'type_to_check'."""
|
|
211
|
+
return isinstance(values, list) and all(isinstance(value, type_to_check) for value in values)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.2.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License-Expression: BUSL-1.1
|
|
@@ -178,9 +178,7 @@ Now, we can benchmark the different techniques:
|
|
|
178
178
|
```python
|
|
179
179
|
import sdgym
|
|
180
180
|
|
|
181
|
-
sdgym.benchmark_single_table(
|
|
182
|
-
synthesizers=(sdv_synthesizers + baseline_synthesizers)
|
|
183
|
-
)
|
|
181
|
+
sdgym.benchmark_single_table(synthesizers=(sdv_synthesizers + baseline_synthesizers))
|
|
184
182
|
```
|
|
185
183
|
|
|
186
184
|
The result is a detailed performance, memory and quality evaluation across the synthesizers
|
|
@@ -197,6 +195,7 @@ def my_training_logic(data, metadata):
|
|
|
197
195
|
# train it using the data
|
|
198
196
|
return synthesizer
|
|
199
197
|
|
|
198
|
+
|
|
200
199
|
def my_sampling_logic(trained_synthesizer, num_rows):
|
|
201
200
|
# use the trained synthesizer to create
|
|
202
201
|
# num_rows of synthetic data
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|