sdgym 0.13.1.dev0__tar.gz → 0.13.2.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {sdgym-0.13.1.dev0/sdgym.egg-info → sdgym-0.13.2.dev0}/PKG-INFO +3 -4
  2. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/README.md +2 -3
  3. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/pyproject.toml +1 -1
  4. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/__init__.py +1 -1
  5. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_explorer/result_explorer.py +112 -21
  6. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_explorer/result_handler.py +101 -30
  7. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/utils.py +5 -0
  8. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0/sdgym.egg-info}/PKG-INFO +3 -4
  9. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/LICENSE +0 -0
  10. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/__init__.py +0 -0
  11. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/benchmark.py +0 -0
  12. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/config_utils.py +0 -0
  13. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_benchmark/credentials_utils.py +0 -0
  14. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/_dataset_utils.py +0 -0
  15. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/benchmark.py +0 -0
  16. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/__init__.py +0 -0
  17. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/__main__.py +0 -0
  18. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/collect.py +0 -0
  19. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/summary.py +0 -0
  20. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/cli/utils.py +0 -0
  21. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/dataset_explorer.py +0 -0
  22. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/datasets.py +0 -0
  23. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/errors.py +0 -0
  24. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/metrics.py +0 -0
  25. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/progress.py +0 -0
  26. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_explorer/__init__.py +0 -0
  27. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/result_writer.py +0 -0
  28. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/__init__.py +0 -0
  29. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/run_benchmark.py +0 -0
  30. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +0 -0
  31. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/run_benchmark/utils.py +0 -0
  32. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/s3.py +0 -0
  33. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizer_descriptions.yaml +0 -0
  34. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/__init__.py +0 -0
  35. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/base.py +0 -0
  36. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/column.py +0 -0
  37. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/generate.py +0 -0
  38. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/identity.py +0 -0
  39. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
  40. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/sdv.py +0 -0
  41. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/uniform.py +0 -0
  42. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym/synthesizers/utils.py +0 -0
  43. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/SOURCES.txt +0 -0
  44. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  45. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  46. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/requires.txt +0 -0
  47. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/sdgym.egg-info/top_level.txt +0 -0
  48. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/setup.cfg +0 -0
  49. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/tests/test_scripts.py +0 -0
  50. {sdgym-0.13.1.dev0 → sdgym-0.13.2.dev0}/tests/test_tasks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.13.1.dev0
3
+ Version: 0.13.2.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -178,9 +178,7 @@ Now, we can benchmark the different techniques:
178
178
  ```python
179
179
  import sdgym
180
180
 
181
- sdgym.benchmark_single_table(
182
- synthesizers=(sdv_synthesizers + baseline_synthesizers)
183
- )
181
+ sdgym.benchmark_single_table(synthesizers=(sdv_synthesizers + baseline_synthesizers))
184
182
  ```
185
183
 
186
184
  The result is a detailed performance, memory and quality evaluation across the synthesizers
@@ -197,6 +195,7 @@ def my_training_logic(data, metadata):
197
195
  # train it using the data
198
196
  return synthesizer
199
197
 
198
+
200
199
  def my_sampling_logic(trained_synthesizer, num_rows):
201
200
  # use the trained synthesizer to create
202
201
  # num_rows of synthetic data
@@ -73,9 +73,7 @@ Now, we can benchmark the different techniques:
73
73
  ```python
74
74
  import sdgym
75
75
 
76
- sdgym.benchmark_single_table(
77
- synthesizers=(sdv_synthesizers + baseline_synthesizers)
78
- )
76
+ sdgym.benchmark_single_table(synthesizers=(sdv_synthesizers + baseline_synthesizers))
79
77
  ```
80
78
 
81
79
  The result is a detailed performance, memory and quality evaluation across the synthesizers
@@ -92,6 +90,7 @@ def my_training_logic(data, metadata):
92
90
  # train it using the data
93
91
  return synthesizer
94
92
 
93
+
95
94
  def my_sampling_logic(trained_synthesizer, num_rows):
96
95
  # use the trained synthesizer to create
97
96
  # num_rows of synthetic data
@@ -161,7 +161,7 @@ namespaces = false
161
161
  version = {attr = 'sdgym.__version__'}
162
162
 
163
163
  [tool.bumpversion]
164
- current_version = "0.13.1.dev0"
164
+ current_version = "0.13.2.dev0"
165
165
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
166
166
  serialize = [
167
167
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.13.1.dev0'
11
+ __version__ = '0.13.2.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -1,9 +1,12 @@
1
1
  """SDGym Results Explorer for accessing and managing benchmark results."""
2
2
 
3
+ import operator
3
4
  import os
5
+ from datetime import datetime
4
6
 
5
7
  from sdgym.datasets import _load_dataset_with_client
6
8
  from sdgym.result_explorer.result_handler import (
9
+ RESULTS_FOLDER_PREFIX,
7
10
  SYNTHESIZER_BASELINE,
8
11
  LocalResultsHandler,
9
12
  S3ResultsHandler,
@@ -39,6 +42,29 @@ def _resolve_effective_path(path, modality):
39
42
  class ResultsExplorer:
40
43
  """Explorer for SDGym benchmark results, supporting both local and S3 storage."""
41
44
 
45
+ def _get_latest_run(self):
46
+ """Get the folder name of the latest SDGym run."""
47
+ candidates = []
48
+ for name in self._handler.list():
49
+ name = name.rstrip('/')
50
+ if not name.startswith(RESULTS_FOLDER_PREFIX):
51
+ continue
52
+
53
+ date_str = name[len(RESULTS_FOLDER_PREFIX) :]
54
+ try:
55
+ date_obj = datetime.strptime(date_str, '%m_%d_%Y')
56
+ except ValueError:
57
+ continue
58
+
59
+ candidates.append((date_obj, name))
60
+ if not candidates:
61
+ raise ValueError(
62
+ f'No run folders found. Expected folders like '
63
+ f"'{RESULTS_FOLDER_PREFIX}MM_DD_YYYY' under: {self.path}/{self.modality}"
64
+ )
65
+
66
+ return max(candidates, key=operator.itemgetter(0))[1]
67
+
42
68
  def _create_results_handler(self, original_path, effective_path):
43
69
  """Create the appropriate results handler for local or S3 storage."""
44
70
  baseline_synthesizer = _BASELINE_BY_MODALITY.get(self.modality, SYNTHESIZER_BASELINE)
@@ -86,22 +112,59 @@ class ResultsExplorer:
86
112
 
87
113
  return self._handler.get_file_path(path_parts, end_filename)
88
114
 
89
- def load_synthesizer(self, results_folder_name, dataset_name, synthesizer_name):
90
- """Load the synthesizer for a given dataset and synthesizer."""
115
+ def load_synthesizer(self, dataset_name, synthesizer_name, results_folder_name=None):
116
+ """Load the synthesizer for a given dataset and synthesizer.
117
+
118
+ Args:
119
+ dataset_name (str):
120
+ The name of the dataset.
121
+ synthesizer_name (str):
122
+ The name of the synthesizer.
123
+ results_folder_name (str, optional):
124
+ The name of the results folder to load from. If not provided,
125
+ the latest run will be used. Defaults to None.
126
+ """
127
+ results_folder_name = results_folder_name or self._get_latest_run()
91
128
  file_path = self._get_file_path(
92
- results_folder_name, dataset_name, synthesizer_name, 'synthesizer'
129
+ results_folder_name=results_folder_name,
130
+ dataset_name=dataset_name,
131
+ synthesizer_name=synthesizer_name,
132
+ file_type='synthesizer',
93
133
  )
94
134
  return self._handler.load_synthesizer(file_path)
95
135
 
96
- def load_synthetic_data(self, results_folder_name, dataset_name, synthesizer_name):
97
- """Load the synthetic data for a given dataset and synthesizer."""
136
+ def load_synthetic_data(self, dataset_name, synthesizer_name, results_folder_name=None):
137
+ """Load the synthetic data for a given dataset and synthesizer.
138
+
139
+ Args:
140
+ dataset_name (str):
141
+ The name of the dataset.
142
+ synthesizer_name (str):
143
+ The name of the synthesizer.
144
+ results_folder_name (str, optional):
145
+ The name of the results folder to load from. If not provided,
146
+ the latest run will be used. Defaults to None.
147
+ """
148
+ results_folder_name = results_folder_name or self._get_latest_run()
98
149
  file_path = self._get_file_path(
99
- results_folder_name, dataset_name, synthesizer_name, 'synthetic_data'
150
+ results_folder_name=results_folder_name,
151
+ dataset_name=dataset_name,
152
+ synthesizer_name=synthesizer_name,
153
+ file_type='synthetic_data',
100
154
  )
101
155
  return self._handler.load_synthetic_data(file_path)
102
156
 
103
157
  def load_real_data(self, dataset_name):
104
- """Load the real data for a given dataset."""
158
+ """Load the real data for a given dataset.
159
+
160
+ Args:
161
+ dataset_name (str):
162
+ The name of the dataset.
163
+
164
+ Returns:
165
+ pd.DataFrame:
166
+ A DataFrame containing the real data for the specified dataset.
167
+ """
105
168
  data, _ = _load_dataset_with_client(
106
169
  modality=self.modality,
107
170
  dataset=dataset_name,
@@ -109,46 +172,74 @@ class ResultsExplorer:
109
172
  )
110
173
  return data
111
174
 
112
- def summarize(self, folder_name):
175
+ def summarize(self, results_folder_name=None):
113
176
  """Summarize the results in the specified folder.
114
177
 
115
178
  Args:
116
- folder_name (str):
117
- The name of the results folder to summarize.
179
+ results_folder_name (str, optional):
180
+ The name of the results folder to summarize. If not provided,
181
+ the latest run will be used. Defaults to None.
118
182
 
119
183
  Returns:
120
184
  tuple (pd.DataFrame, pd.DataFrame):
121
185
  - A summary DataFrame with the number of Wins per synthesizer.
122
186
  - A DataFrame with the results of the benchmark for the specified folder.
123
187
  """
124
- return self._handler.summarize(folder_name)
188
+ results_folder_name = results_folder_name or self._get_latest_run()
189
+ return self._handler.summarize(results_folder_name=results_folder_name)
125
190
 
126
- def all_runs_complete(self, folder_name):
191
+ def all_runs_complete(self, results_folder_name=None):
127
192
  """Check if all runs in the specified folder are complete."""
128
- return self._handler.all_runs_complete(folder_name)
193
+ results_folder_name = results_folder_name or self._get_latest_run()
194
+ return self._handler.all_runs_complete(results_folder_name=results_folder_name)
129
195
 
130
- def load_results(self, results_folder_name):
196
+ def load_results(
197
+ self, dataset_names=None, synthesizer_names=None, summary=False, results_folder_name=None
198
+ ):
131
199
  """Load and aggregate all the results CSV files from the specified results folder.
132
200
 
133
201
  Args:
134
- results_folder_name (str):
135
- The name of the results folder to load results from.
202
+ dataset_names (list[str], optional):
203
+ A list of dataset names to filter results for. If None, results for all
204
+ datasets will be loaded. Defaults to None.
205
+ synthesizer_names (list[str], optional):
206
+ A list of synthesizer names to filter results for. If None, results for all
207
+ synthesizers will be loaded. Defaults to None.
208
+ summary (bool, optional):
209
+ If True, only return the summary results which include the following columns:
210
+ - 'Dataset'
211
+ - 'Synthesizer'
212
+ - 'Adjusted_Total_Time'
213
+ - 'Adjusted_Quality_Score'
214
+ - 'Diagnostic_Score'
215
+ Defaults to False.
216
+ results_folder_name (str, optional):
217
+ The name of the results folder to load results from. If not provided,
218
+ the latest run will be used. Defaults to None.
136
219
 
137
220
  Returns:
138
221
  pd.DataFrame:
139
222
  A DataFrame containing the results of the specified folder.
140
223
  """
141
- return self._handler.load_results(results_folder_name)
224
+ results_folder_name = results_folder_name or self._get_latest_run()
225
+ return self._handler.load_results(
226
+ results_folder_name=results_folder_name,
227
+ dataset_names=dataset_names,
228
+ synthesizer_names=synthesizer_names,
229
+ summary=summary,
230
+ )
142
231
 
143
- def load_metainfo(self, results_folder_name):
232
+ def load_metainfo(self, results_folder_name=None):
144
233
  """Load and aggregate all the metainfo YAML files from the specified results folder.
145
234
 
146
235
  Args:
147
- results_folder_name (str):
148
- The name of the results folder to load metainfo from.
236
+ results_folder_name (str, optional):
237
+ The name of the results folder to load metainfo from. If not provided,
238
+ the latest run will be used. Defaults to None.
149
239
 
150
240
  Returns:
151
241
  dict:
152
242
  A dictionary containing the metainfo of the specified folder.
153
243
  """
154
- return self._handler.load_metainfo(results_folder_name)
244
+ results_folder_name = results_folder_name or self._get_latest_run()
245
+ return self._handler.load_metainfo(results_folder_name=results_folder_name)
@@ -3,6 +3,7 @@
3
3
  import io
4
4
  import operator
5
5
  import os
6
+ import warnings
6
7
  from abc import ABC, abstractmethod
7
8
  from datetime import datetime
8
9
 
@@ -12,6 +13,7 @@ import yaml
12
13
  from botocore.exceptions import ClientError
13
14
 
14
15
  from sdgym._dataset_utils import _read_zipped_data
16
+ from sdgym.utils import _is_list_of_type
15
17
 
16
18
  SYNTHESIZER_BASELINE = 'GaussianCopulaSynthesizer'
17
19
  RESULTS_FOLDER_PREFIX = 'SDGym_results_'
@@ -19,6 +21,13 @@ metainfo_PREFIX = 'metainfo'
19
21
  RESULTS_FILE_PREFIX = 'results'
20
22
  NUM_DIGITS_DATE = 10
21
23
  REGEX_SYNTHESIZER_NAME = r'\s*\(\d+\)\s*$'
24
+ SUMMARY_COLUMNS = [
25
+ 'Dataset',
26
+ 'Synthesizer',
27
+ 'Adjusted_Total_Time',
28
+ 'Adjusted_Quality_Score',
29
+ 'Diagnostic_Score',
30
+ ]
22
31
 
23
32
 
24
33
  class ResultsHandler(ABC):
@@ -151,13 +160,15 @@ class ResultsHandler(ABC):
151
160
  filtered_results = filtered_results.sort_values(by=['Dataset', 'Synthesizer'])
152
161
  return filtered_results.reset_index(drop=True)
153
162
 
154
- def summarize(self, folder_name):
163
+ def summarize(self, results_folder_name):
155
164
  """Summarize the results in the specified folder."""
156
165
  all_folders = [f for f in self.list() if f.startswith(RESULTS_FOLDER_PREFIX)]
157
- if folder_name not in all_folders:
158
- raise ValueError(f'Folder "{folder_name}" does not exist in the results directory.')
166
+ if results_folder_name not in all_folders:
167
+ raise ValueError(
168
+ f'Folder "{results_folder_name}" does not exist in the results directory.'
169
+ )
159
170
 
160
- date = pd.to_datetime(folder_name[-NUM_DIGITS_DATE:], format='%m_%d_%Y')
171
+ date = pd.to_datetime(results_folder_name[-NUM_DIGITS_DATE:], format='%m_%d_%Y')
161
172
  folder_to_results = {}
162
173
  for folder in all_folders:
163
174
  folder_date = pd.to_datetime(folder[len(RESULTS_FOLDER_PREFIX) :], format='%m_%d_%Y')
@@ -181,28 +192,86 @@ class ResultsHandler(ABC):
181
192
 
182
193
  summarized_table = self._get_summarize_table(folder_to_results, folder_infos)
183
194
 
184
- return summarized_table, folder_to_results[folder_name]
195
+ return summarized_table, folder_to_results[results_folder_name]
196
+
197
+ def _validate_load_results_filters(self, dataset_names, synthesizer_names, summary):
198
+ if dataset_names is not None:
199
+ if not _is_list_of_type(dataset_names, str):
200
+ raise ValueError('`dataset_names` must be a list of strings or None.')
201
+
202
+ if synthesizer_names is not None:
203
+ if not _is_list_of_type(synthesizer_names, str):
204
+ raise ValueError('`synthesizer_names` must be a list of strings or None.')
185
205
 
186
- def load_results(self, results_folder_name):
206
+ if not isinstance(summary, bool):
207
+ raise ValueError('`summary` must be a boolean.')
208
+
209
+ def load_results(
210
+ self, results_folder_name, dataset_names=None, synthesizer_names=None, summary=False
211
+ ):
187
212
  """Load and aggregate all the results CSV files from the specified results folder.
188
213
 
189
214
  Args:
190
215
  results_folder_name (str):
191
216
  The name of the results folder to load results from.
217
+ dataset_names (list of str, optional):
218
+ A list of dataset names to filter results for. If None, results for all
219
+ datasets will be loaded. Defaults to None.
220
+ synthesizer_names (list of str, optional):
221
+ A list of synthesizer names to filter results for. If None, results for all
222
+ synthesizers will be loaded. Defaults to None.
223
+ summary (bool, optional):
224
+ If True, only return the summary results which include the following columns:
225
+ - 'Dataset'
226
+ - 'Synthesizer'
227
+ - 'Adjusted_Total_Time'
228
+ - 'Adjusted_Quality_Score'
229
+ - 'Diagnostic_Score'
230
+ Defaults to False.
192
231
 
193
232
  Returns:
194
233
  pd.DataFrame:
195
234
  A DataFrame containing the results of the specified folder.
196
235
  """
236
+ has_dataset_filter = dataset_names is not None
237
+ has_synthesizer_filter = synthesizer_names is not None
197
238
  self._validate_folder_name(results_folder_name)
239
+ self._validate_load_results_filters(dataset_names, synthesizer_names, summary)
198
240
  result_filenames = self._get_results_files(
199
241
  results_folder_name, prefix=RESULTS_FILE_PREFIX, suffix='.csv'
200
242
  )
201
243
 
202
- return pd.concat(
244
+ result = pd.concat(
203
245
  self._get_results(results_folder_name, result_filenames),
204
246
  ignore_index=True,
205
247
  )
248
+ if has_dataset_filter:
249
+ result = result[result['Dataset'].isin(dataset_names)]
250
+
251
+ if has_synthesizer_filter:
252
+ result = result[result['Synthesizer'].isin(synthesizer_names)]
253
+
254
+ if result.empty:
255
+ filters = []
256
+ if has_dataset_filter:
257
+ filters.append(f'- Datasets: {", ".join(dataset_names)}')
258
+ if has_synthesizer_filter:
259
+ filters.append(f'- Synthesizers: {", ".join(synthesizer_names)}')
260
+
261
+ if filters:
262
+ filters_text = '\n'.join(filters)
263
+ warning_message = (
264
+ f'No results found in folder "{results_folder_name}" '
265
+ f'matching the specified filters:\n'
266
+ f'{filters_text}'
267
+ )
268
+ else:
269
+ warning_message = f'No results found in folder "{results_folder_name}".'
270
+
271
+ warnings.warn(warning_message)
272
+
273
+ result = result[SUMMARY_COLUMNS] if summary else result
274
+ return result.reset_index(drop=True)
206
275
 
207
276
  def load_metainfo(self, results_folder_name):
208
277
  """Load and aggregate all the metainfo YAML files from the specified results folder.
@@ -227,14 +296,16 @@ class ResultsHandler(ABC):
227
296
 
228
297
  return results
229
298
 
230
- def all_runs_complete(self, folder_name):
299
+ def all_runs_complete(self, results_folder_name):
231
300
  """Check if all runs in the specified folder are complete."""
232
- yaml_files = self._get_results_files(folder_name, prefix=metainfo_PREFIX, suffix='.yaml')
301
+ yaml_files = self._get_results_files(
302
+ results_folder_name, prefix=metainfo_PREFIX, suffix='.yaml'
303
+ )
233
304
  if not yaml_files:
234
305
  return False
235
306
 
236
307
  for yaml_file in yaml_files:
237
- metainfo_info = self._load_yaml_file(folder_name, yaml_file)
308
+ metainfo_info = self._load_yaml_file(results_folder_name, yaml_file)
238
309
  if metainfo_info.get('completed_date') is None:
239
310
  return False
240
311
 
@@ -278,21 +349,21 @@ class LocalResultsHandler(ResultsHandler):
278
349
 
279
350
  return pd.read_csv(full_path)
280
351
 
281
- def _get_results_files(self, folder_name, prefix, suffix):
352
+ def _get_results_files(self, results_folder_name, prefix, suffix):
282
353
  return [
283
354
  f
284
- for f in os.listdir(os.path.join(self.base_path, folder_name))
355
+ for f in os.listdir(os.path.join(self.base_path, results_folder_name))
285
356
  if f.endswith(suffix) and f.startswith(prefix)
286
357
  ]
287
358
 
288
- def _get_results(self, folder_name, file_names):
359
+ def _get_results(self, results_folder_name, file_names):
289
360
  return [
290
- pd.read_csv(os.path.join(self.base_path, folder_name, file_name))
361
+ pd.read_csv(os.path.join(self.base_path, results_folder_name, file_name))
291
362
  for file_name in file_names
292
363
  ]
293
364
 
294
- def _load_yaml_file(self, folder_name, file_name):
295
- file_path = os.path.join(self.base_path, folder_name, file_name)
365
+ def _load_yaml_file(self, results_folder_name, file_name):
366
+ file_path = os.path.join(self.base_path, results_folder_name, file_name)
296
367
  with open(file_path, 'r') as f:
297
368
  return yaml.safe_load(f)
298
369
 
@@ -396,29 +467,29 @@ class S3ResultsHandler(ResultsHandler):
396
467
 
397
468
  return pd.read_csv(io.BytesIO(body))
398
469
 
399
- def _get_results_files(self, folder_name, prefix, suffix):
400
- s3_prefix = f'{self.prefix}{folder_name}/'
401
- response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix=s3_prefix)
402
- if 'Contents' not in response:
403
- return []
470
+ def _get_results_files(self, results_folder_name, prefix, suffix):
471
+ s3_prefix = f'{self.prefix}{results_folder_name}/'
472
+ paginator = self.s3_client.get_paginator('list_objects_v2')
473
+ files = []
474
+ for page in paginator.paginate(Bucket=self.bucket_name, Prefix=s3_prefix):
475
+ for obj in page.get('Contents', []):
476
+ key = obj['Key']
477
+ if key.startswith(s3_prefix + prefix) and key.endswith(suffix):
478
+ files.append(key.rsplit('/', 1)[-1])
404
479
 
405
- return [
406
- obj['Key'].split('/')[-1]
407
- for obj in response['Contents']
408
- if obj['Key'].startswith(s3_prefix + prefix) and obj['Key'].endswith(suffix)
409
- ]
480
+ return files
410
481
 
411
- def _get_results(self, folder_name, file_names):
482
+ def _get_results(self, results_folder_name, file_names):
412
483
  results = []
413
484
  for file_name in file_names:
414
- s3_key = f'{self.prefix}{folder_name}/{file_name}'
485
+ s3_key = f'{self.prefix}{results_folder_name}/{file_name}'
415
486
  response = self.s3_client.get_object(Bucket=self.bucket_name, Key=s3_key)
416
487
  result_df = pd.read_csv(io.BytesIO(response['Body'].read()))
417
488
  results.append(result_df)
418
489
 
419
490
  return results
420
491
 
421
- def _load_yaml_file(self, folder_name, file_name):
422
- s3_key = f'{self.prefix}{folder_name}/{file_name}'
492
+ def _load_yaml_file(self, results_folder_name, file_name):
493
+ s3_key = f'{self.prefix}{results_folder_name}/{file_name}'
423
494
  response = self.s3_client.get_object(Bucket=self.bucket_name, Key=s3_key)
424
495
  return yaml.safe_load(response['Body'])
@@ -204,3 +204,8 @@ def _set_column_width(writer, df, sheet_name):
204
204
  max_length = max(df[column].astype(str).map(len).max(), len(column))
205
205
  column_letter = get_column_letter(col_idx)
206
206
  worksheet.column_dimensions[column_letter].width = max_length + 2
207
+
208
+
209
+ def _is_list_of_type(values, type_to_check=str):
210
+ """Checks that 'values' is a list and all elements are of type 'type_to_check'."""
211
+ return isinstance(values, list) and all(isinstance(value, type_to_check) for value in values)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.13.1.dev0
3
+ Version: 0.13.2.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -178,9 +178,7 @@ Now, we can benchmark the different techniques:
178
178
  ```python
179
179
  import sdgym
180
180
 
181
- sdgym.benchmark_single_table(
182
- synthesizers=(sdv_synthesizers + baseline_synthesizers)
183
- )
181
+ sdgym.benchmark_single_table(synthesizers=(sdv_synthesizers + baseline_synthesizers))
184
182
  ```
185
183
 
186
184
  The result is a detailed performance, memory and quality evaluation across the synthesizers
@@ -197,6 +195,7 @@ def my_training_logic(data, metadata):
197
195
  # train it using the data
198
196
  return synthesizer
199
197
 
198
+
200
199
  def my_sampling_logic(trained_synthesizer, num_rows):
201
200
  # use the trained synthesizer to create
202
201
  # num_rows of synthetic data
File without changes
File without changes
File without changes
File without changes