sdgym 0.13.2.dev0__tar.gz → 0.14.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {sdgym-0.13.2.dev0/sdgym.egg-info → sdgym-0.14.1.dev0}/PKG-INFO +1 -1
  2. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/pyproject.toml +1 -1
  3. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/__init__.py +1 -1
  4. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/benchmark.py +1 -0
  5. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/result_explorer/result_handler.py +71 -20
  6. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/run_benchmark/run_benchmark.py +2 -1
  7. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0/sdgym.egg-info}/PKG-INFO +1 -1
  8. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/LICENSE +0 -0
  9. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/README.md +0 -0
  10. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/_benchmark/__init__.py +0 -0
  11. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/_benchmark/benchmark.py +0 -0
  12. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/_benchmark/config_utils.py +0 -0
  13. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/_benchmark/credentials_utils.py +0 -0
  14. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/_dataset_utils.py +0 -0
  15. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/cli/__init__.py +0 -0
  16. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/cli/__main__.py +0 -0
  17. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/cli/collect.py +0 -0
  18. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/cli/summary.py +0 -0
  19. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/cli/utils.py +0 -0
  20. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/dataset_explorer.py +0 -0
  21. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/datasets.py +0 -0
  22. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/errors.py +0 -0
  23. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/metrics.py +0 -0
  24. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/progress.py +0 -0
  25. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/result_explorer/__init__.py +0 -0
  26. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/result_explorer/result_explorer.py +0 -0
  27. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/result_writer.py +0 -0
  28. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/run_benchmark/__init__.py +0 -0
  29. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +0 -0
  30. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/run_benchmark/utils.py +0 -0
  31. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/s3.py +0 -0
  32. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizer_descriptions.yaml +0 -0
  33. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/__init__.py +0 -0
  34. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/base.py +0 -0
  35. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/column.py +0 -0
  36. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/generate.py +0 -0
  37. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/identity.py +0 -0
  38. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
  39. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/sdv.py +0 -0
  40. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/uniform.py +0 -0
  41. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/synthesizers/utils.py +0 -0
  42. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym/utils.py +0 -0
  43. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym.egg-info/SOURCES.txt +0 -0
  44. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  45. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  46. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym.egg-info/requires.txt +0 -0
  47. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/sdgym.egg-info/top_level.txt +0 -0
  48. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/setup.cfg +0 -0
  49. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/tests/test_scripts.py +0 -0
  50. {sdgym-0.13.2.dev0 → sdgym-0.14.1.dev0}/tests/test_tasks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.13.2.dev0
3
+ Version: 0.14.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -161,7 +161,7 @@ namespaces = false
161
161
  version = {attr = 'sdgym.__version__'}
162
162
 
163
163
  [tool.bumpversion]
164
- current_version = "0.13.2.dev0"
164
+ current_version = "0.14.1.dev0"
165
165
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
166
166
  serialize = [
167
167
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.13.2.dev0'
11
+ __version__ = '0.14.1.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -64,6 +64,7 @@ from sdgym.utils import (
64
64
  used_memory,
65
65
  )
66
66
 
67
+ TIMEOUT = 345600
67
68
  LOGGER = logging.getLogger(__name__)
68
69
  DEFAULT_SINGLE_TABLE_SYNTHESIZERS = [
69
70
  'GaussianCopulaSynthesizer',
@@ -13,6 +13,7 @@ import yaml
13
13
  from botocore.exceptions import ClientError
14
14
 
15
15
  from sdgym._dataset_utils import _read_zipped_data
16
+ from sdgym.benchmark import TIMEOUT, _add_adjusted_scores
16
17
  from sdgym.utils import _is_list_of_type
17
18
 
18
19
  SYNTHESIZER_BASELINE = 'GaussianCopulaSynthesizer'
@@ -67,26 +68,59 @@ class ResultsHandler(ABC):
67
68
  if folder_name not in all_folders:
68
69
  raise ValueError(f"Folder '{folder_name}' does not exist in the results directory.")
69
70
 
71
+ def _compute_dataset_pareto_frontier(self, dataset_results):
72
+ """Compute whether synthesizers are on the Pareto frontier for the dataset."""
73
+ qualities = dataset_results['Adjusted_Quality_Score'].to_numpy()
74
+ runtimes = dataset_results['Adjusted_Total_Time'].to_numpy()
75
+
76
+ is_on_frontier = []
77
+ for quality, runtime in zip(qualities, runtimes):
78
+ dominated = ((qualities > quality) & (runtimes < runtime)).any()
79
+ is_on_frontier.append(not dominated)
80
+
81
+ return pd.Series(is_on_frontier, index=dataset_results.index)
82
+
83
+ def _compute_pareto_frontier(self, result):
84
+ """Compute whether synthesizers are on the Pareto frontier for all datasets."""
85
+ frontier_masks = []
86
+ for _, dataset_results in result.groupby('Dataset', sort=False):
87
+ dataset_frontier = self._compute_dataset_pareto_frontier(dataset_results)
88
+ dataset_frontier = pd.Series(dataset_frontier, index=dataset_results.index)
89
+ frontier_masks.append(dataset_frontier)
90
+
91
+ frontier_mask = pd.concat(frontier_masks).reindex(result.index)
92
+ return frontier_mask.astype(bool)
93
+
94
+ def _compute_meets_baseline_quality(self, result):
95
+ """Compute whether synthesizers meet or exceed the baseline quality for all datasets."""
96
+ baseline_scores = (
97
+ result
98
+ .loc[
99
+ result['Synthesizer'] == self.baseline_synthesizer,
100
+ ['Dataset', 'Adjusted_Quality_Score'],
101
+ ]
102
+ .drop_duplicates(subset='Dataset')
103
+ .rename(columns={'Adjusted_Quality_Score': 'Baseline_Quality_Score'})
104
+ )
105
+
106
+ result_with_baseline = result.merge(baseline_scores, on='Dataset', how='left')
107
+ meet_baseline = (
108
+ result_with_baseline['Adjusted_Quality_Score']
109
+ >= result_with_baseline['Baseline_Quality_Score']
110
+ )
111
+
112
+ return meet_baseline
113
+
70
114
  def _compute_wins(self, result):
71
- synthesizers = result['Synthesizer'].unique()
72
- datasets = result['Dataset'].unique()
73
- result['Win'] = 0
74
- for dataset in datasets:
75
- score_baseline = result.loc[
76
- (result['Synthesizer'] == self.baseline_synthesizer)
77
- & (result['Dataset'] == dataset)
78
- ]['Quality_Score'].to_numpy()
79
- if score_baseline.size == 0:
80
- continue
115
+ """Compute wins based on baseline quality and Pareto frontier."""
116
+ result['Meets_Baseline_Quality'] = self._compute_meets_baseline_quality(result)
117
+ result['On_Pareto_Frontier'] = self._compute_pareto_frontier(result)
118
+ result['Win'] = (result['Meets_Baseline_Quality'] & result['On_Pareto_Frontier']).astype(
119
+ int
120
+ )
121
+ result = result.drop(columns=['Meets_Baseline_Quality', 'On_Pareto_Frontier'])
81
122
 
82
- for synthesizer in synthesizers:
83
- loc_synthesizer = (result['Synthesizer'] == synthesizer) & (
84
- result['Dataset'] == dataset
85
- )
86
- score_synthesizer = result.loc[loc_synthesizer]['Quality_Score'].to_numpy()
87
- result.loc[loc_synthesizer, 'Win'] = (score_synthesizer > score_baseline).astype(
88
- int
89
- )
123
+ return result
90
124
 
91
125
  def _get_summarize_table(self, folder_to_results, folder_infos):
92
126
  """Create a summary table from the results."""
@@ -99,7 +133,6 @@ class ResultsHandler(ABC):
99
133
  f' - # datasets: {folder_infos[folder]["# datasets"]}'
100
134
  f' - sdgym version: {folder_infos[folder]["sdgym_version"]}'
101
135
  )
102
- results = results.loc[results['Synthesizer'] != self.baseline_synthesizer]
103
136
  column_data = results.groupby(['Synthesizer'])['Win'].sum()
104
137
  columns.append((date_obj, column_name, column_data))
105
138
 
@@ -147,6 +180,21 @@ class ResultsHandler(ABC):
147
180
  aggregated_results = aggregated_results.drop_duplicates(
148
181
  subset=['Dataset', 'Synthesizer'], keep='first'
149
182
  )
183
+ aggregated_results = _add_adjusted_scores(aggregated_results, timeout=TIMEOUT)
184
+
185
+ # Backward compatibility for runs done before graceful degradation logic existed
186
+ fallback_columns = {
187
+ 'Adjusted_Quality_Score': aggregated_results['Quality_Score'],
188
+ 'Adjusted_Total_Time': (
189
+ aggregated_results['Train_Time'] + aggregated_results['Sample_Time']
190
+ ),
191
+ }
192
+ missing_adjusted_columns = [
193
+ column for column in fallback_columns if aggregated_results[column].isna().all()
194
+ ]
195
+ for column in missing_adjusted_columns:
196
+ aggregated_results[column] = fallback_columns[column]
197
+
150
198
  all_synthesizers = aggregated_results['Synthesizer'].unique()
151
199
  dataset_synth_counts = aggregated_results.groupby('Dataset')['Synthesizer'].nunique()
152
200
  valid_datasets = dataset_synth_counts[dataset_synth_counts == len(all_synthesizers)].index
@@ -158,6 +206,9 @@ class ResultsHandler(ABC):
158
206
  )
159
207
 
160
208
  filtered_results = filtered_results.sort_values(by=['Dataset', 'Synthesizer'])
209
+ if missing_adjusted_columns:
210
+ filtered_results = filtered_results.drop(columns=missing_adjusted_columns)
211
+
161
212
  return filtered_results.reset_index(drop=True)
162
213
 
163
214
  def summarize(self, results_folder_name):
@@ -186,7 +237,7 @@ class ResultsHandler(ABC):
186
237
  continue
187
238
 
188
239
  aggregated_results = self._process_results(results)
189
- self._compute_wins(aggregated_results)
240
+ aggregated_results = self._compute_wins(aggregated_results)
190
241
  folder_to_results[folder] = aggregated_results
191
242
  folder_infos = self._get_column_name_infos(folder_to_results)
192
243
 
@@ -10,6 +10,7 @@ from sdgym._benchmark.benchmark import (
10
10
  _benchmark_multi_table_compute_gcp,
11
11
  _benchmark_single_table_compute_gcp,
12
12
  )
13
+ from sdgym.benchmark import TIMEOUT
13
14
  from sdgym.run_benchmark.utils import (
14
15
  KEY_DATE_FILE,
15
16
  OUTPUT_DESTINATION_AWS,
@@ -195,7 +196,7 @@ def main():
195
196
  credential_filepath=os.getenv('CREDENTIALS_FILEPATH'),
196
197
  synthesizers=synthesizers,
197
198
  sdv_datasets=datasets,
198
- timeout=345600, # 4 days
199
+ timeout=TIMEOUT, # 4 days
199
200
  )
200
201
 
201
202
  append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str, modality=modality)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.13.2.dev0
3
+ Version: 0.14.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes