sdgym 0.12.2.dev0__tar.gz → 0.13.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {sdgym-0.12.2.dev0/sdgym.egg-info → sdgym-0.13.1.dev0}/PKG-INFO +31 -19
  2. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/pyproject.toml +32 -20
  3. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/__init__.py +1 -1
  4. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/_benchmark/credentials_utils.py +1 -1
  5. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/benchmark.py +12 -12
  6. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/cli/__main__.py +4 -4
  7. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/cli/summary.py +6 -6
  8. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/result_writer.py +2 -0
  9. sdgym-0.13.1.dev0/sdgym/run_benchmark/run_benchmark.py +206 -0
  10. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +20 -14
  11. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/run_benchmark/utils.py +41 -16
  12. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/utils.py +9 -0
  13. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0/sdgym.egg-info}/PKG-INFO +31 -19
  14. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym.egg-info/requires.txt +29 -13
  15. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/tests/test_tasks.py +1 -1
  16. sdgym-0.12.2.dev0/sdgym/run_benchmark/run_benchmark.py +0 -152
  17. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/LICENSE +0 -0
  18. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/README.md +0 -0
  19. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/_benchmark/__init__.py +0 -0
  20. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/_benchmark/benchmark.py +0 -0
  21. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/_benchmark/config_utils.py +0 -0
  22. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/_dataset_utils.py +0 -0
  23. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/cli/__init__.py +0 -0
  24. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/cli/collect.py +0 -0
  25. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/cli/utils.py +0 -0
  26. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/dataset_explorer.py +0 -0
  27. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/datasets.py +0 -0
  28. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/errors.py +0 -0
  29. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/metrics.py +0 -0
  30. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/progress.py +0 -0
  31. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/result_explorer/__init__.py +0 -0
  32. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/result_explorer/result_explorer.py +0 -0
  33. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/result_explorer/result_handler.py +0 -0
  34. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/run_benchmark/__init__.py +0 -0
  35. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/s3.py +0 -0
  36. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizer_descriptions.yaml +0 -0
  37. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/__init__.py +0 -0
  38. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/base.py +0 -0
  39. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/column.py +0 -0
  40. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/generate.py +0 -0
  41. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/identity.py +0 -0
  42. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
  43. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/sdv.py +0 -0
  44. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/uniform.py +0 -0
  45. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym/synthesizers/utils.py +0 -0
  46. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym.egg-info/SOURCES.txt +0 -0
  47. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  48. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  49. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/sdgym.egg-info/top_level.txt +0 -0
  50. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/setup.cfg +0 -0
  51. {sdgym-0.12.2.dev0 → sdgym-0.13.1.dev0}/tests/test_scripts.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.12.2.dev0
3
+ Version: 0.13.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
- License: BSL-1.1
6
+ License-Expression: BUSL-1.1
7
7
  Project-URL: Source Code, https://github.com/sdv-dev/SDGym/
8
8
  Project-URL: Issue Tracker, https://github.com/sdv-dev/SDGym/issues
9
9
  Project-URL: Changes, https://github.com/sdv-dev/SDGym/blob/main/HISTORY.md
@@ -12,7 +12,6 @@ Project-URL: Chat, https://bit.ly/sdv-slack-invite
12
12
  Keywords: machine learning,synthetic data generation,benchmark,generative models
13
13
  Classifier: Development Status :: 2 - Pre-Alpha
14
14
  Classifier: Intended Audience :: Developers
15
- Classifier: License :: Free for non-commercial use
16
15
  Classifier: Natural Language :: English
17
16
  Classifier: Programming Language :: Python :: 3
18
17
  Classifier: Programming Language :: Python :: 3.9
@@ -20,44 +19,57 @@ Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3.11
21
20
  Classifier: Programming Language :: Python :: 3.12
22
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
- Requires-Python: <3.14,>=3.9
24
+ Requires-Python: <3.15,>=3.9
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: appdirs>=1.3
28
28
  Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
- Requires-Dist: cloudpickle>=2.1.0
30
+ Requires-Dist: cloudpickle>=2.1.0; python_version < "3.14"
31
+ Requires-Dist: cloudpickle>=3.1.1; python_version >= "3.14"
31
32
  Requires-Dist: compress-pickle>=1.2.0
32
- Requires-Dist: google-cloud-compute>=1.0.0
33
- Requires-Dist: google-auth>=2.0.0
33
+ Requires-Dist: google-cloud-compute>=1.30.0
34
+ Requires-Dist: google-auth>=2.14.1
34
35
  Requires-Dist: humanfriendly>=10.0
35
36
  Requires-Dist: numpy>=1.22.2; python_version < "3.10"
36
37
  Requires-Dist: numpy>=1.24.0; python_version >= "3.10" and python_version < "3.12"
37
38
  Requires-Dist: numpy>=1.26.0; python_version >= "3.12" and python_version < "3.13"
38
- Requires-Dist: numpy>=2.1.0; python_version >= "3.13"
39
+ Requires-Dist: numpy>=2.1.0; python_version >= "3.13" and python_version < "3.14"
40
+ Requires-Dist: numpy>=2.3.2; python_version >= "3.14"
39
41
  Requires-Dist: openpyxl>=3.1.2
40
- Requires-Dist: pandas<3.0.0,>=1.4.0; python_version < "3.11"
41
- Requires-Dist: pandas<3.0.0,>=1.5.0; python_version >= "3.11" and python_version < "3.12"
42
- Requires-Dist: pandas<3.0.0,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
43
- Requires-Dist: pandas<3.0.0,>=2.2.3; python_version >= "3.13"
44
- Requires-Dist: psutil>=5.7
42
+ Requires-Dist: pandas<3,>=1.4.0; python_version < "3.11"
43
+ Requires-Dist: pandas<3,>=1.5.0; python_version >= "3.11" and python_version < "3.12"
44
+ Requires-Dist: pandas<3,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
45
+ Requires-Dist: pandas<3,>=2.2.3; python_version >= "3.13" and python_version < "3.14"
46
+ Requires-Dist: pandas<3,>=2.3.3; python_version >= "3.14"
47
+ Requires-Dist: psutil>=5.8
45
48
  Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
46
49
  Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
47
50
  Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
48
51
  Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12" and python_version < "3.13"
49
- Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13"
52
+ Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13" and python_version < "3.14"
53
+ Requires-Dist: scikit-learn>=1.8.0; python_version >= "3.14"
50
54
  Requires-Dist: scipy>=1.7.3; python_version < "3.10"
51
55
  Requires-Dist: scipy>=1.9.2; python_version >= "3.10" and python_version < "3.12"
52
56
  Requires-Dist: scipy>=1.12.0; python_version >= "3.12" and python_version < "3.13"
53
- Requires-Dist: scipy>=1.14.1; python_version >= "3.13"
57
+ Requires-Dist: scipy>=1.14.1; python_version >= "3.13" and python_version < "3.14"
58
+ Requires-Dist: scipy>=1.16.1; python_version >= "3.14"
54
59
  Requires-Dist: tabulate<0.9,>=0.8.3
55
- Requires-Dist: torch>=2.6.0
60
+ Requires-Dist: torch>=1.13.0; python_version < "3.11"
61
+ Requires-Dist: torch>=2.0.0; python_version >= "3.11" and python_version < "3.12"
62
+ Requires-Dist: torch>=2.3.0; python_version >= "3.12" and python_version < "3.13"
63
+ Requires-Dist: torch>=2.6.0; python_version >= "3.13" and python_version < "3.14"
64
+ Requires-Dist: torch>=2.9.0; python_version >= "3.14"
56
65
  Requires-Dist: tqdm>=4.66.3
57
66
  Requires-Dist: XlsxWriter>=1.2.8
58
- Requires-Dist: rdt>=1.17.0
59
- Requires-Dist: sdmetrics>=0.20.1
60
- Requires-Dist: sdv>=1.21.0
67
+ Requires-Dist: rdt>=1.18.2; python_version < "3.14"
68
+ Requires-Dist: rdt>=1.20.0; python_version >= "3.14"
69
+ Requires-Dist: sdmetrics>=0.21.0; python_version < "3.14"
70
+ Requires-Dist: sdmetrics>=0.26.0; python_version >= "3.14"
71
+ Requires-Dist: sdv>=1.21.0; python_version < "3.14"
72
+ Requires-Dist: sdv>=1.33.0; python_version >= "3.14"
61
73
  Provides-Extra: dask
62
74
  Requires-Dist: dask; extra == "dask"
63
75
  Requires-Dist: distributed; extra == "dask"
@@ -5,7 +5,6 @@ authors = [{ name = 'DataCebo, Inc.', email = 'info@sdv.dev' }]
5
5
  classifiers = [
6
6
  'Development Status :: 2 - Pre-Alpha',
7
7
  'Intended Audience :: Developers',
8
- 'License :: Free for non-commercial use',
9
8
  'Natural Language :: English',
10
9
  'Programming Language :: Python :: 3',
11
10
  'Programming Language :: Python :: 3.9',
@@ -13,48 +12,62 @@ classifiers = [
13
12
  'Programming Language :: Python :: 3.11',
14
13
  'Programming Language :: Python :: 3.12',
15
14
  'Programming Language :: Python :: 3.13',
15
+ 'Programming Language :: Python :: 3.14',
16
16
  'Topic :: Scientific/Engineering :: Artificial Intelligence',
17
17
  ]
18
18
  keywords = ['machine learning', 'synthetic data generation', 'benchmark', 'generative models']
19
19
  dynamic = ['version']
20
- license = { text = 'BSL-1.1' }
21
- requires-python = '>=3.9,<3.14'
20
+ license = 'BUSL-1.1'
21
+ license-files = ['LICENSE']
22
+ requires-python = '>=3.9,<3.15'
22
23
  readme = 'README.md'
23
24
  dependencies = [
24
25
  'appdirs>=1.3',
25
26
  'boto3>=1.28,<2',
26
27
  'botocore>=1.31,<2',
27
- 'cloudpickle>=2.1.0',
28
+ "cloudpickle>=2.1.0;python_version<'3.14'",
29
+ "cloudpickle>=3.1.1;python_version>='3.14'",
28
30
  'compress-pickle>=1.2.0',
29
- 'google-cloud-compute>=1.0.0',
30
- 'google-auth>=2.0.0',
31
+ 'google-cloud-compute>=1.30.0',
32
+ 'google-auth>=2.14.1',
31
33
  'humanfriendly>=10.0',
32
34
  "numpy>=1.22.2;python_version<'3.10'",
33
35
  "numpy>=1.24.0;python_version>='3.10' and python_version<'3.12'",
34
36
  "numpy>=1.26.0;python_version>='3.12' and python_version<'3.13'",
35
- "numpy>=2.1.0;python_version>='3.13'",
37
+ "numpy>=2.1.0;python_version>='3.13' and python_version<'3.14'",
38
+ "numpy>=2.3.2;python_version>='3.14'",
36
39
  'openpyxl>=3.1.2',
37
- "pandas>=1.4.0,<3.0.0;python_version<'3.11'",
38
- "pandas>=1.5.0,<3.0.0;python_version>='3.11' and python_version<'3.12'",
39
- "pandas>=2.1.1,<3.0.0;python_version>='3.12' and python_version<'3.13'",
40
- "pandas>=2.2.3,<3.0.0;python_version>='3.13'",
41
- 'psutil>=5.7',
40
+ "pandas>=1.4.0,<3;python_version<'3.11'",
41
+ "pandas>=1.5.0,<3;python_version>='3.11' and python_version<'3.12'",
42
+ "pandas>=2.1.1,<3;python_version>='3.12' and python_version<'3.13'",
43
+ "pandas>=2.2.3,<3;python_version>='3.13' and python_version<'3.14'",
44
+ "pandas>=2.3.3,<3;python_version>='3.14'",
45
+ 'psutil>=5.8',
42
46
  "scikit-learn>=1.0.2;python_version<'3.10'",
43
47
  "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
44
48
  "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
45
49
  "scikit-learn>=1.3.1;python_version>='3.12' and python_version<'3.13'",
46
- "scikit-learn>=1.5.2;python_version>='3.13'",
50
+ "scikit-learn>=1.5.2;python_version>='3.13' and python_version<'3.14'",
51
+ "scikit-learn>=1.8.0;python_version>='3.14'",
47
52
  "scipy>=1.7.3;python_version<'3.10'",
48
53
  "scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'",
49
54
  "scipy>=1.12.0;python_version>='3.12' and python_version<'3.13'",
50
- "scipy>=1.14.1;python_version>='3.13'",
55
+ "scipy>=1.14.1;python_version>='3.13' and python_version<'3.14'",
56
+ "scipy>=1.16.1;python_version>='3.14'",
51
57
  'tabulate>=0.8.3,<0.9',
52
- "torch>=2.6.0",
58
+ "torch>=1.13.0;python_version<'3.11'",
59
+ "torch>=2.0.0;python_version>='3.11' and python_version<'3.12'",
60
+ "torch>=2.3.0;python_version>='3.12' and python_version<'3.13'",
61
+ "torch>=2.6.0;python_version>='3.13' and python_version<'3.14'",
62
+ "torch>=2.9.0;python_version>='3.14'",
53
63
  'tqdm>=4.66.3',
54
64
  'XlsxWriter>=1.2.8',
55
- 'rdt>=1.17.0',
56
- 'sdmetrics>=0.20.1',
57
- 'sdv>=1.21.0',
65
+ "rdt>=1.18.2;python_version<'3.14'",
66
+ "rdt>=1.20.0;python_version>='3.14'",
67
+ "sdmetrics>=0.21.0;python_version<'3.14'",
68
+ "sdmetrics>=0.26.0;python_version>='3.14'",
69
+ "sdv>=1.21.0;python_version<'3.14'",
70
+ "sdv>=1.33.0;python_version>='3.14'",
58
71
  ]
59
72
 
60
73
  [project.urls]
@@ -113,7 +126,6 @@ all = [
113
126
 
114
127
  [tool.setuptools]
115
128
  include-package-data = true
116
- license-files = ['LICENSE']
117
129
 
118
130
  [tool.setuptools.packages.find]
119
131
  include = ['sdgym', 'sdgym.*']
@@ -149,7 +161,7 @@ namespaces = false
149
161
  version = {attr = 'sdgym.__version__'}
150
162
 
151
163
  [tool.bumpversion]
152
- current_version = "0.12.2.dev0"
164
+ current_version = "0.13.1.dev0"
153
165
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
154
166
  serialize = [
155
167
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.12.2.dev0'
11
+ __version__ = '0.13.1.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -74,7 +74,7 @@ def sdv_install_cmd(credentials):
74
74
  pip install sdv-installer
75
75
 
76
76
  python -c "from sdv_installer.installation.installer import install_packages; \\
77
- install_packages(username='{username}', license_key='{license_key}', package='sdv-enterprise')"
77
+ install_packages(username='{username}', license_key='{license_key}')"
78
78
  """)
79
79
 
80
80
 
@@ -514,7 +514,7 @@ def _compute_scores(
514
514
  for metric_name, metric in metrics.items():
515
515
  scores.append({
516
516
  'metric': metric_name,
517
- 'error': 'Metric Timeout',
517
+ 'Error': 'Metric Timeout',
518
518
  })
519
519
  # re-inject list to multiprocessing output
520
520
  output['scores'] = scores
@@ -537,7 +537,7 @@ def _compute_scores(
537
537
  scores[-1].update({
538
538
  'score': score,
539
539
  'normalized_score': normalized_score,
540
- 'error': error,
540
+ 'Error': error,
541
541
  'metric_time': calculate_score_time(start),
542
542
  })
543
543
  # re-inject list to multiprocessing output
@@ -603,7 +603,7 @@ def _score(
603
603
  output = {}
604
604
 
605
605
  output['timeout'] = True # To be deleted if there is no error
606
- output['error'] = 'Load Timeout' # To be deleted if there is no error
606
+ output['Error'] = 'Load Timeout' # To be deleted if there is no error
607
607
  try:
608
608
  LOGGER.info(
609
609
  'Running %s on %s dataset %s; %s',
@@ -615,7 +615,7 @@ def _score(
615
615
 
616
616
  output['dataset_size'] = get_size_of(data) / N_BYTES_IN_MB
617
617
  # To be deleted if there is no error
618
- output['error'] = 'Synthesizer Timeout'
618
+ output['Error'] = 'Synthesizer Timeout'
619
619
 
620
620
  try:
621
621
  synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize(
@@ -642,7 +642,7 @@ def _score(
642
642
  )
643
643
 
644
644
  # No error so far. _compute_scores tracks its own errors by metric
645
- del output['error']
645
+ del output['Error']
646
646
  _compute_scores(
647
647
  metrics,
648
648
  data,
@@ -671,14 +671,14 @@ def _score(
671
671
  output['peak_memory'] = err.peak_memory
672
672
 
673
673
  output['exception'] = err.exception
674
- output['error'] = err.error
674
+ output['Error'] = err.error
675
675
  output['timeout'] = False
676
676
 
677
677
  except Exception:
678
678
  LOGGER.exception('Error running %s on dataset %s;', synthesizer['name'], dataset_name)
679
679
  exception, error = format_exception()
680
680
  output['exception'] = exception
681
- output['error'] = error
681
+ output['Error'] = error
682
682
  output['timeout'] = False # There was no timeout
683
683
 
684
684
  finally:
@@ -744,7 +744,7 @@ def _score_with_timeout(
744
744
  thread.join(timeout)
745
745
  if thread.is_alive():
746
746
  LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
747
- return {'timeout': True, 'error': 'Synthesizer Timeout'}
747
+ return {'timeout': True, 'Error': 'Synthesizer Timeout'}
748
748
 
749
749
  return output
750
750
 
@@ -815,8 +815,8 @@ def _format_output(
815
815
  for score in output.get('scores', []):
816
816
  scores.insert(len(scores.columns), score['metric'], score['normalized_score'])
817
817
 
818
- if 'error' in output:
819
- scores['error'] = output['error']
818
+ if 'Error' in output:
819
+ scores['Error'] = output['Error']
820
820
 
821
821
  return scores
822
822
 
@@ -1085,8 +1085,8 @@ def _add_adjusted_scores(scores, timeout):
1085
1085
 
1086
1086
  fit_times = scores.loc[dataset_mask, 'Train_Time'].fillna(0)
1087
1087
  sample_times = scores.loc[dataset_mask, 'Sample_Time'].fillna(0)
1088
- if 'error' in scores.columns:
1089
- errors = scores.loc[dataset_mask, 'error']
1088
+ if 'Error' in scores.columns:
1089
+ errors = scores.loc[dataset_mask, 'Error']
1090
1090
  else:
1091
1091
  errors = pd.Series([None] * dataset_mask.sum(), index=scores.index[dataset_mask])
1092
1092
 
@@ -39,13 +39,13 @@ def _print_table(data, sort=None, reverse=False, format=None):
39
39
  for field, formatter in format.items():
40
40
  data[field] = data[field].apply(formatter)
41
41
 
42
- if 'error' in data:
43
- error = data['error']
42
+ if 'Error' in data:
43
+ error = data['Error']
44
44
  if pd.isna(error).all():
45
- del data['error']
45
+ del data['Error']
46
46
  else:
47
47
  long_error = error.str.len() > 30
48
- data.loc[long_error, 'error'] = error[long_error].str[:30] + '...'
48
+ data.loc[long_error, 'Error'] = error[long_error].str[:30] + '...'
49
49
 
50
50
  print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False)) # noqa: T201
51
51
 
@@ -35,11 +35,11 @@ def preprocess(data):
35
35
  bydataset = grouped.mean()
36
36
  data = bydataset.reset_index()
37
37
 
38
- if 'error' in data.columns:
38
+ if 'Error' in data.columns:
39
39
  errors = data.error.fillna('')
40
40
  for message, column in KNOWN_ERRORS:
41
41
  data[column] = errors.str.contains(message)
42
- data.loc[data[column], 'error'] = np.nan
42
+ data.loc[data[column], 'Error'] = np.nan
43
43
 
44
44
  return data
45
45
 
@@ -122,7 +122,7 @@ def summarize(data, baselines=(), datasets=None):
122
122
  baseline_scores = baseline_data.set_index('Dataset').Quality_Score
123
123
  results[f'beat_{baseline.lower()}'] = _beat_baseline(data, baseline_scores)
124
124
 
125
- if 'error' in data.columns:
125
+ if 'Error' in data.columns:
126
126
  grouped = data.groupby('Synthesizer')
127
127
  for _, error_column in KNOWN_ERRORS:
128
128
  results[error_column] = grouped[error_column].sum()
@@ -135,7 +135,7 @@ def summarize(data, baselines=(), datasets=None):
135
135
 
136
136
 
137
137
  def _error_counts(data):
138
- if 'error' in data.columns:
138
+ if 'Error' in data.columns:
139
139
  return data.error.value_counts()
140
140
  return 0
141
141
 
@@ -158,8 +158,8 @@ def errors_summary(data):
158
158
  Returns:
159
159
  pandas.DataFrame
160
160
  """
161
- if 'error' in data.columns:
162
- all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'error': 'all'})
161
+ if 'Error' in data.columns:
162
+ all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'Error': 'all'})
163
163
  synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).pivot_table(level=0)
164
164
  for synthesizer, errors in synthesizer_errors.items():
165
165
  all_errors[synthesizer] = errors.fillna(0).astype(int)
@@ -12,6 +12,7 @@ import yaml
12
12
  from openpyxl import load_workbook
13
13
 
14
14
  from sdgym.s3 import parse_s3_path
15
+ from sdgym.utils import _set_column_width
15
16
 
16
17
 
17
18
  class ResultsWriter(ABC):
@@ -79,6 +80,7 @@ class LocalResultsWriter:
79
80
  with writer:
80
81
  for sheet_name, df in data.items():
81
82
  df.to_excel(writer, sheet_name=sheet_name, index=index)
83
+ _set_column_width(writer, df, sheet_name)
82
84
 
83
85
  wb = load_workbook(file_path)
84
86
  for sheet_name in reversed(data.keys()):
@@ -0,0 +1,206 @@
1
+ """Script to run a benchmark and upload results to S3."""
2
+
3
+ import json
4
+ import os
5
+ from datetime import datetime, timezone
6
+
7
+ from botocore.exceptions import ClientError
8
+
9
+ from sdgym._benchmark.benchmark import (
10
+ _benchmark_multi_table_compute_gcp,
11
+ _benchmark_single_table_compute_gcp,
12
+ )
13
+ from sdgym.run_benchmark.utils import (
14
+ KEY_DATE_FILE,
15
+ OUTPUT_DESTINATION_AWS,
16
+ _exclude_datasets,
17
+ _parse_args,
18
+ get_result_folder_name,
19
+ post_benchmark_launch_message,
20
+ )
21
+ from sdgym.s3 import get_s3_client, parse_s3_path
22
+
23
+ SINGLE_TABLE_DATASETS = [
24
+ 'adult',
25
+ 'alarm',
26
+ 'census',
27
+ 'child',
28
+ 'covtype',
29
+ 'expedia_hotel_logs',
30
+ 'insurance',
31
+ 'intrusion',
32
+ 'news',
33
+ ]
34
+ MULTI_TABLE_DATASETS = [
35
+ 'WebKP',
36
+ 'DCG',
37
+ 'UW_std',
38
+ 'Same_gen',
39
+ 'CORA',
40
+ 'got_families',
41
+ 'SalesDB',
42
+ 'UTube',
43
+ 'Student_loan',
44
+ 'Hepatitis_std',
45
+ 'Elti',
46
+ 'Bupa',
47
+ 'Toxicology',
48
+ 'imdb_ijs',
49
+ 'ftp',
50
+ 'imdb_small',
51
+ 'imdb_MovieLens',
52
+ 'Pima',
53
+ 'university',
54
+ 'legalActs',
55
+ 'Dunur',
56
+ 'Mesh',
57
+ 'world',
58
+ 'airbnb-simplified',
59
+ 'trains',
60
+ 'FNHK',
61
+ 'fake_hotels',
62
+ 'SAT',
63
+ 'genes',
64
+ 'Biodegradability',
65
+ 'Pyrimidine',
66
+ 'mutagenesis',
67
+ 'restbase',
68
+ 'Triazine',
69
+ 'Carcinogenesis',
70
+ 'fake_hotels_extended',
71
+ 'Mooney_Family',
72
+ 'PTE',
73
+ 'Facebook',
74
+ 'multi_table_ID_demo_dataset',
75
+ 'SAP',
76
+ 'Chess',
77
+ 'Countries',
78
+ 'NCAA',
79
+ 'Atherosclerosis',
80
+ 'nations',
81
+ 'TubePricing',
82
+ 'financial',
83
+ 'Accidents',
84
+ 'MuskSmall',
85
+ 'NBA',
86
+ 'AustralianFootball',
87
+ 'PremierLeague',
88
+ 'OMOP_CDM_dayz',
89
+ ]
90
+
91
+
92
+ def _get_benchmark_setup(modality):
93
+ """Get the benchmark setup for a given modality.
94
+
95
+ The setup includes the method to run the benchmark and the job split,
96
+ which is a list of tuples where each tuple contains a list of synthesizers and
97
+ a list of datasets to run those synthesizers on.
98
+ """
99
+ if modality == 'single_table':
100
+ real_tab_former_to_exclude = ['covtype', 'intrusion', 'expedia_hotel_logs', 'census']
101
+ gan_to_exclude = ['covtype', 'intrusion']
102
+ job_split = [
103
+ (['ColumnSynthesizer', 'GaussianCopulaSynthesizer'], SINGLE_TABLE_DATASETS),
104
+ (['TVAESynthesizer'], SINGLE_TABLE_DATASETS),
105
+ (['SegmentSynthesizer'], SINGLE_TABLE_DATASETS),
106
+ (['XGCSynthesizer'], SINGLE_TABLE_DATASETS),
107
+ (['BootstrapSynthesizer'], SINGLE_TABLE_DATASETS),
108
+ (['CTGANSynthesizer'], _exclude_datasets(SINGLE_TABLE_DATASETS, gan_to_exclude)),
109
+ (['CopulaGANSynthesizer'], _exclude_datasets(SINGLE_TABLE_DATASETS, gan_to_exclude)),
110
+ (
111
+ ['RealTabFormerSynthesizer'],
112
+ _exclude_datasets(SINGLE_TABLE_DATASETS, real_tab_former_to_exclude),
113
+ ),
114
+ ]
115
+ for dataset in real_tab_former_to_exclude:
116
+ job_split.append((['RealTabFormerSynthesizer'], [dataset]))
117
+
118
+ for dataset in gan_to_exclude:
119
+ job_split.append((['CTGANSynthesizer'], [dataset]))
120
+ job_split.append((['CopulaGANSynthesizer'], [dataset]))
121
+
122
+ return {
123
+ 'method': _benchmark_single_table_compute_gcp,
124
+ 'job_split': job_split,
125
+ }
126
+
127
+ if modality == 'multi_table':
128
+ hma_to_exclude = [
129
+ 'Accidents',
130
+ 'AustralianFootball',
131
+ 'Countries',
132
+ 'MuskSmall',
133
+ 'NBA',
134
+ 'OMOP_CDM_dayz',
135
+ 'PremierLeague',
136
+ 'SalesDB',
137
+ 'airbnb-simplified',
138
+ 'imdb_ijs',
139
+ 'legalActs',
140
+ 'SAP',
141
+ 'imdb_MovieLens',
142
+ ]
143
+ job_split = [
144
+ (['HSASynthesizer', 'IndependentSynthesizer'], MULTI_TABLE_DATASETS),
145
+ (['HMASynthesizer'], _exclude_datasets(MULTI_TABLE_DATASETS, hma_to_exclude)),
146
+ ]
147
+ for dataset in hma_to_exclude:
148
+ job_split.append((['HMASynthesizer'], [dataset]))
149
+
150
+ return {
151
+ 'method': _benchmark_multi_table_compute_gcp,
152
+ 'job_split': job_split,
153
+ }
154
+
155
+
156
+ def append_benchmark_run(
157
+ aws_access_key_id, aws_secret_access_key, date_str, modality='single_table'
158
+ ):
159
+ """Append a new benchmark run to the benchmark dates file in S3."""
160
+ s3_client = get_s3_client(
161
+ aws_access_key_id=aws_access_key_id,
162
+ aws_secret_access_key=aws_secret_access_key,
163
+ )
164
+ bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
165
+ try:
166
+ object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{modality}/{KEY_DATE_FILE}')
167
+ body = object['Body'].read().decode('utf-8')
168
+ data = json.loads(body)
169
+ except ClientError as e:
170
+ if e.response['Error']['Code'] == 'NoSuchKey':
171
+ data = {'runs': []}
172
+ else:
173
+ raise RuntimeError(f'Failed to read {KEY_DATE_FILE} from S3: {e}')
174
+
175
+ data['runs'].append({'date': date_str, 'folder_name': get_result_folder_name(date_str)})
176
+ data['runs'] = sorted(data['runs'], key=lambda x: x['date'])
177
+ s3_client.put_object(
178
+ Bucket=bucket,
179
+ Key=f'{prefix}{modality}/{KEY_DATE_FILE}',
180
+ Body=json.dumps(data).encode('utf-8'),
181
+ )
182
+
183
+
184
+ def main():
185
+ """Main function to run the benchmark and upload results."""
186
+ args = _parse_args()
187
+ aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
188
+ aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
189
+ date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d')
190
+ modality = args.modality
191
+ benchmark_setup = _get_benchmark_setup(modality)
192
+ for synthesizers, datasets in benchmark_setup['job_split']:
193
+ benchmark_setup['method'](
194
+ output_destination=OUTPUT_DESTINATION_AWS,
195
+ credential_filepath=os.getenv('CREDENTIALS_FILEPATH'),
196
+ synthesizers=synthesizers,
197
+ sdv_datasets=datasets,
198
+ timeout=345600, # 4 days
199
+ )
200
+
201
+ append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str, modality=modality)
202
+ post_benchmark_launch_message(date_str, compute_service='GCP', modality=modality)
203
+
204
+
205
+ if __name__ == '__main__':
206
+ main()
@@ -29,6 +29,7 @@ from sdgym.run_benchmark.utils import (
29
29
  get_df_to_plot,
30
30
  )
31
31
  from sdgym.s3 import S3_REGION, parse_s3_path
32
+ from sdgym.utils import _set_column_width
32
33
 
33
34
  LOGGER = logging.getLogger(__name__)
34
35
  SYNTHESIZER_TO_GLOBAL_POSITION = {
@@ -231,7 +232,6 @@ def get_model_details(summary, results, df_to_plot, modality):
231
232
  with open(SYNTHESIZER_DESCRIPTION_PATH, 'r', encoding='utf-8') as f:
232
233
  synthesizer_info = yaml.safe_load(f) or {}
233
234
 
234
- err_column = 'error' if 'error' in results.columns else 'Error'
235
235
  paretos_synthesizers = (
236
236
  df_to_plot.loc[df_to_plot['Pareto'].eq(True), 'Synthesizer'].astype(str).add('Synthesizer')
237
237
  )
@@ -258,18 +258,23 @@ def get_model_details(summary, results, df_to_plot, modality):
258
258
  model_details['Number of datasets - Wins'] = (
259
259
  model_details['Synthesizer'].map(wins).fillna(0).astype(int)
260
260
  )
261
- timeout_counts = (
262
- results
263
- .loc[results[err_column].eq('Synthesizer Timeout')]
264
- .groupby('Synthesizer')['Dataset']
265
- .nunique()
266
- )
267
- error_counts = (
268
- results
269
- .loc[results[err_column].notna() & ~results[err_column].eq('Synthesizer Timeout')]
270
- .groupby('Synthesizer')['Dataset']
271
- .nunique()
272
- )
261
+ if 'Error' in results.columns:
262
+ timeout_counts = (
263
+ results
264
+ .loc[results['Error'].eq('Synthesizer Timeout')]
265
+ .groupby('Synthesizer')['Dataset']
266
+ .nunique()
267
+ )
268
+ error_counts = (
269
+ results
270
+ .loc[results['Error'].notna() & ~results['Error'].eq('Synthesizer Timeout')]
271
+ .groupby('Synthesizer')['Dataset']
272
+ .nunique()
273
+ )
274
+ else:
275
+ timeout_counts = pd.Series(0, index=model_details['Synthesizer'])
276
+ error_counts = pd.Series(0, index=model_details['Synthesizer'])
277
+
273
278
  model_details['Number of datasets - Timeout'] = (
274
279
  model_details['Synthesizer'].map(timeout_counts).fillna(0).astype(int)
275
280
  )
@@ -313,7 +318,8 @@ def update_table_aws(s3_client, bucket, filename, table, reference_column):
313
318
  updated_table = pd.concat([existing_table, table], ignore_index=True)
314
319
  output = io.BytesIO()
315
320
  with pd.ExcelWriter(output, engine='openpyxl') as writer:
316
- updated_table.to_excel(writer, index=False)
321
+ updated_table.to_excel(writer, index=False, sheet_name='Sheet1')
322
+ _set_column_width(writer, updated_table, 'Sheet1')
317
323
 
318
324
  output.seek(0)
319
325
  s3_client.upload_fileobj(output, bucket, filename)
@@ -6,13 +6,15 @@ from datetime import datetime
6
6
  from urllib.parse import parse_qs, quote_plus, urlparse
7
7
 
8
8
  import numpy as np
9
+ import pandas as pd
10
+ from scipy.interpolate import interp1d
9
11
  from slack_sdk import WebClient
10
12
 
11
13
  from sdgym.s3 import parse_s3_path
12
14
 
13
15
  OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Benchmarks/'
14
16
  DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug'
15
- SLACK_CHANNEL = 'sdv-alerts'
17
+ SLACK_CHANNEL = 'sdgym'
16
18
  KEY_DATE_FILE = '_BENCHMARK_DATES.json'
17
19
  PLOTLY_MARKERS = [
18
20
  'circle',
@@ -45,18 +47,7 @@ PLOTLY_MARKERS = [
45
47
  'diamond-cross',
46
48
  'diamond-x',
47
49
  ]
48
-
49
- # The synthesizers inside the same list will be run by the same ec2 instance
50
- SYNTHESIZERS_SPLIT_SINGLE_TABLE = [
51
- ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer', 'TVAESynthesizer'],
52
- ['CopulaGANSynthesizer'],
53
- ['CTGANSynthesizer'],
54
- ['RealTabFormerSynthesizer'],
55
- ]
56
- SYNTHESIZERS_SPLIT_MULTI_TABLE = [
57
- ['HMASynthesizer'],
58
- ['HSASynthesizer', 'IndependentSynthesizer', 'MultiTableUniformSynthesizer'],
59
- ]
50
+ PLOT_PADDING = 0.25
60
51
 
61
52
 
62
53
  def _get_filename_to_gdrive_link():
@@ -104,7 +95,7 @@ def post_slack_message(channel, text):
104
95
 
105
96
 
106
97
  def post_benchmark_launch_message(date_str, compute_service='AWS', modality='single_table'):
107
- """Post a message to the SDV Alerts Slack channel when the benchmark is launched."""
98
+ """Post a message to the sdgym Slack channel when the benchmark is launched."""
108
99
  channel = SLACK_CHANNEL
109
100
  folder_name = get_result_folder_name(date_str)
110
101
  bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
@@ -116,7 +107,7 @@ def post_benchmark_launch_message(date_str, compute_service='AWS', modality='sin
116
107
 
117
108
 
118
109
  def post_benchmark_uploaded_message(folder_name, commit_url=None, modality='single_table'):
119
- """Post benchmark uploaded message to sdv-alerts slack channel."""
110
+ """Post benchmark uploaded message to the sdgym Slack channel."""
120
111
  file_to_gdrive_link = _get_filename_to_gdrive_link()
121
112
  channel = SLACK_CHANNEL
122
113
  bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
@@ -136,6 +127,34 @@ def post_benchmark_uploaded_message(folder_name, commit_url=None, modality='sing
136
127
  post_slack_message(channel, body)
137
128
 
138
129
 
130
+ def _add_pareto_curve_extremity_points(df_to_plot):
131
+ """Add extremity points to the Pareto curve for better visualization."""
132
+ pareto = df_to_plot.loc[df_to_plot['Pareto']].sort_values('Aggregated_Time')
133
+ if len(pareto) < 2:
134
+ return df_to_plot.reset_index(drop=True) # Not enough points to define a curve
135
+
136
+ interp = interp1d(
137
+ pareto['Log10 Aggregated_Time'],
138
+ pareto['Quality_Score'],
139
+ kind='linear',
140
+ fill_value='extrapolate',
141
+ )
142
+
143
+ min_log = np.log10(df_to_plot['Aggregated_Time'].min()) - PLOT_PADDING
144
+ max_log = np.log10(df_to_plot['Aggregated_Time'].max()) + PLOT_PADDING
145
+ extremities = pd.DataFrame({
146
+ 'Synthesizer': np.nan,
147
+ 'Aggregated_Time': 10 ** np.array([min_log, max_log]),
148
+ 'Quality_Score': interp([min_log, max_log]),
149
+ 'Log10 Aggregated_Time': [min_log, max_log],
150
+ 'Pareto': True,
151
+ 'Color': '#01E0C9',
152
+ 'Marker': np.nan,
153
+ })
154
+
155
+ return pd.concat([df_to_plot, extremities], ignore_index=True).reset_index(drop=True)
156
+
157
+
139
158
  def get_df_to_plot(benchmark_result):
140
159
  """Get the data to plot from the benchmark result.
141
160
 
@@ -177,8 +196,9 @@ def get_df_to_plot(benchmark_result):
177
196
  }
178
197
  df_to_plot['Marker'] = df_to_plot['Synthesizer'].map(marker_map)
179
198
  df_to_plot = df_to_plot.rename(columns={'Adjusted_Quality_Score': 'Quality_Score'})
199
+ df_to_plot = df_to_plot.drop(columns=['Cumulative Quality Score'])
180
200
 
181
- return df_to_plot.drop(columns=['Cumulative Quality Score']).reset_index(drop=True)
201
+ return _add_pareto_curve_extremity_points(df_to_plot)
182
202
 
183
203
 
184
204
  def _parse_args():
@@ -203,3 +223,8 @@ def _extract_google_file_id(google_drive_link):
203
223
  return parsed.path.split(marker, 1)[1].split('/', 1)[0]
204
224
 
205
225
  raise ValueError(f'Invalid Google Drive link format: {google_drive_link}')
226
+
227
+
228
+ def _exclude_datasets(datasets, dataset_to_exclude):
229
+ """Exclude datasets that are in the dataset_to_exclude list."""
230
+ return [dataset for dataset in datasets if dataset not in dataset_to_exclude]
@@ -11,6 +11,7 @@ import humanfriendly
11
11
  import numpy as np
12
12
  import pandas as pd
13
13
  import psutil
14
+ from openpyxl.utils import get_column_letter
14
15
 
15
16
  from sdgym.errors import SDGymError
16
17
  from sdgym.synthesizers.base import BaselineSynthesizer
@@ -195,3 +196,11 @@ def convert_metadata_to_sdmetrics(metadata_dict):
195
196
  """Convert a sdv metadata dictionary into sdmetrics expected metadata."""
196
197
  table_name = next(iter(metadata_dict['tables']))
197
198
  return metadata_dict['tables'][table_name]
199
+
200
+
201
+ def _set_column_width(writer, df, sheet_name):
202
+ worksheet = writer.sheets[sheet_name]
203
+ for col_idx, column in enumerate(df.columns, 1):
204
+ max_length = max(df[column].astype(str).map(len).max(), len(column))
205
+ column_letter = get_column_letter(col_idx)
206
+ worksheet.column_dimensions[column_letter].width = max_length + 2
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.12.2.dev0
3
+ Version: 0.13.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
- License: BSL-1.1
6
+ License-Expression: BUSL-1.1
7
7
  Project-URL: Source Code, https://github.com/sdv-dev/SDGym/
8
8
  Project-URL: Issue Tracker, https://github.com/sdv-dev/SDGym/issues
9
9
  Project-URL: Changes, https://github.com/sdv-dev/SDGym/blob/main/HISTORY.md
@@ -12,7 +12,6 @@ Project-URL: Chat, https://bit.ly/sdv-slack-invite
12
12
  Keywords: machine learning,synthetic data generation,benchmark,generative models
13
13
  Classifier: Development Status :: 2 - Pre-Alpha
14
14
  Classifier: Intended Audience :: Developers
15
- Classifier: License :: Free for non-commercial use
16
15
  Classifier: Natural Language :: English
17
16
  Classifier: Programming Language :: Python :: 3
18
17
  Classifier: Programming Language :: Python :: 3.9
@@ -20,44 +19,57 @@ Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3.11
21
20
  Classifier: Programming Language :: Python :: 3.12
22
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
- Requires-Python: <3.14,>=3.9
24
+ Requires-Python: <3.15,>=3.9
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: appdirs>=1.3
28
28
  Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
- Requires-Dist: cloudpickle>=2.1.0
30
+ Requires-Dist: cloudpickle>=2.1.0; python_version < "3.14"
31
+ Requires-Dist: cloudpickle>=3.1.1; python_version >= "3.14"
31
32
  Requires-Dist: compress-pickle>=1.2.0
32
- Requires-Dist: google-cloud-compute>=1.0.0
33
- Requires-Dist: google-auth>=2.0.0
33
+ Requires-Dist: google-cloud-compute>=1.30.0
34
+ Requires-Dist: google-auth>=2.14.1
34
35
  Requires-Dist: humanfriendly>=10.0
35
36
  Requires-Dist: numpy>=1.22.2; python_version < "3.10"
36
37
  Requires-Dist: numpy>=1.24.0; python_version >= "3.10" and python_version < "3.12"
37
38
  Requires-Dist: numpy>=1.26.0; python_version >= "3.12" and python_version < "3.13"
38
- Requires-Dist: numpy>=2.1.0; python_version >= "3.13"
39
+ Requires-Dist: numpy>=2.1.0; python_version >= "3.13" and python_version < "3.14"
40
+ Requires-Dist: numpy>=2.3.2; python_version >= "3.14"
39
41
  Requires-Dist: openpyxl>=3.1.2
40
- Requires-Dist: pandas<3.0.0,>=1.4.0; python_version < "3.11"
41
- Requires-Dist: pandas<3.0.0,>=1.5.0; python_version >= "3.11" and python_version < "3.12"
42
- Requires-Dist: pandas<3.0.0,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
43
- Requires-Dist: pandas<3.0.0,>=2.2.3; python_version >= "3.13"
44
- Requires-Dist: psutil>=5.7
42
+ Requires-Dist: pandas<3,>=1.4.0; python_version < "3.11"
43
+ Requires-Dist: pandas<3,>=1.5.0; python_version >= "3.11" and python_version < "3.12"
44
+ Requires-Dist: pandas<3,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
45
+ Requires-Dist: pandas<3,>=2.2.3; python_version >= "3.13" and python_version < "3.14"
46
+ Requires-Dist: pandas<3,>=2.3.3; python_version >= "3.14"
47
+ Requires-Dist: psutil>=5.8
45
48
  Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
46
49
  Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
47
50
  Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
48
51
  Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12" and python_version < "3.13"
49
- Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13"
52
+ Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13" and python_version < "3.14"
53
+ Requires-Dist: scikit-learn>=1.8.0; python_version >= "3.14"
50
54
  Requires-Dist: scipy>=1.7.3; python_version < "3.10"
51
55
  Requires-Dist: scipy>=1.9.2; python_version >= "3.10" and python_version < "3.12"
52
56
  Requires-Dist: scipy>=1.12.0; python_version >= "3.12" and python_version < "3.13"
53
- Requires-Dist: scipy>=1.14.1; python_version >= "3.13"
57
+ Requires-Dist: scipy>=1.14.1; python_version >= "3.13" and python_version < "3.14"
58
+ Requires-Dist: scipy>=1.16.1; python_version >= "3.14"
54
59
  Requires-Dist: tabulate<0.9,>=0.8.3
55
- Requires-Dist: torch>=2.6.0
60
+ Requires-Dist: torch>=1.13.0; python_version < "3.11"
61
+ Requires-Dist: torch>=2.0.0; python_version >= "3.11" and python_version < "3.12"
62
+ Requires-Dist: torch>=2.3.0; python_version >= "3.12" and python_version < "3.13"
63
+ Requires-Dist: torch>=2.6.0; python_version >= "3.13" and python_version < "3.14"
64
+ Requires-Dist: torch>=2.9.0; python_version >= "3.14"
56
65
  Requires-Dist: tqdm>=4.66.3
57
66
  Requires-Dist: XlsxWriter>=1.2.8
58
- Requires-Dist: rdt>=1.17.0
59
- Requires-Dist: sdmetrics>=0.20.1
60
- Requires-Dist: sdv>=1.21.0
67
+ Requires-Dist: rdt>=1.18.2; python_version < "3.14"
68
+ Requires-Dist: rdt>=1.20.0; python_version >= "3.14"
69
+ Requires-Dist: sdmetrics>=0.21.0; python_version < "3.14"
70
+ Requires-Dist: sdmetrics>=0.26.0; python_version >= "3.14"
71
+ Requires-Dist: sdv>=1.21.0; python_version < "3.14"
72
+ Requires-Dist: sdv>=1.33.0; python_version >= "3.14"
61
73
  Provides-Extra: dask
62
74
  Requires-Dist: dask; extra == "dask"
63
75
  Requires-Dist: distributed; extra == "dask"
@@ -1,20 +1,15 @@
1
1
  appdirs>=1.3
2
2
  boto3<2,>=1.28
3
3
  botocore<2,>=1.31
4
- cloudpickle>=2.1.0
5
4
  compress-pickle>=1.2.0
6
- google-cloud-compute>=1.0.0
7
- google-auth>=2.0.0
5
+ google-cloud-compute>=1.30.0
6
+ google-auth>=2.14.1
8
7
  humanfriendly>=10.0
9
8
  openpyxl>=3.1.2
10
- psutil>=5.7
9
+ psutil>=5.8
11
10
  tabulate<0.9,>=0.8.3
12
- torch>=2.6.0
13
11
  tqdm>=4.66.3
14
12
  XlsxWriter>=1.2.8
15
- rdt>=1.17.0
16
- sdmetrics>=0.20.1
17
- sdv>=1.21.0
18
13
 
19
14
  [:python_version < "3.10"]
20
15
  numpy>=1.22.2
@@ -22,7 +17,14 @@ scikit-learn>=1.0.2
22
17
  scipy>=1.7.3
23
18
 
24
19
  [:python_version < "3.11"]
25
- pandas<3.0.0,>=1.4.0
20
+ pandas<3,>=1.4.0
21
+ torch>=1.13.0
22
+
23
+ [:python_version < "3.14"]
24
+ cloudpickle>=2.1.0
25
+ rdt>=1.18.2
26
+ sdmetrics>=0.21.0
27
+ sdv>=1.21.0
26
28
 
27
29
  [:python_version >= "3.10" and python_version < "3.11"]
28
30
  scikit-learn>=1.1.0
@@ -32,20 +34,34 @@ numpy>=1.24.0
32
34
  scipy>=1.9.2
33
35
 
34
36
  [:python_version >= "3.11" and python_version < "3.12"]
35
- pandas<3.0.0,>=1.5.0
37
+ pandas<3,>=1.5.0
36
38
  scikit-learn>=1.1.3
39
+ torch>=2.0.0
37
40
 
38
41
  [:python_version >= "3.12" and python_version < "3.13"]
39
42
  numpy>=1.26.0
40
- pandas<3.0.0,>=2.1.1
43
+ pandas<3,>=2.1.1
41
44
  scikit-learn>=1.3.1
42
45
  scipy>=1.12.0
46
+ torch>=2.3.0
43
47
 
44
- [:python_version >= "3.13"]
48
+ [:python_version >= "3.13" and python_version < "3.14"]
45
49
  numpy>=2.1.0
46
- pandas<3.0.0,>=2.2.3
50
+ pandas<3,>=2.2.3
47
51
  scikit-learn>=1.5.2
48
52
  scipy>=1.14.1
53
+ torch>=2.6.0
54
+
55
+ [:python_version >= "3.14"]
56
+ cloudpickle>=3.1.1
57
+ numpy>=2.3.2
58
+ pandas<3,>=2.3.3
59
+ scikit-learn>=1.8.0
60
+ scipy>=1.16.1
61
+ torch>=2.9.0
62
+ rdt>=1.20.0
63
+ sdmetrics>=0.26.0
64
+ sdv>=1.33.0
49
65
 
50
66
  [all]
51
67
  sdgym[dask,dev,test]
@@ -118,7 +118,7 @@ def _get_example_pyproject_dict():
118
118
  ],
119
119
  },
120
120
  'readme': 'README.md',
121
- 'requires-python': '>=3.9,<3.13',
121
+ 'requires-python': '>=3.9,<3.15',
122
122
  },
123
123
  'tool': {
124
124
  'bumpversion': {
@@ -1,152 +0,0 @@
1
- """Script to run a benchmark and upload results to S3."""
2
-
3
- import json
4
- import os
5
- from datetime import datetime, timezone
6
-
7
- from botocore.exceptions import ClientError
8
-
9
- from sdgym._benchmark.benchmark import (
10
- _benchmark_multi_table_compute_gcp,
11
- _benchmark_single_table_compute_gcp,
12
- )
13
- from sdgym.run_benchmark.utils import (
14
- KEY_DATE_FILE,
15
- OUTPUT_DESTINATION_AWS,
16
- SYNTHESIZERS_SPLIT_MULTI_TABLE,
17
- SYNTHESIZERS_SPLIT_SINGLE_TABLE,
18
- _parse_args,
19
- get_result_folder_name,
20
- post_benchmark_launch_message,
21
- )
22
- from sdgym.s3 import get_s3_client, parse_s3_path
23
-
24
- MODALITY_TO_SETUP = {
25
- 'single_table': {
26
- 'method': _benchmark_single_table_compute_gcp,
27
- 'synthesizers_split': SYNTHESIZERS_SPLIT_SINGLE_TABLE,
28
- 'datasets': [
29
- 'adult',
30
- 'alarm',
31
- 'census',
32
- 'child',
33
- 'covtype',
34
- 'expedia_hotel_logs',
35
- 'insurance',
36
- 'intrusion',
37
- 'news',
38
- ],
39
- },
40
- 'multi_table': {
41
- 'method': _benchmark_multi_table_compute_gcp,
42
- 'synthesizers_split': SYNTHESIZERS_SPLIT_MULTI_TABLE,
43
- 'datasets': [
44
- 'WebKP',
45
- 'DCG',
46
- 'UW_std',
47
- 'Same_gen',
48
- 'CORA',
49
- 'got_families',
50
- 'SalesDB',
51
- 'UTube',
52
- 'Student_loan',
53
- 'Hepatitis_std',
54
- 'Elti',
55
- 'Bupa',
56
- 'Toxicology',
57
- 'imdb_ijs',
58
- 'ftp',
59
- 'imdb_small',
60
- 'imdb_MovieLens',
61
- 'Pima',
62
- 'university',
63
- 'legalActs',
64
- 'Dunur',
65
- 'Mesh',
66
- 'world',
67
- 'airbnb-simplified',
68
- 'trains',
69
- 'FNHK',
70
- 'fake_hotels',
71
- 'SAT',
72
- 'genes',
73
- 'Biodegradability',
74
- 'Pyrimidine',
75
- 'mutagenesis',
76
- 'restbase',
77
- 'Triazine',
78
- 'Carcinogenesis',
79
- 'fake_hotels_extended',
80
- 'Mooney_Family',
81
- 'PTE',
82
- 'Facebook',
83
- 'multi_table_ID_demo_dataset',
84
- 'SAP',
85
- 'Chess',
86
- 'Countries',
87
- 'NCAA',
88
- 'Atherosclerosis',
89
- 'nations',
90
- 'TubePricing',
91
- 'financial',
92
- 'Accidents',
93
- 'MuskSmall',
94
- 'NBA',
95
- 'AustralianFootball',
96
- 'PremierLeague',
97
- 'OMOP_CDM_dayz',
98
- ],
99
- },
100
- }
101
-
102
-
103
- def append_benchmark_run(
104
- aws_access_key_id, aws_secret_access_key, date_str, modality='single_table'
105
- ):
106
- """Append a new benchmark run to the benchmark dates file in S3."""
107
- s3_client = get_s3_client(
108
- aws_access_key_id=aws_access_key_id,
109
- aws_secret_access_key=aws_secret_access_key,
110
- )
111
- bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
112
- try:
113
- object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{modality}/{KEY_DATE_FILE}')
114
- body = object['Body'].read().decode('utf-8')
115
- data = json.loads(body)
116
- except ClientError as e:
117
- if e.response['Error']['Code'] == 'NoSuchKey':
118
- data = {'runs': []}
119
- else:
120
- raise RuntimeError(f'Failed to read {KEY_DATE_FILE} from S3: {e}')
121
-
122
- data['runs'].append({'date': date_str, 'folder_name': get_result_folder_name(date_str)})
123
- data['runs'] = sorted(data['runs'], key=lambda x: x['date'])
124
- s3_client.put_object(
125
- Bucket=bucket,
126
- Key=f'{prefix}{modality}/{KEY_DATE_FILE}',
127
- Body=json.dumps(data).encode('utf-8'),
128
- )
129
-
130
-
131
- def main():
132
- """Main function to run the benchmark and upload results."""
133
- args = _parse_args()
134
- aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
135
- aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
136
- date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d')
137
- modality = args.modality
138
- for synthesizer_group in MODALITY_TO_SETUP[modality]['synthesizers_split']:
139
- MODALITY_TO_SETUP[modality]['method'](
140
- output_destination=OUTPUT_DESTINATION_AWS,
141
- credential_filepath=os.getenv('CREDENTIALS_FILEPATH'),
142
- synthesizers=synthesizer_group,
143
- sdv_datasets=MODALITY_TO_SETUP[modality]['datasets'],
144
- timeout=345600, # 4 days
145
- )
146
-
147
- append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str, modality=modality)
148
- post_benchmark_launch_message(date_str, compute_service='GCP', modality=modality)
149
-
150
-
151
- if __name__ == '__main__':
152
- main()
File without changes
File without changes
File without changes
File without changes
File without changes