sdgym 0.9.0.dev0__tar.gz → 0.9.1.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/PKG-INFO +2 -1
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/pyproject.toml +2 -1
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/__init__.py +1 -1
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/benchmark.py +70 -31
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/generate.py +17 -6
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/utils.py +5 -1
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/PKG-INFO +2 -1
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/requires.txt +1 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/LICENSE +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/README.md +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/__init__.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/__main__.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/collect.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/summary.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/utils.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/datasets.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/errors.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/metrics.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/progress.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/s3.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/__init__.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/base.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/column.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/identity.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/sdv.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/uniform.py +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/SOURCES.txt +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/entry_points.txt +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/top_level.txt +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/setup.cfg +0 -0
- {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/tests/test_tasks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.1.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License: BSL-1.1
|
|
@@ -27,6 +27,7 @@ License-File: LICENSE
|
|
|
27
27
|
Requires-Dist: appdirs>=1.3
|
|
28
28
|
Requires-Dist: boto3<2,>=1.28
|
|
29
29
|
Requires-Dist: botocore<2,>=1.31
|
|
30
|
+
Requires-Dist: cloudpickle>=2.1.0
|
|
30
31
|
Requires-Dist: compress-pickle>=1.2.0
|
|
31
32
|
Requires-Dist: humanfriendly>=8.2
|
|
32
33
|
Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
|
|
@@ -24,6 +24,7 @@ dependencies = [
|
|
|
24
24
|
'appdirs>=1.3',
|
|
25
25
|
'boto3>=1.28,<2',
|
|
26
26
|
'botocore>=1.31,<2',
|
|
27
|
+
'cloudpickle>=2.1.0',
|
|
27
28
|
'compress-pickle>=1.2.0',
|
|
28
29
|
'humanfriendly>=8.2',
|
|
29
30
|
"numpy>=1.21.0,<2.0.0;python_version<'3.10'",
|
|
@@ -133,7 +134,7 @@ namespaces = false
|
|
|
133
134
|
version = {attr = 'sdgym.__version__'}
|
|
134
135
|
|
|
135
136
|
[tool.bumpversion]
|
|
136
|
-
current_version = "0.9.
|
|
137
|
+
current_version = "0.9.1.dev0"
|
|
137
138
|
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
|
|
138
139
|
serialize = [
|
|
139
140
|
'{major}.{minor}.{patch}.{release}{candidate}',
|
|
@@ -7,10 +7,12 @@ import os
|
|
|
7
7
|
import pickle
|
|
8
8
|
import tracemalloc
|
|
9
9
|
import warnings
|
|
10
|
+
from contextlib import contextmanager
|
|
10
11
|
from datetime import datetime
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
|
|
13
14
|
import boto3
|
|
15
|
+
import cloudpickle
|
|
14
16
|
import compress_pickle
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
@@ -146,11 +148,18 @@ def _generate_job_args_list(
|
|
|
146
148
|
|
|
147
149
|
def _synthesize(synthesizer_dict, real_data, metadata):
|
|
148
150
|
synthesizer = synthesizer_dict['synthesizer']
|
|
149
|
-
|
|
151
|
+
if isinstance(synthesizer, type):
|
|
152
|
+
assert issubclass(
|
|
153
|
+
synthesizer, BaselineSynthesizer
|
|
154
|
+
), '`synthesizer` must be a synthesizer class'
|
|
155
|
+
synthesizer = synthesizer()
|
|
156
|
+
else:
|
|
157
|
+
assert issubclass(
|
|
158
|
+
type(synthesizer), BaselineSynthesizer
|
|
159
|
+
), '`synthesizer` must be an instance of a synthesizer class.'
|
|
150
160
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
sample_from_synthesizer = synthesizer_object.sample_from_synthesizer
|
|
161
|
+
get_synthesizer = synthesizer.get_trained_synthesizer
|
|
162
|
+
sample_from_synthesizer = synthesizer.sample_from_synthesizer
|
|
154
163
|
data = real_data.copy()
|
|
155
164
|
num_samples = len(data)
|
|
156
165
|
|
|
@@ -190,7 +199,8 @@ def _compute_scores(
|
|
|
190
199
|
'metric': metric_name,
|
|
191
200
|
'error': 'Metric Timeout',
|
|
192
201
|
})
|
|
193
|
-
|
|
202
|
+
# re-inject list to multiprocessing output
|
|
203
|
+
output['scores'] = scores
|
|
194
204
|
|
|
195
205
|
error = None
|
|
196
206
|
score = None
|
|
@@ -213,7 +223,8 @@ def _compute_scores(
|
|
|
213
223
|
'error': error,
|
|
214
224
|
'metric_time': (datetime.utcnow() - start).total_seconds(),
|
|
215
225
|
})
|
|
216
|
-
|
|
226
|
+
# re-inject list to multiprocessing output
|
|
227
|
+
output['scores'] = scores
|
|
217
228
|
|
|
218
229
|
if compute_diagnostic_score:
|
|
219
230
|
start = datetime.utcnow()
|
|
@@ -264,7 +275,8 @@ def _score(
|
|
|
264
275
|
)
|
|
265
276
|
|
|
266
277
|
output['dataset_size'] = get_size_of(data) / N_BYTES_IN_MB
|
|
267
|
-
|
|
278
|
+
# To be deleted if there is no error
|
|
279
|
+
output['error'] = 'Synthesizer Timeout'
|
|
268
280
|
synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize(
|
|
269
281
|
synthesizer, data.copy(), metadata
|
|
270
282
|
)
|
|
@@ -283,7 +295,8 @@ def _score(
|
|
|
283
295
|
used_memory(),
|
|
284
296
|
)
|
|
285
297
|
|
|
286
|
-
|
|
298
|
+
# No error so far. _compute_scores tracks its own errors by metric
|
|
299
|
+
del output['error']
|
|
287
300
|
_compute_scores(
|
|
288
301
|
metrics,
|
|
289
302
|
data,
|
|
@@ -314,6 +327,26 @@ def _score(
|
|
|
314
327
|
return output
|
|
315
328
|
|
|
316
329
|
|
|
330
|
+
@contextmanager
|
|
331
|
+
def multiprocessing_context():
|
|
332
|
+
"""Override multiprocessing ForkingPickler to use cloudpickle."""
|
|
333
|
+
original_dump = multiprocessing.reduction.ForkingPickler.dumps
|
|
334
|
+
original_load = multiprocessing.reduction.ForkingPickler.loads
|
|
335
|
+
original_method = multiprocessing.get_start_method()
|
|
336
|
+
|
|
337
|
+
multiprocessing.set_start_method('spawn', force=True)
|
|
338
|
+
multiprocessing.reduction.ForkingPickler.dumps = cloudpickle.dumps
|
|
339
|
+
multiprocessing.reduction.ForkingPickler.loads = cloudpickle.loads
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
yield
|
|
343
|
+
finally:
|
|
344
|
+
# Restore original methods
|
|
345
|
+
multiprocessing.set_start_method(original_method, force=True)
|
|
346
|
+
multiprocessing.reduction.ForkingPickler.dumps = original_dump
|
|
347
|
+
multiprocessing.reduction.ForkingPickler.loads = original_load
|
|
348
|
+
|
|
349
|
+
|
|
317
350
|
def _score_with_timeout(
|
|
318
351
|
timeout,
|
|
319
352
|
synthesizer,
|
|
@@ -325,32 +358,33 @@ def _score_with_timeout(
|
|
|
325
358
|
modality=None,
|
|
326
359
|
dataset_name=None,
|
|
327
360
|
):
|
|
328
|
-
with
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
361
|
+
with multiprocessing_context():
|
|
362
|
+
with multiprocessing.Manager() as manager:
|
|
363
|
+
output = manager.dict()
|
|
364
|
+
process = multiprocessing.Process(
|
|
365
|
+
target=_score,
|
|
366
|
+
args=(
|
|
367
|
+
synthesizer,
|
|
368
|
+
data,
|
|
369
|
+
metadata,
|
|
370
|
+
metrics,
|
|
371
|
+
output,
|
|
372
|
+
compute_quality_score,
|
|
373
|
+
compute_diagnostic_score,
|
|
374
|
+
modality,
|
|
375
|
+
dataset_name,
|
|
376
|
+
),
|
|
377
|
+
)
|
|
344
378
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
379
|
+
process.start()
|
|
380
|
+
process.join(timeout)
|
|
381
|
+
process.terminate()
|
|
348
382
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
383
|
+
output = dict(output)
|
|
384
|
+
if output.get('timeout'):
|
|
385
|
+
LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
|
|
352
386
|
|
|
353
|
-
|
|
387
|
+
return output
|
|
354
388
|
|
|
355
389
|
|
|
356
390
|
def _format_output(
|
|
@@ -756,6 +790,11 @@ def benchmark_single_table(
|
|
|
756
790
|
'package_name': 'dask' or 'multiprocessing',
|
|
757
791
|
'num_workers': 4
|
|
758
792
|
}
|
|
793
|
+
run_on_ec2 (bool):
|
|
794
|
+
The flag is used to run the benchmark on an EC2 instance that will be created
|
|
795
|
+
by a scriptusing the authentication of the current user. The EC2 instance
|
|
796
|
+
uses the LATEST released version of sdgym. Local changes or changes NOT
|
|
797
|
+
in the released version will NOT be used in the ec2 instance.
|
|
759
798
|
|
|
760
799
|
Returns:
|
|
761
800
|
pandas.DataFrame:
|
|
@@ -124,7 +124,7 @@ def create_single_table_synthesizer(
|
|
|
124
124
|
obj:
|
|
125
125
|
The trained synthesizer.
|
|
126
126
|
"""
|
|
127
|
-
return get_trained_synthesizer_fn(data, metadata)
|
|
127
|
+
return self.synthesizer_fn['get_trained_synthesizer_fn'](data, metadata)
|
|
128
128
|
|
|
129
129
|
def sample_from_synthesizer(self, synthesizer, num_samples):
|
|
130
130
|
"""Sample the desired number of samples from the given synthesizer.
|
|
@@ -139,11 +139,22 @@ def create_single_table_synthesizer(
|
|
|
139
139
|
pandas.DataFrame:
|
|
140
140
|
The synthetic data.
|
|
141
141
|
"""
|
|
142
|
-
return sample_from_synthesizer_fn(synthesizer, num_samples)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
142
|
+
return self.synthesizer_fn['sample_from_synthesizer_fn'](synthesizer, num_samples)
|
|
143
|
+
|
|
144
|
+
CustomSynthesizer = type(
|
|
145
|
+
f'Custom:{display_name}',
|
|
146
|
+
(NewSynthesizer,),
|
|
147
|
+
{
|
|
148
|
+
'synthesizer_fn': {
|
|
149
|
+
'get_trained_synthesizer_fn': get_trained_synthesizer_fn,
|
|
150
|
+
'sample_from_synthesizer_fn': sample_from_synthesizer_fn,
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
CustomSynthesizer.__name__ = f'Custom:{display_name}'
|
|
155
|
+
CustomSynthesizer.__module__ = 'sdgym.synthesizers.generate'
|
|
156
|
+
globals()[f'Custom:{display_name}'] = CustomSynthesizer
|
|
157
|
+
return CustomSynthesizer
|
|
147
158
|
|
|
148
159
|
|
|
149
160
|
def create_multi_table_synthesizer(
|
|
@@ -60,8 +60,12 @@ def get_synthesizers(synthesizers):
|
|
|
60
60
|
else:
|
|
61
61
|
raise SDGymError(f'Unknown synthesizer {synthesizer}') from None
|
|
62
62
|
|
|
63
|
+
if isinstance(synthesizer, type) or hasattr(synthesizer, '__name__'):
|
|
64
|
+
synthesizer_name = getattr(synthesizer, '__name__', 'undefined')
|
|
65
|
+
else:
|
|
66
|
+
synthesizer_name = getattr(type(synthesizer), '__name__', 'undefined')
|
|
63
67
|
synthesizers_dicts.append({
|
|
64
|
-
'name':
|
|
68
|
+
'name': synthesizer_name,
|
|
65
69
|
'synthesizer': synthesizer,
|
|
66
70
|
})
|
|
67
71
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.1.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License: BSL-1.1
|
|
@@ -27,6 +27,7 @@ License-File: LICENSE
|
|
|
27
27
|
Requires-Dist: appdirs>=1.3
|
|
28
28
|
Requires-Dist: boto3<2,>=1.28
|
|
29
29
|
Requires-Dist: botocore<2,>=1.31
|
|
30
|
+
Requires-Dist: cloudpickle>=2.1.0
|
|
30
31
|
Requires-Dist: compress-pickle>=1.2.0
|
|
31
32
|
Requires-Dist: humanfriendly>=8.2
|
|
32
33
|
Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|