sdgym 0.9.0.dev0__tar.gz → 0.9.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/PKG-INFO +2 -1
  2. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/pyproject.toml +2 -1
  3. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/__init__.py +1 -1
  4. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/benchmark.py +70 -31
  5. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/generate.py +17 -6
  6. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/utils.py +5 -1
  7. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/PKG-INFO +2 -1
  8. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/requires.txt +1 -0
  9. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/LICENSE +0 -0
  10. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/README.md +0 -0
  11. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/__init__.py +0 -0
  12. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/__main__.py +0 -0
  13. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/collect.py +0 -0
  14. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/summary.py +0 -0
  15. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/cli/utils.py +0 -0
  16. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/datasets.py +0 -0
  17. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/errors.py +0 -0
  18. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/metrics.py +0 -0
  19. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/progress.py +0 -0
  20. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/s3.py +0 -0
  21. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/__init__.py +0 -0
  22. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/base.py +0 -0
  23. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/column.py +0 -0
  24. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/identity.py +0 -0
  25. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/sdv.py +0 -0
  26. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/uniform.py +0 -0
  27. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/SOURCES.txt +0 -0
  28. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  29. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  30. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/top_level.txt +0 -0
  31. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/setup.cfg +0 -0
  32. {sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/tests/test_tasks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sdgym
3
- Version: 0.9.0.dev0
3
+ Version: 0.9.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License: BSL-1.1
@@ -27,6 +27,7 @@ License-File: LICENSE
27
27
  Requires-Dist: appdirs>=1.3
28
28
  Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
+ Requires-Dist: cloudpickle>=2.1.0
30
31
  Requires-Dist: compress-pickle>=1.2.0
31
32
  Requires-Dist: humanfriendly>=8.2
32
33
  Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
@@ -24,6 +24,7 @@ dependencies = [
24
24
  'appdirs>=1.3',
25
25
  'boto3>=1.28,<2',
26
26
  'botocore>=1.31,<2',
27
+ 'cloudpickle>=2.1.0',
27
28
  'compress-pickle>=1.2.0',
28
29
  'humanfriendly>=8.2',
29
30
  "numpy>=1.21.0,<2.0.0;python_version<'3.10'",
@@ -133,7 +134,7 @@ namespaces = false
133
134
  version = {attr = 'sdgym.__version__'}
134
135
 
135
136
  [tool.bumpversion]
136
- current_version = "0.9.0.dev0"
137
+ current_version = "0.9.1.dev0"
137
138
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
138
139
  serialize = [
139
140
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.9.0.dev0'
11
+ __version__ = '0.9.1.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -7,10 +7,12 @@ import os
7
7
  import pickle
8
8
  import tracemalloc
9
9
  import warnings
10
+ from contextlib import contextmanager
10
11
  from datetime import datetime
11
12
  from pathlib import Path
12
13
 
13
14
  import boto3
15
+ import cloudpickle
14
16
  import compress_pickle
15
17
  import numpy as np
16
18
  import pandas as pd
@@ -146,11 +148,18 @@ def _generate_job_args_list(
146
148
 
147
149
  def _synthesize(synthesizer_dict, real_data, metadata):
148
150
  synthesizer = synthesizer_dict['synthesizer']
149
- assert issubclass(synthesizer, BaselineSynthesizer), '`synthesizer` must be a synthesizer class'
151
+ if isinstance(synthesizer, type):
152
+ assert issubclass(
153
+ synthesizer, BaselineSynthesizer
154
+ ), '`synthesizer` must be a synthesizer class'
155
+ synthesizer = synthesizer()
156
+ else:
157
+ assert issubclass(
158
+ type(synthesizer), BaselineSynthesizer
159
+ ), '`synthesizer` must be an instance of a synthesizer class.'
150
160
 
151
- synthesizer_object = synthesizer()
152
- get_synthesizer = synthesizer_object.get_trained_synthesizer
153
- sample_from_synthesizer = synthesizer_object.sample_from_synthesizer
161
+ get_synthesizer = synthesizer.get_trained_synthesizer
162
+ sample_from_synthesizer = synthesizer.sample_from_synthesizer
154
163
  data = real_data.copy()
155
164
  num_samples = len(data)
156
165
 
@@ -190,7 +199,8 @@ def _compute_scores(
190
199
  'metric': metric_name,
191
200
  'error': 'Metric Timeout',
192
201
  })
193
- output['scores'] = scores # re-inject list to multiprocessing output
202
+ # re-inject list to multiprocessing output
203
+ output['scores'] = scores
194
204
 
195
205
  error = None
196
206
  score = None
@@ -213,7 +223,8 @@ def _compute_scores(
213
223
  'error': error,
214
224
  'metric_time': (datetime.utcnow() - start).total_seconds(),
215
225
  })
216
- output['scores'] = scores # re-inject list to multiprocessing output
226
+ # re-inject list to multiprocessing output
227
+ output['scores'] = scores
217
228
 
218
229
  if compute_diagnostic_score:
219
230
  start = datetime.utcnow()
@@ -264,7 +275,8 @@ def _score(
264
275
  )
265
276
 
266
277
  output['dataset_size'] = get_size_of(data) / N_BYTES_IN_MB
267
- output['error'] = 'Synthesizer Timeout' # To be deleted if there is no error
278
+ # To be deleted if there is no error
279
+ output['error'] = 'Synthesizer Timeout'
268
280
  synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize(
269
281
  synthesizer, data.copy(), metadata
270
282
  )
@@ -283,7 +295,8 @@ def _score(
283
295
  used_memory(),
284
296
  )
285
297
 
286
- del output['error'] # No error so far. _compute_scores tracks its own errors by metric
298
+ # No error so far. _compute_scores tracks its own errors by metric
299
+ del output['error']
287
300
  _compute_scores(
288
301
  metrics,
289
302
  data,
@@ -314,6 +327,26 @@ def _score(
314
327
  return output
315
328
 
316
329
 
330
+ @contextmanager
331
+ def multiprocessing_context():
332
+ """Override multiprocessing ForkingPickler to use cloudpickle."""
333
+ original_dump = multiprocessing.reduction.ForkingPickler.dumps
334
+ original_load = multiprocessing.reduction.ForkingPickler.loads
335
+ original_method = multiprocessing.get_start_method()
336
+
337
+ multiprocessing.set_start_method('spawn', force=True)
338
+ multiprocessing.reduction.ForkingPickler.dumps = cloudpickle.dumps
339
+ multiprocessing.reduction.ForkingPickler.loads = cloudpickle.loads
340
+
341
+ try:
342
+ yield
343
+ finally:
344
+ # Restore original methods
345
+ multiprocessing.set_start_method(original_method, force=True)
346
+ multiprocessing.reduction.ForkingPickler.dumps = original_dump
347
+ multiprocessing.reduction.ForkingPickler.loads = original_load
348
+
349
+
317
350
  def _score_with_timeout(
318
351
  timeout,
319
352
  synthesizer,
@@ -325,32 +358,33 @@ def _score_with_timeout(
325
358
  modality=None,
326
359
  dataset_name=None,
327
360
  ):
328
- with multiprocessing.Manager() as manager:
329
- output = manager.dict()
330
- process = multiprocessing.Process(
331
- target=_score,
332
- args=(
333
- synthesizer,
334
- data,
335
- metadata,
336
- metrics,
337
- output,
338
- compute_quality_score,
339
- compute_diagnostic_score,
340
- modality,
341
- dataset_name,
342
- ),
343
- )
361
+ with multiprocessing_context():
362
+ with multiprocessing.Manager() as manager:
363
+ output = manager.dict()
364
+ process = multiprocessing.Process(
365
+ target=_score,
366
+ args=(
367
+ synthesizer,
368
+ data,
369
+ metadata,
370
+ metrics,
371
+ output,
372
+ compute_quality_score,
373
+ compute_diagnostic_score,
374
+ modality,
375
+ dataset_name,
376
+ ),
377
+ )
344
378
 
345
- process.start()
346
- process.join(timeout)
347
- process.terminate()
379
+ process.start()
380
+ process.join(timeout)
381
+ process.terminate()
348
382
 
349
- output = dict(output)
350
- if output.get('timeout'):
351
- LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
383
+ output = dict(output)
384
+ if output.get('timeout'):
385
+ LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
352
386
 
353
- return output
387
+ return output
354
388
 
355
389
 
356
390
  def _format_output(
@@ -756,6 +790,11 @@ def benchmark_single_table(
756
790
  'package_name': 'dask' or 'multiprocessing',
757
791
  'num_workers': 4
758
792
  }
793
+ run_on_ec2 (bool):
794
+ The flag is used to run the benchmark on an EC2 instance that will be created
795
+ by a scriptusing the authentication of the current user. The EC2 instance
796
+ uses the LATEST released version of sdgym. Local changes or changes NOT
797
+ in the released version will NOT be used in the ec2 instance.
759
798
 
760
799
  Returns:
761
800
  pandas.DataFrame:
@@ -124,7 +124,7 @@ def create_single_table_synthesizer(
124
124
  obj:
125
125
  The trained synthesizer.
126
126
  """
127
- return get_trained_synthesizer_fn(data, metadata)
127
+ return self.synthesizer_fn['get_trained_synthesizer_fn'](data, metadata)
128
128
 
129
129
  def sample_from_synthesizer(self, synthesizer, num_samples):
130
130
  """Sample the desired number of samples from the given synthesizer.
@@ -139,11 +139,22 @@ def create_single_table_synthesizer(
139
139
  pandas.DataFrame:
140
140
  The synthetic data.
141
141
  """
142
- return sample_from_synthesizer_fn(synthesizer, num_samples)
143
-
144
- NewSynthesizer.__name__ = f'Custom:{display_name}'
145
-
146
- return NewSynthesizer
142
+ return self.synthesizer_fn['sample_from_synthesizer_fn'](synthesizer, num_samples)
143
+
144
+ CustomSynthesizer = type(
145
+ f'Custom:{display_name}',
146
+ (NewSynthesizer,),
147
+ {
148
+ 'synthesizer_fn': {
149
+ 'get_trained_synthesizer_fn': get_trained_synthesizer_fn,
150
+ 'sample_from_synthesizer_fn': sample_from_synthesizer_fn,
151
+ },
152
+ },
153
+ )
154
+ CustomSynthesizer.__name__ = f'Custom:{display_name}'
155
+ CustomSynthesizer.__module__ = 'sdgym.synthesizers.generate'
156
+ globals()[f'Custom:{display_name}'] = CustomSynthesizer
157
+ return CustomSynthesizer
147
158
 
148
159
 
149
160
  def create_multi_table_synthesizer(
@@ -60,8 +60,12 @@ def get_synthesizers(synthesizers):
60
60
  else:
61
61
  raise SDGymError(f'Unknown synthesizer {synthesizer}') from None
62
62
 
63
+ if isinstance(synthesizer, type) or hasattr(synthesizer, '__name__'):
64
+ synthesizer_name = getattr(synthesizer, '__name__', 'undefined')
65
+ else:
66
+ synthesizer_name = getattr(type(synthesizer), '__name__', 'undefined')
63
67
  synthesizers_dicts.append({
64
- 'name': getattr(synthesizer, '__name__', 'undefined'),
68
+ 'name': synthesizer_name,
65
69
  'synthesizer': synthesizer,
66
70
  })
67
71
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sdgym
3
- Version: 0.9.0.dev0
3
+ Version: 0.9.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License: BSL-1.1
@@ -27,6 +27,7 @@ License-File: LICENSE
27
27
  Requires-Dist: appdirs>=1.3
28
28
  Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
+ Requires-Dist: cloudpickle>=2.1.0
30
31
  Requires-Dist: compress-pickle>=1.2.0
31
32
  Requires-Dist: humanfriendly>=8.2
32
33
  Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
@@ -1,6 +1,7 @@
1
1
  appdirs>=1.3
2
2
  boto3<2,>=1.28
3
3
  botocore<2,>=1.31
4
+ cloudpickle>=2.1.0
4
5
  compress-pickle>=1.2.0
5
6
  humanfriendly>=8.2
6
7
  psutil>=5.7
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes