sdgym 0.8.0.dev1__tar.gz → 0.9.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/PKG-INFO +4 -4
  2. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/pyproject.toml +23 -10
  3. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/__init__.py +1 -1
  4. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/benchmark.py +65 -11
  5. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/__main__.py +4 -4
  6. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/collect.py +2 -2
  7. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/summary.py +7 -7
  8. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/utils.py +1 -1
  9. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/__init__.py +2 -2
  10. sdgym-0.8.0.dev1/sdgym/synthesizers/independent.py → sdgym-0.9.0.dev0/sdgym/synthesizers/column.py +2 -2
  11. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/PKG-INFO +4 -4
  12. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/SOURCES.txt +1 -1
  13. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/requires.txt +3 -3
  14. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/LICENSE +0 -0
  15. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/README.md +0 -0
  16. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/__init__.py +0 -0
  17. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/datasets.py +0 -0
  18. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/errors.py +0 -0
  19. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/metrics.py +0 -0
  20. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/progress.py +0 -0
  21. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/s3.py +0 -0
  22. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/base.py +0 -0
  23. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/generate.py +0 -0
  24. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/identity.py +0 -0
  25. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/sdv.py +0 -0
  26. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/uniform.py +0 -0
  27. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/utils.py +0 -0
  28. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  29. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  30. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/top_level.txt +0 -0
  31. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/setup.cfg +0 -0
  32. {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/tests/test_tasks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sdgym
3
- Version: 0.8.0.dev1
3
+ Version: 0.9.0.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License: BSL-1.1
@@ -29,9 +29,9 @@ Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
30
  Requires-Dist: compress-pickle>=1.2.0
31
31
  Requires-Dist: humanfriendly>=8.2
32
- Requires-Dist: numpy>=1.21.0; python_version < "3.10"
33
- Requires-Dist: numpy<2,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
34
- Requires-Dist: numpy<2,>=1.26.0; python_version >= "3.12"
32
+ Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
33
+ Requires-Dist: numpy<2.0.0,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
34
+ Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= "3.12"
35
35
  Requires-Dist: pandas>=1.4.0; python_version < "3.11"
36
36
  Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
37
37
  Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
@@ -26,9 +26,9 @@ dependencies = [
26
26
  'botocore>=1.31,<2',
27
27
  'compress-pickle>=1.2.0',
28
28
  'humanfriendly>=8.2',
29
- "numpy>=1.21.0;python_version<'3.10'",
30
- "numpy>=1.23.3,<2;python_version>='3.10' and python_version<'3.12'",
31
- "numpy>=1.26.0,<2;python_version>='3.12'",
29
+ "numpy>=1.21.0,<2.0.0;python_version<'3.10'",
30
+ "numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
31
+ "numpy>=1.26.0,<2.0.0;python_version>='3.12'",
32
32
  "pandas>=1.4.0;python_version<'3.11'",
33
33
  "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
34
34
  "pandas>=2.1.1;python_version>='3.12'",
@@ -133,7 +133,7 @@ namespaces = false
133
133
  version = {attr = 'sdgym.__version__'}
134
134
 
135
135
  [tool.bumpversion]
136
- current_version = "0.8.0.dev1"
136
+ current_version = "0.9.0.dev0"
137
137
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
138
138
  serialize = [
139
139
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -179,7 +179,8 @@ exclude = [
179
179
  ".tox",
180
180
  ".git",
181
181
  "__pycache__",
182
- ".ipynb_checkpoints"
182
+ ".ipynb_checkpoints",
183
+ "tasks.py",
183
184
  ]
184
185
 
185
186
  [tool.ruff.lint]
@@ -189,14 +190,22 @@ select = [
189
190
  # Pycodestyle
190
191
  "E",
191
192
  "W",
192
- "D200",
193
+ # pydocstyle
194
+ "D",
193
195
  # isort
194
196
  "I001",
197
+ # print statements
198
+ "T201",
199
+ # pandas-vet
200
+ "PD"
195
201
  ]
196
202
  ignore = [
197
203
  "E501",
204
+ # pydocstyle
198
205
  "D107", # Missing docstring in __init__
199
206
  "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
207
+ "PD901",
208
+ "PD101",
200
209
  ]
201
210
 
202
211
  [tool.ruff.format]
@@ -206,14 +215,18 @@ preview = true
206
215
  docstring-code-format = true
207
216
  docstring-code-line-length = "dynamic"
208
217
 
209
- [tool.ruff.lint.pep8-naming]
210
- extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"]
211
-
212
218
  [tool.ruff.lint.isort]
213
219
  known-first-party = ["sdgym"]
220
+ lines-between-types = 0
214
221
 
215
222
  [tool.ruff.lint.per-file-ignores]
216
223
  "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
224
+ "errors.py" = ["D105"]
225
+ "tests/**.py" = ["D"]
217
226
 
218
227
  [tool.ruff.lint.pydocstyle]
219
- convention = "google"
228
+ convention = "google"
229
+
230
+ [tool.ruff.lint.pycodestyle]
231
+ max-doc-length = 100
232
+ max-line-length = 100
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.8.0.dev1'
11
+ __version__ = '0.9.0.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -15,8 +15,18 @@ import compress_pickle
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
  import tqdm
18
- from sdmetrics.reports.multi_table import QualityReport as MultiTableQualityReport
19
- from sdmetrics.reports.single_table import QualityReport as SingleTableQualityReport
18
+ from sdmetrics.reports.multi_table import (
19
+ DiagnosticReport as MultiTableDiagnosticReport,
20
+ )
21
+ from sdmetrics.reports.multi_table import (
22
+ QualityReport as MultiTableQualityReport,
23
+ )
24
+ from sdmetrics.reports.single_table import (
25
+ DiagnosticReport as SingleTableDiagnosticReport,
26
+ )
27
+ from sdmetrics.reports.single_table import (
28
+ QualityReport as SingleTableQualityReport,
29
+ )
20
30
 
21
31
  from sdgym.datasets import get_dataset_paths, load_dataset
22
32
  from sdgym.errors import SDGymError
@@ -88,6 +98,7 @@ def _generate_job_args_list(
88
98
  detailed_results_folder,
89
99
  timeout,
90
100
  compute_quality_score,
101
+ compute_diagnostic_score,
91
102
  synthesizers,
92
103
  custom_synthesizers,
93
104
  ):
@@ -124,6 +135,7 @@ def _generate_job_args_list(
124
135
  detailed_results_folder,
125
136
  timeout,
126
137
  compute_quality_score,
138
+ compute_diagnostic_score,
127
139
  dataset.name,
128
140
  'single_table',
129
141
  )
@@ -164,6 +176,7 @@ def _compute_scores(
164
176
  metadata,
165
177
  output,
166
178
  compute_quality_score,
179
+ compute_diagnostic_score,
167
180
  modality,
168
181
  dataset_name,
169
182
  ):
@@ -202,6 +215,17 @@ def _compute_scores(
202
215
  })
203
216
  output['scores'] = scores # re-inject list to multiprocessing output
204
217
 
218
+ if compute_diagnostic_score:
219
+ start = datetime.utcnow()
220
+ if modality == 'single_table':
221
+ diagnostic_report = SingleTableDiagnosticReport()
222
+ else:
223
+ diagnostic_report = MultiTableDiagnosticReport()
224
+
225
+ diagnostic_report.generate(real_data, synthetic_data, metadata, verbose=False)
226
+ output['diagnostic_score_time'] = (datetime.utcnow() - start).total_seconds()
227
+ output['diagnostic_score'] = diagnostic_report.get_score()
228
+
205
229
  if compute_quality_score:
206
230
  start = datetime.utcnow()
207
231
  if modality == 'single_table':
@@ -221,6 +245,7 @@ def _score(
221
245
  metrics,
222
246
  output=None,
223
247
  compute_quality_score=False,
248
+ compute_diagnostic_score=False,
224
249
  modality=None,
225
250
  dataset_name=None,
226
251
  ):
@@ -266,6 +291,7 @@ def _score(
266
291
  metadata,
267
292
  output,
268
293
  compute_quality_score,
294
+ compute_diagnostic_score,
269
295
  modality,
270
296
  dataset_name,
271
297
  )
@@ -295,6 +321,7 @@ def _score_with_timeout(
295
321
  metadata,
296
322
  metrics,
297
323
  compute_quality_score=False,
324
+ compute_diagnostic_score=False,
298
325
  modality=None,
299
326
  dataset_name=None,
300
327
  ):
@@ -309,6 +336,7 @@ def _score_with_timeout(
309
336
  metrics,
310
337
  output,
311
338
  compute_quality_score,
339
+ compute_diagnostic_score,
312
340
  modality,
313
341
  dataset_name,
314
342
  ),
@@ -325,15 +353,26 @@ def _score_with_timeout(
325
353
  return output
326
354
 
327
355
 
328
- def _format_output(output, name, dataset_name, compute_quality_score, cache_dir):
329
- evaluate_time = None
330
- if 'scores' in output or 'quality_score_time' in output:
331
- evaluate_time = output.get('quality_score_time', 0)
356
+ def _format_output(
357
+ output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir
358
+ ):
359
+ evaluate_time = 0
360
+ if 'quality_score_time' in output:
361
+ evaluate_time += output.get('quality_score_time', 0)
362
+ if 'diagnostic_score_time' in output:
363
+ evaluate_time += output.get('diagnostic_score_time', 0)
332
364
 
333
365
  for score in output.get('scores', []):
334
- if score['metric'] == 'NewRowSynthesis':
366
+ if 'metric_time' in score and not np.isnan(score['metric_time']):
335
367
  evaluate_time += score['metric_time']
336
368
 
369
+ if (
370
+ 'quality_score_time' not in output
371
+ and 'scores' not in output
372
+ and 'diagnostic_score_time' not in output
373
+ ):
374
+ evaluate_time = None
375
+
337
376
  scores = pd.DataFrame({
338
377
  'Synthesizer': [name],
339
378
  'Dataset': [dataset_name],
@@ -345,6 +384,9 @@ def _format_output(output, name, dataset_name, compute_quality_score, cache_dir)
345
384
  'Evaluate_Time': [evaluate_time],
346
385
  })
347
386
 
387
+ if compute_diagnostic_score:
388
+ scores.insert(len(scores.columns), 'Diagnostic_Score', output.get('diagnostic_score'))
389
+
348
390
  if compute_quality_score:
349
391
  scores.insert(len(scores.columns), 'Quality_Score', output.get('quality_score'))
350
392
 
@@ -381,6 +423,7 @@ def _run_job(args):
381
423
  cache_dir,
382
424
  timeout,
383
425
  compute_quality_score,
426
+ compute_diagnostic_score,
384
427
  dataset_name,
385
428
  modality,
386
429
  ) = args
@@ -404,6 +447,7 @@ def _run_job(args):
404
447
  metadata=metadata,
405
448
  metrics=metrics,
406
449
  compute_quality_score=compute_quality_score,
450
+ compute_diagnostic_score=compute_diagnostic_score,
407
451
  modality=modality,
408
452
  dataset_name=dataset_name,
409
453
  )
@@ -414,13 +458,16 @@ def _run_job(args):
414
458
  metadata=metadata,
415
459
  metrics=metrics,
416
460
  compute_quality_score=compute_quality_score,
461
+ compute_diagnostic_score=compute_diagnostic_score,
417
462
  modality=modality,
418
463
  dataset_name=dataset_name,
419
464
  )
420
465
  except Exception as error:
421
466
  output['exception'] = error
422
467
 
423
- scores = _format_output(output, name, dataset_name, compute_quality_score, cache_dir)
468
+ scores = _format_output(
469
+ output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir
470
+ )
424
471
 
425
472
  return scores
426
473
 
@@ -482,7 +529,7 @@ def _run_jobs(multi_processing_config, job_args_list, show_progress):
482
529
  return scores
483
530
 
484
531
 
485
- def _get_empty_dataframe(compute_quality_score, sdmetrics):
532
+ def _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics):
486
533
  warnings.warn('No datasets/synthesizers found.')
487
534
 
488
535
  scores = pd.DataFrame({
@@ -496,6 +543,8 @@ def _get_empty_dataframe(compute_quality_score, sdmetrics):
496
543
  'Evaluate_Time': [],
497
544
  })
498
545
 
546
+ if compute_diagnostic_score:
547
+ scores['Diagnostic_Score'] = []
499
548
  if compute_quality_score:
500
549
  scores['Quality_Score'] = []
501
550
  if sdmetrics:
@@ -564,7 +613,7 @@ from io import StringIO
564
613
  import sdgym
565
614
  from sdgym.synthesizers.sdv import (CopulaGANSynthesizer, CTGANSynthesizer,
566
615
  GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, SDVRelationalSynthesizer,
567
- SDVTabularSynthesizer,TVAESynthesizer)
616
+ SDVTabularSynthesizer, TVAESynthesizer)
568
617
 
569
618
  results = sdgym.benchmark_single_table(
570
619
  {synthesizer_string}, custom_synthesizers={params['custom_synthesizers']},
@@ -572,6 +621,7 @@ results = sdgym.benchmark_single_table(
572
621
  additional_datasets_folder={params['additional_datasets_folder']},
573
622
  limit_dataset_size={params['limit_dataset_size']},
574
623
  compute_quality_score={params['compute_quality_score']},
624
+ compute_diagnostic_score={params['compute_diagnostic_score']},
575
625
  sdmetrics={params['sdmetrics']}, timeout={params['timeout']},
576
626
  detailed_results_folder={params['detailed_results_folder']},
577
627
  multi_processing_config={params['multi_processing_config']}
@@ -643,6 +693,7 @@ def benchmark_single_table(
643
693
  additional_datasets_folder=None,
644
694
  limit_dataset_size=False,
645
695
  compute_quality_score=True,
696
+ compute_diagnostic_score=True,
646
697
  sdmetrics=DEFAULT_METRICS,
647
698
  timeout=None,
648
699
  output_filepath=None,
@@ -680,6 +731,8 @@ def benchmark_single_table(
680
731
  columns.
681
732
  compute_quality_score (bool):
682
733
  Whether or not to evaluate an overall quality score.
734
+ compute_diagnostic_score (bool):
735
+ Whether or not to evaluate an overall diagnostic score.
683
736
  sdmetrics (list[str]):
684
737
  A list of the different SDMetrics to use. If you'd like to input specific parameters
685
738
  into the metric, provide a tuple with the metric name followed by a dictionary of
@@ -729,6 +782,7 @@ def benchmark_single_table(
729
782
  detailed_results_folder,
730
783
  timeout,
731
784
  compute_quality_score,
785
+ compute_diagnostic_score,
732
786
  synthesizers,
733
787
  custom_synthesizers,
734
788
  )
@@ -738,7 +792,7 @@ def benchmark_single_table(
738
792
 
739
793
  # If no synthesizers/datasets are passed, return an empty dataframe
740
794
  else:
741
- scores = _get_empty_dataframe(compute_quality_score, sdmetrics)
795
+ scores = _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics)
742
796
 
743
797
  if output_filepath:
744
798
  write_csv(scores, output_filepath, None, None)
@@ -41,13 +41,13 @@ def _print_table(data, sort=None, reverse=False, format=None):
41
41
 
42
42
  if 'error' in data:
43
43
  error = data['error']
44
- if pd.isnull(error).all():
44
+ if pd.isna(error).all():
45
45
  del data['error']
46
46
  else:
47
47
  long_error = error.str.len() > 30
48
48
  data.loc[long_error, 'error'] = error[long_error].str[:30] + '...'
49
49
 
50
- print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False))
50
+ print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False)) # noqa: T201
51
51
 
52
52
 
53
53
  def _run(args):
@@ -110,7 +110,7 @@ def _download_datasets(args):
110
110
  def _list_downloaded(args):
111
111
  datasets = sdgym.cli.utils.get_downloaded_datasets(args.datasets_path)
112
112
  _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})
113
- print(f'Found {len(datasets)} downloaded datasets')
113
+ print(f'Found {len(datasets)} downloaded datasets') # noqa: T201
114
114
 
115
115
 
116
116
  def _list_available(args):
@@ -395,7 +395,7 @@ def main():
395
395
  try:
396
396
  args.action(args)
397
397
  except sdgym.errors.SDGymError as error:
398
- print(f'ERROR: {error}')
398
+ print(f'ERROR: {error}') # noqa: T201
399
399
 
400
400
 
401
401
  if __name__ == '__main__':
@@ -22,7 +22,7 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None)
22
22
  If an ``aws_secret`` is provided, the given secret access key will be used to read
23
23
  from and/or write to any s3 paths.
24
24
  """
25
- print(f'Reading results from {input_path}')
25
+ print(f'Reading results from {input_path}') # noqa: T201
26
26
  scores = read_csv_from_path(input_path, aws_key, aws_secret)
27
27
  scores = scores.drop_duplicates()
28
28
 
@@ -31,5 +31,5 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None)
31
31
  else:
32
32
  output = f'{input_path}/results.csv'
33
33
 
34
- print(f'Storing results at {output}')
34
+ print(f'Storing results at {output}') # noqa: T201
35
35
  write_csv(scores, output, aws_key, aws_secret)
@@ -15,7 +15,7 @@ KNOWN_ERRORS = (
15
15
  )
16
16
 
17
17
  MODALITY_BASELINES = {
18
- 'single-table': ['Uniform', 'Independent', 'CLBN', 'PrivBN'],
18
+ 'single-table': ['Uniform', 'Column', 'CLBN', 'PrivBN'],
19
19
  'multi-table': ['Uniform', 'Independent'],
20
20
  'timeseries': [],
21
21
  }
@@ -46,7 +46,7 @@ def preprocess(data):
46
46
 
47
47
  def _coverage(data):
48
48
  total = len(data.Dataset.unique())
49
- scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notnull().sum())
49
+ scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum())
50
50
  coverage_perc = scores / total
51
51
  coverage_str = scores.astype(str) + f' / {total}'
52
52
  return coverage_perc, coverage_str
@@ -102,7 +102,7 @@ def summarize(data, baselines=(), datasets=None):
102
102
  no_identity = data[data.Synthesizer != 'DataIdentity']
103
103
 
104
104
  coverage_perc, coverage_str = _coverage(data)
105
- solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notnull().sum())
105
+ solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum())
106
106
 
107
107
  results = {
108
108
  'total': len(data.Dataset.unique()),
@@ -127,7 +127,7 @@ def summarize(data, baselines=(), datasets=None):
127
127
  for _, error_column in KNOWN_ERRORS:
128
128
  results[error_column] = grouped[error_column].sum()
129
129
 
130
- results['errors'] = grouped.error.apply(lambda x: x.notnull().sum())
130
+ results['errors'] = grouped.error.apply(lambda x: x.notna().sum())
131
131
  total_errors = results['errors']
132
132
  results['metric_errors'] = results['total'] - results['solved'] - total_errors
133
133
 
@@ -160,7 +160,7 @@ def errors_summary(data):
160
160
  """
161
161
  if 'error' in data.columns:
162
162
  all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'error': 'all'})
163
- synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).unstack(level=0)
163
+ synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).pivot_table(level=0)
164
164
  for synthesizer, errors in synthesizer_errors.items():
165
165
  all_errors[synthesizer] = errors.fillna(0).astype(int)
166
166
 
@@ -217,7 +217,7 @@ def _find_library(synthesizer):
217
217
 
218
218
  def _add_summary_libraries(summary_data):
219
219
  summary_data['library'] = summary_data.index.map(_find_library)
220
- summary_data['library'].fillna('Other', inplace=True)
220
+ summary_data['library'] = summary_data['library'].fillna('Other')
221
221
  return summary_data
222
222
 
223
223
 
@@ -240,7 +240,7 @@ def _add_summary(data, modality, baselines, writer):
240
240
  },
241
241
  axis=1,
242
242
  )
243
- summary.drop(index='Identity', inplace=True, errors='ignore')
243
+ summary = summary.drop(index='Identity', errors='ignore')
244
244
  summary = _add_summary_libraries(summary)
245
245
 
246
246
  beat_baseline_headers = ['beat_' + b.lower() for b in baselines]
@@ -67,7 +67,7 @@ def read_csv_from_path(path, aws_key, aws_secret):
67
67
  All csv content within a path will be read and returned in a
68
68
  DataFrame. The path can be either local or an s3 directory.
69
69
 
70
- args:
70
+ Args:
71
71
  path (str):
72
72
  The path to read from, which can be either local or an s3 path.
73
73
  aws_key (str):
@@ -8,7 +8,7 @@ from sdgym.synthesizers.generate import (
8
8
  create_single_table_synthesizer,
9
9
  )
10
10
  from sdgym.synthesizers.identity import DataIdentity
11
- from sdgym.synthesizers.independent import IndependentSynthesizer
11
+ from sdgym.synthesizers.column import ColumnSynthesizer
12
12
  from sdgym.synthesizers.sdv import (
13
13
  CopulaGANSynthesizer,
14
14
  CTGANSynthesizer,
@@ -23,7 +23,7 @@ from sdgym.synthesizers.uniform import UniformSynthesizer
23
23
 
24
24
  __all__ = (
25
25
  'DataIdentity',
26
- 'IndependentSynthesizer',
26
+ 'ColumnSynthesizer',
27
27
  'CTGANSynthesizer',
28
28
  'TVAESynthesizer',
29
29
  'UniformSynthesizer',
@@ -1,4 +1,4 @@
1
- """IndependentSynthesizer module."""
1
+ """ColumnSynthesizer module."""
2
2
 
3
3
  import pandas as pd
4
4
  from rdt.hyper_transformer import HyperTransformer
@@ -7,7 +7,7 @@ from sklearn.mixture import GaussianMixture
7
7
  from sdgym.synthesizers.base import BaselineSynthesizer
8
8
 
9
9
 
10
- class IndependentSynthesizer(BaselineSynthesizer):
10
+ class ColumnSynthesizer(BaselineSynthesizer):
11
11
  """Synthesizer that learns each column independently.
12
12
 
13
13
  Categorical columns are sampled using empirical frequencies.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sdgym
3
- Version: 0.8.0.dev1
3
+ Version: 0.9.0.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License: BSL-1.1
@@ -29,9 +29,9 @@ Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
30
  Requires-Dist: compress-pickle>=1.2.0
31
31
  Requires-Dist: humanfriendly>=8.2
32
- Requires-Dist: numpy>=1.21.0; python_version < "3.10"
33
- Requires-Dist: numpy<2,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
34
- Requires-Dist: numpy<2,>=1.26.0; python_version >= "3.12"
32
+ Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
33
+ Requires-Dist: numpy<2.0.0,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
34
+ Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= "3.12"
35
35
  Requires-Dist: pandas>=1.4.0; python_version < "3.11"
36
36
  Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
37
37
  Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
@@ -22,9 +22,9 @@ sdgym/cli/summary.py
22
22
  sdgym/cli/utils.py
23
23
  sdgym/synthesizers/__init__.py
24
24
  sdgym/synthesizers/base.py
25
+ sdgym/synthesizers/column.py
25
26
  sdgym/synthesizers/generate.py
26
27
  sdgym/synthesizers/identity.py
27
- sdgym/synthesizers/independent.py
28
28
  sdgym/synthesizers/sdv.py
29
29
  sdgym/synthesizers/uniform.py
30
30
  tests/test_tasks.py
@@ -12,7 +12,7 @@ sdmetrics>=0.14.1
12
12
  sdv>=1.13.1
13
13
 
14
14
  [:python_version < "3.10"]
15
- numpy>=1.21.0
15
+ numpy<2.0.0,>=1.21.0
16
16
  scikit-learn>=1.0.2
17
17
  scipy>=1.7.3
18
18
  torch>=1.9.0
@@ -24,7 +24,7 @@ pandas>=1.4.0
24
24
  scikit-learn>=1.1.0
25
25
 
26
26
  [:python_version >= "3.10" and python_version < "3.12"]
27
- numpy<2,>=1.23.3
27
+ numpy<2.0.0,>=1.23.3
28
28
  scipy>=1.9.2
29
29
  torch>=2.0.0
30
30
 
@@ -33,7 +33,7 @@ pandas>=1.5.0
33
33
  scikit-learn>=1.1.3
34
34
 
35
35
  [:python_version >= "3.12"]
36
- numpy<2,>=1.26.0
36
+ numpy<2.0.0,>=1.26.0
37
37
  pandas>=2.1.1
38
38
  scikit-learn>=1.3.1
39
39
  scipy>=1.12.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes