sdgym 0.8.0.dev1__tar.gz → 0.9.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/PKG-INFO +4 -4
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/pyproject.toml +23 -10
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/__init__.py +1 -1
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/benchmark.py +65 -11
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/__main__.py +4 -4
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/collect.py +2 -2
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/summary.py +7 -7
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/utils.py +1 -1
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/__init__.py +2 -2
- sdgym-0.8.0.dev1/sdgym/synthesizers/independent.py → sdgym-0.9.0.dev0/sdgym/synthesizers/column.py +2 -2
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/PKG-INFO +4 -4
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/SOURCES.txt +1 -1
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/requires.txt +3 -3
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/LICENSE +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/README.md +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/__init__.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/datasets.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/errors.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/metrics.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/progress.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/s3.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/base.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/generate.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/identity.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/sdv.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/uniform.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/utils.py +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/entry_points.txt +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/top_level.txt +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/setup.cfg +0 -0
- {sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/tests/test_tasks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License: BSL-1.1
|
|
@@ -29,9 +29,9 @@ Requires-Dist: boto3<2,>=1.28
|
|
|
29
29
|
Requires-Dist: botocore<2,>=1.31
|
|
30
30
|
Requires-Dist: compress-pickle>=1.2.0
|
|
31
31
|
Requires-Dist: humanfriendly>=8.2
|
|
32
|
-
Requires-Dist: numpy
|
|
33
|
-
Requires-Dist: numpy<2,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
|
|
34
|
-
Requires-Dist: numpy<2,>=1.26.0; python_version >= "3.12"
|
|
32
|
+
Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
|
|
33
|
+
Requires-Dist: numpy<2.0.0,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
|
|
34
|
+
Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= "3.12"
|
|
35
35
|
Requires-Dist: pandas>=1.4.0; python_version < "3.11"
|
|
36
36
|
Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
|
|
37
37
|
Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
|
|
@@ -26,9 +26,9 @@ dependencies = [
|
|
|
26
26
|
'botocore>=1.31,<2',
|
|
27
27
|
'compress-pickle>=1.2.0',
|
|
28
28
|
'humanfriendly>=8.2',
|
|
29
|
-
"numpy>=1.21.0;python_version<'3.10'",
|
|
30
|
-
"numpy>=1.23.3,<2;python_version>='3.10' and python_version<'3.12'",
|
|
31
|
-
"numpy>=1.26.0,<2;python_version>='3.12'",
|
|
29
|
+
"numpy>=1.21.0,<2.0.0;python_version<'3.10'",
|
|
30
|
+
"numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
|
|
31
|
+
"numpy>=1.26.0,<2.0.0;python_version>='3.12'",
|
|
32
32
|
"pandas>=1.4.0;python_version<'3.11'",
|
|
33
33
|
"pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
|
|
34
34
|
"pandas>=2.1.1;python_version>='3.12'",
|
|
@@ -133,7 +133,7 @@ namespaces = false
|
|
|
133
133
|
version = {attr = 'sdgym.__version__'}
|
|
134
134
|
|
|
135
135
|
[tool.bumpversion]
|
|
136
|
-
current_version = "0.
|
|
136
|
+
current_version = "0.9.0.dev0"
|
|
137
137
|
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
|
|
138
138
|
serialize = [
|
|
139
139
|
'{major}.{minor}.{patch}.{release}{candidate}',
|
|
@@ -179,7 +179,8 @@ exclude = [
|
|
|
179
179
|
".tox",
|
|
180
180
|
".git",
|
|
181
181
|
"__pycache__",
|
|
182
|
-
".ipynb_checkpoints"
|
|
182
|
+
".ipynb_checkpoints",
|
|
183
|
+
"tasks.py",
|
|
183
184
|
]
|
|
184
185
|
|
|
185
186
|
[tool.ruff.lint]
|
|
@@ -189,14 +190,22 @@ select = [
|
|
|
189
190
|
# Pycodestyle
|
|
190
191
|
"E",
|
|
191
192
|
"W",
|
|
192
|
-
|
|
193
|
+
# pydocstyle
|
|
194
|
+
"D",
|
|
193
195
|
# isort
|
|
194
196
|
"I001",
|
|
197
|
+
# print statements
|
|
198
|
+
"T201",
|
|
199
|
+
# pandas-vet
|
|
200
|
+
"PD"
|
|
195
201
|
]
|
|
196
202
|
ignore = [
|
|
197
203
|
"E501",
|
|
204
|
+
# pydocstyle
|
|
198
205
|
"D107", # Missing docstring in __init__
|
|
199
206
|
"D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
|
|
207
|
+
"PD901",
|
|
208
|
+
"PD101",
|
|
200
209
|
]
|
|
201
210
|
|
|
202
211
|
[tool.ruff.format]
|
|
@@ -206,14 +215,18 @@ preview = true
|
|
|
206
215
|
docstring-code-format = true
|
|
207
216
|
docstring-code-line-length = "dynamic"
|
|
208
217
|
|
|
209
|
-
[tool.ruff.lint.pep8-naming]
|
|
210
|
-
extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"]
|
|
211
|
-
|
|
212
218
|
[tool.ruff.lint.isort]
|
|
213
219
|
known-first-party = ["sdgym"]
|
|
220
|
+
lines-between-types = 0
|
|
214
221
|
|
|
215
222
|
[tool.ruff.lint.per-file-ignores]
|
|
216
223
|
"__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
|
|
224
|
+
"errors.py" = ["D105"]
|
|
225
|
+
"tests/**.py" = ["D"]
|
|
217
226
|
|
|
218
227
|
[tool.ruff.lint.pydocstyle]
|
|
219
|
-
convention = "google"
|
|
228
|
+
convention = "google"
|
|
229
|
+
|
|
230
|
+
[tool.ruff.lint.pycodestyle]
|
|
231
|
+
max-doc-length = 100
|
|
232
|
+
max-line-length = 100
|
|
@@ -15,8 +15,18 @@ import compress_pickle
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
17
|
import tqdm
|
|
18
|
-
from sdmetrics.reports.multi_table import
|
|
19
|
-
|
|
18
|
+
from sdmetrics.reports.multi_table import (
|
|
19
|
+
DiagnosticReport as MultiTableDiagnosticReport,
|
|
20
|
+
)
|
|
21
|
+
from sdmetrics.reports.multi_table import (
|
|
22
|
+
QualityReport as MultiTableQualityReport,
|
|
23
|
+
)
|
|
24
|
+
from sdmetrics.reports.single_table import (
|
|
25
|
+
DiagnosticReport as SingleTableDiagnosticReport,
|
|
26
|
+
)
|
|
27
|
+
from sdmetrics.reports.single_table import (
|
|
28
|
+
QualityReport as SingleTableQualityReport,
|
|
29
|
+
)
|
|
20
30
|
|
|
21
31
|
from sdgym.datasets import get_dataset_paths, load_dataset
|
|
22
32
|
from sdgym.errors import SDGymError
|
|
@@ -88,6 +98,7 @@ def _generate_job_args_list(
|
|
|
88
98
|
detailed_results_folder,
|
|
89
99
|
timeout,
|
|
90
100
|
compute_quality_score,
|
|
101
|
+
compute_diagnostic_score,
|
|
91
102
|
synthesizers,
|
|
92
103
|
custom_synthesizers,
|
|
93
104
|
):
|
|
@@ -124,6 +135,7 @@ def _generate_job_args_list(
|
|
|
124
135
|
detailed_results_folder,
|
|
125
136
|
timeout,
|
|
126
137
|
compute_quality_score,
|
|
138
|
+
compute_diagnostic_score,
|
|
127
139
|
dataset.name,
|
|
128
140
|
'single_table',
|
|
129
141
|
)
|
|
@@ -164,6 +176,7 @@ def _compute_scores(
|
|
|
164
176
|
metadata,
|
|
165
177
|
output,
|
|
166
178
|
compute_quality_score,
|
|
179
|
+
compute_diagnostic_score,
|
|
167
180
|
modality,
|
|
168
181
|
dataset_name,
|
|
169
182
|
):
|
|
@@ -202,6 +215,17 @@ def _compute_scores(
|
|
|
202
215
|
})
|
|
203
216
|
output['scores'] = scores # re-inject list to multiprocessing output
|
|
204
217
|
|
|
218
|
+
if compute_diagnostic_score:
|
|
219
|
+
start = datetime.utcnow()
|
|
220
|
+
if modality == 'single_table':
|
|
221
|
+
diagnostic_report = SingleTableDiagnosticReport()
|
|
222
|
+
else:
|
|
223
|
+
diagnostic_report = MultiTableDiagnosticReport()
|
|
224
|
+
|
|
225
|
+
diagnostic_report.generate(real_data, synthetic_data, metadata, verbose=False)
|
|
226
|
+
output['diagnostic_score_time'] = (datetime.utcnow() - start).total_seconds()
|
|
227
|
+
output['diagnostic_score'] = diagnostic_report.get_score()
|
|
228
|
+
|
|
205
229
|
if compute_quality_score:
|
|
206
230
|
start = datetime.utcnow()
|
|
207
231
|
if modality == 'single_table':
|
|
@@ -221,6 +245,7 @@ def _score(
|
|
|
221
245
|
metrics,
|
|
222
246
|
output=None,
|
|
223
247
|
compute_quality_score=False,
|
|
248
|
+
compute_diagnostic_score=False,
|
|
224
249
|
modality=None,
|
|
225
250
|
dataset_name=None,
|
|
226
251
|
):
|
|
@@ -266,6 +291,7 @@ def _score(
|
|
|
266
291
|
metadata,
|
|
267
292
|
output,
|
|
268
293
|
compute_quality_score,
|
|
294
|
+
compute_diagnostic_score,
|
|
269
295
|
modality,
|
|
270
296
|
dataset_name,
|
|
271
297
|
)
|
|
@@ -295,6 +321,7 @@ def _score_with_timeout(
|
|
|
295
321
|
metadata,
|
|
296
322
|
metrics,
|
|
297
323
|
compute_quality_score=False,
|
|
324
|
+
compute_diagnostic_score=False,
|
|
298
325
|
modality=None,
|
|
299
326
|
dataset_name=None,
|
|
300
327
|
):
|
|
@@ -309,6 +336,7 @@ def _score_with_timeout(
|
|
|
309
336
|
metrics,
|
|
310
337
|
output,
|
|
311
338
|
compute_quality_score,
|
|
339
|
+
compute_diagnostic_score,
|
|
312
340
|
modality,
|
|
313
341
|
dataset_name,
|
|
314
342
|
),
|
|
@@ -325,15 +353,26 @@ def _score_with_timeout(
|
|
|
325
353
|
return output
|
|
326
354
|
|
|
327
355
|
|
|
328
|
-
def _format_output(
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
356
|
+
def _format_output(
|
|
357
|
+
output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir
|
|
358
|
+
):
|
|
359
|
+
evaluate_time = 0
|
|
360
|
+
if 'quality_score_time' in output:
|
|
361
|
+
evaluate_time += output.get('quality_score_time', 0)
|
|
362
|
+
if 'diagnostic_score_time' in output:
|
|
363
|
+
evaluate_time += output.get('diagnostic_score_time', 0)
|
|
332
364
|
|
|
333
365
|
for score in output.get('scores', []):
|
|
334
|
-
if
|
|
366
|
+
if 'metric_time' in score and not np.isnan(score['metric_time']):
|
|
335
367
|
evaluate_time += score['metric_time']
|
|
336
368
|
|
|
369
|
+
if (
|
|
370
|
+
'quality_score_time' not in output
|
|
371
|
+
and 'scores' not in output
|
|
372
|
+
and 'diagnostic_score_time' not in output
|
|
373
|
+
):
|
|
374
|
+
evaluate_time = None
|
|
375
|
+
|
|
337
376
|
scores = pd.DataFrame({
|
|
338
377
|
'Synthesizer': [name],
|
|
339
378
|
'Dataset': [dataset_name],
|
|
@@ -345,6 +384,9 @@ def _format_output(output, name, dataset_name, compute_quality_score, cache_dir)
|
|
|
345
384
|
'Evaluate_Time': [evaluate_time],
|
|
346
385
|
})
|
|
347
386
|
|
|
387
|
+
if compute_diagnostic_score:
|
|
388
|
+
scores.insert(len(scores.columns), 'Diagnostic_Score', output.get('diagnostic_score'))
|
|
389
|
+
|
|
348
390
|
if compute_quality_score:
|
|
349
391
|
scores.insert(len(scores.columns), 'Quality_Score', output.get('quality_score'))
|
|
350
392
|
|
|
@@ -381,6 +423,7 @@ def _run_job(args):
|
|
|
381
423
|
cache_dir,
|
|
382
424
|
timeout,
|
|
383
425
|
compute_quality_score,
|
|
426
|
+
compute_diagnostic_score,
|
|
384
427
|
dataset_name,
|
|
385
428
|
modality,
|
|
386
429
|
) = args
|
|
@@ -404,6 +447,7 @@ def _run_job(args):
|
|
|
404
447
|
metadata=metadata,
|
|
405
448
|
metrics=metrics,
|
|
406
449
|
compute_quality_score=compute_quality_score,
|
|
450
|
+
compute_diagnostic_score=compute_diagnostic_score,
|
|
407
451
|
modality=modality,
|
|
408
452
|
dataset_name=dataset_name,
|
|
409
453
|
)
|
|
@@ -414,13 +458,16 @@ def _run_job(args):
|
|
|
414
458
|
metadata=metadata,
|
|
415
459
|
metrics=metrics,
|
|
416
460
|
compute_quality_score=compute_quality_score,
|
|
461
|
+
compute_diagnostic_score=compute_diagnostic_score,
|
|
417
462
|
modality=modality,
|
|
418
463
|
dataset_name=dataset_name,
|
|
419
464
|
)
|
|
420
465
|
except Exception as error:
|
|
421
466
|
output['exception'] = error
|
|
422
467
|
|
|
423
|
-
scores = _format_output(
|
|
468
|
+
scores = _format_output(
|
|
469
|
+
output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir
|
|
470
|
+
)
|
|
424
471
|
|
|
425
472
|
return scores
|
|
426
473
|
|
|
@@ -482,7 +529,7 @@ def _run_jobs(multi_processing_config, job_args_list, show_progress):
|
|
|
482
529
|
return scores
|
|
483
530
|
|
|
484
531
|
|
|
485
|
-
def _get_empty_dataframe(compute_quality_score, sdmetrics):
|
|
532
|
+
def _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics):
|
|
486
533
|
warnings.warn('No datasets/synthesizers found.')
|
|
487
534
|
|
|
488
535
|
scores = pd.DataFrame({
|
|
@@ -496,6 +543,8 @@ def _get_empty_dataframe(compute_quality_score, sdmetrics):
|
|
|
496
543
|
'Evaluate_Time': [],
|
|
497
544
|
})
|
|
498
545
|
|
|
546
|
+
if compute_diagnostic_score:
|
|
547
|
+
scores['Diagnostic_Score'] = []
|
|
499
548
|
if compute_quality_score:
|
|
500
549
|
scores['Quality_Score'] = []
|
|
501
550
|
if sdmetrics:
|
|
@@ -564,7 +613,7 @@ from io import StringIO
|
|
|
564
613
|
import sdgym
|
|
565
614
|
from sdgym.synthesizers.sdv import (CopulaGANSynthesizer, CTGANSynthesizer,
|
|
566
615
|
GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, SDVRelationalSynthesizer,
|
|
567
|
-
SDVTabularSynthesizer,TVAESynthesizer)
|
|
616
|
+
SDVTabularSynthesizer, TVAESynthesizer)
|
|
568
617
|
|
|
569
618
|
results = sdgym.benchmark_single_table(
|
|
570
619
|
{synthesizer_string}, custom_synthesizers={params['custom_synthesizers']},
|
|
@@ -572,6 +621,7 @@ results = sdgym.benchmark_single_table(
|
|
|
572
621
|
additional_datasets_folder={params['additional_datasets_folder']},
|
|
573
622
|
limit_dataset_size={params['limit_dataset_size']},
|
|
574
623
|
compute_quality_score={params['compute_quality_score']},
|
|
624
|
+
compute_diagnostic_score={params['compute_diagnostic_score']},
|
|
575
625
|
sdmetrics={params['sdmetrics']}, timeout={params['timeout']},
|
|
576
626
|
detailed_results_folder={params['detailed_results_folder']},
|
|
577
627
|
multi_processing_config={params['multi_processing_config']}
|
|
@@ -643,6 +693,7 @@ def benchmark_single_table(
|
|
|
643
693
|
additional_datasets_folder=None,
|
|
644
694
|
limit_dataset_size=False,
|
|
645
695
|
compute_quality_score=True,
|
|
696
|
+
compute_diagnostic_score=True,
|
|
646
697
|
sdmetrics=DEFAULT_METRICS,
|
|
647
698
|
timeout=None,
|
|
648
699
|
output_filepath=None,
|
|
@@ -680,6 +731,8 @@ def benchmark_single_table(
|
|
|
680
731
|
columns.
|
|
681
732
|
compute_quality_score (bool):
|
|
682
733
|
Whether or not to evaluate an overall quality score.
|
|
734
|
+
compute_diagnostic_score (bool):
|
|
735
|
+
Whether or not to evaluate an overall diagnostic score.
|
|
683
736
|
sdmetrics (list[str]):
|
|
684
737
|
A list of the different SDMetrics to use. If you'd like to input specific parameters
|
|
685
738
|
into the metric, provide a tuple with the metric name followed by a dictionary of
|
|
@@ -729,6 +782,7 @@ def benchmark_single_table(
|
|
|
729
782
|
detailed_results_folder,
|
|
730
783
|
timeout,
|
|
731
784
|
compute_quality_score,
|
|
785
|
+
compute_diagnostic_score,
|
|
732
786
|
synthesizers,
|
|
733
787
|
custom_synthesizers,
|
|
734
788
|
)
|
|
@@ -738,7 +792,7 @@ def benchmark_single_table(
|
|
|
738
792
|
|
|
739
793
|
# If no synthesizers/datasets are passed, return an empty dataframe
|
|
740
794
|
else:
|
|
741
|
-
scores = _get_empty_dataframe(compute_quality_score, sdmetrics)
|
|
795
|
+
scores = _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics)
|
|
742
796
|
|
|
743
797
|
if output_filepath:
|
|
744
798
|
write_csv(scores, output_filepath, None, None)
|
|
@@ -41,13 +41,13 @@ def _print_table(data, sort=None, reverse=False, format=None):
|
|
|
41
41
|
|
|
42
42
|
if 'error' in data:
|
|
43
43
|
error = data['error']
|
|
44
|
-
if pd.
|
|
44
|
+
if pd.isna(error).all():
|
|
45
45
|
del data['error']
|
|
46
46
|
else:
|
|
47
47
|
long_error = error.str.len() > 30
|
|
48
48
|
data.loc[long_error, 'error'] = error[long_error].str[:30] + '...'
|
|
49
49
|
|
|
50
|
-
print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False))
|
|
50
|
+
print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False)) # noqa: T201
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
def _run(args):
|
|
@@ -110,7 +110,7 @@ def _download_datasets(args):
|
|
|
110
110
|
def _list_downloaded(args):
|
|
111
111
|
datasets = sdgym.cli.utils.get_downloaded_datasets(args.datasets_path)
|
|
112
112
|
_print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})
|
|
113
|
-
print(f'Found {len(datasets)} downloaded datasets')
|
|
113
|
+
print(f'Found {len(datasets)} downloaded datasets') # noqa: T201
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
def _list_available(args):
|
|
@@ -395,7 +395,7 @@ def main():
|
|
|
395
395
|
try:
|
|
396
396
|
args.action(args)
|
|
397
397
|
except sdgym.errors.SDGymError as error:
|
|
398
|
-
print(f'ERROR: {error}')
|
|
398
|
+
print(f'ERROR: {error}') # noqa: T201
|
|
399
399
|
|
|
400
400
|
|
|
401
401
|
if __name__ == '__main__':
|
|
@@ -22,7 +22,7 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None)
|
|
|
22
22
|
If an ``aws_secret`` is provided, the given secret access key will be used to read
|
|
23
23
|
from and/or write to any s3 paths.
|
|
24
24
|
"""
|
|
25
|
-
print(f'Reading results from {input_path}')
|
|
25
|
+
print(f'Reading results from {input_path}') # noqa: T201
|
|
26
26
|
scores = read_csv_from_path(input_path, aws_key, aws_secret)
|
|
27
27
|
scores = scores.drop_duplicates()
|
|
28
28
|
|
|
@@ -31,5 +31,5 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None)
|
|
|
31
31
|
else:
|
|
32
32
|
output = f'{input_path}/results.csv'
|
|
33
33
|
|
|
34
|
-
print(f'Storing results at {output}')
|
|
34
|
+
print(f'Storing results at {output}') # noqa: T201
|
|
35
35
|
write_csv(scores, output, aws_key, aws_secret)
|
|
@@ -15,7 +15,7 @@ KNOWN_ERRORS = (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
MODALITY_BASELINES = {
|
|
18
|
-
'single-table': ['Uniform', '
|
|
18
|
+
'single-table': ['Uniform', 'Column', 'CLBN', 'PrivBN'],
|
|
19
19
|
'multi-table': ['Uniform', 'Independent'],
|
|
20
20
|
'timeseries': [],
|
|
21
21
|
}
|
|
@@ -46,7 +46,7 @@ def preprocess(data):
|
|
|
46
46
|
|
|
47
47
|
def _coverage(data):
|
|
48
48
|
total = len(data.Dataset.unique())
|
|
49
|
-
scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.
|
|
49
|
+
scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum())
|
|
50
50
|
coverage_perc = scores / total
|
|
51
51
|
coverage_str = scores.astype(str) + f' / {total}'
|
|
52
52
|
return coverage_perc, coverage_str
|
|
@@ -102,7 +102,7 @@ def summarize(data, baselines=(), datasets=None):
|
|
|
102
102
|
no_identity = data[data.Synthesizer != 'DataIdentity']
|
|
103
103
|
|
|
104
104
|
coverage_perc, coverage_str = _coverage(data)
|
|
105
|
-
solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.
|
|
105
|
+
solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum())
|
|
106
106
|
|
|
107
107
|
results = {
|
|
108
108
|
'total': len(data.Dataset.unique()),
|
|
@@ -127,7 +127,7 @@ def summarize(data, baselines=(), datasets=None):
|
|
|
127
127
|
for _, error_column in KNOWN_ERRORS:
|
|
128
128
|
results[error_column] = grouped[error_column].sum()
|
|
129
129
|
|
|
130
|
-
results['errors'] = grouped.error.apply(lambda x: x.
|
|
130
|
+
results['errors'] = grouped.error.apply(lambda x: x.notna().sum())
|
|
131
131
|
total_errors = results['errors']
|
|
132
132
|
results['metric_errors'] = results['total'] - results['solved'] - total_errors
|
|
133
133
|
|
|
@@ -160,7 +160,7 @@ def errors_summary(data):
|
|
|
160
160
|
"""
|
|
161
161
|
if 'error' in data.columns:
|
|
162
162
|
all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'error': 'all'})
|
|
163
|
-
synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).
|
|
163
|
+
synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).pivot_table(level=0)
|
|
164
164
|
for synthesizer, errors in synthesizer_errors.items():
|
|
165
165
|
all_errors[synthesizer] = errors.fillna(0).astype(int)
|
|
166
166
|
|
|
@@ -217,7 +217,7 @@ def _find_library(synthesizer):
|
|
|
217
217
|
|
|
218
218
|
def _add_summary_libraries(summary_data):
|
|
219
219
|
summary_data['library'] = summary_data.index.map(_find_library)
|
|
220
|
-
summary_data['library'].fillna('Other'
|
|
220
|
+
summary_data['library'] = summary_data['library'].fillna('Other')
|
|
221
221
|
return summary_data
|
|
222
222
|
|
|
223
223
|
|
|
@@ -240,7 +240,7 @@ def _add_summary(data, modality, baselines, writer):
|
|
|
240
240
|
},
|
|
241
241
|
axis=1,
|
|
242
242
|
)
|
|
243
|
-
summary.drop(index='Identity',
|
|
243
|
+
summary = summary.drop(index='Identity', errors='ignore')
|
|
244
244
|
summary = _add_summary_libraries(summary)
|
|
245
245
|
|
|
246
246
|
beat_baseline_headers = ['beat_' + b.lower() for b in baselines]
|
|
@@ -67,7 +67,7 @@ def read_csv_from_path(path, aws_key, aws_secret):
|
|
|
67
67
|
All csv content within a path will be read and returned in a
|
|
68
68
|
DataFrame. The path can be either local or an s3 directory.
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
Args:
|
|
71
71
|
path (str):
|
|
72
72
|
The path to read from, which can be either local or an s3 path.
|
|
73
73
|
aws_key (str):
|
|
@@ -8,7 +8,7 @@ from sdgym.synthesizers.generate import (
|
|
|
8
8
|
create_single_table_synthesizer,
|
|
9
9
|
)
|
|
10
10
|
from sdgym.synthesizers.identity import DataIdentity
|
|
11
|
-
from sdgym.synthesizers.
|
|
11
|
+
from sdgym.synthesizers.column import ColumnSynthesizer
|
|
12
12
|
from sdgym.synthesizers.sdv import (
|
|
13
13
|
CopulaGANSynthesizer,
|
|
14
14
|
CTGANSynthesizer,
|
|
@@ -23,7 +23,7 @@ from sdgym.synthesizers.uniform import UniformSynthesizer
|
|
|
23
23
|
|
|
24
24
|
__all__ = (
|
|
25
25
|
'DataIdentity',
|
|
26
|
-
'
|
|
26
|
+
'ColumnSynthesizer',
|
|
27
27
|
'CTGANSynthesizer',
|
|
28
28
|
'TVAESynthesizer',
|
|
29
29
|
'UniformSynthesizer',
|
sdgym-0.8.0.dev1/sdgym/synthesizers/independent.py → sdgym-0.9.0.dev0/sdgym/synthesizers/column.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""ColumnSynthesizer module."""
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from rdt.hyper_transformer import HyperTransformer
|
|
@@ -7,7 +7,7 @@ from sklearn.mixture import GaussianMixture
|
|
|
7
7
|
from sdgym.synthesizers.base import BaselineSynthesizer
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class
|
|
10
|
+
class ColumnSynthesizer(BaselineSynthesizer):
|
|
11
11
|
"""Synthesizer that learns each column independently.
|
|
12
12
|
|
|
13
13
|
Categorical columns are sampled using empirical frequencies.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License: BSL-1.1
|
|
@@ -29,9 +29,9 @@ Requires-Dist: boto3<2,>=1.28
|
|
|
29
29
|
Requires-Dist: botocore<2,>=1.31
|
|
30
30
|
Requires-Dist: compress-pickle>=1.2.0
|
|
31
31
|
Requires-Dist: humanfriendly>=8.2
|
|
32
|
-
Requires-Dist: numpy
|
|
33
|
-
Requires-Dist: numpy<2,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
|
|
34
|
-
Requires-Dist: numpy<2,>=1.26.0; python_version >= "3.12"
|
|
32
|
+
Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
|
|
33
|
+
Requires-Dist: numpy<2.0.0,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
|
|
34
|
+
Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= "3.12"
|
|
35
35
|
Requires-Dist: pandas>=1.4.0; python_version < "3.11"
|
|
36
36
|
Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
|
|
37
37
|
Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
|
|
@@ -22,9 +22,9 @@ sdgym/cli/summary.py
|
|
|
22
22
|
sdgym/cli/utils.py
|
|
23
23
|
sdgym/synthesizers/__init__.py
|
|
24
24
|
sdgym/synthesizers/base.py
|
|
25
|
+
sdgym/synthesizers/column.py
|
|
25
26
|
sdgym/synthesizers/generate.py
|
|
26
27
|
sdgym/synthesizers/identity.py
|
|
27
|
-
sdgym/synthesizers/independent.py
|
|
28
28
|
sdgym/synthesizers/sdv.py
|
|
29
29
|
sdgym/synthesizers/uniform.py
|
|
30
30
|
tests/test_tasks.py
|
|
@@ -12,7 +12,7 @@ sdmetrics>=0.14.1
|
|
|
12
12
|
sdv>=1.13.1
|
|
13
13
|
|
|
14
14
|
[:python_version < "3.10"]
|
|
15
|
-
numpy
|
|
15
|
+
numpy<2.0.0,>=1.21.0
|
|
16
16
|
scikit-learn>=1.0.2
|
|
17
17
|
scipy>=1.7.3
|
|
18
18
|
torch>=1.9.0
|
|
@@ -24,7 +24,7 @@ pandas>=1.4.0
|
|
|
24
24
|
scikit-learn>=1.1.0
|
|
25
25
|
|
|
26
26
|
[:python_version >= "3.10" and python_version < "3.12"]
|
|
27
|
-
numpy<2,>=1.23.3
|
|
27
|
+
numpy<2.0.0,>=1.23.3
|
|
28
28
|
scipy>=1.9.2
|
|
29
29
|
torch>=2.0.0
|
|
30
30
|
|
|
@@ -33,7 +33,7 @@ pandas>=1.5.0
|
|
|
33
33
|
scikit-learn>=1.1.3
|
|
34
34
|
|
|
35
35
|
[:python_version >= "3.12"]
|
|
36
|
-
numpy<2,>=1.26.0
|
|
36
|
+
numpy<2.0.0,>=1.26.0
|
|
37
37
|
pandas>=2.1.1
|
|
38
38
|
scikit-learn>=1.3.1
|
|
39
39
|
scipy>=1.12.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|