synthyverse 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synthyverse/__init__.py +0 -0
- synthyverse/__version__.py +1 -0
- synthyverse/benchmark/__init__.py +1 -0
- synthyverse/benchmark/benchmark.py +88 -0
- synthyverse/evaluation/__init__.py +1 -0
- synthyverse/evaluation/eval.py +120 -0
- synthyverse/evaluation/fidelity.py +230 -0
- synthyverse/evaluation/privacy.py +88 -0
- synthyverse/evaluation/utility.py +115 -0
- synthyverse/generators/__init__.py +23 -0
- synthyverse/generators/arf_generator/__init__.py +1 -0
- synthyverse/generators/arf_generator/arf.py +21 -0
- synthyverse/generators/base.py +43 -0
- synthyverse/generators/bn_generator/__init__.py +1 -0
- synthyverse/generators/bn_generator/bn.py +45 -0
- synthyverse/generators/ctgan_generator/__init__.py +1 -0
- synthyverse/generators/ctgan_generator/ct_gan.py +49 -0
- synthyverse/generators/tvae_generator/__init__.py +1 -0
- synthyverse/generators/tvae_generator/tvae.py +38 -0
- synthyverse/utils/__init__.py +3 -0
- synthyverse/utils/oneclass.py +316 -0
- synthyverse/utils/preprocessing.py +73 -0
- synthyverse/utils/reproducibility.py +18 -0
- synthyverse/utils/utils.py +122 -0
- synthyverse/utils/xgb_utils.py +20 -0
- synthyverse-0.1.0.dist-info/LICENSE +21 -0
- synthyverse-0.1.0.dist-info/METADATA +191 -0
- synthyverse-0.1.0.dist-info/RECORD +30 -0
- synthyverse-0.1.0.dist-info/WHEEL +5 -0
- synthyverse-0.1.0.dist-info/top_level.txt +1 -0
synthyverse/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .benchmark import TabularBenchmark
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from sklearn.model_selection import train_test_split
|
|
2
|
+
from ..evaluation.eval import MetricEvaluator
|
|
3
|
+
from ..utils.utils import get_generator, free_up_memory
|
|
4
|
+
from ..utils.reproducibility import set_seed
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from time import time
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TabularBenchmark:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
generator_name: str = "arf",
|
|
13
|
+
generator_params: dict = {},
|
|
14
|
+
n_random_splits: int = 1,
|
|
15
|
+
n_inits: int = 1,
|
|
16
|
+
n_generated_datasets: int = 1,
|
|
17
|
+
metrics: list = ["classifier_test", "mle", "dcr"],
|
|
18
|
+
test_size: float = 0.3,
|
|
19
|
+
):
|
|
20
|
+
|
|
21
|
+
self.generator_name = generator_name
|
|
22
|
+
self.generator_params = generator_params
|
|
23
|
+
self.n_random_splits = n_random_splits
|
|
24
|
+
self.n_inits = n_inits
|
|
25
|
+
self.n_generated_datasets = n_generated_datasets
|
|
26
|
+
self.metrics = metrics
|
|
27
|
+
self.test_size = test_size
|
|
28
|
+
|
|
29
|
+
def run(self, X: pd.DataFrame, target_column: str, discrete_columns: list):
|
|
30
|
+
|
|
31
|
+
results = {}
|
|
32
|
+
generator_ = get_generator(self.generator_name)
|
|
33
|
+
for split_i in range(self.n_random_splits):
|
|
34
|
+
results[f"split_{split_i}"] = {}
|
|
35
|
+
|
|
36
|
+
# split data according to current seed
|
|
37
|
+
stratify = None
|
|
38
|
+
if target_column in discrete_columns:
|
|
39
|
+
stratify = X[target_column]
|
|
40
|
+
X_train, X_test = train_test_split(
|
|
41
|
+
X, stratify=stratify, test_size=self.test_size, random_state=split_i
|
|
42
|
+
)
|
|
43
|
+
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(
|
|
44
|
+
drop=True
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
for init_i in range(self.n_inits):
|
|
48
|
+
results[f"split_{split_i}"][f"init_{init_i}"] = {}
|
|
49
|
+
set_seed(init_i)
|
|
50
|
+
generator = generator_(random_state=init_i, **self.generator_params)
|
|
51
|
+
start_time = time()
|
|
52
|
+
generator.fit(X_train, discrete_columns)
|
|
53
|
+
results[f"split_{split_i}"][f"init_{init_i}"]["training_time"] = (
|
|
54
|
+
time() - start_time
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# potentially generate multiple datasets
|
|
58
|
+
for generated_dataset_i in range(self.n_generated_datasets):
|
|
59
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
60
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
61
|
+
] = {}
|
|
62
|
+
start_time = time()
|
|
63
|
+
X_syn = generator.generate(len(X))
|
|
64
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
65
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
66
|
+
] = {}
|
|
67
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
68
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
69
|
+
]["inference_time"] = (time() - start_time)
|
|
70
|
+
start_time = time()
|
|
71
|
+
evaluator = MetricEvaluator(
|
|
72
|
+
metrics=self.metrics,
|
|
73
|
+
discrete_features=discrete_columns,
|
|
74
|
+
target_column=target_column,
|
|
75
|
+
random_state=init_i,
|
|
76
|
+
)
|
|
77
|
+
metric_results = evaluator.evaluate(X_train, X_test, X_syn)
|
|
78
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
79
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
80
|
+
]["evaluation_time"] = (time() - start_time)
|
|
81
|
+
results[f"split_{split_i}"][f"init_{init_i}"][
|
|
82
|
+
f"generated_dataset_{generated_dataset_i}"
|
|
83
|
+
].update(metric_results)
|
|
84
|
+
|
|
85
|
+
# free up memory for next iteration
|
|
86
|
+
free_up_memory()
|
|
87
|
+
|
|
88
|
+
return results
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .eval import MetricEvaluator
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from ..utils.preprocessing import scale
|
|
4
|
+
|
|
5
|
+
from .fidelity import (
|
|
6
|
+
ClassifierTest,
|
|
7
|
+
AlphaPrecisionBetaRecallAuthenticity,
|
|
8
|
+
Similarity,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from .utility import MLE
|
|
12
|
+
|
|
13
|
+
from .privacy import DCR
|
|
14
|
+
|
|
15
|
+
METRICS = {
|
|
16
|
+
"classifier_test": ClassifierTest,
|
|
17
|
+
"mle": MLE,
|
|
18
|
+
"dcr": DCR,
|
|
19
|
+
"similarity": Similarity,
|
|
20
|
+
"prauth": AlphaPrecisionBetaRecallAuthenticity,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MetricEvaluator:
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
metrics: Union[dict, list],
|
|
29
|
+
discrete_features: list = [],
|
|
30
|
+
target_column: str = "target",
|
|
31
|
+
random_state: int = 0,
|
|
32
|
+
):
|
|
33
|
+
|
|
34
|
+
if isinstance(metrics, list):
|
|
35
|
+
self.metrics = {metric: {} for metric in metrics}
|
|
36
|
+
else:
|
|
37
|
+
self.metrics = metrics
|
|
38
|
+
self.discrete_features = discrete_features
|
|
39
|
+
self.target_column = target_column
|
|
40
|
+
self.random_state = random_state
|
|
41
|
+
|
|
42
|
+
def evaluate(
|
|
43
|
+
self, X_train: pd.DataFrame, X_test: pd.DataFrame, X_syn: pd.DataFrame
|
|
44
|
+
):
|
|
45
|
+
X_train, X_test, X_syn = (
|
|
46
|
+
X_train.reset_index(drop=True),
|
|
47
|
+
X_test.reset_index(drop=True),
|
|
48
|
+
X_syn.reset_index(drop=True),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# ensure that we do not evaluate a larger real dataset than synthetic
|
|
52
|
+
X_train, X_test = X_train[: len(X_syn)], X_test[: len(X_syn)]
|
|
53
|
+
|
|
54
|
+
# one hot, label encode, standard scale
|
|
55
|
+
X_tr_scaled, X_te_scaled, X_syn_scaled = scale(
|
|
56
|
+
X_train,
|
|
57
|
+
X_test,
|
|
58
|
+
X_syn,
|
|
59
|
+
discrete_features=self.discrete_features,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
dict_ = {}
|
|
63
|
+
for metric__ in self.metrics.keys():
|
|
64
|
+
metric_ = metric__.split("-")[0].strip().lower()
|
|
65
|
+
metric_cls = METRICS[metric_]
|
|
66
|
+
# Use class properties to determine which additional information needs to be passed to the metric
|
|
67
|
+
if hasattr(metric_cls, "needs_discrete_features") and getattr(
|
|
68
|
+
metric_cls, "needs_discrete_features", False
|
|
69
|
+
):
|
|
70
|
+
self.metrics[metric__]["discrete_features"] = self.discrete_features
|
|
71
|
+
if hasattr(metric_cls, "needs_target_column") and getattr(
|
|
72
|
+
metric_cls, "needs_target_column", False
|
|
73
|
+
):
|
|
74
|
+
self.metrics[metric__]["target_column"] = self.target_column
|
|
75
|
+
if hasattr(metric_cls, "needs_random_state") and getattr(
|
|
76
|
+
metric_cls, "needs_random_state", False
|
|
77
|
+
):
|
|
78
|
+
self.metrics[metric__]["random_state"] = self.random_state
|
|
79
|
+
|
|
80
|
+
metric = metric_cls(**self.metrics[metric__])
|
|
81
|
+
# Use class property to determine which data to pass
|
|
82
|
+
data_req = getattr(metric_cls, "data_requirement", None)
|
|
83
|
+
if data_req == "test":
|
|
84
|
+
metric_result = metric.evaluate(
|
|
85
|
+
X_test,
|
|
86
|
+
X_syn[-len(X_test) :],
|
|
87
|
+
)
|
|
88
|
+
elif data_req == "train":
|
|
89
|
+
metric_result = metric.evaluate(
|
|
90
|
+
X_train,
|
|
91
|
+
X_syn[: len(X_train)],
|
|
92
|
+
)
|
|
93
|
+
elif data_req == "test_preprocessed":
|
|
94
|
+
metric_result = metric.evaluate(
|
|
95
|
+
X_te_scaled,
|
|
96
|
+
X_syn_scaled[-len(X_test) :],
|
|
97
|
+
)
|
|
98
|
+
elif data_req == "train_preprocessed":
|
|
99
|
+
metric_result = metric.evaluate(
|
|
100
|
+
X_tr_scaled,
|
|
101
|
+
X_syn_scaled[: len(X_train)],
|
|
102
|
+
)
|
|
103
|
+
elif data_req == "train_and_test":
|
|
104
|
+
metric_result = metric.evaluate(X_train, X_test, X_syn)
|
|
105
|
+
elif data_req == "train_and_test_preprocessed":
|
|
106
|
+
metric_result = metric.evaluate(
|
|
107
|
+
X_tr_scaled,
|
|
108
|
+
X_te_scaled,
|
|
109
|
+
X_syn_scaled,
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
raise Exception(
|
|
113
|
+
f"Metric {metric_} not (fully) implemented or missing data_requirement property"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# add result to dict (note that quantitative metrics have to output a dict, else they won't get added here)
|
|
117
|
+
if type(metric_result) == dict:
|
|
118
|
+
dict_.update(metric_result)
|
|
119
|
+
|
|
120
|
+
return dict_
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.metrics import roc_auc_score
|
|
4
|
+
from xgboost import XGBClassifier
|
|
5
|
+
from ..utils.xgb_utils import get_xgb_tree_method
|
|
6
|
+
from ..utils.oneclass import OneClassLayer
|
|
7
|
+
from ..utils.utils import suppress_print
|
|
8
|
+
import torch
|
|
9
|
+
from sklearn.neighbors import NearestNeighbors
|
|
10
|
+
from sdmetrics.reports.single_table import QualityReport
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ClassifierTest:
|
|
14
|
+
"""
|
|
15
|
+
AUC score of XGB classifier which aims to distinguish synthetic from real data.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
data_requirement = "train_and_test"
|
|
19
|
+
needs_discrete_features = True
|
|
20
|
+
needs_random_state = True
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
discrete_features: list = [],
|
|
25
|
+
random_state: int = 0,
|
|
26
|
+
):
|
|
27
|
+
super().__init__()
|
|
28
|
+
self.random_state = random_state
|
|
29
|
+
self.discrete_features = discrete_features
|
|
30
|
+
|
|
31
|
+
def evaluate(
|
|
32
|
+
self,
|
|
33
|
+
train: pd.DataFrame,
|
|
34
|
+
test: pd.DataFrame,
|
|
35
|
+
sd: pd.DataFrame,
|
|
36
|
+
):
|
|
37
|
+
|
|
38
|
+
numerical_features = [
|
|
39
|
+
col for col in train.columns if col not in self.discrete_features
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
X = pd.concat((train, sd[: len(train)]))
|
|
43
|
+
X = X.reset_index(drop=True)
|
|
44
|
+
X[numerical_features] = X[numerical_features].astype(float)
|
|
45
|
+
X[self.discrete_features] = X[self.discrete_features].astype("category")
|
|
46
|
+
y = pd.concat(
|
|
47
|
+
(
|
|
48
|
+
pd.Series(0, index=list(range(len(train))), name="y"),
|
|
49
|
+
pd.Series(1, index=list(range(len(train))), name="y"),
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
y = y.reset_index(drop=True)
|
|
53
|
+
|
|
54
|
+
model = XGBClassifier(
|
|
55
|
+
tree_method=get_xgb_tree_method(),
|
|
56
|
+
enable_categorical=True,
|
|
57
|
+
random_state=self.random_state,
|
|
58
|
+
max_depth=3,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
model.fit(X, y)
|
|
62
|
+
|
|
63
|
+
X_te = pd.concat((test, sd[-len(test) :]))
|
|
64
|
+
X_te[numerical_features] = X_te[numerical_features].astype(float)
|
|
65
|
+
X_te[self.discrete_features] = X_te[self.discrete_features].astype("category")
|
|
66
|
+
y_te = pd.concat(
|
|
67
|
+
(
|
|
68
|
+
pd.Series(0, index=list(range(len(test))), name="y"),
|
|
69
|
+
pd.Series(1, index=list(range(len(test))), name="y"),
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
y_te = y_te.reset_index(drop=True)
|
|
73
|
+
|
|
74
|
+
preds = model.predict_proba(X_te)
|
|
75
|
+
score = roc_auc_score(y_te, preds[:, 1])
|
|
76
|
+
|
|
77
|
+
return {f"classifiertest.auc": float(score)}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class AlphaPrecisionBetaRecallAuthenticity:
|
|
81
|
+
"""
|
|
82
|
+
alpha-Precision, Beta-Recall, Authenticity score from the Alaa et al. paper.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
data_requirement = "train_preprocessed"
|
|
86
|
+
needs_random_state = True
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
discrete_features: list = [],
|
|
91
|
+
random_state: int = 0,
|
|
92
|
+
):
|
|
93
|
+
super().__init__()
|
|
94
|
+
self.random_state = random_state
|
|
95
|
+
self.discrete_features = discrete_features
|
|
96
|
+
|
|
97
|
+
def evaluate(
|
|
98
|
+
self,
|
|
99
|
+
rd: pd.DataFrame,
|
|
100
|
+
sd: pd.DataFrame,
|
|
101
|
+
):
|
|
102
|
+
|
|
103
|
+
OC_params = {
|
|
104
|
+
"input_dim": rd.shape[1],
|
|
105
|
+
"rep_dim": rd.shape[1],
|
|
106
|
+
"num_layers": 4,
|
|
107
|
+
"num_hidden": 32,
|
|
108
|
+
"activation": "ReLU",
|
|
109
|
+
"dropout_prob": 0.2,
|
|
110
|
+
"dropout_active": False,
|
|
111
|
+
"LossFn": "SoftBoundary",
|
|
112
|
+
"lr": 2e-3,
|
|
113
|
+
"epochs": 1000,
|
|
114
|
+
"warm_up_epochs": 20,
|
|
115
|
+
"train_prop": 1.0,
|
|
116
|
+
"weight_decay": 2e-3,
|
|
117
|
+
}
|
|
118
|
+
OC_hyperparams = {"Radius": 1, "nu": 1e-2}
|
|
119
|
+
OC_hyperparams["center"] = (
|
|
120
|
+
torch.ones(OC_params["rep_dim"]) * 10
|
|
121
|
+
) # *10 is what is used in synthcity
|
|
122
|
+
OC_model = OneClassLayer(params=OC_params, hyperparams=OC_hyperparams)
|
|
123
|
+
OC_model.fit(rd.values, verbosity=True)
|
|
124
|
+
real = OC_model.predict(rd.values)
|
|
125
|
+
syn = OC_model.predict(sd.values)
|
|
126
|
+
emb_center = OC_model.c.detach().cpu().numpy()
|
|
127
|
+
|
|
128
|
+
n_steps = 30
|
|
129
|
+
alphas = np.linspace(0, 1, n_steps)
|
|
130
|
+
|
|
131
|
+
Radii = np.quantile(np.sqrt(np.sum((real - emb_center) ** 2, axis=1)), alphas)
|
|
132
|
+
|
|
133
|
+
synth_center = np.mean(syn, axis=0)
|
|
134
|
+
|
|
135
|
+
alpha_precision_curve = []
|
|
136
|
+
beta_coverage_curve = []
|
|
137
|
+
|
|
138
|
+
synth_to_center = np.sqrt(np.sum((syn - emb_center) ** 2, axis=1))
|
|
139
|
+
|
|
140
|
+
nbrs_real = NearestNeighbors(n_neighbors=2, n_jobs=-1, p=2).fit(real)
|
|
141
|
+
real_to_real, _ = nbrs_real.kneighbors(real)
|
|
142
|
+
|
|
143
|
+
nbrs_synth = NearestNeighbors(n_neighbors=1, n_jobs=-1, p=2).fit(syn)
|
|
144
|
+
real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(real)
|
|
145
|
+
|
|
146
|
+
real_to_real = real_to_real[:, 1].squeeze()
|
|
147
|
+
real_to_synth = real_to_synth.squeeze()
|
|
148
|
+
real_to_synth_args = real_to_synth_args.squeeze()
|
|
149
|
+
|
|
150
|
+
real_synth_closest = syn[real_to_synth_args]
|
|
151
|
+
|
|
152
|
+
real_synth_closest_d = np.sqrt(
|
|
153
|
+
np.sum((real_synth_closest - synth_center) ** 2, axis=1)
|
|
154
|
+
)
|
|
155
|
+
closest_synth_Radii = np.quantile(real_synth_closest_d, alphas)
|
|
156
|
+
|
|
157
|
+
for k in range(len(Radii)):
|
|
158
|
+
precision_audit_mask = synth_to_center <= Radii[k]
|
|
159
|
+
alpha_precision = np.mean(precision_audit_mask)
|
|
160
|
+
|
|
161
|
+
beta_coverage = np.mean(
|
|
162
|
+
(
|
|
163
|
+
(real_to_synth <= real_to_real)
|
|
164
|
+
* (real_synth_closest_d <= closest_synth_Radii[k])
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
alpha_precision_curve.append(alpha_precision)
|
|
169
|
+
beta_coverage_curve.append(beta_coverage)
|
|
170
|
+
|
|
171
|
+
authen = real_to_real[real_to_synth_args] < real_to_synth
|
|
172
|
+
authenticity = np.mean(authen)
|
|
173
|
+
|
|
174
|
+
Delta_precision_alpha = 1 - np.sum(
|
|
175
|
+
np.abs(np.array(alphas) - np.array(alpha_precision_curve))
|
|
176
|
+
) / np.sum(alphas)
|
|
177
|
+
|
|
178
|
+
Delta_coverage_beta = 1 - np.sum(
|
|
179
|
+
np.abs(np.array(alphas) - np.array(beta_coverage_curve))
|
|
180
|
+
) / np.sum(alphas)
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
"alphaprecision.oc.score": float(Delta_precision_alpha),
|
|
184
|
+
"betacoverage.oc.score": float(Delta_coverage_beta),
|
|
185
|
+
"authenticity.oc.score": float(authenticity),
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class Similarity:
|
|
190
|
+
"""
|
|
191
|
+
Column Shapes and Column Pair Trends from the SDMetrics library.
|
|
192
|
+
Indicates quality of marginal distributions and correlations in synthetic data, respectively.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
data_requirement = "train"
|
|
196
|
+
needs_discrete_features = True
|
|
197
|
+
|
|
198
|
+
def __init__(
|
|
199
|
+
self,
|
|
200
|
+
discrete_features: list = [],
|
|
201
|
+
):
|
|
202
|
+
super().__init__()
|
|
203
|
+
self.discrete_features = discrete_features
|
|
204
|
+
|
|
205
|
+
@suppress_print
|
|
206
|
+
def evaluate(
|
|
207
|
+
self,
|
|
208
|
+
rd: pd.DataFrame,
|
|
209
|
+
sd: pd.DataFrame,
|
|
210
|
+
):
|
|
211
|
+
dtypes = [
|
|
212
|
+
"categorical" if x in self.discrete_features else "numerical"
|
|
213
|
+
for x in rd.columns
|
|
214
|
+
]
|
|
215
|
+
metadata = {k: {"sdtype": v} for k, v in zip(rd.columns, dtypes)}
|
|
216
|
+
metadata = {"columns": metadata}
|
|
217
|
+
metadata["primary_key"] = "index"
|
|
218
|
+
|
|
219
|
+
report = QualityReport()
|
|
220
|
+
report.generate(rd, sd, metadata)
|
|
221
|
+
scores = report.get_properties()
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
"similarity.shape": float(
|
|
225
|
+
scores.loc[scores["Property"] == "Column Shapes", "Score"]
|
|
226
|
+
),
|
|
227
|
+
"similarity.trend": float(
|
|
228
|
+
scores.loc[scores["Property"] == "Column Pair Trends", "Score"]
|
|
229
|
+
),
|
|
230
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.metrics import pairwise_distances_argmin_min
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DCR:
|
|
7
|
+
"""
|
|
8
|
+
Distance to Closest Record scores.
|
|
9
|
+
Indicates closeness of synthetic data to the training data, and an independent holdout set.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
data_requirement = "train_and_test_preprocessed"
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
estimates: list = [
|
|
17
|
+
"mean",
|
|
18
|
+
0.01,
|
|
19
|
+
0.05,
|
|
20
|
+
0.1,
|
|
21
|
+
0.25,
|
|
22
|
+
0.5,
|
|
23
|
+
],
|
|
24
|
+
batch_size: int = 16000,
|
|
25
|
+
):
|
|
26
|
+
super().__init__()
|
|
27
|
+
self.estimates = estimates
|
|
28
|
+
self.batch_size = batch_size
|
|
29
|
+
|
|
30
|
+
def _compute_min_distances_batch(
|
|
31
|
+
self, query_data: pd.DataFrame, reference_data: pd.DataFrame
|
|
32
|
+
) -> np.ndarray:
|
|
33
|
+
"""
|
|
34
|
+
Compute minimum distances between query_data and reference_data in batches.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
query_data: DataFrame containing query points
|
|
38
|
+
reference_data: DataFrame containing reference points
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Array of minimum distances for each query point
|
|
42
|
+
"""
|
|
43
|
+
if self.batch_size is None:
|
|
44
|
+
# Use original method for small datasets or when batch_size is not specified
|
|
45
|
+
_, min_distances = pairwise_distances_argmin_min(
|
|
46
|
+
query_data, reference_data, metric="euclidean"
|
|
47
|
+
)
|
|
48
|
+
return min_distances
|
|
49
|
+
|
|
50
|
+
min_distances = []
|
|
51
|
+
n_query = len(query_data)
|
|
52
|
+
|
|
53
|
+
for i in range(0, n_query, self.batch_size):
|
|
54
|
+
end_idx = min(i + self.batch_size, n_query)
|
|
55
|
+
batch_query = query_data.iloc[i:end_idx]
|
|
56
|
+
|
|
57
|
+
_, batch_min_distances = pairwise_distances_argmin_min(
|
|
58
|
+
batch_query, reference_data, metric="euclidean"
|
|
59
|
+
)
|
|
60
|
+
min_distances.extend(batch_min_distances)
|
|
61
|
+
|
|
62
|
+
return np.array(min_distances)
|
|
63
|
+
|
|
64
|
+
def evaluate(self, train: pd.DataFrame, test: pd.DataFrame, sd: pd.DataFrame):
|
|
65
|
+
|
|
66
|
+
sd = sd[: len(train)]
|
|
67
|
+
|
|
68
|
+
# Use batch processing if batch_size is specified
|
|
69
|
+
min_distances_syn = self._compute_min_distances_batch(sd, train)
|
|
70
|
+
min_distances_test = self._compute_min_distances_batch(test, train)
|
|
71
|
+
|
|
72
|
+
dictionary = {}
|
|
73
|
+
|
|
74
|
+
for estimate in self.estimates:
|
|
75
|
+
if estimate == "mean":
|
|
76
|
+
score_train = min_distances_syn.mean()
|
|
77
|
+
score_test = min_distances_test.mean()
|
|
78
|
+
else:
|
|
79
|
+
score_train = np.quantile(min_distances_syn, estimate)
|
|
80
|
+
score_test = np.quantile(min_distances_test, estimate)
|
|
81
|
+
dictionary.update(
|
|
82
|
+
{
|
|
83
|
+
f"dcr.train.{estimate}": float(score_train),
|
|
84
|
+
f"dcr.test.{estimate}": float(score_test),
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return dictionary
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.metrics import roc_auc_score, r2_score
|
|
4
|
+
from xgboost import XGBClassifier, XGBRegressor
|
|
5
|
+
from sklearn.preprocessing import LabelEncoder
|
|
6
|
+
from ..utils.xgb_utils import get_xgb_tree_method
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MLE:
|
|
10
|
+
"""
|
|
11
|
+
Machine Learning Efficacy from a XGB classifier.
|
|
12
|
+
AUC score for discrete target columns, R^2 score for continuous target columns.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
data_requirement = "train_and_test"
|
|
16
|
+
needs_discrete_features = True
|
|
17
|
+
needs_target_column = True
|
|
18
|
+
needs_random_state = True
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
target_column: str = "target",
|
|
23
|
+
discrete_features: list = [],
|
|
24
|
+
random_state: int = 0,
|
|
25
|
+
train_set: str = "synthetic",
|
|
26
|
+
):
|
|
27
|
+
super().__init__()
|
|
28
|
+
self.random_state = random_state
|
|
29
|
+
self.discrete_features = discrete_features
|
|
30
|
+
self.target_column = target_column
|
|
31
|
+
self.train_set = train_set
|
|
32
|
+
|
|
33
|
+
def evaluate(
|
|
34
|
+
self,
|
|
35
|
+
train: pd.DataFrame,
|
|
36
|
+
test: pd.DataFrame,
|
|
37
|
+
sd: pd.DataFrame,
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
y_tr = train[self.target_column]
|
|
41
|
+
y_te = test[self.target_column]
|
|
42
|
+
y_s = sd[self.target_column]
|
|
43
|
+
x_tr = train.drop(columns=[self.target_column])
|
|
44
|
+
x_te = test.drop(columns=[self.target_column])
|
|
45
|
+
x_s = sd.drop(columns=[self.target_column])
|
|
46
|
+
|
|
47
|
+
numerical_features = [
|
|
48
|
+
col for col in train.columns if col not in self.discrete_features
|
|
49
|
+
]
|
|
50
|
+
discrete_features = [
|
|
51
|
+
col for col in self.discrete_features if col != self.target_column
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
x_tr[numerical_features], x_te[numerical_features], x_s[numerical_features] = (
|
|
55
|
+
x_tr[numerical_features].astype(float),
|
|
56
|
+
x_te[numerical_features].astype(float),
|
|
57
|
+
x_s[numerical_features].astype(float),
|
|
58
|
+
)
|
|
59
|
+
x_tr[discrete_features], x_te[discrete_features], x_s[discrete_features] = (
|
|
60
|
+
x_tr[discrete_features].astype("category"),
|
|
61
|
+
x_te[discrete_features].astype("category"),
|
|
62
|
+
x_s[discrete_features].astype("category"),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if self.target_column in self.discrete_features:
|
|
66
|
+
le = LabelEncoder()
|
|
67
|
+
le.fit(pd.concat((y_tr, y_te, y_s)))
|
|
68
|
+
y_tr = le.transform(y_tr)
|
|
69
|
+
y_te = le.transform(y_te)
|
|
70
|
+
y_s = le.transform(y_s)
|
|
71
|
+
model = XGBClassifier(
|
|
72
|
+
tree_method=get_xgb_tree_method(),
|
|
73
|
+
enable_categorical=True,
|
|
74
|
+
random_state=self.random_state,
|
|
75
|
+
max_depth=3,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
model = XGBRegressor(
|
|
79
|
+
tree_method=get_xgb_tree_method(),
|
|
80
|
+
enable_categorical=True,
|
|
81
|
+
random_state=self.random_state,
|
|
82
|
+
max_depth=3,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if self.train_set == "synthetic":
|
|
86
|
+
model.fit(x_s[: len(x_tr)], y_s[: len(x_tr)])
|
|
87
|
+
score = self._get_score(y_te, x_te, model)
|
|
88
|
+
else:
|
|
89
|
+
model.fit(x_tr, y_tr)
|
|
90
|
+
score = self._get_score(y_s[-len(x_te) :], x_s[-len(x_te) :], model)
|
|
91
|
+
|
|
92
|
+
# also add trtr score
|
|
93
|
+
model.fit(x_tr, y_tr)
|
|
94
|
+
score_trtr = self._get_score(y_te, x_te, model)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
f"mle.train-{self.train_set}-test-{'real' if self.train_set == 'synthetic' else 'synthetic'}.{'auc' if self.target_column in self.discrete_features else 'r2'}": float(
|
|
98
|
+
score
|
|
99
|
+
),
|
|
100
|
+
f"mle.train-real-test-real.{'auc' if self.target_column in self.discrete_features else 'r2'}": float(
|
|
101
|
+
score_trtr
|
|
102
|
+
),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def _get_score(self, y_te, X_te, model):
|
|
106
|
+
if self.target_column in self.discrete_features:
|
|
107
|
+
preds = model.predict_proba(X_te)
|
|
108
|
+
if np.unique(y_te).shape[0] > 2:
|
|
109
|
+
score = roc_auc_score(y_te, preds, multi_class="ovr", average="micro")
|
|
110
|
+
else:
|
|
111
|
+
score = roc_auc_score(y_te, preds[:, 1])
|
|
112
|
+
else:
|
|
113
|
+
preds = model.predict(X_te)
|
|
114
|
+
score = r2_score(y_te, preds)
|
|
115
|
+
return score
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .base import BaseGenerator
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from .arf_generator import ARFGenerator
|
|
5
|
+
except ImportError:
|
|
6
|
+
ARFGenerator = None
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from .bn_generator import BNGenerator
|
|
10
|
+
except ImportError:
|
|
11
|
+
BNGenerator = None
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from .ctgan_generator import CTGANGenerator
|
|
15
|
+
except ImportError:
|
|
16
|
+
CTGANGenerator = None
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from .tvae_generator import TVAEGenerator
|
|
20
|
+
except ImportError:
|
|
21
|
+
TVAEGenerator = None
|
|
22
|
+
|
|
23
|
+
all_generators = [ARFGenerator, BNGenerator, CTGANGenerator, TVAEGenerator]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .arf import ARFGenerator
|