sdg-core-lib 0.1.7.dev2__tar.gz → 0.1.7.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/PKG-INFO +1 -1
  2. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/pyproject.toml +1 -1
  3. sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/GANs/CTGANComponents.py +315 -0
  4. sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/GANs/implementation/CTGAN.py +193 -0
  5. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/data_generator/models/ModelInfo.py +1 -1
  6. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/data_generator/models/UnspecializedModel.py +2 -4
  7. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs}/implementation/AutoTabularVAE.py +2 -2
  8. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs}/implementation/TabularVAE.py +2 -2
  9. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs}/implementation/TimeSeriesVAE.py +2 -2
  10. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/job.py +9 -6
  11. sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/preprocess/__init__.py +0 -0
  12. sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/preprocess/strategies/__init__.py +0 -0
  13. sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/preprocess/strategies/ctgan_strategy.py +23 -0
  14. sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/preprocess/strategies/steps.py +259 -0
  15. sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/preprocess/strategies/steps.py +0 -137
  16. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/README.md +0 -0
  17. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/__init__.py +0 -0
  18. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/browser.py +0 -0
  19. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/commons.py +0 -0
  20. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/data_generator/__init__.py +0 -0
  21. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/GANs}/__init__.py +0 -0
  22. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/GANs/implementation}/__init__.py +0 -0
  23. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/data_generator/models/TrainingInfo.py +0 -0
  24. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs}/KerasBaseVAE.py +0 -0
  25. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs}/VAE.py +0 -0
  26. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/data_generator/models/keras/implementation → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs}/__init__.py +0 -0
  27. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/dataset → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models/VAEs/implementation}/__init__.py +0 -0
  28. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/evaluate → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/data_generator/models}/__init__.py +0 -0
  29. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/dataset}/__init__.py +0 -0
  30. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/dataset/columns.py +0 -0
  31. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/dataset/datasets.py +0 -0
  32. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/evaluate}/__init__.py +0 -0
  33. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/evaluate/base_evaluator.py +0 -0
  34. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/evaluate/metrics.py +0 -0
  35. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/evaluate/tables.py +0 -0
  36. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/evaluate/time_series.py +0 -0
  37. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/FunctionApplier.py +0 -0
  38. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/distribution_evaluator → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process}/__init__.py +0 -0
  39. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/function_factory.py +0 -0
  40. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/function_utils.py +0 -0
  41. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/Parameter.py +0 -0
  42. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/UnspecializedFunction.py +0 -0
  43. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/distribution_evaluator/implementation → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions}/__init__.py +0 -0
  44. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/filter → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/distribution_evaluator}/__init__.py +0 -0
  45. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/distribution_evaluator/implementation/NormalTester.py +0 -0
  46. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/filter → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/distribution_evaluator}/implementation/__init__.py +0 -0
  47. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/filter/IntervalThreshold.py +0 -0
  48. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/filter/MonoThreshold.py +0 -0
  49. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/generation → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/filter}/__init__.py +0 -0
  50. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/filter/implementation/InnerThreshold.py +0 -0
  51. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/filter/implementation/LowerThreshold.py +0 -0
  52. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/filter/implementation/OuterThreshold.py +0 -0
  53. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/filter/implementation/UpperThreshold.py +0 -0
  54. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/generation → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/filter}/implementation/__init__.py +0 -0
  55. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/modification → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/generation}/__init__.py +0 -0
  56. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/generation/implementation/LinearFunction.py +0 -0
  57. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/generation/implementation/NormalDistributionSample.py +0 -0
  58. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/generation/implementation/QuadraticFunction.py +0 -0
  59. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/generation/implementation/SinusoidalFunction.py +0 -0
  60. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/post_process/functions/modification → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/generation}/implementation/__init__.py +0 -0
  61. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/preprocess → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/modification}/__init__.py +0 -0
  62. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/modification/implementation/BurstNoiseAdder.py +0 -0
  63. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/post_process/functions/modification/implementation/WhiteNoiseAdder.py +0 -0
  64. {sdg_core_lib-0.1.7.dev2/src/sdg_core_lib/preprocess/strategies → sdg_core_lib-0.1.7.dev4/src/sdg_core_lib/post_process/functions/modification/implementation}/__init__.py +0 -0
  65. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/preprocess/base_processor.py +0 -0
  66. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/preprocess/strategies/base_strategy.py +0 -0
  67. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/preprocess/strategies/vae_strategy.py +0 -0
  68. {sdg_core_lib-0.1.7.dev2 → sdg_core_lib-0.1.7.dev4}/src/sdg_core_lib/preprocess/table_processor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: sdg-core-lib
3
- Version: 0.1.7.dev2
3
+ Version: 0.1.7.dev4
4
4
  Summary: Add your description here
5
5
  Author: emiliocimino
6
6
  Author-email: emiliocimino <emilio.cimino@outlook.it>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sdg-core-lib"
3
- version = "0.1.7.dev2"
3
+ version = "0.1.7.dev4"
4
4
  description = "Add your description here"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -0,0 +1,315 @@
1
+ import tensorflow as tf
2
+ from keras import ops
3
+ from keras.api import layers
4
+ import numpy as np
5
+ import keras
6
+
7
+
8
+ class CTGANCritic(keras.Model):
9
+ def __init__(self, pac_size: int = 10, hidden: int = 256, dropout: float = 0.2, **kwargs):
10
+ super(CTGANCritic, self).__init__(**kwargs)
11
+ self.pac_size = pac_size
12
+ self.fc1 = layers.Dense(hidden)
13
+ self.fc2 = layers.Dense(hidden)
14
+ self.out = layers.Dense(1)
15
+ self.leaky = layers.LeakyReLU(negative_slope=0.2)
16
+ self.drop = layers.Dropout(dropout)
17
+
18
+ def get_config(self):
19
+ config = super().get_config()
20
+ config.update({
21
+ 'pac_size': self.pac_size,
22
+ 'hidden': self.fc1.units,
23
+ 'dropout': self.drop.rate,
24
+ })
25
+ return config
26
+
27
+ @classmethod
28
+ def from_config(cls, config):
29
+ # Filter out only the parameters our constructor expects
30
+ constructor_params = {
31
+ 'pac_size': config.get('pac_size', 10),
32
+ 'hidden': config.get('hidden', 256),
33
+ 'dropout': config.get('dropout', 0.2),
34
+ }
35
+ return cls(**constructor_params)
36
+
37
+ def call(self, x, training=False):
38
+ batch_size = tf.shape(x)[0]
39
+ feature_dim = tf.shape(x)[1]
40
+ remainder = batch_size % self.pac_size
41
+
42
+ def pad_batch():
43
+ padding_size = self.pac_size - remainder
44
+ last_sample = tf.expand_dims(x[-1], axis=0)
45
+ padding = tf.tile(last_sample, [padding_size, 1])
46
+ return tf.concat([x, padding], axis=0), padding_size
47
+
48
+ def no_padding():
49
+ return x, 0
50
+
51
+ x_padded, pad_size = tf.cond(
52
+ remainder > 0,
53
+ pad_batch,
54
+ no_padding
55
+ )
56
+
57
+ x_reshaped = tf.reshape(x_padded, [-1, self.pac_size * feature_dim])
58
+
59
+ h = self.fc1(x_reshaped)
60
+ h = self.leaky(h)
61
+ h = self.drop(h, training=training)
62
+ h = self.fc2(h)
63
+ h = self.leaky(h)
64
+ h = self.drop(h, training=training)
65
+ score = tf.squeeze(self.out(h), axis=1)
66
+
67
+ def remove_padding():
68
+ valid_groups = (batch_size + self.pac_size - 1) // self.pac_size
69
+ return score[:valid_groups]
70
+
71
+ def keep_all():
72
+ return score
73
+
74
+ final_score = tf.cond(
75
+ remainder > 0,
76
+ remove_padding,
77
+ keep_all
78
+ )
79
+
80
+ return tf.cast(final_score, tf.float64)
81
+
82
+
83
+ def gumbel_softmax(logits, tau=0.2, hard=True):
84
+ u = tf.random.uniform(tf.shape(logits), minval=0, maxval=1)
85
+ gumbel = -tf.math.log(-tf.math.log(u + 1e-20) + 1e-20)
86
+ y = tf.nn.softmax((logits + gumbel) / tau)
87
+ if hard:
88
+ y_hard = tf.cast(tf.equal(y, tf.reduce_max(y, axis=-1, keepdims=True)), y.dtype)
89
+ y = tf.stop_gradient(y_hard - y) + y
90
+ return y
91
+
92
+
93
+ class CTGANGenerator(keras.Model):
94
+ def __init__(self, skeleton, modes_per_continuous_column, categories_per_discrete_column, hidden=256):
95
+ super().__init__()
96
+ self.skeleton = skeleton
97
+ self.tau = 0.2
98
+ self.modes_cont = modes_per_continuous_column
99
+ self.cats_disc = categories_per_discrete_column
100
+ self.fc1 = layers.Dense(hidden)
101
+ self.bn1 = layers.BatchNormalization()
102
+ self.fc2 = layers.Dense(hidden)
103
+ self.bn2 = layers.BatchNormalization()
104
+ self.alpha_heads = [layers.Dense(1) for _ in self.modes_cont]
105
+ self.beta_heads = [layers.Dense(m) for m in self.modes_cont]
106
+ self.d_heads = [layers.Dense(d) for d in self.cats_disc]
107
+
108
+ def get_config(self):
109
+ config = super().get_config()
110
+ config.update({
111
+ 'skeleton': self.skeleton,
112
+ 'modes_per_continuous_column': self.modes_cont,
113
+ 'categories_per_discrete_column': self.cats_disc,
114
+ 'hidden': self.fc1.units,
115
+ })
116
+ return config
117
+
118
+ @classmethod
119
+ def from_config(cls, config):
120
+ # Filter out only the parameters our constructor expects
121
+ constructor_params = {
122
+ 'skeleton': config.get('skeleton'),
123
+ 'modes_per_continuous_column': config.get('modes_per_continuous_column'),
124
+ 'categories_per_discrete_column': config.get('categories_per_discrete_column'),
125
+ 'hidden': config.get('hidden', 256),
126
+ }
127
+ return cls(**constructor_params)
128
+
129
+ def call(self, inputs, training=False):
130
+ z, cond = inputs
131
+ h = tf.concat([z, cond], axis=1)
132
+ h = tf.nn.relu(self.bn1(self.fc1(h), training=training))
133
+ h = tf.nn.relu(self.bn2(self.fc2(h), training=training))
134
+
135
+ alphas, betas, ds = [], [], []
136
+ for i in range(len(self.alpha_heads)):
137
+ alphas.append(tf.nn.tanh(self.alpha_heads[i](h)))
138
+ betas.append(gumbel_softmax(self.beta_heads[i](h), self.tau))
139
+ for j in range(len(self.d_heads)):
140
+ ds.append(gumbel_softmax(self.d_heads[j](h), self.tau))
141
+
142
+ parts = []
143
+ c_idx, d_idx = 0, 0
144
+ for col in self.skeleton:
145
+ if col["feature_type"] == "continuous":
146
+ parts.append(alphas[c_idx])
147
+ parts.append(betas[c_idx])
148
+ c_idx += 1
149
+ else:
150
+ parts.append(ds[d_idx])
151
+ d_idx += 1
152
+
153
+ full_row = tf.concat(parts, axis=1)
154
+ return full_row, alphas, betas, ds
155
+
156
+
157
+ class CTGANModel(keras.Model):
158
+ def __init__(self, generator: CTGANGenerator, critic: CTGANCritic, onehot_discrete_indexes: list[int] | None = None):
159
+ super().__init__()
160
+ self.generator = generator
161
+ self.critic = critic
162
+ self.onehot_discrete_indexes = onehot_discrete_indexes
163
+ self.gen_loss_tracker = keras.metrics.Mean(name="generator_loss")
164
+ self.critic_loss_tracker = keras.metrics.Mean(name="discriminator_loss")
165
+ self._train_data = None
166
+ self.probability_mass_function_list = None
167
+ self.row_dim = sum(generator.modes_cont) + sum(generator.cats_disc) + len(generator.modes_cont)
168
+
169
+ @property
170
+ def metrics(self):
171
+ return [self.gen_loss_tracker, self.critic_loss_tracker]
172
+
173
+ @tf.function
174
+ def generate_batch_cond(self, batch_size):
175
+ num_cats = len(self.generator.cats_disc)
176
+ total_cond_dim = sum(self.generator.cats_disc)
177
+ cats_disc = tf.convert_to_tensor(self.generator.cats_disc, dtype=tf.int32)
178
+
179
+ col_indices = tf.random.uniform(
180
+ shape=[batch_size], minval=0, maxval=num_cats, dtype=tf.int32
181
+ )
182
+
183
+ relevant_pmfs = tf.gather(self.probability_mass_function_list, col_indices)
184
+
185
+ cat_indices = tf.random.categorical(tf.math.log(relevant_pmfs), num_samples=1)
186
+ cat_indices = tf.cast(tf.squeeze(cat_indices, axis=1), tf.int32)
187
+
188
+ offsets_table = tf.concat([[0], tf.cumsum(cats_disc)[:-1]], axis=0)
189
+ batch_offsets = tf.gather(offsets_table, col_indices)
190
+
191
+ global_hot_indices = batch_offsets + cat_indices
192
+ row_indices = tf.range(batch_size)
193
+ scatter_indices = tf.stack([row_indices, global_hot_indices], axis=1)
194
+
195
+ cond_batch = tf.scatter_nd(
196
+ indices=scatter_indices,
197
+ updates=tf.ones([batch_size], dtype=tf.float32),
198
+ shape=[batch_size, total_cond_dim],
199
+ )
200
+
201
+ return cond_batch
202
+
203
+ @staticmethod
204
+ @tf.function
205
+ def sample_real_data(train_tensor, cond, discrete_onehot_indexes):
206
+ if tf.rank(cond) == 1:
207
+ cond = tf.expand_dims(cond, axis=0)
208
+
209
+ discrete_indices = tf.constant(discrete_onehot_indexes, dtype=tf.int32)
210
+ cond_indices = tf.cast(tf.argmax(cond, axis=1), tf.int32)
211
+ target_columns = tf.gather(discrete_indices, cond_indices)
212
+
213
+ def sample_single_row(col):
214
+ mask = tf.equal(train_tensor[:, col], 1.0)
215
+ elems = tf.boolean_mask(train_tensor, mask)
216
+ num_elems = tf.shape(elems)[0]
217
+ tf.Assert(num_elems > 0, ["No row found for condition!"])
218
+ logits = tf.zeros([1, num_elems])
219
+ random_idx = tf.random.categorical(logits, 1)
220
+ random_idx = tf.cast(tf.reshape(random_idx, []), tf.int32)
221
+ return tf.gather(elems, random_idx)
222
+
223
+ return tf.map_fn(
224
+ sample_single_row, target_columns, fn_output_signature=train_tensor.dtype
225
+ )
226
+
227
+ @tf.function
228
+ def train_critic(self, real_data, z, cond):
229
+ with tf.GradientTape() as tape:
230
+ fake_data, _, _, _ = self.generator([z, cond], training=True)
231
+
232
+ alpha = tf.random.uniform([ops.shape(real_data)[0], 1], 0.0, 1.0)
233
+ alpha = tf.cast(alpha, tf.float64)
234
+ real_data = tf.cast(real_data, tf.float64)
235
+ fake_data = tf.cast(fake_data, tf.float64)
236
+ interpolated = alpha * real_data + (1 - alpha) * fake_data
237
+
238
+ with tf.GradientTape() as gp_tape:
239
+ gp_tape.watch(interpolated)
240
+ pred = self.critic(interpolated, training=True)
241
+ grads = gp_tape.gradient(pred, [interpolated])[0]
242
+ norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=1) + 1e-12)
243
+ gp = tf.cast(tf.reduce_mean((norm - 1.0) ** 2) * 10.0, tf.float64)
244
+
245
+ real_score = self.critic(real_data, training=True)
246
+ fake_score = self.critic(fake_data, training=True)
247
+ c_loss = tf.reduce_mean(fake_score) - tf.reduce_mean(real_score) + gp
248
+
249
+ grads_c = tape.gradient(c_loss, self.critic.trainable_variables)
250
+ self.critic.optimizer.apply_gradients(
251
+ zip(grads_c, self.critic.trainable_variables)
252
+ )
253
+ return c_loss
254
+
255
+ @tf.function
256
+ def train_gen(self, z, cond):
257
+ with tf.GradientTape() as tape:
258
+ fake_data, _, _, d_list = self.generator([z, cond], training=True)
259
+ fake_score = self.critic(fake_data, training=True)
260
+
261
+ adv_loss = -tf.reduce_mean(fake_score)
262
+ d_logits = tf.concat(d_list, axis=1)
263
+ cond_loss = -tf.reduce_mean(
264
+ tf.reduce_sum(cond * tf.math.log(d_logits + 1e-8), axis=1)
265
+ )
266
+ g_loss = adv_loss + tf.cast(cond_loss, tf.float64)
267
+
268
+ grads_g = tape.gradient(g_loss, self.generator.trainable_variables)
269
+ self.generator.optimizer.apply_gradients(
270
+ zip(grads_g, self.generator.trainable_variables)
271
+ )
272
+ return g_loss
273
+
274
+ def get_pmfs(self, train_data):
275
+ onehot_all = tf.gather(train_data, self.onehot_discrete_indexes, axis=1)
276
+ pmfs = []
277
+ curr = 0
278
+ for sz in self.generator.cats_disc:
279
+ chunk = onehot_all[:, curr:curr + sz]
280
+ log_freqs = tf.math.log(tf.reduce_sum(chunk, axis=0) + 1.0)
281
+ pmfs.append(log_freqs / tf.reduce_sum(log_freqs))
282
+ curr += sz
283
+ return pmfs
284
+
285
+ def train_step(self, data):
286
+ batch = ops.shape(data)[0]
287
+ self.row_dim = ops.shape(data)[1]
288
+ z = tf.random.normal([batch, self.row_dim - sum(self.generator.cats_disc)])
289
+ cond = self.generate_batch_cond(batch)
290
+ real_batch = CTGANModel.sample_real_data(
291
+ self._train_data, cond, self.onehot_discrete_indexes
292
+ )
293
+ c_loss = self.train_critic(real_batch, z, cond)
294
+ g_loss = self.train_gen(z, cond)
295
+
296
+ self.gen_loss_tracker.update_state(g_loss)
297
+ self.critic_loss_tracker.update_state(c_loss)
298
+ return {
299
+ "g_loss": self.gen_loss_tracker.result(),
300
+ "d_loss": self.critic_loss_tracker.result(),
301
+ }
302
+
303
+ def compile(self, g_optimizer, d_optimizer):
304
+ super().compile()
305
+ self.generator.compile(g_optimizer)
306
+ self.critic.compile(d_optimizer)
307
+
308
+ def generate(self, batch_size: int = 100) -> np.ndarray:
309
+ if self.generator is None or self.probability_mass_function_list is None:
310
+ raise RuntimeError("In order to generate some data you need to fit a dataset first!")
311
+
312
+ z = keras.random.normal(shape=(batch_size, self.row_dim - sum(self.generator.cats_disc)), seed=42)
313
+ cond = self.generate_batch_cond(batch_size)
314
+ gen_x, _, _, _ = self.generator([z, cond], training=False)
315
+ return ops.convert_to_numpy(gen_x)
@@ -0,0 +1,193 @@
1
+ import os
2
+
3
+ from sdg_core_lib.data_generator.models.UnspecializedModel import UnspecializedModel
4
+ from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo
5
+ from sdg_core_lib.commons import AllowedData, DataType
6
+
7
+ os.environ["KERAS_BACKEND"] = "tensorflow"
8
+ from sdg_core_lib.data_generator.models.GANs.CTGANComponents import (
9
+ CTGANGenerator,
10
+ CTGANCritic,
11
+ CTGANModel,
12
+ )
13
+ import keras
14
+ from sdg_core_lib.data_generator.models.TrainingInfo import TrainingInfo
15
+ import numpy as np
16
+
17
+
18
+ class CTGAN(UnspecializedModel):
19
+ def __init__(
20
+ self,
21
+ metadata: dict,
22
+ model_name: str,
23
+ input_shape: str = None,
24
+ load_path: str = None,
25
+ gen_hidden=256,
26
+ critic_hidden=256,
27
+ pac_size=10,
28
+ learning_rate=1e-3,
29
+ batch_size=100,
30
+ epochs=10,
31
+ gen_steps=4,
32
+ critic_dropout=0.2,
33
+ ):
34
+ super().__init__(metadata, model_name, input_shape, load_path)
35
+ self._batch_size = batch_size
36
+ self._epochs = epochs
37
+ self._gen_steps = gen_steps
38
+ self._pac_size = pac_size
39
+ self._gen_hidden = gen_hidden
40
+ self._critic_hidden = critic_hidden
41
+ self._learning_rate = learning_rate
42
+ self._critic_dropout = critic_dropout
43
+ self._instantiate()
44
+
45
+ @staticmethod
46
+ def infer_data_structure(skeleton):
47
+ cats, modes, idxs = [], [], []
48
+ true_index = 0
49
+ for col in skeleton:
50
+ try:
51
+ f_size = int(col["feature_size"])
52
+ if col["feature_type"] == "categorical":
53
+ cats.append(f_size)
54
+ # These are the actual global column indices in the train_tensor
55
+ idxs.extend(range(true_index, true_index + f_size))
56
+ elif f_size <= 1:
57
+ raise AttributeError(
58
+ "Continous column after normalization must have at least size 2 (1 column "
59
+ "for the norm values and another for indicating the onehot of"
60
+ "a single mode"
61
+ )
62
+ else:
63
+ modes.append(f_size - 1)
64
+ except KeyError as e:
65
+ raise AttributeError(
66
+ f"The CTGAN needs a valid data schema for each column, "
67
+ f"key {e.args[0]} is missing"
68
+ )
69
+ true_index += f_size
70
+ if not cats:
71
+ raise AttributeError("At least a categorical column must be passed!")
72
+ return cats, modes, idxs
73
+
74
+ def _build(self, input_shape: tuple[int, ...]):
75
+ """
76
+ This method is called during init if there is no load path,
77
+ otherwise the method _load will be called
78
+ :param input_shape:
79
+ :return:
80
+ """
81
+
82
+ if (
83
+ not isinstance(self._metadata, list)
84
+ or not self._metadata
85
+ or any(not isinstance(item, dict) or not item for item in self._metadata)
86
+ ):
87
+ raise AttributeError("CTGAN needs a data schema in order to work!")
88
+ # Infer dimensions and indices
89
+ (
90
+ categories_per_discrete_column,
91
+ modes_per_continuous_column,
92
+ onehot_discrete_indexes,
93
+ ) = CTGAN.infer_data_structure(self._metadata)
94
+ self.generator = CTGANGenerator(
95
+ self._metadata,
96
+ modes_per_continuous_column,
97
+ categories_per_discrete_column,
98
+ self._gen_hidden,
99
+ )
100
+ self.critic = CTGANCritic(
101
+ self._pac_size, self._critic_hidden, self._critic_dropout
102
+ )
103
+ return CTGANModel(self.generator, self.critic, onehot_discrete_indexes)
104
+
105
+ def _load(self, folder_path: str):
106
+ # Should set the _model variable CTGAN Model complete with Generator and Critic
107
+ # Does NOT return the model
108
+ # self._metadata is available
109
+ _, _, onehot_discrete_indexes = CTGAN.infer_data_structure(self._metadata)
110
+ critic = keras.saving.load_model(os.path.join(folder_path, "critic.keras"))
111
+ generator = keras.saving.load_model(
112
+ os.path.join(folder_path, "generator.keras")
113
+ )
114
+ self._model = CTGANModel(generator, critic, onehot_discrete_indexes)
115
+
116
+ # Load probability_mass_function_list if it exists
117
+ pmf_path = os.path.join(folder_path, "probability_mass_function_list.npy")
118
+ if os.path.exists(pmf_path):
119
+ self._model.probability_mass_function_list = np.load(pmf_path, allow_pickle=True)
120
+
121
+ def save(self, folder_path: str):
122
+ keras.saving.save_model(
123
+ self._model.generator, os.path.join(folder_path, "generator.keras")
124
+ )
125
+ keras.saving.save_model(
126
+ self._model.critic, os.path.join(folder_path, "critic.keras")
127
+ )
128
+
129
+ if hasattr(self._model, 'probability_mass_function_list') and self._model.probability_mass_function_list is not None:
130
+ np.save(
131
+ os.path.join(folder_path, "probability_mass_function_list.npy"),
132
+ self._model.probability_mass_function_list
133
+ )
134
+
135
+ def train(self, data: np.ndarray):
136
+ """
137
+ The idea is to condense training hyperparams here and call
138
+ Since learning_rate and other hyperparams comes from user, it should be better defining
139
+ generator optimizer and critic optimizer here and pass them through the model.fit method.
140
+ self._model.fit(data, gen_opt, crit_opt, ....)
141
+ :param data:
142
+ :return: Nothing
143
+ IMPORTANT: Here TrainingInfo should be defined. See KerasBaseVAE train method
144
+ """
145
+ self._model.compile(
146
+ g_optimizer=keras.optimizers.Adam(self._learning_rate, beta_1=0.5, beta_2=0.9),
147
+ d_optimizer=keras.optimizers.Adam(self._learning_rate, beta_1=0.5, beta_2=0.9),
148
+ )
149
+ self._model._train_data = data
150
+ probability_mass_function_list = self._model.get_pmfs(data)
151
+ self._model.probability_mass_function_list = keras.ops.convert_to_numpy(probability_mass_function_list)
152
+ history = self._model.fit(data, batch_size=self._batch_size, epochs=self._epochs, verbose=1)
153
+ self.training_info = TrainingInfo(
154
+ loss_fn="Generator Adversary Loss + Log-frequency weighted cross-entropy",
155
+ train_loss= history.history["g_loss"][-1].numpy().item(),
156
+ train_samples=data.shape[0],
157
+ validation_loss=-1,
158
+ validation_samples=0,
159
+ )
160
+
161
+ def fine_tune(self, data: np.ndarray, **kwargs):
162
+ raise NotImplementedError
163
+
164
+ def infer(self, n_rows: int, **kwargs):
165
+ return self._model.generate(n_rows)
166
+
167
+ def set_hyperparameters(self, **kwargs):
168
+ """
169
+ Define some hyperarams that can be defined outside using kwargs
170
+ :param kwargs:
171
+ :return:
172
+ """
173
+ self._batch_size = int(kwargs.get("batch_size", self._batch_size))
174
+ self._epochs = int(kwargs.get("epochs", self._epochs))
175
+ self._pac_size = kwargs.get("pac_size", self._pac_size)
176
+ self._gen_hidden = kwargs.get("gen_hidden", self._gen_hidden)
177
+ self._critic_hidden = kwargs.get("critic_hidden", self._critic_hidden)
178
+ self._learning_rate = float(kwargs.get("learning_rate", self._learning_rate))
179
+ self._critic_dropout = kwargs.get("critic_dropout", self._critic_dropout)
180
+
181
+ @classmethod
182
+ def self_describe(cls):
183
+ return ModelInfo(
184
+ name=f"{cls.__module__}.{cls.__qualname__}",
185
+ default_loss_function="Mean",
186
+ description="A Conditional Tabular Generative Adversarial Network for data generation",
187
+ allowed_data=[
188
+ AllowedData(DataType.float32, False),
189
+ AllowedData(DataType.int32, False),
190
+ AllowedData(DataType.int32, True),
191
+ AllowedData(DataType.string, True),
192
+ ],
193
+ ).get_model_info()
@@ -14,7 +14,7 @@ class ModelInfo:
14
14
  self.description = description
15
15
  self.allowed_data = allowed_data
16
16
 
17
- def get_model_info(self):
17
+ def get_model_info(self) -> dict:
18
18
  """
19
19
  Returns a dictionary containing the model information.
20
20
 
@@ -8,18 +8,17 @@ class UnspecializedModel(ABC):
8
8
  by all subclasses.
9
9
 
10
10
  Attributes:
11
- _metadata (dict): A dictionary containing miscellaneous information about the model.
11
+ _metadata (dict): A dictionary containing miscellaneous information about the data structure used by a model.
12
12
  model_name (str): The model name, used to identify the model itself.
13
13
  input_shape (tuple): A tuple containing the input shape of the model.
14
14
  _load_path (str): A string containing the path where to load the model from.
15
15
  _model (keras.Model): The model instance.
16
- _scaler (Scaler): The scaler instance.
17
16
  training_info (TrainingInfo): The training info instance.
18
17
  """
19
18
 
20
19
  def __init__(
21
20
  self,
22
- metadata: dict,
21
+ metadata: list[dict],
23
22
  model_name: str,
24
23
  input_shape: str = None,
25
24
  load_path: str = None,
@@ -30,7 +29,6 @@ class UnspecializedModel(ABC):
30
29
  self._load_path = load_path
31
30
  self._model = None # Placeholder for the model instance
32
31
  self.training_info = None # Placeholder for training info
33
- self._model_misc = None # Placeholder for model miscellaneous info
34
32
 
35
33
  @abstractmethod
36
34
  def _build(self, input_shape: tuple[int, ...]):
@@ -7,8 +7,8 @@ from keras_tuner import HyperParameters
7
7
  from sdg_core_lib.data_generator.models.TrainingInfo import TrainingInfo
8
8
  from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo
9
9
  from sdg_core_lib.commons import AllowedData, DataType
10
- from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
11
- from sdg_core_lib.data_generator.models.keras.implementation.TabularVAE import (
10
+ from sdg_core_lib.data_generator.models.VAEs.VAE import Sampling, VAE
11
+ from sdg_core_lib.data_generator.models.VAEs.implementation.TabularVAE import (
12
12
  TabularVAE,
13
13
  )
14
14
 
@@ -3,8 +3,8 @@ from keras import layers
3
3
 
4
4
  from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo
5
5
  from sdg_core_lib.commons import AllowedData, DataType
6
- from sdg_core_lib.data_generator.models.keras.KerasBaseVAE import KerasBaseVAE
7
- from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
6
+ from sdg_core_lib.data_generator.models.VAEs.KerasBaseVAE import KerasBaseVAE
7
+ from sdg_core_lib.data_generator.models.VAEs.VAE import Sampling, VAE
8
8
 
9
9
 
10
10
  class TabularVAE(KerasBaseVAE):
@@ -3,10 +3,10 @@ import keras
3
3
 
4
4
  from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo
5
5
  from sdg_core_lib.commons import AllowedData, DataType
6
- from sdg_core_lib.data_generator.models.keras.KerasBaseVAE import KerasBaseVAE
6
+ from sdg_core_lib.data_generator.models.VAEs.KerasBaseVAE import KerasBaseVAE
7
7
  from keras import layers
8
8
 
9
- from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
9
+ from sdg_core_lib.data_generator.models.VAEs.VAE import Sampling, VAE
10
10
 
11
11
 
12
12
  class TimeSeriesVAE(KerasBaseVAE):
@@ -1,10 +1,11 @@
1
1
  from typing import Optional
2
2
 
3
+ from sdg_core_lib.data_generator.models.GANs.implementation.CTGAN import CTGAN
3
4
  from sdg_core_lib.data_generator.models.UnspecializedModel import UnspecializedModel
4
- from sdg_core_lib.data_generator.models.keras.implementation.TabularVAE import (
5
+ from sdg_core_lib.data_generator.models.VAEs.implementation.TabularVAE import (
5
6
  TabularVAE,
6
7
  )
7
- from sdg_core_lib.data_generator.models.keras.implementation.TimeSeriesVAE import (
8
+ from sdg_core_lib.data_generator.models.VAEs.implementation.TimeSeriesVAE import (
8
9
  TimeSeriesVAE,
9
10
  )
10
11
  from sdg_core_lib.dataset.datasets import Dataset, Table, TimeSeries
@@ -16,6 +17,7 @@ from sdg_core_lib.preprocess.strategies.vae_strategy import (
16
17
  TabularVAEPreprocessingStrategy,
17
18
  TimeSeriesVAEPreprocessingStrategy,
18
19
  )
20
+ from sdg_core_lib.preprocess.strategies.ctgan_strategy import CTGANPreprocessingStrategy
19
21
  from sdg_core_lib.evaluate.tables import TabularComparisonEvaluator
20
22
  from sdg_core_lib.evaluate.time_series import TimeSeriesComparisonEvaluator
21
23
  import importlib
@@ -23,13 +25,13 @@ import os
23
25
 
24
26
 
25
27
  def get_hyperparameters() -> dict:
28
+ hyperparams_name = ["EPOCHS", "LEARNING_RATE", "BATCH_SIZE"]
26
29
  return {
27
- "epochs": os.environ.get("EPOCHS"),
28
- "learning_rate": os.environ.get("LEARNING_RATE"),
29
- "batch_size": os.environ.get("BATCH_SIZE"),
30
+ hp.lower(): os.environ.get(hp)
31
+ for hp in hyperparams_name
32
+ if os.environ.get(hp) is not None
30
33
  }
31
34
 
32
-
33
35
  class Job:
34
36
  dataset_mapping = {
35
37
  "table": {
@@ -47,6 +49,7 @@ class Job:
47
49
  model_strategy_mapping: dict[type, BasePreprocessingStrategy] = {
48
50
  TabularVAE: TabularVAEPreprocessingStrategy,
49
51
  TimeSeriesVAE: TimeSeriesVAEPreprocessingStrategy,
52
+ CTGAN: CTGANPreprocessingStrategy,
50
53
  }
51
54
 
52
55
  def __init__(
@@ -0,0 +1,23 @@
1
+ from sdg_core_lib.dataset.columns import Column, Numeric, Categorical
2
+ from sdg_core_lib.preprocess.strategies.base_strategy import BasePreprocessingStrategy
3
+ from sdg_core_lib.preprocess.strategies.steps import (
4
+ Step,
5
+ NoneStep,
6
+ PerModeNormalization,
7
+ OneHotEncoderWrapper,
8
+ )
9
+
10
+
11
+ class CTGANPreprocessingStrategy(BasePreprocessingStrategy):
12
+ @staticmethod
13
+ def get_steps_per_feature(feature: Column) -> list[Step]:
14
+ step_list = []
15
+ if isinstance(feature, Numeric):
16
+ step_list.append(PerModeNormalization(feature.position, feature.name))
17
+ elif isinstance(feature, Categorical):
18
+ step_list.append(OneHotEncoderWrapper(feature.position, feature.name))
19
+ elif type(feature) is Column:
20
+ step_list.append(NoneStep(feature.position))
21
+ else:
22
+ raise NotImplementedError()
23
+ return step_list
@@ -0,0 +1,259 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Literal
3
+
4
+ import numpy as np
5
+ from sklearn.preprocessing import (
6
+ MinMaxScaler,
7
+ StandardScaler,
8
+ OneHotEncoder,
9
+ OrdinalEncoder,
10
+ )
11
+ import os
12
+ import skops.io as sio
13
+ from sklearn.mixture import BayesianGaussianMixture
14
+
15
+
16
+ class Step(ABC):
17
+ def __init__(self, type_name: str, position: int, col_name: str, mode: str):
18
+ self.type_name = type_name
19
+ self.mode = mode
20
+ self.position = position
21
+ self.col_name = col_name
22
+ self.operator = None
23
+ self.filename = (
24
+ f"{self.position}_{self.col_name}_{self.mode}_{self.type_name}.skops"
25
+ )
26
+
27
+ @abstractmethod
28
+ def _set_operator(self):
29
+ raise NotImplementedError
30
+
31
+ def save_if_not_exist(self, directory_path: str):
32
+ if self.operator is None:
33
+ raise ValueError("Operator is not created")
34
+ os.makedirs(directory_path, exist_ok=True)
35
+ filename = os.path.join(directory_path, self.filename)
36
+ if not os.path.exists(filename):
37
+ sio.dump(self.operator, filename)
38
+
39
+ def load(self, directory_path: str):
40
+ filename = os.path.join(directory_path, self.filename)
41
+ if not os.path.isfile(filename):
42
+ raise FileNotFoundError(f"Operator file not found: {filename}")
43
+ self.operator = sio.load(filename)
44
+
45
+ def fit_transform(self, data: np.ndarray) -> np.ndarray:
46
+ self.operator = self._set_operator()
47
+ return self.operator.fit_transform(data)
48
+
49
+ def transform(self, data: np.ndarray) -> np.ndarray:
50
+ if self.operator is None:
51
+ raise ValueError("Operator not initialized")
52
+ return self.operator.transform(data)
53
+
54
+ def inverse_transform(self, data: np.ndarray) -> np.ndarray:
55
+ if self.operator is None:
56
+ raise ValueError("Operator not initialized")
57
+ return self.operator.inverse_transform(data)
58
+
59
+
60
+ class NoneStep(Step):
61
+ def __init__(self, position: int, mode="", type_name="none"):
62
+ super().__init__(type_name=type_name, position=position, col_name="", mode=mode)
63
+
64
+ def save_if_not_exist(self, directory_path: str):
65
+ pass
66
+
67
+ def load(self, directory_path: str):
68
+ pass
69
+
70
+ def _set_operator(self):
71
+ pass
72
+
73
+ def fit_transform(self, data: np.ndarray) -> np.ndarray:
74
+ return data
75
+
76
+ def transform(self, data: np.ndarray) -> np.ndarray:
77
+ return data
78
+
79
+ def inverse_transform(self, data: np.ndarray) -> np.ndarray:
80
+ return data
81
+
82
+
83
+ class ScalerWrapper(Step):
84
+ def __init__(
85
+ self,
86
+ position: int,
87
+ col_name: str,
88
+ mode: Literal["minmax", "standard"] = "standard",
89
+ type_name="scaler",
90
+ ):
91
+ super().__init__(
92
+ type_name=type_name, position=position, col_name=col_name, mode=mode
93
+ )
94
+
95
+ def _set_operator(self):
96
+ if self.mode == "minmax":
97
+ return MinMaxScaler()
98
+ elif self.mode == "standard":
99
+ return StandardScaler()
100
+ else:
101
+ raise ValueError("Invalid mode while setting the scaler")
102
+
103
+
104
+ class OrdinalEncoderWrapper(Step):
105
+ def __init__(
106
+ self, position: int, col_name: str, mode="ordinal", type_name="encoder"
107
+ ):
108
+ super().__init__(
109
+ type_name=type_name, position=position, col_name=col_name, mode=mode
110
+ )
111
+
112
+ def _set_operator(self):
113
+ return OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
114
+
115
+
116
+ class OneHotEncoderWrapper(Step):
117
+ def __init__(
118
+ self, position: int, col_name: str, mode="one_hot", type_name="encoder"
119
+ ):
120
+ super().__init__(
121
+ type_name=type_name, position=position, col_name=col_name, mode=mode
122
+ )
123
+
124
+ def _set_operator(self):
125
+ return OneHotEncoder(handle_unknown="error")
126
+
127
+ def fit_transform(self, data: np.ndarray) -> np.ndarray:
128
+ return super().fit_transform(data).toarray()
129
+
130
+ def transform(self, data: np.ndarray) -> np.ndarray:
131
+ return super().transform(data).toarray()
132
+
133
+ def inverse_transform(self, data: np.ndarray) -> np.ndarray:
134
+ # Numerical stability for all zeros
135
+ data = data + np.ones(data.shape) * 1e-9
136
+ return super().inverse_transform(data)
137
+
138
+
139
+ class PerModeNormalization(Step):
140
+ """
141
+ This step estimates using variational gaussian mixtures models
142
+ the number of modes the data may come from and performs mode specific
143
+ normalization that will be later used by a CTGAN. This step also
144
+ saves this information in order to perform inverse transformations
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ position: int,
150
+ col_name: str,
151
+ mode: str = "",
152
+ type_name="per_mode_normalization",
153
+ n_components=10,
154
+ max_iter=1000,
155
+ random_state=42,
156
+ ):
157
+ super().__init__(
158
+ type_name=type_name, position=position, col_name=col_name, mode=mode
159
+ )
160
+ self.n_components = n_components
161
+ self.max_iter = max_iter
162
+ self.random_state = random_state
163
+
164
+ def _set_operator(self):
165
+ vbgmm = BayesianGaussianMixture(
166
+ n_components=self.n_components,
167
+ weight_concentration_prior_type="dirichlet_process",
168
+ covariance_type="full",
169
+ max_iter=1000,
170
+ random_state=self.random_state,
171
+ )
172
+ return vbgmm
173
+
174
+ def fit_transform(self, data: np.ndarray) -> np.ndarray:
175
+ self.operator = self._set_operator()
176
+ self.operator.fit(data)
177
+ return self.transform(data)
178
+
179
+ def transform(self, data: np.ndarray) -> np.ndarray:
180
+ if self.operator is None:
181
+ raise ValueError("Operator not initialized")
182
+ column = data.reshape(-1, 1)
183
+ active_weights_indx = np.where(self.operator.weights_ > 0.01)
184
+ weights = self.operator.weights_[active_weights_indx]
185
+ means = self.operator.means_[active_weights_indx].flatten()
186
+ stds = np.sqrt(self.operator.covariances_[active_weights_indx].flatten())
187
+ mixture_probability_density = []
188
+ for w, m, s in zip(weights, means, stds):
189
+ mixture_probability_density.append(
190
+ w
191
+ * PerModeNormalization._gaussian_probability_density_function(
192
+ column, m, s
193
+ )
194
+ )
195
+ marginal_mixture_probability_density = np.hstack(mixture_probability_density)
196
+ responsibilities = PerModeNormalization._compute_responsibilities(
197
+ marginal_mixture_probability_density
198
+ )
199
+ rng = np.random.default_rng(self.random_state)
200
+ n, K = responsibilities.shape
201
+ sampled_mode = np.array(
202
+ [rng.choice(K, p=responsibilities[i]) for i in range(n)]
203
+ )
204
+ f = np.zeros((n, K), dtype=int)
205
+ f[np.arange(n), sampled_mode] = 1
206
+ mu_sel = means[sampled_mode]
207
+ std_sel = stds[sampled_mode]
208
+ normalized_value = (column.reshape(-1) - mu_sel) / (4.0 * std_sel)
209
+ to_return = np.concatenate([normalized_value.reshape(-1, 1), f], axis=1)
210
+ return to_return
211
+
212
+ def inverse_transform(self, data: np.ndarray) -> np.ndarray:
213
+ if self.operator is None:
214
+ raise ValueError("Operator not initialized")
215
+ active_weights_indx = np.where(self.operator.weights_ > 0.01)
216
+ means = self.operator.means_[active_weights_indx].flatten()
217
+ stds = np.sqrt(self.operator.covariances_[active_weights_indx].flatten())
218
+
219
+ # Handle both 1D and 2D input data
220
+ if data.ndim == 1:
221
+ # Data is 1D, reshape to 2D for processing
222
+ data = data.reshape(1, -1)
223
+ was_1d = True
224
+ else:
225
+ was_1d = False
226
+
227
+ rows = data.shape[0]
228
+
229
+ # Find the active mode for each row (argmax of one-hot encoded modes)
230
+ active_modes = np.argmax(data[:, 1:], axis=1)
231
+
232
+ # Get the means and stds for the active modes
233
+ selected_mus = means[active_modes]
234
+ selected_devs = stds[active_modes]
235
+
236
+ # Get the normalized values (first column)
237
+ normalized_values = data[:, 0]
238
+
239
+ # Denormalize the values
240
+ values = (normalized_values * 4 * selected_devs) + selected_mus
241
+
242
+ # Always return 2D array with shape (n_samples, 1) for consistency
243
+ return values.reshape(-1, 1)
244
+
245
+ @staticmethod
246
+ def _gaussian_probability_density_function(
247
+ x: np.ndarray, mean: np.ndarray, std: np.ndarray
248
+ ):
249
+ """
250
+ This function computes the probability density function of the gaussian mixture
251
+ given the mean and standard deviation
252
+ """
253
+ return (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(
254
+ -0.5 * (x - mean) ** 2 / (std**2)
255
+ )
256
+
257
+ @staticmethod
258
+ def _compute_responsibilities(pdf_per_mode: np.ndarray) -> np.ndarray:
259
+ return pdf_per_mode / pdf_per_mode.sum(axis=1, keepdims=True)
@@ -1,137 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import Literal
3
-
4
- import numpy as np
5
- from sklearn.preprocessing import (
6
- MinMaxScaler,
7
- StandardScaler,
8
- OneHotEncoder,
9
- OrdinalEncoder,
10
- )
11
- import os
12
- import skops.io as sio
13
-
14
- # TODO: What if steps change data types? Should I implement some "old_type / new_type" mechanics?
15
-
16
-
17
- class Step(ABC):
18
- def __init__(self, type_name: str, position: int, col_name: str, mode: str):
19
- self.type_name = type_name
20
- self.mode = mode
21
- self.position = position
22
- self.col_name = col_name
23
- self.operator = None
24
- self.filename = (
25
- f"{self.position}_{self.col_name}_{self.mode}_{self.type_name}.skops"
26
- )
27
-
28
- @abstractmethod
29
- def _set_operator(self):
30
- raise NotImplementedError
31
-
32
- def save_if_not_exist(self, directory_path: str):
33
- if self.operator is None:
34
- raise ValueError("Operator is not created")
35
- os.makedirs(directory_path, exist_ok=True)
36
- filename = os.path.join(directory_path, self.filename)
37
- if not os.path.exists(filename):
38
- sio.dump(self.operator, filename)
39
-
40
- def load(self, directory_path: str):
41
- filename = os.path.join(directory_path, self.filename)
42
- if not os.path.isfile(filename):
43
- raise FileNotFoundError(f"Operator file not found: {filename}")
44
- self.operator = sio.load(filename)
45
-
46
- def fit_transform(self, data: np.ndarray) -> np.ndarray:
47
- self.operator = self._set_operator()
48
- return self.operator.fit_transform(data)
49
-
50
- def transform(self, data: np.ndarray) -> np.ndarray:
51
- if self.operator is None:
52
- raise ValueError("Operator not initialized")
53
- return self.operator.transform(data)
54
-
55
- def inverse_transform(self, data: np.ndarray) -> np.ndarray:
56
- if self.operator is None:
57
- raise ValueError("Operator not initialized")
58
- return self.operator.inverse_transform(data)
59
-
60
-
61
- class NoneStep(Step):
62
- def __init__(self, position: int, mode="", type_name="none"):
63
- super().__init__(type_name=type_name, position=position, col_name="", mode=mode)
64
-
65
- def save_if_not_exist(self, directory_path: str):
66
- pass
67
-
68
- def load(self, directory_path: str):
69
- pass
70
-
71
- def _set_operator(self):
72
- pass
73
-
74
- def fit_transform(self, data: np.ndarray) -> np.ndarray:
75
- return data
76
-
77
- def transform(self, data: np.ndarray) -> np.ndarray:
78
- return data
79
-
80
- def inverse_transform(self, data: np.ndarray) -> np.ndarray:
81
- return data
82
-
83
-
84
- class ScalerWrapper(Step):
85
- def __init__(
86
- self,
87
- position: int,
88
- col_name: str,
89
- mode: Literal["minmax", "standard"] = "standard",
90
- type_name="scaler",
91
- ):
92
- super().__init__(
93
- type_name=type_name, position=position, col_name=col_name, mode=mode
94
- )
95
-
96
- def _set_operator(self):
97
- if self.mode == "minmax":
98
- return MinMaxScaler()
99
- elif self.mode == "standard":
100
- return StandardScaler()
101
- else:
102
- raise ValueError("Invalid mode while setting the scaler")
103
-
104
-
105
- class OrdinalEncoderWrapper(Step):
106
- def __init__(
107
- self, position: int, col_name: str, mode="ordinal", type_name="encoder"
108
- ):
109
- super().__init__(
110
- type_name=type_name, position=position, col_name=col_name, mode=mode
111
- )
112
-
113
- def _set_operator(self):
114
- return OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
115
-
116
-
117
- class OneHotEncoderWrapper(Step):
118
- def __init__(
119
- self, position: int, col_name: str, mode="one_hot", type_name="encoder"
120
- ):
121
- super().__init__(
122
- type_name=type_name, position=position, col_name=col_name, mode=mode
123
- )
124
-
125
- def _set_operator(self):
126
- return OneHotEncoder(handle_unknown="error")
127
-
128
- def fit_transform(self, data: np.ndarray) -> np.ndarray:
129
- return super().fit_transform(data).toarray()
130
-
131
- def transform(self, data: np.ndarray) -> np.ndarray:
132
- return super().transform(data).toarray()
133
-
134
- def inverse_transform(self, data: np.ndarray) -> np.ndarray:
135
- # Numerical stability for all zeros
136
- data = data + np.ones(data.shape) * 1e-9
137
- return super().inverse_transform(data)