misata 0.2.0b0__py3-none-any.whl → 0.3.1b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +77 -2
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/profiles.py +332 -0
- misata/simulator.py +133 -12
- misata/smart_values.py +171 -2
- misata/streaming.py +228 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/METADATA +1 -1
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/RECORD +18 -10
- /misata/{generators.py → generators_legacy.py} +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/WHEEL +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/entry_points.txt +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/licenses/LICENSE +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,586 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base generator interface and factory for Misata.
|
|
3
|
+
|
|
4
|
+
Provides abstract base class for all generators and a factory
|
|
5
|
+
pattern for creating generators based on column type.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from misata.exceptions import ColumnGenerationError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseGenerator(ABC):
|
|
17
|
+
"""Abstract base class for all data generators.
|
|
18
|
+
|
|
19
|
+
All generators must implement the `generate` method which produces
|
|
20
|
+
a numpy array of values.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
class IntegerGenerator(BaseGenerator):
|
|
24
|
+
def generate(self, size: int, params: dict) -> np.ndarray:
|
|
25
|
+
return np.random.randint(params['min'], params['max'], size)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
30
|
+
"""Generate an array of values.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
size: Number of values to generate
|
|
34
|
+
params: Distribution parameters specific to this generator
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
numpy array of generated values
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ColumnGenerationError: If generation fails
|
|
41
|
+
"""
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
def validate_params(self, params: Dict[str, Any]) -> None:
|
|
45
|
+
"""Validate parameters before generation.
|
|
46
|
+
|
|
47
|
+
Override this method to add custom validation.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
params: Parameters to validate
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ColumnGenerationError: If validation fails
|
|
54
|
+
"""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def inject_nulls(
|
|
58
|
+
self,
|
|
59
|
+
values: np.ndarray,
|
|
60
|
+
null_rate: float = 0.0,
|
|
61
|
+
rng: Optional[np.random.Generator] = None
|
|
62
|
+
) -> np.ndarray:
|
|
63
|
+
"""Inject null values into generated data.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
values: Generated values array
|
|
67
|
+
null_rate: Fraction of values to make null (0.0 to 1.0)
|
|
68
|
+
rng: Random number generator for reproducibility
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Array with nulls injected (converted to object dtype if needed)
|
|
72
|
+
"""
|
|
73
|
+
if null_rate <= 0:
|
|
74
|
+
return values
|
|
75
|
+
|
|
76
|
+
if rng is None:
|
|
77
|
+
rng = np.random.default_rng()
|
|
78
|
+
|
|
79
|
+
mask = rng.random(len(values)) < null_rate
|
|
80
|
+
|
|
81
|
+
# Convert to object dtype to support None values
|
|
82
|
+
result = values.astype(object)
|
|
83
|
+
result[mask] = None
|
|
84
|
+
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
def inject_outliers(
|
|
88
|
+
self,
|
|
89
|
+
values: np.ndarray,
|
|
90
|
+
outlier_rate: float = 0.0,
|
|
91
|
+
multiplier: float = 3.0,
|
|
92
|
+
rng: Optional[np.random.Generator] = None
|
|
93
|
+
) -> np.ndarray:
|
|
94
|
+
"""Inject outlier values into numeric data.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
values: Generated numeric values
|
|
98
|
+
outlier_rate: Fraction of values to make outliers (0.0 to 1.0)
|
|
99
|
+
multiplier: How many std devs to offset outliers
|
|
100
|
+
rng: Random number generator for reproducibility
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Array with outliers injected
|
|
104
|
+
"""
|
|
105
|
+
if outlier_rate <= 0 or not np.issubdtype(values.dtype, np.number):
|
|
106
|
+
return values
|
|
107
|
+
|
|
108
|
+
if rng is None:
|
|
109
|
+
rng = np.random.default_rng()
|
|
110
|
+
|
|
111
|
+
mask = rng.random(len(values)) < outlier_rate
|
|
112
|
+
n_outliers = mask.sum()
|
|
113
|
+
|
|
114
|
+
if n_outliers == 0:
|
|
115
|
+
return values
|
|
116
|
+
|
|
117
|
+
mean = np.mean(values)
|
|
118
|
+
std = np.std(values)
|
|
119
|
+
|
|
120
|
+
if std == 0:
|
|
121
|
+
std = 1.0 # Avoid division by zero
|
|
122
|
+
|
|
123
|
+
# Generate outliers at mean ± multiplier * std
|
|
124
|
+
outlier_values = mean + rng.choice([-1, 1], n_outliers) * multiplier * std
|
|
125
|
+
|
|
126
|
+
result = values.copy()
|
|
127
|
+
result[mask] = outlier_values
|
|
128
|
+
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
def post_process(
|
|
132
|
+
self,
|
|
133
|
+
values: np.ndarray,
|
|
134
|
+
params: Dict[str, Any],
|
|
135
|
+
rng: Optional[np.random.Generator] = None
|
|
136
|
+
) -> np.ndarray:
|
|
137
|
+
"""Apply post-processing: nulls, outliers, etc.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
values: Generated values
|
|
141
|
+
params: Parameters including null_rate, outlier_rate
|
|
142
|
+
rng: Random number generator
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Post-processed values
|
|
146
|
+
"""
|
|
147
|
+
null_rate = params.get("null_rate", 0.0)
|
|
148
|
+
outlier_rate = params.get("outlier_rate", 0.0)
|
|
149
|
+
|
|
150
|
+
# Apply outliers first (on numeric data)
|
|
151
|
+
if outlier_rate > 0:
|
|
152
|
+
values = self.inject_outliers(values, outlier_rate, rng=rng)
|
|
153
|
+
|
|
154
|
+
# Apply nulls last
|
|
155
|
+
if null_rate > 0:
|
|
156
|
+
values = self.inject_nulls(values, null_rate, rng=rng)
|
|
157
|
+
|
|
158
|
+
return values
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class IntegerGenerator(BaseGenerator):
|
|
162
|
+
"""Generator for integer values with various distributions."""
|
|
163
|
+
|
|
164
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
165
|
+
distribution = params.get("distribution", "uniform")
|
|
166
|
+
|
|
167
|
+
if distribution == "sequence":
|
|
168
|
+
start = params.get("start", 1)
|
|
169
|
+
return np.arange(start, start + size)
|
|
170
|
+
|
|
171
|
+
elif distribution == "uniform":
|
|
172
|
+
min_val = params.get("min", 0)
|
|
173
|
+
max_val = params.get("max", 100)
|
|
174
|
+
return np.random.randint(min_val, max_val + 1, size)
|
|
175
|
+
|
|
176
|
+
elif distribution == "normal":
|
|
177
|
+
mean = params.get("mean", 50)
|
|
178
|
+
std = params.get("std", 10)
|
|
179
|
+
return np.clip(np.random.normal(mean, std, size).astype(int), 0, None)
|
|
180
|
+
|
|
181
|
+
elif distribution == "poisson":
|
|
182
|
+
lam = params.get("lambda", 5)
|
|
183
|
+
return np.random.poisson(lam, size)
|
|
184
|
+
|
|
185
|
+
elif distribution == "binomial":
|
|
186
|
+
n = params.get("n", 10)
|
|
187
|
+
p = params.get("p", 0.5)
|
|
188
|
+
return np.random.binomial(n, p, size)
|
|
189
|
+
|
|
190
|
+
else:
|
|
191
|
+
raise ColumnGenerationError(
|
|
192
|
+
f"Unknown integer distribution: {distribution}",
|
|
193
|
+
column_type="int",
|
|
194
|
+
suggestion="Use 'uniform', 'normal', 'poisson', 'binomial', or 'sequence'"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class FloatGenerator(BaseGenerator):
|
|
199
|
+
"""Generator for floating-point values with various distributions."""
|
|
200
|
+
|
|
201
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
202
|
+
distribution = params.get("distribution", "uniform")
|
|
203
|
+
decimals = params.get("decimals", 2)
|
|
204
|
+
|
|
205
|
+
if distribution == "uniform":
|
|
206
|
+
min_val = params.get("min", 0.0)
|
|
207
|
+
max_val = params.get("max", 100.0)
|
|
208
|
+
values = np.random.uniform(min_val, max_val, size)
|
|
209
|
+
|
|
210
|
+
elif distribution == "normal":
|
|
211
|
+
mean = params.get("mean", 50.0)
|
|
212
|
+
std = params.get("std", 10.0)
|
|
213
|
+
values = np.random.normal(mean, std, size)
|
|
214
|
+
|
|
215
|
+
elif distribution == "exponential":
|
|
216
|
+
scale = params.get("scale", 1.0)
|
|
217
|
+
values = np.random.exponential(scale, size)
|
|
218
|
+
|
|
219
|
+
elif distribution == "lognormal":
|
|
220
|
+
mean = params.get("mean", 0.0)
|
|
221
|
+
sigma = params.get("sigma", 1.0)
|
|
222
|
+
values = np.random.lognormal(mean, sigma, size)
|
|
223
|
+
|
|
224
|
+
elif distribution == "beta":
|
|
225
|
+
a = params.get("a", 2.0)
|
|
226
|
+
b = params.get("b", 5.0)
|
|
227
|
+
values = np.random.beta(a, b, size)
|
|
228
|
+
|
|
229
|
+
else:
|
|
230
|
+
raise ColumnGenerationError(
|
|
231
|
+
f"Unknown float distribution: {distribution}",
|
|
232
|
+
column_type="float",
|
|
233
|
+
suggestion="Use 'uniform', 'normal', 'exponential', 'lognormal', or 'beta'"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return np.round(values, decimals)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class BooleanGenerator(BaseGenerator):
|
|
240
|
+
"""Generator for boolean values."""
|
|
241
|
+
|
|
242
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
243
|
+
probability = params.get("probability", 0.5)
|
|
244
|
+
return np.random.random(size) < probability
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class CategoricalGenerator(BaseGenerator):
|
|
248
|
+
"""Generator for categorical values with optional weights."""
|
|
249
|
+
|
|
250
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
251
|
+
choices = params.get("choices", [])
|
|
252
|
+
if not choices:
|
|
253
|
+
raise ColumnGenerationError(
|
|
254
|
+
"No choices provided for categorical column",
|
|
255
|
+
column_type="categorical",
|
|
256
|
+
suggestion="Add 'choices' parameter with list of values"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
weights = params.get("weights")
|
|
260
|
+
if weights:
|
|
261
|
+
if len(weights) != len(choices):
|
|
262
|
+
raise ColumnGenerationError(
|
|
263
|
+
f"Weights length ({len(weights)}) doesn't match choices length ({len(choices)})",
|
|
264
|
+
column_type="categorical",
|
|
265
|
+
suggestion="Ensure weights and choices have the same length"
|
|
266
|
+
)
|
|
267
|
+
# Normalize weights
|
|
268
|
+
weights = np.array(weights) / sum(weights)
|
|
269
|
+
|
|
270
|
+
return np.random.choice(choices, size=size, p=weights)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class DateGenerator(BaseGenerator):
|
|
274
|
+
"""Generator for date values."""
|
|
275
|
+
|
|
276
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
277
|
+
import pandas as pd
|
|
278
|
+
|
|
279
|
+
start = params.get("start", "2020-01-01")
|
|
280
|
+
end = params.get("end", "2024-12-31")
|
|
281
|
+
distribution = params.get("distribution", "uniform")
|
|
282
|
+
|
|
283
|
+
start_ts = pd.Timestamp(start).value // 10**9
|
|
284
|
+
end_ts = pd.Timestamp(end).value // 10**9
|
|
285
|
+
|
|
286
|
+
if distribution == "uniform":
|
|
287
|
+
timestamps = np.random.randint(start_ts, end_ts, size)
|
|
288
|
+
elif distribution == "recent":
|
|
289
|
+
# Bias towards recent dates (exponential decay)
|
|
290
|
+
u = np.random.exponential(0.3, size)
|
|
291
|
+
u = np.clip(u / u.max(), 0, 1)
|
|
292
|
+
timestamps = (start_ts + (end_ts - start_ts) * u).astype(int)
|
|
293
|
+
else:
|
|
294
|
+
timestamps = np.random.randint(start_ts, end_ts, size)
|
|
295
|
+
|
|
296
|
+
return pd.to_datetime(timestamps, unit='s').strftime('%Y-%m-%d').values
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class TextGenerator(BaseGenerator):
|
|
300
|
+
"""Generator for text values using Faker or patterns."""
|
|
301
|
+
|
|
302
|
+
def __init__(self):
|
|
303
|
+
try:
|
|
304
|
+
from faker import Faker
|
|
305
|
+
self._faker = Faker()
|
|
306
|
+
except ImportError:
|
|
307
|
+
self._faker = None
|
|
308
|
+
|
|
309
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
310
|
+
text_type = params.get("text_type", params.get("distribution", "uuid"))
|
|
311
|
+
|
|
312
|
+
if text_type == "uuid" or text_type == "text":
|
|
313
|
+
import uuid
|
|
314
|
+
return np.array([str(uuid.uuid4()) for _ in range(size)])
|
|
315
|
+
|
|
316
|
+
if self._faker is None:
|
|
317
|
+
# Fallback without faker
|
|
318
|
+
return np.array([f"text_{i}" for i in range(size)])
|
|
319
|
+
|
|
320
|
+
faker_methods = {
|
|
321
|
+
"name": self._faker.name,
|
|
322
|
+
"fake.name": self._faker.name,
|
|
323
|
+
"email": self._faker.email,
|
|
324
|
+
"fake.email": self._faker.email,
|
|
325
|
+
"address": self._faker.address,
|
|
326
|
+
"fake.address": self._faker.address,
|
|
327
|
+
"company": self._faker.company,
|
|
328
|
+
"fake.company": self._faker.company,
|
|
329
|
+
"phone": self._faker.phone_number,
|
|
330
|
+
"fake.phone": self._faker.phone_number,
|
|
331
|
+
"city": self._faker.city,
|
|
332
|
+
"country": self._faker.country,
|
|
333
|
+
"job": self._faker.job,
|
|
334
|
+
"sentence": self._faker.sentence,
|
|
335
|
+
"paragraph": self._faker.paragraph,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
method = faker_methods.get(text_type)
|
|
339
|
+
if method:
|
|
340
|
+
return np.array([method() for _ in range(size)])
|
|
341
|
+
|
|
342
|
+
# Default to name
|
|
343
|
+
return np.array([self._faker.name() for _ in range(size)])
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
class ForeignKeyGenerator(BaseGenerator):
|
|
347
|
+
"""Generator for foreign key references."""
|
|
348
|
+
|
|
349
|
+
def __init__(self, parent_ids: Optional[np.ndarray] = None):
|
|
350
|
+
self.parent_ids = parent_ids
|
|
351
|
+
|
|
352
|
+
def set_parent_ids(self, parent_ids: np.ndarray) -> None:
|
|
353
|
+
"""Set the valid parent IDs for foreign key generation."""
|
|
354
|
+
self.parent_ids = parent_ids
|
|
355
|
+
|
|
356
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
357
|
+
if self.parent_ids is None or len(self.parent_ids) == 0:
|
|
358
|
+
raise ColumnGenerationError(
|
|
359
|
+
"No parent IDs available for foreign key generation",
|
|
360
|
+
column_type="foreign_key",
|
|
361
|
+
suggestion="Ensure parent table is generated before child table"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return np.random.choice(self.parent_ids, size=size)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
# ============ Generator Factory ============
|
|
368
|
+
|
|
369
|
+
class GeneratorFactory:
|
|
370
|
+
"""Factory for creating generators based on column type.
|
|
371
|
+
|
|
372
|
+
Example:
|
|
373
|
+
factory = GeneratorFactory()
|
|
374
|
+
gen = factory.get_generator("int")
|
|
375
|
+
values = gen.generate(1000, {"min": 1, "max": 100})
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
_generators: Dict[str, Type[BaseGenerator]] = {
|
|
379
|
+
"int": IntegerGenerator,
|
|
380
|
+
"integer": IntegerGenerator,
|
|
381
|
+
"float": FloatGenerator,
|
|
382
|
+
"double": FloatGenerator,
|
|
383
|
+
"decimal": FloatGenerator,
|
|
384
|
+
"boolean": BooleanGenerator,
|
|
385
|
+
"bool": BooleanGenerator,
|
|
386
|
+
"categorical": CategoricalGenerator,
|
|
387
|
+
"category": CategoricalGenerator,
|
|
388
|
+
"date": DateGenerator,
|
|
389
|
+
"datetime": DateGenerator,
|
|
390
|
+
"text": TextGenerator,
|
|
391
|
+
"string": TextGenerator,
|
|
392
|
+
"varchar": TextGenerator,
|
|
393
|
+
"foreign_key": ForeignKeyGenerator,
|
|
394
|
+
"fk": ForeignKeyGenerator,
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
_instances: Dict[str, BaseGenerator] = {}
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def register(cls, column_type: str, generator_class: Type[BaseGenerator]) -> None:
|
|
401
|
+
"""Register a custom generator for a column type.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
column_type: Type name (e.g., "custom_int")
|
|
405
|
+
generator_class: Generator class to use
|
|
406
|
+
"""
|
|
407
|
+
cls._generators[column_type.lower()] = generator_class
|
|
408
|
+
|
|
409
|
+
@classmethod
|
|
410
|
+
def get_generator(cls, column_type: str) -> BaseGenerator:
|
|
411
|
+
"""Get a generator instance for the given column type.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
column_type: Column type (e.g., "int", "text", "date")
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
Generator instance
|
|
418
|
+
|
|
419
|
+
Raises:
|
|
420
|
+
ColumnGenerationError: If column type is not supported
|
|
421
|
+
"""
|
|
422
|
+
column_type = column_type.lower()
|
|
423
|
+
|
|
424
|
+
if column_type not in cls._generators:
|
|
425
|
+
raise ColumnGenerationError(
|
|
426
|
+
f"Unsupported column type: {column_type}",
|
|
427
|
+
column_type=column_type,
|
|
428
|
+
suggestion=f"Supported types: {', '.join(cls._generators.keys())}"
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Get or create instance
|
|
432
|
+
if column_type not in cls._instances:
|
|
433
|
+
cls._instances[column_type] = cls._generators[column_type]()
|
|
434
|
+
|
|
435
|
+
return cls._instances[column_type]
|
|
436
|
+
|
|
437
|
+
@classmethod
|
|
438
|
+
def create_foreign_key_generator(cls, parent_ids: np.ndarray) -> ForeignKeyGenerator:
|
|
439
|
+
"""Create a foreign key generator with parent IDs.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
parent_ids: Array of valid parent IDs
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
Configured ForeignKeyGenerator
|
|
446
|
+
"""
|
|
447
|
+
gen = ForeignKeyGenerator(parent_ids)
|
|
448
|
+
return gen
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
class ConditionalCategoricalGenerator(BaseGenerator):
|
|
452
|
+
"""Generator for categorical values that depend on another column.
|
|
453
|
+
|
|
454
|
+
Use this for hierarchical data like state/country, department/role.
|
|
455
|
+
|
|
456
|
+
Example:
|
|
457
|
+
lookup = {
|
|
458
|
+
"USA": ["California", "Texas", "New York"],
|
|
459
|
+
"UK": ["England", "Scotland", "Wales"],
|
|
460
|
+
"Germany": ["Bavaria", "Berlin", "Hamburg"],
|
|
461
|
+
}
|
|
462
|
+
gen = ConditionalCategoricalGenerator(lookup, "country")
|
|
463
|
+
states = gen.generate(1000, {"parent_values": country_column})
|
|
464
|
+
"""
|
|
465
|
+
|
|
466
|
+
def __init__(
|
|
467
|
+
self,
|
|
468
|
+
lookup: Dict[str, List[str]],
|
|
469
|
+
parent_column: str,
|
|
470
|
+
default_values: Optional[List[str]] = None
|
|
471
|
+
):
|
|
472
|
+
"""Initialize conditional generator.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
lookup: Mapping from parent value to list of child values
|
|
476
|
+
parent_column: Name of the parent column
|
|
477
|
+
default_values: Values to use if parent not in lookup
|
|
478
|
+
"""
|
|
479
|
+
self.lookup = lookup
|
|
480
|
+
self.parent_column = parent_column
|
|
481
|
+
self.default_values = default_values or list(lookup.values())[0] if lookup else ["Unknown"]
|
|
482
|
+
|
|
483
|
+
def generate(self, size: int, params: Dict[str, Any]) -> np.ndarray:
|
|
484
|
+
"""Generate values conditioned on parent column.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
size: Number of values to generate
|
|
488
|
+
params: Must include 'parent_values' array
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Array of generated values
|
|
492
|
+
"""
|
|
493
|
+
parent_values = params.get("parent_values")
|
|
494
|
+
|
|
495
|
+
if parent_values is None:
|
|
496
|
+
# No parent values, use uniform random from all possible values
|
|
497
|
+
all_values = []
|
|
498
|
+
for values in self.lookup.values():
|
|
499
|
+
all_values.extend(values)
|
|
500
|
+
if not all_values:
|
|
501
|
+
all_values = self.default_values
|
|
502
|
+
return np.random.choice(all_values, size=size)
|
|
503
|
+
|
|
504
|
+
# Convert to array if needed
|
|
505
|
+
parent_values = np.asarray(parent_values)
|
|
506
|
+
|
|
507
|
+
if len(parent_values) != size:
|
|
508
|
+
raise ColumnGenerationError(
|
|
509
|
+
f"Parent values length ({len(parent_values)}) doesn't match size ({size})",
|
|
510
|
+
column_type="conditional_categorical",
|
|
511
|
+
suggestion="Ensure parent column is generated first"
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Generate conditional values
|
|
515
|
+
result = np.empty(size, dtype=object)
|
|
516
|
+
for i, parent in enumerate(parent_values):
|
|
517
|
+
choices = self.lookup.get(str(parent), self.default_values)
|
|
518
|
+
result[i] = np.random.choice(choices)
|
|
519
|
+
|
|
520
|
+
return result
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
# ============ Built-in Lookup Tables ============
|
|
524
|
+
|
|
525
|
+
CONDITIONAL_LOOKUPS = {
|
|
526
|
+
"country_to_state": {
|
|
527
|
+
"USA": ["California", "Texas", "New York", "Florida", "Illinois", "Pennsylvania", "Ohio", "Georgia", "Michigan", "North Carolina"],
|
|
528
|
+
"UK": ["England", "Scotland", "Wales", "Northern Ireland"],
|
|
529
|
+
"Germany": ["Bavaria", "Berlin", "Hamburg", "Hesse", "North Rhine-Westphalia", "Baden-Württemberg"],
|
|
530
|
+
"France": ["Île-de-France", "Provence", "Normandy", "Brittany", "Alsace"],
|
|
531
|
+
"Canada": ["Ontario", "Quebec", "British Columbia", "Alberta", "Manitoba"],
|
|
532
|
+
"Australia": ["New South Wales", "Victoria", "Queensland", "Western Australia", "South Australia"],
|
|
533
|
+
"India": ["Maharashtra", "Karnataka", "Tamil Nadu", "Delhi", "Gujarat", "Uttar Pradesh"],
|
|
534
|
+
"Japan": ["Tokyo", "Osaka", "Kyoto", "Hokkaido", "Okinawa"],
|
|
535
|
+
},
|
|
536
|
+
"department_to_role": {
|
|
537
|
+
"Engineering": ["Software Engineer", "Senior Engineer", "Staff Engineer", "Principal Engineer", "Engineering Manager"],
|
|
538
|
+
"Product": ["Product Manager", "Senior PM", "Product Director", "VP Product", "Product Analyst"],
|
|
539
|
+
"Design": ["UX Designer", "UI Designer", "Product Designer", "Design Lead", "Design Director"],
|
|
540
|
+
"Sales": ["Sales Rep", "Account Executive", "Sales Manager", "Sales Director", "VP Sales"],
|
|
541
|
+
"Marketing": ["Marketing Manager", "Content Strategist", "Growth Manager", "Marketing Director", "CMO"],
|
|
542
|
+
"HR": ["HR Manager", "Recruiter", "HR Director", "People Partner", "VP People"],
|
|
543
|
+
"Finance": ["Financial Analyst", "Accountant", "Controller", "Finance Director", "CFO"],
|
|
544
|
+
},
|
|
545
|
+
"category_to_subcategory": {
|
|
546
|
+
"Electronics": ["Smartphones", "Laptops", "Tablets", "Accessories", "Wearables"],
|
|
547
|
+
"Clothing": ["Men's Apparel", "Women's Apparel", "Kids", "Shoes", "Accessories"],
|
|
548
|
+
"Home & Garden": ["Furniture", "Decor", "Kitchen", "Outdoor", "Bedding"],
|
|
549
|
+
"Sports": ["Fitness", "Outdoor Sports", "Team Sports", "Water Sports", "Winter Sports"],
|
|
550
|
+
"Books": ["Fiction", "Non-Fiction", "Academic", "Children's", "Comics"],
|
|
551
|
+
},
|
|
552
|
+
"industry_to_company_type": {
|
|
553
|
+
"Technology": ["SaaS", "Consumer Tech", "Enterprise Software", "AI/ML", "Cybersecurity"],
|
|
554
|
+
"Healthcare": ["Hospital", "Pharmaceutical", "Biotech", "Medical Device", "Health Insurance"],
|
|
555
|
+
"Finance": ["Bank", "Investment Firm", "Insurance", "Fintech", "Credit Union"],
|
|
556
|
+
"Retail": ["E-commerce", "Brick & Mortar", "Wholesale", "Specialty Retail", "Marketplace"],
|
|
557
|
+
"Manufacturing": ["Automotive", "Electronics", "Consumer Goods", "Industrial", "Aerospace"],
|
|
558
|
+
},
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def create_conditional_generator(
|
|
563
|
+
lookup_name: str,
|
|
564
|
+
parent_column: str
|
|
565
|
+
) -> ConditionalCategoricalGenerator:
|
|
566
|
+
"""Create a conditional generator from built-in lookup tables.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
lookup_name: Name of the lookup (e.g., "country_to_state")
|
|
570
|
+
parent_column: Name of the parent column
|
|
571
|
+
|
|
572
|
+
Returns:
|
|
573
|
+
Configured ConditionalCategoricalGenerator
|
|
574
|
+
"""
|
|
575
|
+
if lookup_name not in CONDITIONAL_LOOKUPS:
|
|
576
|
+
available = ", ".join(CONDITIONAL_LOOKUPS.keys())
|
|
577
|
+
raise ColumnGenerationError(
|
|
578
|
+
f"Unknown lookup: {lookup_name}",
|
|
579
|
+
column_type="conditional_categorical",
|
|
580
|
+
suggestion=f"Available lookups: {available}"
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
return ConditionalCategoricalGenerator(
|
|
584
|
+
lookup=CONDITIONAL_LOOKUPS[lookup_name],
|
|
585
|
+
parent_column=parent_column
|
|
586
|
+
)
|