misata 0.2.0b0__py3-none-any.whl → 0.3.1b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +77 -2
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/profiles.py +332 -0
- misata/simulator.py +133 -12
- misata/smart_values.py +171 -2
- misata/streaming.py +228 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/METADATA +1 -1
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/RECORD +18 -10
- /misata/{generators.py → generators_legacy.py} +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/WHEEL +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/entry_points.txt +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/licenses/LICENSE +0 -0
- {misata-0.2.0b0.dist-info → misata-0.3.1b0.dist-info}/top_level.txt +0 -0
misata/profiles.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distribution Profiles for Realistic Data Generation.
|
|
3
|
+
|
|
4
|
+
Pre-configured distribution parameters that match real-world patterns
|
|
5
|
+
for common data types like age, salary, prices, etc.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DistributionProfile:
|
|
13
|
+
"""A named distribution configuration for realistic generation.
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
profile = DistributionProfile(
|
|
17
|
+
name="age",
|
|
18
|
+
distribution="mixture",
|
|
19
|
+
params={
|
|
20
|
+
"components": [
|
|
21
|
+
{"mean": 35, "std": 12, "weight": 0.6}, # Working age
|
|
22
|
+
{"mean": 70, "std": 8, "weight": 0.2}, # Retirees
|
|
23
|
+
{"mean": 12, "std": 4, "weight": 0.2}, # Children
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
)
|
|
27
|
+
values = profile.generate(1000)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
name: str,
|
|
33
|
+
distribution: str,
|
|
34
|
+
params: Dict[str, Any],
|
|
35
|
+
min_value: Optional[float] = None,
|
|
36
|
+
max_value: Optional[float] = None,
|
|
37
|
+
decimals: Optional[int] = None,
|
|
38
|
+
):
|
|
39
|
+
self.name = name
|
|
40
|
+
self.distribution = distribution
|
|
41
|
+
self.params = params
|
|
42
|
+
self.min_value = min_value
|
|
43
|
+
self.max_value = max_value
|
|
44
|
+
self.decimals = decimals
|
|
45
|
+
|
|
46
|
+
def generate(
|
|
47
|
+
self,
|
|
48
|
+
size: int,
|
|
49
|
+
rng: Optional[np.random.Generator] = None
|
|
50
|
+
) -> np.ndarray:
|
|
51
|
+
"""Generate values according to this profile."""
|
|
52
|
+
if rng is None:
|
|
53
|
+
rng = np.random.default_rng()
|
|
54
|
+
|
|
55
|
+
if self.distribution == "normal":
|
|
56
|
+
mean = self.params.get("mean", 50)
|
|
57
|
+
std = self.params.get("std", 10)
|
|
58
|
+
values = rng.normal(mean, std, size)
|
|
59
|
+
|
|
60
|
+
elif self.distribution == "lognormal":
|
|
61
|
+
mean = self.params.get("mean", 0)
|
|
62
|
+
sigma = self.params.get("sigma", 1)
|
|
63
|
+
values = rng.lognormal(mean, sigma, size)
|
|
64
|
+
|
|
65
|
+
elif self.distribution == "exponential":
|
|
66
|
+
scale = self.params.get("scale", 1.0)
|
|
67
|
+
values = rng.exponential(scale, size)
|
|
68
|
+
|
|
69
|
+
elif self.distribution == "pareto":
|
|
70
|
+
alpha = self.params.get("alpha", 2.0)
|
|
71
|
+
min_val = self.params.get("min", 1.0)
|
|
72
|
+
values = (rng.pareto(alpha, size) + 1) * min_val
|
|
73
|
+
|
|
74
|
+
elif self.distribution == "beta":
|
|
75
|
+
a = self.params.get("a", 2)
|
|
76
|
+
b = self.params.get("b", 5)
|
|
77
|
+
scale = self.params.get("scale", 1.0)
|
|
78
|
+
values = rng.beta(a, b, size) * scale
|
|
79
|
+
|
|
80
|
+
elif self.distribution == "mixture":
|
|
81
|
+
# Gaussian mixture model
|
|
82
|
+
components = self.params.get("components", [])
|
|
83
|
+
if not components:
|
|
84
|
+
values = rng.normal(0, 1, size)
|
|
85
|
+
else:
|
|
86
|
+
weights = np.array([c.get("weight", 1) for c in components])
|
|
87
|
+
weights = weights / weights.sum()
|
|
88
|
+
|
|
89
|
+
# Sample component indices
|
|
90
|
+
component_indices = rng.choice(
|
|
91
|
+
len(components), size=size, p=weights
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
values = np.zeros(size)
|
|
95
|
+
for i, comp in enumerate(components):
|
|
96
|
+
mask = component_indices == i
|
|
97
|
+
n = mask.sum()
|
|
98
|
+
if n > 0:
|
|
99
|
+
values[mask] = rng.normal(
|
|
100
|
+
comp.get("mean", 0),
|
|
101
|
+
comp.get("std", 1),
|
|
102
|
+
n
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
elif self.distribution == "zipf":
|
|
106
|
+
# Zipf distribution for long-tail data
|
|
107
|
+
a = self.params.get("alpha", 2.0)
|
|
108
|
+
values = rng.zipf(a, size).astype(float)
|
|
109
|
+
|
|
110
|
+
elif self.distribution == "uniform":
|
|
111
|
+
low = self.params.get("min", 0)
|
|
112
|
+
high = self.params.get("max", 100)
|
|
113
|
+
values = rng.uniform(low, high, size)
|
|
114
|
+
|
|
115
|
+
else:
|
|
116
|
+
# Default to uniform
|
|
117
|
+
values = rng.uniform(0, 100, size)
|
|
118
|
+
|
|
119
|
+
# Apply constraints
|
|
120
|
+
if self.min_value is not None:
|
|
121
|
+
values = np.maximum(values, self.min_value)
|
|
122
|
+
if self.max_value is not None:
|
|
123
|
+
values = np.minimum(values, self.max_value)
|
|
124
|
+
if self.decimals is not None:
|
|
125
|
+
values = np.round(values, self.decimals)
|
|
126
|
+
|
|
127
|
+
return values
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ============ Pre-built Profiles ============
|
|
131
|
+
|
|
132
|
+
PROFILES: Dict[str, DistributionProfile] = {}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _register_profile(profile: DistributionProfile) -> None:
|
|
136
|
+
"""Register a profile by name."""
|
|
137
|
+
PROFILES[profile.name] = profile
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Age distributions
|
|
141
|
+
_register_profile(DistributionProfile(
|
|
142
|
+
name="age_adult",
|
|
143
|
+
distribution="mixture",
|
|
144
|
+
params={
|
|
145
|
+
"components": [
|
|
146
|
+
{"mean": 28, "std": 6, "weight": 0.3}, # Young adults
|
|
147
|
+
{"mean": 42, "std": 10, "weight": 0.45}, # Middle age
|
|
148
|
+
{"mean": 62, "std": 8, "weight": 0.25}, # Older adults
|
|
149
|
+
]
|
|
150
|
+
},
|
|
151
|
+
min_value=18,
|
|
152
|
+
max_value=100,
|
|
153
|
+
decimals=0,
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
_register_profile(DistributionProfile(
|
|
157
|
+
name="age_population",
|
|
158
|
+
distribution="mixture",
|
|
159
|
+
params={
|
|
160
|
+
"components": [
|
|
161
|
+
{"mean": 8, "std": 4, "weight": 0.15}, # Children
|
|
162
|
+
{"mean": 25, "std": 8, "weight": 0.25}, # Young adults
|
|
163
|
+
{"mean": 42, "std": 12, "weight": 0.35}, # Middle age
|
|
164
|
+
{"mean": 68, "std": 10, "weight": 0.25}, # Seniors
|
|
165
|
+
]
|
|
166
|
+
},
|
|
167
|
+
min_value=0,
|
|
168
|
+
max_value=105,
|
|
169
|
+
decimals=0,
|
|
170
|
+
))
|
|
171
|
+
|
|
172
|
+
# Salary distributions
|
|
173
|
+
_register_profile(DistributionProfile(
|
|
174
|
+
name="salary_usd",
|
|
175
|
+
distribution="lognormal",
|
|
176
|
+
params={"mean": 11.0, "sigma": 0.5}, # Log of ~$60k median
|
|
177
|
+
min_value=25000,
|
|
178
|
+
max_value=500000,
|
|
179
|
+
decimals=0,
|
|
180
|
+
))
|
|
181
|
+
|
|
182
|
+
_register_profile(DistributionProfile(
|
|
183
|
+
name="salary_tech",
|
|
184
|
+
distribution="mixture",
|
|
185
|
+
params={
|
|
186
|
+
"components": [
|
|
187
|
+
{"mean": 75000, "std": 15000, "weight": 0.2}, # Junior
|
|
188
|
+
{"mean": 120000, "std": 25000, "weight": 0.4}, # Mid
|
|
189
|
+
{"mean": 180000, "std": 40000, "weight": 0.3}, # Senior
|
|
190
|
+
{"mean": 280000, "std": 60000, "weight": 0.1}, # Staff+
|
|
191
|
+
]
|
|
192
|
+
},
|
|
193
|
+
min_value=50000,
|
|
194
|
+
max_value=600000,
|
|
195
|
+
decimals=0,
|
|
196
|
+
))
|
|
197
|
+
|
|
198
|
+
# Price distributions
|
|
199
|
+
_register_profile(DistributionProfile(
|
|
200
|
+
name="price_retail",
|
|
201
|
+
distribution="lognormal",
|
|
202
|
+
params={"mean": 3.5, "sigma": 1.2}, # ~$30 median
|
|
203
|
+
min_value=0.99,
|
|
204
|
+
max_value=10000,
|
|
205
|
+
decimals=2,
|
|
206
|
+
))
|
|
207
|
+
|
|
208
|
+
_register_profile(DistributionProfile(
|
|
209
|
+
name="price_saas",
|
|
210
|
+
distribution="mixture",
|
|
211
|
+
params={
|
|
212
|
+
"components": [
|
|
213
|
+
{"mean": 15, "std": 5, "weight": 0.3}, # Basic tier
|
|
214
|
+
{"mean": 49, "std": 15, "weight": 0.4}, # Pro tier
|
|
215
|
+
{"mean": 199, "std": 50, "weight": 0.25}, # Enterprise
|
|
216
|
+
{"mean": 999, "std": 200, "weight": 0.05}, # Custom
|
|
217
|
+
]
|
|
218
|
+
},
|
|
219
|
+
min_value=0,
|
|
220
|
+
max_value=5000,
|
|
221
|
+
decimals=0,
|
|
222
|
+
))
|
|
223
|
+
|
|
224
|
+
# Transaction amounts
|
|
225
|
+
_register_profile(DistributionProfile(
|
|
226
|
+
name="transaction_amount",
|
|
227
|
+
distribution="pareto",
|
|
228
|
+
params={"alpha": 2.5, "min": 10},
|
|
229
|
+
min_value=1,
|
|
230
|
+
max_value=100000,
|
|
231
|
+
decimals=2,
|
|
232
|
+
))
|
|
233
|
+
|
|
234
|
+
# Counts / quantities
|
|
235
|
+
_register_profile(DistributionProfile(
|
|
236
|
+
name="order_quantity",
|
|
237
|
+
distribution="zipf",
|
|
238
|
+
params={"alpha": 2.0},
|
|
239
|
+
min_value=1,
|
|
240
|
+
max_value=100,
|
|
241
|
+
decimals=0,
|
|
242
|
+
))
|
|
243
|
+
|
|
244
|
+
# Time-related
|
|
245
|
+
_register_profile(DistributionProfile(
|
|
246
|
+
name="session_duration_seconds",
|
|
247
|
+
distribution="lognormal",
|
|
248
|
+
params={"mean": 5.5, "sigma": 1.5}, # ~4 min median
|
|
249
|
+
min_value=1,
|
|
250
|
+
max_value=7200, # 2 hours max
|
|
251
|
+
decimals=0,
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
# Ratings and scores
|
|
255
|
+
_register_profile(DistributionProfile(
|
|
256
|
+
name="rating_5star",
|
|
257
|
+
distribution="beta",
|
|
258
|
+
params={"a": 5, "b": 2, "scale": 5}, # Skewed towards higher ratings
|
|
259
|
+
min_value=1,
|
|
260
|
+
max_value=5,
|
|
261
|
+
decimals=1,
|
|
262
|
+
))
|
|
263
|
+
|
|
264
|
+
_register_profile(DistributionProfile(
|
|
265
|
+
name="nps_score",
|
|
266
|
+
distribution="mixture",
|
|
267
|
+
params={
|
|
268
|
+
"components": [
|
|
269
|
+
{"mean": 3, "std": 2, "weight": 0.15}, # Detractors
|
|
270
|
+
{"mean": 7, "std": 1, "weight": 0.25}, # Passives
|
|
271
|
+
{"mean": 9, "std": 0.8, "weight": 0.6}, # Promoters
|
|
272
|
+
]
|
|
273
|
+
},
|
|
274
|
+
min_value=0,
|
|
275
|
+
max_value=10,
|
|
276
|
+
decimals=0,
|
|
277
|
+
))
|
|
278
|
+
|
|
279
|
+
# Percentages
|
|
280
|
+
_register_profile(DistributionProfile(
|
|
281
|
+
name="conversion_rate",
|
|
282
|
+
distribution="beta",
|
|
283
|
+
params={"a": 2, "b": 50, "scale": 100}, # Low conversion (1-5%)
|
|
284
|
+
min_value=0,
|
|
285
|
+
max_value=100,
|
|
286
|
+
decimals=2,
|
|
287
|
+
))
|
|
288
|
+
|
|
289
|
+
_register_profile(DistributionProfile(
|
|
290
|
+
name="churn_rate",
|
|
291
|
+
distribution="beta",
|
|
292
|
+
params={"a": 1.5, "b": 30, "scale": 100}, # ~5% typical
|
|
293
|
+
min_value=0,
|
|
294
|
+
max_value=100,
|
|
295
|
+
decimals=2,
|
|
296
|
+
))
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def get_profile(name: str) -> Optional[DistributionProfile]:
|
|
300
|
+
"""Get a profile by name."""
|
|
301
|
+
return PROFILES.get(name)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def list_profiles() -> List[str]:
|
|
305
|
+
"""List all available profile names."""
|
|
306
|
+
return list(PROFILES.keys())
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def generate_with_profile(
|
|
310
|
+
profile_name: str,
|
|
311
|
+
size: int,
|
|
312
|
+
rng: Optional[np.random.Generator] = None
|
|
313
|
+
) -> np.ndarray:
|
|
314
|
+
"""Generate values using a named profile.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
profile_name: Name of the profile (e.g., "salary_tech")
|
|
318
|
+
size: Number of values to generate
|
|
319
|
+
rng: Random number generator
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Array of generated values
|
|
323
|
+
|
|
324
|
+
Raises:
|
|
325
|
+
ValueError: If profile not found
|
|
326
|
+
"""
|
|
327
|
+
profile = get_profile(profile_name)
|
|
328
|
+
if profile is None:
|
|
329
|
+
available = ", ".join(list_profiles())
|
|
330
|
+
raise ValueError(f"Unknown profile: {profile_name}. Available: {available}")
|
|
331
|
+
|
|
332
|
+
return profile.generate(size, rng)
|
misata/simulator.py
CHANGED
|
@@ -16,7 +16,9 @@ from typing import Any, Dict, List, Optional
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
import pandas as pd
|
|
18
18
|
|
|
19
|
-
from misata.generators import TextGenerator
|
|
19
|
+
from misata.generators.base import TextGenerator as _FactoryTextGenerator # Generator factory version
|
|
20
|
+
# Use the original generators.py TextGenerator which supports seed
|
|
21
|
+
from misata.generators_legacy import TextGenerator
|
|
20
22
|
from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
|
|
21
23
|
|
|
22
24
|
|
|
@@ -34,6 +36,10 @@ class DataSimulator:
|
|
|
34
36
|
rng: NumPy random generator for reproducibility
|
|
35
37
|
"""
|
|
36
38
|
|
|
39
|
+
# Performance constants
|
|
40
|
+
MAX_CONTEXT_ROWS = 50000 # Cap context storage for memory efficiency
|
|
41
|
+
TEXT_POOL_SIZE = 10000 # Size of text value pools for vectorized sampling
|
|
42
|
+
|
|
37
43
|
def __init__(self, config: SchemaConfig,
|
|
38
44
|
apply_semantic_fixes: bool = True, batch_size: int = 10_000,
|
|
39
45
|
smart_mode: bool = False, use_llm: bool = True):
|
|
@@ -57,6 +63,7 @@ class DataSimulator:
|
|
|
57
63
|
self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
|
|
58
64
|
self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
|
|
59
65
|
self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
|
|
66
|
+
self._text_pools: Dict[str, np.ndarray] = {} # Cache text pools for vectorized sampling
|
|
60
67
|
|
|
61
68
|
# Apply semantic inference to fix column types
|
|
62
69
|
if apply_semantic_fixes:
|
|
@@ -199,10 +206,24 @@ class DataSimulator:
|
|
|
199
206
|
ctx_df = df[cols_to_store].copy()
|
|
200
207
|
|
|
201
208
|
if table_name not in self.context:
|
|
209
|
+
# First batch: store up to MAX_CONTEXT_ROWS
|
|
210
|
+
if len(ctx_df) > self.MAX_CONTEXT_ROWS:
|
|
211
|
+
ctx_df = ctx_df.sample(n=self.MAX_CONTEXT_ROWS, random_state=self.config.seed)
|
|
202
212
|
self.context[table_name] = ctx_df
|
|
203
213
|
else:
|
|
204
|
-
# Append to existing context
|
|
205
|
-
|
|
214
|
+
# Append to existing context, but cap at MAX_CONTEXT_ROWS
|
|
215
|
+
current_len = len(self.context[table_name])
|
|
216
|
+
if current_len >= self.MAX_CONTEXT_ROWS:
|
|
217
|
+
# Already at capacity, use reservoir sampling for randomness
|
|
218
|
+
# Replace some existing rows with new ones (probability-based)
|
|
219
|
+
return # Skip appending, we have enough IDs
|
|
220
|
+
|
|
221
|
+
remaining_space = self.MAX_CONTEXT_ROWS - current_len
|
|
222
|
+
rows_to_add = ctx_df.iloc[:remaining_space]
|
|
223
|
+
self.context[table_name] = pd.concat(
|
|
224
|
+
[self.context[table_name], rows_to_add],
|
|
225
|
+
ignore_index=True
|
|
226
|
+
)
|
|
206
227
|
|
|
207
228
|
def generate_column(
|
|
208
229
|
self,
|
|
@@ -225,6 +246,70 @@ class DataSimulator:
|
|
|
225
246
|
"""
|
|
226
247
|
params = column.distribution_params
|
|
227
248
|
|
|
249
|
+
# ========== CORRELATED COLUMN GENERATION ==========
|
|
250
|
+
# If this column depends on another column's value, use conditional distribution
|
|
251
|
+
if "depends_on" in params and table_data is not None:
|
|
252
|
+
parent_col = params["depends_on"]
|
|
253
|
+
mapping = params.get("mapping", {})
|
|
254
|
+
|
|
255
|
+
if parent_col in table_data.columns and mapping:
|
|
256
|
+
parent_values = table_data[parent_col].values
|
|
257
|
+
|
|
258
|
+
# Check if it's numeric or categorical mapping
|
|
259
|
+
first_val = next(iter(mapping.values()))
|
|
260
|
+
if isinstance(first_val, dict) and "mean" in first_val:
|
|
261
|
+
# Numeric conditional distribution (e.g., salary based on job_title)
|
|
262
|
+
# mapping = {"Intern": {"mean": 40000, "std": 5000}, "CTO": {"mean": 200000, "std": 30000}}
|
|
263
|
+
values = np.zeros(size)
|
|
264
|
+
for key, dist in mapping.items():
|
|
265
|
+
mask = parent_values == key
|
|
266
|
+
count = mask.sum()
|
|
267
|
+
if count > 0:
|
|
268
|
+
mean = dist.get("mean", 50000)
|
|
269
|
+
std = dist.get("std", mean * 0.1)
|
|
270
|
+
values[mask] = self.rng.normal(mean, std, count)
|
|
271
|
+
|
|
272
|
+
# Handle values that didn't match any key (use default)
|
|
273
|
+
default = params.get("default", {"mean": 50000, "std": 10000})
|
|
274
|
+
unmatched = ~np.isin(parent_values, list(mapping.keys()))
|
|
275
|
+
if unmatched.sum() > 0:
|
|
276
|
+
values[unmatched] = self.rng.normal(
|
|
277
|
+
default.get("mean", 50000),
|
|
278
|
+
default.get("std", 10000),
|
|
279
|
+
unmatched.sum()
|
|
280
|
+
)
|
|
281
|
+
return values
|
|
282
|
+
|
|
283
|
+
elif isinstance(first_val, list):
|
|
284
|
+
# Categorical conditional (e.g., state based on country)
|
|
285
|
+
# mapping = {"USA": ["CA", "TX", "NY"], "UK": ["England", "Scotland"]}
|
|
286
|
+
values = np.empty(size, dtype=object)
|
|
287
|
+
for key, choices in mapping.items():
|
|
288
|
+
mask = parent_values == key
|
|
289
|
+
count = mask.sum()
|
|
290
|
+
if count > 0:
|
|
291
|
+
values[mask] = self.rng.choice(choices, count)
|
|
292
|
+
|
|
293
|
+
# Default for unmatched
|
|
294
|
+
default_choices = params.get("default", ["Unknown"])
|
|
295
|
+
unmatched = values == None # noqa
|
|
296
|
+
if unmatched.sum() > 0:
|
|
297
|
+
values[unmatched] = self.rng.choice(default_choices, unmatched.sum())
|
|
298
|
+
return values
|
|
299
|
+
|
|
300
|
+
elif isinstance(first_val, (int, float)):
|
|
301
|
+
# Probability-based boolean (e.g., churn probability based on plan)
|
|
302
|
+
# mapping = {"free": 0.3, "pro": 0.1, "enterprise": 0.05}
|
|
303
|
+
values = np.zeros(size, dtype=bool)
|
|
304
|
+
for key, prob in mapping.items():
|
|
305
|
+
mask = parent_values == key
|
|
306
|
+
count = mask.sum()
|
|
307
|
+
if count > 0:
|
|
308
|
+
values[mask] = self.rng.random(count) < prob
|
|
309
|
+
return values
|
|
310
|
+
|
|
311
|
+
# ========== STANDARD COLUMN GENERATION ==========
|
|
312
|
+
|
|
228
313
|
# CATEGORICAL
|
|
229
314
|
if column.type == "categorical":
|
|
230
315
|
choices = params.get("choices", ["A", "B", "C"])
|
|
@@ -469,23 +554,59 @@ class DataSimulator:
|
|
|
469
554
|
return values
|
|
470
555
|
|
|
471
556
|
if text_type == "name":
|
|
472
|
-
|
|
557
|
+
pool_key = "text_name"
|
|
558
|
+
if pool_key not in self._text_pools:
|
|
559
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
560
|
+
self._text_pools[pool_key] = np.array([self.text_gen.name() for _ in range(pool_size)])
|
|
561
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
473
562
|
elif text_type == "email":
|
|
474
|
-
|
|
563
|
+
pool_key = "text_email"
|
|
564
|
+
if pool_key not in self._text_pools:
|
|
565
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
566
|
+
self._text_pools[pool_key] = np.array([self.text_gen.email() for _ in range(pool_size)])
|
|
567
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
475
568
|
elif text_type == "company":
|
|
476
|
-
|
|
569
|
+
pool_key = "text_company"
|
|
570
|
+
if pool_key not in self._text_pools:
|
|
571
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
572
|
+
self._text_pools[pool_key] = np.array([self.text_gen.company() for _ in range(pool_size)])
|
|
573
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
477
574
|
elif text_type == "sentence":
|
|
478
|
-
|
|
575
|
+
pool_key = "text_sentence"
|
|
576
|
+
if pool_key not in self._text_pools:
|
|
577
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
578
|
+
self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
|
|
579
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
479
580
|
elif text_type == "word":
|
|
480
|
-
|
|
581
|
+
pool_key = "text_word"
|
|
582
|
+
if pool_key not in self._text_pools:
|
|
583
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
584
|
+
self._text_pools[pool_key] = np.array([self.text_gen.word() for _ in range(pool_size)])
|
|
585
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
481
586
|
elif text_type == "address":
|
|
482
|
-
|
|
587
|
+
pool_key = "text_address"
|
|
588
|
+
if pool_key not in self._text_pools:
|
|
589
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
590
|
+
self._text_pools[pool_key] = np.array([self.text_gen.full_address() for _ in range(pool_size)])
|
|
591
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
483
592
|
elif text_type == "phone":
|
|
484
|
-
|
|
593
|
+
pool_key = "text_phone"
|
|
594
|
+
if pool_key not in self._text_pools:
|
|
595
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
596
|
+
self._text_pools[pool_key] = np.array([self.text_gen.phone_number() for _ in range(pool_size)])
|
|
597
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
485
598
|
elif text_type == "url":
|
|
486
|
-
|
|
599
|
+
pool_key = "text_url"
|
|
600
|
+
if pool_key not in self._text_pools:
|
|
601
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
602
|
+
self._text_pools[pool_key] = np.array([self.text_gen.url() for _ in range(pool_size)])
|
|
603
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
487
604
|
else:
|
|
488
|
-
|
|
605
|
+
pool_key = "text_sentence"
|
|
606
|
+
if pool_key not in self._text_pools:
|
|
607
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
608
|
+
self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
|
|
609
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
489
610
|
|
|
490
611
|
return values
|
|
491
612
|
|