misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/simulator.py
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core DataSimulator class for high-performance synthetic data generation.
|
|
3
|
+
|
|
4
|
+
This module implements vectorized data generation with support for:
|
|
5
|
+
- Topological sorting of table dependencies
|
|
6
|
+
- Vectorized column generation (NO LOOPS)
|
|
7
|
+
- Referential integrity enforcement
|
|
8
|
+
- Scenario event application
|
|
9
|
+
- Pure Python text generation (no external dependencies)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import warnings
|
|
13
|
+
from collections import defaultdict, deque
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
from misata.generators import TextGenerator
|
|
20
|
+
from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DataSimulator:
|
|
24
|
+
"""
|
|
25
|
+
High-performance synthetic data simulator.
|
|
26
|
+
|
|
27
|
+
Generates synthetic datasets based on SchemaConfig definitions,
|
|
28
|
+
using vectorized operations for maximum performance.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
config: Schema configuration
|
|
32
|
+
data: Generated dataframes (table_name -> DataFrame)
|
|
33
|
+
text_gen: TextGenerator for entity generation
|
|
34
|
+
rng: NumPy random generator for reproducibility
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, config: SchemaConfig,
|
|
38
|
+
apply_semantic_fixes: bool = True, batch_size: int = 10_000):
|
|
39
|
+
"""
|
|
40
|
+
Initialize the simulator.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: Schema configuration defining tables, columns, and relationships
|
|
44
|
+
apply_semantic_fixes: Auto-fix column types based on semantic patterns
|
|
45
|
+
batch_size: Number of rows to generate per batch
|
|
46
|
+
"""
|
|
47
|
+
self.config = config
|
|
48
|
+
self.context: Dict[str, pd.DataFrame] = {} # Lightweight context (IDs only)
|
|
49
|
+
self.text_gen = TextGenerator(seed=config.seed)
|
|
50
|
+
self.batch_size = batch_size
|
|
51
|
+
self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
|
|
52
|
+
self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
|
|
53
|
+
|
|
54
|
+
# Apply semantic inference to fix column types
|
|
55
|
+
if apply_semantic_fixes:
|
|
56
|
+
from misata.semantic import apply_semantic_inference
|
|
57
|
+
self.config.columns = apply_semantic_inference(self.config.columns)
|
|
58
|
+
|
|
59
|
+
# Set random seed if provided
|
|
60
|
+
seed = config.seed if config.seed is not None else np.random.randint(0, 2**32 - 1)
|
|
61
|
+
self.rng = np.random.default_rng(seed)
|
|
62
|
+
np.random.seed(seed) # For legacy numpy.random calls
|
|
63
|
+
|
|
64
|
+
def topological_sort(self) -> List[str]:
|
|
65
|
+
"""
|
|
66
|
+
Determine table generation order using topological sort.
|
|
67
|
+
|
|
68
|
+
Parent tables must be generated before child tables to ensure
|
|
69
|
+
referential integrity.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
List of table names in dependency order
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If circular dependencies are detected
|
|
76
|
+
"""
|
|
77
|
+
# Build adjacency list and in-degree map
|
|
78
|
+
graph = defaultdict(list)
|
|
79
|
+
in_degree = {table.name: 0 for table in self.config.tables}
|
|
80
|
+
|
|
81
|
+
for rel in self.config.relationships:
|
|
82
|
+
graph[rel.parent_table].append(rel.child_table)
|
|
83
|
+
in_degree[rel.child_table] += 1
|
|
84
|
+
|
|
85
|
+
# Kahn's algorithm for topological sort
|
|
86
|
+
queue = deque([name for name, degree in in_degree.items() if degree == 0])
|
|
87
|
+
sorted_tables = []
|
|
88
|
+
|
|
89
|
+
while queue:
|
|
90
|
+
table_name = queue.popleft()
|
|
91
|
+
sorted_tables.append(table_name)
|
|
92
|
+
|
|
93
|
+
for neighbor in graph[table_name]:
|
|
94
|
+
in_degree[neighbor] -= 1
|
|
95
|
+
if in_degree[neighbor] == 0:
|
|
96
|
+
queue.append(neighbor)
|
|
97
|
+
|
|
98
|
+
# Check for circular dependencies
|
|
99
|
+
if len(sorted_tables) != len(self.config.tables):
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Circular dependency detected in relationships. "
|
|
102
|
+
f"Generated {len(sorted_tables)} / {len(self.config.tables)} tables."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return sorted_tables
|
|
106
|
+
|
|
107
|
+
def _get_parent_ids(self, relationship: Relationship) -> np.ndarray:
|
|
108
|
+
"""
|
|
109
|
+
Get valid parent IDs for foreign key generation, applying filters if defined.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
relationship: Relationship definition
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Array of valid parent IDs
|
|
116
|
+
"""
|
|
117
|
+
if relationship.parent_table not in self.context:
|
|
118
|
+
return np.array([])
|
|
119
|
+
|
|
120
|
+
parent_df = self.context[relationship.parent_table]
|
|
121
|
+
if relationship.parent_key not in parent_df.columns:
|
|
122
|
+
return np.array([])
|
|
123
|
+
|
|
124
|
+
# Apply filters if defined (Logic Gap Fix)
|
|
125
|
+
# Apply filters if defined (Logic Gap Fix)
|
|
126
|
+
if relationship.filters:
|
|
127
|
+
mask = np.ones(len(parent_df), dtype=bool)
|
|
128
|
+
for col, val in relationship.filters.items():
|
|
129
|
+
if col in parent_df.columns:
|
|
130
|
+
mask &= (parent_df[col] == val)
|
|
131
|
+
else:
|
|
132
|
+
# If filter column missing from context, can't filter.
|
|
133
|
+
# Assume mismatch if column missing.
|
|
134
|
+
mask[:]=False
|
|
135
|
+
|
|
136
|
+
valid_ids = parent_df.loc[mask, relationship.parent_key].values
|
|
137
|
+
else:
|
|
138
|
+
valid_ids = parent_df[relationship.parent_key].values
|
|
139
|
+
|
|
140
|
+
return valid_ids
|
|
141
|
+
|
|
142
|
+
def _update_context(self, table_name: str, df: pd.DataFrame) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Update the context with key columns from the generated batch.
|
|
145
|
+
|
|
146
|
+
Smart Context Logic:
|
|
147
|
+
1. Store Primary Key ('id')
|
|
148
|
+
2. Store columns used as foreign keys by children (parent_key)
|
|
149
|
+
3. Store columns used in Relationship filters (Logic Gap fix)
|
|
150
|
+
4. Store columns used in 'relative_to' date constraints (Time Travel fix)
|
|
151
|
+
"""
|
|
152
|
+
needed_cols = {'id'}
|
|
153
|
+
|
|
154
|
+
# 2. FK and Filter dependencies
|
|
155
|
+
for rel in self.config.relationships:
|
|
156
|
+
if rel.parent_table == table_name:
|
|
157
|
+
needed_cols.add(rel.parent_key)
|
|
158
|
+
if rel.filters:
|
|
159
|
+
for col in rel.filters.keys():
|
|
160
|
+
needed_cols.add(col)
|
|
161
|
+
|
|
162
|
+
# 4. Filter 'relative_to' dependencies
|
|
163
|
+
# This requires scanning ALL columns of ALL child tables to see if they reference this table
|
|
164
|
+
# Optimization: Build this dependency map once in __init__?
|
|
165
|
+
# For now, we scan here. It's fast enough for schema sizes < 100 tables.
|
|
166
|
+
for child_table in self.config.tables:
|
|
167
|
+
child_cols = self.config.get_columns(child_table.name)
|
|
168
|
+
for col in child_cols:
|
|
169
|
+
if col.type == 'date' and 'relative_to' in col.distribution_params:
|
|
170
|
+
# Format: "parent_table.column"
|
|
171
|
+
try:
|
|
172
|
+
ptable, pcol = col.distribution_params['relative_to'].split('.')
|
|
173
|
+
if ptable == table_name:
|
|
174
|
+
needed_cols.add(pcol)
|
|
175
|
+
except:
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
cols_to_store = [c for c in needed_cols if c in df.columns]
|
|
179
|
+
if not cols_to_store:
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
ctx_df = df[cols_to_store].copy()
|
|
183
|
+
|
|
184
|
+
if table_name not in self.context:
|
|
185
|
+
self.context[table_name] = ctx_df
|
|
186
|
+
else:
|
|
187
|
+
# Append to existing context
|
|
188
|
+
self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
|
|
189
|
+
|
|
190
|
+
def generate_column(
|
|
191
|
+
self,
|
|
192
|
+
table_name: str,
|
|
193
|
+
column: Column,
|
|
194
|
+
size: int,
|
|
195
|
+
table_data: Optional[pd.DataFrame] = None,
|
|
196
|
+
) -> np.ndarray:
|
|
197
|
+
"""
|
|
198
|
+
Generate a single column using vectorized operations.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
table_name: Name of the table being generated
|
|
202
|
+
column: Column definition
|
|
203
|
+
size: Number of values to generate
|
|
204
|
+
table_data: Partially generated table (for columns that depend on other columns)
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Numpy array of generated values
|
|
208
|
+
"""
|
|
209
|
+
params = column.distribution_params
|
|
210
|
+
|
|
211
|
+
# CATEGORICAL
|
|
212
|
+
if column.type == "categorical":
|
|
213
|
+
choices = params["choices"]
|
|
214
|
+
probabilities = params.get("probabilities", None)
|
|
215
|
+
|
|
216
|
+
if probabilities is not None:
|
|
217
|
+
# Normalize probabilities
|
|
218
|
+
probabilities = np.array(probabilities)
|
|
219
|
+
probabilities = probabilities / probabilities.sum()
|
|
220
|
+
|
|
221
|
+
values = self.rng.choice(choices, size=size, p=probabilities)
|
|
222
|
+
return values
|
|
223
|
+
|
|
224
|
+
# INTEGER
|
|
225
|
+
elif column.type == "int":
|
|
226
|
+
# Handle unique integer generation
|
|
227
|
+
if column.unique:
|
|
228
|
+
pool_key = f"{table_name}.{column.name}"
|
|
229
|
+
|
|
230
|
+
# Verify we aren't asking for more uniques than possible
|
|
231
|
+
low = params.get("min", 0)
|
|
232
|
+
high = params.get("max", 1000)
|
|
233
|
+
total_needed_for_table = self.config.get_table(table_name).row_count
|
|
234
|
+
|
|
235
|
+
if pool_key not in self._unique_pools:
|
|
236
|
+
# Check range capacity
|
|
237
|
+
if (high - low) < total_needed_for_table:
|
|
238
|
+
# Auto-expand range to fix user error (common in tests/small ranges)
|
|
239
|
+
warnings.warn(f"Range {high-low} too small for unique column {column.name} (needs {total_needed_for_table}). Extending max.")
|
|
240
|
+
high = low + total_needed_for_table + 100
|
|
241
|
+
|
|
242
|
+
# Generate full permutation
|
|
243
|
+
pool = np.arange(low, high)
|
|
244
|
+
self.rng.shuffle(pool)
|
|
245
|
+
self._unique_pools[pool_key] = pool
|
|
246
|
+
self._unique_counters[pool_key] = 0
|
|
247
|
+
|
|
248
|
+
# Fetch chunk
|
|
249
|
+
current_idx = self._unique_counters[pool_key]
|
|
250
|
+
if current_idx + size > len(self._unique_pools[pool_key]):
|
|
251
|
+
raise ValueError(f"Exhausted unique values for {column.name}")
|
|
252
|
+
|
|
253
|
+
values = self._unique_pools[pool_key][current_idx : current_idx + size]
|
|
254
|
+
self._unique_counters[pool_key] += size
|
|
255
|
+
return values.astype(int)
|
|
256
|
+
|
|
257
|
+
distribution = params.get("distribution", "normal")
|
|
258
|
+
|
|
259
|
+
# Handle categorical distribution (fixed choices)
|
|
260
|
+
if distribution == "categorical" or "choices" in params:
|
|
261
|
+
choices = params.get("choices", [1, 2, 3, 4, 5])
|
|
262
|
+
probabilities = params.get("probabilities", None)
|
|
263
|
+
if probabilities is not None:
|
|
264
|
+
probabilities = np.array(probabilities)
|
|
265
|
+
probabilities = probabilities / probabilities.sum()
|
|
266
|
+
values = self.rng.choice(choices, size=size, p=probabilities)
|
|
267
|
+
return np.array(values).astype(int)
|
|
268
|
+
elif distribution == "normal":
|
|
269
|
+
mean = params.get("mean", 100)
|
|
270
|
+
std = params.get("std", 20)
|
|
271
|
+
values = self.rng.normal(mean, std, size=size).astype(int)
|
|
272
|
+
elif distribution == "uniform":
|
|
273
|
+
low = params.get("min", 0)
|
|
274
|
+
high = params.get("max", 1000)
|
|
275
|
+
values = self.rng.integers(low, high, size=size)
|
|
276
|
+
elif distribution == "poisson":
|
|
277
|
+
lam = params.get("lambda", 10)
|
|
278
|
+
values = self.rng.poisson(lam, size=size)
|
|
279
|
+
else:
|
|
280
|
+
low = params.get("min", 0)
|
|
281
|
+
high = params.get("max", 1000)
|
|
282
|
+
values = self.rng.integers(low, high, size=size)
|
|
283
|
+
|
|
284
|
+
if "min" in params:
|
|
285
|
+
values = np.maximum(values, params["min"])
|
|
286
|
+
if "max" in params:
|
|
287
|
+
values = np.minimum(values, params["max"])
|
|
288
|
+
|
|
289
|
+
return values
|
|
290
|
+
|
|
291
|
+
# FLOAT
|
|
292
|
+
elif column.type == "float":
|
|
293
|
+
distribution = params.get("distribution", "normal")
|
|
294
|
+
|
|
295
|
+
if distribution == "categorical" or "choices" in params:
|
|
296
|
+
choices = params.get("choices", [1.0, 2.0, 3.0])
|
|
297
|
+
probabilities = params.get("probabilities", None)
|
|
298
|
+
if probabilities is not None:
|
|
299
|
+
probabilities = np.array(probabilities)
|
|
300
|
+
probabilities = probabilities / probabilities.sum()
|
|
301
|
+
values = self.rng.choice(choices, size=size, p=probabilities)
|
|
302
|
+
return np.array(values).astype(float)
|
|
303
|
+
elif distribution == "normal":
|
|
304
|
+
mean = params.get("mean", 100.0)
|
|
305
|
+
std = params.get("std", 20.0)
|
|
306
|
+
values = self.rng.normal(mean, std, size=size)
|
|
307
|
+
elif distribution == "uniform":
|
|
308
|
+
low = params.get("min", 0.0)
|
|
309
|
+
high = params.get("max", 1000.0)
|
|
310
|
+
values = self.rng.uniform(low, high, size=size)
|
|
311
|
+
elif distribution == "exponential":
|
|
312
|
+
scale = params.get("scale", 1.0)
|
|
313
|
+
values = self.rng.exponential(scale, size=size)
|
|
314
|
+
else:
|
|
315
|
+
low = params.get("min", 0.0)
|
|
316
|
+
high = params.get("max", 1000.0)
|
|
317
|
+
values = self.rng.uniform(low, high, size=size)
|
|
318
|
+
|
|
319
|
+
if "min" in params:
|
|
320
|
+
values = np.maximum(values, params["min"])
|
|
321
|
+
if "max" in params:
|
|
322
|
+
values = np.minimum(values, params["max"])
|
|
323
|
+
if "decimals" in params:
|
|
324
|
+
values = np.round(values, params["decimals"])
|
|
325
|
+
|
|
326
|
+
return values
|
|
327
|
+
|
|
328
|
+
# DATE
|
|
329
|
+
elif column.type == "date":
|
|
330
|
+
# Parent-Relative Date Generation (Time Travel Fix)
|
|
331
|
+
if "relative_to" in params:
|
|
332
|
+
# Format: "parent_table.column_name"
|
|
333
|
+
try:
|
|
334
|
+
rel_table, rel_col = params["relative_to"].split(".")
|
|
335
|
+
# Find relationship
|
|
336
|
+
relationship = None
|
|
337
|
+
for rel in self.config.relationships:
|
|
338
|
+
if rel.child_table == table_name and rel.parent_table == rel_table:
|
|
339
|
+
relationship = rel
|
|
340
|
+
break
|
|
341
|
+
|
|
342
|
+
if relationship and table_data is not None and relationship.child_key in table_data.columns:
|
|
343
|
+
# Vectorized lookup!
|
|
344
|
+
child_fk_values = table_data[relationship.child_key].values
|
|
345
|
+
parent_df = self.context.get(rel_table)
|
|
346
|
+
|
|
347
|
+
if parent_df is not None and rel_col in parent_df.columns:
|
|
348
|
+
# Map FK to Parent Date
|
|
349
|
+
# Create a lookup series/dict
|
|
350
|
+
parent_date_map = parent_df.set_index(relationship.parent_key)[rel_col]
|
|
351
|
+
parent_dates = parent_date_map.reindex(child_fk_values).values
|
|
352
|
+
|
|
353
|
+
# Generate deltas
|
|
354
|
+
min_delta = params.get("min_delta_days", 0)
|
|
355
|
+
max_delta = params.get("max_delta_days", 365)
|
|
356
|
+
deltas = self.rng.integers(min_delta, max_delta, size=size)
|
|
357
|
+
deltas_ns = deltas.astype('timedelta64[D]')
|
|
358
|
+
|
|
359
|
+
# Child Date = Parent Date + Delta
|
|
360
|
+
values = parent_dates + deltas_ns
|
|
361
|
+
return values
|
|
362
|
+
except Exception as e:
|
|
363
|
+
warnings.warn(f"Failed to generate relative date: {e}. Falling back to random range.")
|
|
364
|
+
|
|
365
|
+
start = pd.to_datetime(params["start"])
|
|
366
|
+
end = pd.to_datetime(params["end"])
|
|
367
|
+
|
|
368
|
+
start_int = start.value
|
|
369
|
+
end_int = end.value
|
|
370
|
+
random_ints = self.rng.integers(start_int, end_int, size=size)
|
|
371
|
+
values = pd.to_datetime(random_ints)
|
|
372
|
+
|
|
373
|
+
return values
|
|
374
|
+
|
|
375
|
+
# FOREIGN KEY
|
|
376
|
+
elif column.type == "foreign_key":
|
|
377
|
+
relationship = None
|
|
378
|
+
for rel in self.config.relationships:
|
|
379
|
+
if rel.child_table == table_name and rel.child_key == column.name:
|
|
380
|
+
relationship = rel
|
|
381
|
+
break
|
|
382
|
+
|
|
383
|
+
if relationship is None:
|
|
384
|
+
warnings.warn(
|
|
385
|
+
f"No relationship defined for foreign key '{column.name}' "
|
|
386
|
+
f"in table '{table_name}'. Generating sequential IDs instead."
|
|
387
|
+
)
|
|
388
|
+
values = self.rng.integers(1, max(size // 10, 100), size=size)
|
|
389
|
+
return values
|
|
390
|
+
|
|
391
|
+
# Check context instead of data
|
|
392
|
+
if relationship.parent_table not in self.context:
|
|
393
|
+
warnings.warn(
|
|
394
|
+
f"Parent table '{relationship.parent_table}' not yet generated for "
|
|
395
|
+
f"foreign key '{column.name}'. Generating sequential IDs instead."
|
|
396
|
+
)
|
|
397
|
+
values = self.rng.integers(1, max(size // 10, 100), size=size)
|
|
398
|
+
return values
|
|
399
|
+
|
|
400
|
+
parent_ids = self._get_parent_ids(relationship)
|
|
401
|
+
|
|
402
|
+
if len(parent_ids) == 0:
|
|
403
|
+
warnings.warn(
|
|
404
|
+
f"Parent table '{relationship.parent_table}' has no valid IDs in context (after filters). "
|
|
405
|
+
f"Generating sequential IDs for foreign key '{column.name}'."
|
|
406
|
+
)
|
|
407
|
+
values = self.rng.integers(1, max(size // 10, 100), size=size)
|
|
408
|
+
return values
|
|
409
|
+
|
|
410
|
+
values = self.rng.choice(parent_ids, size=size)
|
|
411
|
+
return values
|
|
412
|
+
|
|
413
|
+
# TEXT
|
|
414
|
+
elif column.type == "text":
|
|
415
|
+
text_type = params.get("text_type", "sentence")
|
|
416
|
+
|
|
417
|
+
if text_type == "name":
|
|
418
|
+
values = np.array([self.text_gen.name() for _ in range(size)])
|
|
419
|
+
elif text_type == "email":
|
|
420
|
+
values = np.array([self.text_gen.email() for _ in range(size)])
|
|
421
|
+
elif text_type == "company":
|
|
422
|
+
values = np.array([self.text_gen.company() for _ in range(size)])
|
|
423
|
+
elif text_type == "sentence":
|
|
424
|
+
values = np.array([self.text_gen.sentence() for _ in range(size)])
|
|
425
|
+
elif text_type == "word":
|
|
426
|
+
values = np.array([self.text_gen.word() for _ in range(size)])
|
|
427
|
+
elif text_type == "address":
|
|
428
|
+
values = np.array([self.text_gen.full_address() for _ in range(size)])
|
|
429
|
+
elif text_type == "phone":
|
|
430
|
+
values = np.array([self.text_gen.phone_number() for _ in range(size)])
|
|
431
|
+
elif text_type == "url":
|
|
432
|
+
values = np.array([self.text_gen.url() for _ in range(size)])
|
|
433
|
+
else:
|
|
434
|
+
values = np.array([self.text_gen.sentence() for _ in range(size)])
|
|
435
|
+
|
|
436
|
+
return values
|
|
437
|
+
|
|
438
|
+
# BOOLEAN
|
|
439
|
+
elif column.type == "boolean":
|
|
440
|
+
probability = params.get("probability", 0.5)
|
|
441
|
+
values = self.rng.random(size) < probability
|
|
442
|
+
return values
|
|
443
|
+
|
|
444
|
+
else:
|
|
445
|
+
raise ValueError(f"Unknown column type: {column.type}")
|
|
446
|
+
|
|
447
|
+
def apply_event(self, df: pd.DataFrame, event: ScenarioEvent) -> pd.DataFrame:
|
|
448
|
+
"""Apply a scenario event to modify data based on conditions."""
|
|
449
|
+
try:
|
|
450
|
+
mask = df.eval(event.condition)
|
|
451
|
+
except Exception as e:
|
|
452
|
+
warnings.warn(f"Failed to evaluate condition '{event.condition}' for event '{event.name}': {e}")
|
|
453
|
+
return df
|
|
454
|
+
|
|
455
|
+
if event.modifier_type == "multiply":
|
|
456
|
+
df.loc[mask, event.column] *= event.modifier_value
|
|
457
|
+
elif event.modifier_type == "add":
|
|
458
|
+
df.loc[mask, event.column] += event.modifier_value
|
|
459
|
+
elif event.modifier_type == "set":
|
|
460
|
+
df.loc[mask, event.column] = event.modifier_value
|
|
461
|
+
elif event.modifier_type == "function":
|
|
462
|
+
warnings.warn(f"Function modifiers not yet implemented for event '{event.name}'")
|
|
463
|
+
|
|
464
|
+
return df
|
|
465
|
+
|
|
466
|
+
def _update_context(self, table_name: str, df: pd.DataFrame) -> None:
|
|
467
|
+
"""
|
|
468
|
+
Update the context with key columns from the generated batch.
|
|
469
|
+
|
|
470
|
+
Smart Context Logic:
|
|
471
|
+
1. Store Primary Key ('id')
|
|
472
|
+
2. Store columns used as foreign keys by children (parent_key)
|
|
473
|
+
3. Store columns used in Relationship filters (Logic Gap fix)
|
|
474
|
+
4. Store columns used in 'relative_to' date constraints (Time Travel fix)
|
|
475
|
+
"""
|
|
476
|
+
needed_cols = {'id'}
|
|
477
|
+
|
|
478
|
+
# 2. FK and Filter dependencies
|
|
479
|
+
for rel in self.config.relationships:
|
|
480
|
+
if rel.parent_table == table_name:
|
|
481
|
+
needed_cols.add(rel.parent_key)
|
|
482
|
+
if rel.filters:
|
|
483
|
+
for col in rel.filters.keys():
|
|
484
|
+
needed_cols.add(col)
|
|
485
|
+
|
|
486
|
+
# 4. Filter 'relative_to' dependencies
|
|
487
|
+
# This requires scanning ALL columns of ALL child tables to see if they reference this table
|
|
488
|
+
# Optimization: Build this dependency map once in __init__?
|
|
489
|
+
# For now, we scan here. It's fast enough for schema sizes < 100 tables.
|
|
490
|
+
for child_table in self.config.tables:
|
|
491
|
+
child_cols = self.config.get_columns(child_table.name)
|
|
492
|
+
for col in child_cols:
|
|
493
|
+
if col.type == 'date' and 'relative_to' in col.distribution_params:
|
|
494
|
+
# Format: "parent_table.column"
|
|
495
|
+
try:
|
|
496
|
+
ptable, pcol = col.distribution_params['relative_to'].split('.')
|
|
497
|
+
if ptable == table_name:
|
|
498
|
+
needed_cols.add(pcol)
|
|
499
|
+
except Exception:
|
|
500
|
+
pass
|
|
501
|
+
|
|
502
|
+
cols_to_store = [c for c in needed_cols if c in df.columns]
|
|
503
|
+
if not cols_to_store:
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
ctx_df = df[cols_to_store].copy()
|
|
507
|
+
|
|
508
|
+
if table_name not in self.context:
|
|
509
|
+
self.context[table_name] = ctx_df
|
|
510
|
+
else:
|
|
511
|
+
# Append to existing context
|
|
512
|
+
self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
|
|
513
|
+
|
|
514
|
+
def generate_batches(self, table_name: str) -> Any:
|
|
515
|
+
"""
|
|
516
|
+
Yield batches of generated data for a table.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
table_name: Name of the table to generate
|
|
520
|
+
|
|
521
|
+
Yields:
|
|
522
|
+
DataFrame batch
|
|
523
|
+
"""
|
|
524
|
+
table = self.config.get_table(table_name)
|
|
525
|
+
if table is None:
|
|
526
|
+
raise ValueError(f"Table '{table_name}' not found in schema")
|
|
527
|
+
|
|
528
|
+
# Reference table with inline data - yield as single batch
|
|
529
|
+
if table.is_reference and table.inline_data:
|
|
530
|
+
df = pd.DataFrame(table.inline_data)
|
|
531
|
+
self._update_context(table_name, df)
|
|
532
|
+
yield df
|
|
533
|
+
return
|
|
534
|
+
|
|
535
|
+
columns = self.config.get_columns(table_name)
|
|
536
|
+
total_rows = table.row_count
|
|
537
|
+
|
|
538
|
+
rows_generated = 0
|
|
539
|
+
|
|
540
|
+
while rows_generated < total_rows:
|
|
541
|
+
batch_size = min(self.batch_size, total_rows - rows_generated)
|
|
542
|
+
|
|
543
|
+
# Generate batch
|
|
544
|
+
data = {}
|
|
545
|
+
df_batch = pd.DataFrame()
|
|
546
|
+
|
|
547
|
+
for column in columns:
|
|
548
|
+
values = self.generate_column(table_name, column, batch_size, df_batch)
|
|
549
|
+
data[column.name] = values
|
|
550
|
+
df_batch[column.name] = values
|
|
551
|
+
|
|
552
|
+
df_batch = pd.DataFrame(data)
|
|
553
|
+
|
|
554
|
+
# Apply formulas
|
|
555
|
+
df_batch = self._apply_formula_columns(df_batch, table_name)
|
|
556
|
+
|
|
557
|
+
# Post-process
|
|
558
|
+
df_batch = self._fix_correlated_columns(df_batch, table_name)
|
|
559
|
+
|
|
560
|
+
# Apply events
|
|
561
|
+
table_events = [e for e in self.config.events if e.table == table_name]
|
|
562
|
+
for event in table_events:
|
|
563
|
+
df_batch = self.apply_event(df_batch, event)
|
|
564
|
+
|
|
565
|
+
# Apply business rule constraints
|
|
566
|
+
df_batch = self.apply_constraints(df_batch, table)
|
|
567
|
+
|
|
568
|
+
# Update context for future batches/tables
|
|
569
|
+
self._update_context(table_name, df_batch)
|
|
570
|
+
|
|
571
|
+
yield df_batch
|
|
572
|
+
|
|
573
|
+
rows_generated += batch_size
|
|
574
|
+
|
|
575
|
+
def apply_constraints(self, df: pd.DataFrame, table: Any) -> pd.DataFrame:
|
|
576
|
+
"""
|
|
577
|
+
Apply business rule constraints to generated data.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
df: DataFrame batch to constrain
|
|
581
|
+
table: Table definition containing constraints
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
Constrained DataFrame
|
|
585
|
+
"""
|
|
586
|
+
if not hasattr(table, 'constraints') or not table.constraints:
|
|
587
|
+
return df
|
|
588
|
+
|
|
589
|
+
for constraint in table.constraints:
|
|
590
|
+
df = self._apply_single_constraint(df, constraint)
|
|
591
|
+
|
|
592
|
+
return df
|
|
593
|
+
|
|
594
|
+
def _apply_single_constraint(self, df: pd.DataFrame, constraint: Any) -> pd.DataFrame:
|
|
595
|
+
"""Apply a single constraint to the DataFrame."""
|
|
596
|
+
|
|
597
|
+
# Validate required columns exist
|
|
598
|
+
for col in constraint.group_by:
|
|
599
|
+
if col not in df.columns:
|
|
600
|
+
warnings.warn(f"Constraint '{constraint.name}': Column '{col}' not found. Skipping.")
|
|
601
|
+
return df
|
|
602
|
+
|
|
603
|
+
if constraint.column and constraint.column not in df.columns:
|
|
604
|
+
warnings.warn(f"Constraint '{constraint.name}': Target column '{constraint.column}' not found. Skipping.")
|
|
605
|
+
return df
|
|
606
|
+
|
|
607
|
+
if constraint.type == "max_per_group":
|
|
608
|
+
# Cap values per group (e.g., max 8 hours per employee per day)
|
|
609
|
+
if constraint.action == "cap":
|
|
610
|
+
# Simple cap: clip the value column
|
|
611
|
+
df[constraint.column] = df.groupby(constraint.group_by)[constraint.column].transform(
|
|
612
|
+
lambda x: x.clip(upper=constraint.value)
|
|
613
|
+
)
|
|
614
|
+
elif constraint.action == "redistribute":
|
|
615
|
+
# More complex: redistribute excess across the group
|
|
616
|
+
# For now, just cap
|
|
617
|
+
df[constraint.column] = df.groupby(constraint.group_by)[constraint.column].transform(
|
|
618
|
+
lambda x: x.clip(upper=constraint.value)
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
elif constraint.type == "sum_limit":
|
|
622
|
+
# Limit sum per group (e.g., max 8 total hours per employee per day across projects)
|
|
623
|
+
def cap_sum(group):
|
|
624
|
+
total = group[constraint.column].sum()
|
|
625
|
+
if total > constraint.value:
|
|
626
|
+
# Scale down proportionally
|
|
627
|
+
scale = constraint.value / total
|
|
628
|
+
group[constraint.column] = group[constraint.column] * scale
|
|
629
|
+
return group
|
|
630
|
+
|
|
631
|
+
df = df.groupby(constraint.group_by, group_keys=False).apply(cap_sum)
|
|
632
|
+
|
|
633
|
+
elif constraint.type == "unique_combination":
|
|
634
|
+
# Ensure unique combinations (e.g., one timesheet per employee-project-date)
|
|
635
|
+
if constraint.action == "drop":
|
|
636
|
+
df = df.drop_duplicates(subset=constraint.group_by, keep='first')
|
|
637
|
+
|
|
638
|
+
elif constraint.type == "min_per_group":
|
|
639
|
+
# Floor values per group
|
|
640
|
+
if constraint.action == "cap":
|
|
641
|
+
df[constraint.column] = df.groupby(constraint.group_by)[constraint.column].transform(
|
|
642
|
+
lambda x: x.clip(lower=constraint.value)
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
return df
|
|
646
|
+
|
|
647
|
+
def _apply_formula_columns(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
|
|
648
|
+
"""Apply formula-based derived columns using context for lookups."""
|
|
649
|
+
try:
|
|
650
|
+
from misata.formulas import FormulaEngine
|
|
651
|
+
except ImportError:
|
|
652
|
+
return df
|
|
653
|
+
|
|
654
|
+
columns = self.config.get_columns(table_name)
|
|
655
|
+
formula_cols = [c for c in columns if c.distribution_params.get("formula")]
|
|
656
|
+
|
|
657
|
+
if not formula_cols:
|
|
658
|
+
return df
|
|
659
|
+
|
|
660
|
+
# FormulaEngine now needs context, not full data
|
|
661
|
+
# BUT FormulaEngine expects full DataFrames in tables dict for lookups
|
|
662
|
+
# Our self.context IS a Dict[str, pd.DataFrame], just restricted columns
|
|
663
|
+
# So it should work if the formulas only ref columns in context (like id, price)
|
|
664
|
+
# Note: We need to make sure context has columns needed for formulas!
|
|
665
|
+
# Current _update_context only saves PK/FKs.
|
|
666
|
+
# TODO: Analyze formulas to find required context columns?
|
|
667
|
+
# For now, simplistic approach: formulas usually look up 'price', 'cost' etc.
|
|
668
|
+
# We might need to store more in context.
|
|
669
|
+
# Let's trust user or update _update_context to be smarter later.
|
|
670
|
+
|
|
671
|
+
engine = FormulaEngine(self.context)
|
|
672
|
+
|
|
673
|
+
for col in formula_cols:
|
|
674
|
+
formula = col.distribution_params["formula"]
|
|
675
|
+
# For correctness, lookups should work.
|
|
676
|
+
# If context doesn't have the column, FormulaEngine raises Error.
|
|
677
|
+
try:
|
|
678
|
+
result = engine.evaluate_with_lookups(df, formula)
|
|
679
|
+
df[col.name] = result
|
|
680
|
+
except ValueError as e:
|
|
681
|
+
# Warn and skip if context missing
|
|
682
|
+
warnings.warn(f"Formula evaluation failed (context missing?): {e}")
|
|
683
|
+
|
|
684
|
+
return df
|
|
685
|
+
|
|
686
|
+
def _fix_correlated_columns(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
|
|
687
|
+
"""Post-process to fix common semantically correlated columns."""
|
|
688
|
+
columns = list(df.columns)
|
|
689
|
+
if "plan" in columns and "price" in columns:
|
|
690
|
+
plan_prices = {
|
|
691
|
+
"free": 0.0, "basic": 9.99, "starter": 9.99, "premium": 19.99,
|
|
692
|
+
"pro": 19.99, "professional": 29.99, "enterprise": 49.99,
|
|
693
|
+
"business": 49.99, "unlimited": 99.99,
|
|
694
|
+
}
|
|
695
|
+
df["price"] = df["plan"].map(lambda p: plan_prices.get(str(p).lower(), df["price"].iloc[0]))
|
|
696
|
+
return df
|
|
697
|
+
|
|
698
|
+
def generate_all(self):
|
|
699
|
+
"""
|
|
700
|
+
Generate all tables in dependency order.
|
|
701
|
+
|
|
702
|
+
Yields:
|
|
703
|
+
Tuple[str, pd.DataFrame]: (table_name, batch_df)
|
|
704
|
+
"""
|
|
705
|
+
sorted_tables = self.topological_sort()
|
|
706
|
+
|
|
707
|
+
for table_name in sorted_tables:
|
|
708
|
+
for batch in self.generate_batches(table_name):
|
|
709
|
+
yield table_name, batch
|
|
710
|
+
|
|
711
|
+
def export_to_csv(self, output_dir: str = ".") -> None:
|
|
712
|
+
"""
|
|
713
|
+
Export all generated tables to CSV files, creating files progressively.
|
|
714
|
+
"""
|
|
715
|
+
import os
|
|
716
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
717
|
+
|
|
718
|
+
# Track open file handles or just append?
|
|
719
|
+
# Appending is safer.
|
|
720
|
+
files_created = set()
|
|
721
|
+
|
|
722
|
+
for table_name, batch_df in self.generate_all():
|
|
723
|
+
output_path = os.path.join(output_dir, f"{table_name}.csv")
|
|
724
|
+
mode = 'a' if table_name in files_created else 'w'
|
|
725
|
+
header = table_name not in files_created
|
|
726
|
+
|
|
727
|
+
batch_df.to_csv(output_path, mode=mode, header=header, index=False)
|
|
728
|
+
files_created.add(table_name)
|
|
729
|
+
|
|
730
|
+
def get_summary(self) -> str:
|
|
731
|
+
"""
|
|
732
|
+
Get a summary of generated data (from context).
|
|
733
|
+
Only shows context info since full data isn't kept.
|
|
734
|
+
"""
|
|
735
|
+
summary_lines = ["Generated Context Summary (Lightweight):", "=" * 50]
|
|
736
|
+
|
|
737
|
+
for table_name, df in self.context.items():
|
|
738
|
+
summary_lines.append(f"\n{table_name}: {len(df):,} rows tracked in context")
|
|
739
|
+
summary_lines.append(f" Context Columns: {list(df.columns)}")
|
|
740
|
+
summary_lines.append(f" Context Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
|
|
741
|
+
|
|
742
|
+
return "\n".join(summary_lines)
|