misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/simulator.py ADDED
@@ -0,0 +1,742 @@
1
+ """
2
+ Core DataSimulator class for high-performance synthetic data generation.
3
+
4
+ This module implements vectorized data generation with support for:
5
+ - Topological sorting of table dependencies
6
+ - Vectorized column generation (NO LOOPS)
7
+ - Referential integrity enforcement
8
+ - Scenario event application
9
+ - Pure Python text generation (no external dependencies)
10
+ """
11
+
12
+ import warnings
13
+ from collections import defaultdict, deque
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+
19
+ from misata.generators import TextGenerator
20
+ from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
21
+
22
+
23
+ class DataSimulator:
24
+ """
25
+ High-performance synthetic data simulator.
26
+
27
+ Generates synthetic datasets based on SchemaConfig definitions,
28
+ using vectorized operations for maximum performance.
29
+
30
+ Attributes:
31
+ config: Schema configuration
32
+ data: Generated dataframes (table_name -> DataFrame)
33
+ text_gen: TextGenerator for entity generation
34
+ rng: NumPy random generator for reproducibility
35
+ """
36
+
37
+ def __init__(self, config: SchemaConfig,
38
+ apply_semantic_fixes: bool = True, batch_size: int = 10_000):
39
+ """
40
+ Initialize the simulator.
41
+
42
+ Args:
43
+ config: Schema configuration defining tables, columns, and relationships
44
+ apply_semantic_fixes: Auto-fix column types based on semantic patterns
45
+ batch_size: Number of rows to generate per batch
46
+ """
47
+ self.config = config
48
+ self.context: Dict[str, pd.DataFrame] = {} # Lightweight context (IDs only)
49
+ self.text_gen = TextGenerator(seed=config.seed)
50
+ self.batch_size = batch_size
51
+ self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
52
+ self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
53
+
54
+ # Apply semantic inference to fix column types
55
+ if apply_semantic_fixes:
56
+ from misata.semantic import apply_semantic_inference
57
+ self.config.columns = apply_semantic_inference(self.config.columns)
58
+
59
+ # Set random seed if provided
60
+ seed = config.seed if config.seed is not None else np.random.randint(0, 2**32 - 1)
61
+ self.rng = np.random.default_rng(seed)
62
+ np.random.seed(seed) # For legacy numpy.random calls
63
+
64
+ def topological_sort(self) -> List[str]:
65
+ """
66
+ Determine table generation order using topological sort.
67
+
68
+ Parent tables must be generated before child tables to ensure
69
+ referential integrity.
70
+
71
+ Returns:
72
+ List of table names in dependency order
73
+
74
+ Raises:
75
+ ValueError: If circular dependencies are detected
76
+ """
77
+ # Build adjacency list and in-degree map
78
+ graph = defaultdict(list)
79
+ in_degree = {table.name: 0 for table in self.config.tables}
80
+
81
+ for rel in self.config.relationships:
82
+ graph[rel.parent_table].append(rel.child_table)
83
+ in_degree[rel.child_table] += 1
84
+
85
+ # Kahn's algorithm for topological sort
86
+ queue = deque([name for name, degree in in_degree.items() if degree == 0])
87
+ sorted_tables = []
88
+
89
+ while queue:
90
+ table_name = queue.popleft()
91
+ sorted_tables.append(table_name)
92
+
93
+ for neighbor in graph[table_name]:
94
+ in_degree[neighbor] -= 1
95
+ if in_degree[neighbor] == 0:
96
+ queue.append(neighbor)
97
+
98
+ # Check for circular dependencies
99
+ if len(sorted_tables) != len(self.config.tables):
100
+ raise ValueError(
101
+ f"Circular dependency detected in relationships. "
102
+ f"Generated {len(sorted_tables)} / {len(self.config.tables)} tables."
103
+ )
104
+
105
+ return sorted_tables
106
+
107
+ def _get_parent_ids(self, relationship: Relationship) -> np.ndarray:
108
+ """
109
+ Get valid parent IDs for foreign key generation, applying filters if defined.
110
+
111
+ Args:
112
+ relationship: Relationship definition
113
+
114
+ Returns:
115
+ Array of valid parent IDs
116
+ """
117
+ if relationship.parent_table not in self.context:
118
+ return np.array([])
119
+
120
+ parent_df = self.context[relationship.parent_table]
121
+ if relationship.parent_key not in parent_df.columns:
122
+ return np.array([])
123
+
124
+ # Apply filters if defined (Logic Gap Fix)
125
+ # Apply filters if defined (Logic Gap Fix)
126
+ if relationship.filters:
127
+ mask = np.ones(len(parent_df), dtype=bool)
128
+ for col, val in relationship.filters.items():
129
+ if col in parent_df.columns:
130
+ mask &= (parent_df[col] == val)
131
+ else:
132
+ # If filter column missing from context, can't filter.
133
+ # Assume mismatch if column missing.
134
+ mask[:]=False
135
+
136
+ valid_ids = parent_df.loc[mask, relationship.parent_key].values
137
+ else:
138
+ valid_ids = parent_df[relationship.parent_key].values
139
+
140
+ return valid_ids
141
+
142
+ def _update_context(self, table_name: str, df: pd.DataFrame) -> None:
143
+ """
144
+ Update the context with key columns from the generated batch.
145
+
146
+ Smart Context Logic:
147
+ 1. Store Primary Key ('id')
148
+ 2. Store columns used as foreign keys by children (parent_key)
149
+ 3. Store columns used in Relationship filters (Logic Gap fix)
150
+ 4. Store columns used in 'relative_to' date constraints (Time Travel fix)
151
+ """
152
+ needed_cols = {'id'}
153
+
154
+ # 2. FK and Filter dependencies
155
+ for rel in self.config.relationships:
156
+ if rel.parent_table == table_name:
157
+ needed_cols.add(rel.parent_key)
158
+ if rel.filters:
159
+ for col in rel.filters.keys():
160
+ needed_cols.add(col)
161
+
162
+ # 4. Filter 'relative_to' dependencies
163
+ # This requires scanning ALL columns of ALL child tables to see if they reference this table
164
+ # Optimization: Build this dependency map once in __init__?
165
+ # For now, we scan here. It's fast enough for schema sizes < 100 tables.
166
+ for child_table in self.config.tables:
167
+ child_cols = self.config.get_columns(child_table.name)
168
+ for col in child_cols:
169
+ if col.type == 'date' and 'relative_to' in col.distribution_params:
170
+ # Format: "parent_table.column"
171
+ try:
172
+ ptable, pcol = col.distribution_params['relative_to'].split('.')
173
+ if ptable == table_name:
174
+ needed_cols.add(pcol)
175
+ except:
176
+ pass
177
+
178
+ cols_to_store = [c for c in needed_cols if c in df.columns]
179
+ if not cols_to_store:
180
+ return
181
+
182
+ ctx_df = df[cols_to_store].copy()
183
+
184
+ if table_name not in self.context:
185
+ self.context[table_name] = ctx_df
186
+ else:
187
+ # Append to existing context
188
+ self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
189
+
190
+ def generate_column(
191
+ self,
192
+ table_name: str,
193
+ column: Column,
194
+ size: int,
195
+ table_data: Optional[pd.DataFrame] = None,
196
+ ) -> np.ndarray:
197
+ """
198
+ Generate a single column using vectorized operations.
199
+
200
+ Args:
201
+ table_name: Name of the table being generated
202
+ column: Column definition
203
+ size: Number of values to generate
204
+ table_data: Partially generated table (for columns that depend on other columns)
205
+
206
+ Returns:
207
+ Numpy array of generated values
208
+ """
209
+ params = column.distribution_params
210
+
211
+ # CATEGORICAL
212
+ if column.type == "categorical":
213
+ choices = params["choices"]
214
+ probabilities = params.get("probabilities", None)
215
+
216
+ if probabilities is not None:
217
+ # Normalize probabilities
218
+ probabilities = np.array(probabilities)
219
+ probabilities = probabilities / probabilities.sum()
220
+
221
+ values = self.rng.choice(choices, size=size, p=probabilities)
222
+ return values
223
+
224
+ # INTEGER
225
+ elif column.type == "int":
226
+ # Handle unique integer generation
227
+ if column.unique:
228
+ pool_key = f"{table_name}.{column.name}"
229
+
230
+ # Verify we aren't asking for more uniques than possible
231
+ low = params.get("min", 0)
232
+ high = params.get("max", 1000)
233
+ total_needed_for_table = self.config.get_table(table_name).row_count
234
+
235
+ if pool_key not in self._unique_pools:
236
+ # Check range capacity
237
+ if (high - low) < total_needed_for_table:
238
+ # Auto-expand range to fix user error (common in tests/small ranges)
239
+ warnings.warn(f"Range {high-low} too small for unique column {column.name} (needs {total_needed_for_table}). Extending max.")
240
+ high = low + total_needed_for_table + 100
241
+
242
+ # Generate full permutation
243
+ pool = np.arange(low, high)
244
+ self.rng.shuffle(pool)
245
+ self._unique_pools[pool_key] = pool
246
+ self._unique_counters[pool_key] = 0
247
+
248
+ # Fetch chunk
249
+ current_idx = self._unique_counters[pool_key]
250
+ if current_idx + size > len(self._unique_pools[pool_key]):
251
+ raise ValueError(f"Exhausted unique values for {column.name}")
252
+
253
+ values = self._unique_pools[pool_key][current_idx : current_idx + size]
254
+ self._unique_counters[pool_key] += size
255
+ return values.astype(int)
256
+
257
+ distribution = params.get("distribution", "normal")
258
+
259
+ # Handle categorical distribution (fixed choices)
260
+ if distribution == "categorical" or "choices" in params:
261
+ choices = params.get("choices", [1, 2, 3, 4, 5])
262
+ probabilities = params.get("probabilities", None)
263
+ if probabilities is not None:
264
+ probabilities = np.array(probabilities)
265
+ probabilities = probabilities / probabilities.sum()
266
+ values = self.rng.choice(choices, size=size, p=probabilities)
267
+ return np.array(values).astype(int)
268
+ elif distribution == "normal":
269
+ mean = params.get("mean", 100)
270
+ std = params.get("std", 20)
271
+ values = self.rng.normal(mean, std, size=size).astype(int)
272
+ elif distribution == "uniform":
273
+ low = params.get("min", 0)
274
+ high = params.get("max", 1000)
275
+ values = self.rng.integers(low, high, size=size)
276
+ elif distribution == "poisson":
277
+ lam = params.get("lambda", 10)
278
+ values = self.rng.poisson(lam, size=size)
279
+ else:
280
+ low = params.get("min", 0)
281
+ high = params.get("max", 1000)
282
+ values = self.rng.integers(low, high, size=size)
283
+
284
+ if "min" in params:
285
+ values = np.maximum(values, params["min"])
286
+ if "max" in params:
287
+ values = np.minimum(values, params["max"])
288
+
289
+ return values
290
+
291
+ # FLOAT
292
+ elif column.type == "float":
293
+ distribution = params.get("distribution", "normal")
294
+
295
+ if distribution == "categorical" or "choices" in params:
296
+ choices = params.get("choices", [1.0, 2.0, 3.0])
297
+ probabilities = params.get("probabilities", None)
298
+ if probabilities is not None:
299
+ probabilities = np.array(probabilities)
300
+ probabilities = probabilities / probabilities.sum()
301
+ values = self.rng.choice(choices, size=size, p=probabilities)
302
+ return np.array(values).astype(float)
303
+ elif distribution == "normal":
304
+ mean = params.get("mean", 100.0)
305
+ std = params.get("std", 20.0)
306
+ values = self.rng.normal(mean, std, size=size)
307
+ elif distribution == "uniform":
308
+ low = params.get("min", 0.0)
309
+ high = params.get("max", 1000.0)
310
+ values = self.rng.uniform(low, high, size=size)
311
+ elif distribution == "exponential":
312
+ scale = params.get("scale", 1.0)
313
+ values = self.rng.exponential(scale, size=size)
314
+ else:
315
+ low = params.get("min", 0.0)
316
+ high = params.get("max", 1000.0)
317
+ values = self.rng.uniform(low, high, size=size)
318
+
319
+ if "min" in params:
320
+ values = np.maximum(values, params["min"])
321
+ if "max" in params:
322
+ values = np.minimum(values, params["max"])
323
+ if "decimals" in params:
324
+ values = np.round(values, params["decimals"])
325
+
326
+ return values
327
+
328
+ # DATE
329
+ elif column.type == "date":
330
+ # Parent-Relative Date Generation (Time Travel Fix)
331
+ if "relative_to" in params:
332
+ # Format: "parent_table.column_name"
333
+ try:
334
+ rel_table, rel_col = params["relative_to"].split(".")
335
+ # Find relationship
336
+ relationship = None
337
+ for rel in self.config.relationships:
338
+ if rel.child_table == table_name and rel.parent_table == rel_table:
339
+ relationship = rel
340
+ break
341
+
342
+ if relationship and table_data is not None and relationship.child_key in table_data.columns:
343
+ # Vectorized lookup!
344
+ child_fk_values = table_data[relationship.child_key].values
345
+ parent_df = self.context.get(rel_table)
346
+
347
+ if parent_df is not None and rel_col in parent_df.columns:
348
+ # Map FK to Parent Date
349
+ # Create a lookup series/dict
350
+ parent_date_map = parent_df.set_index(relationship.parent_key)[rel_col]
351
+ parent_dates = parent_date_map.reindex(child_fk_values).values
352
+
353
+ # Generate deltas
354
+ min_delta = params.get("min_delta_days", 0)
355
+ max_delta = params.get("max_delta_days", 365)
356
+ deltas = self.rng.integers(min_delta, max_delta, size=size)
357
+ deltas_ns = deltas.astype('timedelta64[D]')
358
+
359
+ # Child Date = Parent Date + Delta
360
+ values = parent_dates + deltas_ns
361
+ return values
362
+ except Exception as e:
363
+ warnings.warn(f"Failed to generate relative date: {e}. Falling back to random range.")
364
+
365
+ start = pd.to_datetime(params["start"])
366
+ end = pd.to_datetime(params["end"])
367
+
368
+ start_int = start.value
369
+ end_int = end.value
370
+ random_ints = self.rng.integers(start_int, end_int, size=size)
371
+ values = pd.to_datetime(random_ints)
372
+
373
+ return values
374
+
375
+ # FOREIGN KEY
376
+ elif column.type == "foreign_key":
377
+ relationship = None
378
+ for rel in self.config.relationships:
379
+ if rel.child_table == table_name and rel.child_key == column.name:
380
+ relationship = rel
381
+ break
382
+
383
+ if relationship is None:
384
+ warnings.warn(
385
+ f"No relationship defined for foreign key '{column.name}' "
386
+ f"in table '{table_name}'. Generating sequential IDs instead."
387
+ )
388
+ values = self.rng.integers(1, max(size // 10, 100), size=size)
389
+ return values
390
+
391
+ # Check context instead of data
392
+ if relationship.parent_table not in self.context:
393
+ warnings.warn(
394
+ f"Parent table '{relationship.parent_table}' not yet generated for "
395
+ f"foreign key '{column.name}'. Generating sequential IDs instead."
396
+ )
397
+ values = self.rng.integers(1, max(size // 10, 100), size=size)
398
+ return values
399
+
400
+ parent_ids = self._get_parent_ids(relationship)
401
+
402
+ if len(parent_ids) == 0:
403
+ warnings.warn(
404
+ f"Parent table '{relationship.parent_table}' has no valid IDs in context (after filters). "
405
+ f"Generating sequential IDs for foreign key '{column.name}'."
406
+ )
407
+ values = self.rng.integers(1, max(size // 10, 100), size=size)
408
+ return values
409
+
410
+ values = self.rng.choice(parent_ids, size=size)
411
+ return values
412
+
413
+ # TEXT
414
+ elif column.type == "text":
415
+ text_type = params.get("text_type", "sentence")
416
+
417
+ if text_type == "name":
418
+ values = np.array([self.text_gen.name() for _ in range(size)])
419
+ elif text_type == "email":
420
+ values = np.array([self.text_gen.email() for _ in range(size)])
421
+ elif text_type == "company":
422
+ values = np.array([self.text_gen.company() for _ in range(size)])
423
+ elif text_type == "sentence":
424
+ values = np.array([self.text_gen.sentence() for _ in range(size)])
425
+ elif text_type == "word":
426
+ values = np.array([self.text_gen.word() for _ in range(size)])
427
+ elif text_type == "address":
428
+ values = np.array([self.text_gen.full_address() for _ in range(size)])
429
+ elif text_type == "phone":
430
+ values = np.array([self.text_gen.phone_number() for _ in range(size)])
431
+ elif text_type == "url":
432
+ values = np.array([self.text_gen.url() for _ in range(size)])
433
+ else:
434
+ values = np.array([self.text_gen.sentence() for _ in range(size)])
435
+
436
+ return values
437
+
438
+ # BOOLEAN
439
+ elif column.type == "boolean":
440
+ probability = params.get("probability", 0.5)
441
+ values = self.rng.random(size) < probability
442
+ return values
443
+
444
+ else:
445
+ raise ValueError(f"Unknown column type: {column.type}")
446
+
447
+ def apply_event(self, df: pd.DataFrame, event: ScenarioEvent) -> pd.DataFrame:
448
+ """Apply a scenario event to modify data based on conditions."""
449
+ try:
450
+ mask = df.eval(event.condition)
451
+ except Exception as e:
452
+ warnings.warn(f"Failed to evaluate condition '{event.condition}' for event '{event.name}': {e}")
453
+ return df
454
+
455
+ if event.modifier_type == "multiply":
456
+ df.loc[mask, event.column] *= event.modifier_value
457
+ elif event.modifier_type == "add":
458
+ df.loc[mask, event.column] += event.modifier_value
459
+ elif event.modifier_type == "set":
460
+ df.loc[mask, event.column] = event.modifier_value
461
+ elif event.modifier_type == "function":
462
+ warnings.warn(f"Function modifiers not yet implemented for event '{event.name}'")
463
+
464
+ return df
465
+
466
+ def _update_context(self, table_name: str, df: pd.DataFrame) -> None:
467
+ """
468
+ Update the context with key columns from the generated batch.
469
+
470
+ Smart Context Logic:
471
+ 1. Store Primary Key ('id')
472
+ 2. Store columns used as foreign keys by children (parent_key)
473
+ 3. Store columns used in Relationship filters (Logic Gap fix)
474
+ 4. Store columns used in 'relative_to' date constraints (Time Travel fix)
475
+ """
476
+ needed_cols = {'id'}
477
+
478
+ # 2. FK and Filter dependencies
479
+ for rel in self.config.relationships:
480
+ if rel.parent_table == table_name:
481
+ needed_cols.add(rel.parent_key)
482
+ if rel.filters:
483
+ for col in rel.filters.keys():
484
+ needed_cols.add(col)
485
+
486
+ # 4. Filter 'relative_to' dependencies
487
+ # This requires scanning ALL columns of ALL child tables to see if they reference this table
488
+ # Optimization: Build this dependency map once in __init__?
489
+ # For now, we scan here. It's fast enough for schema sizes < 100 tables.
490
+ for child_table in self.config.tables:
491
+ child_cols = self.config.get_columns(child_table.name)
492
+ for col in child_cols:
493
+ if col.type == 'date' and 'relative_to' in col.distribution_params:
494
+ # Format: "parent_table.column"
495
+ try:
496
+ ptable, pcol = col.distribution_params['relative_to'].split('.')
497
+ if ptable == table_name:
498
+ needed_cols.add(pcol)
499
+ except Exception:
500
+ pass
501
+
502
+ cols_to_store = [c for c in needed_cols if c in df.columns]
503
+ if not cols_to_store:
504
+ return
505
+
506
+ ctx_df = df[cols_to_store].copy()
507
+
508
+ if table_name not in self.context:
509
+ self.context[table_name] = ctx_df
510
+ else:
511
+ # Append to existing context
512
+ self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
513
+
514
+ def generate_batches(self, table_name: str) -> Any:
515
+ """
516
+ Yield batches of generated data for a table.
517
+
518
+ Args:
519
+ table_name: Name of the table to generate
520
+
521
+ Yields:
522
+ DataFrame batch
523
+ """
524
+ table = self.config.get_table(table_name)
525
+ if table is None:
526
+ raise ValueError(f"Table '{table_name}' not found in schema")
527
+
528
+ # Reference table with inline data - yield as single batch
529
+ if table.is_reference and table.inline_data:
530
+ df = pd.DataFrame(table.inline_data)
531
+ self._update_context(table_name, df)
532
+ yield df
533
+ return
534
+
535
+ columns = self.config.get_columns(table_name)
536
+ total_rows = table.row_count
537
+
538
+ rows_generated = 0
539
+
540
+ while rows_generated < total_rows:
541
+ batch_size = min(self.batch_size, total_rows - rows_generated)
542
+
543
+ # Generate batch
544
+ data = {}
545
+ df_batch = pd.DataFrame()
546
+
547
+ for column in columns:
548
+ values = self.generate_column(table_name, column, batch_size, df_batch)
549
+ data[column.name] = values
550
+ df_batch[column.name] = values
551
+
552
+ df_batch = pd.DataFrame(data)
553
+
554
+ # Apply formulas
555
+ df_batch = self._apply_formula_columns(df_batch, table_name)
556
+
557
+ # Post-process
558
+ df_batch = self._fix_correlated_columns(df_batch, table_name)
559
+
560
+ # Apply events
561
+ table_events = [e for e in self.config.events if e.table == table_name]
562
+ for event in table_events:
563
+ df_batch = self.apply_event(df_batch, event)
564
+
565
+ # Apply business rule constraints
566
+ df_batch = self.apply_constraints(df_batch, table)
567
+
568
+ # Update context for future batches/tables
569
+ self._update_context(table_name, df_batch)
570
+
571
+ yield df_batch
572
+
573
+ rows_generated += batch_size
574
+
575
+ def apply_constraints(self, df: pd.DataFrame, table: Any) -> pd.DataFrame:
576
+ """
577
+ Apply business rule constraints to generated data.
578
+
579
+ Args:
580
+ df: DataFrame batch to constrain
581
+ table: Table definition containing constraints
582
+
583
+ Returns:
584
+ Constrained DataFrame
585
+ """
586
+ if not hasattr(table, 'constraints') or not table.constraints:
587
+ return df
588
+
589
+ for constraint in table.constraints:
590
+ df = self._apply_single_constraint(df, constraint)
591
+
592
+ return df
593
+
594
+ def _apply_single_constraint(self, df: pd.DataFrame, constraint: Any) -> pd.DataFrame:
595
+ """Apply a single constraint to the DataFrame."""
596
+
597
+ # Validate required columns exist
598
+ for col in constraint.group_by:
599
+ if col not in df.columns:
600
+ warnings.warn(f"Constraint '{constraint.name}': Column '{col}' not found. Skipping.")
601
+ return df
602
+
603
+ if constraint.column and constraint.column not in df.columns:
604
+ warnings.warn(f"Constraint '{constraint.name}': Target column '{constraint.column}' not found. Skipping.")
605
+ return df
606
+
607
+ if constraint.type == "max_per_group":
608
+ # Cap values per group (e.g., max 8 hours per employee per day)
609
+ if constraint.action == "cap":
610
+ # Simple cap: clip the value column
611
+ df[constraint.column] = df.groupby(constraint.group_by)[constraint.column].transform(
612
+ lambda x: x.clip(upper=constraint.value)
613
+ )
614
+ elif constraint.action == "redistribute":
615
+ # More complex: redistribute excess across the group
616
+ # For now, just cap
617
+ df[constraint.column] = df.groupby(constraint.group_by)[constraint.column].transform(
618
+ lambda x: x.clip(upper=constraint.value)
619
+ )
620
+
621
+ elif constraint.type == "sum_limit":
622
+ # Limit sum per group (e.g., max 8 total hours per employee per day across projects)
623
+ def cap_sum(group):
624
+ total = group[constraint.column].sum()
625
+ if total > constraint.value:
626
+ # Scale down proportionally
627
+ scale = constraint.value / total
628
+ group[constraint.column] = group[constraint.column] * scale
629
+ return group
630
+
631
+ df = df.groupby(constraint.group_by, group_keys=False).apply(cap_sum)
632
+
633
+ elif constraint.type == "unique_combination":
634
+ # Ensure unique combinations (e.g., one timesheet per employee-project-date)
635
+ if constraint.action == "drop":
636
+ df = df.drop_duplicates(subset=constraint.group_by, keep='first')
637
+
638
+ elif constraint.type == "min_per_group":
639
+ # Floor values per group
640
+ if constraint.action == "cap":
641
+ df[constraint.column] = df.groupby(constraint.group_by)[constraint.column].transform(
642
+ lambda x: x.clip(lower=constraint.value)
643
+ )
644
+
645
+ return df
646
+
647
+ def _apply_formula_columns(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
648
+ """Apply formula-based derived columns using context for lookups."""
649
+ try:
650
+ from misata.formulas import FormulaEngine
651
+ except ImportError:
652
+ return df
653
+
654
+ columns = self.config.get_columns(table_name)
655
+ formula_cols = [c for c in columns if c.distribution_params.get("formula")]
656
+
657
+ if not formula_cols:
658
+ return df
659
+
660
+ # FormulaEngine now needs context, not full data
661
+ # BUT FormulaEngine expects full DataFrames in tables dict for lookups
662
+ # Our self.context IS a Dict[str, pd.DataFrame], just restricted columns
663
+ # So it should work if the formulas only ref columns in context (like id, price)
664
+ # Note: We need to make sure context has columns needed for formulas!
665
+ # Current _update_context only saves PK/FKs.
666
+ # TODO: Analyze formulas to find required context columns?
667
+ # For now, simplistic approach: formulas usually look up 'price', 'cost' etc.
668
+ # We might need to store more in context.
669
+ # Let's trust user or update _update_context to be smarter later.
670
+
671
+ engine = FormulaEngine(self.context)
672
+
673
+ for col in formula_cols:
674
+ formula = col.distribution_params["formula"]
675
+ # For correctness, lookups should work.
676
+ # If context doesn't have the column, FormulaEngine raises Error.
677
+ try:
678
+ result = engine.evaluate_with_lookups(df, formula)
679
+ df[col.name] = result
680
+ except ValueError as e:
681
+ # Warn and skip if context missing
682
+ warnings.warn(f"Formula evaluation failed (context missing?): {e}")
683
+
684
+ return df
685
+
686
+ def _fix_correlated_columns(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
687
+ """Post-process to fix common semantically correlated columns."""
688
+ columns = list(df.columns)
689
+ if "plan" in columns and "price" in columns:
690
+ plan_prices = {
691
+ "free": 0.0, "basic": 9.99, "starter": 9.99, "premium": 19.99,
692
+ "pro": 19.99, "professional": 29.99, "enterprise": 49.99,
693
+ "business": 49.99, "unlimited": 99.99,
694
+ }
695
+ df["price"] = df["plan"].map(lambda p: plan_prices.get(str(p).lower(), df["price"].iloc[0]))
696
+ return df
697
+
698
+ def generate_all(self):
699
+ """
700
+ Generate all tables in dependency order.
701
+
702
+ Yields:
703
+ Tuple[str, pd.DataFrame]: (table_name, batch_df)
704
+ """
705
+ sorted_tables = self.topological_sort()
706
+
707
+ for table_name in sorted_tables:
708
+ for batch in self.generate_batches(table_name):
709
+ yield table_name, batch
710
+
711
+ def export_to_csv(self, output_dir: str = ".") -> None:
712
+ """
713
+ Export all generated tables to CSV files, creating files progressively.
714
+ """
715
+ import os
716
+ os.makedirs(output_dir, exist_ok=True)
717
+
718
+ # Track open file handles or just append?
719
+ # Appending is safer.
720
+ files_created = set()
721
+
722
+ for table_name, batch_df in self.generate_all():
723
+ output_path = os.path.join(output_dir, f"{table_name}.csv")
724
+ mode = 'a' if table_name in files_created else 'w'
725
+ header = table_name not in files_created
726
+
727
+ batch_df.to_csv(output_path, mode=mode, header=header, index=False)
728
+ files_created.add(table_name)
729
+
730
+ def get_summary(self) -> str:
731
+ """
732
+ Get a summary of generated data (from context).
733
+ Only shows context info since full data isn't kept.
734
+ """
735
+ summary_lines = ["Generated Context Summary (Lightweight):", "=" * 50]
736
+
737
+ for table_name, df in self.context.items():
738
+ summary_lines.append(f"\n{table_name}: {len(df):,} rows tracked in context")
739
+ summary_lines.append(f" Context Columns: {list(df.columns)}")
740
+ summary_lines.append(f" Context Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
741
+
742
+ return "\n".join(summary_lines)