misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/simulator.py CHANGED
@@ -35,7 +35,8 @@ class DataSimulator:
35
35
  """
36
36
 
37
37
  def __init__(self, config: SchemaConfig,
38
- apply_semantic_fixes: bool = True, batch_size: int = 10_000):
38
+ apply_semantic_fixes: bool = True, batch_size: int = 10_000,
39
+ smart_mode: bool = False, use_llm: bool = True):
39
40
  """
40
41
  Initialize the simulator.
41
42
 
@@ -43,13 +44,19 @@ class DataSimulator:
43
44
  config: Schema configuration defining tables, columns, and relationships
44
45
  apply_semantic_fixes: Auto-fix column types based on semantic patterns
45
46
  batch_size: Number of rows to generate per batch
47
+ smart_mode: Enable LLM-powered context-aware value generation
48
+ use_llm: If smart_mode is True, whether to use LLM (vs curated fallbacks)
46
49
  """
47
50
  self.config = config
48
51
  self.context: Dict[str, pd.DataFrame] = {} # Lightweight context (IDs only)
49
52
  self.text_gen = TextGenerator(seed=config.seed)
50
53
  self.batch_size = batch_size
54
+ self.smart_mode = smart_mode
55
+ self.use_llm = use_llm
56
+ self._smart_gen = None # Lazy init
51
57
  self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
52
58
  self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
59
+ self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
53
60
 
54
61
  # Apply semantic inference to fix column types
55
62
  if apply_semantic_fixes:
@@ -60,6 +67,16 @@ class DataSimulator:
60
67
  seed = config.seed if config.seed is not None else np.random.randint(0, 2**32 - 1)
61
68
  self.rng = np.random.default_rng(seed)
62
69
  np.random.seed(seed) # For legacy numpy.random calls
70
+
71
+ def _get_smart_gen(self):
72
+ """Lazy initialize SmartValueGenerator."""
73
+ if self._smart_gen is None:
74
+ try:
75
+ from misata.smart_values import SmartValueGenerator
76
+ self._smart_gen = SmartValueGenerator()
77
+ except Exception:
78
+ self._smart_gen = None
79
+ return self._smart_gen
63
80
 
64
81
  def topological_sort(self) -> List[str]:
65
82
  """
@@ -210,13 +227,21 @@ class DataSimulator:
210
227
 
211
228
  # CATEGORICAL
212
229
  if column.type == "categorical":
213
- choices = params["choices"]
230
+ choices = params.get("choices", ["A", "B", "C"])
214
231
  probabilities = params.get("probabilities", None)
215
232
 
233
+ # Ensure choices is a list
234
+ if not isinstance(choices, list):
235
+ choices = list(choices)
236
+
216
237
  if probabilities is not None:
217
- # Normalize probabilities
218
- probabilities = np.array(probabilities)
219
- probabilities = probabilities / probabilities.sum()
238
+ # Convert to float array and normalize
239
+ probabilities = np.array(probabilities, dtype=float)
240
+ prob_sum = probabilities.sum()
241
+ if prob_sum > 0:
242
+ probabilities = probabilities / prob_sum
243
+ else:
244
+ probabilities = None
220
245
 
221
246
  values = self.rng.choice(choices, size=size, p=probabilities)
222
247
  return values
@@ -413,6 +438,35 @@ class DataSimulator:
413
438
  # TEXT
414
439
  elif column.type == "text":
415
440
  text_type = params.get("text_type", "sentence")
441
+
442
+ # Smart value generation - check for domain-specific content
443
+ smart_generate = params.get("smart_generate", False) or self.smart_mode
444
+ if smart_generate:
445
+ smart_gen = self._get_smart_gen()
446
+ if smart_gen:
447
+ # Check for explicit domain hint or auto-detect
448
+ domain_hint = params.get("domain_hint")
449
+ context = params.get("context", "")
450
+
451
+ # Create cache key for this column's pool
452
+ pool_key = f"{table_name}.{column.name}"
453
+
454
+ if pool_key not in self._smart_pools:
455
+ pool = smart_gen.get_pool(
456
+ column_name=column.name,
457
+ table_name=table_name,
458
+ domain_hint=domain_hint,
459
+ context=context,
460
+ size=100,
461
+ use_llm=self.use_llm,
462
+ )
463
+ if pool:
464
+ self._smart_pools[pool_key] = np.array(pool)
465
+
466
+ if pool_key in self._smart_pools:
467
+ pool = self._smart_pools[pool_key]
468
+ values = self.rng.choice(pool, size=size)
469
+ return values
416
470
 
417
471
  if text_type == "name":
418
472
  values = np.array([self.text_gen.name() for _ in range(size)])
@@ -441,6 +495,28 @@ class DataSimulator:
441
495
  values = self.rng.random(size) < probability
442
496
  return values
443
497
 
498
+ # TIME
499
+ elif column.type == "time":
500
+ # Generate random times as HH:MM:SS strings
501
+ start_hour = params.get("start_hour", 0)
502
+ end_hour = params.get("end_hour", 24)
503
+ hours = self.rng.integers(start_hour, end_hour, size=size)
504
+ minutes = self.rng.integers(0, 60, size=size)
505
+ seconds = self.rng.integers(0, 60, size=size)
506
+ values = np.array([f"{h:02d}:{m:02d}:{s:02d}" for h, m, s in zip(hours, minutes, seconds)])
507
+ return values
508
+
509
+ # DATETIME
510
+ elif column.type == "datetime":
511
+ # Generate random datetimes within a range
512
+ start = pd.to_datetime(params.get("start", "2020-01-01"))
513
+ end = pd.to_datetime(params.get("end", "2024-12-31"))
514
+ start_int = start.value
515
+ end_int = end.value
516
+ random_ints = self.rng.integers(start_int, end_int, size=size)
517
+ values = pd.to_datetime(random_ints)
518
+ return values
519
+
444
520
  else:
445
521
  raise ValueError(f"Unknown column type: {column.type}")
446
522