misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +89 -3
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/llm_parser.py +41 -2
- misata/profiles.py +332 -0
- misata/quality.py +329 -0
- misata/schema.py +8 -3
- misata/simulator.py +81 -5
- misata/smart_values.py +762 -0
- misata/streaming.py +228 -0
- misata/templates/library.py +344 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/METADATA +4 -2
- misata-0.3.0b0.dist-info/RECORD +37 -0
- misata-0.3.0b0.dist-info/licenses/LICENSE +21 -0
- misata-0.1.0b0.dist-info/RECORD +0 -25
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/WHEEL +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/entry_points.txt +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/top_level.txt +0 -0
misata/simulator.py
CHANGED
|
@@ -35,7 +35,8 @@ class DataSimulator:
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
def __init__(self, config: SchemaConfig,
|
|
38
|
-
apply_semantic_fixes: bool = True, batch_size: int = 10_000
|
|
38
|
+
apply_semantic_fixes: bool = True, batch_size: int = 10_000,
|
|
39
|
+
smart_mode: bool = False, use_llm: bool = True):
|
|
39
40
|
"""
|
|
40
41
|
Initialize the simulator.
|
|
41
42
|
|
|
@@ -43,13 +44,19 @@ class DataSimulator:
|
|
|
43
44
|
config: Schema configuration defining tables, columns, and relationships
|
|
44
45
|
apply_semantic_fixes: Auto-fix column types based on semantic patterns
|
|
45
46
|
batch_size: Number of rows to generate per batch
|
|
47
|
+
smart_mode: Enable LLM-powered context-aware value generation
|
|
48
|
+
use_llm: If smart_mode is True, whether to use LLM (vs curated fallbacks)
|
|
46
49
|
"""
|
|
47
50
|
self.config = config
|
|
48
51
|
self.context: Dict[str, pd.DataFrame] = {} # Lightweight context (IDs only)
|
|
49
52
|
self.text_gen = TextGenerator(seed=config.seed)
|
|
50
53
|
self.batch_size = batch_size
|
|
54
|
+
self.smart_mode = smart_mode
|
|
55
|
+
self.use_llm = use_llm
|
|
56
|
+
self._smart_gen = None # Lazy init
|
|
51
57
|
self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
|
|
52
58
|
self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
|
|
59
|
+
self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
|
|
53
60
|
|
|
54
61
|
# Apply semantic inference to fix column types
|
|
55
62
|
if apply_semantic_fixes:
|
|
@@ -60,6 +67,16 @@ class DataSimulator:
|
|
|
60
67
|
seed = config.seed if config.seed is not None else np.random.randint(0, 2**32 - 1)
|
|
61
68
|
self.rng = np.random.default_rng(seed)
|
|
62
69
|
np.random.seed(seed) # For legacy numpy.random calls
|
|
70
|
+
|
|
71
|
+
def _get_smart_gen(self):
|
|
72
|
+
"""Lazy initialize SmartValueGenerator."""
|
|
73
|
+
if self._smart_gen is None:
|
|
74
|
+
try:
|
|
75
|
+
from misata.smart_values import SmartValueGenerator
|
|
76
|
+
self._smart_gen = SmartValueGenerator()
|
|
77
|
+
except Exception:
|
|
78
|
+
self._smart_gen = None
|
|
79
|
+
return self._smart_gen
|
|
63
80
|
|
|
64
81
|
def topological_sort(self) -> List[str]:
|
|
65
82
|
"""
|
|
@@ -210,13 +227,21 @@ class DataSimulator:
|
|
|
210
227
|
|
|
211
228
|
# CATEGORICAL
|
|
212
229
|
if column.type == "categorical":
|
|
213
|
-
choices = params
|
|
230
|
+
choices = params.get("choices", ["A", "B", "C"])
|
|
214
231
|
probabilities = params.get("probabilities", None)
|
|
215
232
|
|
|
233
|
+
# Ensure choices is a list
|
|
234
|
+
if not isinstance(choices, list):
|
|
235
|
+
choices = list(choices)
|
|
236
|
+
|
|
216
237
|
if probabilities is not None:
|
|
217
|
-
#
|
|
218
|
-
probabilities = np.array(probabilities)
|
|
219
|
-
|
|
238
|
+
# Convert to float array and normalize
|
|
239
|
+
probabilities = np.array(probabilities, dtype=float)
|
|
240
|
+
prob_sum = probabilities.sum()
|
|
241
|
+
if prob_sum > 0:
|
|
242
|
+
probabilities = probabilities / prob_sum
|
|
243
|
+
else:
|
|
244
|
+
probabilities = None
|
|
220
245
|
|
|
221
246
|
values = self.rng.choice(choices, size=size, p=probabilities)
|
|
222
247
|
return values
|
|
@@ -413,6 +438,35 @@ class DataSimulator:
|
|
|
413
438
|
# TEXT
|
|
414
439
|
elif column.type == "text":
|
|
415
440
|
text_type = params.get("text_type", "sentence")
|
|
441
|
+
|
|
442
|
+
# Smart value generation - check for domain-specific content
|
|
443
|
+
smart_generate = params.get("smart_generate", False) or self.smart_mode
|
|
444
|
+
if smart_generate:
|
|
445
|
+
smart_gen = self._get_smart_gen()
|
|
446
|
+
if smart_gen:
|
|
447
|
+
# Check for explicit domain hint or auto-detect
|
|
448
|
+
domain_hint = params.get("domain_hint")
|
|
449
|
+
context = params.get("context", "")
|
|
450
|
+
|
|
451
|
+
# Create cache key for this column's pool
|
|
452
|
+
pool_key = f"{table_name}.{column.name}"
|
|
453
|
+
|
|
454
|
+
if pool_key not in self._smart_pools:
|
|
455
|
+
pool = smart_gen.get_pool(
|
|
456
|
+
column_name=column.name,
|
|
457
|
+
table_name=table_name,
|
|
458
|
+
domain_hint=domain_hint,
|
|
459
|
+
context=context,
|
|
460
|
+
size=100,
|
|
461
|
+
use_llm=self.use_llm,
|
|
462
|
+
)
|
|
463
|
+
if pool:
|
|
464
|
+
self._smart_pools[pool_key] = np.array(pool)
|
|
465
|
+
|
|
466
|
+
if pool_key in self._smart_pools:
|
|
467
|
+
pool = self._smart_pools[pool_key]
|
|
468
|
+
values = self.rng.choice(pool, size=size)
|
|
469
|
+
return values
|
|
416
470
|
|
|
417
471
|
if text_type == "name":
|
|
418
472
|
values = np.array([self.text_gen.name() for _ in range(size)])
|
|
@@ -441,6 +495,28 @@ class DataSimulator:
|
|
|
441
495
|
values = self.rng.random(size) < probability
|
|
442
496
|
return values
|
|
443
497
|
|
|
498
|
+
# TIME
|
|
499
|
+
elif column.type == "time":
|
|
500
|
+
# Generate random times as HH:MM:SS strings
|
|
501
|
+
start_hour = params.get("start_hour", 0)
|
|
502
|
+
end_hour = params.get("end_hour", 24)
|
|
503
|
+
hours = self.rng.integers(start_hour, end_hour, size=size)
|
|
504
|
+
minutes = self.rng.integers(0, 60, size=size)
|
|
505
|
+
seconds = self.rng.integers(0, 60, size=size)
|
|
506
|
+
values = np.array([f"{h:02d}:{m:02d}:{s:02d}" for h, m, s in zip(hours, minutes, seconds)])
|
|
507
|
+
return values
|
|
508
|
+
|
|
509
|
+
# DATETIME
|
|
510
|
+
elif column.type == "datetime":
|
|
511
|
+
# Generate random datetimes within a range
|
|
512
|
+
start = pd.to_datetime(params.get("start", "2020-01-01"))
|
|
513
|
+
end = pd.to_datetime(params.get("end", "2024-12-31"))
|
|
514
|
+
start_int = start.value
|
|
515
|
+
end_int = end.value
|
|
516
|
+
random_ints = self.rng.integers(start_int, end_int, size=size)
|
|
517
|
+
values = pd.to_datetime(random_ints)
|
|
518
|
+
return values
|
|
519
|
+
|
|
444
520
|
else:
|
|
445
521
|
raise ValueError(f"Unknown column type: {column.type}")
|
|
446
522
|
|