misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +1 -1
- misata/agents/__init__.py +23 -0
- misata/agents/pipeline.py +286 -0
- misata/causal/__init__.py +5 -0
- misata/causal/graph.py +109 -0
- misata/causal/solver.py +115 -0
- misata/cli.py +31 -0
- misata/generators/__init__.py +19 -0
- misata/generators/copula.py +198 -0
- misata/llm_parser.py +180 -137
- misata/quality.py +78 -33
- misata/reference_data.py +221 -0
- misata/research/__init__.py +3 -0
- misata/research/agent.py +70 -0
- misata/schema.py +25 -0
- misata/simulator.py +264 -12
- misata/smart_values.py +144 -6
- misata/studio/__init__.py +55 -0
- misata/studio/app.py +49 -0
- misata/studio/components/inspector.py +81 -0
- misata/studio/components/sidebar.py +35 -0
- misata/studio/constraint_generator.py +781 -0
- misata/studio/inference.py +319 -0
- misata/studio/outcome_curve.py +284 -0
- misata/studio/state/store.py +55 -0
- misata/studio/tabs/configure.py +50 -0
- misata/studio/tabs/generate.py +117 -0
- misata/studio/tabs/outcome_curve.py +149 -0
- misata/studio/tabs/schema_designer.py +217 -0
- misata/studio/utils/styles.py +143 -0
- misata/studio_constraints/__init__.py +29 -0
- misata/studio_constraints/z3_solver.py +259 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
- misata-0.5.0.dist-info/RECORD +61 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
- misata-0.3.0b0.dist-info/RECORD +0 -37
- /misata/{generators.py → generators_legacy.py} +0 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0
misata/simulator.py
CHANGED
|
@@ -16,7 +16,9 @@ from typing import Any, Dict, List, Optional
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
import pandas as pd
|
|
18
18
|
|
|
19
|
-
from misata.generators import TextGenerator
|
|
19
|
+
from misata.generators.base import TextGenerator as _FactoryTextGenerator # Generator factory version
|
|
20
|
+
# Use the original generators.py TextGenerator which supports seed
|
|
21
|
+
from misata.generators_legacy import TextGenerator
|
|
20
22
|
from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
|
|
21
23
|
|
|
22
24
|
|
|
@@ -34,6 +36,10 @@ class DataSimulator:
|
|
|
34
36
|
rng: NumPy random generator for reproducibility
|
|
35
37
|
"""
|
|
36
38
|
|
|
39
|
+
# Performance constants
|
|
40
|
+
MAX_CONTEXT_ROWS = 50000 # Cap context storage for memory efficiency
|
|
41
|
+
TEXT_POOL_SIZE = 10000 # Size of text value pools for vectorized sampling
|
|
42
|
+
|
|
37
43
|
def __init__(self, config: SchemaConfig,
|
|
38
44
|
apply_semantic_fixes: bool = True, batch_size: int = 10_000,
|
|
39
45
|
smart_mode: bool = False, use_llm: bool = True):
|
|
@@ -57,6 +63,7 @@ class DataSimulator:
|
|
|
57
63
|
self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
|
|
58
64
|
self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
|
|
59
65
|
self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
|
|
66
|
+
self._text_pools: Dict[str, np.ndarray] = {} # Cache text pools for vectorized sampling
|
|
60
67
|
|
|
61
68
|
# Apply semantic inference to fix column types
|
|
62
69
|
if apply_semantic_fixes:
|
|
@@ -199,10 +206,24 @@ class DataSimulator:
|
|
|
199
206
|
ctx_df = df[cols_to_store].copy()
|
|
200
207
|
|
|
201
208
|
if table_name not in self.context:
|
|
209
|
+
# First batch: store up to MAX_CONTEXT_ROWS
|
|
210
|
+
if len(ctx_df) > self.MAX_CONTEXT_ROWS:
|
|
211
|
+
ctx_df = ctx_df.sample(n=self.MAX_CONTEXT_ROWS, random_state=self.config.seed)
|
|
202
212
|
self.context[table_name] = ctx_df
|
|
203
213
|
else:
|
|
204
|
-
# Append to existing context
|
|
205
|
-
|
|
214
|
+
# Append to existing context, but cap at MAX_CONTEXT_ROWS
|
|
215
|
+
current_len = len(self.context[table_name])
|
|
216
|
+
if current_len >= self.MAX_CONTEXT_ROWS:
|
|
217
|
+
# Already at capacity, use reservoir sampling for randomness
|
|
218
|
+
# Replace some existing rows with new ones (probability-based)
|
|
219
|
+
return # Skip appending, we have enough IDs
|
|
220
|
+
|
|
221
|
+
remaining_space = self.MAX_CONTEXT_ROWS - current_len
|
|
222
|
+
rows_to_add = ctx_df.iloc[:remaining_space]
|
|
223
|
+
self.context[table_name] = pd.concat(
|
|
224
|
+
[self.context[table_name], rows_to_add],
|
|
225
|
+
ignore_index=True
|
|
226
|
+
)
|
|
206
227
|
|
|
207
228
|
def generate_column(
|
|
208
229
|
self,
|
|
@@ -225,6 +246,70 @@ class DataSimulator:
|
|
|
225
246
|
"""
|
|
226
247
|
params = column.distribution_params
|
|
227
248
|
|
|
249
|
+
# ========== CORRELATED COLUMN GENERATION ==========
|
|
250
|
+
# If this column depends on another column's value, use conditional distribution
|
|
251
|
+
if "depends_on" in params and table_data is not None:
|
|
252
|
+
parent_col = params["depends_on"]
|
|
253
|
+
mapping = params.get("mapping", {})
|
|
254
|
+
|
|
255
|
+
if parent_col in table_data.columns and mapping:
|
|
256
|
+
parent_values = table_data[parent_col].values
|
|
257
|
+
|
|
258
|
+
# Check if it's numeric or categorical mapping
|
|
259
|
+
first_val = next(iter(mapping.values()))
|
|
260
|
+
if isinstance(first_val, dict) and "mean" in first_val:
|
|
261
|
+
# Numeric conditional distribution (e.g., salary based on job_title)
|
|
262
|
+
# mapping = {"Intern": {"mean": 40000, "std": 5000}, "CTO": {"mean": 200000, "std": 30000}}
|
|
263
|
+
values = np.zeros(size)
|
|
264
|
+
for key, dist in mapping.items():
|
|
265
|
+
mask = parent_values == key
|
|
266
|
+
count = mask.sum()
|
|
267
|
+
if count > 0:
|
|
268
|
+
mean = dist.get("mean", 50000)
|
|
269
|
+
std = dist.get("std", mean * 0.1)
|
|
270
|
+
values[mask] = self.rng.normal(mean, std, count)
|
|
271
|
+
|
|
272
|
+
# Handle values that didn't match any key (use default)
|
|
273
|
+
default = params.get("default", {"mean": 50000, "std": 10000})
|
|
274
|
+
unmatched = ~np.isin(parent_values, list(mapping.keys()))
|
|
275
|
+
if unmatched.sum() > 0:
|
|
276
|
+
values[unmatched] = self.rng.normal(
|
|
277
|
+
default.get("mean", 50000),
|
|
278
|
+
default.get("std", 10000),
|
|
279
|
+
unmatched.sum()
|
|
280
|
+
)
|
|
281
|
+
return values
|
|
282
|
+
|
|
283
|
+
elif isinstance(first_val, list):
|
|
284
|
+
# Categorical conditional (e.g., state based on country)
|
|
285
|
+
# mapping = {"USA": ["CA", "TX", "NY"], "UK": ["England", "Scotland"]}
|
|
286
|
+
values = np.empty(size, dtype=object)
|
|
287
|
+
for key, choices in mapping.items():
|
|
288
|
+
mask = parent_values == key
|
|
289
|
+
count = mask.sum()
|
|
290
|
+
if count > 0:
|
|
291
|
+
values[mask] = self.rng.choice(choices, count)
|
|
292
|
+
|
|
293
|
+
# Default for unmatched
|
|
294
|
+
default_choices = params.get("default", ["Unknown"])
|
|
295
|
+
unmatched = values == None # noqa
|
|
296
|
+
if unmatched.sum() > 0:
|
|
297
|
+
values[unmatched] = self.rng.choice(default_choices, unmatched.sum())
|
|
298
|
+
return values
|
|
299
|
+
|
|
300
|
+
elif isinstance(first_val, (int, float)):
|
|
301
|
+
# Probability-based boolean (e.g., churn probability based on plan)
|
|
302
|
+
# mapping = {"free": 0.3, "pro": 0.1, "enterprise": 0.05}
|
|
303
|
+
values = np.zeros(size, dtype=bool)
|
|
304
|
+
for key, prob in mapping.items():
|
|
305
|
+
mask = parent_values == key
|
|
306
|
+
count = mask.sum()
|
|
307
|
+
if count > 0:
|
|
308
|
+
values[mask] = self.rng.random(count) < prob
|
|
309
|
+
return values
|
|
310
|
+
|
|
311
|
+
# ========== STANDARD COLUMN GENERATION ==========
|
|
312
|
+
|
|
228
313
|
# CATEGORICAL
|
|
229
314
|
if column.type == "categorical":
|
|
230
315
|
choices = params.get("choices", ["A", "B", "C"])
|
|
@@ -469,23 +554,59 @@ class DataSimulator:
|
|
|
469
554
|
return values
|
|
470
555
|
|
|
471
556
|
if text_type == "name":
|
|
472
|
-
|
|
557
|
+
pool_key = "text_name"
|
|
558
|
+
if pool_key not in self._text_pools:
|
|
559
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
560
|
+
self._text_pools[pool_key] = np.array([self.text_gen.name() for _ in range(pool_size)])
|
|
561
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
473
562
|
elif text_type == "email":
|
|
474
|
-
|
|
563
|
+
pool_key = "text_email"
|
|
564
|
+
if pool_key not in self._text_pools:
|
|
565
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
566
|
+
self._text_pools[pool_key] = np.array([self.text_gen.email() for _ in range(pool_size)])
|
|
567
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
475
568
|
elif text_type == "company":
|
|
476
|
-
|
|
569
|
+
pool_key = "text_company"
|
|
570
|
+
if pool_key not in self._text_pools:
|
|
571
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
572
|
+
self._text_pools[pool_key] = np.array([self.text_gen.company() for _ in range(pool_size)])
|
|
573
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
477
574
|
elif text_type == "sentence":
|
|
478
|
-
|
|
575
|
+
pool_key = "text_sentence"
|
|
576
|
+
if pool_key not in self._text_pools:
|
|
577
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
578
|
+
self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
|
|
579
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
479
580
|
elif text_type == "word":
|
|
480
|
-
|
|
581
|
+
pool_key = "text_word"
|
|
582
|
+
if pool_key not in self._text_pools:
|
|
583
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
584
|
+
self._text_pools[pool_key] = np.array([self.text_gen.word() for _ in range(pool_size)])
|
|
585
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
481
586
|
elif text_type == "address":
|
|
482
|
-
|
|
587
|
+
pool_key = "text_address"
|
|
588
|
+
if pool_key not in self._text_pools:
|
|
589
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
590
|
+
self._text_pools[pool_key] = np.array([self.text_gen.full_address() for _ in range(pool_size)])
|
|
591
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
483
592
|
elif text_type == "phone":
|
|
484
|
-
|
|
593
|
+
pool_key = "text_phone"
|
|
594
|
+
if pool_key not in self._text_pools:
|
|
595
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
596
|
+
self._text_pools[pool_key] = np.array([self.text_gen.phone_number() for _ in range(pool_size)])
|
|
597
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
485
598
|
elif text_type == "url":
|
|
486
|
-
|
|
599
|
+
pool_key = "text_url"
|
|
600
|
+
if pool_key not in self._text_pools:
|
|
601
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
602
|
+
self._text_pools[pool_key] = np.array([self.text_gen.url() for _ in range(pool_size)])
|
|
603
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
487
604
|
else:
|
|
488
|
-
|
|
605
|
+
pool_key = "text_sentence"
|
|
606
|
+
if pool_key not in self._text_pools:
|
|
607
|
+
pool_size = min(size, self.TEXT_POOL_SIZE)
|
|
608
|
+
self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
|
|
609
|
+
values = self.rng.choice(self._text_pools[pool_key], size=size)
|
|
489
610
|
|
|
490
611
|
return values
|
|
491
612
|
|
|
@@ -640,6 +761,9 @@ class DataSimulator:
|
|
|
640
761
|
|
|
641
762
|
# Apply business rule constraints
|
|
642
763
|
df_batch = self.apply_constraints(df_batch, table)
|
|
764
|
+
|
|
765
|
+
# Apply outcome curves (Trends/Seasonality)
|
|
766
|
+
df_batch = self.apply_outcome_curves(df_batch, table_name)
|
|
643
767
|
|
|
644
768
|
# Update context for future batches/tables
|
|
645
769
|
self._update_context(table_name, df_batch)
|
|
@@ -667,6 +791,134 @@ class DataSimulator:
|
|
|
667
791
|
|
|
668
792
|
return df
|
|
669
793
|
|
|
794
|
+
def apply_outcome_curves(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
|
|
795
|
+
"""
|
|
796
|
+
Apply temporal outcome curves to force data to match trends/seasonality.
|
|
797
|
+
|
|
798
|
+
This overrides the base distribution with the high-level constraints
|
|
799
|
+
defined in the prompt (e.g. "seasonal peaks", "upward trend").
|
|
800
|
+
"""
|
|
801
|
+
if not hasattr(self.config, 'outcome_curves') or not self.config.outcome_curves:
|
|
802
|
+
print(f"[CURVE DEBUG] No outcome_curves found in config for {table_name}")
|
|
803
|
+
return df
|
|
804
|
+
|
|
805
|
+
print(f"[CURVE DEBUG] Found {len(self.config.outcome_curves)} curves in config")
|
|
806
|
+
|
|
807
|
+
# Filter curves for this table - handle both dict and Pydantic object
|
|
808
|
+
curves = []
|
|
809
|
+
for c in self.config.outcome_curves:
|
|
810
|
+
# Get table name from curve (handle both dict and object)
|
|
811
|
+
c_table = c.table if hasattr(c, 'table') else c.get('table')
|
|
812
|
+
if c_table == table_name:
|
|
813
|
+
curves.append(c)
|
|
814
|
+
|
|
815
|
+
print(f"[CURVE DEBUG] {len(curves)} curves match table '{table_name}'")
|
|
816
|
+
|
|
817
|
+
for curve in curves:
|
|
818
|
+
try:
|
|
819
|
+
# Access attributes (Pydantic) or dict keys
|
|
820
|
+
target_col = curve.column if hasattr(curve, 'column') else curve['column']
|
|
821
|
+
time_col = curve.time_column if hasattr(curve, 'time_column') else curve['time_column']
|
|
822
|
+
points = curve.curve_points if hasattr(curve, 'curve_points') else curve.get('curve_points', [])
|
|
823
|
+
pattern_type = curve.pattern_type if hasattr(curve, 'pattern_type') else curve.get('pattern_type', 'seasonal')
|
|
824
|
+
|
|
825
|
+
print(f"[CURVE DEBUG] Applying curve: table={table_name}, col={target_col}, time_col={time_col}, pattern={pattern_type}")
|
|
826
|
+
print(f"[CURVE DEBUG] DF columns: {list(df.columns)}")
|
|
827
|
+
|
|
828
|
+
if target_col not in df.columns:
|
|
829
|
+
print(f"[CURVE DEBUG] Target column '{target_col}' not in DataFrame!")
|
|
830
|
+
continue
|
|
831
|
+
if time_col not in df.columns:
|
|
832
|
+
print(f"[CURVE DEBUG] Time column '{time_col}' not in DataFrame!")
|
|
833
|
+
continue
|
|
834
|
+
|
|
835
|
+
if not points:
|
|
836
|
+
print(f"[CURVE DEBUG] No curve points!")
|
|
837
|
+
continue
|
|
838
|
+
|
|
839
|
+
# Convert Pydantic CurvePoint objects to dicts if needed
|
|
840
|
+
point_dicts = []
|
|
841
|
+
for p in points:
|
|
842
|
+
if hasattr(p, 'month'):
|
|
843
|
+
point_dicts.append({'month': p.month, 'relative_value': p.relative_value})
|
|
844
|
+
else:
|
|
845
|
+
point_dicts.append(p)
|
|
846
|
+
points = point_dicts
|
|
847
|
+
|
|
848
|
+
print(f"[CURVE DEBUG] Points: {points}")
|
|
849
|
+
|
|
850
|
+
# Sort points by order (month or progress)
|
|
851
|
+
points.sort(key=lambda x: x.get('month', x.get('x', 0)))
|
|
852
|
+
|
|
853
|
+
pattern_type = curve.get('pattern_type', 'seasonal')
|
|
854
|
+
|
|
855
|
+
# Extract time components
|
|
856
|
+
if not pd.api.types.is_datetime64_any_dtype(df[time_col]):
|
|
857
|
+
timestamps = pd.to_datetime(df[time_col], errors='coerce')
|
|
858
|
+
else:
|
|
859
|
+
timestamps = df[time_col]
|
|
860
|
+
|
|
861
|
+
# Initialize factors
|
|
862
|
+
row_factors = np.ones(len(df))
|
|
863
|
+
|
|
864
|
+
# STRATEGY 1: SEASONAL (Cyclic 1-12)
|
|
865
|
+
if pattern_type in ['seasonal', 'cyclic']:
|
|
866
|
+
months = timestamps.dt.month
|
|
867
|
+
scaling_factors = np.ones(13) # Index 1-12
|
|
868
|
+
|
|
869
|
+
x_known = np.array([p['month'] for p in points])
|
|
870
|
+
y_known = np.array([p['relative_value'] for p in points])
|
|
871
|
+
|
|
872
|
+
for m in range(1, 13):
|
|
873
|
+
if m < x_known.min():
|
|
874
|
+
scaling_factors[m] = y_known[0]
|
|
875
|
+
elif m > x_known.max():
|
|
876
|
+
scaling_factors[m] = y_known[-1]
|
|
877
|
+
else:
|
|
878
|
+
scaling_factors[m] = np.interp(m, x_known, y_known)
|
|
879
|
+
|
|
880
|
+
row_factors = scaling_factors[months.fillna(1).astype(int).values]
|
|
881
|
+
|
|
882
|
+
# STRATEGY 2: GROWTH/TREND (Linear over absolute time)
|
|
883
|
+
elif pattern_type in ['growth', 'trend', 'increase', 'decline']:
|
|
884
|
+
# Normalize time to 0.0 - 1.0 range
|
|
885
|
+
t_min = timestamps.min()
|
|
886
|
+
t_max = timestamps.max()
|
|
887
|
+
|
|
888
|
+
if t_min == t_max:
|
|
889
|
+
row_factors = np.ones(len(df))
|
|
890
|
+
else:
|
|
891
|
+
# Convert to numeric (timestamps)
|
|
892
|
+
t_numerics = timestamps.astype(np.int64)
|
|
893
|
+
t_start = t_numerics.min()
|
|
894
|
+
t_range = t_numerics.max() - t_start
|
|
895
|
+
|
|
896
|
+
# Normalize 0.0 to 1.0
|
|
897
|
+
t_norm = (t_numerics - t_start) / t_range
|
|
898
|
+
|
|
899
|
+
# Map points (assume points are mapped 1-12 or 0.0-1.0?)
|
|
900
|
+
# The LLM outputs "month" 1-12 usually. Let's map 1=Start, 12=End?
|
|
901
|
+
# Or safer: interpolating 1-12 across the whole range.
|
|
902
|
+
|
|
903
|
+
x_known = np.array([p['month'] for p in points])
|
|
904
|
+
y_known = np.array([p['relative_value'] for p in points])
|
|
905
|
+
|
|
906
|
+
# Normalize x_known to 0.0-1.0 range (assuming 1..12 scale from LLM)
|
|
907
|
+
# If LLM says Month 1 to 12, we treat 1 as 0.0 and 12 as 1.0
|
|
908
|
+
x_known_norm = (x_known - 1) / 11.0 # 1->0, 12->1
|
|
909
|
+
|
|
910
|
+
# Interpolate
|
|
911
|
+
row_factors = np.interp(t_norm, x_known_norm, y_known)
|
|
912
|
+
|
|
913
|
+
# Apply!
|
|
914
|
+
df[target_col] = df[target_col] * row_factors
|
|
915
|
+
|
|
916
|
+
except Exception as e:
|
|
917
|
+
warnings.warn(f"Failed to apply outcome curve for {table_name}: {e}")
|
|
918
|
+
continue
|
|
919
|
+
|
|
920
|
+
return df
|
|
921
|
+
|
|
670
922
|
def _apply_single_constraint(self, df: pd.DataFrame, constraint: Any) -> pd.DataFrame:
|
|
671
923
|
"""Apply a single constraint to the DataFrame."""
|
|
672
924
|
|
misata/smart_values.py
CHANGED
|
@@ -86,6 +86,22 @@ class SmartValueGenerator:
|
|
|
86
86
|
"feature_name": ["feature", "capability", "functionality"],
|
|
87
87
|
"bug_type": ["bug", "issue", "defect", "error"],
|
|
88
88
|
"api_endpoint": ["endpoint", "api", "route", "path"],
|
|
89
|
+
|
|
90
|
+
# NEW v0.5.0: Additional domain patterns
|
|
91
|
+
"payment_method": ["payment_method", "pay_type", "payment_option"],
|
|
92
|
+
"order_status": ["order_status", "status", "state"],
|
|
93
|
+
"customer_segment": ["segment", "customer_type", "tier", "classification"],
|
|
94
|
+
"license_type": ["license", "licence"],
|
|
95
|
+
"file_type": ["file_type", "document_type", "mime_type"],
|
|
96
|
+
"priority_level": ["priority", "urgency", "importance"],
|
|
97
|
+
"subscription_plan": ["plan", "subscription", "tier", "package"],
|
|
98
|
+
|
|
99
|
+
# Generic patterns - lowest priority but always match on exact column names
|
|
100
|
+
"name": ["name"],
|
|
101
|
+
"description": ["description", "desc", "about", "summary", "details"],
|
|
102
|
+
"title": ["title", "heading"],
|
|
103
|
+
"status": ["status", "state"],
|
|
104
|
+
"type": ["type", "kind", "category"],
|
|
89
105
|
}
|
|
90
106
|
|
|
91
107
|
# Curated fallback pools (no LLM needed)
|
|
@@ -346,6 +362,108 @@ class SmartValueGenerator:
|
|
|
346
362
|
"/api/v1/notifications", "/api/v1/settings", "/api/v1/search",
|
|
347
363
|
"/api/v1/reports", "/api/v1/webhooks", "/api/v1/integrations",
|
|
348
364
|
],
|
|
365
|
+
# NEW v0.5.0: Additional high-quality domain pools
|
|
366
|
+
"medical_specialty": [
|
|
367
|
+
"Cardiology", "Dermatology", "Emergency Medicine", "Endocrinology",
|
|
368
|
+
"Family Medicine", "Gastroenterology", "General Surgery", "Geriatrics",
|
|
369
|
+
"Hematology", "Infectious Disease", "Internal Medicine", "Nephrology",
|
|
370
|
+
"Neurology", "Obstetrics & Gynecology", "Oncology", "Ophthalmology",
|
|
371
|
+
"Orthopedic Surgery", "Otolaryngology", "Pediatrics", "Psychiatry",
|
|
372
|
+
"Pulmonology", "Radiology", "Rheumatology", "Urology", "Anesthesiology",
|
|
373
|
+
],
|
|
374
|
+
"transaction_type": [
|
|
375
|
+
"Purchase", "Refund", "Transfer", "Deposit", "Withdrawal",
|
|
376
|
+
"Payment", "Credit", "Debit", "Fee", "Interest",
|
|
377
|
+
"Dividend", "Commission", "Bonus", "Adjustment", "Reversal",
|
|
378
|
+
"Wire Transfer", "ACH Transfer", "Direct Deposit", "Check Payment",
|
|
379
|
+
"Cash Advance", "Balance Transfer", "Loan Disbursement", "Bill Payment",
|
|
380
|
+
],
|
|
381
|
+
"account_type": [
|
|
382
|
+
"Checking Account", "Savings Account", "Money Market Account",
|
|
383
|
+
"Certificate of Deposit", "Individual Retirement Account (IRA)",
|
|
384
|
+
"401(k) Account", "Brokerage Account", "Business Checking",
|
|
385
|
+
"Business Savings", "Health Savings Account (HSA)", "Joint Account",
|
|
386
|
+
"Trust Account", "Custodial Account", "Student Account", "Premium Account",
|
|
387
|
+
],
|
|
388
|
+
"brand": [
|
|
389
|
+
"Apple", "Samsung", "Sony", "LG", "Nike", "Adidas", "Puma", "Under Armour",
|
|
390
|
+
"Toyota", "Honda", "Ford", "Tesla", "Microsoft", "Google", "Amazon",
|
|
391
|
+
"Dell", "HP", "Lenovo", "ASUS", "Acer", "Canon", "Nikon", "Bose",
|
|
392
|
+
"JBL", "Philips", "Panasonic", "Whirlpool", "GE", "Bosch", "Dyson",
|
|
393
|
+
"IKEA", "Williams-Sonoma", "Crate & Barrel", "West Elm", "Pottery Barn",
|
|
394
|
+
],
|
|
395
|
+
"payment_method": [
|
|
396
|
+
"Credit Card (Visa)", "Credit Card (Mastercard)", "Credit Card (Amex)",
|
|
397
|
+
"Debit Card", "PayPal", "Apple Pay", "Google Pay", "Bank Transfer",
|
|
398
|
+
"Wire Transfer", "Check", "Cash", "Cryptocurrency", "Venmo",
|
|
399
|
+
"Klarna", "Afterpay", "Shop Pay", "Amazon Pay", "ACH Direct Debit",
|
|
400
|
+
],
|
|
401
|
+
"order_status": [
|
|
402
|
+
"Pending", "Confirmed", "Processing", "Shipped", "In Transit",
|
|
403
|
+
"Out for Delivery", "Delivered", "Completed", "Cancelled", "Refunded",
|
|
404
|
+
"On Hold", "Backordered", "Returned", "Partially Shipped", "Failed",
|
|
405
|
+
],
|
|
406
|
+
"customer_segment": [
|
|
407
|
+
"Enterprise", "Mid-Market", "Small Business", "Startup", "Individual",
|
|
408
|
+
"Premium", "Standard", "Basic", "Trial", "Churned", "At-Risk",
|
|
409
|
+
"Champion", "Loyal", "New Customer", "VIP", "Wholesale", "Retail",
|
|
410
|
+
],
|
|
411
|
+
"license_type": [
|
|
412
|
+
"MIT License", "Apache License 2.0", "GNU GPL v3", "BSD 3-Clause",
|
|
413
|
+
"Creative Commons BY 4.0", "Proprietary", "Commercial", "Educational",
|
|
414
|
+
"Open Source", "Freeware", "Shareware", "Enterprise License",
|
|
415
|
+
"Single User", "Multi-User", "Site License", "Perpetual License",
|
|
416
|
+
],
|
|
417
|
+
"file_type": [
|
|
418
|
+
"PDF Document", "Word Document", "Excel Spreadsheet", "PowerPoint Presentation",
|
|
419
|
+
"JPEG Image", "PNG Image", "MP4 Video", "MP3 Audio", "ZIP Archive",
|
|
420
|
+
"CSV File", "JSON File", "XML File", "HTML Page", "Python Script",
|
|
421
|
+
"JavaScript File", "SQL Database", "Markdown Document", "Text File",
|
|
422
|
+
],
|
|
423
|
+
"priority_level": [
|
|
424
|
+
"Critical", "High", "Medium", "Low", "Trivial",
|
|
425
|
+
"Urgent", "Normal", "Deferred", "Blocked", "In Review",
|
|
426
|
+
],
|
|
427
|
+
"subscription_plan": [
|
|
428
|
+
"Free Tier", "Basic Plan", "Professional Plan", "Business Plan",
|
|
429
|
+
"Enterprise Plan", "Starter Plan", "Growth Plan", "Scale Plan",
|
|
430
|
+
"Team Plan", "Individual Plan", "Student Plan", "Nonprofit Plan",
|
|
431
|
+
"Annual Pro", "Monthly Basic", "Lifetime Access", "Pay-As-You-Go",
|
|
432
|
+
],
|
|
433
|
+
# Generic fallbacks for common column patterns
|
|
434
|
+
"name": [
|
|
435
|
+
"Alpha Project", "Beta Initiative", "Gamma Solution", "Delta System",
|
|
436
|
+
"Epsilon Framework", "Zeta Platform", "Eta Service", "Theta Module",
|
|
437
|
+
"Iota Component", "Kappa Engine", "Lambda Protocol", "Mu Architecture",
|
|
438
|
+
"Strategic Modernization", "Digital Transformation", "Innovation Hub",
|
|
439
|
+
"Next Generation Platform", "Cloud Migration", "Data Integration Suite",
|
|
440
|
+
],
|
|
441
|
+
"description": [
|
|
442
|
+
"High-performance solution designed for enterprise-scale deployments with robust security features.",
|
|
443
|
+
"User-friendly platform offering seamless integration with existing workflows and systems.",
|
|
444
|
+
"Cutting-edge technology stack built for reliability, scalability, and maintainability.",
|
|
445
|
+
"Comprehensive toolkit featuring advanced analytics and real-time monitoring capabilities.",
|
|
446
|
+
"Industry-leading service with proven track record of customer satisfaction and uptime.",
|
|
447
|
+
"Streamlined workflow automation reducing manual effort and improving efficiency.",
|
|
448
|
+
"Innovative approach combining best practices with modern architectural patterns.",
|
|
449
|
+
"Full-featured solution supporting multiple deployment options and configuration flexibility.",
|
|
450
|
+
],
|
|
451
|
+
"title": [
|
|
452
|
+
"Senior Software Engineer", "Product Manager", "Data Analyst",
|
|
453
|
+
"Marketing Director", "Sales Representative", "Customer Success Manager",
|
|
454
|
+
"Technical Lead", "UX Designer", "DevOps Engineer", "Quality Analyst",
|
|
455
|
+
"Project Coordinator", "Business Analyst", "Account Executive",
|
|
456
|
+
],
|
|
457
|
+
"status": [
|
|
458
|
+
"Active", "Inactive", "Pending", "Approved", "Rejected",
|
|
459
|
+
"Under Review", "Completed", "In Progress", "On Hold", "Archived",
|
|
460
|
+
"Draft", "Published", "Expired", "Suspended", "Verified",
|
|
461
|
+
],
|
|
462
|
+
"type": [
|
|
463
|
+
"Standard", "Premium", "Custom", "Default", "Advanced",
|
|
464
|
+
"Basic", "Professional", "Enterprise", "Starter", "Legacy",
|
|
465
|
+
"Internal", "External", "Public", "Private", "Hybrid",
|
|
466
|
+
],
|
|
349
467
|
"skill": [
|
|
350
468
|
"Python", "JavaScript", "SQL", "Machine Learning", "Data Analysis",
|
|
351
469
|
"Project Management", "Communication", "Leadership", "Problem Solving",
|
|
@@ -541,14 +659,28 @@ Return ONLY a JSON array of strings, no explanation. Example:
|
|
|
541
659
|
use_llm: Whether to use LLM for generation
|
|
542
660
|
|
|
543
661
|
Returns:
|
|
544
|
-
List of domain-appropriate values
|
|
662
|
+
List of domain-appropriate values (NEVER empty - falls back to generic pools)
|
|
545
663
|
"""
|
|
546
664
|
# Determine domain
|
|
547
665
|
domain = domain_hint or self.detect_domain(column_name, table_name)
|
|
548
666
|
|
|
667
|
+
# If no domain detected, infer from column name patterns
|
|
549
668
|
if domain is None:
|
|
550
|
-
|
|
551
|
-
|
|
669
|
+
col_lower = column_name.lower()
|
|
670
|
+
# Try to match generic patterns
|
|
671
|
+
if "name" in col_lower:
|
|
672
|
+
domain = "name"
|
|
673
|
+
elif "desc" in col_lower or "about" in col_lower:
|
|
674
|
+
domain = "description"
|
|
675
|
+
elif "title" in col_lower:
|
|
676
|
+
domain = "title"
|
|
677
|
+
elif "status" in col_lower or "state" in col_lower:
|
|
678
|
+
domain = "status"
|
|
679
|
+
elif "type" in col_lower or "kind" in col_lower:
|
|
680
|
+
domain = "type"
|
|
681
|
+
else:
|
|
682
|
+
# Ultimate fallback - use "name" pool for any unknown TEXT column
|
|
683
|
+
domain = "name"
|
|
552
684
|
|
|
553
685
|
# Build context string
|
|
554
686
|
full_context = context or f"{table_name} {column_name}".strip()
|
|
@@ -570,10 +702,16 @@ Return ONLY a JSON array of strings, no explanation. Example:
|
|
|
570
702
|
else:
|
|
571
703
|
pool = self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
572
704
|
|
|
705
|
+
# Ensure we never return empty - cascade through fallbacks
|
|
706
|
+
if not pool:
|
|
707
|
+
pool = self.FALLBACK_POOLS.get(domain, [])[:size]
|
|
708
|
+
if not pool:
|
|
709
|
+
# Absolute fallback - use generic name pool
|
|
710
|
+
pool = self.FALLBACK_POOLS.get("name", ["Item A", "Item B", "Item C"])[:size]
|
|
711
|
+
|
|
573
712
|
# Cache the pool
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
self._save_pool_to_cache(cache_key, pool)
|
|
713
|
+
self._pool_cache[cache_key] = pool
|
|
714
|
+
self._save_pool_to_cache(cache_key, pool)
|
|
577
715
|
|
|
578
716
|
return pool
|
|
579
717
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Misata Studio - Visual Schema Designer & Reverse Graph Editor
|
|
3
|
+
|
|
4
|
+
The GUI for reverse-engineering schemas from sample data and
|
|
5
|
+
designing custom distributions visually.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
pip install misata[studio]
|
|
9
|
+
misata studio
|
|
10
|
+
|
|
11
|
+
# Or from Python:
|
|
12
|
+
from misata.studio import launch
|
|
13
|
+
launch()
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def launch(
|
|
20
|
+
port: int = 8501,
|
|
21
|
+
host: str = "localhost",
|
|
22
|
+
open_browser: bool = True,
|
|
23
|
+
) -> None:
|
|
24
|
+
"""Launch Misata Studio GUI.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
port: Port to run on (default 8501)
|
|
28
|
+
host: Host to bind to (default localhost)
|
|
29
|
+
open_browser: Open browser automatically
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
import streamlit.web.cli as stcli
|
|
33
|
+
import sys
|
|
34
|
+
import os
|
|
35
|
+
|
|
36
|
+
# Get the path to app.py
|
|
37
|
+
app_path = os.path.join(os.path.dirname(__file__), "app.py")
|
|
38
|
+
|
|
39
|
+
sys.argv = [
|
|
40
|
+
"streamlit", "run", app_path,
|
|
41
|
+
f"--server.port={port}",
|
|
42
|
+
f"--server.address={host}",
|
|
43
|
+
"--server.headless=true" if not open_browser else "",
|
|
44
|
+
]
|
|
45
|
+
sys.argv = [arg for arg in sys.argv if arg] # Remove empty strings
|
|
46
|
+
|
|
47
|
+
stcli.main()
|
|
48
|
+
except ImportError:
|
|
49
|
+
raise ImportError(
|
|
50
|
+
"Misata Studio requires streamlit. Install with:\n"
|
|
51
|
+
" pip install misata[studio]"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
__all__ = ["launch"]
|
misata/studio/app.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import streamlit as st
|
|
2
|
+
from misata.studio.state.store import StudioStore
|
|
3
|
+
from misata.studio.utils.styles import apply_custom_styles
|
|
4
|
+
from misata.studio.components.sidebar import render_sidebar
|
|
5
|
+
from misata.studio.tabs.schema_designer import render_schema_tab
|
|
6
|
+
from misata.studio.tabs.outcome_curve import render_outcome_tab
|
|
7
|
+
from misata.studio.tabs.configure import render_configure_tab
|
|
8
|
+
from misata.studio.tabs.generate import render_generate_tab
|
|
9
|
+
|
|
10
|
+
# Page Config
|
|
11
|
+
st.set_page_config(
|
|
12
|
+
page_title="Misata Studio",
|
|
13
|
+
page_icon="M",
|
|
14
|
+
layout="wide",
|
|
15
|
+
initial_sidebar_state="expanded"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def main():
|
|
19
|
+
"""Main Orchestrator for Misata Studio."""
|
|
20
|
+
|
|
21
|
+
# 1. Initialize State & Styles
|
|
22
|
+
StudioStore.init()
|
|
23
|
+
apply_custom_styles()
|
|
24
|
+
|
|
25
|
+
# 2. Render Sidebar
|
|
26
|
+
render_sidebar()
|
|
27
|
+
|
|
28
|
+
# 3. Router
|
|
29
|
+
active_tab = StudioStore.get("active_tab", "Schema")
|
|
30
|
+
|
|
31
|
+
# Content Area
|
|
32
|
+
with st.container():
|
|
33
|
+
if active_tab == "Schema":
|
|
34
|
+
render_schema_tab()
|
|
35
|
+
|
|
36
|
+
elif active_tab == "Outcome":
|
|
37
|
+
render_outcome_tab()
|
|
38
|
+
|
|
39
|
+
elif active_tab == "Configure":
|
|
40
|
+
render_configure_tab()
|
|
41
|
+
|
|
42
|
+
elif active_tab == "Generate":
|
|
43
|
+
render_generate_tab()
|
|
44
|
+
|
|
45
|
+
else:
|
|
46
|
+
st.error(f"Unknown View: {active_tab}")
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
main()
|