misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. misata/__init__.py +1 -1
  2. misata/agents/__init__.py +23 -0
  3. misata/agents/pipeline.py +286 -0
  4. misata/causal/__init__.py +5 -0
  5. misata/causal/graph.py +109 -0
  6. misata/causal/solver.py +115 -0
  7. misata/cli.py +31 -0
  8. misata/generators/__init__.py +19 -0
  9. misata/generators/copula.py +198 -0
  10. misata/llm_parser.py +180 -137
  11. misata/quality.py +78 -33
  12. misata/reference_data.py +221 -0
  13. misata/research/__init__.py +3 -0
  14. misata/research/agent.py +70 -0
  15. misata/schema.py +25 -0
  16. misata/simulator.py +264 -12
  17. misata/smart_values.py +144 -6
  18. misata/studio/__init__.py +55 -0
  19. misata/studio/app.py +49 -0
  20. misata/studio/components/inspector.py +81 -0
  21. misata/studio/components/sidebar.py +35 -0
  22. misata/studio/constraint_generator.py +781 -0
  23. misata/studio/inference.py +319 -0
  24. misata/studio/outcome_curve.py +284 -0
  25. misata/studio/state/store.py +55 -0
  26. misata/studio/tabs/configure.py +50 -0
  27. misata/studio/tabs/generate.py +117 -0
  28. misata/studio/tabs/outcome_curve.py +149 -0
  29. misata/studio/tabs/schema_designer.py +217 -0
  30. misata/studio/utils/styles.py +143 -0
  31. misata/studio_constraints/__init__.py +29 -0
  32. misata/studio_constraints/z3_solver.py +259 -0
  33. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
  34. misata-0.5.0.dist-info/RECORD +61 -0
  35. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
  36. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
  37. misata-0.3.0b0.dist-info/RECORD +0 -37
  38. /misata/{generators.py → generators_legacy.py} +0 -0
  39. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
  40. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0
misata/simulator.py CHANGED
@@ -16,7 +16,9 @@ from typing import Any, Dict, List, Optional
16
16
  import numpy as np
17
17
  import pandas as pd
18
18
 
19
- from misata.generators import TextGenerator
19
+ from misata.generators.base import TextGenerator as _FactoryTextGenerator # Generator factory version
20
+ # Use the original generators.py TextGenerator which supports seed
21
+ from misata.generators_legacy import TextGenerator
20
22
  from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
21
23
 
22
24
 
@@ -34,6 +36,10 @@ class DataSimulator:
34
36
  rng: NumPy random generator for reproducibility
35
37
  """
36
38
 
39
+ # Performance constants
40
+ MAX_CONTEXT_ROWS = 50000 # Cap context storage for memory efficiency
41
+ TEXT_POOL_SIZE = 10000 # Size of text value pools for vectorized sampling
42
+
37
43
  def __init__(self, config: SchemaConfig,
38
44
  apply_semantic_fixes: bool = True, batch_size: int = 10_000,
39
45
  smart_mode: bool = False, use_llm: bool = True):
@@ -57,6 +63,7 @@ class DataSimulator:
57
63
  self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
58
64
  self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
59
65
  self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
66
+ self._text_pools: Dict[str, np.ndarray] = {} # Cache text pools for vectorized sampling
60
67
 
61
68
  # Apply semantic inference to fix column types
62
69
  if apply_semantic_fixes:
@@ -199,10 +206,24 @@ class DataSimulator:
199
206
  ctx_df = df[cols_to_store].copy()
200
207
 
201
208
  if table_name not in self.context:
209
+ # First batch: store up to MAX_CONTEXT_ROWS
210
+ if len(ctx_df) > self.MAX_CONTEXT_ROWS:
211
+ ctx_df = ctx_df.sample(n=self.MAX_CONTEXT_ROWS, random_state=self.config.seed)
202
212
  self.context[table_name] = ctx_df
203
213
  else:
204
- # Append to existing context
205
- self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
214
+ # Append to existing context, but cap at MAX_CONTEXT_ROWS
215
+ current_len = len(self.context[table_name])
216
+ if current_len >= self.MAX_CONTEXT_ROWS:
217
+ # Already at capacity, use reservoir sampling for randomness
218
+ # Replace some existing rows with new ones (probability-based)
219
+ return # Skip appending, we have enough IDs
220
+
221
+ remaining_space = self.MAX_CONTEXT_ROWS - current_len
222
+ rows_to_add = ctx_df.iloc[:remaining_space]
223
+ self.context[table_name] = pd.concat(
224
+ [self.context[table_name], rows_to_add],
225
+ ignore_index=True
226
+ )
206
227
 
207
228
  def generate_column(
208
229
  self,
@@ -225,6 +246,70 @@ class DataSimulator:
225
246
  """
226
247
  params = column.distribution_params
227
248
 
249
+ # ========== CORRELATED COLUMN GENERATION ==========
250
+ # If this column depends on another column's value, use conditional distribution
251
+ if "depends_on" in params and table_data is not None:
252
+ parent_col = params["depends_on"]
253
+ mapping = params.get("mapping", {})
254
+
255
+ if parent_col in table_data.columns and mapping:
256
+ parent_values = table_data[parent_col].values
257
+
258
+ # Check if it's numeric or categorical mapping
259
+ first_val = next(iter(mapping.values()))
260
+ if isinstance(first_val, dict) and "mean" in first_val:
261
+ # Numeric conditional distribution (e.g., salary based on job_title)
262
+ # mapping = {"Intern": {"mean": 40000, "std": 5000}, "CTO": {"mean": 200000, "std": 30000}}
263
+ values = np.zeros(size)
264
+ for key, dist in mapping.items():
265
+ mask = parent_values == key
266
+ count = mask.sum()
267
+ if count > 0:
268
+ mean = dist.get("mean", 50000)
269
+ std = dist.get("std", mean * 0.1)
270
+ values[mask] = self.rng.normal(mean, std, count)
271
+
272
+ # Handle values that didn't match any key (use default)
273
+ default = params.get("default", {"mean": 50000, "std": 10000})
274
+ unmatched = ~np.isin(parent_values, list(mapping.keys()))
275
+ if unmatched.sum() > 0:
276
+ values[unmatched] = self.rng.normal(
277
+ default.get("mean", 50000),
278
+ default.get("std", 10000),
279
+ unmatched.sum()
280
+ )
281
+ return values
282
+
283
+ elif isinstance(first_val, list):
284
+ # Categorical conditional (e.g., state based on country)
285
+ # mapping = {"USA": ["CA", "TX", "NY"], "UK": ["England", "Scotland"]}
286
+ values = np.empty(size, dtype=object)
287
+ for key, choices in mapping.items():
288
+ mask = parent_values == key
289
+ count = mask.sum()
290
+ if count > 0:
291
+ values[mask] = self.rng.choice(choices, count)
292
+
293
+ # Default for unmatched
294
+ default_choices = params.get("default", ["Unknown"])
295
+ unmatched = values == None # noqa
296
+ if unmatched.sum() > 0:
297
+ values[unmatched] = self.rng.choice(default_choices, unmatched.sum())
298
+ return values
299
+
300
+ elif isinstance(first_val, (int, float)):
301
+ # Probability-based boolean (e.g., churn probability based on plan)
302
+ # mapping = {"free": 0.3, "pro": 0.1, "enterprise": 0.05}
303
+ values = np.zeros(size, dtype=bool)
304
+ for key, prob in mapping.items():
305
+ mask = parent_values == key
306
+ count = mask.sum()
307
+ if count > 0:
308
+ values[mask] = self.rng.random(count) < prob
309
+ return values
310
+
311
+ # ========== STANDARD COLUMN GENERATION ==========
312
+
228
313
  # CATEGORICAL
229
314
  if column.type == "categorical":
230
315
  choices = params.get("choices", ["A", "B", "C"])
@@ -469,23 +554,59 @@ class DataSimulator:
469
554
  return values
470
555
 
471
556
  if text_type == "name":
472
- values = np.array([self.text_gen.name() for _ in range(size)])
557
+ pool_key = "text_name"
558
+ if pool_key not in self._text_pools:
559
+ pool_size = min(size, self.TEXT_POOL_SIZE)
560
+ self._text_pools[pool_key] = np.array([self.text_gen.name() for _ in range(pool_size)])
561
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
473
562
  elif text_type == "email":
474
- values = np.array([self.text_gen.email() for _ in range(size)])
563
+ pool_key = "text_email"
564
+ if pool_key not in self._text_pools:
565
+ pool_size = min(size, self.TEXT_POOL_SIZE)
566
+ self._text_pools[pool_key] = np.array([self.text_gen.email() for _ in range(pool_size)])
567
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
475
568
  elif text_type == "company":
476
- values = np.array([self.text_gen.company() for _ in range(size)])
569
+ pool_key = "text_company"
570
+ if pool_key not in self._text_pools:
571
+ pool_size = min(size, self.TEXT_POOL_SIZE)
572
+ self._text_pools[pool_key] = np.array([self.text_gen.company() for _ in range(pool_size)])
573
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
477
574
  elif text_type == "sentence":
478
- values = np.array([self.text_gen.sentence() for _ in range(size)])
575
+ pool_key = "text_sentence"
576
+ if pool_key not in self._text_pools:
577
+ pool_size = min(size, self.TEXT_POOL_SIZE)
578
+ self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
579
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
479
580
  elif text_type == "word":
480
- values = np.array([self.text_gen.word() for _ in range(size)])
581
+ pool_key = "text_word"
582
+ if pool_key not in self._text_pools:
583
+ pool_size = min(size, self.TEXT_POOL_SIZE)
584
+ self._text_pools[pool_key] = np.array([self.text_gen.word() for _ in range(pool_size)])
585
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
481
586
  elif text_type == "address":
482
- values = np.array([self.text_gen.full_address() for _ in range(size)])
587
+ pool_key = "text_address"
588
+ if pool_key not in self._text_pools:
589
+ pool_size = min(size, self.TEXT_POOL_SIZE)
590
+ self._text_pools[pool_key] = np.array([self.text_gen.full_address() for _ in range(pool_size)])
591
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
483
592
  elif text_type == "phone":
484
- values = np.array([self.text_gen.phone_number() for _ in range(size)])
593
+ pool_key = "text_phone"
594
+ if pool_key not in self._text_pools:
595
+ pool_size = min(size, self.TEXT_POOL_SIZE)
596
+ self._text_pools[pool_key] = np.array([self.text_gen.phone_number() for _ in range(pool_size)])
597
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
485
598
  elif text_type == "url":
486
- values = np.array([self.text_gen.url() for _ in range(size)])
599
+ pool_key = "text_url"
600
+ if pool_key not in self._text_pools:
601
+ pool_size = min(size, self.TEXT_POOL_SIZE)
602
+ self._text_pools[pool_key] = np.array([self.text_gen.url() for _ in range(pool_size)])
603
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
487
604
  else:
488
- values = np.array([self.text_gen.sentence() for _ in range(size)])
605
+ pool_key = "text_sentence"
606
+ if pool_key not in self._text_pools:
607
+ pool_size = min(size, self.TEXT_POOL_SIZE)
608
+ self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
609
+ values = self.rng.choice(self._text_pools[pool_key], size=size)
489
610
 
490
611
  return values
491
612
 
@@ -640,6 +761,9 @@ class DataSimulator:
640
761
 
641
762
  # Apply business rule constraints
642
763
  df_batch = self.apply_constraints(df_batch, table)
764
+
765
+ # Apply outcome curves (Trends/Seasonality)
766
+ df_batch = self.apply_outcome_curves(df_batch, table_name)
643
767
 
644
768
  # Update context for future batches/tables
645
769
  self._update_context(table_name, df_batch)
@@ -667,6 +791,134 @@ class DataSimulator:
667
791
 
668
792
  return df
669
793
 
794
+ def apply_outcome_curves(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
795
+ """
796
+ Apply temporal outcome curves to force data to match trends/seasonality.
797
+
798
+ This overrides the base distribution with the high-level constraints
799
+ defined in the prompt (e.g. "seasonal peaks", "upward trend").
800
+ """
801
+ if not hasattr(self.config, 'outcome_curves') or not self.config.outcome_curves:
802
+ print(f"[CURVE DEBUG] No outcome_curves found in config for {table_name}")
803
+ return df
804
+
805
+ print(f"[CURVE DEBUG] Found {len(self.config.outcome_curves)} curves in config")
806
+
807
+ # Filter curves for this table - handle both dict and Pydantic object
808
+ curves = []
809
+ for c in self.config.outcome_curves:
810
+ # Get table name from curve (handle both dict and object)
811
+ c_table = c.table if hasattr(c, 'table') else c.get('table')
812
+ if c_table == table_name:
813
+ curves.append(c)
814
+
815
+ print(f"[CURVE DEBUG] {len(curves)} curves match table '{table_name}'")
816
+
817
+ for curve in curves:
818
+ try:
819
+ # Access attributes (Pydantic) or dict keys
820
+ target_col = curve.column if hasattr(curve, 'column') else curve['column']
821
+ time_col = curve.time_column if hasattr(curve, 'time_column') else curve['time_column']
822
+ points = curve.curve_points if hasattr(curve, 'curve_points') else curve.get('curve_points', [])
823
+ pattern_type = curve.pattern_type if hasattr(curve, 'pattern_type') else curve.get('pattern_type', 'seasonal')
824
+
825
+ print(f"[CURVE DEBUG] Applying curve: table={table_name}, col={target_col}, time_col={time_col}, pattern={pattern_type}")
826
+ print(f"[CURVE DEBUG] DF columns: {list(df.columns)}")
827
+
828
+ if target_col not in df.columns:
829
+ print(f"[CURVE DEBUG] Target column '{target_col}' not in DataFrame!")
830
+ continue
831
+ if time_col not in df.columns:
832
+ print(f"[CURVE DEBUG] Time column '{time_col}' not in DataFrame!")
833
+ continue
834
+
835
+ if not points:
836
+ print(f"[CURVE DEBUG] No curve points!")
837
+ continue
838
+
839
+ # Convert Pydantic CurvePoint objects to dicts if needed
840
+ point_dicts = []
841
+ for p in points:
842
+ if hasattr(p, 'month'):
843
+ point_dicts.append({'month': p.month, 'relative_value': p.relative_value})
844
+ else:
845
+ point_dicts.append(p)
846
+ points = point_dicts
847
+
848
+ print(f"[CURVE DEBUG] Points: {points}")
849
+
850
+ # Sort points by order (month or progress)
851
+ points.sort(key=lambda x: x.get('month', x.get('x', 0)))
852
+
853
+ pattern_type = curve.get('pattern_type', 'seasonal')
854
+
855
+ # Extract time components
856
+ if not pd.api.types.is_datetime64_any_dtype(df[time_col]):
857
+ timestamps = pd.to_datetime(df[time_col], errors='coerce')
858
+ else:
859
+ timestamps = df[time_col]
860
+
861
+ # Initialize factors
862
+ row_factors = np.ones(len(df))
863
+
864
+ # STRATEGY 1: SEASONAL (Cyclic 1-12)
865
+ if pattern_type in ['seasonal', 'cyclic']:
866
+ months = timestamps.dt.month
867
+ scaling_factors = np.ones(13) # Index 1-12
868
+
869
+ x_known = np.array([p['month'] for p in points])
870
+ y_known = np.array([p['relative_value'] for p in points])
871
+
872
+ for m in range(1, 13):
873
+ if m < x_known.min():
874
+ scaling_factors[m] = y_known[0]
875
+ elif m > x_known.max():
876
+ scaling_factors[m] = y_known[-1]
877
+ else:
878
+ scaling_factors[m] = np.interp(m, x_known, y_known)
879
+
880
+ row_factors = scaling_factors[months.fillna(1).astype(int).values]
881
+
882
+ # STRATEGY 2: GROWTH/TREND (Linear over absolute time)
883
+ elif pattern_type in ['growth', 'trend', 'increase', 'decline']:
884
+ # Normalize time to 0.0 - 1.0 range
885
+ t_min = timestamps.min()
886
+ t_max = timestamps.max()
887
+
888
+ if t_min == t_max:
889
+ row_factors = np.ones(len(df))
890
+ else:
891
+ # Convert to numeric (timestamps)
892
+ t_numerics = timestamps.astype(np.int64)
893
+ t_start = t_numerics.min()
894
+ t_range = t_numerics.max() - t_start
895
+
896
+ # Normalize 0.0 to 1.0
897
+ t_norm = (t_numerics - t_start) / t_range
898
+
899
+ # Map points (assume points are mapped 1-12 or 0.0-1.0?)
900
+ # The LLM outputs "month" 1-12 usually. Let's map 1=Start, 12=End?
901
+ # Or safer: interpolating 1-12 across the whole range.
902
+
903
+ x_known = np.array([p['month'] for p in points])
904
+ y_known = np.array([p['relative_value'] for p in points])
905
+
906
+ # Normalize x_known to 0.0-1.0 range (assuming 1..12 scale from LLM)
907
+ # If LLM says Month 1 to 12, we treat 1 as 0.0 and 12 as 1.0
908
+ x_known_norm = (x_known - 1) / 11.0 # 1->0, 12->1
909
+
910
+ # Interpolate
911
+ row_factors = np.interp(t_norm, x_known_norm, y_known)
912
+
913
+ # Apply!
914
+ df[target_col] = df[target_col] * row_factors
915
+
916
+ except Exception as e:
917
+ warnings.warn(f"Failed to apply outcome curve for {table_name}: {e}")
918
+ continue
919
+
920
+ return df
921
+
670
922
  def _apply_single_constraint(self, df: pd.DataFrame, constraint: Any) -> pd.DataFrame:
671
923
  """Apply a single constraint to the DataFrame."""
672
924
 
misata/smart_values.py CHANGED
@@ -86,6 +86,22 @@ class SmartValueGenerator:
86
86
  "feature_name": ["feature", "capability", "functionality"],
87
87
  "bug_type": ["bug", "issue", "defect", "error"],
88
88
  "api_endpoint": ["endpoint", "api", "route", "path"],
89
+
90
+ # NEW v0.5.0: Additional domain patterns
91
+ "payment_method": ["payment_method", "pay_type", "payment_option"],
92
+ "order_status": ["order_status", "status", "state"],
93
+ "customer_segment": ["segment", "customer_type", "tier", "classification"],
94
+ "license_type": ["license", "licence"],
95
+ "file_type": ["file_type", "document_type", "mime_type"],
96
+ "priority_level": ["priority", "urgency", "importance"],
97
+ "subscription_plan": ["plan", "subscription", "tier", "package"],
98
+
99
+ # Generic patterns - lowest priority but always match on exact column names
100
+ "name": ["name"],
101
+ "description": ["description", "desc", "about", "summary", "details"],
102
+ "title": ["title", "heading"],
103
+ "status": ["status", "state"],
104
+ "type": ["type", "kind", "category"],
89
105
  }
90
106
 
91
107
  # Curated fallback pools (no LLM needed)
@@ -346,6 +362,108 @@ class SmartValueGenerator:
346
362
  "/api/v1/notifications", "/api/v1/settings", "/api/v1/search",
347
363
  "/api/v1/reports", "/api/v1/webhooks", "/api/v1/integrations",
348
364
  ],
365
+ # NEW v0.5.0: Additional high-quality domain pools
366
+ "medical_specialty": [
367
+ "Cardiology", "Dermatology", "Emergency Medicine", "Endocrinology",
368
+ "Family Medicine", "Gastroenterology", "General Surgery", "Geriatrics",
369
+ "Hematology", "Infectious Disease", "Internal Medicine", "Nephrology",
370
+ "Neurology", "Obstetrics & Gynecology", "Oncology", "Ophthalmology",
371
+ "Orthopedic Surgery", "Otolaryngology", "Pediatrics", "Psychiatry",
372
+ "Pulmonology", "Radiology", "Rheumatology", "Urology", "Anesthesiology",
373
+ ],
374
+ "transaction_type": [
375
+ "Purchase", "Refund", "Transfer", "Deposit", "Withdrawal",
376
+ "Payment", "Credit", "Debit", "Fee", "Interest",
377
+ "Dividend", "Commission", "Bonus", "Adjustment", "Reversal",
378
+ "Wire Transfer", "ACH Transfer", "Direct Deposit", "Check Payment",
379
+ "Cash Advance", "Balance Transfer", "Loan Disbursement", "Bill Payment",
380
+ ],
381
+ "account_type": [
382
+ "Checking Account", "Savings Account", "Money Market Account",
383
+ "Certificate of Deposit", "Individual Retirement Account (IRA)",
384
+ "401(k) Account", "Brokerage Account", "Business Checking",
385
+ "Business Savings", "Health Savings Account (HSA)", "Joint Account",
386
+ "Trust Account", "Custodial Account", "Student Account", "Premium Account",
387
+ ],
388
+ "brand": [
389
+ "Apple", "Samsung", "Sony", "LG", "Nike", "Adidas", "Puma", "Under Armour",
390
+ "Toyota", "Honda", "Ford", "Tesla", "Microsoft", "Google", "Amazon",
391
+ "Dell", "HP", "Lenovo", "ASUS", "Acer", "Canon", "Nikon", "Bose",
392
+ "JBL", "Philips", "Panasonic", "Whirlpool", "GE", "Bosch", "Dyson",
393
+ "IKEA", "Williams-Sonoma", "Crate & Barrel", "West Elm", "Pottery Barn",
394
+ ],
395
+ "payment_method": [
396
+ "Credit Card (Visa)", "Credit Card (Mastercard)", "Credit Card (Amex)",
397
+ "Debit Card", "PayPal", "Apple Pay", "Google Pay", "Bank Transfer",
398
+ "Wire Transfer", "Check", "Cash", "Cryptocurrency", "Venmo",
399
+ "Klarna", "Afterpay", "Shop Pay", "Amazon Pay", "ACH Direct Debit",
400
+ ],
401
+ "order_status": [
402
+ "Pending", "Confirmed", "Processing", "Shipped", "In Transit",
403
+ "Out for Delivery", "Delivered", "Completed", "Cancelled", "Refunded",
404
+ "On Hold", "Backordered", "Returned", "Partially Shipped", "Failed",
405
+ ],
406
+ "customer_segment": [
407
+ "Enterprise", "Mid-Market", "Small Business", "Startup", "Individual",
408
+ "Premium", "Standard", "Basic", "Trial", "Churned", "At-Risk",
409
+ "Champion", "Loyal", "New Customer", "VIP", "Wholesale", "Retail",
410
+ ],
411
+ "license_type": [
412
+ "MIT License", "Apache License 2.0", "GNU GPL v3", "BSD 3-Clause",
413
+ "Creative Commons BY 4.0", "Proprietary", "Commercial", "Educational",
414
+ "Open Source", "Freeware", "Shareware", "Enterprise License",
415
+ "Single User", "Multi-User", "Site License", "Perpetual License",
416
+ ],
417
+ "file_type": [
418
+ "PDF Document", "Word Document", "Excel Spreadsheet", "PowerPoint Presentation",
419
+ "JPEG Image", "PNG Image", "MP4 Video", "MP3 Audio", "ZIP Archive",
420
+ "CSV File", "JSON File", "XML File", "HTML Page", "Python Script",
421
+ "JavaScript File", "SQL Database", "Markdown Document", "Text File",
422
+ ],
423
+ "priority_level": [
424
+ "Critical", "High", "Medium", "Low", "Trivial",
425
+ "Urgent", "Normal", "Deferred", "Blocked", "In Review",
426
+ ],
427
+ "subscription_plan": [
428
+ "Free Tier", "Basic Plan", "Professional Plan", "Business Plan",
429
+ "Enterprise Plan", "Starter Plan", "Growth Plan", "Scale Plan",
430
+ "Team Plan", "Individual Plan", "Student Plan", "Nonprofit Plan",
431
+ "Annual Pro", "Monthly Basic", "Lifetime Access", "Pay-As-You-Go",
432
+ ],
433
+ # Generic fallbacks for common column patterns
434
+ "name": [
435
+ "Alpha Project", "Beta Initiative", "Gamma Solution", "Delta System",
436
+ "Epsilon Framework", "Zeta Platform", "Eta Service", "Theta Module",
437
+ "Iota Component", "Kappa Engine", "Lambda Protocol", "Mu Architecture",
438
+ "Strategic Modernization", "Digital Transformation", "Innovation Hub",
439
+ "Next Generation Platform", "Cloud Migration", "Data Integration Suite",
440
+ ],
441
+ "description": [
442
+ "High-performance solution designed for enterprise-scale deployments with robust security features.",
443
+ "User-friendly platform offering seamless integration with existing workflows and systems.",
444
+ "Cutting-edge technology stack built for reliability, scalability, and maintainability.",
445
+ "Comprehensive toolkit featuring advanced analytics and real-time monitoring capabilities.",
446
+ "Industry-leading service with proven track record of customer satisfaction and uptime.",
447
+ "Streamlined workflow automation reducing manual effort and improving efficiency.",
448
+ "Innovative approach combining best practices with modern architectural patterns.",
449
+ "Full-featured solution supporting multiple deployment options and configuration flexibility.",
450
+ ],
451
+ "title": [
452
+ "Senior Software Engineer", "Product Manager", "Data Analyst",
453
+ "Marketing Director", "Sales Representative", "Customer Success Manager",
454
+ "Technical Lead", "UX Designer", "DevOps Engineer", "Quality Analyst",
455
+ "Project Coordinator", "Business Analyst", "Account Executive",
456
+ ],
457
+ "status": [
458
+ "Active", "Inactive", "Pending", "Approved", "Rejected",
459
+ "Under Review", "Completed", "In Progress", "On Hold", "Archived",
460
+ "Draft", "Published", "Expired", "Suspended", "Verified",
461
+ ],
462
+ "type": [
463
+ "Standard", "Premium", "Custom", "Default", "Advanced",
464
+ "Basic", "Professional", "Enterprise", "Starter", "Legacy",
465
+ "Internal", "External", "Public", "Private", "Hybrid",
466
+ ],
349
467
  "skill": [
350
468
  "Python", "JavaScript", "SQL", "Machine Learning", "Data Analysis",
351
469
  "Project Management", "Communication", "Leadership", "Problem Solving",
@@ -541,14 +659,28 @@ Return ONLY a JSON array of strings, no explanation. Example:
541
659
  use_llm: Whether to use LLM for generation
542
660
 
543
661
  Returns:
544
- List of domain-appropriate values
662
+ List of domain-appropriate values (NEVER empty - falls back to generic pools)
545
663
  """
546
664
  # Determine domain
547
665
  domain = domain_hint or self.detect_domain(column_name, table_name)
548
666
 
667
+ # If no domain detected, infer from column name patterns
549
668
  if domain is None:
550
- # No domain detected, return empty
551
- return []
669
+ col_lower = column_name.lower()
670
+ # Try to match generic patterns
671
+ if "name" in col_lower:
672
+ domain = "name"
673
+ elif "desc" in col_lower or "about" in col_lower:
674
+ domain = "description"
675
+ elif "title" in col_lower:
676
+ domain = "title"
677
+ elif "status" in col_lower or "state" in col_lower:
678
+ domain = "status"
679
+ elif "type" in col_lower or "kind" in col_lower:
680
+ domain = "type"
681
+ else:
682
+ # Ultimate fallback - use "name" pool for any unknown TEXT column
683
+ domain = "name"
552
684
 
553
685
  # Build context string
554
686
  full_context = context or f"{table_name} {column_name}".strip()
@@ -570,10 +702,16 @@ Return ONLY a JSON array of strings, no explanation. Example:
570
702
  else:
571
703
  pool = self.FALLBACK_POOLS.get(domain, [])[:size]
572
704
 
705
+ # Ensure we never return empty - cascade through fallbacks
706
+ if not pool:
707
+ pool = self.FALLBACK_POOLS.get(domain, [])[:size]
708
+ if not pool:
709
+ # Absolute fallback - use generic name pool
710
+ pool = self.FALLBACK_POOLS.get("name", ["Item A", "Item B", "Item C"])[:size]
711
+
573
712
  # Cache the pool
574
- if pool:
575
- self._pool_cache[cache_key] = pool
576
- self._save_pool_to_cache(cache_key, pool)
713
+ self._pool_cache[cache_key] = pool
714
+ self._save_pool_to_cache(cache_key, pool)
577
715
 
578
716
  return pool
579
717
 
@@ -0,0 +1,55 @@
1
+ """
2
+ Misata Studio - Visual Schema Designer & Reverse Graph Editor
3
+
4
+ The GUI for reverse-engineering schemas from sample data and
5
+ designing custom distributions visually.
6
+
7
+ Usage:
8
+ pip install misata[studio]
9
+ misata studio
10
+
11
+ # Or from Python:
12
+ from misata.studio import launch
13
+ launch()
14
+ """
15
+
16
+ from typing import Optional
17
+
18
+
19
+ def launch(
20
+ port: int = 8501,
21
+ host: str = "localhost",
22
+ open_browser: bool = True,
23
+ ) -> None:
24
+ """Launch Misata Studio GUI.
25
+
26
+ Args:
27
+ port: Port to run on (default 8501)
28
+ host: Host to bind to (default localhost)
29
+ open_browser: Open browser automatically
30
+ """
31
+ try:
32
+ import streamlit.web.cli as stcli
33
+ import sys
34
+ import os
35
+
36
+ # Get the path to app.py
37
+ app_path = os.path.join(os.path.dirname(__file__), "app.py")
38
+
39
+ sys.argv = [
40
+ "streamlit", "run", app_path,
41
+ f"--server.port={port}",
42
+ f"--server.address={host}",
43
+ "--server.headless=true" if not open_browser else "",
44
+ ]
45
+ sys.argv = [arg for arg in sys.argv if arg] # Remove empty strings
46
+
47
+ stcli.main()
48
+ except ImportError:
49
+ raise ImportError(
50
+ "Misata Studio requires streamlit. Install with:\n"
51
+ " pip install misata[studio]"
52
+ )
53
+
54
+
55
+ __all__ = ["launch"]
misata/studio/app.py ADDED
@@ -0,0 +1,49 @@
1
+ import streamlit as st
2
+ from misata.studio.state.store import StudioStore
3
+ from misata.studio.utils.styles import apply_custom_styles
4
+ from misata.studio.components.sidebar import render_sidebar
5
+ from misata.studio.tabs.schema_designer import render_schema_tab
6
+ from misata.studio.tabs.outcome_curve import render_outcome_tab
7
+ from misata.studio.tabs.configure import render_configure_tab
8
+ from misata.studio.tabs.generate import render_generate_tab
9
+
10
+ # Page Config
11
+ st.set_page_config(
12
+ page_title="Misata Studio",
13
+ page_icon="M",
14
+ layout="wide",
15
+ initial_sidebar_state="expanded"
16
+ )
17
+
18
+ def main():
19
+ """Main Orchestrator for Misata Studio."""
20
+
21
+ # 1. Initialize State & Styles
22
+ StudioStore.init()
23
+ apply_custom_styles()
24
+
25
+ # 2. Render Sidebar
26
+ render_sidebar()
27
+
28
+ # 3. Router
29
+ active_tab = StudioStore.get("active_tab", "Schema")
30
+
31
+ # Content Area
32
+ with st.container():
33
+ if active_tab == "Schema":
34
+ render_schema_tab()
35
+
36
+ elif active_tab == "Outcome":
37
+ render_outcome_tab()
38
+
39
+ elif active_tab == "Configure":
40
+ render_configure_tab()
41
+
42
+ elif active_tab == "Generate":
43
+ render_generate_tab()
44
+
45
+ else:
46
+ st.error(f"Unknown View: {active_tab}")
47
+
48
+ if __name__ == "__main__":
49
+ main()