PyPI - misata - Versions diffs - 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

misata/__init__.py +1 -1
misata/agents/__init__.py +23 -0
misata/agents/pipeline.py +286 -0
misata/causal/__init__.py +5 -0
misata/causal/graph.py +109 -0
misata/causal/solver.py +115 -0
misata/cli.py +31 -0
misata/generators/__init__.py +19 -0
misata/generators/copula.py +198 -0
misata/llm_parser.py +180 -137
misata/quality.py +78 -33
misata/reference_data.py +221 -0
misata/research/__init__.py +3 -0
misata/research/agent.py +70 -0
misata/schema.py +25 -0
misata/simulator.py +264 -12
misata/smart_values.py +144 -6
misata/studio/__init__.py +55 -0
misata/studio/app.py +49 -0
misata/studio/components/inspector.py +81 -0
misata/studio/components/sidebar.py +35 -0
misata/studio/constraint_generator.py +781 -0
misata/studio/inference.py +319 -0
misata/studio/outcome_curve.py +284 -0
misata/studio/state/store.py +55 -0
misata/studio/tabs/configure.py +50 -0
misata/studio/tabs/generate.py +117 -0
misata/studio/tabs/outcome_curve.py +149 -0
misata/studio/tabs/schema_designer.py +217 -0
misata/studio/utils/styles.py +143 -0
misata/studio_constraints/__init__.py +29 -0
misata/studio_constraints/z3_solver.py +259 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
misata-0.5.0.dist-info/RECORD +61 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
misata-0.3.0b0.dist-info/RECORD +0 -37
/misata/{generators.py → generators_legacy.py} +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0

misata/simulator.py CHANGED Viewed

@@ -16,7 +16,9 @@ from typing import Any, Dict, List, Optional
 import numpy as np
 import pandas as pd
-from misata.generators import TextGenerator
+from misata.generators.base import TextGenerator as _FactoryTextGenerator  # Generator factory version
+# Use the original generators.py TextGenerator which supports seed
+from misata.generators_legacy import TextGenerator
 from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig
@@ -34,6 +36,10 @@ class DataSimulator:
         rng: NumPy random generator for reproducibility
     """
+    # Performance constants
+    MAX_CONTEXT_ROWS = 50000  # Cap context storage for memory efficiency
+    TEXT_POOL_SIZE = 10000    # Size of text value pools for vectorized sampling
     def __init__(self, config: SchemaConfig,
                  apply_semantic_fixes: bool = True, batch_size: int = 10_000,
                  smart_mode: bool = False, use_llm: bool = True):
@@ -57,6 +63,7 @@ class DataSimulator:
         self._unique_pools: Dict[str, np.ndarray] = {}  # Store pre-generated unique values
         self._unique_counters: Dict[str, int] = {}      # Track usage of unique pools
         self._smart_pools: Dict[str, np.ndarray] = {}   # Cache smart value pools
+        self._text_pools: Dict[str, np.ndarray] = {}    # Cache text pools for vectorized sampling
         # Apply semantic inference to fix column types
         if apply_semantic_fixes:
@@ -199,10 +206,24 @@ class DataSimulator:
         ctx_df = df[cols_to_store].copy()
         if table_name not in self.context:
+            # First batch: store up to MAX_CONTEXT_ROWS
+            if len(ctx_df) > self.MAX_CONTEXT_ROWS:
+                ctx_df = ctx_df.sample(n=self.MAX_CONTEXT_ROWS, random_state=self.config.seed)
             self.context[table_name] = ctx_df
         else:
-            # Append to existing context
-            self.context[table_name] = pd.concat([self.context[table_name], ctx_df], ignore_index=True)
+            # Append to existing context, but cap at MAX_CONTEXT_ROWS
+            current_len = len(self.context[table_name])
+            if current_len >= self.MAX_CONTEXT_ROWS:
+                # Already at capacity, use reservoir sampling for randomness
+                # Replace some existing rows with new ones (probability-based)
+                return  # Skip appending, we have enough IDs
+            remaining_space = self.MAX_CONTEXT_ROWS - current_len
+            rows_to_add = ctx_df.iloc[:remaining_space]
+            self.context[table_name] = pd.concat(
+                [self.context[table_name], rows_to_add],
+                ignore_index=True
+            )
     def generate_column(
         self,
@@ -225,6 +246,70 @@ class DataSimulator:
         """
         params = column.distribution_params
+        # ========== CORRELATED COLUMN GENERATION ==========
+        # If this column depends on another column's value, use conditional distribution
+        if "depends_on" in params and table_data is not None:
+            parent_col = params["depends_on"]
+            mapping = params.get("mapping", {})
+            if parent_col in table_data.columns and mapping:
+                parent_values = table_data[parent_col].values
+                # Check if it's numeric or categorical mapping
+                first_val = next(iter(mapping.values()))
+                if isinstance(first_val, dict) and "mean" in first_val:
+                    # Numeric conditional distribution (e.g., salary based on job_title)
+                    # mapping = {"Intern": {"mean": 40000, "std": 5000}, "CTO": {"mean": 200000, "std": 30000}}
+                    values = np.zeros(size)
+                    for key, dist in mapping.items():
+                        mask = parent_values == key
+                        count = mask.sum()
+                        if count > 0:
+                            mean = dist.get("mean", 50000)
+                            std = dist.get("std", mean * 0.1)
+                            values[mask] = self.rng.normal(mean, std, count)
+                    # Handle values that didn't match any key (use default)
+                    default = params.get("default", {"mean": 50000, "std": 10000})
+                    unmatched = ~np.isin(parent_values, list(mapping.keys()))
+                    if unmatched.sum() > 0:
+                        values[unmatched] = self.rng.normal(
+                            default.get("mean", 50000),
+                            default.get("std", 10000),
+                            unmatched.sum()
+                        )
+                    return values
+                elif isinstance(first_val, list):
+                    # Categorical conditional (e.g., state based on country)
+                    # mapping = {"USA": ["CA", "TX", "NY"], "UK": ["England", "Scotland"]}
+                    values = np.empty(size, dtype=object)
+                    for key, choices in mapping.items():
+                        mask = parent_values == key
+                        count = mask.sum()
+                        if count > 0:
+                            values[mask] = self.rng.choice(choices, count)
+                    # Default for unmatched
+                    default_choices = params.get("default", ["Unknown"])
+                    unmatched = values == None  # noqa
+                    if unmatched.sum() > 0:
+                        values[unmatched] = self.rng.choice(default_choices, unmatched.sum())
+                    return values
+                elif isinstance(first_val, (int, float)):
+                    # Probability-based boolean (e.g., churn probability based on plan)
+                    # mapping = {"free": 0.3, "pro": 0.1, "enterprise": 0.05}
+                    values = np.zeros(size, dtype=bool)
+                    for key, prob in mapping.items():
+                        mask = parent_values == key
+                        count = mask.sum()
+                        if count > 0:
+                            values[mask] = self.rng.random(count) < prob
+                    return values
+        # ========== STANDARD COLUMN GENERATION ==========
         # CATEGORICAL
         if column.type == "categorical":
             choices = params.get("choices", ["A", "B", "C"])
@@ -469,23 +554,59 @@ class DataSimulator:
                         return values
             if text_type == "name":
-                values = np.array([self.text_gen.name() for _ in range(size)])
+                pool_key = "text_name"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.name() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "email":
-                values = np.array([self.text_gen.email() for _ in range(size)])
+                pool_key = "text_email"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.email() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "company":
-                values = np.array([self.text_gen.company() for _ in range(size)])
+                pool_key = "text_company"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.company() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "sentence":
-                values = np.array([self.text_gen.sentence() for _ in range(size)])
+                pool_key = "text_sentence"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "word":
-                values = np.array([self.text_gen.word() for _ in range(size)])
+                pool_key = "text_word"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.word() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "address":
-                values = np.array([self.text_gen.full_address() for _ in range(size)])
+                pool_key = "text_address"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.full_address() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "phone":
-                values = np.array([self.text_gen.phone_number() for _ in range(size)])
+                pool_key = "text_phone"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.phone_number() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             elif text_type == "url":
-                values = np.array([self.text_gen.url() for _ in range(size)])
+                pool_key = "text_url"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.url() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             else:
-                values = np.array([self.text_gen.sentence() for _ in range(size)])
+                pool_key = "text_sentence"
+                if pool_key not in self._text_pools:
+                    pool_size = min(size, self.TEXT_POOL_SIZE)
+                    self._text_pools[pool_key] = np.array([self.text_gen.sentence() for _ in range(pool_size)])
+                values = self.rng.choice(self._text_pools[pool_key], size=size)
             return values
@@ -640,6 +761,9 @@ class DataSimulator:
             # Apply business rule constraints
             df_batch = self.apply_constraints(df_batch, table)
+            # Apply outcome curves (Trends/Seasonality)
+            df_batch = self.apply_outcome_curves(df_batch, table_name)
             # Update context for future batches/tables
             self._update_context(table_name, df_batch)
@@ -667,6 +791,134 @@ class DataSimulator:
         return df
+    def apply_outcome_curves(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
+        """
+        Apply temporal outcome curves to force data to match trends/seasonality.
+        This overrides the base distribution with the high-level constraints
+        defined in the prompt (e.g. "seasonal peaks", "upward trend").
+        """
+        if not hasattr(self.config, 'outcome_curves') or not self.config.outcome_curves:
+            print(f"[CURVE DEBUG] No outcome_curves found in config for {table_name}")
+            return df
+        print(f"[CURVE DEBUG] Found {len(self.config.outcome_curves)} curves in config")
+        # Filter curves for this table - handle both dict and Pydantic object
+        curves = []
+        for c in self.config.outcome_curves:
+            # Get table name from curve (handle both dict and object)
+            c_table = c.table if hasattr(c, 'table') else c.get('table')
+            if c_table == table_name:
+                curves.append(c)
+        print(f"[CURVE DEBUG] {len(curves)} curves match table '{table_name}'")
+        for curve in curves:
+            try:
+                # Access attributes (Pydantic) or dict keys
+                target_col = curve.column if hasattr(curve, 'column') else curve['column']
+                time_col = curve.time_column if hasattr(curve, 'time_column') else curve['time_column']
+                points = curve.curve_points if hasattr(curve, 'curve_points') else curve.get('curve_points', [])
+                pattern_type = curve.pattern_type if hasattr(curve, 'pattern_type') else curve.get('pattern_type', 'seasonal')
+                print(f"[CURVE DEBUG] Applying curve: table={table_name}, col={target_col}, time_col={time_col}, pattern={pattern_type}")
+                print(f"[CURVE DEBUG] DF columns: {list(df.columns)}")
+                if target_col not in df.columns:
+                    print(f"[CURVE DEBUG] Target column '{target_col}' not in DataFrame!")
+                    continue
+                if time_col not in df.columns:
+                    print(f"[CURVE DEBUG] Time column '{time_col}' not in DataFrame!")
+                    continue
+                if not points:
+                    print(f"[CURVE DEBUG] No curve points!")
+                    continue
+                # Convert Pydantic CurvePoint objects to dicts if needed
+                point_dicts = []
+                for p in points:
+                    if hasattr(p, 'month'):
+                        point_dicts.append({'month': p.month, 'relative_value': p.relative_value})
+                    else:
+                        point_dicts.append(p)
+                points = point_dicts
+                print(f"[CURVE DEBUG] Points: {points}")
+                # Sort points by order (month or progress)
+                points.sort(key=lambda x: x.get('month', x.get('x', 0)))
+                pattern_type = curve.get('pattern_type', 'seasonal')
+                # Extract time components
+                if not pd.api.types.is_datetime64_any_dtype(df[time_col]):
+                    timestamps = pd.to_datetime(df[time_col], errors='coerce')
+                else:
+                    timestamps = df[time_col]
+                # Initialize factors
+                row_factors = np.ones(len(df))
+                # STRATEGY 1: SEASONAL (Cyclic 1-12)
+                if pattern_type in ['seasonal', 'cyclic']:
+                    months = timestamps.dt.month
+                    scaling_factors = np.ones(13) # Index 1-12
+                    x_known = np.array([p['month'] for p in points])
+                    y_known = np.array([p['relative_value'] for p in points])
+                    for m in range(1, 13):
+                        if m < x_known.min():
+                            scaling_factors[m] = y_known[0]
+                        elif m > x_known.max():
+                            scaling_factors[m] = y_known[-1]
+                        else:
+                            scaling_factors[m] = np.interp(m, x_known, y_known)
+                    row_factors = scaling_factors[months.fillna(1).astype(int).values]
+                # STRATEGY 2: GROWTH/TREND (Linear over absolute time)
+                elif pattern_type in ['growth', 'trend', 'increase', 'decline']:
+                    # Normalize time to 0.0 - 1.0 range
+                    t_min = timestamps.min()
+                    t_max = timestamps.max()
+                    if t_min == t_max:
+                        row_factors = np.ones(len(df))
+                    else:
+                        # Convert to numeric (timestamps)
+                        t_numerics = timestamps.astype(np.int64)
+                        t_start = t_numerics.min()
+                        t_range = t_numerics.max() - t_start
+                        # Normalize 0.0 to 1.0
+                        t_norm = (t_numerics - t_start) / t_range
+                        # Map points (assume points are mapped 1-12 or 0.0-1.0?)
+                        # The LLM outputs "month" 1-12 usually. Let's map 1=Start, 12=End?
+                        # Or safer: interpolating 1-12 across the whole range.
+                        x_known = np.array([p['month'] for p in points])
+                        y_known = np.array([p['relative_value'] for p in points])
+                        # Normalize x_known to 0.0-1.0 range (assuming 1..12 scale from LLM)
+                        # If LLM says Month 1 to 12, we treat 1 as 0.0 and 12 as 1.0
+                        x_known_norm = (x_known - 1) / 11.0 # 1->0, 12->1
+                        # Interpolate
+                        row_factors = np.interp(t_norm, x_known_norm, y_known)
+                # Apply!
+                df[target_col] = df[target_col] * row_factors
+            except Exception as e:
+                warnings.warn(f"Failed to apply outcome curve for {table_name}: {e}")
+                continue
+        return df
     def _apply_single_constraint(self, df: pd.DataFrame, constraint: Any) -> pd.DataFrame:
         """Apply a single constraint to the DataFrame."""

misata/smart_values.py CHANGED Viewed

@@ -86,6 +86,22 @@ class SmartValueGenerator:
         "feature_name": ["feature", "capability", "functionality"],
         "bug_type": ["bug", "issue", "defect", "error"],
         "api_endpoint": ["endpoint", "api", "route", "path"],
+        # NEW v0.5.0: Additional domain patterns
+        "payment_method": ["payment_method", "pay_type", "payment_option"],
+        "order_status": ["order_status", "status", "state"],
+        "customer_segment": ["segment", "customer_type", "tier", "classification"],
+        "license_type": ["license", "licence"],
+        "file_type": ["file_type", "document_type", "mime_type"],
+        "priority_level": ["priority", "urgency", "importance"],
+        "subscription_plan": ["plan", "subscription", "tier", "package"],
+        # Generic patterns - lowest priority but always match on exact column names
+        "name": ["name"],
+        "description": ["description", "desc", "about", "summary", "details"],
+        "title": ["title", "heading"],
+        "status": ["status", "state"],
+        "type": ["type", "kind", "category"],
     }
     # Curated fallback pools (no LLM needed)
@@ -346,6 +362,108 @@ class SmartValueGenerator:
             "/api/v1/notifications", "/api/v1/settings", "/api/v1/search",
             "/api/v1/reports", "/api/v1/webhooks", "/api/v1/integrations",
         ],
+        # NEW v0.5.0: Additional high-quality domain pools
+        "medical_specialty": [
+            "Cardiology", "Dermatology", "Emergency Medicine", "Endocrinology",
+            "Family Medicine", "Gastroenterology", "General Surgery", "Geriatrics",
+            "Hematology", "Infectious Disease", "Internal Medicine", "Nephrology",
+            "Neurology", "Obstetrics & Gynecology", "Oncology", "Ophthalmology",
+            "Orthopedic Surgery", "Otolaryngology", "Pediatrics", "Psychiatry",
+            "Pulmonology", "Radiology", "Rheumatology", "Urology", "Anesthesiology",
+        ],
+        "transaction_type": [
+            "Purchase", "Refund", "Transfer", "Deposit", "Withdrawal",
+            "Payment", "Credit", "Debit", "Fee", "Interest",
+            "Dividend", "Commission", "Bonus", "Adjustment", "Reversal",
+            "Wire Transfer", "ACH Transfer", "Direct Deposit", "Check Payment",
+            "Cash Advance", "Balance Transfer", "Loan Disbursement", "Bill Payment",
+        ],
+        "account_type": [
+            "Checking Account", "Savings Account", "Money Market Account",
+            "Certificate of Deposit", "Individual Retirement Account (IRA)",
+            "401(k) Account", "Brokerage Account", "Business Checking",
+            "Business Savings", "Health Savings Account (HSA)", "Joint Account",
+            "Trust Account", "Custodial Account", "Student Account", "Premium Account",
+        ],
+        "brand": [
+            "Apple", "Samsung", "Sony", "LG", "Nike", "Adidas", "Puma", "Under Armour",
+            "Toyota", "Honda", "Ford", "Tesla", "Microsoft", "Google", "Amazon",
+            "Dell", "HP", "Lenovo", "ASUS", "Acer", "Canon", "Nikon", "Bose",
+            "JBL", "Philips", "Panasonic", "Whirlpool", "GE", "Bosch", "Dyson",
+            "IKEA", "Williams-Sonoma", "Crate & Barrel", "West Elm", "Pottery Barn",
+        ],
+        "payment_method": [
+            "Credit Card (Visa)", "Credit Card (Mastercard)", "Credit Card (Amex)",
+            "Debit Card", "PayPal", "Apple Pay", "Google Pay", "Bank Transfer",
+            "Wire Transfer", "Check", "Cash", "Cryptocurrency", "Venmo",
+            "Klarna", "Afterpay", "Shop Pay", "Amazon Pay", "ACH Direct Debit",
+        ],
+        "order_status": [
+            "Pending", "Confirmed", "Processing", "Shipped", "In Transit",
+            "Out for Delivery", "Delivered", "Completed", "Cancelled", "Refunded",
+            "On Hold", "Backordered", "Returned", "Partially Shipped", "Failed",
+        ],
+        "customer_segment": [
+            "Enterprise", "Mid-Market", "Small Business", "Startup", "Individual",
+            "Premium", "Standard", "Basic", "Trial", "Churned", "At-Risk",
+            "Champion", "Loyal", "New Customer", "VIP", "Wholesale", "Retail",
+        ],
+        "license_type": [
+            "MIT License", "Apache License 2.0", "GNU GPL v3", "BSD 3-Clause",
+            "Creative Commons BY 4.0", "Proprietary", "Commercial", "Educational",
+            "Open Source", "Freeware", "Shareware", "Enterprise License",
+            "Single User", "Multi-User", "Site License", "Perpetual License",
+        ],
+        "file_type": [
+            "PDF Document", "Word Document", "Excel Spreadsheet", "PowerPoint Presentation",
+            "JPEG Image", "PNG Image", "MP4 Video", "MP3 Audio", "ZIP Archive",
+            "CSV File", "JSON File", "XML File", "HTML Page", "Python Script",
+            "JavaScript File", "SQL Database", "Markdown Document", "Text File",
+        ],
+        "priority_level": [
+            "Critical", "High", "Medium", "Low", "Trivial",
+            "Urgent", "Normal", "Deferred", "Blocked", "In Review",
+        ],
+        "subscription_plan": [
+            "Free Tier", "Basic Plan", "Professional Plan", "Business Plan",
+            "Enterprise Plan", "Starter Plan", "Growth Plan", "Scale Plan",
+            "Team Plan", "Individual Plan", "Student Plan", "Nonprofit Plan",
+            "Annual Pro", "Monthly Basic", "Lifetime Access", "Pay-As-You-Go",
+        ],
+        # Generic fallbacks for common column patterns
+        "name": [
+            "Alpha Project", "Beta Initiative", "Gamma Solution", "Delta System",
+            "Epsilon Framework", "Zeta Platform", "Eta Service", "Theta Module",
+            "Iota Component", "Kappa Engine", "Lambda Protocol", "Mu Architecture",
+            "Strategic Modernization", "Digital Transformation", "Innovation Hub",
+            "Next Generation Platform", "Cloud Migration", "Data Integration Suite",
+        ],
+        "description": [
+            "High-performance solution designed for enterprise-scale deployments with robust security features.",
+            "User-friendly platform offering seamless integration with existing workflows and systems.",
+            "Cutting-edge technology stack built for reliability, scalability, and maintainability.",
+            "Comprehensive toolkit featuring advanced analytics and real-time monitoring capabilities.",
+            "Industry-leading service with proven track record of customer satisfaction and uptime.",
+            "Streamlined workflow automation reducing manual effort and improving efficiency.",
+            "Innovative approach combining best practices with modern architectural patterns.",
+            "Full-featured solution supporting multiple deployment options and configuration flexibility.",
+        ],
+        "title": [
+            "Senior Software Engineer", "Product Manager", "Data Analyst",
+            "Marketing Director", "Sales Representative", "Customer Success Manager",
+            "Technical Lead", "UX Designer", "DevOps Engineer", "Quality Analyst",
+            "Project Coordinator", "Business Analyst", "Account Executive",
+        ],
+        "status": [
+            "Active", "Inactive", "Pending", "Approved", "Rejected",
+            "Under Review", "Completed", "In Progress", "On Hold", "Archived",
+            "Draft", "Published", "Expired", "Suspended", "Verified",
+        ],
+        "type": [
+            "Standard", "Premium", "Custom", "Default", "Advanced",
+            "Basic", "Professional", "Enterprise", "Starter", "Legacy",
+            "Internal", "External", "Public", "Private", "Hybrid",
+        ],
         "skill": [
             "Python", "JavaScript", "SQL", "Machine Learning", "Data Analysis",
             "Project Management", "Communication", "Leadership", "Problem Solving",
@@ -541,14 +659,28 @@ Return ONLY a JSON array of strings, no explanation. Example:
             use_llm: Whether to use LLM for generation
         Returns:
-            List of domain-appropriate values
+            List of domain-appropriate values (NEVER empty - falls back to generic pools)
         """
         # Determine domain
         domain = domain_hint or self.detect_domain(column_name, table_name)
+        # If no domain detected, infer from column name patterns
         if domain is None:
-            # No domain detected, return empty
-            return []
+            col_lower = column_name.lower()
+            # Try to match generic patterns
+            if "name" in col_lower:
+                domain = "name"
+            elif "desc" in col_lower or "about" in col_lower:
+                domain = "description"
+            elif "title" in col_lower:
+                domain = "title"
+            elif "status" in col_lower or "state" in col_lower:
+                domain = "status"
+            elif "type" in col_lower or "kind" in col_lower:
+                domain = "type"
+            else:
+                # Ultimate fallback - use "name" pool for any unknown TEXT column
+                domain = "name"
         # Build context string
         full_context = context or f"{table_name} {column_name}".strip()
@@ -570,10 +702,16 @@ Return ONLY a JSON array of strings, no explanation. Example:
         else:
             pool = self.FALLBACK_POOLS.get(domain, [])[:size]
+        # Ensure we never return empty - cascade through fallbacks
+        if not pool:
+            pool = self.FALLBACK_POOLS.get(domain, [])[:size]
+        if not pool:
+            # Absolute fallback - use generic name pool
+            pool = self.FALLBACK_POOLS.get("name", ["Item A", "Item B", "Item C"])[:size]
         # Cache the pool
-        if pool:
-            self._pool_cache[cache_key] = pool
-            self._save_pool_to_cache(cache_key, pool)
+        self._pool_cache[cache_key] = pool
+        self._save_pool_to_cache(cache_key, pool)
         return pool

misata/studio/__init__.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""
+Misata Studio - Visual Schema Designer & Reverse Graph Editor
+The GUI for reverse-engineering schemas from sample data and
+designing custom distributions visually.
+Usage:
+    pip install misata[studio]
+    misata studio
+    # Or from Python:
+    from misata.studio import launch
+    launch()
+"""
+from typing import Optional
+def launch(
+    port: int = 8501,
+    host: str = "localhost",
+    open_browser: bool = True,
+) -> None:
+    """Launch Misata Studio GUI.
+    Args:
+        port: Port to run on (default 8501)
+        host: Host to bind to (default localhost)
+        open_browser: Open browser automatically
+    """
+    try:
+        import streamlit.web.cli as stcli
+        import sys
+        import os
+        # Get the path to app.py
+        app_path = os.path.join(os.path.dirname(__file__), "app.py")
+        sys.argv = [
+            "streamlit", "run", app_path,
+            f"--server.port={port}",
+            f"--server.address={host}",
+            "--server.headless=true" if not open_browser else "",
+        ]
+        sys.argv = [arg for arg in sys.argv if arg]  # Remove empty strings
+        stcli.main()
+    except ImportError:
+        raise ImportError(
+            "Misata Studio requires streamlit. Install with:\n"
+            "  pip install misata[studio]"
+        )
+__all__ = ["launch"]

misata/studio/app.py ADDED Viewed

@@ -0,0 +1,49 @@
+import streamlit as st
+from misata.studio.state.store import StudioStore
+from misata.studio.utils.styles import apply_custom_styles
+from misata.studio.components.sidebar import render_sidebar
+from misata.studio.tabs.schema_designer import render_schema_tab
+from misata.studio.tabs.outcome_curve import render_outcome_tab
+from misata.studio.tabs.configure import render_configure_tab
+from misata.studio.tabs.generate import render_generate_tab
+# Page Config
+st.set_page_config(
+    page_title="Misata Studio",
+    page_icon="M",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+def main():
+    """Main Orchestrator for Misata Studio."""
+    # 1. Initialize State & Styles
+    StudioStore.init()
+    apply_custom_styles()
+    # 2. Render Sidebar
+    render_sidebar()
+    # 3. Router
+    active_tab = StudioStore.get("active_tab", "Schema")
+    # Content Area
+    with st.container():
+        if active_tab == "Schema":
+            render_schema_tab()
+        elif active_tab == "Outcome":
+            render_outcome_tab()
+        elif active_tab == "Configure":
+            render_configure_tab()
+        elif active_tab == "Generate":
+            render_generate_tab()
+        else:
+            st.error(f"Unknown View: {active_tab}")
+if __name__ == "__main__":
+    main()

misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl