PyPI - misata - Versions diffs - 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

misata/__init__.py +1 -1
misata/agents/__init__.py +23 -0
misata/agents/pipeline.py +286 -0
misata/causal/__init__.py +5 -0
misata/causal/graph.py +109 -0
misata/causal/solver.py +115 -0
misata/cli.py +31 -0
misata/generators/__init__.py +19 -0
misata/generators/copula.py +198 -0
misata/llm_parser.py +180 -137
misata/quality.py +78 -33
misata/reference_data.py +221 -0
misata/research/__init__.py +3 -0
misata/research/agent.py +70 -0
misata/schema.py +25 -0
misata/simulator.py +264 -12
misata/smart_values.py +144 -6
misata/studio/__init__.py +55 -0
misata/studio/app.py +49 -0
misata/studio/components/inspector.py +81 -0
misata/studio/components/sidebar.py +35 -0
misata/studio/constraint_generator.py +781 -0
misata/studio/inference.py +319 -0
misata/studio/outcome_curve.py +284 -0
misata/studio/state/store.py +55 -0
misata/studio/tabs/configure.py +50 -0
misata/studio/tabs/generate.py +117 -0
misata/studio/tabs/outcome_curve.py +149 -0
misata/studio/tabs/schema_designer.py +217 -0
misata/studio/utils/styles.py +143 -0
misata/studio_constraints/__init__.py +29 -0
misata/studio_constraints/z3_solver.py +259 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
misata-0.5.0.dist-info/RECORD +61 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
misata-0.3.0b0.dist-info/RECORD +0 -37
/misata/{generators.py → generators_legacy.py} +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0

misata/studio/inference.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""
+Schema Inference Module - Reverse-engineer schemas from sample data.
+This module analyzes uploaded CSV/JSON data and infers:
+- Column types (int, float, categorical, date, text, email, uuid, etc.)
+- Distribution parameters (min, max, mean, std, choices, etc.)
+- Correlations between columns
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datetime import datetime
+import numpy as np
+import pandas as pd
+from misata.schema import Column, SchemaConfig, Table
+# ============ Type Detection Patterns ============
+EMAIL_PATTERN = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')
+UUID_PATTERN = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+PHONE_PATTERN = re.compile(r'^[\d\s\-\+\(\)]{7,20}$')
+URL_PATTERN = re.compile(r'^https?://')
+def detect_column_type(series: pd.Series) -> Tuple[str, Dict[str, Any]]:
+    """Detect the type and distribution parameters of a column.
+    Args:
+        series: Pandas Series to analyze
+    Returns:
+        Tuple of (type_name, distribution_params)
+    """
+    # Drop nulls for analysis
+    clean = series.dropna()
+    if len(clean) == 0:
+        return "text", {"text_type": "sentence"}
+    # Check for boolean
+    unique_vals = set(clean.unique())
+    if unique_vals <= {True, False, 0, 1, "true", "false", "True", "False", "yes", "no", "Yes", "No"}:
+        # Calculate probability of True
+        bool_vals = clean.map(lambda x: str(x).lower() in ('true', '1', 'yes'))
+        prob = bool_vals.mean()
+        return "boolean", {"probability": round(prob, 2)}
+    # Check for UUID
+    if clean.dtype == object:
+        sample = str(clean.iloc[0])
+        if UUID_PATTERN.match(sample):
+            return "text", {"text_type": "uuid"}
+        # Check for email
+        if EMAIL_PATTERN.match(sample):
+            return "text", {"text_type": "email"}
+        # Check for URL
+        if URL_PATTERN.match(sample):
+            return "text", {"text_type": "url"}
+        # Check for phone
+        if PHONE_PATTERN.match(sample):
+            return "text", {"text_type": "phone"}
+    # Check for date
+    if clean.dtype == 'datetime64[ns]' or pd.api.types.is_datetime64_any_dtype(clean):
+        return "date", {
+            "start": str(clean.min().date()),
+            "end": str(clean.max().date())
+        }
+    # Try parsing as date
+    if clean.dtype == object:
+        try:
+            parsed = pd.to_datetime(clean, errors='coerce')
+            if parsed.notna().mean() > 0.9:  # 90%+ parse as dates
+                return "date", {
+                    "start": str(parsed.min().date()),
+                    "end": str(parsed.max().date())
+                }
+        except:
+            pass
+    # Check for categorical (limited unique values)
+    n_unique = clean.nunique()
+    if n_unique <= min(20, len(clean) * 0.2):  # <=20 or <=20% unique
+        value_counts = clean.value_counts(normalize=True)
+        choices = value_counts.index.tolist()
+        probabilities = [round(p, 3) for p in value_counts.values.tolist()]
+        return "categorical", {
+            "choices": choices,
+            "probabilities": probabilities
+        }
+    # Check for numeric
+    if pd.api.types.is_integer_dtype(clean):
+        return "int", {
+            "min": int(clean.min()),
+            "max": int(clean.max()),
+            "distribution": "uniform"
+        }
+    if pd.api.types.is_float_dtype(clean):
+        # Check if it looks like currency (2 decimal places)
+        decimals = clean.apply(lambda x: len(str(x).split('.')[-1]) if '.' in str(x) else 0)
+        if decimals.mode().iloc[0] == 2:
+            return "float", {
+                "min": round(float(clean.min()), 2),
+                "max": round(float(clean.max()), 2),
+                "distribution": "lognormal",
+                "decimals": 2
+            }
+        return "float", {
+            "min": float(clean.min()),
+            "max": float(clean.max()),
+            "distribution": "normal",
+            "mean": float(clean.mean()),
+            "std": float(clean.std())
+        }
+    # Try converting to numeric
+    try:
+        numeric = pd.to_numeric(clean, errors='coerce')
+        if numeric.notna().mean() > 0.9:  # 90%+ are numeric
+            if numeric.apply(float.is_integer).all():
+                return "int", {
+                    "min": int(numeric.min()),
+                    "max": int(numeric.max())
+                }
+            return "float", {
+                "min": float(numeric.min()),
+                "max": float(numeric.max()),
+                "mean": float(numeric.mean()),
+                "std": float(numeric.std())
+            }
+    except:
+        pass
+    # Default to text
+    # Try to detect text type from column name
+    col_name = series.name.lower() if series.name else ""
+    if "name" in col_name:
+        return "text", {"text_type": "name"}
+    elif "email" in col_name:
+        return "text", {"text_type": "email"}
+    elif "address" in col_name:
+        return "text", {"text_type": "address"}
+    elif "company" in col_name or "org" in col_name:
+        return "text", {"text_type": "company"}
+    elif "phone" in col_name:
+        return "text", {"text_type": "phone"}
+    elif "url" in col_name or "website" in col_name:
+        return "text", {"text_type": "url"}
+    return "text", {"text_type": "sentence"}
+def fit_distribution(series: pd.Series) -> Dict[str, Any]:
+    """Fit a statistical distribution to numeric data.
+    Args:
+        series: Numeric pandas Series
+    Returns:
+        Distribution parameters including type and fitted params
+    """
+    clean = pd.to_numeric(series.dropna(), errors='coerce').dropna()
+    if len(clean) < 5:
+        return {"distribution": "uniform", "min": 0, "max": 100}
+    mean = float(clean.mean())
+    std = float(clean.std())
+    min_val = float(clean.min())
+    max_val = float(clean.max())
+    skew = float(clean.skew())
+    # Determine best distribution based on characteristics
+    if abs(skew) < 0.5:
+        # Roughly symmetric → Normal
+        return {
+            "distribution": "normal",
+            "mean": mean,
+            "std": std,
+            "min": min_val,
+            "max": max_val
+        }
+    elif skew > 1.0 and min_val >= 0:
+        # Right-skewed, positive → Lognormal
+        return {
+            "distribution": "lognormal",
+            "mean": np.log(mean) if mean > 0 else 0,
+            "sigma": std / mean if mean > 0 else 1,
+            "min": min_val,
+            "max": max_val
+        }
+    else:
+        # Use empirical (histogram-based)
+        hist, bins = np.histogram(clean, bins=20, density=True)
+        control_points = []
+        for i in range(len(hist)):
+            x = (bins[i] + bins[i+1]) / 2
+            y = float(hist[i])
+            control_points.append({"x": x, "y": y})
+        return {
+            "distribution": "custom",
+            "control_points": control_points,
+            "min": min_val,
+            "max": max_val
+        }
+def infer_schema(
+    data: pd.DataFrame,
+    table_name: str = "data",
+    row_count: Optional[int] = None
+) -> SchemaConfig:
+    """Infer a complete schema from sample data.
+    Args:
+        data: Sample DataFrame to analyze
+        table_name: Name for the inferred table
+        row_count: Target row count (default: 100x input)
+    Returns:
+        SchemaConfig ready for generation
+    """
+    if row_count is None:
+        row_count = max(len(data) * 100, 1000)
+    columns = []
+    for col_name in data.columns:
+        col_type, params = detect_column_type(data[col_name])
+        # Check for unique constraint
+        is_unique = data[col_name].nunique() == len(data)
+        column = Column(
+            name=str(col_name),
+            table_name=table_name,
+            type=col_type,
+            distribution_params=params,
+            nullable=data[col_name].isna().any(),
+            unique=is_unique
+        )
+        columns.append(column)
+    return SchemaConfig(
+        name=f"Inferred: {table_name}",
+        tables=[Table(
+            name=table_name,
+            row_count=row_count,
+            columns=[c.name for c in columns]
+        )],
+        columns={table_name: columns},
+        relationships=[]
+    )
+def detect_correlations(data: pd.DataFrame) -> List[Dict[str, Any]]:
+    """Detect correlations between numeric columns.
+    Args:
+        data: DataFrame to analyze
+    Returns:
+        List of correlation dicts with column pairs and strength
+    """
+    numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
+    if len(numeric_cols) < 2:
+        return []
+    correlations = []
+    corr_matrix = data[numeric_cols].corr()
+    for i, col1 in enumerate(numeric_cols):
+        for col2 in numeric_cols[i+1:]:
+            corr = corr_matrix.loc[col1, col2]
+            if abs(corr) > 0.5:  # Only report strong correlations
+                correlations.append({
+                    "column1": col1,
+                    "column2": col2,
+                    "correlation": round(corr, 3),
+                    "strength": "strong" if abs(corr) > 0.7 else "moderate"
+                })
+    return correlations
+def schema_to_dict(schema: SchemaConfig) -> Dict[str, Any]:
+    """Convert schema to a JSON-serializable dict for the UI."""
+    return {
+        "name": schema.name,
+        "tables": [
+            {
+                "name": t.name,
+                "row_count": t.row_count,
+                "columns": t.columns
+            }
+            for t in schema.tables
+        ],
+        "columns": {
+            table_name: [
+                {
+                    "name": c.name,
+                    "type": c.type,
+                    "params": c.distribution_params,
+                    "nullable": c.nullable,
+                    "unique": c.unique
+                }
+                for c in cols
+            ]
+            for table_name, cols in schema.columns.items()
+        }
+    }

misata/studio/outcome_curve.py ADDED Viewed

@@ -0,0 +1,284 @@
+"""
+Outcome Curve Designer - Reverse Time-Series Generation
+The killer feature: Users draw the aggregated outcome they want,
+and Misata generates individual transactions that produce that exact curve.
+Example:
+    User draws: Revenue = [$100K, $150K, $200K, $180K, ...] over 12 months
+    Misata generates: 50,000 individual orders with dates/amounts
+    When aggregated: SUM(amount) GROUP BY month = exactly the drawn curve
+Algorithm:
+    1. Parse curve control points into time buckets
+    2. For each bucket, calculate target aggregate
+    3. Distribute transactions across bucket:
+       - Determine transaction count (based on avg ticket or specified)
+       - Generate individual amounts that sum to target
+    4. Add variance/noise for realism
+"""
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+@dataclass
+class CurvePoint:
+    """A single point on the outcome curve."""
+    timestamp: datetime
+    value: float
+@dataclass
+class OutcomeCurve:
+    """Represents the target outcome curve drawn by user."""
+    metric_name: str  # e.g., "revenue", "signups", "orders"
+    time_unit: str    # "day", "week", "month"
+    points: List[CurvePoint]
+    # Optional constraints
+    avg_transaction_value: Optional[float] = None  # For revenue curves
+    min_transactions_per_period: int = 10
+    max_transactions_per_period: int = 10000
+def interpolate_curve(curve: OutcomeCurve, num_buckets: int) -> List[float]:
+    """Interpolate curve to get values for each time bucket."""
+    if len(curve.points) < 2:
+        return [curve.points[0].value] * num_buckets
+    # Extract x (time indices) and y (values)
+    x = np.array([i for i in range(len(curve.points))])
+    y = np.array([p.value for p in curve.points])
+    # Interpolate to num_buckets
+    x_new = np.linspace(0, len(curve.points) - 1, num_buckets)
+    from scipy.interpolate import interp1d
+    f = interp1d(x, y, kind='cubic', fill_value='extrapolate')
+    return list(np.maximum(f(x_new), 0))  # Ensure non-negative
+def generate_transactions_for_bucket(
+    target_value: float,
+    bucket_start: datetime,
+    bucket_end: datetime,
+    avg_transaction: Optional[float] = None,
+    min_transactions: int = 10,
+    max_transactions: int = 1000,
+    rng: Optional[np.random.Generator] = None
+) -> pd.DataFrame:
+    """Generate individual transactions that sum to target_value for a time bucket.
+    Returns DataFrame with columns: [timestamp, amount]
+    """
+    if rng is None:
+        rng = np.random.default_rng()
+    if target_value <= 0:
+        return pd.DataFrame(columns=['timestamp', 'amount'])
+    # Determine number of transactions
+    if avg_transaction:
+        n_transactions = int(target_value / avg_transaction)
+        n_transactions = max(min_transactions, min(n_transactions, max_transactions))
+    else:
+        # Estimate based on target value
+        n_transactions = max(min_transactions, min(int(target_value / 50), max_transactions))
+    # Generate amounts that sum to target using Dirichlet distribution
+    # This ensures realistic variation while hitting exact target
+    proportions = rng.dirichlet(np.ones(n_transactions) * 2)  # alpha=2 for moderate variance
+    amounts = proportions * target_value
+    # Add some variance to make it more realistic
+    # Small noise that doesn't change the sum significantly
+    noise = rng.normal(0, abs(target_value) * 0.001, n_transactions)
+    amounts = amounts + noise
+    # Adjust to hit exact target (compensate for noise)
+    amounts = amounts * (target_value / amounts.sum())
+    # Ensure all positive
+    amounts = np.maximum(amounts, 0.01)
+    amounts = amounts * (target_value / amounts.sum())  # Re-normalize
+    # Generate timestamps uniformly distributed within bucket
+    bucket_duration = (bucket_end - bucket_start).total_seconds()
+    random_seconds = rng.uniform(0, bucket_duration, n_transactions)
+    timestamps = [bucket_start + timedelta(seconds=s) for s in random_seconds]
+    # Sort by timestamp
+    df = pd.DataFrame({
+        'timestamp': timestamps,
+        'amount': amounts.round(2)
+    }).sort_values('timestamp').reset_index(drop=True)
+    return df
+def generate_from_outcome_curve(
+    curve: OutcomeCurve,
+    start_date: Optional[datetime] = None,
+    seed: int = 42
+) -> pd.DataFrame:
+    """Generate a full transaction dataset from an outcome curve.
+    Args:
+        curve: The target outcome curve
+        start_date: Start date (defaults to today minus curve duration)
+        seed: Random seed for reproducibility
+    Returns:
+        DataFrame with columns: [id, timestamp, amount] where
+        SUM(amount) GROUP BY period = the drawn curve
+    """
+    rng = np.random.default_rng(seed)
+    n_periods = len(curve.points)
+    # Determine bucket duration
+    if curve.time_unit == "day":
+        bucket_delta = timedelta(days=1)
+    elif curve.time_unit == "week":
+        bucket_delta = timedelta(weeks=1)
+    elif curve.time_unit == "month":
+        bucket_delta = timedelta(days=30)  # Approximate
+    else:
+        bucket_delta = timedelta(days=1)
+    # Set start date
+    if start_date is None:
+        start_date = datetime.now() - (bucket_delta * n_periods)
+    all_transactions = []
+    for i, point in enumerate(curve.points):
+        bucket_start = start_date + (bucket_delta * i)
+        bucket_end = bucket_start + bucket_delta
+        transactions = generate_transactions_for_bucket(
+            target_value=point.value,
+            bucket_start=bucket_start,
+            bucket_end=bucket_end,
+            avg_transaction=curve.avg_transaction_value,
+            min_transactions=curve.min_transactions_per_period,
+            max_transactions=curve.max_transactions_per_period,
+            rng=rng
+        )
+        all_transactions.append(transactions)
+    # Combine all transactions
+    df = pd.concat(all_transactions, ignore_index=True)
+    df.insert(0, 'id', range(1, len(df) + 1))
+    return df
+def verify_curve_match(
+    transactions: pd.DataFrame,
+    curve: OutcomeCurve,
+    start_date: datetime
+) -> Dict[str, Any]:
+    """Verify that generated transactions aggregate to match the target curve.
+    Returns:
+        Dict with 'match_score', 'expected', 'actual', 'error_pct'
+    """
+    n_periods = len(curve.points)
+    expected = np.array([p.value for p in curve.points])
+    # Determine bucket duration
+    if curve.time_unit == "day":
+        bucket_delta = timedelta(days=1)
+    elif curve.time_unit == "week":
+        bucket_delta = timedelta(weeks=1)
+    else:  # month
+        bucket_delta = timedelta(days=30)
+    # Assign each transaction to a bucket index based on time offset from start
+    def get_bucket_index(ts):
+        offset = (ts - start_date).total_seconds()
+        bucket_seconds = bucket_delta.total_seconds()
+        return min(int(offset / bucket_seconds), n_periods - 1)
+    transactions = transactions.copy()
+    transactions['bucket_idx'] = transactions['timestamp'].apply(get_bucket_index)
+    # Aggregate by bucket index
+    actual_by_bucket = transactions.groupby('bucket_idx')['amount'].sum()
+    # Build actual array matching expected length
+    actual = np.zeros(n_periods)
+    for idx, val in actual_by_bucket.items():
+        if 0 <= idx < n_periods:
+            actual[idx] = val
+    # Calculate match score
+    error_pct = np.abs(actual - expected) / np.maximum(expected, 1) * 100
+    avg_error = error_pct.mean()
+    match_score = max(0, 100 - avg_error)
+    return {
+        'match_score': round(match_score, 2),
+        'expected': expected.tolist(),
+        'actual': actual.tolist(),
+        'error_pct': error_pct.tolist(),
+        'avg_error_pct': round(avg_error, 2)
+    }
+# ============ Preset Curve Shapes ============
+def get_curve_presets() -> Dict[str, List[float]]:
+    """Get preset curve shapes for common business patterns."""
+    return {
+        "Linear Growth": [100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320],
+        "Exponential Growth": [100, 115, 132, 152, 175, 201, 231, 266, 306, 352, 405, 466],
+        "Hockey Stick": [100, 102, 105, 108, 112, 118, 140, 180, 250, 350, 500, 700],
+        "Seasonal (Retail)": [100, 80, 70, 90, 100, 120, 110, 100, 130, 160, 200, 300],
+        "SaaS Growth": [10, 18, 30, 50, 80, 120, 170, 230, 300, 380, 470, 570],
+        "Churn Decline": [1000, 920, 850, 790, 740, 700, 665, 635, 610, 590, 575, 560],
+        "V-shaped Recovery": [100, 80, 60, 50, 45, 50, 65, 85, 110, 140, 170, 200],
+        "Plateau": [100, 150, 200, 240, 270, 290, 300, 305, 308, 310, 311, 312],
+    }
+def create_curve_from_preset(
+    preset_name: str,
+    metric_name: str = "revenue",
+    time_unit: str = "month",
+    start_date: datetime = None,
+    scale: float = 1000  # Multiply preset values by this
+) -> OutcomeCurve:
+    """Create an OutcomeCurve from a preset shape."""
+    presets = get_curve_presets()
+    values = presets.get(preset_name, presets["Linear Growth"])
+    if start_date is None:
+        start_date = datetime.now() - timedelta(days=30 * len(values))
+    if time_unit == "day":
+        delta = timedelta(days=1)
+    elif time_unit == "week":
+        delta = timedelta(weeks=1)
+    else:
+        delta = timedelta(days=30)
+    points = [
+        CurvePoint(
+            timestamp=start_date + delta * i,
+            value=v * scale
+        )
+        for i, v in enumerate(values)
+    ]
+    return OutcomeCurve(
+        metric_name=metric_name,
+        time_unit=time_unit,
+        points=points
+    )

misata/studio/state/store.py ADDED Viewed

@@ -0,0 +1,55 @@
+import streamlit as st
+from datetime import datetime
+class StudioStore:
+    """Centralized state management for Misata Studio."""
+    @staticmethod
+    def init():
+        """Initialize all session state variables with smart defaults."""
+        defaults = {
+            # Navigation
+            "active_tab": "Schema",
+            "sidebar_expanded": True,
+            # Data & Schema
+            "schema_config": None,
+            "schema_source": "Template", # "Template" or "AI"
+            "warehouse_schema": {
+                "type": "service_company",
+                "customer_count": 500,
+                "project_count": 2000
+            },
+            # Constraint Configuration
+            "selected_constraint": None, # e.g. "invoices.amount"
+            "warehouse_curve": [100000] * 12, # Default annual curve
+            "start_date_input": datetime.now().date(),
+            # Generation Config
+            "warehouse_config": {
+                "avg_transaction": 50.0,
+                "seed": 42,
+                "tier_distribution": [0.5, 0.3, 0.2]
+            },
+            # Results
+            "generated_warehouse": None,
+            "warehouse_generated": False
+        }
+        for key, default_val in defaults.items():
+            if key not in st.session_state:
+                st.session_state[key] = default_val
+    @staticmethod
+    def get(key, default=None):
+        return st.session_state.get(key, default)
+    @staticmethod
+    def set(key, value):
+        st.session_state[key] = value
+    @property
+    def schema(self):
+        return st.session_state.get("schema_config")

misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl