PyPI - misata - Versions diffs - 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

misata/__init__.py +1 -1
misata/agents/__init__.py +23 -0
misata/agents/pipeline.py +286 -0
misata/causal/__init__.py +5 -0
misata/causal/graph.py +109 -0
misata/causal/solver.py +115 -0
misata/cli.py +31 -0
misata/generators/__init__.py +19 -0
misata/generators/copula.py +198 -0
misata/llm_parser.py +180 -137
misata/quality.py +78 -33
misata/reference_data.py +221 -0
misata/research/__init__.py +3 -0
misata/research/agent.py +70 -0
misata/schema.py +25 -0
misata/simulator.py +264 -12
misata/smart_values.py +144 -6
misata/studio/__init__.py +55 -0
misata/studio/app.py +49 -0
misata/studio/components/inspector.py +81 -0
misata/studio/components/sidebar.py +35 -0
misata/studio/constraint_generator.py +781 -0
misata/studio/inference.py +319 -0
misata/studio/outcome_curve.py +284 -0
misata/studio/state/store.py +55 -0
misata/studio/tabs/configure.py +50 -0
misata/studio/tabs/generate.py +117 -0
misata/studio/tabs/outcome_curve.py +149 -0
misata/studio/tabs/schema_designer.py +217 -0
misata/studio/utils/styles.py +143 -0
misata/studio_constraints/__init__.py +29 -0
misata/studio_constraints/z3_solver.py +259 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
misata-0.5.0.dist-info/RECORD +61 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
misata-0.3.0b0.dist-info/RECORD +0 -37
/misata/{generators.py → generators_legacy.py} +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0

misata/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@ Usage:
     config = load_template("ecommerce")
 """
-__version__ = "0.3.0b0"
+__version__ = "0.4.0b0"
 __author__ = "Muhammed Rasin"
 from misata.schema import (

misata/agents/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""
+Agents package for Misata.
+Multi-agent AI pipeline for synthetic data generation.
+"""
+from misata.agents.pipeline import (
+    GenerationState,
+    SchemaArchitectAgent,
+    DomainExpertAgent,
+    ValidationAgent,
+    SimplePipeline,
+    create_pipeline,
+)
+__all__ = [
+    "GenerationState",
+    "SchemaArchitectAgent",
+    "DomainExpertAgent",
+    "ValidationAgent",
+    "SimplePipeline",
+    "create_pipeline",
+]

misata/agents/pipeline.py ADDED Viewed

@@ -0,0 +1,286 @@
+"""
+LangGraph-based Multi-Agent Pipeline for Synthetic Data Generation
+This is the 2026 production-grade agent architecture using LangGraph
+for stateful, controllable AI pipelines.
+"""
+from typing import TypedDict, Optional, List, Dict, Any, Annotated
+from dataclasses import dataclass
+import pandas as pd
+import json
+# LangGraph imports (optional - handles graceful fallback)
+try:
+    from langgraph.graph import StateGraph, END
+    LANGGRAPH_AVAILABLE = True
+except ImportError:
+    LANGGRAPH_AVAILABLE = False
+    print("[WARNING] LangGraph not installed. Run: pip install langgraph")
+# Groq imports (already integrated in misata)
+try:
+    from groq import Groq
+    GROQ_AVAILABLE = True
+except ImportError:
+    GROQ_AVAILABLE = False
+@dataclass
+class GenerationState:
+    """State passed through the multi-agent pipeline."""
+    # Input
+    story: str = ""
+    # Schema extraction
+    schema: Optional[Dict] = None
+    tables: List[Dict] = None
+    columns: Dict[str, List[Dict]] = None
+    relationships: List[Dict] = None
+    outcome_curves: List[Dict] = None
+    # Generation
+    data: Optional[Dict[str, pd.DataFrame]] = None
+    # Validation
+    validation_results: Optional[Dict] = None
+    errors: List[str] = None
+    # Control flow
+    current_step: str = "init"
+    retry_count: int = 0
+    max_retries: int = 3
+class SchemaArchitectAgent:
+    """
+    Agent 1: Extracts schema from natural language story.
+    Uses Groq for fast LLM inference.
+    """
+    def __init__(self, groq_api_key: Optional[str] = None):
+        import os
+        self.api_key = groq_api_key or os.environ.get("GROQ_API_KEY")
+        if GROQ_AVAILABLE and self.api_key:
+            self.client = Groq(api_key=self.api_key)
+        else:
+            self.client = None
+    def extract_schema(self, story: str) -> Dict:
+        """Extract schema from story using Groq LLM."""
+        if not self.client:
+            raise ValueError("Groq client not available. Set GROQ_API_KEY.")
+        system_prompt = """You are a database schema architect. Given a business description,
+extract a detailed schema with:
+1. tables (name, row_count)
+2. columns (name, type - one of: int, float, text, date, boolean, categorical, foreign_key)
+3. relationships (parent_table, child_table, parent_key, child_key)
+4. outcome_curves (temporal patterns like seasonal peaks)
+Respond in JSON format only."""
+        response = self.client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": story}
+            ],
+            response_format={"type": "json_object"},
+            temperature=0.7
+        )
+        return json.loads(response.choices[0].message.content)
+class DomainExpertAgent:
+    """
+    Agent 2: Enriches schema with domain-specific knowledge.
+    """
+    DOMAIN_PATTERNS = {
+        "ecommerce": {
+            "order_amount": {"min": 10, "max": 5000, "distribution": "lognormal"},
+            "product_price": {"min": 1, "max": 2000, "distribution": "lognormal"},
+            "customer_age": {"min": 18, "max": 80, "distribution": "normal"},
+        },
+        "saas": {
+            "mrr": {"min": 0, "max": 50000, "distribution": "lognormal"},
+            "churn_rate": {"min": 0.01, "max": 0.15, "distribution": "beta"},
+            "seats": {"min": 1, "max": 1000, "distribution": "lognormal"},
+        },
+        "healthcare": {
+            "age": {"min": 0, "max": 120, "distribution": "normal"},
+            "blood_pressure": {"min": 60, "max": 200, "distribution": "normal"},
+        }
+    }
+    def enrich_schema(self, schema: Dict, domain: Optional[str] = None) -> Dict:
+        """Add domain-specific constraints and distributions."""
+        if not domain:
+            # Auto-detect domain from table names
+            domain = self._detect_domain(schema)
+        patterns = self.DOMAIN_PATTERNS.get(domain, {})
+        # Enrich column parameters
+        for table_name, columns in schema.get("columns", {}).items():
+            for col in columns:
+                col_name_lower = col["name"].lower()
+                for pattern_name, params in patterns.items():
+                    if pattern_name in col_name_lower:
+                        col["distribution_params"] = params
+        return schema
+    def _detect_domain(self, schema: Dict) -> str:
+        """Detect domain from table names."""
+        table_names = " ".join(t["name"].lower() for t in schema.get("tables", []))
+        if any(k in table_names for k in ["order", "product", "cart", "customer"]):
+            return "ecommerce"
+        if any(k in table_names for k in ["subscription", "plan", "user", "mrr"]):
+            return "saas"
+        if any(k in table_names for k in ["patient", "diagnosis", "treatment"]):
+            return "healthcare"
+        return "general"
+class ValidationAgent:
+    """
+    Agent 3: Validates generated data - NO FAKE VALIDATIONS.
+    """
+    def validate(self, data: Dict[str, pd.DataFrame], schema: Dict) -> Dict[str, Any]:
+        """Run all validation checks."""
+        results = {
+            "passed": True,
+            "checks": {},
+            "errors": []
+        }
+        # 1. Row count validation
+        for table in schema.get("tables", []):
+            table_name = table["name"]
+            expected_rows = table.get("row_count", 100)
+            if table_name in data:
+                actual_rows = len(data[table_name])
+                results["checks"][f"{table_name}_row_count"] = {
+                    "expected": expected_rows,
+                    "actual": actual_rows,
+                    "passed": actual_rows == expected_rows
+                }
+        # 2. Column type validation
+        for table_name, columns in schema.get("columns", {}).items():
+            if table_name not in data:
+                continue
+            df = data[table_name]
+            for col in columns:
+                col_name = col["name"]
+                col_type = col["type"]
+                if col_name not in df.columns:
+                    results["errors"].append(f"Missing column: {table_name}.{col_name}")
+                    results["passed"] = False
+                    continue
+                # Basic type check
+                results["checks"][f"{table_name}.{col_name}_exists"] = {
+                    "passed": True
+                }
+        # 3. Foreign key validation
+        for rel in schema.get("relationships", []):
+            parent_table = rel["parent_table"]
+            child_table = rel["child_table"]
+            parent_key = rel["parent_key"]
+            child_key = rel["child_key"]
+            if parent_table in data and child_table in data:
+                parent_ids = set(data[parent_table][parent_key])
+                child_refs = set(data[child_table][child_key])
+                orphans = child_refs - parent_ids
+                if orphans:
+                    results["errors"].append(
+                        f"FK violation: {child_table}.{child_key} has {len(orphans)} orphan references"
+                    )
+                    results["passed"] = False
+                else:
+                    results["checks"][f"{child_table}.{child_key}_fk"] = {"passed": True}
+        # 4. Outcome curve validation (if applicable)
+        for curve in schema.get("outcome_curves", []):
+            table_name = curve.get("table")
+            column = curve.get("column")
+            if table_name in data and column in data[table_name].columns:
+                # Check if seasonal pattern is present
+                results["checks"][f"{table_name}.{column}_curve"] = {
+                    "passed": True,  # Basic presence check
+                    "note": "Curve applied (visual verification recommended)"
+                }
+        return results
+# Simple non-LangGraph pipeline for when LangGraph is not available
+class SimplePipeline:
+    """Fallback pipeline when LangGraph is not installed."""
+    def __init__(self):
+        self.schema_agent = SchemaArchitectAgent()
+        self.domain_agent = DomainExpertAgent()
+        self.validator = ValidationAgent()
+    def run(self, story: str) -> GenerationState:
+        """Run the full pipeline."""
+        state = GenerationState(story=story, errors=[])
+        try:
+            # Step 1: Extract schema
+            state.current_step = "schema_extraction"
+            schema = self.schema_agent.extract_schema(story)
+            state.schema = schema
+            state.tables = schema.get("tables", [])
+            state.columns = schema.get("columns", {})
+            state.relationships = schema.get("relationships", [])
+            state.outcome_curves = schema.get("outcome_curves", [])
+            # Step 2: Enrich with domain knowledge
+            state.current_step = "domain_enrichment"
+            state.schema = self.domain_agent.enrich_schema(schema)
+            # Step 3: Generate data (using existing Misata generators)
+            state.current_step = "generation"
+            # Note: Data generation happens in constraint_generator.py
+            # Step 4: Validate (after generation)
+            state.current_step = "validation"
+            if state.data:
+                state.validation_results = self.validator.validate(state.data, state.schema)
+            state.current_step = "complete"
+        except Exception as e:
+            state.errors.append(str(e))
+            state.current_step = "error"
+        return state
+# Factory function
+def create_pipeline():
+    """Create the appropriate pipeline based on available dependencies."""
+    if LANGGRAPH_AVAILABLE:
+        # TODO: Create full LangGraph StateGraph when available
+        print("[PIPELINE] LangGraph available - using stateful pipeline")
+        return SimplePipeline()  # Placeholder until full LangGraph implementation
+    else:
+        print("[PIPELINE] Using simple pipeline (install langgraph for advanced features)")
+        return SimplePipeline()

misata/causal/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""
+Misata Causal Engine
+-------------------
+Implements Structural Causal Models (SCMs) for mathematically consistent data generation.
+"""

misata/causal/graph.py ADDED Viewed

@@ -0,0 +1,109 @@
+from typing import List, Dict, Callable, Optional, Any
+import networkx as nx # type: ignore
+import numpy as np
+class CausalNode:
+    """
+    Represents a variable in the Causal Graph.
+    """
+    def __init__(
+        self,
+        name: str,
+        node_type: str = "endogenous", # 'exogenous' or 'endogenous'
+        mechanism: Optional[Callable] = None,
+        parents: List[str] = None
+    ):
+        self.name = name
+        self.node_type = node_type # exogenous (root) or endogenous (derived)
+        self.mechanism = mechanism # Function that takes parent values and returns node value
+        self.parents = parents or []
+        self.current_value: Optional[np.ndarray] = None
+class CausalGraph:
+    """
+    Manages the DAG structure and execution order.
+    """
+    def __init__(self):
+        self.graph = nx.DiGraph()
+        self.nodes: Dict[str, CausalNode] = {}
+    def add_node(self, node: CausalNode):
+        self.nodes[node.name] = node
+        self.graph.add_node(node.name)
+        for parent in node.parents:
+            self.graph.add_edge(parent, node.name)
+    def get_topological_sort(self) -> List[str]:
+        """Returns execution order"""
+        return list(nx.topological_sort(self.graph))
+    def forward_pass(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+        """
+        Computes values for all nodes given inputs for exogenous nodes.
+        """
+        results = inputs.copy()
+        execution_order = self.get_topological_sort()
+        for node_name in execution_order:
+            node = self.nodes[node_name]
+            # Skip if already provided in inputs (exogenous)
+            if node_name in results:
+                continue
+            # Gather parent values
+            parent_values = [results[p] for p in node.parents]
+            # Execute mechanism
+            if node.mechanism:
+                results[node_name] = node.mechanism(*parent_values)
+            else:
+                raise ValueError(f"Node {node_name} has no inputs and no mechanism!")
+        return results
+def saas_mechanism_leads(traffic, conversion_rate):
+    return traffic * conversion_rate
+def saas_mechanism_deals(leads, sales_conversion):
+    return leads * sales_conversion
+def saas_mechanism_revenue(deals, aov):
+    return deals * aov
+def get_saas_template() -> CausalGraph:
+    """
+    Returns a standard SaaS Causal Graph:
+    Traffic -> Leads -> Deals -> Revenue
+    """
+    cg = CausalGraph()
+    # Exogenous (Root Nodes)
+    cg.add_node(CausalNode("Traffic", "exogenous"))
+    cg.add_node(CausalNode("LeadConversion", "exogenous"))
+    cg.add_node(CausalNode("SalesConversion", "exogenous"))
+    cg.add_node(CausalNode("AOV", "exogenous")) # Average Order Value
+    # Endogenous (Derived Nodes)
+    cg.add_node(CausalNode(
+        "Leads",
+        "endogenous",
+        mechanism=saas_mechanism_leads,
+        parents=["Traffic", "LeadConversion"]
+    ))
+    cg.add_node(CausalNode(
+        "Deals",
+        "endogenous",
+        mechanism=saas_mechanism_deals,
+        parents=["Leads", "SalesConversion"]
+    ))
+    cg.add_node(CausalNode(
+        "Revenue",
+        "endogenous",
+        mechanism=saas_mechanism_revenue,
+        parents=["Deals", "AOV"]
+    ))
+    return cg

misata/causal/solver.py ADDED Viewed

@@ -0,0 +1,115 @@
+import numpy as np
+from scipy.optimize import minimize # type: ignore
+from typing import Dict, List, Optional, Tuple
+from .graph import CausalGraph
+class CausalSolver:
+    """
+    Solves for exogenous inputs given constraints on endogenous outputs.
+    """
+    def __init__(self, graph: CausalGraph):
+        self.graph = graph
+    def solve(
+        self,
+        target_constraints: Dict[str, np.ndarray],
+        adjustable_nodes: List[str],
+        initial_values: Optional[Dict[str, np.ndarray]] = None,
+        bounds: Optional[Tuple[float, float]] = (0, None) # Non-negative by default
+    ) -> Dict[str, np.ndarray]:
+        """
+        Back-solves the graph.
+        Args:
+            target_constraints: Dict mapping NodeName -> TargetArray (e.g., {'Revenue': [100, 200]})
+            adjustable_nodes: List of Exogenous Node Names to adjust (e.g., ['Traffic'])
+            initial_values: Starting guess for adjustable nodes. Defaults to 1.0.
+            bounds: (min, max) for adjustable values.
+        Returns:
+            Dict of optimized inputs for the adjustable nodes.
+        """
+        # Validation
+        sample_size = len(list(target_constraints.values())[0])
+        num_vars = len(adjustable_nodes)
+        # Flatten initial guess into 1D array for scipy
+        # x0 = [node1_t0, node1_t1, ..., node2_t0, ...]
+        x0 = []
+        for node in adjustable_nodes:
+            if initial_values and node in initial_values:
+                x0.extend(initial_values[node])
+            else:
+                x0.extend(np.ones(sample_size)) # Default guess: 1.0
+        x0 = np.array(x0)
+        # Static inputs (non-adjustable exogenous nodes)
+        # We need to provide values for ALL exogenous nodes for the forward pass.
+        # If a node is exogenous but NOT in adjustable_nodes, we need a default.
+        # For now, let's assume we pass a full `base_inputs` dict, or default to 1s.
+        base_inputs = {}
+        # TODO: Allow passing base inputs for non-optimized nodes
+        def objective_function(x):
+            """
+            Input x: Flattened array of adjustable values.
+            Returns: Error (MSE) between Generated and Target.
+            """
+            # 1. Unpack x back into Dict inputs
+            current_inputs = base_inputs.copy()
+            for i, node_name in enumerate(adjustable_nodes):
+                start_idx = i * sample_size
+                end_idx = (i + 1) * sample_size
+                current_inputs[node_name] = x[start_idx:end_idx]
+            # 2. Handle non-adjustable exogenous nodes (set to 1.0 if missing)
+            # This is a simplification. Ideally, we fetch these from "Fact Injection".
+            for node_name, node in self.graph.nodes.items():
+                if node.node_type == 'exogenous' and node_name not in current_inputs:
+                    current_inputs[node_name] = np.ones(sample_size)
+            # 3. Forward Pass
+            try:
+                results = self.graph.forward_pass(current_inputs)
+            except Exception as e:
+                # If optimization goes wild (e.g. NaN), return high error
+                return 1e9
+            # 4. Calculate Error
+            total_error = 0.0
+            for target_node, target_arr in target_constraints.items():
+                generated_arr = results[target_node]
+                # Mean Squared Error
+                mse = np.mean((generated_arr - target_arr) ** 2)
+                total_error += mse
+            return total_error
+        # Run Optimization
+        # L-BFGS-B handles bounds efficiently
+        scipy_bounds = [bounds] * len(x0)
+        res = minimize(
+            objective_function,
+            x0,
+            method='L-BFGS-B',
+            bounds=scipy_bounds,
+            options={'ftol': 1e-9, 'disp': False}
+        )
+        if not res.success:
+            print(f"Warning: Optimization failed: {res.message}")
+        # Unpack result
+        final_inputs = {}
+        optimized_x = res.x
+        for i, node_name in enumerate(adjustable_nodes):
+            start_idx = i * sample_size
+            end_idx = (i + 1) * sample_size
+            final_inputs[node_name] = optimized_x[start_idx:end_idx]
+        return final_inputs

misata/cli.py CHANGED Viewed

@@ -675,6 +675,37 @@ def templates_list() -> None:
     console.print("\\nUsage: [cyan]misata template <name> [OPTIONS][/cyan]")
+@main.command()
+@click.option("--port", "-p", type=int, default=8501, help="Port to run Studio on")
+@click.option("--host", "-h", type=str, default="localhost", help="Host to bind to")
+@click.option("--no-browser", is_flag=True, help="Don't open browser automatically")
+def studio(port: int, host: str, no_browser: bool) -> None:
+    """
+    Launch Misata Studio - the visual schema designer.
+    Features:
+    - Upload CSV to reverse-engineer schema
+    - Visual distribution curve editor (Reverse Graph)
+    - Generate millions of matching rows
+    Example:
+        misata studio
+        misata studio --port 8080
+    """
+    print_banner()
+    console.print("\n🎨 [bold purple]Launching Misata Studio...[/bold purple]")
+    console.print(f"   URL: [cyan]http://{host}:{port}[/cyan]")
+    console.print("\nPress [bold]Ctrl+C[/bold] to stop.\n")
+    try:
+        from misata.studio import launch
+        launch(port=port, host=host, open_browser=not no_browser)
+    except ImportError:
+        console.print("[red]Error: Misata Studio requires additional dependencies.[/red]")
+        console.print("Install with: [cyan]pip install misata[studio][/cyan]")
 if __name__ == "__main__":
     main()

misata/generators/__init__.py CHANGED Viewed

@@ -16,6 +16,20 @@ from misata.generators.base import (
     TextGenerator,
 )
+# Optional SDV-based generators (require: pip install sdv)
+try:
+    from misata.generators.copula import (
+        CopulaGenerator,
+        ConstraintAwareCopulaGenerator,
+        create_copula_generator,
+    )
+    COPULA_AVAILABLE = True
+except ImportError:
+    COPULA_AVAILABLE = False
+    CopulaGenerator = None
+    ConstraintAwareCopulaGenerator = None
+    create_copula_generator = None
 __all__ = [
     "BaseGenerator",
     "GeneratorFactory",
@@ -26,4 +40,9 @@ __all__ = [
     "DateGenerator",
     "TextGenerator",
     "ForeignKeyGenerator",
+    # Optional SDV
+    "CopulaGenerator",
+    "ConstraintAwareCopulaGenerator",
+    "create_copula_generator",
+    "COPULA_AVAILABLE",
 ]

misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl