PyPI - misata - Versions diffs - 0.1.0b0__py3-none-any.whl - Mend

misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

misata/__init__.py +48 -0
misata/api.py +460 -0
misata/audit.py +415 -0
misata/benchmark.py +376 -0
misata/cli.py +680 -0
misata/codegen.py +153 -0
misata/curve_fitting.py +106 -0
misata/customization.py +256 -0
misata/feedback.py +433 -0
misata/formulas.py +362 -0
misata/generators.py +247 -0
misata/hybrid.py +398 -0
misata/llm_parser.py +493 -0
misata/noise.py +346 -0
misata/schema.py +252 -0
misata/semantic.py +185 -0
misata/simulator.py +742 -0
misata/story_parser.py +425 -0
misata/templates/__init__.py +444 -0
misata/validation.py +313 -0
misata-0.1.0b0.dist-info/METADATA +291 -0
misata-0.1.0b0.dist-info/RECORD +25 -0
misata-0.1.0b0.dist-info/WHEEL +5 -0
misata-0.1.0b0.dist-info/entry_points.txt +2 -0
misata-0.1.0b0.dist-info/top_level.txt +1 -0

misata/llm_parser.py ADDED Viewed

@@ -0,0 +1,493 @@
+"""
+LLM-powered schema generator using Groq Llama 3.3.
+This module provides intelligent schema generation from natural language,
+including:
+- Reference tables with actual LLM-generated data (exercises, plans, meals)
+- Transactional tables with foreign keys to reference tables
+- Industry-realistic column configurations
+"""
+import json
+import os
+from pathlib import Path
+from typing import Dict, Optional
+from groq import Groq
+from misata.curve_fitting import CurveFitter
+from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
+# Load .env file if it exists
+def _load_env():
+    """Load environment variables from .env file."""
+    env_paths = [
+        Path.cwd() / ".env",
+        Path(__file__).parent.parent / ".env",
+        Path.home() / ".misata" / ".env",
+    ]
+    for env_path in env_paths:
+        if env_path.exists():
+            with open(env_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#") and "=" in line:
+                        key, _, value = line.partition("=")
+                        os.environ.setdefault(key.strip(), value.strip())
+            break
+_load_env()
+SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Generate realistic database schemas with TWO types of tables:
+## TABLE TYPES
+### 1. REFERENCE TABLES (is_reference: true)
+Small lookup tables with ACTUAL DATA you generate. Include realistic rows.
+Examples: plans, exercises, categories, products, meal_types
+For reference tables, provide:
+- is_reference: true
+- inline_data: Array of actual rows with realistic values
+### 2. TRANSACTIONAL TABLES (is_reference: false)
+Large tables generated by code using foreign keys to reference tables.
+Examples: users, subscriptions, orders, workouts, payments
+For transactional tables, provide:
+- row_count: Number of rows to generate
+- Columns with distribution parameters
+## CRITICAL RULES
+### Reference Table Requirements:
+- ALWAYS include an "id" column (integer, sequential from 1)
+- Provide 5-20 realistic rows in inline_data
+- Prices in reference tables are the SOURCE OF TRUTH
+### Transactional Table Requirements:
+- Use foreign_key type to reference parent tables (reference or other parents)
+- Users: type="text" with text_type="name" or "email"
+- Metrics use distribution parameters
+### Foreign Key Rules:
+- foreign_key columns reference parent table's "id" column
+- Parent can be either reference table (plans.id) or transactional table (users.id)
+### Advanced Distributions (Optional):
+Instead of guessing parameters, you can provide "control_points" to draw the shape.
+Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
+Misata will mathematically solve for the best parameters.
+## OUTPUT FORMAT
+{
+  "name": "Dataset Name",
+  "description": "Description",
+  "seed": 42,
+  "tables": [
+    {
+      "name": "plans",
+      "is_reference": true,
+      "inline_data": [
+        {"id": 1, "name": "Free", "price": 0.0, "features": "Basic features"},
+        {"id": 2, "name": "Basic", "price": 9.99, "features": "All free + analytics"},
+        {"id": 3, "name": "Premium", "price": 19.99, "features": "All basic + priority support"},
+        {"id": 4, "name": "Enterprise", "price": 49.99, "features": "All premium + custom integrations"}
+      ]
+    },
+    {
+      "name": "exercises",
+      "is_reference": true,
+      "inline_data": [
+        {"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10},
+        {"id": 2, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8},
+        {"id": 3, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3},
+        {"id": 4, "name": "Weightlifting", "category": "Strength", "calories_per_minute": 6},
+        {"id": 5, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9},
+        {"id": 6, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12},
+        {"id": 7, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4},
+        {"id": 8, "name": "Boxing", "category": "Cardio", "calories_per_minute": 11}
+      ]
+    },
+    {
+      "name": "users",
+      "row_count": 50000,
+      "is_reference": false
+    },
+    {
+      "name": "subscriptions",
+      "row_count": 20000,
+      "is_reference": false
+    },
+    {
+      "name": "workouts",
+      "row_count": 100000,
+      "is_reference": false
+    }
+  ],
+  "columns": {
+    "users": [
+      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}, "unique": true},
+      {"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
+      {"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
+      {"name": "age", "type": "int", "distribution_params": {"distribution": "uniform", "min": 18, "max": 65}}
+    ],
+    "subscriptions": [
+      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 20000}},
+      {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
+      {"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
+      {"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused"], "probabilities": [0.7, 0.2, 0.1]}},
+      {"name": "start_date", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}}
+    ],
+    "workouts": [
+      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
+      {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
+      {"name": "exercise_id", "type": "foreign_key", "distribution_params": {}},
+      {"name": "duration_minutes", "type": "int", "distribution_params": {"distribution": "uniform", "min": 15, "max": 90}},
+      {"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}}
+    ]
+  },
+  "relationships": [
+    {"parent_table": "users", "child_table": "subscriptions", "parent_key": "id", "child_key": "user_id"},
+    {"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
+    {"parent_table": "users", "child_table": "workouts", "parent_key": "id", "child_key": "user_id"},
+    {"parent_table": "exercises", "child_table": "workouts", "parent_key": "id", "child_key": "exercise_id"}
+  ],
+  "events": []
+}
+## KEY DIFFERENCE FROM BEFORE:
+- Reference tables have ACTUAL DATA in inline_data (plans with real prices!)
+- Transactional tables use foreign_key to REFERENCE those tables
+- When workout.exercise_id = 3, it means "Yoga" because exercises table has {id: 3, name: "Yoga"}
+Generate schemas following this exact pattern. The reference table inline_data is the source of truth."""
+GRAPH_REVERSE_PROMPT = """You are Misata, an expert at reverse-engineering data patterns.
+Given a description of a desired chart or graph pattern, generate a schema that will
+produce data matching that EXACT pattern when plotted.
+Follow the same two-tier table structure:
+- Reference tables with inline_data for lookup values
+- Transactional tables with foreign keys for mass data
+The user will describe a chart they want. Your job is to generate data that,
+when plotted, produces that exact chart."""
+class LLMSchemaGenerator:
+    """
+    Generate realistic schemas from natural language using LLMs.
+    Supports multiple providers:
+    - groq: Groq Cloud (Llama 3.3) - Fast, free tier
+    - openai: OpenAI (GPT-4o) - Best quality
+    - ollama: Local Ollama - Free, private
+    This is the "brain" of Misata - what makes it genuinely AI-powered.
+    """
+    # Provider configurations
+    PROVIDERS = {
+        "groq": {
+            "base_url": None,  # Uses default
+            "env_key": "GROQ_API_KEY",
+            "default_model": "llama-3.3-70b-versatile",
+        },
+        "openai": {
+            "base_url": None,
+            "env_key": "OPENAI_API_KEY",
+            "default_model": "gpt-4o-mini",
+        },
+        "ollama": {
+            "base_url": "http://localhost:11434/v1",
+            "env_key": None,  # No key needed for local
+            "default_model": "llama3",
+        },
+    }
+    def __init__(
+        self,
+        provider: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        base_url: Optional[str] = None,
+    ):
+        """
+        Initialize the LLM generator.
+        Args:
+            provider: LLM provider ("groq", "openai", "ollama").
+                      Defaults to MISATA_PROVIDER env var or "groq".
+            api_key: API key. If not provided, reads from provider's env var.
+            model: Model name. If not provided, uses provider default.
+            base_url: Custom API base URL (for Ollama or compatible APIs).
+        """
+        # Determine provider
+        self.provider = provider or os.environ.get("MISATA_PROVIDER", "groq").lower()
+        if self.provider not in self.PROVIDERS:
+            raise ValueError(f"Unknown provider: {self.provider}. Use: {list(self.PROVIDERS.keys())}")
+        config = self.PROVIDERS[self.provider]
+        # Get API key
+        self.api_key = api_key
+        if not self.api_key and config["env_key"]:
+            self.api_key = os.environ.get(config["env_key"])
+        if not self.api_key and self.provider != "ollama":
+            env_key = config["env_key"]
+            raise ValueError(
+                f"{self.provider.title()} API key required. "
+                f"Set {env_key} environment variable or pass api_key parameter."
+            )
+        # Set model
+        self.model = model or config["default_model"]
+        # Set base URL
+        self.base_url = base_url or config["base_url"]
+        # Initialize client (all providers use OpenAI-compatible API)
+        if self.provider == "groq":
+            self.client = Groq(api_key=self.api_key)
+        else:
+            # OpenAI and Ollama use openai package
+            try:
+                from openai import OpenAI
+            except ImportError:
+                raise ImportError(
+                    f"openai package required for {self.provider}. "
+                    "Install with: pip install openai"
+                )
+            client_kwargs = {}
+            if self.api_key:
+                client_kwargs["api_key"] = self.api_key
+            if self.base_url:
+                client_kwargs["base_url"] = self.base_url
+            # Ollama doesn't need a real API key
+            if self.provider == "ollama":
+                client_kwargs["api_key"] = "ollama"
+            self.client = OpenAI(**client_kwargs)
+    def generate_from_story(
+        self,
+        story: str,
+        default_rows: int = 10000,
+        temperature: float = 0.3,
+    ) -> SchemaConfig:
+        """
+        Generate a realistic schema from a natural language story.
+        Args:
+            story: Natural language description of the data needs
+            default_rows: Default row count if not specified in story
+            temperature: LLM temperature (lower = more consistent)
+        Returns:
+            SchemaConfig ready for data generation
+        """
+        user_prompt = f"""Generate a complete synthetic data schema in JSON format for:
+{story}
+IMPORTANT:
+1. Create REFERENCE TABLES with inline_data for: plans, exercises, categories, products, etc.
+2. Create TRANSACTIONAL TABLES with row_count for: users, subscriptions, orders, workouts, etc.
+3. Use foreign_key to link transactional tables to reference tables
+4. Default row count for transactional tables: {default_rows}
+Output valid JSON. Think about what lookup/reference data is needed, then what transactional data references it."""
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt}
+            ],
+            temperature=temperature,
+            max_tokens=6000,
+            response_format={"type": "json_object"}
+        )
+        schema_dict = json.loads(response.choices[0].message.content)
+        return self._parse_schema(schema_dict)
+    def generate_from_graph(
+        self,
+        graph_description: str,
+        temperature: float = 0.2,
+    ) -> SchemaConfig:
+        """
+        REVERSE ENGINEERING: Generate schema that produces desired graph patterns.
+        """
+        user_prompt = f"""Generate a JSON schema that will produce this chart pattern:
+{graph_description}
+Include reference tables with inline_data for lookup values and transactional tables for mass data. Output valid JSON."""
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": GRAPH_REVERSE_PROMPT},
+                {"role": "user", "content": user_prompt}
+            ],
+            temperature=temperature,
+            max_tokens=6000,
+            response_format={"type": "json_object"}
+        )
+        schema_dict = json.loads(response.choices[0].message.content)
+        return self._parse_schema(schema_dict)
+    def _normalize_distribution_params(self, col_type: str, params: Dict) -> Dict:
+        """Normalize LLM output variations in distribution_params."""
+        normalized = params.copy()
+        # Normalize date column parameters
+        if col_type == "date":
+            if "start_date" in normalized and "start" not in normalized:
+                normalized["start"] = normalized.pop("start_date")
+            if "end_date" in normalized and "end" not in normalized:
+                normalized["end"] = normalized.pop("end_date")
+            if "start" not in normalized:
+                normalized["start"] = "2023-01-01"
+            if "end" not in normalized:
+                normalized["end"] = "2024-12-31"
+        # Normalize categorical parameters
+        if col_type == "categorical":
+            if "options" in normalized and "choices" not in normalized:
+                normalized["choices"] = normalized.pop("options")
+            if "choices" not in normalized:
+                normalized["choices"] = ["A", "B", "C"]
+        # Curve Fitting for 'control_points'
+        if "control_points" in normalized:
+            try:
+                points = normalized.pop("control_points")
+                dist_type = normalized.get("distribution", "normal")
+                fitter = CurveFitter()
+                fitted_params = fitter.fit_distribution(points, dist_type)
+                # Merge fitted params, keeping any manual overrides if they exist (or overwriting? let's overwrite for safety)
+                normalized.update(fitted_params)
+            except Exception:
+                # If fitting fails, fallback to defaults or keep what we have
+                pass
+        return normalized
+    def _parse_schema(self, schema_dict: Dict) -> SchemaConfig:
+        """Parse LLM output into validated SchemaConfig."""
+        # Parse tables
+        tables = []
+        for t in schema_dict.get("tables", []):
+            is_ref = t.get("is_reference", False)
+            inline = t.get("inline_data", None)
+            row_count = t.get("row_count", len(inline) if inline else 100)
+            tables.append(Table(
+                name=t["name"],
+                row_count=row_count,
+                description=t.get("description"),
+                is_reference=is_ref,
+                inline_data=inline
+            ))
+        # Parse columns (only for transactional tables, reference tables use inline_data)
+        columns = {}
+        for table_name, cols in schema_dict.get("columns", {}).items():
+            columns[table_name] = []
+            for c in cols:
+                col_type = c.get("type", "text")
+                raw_params = c.get("distribution_params", {})
+                normalized_params = self._normalize_distribution_params(col_type, raw_params)
+                columns[table_name].append(Column(
+                    name=c["name"],
+                    type=col_type,
+                    distribution_params=normalized_params,
+                    nullable=c.get("nullable", False),
+                    unique=c.get("unique", False)
+                ))
+        # For reference tables without columns, create columns from inline_data
+        for table in tables:
+            if table.is_reference and table.inline_data and table.name not in columns:
+                # Infer columns from first row of inline_data
+                first_row = table.inline_data[0]
+                columns[table.name] = []
+                for col_name, value in first_row.items():
+                    if isinstance(value, int):
+                        col_type = "int"
+                    elif isinstance(value, float):
+                        col_type = "float"
+                    else:
+                        col_type = "text"
+                    columns[table.name].append(Column(
+                        name=col_name,
+                        type=col_type,
+                        distribution_params={}
+                    ))
+        # Parse relationships
+        relationships = []
+        for r in schema_dict.get("relationships", []):
+            relationships.append(Relationship(
+                parent_table=r["parent_table"],
+                child_table=r["child_table"],
+                parent_key=r["parent_key"],
+                child_key=r["child_key"],
+                temporal_constraint=r.get("temporal_constraint", False)
+            ))
+        # Parse events
+        events = []
+        for e in schema_dict.get("events", []):
+            if not all(key in e for key in ["name", "table", "column", "condition", "modifier_type", "modifier_value"]):
+                continue
+            events.append(ScenarioEvent(
+                name=e["name"],
+                table=e["table"],
+                column=e["column"],
+                condition=e["condition"],
+                modifier_type=e["modifier_type"],
+                modifier_value=e["modifier_value"],
+                description=e.get("description")
+            ))
+        return SchemaConfig(
+            name=schema_dict.get("name", "Generated Dataset"),
+            description=schema_dict.get("description"),
+            tables=tables,
+            columns=columns,
+            relationships=relationships,
+            events=events,
+            seed=schema_dict.get("seed", 42)
+        )
+# Convenience functions
+def generate_schema(story: str, api_key: Optional[str] = None) -> SchemaConfig:
+    """Quick helper to generate schema from story."""
+    generator = LLMSchemaGenerator(api_key=api_key)
+    return generator.generate_from_story(story)
+def generate_from_chart(description: str, api_key: Optional[str] = None) -> SchemaConfig:
+    """Quick helper to reverse-engineer schema from chart description."""
+    generator = LLMSchemaGenerator(api_key=api_key)
+    return generator.generate_from_graph(description)