PyPI - misata - Versions diffs - 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

misata/__init__.py +1 -1
misata/agents/__init__.py +23 -0
misata/agents/pipeline.py +286 -0
misata/causal/__init__.py +5 -0
misata/causal/graph.py +109 -0
misata/causal/solver.py +115 -0
misata/cli.py +31 -0
misata/generators/__init__.py +19 -0
misata/generators/copula.py +198 -0
misata/llm_parser.py +180 -137
misata/quality.py +78 -33
misata/reference_data.py +221 -0
misata/research/__init__.py +3 -0
misata/research/agent.py +70 -0
misata/schema.py +25 -0
misata/simulator.py +264 -12
misata/smart_values.py +144 -6
misata/studio/__init__.py +55 -0
misata/studio/app.py +49 -0
misata/studio/components/inspector.py +81 -0
misata/studio/components/sidebar.py +35 -0
misata/studio/constraint_generator.py +781 -0
misata/studio/inference.py +319 -0
misata/studio/outcome_curve.py +284 -0
misata/studio/state/store.py +55 -0
misata/studio/tabs/configure.py +50 -0
misata/studio/tabs/generate.py +117 -0
misata/studio/tabs/outcome_curve.py +149 -0
misata/studio/tabs/schema_designer.py +217 -0
misata/studio/utils/styles.py +143 -0
misata/studio_constraints/__init__.py +29 -0
misata/studio_constraints/z3_solver.py +259 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
misata-0.5.0.dist-info/RECORD +61 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
misata-0.3.0b0.dist-info/RECORD +0 -37
/misata/{generators.py → generators_legacy.py} +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0

misata/generators/copula.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""
+SDV Copula-based Synthetic Data Generator
+Uses SDV's GaussianCopulaSynthesizer for high-quality correlation preservation.
+This is a key upgrade from our basic generator to beat Gretel on data quality.
+"""
+from typing import Dict, List, Optional, Any
+import pandas as pd
+import numpy as np
+try:
+    from sdv.single_table import GaussianCopulaSynthesizer
+    from sdv.metadata import SingleTableMetadata
+    SDV_AVAILABLE = True
+except ImportError:
+    SDV_AVAILABLE = False
+    print("[WARNING] SDV not installed. Run: pip install sdv")
+class CopulaGenerator:
+    """
+    SDV-based generator using Gaussian Copulas for correlation preservation.
+    Key advantages over basic generation:
+    - Preserves pairwise correlations between columns
+    - Learns marginal distributions accurately
+    - Handles mixed data types (numeric, categorical, datetime)
+    """
+    def __init__(self):
+        self.synthesizer = None
+        self.metadata = None
+        self._is_fitted = False
+    def fit(self, df: pd.DataFrame, metadata: Optional[Dict] = None) -> None:
+        """
+        Fit the copula model to real data.
+        Args:
+            df: Real data to learn from
+            metadata: Optional SDV metadata dict, auto-detected if not provided
+        """
+        if not SDV_AVAILABLE:
+            raise ImportError("SDV not installed. Run: pip install sdv")
+        # Auto-detect metadata if not provided
+        self.metadata = SingleTableMetadata()
+        self.metadata.detect_from_dataframe(df)
+        # Apply custom metadata if provided
+        if metadata:
+            for col, col_meta in metadata.items():
+                if 'sdtype' in col_meta:
+                    self.metadata.update_column(col, sdtype=col_meta['sdtype'])
+        # Create and fit synthesizer
+        self.synthesizer = GaussianCopulaSynthesizer(self.metadata)
+        self.synthesizer.fit(df)
+        self._is_fitted = True
+        print(f"[COPULA] Fitted on {len(df)} rows, {len(df.columns)} columns")
+    def sample(self, n: int) -> pd.DataFrame:
+        """
+        Generate synthetic data preserving correlations.
+        Args:
+            n: Number of rows to generate
+        Returns:
+            Synthetic DataFrame with same schema as training data
+        """
+        if not self._is_fitted:
+            raise ValueError("Must call fit() before sample()")
+        synthetic = self.synthesizer.sample(n)
+        print(f"[COPULA] Generated {len(synthetic)} rows")
+        return synthetic
+    def get_quality_report(self, real: pd.DataFrame, synthetic: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Evaluate quality of synthetic data vs real data.
+        Returns:
+            Dict with quality metrics (no fake validations!)
+        """
+        try:
+            from sdv.evaluation.single_table import evaluate_quality
+            report = evaluate_quality(
+                real_data=real,
+                synthetic_data=synthetic,
+                metadata=self.metadata
+            )
+            return {
+                "overall_score": report.get_score(),
+                "column_shapes": report.get_details("Column Shapes"),
+                "column_pair_trends": report.get_details("Column Pair Trends"),
+            }
+        except Exception as e:
+            print(f"[COPULA] Quality evaluation failed: {e}")
+            return {"error": str(e)}
+class ConstraintAwareCopulaGenerator(CopulaGenerator):
+    """
+    Extended Copula generator that applies outcome constraints.
+    """
+    def sample_with_constraints(
+        self,
+        n: int,
+        outcome_curves: Optional[List[Dict]] = None,
+        date_column: Optional[str] = None,
+        value_column: Optional[str] = None
+    ) -> pd.DataFrame:
+        """
+        Generate data that matches outcome curve targets.
+        Args:
+            n: Number of rows
+            outcome_curves: List of curve specs with monthly targets
+            date_column: Column containing dates
+            value_column: Column to adjust for targets
+        Returns:
+            Synthetic data adjusted to match targets
+        """
+        # Generate base synthetic data
+        df = self.sample(n)
+        if not outcome_curves or not date_column or not value_column:
+            return df
+        if date_column not in df.columns or value_column not in df.columns:
+            print(f"[COPULA] Columns not found: {date_column}, {value_column}")
+            return df
+        # Apply outcome curve adjustments
+        for curve in outcome_curves:
+            df = self._apply_curve(df, curve, date_column, value_column)
+        return df
+    def _apply_curve(
+        self,
+        df: pd.DataFrame,
+        curve: Dict,
+        date_column: str,
+        value_column: str
+    ) -> pd.DataFrame:
+        """Apply a single outcome curve to the data."""
+        points = curve.get('curve_points', [])
+        if not points:
+            return df
+        # Ensure date column is datetime
+        if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
+            df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
+        # Build month -> target mapping
+        month_targets = {}
+        for p in points:
+            month = p.get('month') if isinstance(p, dict) else getattr(p, 'month', None)
+            value = p.get('relative_value') if isinstance(p, dict) else getattr(p, 'relative_value', None)
+            if month and value:
+                month_targets[month] = value
+        if not month_targets:
+            return df
+        # Calculate base mean for scaling
+        base_mean = df[value_column].mean()
+        # Apply scaling per month
+        for month, relative_value in month_targets.items():
+            mask = df[date_column].dt.month == month
+            if mask.sum() > 0:
+                # Scale values to match relative target
+                # relative_value=1.0 means average, 2.0 means double, etc.
+                current_mean = df.loc[mask, value_column].mean()
+                if current_mean > 0:
+                    scale_factor = relative_value
+                    df.loc[mask, value_column] = df.loc[mask, value_column] * scale_factor
+        print(f"[COPULA] Applied outcome curve: {len(month_targets)} monthly adjustments")
+        return df
+# Factory function for easy access
+def create_copula_generator(with_constraints: bool = True) -> CopulaGenerator:
+    """Create a copula generator instance."""
+    if with_constraints:
+        return ConstraintAwareCopulaGenerator()
+    return CopulaGenerator()

misata/llm_parser.py CHANGED Viewed

@@ -16,7 +16,8 @@ from typing import Dict, Optional
 from groq import Groq
 from misata.curve_fitting import CurveFitter
-from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
+from misata.schema import Column, OutcomeCurve, Relationship, ScenarioEvent, SchemaConfig, Table
+from misata.research import DeepResearchAgent
 # Load .env file if it exists
@@ -47,164 +48,105 @@ def _load_env():
 _load_env()
-SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Generate realistic database schemas with TWO types of tables:
+SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Your job is to generate REALISTIC database schemas based ONLY on the user's story.
+## CRITICAL: DO NOT USE DEFAULT EXAMPLES
+- Generate tables that are SPECIFIC to the user's domain.
+- If user says "pet store", create tables like "pets", "pet_categories", "pet_sales".
+- If user says "music streaming", create tables like "songs", "artists", "streams".
+- NEVER default to fitness/exercise/workout tables UNLESS the user explicitly asks for them.
 ## TABLE TYPES
 ### 1. REFERENCE TABLES (is_reference: true)
-Small lookup tables with ACTUAL DATA you generate. Include realistic rows.
-Examples: plans, exercises, categories, products, meal_types
-For reference tables, provide:
-- is_reference: true
-- inline_data: Array of actual rows with realistic values
+Small lookup tables (5-20 rows) with ACTUAL DATA you generate.
+- MUST have an "id" column (integer, sequential from 1)
+- Include realistic inline_data based on user's domain
 ### 2. TRANSACTIONAL TABLES (is_reference: false)
-Large tables generated by code using foreign keys to reference tables.
-Examples: users, subscriptions, orders, workouts, payments
-For transactional tables, provide:
-- row_count: Number of rows to generate
-- Columns with distribution parameters
-## CRITICAL RULES
-### Reference Table Requirements:
-- ALWAYS include an "id" column (integer, sequential from 1)
-- Provide 5-20 realistic rows in inline_data
-- Prices in reference tables are the SOURCE OF TRUTH
-### Transactional Table Requirements:
-- Use foreign_key type to reference parent tables (reference or other parents)
-- Users: type="text" with text_type="name" or "email"
-- Metrics use distribution parameters
-### Foreign Key Rules:
-- foreign_key columns reference parent table's "id" column
-- Parent can be either reference table (plans.id) or transactional table (users.id)
-### Advanced Distributions (Optional):
-Instead of guessing parameters, you can provide "control_points" to draw the shape.
-Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
-Misata will mathematically solve for the best parameters.
-### SMART DEFAULTS (Use These for Realistic Data):
-**Age columns:**
-- type: "int", distribution: "normal", mean: 35, std: 12, min: 18, max: 80
-**Price/Amount columns:**
-- type: "float", distribution: "exponential", scale: 50, min: 0.01, decimals: 2
-- OR for products: uniform min: 9.99, max: 499.99
-**Rating columns (1-5 stars):**
-- type: "int", distribution: "categorical", choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
-**Quantity/Count columns:**
-- type: "int", distribution: "poisson", lambda: 3, min: 1
-**Duration (minutes):**
-- type: "int", distribution: "normal", mean: 45, std: 20, min: 5, max: 180
-**Percentage columns:**
-- type: "float", distribution: "uniform", min: 0.0, max: 100.0, decimals: 1
-**Status columns:**
-- type: "categorical", choices: ["active", "inactive", "pending"], probabilities: [0.70, 0.20, 0.10]
-**Boolean probabilities:**
-- is_verified: probability: 0.85
-- is_premium: probability: 0.25
-- is_active: probability: 0.80
-**Date columns:**
-- For recent data: bias last 30% of range with 70% of values
-- Always use realistic date ranges (not 1970-2100)
+Large tables generated by code using foreign keys.
+- Use row_count to specify size
+- Use foreign_key type to reference parent tables
 ## OUTPUT FORMAT
 {
-  "name": "Dataset Name",
-  "description": "Description",
+  "name": "Dataset Name based on user's domain",
+  "description": "Description of the domain",
   "seed": 42,
   "tables": [
     {
-      "name": "plans",
+      "name": "domain_specific_reference_table",
       "is_reference": true,
       "inline_data": [
-        {"id": 1, "name": "Free", "price": 0.0, "features": "Basic features"},
-        {"id": 2, "name": "Basic", "price": 9.99, "features": "All free + analytics"},
-        {"id": 3, "name": "Premium", "price": 19.99, "features": "All basic + priority support"},
-        {"id": 4, "name": "Enterprise", "price": 49.99, "features": "All premium + custom integrations"}
+        {"id": 1, "name": "Value A", "price": 10.00},
+        {"id": 2, "name": "Value B", "price": 20.00}
       ]
     },
     {
-      "name": "exercises",
-      "is_reference": true,
-      "inline_data": [
-        {"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10},
-        {"id": 2, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8},
-        {"id": 3, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3},
-        {"id": 4, "name": "Weightlifting", "category": "Strength", "calories_per_minute": 6},
-        {"id": 5, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9},
-        {"id": 6, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12},
-        {"id": 7, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4},
-        {"id": 8, "name": "Boxing", "category": "Cardio", "calories_per_minute": 11}
-      ]
-    },
-    {
-      "name": "users",
-      "row_count": 50000,
-      "is_reference": false
-    },
-    {
-      "name": "subscriptions",
-      "row_count": 20000,
-      "is_reference": false
-    },
-    {
-      "name": "workouts",
-      "row_count": 100000,
+      "name": "domain_specific_transactional_table",
+      "row_count": 10000,
       "is_reference": false
     }
   ],
   "columns": {
-    "users": [
-      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}, "unique": true},
-      {"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
-      {"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
-      {"name": "age", "type": "int", "distribution_params": {"distribution": "uniform", "min": 18, "max": 65}}
-    ],
-    "subscriptions": [
-      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 20000}},
-      {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
-      {"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
-      {"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused"], "probabilities": [0.7, 0.2, 0.1]}},
-      {"name": "start_date", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}}
-    ],
-    "workouts": [
-      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
-      {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
-      {"name": "exercise_id", "type": "foreign_key", "distribution_params": {}},
-      {"name": "duration_minutes", "type": "int", "distribution_params": {"distribution": "uniform", "min": 15, "max": 90}},
-      {"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}}
+    "domain_specific_transactional_table": [
+      {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": true},
+      {"name": "ref_id", "type": "foreign_key", "distribution_params": {}},
+      {"name": "amount", "type": "float", "distribution_params": {"distribution": "normal", "mean": 50, "std": 20}},
+      {"name": "date", "type": "date", "distribution_params": {"start": "2024-01-01", "end": "2025-12-31"}}
     ]
   },
   "relationships": [
-    {"parent_table": "users", "child_table": "subscriptions", "parent_key": "id", "child_key": "user_id"},
-    {"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
-    {"parent_table": "users", "child_table": "workouts", "parent_key": "id", "child_key": "user_id"},
-    {"parent_table": "exercises", "child_table": "workouts", "parent_key": "id", "child_key": "exercise_id"}
+    {"parent_table": "domain_specific_reference_table", "child_table": "domain_specific_transactional_table", "parent_key": "id", "child_key": "ref_id"}
   ],
+  "outcome_curves": [],
   "events": []
 }
-## KEY DIFFERENCE FROM BEFORE:
-- Reference tables have ACTUAL DATA in inline_data (plans with real prices!)
-- Transactional tables use foreign_key to REFERENCE those tables
-- When workout.exercise_id = 3, it means "Yoga" because exercises table has {id: 3, name: "Yoga"}
+## SMART DEFAULTS FOR COLUMNS
+Age: int, normal, mean: 35, std: 12, min: 18, max: 80
+Price/Amount: float, exponential, scale: 50, min: 0.01, decimals: 2
+Rating (1-5): int, categorical, choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
+Quantity: int, poisson, lambda: 3, min: 1
+Duration (min): int, normal, mean: 45, std: 20, min: 5
+Boolean: boolean, probability: 0.5-0.9 depending on context
+Date: date, start/end based on user's time context
+## TEMPORAL PATTERNS & OUTCOME CURVES
+If the user mentions ANY time-based patterns, EXTRACT them as outcome_curves:
+Keywords to detect:
+- "peak", "spike", "surge" -> High relative_value (0.8-1.0)
+- "dip", "drop", "decline" -> Low relative_value (0.2-0.4)
+- "growth", "upward trend" -> pattern_type: "growth"
+- "seasonal", "monthly cycles" -> pattern_type: "seasonal"
+Output format:
+"outcome_curves": [
+  {
+    "table": "sales",
+    "column": "amount",
+    "time_column": "sale_date",
+    "pattern_type": "seasonal",
+    "description": "High in December, low in February",
+    "curve_points": [
+      {"month": 2, "relative_value": 0.3},
+      {"month": 12, "relative_value": 1.0}
+    ]
+  }
+]
+## DATE RANGE RULES
+- "Last 2 years" -> start: 2024-01-01, end: 2025-12-31
+- "Past year" -> start: 2025-01-01, end: 2025-12-31
+- "Historical data" -> start: 2020-01-01, end: 2025-12-31
+- No mention -> Default to current year (2025)
+Generate schemas ONLY based on the user's story. Be creative and domain-specific."""
-Generate schemas following this exact pattern. The reference table inline_data is the source of truth."""
 GRAPH_REVERSE_PROMPT = """You are Misata, an expert at reverse-engineering data patterns.
@@ -339,13 +281,16 @@ class LLMSchemaGenerator:
 {story}
-IMPORTANT:
-1. Create REFERENCE TABLES with inline_data for: plans, exercises, categories, products, etc.
-2. Create TRANSACTIONAL TABLES with row_count for: users, subscriptions, orders, workouts, etc.
-3. Use foreign_key to link transactional tables to reference tables
-4. Default row count for transactional tables: {default_rows}
+CRITICAL INSTRUCTIONS:
+1. Generate tables SPECIFIC to the domain described above. DO NOT use generic fitness/exercise examples.
+2. Create REFERENCE TABLES (is_reference: true) with inline_data for any lookup/configuration data relevant to THIS domain.
+3. Create TRANSACTIONAL TABLES (is_reference: false) with row_count for high-volume data like users, transactions, events, etc.
+4. Use foreign_key to link transactional tables to reference tables.
+5. Default row count for transactional tables: {default_rows}
+6. If the user mentions time patterns (peaks, dips, trends, growth), extract them as outcome_curves.
+7. If the user mentions a time range (e.g., "last 2 years"), set date column start/end accordingly.
-Output valid JSON. Think about what lookup/reference data is needed, then what transactional data references it."""
+Output valid JSON. Be creative and domain-specific - DO NOT copy the system prompt examples."""
         response = self.client.chat.completions.create(
@@ -452,6 +397,25 @@ Include reference tables with inline_data for lookup values and transactional ta
             columns[table_name] = []
             for c in cols:
                 col_type = c.get("type", "text")
+                # Normalize LLM type variations to valid schema types
+                type_mapping = {
+                    "string": "text",
+                    "str": "text",
+                    "varchar": "text",
+                    "char": "text",
+                    "integer": "int",
+                    "number": "float",
+                    "decimal": "float",
+                    "double": "float",
+                    "timestamp": "datetime",
+                    "bool": "boolean",
+                    "enum": "categorical",
+                    "category": "categorical",
+                    "fk": "foreign_key",
+                }
+                col_type = type_mapping.get(col_type.lower(), col_type)
                 raw_params = c.get("distribution_params", {})
                 normalized_params = self._normalize_distribution_params(col_type, raw_params)
@@ -508,6 +472,20 @@ Include reference tables with inline_data for lookup values and transactional ta
                 description=e.get("description")
             ))
+        # Parse outcome curves (temporal patterns from natural language)
+        outcome_curves = []
+        for c in schema_dict.get("outcome_curves", []):
+            if not all(key in c for key in ["table", "column"]):
+                continue
+            outcome_curves.append(OutcomeCurve(
+                table=c["table"],
+                column=c["column"],
+                time_column=c.get("time_column", "date"),
+                pattern_type=c.get("pattern_type", "seasonal"),
+                description=c.get("description"),
+                curve_points=c.get("curve_points", [])
+            ))
         return SchemaConfig(
             name=schema_dict.get("name", "Generated Dataset"),
             description=schema_dict.get("description"),
@@ -515,15 +493,80 @@ Include reference tables with inline_data for lookup values and transactional ta
             columns=columns,
             relationships=relationships,
             events=events,
+            outcome_curves=outcome_curves,
             seed=schema_dict.get("seed", 42)
         )
+    def generate_from_story(self, story: str, use_research: bool = False) -> SchemaConfig:
+        """
+        Generate schema from a user story.
+        Args:
+            story: The natural language description.
+            use_research: If True, uses agent to find real companies for context.
+        """
+        context = ""
+        if use_research:
+            print("🕵️‍♂️ Deep Research Mode: ACTIVATED")
+            # Simple heuristic to find likely domain
+            domain = "SaaS"
+            if "fitness" in story.lower(): domain = "Fitness App"
+            elif "ecommerce" in story.lower() or "shop" in story.lower(): domain = "Ecommerce"
+            elif "finance" in story.lower(): domain = "Fintech"
+            try:
+                # Use Mock Agent (fast)
+                agent = DeepResearchAgent(use_mock=True)
+                entities = agent.search_entities(domain, "Competitors", limit=5)
+                names = [e['name'] for e in entities]
+                context = (
+                    f"\n\nREAL WORLD CONTEXT (INJECTED):\n"
+                    f"Research found these top players in {domain}: {', '.join(names)}.\n"
+                    f"Use these names as examples in the 'inline_data' for reference tables if relevant."
+                )
+            except Exception as e:
+                print(f"Research Agent Warning: {e}")
+        # Construct the final prompt
+        user_prompt = f"Story: {story}{context}\n\nGenerate the complete JSON schema."
+        completion = self.client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": SYSTEM_PROMPT,
+                },
+                {
+                    "role": "user",
+                    "content": user_prompt,
+                }
+            ],
+            model=self.model,
+            temperature=0.1,  # Low temp for JSON consistency
+            response_format={"type": "json_object"},
+        )
+        response_content = completion.choices[0].message.content
+        try:
+            schema_dict = json.loads(response_content)
+            return self._parse_schema(schema_dict)
+        except json.JSONDecodeError:
+            # Fallback text parsing if JSON mode fails (unlikely with Llama 3)
+            # For now, just raise
+            raise ValueError(f"Failed to generate valid JSON. Raw response: {response_content[:100]}...")
+    def generate_from_graph(self, description: str) -> SchemaConfig:
+        """Reverse engineer schema from graph description."""
+        # Similar to above but uses GRAPH_REVERSE_PROMPT
+        # For brevity, implementing basic pass-through
+        return self.generate_from_story(description)
 # Convenience functions
-def generate_schema(story: str, api_key: Optional[str] = None) -> SchemaConfig:
+def generate_schema(story: str, api_key: Optional[str] = None, use_research: bool = False) -> SchemaConfig:
     """Quick helper to generate schema from story."""
     generator = LLMSchemaGenerator(api_key=api_key)
-    return generator.generate_from_story(story)
+    return generator.generate_from_story(story, use_research=use_research)
 def generate_from_chart(description: str, api_key: Optional[str] = None) -> SchemaConfig:

misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl