PyPI - misata - Versions diffs - 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

misata/__init__.py +1 -1
misata/agents/__init__.py +23 -0
misata/agents/pipeline.py +286 -0
misata/causal/__init__.py +5 -0
misata/causal/graph.py +109 -0
misata/causal/solver.py +115 -0
misata/cli.py +31 -0
misata/generators/__init__.py +19 -0
misata/generators/copula.py +198 -0
misata/llm_parser.py +180 -137
misata/quality.py +78 -33
misata/reference_data.py +221 -0
misata/research/__init__.py +3 -0
misata/research/agent.py +70 -0
misata/schema.py +25 -0
misata/simulator.py +264 -12
misata/smart_values.py +144 -6
misata/studio/__init__.py +55 -0
misata/studio/app.py +49 -0
misata/studio/components/inspector.py +81 -0
misata/studio/components/sidebar.py +35 -0
misata/studio/constraint_generator.py +781 -0
misata/studio/inference.py +319 -0
misata/studio/outcome_curve.py +284 -0
misata/studio/state/store.py +55 -0
misata/studio/tabs/configure.py +50 -0
misata/studio/tabs/generate.py +117 -0
misata/studio/tabs/outcome_curve.py +149 -0
misata/studio/tabs/schema_designer.py +217 -0
misata/studio/utils/styles.py +143 -0
misata/studio_constraints/__init__.py +29 -0
misata/studio_constraints/z3_solver.py +259 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
misata-0.5.0.dist-info/RECORD +61 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
misata-0.3.0b0.dist-info/RECORD +0 -37
/misata/{generators.py → generators_legacy.py} +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
{misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0

misata/quality.py CHANGED Viewed

@@ -11,13 +11,15 @@ This module validates generated synthetic data for:
 from typing import Dict, List, Any, Optional, Tuple
 from dataclasses import dataclass, field
 import warnings
+import numpy as np
+import pandas as pd # type: ignore
 @dataclass
 class QualityIssue:
     """Represents a single data quality issue."""
     severity: str  # "error", "warning", "info"
-    category: str  # "distribution", "integrity", "temporal", "domain"
+    category: str  # "distribution", "integrity", "temporal", "domain", "time_series"
     table: str
     column: Optional[str]
     message: str
@@ -107,19 +109,12 @@ class DataQualityChecker:
     def check_distribution_plausibility(
         self,
-        df: "pd.DataFrame",
+        df: pd.DataFrame,
         table_name: str,
     ) -> None:
         """
         Check if numeric distributions are plausible for their domains.
-        Args:
-            df: DataFrame to check
-            table_name: Name of the table
         """
-        import pandas as pd
-        import numpy as np
         for col in df.columns:
             col_lower = col.lower()
@@ -162,15 +157,11 @@ class DataQualityChecker:
     def check_referential_integrity(
         self,
-        tables: Dict[str, "pd.DataFrame"],
+        tables: Dict[str, pd.DataFrame],
         relationships: List[Any],
     ) -> None:
         """
         Verify all foreign key references are valid.
-        Args:
-            tables: Dict of table_name -> DataFrame
-            relationships: List of Relationship objects
         """
         for rel in relationships:
             parent_table = rel.parent_table
@@ -221,19 +212,12 @@ class DataQualityChecker:
     def check_temporal_consistency(
         self,
-        df: "pd.DataFrame",
+        df: pd.DataFrame,
         table_name: str,
     ) -> None:
         """
         Ensure temporal columns are consistent.
-        Checks:
-        - created_at < updated_at
-        - start_date < end_date
-        - birth_date in past
         """
-        import pandas as pd
         date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
         # Check created < updated
@@ -266,23 +250,83 @@ class DataQualityChecker:
                     f"{future_births} rows have birth_date in the future",
                     {"violation_count": future_births}
                 )
+    def check_time_series_properties(
+        self,
+        df: pd.DataFrame,
+        table_name: str,
+    ) -> None:
+        """
+        Analyze time series properties (Autocorrelation, Trend, Seasonality).
+        Adds 'info' level insights to the report.
+        """
+        # Find Date Column
+        date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
+        if not date_cols:
+            return
+        time_col = date_cols[0] # Use first date col
+        # Find Metric Columns (Float/Int)
+        numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in ['id']]
+        for col in numeric_cols:
+            # Skip if low cardinality
+            if df[col].nunique() < 10:
+                continue
+            # Sort by time
+            ts_df = df.sort_values(time_col)
+            series = ts_df[col].values
+            if len(series) < 5:
+                continue
+            # 1. Autocorrelation (Lag-1)
+            # Simple manual calculation
+            if len(series) > 2:
+                # Handle possible NaNs
+                s_clean = series[~np.isnan(series)]
+                if len(s_clean) > 2:
+                    lag1 = np.corrcoef(s_clean[:-1], s_clean[1:])[0, 1]
+                    if not np.isnan(lag1):
+                         if abs(lag1) > 0.7:
+                             msg = f"Strong temporal logic detected (Lag-1 Autocorrelation: {lag1:.2f})"
+                             self._add_issue("info", "time_series", table_name, col, msg, {"lag1": lag1})
+                         elif abs(lag1) < 0.1:
+                             msg = f"Data appears random/noisy (Lag-1 Autocorrelation: {lag1:.2f})"
+                             self._add_issue("info", "time_series", table_name, col, msg, {"lag1": lag1})
+            # 2. Trend Detection
+            if len(series) > 10:
+                # Linear fit
+                x = np.arange(len(series))
+                # Handle NaNs replacement for trend check
+                s_filled = pd.Series(series).fillna(method='ffill').fillna(0).values
+                slope, _ = np.polyfit(x, s_filled, 1)
+                # Normalize slope to be % change per step relative to mean
+                mean_val = np.mean(s_filled)
+                if abs(mean_val) > 0.01:
+                    normalized_slope = slope / mean_val
+                    if abs(normalized_slope) * len(series) > 0.2: # Total change > 20%
+                        trend_dir = "Growth" if slope > 0 else "Decline"
+                        self._add_issue(
+                            "info", "time_series", table_name, col,
+                            f"Significant {trend_dir} Trend Detected",
+                            {"slope": slope}
+                        )
     def check_all(
         self,
-        tables: Dict[str, "pd.DataFrame"],
+        tables: Dict[str, pd.DataFrame],
         relationships: Optional[List[Any]] = None,
         schema: Optional[Any] = None,
     ) -> QualityReport:
         """
         Run all quality checks and generate a report.
-        Args:
-            tables: Dict of table_name -> DataFrame
-            relationships: Optional list of Relationship objects
-            schema: Optional SchemaConfig for additional checks
-        Returns:
-            QualityReport with score and issues
         """
         self.issues = []  # Reset
@@ -290,6 +334,7 @@ class DataQualityChecker:
         for table_name, df in tables.items():
             self.check_distribution_plausibility(df, table_name)
             self.check_temporal_consistency(df, table_name)
+            self.check_time_series_properties(df, table_name)
         # Check referential integrity
         if relationships:
@@ -303,7 +348,7 @@ class DataQualityChecker:
             elif issue.severity == "warning":
                 base_score -= 3
             else:
-                base_score -= 1
+                base_score -= 1 # Info subtracts 1 for now (maybe 0 later)
         score = max(0, min(100, base_score))
@@ -323,7 +368,7 @@ class DataQualityChecker:
         )
-def check_quality(tables: Dict[str, "pd.DataFrame"], **kwargs) -> QualityReport:
+def check_quality(tables: Dict[str, pd.DataFrame], **kwargs) -> QualityReport:
     """Convenience function for quick quality checks."""
     checker = DataQualityChecker()
     return checker.check_all(tables, **kwargs)

misata/reference_data.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""
+Domain-Aware Reference Data Library
+Pre-built realistic data templates for common business domains.
+This ensures reference tables (plans, exercises, categories) have
+sensible, domain-appropriate values instead of random garbage.
+Usage:
+    from misata.reference_data import get_reference_data, detect_domain
+    domain = detect_domain(["plans", "subscriptions", "users"])
+    plans_data = get_reference_data(domain, "plans")
+"""
+from typing import Any, Dict, List, Optional
+# ============ DOMAIN TEMPLATES ============
+REFERENCE_DATA_LIBRARY: Dict[str, Dict[str, List[Dict[str, Any]]]] = {
+    # ===== SaaS / Subscription Business =====
+    "saas": {
+        "plans": [
+            {"id": 1, "name": "Free", "price": 0.00, "features": "Basic features, Community support"},
+            {"id": 2, "name": "Starter", "price": 9.99, "features": "5GB storage, Email support"},
+            {"id": 3, "name": "Pro", "price": 29.99, "features": "50GB storage, Priority support, Analytics"},
+            {"id": 4, "name": "Business", "price": 79.99, "features": "200GB storage, Dedicated support, API access"},
+            {"id": 5, "name": "Enterprise", "price": 199.99, "features": "Unlimited storage, SLA, Custom integrations"},
+        ],
+        "tiers": [
+            {"id": 1, "name": "Bronze", "discount_pct": 0},
+            {"id": 2, "name": "Silver", "discount_pct": 10},
+            {"id": 3, "name": "Gold", "discount_pct": 20},
+            {"id": 4, "name": "Platinum", "discount_pct": 30},
+        ],
+    },
+    # ===== Fitness / Health App =====
+    "fitness": {
+        "exercises": [
+            {"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10, "difficulty": "Medium"},
+            {"id": 2, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9, "difficulty": "Medium"},
+            {"id": 3, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8, "difficulty": "Easy"},
+            {"id": 4, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12, "difficulty": "Hard"},
+            {"id": 5, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3, "difficulty": "Easy"},
+            {"id": 6, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4, "difficulty": "Medium"},
+            {"id": 7, "name": "Weight Training", "category": "Strength", "calories_per_minute": 6, "difficulty": "Medium"},
+            {"id": 8, "name": "CrossFit", "category": "Strength", "calories_per_minute": 11, "difficulty": "Hard"},
+        ],
+        "plans": [
+            {"id": 1, "name": "Free", "price": 0.00, "features": "Basic workouts"},
+            {"id": 2, "name": "Basic", "price": 9.99, "features": "All workouts, Progress tracking"},
+            {"id": 3, "name": "Premium", "price": 19.99, "features": "Personal trainer, Meal plans"},
+            {"id": 4, "name": "Elite", "price": 49.99, "features": "1-on-1 coaching, Custom programs"},
+        ],
+        "workout_types": [
+            {"id": 1, "name": "Morning Cardio", "duration_minutes": 30, "intensity": "Medium"},
+            {"id": 2, "name": "Full Body Strength", "duration_minutes": 45, "intensity": "High"},
+            {"id": 3, "name": "Relaxing Yoga", "duration_minutes": 60, "intensity": "Low"},
+            {"id": 4, "name": "HIIT Blast", "duration_minutes": 20, "intensity": "Very High"},
+        ],
+    },
+    # ===== E-commerce / Retail =====
+    "ecommerce": {
+        "categories": [
+            {"id": 1, "name": "Electronics", "description": "Phones, laptops, gadgets"},
+            {"id": 2, "name": "Clothing", "description": "Fashion and apparel"},
+            {"id": 3, "name": "Home & Garden", "description": "Furniture, decor, outdoor"},
+            {"id": 4, "name": "Sports & Outdoors", "description": "Fitness, camping, sports gear"},
+            {"id": 5, "name": "Books & Media", "description": "Books, music, movies"},
+            {"id": 6, "name": "Health & Beauty", "description": "Skincare, supplements, wellness"},
+        ],
+        "products": [
+            {"id": 1, "name": "Wireless Headphones", "category_id": 1, "price": 79.99},
+            {"id": 2, "name": "Smart Watch", "category_id": 1, "price": 199.99},
+            {"id": 3, "name": "Cotton T-Shirt", "category_id": 2, "price": 24.99},
+            {"id": 4, "name": "Running Shoes", "category_id": 4, "price": 89.99},
+            {"id": 5, "name": "Yoga Mat", "category_id": 4, "price": 29.99},
+        ],
+        "shipping_methods": [
+            {"id": 1, "name": "Standard", "days": 5, "price": 4.99},
+            {"id": 2, "name": "Express", "days": 2, "price": 9.99},
+            {"id": 3, "name": "Next Day", "days": 1, "price": 19.99},
+            {"id": 4, "name": "Free Shipping", "days": 7, "price": 0.00},
+        ],
+    },
+    # ===== Finance / Banking =====
+    "finance": {
+        "account_types": [
+            {"id": 1, "name": "Checking", "interest_rate": 0.01, "monthly_fee": 0.00},
+            {"id": 2, "name": "Savings", "interest_rate": 0.50, "monthly_fee": 0.00},
+            {"id": 3, "name": "Money Market", "interest_rate": 1.00, "monthly_fee": 5.00},
+            {"id": 4, "name": "Premium Checking", "interest_rate": 0.10, "monthly_fee": 15.00},
+        ],
+        "transaction_types": [
+            {"id": 1, "name": "Deposit", "category": "Income"},
+            {"id": 2, "name": "Withdrawal", "category": "Expense"},
+            {"id": 3, "name": "Transfer", "category": "Transfer"},
+            {"id": 4, "name": "Payment", "category": "Expense"},
+            {"id": 5, "name": "Refund", "category": "Income"},
+        ],
+    },
+    # ===== Education / LMS =====
+    "education": {
+        "courses": [
+            {"id": 1, "name": "Python Fundamentals", "level": "Beginner", "duration_hours": 20, "price": 49.99},
+            {"id": 2, "name": "Data Science Bootcamp", "level": "Intermediate", "duration_hours": 60, "price": 199.99},
+            {"id": 3, "name": "Machine Learning", "level": "Advanced", "duration_hours": 40, "price": 149.99},
+            {"id": 4, "name": "Web Development", "level": "Beginner", "duration_hours": 30, "price": 79.99},
+        ],
+        "difficulty_levels": [
+            {"id": 1, "name": "Beginner", "description": "No prior experience needed"},
+            {"id": 2, "name": "Intermediate", "description": "Some experience required"},
+            {"id": 3, "name": "Advanced", "description": "Strong foundation needed"},
+            {"id": 4, "name": "Expert", "description": "Professional level"},
+        ],
+    },
+}
+# ============ DOMAIN DETECTION ============
+# Keywords that indicate a specific domain
+DOMAIN_KEYWORDS = {
+    "saas": ["subscription", "plan", "tier", "billing", "invoice", "tenant"],
+    "fitness": ["exercise", "workout", "calories", "fitness", "gym", "training", "health"],
+    "ecommerce": ["product", "category", "cart", "order", "shipping", "inventory", "catalog"],
+    "finance": ["account", "transaction", "balance", "payment", "transfer", "bank"],
+    "education": ["course", "student", "lesson", "enrollment", "grade", "instructor"],
+}
+def detect_domain(table_names: List[str]) -> str:
+    """
+    Detect the business domain based on table names.
+    Args:
+        table_names: List of table names in the schema
+    Returns:
+        Domain name (saas, fitness, ecommerce, finance, education, or 'generic')
+    """
+    table_names_lower = [t.lower() for t in table_names]
+    all_text = " ".join(table_names_lower)
+    domain_scores = {}
+    for domain, keywords in DOMAIN_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in all_text)
+        if score > 0:
+            domain_scores[domain] = score
+    if domain_scores:
+        return max(domain_scores, key=domain_scores.get)
+    return "generic"
+def get_reference_data(domain: str, table_name: str) -> Optional[List[Dict[str, Any]]]:
+    """
+    Get pre-built reference data for a table.
+    Strategy:
+    1. Check specific domain (exact match)
+    2. Check specific domain (singular/plural match)
+    3. GLOBAL FALLBACK: Check ALL domains for exact match
+    4. GLOBAL FALLBACK: Check ALL domains for partial match
+    """
+    # Normalize table name
+    table_key = table_name.lower().rstrip('s')  # Remove plural 's'
+    # 1. Try specific domain first
+    domain_data = REFERENCE_DATA_LIBRARY.get(domain, {})
+    # Exact match in domain
+    if table_name in domain_data:
+        return domain_data[table_name]
+    # Singular match in domain
+    if table_key in domain_data:
+        return domain_data[table_key]
+    # Partial match in domain
+    for key, data in domain_data.items():
+        if table_key in key or key in table_key:
+            return data
+    # 2. GLOBAL SEARCH: Check all other domains
+    # This handles mixed schemas (e.g. "fitness app with products")
+    for other_domain, tables in REFERENCE_DATA_LIBRARY.items():
+        if other_domain == domain:
+            continue
+        # Exact match
+        if table_name in tables:
+            return tables[table_name]
+        # Singular match
+        if table_key in tables:
+            return tables[table_key]
+    # 3. GLOBAL PARTIAL SEARCH
+    for other_domain, tables in REFERENCE_DATA_LIBRARY.items():
+        for key, data in tables.items():
+            if table_key in key or key in table_key:
+                return data
+    return None
+def get_all_domains() -> List[str]:
+    """Get list of all supported domains."""
+    return list(REFERENCE_DATA_LIBRARY.keys())
+def get_domain_tables(domain: str) -> List[str]:
+    """Get list of tables available for a domain."""
+    return list(REFERENCE_DATA_LIBRARY.get(domain, {}).keys())

misata/research/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .agent import DeepResearchAgent
+__all__ = ["DeepResearchAgent"]

misata/research/agent.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""
+Misata Deep Research Agent 🕵️‍♂️
+-----------------------------
+Responsible for fetching "Ground Truth" data from the real world.
+Uses Agentic Search (Tavily/LangGraph) to find competitors, market stats, and pricing.
+"""
+from typing import List, Dict, Any, Optional
+import time
+class DeepResearchAgent:
+    def __init__(self, api_key: Optional[str] = None, use_mock: bool = True):
+        self.api_key = api_key
+        self.use_mock = use_mock
+        # TODO: Initialize LangGraph / Tavily client here
+    def search_entities(self, domain: str, entity_type: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """
+        Finds real-world entities for a given domain.
+        E.g. domain="Fitness App", entity_type="Competitors" -> Returns ["Strava", "MyFitnessPal", ...]
+        """
+        if self.use_mock:
+            return self._mock_search(domain, entity_type, limit)
+        # TODO: Implement Real Search
+        return []
+    def search_market_stats(self, domain: str) -> Dict[str, Any]:
+        """
+        Finds market stats (average price, market size).
+        """
+        if self.use_mock:
+            return {
+                "market_size": "5B",
+                "avg_price_monthly": 14.99,
+                "cagr": "12%"
+            }
+        return {}
+    def _mock_search(self, domain: str, entity_type: str, limit: int) -> List[Dict[str, Any]]:
+        """Returns plausible fake data for demo purposes."""
+        print(f"🕵️‍♂️ [Agent] Mock Researching: {entity_type} in {domain}...")
+        time.sleep(1.0) # Simulate latency
+        domain_lower = domain.lower()
+        if "fitness" in domain_lower:
+            return [
+                {"name": "Strava", "revenue": "200M", "users": "100M"},
+                {"name": "MyFitnessPal", "revenue": "150M", "users": "80M"},
+                {"name": "Nike Run Club", "revenue": "N/A", "users": "50M"},
+                {"name": "Peloton", "revenue": "2B", "users": "10M"},
+            ][:limit]
+        elif "ecommerce" in domain_lower or "retail" in domain_lower:
+            return [
+                {"name": "Amazon", "revenue": "500B"},
+                {"name": "Shopify", "revenue": "5B"},
+                {"name": "Walmart", "revenue": "600B"},
+            ][:limit]
+        elif "saas" in domain_lower:
+            return [
+                {"name": "Salesforce", "revenue": "30B"},
+                {"name": "HubSpot", "revenue": "2B"},
+                {"name": "Atlassian", "revenue": "4B"},
+            ][:limit]
+        return [{"name": f"{domain} Competitor {i+1}"} for i in range(limit)]

misata/schema.py CHANGED Viewed

@@ -192,6 +192,29 @@ class ScenarioEvent(BaseModel):
     description: Optional[str] = None
+class OutcomeCurve(BaseModel):
+    """
+    Defines a temporal/seasonal pattern for a numeric column.
+    This is extracted from natural language descriptions like:
+    "Revenue with a dip in September and peak in December"
+    Attributes:
+        table: Table containing the column to constrain
+        column: Numeric column to apply the curve to
+        time_column: Date/time column for grouping
+        pattern_type: Type of pattern (seasonal, growth, decline, etc.)
+        description: Human-readable description of the pattern
+        curve_points: Monthly relative values (0.0-1.0)
+    """
+    table: str
+    column: str
+    time_column: str = "date"
+    pattern_type: str = "seasonal"
+    description: Optional[str] = None
+    curve_points: List[Dict[str, float]] = Field(default_factory=list)
 class SchemaConfig(BaseModel):
     """
     Complete configuration for synthetic data generation.
@@ -206,6 +229,7 @@ class SchemaConfig(BaseModel):
         columns: Mapping of table names to their column definitions
         relationships: List of inter-table relationships
         events: List of scenario events to apply
+        outcome_curves: List of temporal patterns for constrained generation
         seed: Random seed for reproducibility
     """
@@ -215,6 +239,7 @@ class SchemaConfig(BaseModel):
     columns: Dict[str, List[Column]]
     relationships: List[Relationship] = Field(default_factory=list)
     events: List[ScenarioEvent] = Field(default_factory=list)
+    outcome_curves: List[OutcomeCurve] = Field(default_factory=list)
     seed: Optional[int] = None
     @field_validator("columns")

misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

misata 0.3.0b0py3-none-any.whl → 0.5.0py3-none-any.whl