PyPI - misata - Versions diffs - 0.1.0b0__py3-none-any.whl - Mend

misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

misata/__init__.py +48 -0
misata/api.py +460 -0
misata/audit.py +415 -0
misata/benchmark.py +376 -0
misata/cli.py +680 -0
misata/codegen.py +153 -0
misata/curve_fitting.py +106 -0
misata/customization.py +256 -0
misata/feedback.py +433 -0
misata/formulas.py +362 -0
misata/generators.py +247 -0
misata/hybrid.py +398 -0
misata/llm_parser.py +493 -0
misata/noise.py +346 -0
misata/schema.py +252 -0
misata/semantic.py +185 -0
misata/simulator.py +742 -0
misata/story_parser.py +425 -0
misata/templates/__init__.py +444 -0
misata/validation.py +313 -0
misata-0.1.0b0.dist-info/METADATA +291 -0
misata-0.1.0b0.dist-info/RECORD +25 -0
misata-0.1.0b0.dist-info/WHEEL +5 -0
misata-0.1.0b0.dist-info/entry_points.txt +2 -0
misata-0.1.0b0.dist-info/top_level.txt +1 -0

misata/noise.py ADDED Viewed

@@ -0,0 +1,346 @@
+"""
+Noise injection module for realistic ML training data.
+Adds real-world imperfections to synthetic data:
+- Missing values (nulls/NaN)
+- Outliers
+- Typos and data entry errors
+- Duplicates and near-duplicates
+- Distribution drift over time
+"""
+import random
+import string
+from typing import Any, Dict, List, Optional
+import numpy as np
+import pandas as pd
+class NoiseInjector:
+    """
+    Inject realistic noise and imperfections into synthetic data.
+    Makes data suitable for ML training by adding real-world issues:
+    - Missing values at configurable rates
+    - Statistical outliers
+    - Typos in text fields
+    - Duplicate rows
+    - Temporal distribution shifts
+    Usage:
+        injector = NoiseInjector(seed=42)
+        noisy_df = injector.apply(df, config={
+            "null_rate": 0.05,
+            "outlier_rate": 0.02,
+            "typo_rate": 0.01,
+            "duplicate_rate": 0.03,
+        })
+    """
+    def __init__(self, seed: Optional[int] = None):
+        """Initialize with optional random seed for reproducibility."""
+        self.rng = np.random.default_rng(seed)
+        self.py_rng = random.Random(seed)
+    def apply(
+        self,
+        df: pd.DataFrame,
+        config: Optional[Dict[str, Any]] = None,
+    ) -> pd.DataFrame:
+        """
+        Apply all configured noise types to a DataFrame.
+        Args:
+            df: Input DataFrame
+            config: Noise configuration dict with rates for each type
+        Returns:
+            DataFrame with noise applied
+        """
+        if config is None:
+            config = {}
+        result = df.copy()
+        # Apply each noise type
+        if config.get("null_rate", 0) > 0:
+            result = self.inject_nulls(result, rate=config["null_rate"],
+                                       columns=config.get("null_columns"))
+        if config.get("outlier_rate", 0) > 0:
+            result = self.inject_outliers(result, rate=config["outlier_rate"],
+                                          columns=config.get("outlier_columns"))
+        if config.get("typo_rate", 0) > 0:
+            result = self.inject_typos(result, rate=config["typo_rate"],
+                                       columns=config.get("typo_columns"))
+        if config.get("duplicate_rate", 0) > 0:
+            result = self.inject_duplicates(result, rate=config["duplicate_rate"])
+        return result
+    def inject_nulls(
+        self,
+        df: pd.DataFrame,
+        rate: float = 0.05,
+        columns: Optional[List[str]] = None,
+    ) -> pd.DataFrame:
+        """
+        Inject null/missing values at a specified rate.
+        Args:
+            df: Input DataFrame
+            rate: Probability of any cell becoming null (0.0-1.0)
+            columns: Specific columns to apply to (default: all except ID columns)
+        Returns:
+            DataFrame with nulls injected
+        """
+        result = df.copy()
+        # Default: skip ID columns
+        if columns is None:
+            columns = [c for c in df.columns if not c.endswith('_id') and c != 'id']
+        for col in columns:
+            if col not in result.columns:
+                continue
+            mask = self.rng.random(len(result)) < rate
+            result.loc[mask, col] = np.nan
+        return result
+    def inject_outliers(
+        self,
+        df: pd.DataFrame,
+        rate: float = 0.02,
+        columns: Optional[List[str]] = None,
+        multiplier: float = 5.0,
+    ) -> pd.DataFrame:
+        """
+        Inject statistical outliers into numeric columns.
+        Args:
+            df: Input DataFrame
+            rate: Probability of any numeric cell becoming an outlier
+            columns: Specific columns (default: all numeric)
+            multiplier: How extreme the outliers should be (times std dev)
+        Returns:
+            DataFrame with outliers injected
+        """
+        result = df.copy()
+        # Default: all numeric columns
+        if columns is None:
+            columns = result.select_dtypes(include=[np.number]).columns.tolist()
+            columns = [c for c in columns if not c.endswith('_id') and c != 'id']
+        for col in columns:
+            if col not in result.columns:
+                continue
+            series = result[col]
+            if not np.issubdtype(series.dtype, np.number):
+                continue
+            mean = series.mean()
+            std = series.std()
+            if std == 0 or np.isnan(std):
+                continue
+            mask = self.rng.random(len(result)) < rate
+            n_outliers = mask.sum()
+            if n_outliers > 0:
+                # Generate outliers above or below mean
+                direction = self.rng.choice([-1, 1], size=n_outliers)
+                outlier_values = mean + direction * multiplier * std * (1 + self.rng.random(n_outliers))
+                result.loc[mask, col] = outlier_values
+        return result
+    def inject_typos(
+        self,
+        df: pd.DataFrame,
+        rate: float = 0.01,
+        columns: Optional[List[str]] = None,
+    ) -> pd.DataFrame:
+        """
+        Inject typos into text columns.
+        Typo types:
+        - Character swap
+        - Character deletion
+        - Character insertion
+        - Case change
+        Args:
+            df: Input DataFrame
+            rate: Probability of any text cell getting a typo
+            columns: Specific columns (default: all object/string)
+        Returns:
+            DataFrame with typos injected
+        """
+        result = df.copy()
+        # Default: all text columns
+        if columns is None:
+            columns = result.select_dtypes(include=['object', 'string']).columns.tolist()
+            # Skip columns that look like IDs or structured data
+            columns = [c for c in columns if 'id' not in c.lower() and 'email' not in c.lower()]
+        for col in columns:
+            if col not in result.columns:
+                continue
+            mask = self.rng.random(len(result)) < rate
+            for idx in result.index[mask]:
+                value = result.at[idx, col]
+                if pd.isna(value) or not isinstance(value, str) or len(value) < 2:
+                    continue
+                result.at[idx, col] = self._add_typo(value)
+        return result
+    def _add_typo(self, text: str) -> str:
+        """Add a single typo to a text string."""
+        if len(text) < 2:
+            return text
+        typo_type = self.py_rng.choice(['swap', 'delete', 'insert', 'case'])
+        chars = list(text)
+        pos = self.py_rng.randint(0, len(chars) - 1)
+        if typo_type == 'swap' and pos < len(chars) - 1:
+            chars[pos], chars[pos + 1] = chars[pos + 1], chars[pos]
+        elif typo_type == 'delete':
+            chars.pop(pos)
+        elif typo_type == 'insert':
+            chars.insert(pos, self.py_rng.choice(string.ascii_lowercase))
+        elif typo_type == 'case':
+            chars[pos] = chars[pos].swapcase()
+        return ''.join(chars)
+    def inject_duplicates(
+        self,
+        df: pd.DataFrame,
+        rate: float = 0.03,
+        exact: bool = True,
+    ) -> pd.DataFrame:
+        """
+        Inject duplicate rows.
+        Args:
+            df: Input DataFrame
+            rate: Rate of rows to duplicate
+            exact: If True, exact duplicates. If False, near-duplicates with slight variations.
+        Returns:
+            DataFrame with duplicates added
+        """
+        n_duplicates = int(len(df) * rate)
+        if n_duplicates == 0:
+            return df
+        # Select random rows to duplicate
+        dup_indices = self.rng.choice(df.index, size=n_duplicates, replace=True)
+        duplicates = df.loc[dup_indices].copy()
+        if not exact:
+            # Add slight variations to numeric columns
+            for col in duplicates.select_dtypes(include=[np.number]).columns:
+                if col.endswith('_id') or col == 'id':
+                    continue
+                noise = self.rng.normal(0, 0.01, len(duplicates))
+                duplicates[col] = duplicates[col] * (1 + noise)
+        return pd.concat([df, duplicates], ignore_index=True)
+    def apply_temporal_drift(
+        self,
+        df: pd.DataFrame,
+        date_column: str,
+        value_column: str,
+        drift_rate: float = 0.1,
+        drift_direction: str = "up",
+    ) -> pd.DataFrame:
+        """
+        Apply temporal distribution drift to simulate changing trends.
+        Args:
+            df: Input DataFrame
+            date_column: Column containing dates
+            value_column: Numeric column to apply drift to
+            drift_rate: Rate of drift (0.1 = 10% change over time range)
+            drift_direction: "up" for increasing, "down" for decreasing
+        Returns:
+            DataFrame with temporal drift applied
+        """
+        result = df.copy()
+        if date_column not in result.columns or value_column not in result.columns:
+            return result
+        dates = pd.to_datetime(result[date_column])
+        min_date = dates.min()
+        max_date = dates.max()
+        if min_date == max_date:
+            return result
+        # Normalize dates to 0-1 range
+        time_fraction = (dates - min_date) / (max_date - min_date)
+        # Calculate drift multiplier
+        multiplier = 1 + (drift_rate * time_fraction if drift_direction == "up"
+                         else -drift_rate * time_fraction)
+        result[value_column] = result[value_column] * multiplier
+        return result
+# Convenience function
+def add_noise(
+    df: pd.DataFrame,
+    null_rate: float = 0.0,
+    outlier_rate: float = 0.0,
+    typo_rate: float = 0.0,
+    duplicate_rate: float = 0.0,
+    seed: Optional[int] = None,
+) -> pd.DataFrame:
+    """
+    Convenience function to add noise to a DataFrame.
+    Args:
+        df: Input DataFrame
+        null_rate: Rate of null value injection (0.0-1.0)
+        outlier_rate: Rate of outlier injection
+        typo_rate: Rate of typo injection in text
+        duplicate_rate: Rate of duplicate rows
+        seed: Random seed for reproducibility
+    Returns:
+        DataFrame with noise applied
+    Example:
+        noisy_df = add_noise(df, null_rate=0.05, outlier_rate=0.02)
+    """
+    injector = NoiseInjector(seed=seed)
+    return injector.apply(df, config={
+        "null_rate": null_rate,
+        "outlier_rate": outlier_rate,
+        "typo_rate": typo_rate,
+        "duplicate_rate": duplicate_rate,
+    })

misata/schema.py ADDED Viewed

@@ -0,0 +1,252 @@
+"""
+Pydantic models for Misata configuration.
+These models define the blueprint for synthetic data generation,
+including tables, columns, relationships, and scenario events.
+"""
+from typing import Any, Dict, List, Literal, Optional, Union
+from pydantic import BaseModel, Field, field_validator
+class Column(BaseModel):
+    """
+    Defines a single column in a table.
+    Attributes:
+        name: Column name
+        type: Data type (int, float, date, categorical, foreign_key, text)
+        distribution_params: Parameters for data generation (mean, std, choices, etc.)
+        nullable: Whether the column can contain NULL values
+        unique: Whether values must be unique
+    """
+    name: str
+    type: Literal["int", "float", "date", "categorical", "foreign_key", "text", "boolean"]
+    distribution_params: Dict[str, Any] = Field(default_factory=dict)
+    nullable: bool = False
+    unique: bool = False
+    @field_validator("distribution_params")
+    @classmethod
+    def validate_params(cls, v: Dict[str, Any], info: Any) -> Dict[str, Any]:
+        """Validate distribution parameters based on column type."""
+        col_type = info.data.get("type")
+        if col_type == "categorical" and "choices" not in v:
+            raise ValueError("Categorical columns must have 'choices' in distribution_params")
+        if col_type == "date":
+            if "relative_to" not in v:
+                if "start" not in v or "end" not in v:
+                    raise ValueError("Date columns must have 'start' and 'end' OR 'relative_to' in distribution_params")
+        if col_type in ["int", "float"]:
+            if "distribution" not in v:
+                v["distribution"] = "normal"  # Default to normal distribution
+        return v
+class Table(BaseModel):
+    """
+    Defines a table to be generated.
+    Tables can be either:
+    - Reference tables: Small lookup tables with LLM-generated actual data (exercises, plans)
+    - Transactional tables: Mass-generated tables using foreign keys to reference tables
+    Attributes:
+        name: Table name
+        row_count: Number of rows to generate (ignored if inline_data is provided)
+        description: Optional description of the table's purpose
+        is_reference: If True, this is a lookup/reference table
+        inline_data: Actual data rows for reference tables (list of dicts)
+    """
+    name: str
+    row_count: int = Field(default=100, gt=0)
+    description: Optional[str] = None
+    is_reference: bool = False
+    inline_data: Optional[List[Dict[str, Any]]] = None
+    constraints: List["Constraint"] = Field(default_factory=list)
+class Relationship(BaseModel):
+    """
+    Defines a parent-child relationship between tables.
+    Ensures referential integrity by constraining child foreign keys
+    to existing parent primary keys.
+    Attributes:
+        parent_table: Name of the parent table
+        child_table: Name of the child table
+        parent_key: Column name in parent table (usually primary key)
+        child_key: Column name in child table (foreign key)
+        temporal_constraint: If True, child events must occur after parent events
+    """
+    parent_table: str
+    child_table: str
+    parent_key: str
+    child_key: str
+    temporal_constraint: bool = False
+    filters: Optional[Dict[str, Any]] = None  # e.g., {"status": "active"}
+class Constraint(BaseModel):
+    """
+    Defines a business rule constraint to enforce during generation.
+    Constraints are applied after generating a batch to ensure data
+    adheres to real-world business rules.
+    Attributes:
+        name: Descriptive name of the constraint
+        type: Type of constraint (max_per_group, min_per_group, unique_combination, sum_limit)
+        group_by: List of columns to group by (e.g., ["employee_id", "date"])
+        column: The column to constrain
+        value: The constraint value (e.g., 8 for max 8 hours)
+        action: What to do when constraint is violated (cap, redistribute, error)
+    Examples:
+        # Max 8 hours per employee per day
+        Constraint(
+            name="max_daily_hours",
+            type="max_per_group",
+            group_by=["employee_id", "date"],
+            column="hours",
+            value=8,
+            action="cap"
+        )
+        # Each employee-project-date combination must be unique
+        Constraint(
+            name="unique_timesheet_entry",
+            type="unique_combination",
+            group_by=["employee_id", "project_id", "date"],
+            action="drop"
+        )
+    """
+    name: str
+    type: Literal["max_per_group", "min_per_group", "sum_limit", "unique_combination"]
+    group_by: List[str] = Field(default_factory=list)
+    column: Optional[str] = None  # Not needed for unique_combination
+    value: Optional[float] = None  # The limit value
+    action: Literal["cap", "redistribute", "drop", "error"] = "cap"
+class ScenarioEvent(BaseModel):
+    """
+    Defines a time-based or conditional modifier to apply to data.
+    This is the "story" layer - events that force data to follow
+    specific patterns (growth, crashes, seasonality, etc.).
+    Attributes:
+        name: Descriptive name of the event
+        table: Table to apply the event to
+        column: Column to modify
+        condition: Python expression evaluated on the DataFrame (e.g., "date > '2023-11-01'")
+        modifier_type: Type of modification (multiply, add, set, function)
+        modifier_value: Value or function to apply
+        description: Optional description of what this event represents
+    Examples:
+        # Revenue crash
+        ScenarioEvent(
+            name="Q3_Revenue_Crash",
+            table="sales",
+            column="revenue",
+            condition="date >= '2023-07-01' and date < '2023-10-01'",
+            modifier_type="multiply",
+            modifier_value=0.5
+        )
+        # Set all churned users
+        ScenarioEvent(
+            name="Churn_Flag",
+            table="users",
+            column="churned",
+            condition="signup_date < '2023-06-01'",
+            modifier_type="set",
+            modifier_value=True
+        )
+    """
+    name: str
+    table: str
+    column: str
+    condition: str
+    modifier_type: Literal["multiply", "add", "set", "function"]
+    modifier_value: Union[int, float, str, bool]
+    description: Optional[str] = None
+class SchemaConfig(BaseModel):
+    """
+    Complete configuration for synthetic data generation.
+    This is the root configuration object that defines all tables,
+    columns, relationships, and scenario events.
+    Attributes:
+        name: Name of the dataset/scenario
+        description: Description of what this data represents
+        tables: List of tables to generate
+        columns: Mapping of table names to their column definitions
+        relationships: List of inter-table relationships
+        events: List of scenario events to apply
+        seed: Random seed for reproducibility
+    """
+    name: str
+    description: Optional[str] = None
+    tables: List[Table]
+    columns: Dict[str, List[Column]]
+    relationships: List[Relationship] = Field(default_factory=list)
+    events: List[ScenarioEvent] = Field(default_factory=list)
+    seed: Optional[int] = None
+    @field_validator("columns")
+    @classmethod
+    def validate_columns(cls, v: Dict[str, List[Column]], info: Any) -> Dict[str, List[Column]]:
+        """Ensure all tables have column definitions."""
+        tables = info.data.get("tables", [])
+        table_names = {t.name for t in tables}
+        for table_name in table_names:
+            if table_name not in v:
+                raise ValueError(f"Table '{table_name}' has no column definitions")
+        return v
+    @field_validator("relationships")
+    @classmethod
+    def validate_relationships(cls, v: List[Relationship], info: Any) -> List[Relationship]:
+        """Ensure relationship references exist."""
+        tables = info.data.get("tables", [])
+        table_names = {t.name for t in tables}
+        for rel in v:
+            if rel.parent_table not in table_names:
+                raise ValueError(f"Parent table '{rel.parent_table}' not found in schema")
+            if rel.child_table not in table_names:
+                raise ValueError(f"Child table '{rel.child_table}' not found in schema")
+        return v
+    def get_table(self, name: str) -> Optional[Table]:
+        """Get a table by name."""
+        for table in self.tables:
+            if table.name == name:
+                return table
+        return None
+    def get_columns(self, table_name: str) -> List[Column]:
+        """Get columns for a specific table."""
+        return self.columns.get(table_name, [])