misata 0.1.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/codegen.py ADDED
@@ -0,0 +1,153 @@
1
+ """
2
+ Code generation module for creating standalone Python scripts.
3
+
4
+ Generates executable Python scripts from SchemaConfig that can be
5
+ run independently to produce synthetic data.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+
11
+ from misata.schema import SchemaConfig
12
+
13
+
14
+ class ScriptGenerator:
15
+ """
16
+ Generates standalone Python scripts for data generation.
17
+
18
+ The generated scripts are self-contained and can be executed
19
+ without the Misata library installed.
20
+ """
21
+
22
+ def __init__(self, config: SchemaConfig):
23
+ """
24
+ Initialize the script generator.
25
+
26
+ Args:
27
+ config: Schema configuration to generate script from
28
+ """
29
+ self.config = config
30
+
31
+ def _generate_imports(self) -> str:
32
+ """Generate import statements."""
33
+ return """import pandas as pd
34
+ import numpy as np
35
+ import warnings
36
+ from collections import defaultdict, deque
37
+ import os
38
+
39
+ # Pure Python text generator (no external dependencies)
40
+ from misata.generators import TextGenerator
41
+ """
42
+
43
+ def _generate_config_dict(self) -> str:
44
+ """Generate the configuration as a Python dictionary."""
45
+ config_dict = self.config.model_dump()
46
+ return f"CONFIG = {json.dumps(config_dict, indent=2)}"
47
+
48
+ def _generate_simulator_class(self) -> str:
49
+ """Generate the DataSimulator class code."""
50
+ # Read the simulator.py file and extract the class
51
+ simulator_path = Path(__file__).parent / "simulator.py"
52
+ with open(simulator_path, "r") as f:
53
+ content = f.read()
54
+
55
+ # Extract just the class definition (simplified - in production use AST)
56
+ # For now, include the entire simulator module
57
+ return content.replace("from misata.schema import", "# from misata.schema import")
58
+
59
+ def generate(self, output_path: str, include_export: bool = True) -> None:
60
+ """
61
+ Generate a standalone Python script.
62
+
63
+ Args:
64
+ output_path: Path where the script should be saved
65
+ include_export: Whether to include CSV export code at the end
66
+ """
67
+ script_parts = []
68
+
69
+ # Header comment
70
+ script_parts.append('"""')
71
+ script_parts.append("Auto-generated synthetic data script by Misata")
72
+ script_parts.append(f"Dataset: {self.config.name}")
73
+ if self.config.description:
74
+ script_parts.append(f"Description: {self.config.description}")
75
+ script_parts.append('"""')
76
+ script_parts.append("")
77
+
78
+ # Imports
79
+ script_parts.append(self._generate_imports())
80
+ script_parts.append("")
81
+
82
+ # Configuration
83
+ script_parts.append("# " + "=" * 70)
84
+ script_parts.append("# CONFIGURATION")
85
+ script_parts.append("# " + "=" * 70)
86
+ script_parts.append(self._generate_config_dict())
87
+ script_parts.append("")
88
+
89
+ # Main execution
90
+ script_parts.append("# " + "=" * 70)
91
+ script_parts.append("# MAIN EXECUTION")
92
+ script_parts.append("# " + "=" * 70)
93
+ script_parts.append("")
94
+ script_parts.append("def main():")
95
+ script_parts.append(" \"\"\"Generate synthetic data and export to CSV.\"\"\"")
96
+ script_parts.append(" ")
97
+ script_parts.append(" # Note: This is a simplified version.")
98
+ script_parts.append(" # For full functionality, use the Misata library directly.")
99
+ script_parts.append(" ")
100
+ script_parts.append(" print('Generating synthetic data...')")
101
+ script_parts.append(" print(f'Dataset: {CONFIG[\"name\"]}')")
102
+ script_parts.append(" print(f'Tables: {len(CONFIG[\"tables\"])}')")
103
+ script_parts.append(" ")
104
+ script_parts.append(" # Initialize random seed")
105
+ script_parts.append(" seed = CONFIG.get('seed', 42)")
106
+ script_parts.append(" np.random.seed(seed)")
107
+ script_parts.append(" rng = np.random.default_rng(seed)")
108
+ script_parts.append(" text_gen = TextGenerator(seed=seed)")
109
+ script_parts.append(" ")
110
+ script_parts.append(" # TODO: Import and use DataSimulator from Misata")
111
+ script_parts.append(" # For now, this is a placeholder")
112
+ script_parts.append(" print('Please install misata library:')")
113
+ script_parts.append(" print(' pip install misata')")
114
+ script_parts.append(" print('Then use:')")
115
+ script_parts.append(" print(' from misata import DataSimulator, SchemaConfig')")
116
+ script_parts.append(" ")
117
+
118
+ if include_export:
119
+ script_parts.append(" # Export placeholder")
120
+ script_parts.append(" output_dir = 'generated_data'")
121
+ script_parts.append(" os.makedirs(output_dir, exist_ok=True)")
122
+ script_parts.append(" print(f'Output directory: {output_dir}')")
123
+
124
+ script_parts.append("")
125
+ script_parts.append("")
126
+ script_parts.append("if __name__ == '__main__':")
127
+ script_parts.append(" main()")
128
+
129
+ # Write to file
130
+ output_path_obj = Path(output_path)
131
+ output_path_obj.parent.mkdir(parents=True, exist_ok=True)
132
+ output_path_obj.write_text("\n".join(script_parts))
133
+
134
+ print(f"Generated script saved to: {output_path}")
135
+
136
+ def generate_yaml_config(self, output_path: str) -> None:
137
+ """
138
+ Generate a YAML configuration file.
139
+
140
+ Args:
141
+ output_path: Path where the YAML should be saved
142
+ """
143
+ import yaml
144
+
145
+ config_dict = self.config.model_dump()
146
+
147
+ output_path_obj = Path(output_path)
148
+ output_path_obj.parent.mkdir(parents=True, exist_ok=True)
149
+
150
+ with open(output_path_obj, "w") as f:
151
+ yaml.dump(config_dict, f, default_flow_style=False, sort_keys=False)
152
+
153
+ print(f"Generated YAML config saved to: {output_path}")
@@ -0,0 +1,106 @@
1
+
2
+ """
3
+ Mathematical Graph Solver for Misata.
4
+
5
+ This module uses scipy.optimize to find the best distribution parameters
6
+ that match a set of control points provided by the LLM or user.
7
+ This allows users/LLMs to "draw" a distribution shape rather than
8
+ guessing abstract parameters like alpha/beta/gamma.
9
+ """
10
+
11
+ from typing import Dict, List
12
+
13
+ import numpy as np
14
+ from scipy.optimize import minimize
15
+ from scipy.stats import norm, lognorm, expon, beta, gamma, uniform
16
+
17
+ class CurveFitter:
18
+ """
19
+ Fits statistical distributions to control points using optimization.
20
+ """
21
+
22
+ def __init__(self):
23
+ """Initialize the curve fitter."""
24
+ self.distributions = {
25
+ "normal": norm,
26
+ "lognormal": lognorm,
27
+ "exponential": expon,
28
+ "beta": beta,
29
+ "gamma": gamma,
30
+ "uniform": uniform
31
+ }
32
+
33
+ def fit_distribution(
34
+ self,
35
+ targets: List[Dict[str, float]],
36
+ distribution_type: str = "normal"
37
+ ) -> Dict[str, float]:
38
+ """
39
+ Find best parameters for a distribution to match target points.
40
+
41
+ Args:
42
+ targets: List of points dicts [{"x": 10, "y": 0.1}, ...]
43
+ where x is the value and y is the desired PDF probability density.
44
+ distribution_type: Name of distribution to fit.
45
+
46
+ Returns:
47
+ Dictionary of best-fit parameters (e.g., {"mean": 10, "std": 5})
48
+ """
49
+ if distribution_type not in self.distributions:
50
+ raise ValueError(f"Unsupported distribution: {distribution_type}")
51
+
52
+ dist_func = self.distributions[distribution_type]
53
+ points = np.array([(p["x"], p["y"]) for p in targets])
54
+ x_vals = points[:, 0]
55
+ y_targets = points[:, 1]
56
+
57
+ # Define objective function (MSE)
58
+ def objective(params):
59
+ try:
60
+ # Scipy stats distributions take args/scale/loc differently
61
+ # We need to map generic params array to specific distribution args
62
+ # This is tricky. Simplified approach:
63
+ if distribution_type == "normal":
64
+ # params[0] = mean (loc), params[1] = std (scale)
65
+ y_pred = dist_func.pdf(x_vals, loc=params[0], scale=abs(params[1]))
66
+ elif distribution_type == "exponential":
67
+ # params[0] = scale (1/lambda)
68
+ y_pred = dist_func.pdf(x_vals, scale=abs(params[0]))
69
+ elif distribution_type == "uniform":
70
+ # params[0] = min (loc), params[1] = range (scale)
71
+ y_pred = dist_func.pdf(x_vals, loc=params[0], scale=abs(params[1]))
72
+ elif distribution_type == "lognormal":
73
+ # s=shape, scale=exp(mean), loc=0 usually
74
+ y_pred = dist_func.pdf(x_vals, s=abs(params[0]), scale=abs(params[1]))
75
+ else:
76
+ # General fallback?
77
+ return 1e9
78
+
79
+ # Mean Squared Error
80
+ mse = np.mean((y_pred - y_targets) ** 2)
81
+ return mse
82
+ except Exception:
83
+ return 1e9
84
+
85
+ # Initial guesses
86
+ initial_guess = [np.mean(x_vals), np.std(x_vals)]
87
+ if distribution_type == "exponential":
88
+ initial_guess = [np.mean(x_vals)]
89
+ elif distribution_type == "lognormal":
90
+ initial_guess = [1.0, np.mean(x_vals)]
91
+
92
+ # Optimize
93
+ result = minimize(objective, initial_guess, method='Nelder-Mead')
94
+ best_params = result.x
95
+
96
+ # Map back to named parameters
97
+ if distribution_type == "normal":
98
+ return {"mean": float(best_params[0]), "std": float(abs(best_params[1]))}
99
+ elif distribution_type == "exponential":
100
+ return {"scale": float(abs(best_params[0]))}
101
+ elif distribution_type == "uniform":
102
+ return {"min": float(best_params[0]), "max": float(best_params[0] + abs(best_params[1]))}
103
+ elif distribution_type == "lognormal":
104
+ return {"shape": float(abs(best_params[0])), "scale": float(abs(best_params[1]))}
105
+
106
+ return {}
@@ -0,0 +1,256 @@
1
+ """
2
+ Attribute customization module for fine-grained control over data generation.
3
+
4
+ Allows users to:
5
+ - Override column values with custom generators
6
+ - Apply conditional logic to values
7
+ - Define custom value pools per column
8
+ - Apply transformations post-generation
9
+ """
10
+
11
+ from typing import Any, Callable, Dict, List, Optional
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+
17
+ class ColumnOverride:
18
+ """
19
+ Define custom generation logic for a specific column.
20
+
21
+ Usage:
22
+ override = ColumnOverride(
23
+ name="price",
24
+ generator=lambda n: np.random.uniform(10, 100, n),
25
+ post_process=lambda x: round(x, 2)
26
+ )
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ name: str,
32
+ generator: Optional[Callable[[int], np.ndarray]] = None,
33
+ value_pool: Optional[List[Any]] = None,
34
+ conditional: Optional[Dict[str, Any]] = None,
35
+ post_process: Optional[Callable[[Any], Any]] = None,
36
+ null_rate: float = 0.0,
37
+ ):
38
+ """
39
+ Initialize a column override.
40
+
41
+ Args:
42
+ name: Column name to override
43
+ generator: Function that takes size N and returns N values
44
+ value_pool: List of values to sample from
45
+ conditional: Dict with {condition_column: {value: override_value}}
46
+ post_process: Function to apply to each value after generation
47
+ null_rate: Rate of null values to inject
48
+ """
49
+ self.name = name
50
+ self.generator = generator
51
+ self.value_pool = value_pool
52
+ self.conditional = conditional
53
+ self.post_process = post_process
54
+ self.null_rate = null_rate
55
+
56
+ def apply(self, df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
57
+ """Apply this override to a DataFrame."""
58
+ result = df.copy()
59
+
60
+ if self.generator is not None:
61
+ result[self.name] = self.generator(len(df))
62
+
63
+ elif self.value_pool is not None:
64
+ result[self.name] = rng.choice(self.value_pool, size=len(df))
65
+
66
+ # Apply conditional overrides
67
+ if self.conditional is not None:
68
+ for cond_col, value_map in self.conditional.items():
69
+ if cond_col not in result.columns:
70
+ continue
71
+ for cond_value, override_value in value_map.items():
72
+ mask = result[cond_col] == cond_value
73
+ result.loc[mask, self.name] = override_value
74
+
75
+ # Apply post-processing
76
+ if self.post_process is not None:
77
+ result[self.name] = result[self.name].apply(self.post_process)
78
+
79
+ # Inject nulls
80
+ if self.null_rate > 0:
81
+ mask = rng.random(len(result)) < self.null_rate
82
+ result.loc[mask, self.name] = np.nan
83
+
84
+ return result
85
+
86
+
87
+ class Customizer:
88
+ """
89
+ Central customization engine for attribute-level control.
90
+
91
+ Usage:
92
+ customizer = Customizer()
93
+ customizer.add_override("users", ColumnOverride(
94
+ name="age",
95
+ generator=lambda n: np.random.normal(35, 10, n).clip(18, 80).astype(int)
96
+ ))
97
+ customizer.add_conditional("orders", "shipping_cost", {
98
+ "country": {"US": 5.99, "UK": 9.99, "CA": 7.99}
99
+ })
100
+
101
+ df = customizer.apply(df, "users")
102
+ """
103
+
104
+ def __init__(self, seed: Optional[int] = None):
105
+ """Initialize the customizer."""
106
+ self.overrides: Dict[str, List[ColumnOverride]] = {}
107
+ self.rng = np.random.default_rng(seed)
108
+
109
+ def add_override(self, table: str, override: ColumnOverride) -> "Customizer":
110
+ """Add a column override for a table."""
111
+ if table not in self.overrides:
112
+ self.overrides[table] = []
113
+ self.overrides[table].append(override)
114
+ return self
115
+
116
+ def add_conditional(
117
+ self,
118
+ table: str,
119
+ column: str,
120
+ conditions: Dict[str, Dict[Any, Any]],
121
+ ) -> "Customizer":
122
+ """
123
+ Add a conditional value override.
124
+
125
+ Args:
126
+ table: Table name
127
+ column: Column to override
128
+ conditions: {condition_column: {condition_value: new_value}}
129
+
130
+ Example:
131
+ customizer.add_conditional("products", "tax_rate", {
132
+ "category": {"Electronics": 0.08, "Food": 0.0, "Clothing": 0.05}
133
+ })
134
+ """
135
+ override = ColumnOverride(name=column, conditional=conditions)
136
+ return self.add_override(table, override)
137
+
138
+ def add_value_pool(
139
+ self,
140
+ table: str,
141
+ column: str,
142
+ values: List[Any],
143
+ probabilities: Optional[List[float]] = None,
144
+ ) -> "Customizer":
145
+ """
146
+ Add a custom value pool for a column.
147
+
148
+ Args:
149
+ table: Table name
150
+ column: Column to override
151
+ values: List of possible values
152
+ probabilities: Optional weights (must sum to 1)
153
+ """
154
+ if probabilities:
155
+ def gen(n):
156
+ return self.rng.choice(values, size=n, p=probabilities)
157
+ else:
158
+ def gen(n):
159
+ return self.rng.choice(values, size=n)
160
+
161
+ override = ColumnOverride(name=column, generator=gen)
162
+ return self.add_override(table, override)
163
+
164
+ def add_formula(
165
+ self,
166
+ table: str,
167
+ column: str,
168
+ formula: Callable[[pd.DataFrame], pd.Series],
169
+ ) -> "Customizer":
170
+ """
171
+ Add a formula-based column using other columns.
172
+
173
+ Args:
174
+ table: Table name
175
+ column: Column to create/override
176
+ formula: Function that takes DataFrame and returns Series
177
+
178
+ Example:
179
+ customizer.add_formula("orders", "total",
180
+ lambda df: df["quantity"] * df["unit_price"] * (1 + df["tax_rate"]))
181
+ """
182
+ # Store as a special override that uses post-processing
183
+ class FormulaOverride(ColumnOverride):
184
+ def __init__(self, name, formula_fn):
185
+ super().__init__(name=name)
186
+ self.formula_fn = formula_fn
187
+
188
+ def apply(self, df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
189
+ result = df.copy()
190
+ result[self.name] = self.formula_fn(result)
191
+ return result
192
+
193
+ override = FormulaOverride(column, formula)
194
+ return self.add_override(table, override)
195
+
196
+ def apply(self, df: pd.DataFrame, table: str) -> pd.DataFrame:
197
+ """Apply all overrides for a table to a DataFrame."""
198
+ result = df.copy()
199
+
200
+ if table not in self.overrides:
201
+ return result
202
+
203
+ for override in self.overrides[table]:
204
+ result = override.apply(result, self.rng)
205
+
206
+ return result
207
+
208
+
209
+ # Convenience functions for common patterns
210
+
211
+ def price_generator(min_val: float = 1.0, max_val: float = 1000.0, decimals: int = 2):
212
+ """Create a price generator with realistic distribution."""
213
+ def gen(n):
214
+ # Log-normal distribution for prices (more small items than expensive ones)
215
+ log_min = np.log(max(min_val, 0.01))
216
+ log_max = np.log(max_val)
217
+ log_prices = np.random.uniform(log_min, log_max, n)
218
+ prices = np.exp(log_prices)
219
+ return np.round(prices, decimals)
220
+ return gen
221
+
222
+
223
+ def age_generator(mean: int = 35, std: int = 12, min_age: int = 18, max_age: int = 80):
224
+ """Create an age generator with realistic distribution."""
225
+ def gen(n):
226
+ ages = np.random.normal(mean, std, n)
227
+ return np.clip(ages, min_age, max_age).astype(int)
228
+ return gen
229
+
230
+
231
+ def rating_generator(min_rating: float = 1.0, max_rating: float = 5.0, skew: str = "positive"):
232
+ """Create a rating generator with configurable skew."""
233
+ def gen(n):
234
+ if skew == "positive":
235
+ # Most ratings are 4-5 stars (beta distribution)
236
+ ratings = np.random.beta(5, 2, n) * (max_rating - min_rating) + min_rating
237
+ elif skew == "negative":
238
+ ratings = np.random.beta(2, 5, n) * (max_rating - min_rating) + min_rating
239
+ else:
240
+ ratings = np.random.uniform(min_rating, max_rating, n)
241
+ return np.round(ratings, 1)
242
+ return gen
243
+
244
+
245
+ def percentage_generator(realistic: bool = True):
246
+ """Create a percentage generator."""
247
+ def gen(n):
248
+ if realistic:
249
+ # Most percentages cluster around common values
250
+ common = [0, 5, 10, 15, 20, 25, 30, 50, 75, 100]
251
+ base = np.random.choice(common, n)
252
+ noise = np.random.uniform(-2, 2, n)
253
+ return np.clip(base + noise, 0, 100)
254
+ else:
255
+ return np.random.uniform(0, 100, n)
256
+ return gen