misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/codegen.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code generation module for creating standalone Python scripts.
|
|
3
|
+
|
|
4
|
+
Generates executable Python scripts from SchemaConfig that can be
|
|
5
|
+
run independently to produce synthetic data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from misata.schema import SchemaConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ScriptGenerator:
|
|
15
|
+
"""
|
|
16
|
+
Generates standalone Python scripts for data generation.
|
|
17
|
+
|
|
18
|
+
The generated scripts are self-contained and can be executed
|
|
19
|
+
without the Misata library installed.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, config: SchemaConfig):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the script generator.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
config: Schema configuration to generate script from
|
|
28
|
+
"""
|
|
29
|
+
self.config = config
|
|
30
|
+
|
|
31
|
+
def _generate_imports(self) -> str:
|
|
32
|
+
"""Generate import statements."""
|
|
33
|
+
return """import pandas as pd
|
|
34
|
+
import numpy as np
|
|
35
|
+
import warnings
|
|
36
|
+
from collections import defaultdict, deque
|
|
37
|
+
import os
|
|
38
|
+
|
|
39
|
+
# Pure Python text generator (no external dependencies)
|
|
40
|
+
from misata.generators import TextGenerator
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def _generate_config_dict(self) -> str:
|
|
44
|
+
"""Generate the configuration as a Python dictionary."""
|
|
45
|
+
config_dict = self.config.model_dump()
|
|
46
|
+
return f"CONFIG = {json.dumps(config_dict, indent=2)}"
|
|
47
|
+
|
|
48
|
+
def _generate_simulator_class(self) -> str:
|
|
49
|
+
"""Generate the DataSimulator class code."""
|
|
50
|
+
# Read the simulator.py file and extract the class
|
|
51
|
+
simulator_path = Path(__file__).parent / "simulator.py"
|
|
52
|
+
with open(simulator_path, "r") as f:
|
|
53
|
+
content = f.read()
|
|
54
|
+
|
|
55
|
+
# Extract just the class definition (simplified - in production use AST)
|
|
56
|
+
# For now, include the entire simulator module
|
|
57
|
+
return content.replace("from misata.schema import", "# from misata.schema import")
|
|
58
|
+
|
|
59
|
+
def generate(self, output_path: str, include_export: bool = True) -> None:
|
|
60
|
+
"""
|
|
61
|
+
Generate a standalone Python script.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
output_path: Path where the script should be saved
|
|
65
|
+
include_export: Whether to include CSV export code at the end
|
|
66
|
+
"""
|
|
67
|
+
script_parts = []
|
|
68
|
+
|
|
69
|
+
# Header comment
|
|
70
|
+
script_parts.append('"""')
|
|
71
|
+
script_parts.append("Auto-generated synthetic data script by Misata")
|
|
72
|
+
script_parts.append(f"Dataset: {self.config.name}")
|
|
73
|
+
if self.config.description:
|
|
74
|
+
script_parts.append(f"Description: {self.config.description}")
|
|
75
|
+
script_parts.append('"""')
|
|
76
|
+
script_parts.append("")
|
|
77
|
+
|
|
78
|
+
# Imports
|
|
79
|
+
script_parts.append(self._generate_imports())
|
|
80
|
+
script_parts.append("")
|
|
81
|
+
|
|
82
|
+
# Configuration
|
|
83
|
+
script_parts.append("# " + "=" * 70)
|
|
84
|
+
script_parts.append("# CONFIGURATION")
|
|
85
|
+
script_parts.append("# " + "=" * 70)
|
|
86
|
+
script_parts.append(self._generate_config_dict())
|
|
87
|
+
script_parts.append("")
|
|
88
|
+
|
|
89
|
+
# Main execution
|
|
90
|
+
script_parts.append("# " + "=" * 70)
|
|
91
|
+
script_parts.append("# MAIN EXECUTION")
|
|
92
|
+
script_parts.append("# " + "=" * 70)
|
|
93
|
+
script_parts.append("")
|
|
94
|
+
script_parts.append("def main():")
|
|
95
|
+
script_parts.append(" \"\"\"Generate synthetic data and export to CSV.\"\"\"")
|
|
96
|
+
script_parts.append(" ")
|
|
97
|
+
script_parts.append(" # Note: This is a simplified version.")
|
|
98
|
+
script_parts.append(" # For full functionality, use the Misata library directly.")
|
|
99
|
+
script_parts.append(" ")
|
|
100
|
+
script_parts.append(" print('Generating synthetic data...')")
|
|
101
|
+
script_parts.append(" print(f'Dataset: {CONFIG[\"name\"]}')")
|
|
102
|
+
script_parts.append(" print(f'Tables: {len(CONFIG[\"tables\"])}')")
|
|
103
|
+
script_parts.append(" ")
|
|
104
|
+
script_parts.append(" # Initialize random seed")
|
|
105
|
+
script_parts.append(" seed = CONFIG.get('seed', 42)")
|
|
106
|
+
script_parts.append(" np.random.seed(seed)")
|
|
107
|
+
script_parts.append(" rng = np.random.default_rng(seed)")
|
|
108
|
+
script_parts.append(" text_gen = TextGenerator(seed=seed)")
|
|
109
|
+
script_parts.append(" ")
|
|
110
|
+
script_parts.append(" # TODO: Import and use DataSimulator from Misata")
|
|
111
|
+
script_parts.append(" # For now, this is a placeholder")
|
|
112
|
+
script_parts.append(" print('Please install misata library:')")
|
|
113
|
+
script_parts.append(" print(' pip install misata')")
|
|
114
|
+
script_parts.append(" print('Then use:')")
|
|
115
|
+
script_parts.append(" print(' from misata import DataSimulator, SchemaConfig')")
|
|
116
|
+
script_parts.append(" ")
|
|
117
|
+
|
|
118
|
+
if include_export:
|
|
119
|
+
script_parts.append(" # Export placeholder")
|
|
120
|
+
script_parts.append(" output_dir = 'generated_data'")
|
|
121
|
+
script_parts.append(" os.makedirs(output_dir, exist_ok=True)")
|
|
122
|
+
script_parts.append(" print(f'Output directory: {output_dir}')")
|
|
123
|
+
|
|
124
|
+
script_parts.append("")
|
|
125
|
+
script_parts.append("")
|
|
126
|
+
script_parts.append("if __name__ == '__main__':")
|
|
127
|
+
script_parts.append(" main()")
|
|
128
|
+
|
|
129
|
+
# Write to file
|
|
130
|
+
output_path_obj = Path(output_path)
|
|
131
|
+
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
output_path_obj.write_text("\n".join(script_parts))
|
|
133
|
+
|
|
134
|
+
print(f"Generated script saved to: {output_path}")
|
|
135
|
+
|
|
136
|
+
def generate_yaml_config(self, output_path: str) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Generate a YAML configuration file.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
output_path: Path where the YAML should be saved
|
|
142
|
+
"""
|
|
143
|
+
import yaml
|
|
144
|
+
|
|
145
|
+
config_dict = self.config.model_dump()
|
|
146
|
+
|
|
147
|
+
output_path_obj = Path(output_path)
|
|
148
|
+
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
with open(output_path_obj, "w") as f:
|
|
151
|
+
yaml.dump(config_dict, f, default_flow_style=False, sort_keys=False)
|
|
152
|
+
|
|
153
|
+
print(f"Generated YAML config saved to: {output_path}")
|
misata/curve_fitting.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Mathematical Graph Solver for Misata.
|
|
4
|
+
|
|
5
|
+
This module uses scipy.optimize to find the best distribution parameters
|
|
6
|
+
that match a set of control points provided by the LLM or user.
|
|
7
|
+
This allows users/LLMs to "draw" a distribution shape rather than
|
|
8
|
+
guessing abstract parameters like alpha/beta/gamma.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict, List
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from scipy.optimize import minimize
|
|
15
|
+
from scipy.stats import norm, lognorm, expon, beta, gamma, uniform
|
|
16
|
+
|
|
17
|
+
class CurveFitter:
|
|
18
|
+
"""
|
|
19
|
+
Fits statistical distributions to control points using optimization.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
"""Initialize the curve fitter."""
|
|
24
|
+
self.distributions = {
|
|
25
|
+
"normal": norm,
|
|
26
|
+
"lognormal": lognorm,
|
|
27
|
+
"exponential": expon,
|
|
28
|
+
"beta": beta,
|
|
29
|
+
"gamma": gamma,
|
|
30
|
+
"uniform": uniform
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def fit_distribution(
|
|
34
|
+
self,
|
|
35
|
+
targets: List[Dict[str, float]],
|
|
36
|
+
distribution_type: str = "normal"
|
|
37
|
+
) -> Dict[str, float]:
|
|
38
|
+
"""
|
|
39
|
+
Find best parameters for a distribution to match target points.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
targets: List of points dicts [{"x": 10, "y": 0.1}, ...]
|
|
43
|
+
where x is the value and y is the desired PDF probability density.
|
|
44
|
+
distribution_type: Name of distribution to fit.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Dictionary of best-fit parameters (e.g., {"mean": 10, "std": 5})
|
|
48
|
+
"""
|
|
49
|
+
if distribution_type not in self.distributions:
|
|
50
|
+
raise ValueError(f"Unsupported distribution: {distribution_type}")
|
|
51
|
+
|
|
52
|
+
dist_func = self.distributions[distribution_type]
|
|
53
|
+
points = np.array([(p["x"], p["y"]) for p in targets])
|
|
54
|
+
x_vals = points[:, 0]
|
|
55
|
+
y_targets = points[:, 1]
|
|
56
|
+
|
|
57
|
+
# Define objective function (MSE)
|
|
58
|
+
def objective(params):
|
|
59
|
+
try:
|
|
60
|
+
# Scipy stats distributions take args/scale/loc differently
|
|
61
|
+
# We need to map generic params array to specific distribution args
|
|
62
|
+
# This is tricky. Simplified approach:
|
|
63
|
+
if distribution_type == "normal":
|
|
64
|
+
# params[0] = mean (loc), params[1] = std (scale)
|
|
65
|
+
y_pred = dist_func.pdf(x_vals, loc=params[0], scale=abs(params[1]))
|
|
66
|
+
elif distribution_type == "exponential":
|
|
67
|
+
# params[0] = scale (1/lambda)
|
|
68
|
+
y_pred = dist_func.pdf(x_vals, scale=abs(params[0]))
|
|
69
|
+
elif distribution_type == "uniform":
|
|
70
|
+
# params[0] = min (loc), params[1] = range (scale)
|
|
71
|
+
y_pred = dist_func.pdf(x_vals, loc=params[0], scale=abs(params[1]))
|
|
72
|
+
elif distribution_type == "lognormal":
|
|
73
|
+
# s=shape, scale=exp(mean), loc=0 usually
|
|
74
|
+
y_pred = dist_func.pdf(x_vals, s=abs(params[0]), scale=abs(params[1]))
|
|
75
|
+
else:
|
|
76
|
+
# General fallback?
|
|
77
|
+
return 1e9
|
|
78
|
+
|
|
79
|
+
# Mean Squared Error
|
|
80
|
+
mse = np.mean((y_pred - y_targets) ** 2)
|
|
81
|
+
return mse
|
|
82
|
+
except Exception:
|
|
83
|
+
return 1e9
|
|
84
|
+
|
|
85
|
+
# Initial guesses
|
|
86
|
+
initial_guess = [np.mean(x_vals), np.std(x_vals)]
|
|
87
|
+
if distribution_type == "exponential":
|
|
88
|
+
initial_guess = [np.mean(x_vals)]
|
|
89
|
+
elif distribution_type == "lognormal":
|
|
90
|
+
initial_guess = [1.0, np.mean(x_vals)]
|
|
91
|
+
|
|
92
|
+
# Optimize
|
|
93
|
+
result = minimize(objective, initial_guess, method='Nelder-Mead')
|
|
94
|
+
best_params = result.x
|
|
95
|
+
|
|
96
|
+
# Map back to named parameters
|
|
97
|
+
if distribution_type == "normal":
|
|
98
|
+
return {"mean": float(best_params[0]), "std": float(abs(best_params[1]))}
|
|
99
|
+
elif distribution_type == "exponential":
|
|
100
|
+
return {"scale": float(abs(best_params[0]))}
|
|
101
|
+
elif distribution_type == "uniform":
|
|
102
|
+
return {"min": float(best_params[0]), "max": float(best_params[0] + abs(best_params[1]))}
|
|
103
|
+
elif distribution_type == "lognormal":
|
|
104
|
+
return {"shape": float(abs(best_params[0])), "scale": float(abs(best_params[1]))}
|
|
105
|
+
|
|
106
|
+
return {}
|
misata/customization.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Attribute customization module for fine-grained control over data generation.
|
|
3
|
+
|
|
4
|
+
Allows users to:
|
|
5
|
+
- Override column values with custom generators
|
|
6
|
+
- Apply conditional logic to values
|
|
7
|
+
- Define custom value pools per column
|
|
8
|
+
- Apply transformations post-generation
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ColumnOverride:
|
|
18
|
+
"""
|
|
19
|
+
Define custom generation logic for a specific column.
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
override = ColumnOverride(
|
|
23
|
+
name="price",
|
|
24
|
+
generator=lambda n: np.random.uniform(10, 100, n),
|
|
25
|
+
post_process=lambda x: round(x, 2)
|
|
26
|
+
)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
name: str,
|
|
32
|
+
generator: Optional[Callable[[int], np.ndarray]] = None,
|
|
33
|
+
value_pool: Optional[List[Any]] = None,
|
|
34
|
+
conditional: Optional[Dict[str, Any]] = None,
|
|
35
|
+
post_process: Optional[Callable[[Any], Any]] = None,
|
|
36
|
+
null_rate: float = 0.0,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Initialize a column override.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
name: Column name to override
|
|
43
|
+
generator: Function that takes size N and returns N values
|
|
44
|
+
value_pool: List of values to sample from
|
|
45
|
+
conditional: Dict with {condition_column: {value: override_value}}
|
|
46
|
+
post_process: Function to apply to each value after generation
|
|
47
|
+
null_rate: Rate of null values to inject
|
|
48
|
+
"""
|
|
49
|
+
self.name = name
|
|
50
|
+
self.generator = generator
|
|
51
|
+
self.value_pool = value_pool
|
|
52
|
+
self.conditional = conditional
|
|
53
|
+
self.post_process = post_process
|
|
54
|
+
self.null_rate = null_rate
|
|
55
|
+
|
|
56
|
+
def apply(self, df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
|
|
57
|
+
"""Apply this override to a DataFrame."""
|
|
58
|
+
result = df.copy()
|
|
59
|
+
|
|
60
|
+
if self.generator is not None:
|
|
61
|
+
result[self.name] = self.generator(len(df))
|
|
62
|
+
|
|
63
|
+
elif self.value_pool is not None:
|
|
64
|
+
result[self.name] = rng.choice(self.value_pool, size=len(df))
|
|
65
|
+
|
|
66
|
+
# Apply conditional overrides
|
|
67
|
+
if self.conditional is not None:
|
|
68
|
+
for cond_col, value_map in self.conditional.items():
|
|
69
|
+
if cond_col not in result.columns:
|
|
70
|
+
continue
|
|
71
|
+
for cond_value, override_value in value_map.items():
|
|
72
|
+
mask = result[cond_col] == cond_value
|
|
73
|
+
result.loc[mask, self.name] = override_value
|
|
74
|
+
|
|
75
|
+
# Apply post-processing
|
|
76
|
+
if self.post_process is not None:
|
|
77
|
+
result[self.name] = result[self.name].apply(self.post_process)
|
|
78
|
+
|
|
79
|
+
# Inject nulls
|
|
80
|
+
if self.null_rate > 0:
|
|
81
|
+
mask = rng.random(len(result)) < self.null_rate
|
|
82
|
+
result.loc[mask, self.name] = np.nan
|
|
83
|
+
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Customizer:
|
|
88
|
+
"""
|
|
89
|
+
Central customization engine for attribute-level control.
|
|
90
|
+
|
|
91
|
+
Usage:
|
|
92
|
+
customizer = Customizer()
|
|
93
|
+
customizer.add_override("users", ColumnOverride(
|
|
94
|
+
name="age",
|
|
95
|
+
generator=lambda n: np.random.normal(35, 10, n).clip(18, 80).astype(int)
|
|
96
|
+
))
|
|
97
|
+
customizer.add_conditional("orders", "shipping_cost", {
|
|
98
|
+
"country": {"US": 5.99, "UK": 9.99, "CA": 7.99}
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
df = customizer.apply(df, "users")
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self, seed: Optional[int] = None):
|
|
105
|
+
"""Initialize the customizer."""
|
|
106
|
+
self.overrides: Dict[str, List[ColumnOverride]] = {}
|
|
107
|
+
self.rng = np.random.default_rng(seed)
|
|
108
|
+
|
|
109
|
+
def add_override(self, table: str, override: ColumnOverride) -> "Customizer":
|
|
110
|
+
"""Add a column override for a table."""
|
|
111
|
+
if table not in self.overrides:
|
|
112
|
+
self.overrides[table] = []
|
|
113
|
+
self.overrides[table].append(override)
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
def add_conditional(
|
|
117
|
+
self,
|
|
118
|
+
table: str,
|
|
119
|
+
column: str,
|
|
120
|
+
conditions: Dict[str, Dict[Any, Any]],
|
|
121
|
+
) -> "Customizer":
|
|
122
|
+
"""
|
|
123
|
+
Add a conditional value override.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
table: Table name
|
|
127
|
+
column: Column to override
|
|
128
|
+
conditions: {condition_column: {condition_value: new_value}}
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
customizer.add_conditional("products", "tax_rate", {
|
|
132
|
+
"category": {"Electronics": 0.08, "Food": 0.0, "Clothing": 0.05}
|
|
133
|
+
})
|
|
134
|
+
"""
|
|
135
|
+
override = ColumnOverride(name=column, conditional=conditions)
|
|
136
|
+
return self.add_override(table, override)
|
|
137
|
+
|
|
138
|
+
def add_value_pool(
|
|
139
|
+
self,
|
|
140
|
+
table: str,
|
|
141
|
+
column: str,
|
|
142
|
+
values: List[Any],
|
|
143
|
+
probabilities: Optional[List[float]] = None,
|
|
144
|
+
) -> "Customizer":
|
|
145
|
+
"""
|
|
146
|
+
Add a custom value pool for a column.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
table: Table name
|
|
150
|
+
column: Column to override
|
|
151
|
+
values: List of possible values
|
|
152
|
+
probabilities: Optional weights (must sum to 1)
|
|
153
|
+
"""
|
|
154
|
+
if probabilities:
|
|
155
|
+
def gen(n):
|
|
156
|
+
return self.rng.choice(values, size=n, p=probabilities)
|
|
157
|
+
else:
|
|
158
|
+
def gen(n):
|
|
159
|
+
return self.rng.choice(values, size=n)
|
|
160
|
+
|
|
161
|
+
override = ColumnOverride(name=column, generator=gen)
|
|
162
|
+
return self.add_override(table, override)
|
|
163
|
+
|
|
164
|
+
def add_formula(
|
|
165
|
+
self,
|
|
166
|
+
table: str,
|
|
167
|
+
column: str,
|
|
168
|
+
formula: Callable[[pd.DataFrame], pd.Series],
|
|
169
|
+
) -> "Customizer":
|
|
170
|
+
"""
|
|
171
|
+
Add a formula-based column using other columns.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
table: Table name
|
|
175
|
+
column: Column to create/override
|
|
176
|
+
formula: Function that takes DataFrame and returns Series
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
customizer.add_formula("orders", "total",
|
|
180
|
+
lambda df: df["quantity"] * df["unit_price"] * (1 + df["tax_rate"]))
|
|
181
|
+
"""
|
|
182
|
+
# Store as a special override that uses post-processing
|
|
183
|
+
class FormulaOverride(ColumnOverride):
|
|
184
|
+
def __init__(self, name, formula_fn):
|
|
185
|
+
super().__init__(name=name)
|
|
186
|
+
self.formula_fn = formula_fn
|
|
187
|
+
|
|
188
|
+
def apply(self, df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
|
|
189
|
+
result = df.copy()
|
|
190
|
+
result[self.name] = self.formula_fn(result)
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
override = FormulaOverride(column, formula)
|
|
194
|
+
return self.add_override(table, override)
|
|
195
|
+
|
|
196
|
+
def apply(self, df: pd.DataFrame, table: str) -> pd.DataFrame:
|
|
197
|
+
"""Apply all overrides for a table to a DataFrame."""
|
|
198
|
+
result = df.copy()
|
|
199
|
+
|
|
200
|
+
if table not in self.overrides:
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
for override in self.overrides[table]:
|
|
204
|
+
result = override.apply(result, self.rng)
|
|
205
|
+
|
|
206
|
+
return result
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# Convenience functions for common patterns
|
|
210
|
+
|
|
211
|
+
def price_generator(min_val: float = 1.0, max_val: float = 1000.0, decimals: int = 2):
|
|
212
|
+
"""Create a price generator with realistic distribution."""
|
|
213
|
+
def gen(n):
|
|
214
|
+
# Log-normal distribution for prices (more small items than expensive ones)
|
|
215
|
+
log_min = np.log(max(min_val, 0.01))
|
|
216
|
+
log_max = np.log(max_val)
|
|
217
|
+
log_prices = np.random.uniform(log_min, log_max, n)
|
|
218
|
+
prices = np.exp(log_prices)
|
|
219
|
+
return np.round(prices, decimals)
|
|
220
|
+
return gen
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def age_generator(mean: int = 35, std: int = 12, min_age: int = 18, max_age: int = 80):
|
|
224
|
+
"""Create an age generator with realistic distribution."""
|
|
225
|
+
def gen(n):
|
|
226
|
+
ages = np.random.normal(mean, std, n)
|
|
227
|
+
return np.clip(ages, min_age, max_age).astype(int)
|
|
228
|
+
return gen
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def rating_generator(min_rating: float = 1.0, max_rating: float = 5.0, skew: str = "positive"):
|
|
232
|
+
"""Create a rating generator with configurable skew."""
|
|
233
|
+
def gen(n):
|
|
234
|
+
if skew == "positive":
|
|
235
|
+
# Most ratings are 4-5 stars (beta distribution)
|
|
236
|
+
ratings = np.random.beta(5, 2, n) * (max_rating - min_rating) + min_rating
|
|
237
|
+
elif skew == "negative":
|
|
238
|
+
ratings = np.random.beta(2, 5, n) * (max_rating - min_rating) + min_rating
|
|
239
|
+
else:
|
|
240
|
+
ratings = np.random.uniform(min_rating, max_rating, n)
|
|
241
|
+
return np.round(ratings, 1)
|
|
242
|
+
return gen
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def percentage_generator(realistic: bool = True):
|
|
246
|
+
"""Create a percentage generator."""
|
|
247
|
+
def gen(n):
|
|
248
|
+
if realistic:
|
|
249
|
+
# Most percentages cluster around common values
|
|
250
|
+
common = [0, 5, 10, 15, 20, 25, 30, 50, 75, 100]
|
|
251
|
+
base = np.random.choice(common, n)
|
|
252
|
+
noise = np.random.uniform(-2, 2, n)
|
|
253
|
+
return np.clip(base + noise, 0, 100)
|
|
254
|
+
else:
|
|
255
|
+
return np.random.uniform(0, 100, n)
|
|
256
|
+
return gen
|