misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/benchmark.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Accuracy Benchmarking Module for Misata.
|
|
3
|
+
|
|
4
|
+
This module provides:
|
|
5
|
+
- Statistical validation of generated distributions
|
|
6
|
+
- Comparison against real-world reference datasets
|
|
7
|
+
- K-S tests, chi-squared tests, and distribution matching scores
|
|
8
|
+
- Benchmark reports with pass/fail criteria
|
|
9
|
+
|
|
10
|
+
This addresses the critic's concern: "Your accuracy is unproven"
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import Any, Dict, List
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from scipy import stats
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class BenchmarkResult:
|
|
24
|
+
"""Result of a single distribution benchmark."""
|
|
25
|
+
column_name: str
|
|
26
|
+
test_name: str
|
|
27
|
+
statistic: float
|
|
28
|
+
p_value: float
|
|
29
|
+
passed: bool
|
|
30
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
33
|
+
return {
|
|
34
|
+
"column": self.column_name,
|
|
35
|
+
"test": self.test_name,
|
|
36
|
+
"statistic": round(self.statistic, 4),
|
|
37
|
+
"p_value": round(self.p_value, 4),
|
|
38
|
+
"passed": self.passed,
|
|
39
|
+
"details": self.details
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class BenchmarkReport:
|
|
45
|
+
"""Complete benchmark report for a generated dataset."""
|
|
46
|
+
|
|
47
|
+
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
|
48
|
+
results: List[BenchmarkResult] = field(default_factory=list)
|
|
49
|
+
overall_score: float = 0.0
|
|
50
|
+
passed: bool = False
|
|
51
|
+
|
|
52
|
+
def add_result(self, result: BenchmarkResult):
|
|
53
|
+
self.results.append(result)
|
|
54
|
+
self._update_score()
|
|
55
|
+
|
|
56
|
+
def _update_score(self):
|
|
57
|
+
if not self.results:
|
|
58
|
+
self.overall_score = 0.0
|
|
59
|
+
self.passed = False
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
passed_count = sum(1 for r in self.results if r.passed)
|
|
63
|
+
self.overall_score = passed_count / len(self.results)
|
|
64
|
+
self.passed = self.overall_score >= 0.75 # 75% threshold
|
|
65
|
+
|
|
66
|
+
def summary(self) -> str:
|
|
67
|
+
lines = [
|
|
68
|
+
"=" * 60,
|
|
69
|
+
"MISATA ACCURACY BENCHMARK REPORT",
|
|
70
|
+
"=" * 60,
|
|
71
|
+
f"Timestamp: {self.timestamp}",
|
|
72
|
+
f"Tests Run: {len(self.results)}",
|
|
73
|
+
f"Tests Passed: {sum(1 for r in self.results if r.passed)}",
|
|
74
|
+
f"Overall Score: {self.overall_score:.1%}",
|
|
75
|
+
f"Status: {'✅ PASSED' if self.passed else '❌ FAILED'}",
|
|
76
|
+
"-" * 60,
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
for result in self.results:
|
|
80
|
+
status = "✅" if result.passed else "❌"
|
|
81
|
+
lines.append(f"{status} {result.column_name}: {result.test_name}")
|
|
82
|
+
lines.append(f" statistic={result.statistic:.4f}, p={result.p_value:.4f}")
|
|
83
|
+
|
|
84
|
+
lines.append("=" * 60)
|
|
85
|
+
return "\n".join(lines)
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
88
|
+
return {
|
|
89
|
+
"timestamp": self.timestamp,
|
|
90
|
+
"tests_run": len(self.results),
|
|
91
|
+
"tests_passed": sum(1 for r in self.results if r.passed),
|
|
92
|
+
"overall_score": round(self.overall_score, 3),
|
|
93
|
+
"passed": self.passed,
|
|
94
|
+
"results": [r.to_dict() for r in self.results]
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class AccuracyBenchmark:
|
|
99
|
+
"""
|
|
100
|
+
Benchmark synthetic data against statistical expectations.
|
|
101
|
+
|
|
102
|
+
Validates that generated distributions match specified parameters.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(self, significance_level: float = 0.05):
|
|
106
|
+
"""
|
|
107
|
+
Initialize benchmark.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
significance_level: P-value threshold for tests (default 0.05)
|
|
111
|
+
"""
|
|
112
|
+
self.alpha = significance_level
|
|
113
|
+
|
|
114
|
+
def benchmark_normal(
|
|
115
|
+
self,
|
|
116
|
+
data: np.ndarray,
|
|
117
|
+
expected_mean: float,
|
|
118
|
+
expected_std: float,
|
|
119
|
+
column_name: str = "unknown"
|
|
120
|
+
) -> BenchmarkResult:
|
|
121
|
+
"""
|
|
122
|
+
Test if data follows expected normal distribution.
|
|
123
|
+
|
|
124
|
+
Uses one-sample K-S test against expected normal.
|
|
125
|
+
"""
|
|
126
|
+
# Standardize data
|
|
127
|
+
standardized = (data - expected_mean) / expected_std
|
|
128
|
+
|
|
129
|
+
# K-S test against standard normal
|
|
130
|
+
statistic, p_value = stats.kstest(standardized, 'norm')
|
|
131
|
+
|
|
132
|
+
# Also check mean and std are close
|
|
133
|
+
actual_mean = np.mean(data)
|
|
134
|
+
actual_std = np.std(data)
|
|
135
|
+
|
|
136
|
+
mean_error = abs(actual_mean - expected_mean) / (expected_std + 1e-10)
|
|
137
|
+
std_error = abs(actual_std - expected_std) / (expected_std + 1e-10)
|
|
138
|
+
|
|
139
|
+
# Pass if p-value > alpha AND mean/std within 10%
|
|
140
|
+
passed = p_value > self.alpha and mean_error < 0.1 and std_error < 0.2
|
|
141
|
+
|
|
142
|
+
return BenchmarkResult(
|
|
143
|
+
column_name=column_name,
|
|
144
|
+
test_name="Normal Distribution (K-S)",
|
|
145
|
+
statistic=statistic,
|
|
146
|
+
p_value=p_value,
|
|
147
|
+
passed=passed,
|
|
148
|
+
details={
|
|
149
|
+
"expected_mean": expected_mean,
|
|
150
|
+
"actual_mean": round(actual_mean, 2),
|
|
151
|
+
"expected_std": expected_std,
|
|
152
|
+
"actual_std": round(actual_std, 2),
|
|
153
|
+
"mean_error_percent": round(mean_error * 100, 1),
|
|
154
|
+
"std_error_percent": round(std_error * 100, 1)
|
|
155
|
+
}
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def benchmark_uniform(
|
|
159
|
+
self,
|
|
160
|
+
data: np.ndarray,
|
|
161
|
+
expected_min: float,
|
|
162
|
+
expected_max: float,
|
|
163
|
+
column_name: str = "unknown"
|
|
164
|
+
) -> BenchmarkResult:
|
|
165
|
+
"""
|
|
166
|
+
Test if data follows expected uniform distribution.
|
|
167
|
+
|
|
168
|
+
Uses K-S test against uniform.
|
|
169
|
+
"""
|
|
170
|
+
# Normalize to [0, 1]
|
|
171
|
+
normalized = (data - expected_min) / (expected_max - expected_min + 1e-10)
|
|
172
|
+
|
|
173
|
+
# K-S test against uniform
|
|
174
|
+
statistic, p_value = stats.kstest(normalized, 'uniform')
|
|
175
|
+
|
|
176
|
+
# Check bounds
|
|
177
|
+
actual_min = np.min(data)
|
|
178
|
+
actual_max = np.max(data)
|
|
179
|
+
|
|
180
|
+
in_bounds = actual_min >= expected_min and actual_max <= expected_max
|
|
181
|
+
|
|
182
|
+
passed = p_value > self.alpha and in_bounds
|
|
183
|
+
|
|
184
|
+
return BenchmarkResult(
|
|
185
|
+
column_name=column_name,
|
|
186
|
+
test_name="Uniform Distribution (K-S)",
|
|
187
|
+
statistic=statistic,
|
|
188
|
+
p_value=p_value,
|
|
189
|
+
passed=passed,
|
|
190
|
+
details={
|
|
191
|
+
"expected_range": [expected_min, expected_max],
|
|
192
|
+
"actual_range": [round(actual_min, 2), round(actual_max, 2)],
|
|
193
|
+
"in_bounds": in_bounds
|
|
194
|
+
}
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def benchmark_categorical(
|
|
198
|
+
self,
|
|
199
|
+
data: pd.Series,
|
|
200
|
+
expected_probs: Dict[str, float],
|
|
201
|
+
column_name: str = "unknown"
|
|
202
|
+
) -> BenchmarkResult:
|
|
203
|
+
"""
|
|
204
|
+
Test if categorical data matches expected probabilities.
|
|
205
|
+
|
|
206
|
+
Uses chi-squared test.
|
|
207
|
+
"""
|
|
208
|
+
n = len(data)
|
|
209
|
+
observed_counts = data.value_counts()
|
|
210
|
+
|
|
211
|
+
categories = list(expected_probs.keys())
|
|
212
|
+
observed = [observed_counts.get(cat, 0) for cat in categories]
|
|
213
|
+
expected = [expected_probs[cat] * n for cat in categories]
|
|
214
|
+
|
|
215
|
+
# Chi-squared test
|
|
216
|
+
if min(expected) >= 5: # Chi-squared requirement
|
|
217
|
+
statistic, p_value = stats.chisquare(observed, expected)
|
|
218
|
+
else:
|
|
219
|
+
# Use exact test for small samples
|
|
220
|
+
statistic = sum((o - e)**2 / (e + 1e-10) for o, e in zip(observed, expected))
|
|
221
|
+
p_value = 0.1 # Approximate
|
|
222
|
+
|
|
223
|
+
passed = p_value > self.alpha
|
|
224
|
+
|
|
225
|
+
# Calculate actual vs expected percentages
|
|
226
|
+
actual_probs = {cat: count / n for cat, count in observed_counts.items()}
|
|
227
|
+
|
|
228
|
+
return BenchmarkResult(
|
|
229
|
+
column_name=column_name,
|
|
230
|
+
test_name="Categorical Distribution (Chi-squared)",
|
|
231
|
+
statistic=statistic,
|
|
232
|
+
p_value=p_value,
|
|
233
|
+
passed=passed,
|
|
234
|
+
details={
|
|
235
|
+
"expected_probs": {k: round(v, 3) for k, v in expected_probs.items()},
|
|
236
|
+
"actual_probs": {k: round(v, 3) for k, v in actual_probs.items()}
|
|
237
|
+
}
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def benchmark_foreign_key_coverage(
|
|
241
|
+
self,
|
|
242
|
+
child_fk: pd.Series,
|
|
243
|
+
parent_pk: pd.Series,
|
|
244
|
+
column_name: str = "unknown"
|
|
245
|
+
) -> BenchmarkResult:
|
|
246
|
+
"""
|
|
247
|
+
Test if FK references are well-distributed across parent keys.
|
|
248
|
+
|
|
249
|
+
Good synthetic data should use all parent keys, not just a few.
|
|
250
|
+
"""
|
|
251
|
+
parent_set = set(parent_pk)
|
|
252
|
+
child_refs = set(child_fk)
|
|
253
|
+
|
|
254
|
+
# Coverage: what % of parent keys are referenced?
|
|
255
|
+
coverage = len(child_refs.intersection(parent_set)) / len(parent_set)
|
|
256
|
+
|
|
257
|
+
# Distribution: are references evenly spread?
|
|
258
|
+
ref_counts = child_fk.value_counts()
|
|
259
|
+
ref_std = ref_counts.std() if len(ref_counts) > 1 else 0
|
|
260
|
+
ref_mean = ref_counts.mean()
|
|
261
|
+
cv = ref_std / (ref_mean + 1e-10) # Coefficient of variation
|
|
262
|
+
|
|
263
|
+
# Good if coverage > 80% and CV < 1.5 (not too skewed)
|
|
264
|
+
passed = coverage > 0.8 and cv < 1.5
|
|
265
|
+
|
|
266
|
+
return BenchmarkResult(
|
|
267
|
+
column_name=column_name,
|
|
268
|
+
test_name="FK Coverage & Distribution",
|
|
269
|
+
statistic=coverage,
|
|
270
|
+
p_value=1 - cv, # Higher is better
|
|
271
|
+
passed=passed,
|
|
272
|
+
details={
|
|
273
|
+
"parent_key_coverage": round(coverage * 100, 1),
|
|
274
|
+
"distribution_cv": round(cv, 2),
|
|
275
|
+
"unique_fk_values": len(child_refs),
|
|
276
|
+
"total_parent_keys": len(parent_set)
|
|
277
|
+
}
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def benchmark_generated_data(
|
|
282
|
+
data: Dict[str, pd.DataFrame],
|
|
283
|
+
schema_config: Dict[str, Any]
|
|
284
|
+
) -> BenchmarkReport:
|
|
285
|
+
"""
|
|
286
|
+
Run comprehensive benchmarks on generated data.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
data: Generated dataframes by table name
|
|
290
|
+
schema_config: Original schema configuration
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Complete benchmark report
|
|
294
|
+
"""
|
|
295
|
+
benchmark = AccuracyBenchmark()
|
|
296
|
+
report = BenchmarkReport()
|
|
297
|
+
|
|
298
|
+
columns = schema_config.get("columns", {})
|
|
299
|
+
|
|
300
|
+
for table_name, df in data.items():
|
|
301
|
+
table_cols = columns.get(table_name, [])
|
|
302
|
+
|
|
303
|
+
for col_def in table_cols:
|
|
304
|
+
col_name = col_def.get("name")
|
|
305
|
+
col_type = col_def.get("type")
|
|
306
|
+
params = col_def.get("distribution_params", {})
|
|
307
|
+
|
|
308
|
+
if col_name not in df.columns:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
col_data = df[col_name]
|
|
312
|
+
full_name = f"{table_name}.{col_name}"
|
|
313
|
+
|
|
314
|
+
# Benchmark based on column type
|
|
315
|
+
if col_type in ["int", "float"]:
|
|
316
|
+
dist = params.get("distribution", "uniform")
|
|
317
|
+
|
|
318
|
+
if dist == "normal":
|
|
319
|
+
result = benchmark.benchmark_normal(
|
|
320
|
+
col_data.values,
|
|
321
|
+
params.get("mean", 0),
|
|
322
|
+
params.get("std", 1),
|
|
323
|
+
full_name
|
|
324
|
+
)
|
|
325
|
+
report.add_result(result)
|
|
326
|
+
|
|
327
|
+
elif dist == "uniform":
|
|
328
|
+
result = benchmark.benchmark_uniform(
|
|
329
|
+
col_data.values,
|
|
330
|
+
params.get("min", 0),
|
|
331
|
+
params.get("max", 100),
|
|
332
|
+
full_name
|
|
333
|
+
)
|
|
334
|
+
report.add_result(result)
|
|
335
|
+
|
|
336
|
+
elif col_type == "categorical":
|
|
337
|
+
choices = params.get("choices", [])
|
|
338
|
+
probs = params.get("probabilities")
|
|
339
|
+
|
|
340
|
+
if probs:
|
|
341
|
+
expected = dict(zip(choices, probs))
|
|
342
|
+
else:
|
|
343
|
+
expected = {c: 1/len(choices) for c in choices}
|
|
344
|
+
|
|
345
|
+
result = benchmark.benchmark_categorical(
|
|
346
|
+
col_data,
|
|
347
|
+
expected,
|
|
348
|
+
full_name
|
|
349
|
+
)
|
|
350
|
+
report.add_result(result)
|
|
351
|
+
|
|
352
|
+
elif col_type == "foreign_key":
|
|
353
|
+
# Find parent table
|
|
354
|
+
rels = schema_config.get("relationships", [])
|
|
355
|
+
for rel in rels:
|
|
356
|
+
if rel.get("child_table") == table_name and rel.get("child_key") == col_name:
|
|
357
|
+
parent = rel.get("parent_table")
|
|
358
|
+
parent_key = rel.get("parent_key")
|
|
359
|
+
|
|
360
|
+
if parent in data:
|
|
361
|
+
result = benchmark.benchmark_foreign_key_coverage(
|
|
362
|
+
col_data,
|
|
363
|
+
data[parent][parent_key],
|
|
364
|
+
full_name
|
|
365
|
+
)
|
|
366
|
+
report.add_result(result)
|
|
367
|
+
break
|
|
368
|
+
|
|
369
|
+
return report
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
# Convenience function for CLI
|
|
373
|
+
def run_benchmark_report(data: Dict[str, pd.DataFrame], schema: Dict) -> str:
|
|
374
|
+
"""Run benchmarks and return formatted report string."""
|
|
375
|
+
report = benchmark_generated_data(data, schema)
|
|
376
|
+
return report.summary()
|