misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
misata/validation.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data validation layer for post-generation quality checks.
|
|
3
|
+
|
|
4
|
+
This module validates generated data to ensure:
|
|
5
|
+
- No negative values where inappropriate
|
|
6
|
+
- Valid date ranges
|
|
7
|
+
- Referential integrity (FK -> PK exists)
|
|
8
|
+
- Business logic rules
|
|
9
|
+
- Statistical distribution accuracy
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Severity(Enum):
|
|
20
|
+
"""Validation issue severity levels."""
|
|
21
|
+
INFO = "info"
|
|
22
|
+
WARNING = "warning"
|
|
23
|
+
ERROR = "error"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ValidationIssue:
|
|
28
|
+
"""A single validation issue found in the data."""
|
|
29
|
+
severity: Severity
|
|
30
|
+
table: str
|
|
31
|
+
column: Optional[str]
|
|
32
|
+
message: str
|
|
33
|
+
affected_rows: int = 0
|
|
34
|
+
sample_values: List[Any] = field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
def __str__(self):
|
|
37
|
+
severity_icon = {"info": "ℹ️", "warning": "⚠️", "error": "❌"}[self.severity.value]
|
|
38
|
+
col = f".{self.column}" if self.column else ""
|
|
39
|
+
return f"{severity_icon} [{self.table}{col}] {self.message} ({self.affected_rows} rows)"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ValidationReport:
|
|
44
|
+
"""Complete validation report for generated data."""
|
|
45
|
+
issues: List[ValidationIssue] = field(default_factory=list)
|
|
46
|
+
tables_checked: int = 0
|
|
47
|
+
columns_checked: int = 0
|
|
48
|
+
total_rows: int = 0
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def has_errors(self) -> bool:
|
|
52
|
+
return any(i.severity == Severity.ERROR for i in self.issues)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def has_warnings(self) -> bool:
|
|
56
|
+
return any(i.severity == Severity.WARNING for i in self.issues)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def is_clean(self) -> bool:
|
|
60
|
+
return len(self.issues) == 0
|
|
61
|
+
|
|
62
|
+
def summary(self) -> str:
|
|
63
|
+
"""Get a summary of the validation report."""
|
|
64
|
+
errors = sum(1 for i in self.issues if i.severity == Severity.ERROR)
|
|
65
|
+
warnings = sum(1 for i in self.issues if i.severity == Severity.WARNING)
|
|
66
|
+
info = sum(1 for i in self.issues if i.severity == Severity.INFO)
|
|
67
|
+
|
|
68
|
+
lines = [
|
|
69
|
+
"=" * 50,
|
|
70
|
+
"DATA VALIDATION REPORT",
|
|
71
|
+
"=" * 50,
|
|
72
|
+
f"Tables checked: {self.tables_checked}",
|
|
73
|
+
f"Columns checked: {self.columns_checked}",
|
|
74
|
+
f"Total rows: {self.total_rows:,}",
|
|
75
|
+
"-" * 50,
|
|
76
|
+
f"❌ Errors: {errors}",
|
|
77
|
+
f"⚠️ Warnings: {warnings}",
|
|
78
|
+
f"ℹ️ Info: {info}",
|
|
79
|
+
"-" * 50,
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
if self.is_clean:
|
|
83
|
+
lines.append("✅ All validations passed!")
|
|
84
|
+
else:
|
|
85
|
+
lines.append("Issues found:")
|
|
86
|
+
for issue in self.issues:
|
|
87
|
+
lines.append(f" {issue}")
|
|
88
|
+
|
|
89
|
+
lines.append("=" * 50)
|
|
90
|
+
return "\n".join(lines)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class DataValidator:
|
|
94
|
+
"""
|
|
95
|
+
Validates generated data for quality and accuracy.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
tables: Dict[str, pd.DataFrame],
|
|
101
|
+
schema_config: Optional[Any] = None,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Initialize validator with generated tables.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
tables: Dict mapping table name to DataFrame
|
|
108
|
+
schema_config: Optional schema config for relationship checking
|
|
109
|
+
"""
|
|
110
|
+
self.tables = tables
|
|
111
|
+
self.schema_config = schema_config
|
|
112
|
+
self.issues: List[ValidationIssue] = []
|
|
113
|
+
|
|
114
|
+
def validate_all(self) -> ValidationReport:
|
|
115
|
+
"""
|
|
116
|
+
Run all validation checks.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Complete validation report
|
|
120
|
+
"""
|
|
121
|
+
self.issues = []
|
|
122
|
+
|
|
123
|
+
for table_name, df in self.tables.items():
|
|
124
|
+
self._validate_table(table_name, df)
|
|
125
|
+
|
|
126
|
+
# Validate referential integrity
|
|
127
|
+
self._validate_referential_integrity()
|
|
128
|
+
|
|
129
|
+
return ValidationReport(
|
|
130
|
+
issues=self.issues,
|
|
131
|
+
tables_checked=len(self.tables),
|
|
132
|
+
columns_checked=sum(len(df.columns) for df in self.tables.values()),
|
|
133
|
+
total_rows=sum(len(df) for df in self.tables.values()),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _validate_table(self, table_name: str, df: pd.DataFrame) -> None:
|
|
137
|
+
"""Validate a single table."""
|
|
138
|
+
for col in df.columns:
|
|
139
|
+
self._validate_column(table_name, df, col)
|
|
140
|
+
|
|
141
|
+
def _validate_column(self, table_name: str, df: pd.DataFrame, col: str) -> None:
|
|
142
|
+
"""Validate a single column."""
|
|
143
|
+
col.lower()
|
|
144
|
+
values = df[col]
|
|
145
|
+
|
|
146
|
+
# Check for nulls
|
|
147
|
+
null_count = values.isna().sum()
|
|
148
|
+
if null_count > 0:
|
|
149
|
+
self.issues.append(ValidationIssue(
|
|
150
|
+
severity=Severity.INFO,
|
|
151
|
+
table=table_name,
|
|
152
|
+
column=col,
|
|
153
|
+
message=f"Contains {null_count} null values",
|
|
154
|
+
affected_rows=null_count,
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
# Numeric column checks
|
|
158
|
+
if pd.api.types.is_numeric_dtype(values):
|
|
159
|
+
self._validate_numeric_column(table_name, col, values)
|
|
160
|
+
|
|
161
|
+
# Date column checks
|
|
162
|
+
if pd.api.types.is_datetime64_any_dtype(values):
|
|
163
|
+
self._validate_date_column(table_name, col, values)
|
|
164
|
+
|
|
165
|
+
# String column checks
|
|
166
|
+
if pd.api.types.is_string_dtype(values) or pd.api.types.is_object_dtype(values):
|
|
167
|
+
self._validate_string_column(table_name, col, values)
|
|
168
|
+
|
|
169
|
+
def _validate_numeric_column(self, table_name: str, col: str, values: pd.Series) -> None:
|
|
170
|
+
"""Validate numeric columns."""
|
|
171
|
+
col_lower = col.lower()
|
|
172
|
+
|
|
173
|
+
# Check for negative values in columns that should be positive
|
|
174
|
+
positive_patterns = ['price', 'cost', 'amount', 'age', 'quantity', 'count',
|
|
175
|
+
'duration', 'weight', 'height', 'salary', 'revenue']
|
|
176
|
+
|
|
177
|
+
if any(p in col_lower for p in positive_patterns):
|
|
178
|
+
negative_count = (values < 0).sum()
|
|
179
|
+
if negative_count > 0:
|
|
180
|
+
self.issues.append(ValidationIssue(
|
|
181
|
+
severity=Severity.ERROR,
|
|
182
|
+
table=table_name,
|
|
183
|
+
column=col,
|
|
184
|
+
message=f"Contains {negative_count} negative values (should be positive)",
|
|
185
|
+
affected_rows=negative_count,
|
|
186
|
+
sample_values=values[values < 0].head(5).tolist(),
|
|
187
|
+
))
|
|
188
|
+
|
|
189
|
+
# Check for unreasonable ages
|
|
190
|
+
if 'age' in col_lower:
|
|
191
|
+
invalid_ages = ((values < 0) | (values > 150)).sum()
|
|
192
|
+
if invalid_ages > 0:
|
|
193
|
+
self.issues.append(ValidationIssue(
|
|
194
|
+
severity=Severity.WARNING,
|
|
195
|
+
table=table_name,
|
|
196
|
+
column=col,
|
|
197
|
+
message=f"Contains {invalid_ages} unrealistic age values (< 0 or > 150)",
|
|
198
|
+
affected_rows=invalid_ages,
|
|
199
|
+
))
|
|
200
|
+
|
|
201
|
+
# Check for unreasonable prices
|
|
202
|
+
if 'price' in col_lower or 'cost' in col_lower:
|
|
203
|
+
very_high = (values > 1000000).sum()
|
|
204
|
+
if very_high > 0:
|
|
205
|
+
self.issues.append(ValidationIssue(
|
|
206
|
+
severity=Severity.INFO,
|
|
207
|
+
table=table_name,
|
|
208
|
+
column=col,
|
|
209
|
+
message=f"Contains {very_high} values over $1M",
|
|
210
|
+
affected_rows=very_high,
|
|
211
|
+
))
|
|
212
|
+
|
|
213
|
+
def _validate_date_column(self, table_name: str, col: str, values: pd.Series) -> None:
|
|
214
|
+
"""Validate date columns."""
|
|
215
|
+
# Check for dates too far in the future
|
|
216
|
+
future_cutoff = pd.Timestamp.now() + pd.Timedelta(days=365*5)
|
|
217
|
+
far_future = (values > future_cutoff).sum()
|
|
218
|
+
if far_future > 0:
|
|
219
|
+
self.issues.append(ValidationIssue(
|
|
220
|
+
severity=Severity.WARNING,
|
|
221
|
+
table=table_name,
|
|
222
|
+
column=col,
|
|
223
|
+
message=f"Contains {far_future} dates more than 5 years in the future",
|
|
224
|
+
affected_rows=far_future,
|
|
225
|
+
))
|
|
226
|
+
|
|
227
|
+
# Check for dates too far in the past
|
|
228
|
+
past_cutoff = pd.Timestamp('1900-01-01')
|
|
229
|
+
far_past = (values < past_cutoff).sum()
|
|
230
|
+
if far_past > 0:
|
|
231
|
+
self.issues.append(ValidationIssue(
|
|
232
|
+
severity=Severity.ERROR,
|
|
233
|
+
table=table_name,
|
|
234
|
+
column=col,
|
|
235
|
+
message=f"Contains {far_past} dates before 1900",
|
|
236
|
+
affected_rows=far_past,
|
|
237
|
+
))
|
|
238
|
+
|
|
239
|
+
def _validate_string_column(self, table_name: str, col: str, values: pd.Series) -> None:
|
|
240
|
+
"""Validate string columns."""
|
|
241
|
+
col_lower = col.lower()
|
|
242
|
+
|
|
243
|
+
# Check for email format
|
|
244
|
+
if 'email' in col_lower:
|
|
245
|
+
# Simple email check - contains @
|
|
246
|
+
invalid_emails = (~values.astype(str).str.contains('@', na=False)).sum()
|
|
247
|
+
if invalid_emails > 0:
|
|
248
|
+
self.issues.append(ValidationIssue(
|
|
249
|
+
severity=Severity.ERROR,
|
|
250
|
+
table=table_name,
|
|
251
|
+
column=col,
|
|
252
|
+
message=f"Contains {invalid_emails} invalid email addresses",
|
|
253
|
+
affected_rows=invalid_emails,
|
|
254
|
+
))
|
|
255
|
+
|
|
256
|
+
# Check for empty strings
|
|
257
|
+
empty_count = (values.astype(str).str.strip() == '').sum()
|
|
258
|
+
if empty_count > 0:
|
|
259
|
+
self.issues.append(ValidationIssue(
|
|
260
|
+
severity=Severity.WARNING,
|
|
261
|
+
table=table_name,
|
|
262
|
+
column=col,
|
|
263
|
+
message=f"Contains {empty_count} empty strings",
|
|
264
|
+
affected_rows=empty_count,
|
|
265
|
+
))
|
|
266
|
+
|
|
267
|
+
def _validate_referential_integrity(self) -> None:
|
|
268
|
+
"""Validate foreign key relationships."""
|
|
269
|
+
if not self.schema_config:
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
for rel in self.schema_config.relationships:
|
|
273
|
+
if rel.parent_table not in self.tables or rel.child_table not in self.tables:
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
parent_df = self.tables[rel.parent_table]
|
|
277
|
+
child_df = self.tables[rel.child_table]
|
|
278
|
+
|
|
279
|
+
if rel.parent_key not in parent_df.columns or rel.child_key not in child_df.columns:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
parent_ids = set(parent_df[rel.parent_key].dropna())
|
|
283
|
+
child_fks = child_df[rel.child_key].dropna()
|
|
284
|
+
|
|
285
|
+
orphans = ~child_fks.isin(parent_ids)
|
|
286
|
+
orphan_count = orphans.sum()
|
|
287
|
+
|
|
288
|
+
if orphan_count > 0:
|
|
289
|
+
self.issues.append(ValidationIssue(
|
|
290
|
+
severity=Severity.ERROR,
|
|
291
|
+
table=rel.child_table,
|
|
292
|
+
column=rel.child_key,
|
|
293
|
+
message=f"Contains {orphan_count} orphan references (FK not found in {rel.parent_table})",
|
|
294
|
+
affected_rows=orphan_count,
|
|
295
|
+
))
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def validate_data(
|
|
299
|
+
tables: Dict[str, pd.DataFrame],
|
|
300
|
+
schema_config: Optional[Any] = None,
|
|
301
|
+
) -> ValidationReport:
|
|
302
|
+
"""
|
|
303
|
+
Quick validation of generated data.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
tables: Generated tables
|
|
307
|
+
schema_config: Optional schema for FK validation
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Validation report
|
|
311
|
+
"""
|
|
312
|
+
validator = DataValidator(tables, schema_config)
|
|
313
|
+
return validator.validate_all()
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: misata
|
|
3
|
+
Version: 0.1.0b0
|
|
4
|
+
Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
|
|
5
|
+
Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rasinmuhammed/misata
|
|
8
|
+
Project-URL: Documentation, https://github.com/rasinmuhammed/misata#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/rasinmuhammed/misata
|
|
10
|
+
Project-URL: Issues, https://github.com/rasinmuhammed/misata/issues
|
|
11
|
+
Keywords: synthetic-data,data-generation,fake-data,machine-learning,testing,llm,ai,database
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Software Development :: Testing
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Database
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
Requires-Dist: pandas>=2.0.0
|
|
27
|
+
Requires-Dist: numpy>=1.24.0
|
|
28
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
|
+
Requires-Dist: click>=8.1.0
|
|
30
|
+
Requires-Dist: pyyaml>=6.0
|
|
31
|
+
Requires-Dist: rich>=13.0.0
|
|
32
|
+
Requires-Dist: groq>=0.4.0
|
|
33
|
+
Requires-Dist: fastapi>=0.109.0
|
|
34
|
+
Requires-Dist: uvicorn>=0.27.0
|
|
35
|
+
Requires-Dist: python-multipart>=0.0.6
|
|
36
|
+
Requires-Dist: simpleeval>=0.9.0
|
|
37
|
+
Requires-Dist: scipy>=1.10.0
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
|
+
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
44
|
+
|
|
45
|
+
# 🧠 Misata
|
|
46
|
+
|
|
47
|
+
**Generate realistic multi-table datasets from natural language.**
|
|
48
|
+
|
|
49
|
+
No schema writing. No training data. Just describe what you need.
|
|
50
|
+
|
|
51
|
+
[]()
|
|
52
|
+
[]()
|
|
53
|
+
[]()
|
|
54
|
+
|
|
55
|
+
## ✨ What Makes Misata Different
|
|
56
|
+
|
|
57
|
+
| Feature | Faker | SDV | **Misata** |
|
|
58
|
+
|---------|-------|-----|------------|
|
|
59
|
+
| Natural language input | ❌ | ❌ | ✅ |
|
|
60
|
+
| Auto schema generation | ❌ | ❌ | ✅ |
|
|
61
|
+
| Relational integrity | ❌ | ✅ | ✅ |
|
|
62
|
+
| Business constraints | ❌ | ❌ | ✅ |
|
|
63
|
+
| No training data needed | ✅ | ❌ | ✅ |
|
|
64
|
+
| Streaming (10M+ rows) | ❌ | ❌ | ✅ |
|
|
65
|
+
|
|
66
|
+
## 🚀 Quick Start
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install misata
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### With Groq (Free, Fast)
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
export GROQ_API_KEY=your_key # Get free: https://console.groq.com
|
|
76
|
+
misata generate --story "A SaaS with 50K users, subscriptions, and payments" --use-llm
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### With OpenAI
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export OPENAI_API_KEY=your_key
|
|
83
|
+
misata generate --story "E-commerce with products and orders" --use-llm --provider openai
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### With Ollama (Local, Free, Private)
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
ollama run llama3 # Start Ollama first
|
|
90
|
+
misata generate --story "Fitness app with workouts" --use-llm --provider ollama
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## 📊 Example Output
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
$ misata generate --story "A fitness app with 50K users" --use-llm
|
|
97
|
+
|
|
98
|
+
🧠 Using Groq (llama-3.3-70b-versatile) for intelligent parsing...
|
|
99
|
+
✅ LLM schema generated successfully!
|
|
100
|
+
|
|
101
|
+
📋 Schema: FitnessApp
|
|
102
|
+
Tables: 5
|
|
103
|
+
Relationships: 4
|
|
104
|
+
|
|
105
|
+
🔧 Generating 5 table(s)...
|
|
106
|
+
|
|
107
|
+
✓ exercises (10 rows)
|
|
108
|
+
✓ plans (5 rows)
|
|
109
|
+
✓ users (50,000 rows)
|
|
110
|
+
✓ subscriptions (45,000 rows)
|
|
111
|
+
✓ workouts (500,000 rows)
|
|
112
|
+
|
|
113
|
+
⏱️ Generation time: 2.34 seconds
|
|
114
|
+
🚀 Performance: 213,675 rows/second
|
|
115
|
+
💾 Data saved to: ./generated_data
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## 💻 Python API
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from misata import DataSimulator, SchemaConfig
|
|
122
|
+
from misata.llm_parser import LLMSchemaGenerator
|
|
123
|
+
|
|
124
|
+
# Generate schema from story
|
|
125
|
+
llm = LLMSchemaGenerator(provider="groq") # or "openai", "ollama"
|
|
126
|
+
config = llm.generate_from_story(
|
|
127
|
+
"A mobile fitness app with 50K users, workout tracking, "
|
|
128
|
+
"premium subscriptions, and January signup spikes"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Generate data
|
|
132
|
+
for table_name, batch in DataSimulator(config).generate_all():
|
|
133
|
+
print(f"Generated {len(batch)} rows for {table_name}")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## 🔧 CLI Reference
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
# Basic generation (rule-based, no API key needed)
|
|
140
|
+
misata generate --story "SaaS company with users and subscriptions"
|
|
141
|
+
|
|
142
|
+
# LLM-powered generation
|
|
143
|
+
misata generate --story "..." --use-llm
|
|
144
|
+
|
|
145
|
+
# Specify provider and model
|
|
146
|
+
misata generate --story "..." --use-llm --provider ollama --model llama3
|
|
147
|
+
|
|
148
|
+
# Custom output directory
|
|
149
|
+
misata generate --story "..." --use-llm --output-dir ./my_data
|
|
150
|
+
|
|
151
|
+
# Set row count
|
|
152
|
+
misata generate --story "..." --use-llm --rows 100000
|
|
153
|
+
|
|
154
|
+
# Reproducible with seed
|
|
155
|
+
misata generate --story "..." --use-llm --seed 42
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## 🎯 Business Rule Constraints
|
|
159
|
+
|
|
160
|
+
Define rules like "employees can't log >8 hours/day":
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from misata import Constraint, Table
|
|
164
|
+
|
|
165
|
+
timesheets = Table(
|
|
166
|
+
name="timesheets",
|
|
167
|
+
row_count=10000,
|
|
168
|
+
constraints=[
|
|
169
|
+
Constraint(
|
|
170
|
+
name="max_daily_hours",
|
|
171
|
+
type="sum_limit",
|
|
172
|
+
group_by=["employee_id", "date"],
|
|
173
|
+
column="hours",
|
|
174
|
+
value=8.0,
|
|
175
|
+
action="redistribute"
|
|
176
|
+
)
|
|
177
|
+
]
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## 🔑 LLM Providers
|
|
182
|
+
|
|
183
|
+
| Provider | Env Variable | Free Tier | Notes |
|
|
184
|
+
|----------|--------------|-----------|-------|
|
|
185
|
+
| **Groq** | `GROQ_API_KEY` | ✅ 30 req/min | Fastest, recommended |
|
|
186
|
+
| **OpenAI** | `OPENAI_API_KEY` | ❌ | Best quality |
|
|
187
|
+
| **Ollama** | None | ✅ Local | Private, no internet |
|
|
188
|
+
|
|
189
|
+
## 📈 Extending Data Pools
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from misata import TextGenerator
|
|
193
|
+
|
|
194
|
+
# Add custom names
|
|
195
|
+
TextGenerator.extend_pool("first_names", ["Arjun", "Priya", "Rahul"])
|
|
196
|
+
|
|
197
|
+
# Load from file
|
|
198
|
+
TextGenerator.load_pools_from_file("custom_pools.json")
|
|
199
|
+
|
|
200
|
+
# Save for reuse
|
|
201
|
+
TextGenerator.save_pools_to_file("expanded_pools.json")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## 🤖 ML Training Data
|
|
205
|
+
|
|
206
|
+
Make your synthetic data **indistinguishable from real-world data** with noise injection:
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from misata import add_noise, NoiseInjector
|
|
210
|
+
|
|
211
|
+
# Quick noise injection
|
|
212
|
+
noisy_df = add_noise(df,
|
|
213
|
+
null_rate=0.05, # 5% missing values
|
|
214
|
+
outlier_rate=0.02, # 2% statistical outliers
|
|
215
|
+
typo_rate=0.01, # 1% typos in text
|
|
216
|
+
duplicate_rate=0.03, # 3% duplicate rows
|
|
217
|
+
seed=42
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Advanced: Temporal distribution drift
|
|
221
|
+
injector = NoiseInjector(seed=42)
|
|
222
|
+
df = injector.apply_temporal_drift(df,
|
|
223
|
+
date_column="created_at",
|
|
224
|
+
value_column="revenue",
|
|
225
|
+
drift_rate=0.15, # 15% increase over time
|
|
226
|
+
drift_direction="up"
|
|
227
|
+
)
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Attribute Customization
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from misata import Customizer, ColumnOverride
|
|
234
|
+
import numpy as np
|
|
235
|
+
|
|
236
|
+
customizer = Customizer(seed=42)
|
|
237
|
+
|
|
238
|
+
# Custom age distribution (realistic, not uniform)
|
|
239
|
+
customizer.add_override("users", ColumnOverride(
|
|
240
|
+
name="age",
|
|
241
|
+
generator=lambda n: np.random.normal(35, 12, n).clip(18, 80).astype(int)
|
|
242
|
+
))
|
|
243
|
+
|
|
244
|
+
# Conditional values based on other columns
|
|
245
|
+
customizer.add_conditional("orders", "shipping_cost", {
|
|
246
|
+
"country": {"US": 5.99, "UK": 9.99, "DE": 7.99}
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
# Apply to generated data
|
|
250
|
+
df = customizer.apply(df, "users")
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## ⚡ Performance
|
|
254
|
+
|
|
255
|
+
| Rows | Time | Speed |
|
|
256
|
+
|------|------|-------|
|
|
257
|
+
| 10K | 0.03s | 333K rows/sec |
|
|
258
|
+
| 100K | 0.26s | 385K rows/sec |
|
|
259
|
+
| 1M | 2.6s | 390K rows/sec |
|
|
260
|
+
| 10M | 26s | 390K rows/sec (streaming) |
|
|
261
|
+
|
|
262
|
+
## � Try It Now
|
|
263
|
+
|
|
264
|
+
[](https://colab.research.google.com/github/rasinmuhammed/misata/blob/main/examples/getting_started.ipynb)
|
|
265
|
+
|
|
266
|
+
Try Misata in your browser without installing anything!
|
|
267
|
+
|
|
268
|
+
## 💼 Enterprise & Consulting
|
|
269
|
+
|
|
270
|
+
**Need help with complex scenarios?**
|
|
271
|
+
|
|
272
|
+
- 🏢 Custom enterprise data schemas (10M+ rows)
|
|
273
|
+
- 🔧 Integration with your existing pipelines
|
|
274
|
+
- 📊 Industry-specific realistic data generation
|
|
275
|
+
- 🎓 Training and onboarding for your team
|
|
276
|
+
|
|
277
|
+
📧 **Contact: rasinbinabdulla@gmail.com**
|
|
278
|
+
|
|
279
|
+
## �📄 License
|
|
280
|
+
|
|
281
|
+
MIT License
|
|
282
|
+
|
|
283
|
+
## 👤 Author
|
|
284
|
+
|
|
285
|
+
Built by **Muhammed Rasin**
|
|
286
|
+
|
|
287
|
+
---
|
|
288
|
+
|
|
289
|
+
**Misata** - From story to synthetic database in one command.
|
|
290
|
+
|
|
291
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
misata/__init__.py,sha256=QebW3oeV4C6fLQ8B3O3YK_N7A4tInyoi-lbuFohALIY,1023
|
|
2
|
+
misata/api.py,sha256=Wq2H3iJzocNTsCzb9vhYJxDyag3Yiucvb-GVF0tdKhI,14999
|
|
3
|
+
misata/audit.py,sha256=4eUCHT2STptemfakWeNODbVuBRhyD8Q32LlB2eufvuw,12291
|
|
4
|
+
misata/benchmark.py,sha256=Y1-tuKegJyAlTneROQpPo276qnfmMmupGDbVDs9k5J8,12358
|
|
5
|
+
misata/cli.py,sha256=a7YijZCUYrkCYGVYJ2nZSL9J3JfFqbXQQOad6bhy7zM,22642
|
|
6
|
+
misata/codegen.py,sha256=m7ykTtLgITvaqzVB1cVhs1b9Puo2X4uyzngZ85wi6J0,5791
|
|
7
|
+
misata/curve_fitting.py,sha256=gLj4BkIxNWKkfo3QKZFI_aq60bsXlI53K5yZX4hc9EU,4126
|
|
8
|
+
misata/customization.py,sha256=pw-BEsPKN091hyOrQWWQoRhTrlmQ9_PXXopm2FZSEvs,8551
|
|
9
|
+
misata/feedback.py,sha256=HBEsoKi_vdRqwRzMoVFVj_cjfzQ5SUAaGz40s1HMD50,13313
|
|
10
|
+
misata/formulas.py,sha256=KOTq5YN_19vv1ERd92bdzKot9yo9rrrwjOuWO13nFCg,11210
|
|
11
|
+
misata/generators.py,sha256=NrMF12i6CB7K6fUsqcqurmZBBQ382ZhVnYB9oMBIZCE,8844
|
|
12
|
+
misata/hybrid.py,sha256=5oopAdfOLWUYzdRWlc0plVeVEVg7Nu1CVGNNCDSjQt8,13104
|
|
13
|
+
misata/llm_parser.py,sha256=1BrX5jUkRqewpHDypfJdglgVXtsLILByAJmI7zrSQ3o,19118
|
|
14
|
+
misata/noise.py,sha256=UO7MokzQ5Y5Vj7JaayDUG0JwCLnpHtnpQTcJ4UHWibo,10460
|
|
15
|
+
misata/schema.py,sha256=vHy-KesQIaHJIH6CxRed_uQ6oUxjkXYhME0mAacAPd8,8696
|
|
16
|
+
misata/semantic.py,sha256=0fauGWJ75wlbHVqT0hohYTN4m_nscdaMaVAIfkhTZXk,7087
|
|
17
|
+
misata/simulator.py,sha256=XF8jLCOak_-9NsSU2pGH_AqWIu5zGpJF86LNG6tAcHE,30550
|
|
18
|
+
misata/story_parser.py,sha256=7N7so3KWisl2UxkOtENQwP-4hN2cs9vTKsPHVRZB2Mc,15964
|
|
19
|
+
misata/validation.py,sha256=5yJSN7jecVNLJ8ss6y7l2U4hF1Ljn27Q6Xs9N1iDPcw,10791
|
|
20
|
+
misata/templates/__init__.py,sha256=0RcZz9d4bmCqLAr77h0gpMfHncqAPeZCguqsuGCz7rE,25245
|
|
21
|
+
misata-0.1.0b0.dist-info/METADATA,sha256=6MaVv1gWkZ-2g5hW_qDnGAkka95ZrgG3R3XaHx2VXnA,8070
|
|
22
|
+
misata-0.1.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
23
|
+
misata-0.1.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
|
|
24
|
+
misata-0.1.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
|
|
25
|
+
misata-0.1.0b0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
misata
|