misata 0.1.0b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata-0.1.0b0/PKG-INFO +291 -0
- misata-0.1.0b0/README.md +247 -0
- misata-0.1.0b0/misata/__init__.py +48 -0
- misata-0.1.0b0/misata/api.py +460 -0
- misata-0.1.0b0/misata/audit.py +415 -0
- misata-0.1.0b0/misata/benchmark.py +376 -0
- misata-0.1.0b0/misata/cli.py +680 -0
- misata-0.1.0b0/misata/codegen.py +153 -0
- misata-0.1.0b0/misata/curve_fitting.py +106 -0
- misata-0.1.0b0/misata/customization.py +256 -0
- misata-0.1.0b0/misata/feedback.py +433 -0
- misata-0.1.0b0/misata/formulas.py +362 -0
- misata-0.1.0b0/misata/generators.py +247 -0
- misata-0.1.0b0/misata/hybrid.py +398 -0
- misata-0.1.0b0/misata/llm_parser.py +493 -0
- misata-0.1.0b0/misata/noise.py +346 -0
- misata-0.1.0b0/misata/schema.py +252 -0
- misata-0.1.0b0/misata/semantic.py +185 -0
- misata-0.1.0b0/misata/simulator.py +742 -0
- misata-0.1.0b0/misata/story_parser.py +425 -0
- misata-0.1.0b0/misata/templates/__init__.py +444 -0
- misata-0.1.0b0/misata/validation.py +313 -0
- misata-0.1.0b0/misata.egg-info/PKG-INFO +291 -0
- misata-0.1.0b0/misata.egg-info/SOURCES.txt +42 -0
- misata-0.1.0b0/misata.egg-info/dependency_links.txt +1 -0
- misata-0.1.0b0/misata.egg-info/entry_points.txt +2 -0
- misata-0.1.0b0/misata.egg-info/requires.txt +19 -0
- misata-0.1.0b0/misata.egg-info/top_level.txt +1 -0
- misata-0.1.0b0/pyproject.toml +94 -0
- misata-0.1.0b0/setup.cfg +4 -0
- misata-0.1.0b0/tests/test_api.py +145 -0
- misata-0.1.0b0/tests/test_cli.py +100 -0
- misata-0.1.0b0/tests/test_constraints.py +124 -0
- misata-0.1.0b0/tests/test_curve_fitting.py +58 -0
- misata-0.1.0b0/tests/test_enterprise.py +269 -0
- misata-0.1.0b0/tests/test_formulas.py +175 -0
- misata-0.1.0b0/tests/test_integrity.py +144 -0
- misata-0.1.0b0/tests/test_llm_parser.py +193 -0
- misata-0.1.0b0/tests/test_schema.py +156 -0
- misata-0.1.0b0/tests/test_security.py +67 -0
- misata-0.1.0b0/tests/test_semantic.py +115 -0
- misata-0.1.0b0/tests/test_simulator.py +251 -0
- misata-0.1.0b0/tests/test_templates.py +179 -0
- misata-0.1.0b0/tests/test_validation.py +145 -0
misata-0.1.0b0/PKG-INFO
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: misata
|
|
3
|
+
Version: 0.1.0b0
|
|
4
|
+
Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
|
|
5
|
+
Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rasinmuhammed/misata
|
|
8
|
+
Project-URL: Documentation, https://github.com/rasinmuhammed/misata#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/rasinmuhammed/misata
|
|
10
|
+
Project-URL: Issues, https://github.com/rasinmuhammed/misata/issues
|
|
11
|
+
Keywords: synthetic-data,data-generation,fake-data,machine-learning,testing,llm,ai,database
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Software Development :: Testing
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Database
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
Requires-Dist: pandas>=2.0.0
|
|
27
|
+
Requires-Dist: numpy>=1.24.0
|
|
28
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
|
+
Requires-Dist: click>=8.1.0
|
|
30
|
+
Requires-Dist: pyyaml>=6.0
|
|
31
|
+
Requires-Dist: rich>=13.0.0
|
|
32
|
+
Requires-Dist: groq>=0.4.0
|
|
33
|
+
Requires-Dist: fastapi>=0.109.0
|
|
34
|
+
Requires-Dist: uvicorn>=0.27.0
|
|
35
|
+
Requires-Dist: python-multipart>=0.0.6
|
|
36
|
+
Requires-Dist: simpleeval>=0.9.0
|
|
37
|
+
Requires-Dist: scipy>=1.10.0
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
|
+
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
44
|
+
|
|
45
|
+
# 🧠 Misata
|
|
46
|
+
|
|
47
|
+
**Generate realistic multi-table datasets from natural language.**
|
|
48
|
+
|
|
49
|
+
No schema writing. No training data. Just describe what you need.
|
|
50
|
+
|
|
51
|
+
[]()
|
|
52
|
+
[]()
|
|
53
|
+
[]()
|
|
54
|
+
|
|
55
|
+
## ✨ What Makes Misata Different
|
|
56
|
+
|
|
57
|
+
| Feature | Faker | SDV | **Misata** |
|
|
58
|
+
|---------|-------|-----|------------|
|
|
59
|
+
| Natural language input | ❌ | ❌ | ✅ |
|
|
60
|
+
| Auto schema generation | ❌ | ❌ | ✅ |
|
|
61
|
+
| Relational integrity | ❌ | ✅ | ✅ |
|
|
62
|
+
| Business constraints | ❌ | ❌ | ✅ |
|
|
63
|
+
| No training data needed | ✅ | ❌ | ✅ |
|
|
64
|
+
| Streaming (10M+ rows) | ❌ | ❌ | ✅ |
|
|
65
|
+
|
|
66
|
+
## 🚀 Quick Start
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install misata
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### With Groq (Free, Fast)
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
export GROQ_API_KEY=your_key # Get free: https://console.groq.com
|
|
76
|
+
misata generate --story "A SaaS with 50K users, subscriptions, and payments" --use-llm
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### With OpenAI
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export OPENAI_API_KEY=your_key
|
|
83
|
+
misata generate --story "E-commerce with products and orders" --use-llm --provider openai
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### With Ollama (Local, Free, Private)
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
ollama run llama3 # Start Ollama first
|
|
90
|
+
misata generate --story "Fitness app with workouts" --use-llm --provider ollama
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## 📊 Example Output
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
$ misata generate --story "A fitness app with 50K users" --use-llm
|
|
97
|
+
|
|
98
|
+
🧠 Using Groq (llama-3.3-70b-versatile) for intelligent parsing...
|
|
99
|
+
✅ LLM schema generated successfully!
|
|
100
|
+
|
|
101
|
+
📋 Schema: FitnessApp
|
|
102
|
+
Tables: 5
|
|
103
|
+
Relationships: 4
|
|
104
|
+
|
|
105
|
+
🔧 Generating 5 table(s)...
|
|
106
|
+
|
|
107
|
+
✓ exercises (10 rows)
|
|
108
|
+
✓ plans (5 rows)
|
|
109
|
+
✓ users (50,000 rows)
|
|
110
|
+
✓ subscriptions (45,000 rows)
|
|
111
|
+
✓ workouts (500,000 rows)
|
|
112
|
+
|
|
113
|
+
⏱️ Generation time: 2.34 seconds
|
|
114
|
+
🚀 Performance: 213,675 rows/second
|
|
115
|
+
💾 Data saved to: ./generated_data
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## 💻 Python API
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from misata import DataSimulator, SchemaConfig
|
|
122
|
+
from misata.llm_parser import LLMSchemaGenerator
|
|
123
|
+
|
|
124
|
+
# Generate schema from story
|
|
125
|
+
llm = LLMSchemaGenerator(provider="groq") # or "openai", "ollama"
|
|
126
|
+
config = llm.generate_from_story(
|
|
127
|
+
"A mobile fitness app with 50K users, workout tracking, "
|
|
128
|
+
"premium subscriptions, and January signup spikes"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Generate data
|
|
132
|
+
for table_name, batch in DataSimulator(config).generate_all():
|
|
133
|
+
print(f"Generated {len(batch)} rows for {table_name}")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## 🔧 CLI Reference
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
# Basic generation (rule-based, no API key needed)
|
|
140
|
+
misata generate --story "SaaS company with users and subscriptions"
|
|
141
|
+
|
|
142
|
+
# LLM-powered generation
|
|
143
|
+
misata generate --story "..." --use-llm
|
|
144
|
+
|
|
145
|
+
# Specify provider and model
|
|
146
|
+
misata generate --story "..." --use-llm --provider ollama --model llama3
|
|
147
|
+
|
|
148
|
+
# Custom output directory
|
|
149
|
+
misata generate --story "..." --use-llm --output-dir ./my_data
|
|
150
|
+
|
|
151
|
+
# Set row count
|
|
152
|
+
misata generate --story "..." --use-llm --rows 100000
|
|
153
|
+
|
|
154
|
+
# Reproducible with seed
|
|
155
|
+
misata generate --story "..." --use-llm --seed 42
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## 🎯 Business Rule Constraints
|
|
159
|
+
|
|
160
|
+
Define rules like "employees can't log >8 hours/day":
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from misata import Constraint, Table
|
|
164
|
+
|
|
165
|
+
timesheets = Table(
|
|
166
|
+
name="timesheets",
|
|
167
|
+
row_count=10000,
|
|
168
|
+
constraints=[
|
|
169
|
+
Constraint(
|
|
170
|
+
name="max_daily_hours",
|
|
171
|
+
type="sum_limit",
|
|
172
|
+
group_by=["employee_id", "date"],
|
|
173
|
+
column="hours",
|
|
174
|
+
value=8.0,
|
|
175
|
+
action="redistribute"
|
|
176
|
+
)
|
|
177
|
+
]
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## 🔑 LLM Providers
|
|
182
|
+
|
|
183
|
+
| Provider | Env Variable | Free Tier | Notes |
|
|
184
|
+
|----------|--------------|-----------|-------|
|
|
185
|
+
| **Groq** | `GROQ_API_KEY` | ✅ 30 req/min | Fastest, recommended |
|
|
186
|
+
| **OpenAI** | `OPENAI_API_KEY` | ❌ | Best quality |
|
|
187
|
+
| **Ollama** | None | ✅ Local | Private, no internet |
|
|
188
|
+
|
|
189
|
+
## 📈 Extending Data Pools
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from misata import TextGenerator
|
|
193
|
+
|
|
194
|
+
# Add custom names
|
|
195
|
+
TextGenerator.extend_pool("first_names", ["Arjun", "Priya", "Rahul"])
|
|
196
|
+
|
|
197
|
+
# Load from file
|
|
198
|
+
TextGenerator.load_pools_from_file("custom_pools.json")
|
|
199
|
+
|
|
200
|
+
# Save for reuse
|
|
201
|
+
TextGenerator.save_pools_to_file("expanded_pools.json")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## 🤖 ML Training Data
|
|
205
|
+
|
|
206
|
+
Make your synthetic data **indistinguishable from real-world data** with noise injection:
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from misata import add_noise, NoiseInjector
|
|
210
|
+
|
|
211
|
+
# Quick noise injection
|
|
212
|
+
noisy_df = add_noise(df,
|
|
213
|
+
null_rate=0.05, # 5% missing values
|
|
214
|
+
outlier_rate=0.02, # 2% statistical outliers
|
|
215
|
+
typo_rate=0.01, # 1% typos in text
|
|
216
|
+
duplicate_rate=0.03, # 3% duplicate rows
|
|
217
|
+
seed=42
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Advanced: Temporal distribution drift
|
|
221
|
+
injector = NoiseInjector(seed=42)
|
|
222
|
+
df = injector.apply_temporal_drift(df,
|
|
223
|
+
date_column="created_at",
|
|
224
|
+
value_column="revenue",
|
|
225
|
+
drift_rate=0.15, # 15% increase over time
|
|
226
|
+
drift_direction="up"
|
|
227
|
+
)
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Attribute Customization
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from misata import Customizer, ColumnOverride
|
|
234
|
+
import numpy as np
|
|
235
|
+
|
|
236
|
+
customizer = Customizer(seed=42)
|
|
237
|
+
|
|
238
|
+
# Custom age distribution (realistic, not uniform)
|
|
239
|
+
customizer.add_override("users", ColumnOverride(
|
|
240
|
+
name="age",
|
|
241
|
+
generator=lambda n: np.random.normal(35, 12, n).clip(18, 80).astype(int)
|
|
242
|
+
))
|
|
243
|
+
|
|
244
|
+
# Conditional values based on other columns
|
|
245
|
+
customizer.add_conditional("orders", "shipping_cost", {
|
|
246
|
+
"country": {"US": 5.99, "UK": 9.99, "DE": 7.99}
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
# Apply to generated data
|
|
250
|
+
df = customizer.apply(df, "users")
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## ⚡ Performance
|
|
254
|
+
|
|
255
|
+
| Rows | Time | Speed |
|
|
256
|
+
|------|------|-------|
|
|
257
|
+
| 10K | 0.03s | 333K rows/sec |
|
|
258
|
+
| 100K | 0.26s | 385K rows/sec |
|
|
259
|
+
| 1M | 2.6s | 390K rows/sec |
|
|
260
|
+
| 10M | 26s | 390K rows/sec (streaming) |
|
|
261
|
+
|
|
262
|
+
## � Try It Now
|
|
263
|
+
|
|
264
|
+
[](https://colab.research.google.com/github/rasinmuhammed/misata/blob/main/examples/getting_started.ipynb)
|
|
265
|
+
|
|
266
|
+
Try Misata in your browser without installing anything!
|
|
267
|
+
|
|
268
|
+
## 💼 Enterprise & Consulting
|
|
269
|
+
|
|
270
|
+
**Need help with complex scenarios?**
|
|
271
|
+
|
|
272
|
+
- 🏢 Custom enterprise data schemas (10M+ rows)
|
|
273
|
+
- 🔧 Integration with your existing pipelines
|
|
274
|
+
- 📊 Industry-specific realistic data generation
|
|
275
|
+
- 🎓 Training and onboarding for your team
|
|
276
|
+
|
|
277
|
+
📧 **Contact: rasinbinabdulla@gmail.com**
|
|
278
|
+
|
|
279
|
+
## �📄 License
|
|
280
|
+
|
|
281
|
+
MIT License
|
|
282
|
+
|
|
283
|
+
## 👤 Author
|
|
284
|
+
|
|
285
|
+
Built by **Muhammed Rasin**
|
|
286
|
+
|
|
287
|
+
---
|
|
288
|
+
|
|
289
|
+
**Misata** - From story to synthetic database in one command.
|
|
290
|
+
|
|
291
|
+
|
misata-0.1.0b0/README.md
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# 🧠 Misata
|
|
2
|
+
|
|
3
|
+
**Generate realistic multi-table datasets from natural language.**
|
|
4
|
+
|
|
5
|
+
No schema writing. No training data. Just describe what you need.
|
|
6
|
+
|
|
7
|
+
[]()
|
|
8
|
+
[]()
|
|
9
|
+
[]()
|
|
10
|
+
|
|
11
|
+
## ✨ What Makes Misata Different
|
|
12
|
+
|
|
13
|
+
| Feature | Faker | SDV | **Misata** |
|
|
14
|
+
|---------|-------|-----|------------|
|
|
15
|
+
| Natural language input | ❌ | ❌ | ✅ |
|
|
16
|
+
| Auto schema generation | ❌ | ❌ | ✅ |
|
|
17
|
+
| Relational integrity | ❌ | ✅ | ✅ |
|
|
18
|
+
| Business constraints | ❌ | ❌ | ✅ |
|
|
19
|
+
| No training data needed | ✅ | ❌ | ✅ |
|
|
20
|
+
| Streaming (10M+ rows) | ❌ | ❌ | ✅ |
|
|
21
|
+
|
|
22
|
+
## 🚀 Quick Start
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install misata
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### With Groq (Free, Fast)
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
export GROQ_API_KEY=your_key # Get free: https://console.groq.com
|
|
32
|
+
misata generate --story "A SaaS with 50K users, subscriptions, and payments" --use-llm
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### With OpenAI
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
export OPENAI_API_KEY=your_key
|
|
39
|
+
misata generate --story "E-commerce with products and orders" --use-llm --provider openai
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### With Ollama (Local, Free, Private)
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
ollama run llama3 # Start Ollama first
|
|
46
|
+
misata generate --story "Fitness app with workouts" --use-llm --provider ollama
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## 📊 Example Output
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
$ misata generate --story "A fitness app with 50K users" --use-llm
|
|
53
|
+
|
|
54
|
+
🧠 Using Groq (llama-3.3-70b-versatile) for intelligent parsing...
|
|
55
|
+
✅ LLM schema generated successfully!
|
|
56
|
+
|
|
57
|
+
📋 Schema: FitnessApp
|
|
58
|
+
Tables: 5
|
|
59
|
+
Relationships: 4
|
|
60
|
+
|
|
61
|
+
🔧 Generating 5 table(s)...
|
|
62
|
+
|
|
63
|
+
✓ exercises (10 rows)
|
|
64
|
+
✓ plans (5 rows)
|
|
65
|
+
✓ users (50,000 rows)
|
|
66
|
+
✓ subscriptions (45,000 rows)
|
|
67
|
+
✓ workouts (500,000 rows)
|
|
68
|
+
|
|
69
|
+
⏱️ Generation time: 2.34 seconds
|
|
70
|
+
🚀 Performance: 213,675 rows/second
|
|
71
|
+
💾 Data saved to: ./generated_data
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 💻 Python API
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from misata import DataSimulator, SchemaConfig
|
|
78
|
+
from misata.llm_parser import LLMSchemaGenerator
|
|
79
|
+
|
|
80
|
+
# Generate schema from story
|
|
81
|
+
llm = LLMSchemaGenerator(provider="groq") # or "openai", "ollama"
|
|
82
|
+
config = llm.generate_from_story(
|
|
83
|
+
"A mobile fitness app with 50K users, workout tracking, "
|
|
84
|
+
"premium subscriptions, and January signup spikes"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Generate data
|
|
88
|
+
for table_name, batch in DataSimulator(config).generate_all():
|
|
89
|
+
print(f"Generated {len(batch)} rows for {table_name}")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## 🔧 CLI Reference
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Basic generation (rule-based, no API key needed)
|
|
96
|
+
misata generate --story "SaaS company with users and subscriptions"
|
|
97
|
+
|
|
98
|
+
# LLM-powered generation
|
|
99
|
+
misata generate --story "..." --use-llm
|
|
100
|
+
|
|
101
|
+
# Specify provider and model
|
|
102
|
+
misata generate --story "..." --use-llm --provider ollama --model llama3
|
|
103
|
+
|
|
104
|
+
# Custom output directory
|
|
105
|
+
misata generate --story "..." --use-llm --output-dir ./my_data
|
|
106
|
+
|
|
107
|
+
# Set row count
|
|
108
|
+
misata generate --story "..." --use-llm --rows 100000
|
|
109
|
+
|
|
110
|
+
# Reproducible with seed
|
|
111
|
+
misata generate --story "..." --use-llm --seed 42
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## 🎯 Business Rule Constraints
|
|
115
|
+
|
|
116
|
+
Define rules like "employees can't log >8 hours/day":
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from misata import Constraint, Table
|
|
120
|
+
|
|
121
|
+
timesheets = Table(
|
|
122
|
+
name="timesheets",
|
|
123
|
+
row_count=10000,
|
|
124
|
+
constraints=[
|
|
125
|
+
Constraint(
|
|
126
|
+
name="max_daily_hours",
|
|
127
|
+
type="sum_limit",
|
|
128
|
+
group_by=["employee_id", "date"],
|
|
129
|
+
column="hours",
|
|
130
|
+
value=8.0,
|
|
131
|
+
action="redistribute"
|
|
132
|
+
)
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## 🔑 LLM Providers
|
|
138
|
+
|
|
139
|
+
| Provider | Env Variable | Free Tier | Notes |
|
|
140
|
+
|----------|--------------|-----------|-------|
|
|
141
|
+
| **Groq** | `GROQ_API_KEY` | ✅ 30 req/min | Fastest, recommended |
|
|
142
|
+
| **OpenAI** | `OPENAI_API_KEY` | ❌ | Best quality |
|
|
143
|
+
| **Ollama** | None | ✅ Local | Private, no internet |
|
|
144
|
+
|
|
145
|
+
## 📈 Extending Data Pools
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from misata import TextGenerator
|
|
149
|
+
|
|
150
|
+
# Add custom names
|
|
151
|
+
TextGenerator.extend_pool("first_names", ["Arjun", "Priya", "Rahul"])
|
|
152
|
+
|
|
153
|
+
# Load from file
|
|
154
|
+
TextGenerator.load_pools_from_file("custom_pools.json")
|
|
155
|
+
|
|
156
|
+
# Save for reuse
|
|
157
|
+
TextGenerator.save_pools_to_file("expanded_pools.json")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## 🤖 ML Training Data
|
|
161
|
+
|
|
162
|
+
Make your synthetic data **indistinguishable from real-world data** with noise injection:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from misata import add_noise, NoiseInjector
|
|
166
|
+
|
|
167
|
+
# Quick noise injection
|
|
168
|
+
noisy_df = add_noise(df,
|
|
169
|
+
null_rate=0.05, # 5% missing values
|
|
170
|
+
outlier_rate=0.02, # 2% statistical outliers
|
|
171
|
+
typo_rate=0.01, # 1% typos in text
|
|
172
|
+
duplicate_rate=0.03, # 3% duplicate rows
|
|
173
|
+
seed=42
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Advanced: Temporal distribution drift
|
|
177
|
+
injector = NoiseInjector(seed=42)
|
|
178
|
+
df = injector.apply_temporal_drift(df,
|
|
179
|
+
date_column="created_at",
|
|
180
|
+
value_column="revenue",
|
|
181
|
+
drift_rate=0.15, # 15% increase over time
|
|
182
|
+
drift_direction="up"
|
|
183
|
+
)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Attribute Customization
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from misata import Customizer, ColumnOverride
|
|
190
|
+
import numpy as np
|
|
191
|
+
|
|
192
|
+
customizer = Customizer(seed=42)
|
|
193
|
+
|
|
194
|
+
# Custom age distribution (realistic, not uniform)
|
|
195
|
+
customizer.add_override("users", ColumnOverride(
|
|
196
|
+
name="age",
|
|
197
|
+
generator=lambda n: np.random.normal(35, 12, n).clip(18, 80).astype(int)
|
|
198
|
+
))
|
|
199
|
+
|
|
200
|
+
# Conditional values based on other columns
|
|
201
|
+
customizer.add_conditional("orders", "shipping_cost", {
|
|
202
|
+
"country": {"US": 5.99, "UK": 9.99, "DE": 7.99}
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
# Apply to generated data
|
|
206
|
+
df = customizer.apply(df, "users")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## ⚡ Performance
|
|
210
|
+
|
|
211
|
+
| Rows | Time | Speed |
|
|
212
|
+
|------|------|-------|
|
|
213
|
+
| 10K | 0.03s | 333K rows/sec |
|
|
214
|
+
| 100K | 0.26s | 385K rows/sec |
|
|
215
|
+
| 1M | 2.6s | 390K rows/sec |
|
|
216
|
+
| 10M | 26s | 390K rows/sec (streaming) |
|
|
217
|
+
|
|
218
|
+
## � Try It Now
|
|
219
|
+
|
|
220
|
+
[](https://colab.research.google.com/github/rasinmuhammed/misata/blob/main/examples/getting_started.ipynb)
|
|
221
|
+
|
|
222
|
+
Try Misata in your browser without installing anything!
|
|
223
|
+
|
|
224
|
+
## 💼 Enterprise & Consulting
|
|
225
|
+
|
|
226
|
+
**Need help with complex scenarios?**
|
|
227
|
+
|
|
228
|
+
- 🏢 Custom enterprise data schemas (10M+ rows)
|
|
229
|
+
- 🔧 Integration with your existing pipelines
|
|
230
|
+
- 📊 Industry-specific realistic data generation
|
|
231
|
+
- 🎓 Training and onboarding for your team
|
|
232
|
+
|
|
233
|
+
📧 **Contact: rasinbinabdulla@gmail.com**
|
|
234
|
+
|
|
235
|
+
## �📄 License
|
|
236
|
+
|
|
237
|
+
MIT License
|
|
238
|
+
|
|
239
|
+
## 👤 Author
|
|
240
|
+
|
|
241
|
+
Built by **Muhammed Rasin**
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
**Misata** - From story to synthetic database in one command.
|
|
246
|
+
|
|
247
|
+
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Misata - AI-Powered Synthetic Data Engine
|
|
3
|
+
|
|
4
|
+
Generate realistic multi-table datasets from natural language descriptions.
|
|
5
|
+
Supports OpenAI, Groq, Gemini, and Ollama for intelligent schema generation.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from misata import DataSimulator, SchemaConfig
|
|
9
|
+
|
|
10
|
+
# Or use the CLI:
|
|
11
|
+
# misata generate --story "A SaaS with 50k users..."
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0-beta"
|
|
15
|
+
__author__ = "Muhammed Rasin"
|
|
16
|
+
|
|
17
|
+
from misata.schema import (
|
|
18
|
+
Column,
|
|
19
|
+
Constraint,
|
|
20
|
+
Relationship,
|
|
21
|
+
ScenarioEvent,
|
|
22
|
+
SchemaConfig,
|
|
23
|
+
Table,
|
|
24
|
+
)
|
|
25
|
+
from misata.simulator import DataSimulator
|
|
26
|
+
from misata.generators import TextGenerator
|
|
27
|
+
from misata.noise import NoiseInjector, add_noise
|
|
28
|
+
from misata.customization import Customizer, ColumnOverride
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Core
|
|
32
|
+
"Column",
|
|
33
|
+
"Constraint",
|
|
34
|
+
"Relationship",
|
|
35
|
+
"ScenarioEvent",
|
|
36
|
+
"SchemaConfig",
|
|
37
|
+
"Table",
|
|
38
|
+
"DataSimulator",
|
|
39
|
+
# Extensibility
|
|
40
|
+
"TextGenerator",
|
|
41
|
+
# ML-ready features
|
|
42
|
+
"NoiseInjector",
|
|
43
|
+
"add_noise",
|
|
44
|
+
"Customizer",
|
|
45
|
+
"ColumnOverride",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|