misata 0.1.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +48 -0
- misata/api.py +460 -0
- misata/audit.py +415 -0
- misata/benchmark.py +376 -0
- misata/cli.py +680 -0
- misata/codegen.py +153 -0
- misata/curve_fitting.py +106 -0
- misata/customization.py +256 -0
- misata/feedback.py +433 -0
- misata/formulas.py +362 -0
- misata/generators.py +247 -0
- misata/hybrid.py +398 -0
- misata/llm_parser.py +493 -0
- misata/noise.py +346 -0
- misata/schema.py +252 -0
- misata/semantic.py +185 -0
- misata/simulator.py +742 -0
- misata/story_parser.py +425 -0
- misata/templates/__init__.py +444 -0
- misata/validation.py +313 -0
- misata-0.1.0b0.dist-info/METADATA +291 -0
- misata-0.1.0b0.dist-info/RECORD +25 -0
- misata-0.1.0b0.dist-info/WHEEL +5 -0
- misata-0.1.0b0.dist-info/entry_points.txt +2 -0
- misata-0.1.0b0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Industry templates for quick-start synthetic data generation.
|
|
3
|
+
|
|
4
|
+
Each template provides:
|
|
5
|
+
- Reference tables with realistic inline data
|
|
6
|
+
- Transactional tables with proper relationships
|
|
7
|
+
- Industry-specific column definitions
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Dict, List, Any
|
|
11
|
+
|
|
12
|
+
from misata.schema import SchemaConfig, Table, Column, Relationship
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ============================================================================
|
|
16
|
+
# SAAS TEMPLATE
|
|
17
|
+
# ============================================================================
|
|
18
|
+
|
|
19
|
+
SAAS_TEMPLATE = {
|
|
20
|
+
"name": "SaaS Company Dataset",
|
|
21
|
+
"description": "Complete SaaS company data with users, plans, subscriptions, and payments",
|
|
22
|
+
"seed": 42,
|
|
23
|
+
"tables": [
|
|
24
|
+
{
|
|
25
|
+
"name": "plans",
|
|
26
|
+
"is_reference": True,
|
|
27
|
+
"inline_data": [
|
|
28
|
+
{"id": 1, "name": "Free", "price": 0.0, "billing_period": "monthly", "features": "Basic features, 1 user"},
|
|
29
|
+
{"id": 2, "name": "Starter", "price": 9.99, "billing_period": "monthly", "features": "All free + 5 users, analytics"},
|
|
30
|
+
{"id": 3, "name": "Professional", "price": 29.99, "billing_period": "monthly", "features": "All starter + 25 users, API access"},
|
|
31
|
+
{"id": 4, "name": "Enterprise", "price": 99.99, "billing_period": "monthly", "features": "Unlimited users, custom integrations, SLA"},
|
|
32
|
+
]
|
|
33
|
+
},
|
|
34
|
+
{"name": "users", "row_count": 10000, "is_reference": False},
|
|
35
|
+
{"name": "subscriptions", "row_count": 8000, "is_reference": False},
|
|
36
|
+
{"name": "payments", "row_count": 50000, "is_reference": False},
|
|
37
|
+
{"name": "usage_events", "row_count": 100000, "is_reference": False},
|
|
38
|
+
],
|
|
39
|
+
"columns": {
|
|
40
|
+
"users": [
|
|
41
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": True},
|
|
42
|
+
{"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
|
|
43
|
+
{"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
|
|
44
|
+
{"name": "company", "type": "text", "distribution_params": {"text_type": "company"}},
|
|
45
|
+
{"name": "created_at", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}},
|
|
46
|
+
],
|
|
47
|
+
"subscriptions": [
|
|
48
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 8000}},
|
|
49
|
+
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
50
|
+
{"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
|
|
51
|
+
{"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused", "trial"], "probabilities": [0.7, 0.15, 0.1, 0.05]}},
|
|
52
|
+
{"name": "started_at", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}},
|
|
53
|
+
],
|
|
54
|
+
"payments": [
|
|
55
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}},
|
|
56
|
+
{"name": "subscription_id", "type": "foreign_key", "distribution_params": {}},
|
|
57
|
+
{"name": "amount", "type": "categorical", "distribution_params": {"choices": [9.99, 29.99, 99.99], "probabilities": [0.5, 0.35, 0.15]}},
|
|
58
|
+
{"name": "status", "type": "categorical", "distribution_params": {"choices": ["completed", "pending", "failed", "refunded"], "probabilities": [0.9, 0.05, 0.03, 0.02]}},
|
|
59
|
+
{"name": "paid_at", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}},
|
|
60
|
+
],
|
|
61
|
+
"usage_events": [
|
|
62
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
|
|
63
|
+
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
64
|
+
{"name": "event_type", "type": "categorical", "distribution_params": {"choices": ["login", "api_call", "export", "invite_user", "report_view"]}},
|
|
65
|
+
{"name": "created_at", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}},
|
|
66
|
+
],
|
|
67
|
+
},
|
|
68
|
+
"relationships": [
|
|
69
|
+
{"parent_table": "users", "child_table": "subscriptions", "parent_key": "id", "child_key": "user_id"},
|
|
70
|
+
{"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
|
|
71
|
+
{"parent_table": "subscriptions", "child_table": "payments", "parent_key": "id", "child_key": "subscription_id"},
|
|
72
|
+
{"parent_table": "users", "child_table": "usage_events", "parent_key": "id", "child_key": "user_id"},
|
|
73
|
+
],
|
|
74
|
+
"events": []
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ============================================================================
|
|
79
|
+
# E-COMMERCE TEMPLATE
|
|
80
|
+
# ============================================================================
|
|
81
|
+
|
|
82
|
+
ECOMMERCE_TEMPLATE = {
|
|
83
|
+
"name": "E-Commerce Store Dataset",
|
|
84
|
+
"description": "Complete e-commerce data with products, orders, and reviews",
|
|
85
|
+
"seed": 42,
|
|
86
|
+
"tables": [
|
|
87
|
+
{
|
|
88
|
+
"name": "categories",
|
|
89
|
+
"is_reference": True,
|
|
90
|
+
"inline_data": [
|
|
91
|
+
{"id": 1, "name": "Electronics", "description": "Phones, computers, accessories"},
|
|
92
|
+
{"id": 2, "name": "Clothing", "description": "Apparel and fashion"},
|
|
93
|
+
{"id": 3, "name": "Home & Garden", "description": "Furniture and decor"},
|
|
94
|
+
{"id": 4, "name": "Sports", "description": "Sports equipment and apparel"},
|
|
95
|
+
{"id": 5, "name": "Books", "description": "Books and media"},
|
|
96
|
+
]
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"name": "products",
|
|
100
|
+
"is_reference": True,
|
|
101
|
+
"inline_data": [
|
|
102
|
+
{"id": 1, "name": "iPhone 15 Pro", "category_id": 1, "price": 999.99, "stock": 150},
|
|
103
|
+
{"id": 2, "name": "MacBook Air M3", "category_id": 1, "price": 1299.99, "stock": 80},
|
|
104
|
+
{"id": 3, "name": "AirPods Pro", "category_id": 1, "price": 249.99, "stock": 500},
|
|
105
|
+
{"id": 4, "name": "Classic T-Shirt", "category_id": 2, "price": 29.99, "stock": 1000},
|
|
106
|
+
{"id": 5, "name": "Running Shoes", "category_id": 4, "price": 89.99, "stock": 300},
|
|
107
|
+
{"id": 6, "name": "Yoga Mat", "category_id": 4, "price": 39.99, "stock": 450},
|
|
108
|
+
{"id": 7, "name": "Coffee Table", "category_id": 3, "price": 199.99, "stock": 75},
|
|
109
|
+
{"id": 8, "name": "Desk Lamp", "category_id": 3, "price": 49.99, "stock": 200},
|
|
110
|
+
{"id": 9, "name": "Python Cookbook", "category_id": 5, "price": 49.99, "stock": 120},
|
|
111
|
+
{"id": 10, "name": "Data Science Handbook", "category_id": 5, "price": 59.99, "stock": 100},
|
|
112
|
+
]
|
|
113
|
+
},
|
|
114
|
+
{"name": "customers", "row_count": 10000, "is_reference": False},
|
|
115
|
+
{"name": "orders", "row_count": 25000, "is_reference": False},
|
|
116
|
+
{"name": "order_items", "row_count": 50000, "is_reference": False},
|
|
117
|
+
{"name": "reviews", "row_count": 15000, "is_reference": False},
|
|
118
|
+
],
|
|
119
|
+
"columns": {
|
|
120
|
+
"customers": [
|
|
121
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": True},
|
|
122
|
+
{"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
|
|
123
|
+
{"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
|
|
124
|
+
{"name": "address", "type": "text", "distribution_params": {"text_type": "address"}},
|
|
125
|
+
{"name": "created_at", "type": "date", "distribution_params": {"start": "2020-01-01", "end": "2024-12-31"}},
|
|
126
|
+
],
|
|
127
|
+
"orders": [
|
|
128
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 25000}},
|
|
129
|
+
{"name": "customer_id", "type": "foreign_key", "distribution_params": {}},
|
|
130
|
+
{"name": "status", "type": "categorical", "distribution_params": {"choices": ["pending", "shipped", "delivered", "cancelled", "returned"], "probabilities": [0.1, 0.15, 0.65, 0.05, 0.05]}},
|
|
131
|
+
{"name": "total", "type": "float", "distribution_params": {"distribution": "exponential", "scale": 150, "min": 10, "max": 5000}},
|
|
132
|
+
{"name": "ordered_at", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}},
|
|
133
|
+
],
|
|
134
|
+
"order_items": [
|
|
135
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}},
|
|
136
|
+
{"name": "order_id", "type": "foreign_key", "distribution_params": {}},
|
|
137
|
+
{"name": "product_id", "type": "foreign_key", "distribution_params": {}},
|
|
138
|
+
{"name": "quantity", "type": "int", "distribution_params": {"distribution": "poisson", "lambda": 2, "min": 1, "max": 10}},
|
|
139
|
+
],
|
|
140
|
+
"reviews": [
|
|
141
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 15000}},
|
|
142
|
+
{"name": "product_id", "type": "foreign_key", "distribution_params": {}},
|
|
143
|
+
{"name": "customer_id", "type": "foreign_key", "distribution_params": {}},
|
|
144
|
+
{"name": "rating", "type": "int", "distribution_params": {"choices": [1, 2, 3, 4, 5], "probabilities": [0.05, 0.05, 0.15, 0.35, 0.40]}},
|
|
145
|
+
{"name": "created_at", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}},
|
|
146
|
+
],
|
|
147
|
+
},
|
|
148
|
+
"relationships": [
|
|
149
|
+
{"parent_table": "customers", "child_table": "orders", "parent_key": "id", "child_key": "customer_id"},
|
|
150
|
+
{"parent_table": "orders", "child_table": "order_items", "parent_key": "id", "child_key": "order_id"},
|
|
151
|
+
{"parent_table": "products", "child_table": "order_items", "parent_key": "id", "child_key": "product_id"},
|
|
152
|
+
{"parent_table": "products", "child_table": "reviews", "parent_key": "id", "child_key": "product_id"},
|
|
153
|
+
{"parent_table": "customers", "child_table": "reviews", "parent_key": "id", "child_key": "customer_id"},
|
|
154
|
+
],
|
|
155
|
+
"events": []
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ============================================================================
|
|
160
|
+
# FITNESS TEMPLATE
|
|
161
|
+
# ============================================================================
|
|
162
|
+
|
|
163
|
+
FITNESS_TEMPLATE = {
|
|
164
|
+
"name": "Fitness App Dataset",
|
|
165
|
+
"description": "Fitness app data with exercises, workouts, and nutrition",
|
|
166
|
+
"seed": 42,
|
|
167
|
+
"tables": [
|
|
168
|
+
{
|
|
169
|
+
"name": "plans",
|
|
170
|
+
"is_reference": True,
|
|
171
|
+
"inline_data": [
|
|
172
|
+
{"id": 1, "name": "Free", "price": 0.0, "features": "Basic workout tracking"},
|
|
173
|
+
{"id": 2, "name": "Premium", "price": 9.99, "features": "All workouts + nutrition tracking"},
|
|
174
|
+
{"id": 3, "name": "Pro", "price": 19.99, "features": "Everything + personal coaching"},
|
|
175
|
+
]
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
"name": "exercises",
|
|
179
|
+
"is_reference": True,
|
|
180
|
+
"inline_data": [
|
|
181
|
+
{"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10, "difficulty": "medium"},
|
|
182
|
+
{"id": 2, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8, "difficulty": "easy"},
|
|
183
|
+
{"id": 3, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9, "difficulty": "medium"},
|
|
184
|
+
{"id": 4, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3, "difficulty": "easy"},
|
|
185
|
+
{"id": 5, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4, "difficulty": "medium"},
|
|
186
|
+
{"id": 6, "name": "Weightlifting", "category": "Strength", "calories_per_minute": 6, "difficulty": "hard"},
|
|
187
|
+
{"id": 7, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12, "difficulty": "hard"},
|
|
188
|
+
{"id": 8, "name": "Boxing", "category": "Cardio", "calories_per_minute": 11, "difficulty": "hard"},
|
|
189
|
+
{"id": 9, "name": "Stretching", "category": "Flexibility", "calories_per_minute": 2, "difficulty": "easy"},
|
|
190
|
+
{"id": 10, "name": "Walking", "category": "Cardio", "calories_per_minute": 4, "difficulty": "easy"},
|
|
191
|
+
]
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
"name": "meal_types",
|
|
195
|
+
"is_reference": True,
|
|
196
|
+
"inline_data": [
|
|
197
|
+
{"id": 1, "name": "Breakfast", "typical_calories": 400},
|
|
198
|
+
{"id": 2, "name": "Lunch", "typical_calories": 600},
|
|
199
|
+
{"id": 3, "name": "Dinner", "typical_calories": 700},
|
|
200
|
+
{"id": 4, "name": "Snack", "typical_calories": 200},
|
|
201
|
+
]
|
|
202
|
+
},
|
|
203
|
+
{"name": "users", "row_count": 10000, "is_reference": False},
|
|
204
|
+
{"name": "subscriptions", "row_count": 8000, "is_reference": False},
|
|
205
|
+
{"name": "workouts", "row_count": 100000, "is_reference": False},
|
|
206
|
+
{"name": "meals", "row_count": 50000, "is_reference": False},
|
|
207
|
+
],
|
|
208
|
+
"columns": {
|
|
209
|
+
"users": [
|
|
210
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": True},
|
|
211
|
+
{"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
|
|
212
|
+
{"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
|
|
213
|
+
{"name": "age", "type": "int", "distribution_params": {"distribution": "uniform", "min": 18, "max": 65}},
|
|
214
|
+
{"name": "weight_kg", "type": "float", "distribution_params": {"distribution": "normal", "mean": 75, "std": 15, "min": 40, "max": 150}},
|
|
215
|
+
{"name": "height_cm", "type": "float", "distribution_params": {"distribution": "normal", "mean": 170, "std": 10, "min": 140, "max": 210}},
|
|
216
|
+
{"name": "goal", "type": "categorical", "distribution_params": {"choices": ["lose_weight", "build_muscle", "maintain", "improve_endurance"]}},
|
|
217
|
+
],
|
|
218
|
+
"subscriptions": [
|
|
219
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 8000}},
|
|
220
|
+
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
221
|
+
{"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
|
|
222
|
+
{"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused"], "probabilities": [0.75, 0.15, 0.10]}},
|
|
223
|
+
{"name": "started_at", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}},
|
|
224
|
+
],
|
|
225
|
+
"workouts": [
|
|
226
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
|
|
227
|
+
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
228
|
+
{"name": "exercise_id", "type": "foreign_key", "distribution_params": {}},
|
|
229
|
+
{"name": "duration_minutes", "type": "int", "distribution_params": {"distribution": "uniform", "min": 15, "max": 90}},
|
|
230
|
+
{"name": "calories_burned", "type": "int", "distribution_params": {"distribution": "normal", "mean": 300, "std": 150, "min": 50, "max": 1500}},
|
|
231
|
+
{"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}},
|
|
232
|
+
],
|
|
233
|
+
"meals": [
|
|
234
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}},
|
|
235
|
+
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
236
|
+
{"name": "meal_type_id", "type": "foreign_key", "distribution_params": {}},
|
|
237
|
+
{"name": "calories", "type": "int", "distribution_params": {"distribution": "normal", "mean": 500, "std": 200, "min": 100, "max": 1500}},
|
|
238
|
+
{"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}},
|
|
239
|
+
],
|
|
240
|
+
},
|
|
241
|
+
"relationships": [
|
|
242
|
+
{"parent_table": "users", "child_table": "subscriptions", "parent_key": "id", "child_key": "user_id"},
|
|
243
|
+
{"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
|
|
244
|
+
{"parent_table": "users", "child_table": "workouts", "parent_key": "id", "child_key": "user_id"},
|
|
245
|
+
{"parent_table": "exercises", "child_table": "workouts", "parent_key": "id", "child_key": "exercise_id"},
|
|
246
|
+
{"parent_table": "users", "child_table": "meals", "parent_key": "id", "child_key": "user_id"},
|
|
247
|
+
{"parent_table": "meal_types", "child_table": "meals", "parent_key": "id", "child_key": "meal_type_id"},
|
|
248
|
+
],
|
|
249
|
+
"events": []
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ============================================================================
|
|
254
|
+
# HEALTHCARE TEMPLATE
|
|
255
|
+
# ============================================================================
|
|
256
|
+
|
|
257
|
+
HEALTHCARE_TEMPLATE = {
|
|
258
|
+
"name": "Healthcare System Dataset",
|
|
259
|
+
"description": "Healthcare data with patients, doctors, appointments, and diagnoses",
|
|
260
|
+
"seed": 42,
|
|
261
|
+
"tables": [
|
|
262
|
+
{
|
|
263
|
+
"name": "departments",
|
|
264
|
+
"is_reference": True,
|
|
265
|
+
"inline_data": [
|
|
266
|
+
{"id": 1, "name": "Cardiology", "floor": 3},
|
|
267
|
+
{"id": 2, "name": "Orthopedics", "floor": 4},
|
|
268
|
+
{"id": 3, "name": "Pediatrics", "floor": 2},
|
|
269
|
+
{"id": 4, "name": "Neurology", "floor": 5},
|
|
270
|
+
{"id": 5, "name": "General Medicine", "floor": 1},
|
|
271
|
+
{"id": 6, "name": "Emergency", "floor": 1},
|
|
272
|
+
]
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
"name": "diagnoses_catalog",
|
|
276
|
+
"is_reference": True,
|
|
277
|
+
"inline_data": [
|
|
278
|
+
{"id": 1, "code": "J06.9", "name": "Acute upper respiratory infection", "category": "Respiratory"},
|
|
279
|
+
{"id": 2, "code": "I10", "name": "Essential hypertension", "category": "Cardiovascular"},
|
|
280
|
+
{"id": 3, "code": "E11.9", "name": "Type 2 diabetes", "category": "Endocrine"},
|
|
281
|
+
{"id": 4, "code": "M54.5", "name": "Low back pain", "category": "Musculoskeletal"},
|
|
282
|
+
{"id": 5, "code": "J18.9", "name": "Pneumonia", "category": "Respiratory"},
|
|
283
|
+
{"id": 6, "code": "K21.0", "name": "GERD", "category": "Digestive"},
|
|
284
|
+
{"id": 7, "code": "F32.9", "name": "Major depressive disorder", "category": "Mental Health"},
|
|
285
|
+
{"id": 8, "code": "G43.909", "name": "Migraine", "category": "Neurological"},
|
|
286
|
+
]
|
|
287
|
+
},
|
|
288
|
+
{"name": "doctors", "row_count": 100, "is_reference": False},
|
|
289
|
+
{"name": "patients", "row_count": 10000, "is_reference": False},
|
|
290
|
+
{"name": "appointments", "row_count": 50000, "is_reference": False},
|
|
291
|
+
{"name": "patient_diagnoses", "row_count": 30000, "is_reference": False},
|
|
292
|
+
],
|
|
293
|
+
"columns": {
|
|
294
|
+
"doctors": [
|
|
295
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100}, "unique": True},
|
|
296
|
+
{"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
|
|
297
|
+
{"name": "department_id", "type": "foreign_key", "distribution_params": {}},
|
|
298
|
+
{"name": "specialization", "type": "categorical", "distribution_params": {"choices": ["MD", "DO", "Specialist", "Surgeon"]}},
|
|
299
|
+
{"name": "years_experience", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 35}},
|
|
300
|
+
],
|
|
301
|
+
"patients": [
|
|
302
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": True},
|
|
303
|
+
{"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
|
|
304
|
+
{"name": "date_of_birth", "type": "date", "distribution_params": {"start": "1940-01-01", "end": "2010-12-31"}},
|
|
305
|
+
{"name": "gender", "type": "categorical", "distribution_params": {"choices": ["Male", "Female", "Other"], "probabilities": [0.48, 0.48, 0.04]}},
|
|
306
|
+
{"name": "phone", "type": "text", "distribution_params": {"text_type": "phone"}},
|
|
307
|
+
{"name": "blood_type", "type": "categorical", "distribution_params": {"choices": ["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"]}},
|
|
308
|
+
],
|
|
309
|
+
"appointments": [
|
|
310
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}},
|
|
311
|
+
{"name": "patient_id", "type": "foreign_key", "distribution_params": {}},
|
|
312
|
+
{"name": "doctor_id", "type": "foreign_key", "distribution_params": {}},
|
|
313
|
+
{"name": "scheduled_at", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2025-12-31"}},
|
|
314
|
+
{"name": "status", "type": "categorical", "distribution_params": {"choices": ["scheduled", "completed", "cancelled", "no_show"], "probabilities": [0.2, 0.65, 0.10, 0.05]}},
|
|
315
|
+
{"name": "duration_minutes", "type": "int", "distribution_params": {"choices": [15, 30, 45, 60], "probabilities": [0.3, 0.4, 0.2, 0.1]}},
|
|
316
|
+
],
|
|
317
|
+
"patient_diagnoses": [
|
|
318
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 30000}},
|
|
319
|
+
{"name": "patient_id", "type": "foreign_key", "distribution_params": {}},
|
|
320
|
+
{"name": "diagnosis_id", "type": "foreign_key", "distribution_params": {}},
|
|
321
|
+
{"name": "diagnosed_at", "type": "date", "distribution_params": {"start": "2020-01-01", "end": "2024-12-31"}},
|
|
322
|
+
{"name": "severity", "type": "categorical", "distribution_params": {"choices": ["mild", "moderate", "severe"], "probabilities": [0.5, 0.35, 0.15]}},
|
|
323
|
+
],
|
|
324
|
+
},
|
|
325
|
+
"relationships": [
|
|
326
|
+
{"parent_table": "departments", "child_table": "doctors", "parent_key": "id", "child_key": "department_id"},
|
|
327
|
+
{"parent_table": "patients", "child_table": "appointments", "parent_key": "id", "child_key": "patient_id"},
|
|
328
|
+
{"parent_table": "doctors", "child_table": "appointments", "parent_key": "id", "child_key": "doctor_id"},
|
|
329
|
+
{"parent_table": "patients", "child_table": "patient_diagnoses", "parent_key": "id", "child_key": "patient_id"},
|
|
330
|
+
{"parent_table": "diagnoses_catalog", "child_table": "patient_diagnoses", "parent_key": "id", "child_key": "diagnosis_id"},
|
|
331
|
+
],
|
|
332
|
+
"events": []
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# ============================================================================
|
|
337
|
+
# TEMPLATE REGISTRY
|
|
338
|
+
# ============================================================================
|
|
339
|
+
|
|
340
|
+
TEMPLATES = {
|
|
341
|
+
"saas": SAAS_TEMPLATE,
|
|
342
|
+
"ecommerce": ECOMMERCE_TEMPLATE,
|
|
343
|
+
"fitness": FITNESS_TEMPLATE,
|
|
344
|
+
"healthcare": HEALTHCARE_TEMPLATE,
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def get_template(name: str) -> Dict[str, Any]:
|
|
349
|
+
"""
|
|
350
|
+
Get a template by name.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
name: Template name (saas, ecommerce, fitness, healthcare)
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Template dictionary
|
|
357
|
+
|
|
358
|
+
Raises:
|
|
359
|
+
ValueError: If template not found
|
|
360
|
+
"""
|
|
361
|
+
if name not in TEMPLATES:
|
|
362
|
+
available = ", ".join(TEMPLATES.keys())
|
|
363
|
+
raise ValueError(f"Template '{name}' not found. Available: {available}")
|
|
364
|
+
return TEMPLATES[name]
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def list_templates() -> List[str]:
|
|
368
|
+
"""Get list of available template names."""
|
|
369
|
+
return list(TEMPLATES.keys())
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def template_to_schema(template_name: str, row_multiplier: float = 1.0) -> SchemaConfig:
|
|
373
|
+
"""
|
|
374
|
+
Convert a template to a SchemaConfig.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
template_name: Name of template
|
|
378
|
+
row_multiplier: Multiply row counts by this factor
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
SchemaConfig ready for generation
|
|
382
|
+
"""
|
|
383
|
+
template = get_template(template_name)
|
|
384
|
+
|
|
385
|
+
# Adjust row counts
|
|
386
|
+
if row_multiplier != 1.0:
|
|
387
|
+
for table in template["tables"]:
|
|
388
|
+
if "row_count" in table and not table.get("is_reference"):
|
|
389
|
+
table["row_count"] = int(table["row_count"] * row_multiplier)
|
|
390
|
+
|
|
391
|
+
# Parse tables
|
|
392
|
+
tables = []
|
|
393
|
+
for t in template["tables"]:
|
|
394
|
+
tables.append(Table(
|
|
395
|
+
name=t["name"],
|
|
396
|
+
row_count=t.get("row_count", len(t.get("inline_data", [])) or 100),
|
|
397
|
+
is_reference=t.get("is_reference", False),
|
|
398
|
+
inline_data=t.get("inline_data"),
|
|
399
|
+
))
|
|
400
|
+
|
|
401
|
+
# Parse columns
|
|
402
|
+
columns = {}
|
|
403
|
+
for table_name, cols in template["columns"].items():
|
|
404
|
+
columns[table_name] = []
|
|
405
|
+
for c in cols:
|
|
406
|
+
columns[table_name].append(Column(
|
|
407
|
+
name=c["name"],
|
|
408
|
+
type=c["type"],
|
|
409
|
+
distribution_params=c.get("distribution_params", {}),
|
|
410
|
+
nullable=c.get("nullable", False),
|
|
411
|
+
unique=c.get("unique", False),
|
|
412
|
+
))
|
|
413
|
+
|
|
414
|
+
# Add inferred columns for reference tables
|
|
415
|
+
for table in tables:
|
|
416
|
+
if table.is_reference and table.inline_data and table.name not in columns:
|
|
417
|
+
columns[table.name] = []
|
|
418
|
+
first_row = table.inline_data[0]
|
|
419
|
+
for col_name in first_row.keys():
|
|
420
|
+
columns[table.name].append(Column(
|
|
421
|
+
name=col_name,
|
|
422
|
+
type="text", # Will be inferred
|
|
423
|
+
distribution_params={},
|
|
424
|
+
))
|
|
425
|
+
|
|
426
|
+
# Parse relationships
|
|
427
|
+
relationships = []
|
|
428
|
+
for r in template["relationships"]:
|
|
429
|
+
relationships.append(Relationship(
|
|
430
|
+
parent_table=r["parent_table"],
|
|
431
|
+
child_table=r["child_table"],
|
|
432
|
+
parent_key=r["parent_key"],
|
|
433
|
+
child_key=r["child_key"],
|
|
434
|
+
))
|
|
435
|
+
|
|
436
|
+
return SchemaConfig(
|
|
437
|
+
name=template["name"],
|
|
438
|
+
description=template.get("description"),
|
|
439
|
+
tables=tables,
|
|
440
|
+
columns=columns,
|
|
441
|
+
relationships=relationships,
|
|
442
|
+
events=[],
|
|
443
|
+
seed=template.get("seed", 42),
|
|
444
|
+
)
|