misata 0.1.0b0__py3-none-any.whl → 0.2.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +13 -2
- misata/llm_parser.py +41 -2
- misata/quality.py +329 -0
- misata/schema.py +8 -3
- misata/simulator.py +81 -5
- misata/smart_values.py +593 -0
- misata/templates/library.py +344 -0
- {misata-0.1.0b0.dist-info → misata-0.2.0b0.dist-info}/METADATA +4 -2
- {misata-0.1.0b0.dist-info → misata-0.2.0b0.dist-info}/RECORD +13 -9
- misata-0.2.0b0.dist-info/licenses/LICENSE +21 -0
- {misata-0.1.0b0.dist-info → misata-0.2.0b0.dist-info}/WHEEL +0 -0
- {misata-0.1.0b0.dist-info → misata-0.2.0b0.dist-info}/entry_points.txt +0 -0
- {misata-0.1.0b0.dist-info → misata-0.2.0b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pre-built schema templates for common use cases.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from misata.templates.library import load_template, list_templates
|
|
6
|
+
|
|
7
|
+
# See available templates
|
|
8
|
+
print(list_templates())
|
|
9
|
+
|
|
10
|
+
# Load a template
|
|
11
|
+
config = load_template("ecommerce")
|
|
12
|
+
|
|
13
|
+
# Generate data
|
|
14
|
+
from misata import DataSimulator
|
|
15
|
+
for table, batch in DataSimulator(config).generate_all():
|
|
16
|
+
print(f"Generated {len(batch)} rows for {table}")
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from misata.schema import Column, Relationship, SchemaConfig, Table
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def list_templates() -> list:
|
|
23
|
+
"""List all available built-in templates."""
|
|
24
|
+
return ["ecommerce", "saas", "healthcare", "fintech"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_template(name: str, row_multiplier: float = 1.0) -> SchemaConfig:
|
|
28
|
+
"""
|
|
29
|
+
Load a pre-built schema template.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: Template name (ecommerce, saas, healthcare, fintech)
|
|
33
|
+
row_multiplier: Scale row counts (e.g., 0.1 for 10%, 2.0 for 2x)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
SchemaConfig ready for DataSimulator
|
|
37
|
+
"""
|
|
38
|
+
templates = {
|
|
39
|
+
"ecommerce": _ecommerce_template,
|
|
40
|
+
"saas": _saas_template,
|
|
41
|
+
"healthcare": _healthcare_template,
|
|
42
|
+
"fintech": _fintech_template,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if name not in templates:
|
|
46
|
+
raise ValueError(f"Unknown template: {name}. Available: {list(templates.keys())}")
|
|
47
|
+
|
|
48
|
+
config = templates[name]()
|
|
49
|
+
|
|
50
|
+
# Apply row multiplier
|
|
51
|
+
if row_multiplier != 1.0:
|
|
52
|
+
for table in config.tables:
|
|
53
|
+
if not table.is_reference:
|
|
54
|
+
table.row_count = int(table.row_count * row_multiplier)
|
|
55
|
+
|
|
56
|
+
return config
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _ecommerce_template() -> SchemaConfig:
|
|
60
|
+
"""E-commerce platform with products, orders, reviews."""
|
|
61
|
+
return SchemaConfig(
|
|
62
|
+
name="E-commerce Platform",
|
|
63
|
+
description="Complete e-commerce dataset with products, orders, and reviews",
|
|
64
|
+
seed=42,
|
|
65
|
+
tables=[
|
|
66
|
+
# Reference tables
|
|
67
|
+
Table(
|
|
68
|
+
name="categories",
|
|
69
|
+
is_reference=True,
|
|
70
|
+
inline_data=[
|
|
71
|
+
{"id": 1, "name": "Electronics", "margin_pct": 15},
|
|
72
|
+
{"id": 2, "name": "Clothing", "margin_pct": 40},
|
|
73
|
+
{"id": 3, "name": "Home & Garden", "margin_pct": 25},
|
|
74
|
+
{"id": 4, "name": "Sports", "margin_pct": 30},
|
|
75
|
+
{"id": 5, "name": "Books", "margin_pct": 35},
|
|
76
|
+
{"id": 6, "name": "Beauty", "margin_pct": 50},
|
|
77
|
+
],
|
|
78
|
+
),
|
|
79
|
+
Table(
|
|
80
|
+
name="shipping_methods",
|
|
81
|
+
is_reference=True,
|
|
82
|
+
inline_data=[
|
|
83
|
+
{"id": 1, "name": "Standard", "days": 5, "cost": 4.99},
|
|
84
|
+
{"id": 2, "name": "Express", "days": 2, "cost": 9.99},
|
|
85
|
+
{"id": 3, "name": "Next Day", "days": 1, "cost": 19.99},
|
|
86
|
+
{"id": 4, "name": "Free Shipping", "days": 7, "cost": 0.00},
|
|
87
|
+
],
|
|
88
|
+
),
|
|
89
|
+
# Transactional tables
|
|
90
|
+
Table(name="customers", row_count=10000),
|
|
91
|
+
Table(name="products", row_count=500),
|
|
92
|
+
Table(name="orders", row_count=50000),
|
|
93
|
+
Table(name="order_items", row_count=150000),
|
|
94
|
+
Table(name="reviews", row_count=20000),
|
|
95
|
+
],
|
|
96
|
+
columns={
|
|
97
|
+
"customers": [
|
|
98
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 10000}, unique=True),
|
|
99
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
100
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
101
|
+
Column(name="city", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
|
|
102
|
+
Column(name="created_at", type="date", distribution_params={"start": "2020-01-01", "end": "2024-12-31"}),
|
|
103
|
+
Column(name="is_premium", type="boolean", distribution_params={"probability": 0.15}),
|
|
104
|
+
],
|
|
105
|
+
"products": [
|
|
106
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 500}, unique=True),
|
|
107
|
+
Column(name="name", type="text", distribution_params={"text_type": "sentence"}),
|
|
108
|
+
Column(name="category_id", type="foreign_key", distribution_params={}),
|
|
109
|
+
Column(name="price", type="float", distribution_params={"distribution": "uniform", "min": 9.99, "max": 299.99, "decimals": 2}),
|
|
110
|
+
Column(name="stock", type="int", distribution_params={"distribution": "poisson", "lambda": 50}),
|
|
111
|
+
],
|
|
112
|
+
"orders": [
|
|
113
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 50000}, unique=True),
|
|
114
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
115
|
+
Column(name="shipping_method_id", type="foreign_key", distribution_params={}),
|
|
116
|
+
Column(name="order_date", type="date", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
|
|
117
|
+
Column(name="status", type="categorical", distribution_params={"choices": ["completed", "pending", "shipped", "cancelled"], "probabilities": [0.6, 0.15, 0.2, 0.05]}),
|
|
118
|
+
Column(name="total", type="float", distribution_params={"distribution": "exponential", "scale": 75, "min": 10, "decimals": 2}),
|
|
119
|
+
],
|
|
120
|
+
"order_items": [
|
|
121
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 150000}, unique=True),
|
|
122
|
+
Column(name="order_id", type="foreign_key", distribution_params={}),
|
|
123
|
+
Column(name="product_id", type="foreign_key", distribution_params={}),
|
|
124
|
+
Column(name="quantity", type="int", distribution_params={"distribution": "poisson", "lambda": 2, "min": 1}),
|
|
125
|
+
Column(name="unit_price", type="float", distribution_params={"distribution": "uniform", "min": 5.0, "max": 200.0, "decimals": 2}),
|
|
126
|
+
],
|
|
127
|
+
"reviews": [
|
|
128
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 20000}, unique=True),
|
|
129
|
+
Column(name="product_id", type="foreign_key", distribution_params={}),
|
|
130
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
131
|
+
Column(name="rating", type="int", distribution_params={"distribution": "categorical", "choices": [1, 2, 3, 4, 5], "probabilities": [0.05, 0.08, 0.15, 0.32, 0.40]}),
|
|
132
|
+
Column(name="title", type="text", distribution_params={"text_type": "sentence", "smart_generate": True}),
|
|
133
|
+
Column(name="created_at", type="date", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
|
|
134
|
+
],
|
|
135
|
+
},
|
|
136
|
+
relationships=[
|
|
137
|
+
Relationship(parent_table="categories", child_table="products", parent_key="id", child_key="category_id"),
|
|
138
|
+
Relationship(parent_table="customers", child_table="orders", parent_key="id", child_key="customer_id"),
|
|
139
|
+
Relationship(parent_table="shipping_methods", child_table="orders", parent_key="id", child_key="shipping_method_id"),
|
|
140
|
+
Relationship(parent_table="orders", child_table="order_items", parent_key="id", child_key="order_id"),
|
|
141
|
+
Relationship(parent_table="products", child_table="order_items", parent_key="id", child_key="product_id"),
|
|
142
|
+
Relationship(parent_table="products", child_table="reviews", parent_key="id", child_key="product_id"),
|
|
143
|
+
Relationship(parent_table="customers", child_table="reviews", parent_key="id", child_key="customer_id"),
|
|
144
|
+
],
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _saas_template() -> SchemaConfig:
|
|
149
|
+
"""SaaS platform with users, subscriptions, and usage events."""
|
|
150
|
+
return SchemaConfig(
|
|
151
|
+
name="SaaS Platform",
|
|
152
|
+
description="B2B SaaS with companies, users, subscriptions, and usage tracking",
|
|
153
|
+
seed=42,
|
|
154
|
+
tables=[
|
|
155
|
+
Table(
|
|
156
|
+
name="plans",
|
|
157
|
+
is_reference=True,
|
|
158
|
+
inline_data=[
|
|
159
|
+
{"id": 1, "name": "Free", "price": 0, "seats": 1, "features": "Basic"},
|
|
160
|
+
{"id": 2, "name": "Starter", "price": 29, "seats": 5, "features": "Core features"},
|
|
161
|
+
{"id": 3, "name": "Professional", "price": 99, "seats": 20, "features": "All features"},
|
|
162
|
+
{"id": 4, "name": "Enterprise", "price": 299, "seats": 100, "features": "Custom"},
|
|
163
|
+
],
|
|
164
|
+
),
|
|
165
|
+
Table(name="companies", row_count=1000),
|
|
166
|
+
Table(name="users", row_count=25000),
|
|
167
|
+
Table(name="subscriptions", row_count=1200),
|
|
168
|
+
Table(name="usage_events", row_count=500000),
|
|
169
|
+
],
|
|
170
|
+
columns={
|
|
171
|
+
"companies": [
|
|
172
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 1000}, unique=True),
|
|
173
|
+
Column(name="name", type="text", distribution_params={"text_type": "company"}),
|
|
174
|
+
Column(name="industry", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
|
|
175
|
+
Column(name="employee_count", type="int", distribution_params={"distribution": "exponential", "scale": 50, "min": 1}),
|
|
176
|
+
Column(name="created_at", type="date", distribution_params={"start": "2020-01-01", "end": "2024-06-30"}),
|
|
177
|
+
],
|
|
178
|
+
"users": [
|
|
179
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 25000}, unique=True),
|
|
180
|
+
Column(name="company_id", type="foreign_key", distribution_params={}),
|
|
181
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
182
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
183
|
+
Column(name="role", type="categorical", distribution_params={"choices": ["admin", "member", "viewer"], "probabilities": [0.1, 0.6, 0.3]}),
|
|
184
|
+
Column(name="is_active", type="boolean", distribution_params={"probability": 0.85}),
|
|
185
|
+
Column(name="last_login", type="date", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
|
|
186
|
+
],
|
|
187
|
+
"subscriptions": [
|
|
188
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 1200}, unique=True),
|
|
189
|
+
Column(name="company_id", type="foreign_key", distribution_params={}),
|
|
190
|
+
Column(name="plan_id", type="foreign_key", distribution_params={}),
|
|
191
|
+
Column(name="status", type="categorical", distribution_params={"choices": ["active", "cancelled", "trial", "past_due"], "probabilities": [0.7, 0.1, 0.15, 0.05]}),
|
|
192
|
+
Column(name="start_date", type="date", distribution_params={"start": "2022-01-01", "end": "2024-12-31"}),
|
|
193
|
+
Column(name="mrr", type="float", distribution_params={"distribution": "exponential", "scale": 100, "min": 0, "decimals": 2}),
|
|
194
|
+
],
|
|
195
|
+
"usage_events": [
|
|
196
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 500000}, unique=True),
|
|
197
|
+
Column(name="user_id", type="foreign_key", distribution_params={}),
|
|
198
|
+
Column(name="event_type", type="categorical", distribution_params={"choices": ["page_view", "api_call", "export", "login", "feature_use"], "probabilities": [0.4, 0.3, 0.1, 0.1, 0.1]}),
|
|
199
|
+
Column(name="timestamp", type="datetime", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
|
|
200
|
+
],
|
|
201
|
+
},
|
|
202
|
+
relationships=[
|
|
203
|
+
Relationship(parent_table="companies", child_table="users", parent_key="id", child_key="company_id"),
|
|
204
|
+
Relationship(parent_table="companies", child_table="subscriptions", parent_key="id", child_key="company_id"),
|
|
205
|
+
Relationship(parent_table="plans", child_table="subscriptions", parent_key="id", child_key="plan_id"),
|
|
206
|
+
Relationship(parent_table="users", child_table="usage_events", parent_key="id", child_key="user_id"),
|
|
207
|
+
],
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _healthcare_template() -> SchemaConfig:
|
|
212
|
+
"""Healthcare system with patients, doctors, appointments, prescriptions."""
|
|
213
|
+
return SchemaConfig(
|
|
214
|
+
name="Healthcare System",
|
|
215
|
+
description="Hospital management with patients, appointments, and prescriptions",
|
|
216
|
+
seed=42,
|
|
217
|
+
tables=[
|
|
218
|
+
Table(
|
|
219
|
+
name="specialties",
|
|
220
|
+
is_reference=True,
|
|
221
|
+
inline_data=[
|
|
222
|
+
{"id": 1, "name": "General Practice", "avg_consult_mins": 15},
|
|
223
|
+
{"id": 2, "name": "Cardiology", "avg_consult_mins": 30},
|
|
224
|
+
{"id": 3, "name": "Dermatology", "avg_consult_mins": 20},
|
|
225
|
+
{"id": 4, "name": "Orthopedics", "avg_consult_mins": 25},
|
|
226
|
+
{"id": 5, "name": "Pediatrics", "avg_consult_mins": 20},
|
|
227
|
+
{"id": 6, "name": "Psychiatry", "avg_consult_mins": 45},
|
|
228
|
+
{"id": 7, "name": "Neurology", "avg_consult_mins": 30},
|
|
229
|
+
],
|
|
230
|
+
),
|
|
231
|
+
Table(name="patients", row_count=10000),
|
|
232
|
+
Table(name="doctors", row_count=100),
|
|
233
|
+
Table(name="appointments", row_count=50000),
|
|
234
|
+
Table(name="prescriptions", row_count=75000),
|
|
235
|
+
],
|
|
236
|
+
columns={
|
|
237
|
+
"patients": [
|
|
238
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 10000}, unique=True),
|
|
239
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
240
|
+
Column(name="date_of_birth", type="date", distribution_params={"start": "1940-01-01", "end": "2020-12-31"}),
|
|
241
|
+
Column(name="gender", type="categorical", distribution_params={"choices": ["M", "F", "Other"], "probabilities": [0.48, 0.48, 0.04]}),
|
|
242
|
+
Column(name="phone", type="text", distribution_params={"text_type": "phone"}),
|
|
243
|
+
Column(name="insurance_id", type="text", distribution_params={"text_type": "word"}),
|
|
244
|
+
],
|
|
245
|
+
"doctors": [
|
|
246
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 100}, unique=True),
|
|
247
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
248
|
+
Column(name="specialty_id", type="foreign_key", distribution_params={}),
|
|
249
|
+
Column(name="years_experience", type="int", distribution_params={"distribution": "normal", "mean": 15, "std": 8, "min": 1, "max": 40}),
|
|
250
|
+
Column(name="is_accepting_patients", type="boolean", distribution_params={"probability": 0.8}),
|
|
251
|
+
],
|
|
252
|
+
"appointments": [
|
|
253
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 50000}, unique=True),
|
|
254
|
+
Column(name="patient_id", type="foreign_key", distribution_params={}),
|
|
255
|
+
Column(name="doctor_id", type="foreign_key", distribution_params={}),
|
|
256
|
+
Column(name="appointment_date", type="datetime", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
|
|
257
|
+
Column(name="duration_mins", type="int", distribution_params={"distribution": "normal", "mean": 25, "std": 10, "min": 10, "max": 60}),
|
|
258
|
+
Column(name="status", type="categorical", distribution_params={"choices": ["completed", "scheduled", "cancelled", "no_show"], "probabilities": [0.65, 0.2, 0.1, 0.05]}),
|
|
259
|
+
Column(name="notes", type="text", distribution_params={"text_type": "sentence"}),
|
|
260
|
+
],
|
|
261
|
+
"prescriptions": [
|
|
262
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 75000}, unique=True),
|
|
263
|
+
Column(name="appointment_id", type="foreign_key", distribution_params={}),
|
|
264
|
+
Column(name="medication", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
|
|
265
|
+
Column(name="dosage", type="text", distribution_params={"text_type": "word"}),
|
|
266
|
+
Column(name="duration_days", type="int", distribution_params={"distribution": "categorical", "choices": [7, 14, 30, 60, 90], "probabilities": [0.3, 0.25, 0.25, 0.1, 0.1]}),
|
|
267
|
+
],
|
|
268
|
+
},
|
|
269
|
+
relationships=[
|
|
270
|
+
Relationship(parent_table="specialties", child_table="doctors", parent_key="id", child_key="specialty_id"),
|
|
271
|
+
Relationship(parent_table="patients", child_table="appointments", parent_key="id", child_key="patient_id"),
|
|
272
|
+
Relationship(parent_table="doctors", child_table="appointments", parent_key="id", child_key="doctor_id"),
|
|
273
|
+
Relationship(parent_table="appointments", child_table="prescriptions", parent_key="id", child_key="appointment_id"),
|
|
274
|
+
],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _fintech_template() -> SchemaConfig:
|
|
279
|
+
"""Fintech platform with accounts, transactions, and fraud detection."""
|
|
280
|
+
return SchemaConfig(
|
|
281
|
+
name="Fintech Platform",
|
|
282
|
+
description="Banking/payments platform with accounts, transactions, and fraud labels",
|
|
283
|
+
seed=42,
|
|
284
|
+
tables=[
|
|
285
|
+
Table(
|
|
286
|
+
name="account_types",
|
|
287
|
+
is_reference=True,
|
|
288
|
+
inline_data=[
|
|
289
|
+
{"id": 1, "name": "Checking", "min_balance": 0, "monthly_fee": 0},
|
|
290
|
+
{"id": 2, "name": "Savings", "min_balance": 100, "monthly_fee": 0},
|
|
291
|
+
{"id": 3, "name": "Premium", "min_balance": 5000, "monthly_fee": 15},
|
|
292
|
+
{"id": 4, "name": "Business", "min_balance": 1000, "monthly_fee": 25},
|
|
293
|
+
],
|
|
294
|
+
),
|
|
295
|
+
Table(
|
|
296
|
+
name="transaction_types",
|
|
297
|
+
is_reference=True,
|
|
298
|
+
inline_data=[
|
|
299
|
+
{"id": 1, "name": "deposit", "direction": "in"},
|
|
300
|
+
{"id": 2, "name": "withdrawal", "direction": "out"},
|
|
301
|
+
{"id": 3, "name": "transfer", "direction": "both"},
|
|
302
|
+
{"id": 4, "name": "payment", "direction": "out"},
|
|
303
|
+
{"id": 5, "name": "refund", "direction": "in"},
|
|
304
|
+
],
|
|
305
|
+
),
|
|
306
|
+
Table(name="customers", row_count=25000),
|
|
307
|
+
Table(name="accounts", row_count=35000),
|
|
308
|
+
Table(name="transactions", row_count=500000),
|
|
309
|
+
],
|
|
310
|
+
columns={
|
|
311
|
+
"customers": [
|
|
312
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 25000}, unique=True),
|
|
313
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
314
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
315
|
+
Column(name="phone", type="text", distribution_params={"text_type": "phone"}),
|
|
316
|
+
Column(name="created_at", type="date", distribution_params={"start": "2018-01-01", "end": "2024-12-31"}),
|
|
317
|
+
Column(name="risk_score", type="int", distribution_params={"distribution": "normal", "mean": 30, "std": 20, "min": 0, "max": 100}),
|
|
318
|
+
Column(name="is_verified", type="boolean", distribution_params={"probability": 0.92}),
|
|
319
|
+
],
|
|
320
|
+
"accounts": [
|
|
321
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 35000}, unique=True),
|
|
322
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
323
|
+
Column(name="account_type_id", type="foreign_key", distribution_params={}),
|
|
324
|
+
Column(name="balance", type="float", distribution_params={"distribution": "exponential", "scale": 5000, "min": 0, "decimals": 2}),
|
|
325
|
+
Column(name="opened_date", type="date", distribution_params={"start": "2018-01-01", "end": "2024-12-31"}),
|
|
326
|
+
Column(name="is_active", type="boolean", distribution_params={"probability": 0.88}),
|
|
327
|
+
],
|
|
328
|
+
"transactions": [
|
|
329
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 500000}, unique=True),
|
|
330
|
+
Column(name="account_id", type="foreign_key", distribution_params={}),
|
|
331
|
+
Column(name="transaction_type_id", type="foreign_key", distribution_params={}),
|
|
332
|
+
Column(name="amount", type="float", distribution_params={"distribution": "exponential", "scale": 150, "min": 0.01, "decimals": 2}),
|
|
333
|
+
Column(name="timestamp", type="datetime", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
|
|
334
|
+
Column(name="merchant", type="text", distribution_params={"text_type": "company"}),
|
|
335
|
+
Column(name="is_fraud", type="boolean", distribution_params={"probability": 0.012}), # 1.2% fraud rate
|
|
336
|
+
],
|
|
337
|
+
},
|
|
338
|
+
relationships=[
|
|
339
|
+
Relationship(parent_table="customers", child_table="accounts", parent_key="id", child_key="customer_id"),
|
|
340
|
+
Relationship(parent_table="account_types", child_table="accounts", parent_key="id", child_key="account_type_id"),
|
|
341
|
+
Relationship(parent_table="accounts", child_table="transactions", parent_key="id", child_key="account_id"),
|
|
342
|
+
Relationship(parent_table="transaction_types", child_table="transactions", parent_key="id", child_key="transaction_type_id"),
|
|
343
|
+
],
|
|
344
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: misata
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0b0
|
|
4
4
|
Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
|
|
5
5
|
Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -23,6 +23,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Database
|
|
24
24
|
Requires-Python: >=3.10
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
26
27
|
Requires-Dist: pandas>=2.0.0
|
|
27
28
|
Requires-Dist: numpy>=1.24.0
|
|
28
29
|
Requires-Dist: pydantic>=2.0.0
|
|
@@ -41,6 +42,7 @@ Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
|
|
|
41
42
|
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
42
43
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
44
|
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
44
46
|
|
|
45
47
|
# 🧠 Misata
|
|
46
48
|
|
|
@@ -48,7 +50,7 @@ Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
|
48
50
|
|
|
49
51
|
No schema writing. No training data. Just describe what you need.
|
|
50
52
|
|
|
51
|
-
[]()
|
|
52
54
|
[]()
|
|
53
55
|
[]()
|
|
54
56
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
misata/__init__.py,sha256=
|
|
1
|
+
misata/__init__.py,sha256=rTXPG74KAXwnJD9TchtkbOr3kVPaC6W6CrhTKEeSSyk,1404
|
|
2
2
|
misata/api.py,sha256=Wq2H3iJzocNTsCzb9vhYJxDyag3Yiucvb-GVF0tdKhI,14999
|
|
3
3
|
misata/audit.py,sha256=4eUCHT2STptemfakWeNODbVuBRhyD8Q32LlB2eufvuw,12291
|
|
4
4
|
misata/benchmark.py,sha256=Y1-tuKegJyAlTneROQpPo276qnfmMmupGDbVDs9k5J8,12358
|
|
@@ -10,16 +10,20 @@ misata/feedback.py,sha256=HBEsoKi_vdRqwRzMoVFVj_cjfzQ5SUAaGz40s1HMD50,13313
|
|
|
10
10
|
misata/formulas.py,sha256=KOTq5YN_19vv1ERd92bdzKot9yo9rrrwjOuWO13nFCg,11210
|
|
11
11
|
misata/generators.py,sha256=NrMF12i6CB7K6fUsqcqurmZBBQ382ZhVnYB9oMBIZCE,8844
|
|
12
12
|
misata/hybrid.py,sha256=5oopAdfOLWUYzdRWlc0plVeVEVg7Nu1CVGNNCDSjQt8,13104
|
|
13
|
-
misata/llm_parser.py,sha256=
|
|
13
|
+
misata/llm_parser.py,sha256=2SVozbKtb0kaPaR4ERz9FtIIxK5jQVaYJ8L_xC6gU10,20662
|
|
14
14
|
misata/noise.py,sha256=UO7MokzQ5Y5Vj7JaayDUG0JwCLnpHtnpQTcJ4UHWibo,10460
|
|
15
|
-
misata/
|
|
15
|
+
misata/quality.py,sha256=VSntJfMnF1tVWJ05fvbVJOMcAPEB7QtuEg18k6aEwhA,11685
|
|
16
|
+
misata/schema.py,sha256=zMYDPCgPfcy_STgANiS-Ow3dUETpW3Ayo02G88jmBe0,8954
|
|
16
17
|
misata/semantic.py,sha256=0fauGWJ75wlbHVqT0hohYTN4m_nscdaMaVAIfkhTZXk,7087
|
|
17
|
-
misata/simulator.py,sha256=
|
|
18
|
+
misata/simulator.py,sha256=nq9KxOS-4oUMNu7a2Ten0TQyhT2u_rTo2ImmvdkMRbU,34037
|
|
19
|
+
misata/smart_values.py,sha256=_jVE3kqqSnFqfKchFDwlsuzFPmyJhtgmpJr4O-wpXrA,28274
|
|
18
20
|
misata/story_parser.py,sha256=7N7so3KWisl2UxkOtENQwP-4hN2cs9vTKsPHVRZB2Mc,15964
|
|
19
21
|
misata/validation.py,sha256=5yJSN7jecVNLJ8ss6y7l2U4hF1Ljn27Q6Xs9N1iDPcw,10791
|
|
20
22
|
misata/templates/__init__.py,sha256=0RcZz9d4bmCqLAr77h0gpMfHncqAPeZCguqsuGCz7rE,25245
|
|
21
|
-
misata
|
|
22
|
-
misata-0.
|
|
23
|
-
misata-0.
|
|
24
|
-
misata-0.
|
|
25
|
-
misata-0.
|
|
23
|
+
misata/templates/library.py,sha256=eMex18ZKlzQqIkGFgs1uy9QGs7PmUN_VVL4txKvxynM,20930
|
|
24
|
+
misata-0.2.0b0.dist-info/licenses/LICENSE,sha256=oagkechmfr9iT214N871zCm7TnB0KTfPjAUWxHsYJ4I,1071
|
|
25
|
+
misata-0.2.0b0.dist-info/METADATA,sha256=t5yL_ZD7DNiH7TuCfaF4ZNT-5wYTiIaUSj8puQ4TROw,8114
|
|
26
|
+
misata-0.2.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
27
|
+
misata-0.2.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
|
|
28
|
+
misata-0.2.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
|
|
29
|
+
misata-0.2.0b0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Muhammed Rasin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|