misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/streaming.py ADDED
@@ -0,0 +1,228 @@
1
+ """
2
+ Streaming export utilities for Misata.
3
+
4
+ Provides streaming CSV/Parquet export to handle large datasets
5
+ without loading everything into memory.
6
+ """
7
+
8
+ import csv
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Union
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from misata.exceptions import ExportError, FileWriteError
17
+
18
+
19
+ class StreamingExporter:
20
+ """Export data in streaming fashion to handle large datasets.
21
+
22
+ Instead of building a full DataFrame and then exporting, this writes
23
+ batches directly to files as they are generated.
24
+
25
+ Example:
26
+ exporter = StreamingExporter(output_dir="./data")
27
+
28
+ for table_name, batch_df in simulator.generate_all():
29
+ exporter.write_batch(table_name, batch_df)
30
+
31
+ exporter.finalize()
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ output_dir: str,
37
+ format: str = "csv",
38
+ progress_callback: Optional[Callable[[str, int], None]] = None,
39
+ ):
40
+ """Initialize the exporter.
41
+
42
+ Args:
43
+ output_dir: Directory to write files to
44
+ format: Export format ('csv' or 'parquet')
45
+ progress_callback: Optional callback(table_name, rows_written)
46
+ """
47
+ self.output_dir = Path(output_dir)
48
+ self.format = format.lower()
49
+ self.progress_callback = progress_callback
50
+
51
+ self._file_handles: Dict[str, Any] = {}
52
+ self._csv_writers: Dict[str, csv.writer] = {}
53
+ self._rows_written: Dict[str, int] = {}
54
+ self._headers_written: Dict[str, bool] = {}
55
+
56
+ # Create output directory
57
+ self.output_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ def write_batch(self, table_name: str, df: pd.DataFrame) -> int:
60
+ """Write a batch of data to the appropriate file.
61
+
62
+ Args:
63
+ table_name: Name of the table
64
+ df: Batch DataFrame to write
65
+
66
+ Returns:
67
+ Number of rows written
68
+ """
69
+ if self.format == "csv":
70
+ return self._write_csv_batch(table_name, df)
71
+ elif self.format == "parquet":
72
+ return self._write_parquet_batch(table_name, df)
73
+ else:
74
+ raise ExportError(f"Unsupported format: {self.format}")
75
+
76
+ def _write_csv_batch(self, table_name: str, df: pd.DataFrame) -> int:
77
+ """Write a batch to CSV file."""
78
+ file_path = self.output_dir / f"{table_name}.csv"
79
+
80
+ try:
81
+ # First batch: write header
82
+ if table_name not in self._headers_written:
83
+ with open(file_path, 'w', newline='', encoding='utf-8') as f:
84
+ writer = csv.writer(f)
85
+ writer.writerow(df.columns.tolist())
86
+ self._headers_written[table_name] = True
87
+ self._rows_written[table_name] = 0
88
+
89
+ # Append data
90
+ with open(file_path, 'a', newline='', encoding='utf-8') as f:
91
+ writer = csv.writer(f)
92
+ for _, row in df.iterrows():
93
+ writer.writerow(row.tolist())
94
+
95
+ rows = len(df)
96
+ self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
97
+
98
+ if self.progress_callback:
99
+ self.progress_callback(table_name, self._rows_written[table_name])
100
+
101
+ return rows
102
+
103
+ except Exception as e:
104
+ raise FileWriteError(f"Failed to write CSV: {e}", path=str(file_path))
105
+
106
+ def _write_parquet_batch(self, table_name: str, df: pd.DataFrame) -> int:
107
+ """Write a batch to Parquet file using append mode."""
108
+ try:
109
+ import pyarrow as pa
110
+ import pyarrow.parquet as pq
111
+ except ImportError:
112
+ raise ExportError(
113
+ "PyArrow required for Parquet export",
114
+ details={"suggestion": "pip install pyarrow"}
115
+ )
116
+
117
+ file_path = self.output_dir / f"{table_name}.parquet"
118
+
119
+ try:
120
+ table = pa.Table.from_pandas(df)
121
+
122
+ if file_path.exists():
123
+ # Append to existing file
124
+ existing = pq.read_table(file_path)
125
+ combined = pa.concat_tables([existing, table])
126
+ pq.write_table(combined, file_path)
127
+ else:
128
+ pq.write_table(table, file_path)
129
+
130
+ rows = len(df)
131
+ self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
132
+
133
+ if self.progress_callback:
134
+ self.progress_callback(table_name, self._rows_written[table_name])
135
+
136
+ return rows
137
+
138
+ except Exception as e:
139
+ raise FileWriteError(f"Failed to write Parquet: {e}", path=str(file_path))
140
+
141
+ def finalize(self) -> Dict[str, int]:
142
+ """Finalize all exports and return summary.
143
+
144
+ Returns:
145
+ Dict mapping table names to row counts
146
+ """
147
+ # Close any open file handles
148
+ for handle in self._file_handles.values():
149
+ try:
150
+ handle.close()
151
+ except Exception:
152
+ pass
153
+
154
+ self._file_handles.clear()
155
+ self._csv_writers.clear()
156
+
157
+ return self._rows_written.copy()
158
+
159
+ def get_file_paths(self) -> Dict[str, Path]:
160
+ """Get paths to all exported files."""
161
+ ext = ".csv" if self.format == "csv" else ".parquet"
162
+ return {
163
+ table: self.output_dir / f"{table}{ext}"
164
+ for table in self._rows_written.keys()
165
+ }
166
+
167
+ def get_stats(self) -> Dict[str, Any]:
168
+ """Get export statistics."""
169
+ return {
170
+ "output_dir": str(self.output_dir),
171
+ "format": self.format,
172
+ "tables": self._rows_written.copy(),
173
+ "total_rows": sum(self._rows_written.values()),
174
+ }
175
+
176
+
177
+ def stream_to_csv(
178
+ generator: Generator[tuple, None, None],
179
+ output_dir: str,
180
+ progress_callback: Optional[Callable[[str, int], None]] = None,
181
+ ) -> Dict[str, int]:
182
+ """Stream data from a generator directly to CSV files.
183
+
184
+ Args:
185
+ generator: Iterator yielding (table_name, batch_df) tuples
186
+ output_dir: Directory to write files to
187
+ progress_callback: Optional callback(table_name, rows_written)
188
+
189
+ Returns:
190
+ Dict mapping table names to final row counts
191
+ """
192
+ exporter = StreamingExporter(
193
+ output_dir=output_dir,
194
+ format="csv",
195
+ progress_callback=progress_callback,
196
+ )
197
+
198
+ for table_name, batch_df in generator:
199
+ exporter.write_batch(table_name, batch_df)
200
+
201
+ return exporter.finalize()
202
+
203
+
204
+ def stream_to_parquet(
205
+ generator: Generator[tuple, None, None],
206
+ output_dir: str,
207
+ progress_callback: Optional[Callable[[str, int], None]] = None,
208
+ ) -> Dict[str, int]:
209
+ """Stream data from a generator directly to Parquet files.
210
+
211
+ Args:
212
+ generator: Iterator yielding (table_name, batch_df) tuples
213
+ output_dir: Directory to write files to
214
+ progress_callback: Optional callback(table_name, rows_written)
215
+
216
+ Returns:
217
+ Dict mapping table names to final row counts
218
+ """
219
+ exporter = StreamingExporter(
220
+ output_dir=output_dir,
221
+ format="parquet",
222
+ progress_callback=progress_callback,
223
+ )
224
+
225
+ for table_name, batch_df in generator:
226
+ exporter.write_batch(table_name, batch_df)
227
+
228
+ return exporter.finalize()
@@ -0,0 +1,344 @@
1
+ """
2
+ Pre-built schema templates for common use cases.
3
+
4
+ Usage:
5
+ from misata.templates.library import load_template, list_templates
6
+
7
+ # See available templates
8
+ print(list_templates())
9
+
10
+ # Load a template
11
+ config = load_template("ecommerce")
12
+
13
+ # Generate data
14
+ from misata import DataSimulator
15
+ for table, batch in DataSimulator(config).generate_all():
16
+ print(f"Generated {len(batch)} rows for {table}")
17
+ """
18
+
19
+ from misata.schema import Column, Relationship, SchemaConfig, Table
20
+
21
+
22
+ def list_templates() -> list:
23
+ """List all available built-in templates."""
24
+ return ["ecommerce", "saas", "healthcare", "fintech"]
25
+
26
+
27
+ def load_template(name: str, row_multiplier: float = 1.0) -> SchemaConfig:
28
+ """
29
+ Load a pre-built schema template.
30
+
31
+ Args:
32
+ name: Template name (ecommerce, saas, healthcare, fintech)
33
+ row_multiplier: Scale row counts (e.g., 0.1 for 10%, 2.0 for 2x)
34
+
35
+ Returns:
36
+ SchemaConfig ready for DataSimulator
37
+ """
38
+ templates = {
39
+ "ecommerce": _ecommerce_template,
40
+ "saas": _saas_template,
41
+ "healthcare": _healthcare_template,
42
+ "fintech": _fintech_template,
43
+ }
44
+
45
+ if name not in templates:
46
+ raise ValueError(f"Unknown template: {name}. Available: {list(templates.keys())}")
47
+
48
+ config = templates[name]()
49
+
50
+ # Apply row multiplier
51
+ if row_multiplier != 1.0:
52
+ for table in config.tables:
53
+ if not table.is_reference:
54
+ table.row_count = int(table.row_count * row_multiplier)
55
+
56
+ return config
57
+
58
+
59
+ def _ecommerce_template() -> SchemaConfig:
60
+ """E-commerce platform with products, orders, reviews."""
61
+ return SchemaConfig(
62
+ name="E-commerce Platform",
63
+ description="Complete e-commerce dataset with products, orders, and reviews",
64
+ seed=42,
65
+ tables=[
66
+ # Reference tables
67
+ Table(
68
+ name="categories",
69
+ is_reference=True,
70
+ inline_data=[
71
+ {"id": 1, "name": "Electronics", "margin_pct": 15},
72
+ {"id": 2, "name": "Clothing", "margin_pct": 40},
73
+ {"id": 3, "name": "Home & Garden", "margin_pct": 25},
74
+ {"id": 4, "name": "Sports", "margin_pct": 30},
75
+ {"id": 5, "name": "Books", "margin_pct": 35},
76
+ {"id": 6, "name": "Beauty", "margin_pct": 50},
77
+ ],
78
+ ),
79
+ Table(
80
+ name="shipping_methods",
81
+ is_reference=True,
82
+ inline_data=[
83
+ {"id": 1, "name": "Standard", "days": 5, "cost": 4.99},
84
+ {"id": 2, "name": "Express", "days": 2, "cost": 9.99},
85
+ {"id": 3, "name": "Next Day", "days": 1, "cost": 19.99},
86
+ {"id": 4, "name": "Free Shipping", "days": 7, "cost": 0.00},
87
+ ],
88
+ ),
89
+ # Transactional tables
90
+ Table(name="customers", row_count=10000),
91
+ Table(name="products", row_count=500),
92
+ Table(name="orders", row_count=50000),
93
+ Table(name="order_items", row_count=150000),
94
+ Table(name="reviews", row_count=20000),
95
+ ],
96
+ columns={
97
+ "customers": [
98
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 10000}, unique=True),
99
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
100
+ Column(name="email", type="text", distribution_params={"text_type": "email"}),
101
+ Column(name="city", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
102
+ Column(name="created_at", type="date", distribution_params={"start": "2020-01-01", "end": "2024-12-31"}),
103
+ Column(name="is_premium", type="boolean", distribution_params={"probability": 0.15}),
104
+ ],
105
+ "products": [
106
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 500}, unique=True),
107
+ Column(name="name", type="text", distribution_params={"text_type": "sentence"}),
108
+ Column(name="category_id", type="foreign_key", distribution_params={}),
109
+ Column(name="price", type="float", distribution_params={"distribution": "uniform", "min": 9.99, "max": 299.99, "decimals": 2}),
110
+ Column(name="stock", type="int", distribution_params={"distribution": "poisson", "lambda": 50}),
111
+ ],
112
+ "orders": [
113
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 50000}, unique=True),
114
+ Column(name="customer_id", type="foreign_key", distribution_params={}),
115
+ Column(name="shipping_method_id", type="foreign_key", distribution_params={}),
116
+ Column(name="order_date", type="date", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
117
+ Column(name="status", type="categorical", distribution_params={"choices": ["completed", "pending", "shipped", "cancelled"], "probabilities": [0.6, 0.15, 0.2, 0.05]}),
118
+ Column(name="total", type="float", distribution_params={"distribution": "exponential", "scale": 75, "min": 10, "decimals": 2}),
119
+ ],
120
+ "order_items": [
121
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 150000}, unique=True),
122
+ Column(name="order_id", type="foreign_key", distribution_params={}),
123
+ Column(name="product_id", type="foreign_key", distribution_params={}),
124
+ Column(name="quantity", type="int", distribution_params={"distribution": "poisson", "lambda": 2, "min": 1}),
125
+ Column(name="unit_price", type="float", distribution_params={"distribution": "uniform", "min": 5.0, "max": 200.0, "decimals": 2}),
126
+ ],
127
+ "reviews": [
128
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 20000}, unique=True),
129
+ Column(name="product_id", type="foreign_key", distribution_params={}),
130
+ Column(name="customer_id", type="foreign_key", distribution_params={}),
131
+ Column(name="rating", type="int", distribution_params={"distribution": "categorical", "choices": [1, 2, 3, 4, 5], "probabilities": [0.05, 0.08, 0.15, 0.32, 0.40]}),
132
+ Column(name="title", type="text", distribution_params={"text_type": "sentence", "smart_generate": True}),
133
+ Column(name="created_at", type="date", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
134
+ ],
135
+ },
136
+ relationships=[
137
+ Relationship(parent_table="categories", child_table="products", parent_key="id", child_key="category_id"),
138
+ Relationship(parent_table="customers", child_table="orders", parent_key="id", child_key="customer_id"),
139
+ Relationship(parent_table="shipping_methods", child_table="orders", parent_key="id", child_key="shipping_method_id"),
140
+ Relationship(parent_table="orders", child_table="order_items", parent_key="id", child_key="order_id"),
141
+ Relationship(parent_table="products", child_table="order_items", parent_key="id", child_key="product_id"),
142
+ Relationship(parent_table="products", child_table="reviews", parent_key="id", child_key="product_id"),
143
+ Relationship(parent_table="customers", child_table="reviews", parent_key="id", child_key="customer_id"),
144
+ ],
145
+ )
146
+
147
+
148
+ def _saas_template() -> SchemaConfig:
149
+ """SaaS platform with users, subscriptions, and usage events."""
150
+ return SchemaConfig(
151
+ name="SaaS Platform",
152
+ description="B2B SaaS with companies, users, subscriptions, and usage tracking",
153
+ seed=42,
154
+ tables=[
155
+ Table(
156
+ name="plans",
157
+ is_reference=True,
158
+ inline_data=[
159
+ {"id": 1, "name": "Free", "price": 0, "seats": 1, "features": "Basic"},
160
+ {"id": 2, "name": "Starter", "price": 29, "seats": 5, "features": "Core features"},
161
+ {"id": 3, "name": "Professional", "price": 99, "seats": 20, "features": "All features"},
162
+ {"id": 4, "name": "Enterprise", "price": 299, "seats": 100, "features": "Custom"},
163
+ ],
164
+ ),
165
+ Table(name="companies", row_count=1000),
166
+ Table(name="users", row_count=25000),
167
+ Table(name="subscriptions", row_count=1200),
168
+ Table(name="usage_events", row_count=500000),
169
+ ],
170
+ columns={
171
+ "companies": [
172
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 1000}, unique=True),
173
+ Column(name="name", type="text", distribution_params={"text_type": "company"}),
174
+ Column(name="industry", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
175
+ Column(name="employee_count", type="int", distribution_params={"distribution": "exponential", "scale": 50, "min": 1}),
176
+ Column(name="created_at", type="date", distribution_params={"start": "2020-01-01", "end": "2024-06-30"}),
177
+ ],
178
+ "users": [
179
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 25000}, unique=True),
180
+ Column(name="company_id", type="foreign_key", distribution_params={}),
181
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
182
+ Column(name="email", type="text", distribution_params={"text_type": "email"}),
183
+ Column(name="role", type="categorical", distribution_params={"choices": ["admin", "member", "viewer"], "probabilities": [0.1, 0.6, 0.3]}),
184
+ Column(name="is_active", type="boolean", distribution_params={"probability": 0.85}),
185
+ Column(name="last_login", type="date", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
186
+ ],
187
+ "subscriptions": [
188
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 1200}, unique=True),
189
+ Column(name="company_id", type="foreign_key", distribution_params={}),
190
+ Column(name="plan_id", type="foreign_key", distribution_params={}),
191
+ Column(name="status", type="categorical", distribution_params={"choices": ["active", "cancelled", "trial", "past_due"], "probabilities": [0.7, 0.1, 0.15, 0.05]}),
192
+ Column(name="start_date", type="date", distribution_params={"start": "2022-01-01", "end": "2024-12-31"}),
193
+ Column(name="mrr", type="float", distribution_params={"distribution": "exponential", "scale": 100, "min": 0, "decimals": 2}),
194
+ ],
195
+ "usage_events": [
196
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 500000}, unique=True),
197
+ Column(name="user_id", type="foreign_key", distribution_params={}),
198
+ Column(name="event_type", type="categorical", distribution_params={"choices": ["page_view", "api_call", "export", "login", "feature_use"], "probabilities": [0.4, 0.3, 0.1, 0.1, 0.1]}),
199
+ Column(name="timestamp", type="datetime", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
200
+ ],
201
+ },
202
+ relationships=[
203
+ Relationship(parent_table="companies", child_table="users", parent_key="id", child_key="company_id"),
204
+ Relationship(parent_table="companies", child_table="subscriptions", parent_key="id", child_key="company_id"),
205
+ Relationship(parent_table="plans", child_table="subscriptions", parent_key="id", child_key="plan_id"),
206
+ Relationship(parent_table="users", child_table="usage_events", parent_key="id", child_key="user_id"),
207
+ ],
208
+ )
209
+
210
+
211
+ def _healthcare_template() -> SchemaConfig:
212
+ """Healthcare system with patients, doctors, appointments, prescriptions."""
213
+ return SchemaConfig(
214
+ name="Healthcare System",
215
+ description="Hospital management with patients, appointments, and prescriptions",
216
+ seed=42,
217
+ tables=[
218
+ Table(
219
+ name="specialties",
220
+ is_reference=True,
221
+ inline_data=[
222
+ {"id": 1, "name": "General Practice", "avg_consult_mins": 15},
223
+ {"id": 2, "name": "Cardiology", "avg_consult_mins": 30},
224
+ {"id": 3, "name": "Dermatology", "avg_consult_mins": 20},
225
+ {"id": 4, "name": "Orthopedics", "avg_consult_mins": 25},
226
+ {"id": 5, "name": "Pediatrics", "avg_consult_mins": 20},
227
+ {"id": 6, "name": "Psychiatry", "avg_consult_mins": 45},
228
+ {"id": 7, "name": "Neurology", "avg_consult_mins": 30},
229
+ ],
230
+ ),
231
+ Table(name="patients", row_count=10000),
232
+ Table(name="doctors", row_count=100),
233
+ Table(name="appointments", row_count=50000),
234
+ Table(name="prescriptions", row_count=75000),
235
+ ],
236
+ columns={
237
+ "patients": [
238
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 10000}, unique=True),
239
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
240
+ Column(name="date_of_birth", type="date", distribution_params={"start": "1940-01-01", "end": "2020-12-31"}),
241
+ Column(name="gender", type="categorical", distribution_params={"choices": ["M", "F", "Other"], "probabilities": [0.48, 0.48, 0.04]}),
242
+ Column(name="phone", type="text", distribution_params={"text_type": "phone"}),
243
+ Column(name="insurance_id", type="text", distribution_params={"text_type": "word"}),
244
+ ],
245
+ "doctors": [
246
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 100}, unique=True),
247
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
248
+ Column(name="specialty_id", type="foreign_key", distribution_params={}),
249
+ Column(name="years_experience", type="int", distribution_params={"distribution": "normal", "mean": 15, "std": 8, "min": 1, "max": 40}),
250
+ Column(name="is_accepting_patients", type="boolean", distribution_params={"probability": 0.8}),
251
+ ],
252
+ "appointments": [
253
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 50000}, unique=True),
254
+ Column(name="patient_id", type="foreign_key", distribution_params={}),
255
+ Column(name="doctor_id", type="foreign_key", distribution_params={}),
256
+ Column(name="appointment_date", type="datetime", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
257
+ Column(name="duration_mins", type="int", distribution_params={"distribution": "normal", "mean": 25, "std": 10, "min": 10, "max": 60}),
258
+ Column(name="status", type="categorical", distribution_params={"choices": ["completed", "scheduled", "cancelled", "no_show"], "probabilities": [0.65, 0.2, 0.1, 0.05]}),
259
+ Column(name="notes", type="text", distribution_params={"text_type": "sentence"}),
260
+ ],
261
+ "prescriptions": [
262
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 75000}, unique=True),
263
+ Column(name="appointment_id", type="foreign_key", distribution_params={}),
264
+ Column(name="medication", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
265
+ Column(name="dosage", type="text", distribution_params={"text_type": "word"}),
266
+ Column(name="duration_days", type="int", distribution_params={"distribution": "categorical", "choices": [7, 14, 30, 60, 90], "probabilities": [0.3, 0.25, 0.25, 0.1, 0.1]}),
267
+ ],
268
+ },
269
+ relationships=[
270
+ Relationship(parent_table="specialties", child_table="doctors", parent_key="id", child_key="specialty_id"),
271
+ Relationship(parent_table="patients", child_table="appointments", parent_key="id", child_key="patient_id"),
272
+ Relationship(parent_table="doctors", child_table="appointments", parent_key="id", child_key="doctor_id"),
273
+ Relationship(parent_table="appointments", child_table="prescriptions", parent_key="id", child_key="appointment_id"),
274
+ ],
275
+ )
276
+
277
+
278
+ def _fintech_template() -> SchemaConfig:
279
+ """Fintech platform with accounts, transactions, and fraud detection."""
280
+ return SchemaConfig(
281
+ name="Fintech Platform",
282
+ description="Banking/payments platform with accounts, transactions, and fraud labels",
283
+ seed=42,
284
+ tables=[
285
+ Table(
286
+ name="account_types",
287
+ is_reference=True,
288
+ inline_data=[
289
+ {"id": 1, "name": "Checking", "min_balance": 0, "monthly_fee": 0},
290
+ {"id": 2, "name": "Savings", "min_balance": 100, "monthly_fee": 0},
291
+ {"id": 3, "name": "Premium", "min_balance": 5000, "monthly_fee": 15},
292
+ {"id": 4, "name": "Business", "min_balance": 1000, "monthly_fee": 25},
293
+ ],
294
+ ),
295
+ Table(
296
+ name="transaction_types",
297
+ is_reference=True,
298
+ inline_data=[
299
+ {"id": 1, "name": "deposit", "direction": "in"},
300
+ {"id": 2, "name": "withdrawal", "direction": "out"},
301
+ {"id": 3, "name": "transfer", "direction": "both"},
302
+ {"id": 4, "name": "payment", "direction": "out"},
303
+ {"id": 5, "name": "refund", "direction": "in"},
304
+ ],
305
+ ),
306
+ Table(name="customers", row_count=25000),
307
+ Table(name="accounts", row_count=35000),
308
+ Table(name="transactions", row_count=500000),
309
+ ],
310
+ columns={
311
+ "customers": [
312
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 25000}, unique=True),
313
+ Column(name="name", type="text", distribution_params={"text_type": "name"}),
314
+ Column(name="email", type="text", distribution_params={"text_type": "email"}),
315
+ Column(name="phone", type="text", distribution_params={"text_type": "phone"}),
316
+ Column(name="created_at", type="date", distribution_params={"start": "2018-01-01", "end": "2024-12-31"}),
317
+ Column(name="risk_score", type="int", distribution_params={"distribution": "normal", "mean": 30, "std": 20, "min": 0, "max": 100}),
318
+ Column(name="is_verified", type="boolean", distribution_params={"probability": 0.92}),
319
+ ],
320
+ "accounts": [
321
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 35000}, unique=True),
322
+ Column(name="customer_id", type="foreign_key", distribution_params={}),
323
+ Column(name="account_type_id", type="foreign_key", distribution_params={}),
324
+ Column(name="balance", type="float", distribution_params={"distribution": "exponential", "scale": 5000, "min": 0, "decimals": 2}),
325
+ Column(name="opened_date", type="date", distribution_params={"start": "2018-01-01", "end": "2024-12-31"}),
326
+ Column(name="is_active", type="boolean", distribution_params={"probability": 0.88}),
327
+ ],
328
+ "transactions": [
329
+ Column(name="id", type="int", distribution_params={"min": 1, "max": 500000}, unique=True),
330
+ Column(name="account_id", type="foreign_key", distribution_params={}),
331
+ Column(name="transaction_type_id", type="foreign_key", distribution_params={}),
332
+ Column(name="amount", type="float", distribution_params={"distribution": "exponential", "scale": 150, "min": 0.01, "decimals": 2}),
333
+ Column(name="timestamp", type="datetime", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
334
+ Column(name="merchant", type="text", distribution_params={"text_type": "company"}),
335
+ Column(name="is_fraud", type="boolean", distribution_params={"probability": 0.012}), # 1.2% fraud rate
336
+ ],
337
+ },
338
+ relationships=[
339
+ Relationship(parent_table="customers", child_table="accounts", parent_key="id", child_key="customer_id"),
340
+ Relationship(parent_table="account_types", child_table="accounts", parent_key="id", child_key="account_type_id"),
341
+ Relationship(parent_table="accounts", child_table="transactions", parent_key="id", child_key="account_id"),
342
+ Relationship(parent_table="transaction_types", child_table="transactions", parent_key="id", child_key="transaction_type_id"),
343
+ ],
344
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: misata
3
- Version: 0.1.0b0
3
+ Version: 0.3.0b0
4
4
  Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
5
5
  Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
6
6
  License: MIT
@@ -23,6 +23,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Database
24
24
  Requires-Python: >=3.10
25
25
  Description-Content-Type: text/markdown
26
+ License-File: LICENSE
26
27
  Requires-Dist: pandas>=2.0.0
27
28
  Requires-Dist: numpy>=1.24.0
28
29
  Requires-Dist: pydantic>=2.0.0
@@ -41,6 +42,7 @@ Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
41
42
  Requires-Dist: black>=23.0.0; extra == "dev"
42
43
  Requires-Dist: ruff>=0.1.0; extra == "dev"
43
44
  Requires-Dist: mypy>=1.5.0; extra == "dev"
45
+ Dynamic: license-file
44
46
 
45
47
  # 🧠 Misata
46
48
 
@@ -48,7 +50,7 @@ Requires-Dist: mypy>=1.5.0; extra == "dev"
48
50
 
49
51
  No schema writing. No training data. Just describe what you need.
50
52
 
51
- [![Version](https://img.shields.io/badge/version-0.1.0--beta-purple.svg)]()
53
+ [![Version](https://img.shields.io/badge/version-0.2.0--beta-purple.svg)]()
52
54
  [![License](https://img.shields.io/badge/license-MIT-blue.svg)]()
53
55
  [![Python](https://img.shields.io/badge/python-3.10+-green.svg)]()
54
56
 
@@ -0,0 +1,37 @@
1
+ misata/__init__.py,sha256=Vra5zMkd5Y6HTzhGRc76jTv10Z0yuhw33MDUoLpACrE,3144
2
+ misata/api.py,sha256=Wq2H3iJzocNTsCzb9vhYJxDyag3Yiucvb-GVF0tdKhI,14999
3
+ misata/audit.py,sha256=4eUCHT2STptemfakWeNODbVuBRhyD8Q32LlB2eufvuw,12291
4
+ misata/benchmark.py,sha256=Y1-tuKegJyAlTneROQpPo276qnfmMmupGDbVDs9k5J8,12358
5
+ misata/cache.py,sha256=fuLk7cQ7hOEmlqEWmm-O516L26btZ6zFO8FdrqFCRLg,7087
6
+ misata/cli.py,sha256=a7YijZCUYrkCYGVYJ2nZSL9J3JfFqbXQQOad6bhy7zM,22642
7
+ misata/codegen.py,sha256=m7ykTtLgITvaqzVB1cVhs1b9Puo2X4uyzngZ85wi6J0,5791
8
+ misata/constraints.py,sha256=8jUKlA2VyVomDnl2zz0RDkUqxEkxlwUnBbHTXr_SA5g,9937
9
+ misata/context.py,sha256=tjYrU67wjII07Pl3MKV_uCMl_s55DIOQZCouxAryyzE,8509
10
+ misata/curve_fitting.py,sha256=gLj4BkIxNWKkfo3QKZFI_aq60bsXlI53K5yZX4hc9EU,4126
11
+ misata/customization.py,sha256=pw-BEsPKN091hyOrQWWQoRhTrlmQ9_PXXopm2FZSEvs,8551
12
+ misata/exceptions.py,sha256=C3IGMk8xAy9AmRVWeSAnLHHui7drv6rzgzvOmr6gh50,8335
13
+ misata/feedback.py,sha256=HBEsoKi_vdRqwRzMoVFVj_cjfzQ5SUAaGz40s1HMD50,13313
14
+ misata/formulas.py,sha256=KOTq5YN_19vv1ERd92bdzKot9yo9rrrwjOuWO13nFCg,11210
15
+ misata/generators.py,sha256=NrMF12i6CB7K6fUsqcqurmZBBQ382ZhVnYB9oMBIZCE,8844
16
+ misata/hybrid.py,sha256=5oopAdfOLWUYzdRWlc0plVeVEVg7Nu1CVGNNCDSjQt8,13104
17
+ misata/llm_parser.py,sha256=2SVozbKtb0kaPaR4ERz9FtIIxK5jQVaYJ8L_xC6gU10,20662
18
+ misata/noise.py,sha256=UO7MokzQ5Y5Vj7JaayDUG0JwCLnpHtnpQTcJ4UHWibo,10460
19
+ misata/profiles.py,sha256=0djys8wWvH8VP74KmGn6cGLuOb64h9Hk0g0bkXOfxP4,9578
20
+ misata/quality.py,sha256=VSntJfMnF1tVWJ05fvbVJOMcAPEB7QtuEg18k6aEwhA,11685
21
+ misata/schema.py,sha256=zMYDPCgPfcy_STgANiS-Ow3dUETpW3Ayo02G88jmBe0,8954
22
+ misata/semantic.py,sha256=0fauGWJ75wlbHVqT0hohYTN4m_nscdaMaVAIfkhTZXk,7087
23
+ misata/simulator.py,sha256=nq9KxOS-4oUMNu7a2Ten0TQyhT2u_rTo2ImmvdkMRbU,34037
24
+ misata/smart_values.py,sha256=8-TYBK5cVBst9tfGuQXXetOLSqgns_NKnIl14rpVrbk,35870
25
+ misata/story_parser.py,sha256=7N7so3KWisl2UxkOtENQwP-4hN2cs9vTKsPHVRZB2Mc,15964
26
+ misata/streaming.py,sha256=qbEnoFRfn9a7H_gWlq5C3TwbNUnP5U98OPo1EdU_cQ0,7578
27
+ misata/validation.py,sha256=5yJSN7jecVNLJ8ss6y7l2U4hF1Ljn27Q6Xs9N1iDPcw,10791
28
+ misata/generators/__init__.py,sha256=V4I_1IucuywRJZH3cLxKvBd2Ib7kE0WIJ7tq8y4lkx8,568
29
+ misata/generators/base.py,sha256=iON9iAONMEQdbq2Fdric3V3bWn3caD1ITC16DTCK0Og,21329
30
+ misata/templates/__init__.py,sha256=0RcZz9d4bmCqLAr77h0gpMfHncqAPeZCguqsuGCz7rE,25245
31
+ misata/templates/library.py,sha256=eMex18ZKlzQqIkGFgs1uy9QGs7PmUN_VVL4txKvxynM,20930
32
+ misata-0.3.0b0.dist-info/licenses/LICENSE,sha256=oagkechmfr9iT214N871zCm7TnB0KTfPjAUWxHsYJ4I,1071
33
+ misata-0.3.0b0.dist-info/METADATA,sha256=Wxpa2V0Sum-CFOpNnmRd27eEDfyT9CKIy-4nGZnrCys,8114
34
+ misata-0.3.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ misata-0.3.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
36
+ misata-0.3.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
37
+ misata-0.3.0b0.dist-info/RECORD,,