misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +89 -3
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/llm_parser.py +41 -2
- misata/profiles.py +332 -0
- misata/quality.py +329 -0
- misata/schema.py +8 -3
- misata/simulator.py +81 -5
- misata/smart_values.py +762 -0
- misata/streaming.py +228 -0
- misata/templates/library.py +344 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/METADATA +4 -2
- misata-0.3.0b0.dist-info/RECORD +37 -0
- misata-0.3.0b0.dist-info/licenses/LICENSE +21 -0
- misata-0.1.0b0.dist-info/RECORD +0 -25
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/WHEEL +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/entry_points.txt +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/top_level.txt +0 -0
misata/streaming.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Streaming export utilities for Misata.
|
|
3
|
+
|
|
4
|
+
Provides streaming CSV/Parquet export to handle large datasets
|
|
5
|
+
without loading everything into memory.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Union
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from misata.exceptions import ExportError, FileWriteError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StreamingExporter:
|
|
20
|
+
"""Export data in streaming fashion to handle large datasets.
|
|
21
|
+
|
|
22
|
+
Instead of building a full DataFrame and then exporting, this writes
|
|
23
|
+
batches directly to files as they are generated.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
exporter = StreamingExporter(output_dir="./data")
|
|
27
|
+
|
|
28
|
+
for table_name, batch_df in simulator.generate_all():
|
|
29
|
+
exporter.write_batch(table_name, batch_df)
|
|
30
|
+
|
|
31
|
+
exporter.finalize()
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
output_dir: str,
|
|
37
|
+
format: str = "csv",
|
|
38
|
+
progress_callback: Optional[Callable[[str, int], None]] = None,
|
|
39
|
+
):
|
|
40
|
+
"""Initialize the exporter.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
output_dir: Directory to write files to
|
|
44
|
+
format: Export format ('csv' or 'parquet')
|
|
45
|
+
progress_callback: Optional callback(table_name, rows_written)
|
|
46
|
+
"""
|
|
47
|
+
self.output_dir = Path(output_dir)
|
|
48
|
+
self.format = format.lower()
|
|
49
|
+
self.progress_callback = progress_callback
|
|
50
|
+
|
|
51
|
+
self._file_handles: Dict[str, Any] = {}
|
|
52
|
+
self._csv_writers: Dict[str, csv.writer] = {}
|
|
53
|
+
self._rows_written: Dict[str, int] = {}
|
|
54
|
+
self._headers_written: Dict[str, bool] = {}
|
|
55
|
+
|
|
56
|
+
# Create output directory
|
|
57
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
|
|
59
|
+
def write_batch(self, table_name: str, df: pd.DataFrame) -> int:
|
|
60
|
+
"""Write a batch of data to the appropriate file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
table_name: Name of the table
|
|
64
|
+
df: Batch DataFrame to write
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Number of rows written
|
|
68
|
+
"""
|
|
69
|
+
if self.format == "csv":
|
|
70
|
+
return self._write_csv_batch(table_name, df)
|
|
71
|
+
elif self.format == "parquet":
|
|
72
|
+
return self._write_parquet_batch(table_name, df)
|
|
73
|
+
else:
|
|
74
|
+
raise ExportError(f"Unsupported format: {self.format}")
|
|
75
|
+
|
|
76
|
+
def _write_csv_batch(self, table_name: str, df: pd.DataFrame) -> int:
|
|
77
|
+
"""Write a batch to CSV file."""
|
|
78
|
+
file_path = self.output_dir / f"{table_name}.csv"
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# First batch: write header
|
|
82
|
+
if table_name not in self._headers_written:
|
|
83
|
+
with open(file_path, 'w', newline='', encoding='utf-8') as f:
|
|
84
|
+
writer = csv.writer(f)
|
|
85
|
+
writer.writerow(df.columns.tolist())
|
|
86
|
+
self._headers_written[table_name] = True
|
|
87
|
+
self._rows_written[table_name] = 0
|
|
88
|
+
|
|
89
|
+
# Append data
|
|
90
|
+
with open(file_path, 'a', newline='', encoding='utf-8') as f:
|
|
91
|
+
writer = csv.writer(f)
|
|
92
|
+
for _, row in df.iterrows():
|
|
93
|
+
writer.writerow(row.tolist())
|
|
94
|
+
|
|
95
|
+
rows = len(df)
|
|
96
|
+
self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
|
|
97
|
+
|
|
98
|
+
if self.progress_callback:
|
|
99
|
+
self.progress_callback(table_name, self._rows_written[table_name])
|
|
100
|
+
|
|
101
|
+
return rows
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise FileWriteError(f"Failed to write CSV: {e}", path=str(file_path))
|
|
105
|
+
|
|
106
|
+
def _write_parquet_batch(self, table_name: str, df: pd.DataFrame) -> int:
|
|
107
|
+
"""Write a batch to Parquet file using append mode."""
|
|
108
|
+
try:
|
|
109
|
+
import pyarrow as pa
|
|
110
|
+
import pyarrow.parquet as pq
|
|
111
|
+
except ImportError:
|
|
112
|
+
raise ExportError(
|
|
113
|
+
"PyArrow required for Parquet export",
|
|
114
|
+
details={"suggestion": "pip install pyarrow"}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
file_path = self.output_dir / f"{table_name}.parquet"
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
table = pa.Table.from_pandas(df)
|
|
121
|
+
|
|
122
|
+
if file_path.exists():
|
|
123
|
+
# Append to existing file
|
|
124
|
+
existing = pq.read_table(file_path)
|
|
125
|
+
combined = pa.concat_tables([existing, table])
|
|
126
|
+
pq.write_table(combined, file_path)
|
|
127
|
+
else:
|
|
128
|
+
pq.write_table(table, file_path)
|
|
129
|
+
|
|
130
|
+
rows = len(df)
|
|
131
|
+
self._rows_written[table_name] = self._rows_written.get(table_name, 0) + rows
|
|
132
|
+
|
|
133
|
+
if self.progress_callback:
|
|
134
|
+
self.progress_callback(table_name, self._rows_written[table_name])
|
|
135
|
+
|
|
136
|
+
return rows
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
raise FileWriteError(f"Failed to write Parquet: {e}", path=str(file_path))
|
|
140
|
+
|
|
141
|
+
def finalize(self) -> Dict[str, int]:
|
|
142
|
+
"""Finalize all exports and return summary.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dict mapping table names to row counts
|
|
146
|
+
"""
|
|
147
|
+
# Close any open file handles
|
|
148
|
+
for handle in self._file_handles.values():
|
|
149
|
+
try:
|
|
150
|
+
handle.close()
|
|
151
|
+
except Exception:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
self._file_handles.clear()
|
|
155
|
+
self._csv_writers.clear()
|
|
156
|
+
|
|
157
|
+
return self._rows_written.copy()
|
|
158
|
+
|
|
159
|
+
def get_file_paths(self) -> Dict[str, Path]:
|
|
160
|
+
"""Get paths to all exported files."""
|
|
161
|
+
ext = ".csv" if self.format == "csv" else ".parquet"
|
|
162
|
+
return {
|
|
163
|
+
table: self.output_dir / f"{table}{ext}"
|
|
164
|
+
for table in self._rows_written.keys()
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
168
|
+
"""Get export statistics."""
|
|
169
|
+
return {
|
|
170
|
+
"output_dir": str(self.output_dir),
|
|
171
|
+
"format": self.format,
|
|
172
|
+
"tables": self._rows_written.copy(),
|
|
173
|
+
"total_rows": sum(self._rows_written.values()),
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def stream_to_csv(
|
|
178
|
+
generator: Generator[tuple, None, None],
|
|
179
|
+
output_dir: str,
|
|
180
|
+
progress_callback: Optional[Callable[[str, int], None]] = None,
|
|
181
|
+
) -> Dict[str, int]:
|
|
182
|
+
"""Stream data from a generator directly to CSV files.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
generator: Iterator yielding (table_name, batch_df) tuples
|
|
186
|
+
output_dir: Directory to write files to
|
|
187
|
+
progress_callback: Optional callback(table_name, rows_written)
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dict mapping table names to final row counts
|
|
191
|
+
"""
|
|
192
|
+
exporter = StreamingExporter(
|
|
193
|
+
output_dir=output_dir,
|
|
194
|
+
format="csv",
|
|
195
|
+
progress_callback=progress_callback,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
for table_name, batch_df in generator:
|
|
199
|
+
exporter.write_batch(table_name, batch_df)
|
|
200
|
+
|
|
201
|
+
return exporter.finalize()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def stream_to_parquet(
|
|
205
|
+
generator: Generator[tuple, None, None],
|
|
206
|
+
output_dir: str,
|
|
207
|
+
progress_callback: Optional[Callable[[str, int], None]] = None,
|
|
208
|
+
) -> Dict[str, int]:
|
|
209
|
+
"""Stream data from a generator directly to Parquet files.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
generator: Iterator yielding (table_name, batch_df) tuples
|
|
213
|
+
output_dir: Directory to write files to
|
|
214
|
+
progress_callback: Optional callback(table_name, rows_written)
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Dict mapping table names to final row counts
|
|
218
|
+
"""
|
|
219
|
+
exporter = StreamingExporter(
|
|
220
|
+
output_dir=output_dir,
|
|
221
|
+
format="parquet",
|
|
222
|
+
progress_callback=progress_callback,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
for table_name, batch_df in generator:
|
|
226
|
+
exporter.write_batch(table_name, batch_df)
|
|
227
|
+
|
|
228
|
+
return exporter.finalize()
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pre-built schema templates for common use cases.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from misata.templates.library import load_template, list_templates
|
|
6
|
+
|
|
7
|
+
# See available templates
|
|
8
|
+
print(list_templates())
|
|
9
|
+
|
|
10
|
+
# Load a template
|
|
11
|
+
config = load_template("ecommerce")
|
|
12
|
+
|
|
13
|
+
# Generate data
|
|
14
|
+
from misata import DataSimulator
|
|
15
|
+
for table, batch in DataSimulator(config).generate_all():
|
|
16
|
+
print(f"Generated {len(batch)} rows for {table}")
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from misata.schema import Column, Relationship, SchemaConfig, Table
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def list_templates() -> list:
|
|
23
|
+
"""List all available built-in templates."""
|
|
24
|
+
return ["ecommerce", "saas", "healthcare", "fintech"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_template(name: str, row_multiplier: float = 1.0) -> SchemaConfig:
|
|
28
|
+
"""
|
|
29
|
+
Load a pre-built schema template.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: Template name (ecommerce, saas, healthcare, fintech)
|
|
33
|
+
row_multiplier: Scale row counts (e.g., 0.1 for 10%, 2.0 for 2x)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
SchemaConfig ready for DataSimulator
|
|
37
|
+
"""
|
|
38
|
+
templates = {
|
|
39
|
+
"ecommerce": _ecommerce_template,
|
|
40
|
+
"saas": _saas_template,
|
|
41
|
+
"healthcare": _healthcare_template,
|
|
42
|
+
"fintech": _fintech_template,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if name not in templates:
|
|
46
|
+
raise ValueError(f"Unknown template: {name}. Available: {list(templates.keys())}")
|
|
47
|
+
|
|
48
|
+
config = templates[name]()
|
|
49
|
+
|
|
50
|
+
# Apply row multiplier
|
|
51
|
+
if row_multiplier != 1.0:
|
|
52
|
+
for table in config.tables:
|
|
53
|
+
if not table.is_reference:
|
|
54
|
+
table.row_count = int(table.row_count * row_multiplier)
|
|
55
|
+
|
|
56
|
+
return config
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _ecommerce_template() -> SchemaConfig:
|
|
60
|
+
"""E-commerce platform with products, orders, reviews."""
|
|
61
|
+
return SchemaConfig(
|
|
62
|
+
name="E-commerce Platform",
|
|
63
|
+
description="Complete e-commerce dataset with products, orders, and reviews",
|
|
64
|
+
seed=42,
|
|
65
|
+
tables=[
|
|
66
|
+
# Reference tables
|
|
67
|
+
Table(
|
|
68
|
+
name="categories",
|
|
69
|
+
is_reference=True,
|
|
70
|
+
inline_data=[
|
|
71
|
+
{"id": 1, "name": "Electronics", "margin_pct": 15},
|
|
72
|
+
{"id": 2, "name": "Clothing", "margin_pct": 40},
|
|
73
|
+
{"id": 3, "name": "Home & Garden", "margin_pct": 25},
|
|
74
|
+
{"id": 4, "name": "Sports", "margin_pct": 30},
|
|
75
|
+
{"id": 5, "name": "Books", "margin_pct": 35},
|
|
76
|
+
{"id": 6, "name": "Beauty", "margin_pct": 50},
|
|
77
|
+
],
|
|
78
|
+
),
|
|
79
|
+
Table(
|
|
80
|
+
name="shipping_methods",
|
|
81
|
+
is_reference=True,
|
|
82
|
+
inline_data=[
|
|
83
|
+
{"id": 1, "name": "Standard", "days": 5, "cost": 4.99},
|
|
84
|
+
{"id": 2, "name": "Express", "days": 2, "cost": 9.99},
|
|
85
|
+
{"id": 3, "name": "Next Day", "days": 1, "cost": 19.99},
|
|
86
|
+
{"id": 4, "name": "Free Shipping", "days": 7, "cost": 0.00},
|
|
87
|
+
],
|
|
88
|
+
),
|
|
89
|
+
# Transactional tables
|
|
90
|
+
Table(name="customers", row_count=10000),
|
|
91
|
+
Table(name="products", row_count=500),
|
|
92
|
+
Table(name="orders", row_count=50000),
|
|
93
|
+
Table(name="order_items", row_count=150000),
|
|
94
|
+
Table(name="reviews", row_count=20000),
|
|
95
|
+
],
|
|
96
|
+
columns={
|
|
97
|
+
"customers": [
|
|
98
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 10000}, unique=True),
|
|
99
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
100
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
101
|
+
Column(name="city", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
|
|
102
|
+
Column(name="created_at", type="date", distribution_params={"start": "2020-01-01", "end": "2024-12-31"}),
|
|
103
|
+
Column(name="is_premium", type="boolean", distribution_params={"probability": 0.15}),
|
|
104
|
+
],
|
|
105
|
+
"products": [
|
|
106
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 500}, unique=True),
|
|
107
|
+
Column(name="name", type="text", distribution_params={"text_type": "sentence"}),
|
|
108
|
+
Column(name="category_id", type="foreign_key", distribution_params={}),
|
|
109
|
+
Column(name="price", type="float", distribution_params={"distribution": "uniform", "min": 9.99, "max": 299.99, "decimals": 2}),
|
|
110
|
+
Column(name="stock", type="int", distribution_params={"distribution": "poisson", "lambda": 50}),
|
|
111
|
+
],
|
|
112
|
+
"orders": [
|
|
113
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 50000}, unique=True),
|
|
114
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
115
|
+
Column(name="shipping_method_id", type="foreign_key", distribution_params={}),
|
|
116
|
+
Column(name="order_date", type="date", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
|
|
117
|
+
Column(name="status", type="categorical", distribution_params={"choices": ["completed", "pending", "shipped", "cancelled"], "probabilities": [0.6, 0.15, 0.2, 0.05]}),
|
|
118
|
+
Column(name="total", type="float", distribution_params={"distribution": "exponential", "scale": 75, "min": 10, "decimals": 2}),
|
|
119
|
+
],
|
|
120
|
+
"order_items": [
|
|
121
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 150000}, unique=True),
|
|
122
|
+
Column(name="order_id", type="foreign_key", distribution_params={}),
|
|
123
|
+
Column(name="product_id", type="foreign_key", distribution_params={}),
|
|
124
|
+
Column(name="quantity", type="int", distribution_params={"distribution": "poisson", "lambda": 2, "min": 1}),
|
|
125
|
+
Column(name="unit_price", type="float", distribution_params={"distribution": "uniform", "min": 5.0, "max": 200.0, "decimals": 2}),
|
|
126
|
+
],
|
|
127
|
+
"reviews": [
|
|
128
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 20000}, unique=True),
|
|
129
|
+
Column(name="product_id", type="foreign_key", distribution_params={}),
|
|
130
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
131
|
+
Column(name="rating", type="int", distribution_params={"distribution": "categorical", "choices": [1, 2, 3, 4, 5], "probabilities": [0.05, 0.08, 0.15, 0.32, 0.40]}),
|
|
132
|
+
Column(name="title", type="text", distribution_params={"text_type": "sentence", "smart_generate": True}),
|
|
133
|
+
Column(name="created_at", type="date", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
|
|
134
|
+
],
|
|
135
|
+
},
|
|
136
|
+
relationships=[
|
|
137
|
+
Relationship(parent_table="categories", child_table="products", parent_key="id", child_key="category_id"),
|
|
138
|
+
Relationship(parent_table="customers", child_table="orders", parent_key="id", child_key="customer_id"),
|
|
139
|
+
Relationship(parent_table="shipping_methods", child_table="orders", parent_key="id", child_key="shipping_method_id"),
|
|
140
|
+
Relationship(parent_table="orders", child_table="order_items", parent_key="id", child_key="order_id"),
|
|
141
|
+
Relationship(parent_table="products", child_table="order_items", parent_key="id", child_key="product_id"),
|
|
142
|
+
Relationship(parent_table="products", child_table="reviews", parent_key="id", child_key="product_id"),
|
|
143
|
+
Relationship(parent_table="customers", child_table="reviews", parent_key="id", child_key="customer_id"),
|
|
144
|
+
],
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _saas_template() -> SchemaConfig:
|
|
149
|
+
"""SaaS platform with users, subscriptions, and usage events."""
|
|
150
|
+
return SchemaConfig(
|
|
151
|
+
name="SaaS Platform",
|
|
152
|
+
description="B2B SaaS with companies, users, subscriptions, and usage tracking",
|
|
153
|
+
seed=42,
|
|
154
|
+
tables=[
|
|
155
|
+
Table(
|
|
156
|
+
name="plans",
|
|
157
|
+
is_reference=True,
|
|
158
|
+
inline_data=[
|
|
159
|
+
{"id": 1, "name": "Free", "price": 0, "seats": 1, "features": "Basic"},
|
|
160
|
+
{"id": 2, "name": "Starter", "price": 29, "seats": 5, "features": "Core features"},
|
|
161
|
+
{"id": 3, "name": "Professional", "price": 99, "seats": 20, "features": "All features"},
|
|
162
|
+
{"id": 4, "name": "Enterprise", "price": 299, "seats": 100, "features": "Custom"},
|
|
163
|
+
],
|
|
164
|
+
),
|
|
165
|
+
Table(name="companies", row_count=1000),
|
|
166
|
+
Table(name="users", row_count=25000),
|
|
167
|
+
Table(name="subscriptions", row_count=1200),
|
|
168
|
+
Table(name="usage_events", row_count=500000),
|
|
169
|
+
],
|
|
170
|
+
columns={
|
|
171
|
+
"companies": [
|
|
172
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 1000}, unique=True),
|
|
173
|
+
Column(name="name", type="text", distribution_params={"text_type": "company"}),
|
|
174
|
+
Column(name="industry", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
|
|
175
|
+
Column(name="employee_count", type="int", distribution_params={"distribution": "exponential", "scale": 50, "min": 1}),
|
|
176
|
+
Column(name="created_at", type="date", distribution_params={"start": "2020-01-01", "end": "2024-06-30"}),
|
|
177
|
+
],
|
|
178
|
+
"users": [
|
|
179
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 25000}, unique=True),
|
|
180
|
+
Column(name="company_id", type="foreign_key", distribution_params={}),
|
|
181
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
182
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
183
|
+
Column(name="role", type="categorical", distribution_params={"choices": ["admin", "member", "viewer"], "probabilities": [0.1, 0.6, 0.3]}),
|
|
184
|
+
Column(name="is_active", type="boolean", distribution_params={"probability": 0.85}),
|
|
185
|
+
Column(name="last_login", type="date", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
|
|
186
|
+
],
|
|
187
|
+
"subscriptions": [
|
|
188
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 1200}, unique=True),
|
|
189
|
+
Column(name="company_id", type="foreign_key", distribution_params={}),
|
|
190
|
+
Column(name="plan_id", type="foreign_key", distribution_params={}),
|
|
191
|
+
Column(name="status", type="categorical", distribution_params={"choices": ["active", "cancelled", "trial", "past_due"], "probabilities": [0.7, 0.1, 0.15, 0.05]}),
|
|
192
|
+
Column(name="start_date", type="date", distribution_params={"start": "2022-01-01", "end": "2024-12-31"}),
|
|
193
|
+
Column(name="mrr", type="float", distribution_params={"distribution": "exponential", "scale": 100, "min": 0, "decimals": 2}),
|
|
194
|
+
],
|
|
195
|
+
"usage_events": [
|
|
196
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 500000}, unique=True),
|
|
197
|
+
Column(name="user_id", type="foreign_key", distribution_params={}),
|
|
198
|
+
Column(name="event_type", type="categorical", distribution_params={"choices": ["page_view", "api_call", "export", "login", "feature_use"], "probabilities": [0.4, 0.3, 0.1, 0.1, 0.1]}),
|
|
199
|
+
Column(name="timestamp", type="datetime", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
|
|
200
|
+
],
|
|
201
|
+
},
|
|
202
|
+
relationships=[
|
|
203
|
+
Relationship(parent_table="companies", child_table="users", parent_key="id", child_key="company_id"),
|
|
204
|
+
Relationship(parent_table="companies", child_table="subscriptions", parent_key="id", child_key="company_id"),
|
|
205
|
+
Relationship(parent_table="plans", child_table="subscriptions", parent_key="id", child_key="plan_id"),
|
|
206
|
+
Relationship(parent_table="users", child_table="usage_events", parent_key="id", child_key="user_id"),
|
|
207
|
+
],
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _healthcare_template() -> SchemaConfig:
|
|
212
|
+
"""Healthcare system with patients, doctors, appointments, prescriptions."""
|
|
213
|
+
return SchemaConfig(
|
|
214
|
+
name="Healthcare System",
|
|
215
|
+
description="Hospital management with patients, appointments, and prescriptions",
|
|
216
|
+
seed=42,
|
|
217
|
+
tables=[
|
|
218
|
+
Table(
|
|
219
|
+
name="specialties",
|
|
220
|
+
is_reference=True,
|
|
221
|
+
inline_data=[
|
|
222
|
+
{"id": 1, "name": "General Practice", "avg_consult_mins": 15},
|
|
223
|
+
{"id": 2, "name": "Cardiology", "avg_consult_mins": 30},
|
|
224
|
+
{"id": 3, "name": "Dermatology", "avg_consult_mins": 20},
|
|
225
|
+
{"id": 4, "name": "Orthopedics", "avg_consult_mins": 25},
|
|
226
|
+
{"id": 5, "name": "Pediatrics", "avg_consult_mins": 20},
|
|
227
|
+
{"id": 6, "name": "Psychiatry", "avg_consult_mins": 45},
|
|
228
|
+
{"id": 7, "name": "Neurology", "avg_consult_mins": 30},
|
|
229
|
+
],
|
|
230
|
+
),
|
|
231
|
+
Table(name="patients", row_count=10000),
|
|
232
|
+
Table(name="doctors", row_count=100),
|
|
233
|
+
Table(name="appointments", row_count=50000),
|
|
234
|
+
Table(name="prescriptions", row_count=75000),
|
|
235
|
+
],
|
|
236
|
+
columns={
|
|
237
|
+
"patients": [
|
|
238
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 10000}, unique=True),
|
|
239
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
240
|
+
Column(name="date_of_birth", type="date", distribution_params={"start": "1940-01-01", "end": "2020-12-31"}),
|
|
241
|
+
Column(name="gender", type="categorical", distribution_params={"choices": ["M", "F", "Other"], "probabilities": [0.48, 0.48, 0.04]}),
|
|
242
|
+
Column(name="phone", type="text", distribution_params={"text_type": "phone"}),
|
|
243
|
+
Column(name="insurance_id", type="text", distribution_params={"text_type": "word"}),
|
|
244
|
+
],
|
|
245
|
+
"doctors": [
|
|
246
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 100}, unique=True),
|
|
247
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
248
|
+
Column(name="specialty_id", type="foreign_key", distribution_params={}),
|
|
249
|
+
Column(name="years_experience", type="int", distribution_params={"distribution": "normal", "mean": 15, "std": 8, "min": 1, "max": 40}),
|
|
250
|
+
Column(name="is_accepting_patients", type="boolean", distribution_params={"probability": 0.8}),
|
|
251
|
+
],
|
|
252
|
+
"appointments": [
|
|
253
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 50000}, unique=True),
|
|
254
|
+
Column(name="patient_id", type="foreign_key", distribution_params={}),
|
|
255
|
+
Column(name="doctor_id", type="foreign_key", distribution_params={}),
|
|
256
|
+
Column(name="appointment_date", type="datetime", distribution_params={"start": "2023-01-01", "end": "2024-12-31"}),
|
|
257
|
+
Column(name="duration_mins", type="int", distribution_params={"distribution": "normal", "mean": 25, "std": 10, "min": 10, "max": 60}),
|
|
258
|
+
Column(name="status", type="categorical", distribution_params={"choices": ["completed", "scheduled", "cancelled", "no_show"], "probabilities": [0.65, 0.2, 0.1, 0.05]}),
|
|
259
|
+
Column(name="notes", type="text", distribution_params={"text_type": "sentence"}),
|
|
260
|
+
],
|
|
261
|
+
"prescriptions": [
|
|
262
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 75000}, unique=True),
|
|
263
|
+
Column(name="appointment_id", type="foreign_key", distribution_params={}),
|
|
264
|
+
Column(name="medication", type="text", distribution_params={"text_type": "word", "smart_generate": True}),
|
|
265
|
+
Column(name="dosage", type="text", distribution_params={"text_type": "word"}),
|
|
266
|
+
Column(name="duration_days", type="int", distribution_params={"distribution": "categorical", "choices": [7, 14, 30, 60, 90], "probabilities": [0.3, 0.25, 0.25, 0.1, 0.1]}),
|
|
267
|
+
],
|
|
268
|
+
},
|
|
269
|
+
relationships=[
|
|
270
|
+
Relationship(parent_table="specialties", child_table="doctors", parent_key="id", child_key="specialty_id"),
|
|
271
|
+
Relationship(parent_table="patients", child_table="appointments", parent_key="id", child_key="patient_id"),
|
|
272
|
+
Relationship(parent_table="doctors", child_table="appointments", parent_key="id", child_key="doctor_id"),
|
|
273
|
+
Relationship(parent_table="appointments", child_table="prescriptions", parent_key="id", child_key="appointment_id"),
|
|
274
|
+
],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _fintech_template() -> SchemaConfig:
|
|
279
|
+
"""Fintech platform with accounts, transactions, and fraud detection."""
|
|
280
|
+
return SchemaConfig(
|
|
281
|
+
name="Fintech Platform",
|
|
282
|
+
description="Banking/payments platform with accounts, transactions, and fraud labels",
|
|
283
|
+
seed=42,
|
|
284
|
+
tables=[
|
|
285
|
+
Table(
|
|
286
|
+
name="account_types",
|
|
287
|
+
is_reference=True,
|
|
288
|
+
inline_data=[
|
|
289
|
+
{"id": 1, "name": "Checking", "min_balance": 0, "monthly_fee": 0},
|
|
290
|
+
{"id": 2, "name": "Savings", "min_balance": 100, "monthly_fee": 0},
|
|
291
|
+
{"id": 3, "name": "Premium", "min_balance": 5000, "monthly_fee": 15},
|
|
292
|
+
{"id": 4, "name": "Business", "min_balance": 1000, "monthly_fee": 25},
|
|
293
|
+
],
|
|
294
|
+
),
|
|
295
|
+
Table(
|
|
296
|
+
name="transaction_types",
|
|
297
|
+
is_reference=True,
|
|
298
|
+
inline_data=[
|
|
299
|
+
{"id": 1, "name": "deposit", "direction": "in"},
|
|
300
|
+
{"id": 2, "name": "withdrawal", "direction": "out"},
|
|
301
|
+
{"id": 3, "name": "transfer", "direction": "both"},
|
|
302
|
+
{"id": 4, "name": "payment", "direction": "out"},
|
|
303
|
+
{"id": 5, "name": "refund", "direction": "in"},
|
|
304
|
+
],
|
|
305
|
+
),
|
|
306
|
+
Table(name="customers", row_count=25000),
|
|
307
|
+
Table(name="accounts", row_count=35000),
|
|
308
|
+
Table(name="transactions", row_count=500000),
|
|
309
|
+
],
|
|
310
|
+
columns={
|
|
311
|
+
"customers": [
|
|
312
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 25000}, unique=True),
|
|
313
|
+
Column(name="name", type="text", distribution_params={"text_type": "name"}),
|
|
314
|
+
Column(name="email", type="text", distribution_params={"text_type": "email"}),
|
|
315
|
+
Column(name="phone", type="text", distribution_params={"text_type": "phone"}),
|
|
316
|
+
Column(name="created_at", type="date", distribution_params={"start": "2018-01-01", "end": "2024-12-31"}),
|
|
317
|
+
Column(name="risk_score", type="int", distribution_params={"distribution": "normal", "mean": 30, "std": 20, "min": 0, "max": 100}),
|
|
318
|
+
Column(name="is_verified", type="boolean", distribution_params={"probability": 0.92}),
|
|
319
|
+
],
|
|
320
|
+
"accounts": [
|
|
321
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 35000}, unique=True),
|
|
322
|
+
Column(name="customer_id", type="foreign_key", distribution_params={}),
|
|
323
|
+
Column(name="account_type_id", type="foreign_key", distribution_params={}),
|
|
324
|
+
Column(name="balance", type="float", distribution_params={"distribution": "exponential", "scale": 5000, "min": 0, "decimals": 2}),
|
|
325
|
+
Column(name="opened_date", type="date", distribution_params={"start": "2018-01-01", "end": "2024-12-31"}),
|
|
326
|
+
Column(name="is_active", type="boolean", distribution_params={"probability": 0.88}),
|
|
327
|
+
],
|
|
328
|
+
"transactions": [
|
|
329
|
+
Column(name="id", type="int", distribution_params={"min": 1, "max": 500000}, unique=True),
|
|
330
|
+
Column(name="account_id", type="foreign_key", distribution_params={}),
|
|
331
|
+
Column(name="transaction_type_id", type="foreign_key", distribution_params={}),
|
|
332
|
+
Column(name="amount", type="float", distribution_params={"distribution": "exponential", "scale": 150, "min": 0.01, "decimals": 2}),
|
|
333
|
+
Column(name="timestamp", type="datetime", distribution_params={"start": "2024-01-01", "end": "2024-12-31"}),
|
|
334
|
+
Column(name="merchant", type="text", distribution_params={"text_type": "company"}),
|
|
335
|
+
Column(name="is_fraud", type="boolean", distribution_params={"probability": 0.012}), # 1.2% fraud rate
|
|
336
|
+
],
|
|
337
|
+
},
|
|
338
|
+
relationships=[
|
|
339
|
+
Relationship(parent_table="customers", child_table="accounts", parent_key="id", child_key="customer_id"),
|
|
340
|
+
Relationship(parent_table="account_types", child_table="accounts", parent_key="id", child_key="account_type_id"),
|
|
341
|
+
Relationship(parent_table="accounts", child_table="transactions", parent_key="id", child_key="account_id"),
|
|
342
|
+
Relationship(parent_table="transaction_types", child_table="transactions", parent_key="id", child_key="transaction_type_id"),
|
|
343
|
+
],
|
|
344
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: misata
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0b0
|
|
4
4
|
Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
|
|
5
5
|
Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -23,6 +23,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Database
|
|
24
24
|
Requires-Python: >=3.10
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
26
27
|
Requires-Dist: pandas>=2.0.0
|
|
27
28
|
Requires-Dist: numpy>=1.24.0
|
|
28
29
|
Requires-Dist: pydantic>=2.0.0
|
|
@@ -41,6 +42,7 @@ Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
|
|
|
41
42
|
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
42
43
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
44
|
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
44
46
|
|
|
45
47
|
# 🧠 Misata
|
|
46
48
|
|
|
@@ -48,7 +50,7 @@ Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
|
48
50
|
|
|
49
51
|
No schema writing. No training data. Just describe what you need.
|
|
50
52
|
|
|
51
|
-
[]()
|
|
52
54
|
[]()
|
|
53
55
|
[]()
|
|
54
56
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
misata/__init__.py,sha256=Vra5zMkd5Y6HTzhGRc76jTv10Z0yuhw33MDUoLpACrE,3144
|
|
2
|
+
misata/api.py,sha256=Wq2H3iJzocNTsCzb9vhYJxDyag3Yiucvb-GVF0tdKhI,14999
|
|
3
|
+
misata/audit.py,sha256=4eUCHT2STptemfakWeNODbVuBRhyD8Q32LlB2eufvuw,12291
|
|
4
|
+
misata/benchmark.py,sha256=Y1-tuKegJyAlTneROQpPo276qnfmMmupGDbVDs9k5J8,12358
|
|
5
|
+
misata/cache.py,sha256=fuLk7cQ7hOEmlqEWmm-O516L26btZ6zFO8FdrqFCRLg,7087
|
|
6
|
+
misata/cli.py,sha256=a7YijZCUYrkCYGVYJ2nZSL9J3JfFqbXQQOad6bhy7zM,22642
|
|
7
|
+
misata/codegen.py,sha256=m7ykTtLgITvaqzVB1cVhs1b9Puo2X4uyzngZ85wi6J0,5791
|
|
8
|
+
misata/constraints.py,sha256=8jUKlA2VyVomDnl2zz0RDkUqxEkxlwUnBbHTXr_SA5g,9937
|
|
9
|
+
misata/context.py,sha256=tjYrU67wjII07Pl3MKV_uCMl_s55DIOQZCouxAryyzE,8509
|
|
10
|
+
misata/curve_fitting.py,sha256=gLj4BkIxNWKkfo3QKZFI_aq60bsXlI53K5yZX4hc9EU,4126
|
|
11
|
+
misata/customization.py,sha256=pw-BEsPKN091hyOrQWWQoRhTrlmQ9_PXXopm2FZSEvs,8551
|
|
12
|
+
misata/exceptions.py,sha256=C3IGMk8xAy9AmRVWeSAnLHHui7drv6rzgzvOmr6gh50,8335
|
|
13
|
+
misata/feedback.py,sha256=HBEsoKi_vdRqwRzMoVFVj_cjfzQ5SUAaGz40s1HMD50,13313
|
|
14
|
+
misata/formulas.py,sha256=KOTq5YN_19vv1ERd92bdzKot9yo9rrrwjOuWO13nFCg,11210
|
|
15
|
+
misata/generators.py,sha256=NrMF12i6CB7K6fUsqcqurmZBBQ382ZhVnYB9oMBIZCE,8844
|
|
16
|
+
misata/hybrid.py,sha256=5oopAdfOLWUYzdRWlc0plVeVEVg7Nu1CVGNNCDSjQt8,13104
|
|
17
|
+
misata/llm_parser.py,sha256=2SVozbKtb0kaPaR4ERz9FtIIxK5jQVaYJ8L_xC6gU10,20662
|
|
18
|
+
misata/noise.py,sha256=UO7MokzQ5Y5Vj7JaayDUG0JwCLnpHtnpQTcJ4UHWibo,10460
|
|
19
|
+
misata/profiles.py,sha256=0djys8wWvH8VP74KmGn6cGLuOb64h9Hk0g0bkXOfxP4,9578
|
|
20
|
+
misata/quality.py,sha256=VSntJfMnF1tVWJ05fvbVJOMcAPEB7QtuEg18k6aEwhA,11685
|
|
21
|
+
misata/schema.py,sha256=zMYDPCgPfcy_STgANiS-Ow3dUETpW3Ayo02G88jmBe0,8954
|
|
22
|
+
misata/semantic.py,sha256=0fauGWJ75wlbHVqT0hohYTN4m_nscdaMaVAIfkhTZXk,7087
|
|
23
|
+
misata/simulator.py,sha256=nq9KxOS-4oUMNu7a2Ten0TQyhT2u_rTo2ImmvdkMRbU,34037
|
|
24
|
+
misata/smart_values.py,sha256=8-TYBK5cVBst9tfGuQXXetOLSqgns_NKnIl14rpVrbk,35870
|
|
25
|
+
misata/story_parser.py,sha256=7N7so3KWisl2UxkOtENQwP-4hN2cs9vTKsPHVRZB2Mc,15964
|
|
26
|
+
misata/streaming.py,sha256=qbEnoFRfn9a7H_gWlq5C3TwbNUnP5U98OPo1EdU_cQ0,7578
|
|
27
|
+
misata/validation.py,sha256=5yJSN7jecVNLJ8ss6y7l2U4hF1Ljn27Q6Xs9N1iDPcw,10791
|
|
28
|
+
misata/generators/__init__.py,sha256=V4I_1IucuywRJZH3cLxKvBd2Ib7kE0WIJ7tq8y4lkx8,568
|
|
29
|
+
misata/generators/base.py,sha256=iON9iAONMEQdbq2Fdric3V3bWn3caD1ITC16DTCK0Og,21329
|
|
30
|
+
misata/templates/__init__.py,sha256=0RcZz9d4bmCqLAr77h0gpMfHncqAPeZCguqsuGCz7rE,25245
|
|
31
|
+
misata/templates/library.py,sha256=eMex18ZKlzQqIkGFgs1uy9QGs7PmUN_VVL4txKvxynM,20930
|
|
32
|
+
misata-0.3.0b0.dist-info/licenses/LICENSE,sha256=oagkechmfr9iT214N871zCm7TnB0KTfPjAUWxHsYJ4I,1071
|
|
33
|
+
misata-0.3.0b0.dist-info/METADATA,sha256=Wxpa2V0Sum-CFOpNnmRd27eEDfyT9CKIy-4nGZnrCys,8114
|
|
34
|
+
misata-0.3.0b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
misata-0.3.0b0.dist-info/entry_points.txt,sha256=k3SDuju7VnqB4AcY0Vufw-j1tWU3Ay612G3DGqoNs0U,43
|
|
36
|
+
misata-0.3.0b0.dist-info/top_level.txt,sha256=dpwR99XWKUAXqNg7WiNLu_XYd7WYGmZpJzrfQXbAZFs,7
|
|
37
|
+
misata-0.3.0b0.dist-info/RECORD,,
|