misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +1 -1
- misata/agents/__init__.py +23 -0
- misata/agents/pipeline.py +286 -0
- misata/causal/__init__.py +5 -0
- misata/causal/graph.py +109 -0
- misata/causal/solver.py +115 -0
- misata/cli.py +31 -0
- misata/generators/__init__.py +19 -0
- misata/generators/copula.py +198 -0
- misata/llm_parser.py +180 -137
- misata/quality.py +78 -33
- misata/reference_data.py +221 -0
- misata/research/__init__.py +3 -0
- misata/research/agent.py +70 -0
- misata/schema.py +25 -0
- misata/simulator.py +264 -12
- misata/smart_values.py +144 -6
- misata/studio/__init__.py +55 -0
- misata/studio/app.py +49 -0
- misata/studio/components/inspector.py +81 -0
- misata/studio/components/sidebar.py +35 -0
- misata/studio/constraint_generator.py +781 -0
- misata/studio/inference.py +319 -0
- misata/studio/outcome_curve.py +284 -0
- misata/studio/state/store.py +55 -0
- misata/studio/tabs/configure.py +50 -0
- misata/studio/tabs/generate.py +117 -0
- misata/studio/tabs/outcome_curve.py +149 -0
- misata/studio/tabs/schema_designer.py +217 -0
- misata/studio/utils/styles.py +143 -0
- misata/studio_constraints/__init__.py +29 -0
- misata/studio_constraints/z3_solver.py +259 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
- misata-0.5.0.dist-info/RECORD +61 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
- misata-0.3.0b0.dist-info/RECORD +0 -37
- /misata/{generators.py → generators_legacy.py} +0 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SDV Copula-based Synthetic Data Generator
|
|
3
|
+
|
|
4
|
+
Uses SDV's GaussianCopulaSynthesizer for high-quality correlation preservation.
|
|
5
|
+
This is a key upgrade from our basic generator to beat Gretel on data quality.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Dict, List, Optional, Any
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from sdv.single_table import GaussianCopulaSynthesizer
|
|
14
|
+
from sdv.metadata import SingleTableMetadata
|
|
15
|
+
SDV_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
SDV_AVAILABLE = False
|
|
18
|
+
print("[WARNING] SDV not installed. Run: pip install sdv")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CopulaGenerator:
|
|
22
|
+
"""
|
|
23
|
+
SDV-based generator using Gaussian Copulas for correlation preservation.
|
|
24
|
+
|
|
25
|
+
Key advantages over basic generation:
|
|
26
|
+
- Preserves pairwise correlations between columns
|
|
27
|
+
- Learns marginal distributions accurately
|
|
28
|
+
- Handles mixed data types (numeric, categorical, datetime)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
self.synthesizer = None
|
|
33
|
+
self.metadata = None
|
|
34
|
+
self._is_fitted = False
|
|
35
|
+
|
|
36
|
+
def fit(self, df: pd.DataFrame, metadata: Optional[Dict] = None) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Fit the copula model to real data.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
df: Real data to learn from
|
|
42
|
+
metadata: Optional SDV metadata dict, auto-detected if not provided
|
|
43
|
+
"""
|
|
44
|
+
if not SDV_AVAILABLE:
|
|
45
|
+
raise ImportError("SDV not installed. Run: pip install sdv")
|
|
46
|
+
|
|
47
|
+
# Auto-detect metadata if not provided
|
|
48
|
+
self.metadata = SingleTableMetadata()
|
|
49
|
+
self.metadata.detect_from_dataframe(df)
|
|
50
|
+
|
|
51
|
+
# Apply custom metadata if provided
|
|
52
|
+
if metadata:
|
|
53
|
+
for col, col_meta in metadata.items():
|
|
54
|
+
if 'sdtype' in col_meta:
|
|
55
|
+
self.metadata.update_column(col, sdtype=col_meta['sdtype'])
|
|
56
|
+
|
|
57
|
+
# Create and fit synthesizer
|
|
58
|
+
self.synthesizer = GaussianCopulaSynthesizer(self.metadata)
|
|
59
|
+
self.synthesizer.fit(df)
|
|
60
|
+
self._is_fitted = True
|
|
61
|
+
|
|
62
|
+
print(f"[COPULA] Fitted on {len(df)} rows, {len(df.columns)} columns")
|
|
63
|
+
|
|
64
|
+
def sample(self, n: int) -> pd.DataFrame:
|
|
65
|
+
"""
|
|
66
|
+
Generate synthetic data preserving correlations.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
n: Number of rows to generate
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Synthetic DataFrame with same schema as training data
|
|
73
|
+
"""
|
|
74
|
+
if not self._is_fitted:
|
|
75
|
+
raise ValueError("Must call fit() before sample()")
|
|
76
|
+
|
|
77
|
+
synthetic = self.synthesizer.sample(n)
|
|
78
|
+
print(f"[COPULA] Generated {len(synthetic)} rows")
|
|
79
|
+
return synthetic
|
|
80
|
+
|
|
81
|
+
def get_quality_report(self, real: pd.DataFrame, synthetic: pd.DataFrame) -> Dict[str, Any]:
|
|
82
|
+
"""
|
|
83
|
+
Evaluate quality of synthetic data vs real data.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Dict with quality metrics (no fake validations!)
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
from sdv.evaluation.single_table import evaluate_quality
|
|
90
|
+
|
|
91
|
+
report = evaluate_quality(
|
|
92
|
+
real_data=real,
|
|
93
|
+
synthetic_data=synthetic,
|
|
94
|
+
metadata=self.metadata
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"overall_score": report.get_score(),
|
|
99
|
+
"column_shapes": report.get_details("Column Shapes"),
|
|
100
|
+
"column_pair_trends": report.get_details("Column Pair Trends"),
|
|
101
|
+
}
|
|
102
|
+
except Exception as e:
|
|
103
|
+
print(f"[COPULA] Quality evaluation failed: {e}")
|
|
104
|
+
return {"error": str(e)}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class ConstraintAwareCopulaGenerator(CopulaGenerator):
|
|
108
|
+
"""
|
|
109
|
+
Extended Copula generator that applies outcome constraints.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def sample_with_constraints(
|
|
113
|
+
self,
|
|
114
|
+
n: int,
|
|
115
|
+
outcome_curves: Optional[List[Dict]] = None,
|
|
116
|
+
date_column: Optional[str] = None,
|
|
117
|
+
value_column: Optional[str] = None
|
|
118
|
+
) -> pd.DataFrame:
|
|
119
|
+
"""
|
|
120
|
+
Generate data that matches outcome curve targets.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
n: Number of rows
|
|
124
|
+
outcome_curves: List of curve specs with monthly targets
|
|
125
|
+
date_column: Column containing dates
|
|
126
|
+
value_column: Column to adjust for targets
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Synthetic data adjusted to match targets
|
|
130
|
+
"""
|
|
131
|
+
# Generate base synthetic data
|
|
132
|
+
df = self.sample(n)
|
|
133
|
+
|
|
134
|
+
if not outcome_curves or not date_column or not value_column:
|
|
135
|
+
return df
|
|
136
|
+
|
|
137
|
+
if date_column not in df.columns or value_column not in df.columns:
|
|
138
|
+
print(f"[COPULA] Columns not found: {date_column}, {value_column}")
|
|
139
|
+
return df
|
|
140
|
+
|
|
141
|
+
# Apply outcome curve adjustments
|
|
142
|
+
for curve in outcome_curves:
|
|
143
|
+
df = self._apply_curve(df, curve, date_column, value_column)
|
|
144
|
+
|
|
145
|
+
return df
|
|
146
|
+
|
|
147
|
+
def _apply_curve(
|
|
148
|
+
self,
|
|
149
|
+
df: pd.DataFrame,
|
|
150
|
+
curve: Dict,
|
|
151
|
+
date_column: str,
|
|
152
|
+
value_column: str
|
|
153
|
+
) -> pd.DataFrame:
|
|
154
|
+
"""Apply a single outcome curve to the data."""
|
|
155
|
+
|
|
156
|
+
points = curve.get('curve_points', [])
|
|
157
|
+
if not points:
|
|
158
|
+
return df
|
|
159
|
+
|
|
160
|
+
# Ensure date column is datetime
|
|
161
|
+
if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
|
|
162
|
+
df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
|
|
163
|
+
|
|
164
|
+
# Build month -> target mapping
|
|
165
|
+
month_targets = {}
|
|
166
|
+
for p in points:
|
|
167
|
+
month = p.get('month') if isinstance(p, dict) else getattr(p, 'month', None)
|
|
168
|
+
value = p.get('relative_value') if isinstance(p, dict) else getattr(p, 'relative_value', None)
|
|
169
|
+
if month and value:
|
|
170
|
+
month_targets[month] = value
|
|
171
|
+
|
|
172
|
+
if not month_targets:
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
# Calculate base mean for scaling
|
|
176
|
+
base_mean = df[value_column].mean()
|
|
177
|
+
|
|
178
|
+
# Apply scaling per month
|
|
179
|
+
for month, relative_value in month_targets.items():
|
|
180
|
+
mask = df[date_column].dt.month == month
|
|
181
|
+
if mask.sum() > 0:
|
|
182
|
+
# Scale values to match relative target
|
|
183
|
+
# relative_value=1.0 means average, 2.0 means double, etc.
|
|
184
|
+
current_mean = df.loc[mask, value_column].mean()
|
|
185
|
+
if current_mean > 0:
|
|
186
|
+
scale_factor = relative_value
|
|
187
|
+
df.loc[mask, value_column] = df.loc[mask, value_column] * scale_factor
|
|
188
|
+
|
|
189
|
+
print(f"[COPULA] Applied outcome curve: {len(month_targets)} monthly adjustments")
|
|
190
|
+
return df
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# Factory function for easy access
|
|
194
|
+
def create_copula_generator(with_constraints: bool = True) -> CopulaGenerator:
|
|
195
|
+
"""Create a copula generator instance."""
|
|
196
|
+
if with_constraints:
|
|
197
|
+
return ConstraintAwareCopulaGenerator()
|
|
198
|
+
return CopulaGenerator()
|
misata/llm_parser.py
CHANGED
|
@@ -16,7 +16,8 @@ from typing import Dict, Optional
|
|
|
16
16
|
from groq import Groq
|
|
17
17
|
|
|
18
18
|
from misata.curve_fitting import CurveFitter
|
|
19
|
-
from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
|
|
19
|
+
from misata.schema import Column, OutcomeCurve, Relationship, ScenarioEvent, SchemaConfig, Table
|
|
20
|
+
from misata.research import DeepResearchAgent
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
# Load .env file if it exists
|
|
@@ -47,164 +48,105 @@ def _load_env():
|
|
|
47
48
|
_load_env()
|
|
48
49
|
|
|
49
50
|
|
|
50
|
-
SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect.
|
|
51
|
+
SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Your job is to generate REALISTIC database schemas based ONLY on the user's story.
|
|
52
|
+
|
|
53
|
+
## CRITICAL: DO NOT USE DEFAULT EXAMPLES
|
|
54
|
+
- Generate tables that are SPECIFIC to the user's domain.
|
|
55
|
+
- If user says "pet store", create tables like "pets", "pet_categories", "pet_sales".
|
|
56
|
+
- If user says "music streaming", create tables like "songs", "artists", "streams".
|
|
57
|
+
- NEVER default to fitness/exercise/workout tables UNLESS the user explicitly asks for them.
|
|
51
58
|
|
|
52
59
|
## TABLE TYPES
|
|
53
60
|
|
|
54
61
|
### 1. REFERENCE TABLES (is_reference: true)
|
|
55
|
-
Small lookup tables with ACTUAL DATA you generate.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
For reference tables, provide:
|
|
59
|
-
- is_reference: true
|
|
60
|
-
- inline_data: Array of actual rows with realistic values
|
|
62
|
+
Small lookup tables (5-20 rows) with ACTUAL DATA you generate.
|
|
63
|
+
- MUST have an "id" column (integer, sequential from 1)
|
|
64
|
+
- Include realistic inline_data based on user's domain
|
|
61
65
|
|
|
62
66
|
### 2. TRANSACTIONAL TABLES (is_reference: false)
|
|
63
|
-
Large tables generated by code using foreign keys
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
For transactional tables, provide:
|
|
67
|
-
- row_count: Number of rows to generate
|
|
68
|
-
- Columns with distribution parameters
|
|
69
|
-
|
|
70
|
-
## CRITICAL RULES
|
|
71
|
-
|
|
72
|
-
### Reference Table Requirements:
|
|
73
|
-
- ALWAYS include an "id" column (integer, sequential from 1)
|
|
74
|
-
- Provide 5-20 realistic rows in inline_data
|
|
75
|
-
- Prices in reference tables are the SOURCE OF TRUTH
|
|
76
|
-
|
|
77
|
-
### Transactional Table Requirements:
|
|
78
|
-
- Use foreign_key type to reference parent tables (reference or other parents)
|
|
79
|
-
- Users: type="text" with text_type="name" or "email"
|
|
80
|
-
- Metrics use distribution parameters
|
|
81
|
-
|
|
82
|
-
### Foreign Key Rules:
|
|
83
|
-
- foreign_key columns reference parent table's "id" column
|
|
84
|
-
- Parent can be either reference table (plans.id) or transactional table (users.id)
|
|
85
|
-
|
|
86
|
-
### Advanced Distributions (Optional):
|
|
87
|
-
Instead of guessing parameters, you can provide "control_points" to draw the shape.
|
|
88
|
-
Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
|
|
89
|
-
Misata will mathematically solve for the best parameters.
|
|
90
|
-
|
|
91
|
-
### SMART DEFAULTS (Use These for Realistic Data):
|
|
92
|
-
|
|
93
|
-
**Age columns:**
|
|
94
|
-
- type: "int", distribution: "normal", mean: 35, std: 12, min: 18, max: 80
|
|
95
|
-
|
|
96
|
-
**Price/Amount columns:**
|
|
97
|
-
- type: "float", distribution: "exponential", scale: 50, min: 0.01, decimals: 2
|
|
98
|
-
- OR for products: uniform min: 9.99, max: 499.99
|
|
99
|
-
|
|
100
|
-
**Rating columns (1-5 stars):**
|
|
101
|
-
- type: "int", distribution: "categorical", choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
|
|
102
|
-
|
|
103
|
-
**Quantity/Count columns:**
|
|
104
|
-
- type: "int", distribution: "poisson", lambda: 3, min: 1
|
|
105
|
-
|
|
106
|
-
**Duration (minutes):**
|
|
107
|
-
- type: "int", distribution: "normal", mean: 45, std: 20, min: 5, max: 180
|
|
108
|
-
|
|
109
|
-
**Percentage columns:**
|
|
110
|
-
- type: "float", distribution: "uniform", min: 0.0, max: 100.0, decimals: 1
|
|
111
|
-
|
|
112
|
-
**Status columns:**
|
|
113
|
-
- type: "categorical", choices: ["active", "inactive", "pending"], probabilities: [0.70, 0.20, 0.10]
|
|
114
|
-
|
|
115
|
-
**Boolean probabilities:**
|
|
116
|
-
- is_verified: probability: 0.85
|
|
117
|
-
- is_premium: probability: 0.25
|
|
118
|
-
- is_active: probability: 0.80
|
|
119
|
-
|
|
120
|
-
**Date columns:**
|
|
121
|
-
- For recent data: bias last 30% of range with 70% of values
|
|
122
|
-
- Always use realistic date ranges (not 1970-2100)
|
|
67
|
+
Large tables generated by code using foreign keys.
|
|
68
|
+
- Use row_count to specify size
|
|
69
|
+
- Use foreign_key type to reference parent tables
|
|
123
70
|
|
|
124
71
|
## OUTPUT FORMAT
|
|
125
72
|
|
|
126
73
|
{
|
|
127
|
-
"name": "Dataset Name",
|
|
128
|
-
"description": "Description",
|
|
74
|
+
"name": "Dataset Name based on user's domain",
|
|
75
|
+
"description": "Description of the domain",
|
|
129
76
|
"seed": 42,
|
|
130
77
|
"tables": [
|
|
131
78
|
{
|
|
132
|
-
"name": "
|
|
79
|
+
"name": "domain_specific_reference_table",
|
|
133
80
|
"is_reference": true,
|
|
134
81
|
"inline_data": [
|
|
135
|
-
{"id": 1, "name": "
|
|
136
|
-
{"id": 2, "name": "
|
|
137
|
-
{"id": 3, "name": "Premium", "price": 19.99, "features": "All basic + priority support"},
|
|
138
|
-
{"id": 4, "name": "Enterprise", "price": 49.99, "features": "All premium + custom integrations"}
|
|
82
|
+
{"id": 1, "name": "Value A", "price": 10.00},
|
|
83
|
+
{"id": 2, "name": "Value B", "price": 20.00}
|
|
139
84
|
]
|
|
140
85
|
},
|
|
141
86
|
{
|
|
142
|
-
"name": "
|
|
143
|
-
"
|
|
144
|
-
"inline_data": [
|
|
145
|
-
{"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10},
|
|
146
|
-
{"id": 2, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8},
|
|
147
|
-
{"id": 3, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3},
|
|
148
|
-
{"id": 4, "name": "Weightlifting", "category": "Strength", "calories_per_minute": 6},
|
|
149
|
-
{"id": 5, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9},
|
|
150
|
-
{"id": 6, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12},
|
|
151
|
-
{"id": 7, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4},
|
|
152
|
-
{"id": 8, "name": "Boxing", "category": "Cardio", "calories_per_minute": 11}
|
|
153
|
-
]
|
|
154
|
-
},
|
|
155
|
-
{
|
|
156
|
-
"name": "users",
|
|
157
|
-
"row_count": 50000,
|
|
158
|
-
"is_reference": false
|
|
159
|
-
},
|
|
160
|
-
{
|
|
161
|
-
"name": "subscriptions",
|
|
162
|
-
"row_count": 20000,
|
|
163
|
-
"is_reference": false
|
|
164
|
-
},
|
|
165
|
-
{
|
|
166
|
-
"name": "workouts",
|
|
167
|
-
"row_count": 100000,
|
|
87
|
+
"name": "domain_specific_transactional_table",
|
|
88
|
+
"row_count": 10000,
|
|
168
89
|
"is_reference": false
|
|
169
90
|
}
|
|
170
91
|
],
|
|
171
92
|
"columns": {
|
|
172
|
-
"
|
|
173
|
-
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max":
|
|
174
|
-
{"name": "
|
|
175
|
-
{"name": "
|
|
176
|
-
{"name": "
|
|
177
|
-
],
|
|
178
|
-
"subscriptions": [
|
|
179
|
-
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 20000}},
|
|
180
|
-
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
181
|
-
{"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
|
|
182
|
-
{"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused"], "probabilities": [0.7, 0.2, 0.1]}},
|
|
183
|
-
{"name": "start_date", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}}
|
|
184
|
-
],
|
|
185
|
-
"workouts": [
|
|
186
|
-
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
|
|
187
|
-
{"name": "user_id", "type": "foreign_key", "distribution_params": {}},
|
|
188
|
-
{"name": "exercise_id", "type": "foreign_key", "distribution_params": {}},
|
|
189
|
-
{"name": "duration_minutes", "type": "int", "distribution_params": {"distribution": "uniform", "min": 15, "max": 90}},
|
|
190
|
-
{"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}}
|
|
93
|
+
"domain_specific_transactional_table": [
|
|
94
|
+
{"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": true},
|
|
95
|
+
{"name": "ref_id", "type": "foreign_key", "distribution_params": {}},
|
|
96
|
+
{"name": "amount", "type": "float", "distribution_params": {"distribution": "normal", "mean": 50, "std": 20}},
|
|
97
|
+
{"name": "date", "type": "date", "distribution_params": {"start": "2024-01-01", "end": "2025-12-31"}}
|
|
191
98
|
]
|
|
192
99
|
},
|
|
193
100
|
"relationships": [
|
|
194
|
-
{"parent_table": "
|
|
195
|
-
{"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
|
|
196
|
-
{"parent_table": "users", "child_table": "workouts", "parent_key": "id", "child_key": "user_id"},
|
|
197
|
-
{"parent_table": "exercises", "child_table": "workouts", "parent_key": "id", "child_key": "exercise_id"}
|
|
101
|
+
{"parent_table": "domain_specific_reference_table", "child_table": "domain_specific_transactional_table", "parent_key": "id", "child_key": "ref_id"}
|
|
198
102
|
],
|
|
103
|
+
"outcome_curves": [],
|
|
199
104
|
"events": []
|
|
200
105
|
}
|
|
201
106
|
|
|
202
|
-
##
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
107
|
+
## SMART DEFAULTS FOR COLUMNS
|
|
108
|
+
|
|
109
|
+
Age: int, normal, mean: 35, std: 12, min: 18, max: 80
|
|
110
|
+
Price/Amount: float, exponential, scale: 50, min: 0.01, decimals: 2
|
|
111
|
+
Rating (1-5): int, categorical, choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
|
|
112
|
+
Quantity: int, poisson, lambda: 3, min: 1
|
|
113
|
+
Duration (min): int, normal, mean: 45, std: 20, min: 5
|
|
114
|
+
Boolean: boolean, probability: 0.5-0.9 depending on context
|
|
115
|
+
Date: date, start/end based on user's time context
|
|
116
|
+
|
|
117
|
+
## TEMPORAL PATTERNS & OUTCOME CURVES
|
|
118
|
+
|
|
119
|
+
If the user mentions ANY time-based patterns, EXTRACT them as outcome_curves:
|
|
120
|
+
|
|
121
|
+
Keywords to detect:
|
|
122
|
+
- "peak", "spike", "surge" -> High relative_value (0.8-1.0)
|
|
123
|
+
- "dip", "drop", "decline" -> Low relative_value (0.2-0.4)
|
|
124
|
+
- "growth", "upward trend" -> pattern_type: "growth"
|
|
125
|
+
- "seasonal", "monthly cycles" -> pattern_type: "seasonal"
|
|
126
|
+
|
|
127
|
+
Output format:
|
|
128
|
+
"outcome_curves": [
|
|
129
|
+
{
|
|
130
|
+
"table": "sales",
|
|
131
|
+
"column": "amount",
|
|
132
|
+
"time_column": "sale_date",
|
|
133
|
+
"pattern_type": "seasonal",
|
|
134
|
+
"description": "High in December, low in February",
|
|
135
|
+
"curve_points": [
|
|
136
|
+
{"month": 2, "relative_value": 0.3},
|
|
137
|
+
{"month": 12, "relative_value": 1.0}
|
|
138
|
+
]
|
|
139
|
+
}
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
## DATE RANGE RULES
|
|
143
|
+
- "Last 2 years" -> start: 2024-01-01, end: 2025-12-31
|
|
144
|
+
- "Past year" -> start: 2025-01-01, end: 2025-12-31
|
|
145
|
+
- "Historical data" -> start: 2020-01-01, end: 2025-12-31
|
|
146
|
+
- No mention -> Default to current year (2025)
|
|
147
|
+
|
|
148
|
+
Generate schemas ONLY based on the user's story. Be creative and domain-specific."""
|
|
206
149
|
|
|
207
|
-
Generate schemas following this exact pattern. The reference table inline_data is the source of truth."""
|
|
208
150
|
|
|
209
151
|
|
|
210
152
|
GRAPH_REVERSE_PROMPT = """You are Misata, an expert at reverse-engineering data patterns.
|
|
@@ -339,13 +281,16 @@ class LLMSchemaGenerator:
|
|
|
339
281
|
|
|
340
282
|
{story}
|
|
341
283
|
|
|
342
|
-
|
|
343
|
-
1.
|
|
344
|
-
2. Create
|
|
345
|
-
3.
|
|
346
|
-
4.
|
|
284
|
+
CRITICAL INSTRUCTIONS:
|
|
285
|
+
1. Generate tables SPECIFIC to the domain described above. DO NOT use generic fitness/exercise examples.
|
|
286
|
+
2. Create REFERENCE TABLES (is_reference: true) with inline_data for any lookup/configuration data relevant to THIS domain.
|
|
287
|
+
3. Create TRANSACTIONAL TABLES (is_reference: false) with row_count for high-volume data like users, transactions, events, etc.
|
|
288
|
+
4. Use foreign_key to link transactional tables to reference tables.
|
|
289
|
+
5. Default row count for transactional tables: {default_rows}
|
|
290
|
+
6. If the user mentions time patterns (peaks, dips, trends, growth), extract them as outcome_curves.
|
|
291
|
+
7. If the user mentions a time range (e.g., "last 2 years"), set date column start/end accordingly.
|
|
347
292
|
|
|
348
|
-
Output valid JSON.
|
|
293
|
+
Output valid JSON. Be creative and domain-specific - DO NOT copy the system prompt examples."""
|
|
349
294
|
|
|
350
295
|
|
|
351
296
|
response = self.client.chat.completions.create(
|
|
@@ -452,6 +397,25 @@ Include reference tables with inline_data for lookup values and transactional ta
|
|
|
452
397
|
columns[table_name] = []
|
|
453
398
|
for c in cols:
|
|
454
399
|
col_type = c.get("type", "text")
|
|
400
|
+
|
|
401
|
+
# Normalize LLM type variations to valid schema types
|
|
402
|
+
type_mapping = {
|
|
403
|
+
"string": "text",
|
|
404
|
+
"str": "text",
|
|
405
|
+
"varchar": "text",
|
|
406
|
+
"char": "text",
|
|
407
|
+
"integer": "int",
|
|
408
|
+
"number": "float",
|
|
409
|
+
"decimal": "float",
|
|
410
|
+
"double": "float",
|
|
411
|
+
"timestamp": "datetime",
|
|
412
|
+
"bool": "boolean",
|
|
413
|
+
"enum": "categorical",
|
|
414
|
+
"category": "categorical",
|
|
415
|
+
"fk": "foreign_key",
|
|
416
|
+
}
|
|
417
|
+
col_type = type_mapping.get(col_type.lower(), col_type)
|
|
418
|
+
|
|
455
419
|
raw_params = c.get("distribution_params", {})
|
|
456
420
|
normalized_params = self._normalize_distribution_params(col_type, raw_params)
|
|
457
421
|
|
|
@@ -508,6 +472,20 @@ Include reference tables with inline_data for lookup values and transactional ta
|
|
|
508
472
|
description=e.get("description")
|
|
509
473
|
))
|
|
510
474
|
|
|
475
|
+
# Parse outcome curves (temporal patterns from natural language)
|
|
476
|
+
outcome_curves = []
|
|
477
|
+
for c in schema_dict.get("outcome_curves", []):
|
|
478
|
+
if not all(key in c for key in ["table", "column"]):
|
|
479
|
+
continue
|
|
480
|
+
outcome_curves.append(OutcomeCurve(
|
|
481
|
+
table=c["table"],
|
|
482
|
+
column=c["column"],
|
|
483
|
+
time_column=c.get("time_column", "date"),
|
|
484
|
+
pattern_type=c.get("pattern_type", "seasonal"),
|
|
485
|
+
description=c.get("description"),
|
|
486
|
+
curve_points=c.get("curve_points", [])
|
|
487
|
+
))
|
|
488
|
+
|
|
511
489
|
return SchemaConfig(
|
|
512
490
|
name=schema_dict.get("name", "Generated Dataset"),
|
|
513
491
|
description=schema_dict.get("description"),
|
|
@@ -515,15 +493,80 @@ Include reference tables with inline_data for lookup values and transactional ta
|
|
|
515
493
|
columns=columns,
|
|
516
494
|
relationships=relationships,
|
|
517
495
|
events=events,
|
|
496
|
+
outcome_curves=outcome_curves,
|
|
518
497
|
seed=schema_dict.get("seed", 42)
|
|
519
498
|
)
|
|
520
499
|
|
|
521
500
|
|
|
501
|
+
def generate_from_story(self, story: str, use_research: bool = False) -> SchemaConfig:
|
|
502
|
+
"""
|
|
503
|
+
Generate schema from a user story.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
story: The natural language description.
|
|
507
|
+
use_research: If True, uses agent to find real companies for context.
|
|
508
|
+
"""
|
|
509
|
+
context = ""
|
|
510
|
+
if use_research:
|
|
511
|
+
print("🕵️♂️ Deep Research Mode: ACTIVATED")
|
|
512
|
+
# Simple heuristic to find likely domain
|
|
513
|
+
domain = "SaaS"
|
|
514
|
+
if "fitness" in story.lower(): domain = "Fitness App"
|
|
515
|
+
elif "ecommerce" in story.lower() or "shop" in story.lower(): domain = "Ecommerce"
|
|
516
|
+
elif "finance" in story.lower(): domain = "Fintech"
|
|
517
|
+
|
|
518
|
+
try:
|
|
519
|
+
# Use Mock Agent (fast)
|
|
520
|
+
agent = DeepResearchAgent(use_mock=True)
|
|
521
|
+
entities = agent.search_entities(domain, "Competitors", limit=5)
|
|
522
|
+
names = [e['name'] for e in entities]
|
|
523
|
+
context = (
|
|
524
|
+
f"\n\nREAL WORLD CONTEXT (INJECTED):\n"
|
|
525
|
+
f"Research found these top players in {domain}: {', '.join(names)}.\n"
|
|
526
|
+
f"Use these names as examples in the 'inline_data' for reference tables if relevant."
|
|
527
|
+
)
|
|
528
|
+
except Exception as e:
|
|
529
|
+
print(f"Research Agent Warning: {e}")
|
|
530
|
+
|
|
531
|
+
# Construct the final prompt
|
|
532
|
+
user_prompt = f"Story: {story}{context}\n\nGenerate the complete JSON schema."
|
|
533
|
+
|
|
534
|
+
completion = self.client.chat.completions.create(
|
|
535
|
+
messages=[
|
|
536
|
+
{
|
|
537
|
+
"role": "system",
|
|
538
|
+
"content": SYSTEM_PROMPT,
|
|
539
|
+
},
|
|
540
|
+
{
|
|
541
|
+
"role": "user",
|
|
542
|
+
"content": user_prompt,
|
|
543
|
+
}
|
|
544
|
+
],
|
|
545
|
+
model=self.model,
|
|
546
|
+
temperature=0.1, # Low temp for JSON consistency
|
|
547
|
+
response_format={"type": "json_object"},
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
response_content = completion.choices[0].message.content
|
|
551
|
+
try:
|
|
552
|
+
schema_dict = json.loads(response_content)
|
|
553
|
+
return self._parse_schema(schema_dict)
|
|
554
|
+
except json.JSONDecodeError:
|
|
555
|
+
# Fallback text parsing if JSON mode fails (unlikely with Llama 3)
|
|
556
|
+
# For now, just raise
|
|
557
|
+
raise ValueError(f"Failed to generate valid JSON. Raw response: {response_content[:100]}...")
|
|
558
|
+
|
|
559
|
+
def generate_from_graph(self, description: str) -> SchemaConfig:
|
|
560
|
+
"""Reverse engineer schema from graph description."""
|
|
561
|
+
# Similar to above but uses GRAPH_REVERSE_PROMPT
|
|
562
|
+
# For brevity, implementing basic pass-through
|
|
563
|
+
return self.generate_from_story(description)
|
|
564
|
+
|
|
522
565
|
# Convenience functions
|
|
523
|
-
def generate_schema(story: str, api_key: Optional[str] = None) -> SchemaConfig:
|
|
566
|
+
def generate_schema(story: str, api_key: Optional[str] = None, use_research: bool = False) -> SchemaConfig:
|
|
524
567
|
"""Quick helper to generate schema from story."""
|
|
525
568
|
generator = LLMSchemaGenerator(api_key=api_key)
|
|
526
|
-
return generator.generate_from_story(story)
|
|
569
|
+
return generator.generate_from_story(story, use_research=use_research)
|
|
527
570
|
|
|
528
571
|
|
|
529
572
|
def generate_from_chart(description: str, api_key: Optional[str] = None) -> SchemaConfig:
|