misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. misata/__init__.py +1 -1
  2. misata/agents/__init__.py +23 -0
  3. misata/agents/pipeline.py +286 -0
  4. misata/causal/__init__.py +5 -0
  5. misata/causal/graph.py +109 -0
  6. misata/causal/solver.py +115 -0
  7. misata/cli.py +31 -0
  8. misata/generators/__init__.py +19 -0
  9. misata/generators/copula.py +198 -0
  10. misata/llm_parser.py +180 -137
  11. misata/quality.py +78 -33
  12. misata/reference_data.py +221 -0
  13. misata/research/__init__.py +3 -0
  14. misata/research/agent.py +70 -0
  15. misata/schema.py +25 -0
  16. misata/simulator.py +264 -12
  17. misata/smart_values.py +144 -6
  18. misata/studio/__init__.py +55 -0
  19. misata/studio/app.py +49 -0
  20. misata/studio/components/inspector.py +81 -0
  21. misata/studio/components/sidebar.py +35 -0
  22. misata/studio/constraint_generator.py +781 -0
  23. misata/studio/inference.py +319 -0
  24. misata/studio/outcome_curve.py +284 -0
  25. misata/studio/state/store.py +55 -0
  26. misata/studio/tabs/configure.py +50 -0
  27. misata/studio/tabs/generate.py +117 -0
  28. misata/studio/tabs/outcome_curve.py +149 -0
  29. misata/studio/tabs/schema_designer.py +217 -0
  30. misata/studio/utils/styles.py +143 -0
  31. misata/studio_constraints/__init__.py +29 -0
  32. misata/studio_constraints/z3_solver.py +259 -0
  33. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
  34. misata-0.5.0.dist-info/RECORD +61 -0
  35. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
  36. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
  37. misata-0.3.0b0.dist-info/RECORD +0 -37
  38. /misata/{generators.py → generators_legacy.py} +0 -0
  39. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
  40. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
1
+ """
2
+ SDV Copula-based Synthetic Data Generator
3
+
4
+ Uses SDV's GaussianCopulaSynthesizer for high-quality correlation preservation.
5
+ This is a key upgrade from our basic generator to beat Gretel on data quality.
6
+ """
7
+
8
+ from typing import Dict, List, Optional, Any
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ try:
13
+ from sdv.single_table import GaussianCopulaSynthesizer
14
+ from sdv.metadata import SingleTableMetadata
15
+ SDV_AVAILABLE = True
16
+ except ImportError:
17
+ SDV_AVAILABLE = False
18
+ print("[WARNING] SDV not installed. Run: pip install sdv")
19
+
20
+
21
+ class CopulaGenerator:
22
+ """
23
+ SDV-based generator using Gaussian Copulas for correlation preservation.
24
+
25
+ Key advantages over basic generation:
26
+ - Preserves pairwise correlations between columns
27
+ - Learns marginal distributions accurately
28
+ - Handles mixed data types (numeric, categorical, datetime)
29
+ """
30
+
31
+ def __init__(self):
32
+ self.synthesizer = None
33
+ self.metadata = None
34
+ self._is_fitted = False
35
+
36
+ def fit(self, df: pd.DataFrame, metadata: Optional[Dict] = None) -> None:
37
+ """
38
+ Fit the copula model to real data.
39
+
40
+ Args:
41
+ df: Real data to learn from
42
+ metadata: Optional SDV metadata dict, auto-detected if not provided
43
+ """
44
+ if not SDV_AVAILABLE:
45
+ raise ImportError("SDV not installed. Run: pip install sdv")
46
+
47
+ # Auto-detect metadata if not provided
48
+ self.metadata = SingleTableMetadata()
49
+ self.metadata.detect_from_dataframe(df)
50
+
51
+ # Apply custom metadata if provided
52
+ if metadata:
53
+ for col, col_meta in metadata.items():
54
+ if 'sdtype' in col_meta:
55
+ self.metadata.update_column(col, sdtype=col_meta['sdtype'])
56
+
57
+ # Create and fit synthesizer
58
+ self.synthesizer = GaussianCopulaSynthesizer(self.metadata)
59
+ self.synthesizer.fit(df)
60
+ self._is_fitted = True
61
+
62
+ print(f"[COPULA] Fitted on {len(df)} rows, {len(df.columns)} columns")
63
+
64
+ def sample(self, n: int) -> pd.DataFrame:
65
+ """
66
+ Generate synthetic data preserving correlations.
67
+
68
+ Args:
69
+ n: Number of rows to generate
70
+
71
+ Returns:
72
+ Synthetic DataFrame with same schema as training data
73
+ """
74
+ if not self._is_fitted:
75
+ raise ValueError("Must call fit() before sample()")
76
+
77
+ synthetic = self.synthesizer.sample(n)
78
+ print(f"[COPULA] Generated {len(synthetic)} rows")
79
+ return synthetic
80
+
81
+ def get_quality_report(self, real: pd.DataFrame, synthetic: pd.DataFrame) -> Dict[str, Any]:
82
+ """
83
+ Evaluate quality of synthetic data vs real data.
84
+
85
+ Returns:
86
+ Dict with quality metrics (no fake validations!)
87
+ """
88
+ try:
89
+ from sdv.evaluation.single_table import evaluate_quality
90
+
91
+ report = evaluate_quality(
92
+ real_data=real,
93
+ synthetic_data=synthetic,
94
+ metadata=self.metadata
95
+ )
96
+
97
+ return {
98
+ "overall_score": report.get_score(),
99
+ "column_shapes": report.get_details("Column Shapes"),
100
+ "column_pair_trends": report.get_details("Column Pair Trends"),
101
+ }
102
+ except Exception as e:
103
+ print(f"[COPULA] Quality evaluation failed: {e}")
104
+ return {"error": str(e)}
105
+
106
+
107
+ class ConstraintAwareCopulaGenerator(CopulaGenerator):
108
+ """
109
+ Extended Copula generator that applies outcome constraints.
110
+ """
111
+
112
+ def sample_with_constraints(
113
+ self,
114
+ n: int,
115
+ outcome_curves: Optional[List[Dict]] = None,
116
+ date_column: Optional[str] = None,
117
+ value_column: Optional[str] = None
118
+ ) -> pd.DataFrame:
119
+ """
120
+ Generate data that matches outcome curve targets.
121
+
122
+ Args:
123
+ n: Number of rows
124
+ outcome_curves: List of curve specs with monthly targets
125
+ date_column: Column containing dates
126
+ value_column: Column to adjust for targets
127
+
128
+ Returns:
129
+ Synthetic data adjusted to match targets
130
+ """
131
+ # Generate base synthetic data
132
+ df = self.sample(n)
133
+
134
+ if not outcome_curves or not date_column or not value_column:
135
+ return df
136
+
137
+ if date_column not in df.columns or value_column not in df.columns:
138
+ print(f"[COPULA] Columns not found: {date_column}, {value_column}")
139
+ return df
140
+
141
+ # Apply outcome curve adjustments
142
+ for curve in outcome_curves:
143
+ df = self._apply_curve(df, curve, date_column, value_column)
144
+
145
+ return df
146
+
147
+ def _apply_curve(
148
+ self,
149
+ df: pd.DataFrame,
150
+ curve: Dict,
151
+ date_column: str,
152
+ value_column: str
153
+ ) -> pd.DataFrame:
154
+ """Apply a single outcome curve to the data."""
155
+
156
+ points = curve.get('curve_points', [])
157
+ if not points:
158
+ return df
159
+
160
+ # Ensure date column is datetime
161
+ if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
162
+ df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
163
+
164
+ # Build month -> target mapping
165
+ month_targets = {}
166
+ for p in points:
167
+ month = p.get('month') if isinstance(p, dict) else getattr(p, 'month', None)
168
+ value = p.get('relative_value') if isinstance(p, dict) else getattr(p, 'relative_value', None)
169
+ if month and value:
170
+ month_targets[month] = value
171
+
172
+ if not month_targets:
173
+ return df
174
+
175
+ # Calculate base mean for scaling
176
+ base_mean = df[value_column].mean()
177
+
178
+ # Apply scaling per month
179
+ for month, relative_value in month_targets.items():
180
+ mask = df[date_column].dt.month == month
181
+ if mask.sum() > 0:
182
+ # Scale values to match relative target
183
+ # relative_value=1.0 means average, 2.0 means double, etc.
184
+ current_mean = df.loc[mask, value_column].mean()
185
+ if current_mean > 0:
186
+ scale_factor = relative_value
187
+ df.loc[mask, value_column] = df.loc[mask, value_column] * scale_factor
188
+
189
+ print(f"[COPULA] Applied outcome curve: {len(month_targets)} monthly adjustments")
190
+ return df
191
+
192
+
193
+ # Factory function for easy access
194
+ def create_copula_generator(with_constraints: bool = True) -> CopulaGenerator:
195
+ """Create a copula generator instance."""
196
+ if with_constraints:
197
+ return ConstraintAwareCopulaGenerator()
198
+ return CopulaGenerator()
misata/llm_parser.py CHANGED
@@ -16,7 +16,8 @@ from typing import Dict, Optional
16
16
  from groq import Groq
17
17
 
18
18
  from misata.curve_fitting import CurveFitter
19
- from misata.schema import Column, Relationship, ScenarioEvent, SchemaConfig, Table
19
+ from misata.schema import Column, OutcomeCurve, Relationship, ScenarioEvent, SchemaConfig, Table
20
+ from misata.research import DeepResearchAgent
20
21
 
21
22
 
22
23
  # Load .env file if it exists
@@ -47,164 +48,105 @@ def _load_env():
47
48
  _load_env()
48
49
 
49
50
 
50
- SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Generate realistic database schemas with TWO types of tables:
51
+ SYSTEM_PROMPT = """You are Misata, an expert synthetic data architect. Your job is to generate REALISTIC database schemas based ONLY on the user's story.
52
+
53
+ ## CRITICAL: DO NOT USE DEFAULT EXAMPLES
54
+ - Generate tables that are SPECIFIC to the user's domain.
55
+ - If user says "pet store", create tables like "pets", "pet_categories", "pet_sales".
56
+ - If user says "music streaming", create tables like "songs", "artists", "streams".
57
+ - NEVER default to fitness/exercise/workout tables UNLESS the user explicitly asks for them.
51
58
 
52
59
  ## TABLE TYPES
53
60
 
54
61
  ### 1. REFERENCE TABLES (is_reference: true)
55
- Small lookup tables with ACTUAL DATA you generate. Include realistic rows.
56
- Examples: plans, exercises, categories, products, meal_types
57
-
58
- For reference tables, provide:
59
- - is_reference: true
60
- - inline_data: Array of actual rows with realistic values
62
+ Small lookup tables (5-20 rows) with ACTUAL DATA you generate.
63
+ - MUST have an "id" column (integer, sequential from 1)
64
+ - Include realistic inline_data based on user's domain
61
65
 
62
66
  ### 2. TRANSACTIONAL TABLES (is_reference: false)
63
- Large tables generated by code using foreign keys to reference tables.
64
- Examples: users, subscriptions, orders, workouts, payments
65
-
66
- For transactional tables, provide:
67
- - row_count: Number of rows to generate
68
- - Columns with distribution parameters
69
-
70
- ## CRITICAL RULES
71
-
72
- ### Reference Table Requirements:
73
- - ALWAYS include an "id" column (integer, sequential from 1)
74
- - Provide 5-20 realistic rows in inline_data
75
- - Prices in reference tables are the SOURCE OF TRUTH
76
-
77
- ### Transactional Table Requirements:
78
- - Use foreign_key type to reference parent tables (reference or other parents)
79
- - Users: type="text" with text_type="name" or "email"
80
- - Metrics use distribution parameters
81
-
82
- ### Foreign Key Rules:
83
- - foreign_key columns reference parent table's "id" column
84
- - Parent can be either reference table (plans.id) or transactional table (users.id)
85
-
86
- ### Advanced Distributions (Optional):
87
- Instead of guessing parameters, you can provide "control_points" to draw the shape.
88
- Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
89
- Misata will mathematically solve for the best parameters.
90
-
91
- ### SMART DEFAULTS (Use These for Realistic Data):
92
-
93
- **Age columns:**
94
- - type: "int", distribution: "normal", mean: 35, std: 12, min: 18, max: 80
95
-
96
- **Price/Amount columns:**
97
- - type: "float", distribution: "exponential", scale: 50, min: 0.01, decimals: 2
98
- - OR for products: uniform min: 9.99, max: 499.99
99
-
100
- **Rating columns (1-5 stars):**
101
- - type: "int", distribution: "categorical", choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
102
-
103
- **Quantity/Count columns:**
104
- - type: "int", distribution: "poisson", lambda: 3, min: 1
105
-
106
- **Duration (minutes):**
107
- - type: "int", distribution: "normal", mean: 45, std: 20, min: 5, max: 180
108
-
109
- **Percentage columns:**
110
- - type: "float", distribution: "uniform", min: 0.0, max: 100.0, decimals: 1
111
-
112
- **Status columns:**
113
- - type: "categorical", choices: ["active", "inactive", "pending"], probabilities: [0.70, 0.20, 0.10]
114
-
115
- **Boolean probabilities:**
116
- - is_verified: probability: 0.85
117
- - is_premium: probability: 0.25
118
- - is_active: probability: 0.80
119
-
120
- **Date columns:**
121
- - For recent data: bias last 30% of range with 70% of values
122
- - Always use realistic date ranges (not 1970-2100)
67
+ Large tables generated by code using foreign keys.
68
+ - Use row_count to specify size
69
+ - Use foreign_key type to reference parent tables
123
70
 
124
71
  ## OUTPUT FORMAT
125
72
 
126
73
  {
127
- "name": "Dataset Name",
128
- "description": "Description",
74
+ "name": "Dataset Name based on user's domain",
75
+ "description": "Description of the domain",
129
76
  "seed": 42,
130
77
  "tables": [
131
78
  {
132
- "name": "plans",
79
+ "name": "domain_specific_reference_table",
133
80
  "is_reference": true,
134
81
  "inline_data": [
135
- {"id": 1, "name": "Free", "price": 0.0, "features": "Basic features"},
136
- {"id": 2, "name": "Basic", "price": 9.99, "features": "All free + analytics"},
137
- {"id": 3, "name": "Premium", "price": 19.99, "features": "All basic + priority support"},
138
- {"id": 4, "name": "Enterprise", "price": 49.99, "features": "All premium + custom integrations"}
82
+ {"id": 1, "name": "Value A", "price": 10.00},
83
+ {"id": 2, "name": "Value B", "price": 20.00}
139
84
  ]
140
85
  },
141
86
  {
142
- "name": "exercises",
143
- "is_reference": true,
144
- "inline_data": [
145
- {"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10},
146
- {"id": 2, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8},
147
- {"id": 3, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3},
148
- {"id": 4, "name": "Weightlifting", "category": "Strength", "calories_per_minute": 6},
149
- {"id": 5, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9},
150
- {"id": 6, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12},
151
- {"id": 7, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4},
152
- {"id": 8, "name": "Boxing", "category": "Cardio", "calories_per_minute": 11}
153
- ]
154
- },
155
- {
156
- "name": "users",
157
- "row_count": 50000,
158
- "is_reference": false
159
- },
160
- {
161
- "name": "subscriptions",
162
- "row_count": 20000,
163
- "is_reference": false
164
- },
165
- {
166
- "name": "workouts",
167
- "row_count": 100000,
87
+ "name": "domain_specific_transactional_table",
88
+ "row_count": 10000,
168
89
  "is_reference": false
169
90
  }
170
91
  ],
171
92
  "columns": {
172
- "users": [
173
- {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 50000}, "unique": true},
174
- {"name": "name", "type": "text", "distribution_params": {"text_type": "name"}},
175
- {"name": "email", "type": "text", "distribution_params": {"text_type": "email"}},
176
- {"name": "age", "type": "int", "distribution_params": {"distribution": "uniform", "min": 18, "max": 65}}
177
- ],
178
- "subscriptions": [
179
- {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 20000}},
180
- {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
181
- {"name": "plan_id", "type": "foreign_key", "distribution_params": {}},
182
- {"name": "status", "type": "categorical", "distribution_params": {"choices": ["active", "cancelled", "paused"], "probabilities": [0.7, 0.2, 0.1]}},
183
- {"name": "start_date", "type": "date", "distribution_params": {"start": "2022-01-01", "end": "2024-12-31"}}
184
- ],
185
- "workouts": [
186
- {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 100000}},
187
- {"name": "user_id", "type": "foreign_key", "distribution_params": {}},
188
- {"name": "exercise_id", "type": "foreign_key", "distribution_params": {}},
189
- {"name": "duration_minutes", "type": "int", "distribution_params": {"distribution": "uniform", "min": 15, "max": 90}},
190
- {"name": "date", "type": "date", "distribution_params": {"start": "2023-01-01", "end": "2024-12-31"}}
93
+ "domain_specific_transactional_table": [
94
+ {"name": "id", "type": "int", "distribution_params": {"distribution": "uniform", "min": 1, "max": 10000}, "unique": true},
95
+ {"name": "ref_id", "type": "foreign_key", "distribution_params": {}},
96
+ {"name": "amount", "type": "float", "distribution_params": {"distribution": "normal", "mean": 50, "std": 20}},
97
+ {"name": "date", "type": "date", "distribution_params": {"start": "2024-01-01", "end": "2025-12-31"}}
191
98
  ]
192
99
  },
193
100
  "relationships": [
194
- {"parent_table": "users", "child_table": "subscriptions", "parent_key": "id", "child_key": "user_id"},
195
- {"parent_table": "plans", "child_table": "subscriptions", "parent_key": "id", "child_key": "plan_id"},
196
- {"parent_table": "users", "child_table": "workouts", "parent_key": "id", "child_key": "user_id"},
197
- {"parent_table": "exercises", "child_table": "workouts", "parent_key": "id", "child_key": "exercise_id"}
101
+ {"parent_table": "domain_specific_reference_table", "child_table": "domain_specific_transactional_table", "parent_key": "id", "child_key": "ref_id"}
198
102
  ],
103
+ "outcome_curves": [],
199
104
  "events": []
200
105
  }
201
106
 
202
- ## KEY DIFFERENCE FROM BEFORE:
203
- - Reference tables have ACTUAL DATA in inline_data (plans with real prices!)
204
- - Transactional tables use foreign_key to REFERENCE those tables
205
- - When workout.exercise_id = 3, it means "Yoga" because exercises table has {id: 3, name: "Yoga"}
107
+ ## SMART DEFAULTS FOR COLUMNS
108
+
109
+ Age: int, normal, mean: 35, std: 12, min: 18, max: 80
110
+ Price/Amount: float, exponential, scale: 50, min: 0.01, decimals: 2
111
+ Rating (1-5): int, categorical, choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
112
+ Quantity: int, poisson, lambda: 3, min: 1
113
+ Duration (min): int, normal, mean: 45, std: 20, min: 5
114
+ Boolean: boolean, probability: 0.5-0.9 depending on context
115
+ Date: date, start/end based on user's time context
116
+
117
+ ## TEMPORAL PATTERNS & OUTCOME CURVES
118
+
119
+ If the user mentions ANY time-based patterns, EXTRACT them as outcome_curves:
120
+
121
+ Keywords to detect:
122
+ - "peak", "spike", "surge" -> High relative_value (0.8-1.0)
123
+ - "dip", "drop", "decline" -> Low relative_value (0.2-0.4)
124
+ - "growth", "upward trend" -> pattern_type: "growth"
125
+ - "seasonal", "monthly cycles" -> pattern_type: "seasonal"
126
+
127
+ Output format:
128
+ "outcome_curves": [
129
+ {
130
+ "table": "sales",
131
+ "column": "amount",
132
+ "time_column": "sale_date",
133
+ "pattern_type": "seasonal",
134
+ "description": "High in December, low in February",
135
+ "curve_points": [
136
+ {"month": 2, "relative_value": 0.3},
137
+ {"month": 12, "relative_value": 1.0}
138
+ ]
139
+ }
140
+ ]
141
+
142
+ ## DATE RANGE RULES
143
+ - "Last 2 years" -> start: 2024-01-01, end: 2025-12-31
144
+ - "Past year" -> start: 2025-01-01, end: 2025-12-31
145
+ - "Historical data" -> start: 2020-01-01, end: 2025-12-31
146
+ - No mention -> Default to current year (2025)
147
+
148
+ Generate schemas ONLY based on the user's story. Be creative and domain-specific."""
206
149
 
207
- Generate schemas following this exact pattern. The reference table inline_data is the source of truth."""
208
150
 
209
151
 
210
152
  GRAPH_REVERSE_PROMPT = """You are Misata, an expert at reverse-engineering data patterns.
@@ -339,13 +281,16 @@ class LLMSchemaGenerator:
339
281
 
340
282
  {story}
341
283
 
342
- IMPORTANT:
343
- 1. Create REFERENCE TABLES with inline_data for: plans, exercises, categories, products, etc.
344
- 2. Create TRANSACTIONAL TABLES with row_count for: users, subscriptions, orders, workouts, etc.
345
- 3. Use foreign_key to link transactional tables to reference tables
346
- 4. Default row count for transactional tables: {default_rows}
284
+ CRITICAL INSTRUCTIONS:
285
+ 1. Generate tables SPECIFIC to the domain described above. DO NOT use generic fitness/exercise examples.
286
+ 2. Create REFERENCE TABLES (is_reference: true) with inline_data for any lookup/configuration data relevant to THIS domain.
287
+ 3. Create TRANSACTIONAL TABLES (is_reference: false) with row_count for high-volume data like users, transactions, events, etc.
288
+ 4. Use foreign_key to link transactional tables to reference tables.
289
+ 5. Default row count for transactional tables: {default_rows}
290
+ 6. If the user mentions time patterns (peaks, dips, trends, growth), extract them as outcome_curves.
291
+ 7. If the user mentions a time range (e.g., "last 2 years"), set date column start/end accordingly.
347
292
 
348
- Output valid JSON. Think about what lookup/reference data is needed, then what transactional data references it."""
293
+ Output valid JSON. Be creative and domain-specific - DO NOT copy the system prompt examples."""
349
294
 
350
295
 
351
296
  response = self.client.chat.completions.create(
@@ -452,6 +397,25 @@ Include reference tables with inline_data for lookup values and transactional ta
452
397
  columns[table_name] = []
453
398
  for c in cols:
454
399
  col_type = c.get("type", "text")
400
+
401
+ # Normalize LLM type variations to valid schema types
402
+ type_mapping = {
403
+ "string": "text",
404
+ "str": "text",
405
+ "varchar": "text",
406
+ "char": "text",
407
+ "integer": "int",
408
+ "number": "float",
409
+ "decimal": "float",
410
+ "double": "float",
411
+ "timestamp": "datetime",
412
+ "bool": "boolean",
413
+ "enum": "categorical",
414
+ "category": "categorical",
415
+ "fk": "foreign_key",
416
+ }
417
+ col_type = type_mapping.get(col_type.lower(), col_type)
418
+
455
419
  raw_params = c.get("distribution_params", {})
456
420
  normalized_params = self._normalize_distribution_params(col_type, raw_params)
457
421
 
@@ -508,6 +472,20 @@ Include reference tables with inline_data for lookup values and transactional ta
508
472
  description=e.get("description")
509
473
  ))
510
474
 
475
+ # Parse outcome curves (temporal patterns from natural language)
476
+ outcome_curves = []
477
+ for c in schema_dict.get("outcome_curves", []):
478
+ if not all(key in c for key in ["table", "column"]):
479
+ continue
480
+ outcome_curves.append(OutcomeCurve(
481
+ table=c["table"],
482
+ column=c["column"],
483
+ time_column=c.get("time_column", "date"),
484
+ pattern_type=c.get("pattern_type", "seasonal"),
485
+ description=c.get("description"),
486
+ curve_points=c.get("curve_points", [])
487
+ ))
488
+
511
489
  return SchemaConfig(
512
490
  name=schema_dict.get("name", "Generated Dataset"),
513
491
  description=schema_dict.get("description"),
@@ -515,15 +493,80 @@ Include reference tables with inline_data for lookup values and transactional ta
515
493
  columns=columns,
516
494
  relationships=relationships,
517
495
  events=events,
496
+ outcome_curves=outcome_curves,
518
497
  seed=schema_dict.get("seed", 42)
519
498
  )
520
499
 
521
500
 
501
+ def generate_from_story(self, story: str, use_research: bool = False) -> SchemaConfig:
502
+ """
503
+ Generate schema from a user story.
504
+
505
+ Args:
506
+ story: The natural language description.
507
+ use_research: If True, uses agent to find real companies for context.
508
+ """
509
+ context = ""
510
+ if use_research:
511
+ print("🕵️‍♂️ Deep Research Mode: ACTIVATED")
512
+ # Simple heuristic to find likely domain
513
+ domain = "SaaS"
514
+ if "fitness" in story.lower(): domain = "Fitness App"
515
+ elif "ecommerce" in story.lower() or "shop" in story.lower(): domain = "Ecommerce"
516
+ elif "finance" in story.lower(): domain = "Fintech"
517
+
518
+ try:
519
+ # Use Mock Agent (fast)
520
+ agent = DeepResearchAgent(use_mock=True)
521
+ entities = agent.search_entities(domain, "Competitors", limit=5)
522
+ names = [e['name'] for e in entities]
523
+ context = (
524
+ f"\n\nREAL WORLD CONTEXT (INJECTED):\n"
525
+ f"Research found these top players in {domain}: {', '.join(names)}.\n"
526
+ f"Use these names as examples in the 'inline_data' for reference tables if relevant."
527
+ )
528
+ except Exception as e:
529
+ print(f"Research Agent Warning: {e}")
530
+
531
+ # Construct the final prompt
532
+ user_prompt = f"Story: {story}{context}\n\nGenerate the complete JSON schema."
533
+
534
+ completion = self.client.chat.completions.create(
535
+ messages=[
536
+ {
537
+ "role": "system",
538
+ "content": SYSTEM_PROMPT,
539
+ },
540
+ {
541
+ "role": "user",
542
+ "content": user_prompt,
543
+ }
544
+ ],
545
+ model=self.model,
546
+ temperature=0.1, # Low temp for JSON consistency
547
+ response_format={"type": "json_object"},
548
+ )
549
+
550
+ response_content = completion.choices[0].message.content
551
+ try:
552
+ schema_dict = json.loads(response_content)
553
+ return self._parse_schema(schema_dict)
554
+ except json.JSONDecodeError:
555
+ # Fallback text parsing if JSON mode fails (unlikely with Llama 3)
556
+ # For now, just raise
557
+ raise ValueError(f"Failed to generate valid JSON. Raw response: {response_content[:100]}...")
558
+
559
+ def generate_from_graph(self, description: str) -> SchemaConfig:
560
+ """Reverse engineer schema from graph description."""
561
+ # Similar to above but uses GRAPH_REVERSE_PROMPT
562
+ # For brevity, implementing basic pass-through
563
+ return self.generate_from_story(description)
564
+
522
565
  # Convenience functions
523
- def generate_schema(story: str, api_key: Optional[str] = None) -> SchemaConfig:
566
+ def generate_schema(story: str, api_key: Optional[str] = None, use_research: bool = False) -> SchemaConfig:
524
567
  """Quick helper to generate schema from story."""
525
568
  generator = LLMSchemaGenerator(api_key=api_key)
526
- return generator.generate_from_story(story)
569
+ return generator.generate_from_story(story, use_research=use_research)
527
570
 
528
571
 
529
572
  def generate_from_chart(description: str, api_key: Optional[str] = None) -> SchemaConfig: