misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. misata/__init__.py +1 -1
  2. misata/agents/__init__.py +23 -0
  3. misata/agents/pipeline.py +286 -0
  4. misata/causal/__init__.py +5 -0
  5. misata/causal/graph.py +109 -0
  6. misata/causal/solver.py +115 -0
  7. misata/cli.py +31 -0
  8. misata/generators/__init__.py +19 -0
  9. misata/generators/copula.py +198 -0
  10. misata/llm_parser.py +180 -137
  11. misata/quality.py +78 -33
  12. misata/reference_data.py +221 -0
  13. misata/research/__init__.py +3 -0
  14. misata/research/agent.py +70 -0
  15. misata/schema.py +25 -0
  16. misata/simulator.py +264 -12
  17. misata/smart_values.py +144 -6
  18. misata/studio/__init__.py +55 -0
  19. misata/studio/app.py +49 -0
  20. misata/studio/components/inspector.py +81 -0
  21. misata/studio/components/sidebar.py +35 -0
  22. misata/studio/constraint_generator.py +781 -0
  23. misata/studio/inference.py +319 -0
  24. misata/studio/outcome_curve.py +284 -0
  25. misata/studio/state/store.py +55 -0
  26. misata/studio/tabs/configure.py +50 -0
  27. misata/studio/tabs/generate.py +117 -0
  28. misata/studio/tabs/outcome_curve.py +149 -0
  29. misata/studio/tabs/schema_designer.py +217 -0
  30. misata/studio/utils/styles.py +143 -0
  31. misata/studio_constraints/__init__.py +29 -0
  32. misata/studio_constraints/z3_solver.py +259 -0
  33. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
  34. misata-0.5.0.dist-info/RECORD +61 -0
  35. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
  36. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
  37. misata-0.3.0b0.dist-info/RECORD +0 -37
  38. /misata/{generators.py → generators_legacy.py} +0 -0
  39. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
  40. {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0
misata/quality.py CHANGED
@@ -11,13 +11,15 @@ This module validates generated synthetic data for:
11
11
  from typing import Dict, List, Any, Optional, Tuple
12
12
  from dataclasses import dataclass, field
13
13
  import warnings
14
+ import numpy as np
15
+ import pandas as pd # type: ignore
14
16
 
15
17
 
16
18
  @dataclass
17
19
  class QualityIssue:
18
20
  """Represents a single data quality issue."""
19
21
  severity: str # "error", "warning", "info"
20
- category: str # "distribution", "integrity", "temporal", "domain"
22
+ category: str # "distribution", "integrity", "temporal", "domain", "time_series"
21
23
  table: str
22
24
  column: Optional[str]
23
25
  message: str
@@ -107,19 +109,12 @@ class DataQualityChecker:
107
109
 
108
110
  def check_distribution_plausibility(
109
111
  self,
110
- df: "pd.DataFrame",
112
+ df: pd.DataFrame,
111
113
  table_name: str,
112
114
  ) -> None:
113
115
  """
114
116
  Check if numeric distributions are plausible for their domains.
115
-
116
- Args:
117
- df: DataFrame to check
118
- table_name: Name of the table
119
117
  """
120
- import pandas as pd
121
- import numpy as np
122
-
123
118
  for col in df.columns:
124
119
  col_lower = col.lower()
125
120
 
@@ -162,15 +157,11 @@ class DataQualityChecker:
162
157
 
163
158
  def check_referential_integrity(
164
159
  self,
165
- tables: Dict[str, "pd.DataFrame"],
160
+ tables: Dict[str, pd.DataFrame],
166
161
  relationships: List[Any],
167
162
  ) -> None:
168
163
  """
169
164
  Verify all foreign key references are valid.
170
-
171
- Args:
172
- tables: Dict of table_name -> DataFrame
173
- relationships: List of Relationship objects
174
165
  """
175
166
  for rel in relationships:
176
167
  parent_table = rel.parent_table
@@ -221,19 +212,12 @@ class DataQualityChecker:
221
212
 
222
213
  def check_temporal_consistency(
223
214
  self,
224
- df: "pd.DataFrame",
215
+ df: pd.DataFrame,
225
216
  table_name: str,
226
217
  ) -> None:
227
218
  """
228
219
  Ensure temporal columns are consistent.
229
-
230
- Checks:
231
- - created_at < updated_at
232
- - start_date < end_date
233
- - birth_date in past
234
220
  """
235
- import pandas as pd
236
-
237
221
  date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
238
222
 
239
223
  # Check created < updated
@@ -266,23 +250,83 @@ class DataQualityChecker:
266
250
  f"{future_births} rows have birth_date in the future",
267
251
  {"violation_count": future_births}
268
252
  )
253
+
254
+ def check_time_series_properties(
255
+ self,
256
+ df: pd.DataFrame,
257
+ table_name: str,
258
+ ) -> None:
259
+ """
260
+ Analyze time series properties (Autocorrelation, Trend, Seasonality).
261
+ Adds 'info' level insights to the report.
262
+ """
263
+ # Find Date Column
264
+ date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
265
+ if not date_cols:
266
+ return
267
+
268
+ time_col = date_cols[0] # Use first date col
269
+
270
+ # Find Metric Columns (Float/Int)
271
+ numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in ['id']]
272
+
273
+ for col in numeric_cols:
274
+ # Skip if low cardinality
275
+ if df[col].nunique() < 10:
276
+ continue
277
+
278
+ # Sort by time
279
+ ts_df = df.sort_values(time_col)
280
+ series = ts_df[col].values
281
+
282
+ if len(series) < 5:
283
+ continue
284
+
285
+ # 1. Autocorrelation (Lag-1)
286
+ # Simple manual calculation
287
+ if len(series) > 2:
288
+ # Handle possible NaNs
289
+ s_clean = series[~np.isnan(series)]
290
+ if len(s_clean) > 2:
291
+ lag1 = np.corrcoef(s_clean[:-1], s_clean[1:])[0, 1]
292
+
293
+ if not np.isnan(lag1):
294
+ if abs(lag1) > 0.7:
295
+ msg = f"Strong temporal logic detected (Lag-1 Autocorrelation: {lag1:.2f})"
296
+ self._add_issue("info", "time_series", table_name, col, msg, {"lag1": lag1})
297
+ elif abs(lag1) < 0.1:
298
+ msg = f"Data appears random/noisy (Lag-1 Autocorrelation: {lag1:.2f})"
299
+ self._add_issue("info", "time_series", table_name, col, msg, {"lag1": lag1})
300
+
301
+ # 2. Trend Detection
302
+ if len(series) > 10:
303
+ # Linear fit
304
+ x = np.arange(len(series))
305
+ # Handle NaNs replacement for trend check
306
+ s_filled = pd.Series(series).fillna(method='ffill').fillna(0).values
307
+
308
+ slope, _ = np.polyfit(x, s_filled, 1)
309
+
310
+ # Normalize slope to be % change per step relative to mean
311
+ mean_val = np.mean(s_filled)
312
+ if abs(mean_val) > 0.01:
313
+ normalized_slope = slope / mean_val
314
+ if abs(normalized_slope) * len(series) > 0.2: # Total change > 20%
315
+ trend_dir = "Growth" if slope > 0 else "Decline"
316
+ self._add_issue(
317
+ "info", "time_series", table_name, col,
318
+ f"Significant {trend_dir} Trend Detected",
319
+ {"slope": slope}
320
+ )
269
321
 
270
322
  def check_all(
271
323
  self,
272
- tables: Dict[str, "pd.DataFrame"],
324
+ tables: Dict[str, pd.DataFrame],
273
325
  relationships: Optional[List[Any]] = None,
274
326
  schema: Optional[Any] = None,
275
327
  ) -> QualityReport:
276
328
  """
277
329
  Run all quality checks and generate a report.
278
-
279
- Args:
280
- tables: Dict of table_name -> DataFrame
281
- relationships: Optional list of Relationship objects
282
- schema: Optional SchemaConfig for additional checks
283
-
284
- Returns:
285
- QualityReport with score and issues
286
330
  """
287
331
  self.issues = [] # Reset
288
332
 
@@ -290,6 +334,7 @@ class DataQualityChecker:
290
334
  for table_name, df in tables.items():
291
335
  self.check_distribution_plausibility(df, table_name)
292
336
  self.check_temporal_consistency(df, table_name)
337
+ self.check_time_series_properties(df, table_name)
293
338
 
294
339
  # Check referential integrity
295
340
  if relationships:
@@ -303,7 +348,7 @@ class DataQualityChecker:
303
348
  elif issue.severity == "warning":
304
349
  base_score -= 3
305
350
  else:
306
- base_score -= 1
351
+ base_score -= 1 # Info subtracts 1 for now (maybe 0 later)
307
352
 
308
353
  score = max(0, min(100, base_score))
309
354
 
@@ -323,7 +368,7 @@ class DataQualityChecker:
323
368
  )
324
369
 
325
370
 
326
- def check_quality(tables: Dict[str, "pd.DataFrame"], **kwargs) -> QualityReport:
371
+ def check_quality(tables: Dict[str, pd.DataFrame], **kwargs) -> QualityReport:
327
372
  """Convenience function for quick quality checks."""
328
373
  checker = DataQualityChecker()
329
374
  return checker.check_all(tables, **kwargs)
@@ -0,0 +1,221 @@
1
+ """
2
+ Domain-Aware Reference Data Library
3
+
4
+ Pre-built realistic data templates for common business domains.
5
+ This ensures reference tables (plans, exercises, categories) have
6
+ sensible, domain-appropriate values instead of random garbage.
7
+
8
+ Usage:
9
+ from misata.reference_data import get_reference_data, detect_domain
10
+
11
+ domain = detect_domain(["plans", "subscriptions", "users"])
12
+ plans_data = get_reference_data(domain, "plans")
13
+ """
14
+
15
+ from typing import Any, Dict, List, Optional
16
+
17
+
18
+ # ============ DOMAIN TEMPLATES ============
19
+
20
+ REFERENCE_DATA_LIBRARY: Dict[str, Dict[str, List[Dict[str, Any]]]] = {
21
+
22
+ # ===== SaaS / Subscription Business =====
23
+ "saas": {
24
+ "plans": [
25
+ {"id": 1, "name": "Free", "price": 0.00, "features": "Basic features, Community support"},
26
+ {"id": 2, "name": "Starter", "price": 9.99, "features": "5GB storage, Email support"},
27
+ {"id": 3, "name": "Pro", "price": 29.99, "features": "50GB storage, Priority support, Analytics"},
28
+ {"id": 4, "name": "Business", "price": 79.99, "features": "200GB storage, Dedicated support, API access"},
29
+ {"id": 5, "name": "Enterprise", "price": 199.99, "features": "Unlimited storage, SLA, Custom integrations"},
30
+ ],
31
+ "tiers": [
32
+ {"id": 1, "name": "Bronze", "discount_pct": 0},
33
+ {"id": 2, "name": "Silver", "discount_pct": 10},
34
+ {"id": 3, "name": "Gold", "discount_pct": 20},
35
+ {"id": 4, "name": "Platinum", "discount_pct": 30},
36
+ ],
37
+ },
38
+
39
+ # ===== Fitness / Health App =====
40
+ "fitness": {
41
+ "exercises": [
42
+ {"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10, "difficulty": "Medium"},
43
+ {"id": 2, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9, "difficulty": "Medium"},
44
+ {"id": 3, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8, "difficulty": "Easy"},
45
+ {"id": 4, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12, "difficulty": "Hard"},
46
+ {"id": 5, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3, "difficulty": "Easy"},
47
+ {"id": 6, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4, "difficulty": "Medium"},
48
+ {"id": 7, "name": "Weight Training", "category": "Strength", "calories_per_minute": 6, "difficulty": "Medium"},
49
+ {"id": 8, "name": "CrossFit", "category": "Strength", "calories_per_minute": 11, "difficulty": "Hard"},
50
+ ],
51
+ "plans": [
52
+ {"id": 1, "name": "Free", "price": 0.00, "features": "Basic workouts"},
53
+ {"id": 2, "name": "Basic", "price": 9.99, "features": "All workouts, Progress tracking"},
54
+ {"id": 3, "name": "Premium", "price": 19.99, "features": "Personal trainer, Meal plans"},
55
+ {"id": 4, "name": "Elite", "price": 49.99, "features": "1-on-1 coaching, Custom programs"},
56
+ ],
57
+ "workout_types": [
58
+ {"id": 1, "name": "Morning Cardio", "duration_minutes": 30, "intensity": "Medium"},
59
+ {"id": 2, "name": "Full Body Strength", "duration_minutes": 45, "intensity": "High"},
60
+ {"id": 3, "name": "Relaxing Yoga", "duration_minutes": 60, "intensity": "Low"},
61
+ {"id": 4, "name": "HIIT Blast", "duration_minutes": 20, "intensity": "Very High"},
62
+ ],
63
+ },
64
+
65
+ # ===== E-commerce / Retail =====
66
+ "ecommerce": {
67
+ "categories": [
68
+ {"id": 1, "name": "Electronics", "description": "Phones, laptops, gadgets"},
69
+ {"id": 2, "name": "Clothing", "description": "Fashion and apparel"},
70
+ {"id": 3, "name": "Home & Garden", "description": "Furniture, decor, outdoor"},
71
+ {"id": 4, "name": "Sports & Outdoors", "description": "Fitness, camping, sports gear"},
72
+ {"id": 5, "name": "Books & Media", "description": "Books, music, movies"},
73
+ {"id": 6, "name": "Health & Beauty", "description": "Skincare, supplements, wellness"},
74
+ ],
75
+ "products": [
76
+ {"id": 1, "name": "Wireless Headphones", "category_id": 1, "price": 79.99},
77
+ {"id": 2, "name": "Smart Watch", "category_id": 1, "price": 199.99},
78
+ {"id": 3, "name": "Cotton T-Shirt", "category_id": 2, "price": 24.99},
79
+ {"id": 4, "name": "Running Shoes", "category_id": 4, "price": 89.99},
80
+ {"id": 5, "name": "Yoga Mat", "category_id": 4, "price": 29.99},
81
+ ],
82
+ "shipping_methods": [
83
+ {"id": 1, "name": "Standard", "days": 5, "price": 4.99},
84
+ {"id": 2, "name": "Express", "days": 2, "price": 9.99},
85
+ {"id": 3, "name": "Next Day", "days": 1, "price": 19.99},
86
+ {"id": 4, "name": "Free Shipping", "days": 7, "price": 0.00},
87
+ ],
88
+ },
89
+
90
+ # ===== Finance / Banking =====
91
+ "finance": {
92
+ "account_types": [
93
+ {"id": 1, "name": "Checking", "interest_rate": 0.01, "monthly_fee": 0.00},
94
+ {"id": 2, "name": "Savings", "interest_rate": 0.50, "monthly_fee": 0.00},
95
+ {"id": 3, "name": "Money Market", "interest_rate": 1.00, "monthly_fee": 5.00},
96
+ {"id": 4, "name": "Premium Checking", "interest_rate": 0.10, "monthly_fee": 15.00},
97
+ ],
98
+ "transaction_types": [
99
+ {"id": 1, "name": "Deposit", "category": "Income"},
100
+ {"id": 2, "name": "Withdrawal", "category": "Expense"},
101
+ {"id": 3, "name": "Transfer", "category": "Transfer"},
102
+ {"id": 4, "name": "Payment", "category": "Expense"},
103
+ {"id": 5, "name": "Refund", "category": "Income"},
104
+ ],
105
+ },
106
+
107
+ # ===== Education / LMS =====
108
+ "education": {
109
+ "courses": [
110
+ {"id": 1, "name": "Python Fundamentals", "level": "Beginner", "duration_hours": 20, "price": 49.99},
111
+ {"id": 2, "name": "Data Science Bootcamp", "level": "Intermediate", "duration_hours": 60, "price": 199.99},
112
+ {"id": 3, "name": "Machine Learning", "level": "Advanced", "duration_hours": 40, "price": 149.99},
113
+ {"id": 4, "name": "Web Development", "level": "Beginner", "duration_hours": 30, "price": 79.99},
114
+ ],
115
+ "difficulty_levels": [
116
+ {"id": 1, "name": "Beginner", "description": "No prior experience needed"},
117
+ {"id": 2, "name": "Intermediate", "description": "Some experience required"},
118
+ {"id": 3, "name": "Advanced", "description": "Strong foundation needed"},
119
+ {"id": 4, "name": "Expert", "description": "Professional level"},
120
+ ],
121
+ },
122
+ }
123
+
124
+
125
+ # ============ DOMAIN DETECTION ============
126
+
127
+ # Keywords that indicate a specific domain
128
+ DOMAIN_KEYWORDS = {
129
+ "saas": ["subscription", "plan", "tier", "billing", "invoice", "tenant"],
130
+ "fitness": ["exercise", "workout", "calories", "fitness", "gym", "training", "health"],
131
+ "ecommerce": ["product", "category", "cart", "order", "shipping", "inventory", "catalog"],
132
+ "finance": ["account", "transaction", "balance", "payment", "transfer", "bank"],
133
+ "education": ["course", "student", "lesson", "enrollment", "grade", "instructor"],
134
+ }
135
+
136
+
137
+ def detect_domain(table_names: List[str]) -> str:
138
+ """
139
+ Detect the business domain based on table names.
140
+
141
+ Args:
142
+ table_names: List of table names in the schema
143
+
144
+ Returns:
145
+ Domain name (saas, fitness, ecommerce, finance, education, or 'generic')
146
+ """
147
+ table_names_lower = [t.lower() for t in table_names]
148
+ all_text = " ".join(table_names_lower)
149
+
150
+ domain_scores = {}
151
+ for domain, keywords in DOMAIN_KEYWORDS.items():
152
+ score = sum(1 for kw in keywords if kw in all_text)
153
+ if score > 0:
154
+ domain_scores[domain] = score
155
+
156
+ if domain_scores:
157
+ return max(domain_scores, key=domain_scores.get)
158
+
159
+ return "generic"
160
+
161
+
162
+ def get_reference_data(domain: str, table_name: str) -> Optional[List[Dict[str, Any]]]:
163
+ """
164
+ Get pre-built reference data for a table.
165
+
166
+ Strategy:
167
+ 1. Check specific domain (exact match)
168
+ 2. Check specific domain (singular/plural match)
169
+ 3. GLOBAL FALLBACK: Check ALL domains for exact match
170
+ 4. GLOBAL FALLBACK: Check ALL domains for partial match
171
+ """
172
+ # Normalize table name
173
+ table_key = table_name.lower().rstrip('s') # Remove plural 's'
174
+
175
+ # 1. Try specific domain first
176
+ domain_data = REFERENCE_DATA_LIBRARY.get(domain, {})
177
+
178
+ # Exact match in domain
179
+ if table_name in domain_data:
180
+ return domain_data[table_name]
181
+
182
+ # Singular match in domain
183
+ if table_key in domain_data:
184
+ return domain_data[table_key]
185
+
186
+ # Partial match in domain
187
+ for key, data in domain_data.items():
188
+ if table_key in key or key in table_key:
189
+ return data
190
+
191
+ # 2. GLOBAL SEARCH: Check all other domains
192
+ # This handles mixed schemas (e.g. "fitness app with products")
193
+ for other_domain, tables in REFERENCE_DATA_LIBRARY.items():
194
+ if other_domain == domain:
195
+ continue
196
+
197
+ # Exact match
198
+ if table_name in tables:
199
+ return tables[table_name]
200
+
201
+ # Singular match
202
+ if table_key in tables:
203
+ return tables[table_key]
204
+
205
+ # 3. GLOBAL PARTIAL SEARCH
206
+ for other_domain, tables in REFERENCE_DATA_LIBRARY.items():
207
+ for key, data in tables.items():
208
+ if table_key in key or key in table_key:
209
+ return data
210
+
211
+ return None
212
+
213
+
214
+ def get_all_domains() -> List[str]:
215
+ """Get list of all supported domains."""
216
+ return list(REFERENCE_DATA_LIBRARY.keys())
217
+
218
+
219
+ def get_domain_tables(domain: str) -> List[str]:
220
+ """Get list of tables available for a domain."""
221
+ return list(REFERENCE_DATA_LIBRARY.get(domain, {}).keys())
@@ -0,0 +1,3 @@
1
+ from .agent import DeepResearchAgent
2
+
3
+ __all__ = ["DeepResearchAgent"]
@@ -0,0 +1,70 @@
1
+
2
+ """
3
+ Misata Deep Research Agent 🕵️‍♂️
4
+ -----------------------------
5
+ Responsible for fetching "Ground Truth" data from the real world.
6
+ Uses Agentic Search (Tavily/LangGraph) to find competitors, market stats, and pricing.
7
+ """
8
+
9
+ from typing import List, Dict, Any, Optional
10
+ import time
11
+
12
+ class DeepResearchAgent:
13
+ def __init__(self, api_key: Optional[str] = None, use_mock: bool = True):
14
+ self.api_key = api_key
15
+ self.use_mock = use_mock
16
+ # TODO: Initialize LangGraph / Tavily client here
17
+
18
+ def search_entities(self, domain: str, entity_type: str, limit: int = 10) -> List[Dict[str, Any]]:
19
+ """
20
+ Finds real-world entities for a given domain.
21
+ E.g. domain="Fitness App", entity_type="Competitors" -> Returns ["Strava", "MyFitnessPal", ...]
22
+ """
23
+ if self.use_mock:
24
+ return self._mock_search(domain, entity_type, limit)
25
+
26
+ # TODO: Implement Real Search
27
+ return []
28
+
29
+ def search_market_stats(self, domain: str) -> Dict[str, Any]:
30
+ """
31
+ Finds market stats (average price, market size).
32
+ """
33
+ if self.use_mock:
34
+ return {
35
+ "market_size": "5B",
36
+ "avg_price_monthly": 14.99,
37
+ "cagr": "12%"
38
+ }
39
+ return {}
40
+
41
+ def _mock_search(self, domain: str, entity_type: str, limit: int) -> List[Dict[str, Any]]:
42
+ """Returns plausible fake data for demo purposes."""
43
+ print(f"🕵️‍♂️ [Agent] Mock Researching: {entity_type} in {domain}...")
44
+ time.sleep(1.0) # Simulate latency
45
+
46
+ domain_lower = domain.lower()
47
+
48
+ if "fitness" in domain_lower:
49
+ return [
50
+ {"name": "Strava", "revenue": "200M", "users": "100M"},
51
+ {"name": "MyFitnessPal", "revenue": "150M", "users": "80M"},
52
+ {"name": "Nike Run Club", "revenue": "N/A", "users": "50M"},
53
+ {"name": "Peloton", "revenue": "2B", "users": "10M"},
54
+ ][:limit]
55
+
56
+ elif "ecommerce" in domain_lower or "retail" in domain_lower:
57
+ return [
58
+ {"name": "Amazon", "revenue": "500B"},
59
+ {"name": "Shopify", "revenue": "5B"},
60
+ {"name": "Walmart", "revenue": "600B"},
61
+ ][:limit]
62
+
63
+ elif "saas" in domain_lower:
64
+ return [
65
+ {"name": "Salesforce", "revenue": "30B"},
66
+ {"name": "HubSpot", "revenue": "2B"},
67
+ {"name": "Atlassian", "revenue": "4B"},
68
+ ][:limit]
69
+
70
+ return [{"name": f"{domain} Competitor {i+1}"} for i in range(limit)]
misata/schema.py CHANGED
@@ -192,6 +192,29 @@ class ScenarioEvent(BaseModel):
192
192
  description: Optional[str] = None
193
193
 
194
194
 
195
+ class OutcomeCurve(BaseModel):
196
+ """
197
+ Defines a temporal/seasonal pattern for a numeric column.
198
+
199
+ This is extracted from natural language descriptions like:
200
+ "Revenue with a dip in September and peak in December"
201
+
202
+ Attributes:
203
+ table: Table containing the column to constrain
204
+ column: Numeric column to apply the curve to
205
+ time_column: Date/time column for grouping
206
+ pattern_type: Type of pattern (seasonal, growth, decline, etc.)
207
+ description: Human-readable description of the pattern
208
+ curve_points: Monthly relative values (0.0-1.0)
209
+ """
210
+ table: str
211
+ column: str
212
+ time_column: str = "date"
213
+ pattern_type: str = "seasonal"
214
+ description: Optional[str] = None
215
+ curve_points: List[Dict[str, float]] = Field(default_factory=list)
216
+
217
+
195
218
  class SchemaConfig(BaseModel):
196
219
  """
197
220
  Complete configuration for synthetic data generation.
@@ -206,6 +229,7 @@ class SchemaConfig(BaseModel):
206
229
  columns: Mapping of table names to their column definitions
207
230
  relationships: List of inter-table relationships
208
231
  events: List of scenario events to apply
232
+ outcome_curves: List of temporal patterns for constrained generation
209
233
  seed: Random seed for reproducibility
210
234
  """
211
235
 
@@ -215,6 +239,7 @@ class SchemaConfig(BaseModel):
215
239
  columns: Dict[str, List[Column]]
216
240
  relationships: List[Relationship] = Field(default_factory=list)
217
241
  events: List[ScenarioEvent] = Field(default_factory=list)
242
+ outcome_curves: List[OutcomeCurve] = Field(default_factory=list)
218
243
  seed: Optional[int] = None
219
244
 
220
245
  @field_validator("columns")