misata 0.3.0b0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +1 -1
- misata/agents/__init__.py +23 -0
- misata/agents/pipeline.py +286 -0
- misata/causal/__init__.py +5 -0
- misata/causal/graph.py +109 -0
- misata/causal/solver.py +115 -0
- misata/cli.py +31 -0
- misata/generators/__init__.py +19 -0
- misata/generators/copula.py +198 -0
- misata/llm_parser.py +180 -137
- misata/quality.py +78 -33
- misata/reference_data.py +221 -0
- misata/research/__init__.py +3 -0
- misata/research/agent.py +70 -0
- misata/schema.py +25 -0
- misata/simulator.py +264 -12
- misata/smart_values.py +144 -6
- misata/studio/__init__.py +55 -0
- misata/studio/app.py +49 -0
- misata/studio/components/inspector.py +81 -0
- misata/studio/components/sidebar.py +35 -0
- misata/studio/constraint_generator.py +781 -0
- misata/studio/inference.py +319 -0
- misata/studio/outcome_curve.py +284 -0
- misata/studio/state/store.py +55 -0
- misata/studio/tabs/configure.py +50 -0
- misata/studio/tabs/generate.py +117 -0
- misata/studio/tabs/outcome_curve.py +149 -0
- misata/studio/tabs/schema_designer.py +217 -0
- misata/studio/utils/styles.py +143 -0
- misata/studio_constraints/__init__.py +29 -0
- misata/studio_constraints/z3_solver.py +259 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/METADATA +13 -2
- misata-0.5.0.dist-info/RECORD +61 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/WHEEL +1 -1
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/entry_points.txt +1 -0
- misata-0.3.0b0.dist-info/RECORD +0 -37
- /misata/{generators.py → generators_legacy.py} +0 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {misata-0.3.0b0.dist-info → misata-0.5.0.dist-info}/top_level.txt +0 -0
misata/quality.py
CHANGED
|
@@ -11,13 +11,15 @@ This module validates generated synthetic data for:
|
|
|
11
11
|
from typing import Dict, List, Any, Optional, Tuple
|
|
12
12
|
from dataclasses import dataclass, field
|
|
13
13
|
import warnings
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd # type: ignore
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
@dataclass
|
|
17
19
|
class QualityIssue:
|
|
18
20
|
"""Represents a single data quality issue."""
|
|
19
21
|
severity: str # "error", "warning", "info"
|
|
20
|
-
category: str # "distribution", "integrity", "temporal", "domain"
|
|
22
|
+
category: str # "distribution", "integrity", "temporal", "domain", "time_series"
|
|
21
23
|
table: str
|
|
22
24
|
column: Optional[str]
|
|
23
25
|
message: str
|
|
@@ -107,19 +109,12 @@ class DataQualityChecker:
|
|
|
107
109
|
|
|
108
110
|
def check_distribution_plausibility(
|
|
109
111
|
self,
|
|
110
|
-
df:
|
|
112
|
+
df: pd.DataFrame,
|
|
111
113
|
table_name: str,
|
|
112
114
|
) -> None:
|
|
113
115
|
"""
|
|
114
116
|
Check if numeric distributions are plausible for their domains.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
df: DataFrame to check
|
|
118
|
-
table_name: Name of the table
|
|
119
117
|
"""
|
|
120
|
-
import pandas as pd
|
|
121
|
-
import numpy as np
|
|
122
|
-
|
|
123
118
|
for col in df.columns:
|
|
124
119
|
col_lower = col.lower()
|
|
125
120
|
|
|
@@ -162,15 +157,11 @@ class DataQualityChecker:
|
|
|
162
157
|
|
|
163
158
|
def check_referential_integrity(
|
|
164
159
|
self,
|
|
165
|
-
tables: Dict[str,
|
|
160
|
+
tables: Dict[str, pd.DataFrame],
|
|
166
161
|
relationships: List[Any],
|
|
167
162
|
) -> None:
|
|
168
163
|
"""
|
|
169
164
|
Verify all foreign key references are valid.
|
|
170
|
-
|
|
171
|
-
Args:
|
|
172
|
-
tables: Dict of table_name -> DataFrame
|
|
173
|
-
relationships: List of Relationship objects
|
|
174
165
|
"""
|
|
175
166
|
for rel in relationships:
|
|
176
167
|
parent_table = rel.parent_table
|
|
@@ -221,19 +212,12 @@ class DataQualityChecker:
|
|
|
221
212
|
|
|
222
213
|
def check_temporal_consistency(
|
|
223
214
|
self,
|
|
224
|
-
df:
|
|
215
|
+
df: pd.DataFrame,
|
|
225
216
|
table_name: str,
|
|
226
217
|
) -> None:
|
|
227
218
|
"""
|
|
228
219
|
Ensure temporal columns are consistent.
|
|
229
|
-
|
|
230
|
-
Checks:
|
|
231
|
-
- created_at < updated_at
|
|
232
|
-
- start_date < end_date
|
|
233
|
-
- birth_date in past
|
|
234
220
|
"""
|
|
235
|
-
import pandas as pd
|
|
236
|
-
|
|
237
221
|
date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
|
238
222
|
|
|
239
223
|
# Check created < updated
|
|
@@ -266,23 +250,83 @@ class DataQualityChecker:
|
|
|
266
250
|
f"{future_births} rows have birth_date in the future",
|
|
267
251
|
{"violation_count": future_births}
|
|
268
252
|
)
|
|
253
|
+
|
|
254
|
+
def check_time_series_properties(
|
|
255
|
+
self,
|
|
256
|
+
df: pd.DataFrame,
|
|
257
|
+
table_name: str,
|
|
258
|
+
) -> None:
|
|
259
|
+
"""
|
|
260
|
+
Analyze time series properties (Autocorrelation, Trend, Seasonality).
|
|
261
|
+
Adds 'info' level insights to the report.
|
|
262
|
+
"""
|
|
263
|
+
# Find Date Column
|
|
264
|
+
date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
|
265
|
+
if not date_cols:
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
time_col = date_cols[0] # Use first date col
|
|
269
|
+
|
|
270
|
+
# Find Metric Columns (Float/Int)
|
|
271
|
+
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in ['id']]
|
|
272
|
+
|
|
273
|
+
for col in numeric_cols:
|
|
274
|
+
# Skip if low cardinality
|
|
275
|
+
if df[col].nunique() < 10:
|
|
276
|
+
continue
|
|
277
|
+
|
|
278
|
+
# Sort by time
|
|
279
|
+
ts_df = df.sort_values(time_col)
|
|
280
|
+
series = ts_df[col].values
|
|
281
|
+
|
|
282
|
+
if len(series) < 5:
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
# 1. Autocorrelation (Lag-1)
|
|
286
|
+
# Simple manual calculation
|
|
287
|
+
if len(series) > 2:
|
|
288
|
+
# Handle possible NaNs
|
|
289
|
+
s_clean = series[~np.isnan(series)]
|
|
290
|
+
if len(s_clean) > 2:
|
|
291
|
+
lag1 = np.corrcoef(s_clean[:-1], s_clean[1:])[0, 1]
|
|
292
|
+
|
|
293
|
+
if not np.isnan(lag1):
|
|
294
|
+
if abs(lag1) > 0.7:
|
|
295
|
+
msg = f"Strong temporal logic detected (Lag-1 Autocorrelation: {lag1:.2f})"
|
|
296
|
+
self._add_issue("info", "time_series", table_name, col, msg, {"lag1": lag1})
|
|
297
|
+
elif abs(lag1) < 0.1:
|
|
298
|
+
msg = f"Data appears random/noisy (Lag-1 Autocorrelation: {lag1:.2f})"
|
|
299
|
+
self._add_issue("info", "time_series", table_name, col, msg, {"lag1": lag1})
|
|
300
|
+
|
|
301
|
+
# 2. Trend Detection
|
|
302
|
+
if len(series) > 10:
|
|
303
|
+
# Linear fit
|
|
304
|
+
x = np.arange(len(series))
|
|
305
|
+
# Handle NaNs replacement for trend check
|
|
306
|
+
s_filled = pd.Series(series).fillna(method='ffill').fillna(0).values
|
|
307
|
+
|
|
308
|
+
slope, _ = np.polyfit(x, s_filled, 1)
|
|
309
|
+
|
|
310
|
+
# Normalize slope to be % change per step relative to mean
|
|
311
|
+
mean_val = np.mean(s_filled)
|
|
312
|
+
if abs(mean_val) > 0.01:
|
|
313
|
+
normalized_slope = slope / mean_val
|
|
314
|
+
if abs(normalized_slope) * len(series) > 0.2: # Total change > 20%
|
|
315
|
+
trend_dir = "Growth" if slope > 0 else "Decline"
|
|
316
|
+
self._add_issue(
|
|
317
|
+
"info", "time_series", table_name, col,
|
|
318
|
+
f"Significant {trend_dir} Trend Detected",
|
|
319
|
+
{"slope": slope}
|
|
320
|
+
)
|
|
269
321
|
|
|
270
322
|
def check_all(
|
|
271
323
|
self,
|
|
272
|
-
tables: Dict[str,
|
|
324
|
+
tables: Dict[str, pd.DataFrame],
|
|
273
325
|
relationships: Optional[List[Any]] = None,
|
|
274
326
|
schema: Optional[Any] = None,
|
|
275
327
|
) -> QualityReport:
|
|
276
328
|
"""
|
|
277
329
|
Run all quality checks and generate a report.
|
|
278
|
-
|
|
279
|
-
Args:
|
|
280
|
-
tables: Dict of table_name -> DataFrame
|
|
281
|
-
relationships: Optional list of Relationship objects
|
|
282
|
-
schema: Optional SchemaConfig for additional checks
|
|
283
|
-
|
|
284
|
-
Returns:
|
|
285
|
-
QualityReport with score and issues
|
|
286
330
|
"""
|
|
287
331
|
self.issues = [] # Reset
|
|
288
332
|
|
|
@@ -290,6 +334,7 @@ class DataQualityChecker:
|
|
|
290
334
|
for table_name, df in tables.items():
|
|
291
335
|
self.check_distribution_plausibility(df, table_name)
|
|
292
336
|
self.check_temporal_consistency(df, table_name)
|
|
337
|
+
self.check_time_series_properties(df, table_name)
|
|
293
338
|
|
|
294
339
|
# Check referential integrity
|
|
295
340
|
if relationships:
|
|
@@ -303,7 +348,7 @@ class DataQualityChecker:
|
|
|
303
348
|
elif issue.severity == "warning":
|
|
304
349
|
base_score -= 3
|
|
305
350
|
else:
|
|
306
|
-
base_score -= 1
|
|
351
|
+
base_score -= 1 # Info subtracts 1 for now (maybe 0 later)
|
|
307
352
|
|
|
308
353
|
score = max(0, min(100, base_score))
|
|
309
354
|
|
|
@@ -323,7 +368,7 @@ class DataQualityChecker:
|
|
|
323
368
|
)
|
|
324
369
|
|
|
325
370
|
|
|
326
|
-
def check_quality(tables: Dict[str,
|
|
371
|
+
def check_quality(tables: Dict[str, pd.DataFrame], **kwargs) -> QualityReport:
|
|
327
372
|
"""Convenience function for quick quality checks."""
|
|
328
373
|
checker = DataQualityChecker()
|
|
329
374
|
return checker.check_all(tables, **kwargs)
|
misata/reference_data.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Domain-Aware Reference Data Library
|
|
3
|
+
|
|
4
|
+
Pre-built realistic data templates for common business domains.
|
|
5
|
+
This ensures reference tables (plans, exercises, categories) have
|
|
6
|
+
sensible, domain-appropriate values instead of random garbage.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from misata.reference_data import get_reference_data, detect_domain
|
|
10
|
+
|
|
11
|
+
domain = detect_domain(["plans", "subscriptions", "users"])
|
|
12
|
+
plans_data = get_reference_data(domain, "plans")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ============ DOMAIN TEMPLATES ============
|
|
19
|
+
|
|
20
|
+
REFERENCE_DATA_LIBRARY: Dict[str, Dict[str, List[Dict[str, Any]]]] = {
|
|
21
|
+
|
|
22
|
+
# ===== SaaS / Subscription Business =====
|
|
23
|
+
"saas": {
|
|
24
|
+
"plans": [
|
|
25
|
+
{"id": 1, "name": "Free", "price": 0.00, "features": "Basic features, Community support"},
|
|
26
|
+
{"id": 2, "name": "Starter", "price": 9.99, "features": "5GB storage, Email support"},
|
|
27
|
+
{"id": 3, "name": "Pro", "price": 29.99, "features": "50GB storage, Priority support, Analytics"},
|
|
28
|
+
{"id": 4, "name": "Business", "price": 79.99, "features": "200GB storage, Dedicated support, API access"},
|
|
29
|
+
{"id": 5, "name": "Enterprise", "price": 199.99, "features": "Unlimited storage, SLA, Custom integrations"},
|
|
30
|
+
],
|
|
31
|
+
"tiers": [
|
|
32
|
+
{"id": 1, "name": "Bronze", "discount_pct": 0},
|
|
33
|
+
{"id": 2, "name": "Silver", "discount_pct": 10},
|
|
34
|
+
{"id": 3, "name": "Gold", "discount_pct": 20},
|
|
35
|
+
{"id": 4, "name": "Platinum", "discount_pct": 30},
|
|
36
|
+
],
|
|
37
|
+
},
|
|
38
|
+
|
|
39
|
+
# ===== Fitness / Health App =====
|
|
40
|
+
"fitness": {
|
|
41
|
+
"exercises": [
|
|
42
|
+
{"id": 1, "name": "Running", "category": "Cardio", "calories_per_minute": 10, "difficulty": "Medium"},
|
|
43
|
+
{"id": 2, "name": "Swimming", "category": "Cardio", "calories_per_minute": 9, "difficulty": "Medium"},
|
|
44
|
+
{"id": 3, "name": "Cycling", "category": "Cardio", "calories_per_minute": 8, "difficulty": "Easy"},
|
|
45
|
+
{"id": 4, "name": "HIIT", "category": "Cardio", "calories_per_minute": 12, "difficulty": "Hard"},
|
|
46
|
+
{"id": 5, "name": "Yoga", "category": "Flexibility", "calories_per_minute": 3, "difficulty": "Easy"},
|
|
47
|
+
{"id": 6, "name": "Pilates", "category": "Flexibility", "calories_per_minute": 4, "difficulty": "Medium"},
|
|
48
|
+
{"id": 7, "name": "Weight Training", "category": "Strength", "calories_per_minute": 6, "difficulty": "Medium"},
|
|
49
|
+
{"id": 8, "name": "CrossFit", "category": "Strength", "calories_per_minute": 11, "difficulty": "Hard"},
|
|
50
|
+
],
|
|
51
|
+
"plans": [
|
|
52
|
+
{"id": 1, "name": "Free", "price": 0.00, "features": "Basic workouts"},
|
|
53
|
+
{"id": 2, "name": "Basic", "price": 9.99, "features": "All workouts, Progress tracking"},
|
|
54
|
+
{"id": 3, "name": "Premium", "price": 19.99, "features": "Personal trainer, Meal plans"},
|
|
55
|
+
{"id": 4, "name": "Elite", "price": 49.99, "features": "1-on-1 coaching, Custom programs"},
|
|
56
|
+
],
|
|
57
|
+
"workout_types": [
|
|
58
|
+
{"id": 1, "name": "Morning Cardio", "duration_minutes": 30, "intensity": "Medium"},
|
|
59
|
+
{"id": 2, "name": "Full Body Strength", "duration_minutes": 45, "intensity": "High"},
|
|
60
|
+
{"id": 3, "name": "Relaxing Yoga", "duration_minutes": 60, "intensity": "Low"},
|
|
61
|
+
{"id": 4, "name": "HIIT Blast", "duration_minutes": 20, "intensity": "Very High"},
|
|
62
|
+
],
|
|
63
|
+
},
|
|
64
|
+
|
|
65
|
+
# ===== E-commerce / Retail =====
|
|
66
|
+
"ecommerce": {
|
|
67
|
+
"categories": [
|
|
68
|
+
{"id": 1, "name": "Electronics", "description": "Phones, laptops, gadgets"},
|
|
69
|
+
{"id": 2, "name": "Clothing", "description": "Fashion and apparel"},
|
|
70
|
+
{"id": 3, "name": "Home & Garden", "description": "Furniture, decor, outdoor"},
|
|
71
|
+
{"id": 4, "name": "Sports & Outdoors", "description": "Fitness, camping, sports gear"},
|
|
72
|
+
{"id": 5, "name": "Books & Media", "description": "Books, music, movies"},
|
|
73
|
+
{"id": 6, "name": "Health & Beauty", "description": "Skincare, supplements, wellness"},
|
|
74
|
+
],
|
|
75
|
+
"products": [
|
|
76
|
+
{"id": 1, "name": "Wireless Headphones", "category_id": 1, "price": 79.99},
|
|
77
|
+
{"id": 2, "name": "Smart Watch", "category_id": 1, "price": 199.99},
|
|
78
|
+
{"id": 3, "name": "Cotton T-Shirt", "category_id": 2, "price": 24.99},
|
|
79
|
+
{"id": 4, "name": "Running Shoes", "category_id": 4, "price": 89.99},
|
|
80
|
+
{"id": 5, "name": "Yoga Mat", "category_id": 4, "price": 29.99},
|
|
81
|
+
],
|
|
82
|
+
"shipping_methods": [
|
|
83
|
+
{"id": 1, "name": "Standard", "days": 5, "price": 4.99},
|
|
84
|
+
{"id": 2, "name": "Express", "days": 2, "price": 9.99},
|
|
85
|
+
{"id": 3, "name": "Next Day", "days": 1, "price": 19.99},
|
|
86
|
+
{"id": 4, "name": "Free Shipping", "days": 7, "price": 0.00},
|
|
87
|
+
],
|
|
88
|
+
},
|
|
89
|
+
|
|
90
|
+
# ===== Finance / Banking =====
|
|
91
|
+
"finance": {
|
|
92
|
+
"account_types": [
|
|
93
|
+
{"id": 1, "name": "Checking", "interest_rate": 0.01, "monthly_fee": 0.00},
|
|
94
|
+
{"id": 2, "name": "Savings", "interest_rate": 0.50, "monthly_fee": 0.00},
|
|
95
|
+
{"id": 3, "name": "Money Market", "interest_rate": 1.00, "monthly_fee": 5.00},
|
|
96
|
+
{"id": 4, "name": "Premium Checking", "interest_rate": 0.10, "monthly_fee": 15.00},
|
|
97
|
+
],
|
|
98
|
+
"transaction_types": [
|
|
99
|
+
{"id": 1, "name": "Deposit", "category": "Income"},
|
|
100
|
+
{"id": 2, "name": "Withdrawal", "category": "Expense"},
|
|
101
|
+
{"id": 3, "name": "Transfer", "category": "Transfer"},
|
|
102
|
+
{"id": 4, "name": "Payment", "category": "Expense"},
|
|
103
|
+
{"id": 5, "name": "Refund", "category": "Income"},
|
|
104
|
+
],
|
|
105
|
+
},
|
|
106
|
+
|
|
107
|
+
# ===== Education / LMS =====
|
|
108
|
+
"education": {
|
|
109
|
+
"courses": [
|
|
110
|
+
{"id": 1, "name": "Python Fundamentals", "level": "Beginner", "duration_hours": 20, "price": 49.99},
|
|
111
|
+
{"id": 2, "name": "Data Science Bootcamp", "level": "Intermediate", "duration_hours": 60, "price": 199.99},
|
|
112
|
+
{"id": 3, "name": "Machine Learning", "level": "Advanced", "duration_hours": 40, "price": 149.99},
|
|
113
|
+
{"id": 4, "name": "Web Development", "level": "Beginner", "duration_hours": 30, "price": 79.99},
|
|
114
|
+
],
|
|
115
|
+
"difficulty_levels": [
|
|
116
|
+
{"id": 1, "name": "Beginner", "description": "No prior experience needed"},
|
|
117
|
+
{"id": 2, "name": "Intermediate", "description": "Some experience required"},
|
|
118
|
+
{"id": 3, "name": "Advanced", "description": "Strong foundation needed"},
|
|
119
|
+
{"id": 4, "name": "Expert", "description": "Professional level"},
|
|
120
|
+
],
|
|
121
|
+
},
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ============ DOMAIN DETECTION ============
|
|
126
|
+
|
|
127
|
+
# Keywords that indicate a specific domain
|
|
128
|
+
DOMAIN_KEYWORDS = {
|
|
129
|
+
"saas": ["subscription", "plan", "tier", "billing", "invoice", "tenant"],
|
|
130
|
+
"fitness": ["exercise", "workout", "calories", "fitness", "gym", "training", "health"],
|
|
131
|
+
"ecommerce": ["product", "category", "cart", "order", "shipping", "inventory", "catalog"],
|
|
132
|
+
"finance": ["account", "transaction", "balance", "payment", "transfer", "bank"],
|
|
133
|
+
"education": ["course", "student", "lesson", "enrollment", "grade", "instructor"],
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def detect_domain(table_names: List[str]) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Detect the business domain based on table names.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
table_names: List of table names in the schema
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Domain name (saas, fitness, ecommerce, finance, education, or 'generic')
|
|
146
|
+
"""
|
|
147
|
+
table_names_lower = [t.lower() for t in table_names]
|
|
148
|
+
all_text = " ".join(table_names_lower)
|
|
149
|
+
|
|
150
|
+
domain_scores = {}
|
|
151
|
+
for domain, keywords in DOMAIN_KEYWORDS.items():
|
|
152
|
+
score = sum(1 for kw in keywords if kw in all_text)
|
|
153
|
+
if score > 0:
|
|
154
|
+
domain_scores[domain] = score
|
|
155
|
+
|
|
156
|
+
if domain_scores:
|
|
157
|
+
return max(domain_scores, key=domain_scores.get)
|
|
158
|
+
|
|
159
|
+
return "generic"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_reference_data(domain: str, table_name: str) -> Optional[List[Dict[str, Any]]]:
|
|
163
|
+
"""
|
|
164
|
+
Get pre-built reference data for a table.
|
|
165
|
+
|
|
166
|
+
Strategy:
|
|
167
|
+
1. Check specific domain (exact match)
|
|
168
|
+
2. Check specific domain (singular/plural match)
|
|
169
|
+
3. GLOBAL FALLBACK: Check ALL domains for exact match
|
|
170
|
+
4. GLOBAL FALLBACK: Check ALL domains for partial match
|
|
171
|
+
"""
|
|
172
|
+
# Normalize table name
|
|
173
|
+
table_key = table_name.lower().rstrip('s') # Remove plural 's'
|
|
174
|
+
|
|
175
|
+
# 1. Try specific domain first
|
|
176
|
+
domain_data = REFERENCE_DATA_LIBRARY.get(domain, {})
|
|
177
|
+
|
|
178
|
+
# Exact match in domain
|
|
179
|
+
if table_name in domain_data:
|
|
180
|
+
return domain_data[table_name]
|
|
181
|
+
|
|
182
|
+
# Singular match in domain
|
|
183
|
+
if table_key in domain_data:
|
|
184
|
+
return domain_data[table_key]
|
|
185
|
+
|
|
186
|
+
# Partial match in domain
|
|
187
|
+
for key, data in domain_data.items():
|
|
188
|
+
if table_key in key or key in table_key:
|
|
189
|
+
return data
|
|
190
|
+
|
|
191
|
+
# 2. GLOBAL SEARCH: Check all other domains
|
|
192
|
+
# This handles mixed schemas (e.g. "fitness app with products")
|
|
193
|
+
for other_domain, tables in REFERENCE_DATA_LIBRARY.items():
|
|
194
|
+
if other_domain == domain:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
# Exact match
|
|
198
|
+
if table_name in tables:
|
|
199
|
+
return tables[table_name]
|
|
200
|
+
|
|
201
|
+
# Singular match
|
|
202
|
+
if table_key in tables:
|
|
203
|
+
return tables[table_key]
|
|
204
|
+
|
|
205
|
+
# 3. GLOBAL PARTIAL SEARCH
|
|
206
|
+
for other_domain, tables in REFERENCE_DATA_LIBRARY.items():
|
|
207
|
+
for key, data in tables.items():
|
|
208
|
+
if table_key in key or key in table_key:
|
|
209
|
+
return data
|
|
210
|
+
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def get_all_domains() -> List[str]:
|
|
215
|
+
"""Get list of all supported domains."""
|
|
216
|
+
return list(REFERENCE_DATA_LIBRARY.keys())
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_domain_tables(domain: str) -> List[str]:
|
|
220
|
+
"""Get list of tables available for a domain."""
|
|
221
|
+
return list(REFERENCE_DATA_LIBRARY.get(domain, {}).keys())
|
misata/research/agent.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Misata Deep Research Agent 🕵️♂️
|
|
4
|
+
-----------------------------
|
|
5
|
+
Responsible for fetching "Ground Truth" data from the real world.
|
|
6
|
+
Uses Agentic Search (Tavily/LangGraph) to find competitors, market stats, and pricing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Dict, Any, Optional
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
class DeepResearchAgent:
|
|
13
|
+
def __init__(self, api_key: Optional[str] = None, use_mock: bool = True):
|
|
14
|
+
self.api_key = api_key
|
|
15
|
+
self.use_mock = use_mock
|
|
16
|
+
# TODO: Initialize LangGraph / Tavily client here
|
|
17
|
+
|
|
18
|
+
def search_entities(self, domain: str, entity_type: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
19
|
+
"""
|
|
20
|
+
Finds real-world entities for a given domain.
|
|
21
|
+
E.g. domain="Fitness App", entity_type="Competitors" -> Returns ["Strava", "MyFitnessPal", ...]
|
|
22
|
+
"""
|
|
23
|
+
if self.use_mock:
|
|
24
|
+
return self._mock_search(domain, entity_type, limit)
|
|
25
|
+
|
|
26
|
+
# TODO: Implement Real Search
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
def search_market_stats(self, domain: str) -> Dict[str, Any]:
|
|
30
|
+
"""
|
|
31
|
+
Finds market stats (average price, market size).
|
|
32
|
+
"""
|
|
33
|
+
if self.use_mock:
|
|
34
|
+
return {
|
|
35
|
+
"market_size": "5B",
|
|
36
|
+
"avg_price_monthly": 14.99,
|
|
37
|
+
"cagr": "12%"
|
|
38
|
+
}
|
|
39
|
+
return {}
|
|
40
|
+
|
|
41
|
+
def _mock_search(self, domain: str, entity_type: str, limit: int) -> List[Dict[str, Any]]:
|
|
42
|
+
"""Returns plausible fake data for demo purposes."""
|
|
43
|
+
print(f"🕵️♂️ [Agent] Mock Researching: {entity_type} in {domain}...")
|
|
44
|
+
time.sleep(1.0) # Simulate latency
|
|
45
|
+
|
|
46
|
+
domain_lower = domain.lower()
|
|
47
|
+
|
|
48
|
+
if "fitness" in domain_lower:
|
|
49
|
+
return [
|
|
50
|
+
{"name": "Strava", "revenue": "200M", "users": "100M"},
|
|
51
|
+
{"name": "MyFitnessPal", "revenue": "150M", "users": "80M"},
|
|
52
|
+
{"name": "Nike Run Club", "revenue": "N/A", "users": "50M"},
|
|
53
|
+
{"name": "Peloton", "revenue": "2B", "users": "10M"},
|
|
54
|
+
][:limit]
|
|
55
|
+
|
|
56
|
+
elif "ecommerce" in domain_lower or "retail" in domain_lower:
|
|
57
|
+
return [
|
|
58
|
+
{"name": "Amazon", "revenue": "500B"},
|
|
59
|
+
{"name": "Shopify", "revenue": "5B"},
|
|
60
|
+
{"name": "Walmart", "revenue": "600B"},
|
|
61
|
+
][:limit]
|
|
62
|
+
|
|
63
|
+
elif "saas" in domain_lower:
|
|
64
|
+
return [
|
|
65
|
+
{"name": "Salesforce", "revenue": "30B"},
|
|
66
|
+
{"name": "HubSpot", "revenue": "2B"},
|
|
67
|
+
{"name": "Atlassian", "revenue": "4B"},
|
|
68
|
+
][:limit]
|
|
69
|
+
|
|
70
|
+
return [{"name": f"{domain} Competitor {i+1}"} for i in range(limit)]
|
misata/schema.py
CHANGED
|
@@ -192,6 +192,29 @@ class ScenarioEvent(BaseModel):
|
|
|
192
192
|
description: Optional[str] = None
|
|
193
193
|
|
|
194
194
|
|
|
195
|
+
class OutcomeCurve(BaseModel):
|
|
196
|
+
"""
|
|
197
|
+
Defines a temporal/seasonal pattern for a numeric column.
|
|
198
|
+
|
|
199
|
+
This is extracted from natural language descriptions like:
|
|
200
|
+
"Revenue with a dip in September and peak in December"
|
|
201
|
+
|
|
202
|
+
Attributes:
|
|
203
|
+
table: Table containing the column to constrain
|
|
204
|
+
column: Numeric column to apply the curve to
|
|
205
|
+
time_column: Date/time column for grouping
|
|
206
|
+
pattern_type: Type of pattern (seasonal, growth, decline, etc.)
|
|
207
|
+
description: Human-readable description of the pattern
|
|
208
|
+
curve_points: Monthly relative values (0.0-1.0)
|
|
209
|
+
"""
|
|
210
|
+
table: str
|
|
211
|
+
column: str
|
|
212
|
+
time_column: str = "date"
|
|
213
|
+
pattern_type: str = "seasonal"
|
|
214
|
+
description: Optional[str] = None
|
|
215
|
+
curve_points: List[Dict[str, float]] = Field(default_factory=list)
|
|
216
|
+
|
|
217
|
+
|
|
195
218
|
class SchemaConfig(BaseModel):
|
|
196
219
|
"""
|
|
197
220
|
Complete configuration for synthetic data generation.
|
|
@@ -206,6 +229,7 @@ class SchemaConfig(BaseModel):
|
|
|
206
229
|
columns: Mapping of table names to their column definitions
|
|
207
230
|
relationships: List of inter-table relationships
|
|
208
231
|
events: List of scenario events to apply
|
|
232
|
+
outcome_curves: List of temporal patterns for constrained generation
|
|
209
233
|
seed: Random seed for reproducibility
|
|
210
234
|
"""
|
|
211
235
|
|
|
@@ -215,6 +239,7 @@ class SchemaConfig(BaseModel):
|
|
|
215
239
|
columns: Dict[str, List[Column]]
|
|
216
240
|
relationships: List[Relationship] = Field(default_factory=list)
|
|
217
241
|
events: List[ScenarioEvent] = Field(default_factory=list)
|
|
242
|
+
outcome_curves: List[OutcomeCurve] = Field(default_factory=list)
|
|
218
243
|
seed: Optional[int] = None
|
|
219
244
|
|
|
220
245
|
@field_validator("columns")
|