misata 0.1.0b0__tar.gz → 0.2.0b0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. misata-0.2.0b0/LICENSE +21 -0
  2. {misata-0.1.0b0 → misata-0.2.0b0}/PKG-INFO +4 -2
  3. {misata-0.1.0b0 → misata-0.2.0b0}/README.md +1 -1
  4. {misata-0.1.0b0 → misata-0.2.0b0}/misata/__init__.py +13 -2
  5. {misata-0.1.0b0 → misata-0.2.0b0}/misata/llm_parser.py +41 -2
  6. misata-0.2.0b0/misata/quality.py +329 -0
  7. {misata-0.1.0b0 → misata-0.2.0b0}/misata/schema.py +8 -3
  8. {misata-0.1.0b0 → misata-0.2.0b0}/misata/simulator.py +81 -5
  9. misata-0.2.0b0/misata/smart_values.py +593 -0
  10. misata-0.2.0b0/misata/templates/library.py +344 -0
  11. {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/PKG-INFO +4 -2
  12. {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/SOURCES.txt +4 -0
  13. {misata-0.1.0b0 → misata-0.2.0b0}/pyproject.toml +1 -1
  14. {misata-0.1.0b0 → misata-0.2.0b0}/misata/api.py +0 -0
  15. {misata-0.1.0b0 → misata-0.2.0b0}/misata/audit.py +0 -0
  16. {misata-0.1.0b0 → misata-0.2.0b0}/misata/benchmark.py +0 -0
  17. {misata-0.1.0b0 → misata-0.2.0b0}/misata/cli.py +0 -0
  18. {misata-0.1.0b0 → misata-0.2.0b0}/misata/codegen.py +0 -0
  19. {misata-0.1.0b0 → misata-0.2.0b0}/misata/curve_fitting.py +0 -0
  20. {misata-0.1.0b0 → misata-0.2.0b0}/misata/customization.py +0 -0
  21. {misata-0.1.0b0 → misata-0.2.0b0}/misata/feedback.py +0 -0
  22. {misata-0.1.0b0 → misata-0.2.0b0}/misata/formulas.py +0 -0
  23. {misata-0.1.0b0 → misata-0.2.0b0}/misata/generators.py +0 -0
  24. {misata-0.1.0b0 → misata-0.2.0b0}/misata/hybrid.py +0 -0
  25. {misata-0.1.0b0 → misata-0.2.0b0}/misata/noise.py +0 -0
  26. {misata-0.1.0b0 → misata-0.2.0b0}/misata/semantic.py +0 -0
  27. {misata-0.1.0b0 → misata-0.2.0b0}/misata/story_parser.py +0 -0
  28. {misata-0.1.0b0 → misata-0.2.0b0}/misata/templates/__init__.py +0 -0
  29. {misata-0.1.0b0 → misata-0.2.0b0}/misata/validation.py +0 -0
  30. {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/dependency_links.txt +0 -0
  31. {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/entry_points.txt +0 -0
  32. {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/requires.txt +0 -0
  33. {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/top_level.txt +0 -0
  34. {misata-0.1.0b0 → misata-0.2.0b0}/setup.cfg +0 -0
  35. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_api.py +0 -0
  36. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_cli.py +0 -0
  37. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_constraints.py +0 -0
  38. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_curve_fitting.py +0 -0
  39. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_enterprise.py +0 -0
  40. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_formulas.py +0 -0
  41. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_integrity.py +0 -0
  42. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_llm_parser.py +0 -0
  43. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_schema.py +0 -0
  44. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_security.py +0 -0
  45. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_semantic.py +0 -0
  46. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_simulator.py +0 -0
  47. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_templates.py +0 -0
  48. {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_validation.py +0 -0
misata-0.2.0b0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Muhammed Rasin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: misata
3
- Version: 0.1.0b0
3
+ Version: 0.2.0b0
4
4
  Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
5
5
  Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
6
6
  License: MIT
@@ -23,6 +23,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Database
24
24
  Requires-Python: >=3.10
25
25
  Description-Content-Type: text/markdown
26
+ License-File: LICENSE
26
27
  Requires-Dist: pandas>=2.0.0
27
28
  Requires-Dist: numpy>=1.24.0
28
29
  Requires-Dist: pydantic>=2.0.0
@@ -41,6 +42,7 @@ Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
41
42
  Requires-Dist: black>=23.0.0; extra == "dev"
42
43
  Requires-Dist: ruff>=0.1.0; extra == "dev"
43
44
  Requires-Dist: mypy>=1.5.0; extra == "dev"
45
+ Dynamic: license-file
44
46
 
45
47
  # 🧠 Misata
46
48
 
@@ -48,7 +50,7 @@ Requires-Dist: mypy>=1.5.0; extra == "dev"
48
50
 
49
51
  No schema writing. No training data. Just describe what you need.
50
52
 
51
- [![Version](https://img.shields.io/badge/version-0.1.0--beta-purple.svg)]()
53
+ [![Version](https://img.shields.io/badge/version-0.2.0--beta-purple.svg)]()
52
54
  [![License](https://img.shields.io/badge/license-MIT-blue.svg)]()
53
55
  [![Python](https://img.shields.io/badge/python-3.10+-green.svg)]()
54
56
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  No schema writing. No training data. Just describe what you need.
6
6
 
7
- [![Version](https://img.shields.io/badge/version-0.1.0--beta-purple.svg)]()
7
+ [![Version](https://img.shields.io/badge/version-0.2.0--beta-purple.svg)]()
8
8
  [![License](https://img.shields.io/badge/license-MIT-blue.svg)]()
9
9
  [![Python](https://img.shields.io/badge/python-3.10+-green.svg)]()
10
10
 
@@ -9,9 +9,13 @@ Usage:
9
9
 
10
10
  # Or use the CLI:
11
11
  # misata generate --story "A SaaS with 50k users..."
12
+
13
+ # Or use pre-built templates:
14
+ from misata.templates.library import load_template
15
+ config = load_template("ecommerce")
12
16
  """
13
17
 
14
- __version__ = "0.1.0-beta"
18
+ __version__ = "0.2.0-beta"
15
19
  __author__ = "Muhammed Rasin"
16
20
 
17
21
  from misata.schema import (
@@ -26,6 +30,8 @@ from misata.simulator import DataSimulator
26
30
  from misata.generators import TextGenerator
27
31
  from misata.noise import NoiseInjector, add_noise
28
32
  from misata.customization import Customizer, ColumnOverride
33
+ from misata.quality import DataQualityChecker, check_quality
34
+ from misata.templates.library import load_template, list_templates
29
35
 
30
36
  __all__ = [
31
37
  # Core
@@ -43,6 +49,11 @@ __all__ = [
43
49
  "add_noise",
44
50
  "Customizer",
45
51
  "ColumnOverride",
52
+ # Quality
53
+ "DataQualityChecker",
54
+ "check_quality",
55
+ # Templates
56
+ "load_template",
57
+ "list_templates",
46
58
  ]
47
59
 
48
-
@@ -24,7 +24,11 @@ def _load_env():
24
24
  """Load environment variables from .env file."""
25
25
  env_paths = [
26
26
  Path.cwd() / ".env",
27
- Path(__file__).parent.parent / ".env",
27
+ Path.cwd().parent / ".env", # apps/.env or api parent
28
+ Path.cwd().parent.parent / ".env", # Misata root from apps/api
29
+ Path(__file__).parent.parent / ".env", # packages/core/.env
30
+ Path(__file__).parent.parent.parent / ".env", # packages/.env
31
+ Path(__file__).parent.parent.parent.parent / ".env", # Misata root from packages/core/misata
28
32
  Path.home() / ".misata" / ".env",
29
33
  ]
30
34
 
@@ -35,7 +39,9 @@ def _load_env():
35
39
  line = line.strip()
36
40
  if line and not line.startswith("#") and "=" in line:
37
41
  key, _, value = line.partition("=")
38
- os.environ.setdefault(key.strip(), value.strip())
42
+ # Remove quotes if present
43
+ value = value.strip().strip("'\"")
44
+ os.environ.setdefault(key.strip(), value)
39
45
  break
40
46
 
41
47
  _load_env()
@@ -82,6 +88,39 @@ Instead of guessing parameters, you can provide "control_points" to draw the sha
82
88
  Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
83
89
  Misata will mathematically solve for the best parameters.
84
90
 
91
+ ### SMART DEFAULTS (Use These for Realistic Data):
92
+
93
+ **Age columns:**
94
+ - type: "int", distribution: "normal", mean: 35, std: 12, min: 18, max: 80
95
+
96
+ **Price/Amount columns:**
97
+ - type: "float", distribution: "exponential", scale: 50, min: 0.01, decimals: 2
98
+ - OR for products: uniform min: 9.99, max: 499.99
99
+
100
+ **Rating columns (1-5 stars):**
101
+ - type: "int", distribution: "categorical", choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
102
+
103
+ **Quantity/Count columns:**
104
+ - type: "int", distribution: "poisson", lambda: 3, min: 1
105
+
106
+ **Duration (minutes):**
107
+ - type: "int", distribution: "normal", mean: 45, std: 20, min: 5, max: 180
108
+
109
+ **Percentage columns:**
110
+ - type: "float", distribution: "uniform", min: 0.0, max: 100.0, decimals: 1
111
+
112
+ **Status columns:**
113
+ - type: "categorical", choices: ["active", "inactive", "pending"], probabilities: [0.70, 0.20, 0.10]
114
+
115
+ **Boolean probabilities:**
116
+ - is_verified: probability: 0.85
117
+ - is_premium: probability: 0.25
118
+ - is_active: probability: 0.80
119
+
120
+ **Date columns:**
121
+ - For recent data: bias last 30% of range with 70% of values
122
+ - Always use realistic date ranges (not 1970-2100)
123
+
85
124
  ## OUTPUT FORMAT
86
125
 
87
126
  {
@@ -0,0 +1,329 @@
1
+ """
2
+ Data Quality Checker for Synthetic Data Validation.
3
+
4
+ This module validates generated synthetic data for:
5
+ - Distribution plausibility
6
+ - Referential integrity
7
+ - Temporal consistency
8
+ - Domain-specific rules
9
+ """
10
+
11
+ from typing import Dict, List, Any, Optional, Tuple
12
+ from dataclasses import dataclass, field
13
+ import warnings
14
+
15
+
16
+ @dataclass
17
+ class QualityIssue:
18
+ """Represents a single data quality issue."""
19
+ severity: str # "error", "warning", "info"
20
+ category: str # "distribution", "integrity", "temporal", "domain"
21
+ table: str
22
+ column: Optional[str]
23
+ message: str
24
+ details: Dict[str, Any] = field(default_factory=dict)
25
+
26
+
27
+ @dataclass
28
+ class QualityReport:
29
+ """Complete quality report for generated data."""
30
+ score: float # 0-100
31
+ issues: List[QualityIssue]
32
+ stats: Dict[str, Any]
33
+
34
+ @property
35
+ def passed(self) -> bool:
36
+ """Returns True if no errors (warnings OK)."""
37
+ return not any(i.severity == "error" for i in self.issues)
38
+
39
+ def summary(self) -> str:
40
+ """Human-readable summary."""
41
+ errors = sum(1 for i in self.issues if i.severity == "error")
42
+ warnings = sum(1 for i in self.issues if i.severity == "warning")
43
+ return f"Quality Score: {self.score:.1f}/100 | Errors: {errors} | Warnings: {warnings}"
44
+
45
+
46
+ class DataQualityChecker:
47
+ """
48
+ Validate generated synthetic data for realism and correctness.
49
+
50
+ Usage:
51
+ checker = DataQualityChecker()
52
+ report = checker.check_all(tables, relationships, schema)
53
+
54
+ if not report.passed:
55
+ print("Issues found:", report.issues)
56
+ """
57
+
58
+ # Domain-specific plausibility rules
59
+ PLAUSIBILITY_RULES = {
60
+ # Column name patterns -> (min, max, description)
61
+ "age": (0, 120, "Human age"),
62
+ "price": (0, 1_000_000, "Price"),
63
+ "quantity": (0, 10_000, "Quantity"),
64
+ "rating": (1, 5, "Rating"),
65
+ "percentage": (0, 100, "Percentage"),
66
+ "year": (1900, 2100, "Year"),
67
+ "month": (1, 12, "Month"),
68
+ "day": (1, 31, "Day"),
69
+ "hour": (0, 23, "Hour"),
70
+ "minute": (0, 59, "Minute"),
71
+ "score": (0, 100, "Score"),
72
+ "count": (0, 1_000_000, "Count"),
73
+ "duration": (0, 10_000, "Duration"),
74
+ }
75
+
76
+ def __init__(self, strict: bool = False):
77
+ """
78
+ Initialize the quality checker.
79
+
80
+ Args:
81
+ strict: If True, warnings become errors
82
+ """
83
+ self.strict = strict
84
+ self.issues: List[QualityIssue] = []
85
+
86
+ def _add_issue(
87
+ self,
88
+ severity: str,
89
+ category: str,
90
+ table: str,
91
+ column: Optional[str],
92
+ message: str,
93
+ details: Optional[Dict] = None,
94
+ ):
95
+ """Add an issue to the list."""
96
+ if self.strict and severity == "warning":
97
+ severity = "error"
98
+
99
+ self.issues.append(QualityIssue(
100
+ severity=severity,
101
+ category=category,
102
+ table=table,
103
+ column=column,
104
+ message=message,
105
+ details=details or {},
106
+ ))
107
+
108
+ def check_distribution_plausibility(
109
+ self,
110
+ df: "pd.DataFrame",
111
+ table_name: str,
112
+ ) -> None:
113
+ """
114
+ Check if numeric distributions are plausible for their domains.
115
+
116
+ Args:
117
+ df: DataFrame to check
118
+ table_name: Name of the table
119
+ """
120
+ import pandas as pd
121
+ import numpy as np
122
+
123
+ for col in df.columns:
124
+ col_lower = col.lower()
125
+
126
+ # Check against plausibility rules
127
+ for pattern, (min_val, max_val, description) in self.PLAUSIBILITY_RULES.items():
128
+ if pattern in col_lower:
129
+ if pd.api.types.is_numeric_dtype(df[col]):
130
+ actual_min = df[col].min()
131
+ actual_max = df[col].max()
132
+
133
+ if actual_min < min_val:
134
+ self._add_issue(
135
+ "warning", "distribution", table_name, col,
136
+ f"{description} column '{col}' has min {actual_min} < expected {min_val}",
137
+ {"actual_min": actual_min, "expected_min": min_val}
138
+ )
139
+
140
+ if actual_max > max_val:
141
+ self._add_issue(
142
+ "warning", "distribution", table_name, col,
143
+ f"{description} column '{col}' has max {actual_max} > expected {max_val}",
144
+ {"actual_max": actual_max, "expected_max": max_val}
145
+ )
146
+ break
147
+
148
+ # Check for all-null columns
149
+ if df[col].isna().all():
150
+ self._add_issue(
151
+ "error", "distribution", table_name, col,
152
+ f"Column '{col}' is entirely NULL",
153
+ )
154
+
155
+ # Check for zero variance (all same value)
156
+ if pd.api.types.is_numeric_dtype(df[col]) and df[col].std() == 0:
157
+ self._add_issue(
158
+ "warning", "distribution", table_name, col,
159
+ f"Column '{col}' has zero variance (all values identical)",
160
+ {"value": df[col].iloc[0]}
161
+ )
162
+
163
+ def check_referential_integrity(
164
+ self,
165
+ tables: Dict[str, "pd.DataFrame"],
166
+ relationships: List[Any],
167
+ ) -> None:
168
+ """
169
+ Verify all foreign key references are valid.
170
+
171
+ Args:
172
+ tables: Dict of table_name -> DataFrame
173
+ relationships: List of Relationship objects
174
+ """
175
+ for rel in relationships:
176
+ parent_table = rel.parent_table
177
+ child_table = rel.child_table
178
+ parent_key = rel.parent_key
179
+ child_key = rel.child_key
180
+
181
+ if parent_table not in tables:
182
+ self._add_issue(
183
+ "error", "integrity", child_table, child_key,
184
+ f"Parent table '{parent_table}' not found for FK '{child_key}'",
185
+ )
186
+ continue
187
+
188
+ if child_table not in tables:
189
+ continue # Child table might not exist yet
190
+
191
+ parent_df = tables[parent_table]
192
+ child_df = tables[child_table]
193
+
194
+ if parent_key not in parent_df.columns:
195
+ self._add_issue(
196
+ "error", "integrity", parent_table, parent_key,
197
+ f"Parent key '{parent_key}' not found in table '{parent_table}'",
198
+ )
199
+ continue
200
+
201
+ if child_key not in child_df.columns:
202
+ self._add_issue(
203
+ "error", "integrity", child_table, child_key,
204
+ f"Child key '{child_key}' not found in table '{child_table}'",
205
+ )
206
+ continue
207
+
208
+ # Check for orphaned records
209
+ parent_ids = set(parent_df[parent_key].dropna().unique())
210
+ child_ids = set(child_df[child_key].dropna().unique())
211
+ orphans = child_ids - parent_ids
212
+
213
+ if orphans:
214
+ orphan_pct = len(orphans) / len(child_ids) * 100
215
+ self._add_issue(
216
+ "error" if orphan_pct > 1 else "warning",
217
+ "integrity", child_table, child_key,
218
+ f"{len(orphans)} orphaned FK values ({orphan_pct:.1f}%) in '{child_key}' -> '{parent_table}.{parent_key}'",
219
+ {"orphan_count": len(orphans), "orphan_pct": orphan_pct}
220
+ )
221
+
222
+ def check_temporal_consistency(
223
+ self,
224
+ df: "pd.DataFrame",
225
+ table_name: str,
226
+ ) -> None:
227
+ """
228
+ Ensure temporal columns are consistent.
229
+
230
+ Checks:
231
+ - created_at < updated_at
232
+ - start_date < end_date
233
+ - birth_date in past
234
+ """
235
+ import pandas as pd
236
+
237
+ date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
238
+
239
+ # Check created < updated
240
+ if "created_at" in date_cols and "updated_at" in date_cols:
241
+ violations = (df["created_at"] > df["updated_at"]).sum()
242
+ if violations > 0:
243
+ self._add_issue(
244
+ "error", "temporal", table_name, "created_at",
245
+ f"{violations} rows have created_at > updated_at",
246
+ {"violation_count": violations}
247
+ )
248
+
249
+ # Check start < end
250
+ if "start_date" in date_cols and "end_date" in date_cols:
251
+ violations = (df["start_date"] > df["end_date"]).sum()
252
+ if violations > 0:
253
+ self._add_issue(
254
+ "error", "temporal", table_name, "start_date",
255
+ f"{violations} rows have start_date > end_date",
256
+ {"violation_count": violations}
257
+ )
258
+
259
+ # Check birth_date is in past
260
+ if "birth_date" in date_cols or "date_of_birth" in date_cols:
261
+ col = "birth_date" if "birth_date" in date_cols else "date_of_birth"
262
+ future_births = (df[col] > pd.Timestamp.now()).sum()
263
+ if future_births > 0:
264
+ self._add_issue(
265
+ "error", "temporal", table_name, col,
266
+ f"{future_births} rows have birth_date in the future",
267
+ {"violation_count": future_births}
268
+ )
269
+
270
+ def check_all(
271
+ self,
272
+ tables: Dict[str, "pd.DataFrame"],
273
+ relationships: Optional[List[Any]] = None,
274
+ schema: Optional[Any] = None,
275
+ ) -> QualityReport:
276
+ """
277
+ Run all quality checks and generate a report.
278
+
279
+ Args:
280
+ tables: Dict of table_name -> DataFrame
281
+ relationships: Optional list of Relationship objects
282
+ schema: Optional SchemaConfig for additional checks
283
+
284
+ Returns:
285
+ QualityReport with score and issues
286
+ """
287
+ self.issues = [] # Reset
288
+
289
+ # Check each table
290
+ for table_name, df in tables.items():
291
+ self.check_distribution_plausibility(df, table_name)
292
+ self.check_temporal_consistency(df, table_name)
293
+
294
+ # Check referential integrity
295
+ if relationships:
296
+ self.check_referential_integrity(tables, relationships)
297
+
298
+ # Calculate score
299
+ base_score = 100
300
+ for issue in self.issues:
301
+ if issue.severity == "error":
302
+ base_score -= 10
303
+ elif issue.severity == "warning":
304
+ base_score -= 3
305
+ else:
306
+ base_score -= 1
307
+
308
+ score = max(0, min(100, base_score))
309
+
310
+ # Gather stats
311
+ stats = {
312
+ "tables_checked": len(tables),
313
+ "total_rows": sum(len(df) for df in tables.values()),
314
+ "total_columns": sum(len(df.columns) for df in tables.values()),
315
+ "error_count": sum(1 for i in self.issues if i.severity == "error"),
316
+ "warning_count": sum(1 for i in self.issues if i.severity == "warning"),
317
+ }
318
+
319
+ return QualityReport(
320
+ score=score,
321
+ issues=self.issues.copy(),
322
+ stats=stats,
323
+ )
324
+
325
+
326
+ def check_quality(tables: Dict[str, "pd.DataFrame"], **kwargs) -> QualityReport:
327
+ """Convenience function for quick quality checks."""
328
+ checker = DataQualityChecker()
329
+ return checker.check_all(tables, **kwargs)
@@ -23,7 +23,7 @@ class Column(BaseModel):
23
23
  """
24
24
 
25
25
  name: str
26
- type: Literal["int", "float", "date", "categorical", "foreign_key", "text", "boolean"]
26
+ type: Literal["int", "float", "date", "time", "datetime", "categorical", "foreign_key", "text", "boolean"]
27
27
  distribution_params: Dict[str, Any] = Field(default_factory=dict)
28
28
  nullable: bool = False
29
29
  unique: bool = False
@@ -39,8 +39,13 @@ class Column(BaseModel):
39
39
 
40
40
  if col_type == "date":
41
41
  if "relative_to" not in v:
42
- if "start" not in v or "end" not in v:
43
- raise ValueError("Date columns must have 'start' and 'end' OR 'relative_to' in distribution_params")
42
+ # Provide sensible defaults if start/end not specified
43
+ if "start" not in v:
44
+ from datetime import datetime, timedelta
45
+ v["start"] = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
46
+ if "end" not in v:
47
+ from datetime import datetime
48
+ v["end"] = datetime.now().strftime("%Y-%m-%d")
44
49
 
45
50
  if col_type in ["int", "float"]:
46
51
  if "distribution" not in v:
@@ -35,7 +35,8 @@ class DataSimulator:
35
35
  """
36
36
 
37
37
  def __init__(self, config: SchemaConfig,
38
- apply_semantic_fixes: bool = True, batch_size: int = 10_000):
38
+ apply_semantic_fixes: bool = True, batch_size: int = 10_000,
39
+ smart_mode: bool = False, use_llm: bool = True):
39
40
  """
40
41
  Initialize the simulator.
41
42
 
@@ -43,13 +44,19 @@ class DataSimulator:
43
44
  config: Schema configuration defining tables, columns, and relationships
44
45
  apply_semantic_fixes: Auto-fix column types based on semantic patterns
45
46
  batch_size: Number of rows to generate per batch
47
+ smart_mode: Enable LLM-powered context-aware value generation
48
+ use_llm: If smart_mode is True, whether to use LLM (vs curated fallbacks)
46
49
  """
47
50
  self.config = config
48
51
  self.context: Dict[str, pd.DataFrame] = {} # Lightweight context (IDs only)
49
52
  self.text_gen = TextGenerator(seed=config.seed)
50
53
  self.batch_size = batch_size
54
+ self.smart_mode = smart_mode
55
+ self.use_llm = use_llm
56
+ self._smart_gen = None # Lazy init
51
57
  self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
52
58
  self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
59
+ self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
53
60
 
54
61
  # Apply semantic inference to fix column types
55
62
  if apply_semantic_fixes:
@@ -60,6 +67,16 @@ class DataSimulator:
60
67
  seed = config.seed if config.seed is not None else np.random.randint(0, 2**32 - 1)
61
68
  self.rng = np.random.default_rng(seed)
62
69
  np.random.seed(seed) # For legacy numpy.random calls
70
+
71
+ def _get_smart_gen(self):
72
+ """Lazy initialize SmartValueGenerator."""
73
+ if self._smart_gen is None:
74
+ try:
75
+ from misata.smart_values import SmartValueGenerator
76
+ self._smart_gen = SmartValueGenerator()
77
+ except Exception:
78
+ self._smart_gen = None
79
+ return self._smart_gen
63
80
 
64
81
  def topological_sort(self) -> List[str]:
65
82
  """
@@ -210,13 +227,21 @@ class DataSimulator:
210
227
 
211
228
  # CATEGORICAL
212
229
  if column.type == "categorical":
213
- choices = params["choices"]
230
+ choices = params.get("choices", ["A", "B", "C"])
214
231
  probabilities = params.get("probabilities", None)
215
232
 
233
+ # Ensure choices is a list
234
+ if not isinstance(choices, list):
235
+ choices = list(choices)
236
+
216
237
  if probabilities is not None:
217
- # Normalize probabilities
218
- probabilities = np.array(probabilities)
219
- probabilities = probabilities / probabilities.sum()
238
+ # Convert to float array and normalize
239
+ probabilities = np.array(probabilities, dtype=float)
240
+ prob_sum = probabilities.sum()
241
+ if prob_sum > 0:
242
+ probabilities = probabilities / prob_sum
243
+ else:
244
+ probabilities = None
220
245
 
221
246
  values = self.rng.choice(choices, size=size, p=probabilities)
222
247
  return values
@@ -413,6 +438,35 @@ class DataSimulator:
413
438
  # TEXT
414
439
  elif column.type == "text":
415
440
  text_type = params.get("text_type", "sentence")
441
+
442
+ # Smart value generation - check for domain-specific content
443
+ smart_generate = params.get("smart_generate", False) or self.smart_mode
444
+ if smart_generate:
445
+ smart_gen = self._get_smart_gen()
446
+ if smart_gen:
447
+ # Check for explicit domain hint or auto-detect
448
+ domain_hint = params.get("domain_hint")
449
+ context = params.get("context", "")
450
+
451
+ # Create cache key for this column's pool
452
+ pool_key = f"{table_name}.{column.name}"
453
+
454
+ if pool_key not in self._smart_pools:
455
+ pool = smart_gen.get_pool(
456
+ column_name=column.name,
457
+ table_name=table_name,
458
+ domain_hint=domain_hint,
459
+ context=context,
460
+ size=100,
461
+ use_llm=self.use_llm,
462
+ )
463
+ if pool:
464
+ self._smart_pools[pool_key] = np.array(pool)
465
+
466
+ if pool_key in self._smart_pools:
467
+ pool = self._smart_pools[pool_key]
468
+ values = self.rng.choice(pool, size=size)
469
+ return values
416
470
 
417
471
  if text_type == "name":
418
472
  values = np.array([self.text_gen.name() for _ in range(size)])
@@ -441,6 +495,28 @@ class DataSimulator:
441
495
  values = self.rng.random(size) < probability
442
496
  return values
443
497
 
498
+ # TIME
499
+ elif column.type == "time":
500
+ # Generate random times as HH:MM:SS strings
501
+ start_hour = params.get("start_hour", 0)
502
+ end_hour = params.get("end_hour", 24)
503
+ hours = self.rng.integers(start_hour, end_hour, size=size)
504
+ minutes = self.rng.integers(0, 60, size=size)
505
+ seconds = self.rng.integers(0, 60, size=size)
506
+ values = np.array([f"{h:02d}:{m:02d}:{s:02d}" for h, m, s in zip(hours, minutes, seconds)])
507
+ return values
508
+
509
+ # DATETIME
510
+ elif column.type == "datetime":
511
+ # Generate random datetimes within a range
512
+ start = pd.to_datetime(params.get("start", "2020-01-01"))
513
+ end = pd.to_datetime(params.get("end", "2024-12-31"))
514
+ start_int = start.value
515
+ end_int = end.value
516
+ random_ints = self.rng.integers(start_int, end_int, size=size)
517
+ values = pd.to_datetime(random_ints)
518
+ return values
519
+
444
520
  else:
445
521
  raise ValueError(f"Unknown column type: {column.type}")
446
522