misata 0.1.0b0__tar.gz → 0.2.0b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata-0.2.0b0/LICENSE +21 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/PKG-INFO +4 -2
- {misata-0.1.0b0 → misata-0.2.0b0}/README.md +1 -1
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/__init__.py +13 -2
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/llm_parser.py +41 -2
- misata-0.2.0b0/misata/quality.py +329 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/schema.py +8 -3
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/simulator.py +81 -5
- misata-0.2.0b0/misata/smart_values.py +593 -0
- misata-0.2.0b0/misata/templates/library.py +344 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/PKG-INFO +4 -2
- {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/SOURCES.txt +4 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/pyproject.toml +1 -1
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/api.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/audit.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/benchmark.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/cli.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/codegen.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/curve_fitting.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/customization.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/feedback.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/formulas.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/generators.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/hybrid.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/noise.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/semantic.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/story_parser.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/templates/__init__.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata/validation.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/dependency_links.txt +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/entry_points.txt +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/requires.txt +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/misata.egg-info/top_level.txt +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/setup.cfg +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_api.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_cli.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_constraints.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_curve_fitting.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_enterprise.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_formulas.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_integrity.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_llm_parser.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_schema.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_security.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_semantic.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_simulator.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_templates.py +0 -0
- {misata-0.1.0b0 → misata-0.2.0b0}/tests/test_validation.py +0 -0
misata-0.2.0b0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Muhammed Rasin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: misata
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0b0
|
|
4
4
|
Summary: AI-Powered Synthetic Data Engine - Generate realistic multi-table datasets from natural language
|
|
5
5
|
Author-email: Muhammed Rasin <rasinbinabdulla@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -23,6 +23,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Database
|
|
24
24
|
Requires-Python: >=3.10
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
26
27
|
Requires-Dist: pandas>=2.0.0
|
|
27
28
|
Requires-Dist: numpy>=1.24.0
|
|
28
29
|
Requires-Dist: pydantic>=2.0.0
|
|
@@ -41,6 +42,7 @@ Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
|
|
|
41
42
|
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
42
43
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
44
|
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
44
46
|
|
|
45
47
|
# 🧠 Misata
|
|
46
48
|
|
|
@@ -48,7 +50,7 @@ Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
|
48
50
|
|
|
49
51
|
No schema writing. No training data. Just describe what you need.
|
|
50
52
|
|
|
51
|
-
[]()
|
|
52
54
|
[]()
|
|
53
55
|
[]()
|
|
54
56
|
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
No schema writing. No training data. Just describe what you need.
|
|
6
6
|
|
|
7
|
-
[]()
|
|
8
8
|
[]()
|
|
9
9
|
[]()
|
|
10
10
|
|
|
@@ -9,9 +9,13 @@ Usage:
|
|
|
9
9
|
|
|
10
10
|
# Or use the CLI:
|
|
11
11
|
# misata generate --story "A SaaS with 50k users..."
|
|
12
|
+
|
|
13
|
+
# Or use pre-built templates:
|
|
14
|
+
from misata.templates.library import load_template
|
|
15
|
+
config = load_template("ecommerce")
|
|
12
16
|
"""
|
|
13
17
|
|
|
14
|
-
__version__ = "0.
|
|
18
|
+
__version__ = "0.2.0-beta"
|
|
15
19
|
__author__ = "Muhammed Rasin"
|
|
16
20
|
|
|
17
21
|
from misata.schema import (
|
|
@@ -26,6 +30,8 @@ from misata.simulator import DataSimulator
|
|
|
26
30
|
from misata.generators import TextGenerator
|
|
27
31
|
from misata.noise import NoiseInjector, add_noise
|
|
28
32
|
from misata.customization import Customizer, ColumnOverride
|
|
33
|
+
from misata.quality import DataQualityChecker, check_quality
|
|
34
|
+
from misata.templates.library import load_template, list_templates
|
|
29
35
|
|
|
30
36
|
__all__ = [
|
|
31
37
|
# Core
|
|
@@ -43,6 +49,11 @@ __all__ = [
|
|
|
43
49
|
"add_noise",
|
|
44
50
|
"Customizer",
|
|
45
51
|
"ColumnOverride",
|
|
52
|
+
# Quality
|
|
53
|
+
"DataQualityChecker",
|
|
54
|
+
"check_quality",
|
|
55
|
+
# Templates
|
|
56
|
+
"load_template",
|
|
57
|
+
"list_templates",
|
|
46
58
|
]
|
|
47
59
|
|
|
48
|
-
|
|
@@ -24,7 +24,11 @@ def _load_env():
|
|
|
24
24
|
"""Load environment variables from .env file."""
|
|
25
25
|
env_paths = [
|
|
26
26
|
Path.cwd() / ".env",
|
|
27
|
-
Path(
|
|
27
|
+
Path.cwd().parent / ".env", # apps/.env or api parent
|
|
28
|
+
Path.cwd().parent.parent / ".env", # Misata root from apps/api
|
|
29
|
+
Path(__file__).parent.parent / ".env", # packages/core/.env
|
|
30
|
+
Path(__file__).parent.parent.parent / ".env", # packages/.env
|
|
31
|
+
Path(__file__).parent.parent.parent.parent / ".env", # Misata root from packages/core/misata
|
|
28
32
|
Path.home() / ".misata" / ".env",
|
|
29
33
|
]
|
|
30
34
|
|
|
@@ -35,7 +39,9 @@ def _load_env():
|
|
|
35
39
|
line = line.strip()
|
|
36
40
|
if line and not line.startswith("#") and "=" in line:
|
|
37
41
|
key, _, value = line.partition("=")
|
|
38
|
-
|
|
42
|
+
# Remove quotes if present
|
|
43
|
+
value = value.strip().strip("'\"")
|
|
44
|
+
os.environ.setdefault(key.strip(), value)
|
|
39
45
|
break
|
|
40
46
|
|
|
41
47
|
_load_env()
|
|
@@ -82,6 +88,39 @@ Instead of guessing parameters, you can provide "control_points" to draw the sha
|
|
|
82
88
|
Format: {"distribution": "normal", "control_points": [{"x": 10, "y": 0.1}, {"x": 50, "y": 0.9}]}
|
|
83
89
|
Misata will mathematically solve for the best parameters.
|
|
84
90
|
|
|
91
|
+
### SMART DEFAULTS (Use These for Realistic Data):
|
|
92
|
+
|
|
93
|
+
**Age columns:**
|
|
94
|
+
- type: "int", distribution: "normal", mean: 35, std: 12, min: 18, max: 80
|
|
95
|
+
|
|
96
|
+
**Price/Amount columns:**
|
|
97
|
+
- type: "float", distribution: "exponential", scale: 50, min: 0.01, decimals: 2
|
|
98
|
+
- OR for products: uniform min: 9.99, max: 499.99
|
|
99
|
+
|
|
100
|
+
**Rating columns (1-5 stars):**
|
|
101
|
+
- type: "int", distribution: "categorical", choices: [1,2,3,4,5], probabilities: [0.05, 0.08, 0.15, 0.32, 0.40]
|
|
102
|
+
|
|
103
|
+
**Quantity/Count columns:**
|
|
104
|
+
- type: "int", distribution: "poisson", lambda: 3, min: 1
|
|
105
|
+
|
|
106
|
+
**Duration (minutes):**
|
|
107
|
+
- type: "int", distribution: "normal", mean: 45, std: 20, min: 5, max: 180
|
|
108
|
+
|
|
109
|
+
**Percentage columns:**
|
|
110
|
+
- type: "float", distribution: "uniform", min: 0.0, max: 100.0, decimals: 1
|
|
111
|
+
|
|
112
|
+
**Status columns:**
|
|
113
|
+
- type: "categorical", choices: ["active", "inactive", "pending"], probabilities: [0.70, 0.20, 0.10]
|
|
114
|
+
|
|
115
|
+
**Boolean probabilities:**
|
|
116
|
+
- is_verified: probability: 0.85
|
|
117
|
+
- is_premium: probability: 0.25
|
|
118
|
+
- is_active: probability: 0.80
|
|
119
|
+
|
|
120
|
+
**Date columns:**
|
|
121
|
+
- For recent data: bias last 30% of range with 70% of values
|
|
122
|
+
- Always use realistic date ranges (not 1970-2100)
|
|
123
|
+
|
|
85
124
|
## OUTPUT FORMAT
|
|
86
125
|
|
|
87
126
|
{
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Quality Checker for Synthetic Data Validation.
|
|
3
|
+
|
|
4
|
+
This module validates generated synthetic data for:
|
|
5
|
+
- Distribution plausibility
|
|
6
|
+
- Referential integrity
|
|
7
|
+
- Temporal consistency
|
|
8
|
+
- Domain-specific rules
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
import warnings
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class QualityIssue:
|
|
18
|
+
"""Represents a single data quality issue."""
|
|
19
|
+
severity: str # "error", "warning", "info"
|
|
20
|
+
category: str # "distribution", "integrity", "temporal", "domain"
|
|
21
|
+
table: str
|
|
22
|
+
column: Optional[str]
|
|
23
|
+
message: str
|
|
24
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class QualityReport:
|
|
29
|
+
"""Complete quality report for generated data."""
|
|
30
|
+
score: float # 0-100
|
|
31
|
+
issues: List[QualityIssue]
|
|
32
|
+
stats: Dict[str, Any]
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def passed(self) -> bool:
|
|
36
|
+
"""Returns True if no errors (warnings OK)."""
|
|
37
|
+
return not any(i.severity == "error" for i in self.issues)
|
|
38
|
+
|
|
39
|
+
def summary(self) -> str:
|
|
40
|
+
"""Human-readable summary."""
|
|
41
|
+
errors = sum(1 for i in self.issues if i.severity == "error")
|
|
42
|
+
warnings = sum(1 for i in self.issues if i.severity == "warning")
|
|
43
|
+
return f"Quality Score: {self.score:.1f}/100 | Errors: {errors} | Warnings: {warnings}"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DataQualityChecker:
|
|
47
|
+
"""
|
|
48
|
+
Validate generated synthetic data for realism and correctness.
|
|
49
|
+
|
|
50
|
+
Usage:
|
|
51
|
+
checker = DataQualityChecker()
|
|
52
|
+
report = checker.check_all(tables, relationships, schema)
|
|
53
|
+
|
|
54
|
+
if not report.passed:
|
|
55
|
+
print("Issues found:", report.issues)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
# Domain-specific plausibility rules
|
|
59
|
+
PLAUSIBILITY_RULES = {
|
|
60
|
+
# Column name patterns -> (min, max, description)
|
|
61
|
+
"age": (0, 120, "Human age"),
|
|
62
|
+
"price": (0, 1_000_000, "Price"),
|
|
63
|
+
"quantity": (0, 10_000, "Quantity"),
|
|
64
|
+
"rating": (1, 5, "Rating"),
|
|
65
|
+
"percentage": (0, 100, "Percentage"),
|
|
66
|
+
"year": (1900, 2100, "Year"),
|
|
67
|
+
"month": (1, 12, "Month"),
|
|
68
|
+
"day": (1, 31, "Day"),
|
|
69
|
+
"hour": (0, 23, "Hour"),
|
|
70
|
+
"minute": (0, 59, "Minute"),
|
|
71
|
+
"score": (0, 100, "Score"),
|
|
72
|
+
"count": (0, 1_000_000, "Count"),
|
|
73
|
+
"duration": (0, 10_000, "Duration"),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def __init__(self, strict: bool = False):
|
|
77
|
+
"""
|
|
78
|
+
Initialize the quality checker.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
strict: If True, warnings become errors
|
|
82
|
+
"""
|
|
83
|
+
self.strict = strict
|
|
84
|
+
self.issues: List[QualityIssue] = []
|
|
85
|
+
|
|
86
|
+
def _add_issue(
|
|
87
|
+
self,
|
|
88
|
+
severity: str,
|
|
89
|
+
category: str,
|
|
90
|
+
table: str,
|
|
91
|
+
column: Optional[str],
|
|
92
|
+
message: str,
|
|
93
|
+
details: Optional[Dict] = None,
|
|
94
|
+
):
|
|
95
|
+
"""Add an issue to the list."""
|
|
96
|
+
if self.strict and severity == "warning":
|
|
97
|
+
severity = "error"
|
|
98
|
+
|
|
99
|
+
self.issues.append(QualityIssue(
|
|
100
|
+
severity=severity,
|
|
101
|
+
category=category,
|
|
102
|
+
table=table,
|
|
103
|
+
column=column,
|
|
104
|
+
message=message,
|
|
105
|
+
details=details or {},
|
|
106
|
+
))
|
|
107
|
+
|
|
108
|
+
def check_distribution_plausibility(
|
|
109
|
+
self,
|
|
110
|
+
df: "pd.DataFrame",
|
|
111
|
+
table_name: str,
|
|
112
|
+
) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Check if numeric distributions are plausible for their domains.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df: DataFrame to check
|
|
118
|
+
table_name: Name of the table
|
|
119
|
+
"""
|
|
120
|
+
import pandas as pd
|
|
121
|
+
import numpy as np
|
|
122
|
+
|
|
123
|
+
for col in df.columns:
|
|
124
|
+
col_lower = col.lower()
|
|
125
|
+
|
|
126
|
+
# Check against plausibility rules
|
|
127
|
+
for pattern, (min_val, max_val, description) in self.PLAUSIBILITY_RULES.items():
|
|
128
|
+
if pattern in col_lower:
|
|
129
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
130
|
+
actual_min = df[col].min()
|
|
131
|
+
actual_max = df[col].max()
|
|
132
|
+
|
|
133
|
+
if actual_min < min_val:
|
|
134
|
+
self._add_issue(
|
|
135
|
+
"warning", "distribution", table_name, col,
|
|
136
|
+
f"{description} column '{col}' has min {actual_min} < expected {min_val}",
|
|
137
|
+
{"actual_min": actual_min, "expected_min": min_val}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if actual_max > max_val:
|
|
141
|
+
self._add_issue(
|
|
142
|
+
"warning", "distribution", table_name, col,
|
|
143
|
+
f"{description} column '{col}' has max {actual_max} > expected {max_val}",
|
|
144
|
+
{"actual_max": actual_max, "expected_max": max_val}
|
|
145
|
+
)
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
# Check for all-null columns
|
|
149
|
+
if df[col].isna().all():
|
|
150
|
+
self._add_issue(
|
|
151
|
+
"error", "distribution", table_name, col,
|
|
152
|
+
f"Column '{col}' is entirely NULL",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Check for zero variance (all same value)
|
|
156
|
+
if pd.api.types.is_numeric_dtype(df[col]) and df[col].std() == 0:
|
|
157
|
+
self._add_issue(
|
|
158
|
+
"warning", "distribution", table_name, col,
|
|
159
|
+
f"Column '{col}' has zero variance (all values identical)",
|
|
160
|
+
{"value": df[col].iloc[0]}
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def check_referential_integrity(
|
|
164
|
+
self,
|
|
165
|
+
tables: Dict[str, "pd.DataFrame"],
|
|
166
|
+
relationships: List[Any],
|
|
167
|
+
) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Verify all foreign key references are valid.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
tables: Dict of table_name -> DataFrame
|
|
173
|
+
relationships: List of Relationship objects
|
|
174
|
+
"""
|
|
175
|
+
for rel in relationships:
|
|
176
|
+
parent_table = rel.parent_table
|
|
177
|
+
child_table = rel.child_table
|
|
178
|
+
parent_key = rel.parent_key
|
|
179
|
+
child_key = rel.child_key
|
|
180
|
+
|
|
181
|
+
if parent_table not in tables:
|
|
182
|
+
self._add_issue(
|
|
183
|
+
"error", "integrity", child_table, child_key,
|
|
184
|
+
f"Parent table '{parent_table}' not found for FK '{child_key}'",
|
|
185
|
+
)
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
if child_table not in tables:
|
|
189
|
+
continue # Child table might not exist yet
|
|
190
|
+
|
|
191
|
+
parent_df = tables[parent_table]
|
|
192
|
+
child_df = tables[child_table]
|
|
193
|
+
|
|
194
|
+
if parent_key not in parent_df.columns:
|
|
195
|
+
self._add_issue(
|
|
196
|
+
"error", "integrity", parent_table, parent_key,
|
|
197
|
+
f"Parent key '{parent_key}' not found in table '{parent_table}'",
|
|
198
|
+
)
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
if child_key not in child_df.columns:
|
|
202
|
+
self._add_issue(
|
|
203
|
+
"error", "integrity", child_table, child_key,
|
|
204
|
+
f"Child key '{child_key}' not found in table '{child_table}'",
|
|
205
|
+
)
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
# Check for orphaned records
|
|
209
|
+
parent_ids = set(parent_df[parent_key].dropna().unique())
|
|
210
|
+
child_ids = set(child_df[child_key].dropna().unique())
|
|
211
|
+
orphans = child_ids - parent_ids
|
|
212
|
+
|
|
213
|
+
if orphans:
|
|
214
|
+
orphan_pct = len(orphans) / len(child_ids) * 100
|
|
215
|
+
self._add_issue(
|
|
216
|
+
"error" if orphan_pct > 1 else "warning",
|
|
217
|
+
"integrity", child_table, child_key,
|
|
218
|
+
f"{len(orphans)} orphaned FK values ({orphan_pct:.1f}%) in '{child_key}' -> '{parent_table}.{parent_key}'",
|
|
219
|
+
{"orphan_count": len(orphans), "orphan_pct": orphan_pct}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def check_temporal_consistency(
|
|
223
|
+
self,
|
|
224
|
+
df: "pd.DataFrame",
|
|
225
|
+
table_name: str,
|
|
226
|
+
) -> None:
|
|
227
|
+
"""
|
|
228
|
+
Ensure temporal columns are consistent.
|
|
229
|
+
|
|
230
|
+
Checks:
|
|
231
|
+
- created_at < updated_at
|
|
232
|
+
- start_date < end_date
|
|
233
|
+
- birth_date in past
|
|
234
|
+
"""
|
|
235
|
+
import pandas as pd
|
|
236
|
+
|
|
237
|
+
date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
|
238
|
+
|
|
239
|
+
# Check created < updated
|
|
240
|
+
if "created_at" in date_cols and "updated_at" in date_cols:
|
|
241
|
+
violations = (df["created_at"] > df["updated_at"]).sum()
|
|
242
|
+
if violations > 0:
|
|
243
|
+
self._add_issue(
|
|
244
|
+
"error", "temporal", table_name, "created_at",
|
|
245
|
+
f"{violations} rows have created_at > updated_at",
|
|
246
|
+
{"violation_count": violations}
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Check start < end
|
|
250
|
+
if "start_date" in date_cols and "end_date" in date_cols:
|
|
251
|
+
violations = (df["start_date"] > df["end_date"]).sum()
|
|
252
|
+
if violations > 0:
|
|
253
|
+
self._add_issue(
|
|
254
|
+
"error", "temporal", table_name, "start_date",
|
|
255
|
+
f"{violations} rows have start_date > end_date",
|
|
256
|
+
{"violation_count": violations}
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Check birth_date is in past
|
|
260
|
+
if "birth_date" in date_cols or "date_of_birth" in date_cols:
|
|
261
|
+
col = "birth_date" if "birth_date" in date_cols else "date_of_birth"
|
|
262
|
+
future_births = (df[col] > pd.Timestamp.now()).sum()
|
|
263
|
+
if future_births > 0:
|
|
264
|
+
self._add_issue(
|
|
265
|
+
"error", "temporal", table_name, col,
|
|
266
|
+
f"{future_births} rows have birth_date in the future",
|
|
267
|
+
{"violation_count": future_births}
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
def check_all(
|
|
271
|
+
self,
|
|
272
|
+
tables: Dict[str, "pd.DataFrame"],
|
|
273
|
+
relationships: Optional[List[Any]] = None,
|
|
274
|
+
schema: Optional[Any] = None,
|
|
275
|
+
) -> QualityReport:
|
|
276
|
+
"""
|
|
277
|
+
Run all quality checks and generate a report.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
tables: Dict of table_name -> DataFrame
|
|
281
|
+
relationships: Optional list of Relationship objects
|
|
282
|
+
schema: Optional SchemaConfig for additional checks
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
QualityReport with score and issues
|
|
286
|
+
"""
|
|
287
|
+
self.issues = [] # Reset
|
|
288
|
+
|
|
289
|
+
# Check each table
|
|
290
|
+
for table_name, df in tables.items():
|
|
291
|
+
self.check_distribution_plausibility(df, table_name)
|
|
292
|
+
self.check_temporal_consistency(df, table_name)
|
|
293
|
+
|
|
294
|
+
# Check referential integrity
|
|
295
|
+
if relationships:
|
|
296
|
+
self.check_referential_integrity(tables, relationships)
|
|
297
|
+
|
|
298
|
+
# Calculate score
|
|
299
|
+
base_score = 100
|
|
300
|
+
for issue in self.issues:
|
|
301
|
+
if issue.severity == "error":
|
|
302
|
+
base_score -= 10
|
|
303
|
+
elif issue.severity == "warning":
|
|
304
|
+
base_score -= 3
|
|
305
|
+
else:
|
|
306
|
+
base_score -= 1
|
|
307
|
+
|
|
308
|
+
score = max(0, min(100, base_score))
|
|
309
|
+
|
|
310
|
+
# Gather stats
|
|
311
|
+
stats = {
|
|
312
|
+
"tables_checked": len(tables),
|
|
313
|
+
"total_rows": sum(len(df) for df in tables.values()),
|
|
314
|
+
"total_columns": sum(len(df.columns) for df in tables.values()),
|
|
315
|
+
"error_count": sum(1 for i in self.issues if i.severity == "error"),
|
|
316
|
+
"warning_count": sum(1 for i in self.issues if i.severity == "warning"),
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return QualityReport(
|
|
320
|
+
score=score,
|
|
321
|
+
issues=self.issues.copy(),
|
|
322
|
+
stats=stats,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def check_quality(tables: Dict[str, "pd.DataFrame"], **kwargs) -> QualityReport:
|
|
327
|
+
"""Convenience function for quick quality checks."""
|
|
328
|
+
checker = DataQualityChecker()
|
|
329
|
+
return checker.check_all(tables, **kwargs)
|
|
@@ -23,7 +23,7 @@ class Column(BaseModel):
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
name: str
|
|
26
|
-
type: Literal["int", "float", "date", "categorical", "foreign_key", "text", "boolean"]
|
|
26
|
+
type: Literal["int", "float", "date", "time", "datetime", "categorical", "foreign_key", "text", "boolean"]
|
|
27
27
|
distribution_params: Dict[str, Any] = Field(default_factory=dict)
|
|
28
28
|
nullable: bool = False
|
|
29
29
|
unique: bool = False
|
|
@@ -39,8 +39,13 @@ class Column(BaseModel):
|
|
|
39
39
|
|
|
40
40
|
if col_type == "date":
|
|
41
41
|
if "relative_to" not in v:
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
# Provide sensible defaults if start/end not specified
|
|
43
|
+
if "start" not in v:
|
|
44
|
+
from datetime import datetime, timedelta
|
|
45
|
+
v["start"] = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
|
46
|
+
if "end" not in v:
|
|
47
|
+
from datetime import datetime
|
|
48
|
+
v["end"] = datetime.now().strftime("%Y-%m-%d")
|
|
44
49
|
|
|
45
50
|
if col_type in ["int", "float"]:
|
|
46
51
|
if "distribution" not in v:
|
|
@@ -35,7 +35,8 @@ class DataSimulator:
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
def __init__(self, config: SchemaConfig,
|
|
38
|
-
apply_semantic_fixes: bool = True, batch_size: int = 10_000
|
|
38
|
+
apply_semantic_fixes: bool = True, batch_size: int = 10_000,
|
|
39
|
+
smart_mode: bool = False, use_llm: bool = True):
|
|
39
40
|
"""
|
|
40
41
|
Initialize the simulator.
|
|
41
42
|
|
|
@@ -43,13 +44,19 @@ class DataSimulator:
|
|
|
43
44
|
config: Schema configuration defining tables, columns, and relationships
|
|
44
45
|
apply_semantic_fixes: Auto-fix column types based on semantic patterns
|
|
45
46
|
batch_size: Number of rows to generate per batch
|
|
47
|
+
smart_mode: Enable LLM-powered context-aware value generation
|
|
48
|
+
use_llm: If smart_mode is True, whether to use LLM (vs curated fallbacks)
|
|
46
49
|
"""
|
|
47
50
|
self.config = config
|
|
48
51
|
self.context: Dict[str, pd.DataFrame] = {} # Lightweight context (IDs only)
|
|
49
52
|
self.text_gen = TextGenerator(seed=config.seed)
|
|
50
53
|
self.batch_size = batch_size
|
|
54
|
+
self.smart_mode = smart_mode
|
|
55
|
+
self.use_llm = use_llm
|
|
56
|
+
self._smart_gen = None # Lazy init
|
|
51
57
|
self._unique_pools: Dict[str, np.ndarray] = {} # Store pre-generated unique values
|
|
52
58
|
self._unique_counters: Dict[str, int] = {} # Track usage of unique pools
|
|
59
|
+
self._smart_pools: Dict[str, np.ndarray] = {} # Cache smart value pools
|
|
53
60
|
|
|
54
61
|
# Apply semantic inference to fix column types
|
|
55
62
|
if apply_semantic_fixes:
|
|
@@ -60,6 +67,16 @@ class DataSimulator:
|
|
|
60
67
|
seed = config.seed if config.seed is not None else np.random.randint(0, 2**32 - 1)
|
|
61
68
|
self.rng = np.random.default_rng(seed)
|
|
62
69
|
np.random.seed(seed) # For legacy numpy.random calls
|
|
70
|
+
|
|
71
|
+
def _get_smart_gen(self):
|
|
72
|
+
"""Lazy initialize SmartValueGenerator."""
|
|
73
|
+
if self._smart_gen is None:
|
|
74
|
+
try:
|
|
75
|
+
from misata.smart_values import SmartValueGenerator
|
|
76
|
+
self._smart_gen = SmartValueGenerator()
|
|
77
|
+
except Exception:
|
|
78
|
+
self._smart_gen = None
|
|
79
|
+
return self._smart_gen
|
|
63
80
|
|
|
64
81
|
def topological_sort(self) -> List[str]:
|
|
65
82
|
"""
|
|
@@ -210,13 +227,21 @@ class DataSimulator:
|
|
|
210
227
|
|
|
211
228
|
# CATEGORICAL
|
|
212
229
|
if column.type == "categorical":
|
|
213
|
-
choices = params
|
|
230
|
+
choices = params.get("choices", ["A", "B", "C"])
|
|
214
231
|
probabilities = params.get("probabilities", None)
|
|
215
232
|
|
|
233
|
+
# Ensure choices is a list
|
|
234
|
+
if not isinstance(choices, list):
|
|
235
|
+
choices = list(choices)
|
|
236
|
+
|
|
216
237
|
if probabilities is not None:
|
|
217
|
-
#
|
|
218
|
-
probabilities = np.array(probabilities)
|
|
219
|
-
|
|
238
|
+
# Convert to float array and normalize
|
|
239
|
+
probabilities = np.array(probabilities, dtype=float)
|
|
240
|
+
prob_sum = probabilities.sum()
|
|
241
|
+
if prob_sum > 0:
|
|
242
|
+
probabilities = probabilities / prob_sum
|
|
243
|
+
else:
|
|
244
|
+
probabilities = None
|
|
220
245
|
|
|
221
246
|
values = self.rng.choice(choices, size=size, p=probabilities)
|
|
222
247
|
return values
|
|
@@ -413,6 +438,35 @@ class DataSimulator:
|
|
|
413
438
|
# TEXT
|
|
414
439
|
elif column.type == "text":
|
|
415
440
|
text_type = params.get("text_type", "sentence")
|
|
441
|
+
|
|
442
|
+
# Smart value generation - check for domain-specific content
|
|
443
|
+
smart_generate = params.get("smart_generate", False) or self.smart_mode
|
|
444
|
+
if smart_generate:
|
|
445
|
+
smart_gen = self._get_smart_gen()
|
|
446
|
+
if smart_gen:
|
|
447
|
+
# Check for explicit domain hint or auto-detect
|
|
448
|
+
domain_hint = params.get("domain_hint")
|
|
449
|
+
context = params.get("context", "")
|
|
450
|
+
|
|
451
|
+
# Create cache key for this column's pool
|
|
452
|
+
pool_key = f"{table_name}.{column.name}"
|
|
453
|
+
|
|
454
|
+
if pool_key not in self._smart_pools:
|
|
455
|
+
pool = smart_gen.get_pool(
|
|
456
|
+
column_name=column.name,
|
|
457
|
+
table_name=table_name,
|
|
458
|
+
domain_hint=domain_hint,
|
|
459
|
+
context=context,
|
|
460
|
+
size=100,
|
|
461
|
+
use_llm=self.use_llm,
|
|
462
|
+
)
|
|
463
|
+
if pool:
|
|
464
|
+
self._smart_pools[pool_key] = np.array(pool)
|
|
465
|
+
|
|
466
|
+
if pool_key in self._smart_pools:
|
|
467
|
+
pool = self._smart_pools[pool_key]
|
|
468
|
+
values = self.rng.choice(pool, size=size)
|
|
469
|
+
return values
|
|
416
470
|
|
|
417
471
|
if text_type == "name":
|
|
418
472
|
values = np.array([self.text_gen.name() for _ in range(size)])
|
|
@@ -441,6 +495,28 @@ class DataSimulator:
|
|
|
441
495
|
values = self.rng.random(size) < probability
|
|
442
496
|
return values
|
|
443
497
|
|
|
498
|
+
# TIME
|
|
499
|
+
elif column.type == "time":
|
|
500
|
+
# Generate random times as HH:MM:SS strings
|
|
501
|
+
start_hour = params.get("start_hour", 0)
|
|
502
|
+
end_hour = params.get("end_hour", 24)
|
|
503
|
+
hours = self.rng.integers(start_hour, end_hour, size=size)
|
|
504
|
+
minutes = self.rng.integers(0, 60, size=size)
|
|
505
|
+
seconds = self.rng.integers(0, 60, size=size)
|
|
506
|
+
values = np.array([f"{h:02d}:{m:02d}:{s:02d}" for h, m, s in zip(hours, minutes, seconds)])
|
|
507
|
+
return values
|
|
508
|
+
|
|
509
|
+
# DATETIME
|
|
510
|
+
elif column.type == "datetime":
|
|
511
|
+
# Generate random datetimes within a range
|
|
512
|
+
start = pd.to_datetime(params.get("start", "2020-01-01"))
|
|
513
|
+
end = pd.to_datetime(params.get("end", "2024-12-31"))
|
|
514
|
+
start_int = start.value
|
|
515
|
+
end_int = end.value
|
|
516
|
+
random_ints = self.rng.integers(start_int, end_int, size=size)
|
|
517
|
+
values = pd.to_datetime(random_ints)
|
|
518
|
+
return values
|
|
519
|
+
|
|
444
520
|
else:
|
|
445
521
|
raise ValueError(f"Unknown column type: {column.type}")
|
|
446
522
|
|