misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misata/__init__.py +89 -3
- misata/cache.py +258 -0
- misata/constraints.py +307 -0
- misata/context.py +259 -0
- misata/exceptions.py +277 -0
- misata/generators/__init__.py +29 -0
- misata/generators/base.py +586 -0
- misata/llm_parser.py +41 -2
- misata/profiles.py +332 -0
- misata/quality.py +329 -0
- misata/schema.py +8 -3
- misata/simulator.py +81 -5
- misata/smart_values.py +762 -0
- misata/streaming.py +228 -0
- misata/templates/library.py +344 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/METADATA +4 -2
- misata-0.3.0b0.dist-info/RECORD +37 -0
- misata-0.3.0b0.dist-info/licenses/LICENSE +21 -0
- misata-0.1.0b0.dist-info/RECORD +0 -25
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/WHEEL +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/entry_points.txt +0 -0
- {misata-0.1.0b0.dist-info → misata-0.3.0b0.dist-info}/top_level.txt +0 -0
misata/profiles.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distribution Profiles for Realistic Data Generation.
|
|
3
|
+
|
|
4
|
+
Pre-configured distribution parameters that match real-world patterns
|
|
5
|
+
for common data types like age, salary, prices, etc.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DistributionProfile:
|
|
13
|
+
"""A named distribution configuration for realistic generation.
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
profile = DistributionProfile(
|
|
17
|
+
name="age",
|
|
18
|
+
distribution="mixture",
|
|
19
|
+
params={
|
|
20
|
+
"components": [
|
|
21
|
+
{"mean": 35, "std": 12, "weight": 0.6}, # Working age
|
|
22
|
+
{"mean": 70, "std": 8, "weight": 0.2}, # Retirees
|
|
23
|
+
{"mean": 12, "std": 4, "weight": 0.2}, # Children
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
)
|
|
27
|
+
values = profile.generate(1000)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
name: str,
|
|
33
|
+
distribution: str,
|
|
34
|
+
params: Dict[str, Any],
|
|
35
|
+
min_value: Optional[float] = None,
|
|
36
|
+
max_value: Optional[float] = None,
|
|
37
|
+
decimals: Optional[int] = None,
|
|
38
|
+
):
|
|
39
|
+
self.name = name
|
|
40
|
+
self.distribution = distribution
|
|
41
|
+
self.params = params
|
|
42
|
+
self.min_value = min_value
|
|
43
|
+
self.max_value = max_value
|
|
44
|
+
self.decimals = decimals
|
|
45
|
+
|
|
46
|
+
def generate(
|
|
47
|
+
self,
|
|
48
|
+
size: int,
|
|
49
|
+
rng: Optional[np.random.Generator] = None
|
|
50
|
+
) -> np.ndarray:
|
|
51
|
+
"""Generate values according to this profile."""
|
|
52
|
+
if rng is None:
|
|
53
|
+
rng = np.random.default_rng()
|
|
54
|
+
|
|
55
|
+
if self.distribution == "normal":
|
|
56
|
+
mean = self.params.get("mean", 50)
|
|
57
|
+
std = self.params.get("std", 10)
|
|
58
|
+
values = rng.normal(mean, std, size)
|
|
59
|
+
|
|
60
|
+
elif self.distribution == "lognormal":
|
|
61
|
+
mean = self.params.get("mean", 0)
|
|
62
|
+
sigma = self.params.get("sigma", 1)
|
|
63
|
+
values = rng.lognormal(mean, sigma, size)
|
|
64
|
+
|
|
65
|
+
elif self.distribution == "exponential":
|
|
66
|
+
scale = self.params.get("scale", 1.0)
|
|
67
|
+
values = rng.exponential(scale, size)
|
|
68
|
+
|
|
69
|
+
elif self.distribution == "pareto":
|
|
70
|
+
alpha = self.params.get("alpha", 2.0)
|
|
71
|
+
min_val = self.params.get("min", 1.0)
|
|
72
|
+
values = (rng.pareto(alpha, size) + 1) * min_val
|
|
73
|
+
|
|
74
|
+
elif self.distribution == "beta":
|
|
75
|
+
a = self.params.get("a", 2)
|
|
76
|
+
b = self.params.get("b", 5)
|
|
77
|
+
scale = self.params.get("scale", 1.0)
|
|
78
|
+
values = rng.beta(a, b, size) * scale
|
|
79
|
+
|
|
80
|
+
elif self.distribution == "mixture":
|
|
81
|
+
# Gaussian mixture model
|
|
82
|
+
components = self.params.get("components", [])
|
|
83
|
+
if not components:
|
|
84
|
+
values = rng.normal(0, 1, size)
|
|
85
|
+
else:
|
|
86
|
+
weights = np.array([c.get("weight", 1) for c in components])
|
|
87
|
+
weights = weights / weights.sum()
|
|
88
|
+
|
|
89
|
+
# Sample component indices
|
|
90
|
+
component_indices = rng.choice(
|
|
91
|
+
len(components), size=size, p=weights
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
values = np.zeros(size)
|
|
95
|
+
for i, comp in enumerate(components):
|
|
96
|
+
mask = component_indices == i
|
|
97
|
+
n = mask.sum()
|
|
98
|
+
if n > 0:
|
|
99
|
+
values[mask] = rng.normal(
|
|
100
|
+
comp.get("mean", 0),
|
|
101
|
+
comp.get("std", 1),
|
|
102
|
+
n
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
elif self.distribution == "zipf":
|
|
106
|
+
# Zipf distribution for long-tail data
|
|
107
|
+
a = self.params.get("alpha", 2.0)
|
|
108
|
+
values = rng.zipf(a, size).astype(float)
|
|
109
|
+
|
|
110
|
+
elif self.distribution == "uniform":
|
|
111
|
+
low = self.params.get("min", 0)
|
|
112
|
+
high = self.params.get("max", 100)
|
|
113
|
+
values = rng.uniform(low, high, size)
|
|
114
|
+
|
|
115
|
+
else:
|
|
116
|
+
# Default to uniform
|
|
117
|
+
values = rng.uniform(0, 100, size)
|
|
118
|
+
|
|
119
|
+
# Apply constraints
|
|
120
|
+
if self.min_value is not None:
|
|
121
|
+
values = np.maximum(values, self.min_value)
|
|
122
|
+
if self.max_value is not None:
|
|
123
|
+
values = np.minimum(values, self.max_value)
|
|
124
|
+
if self.decimals is not None:
|
|
125
|
+
values = np.round(values, self.decimals)
|
|
126
|
+
|
|
127
|
+
return values
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ============ Pre-built Profiles ============
|
|
131
|
+
|
|
132
|
+
PROFILES: Dict[str, DistributionProfile] = {}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _register_profile(profile: DistributionProfile) -> None:
|
|
136
|
+
"""Register a profile by name."""
|
|
137
|
+
PROFILES[profile.name] = profile
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Age distributions
|
|
141
|
+
_register_profile(DistributionProfile(
|
|
142
|
+
name="age_adult",
|
|
143
|
+
distribution="mixture",
|
|
144
|
+
params={
|
|
145
|
+
"components": [
|
|
146
|
+
{"mean": 28, "std": 6, "weight": 0.3}, # Young adults
|
|
147
|
+
{"mean": 42, "std": 10, "weight": 0.45}, # Middle age
|
|
148
|
+
{"mean": 62, "std": 8, "weight": 0.25}, # Older adults
|
|
149
|
+
]
|
|
150
|
+
},
|
|
151
|
+
min_value=18,
|
|
152
|
+
max_value=100,
|
|
153
|
+
decimals=0,
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
_register_profile(DistributionProfile(
|
|
157
|
+
name="age_population",
|
|
158
|
+
distribution="mixture",
|
|
159
|
+
params={
|
|
160
|
+
"components": [
|
|
161
|
+
{"mean": 8, "std": 4, "weight": 0.15}, # Children
|
|
162
|
+
{"mean": 25, "std": 8, "weight": 0.25}, # Young adults
|
|
163
|
+
{"mean": 42, "std": 12, "weight": 0.35}, # Middle age
|
|
164
|
+
{"mean": 68, "std": 10, "weight": 0.25}, # Seniors
|
|
165
|
+
]
|
|
166
|
+
},
|
|
167
|
+
min_value=0,
|
|
168
|
+
max_value=105,
|
|
169
|
+
decimals=0,
|
|
170
|
+
))
|
|
171
|
+
|
|
172
|
+
# Salary distributions
|
|
173
|
+
_register_profile(DistributionProfile(
|
|
174
|
+
name="salary_usd",
|
|
175
|
+
distribution="lognormal",
|
|
176
|
+
params={"mean": 11.0, "sigma": 0.5}, # Log of ~$60k median
|
|
177
|
+
min_value=25000,
|
|
178
|
+
max_value=500000,
|
|
179
|
+
decimals=0,
|
|
180
|
+
))
|
|
181
|
+
|
|
182
|
+
_register_profile(DistributionProfile(
|
|
183
|
+
name="salary_tech",
|
|
184
|
+
distribution="mixture",
|
|
185
|
+
params={
|
|
186
|
+
"components": [
|
|
187
|
+
{"mean": 75000, "std": 15000, "weight": 0.2}, # Junior
|
|
188
|
+
{"mean": 120000, "std": 25000, "weight": 0.4}, # Mid
|
|
189
|
+
{"mean": 180000, "std": 40000, "weight": 0.3}, # Senior
|
|
190
|
+
{"mean": 280000, "std": 60000, "weight": 0.1}, # Staff+
|
|
191
|
+
]
|
|
192
|
+
},
|
|
193
|
+
min_value=50000,
|
|
194
|
+
max_value=600000,
|
|
195
|
+
decimals=0,
|
|
196
|
+
))
|
|
197
|
+
|
|
198
|
+
# Price distributions
|
|
199
|
+
_register_profile(DistributionProfile(
|
|
200
|
+
name="price_retail",
|
|
201
|
+
distribution="lognormal",
|
|
202
|
+
params={"mean": 3.5, "sigma": 1.2}, # ~$30 median
|
|
203
|
+
min_value=0.99,
|
|
204
|
+
max_value=10000,
|
|
205
|
+
decimals=2,
|
|
206
|
+
))
|
|
207
|
+
|
|
208
|
+
_register_profile(DistributionProfile(
|
|
209
|
+
name="price_saas",
|
|
210
|
+
distribution="mixture",
|
|
211
|
+
params={
|
|
212
|
+
"components": [
|
|
213
|
+
{"mean": 15, "std": 5, "weight": 0.3}, # Basic tier
|
|
214
|
+
{"mean": 49, "std": 15, "weight": 0.4}, # Pro tier
|
|
215
|
+
{"mean": 199, "std": 50, "weight": 0.25}, # Enterprise
|
|
216
|
+
{"mean": 999, "std": 200, "weight": 0.05}, # Custom
|
|
217
|
+
]
|
|
218
|
+
},
|
|
219
|
+
min_value=0,
|
|
220
|
+
max_value=5000,
|
|
221
|
+
decimals=0,
|
|
222
|
+
))
|
|
223
|
+
|
|
224
|
+
# Transaction amounts
|
|
225
|
+
_register_profile(DistributionProfile(
|
|
226
|
+
name="transaction_amount",
|
|
227
|
+
distribution="pareto",
|
|
228
|
+
params={"alpha": 2.5, "min": 10},
|
|
229
|
+
min_value=1,
|
|
230
|
+
max_value=100000,
|
|
231
|
+
decimals=2,
|
|
232
|
+
))
|
|
233
|
+
|
|
234
|
+
# Counts / quantities
|
|
235
|
+
_register_profile(DistributionProfile(
|
|
236
|
+
name="order_quantity",
|
|
237
|
+
distribution="zipf",
|
|
238
|
+
params={"alpha": 2.0},
|
|
239
|
+
min_value=1,
|
|
240
|
+
max_value=100,
|
|
241
|
+
decimals=0,
|
|
242
|
+
))
|
|
243
|
+
|
|
244
|
+
# Time-related
|
|
245
|
+
_register_profile(DistributionProfile(
|
|
246
|
+
name="session_duration_seconds",
|
|
247
|
+
distribution="lognormal",
|
|
248
|
+
params={"mean": 5.5, "sigma": 1.5}, # ~4 min median
|
|
249
|
+
min_value=1,
|
|
250
|
+
max_value=7200, # 2 hours max
|
|
251
|
+
decimals=0,
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
# Ratings and scores
|
|
255
|
+
_register_profile(DistributionProfile(
|
|
256
|
+
name="rating_5star",
|
|
257
|
+
distribution="beta",
|
|
258
|
+
params={"a": 5, "b": 2, "scale": 5}, # Skewed towards higher ratings
|
|
259
|
+
min_value=1,
|
|
260
|
+
max_value=5,
|
|
261
|
+
decimals=1,
|
|
262
|
+
))
|
|
263
|
+
|
|
264
|
+
_register_profile(DistributionProfile(
|
|
265
|
+
name="nps_score",
|
|
266
|
+
distribution="mixture",
|
|
267
|
+
params={
|
|
268
|
+
"components": [
|
|
269
|
+
{"mean": 3, "std": 2, "weight": 0.15}, # Detractors
|
|
270
|
+
{"mean": 7, "std": 1, "weight": 0.25}, # Passives
|
|
271
|
+
{"mean": 9, "std": 0.8, "weight": 0.6}, # Promoters
|
|
272
|
+
]
|
|
273
|
+
},
|
|
274
|
+
min_value=0,
|
|
275
|
+
max_value=10,
|
|
276
|
+
decimals=0,
|
|
277
|
+
))
|
|
278
|
+
|
|
279
|
+
# Percentages
|
|
280
|
+
_register_profile(DistributionProfile(
|
|
281
|
+
name="conversion_rate",
|
|
282
|
+
distribution="beta",
|
|
283
|
+
params={"a": 2, "b": 50, "scale": 100}, # Low conversion (1-5%)
|
|
284
|
+
min_value=0,
|
|
285
|
+
max_value=100,
|
|
286
|
+
decimals=2,
|
|
287
|
+
))
|
|
288
|
+
|
|
289
|
+
_register_profile(DistributionProfile(
|
|
290
|
+
name="churn_rate",
|
|
291
|
+
distribution="beta",
|
|
292
|
+
params={"a": 1.5, "b": 30, "scale": 100}, # ~5% typical
|
|
293
|
+
min_value=0,
|
|
294
|
+
max_value=100,
|
|
295
|
+
decimals=2,
|
|
296
|
+
))
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def get_profile(name: str) -> Optional[DistributionProfile]:
|
|
300
|
+
"""Get a profile by name."""
|
|
301
|
+
return PROFILES.get(name)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def list_profiles() -> List[str]:
|
|
305
|
+
"""List all available profile names."""
|
|
306
|
+
return list(PROFILES.keys())
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def generate_with_profile(
|
|
310
|
+
profile_name: str,
|
|
311
|
+
size: int,
|
|
312
|
+
rng: Optional[np.random.Generator] = None
|
|
313
|
+
) -> np.ndarray:
|
|
314
|
+
"""Generate values using a named profile.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
profile_name: Name of the profile (e.g., "salary_tech")
|
|
318
|
+
size: Number of values to generate
|
|
319
|
+
rng: Random number generator
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Array of generated values
|
|
323
|
+
|
|
324
|
+
Raises:
|
|
325
|
+
ValueError: If profile not found
|
|
326
|
+
"""
|
|
327
|
+
profile = get_profile(profile_name)
|
|
328
|
+
if profile is None:
|
|
329
|
+
available = ", ".join(list_profiles())
|
|
330
|
+
raise ValueError(f"Unknown profile: {profile_name}. Available: {available}")
|
|
331
|
+
|
|
332
|
+
return profile.generate(size, rng)
|
misata/quality.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Quality Checker for Synthetic Data Validation.
|
|
3
|
+
|
|
4
|
+
This module validates generated synthetic data for:
|
|
5
|
+
- Distribution plausibility
|
|
6
|
+
- Referential integrity
|
|
7
|
+
- Temporal consistency
|
|
8
|
+
- Domain-specific rules
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
import warnings
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class QualityIssue:
|
|
18
|
+
"""Represents a single data quality issue."""
|
|
19
|
+
severity: str # "error", "warning", "info"
|
|
20
|
+
category: str # "distribution", "integrity", "temporal", "domain"
|
|
21
|
+
table: str
|
|
22
|
+
column: Optional[str]
|
|
23
|
+
message: str
|
|
24
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class QualityReport:
|
|
29
|
+
"""Complete quality report for generated data."""
|
|
30
|
+
score: float # 0-100
|
|
31
|
+
issues: List[QualityIssue]
|
|
32
|
+
stats: Dict[str, Any]
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def passed(self) -> bool:
|
|
36
|
+
"""Returns True if no errors (warnings OK)."""
|
|
37
|
+
return not any(i.severity == "error" for i in self.issues)
|
|
38
|
+
|
|
39
|
+
def summary(self) -> str:
|
|
40
|
+
"""Human-readable summary."""
|
|
41
|
+
errors = sum(1 for i in self.issues if i.severity == "error")
|
|
42
|
+
warnings = sum(1 for i in self.issues if i.severity == "warning")
|
|
43
|
+
return f"Quality Score: {self.score:.1f}/100 | Errors: {errors} | Warnings: {warnings}"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DataQualityChecker:
|
|
47
|
+
"""
|
|
48
|
+
Validate generated synthetic data for realism and correctness.
|
|
49
|
+
|
|
50
|
+
Usage:
|
|
51
|
+
checker = DataQualityChecker()
|
|
52
|
+
report = checker.check_all(tables, relationships, schema)
|
|
53
|
+
|
|
54
|
+
if not report.passed:
|
|
55
|
+
print("Issues found:", report.issues)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
# Domain-specific plausibility rules
|
|
59
|
+
PLAUSIBILITY_RULES = {
|
|
60
|
+
# Column name patterns -> (min, max, description)
|
|
61
|
+
"age": (0, 120, "Human age"),
|
|
62
|
+
"price": (0, 1_000_000, "Price"),
|
|
63
|
+
"quantity": (0, 10_000, "Quantity"),
|
|
64
|
+
"rating": (1, 5, "Rating"),
|
|
65
|
+
"percentage": (0, 100, "Percentage"),
|
|
66
|
+
"year": (1900, 2100, "Year"),
|
|
67
|
+
"month": (1, 12, "Month"),
|
|
68
|
+
"day": (1, 31, "Day"),
|
|
69
|
+
"hour": (0, 23, "Hour"),
|
|
70
|
+
"minute": (0, 59, "Minute"),
|
|
71
|
+
"score": (0, 100, "Score"),
|
|
72
|
+
"count": (0, 1_000_000, "Count"),
|
|
73
|
+
"duration": (0, 10_000, "Duration"),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def __init__(self, strict: bool = False):
|
|
77
|
+
"""
|
|
78
|
+
Initialize the quality checker.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
strict: If True, warnings become errors
|
|
82
|
+
"""
|
|
83
|
+
self.strict = strict
|
|
84
|
+
self.issues: List[QualityIssue] = []
|
|
85
|
+
|
|
86
|
+
def _add_issue(
|
|
87
|
+
self,
|
|
88
|
+
severity: str,
|
|
89
|
+
category: str,
|
|
90
|
+
table: str,
|
|
91
|
+
column: Optional[str],
|
|
92
|
+
message: str,
|
|
93
|
+
details: Optional[Dict] = None,
|
|
94
|
+
):
|
|
95
|
+
"""Add an issue to the list."""
|
|
96
|
+
if self.strict and severity == "warning":
|
|
97
|
+
severity = "error"
|
|
98
|
+
|
|
99
|
+
self.issues.append(QualityIssue(
|
|
100
|
+
severity=severity,
|
|
101
|
+
category=category,
|
|
102
|
+
table=table,
|
|
103
|
+
column=column,
|
|
104
|
+
message=message,
|
|
105
|
+
details=details or {},
|
|
106
|
+
))
|
|
107
|
+
|
|
108
|
+
def check_distribution_plausibility(
|
|
109
|
+
self,
|
|
110
|
+
df: "pd.DataFrame",
|
|
111
|
+
table_name: str,
|
|
112
|
+
) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Check if numeric distributions are plausible for their domains.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df: DataFrame to check
|
|
118
|
+
table_name: Name of the table
|
|
119
|
+
"""
|
|
120
|
+
import pandas as pd
|
|
121
|
+
import numpy as np
|
|
122
|
+
|
|
123
|
+
for col in df.columns:
|
|
124
|
+
col_lower = col.lower()
|
|
125
|
+
|
|
126
|
+
# Check against plausibility rules
|
|
127
|
+
for pattern, (min_val, max_val, description) in self.PLAUSIBILITY_RULES.items():
|
|
128
|
+
if pattern in col_lower:
|
|
129
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
130
|
+
actual_min = df[col].min()
|
|
131
|
+
actual_max = df[col].max()
|
|
132
|
+
|
|
133
|
+
if actual_min < min_val:
|
|
134
|
+
self._add_issue(
|
|
135
|
+
"warning", "distribution", table_name, col,
|
|
136
|
+
f"{description} column '{col}' has min {actual_min} < expected {min_val}",
|
|
137
|
+
{"actual_min": actual_min, "expected_min": min_val}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if actual_max > max_val:
|
|
141
|
+
self._add_issue(
|
|
142
|
+
"warning", "distribution", table_name, col,
|
|
143
|
+
f"{description} column '{col}' has max {actual_max} > expected {max_val}",
|
|
144
|
+
{"actual_max": actual_max, "expected_max": max_val}
|
|
145
|
+
)
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
# Check for all-null columns
|
|
149
|
+
if df[col].isna().all():
|
|
150
|
+
self._add_issue(
|
|
151
|
+
"error", "distribution", table_name, col,
|
|
152
|
+
f"Column '{col}' is entirely NULL",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Check for zero variance (all same value)
|
|
156
|
+
if pd.api.types.is_numeric_dtype(df[col]) and df[col].std() == 0:
|
|
157
|
+
self._add_issue(
|
|
158
|
+
"warning", "distribution", table_name, col,
|
|
159
|
+
f"Column '{col}' has zero variance (all values identical)",
|
|
160
|
+
{"value": df[col].iloc[0]}
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def check_referential_integrity(
|
|
164
|
+
self,
|
|
165
|
+
tables: Dict[str, "pd.DataFrame"],
|
|
166
|
+
relationships: List[Any],
|
|
167
|
+
) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Verify all foreign key references are valid.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
tables: Dict of table_name -> DataFrame
|
|
173
|
+
relationships: List of Relationship objects
|
|
174
|
+
"""
|
|
175
|
+
for rel in relationships:
|
|
176
|
+
parent_table = rel.parent_table
|
|
177
|
+
child_table = rel.child_table
|
|
178
|
+
parent_key = rel.parent_key
|
|
179
|
+
child_key = rel.child_key
|
|
180
|
+
|
|
181
|
+
if parent_table not in tables:
|
|
182
|
+
self._add_issue(
|
|
183
|
+
"error", "integrity", child_table, child_key,
|
|
184
|
+
f"Parent table '{parent_table}' not found for FK '{child_key}'",
|
|
185
|
+
)
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
if child_table not in tables:
|
|
189
|
+
continue # Child table might not exist yet
|
|
190
|
+
|
|
191
|
+
parent_df = tables[parent_table]
|
|
192
|
+
child_df = tables[child_table]
|
|
193
|
+
|
|
194
|
+
if parent_key not in parent_df.columns:
|
|
195
|
+
self._add_issue(
|
|
196
|
+
"error", "integrity", parent_table, parent_key,
|
|
197
|
+
f"Parent key '{parent_key}' not found in table '{parent_table}'",
|
|
198
|
+
)
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
if child_key not in child_df.columns:
|
|
202
|
+
self._add_issue(
|
|
203
|
+
"error", "integrity", child_table, child_key,
|
|
204
|
+
f"Child key '{child_key}' not found in table '{child_table}'",
|
|
205
|
+
)
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
# Check for orphaned records
|
|
209
|
+
parent_ids = set(parent_df[parent_key].dropna().unique())
|
|
210
|
+
child_ids = set(child_df[child_key].dropna().unique())
|
|
211
|
+
orphans = child_ids - parent_ids
|
|
212
|
+
|
|
213
|
+
if orphans:
|
|
214
|
+
orphan_pct = len(orphans) / len(child_ids) * 100
|
|
215
|
+
self._add_issue(
|
|
216
|
+
"error" if orphan_pct > 1 else "warning",
|
|
217
|
+
"integrity", child_table, child_key,
|
|
218
|
+
f"{len(orphans)} orphaned FK values ({orphan_pct:.1f}%) in '{child_key}' -> '{parent_table}.{parent_key}'",
|
|
219
|
+
{"orphan_count": len(orphans), "orphan_pct": orphan_pct}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def check_temporal_consistency(
|
|
223
|
+
self,
|
|
224
|
+
df: "pd.DataFrame",
|
|
225
|
+
table_name: str,
|
|
226
|
+
) -> None:
|
|
227
|
+
"""
|
|
228
|
+
Ensure temporal columns are consistent.
|
|
229
|
+
|
|
230
|
+
Checks:
|
|
231
|
+
- created_at < updated_at
|
|
232
|
+
- start_date < end_date
|
|
233
|
+
- birth_date in past
|
|
234
|
+
"""
|
|
235
|
+
import pandas as pd
|
|
236
|
+
|
|
237
|
+
date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
|
|
238
|
+
|
|
239
|
+
# Check created < updated
|
|
240
|
+
if "created_at" in date_cols and "updated_at" in date_cols:
|
|
241
|
+
violations = (df["created_at"] > df["updated_at"]).sum()
|
|
242
|
+
if violations > 0:
|
|
243
|
+
self._add_issue(
|
|
244
|
+
"error", "temporal", table_name, "created_at",
|
|
245
|
+
f"{violations} rows have created_at > updated_at",
|
|
246
|
+
{"violation_count": violations}
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Check start < end
|
|
250
|
+
if "start_date" in date_cols and "end_date" in date_cols:
|
|
251
|
+
violations = (df["start_date"] > df["end_date"]).sum()
|
|
252
|
+
if violations > 0:
|
|
253
|
+
self._add_issue(
|
|
254
|
+
"error", "temporal", table_name, "start_date",
|
|
255
|
+
f"{violations} rows have start_date > end_date",
|
|
256
|
+
{"violation_count": violations}
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Check birth_date is in past
|
|
260
|
+
if "birth_date" in date_cols or "date_of_birth" in date_cols:
|
|
261
|
+
col = "birth_date" if "birth_date" in date_cols else "date_of_birth"
|
|
262
|
+
future_births = (df[col] > pd.Timestamp.now()).sum()
|
|
263
|
+
if future_births > 0:
|
|
264
|
+
self._add_issue(
|
|
265
|
+
"error", "temporal", table_name, col,
|
|
266
|
+
f"{future_births} rows have birth_date in the future",
|
|
267
|
+
{"violation_count": future_births}
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
def check_all(
|
|
271
|
+
self,
|
|
272
|
+
tables: Dict[str, "pd.DataFrame"],
|
|
273
|
+
relationships: Optional[List[Any]] = None,
|
|
274
|
+
schema: Optional[Any] = None,
|
|
275
|
+
) -> QualityReport:
|
|
276
|
+
"""
|
|
277
|
+
Run all quality checks and generate a report.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
tables: Dict of table_name -> DataFrame
|
|
281
|
+
relationships: Optional list of Relationship objects
|
|
282
|
+
schema: Optional SchemaConfig for additional checks
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
QualityReport with score and issues
|
|
286
|
+
"""
|
|
287
|
+
self.issues = [] # Reset
|
|
288
|
+
|
|
289
|
+
# Check each table
|
|
290
|
+
for table_name, df in tables.items():
|
|
291
|
+
self.check_distribution_plausibility(df, table_name)
|
|
292
|
+
self.check_temporal_consistency(df, table_name)
|
|
293
|
+
|
|
294
|
+
# Check referential integrity
|
|
295
|
+
if relationships:
|
|
296
|
+
self.check_referential_integrity(tables, relationships)
|
|
297
|
+
|
|
298
|
+
# Calculate score
|
|
299
|
+
base_score = 100
|
|
300
|
+
for issue in self.issues:
|
|
301
|
+
if issue.severity == "error":
|
|
302
|
+
base_score -= 10
|
|
303
|
+
elif issue.severity == "warning":
|
|
304
|
+
base_score -= 3
|
|
305
|
+
else:
|
|
306
|
+
base_score -= 1
|
|
307
|
+
|
|
308
|
+
score = max(0, min(100, base_score))
|
|
309
|
+
|
|
310
|
+
# Gather stats
|
|
311
|
+
stats = {
|
|
312
|
+
"tables_checked": len(tables),
|
|
313
|
+
"total_rows": sum(len(df) for df in tables.values()),
|
|
314
|
+
"total_columns": sum(len(df.columns) for df in tables.values()),
|
|
315
|
+
"error_count": sum(1 for i in self.issues if i.severity == "error"),
|
|
316
|
+
"warning_count": sum(1 for i in self.issues if i.severity == "warning"),
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return QualityReport(
|
|
320
|
+
score=score,
|
|
321
|
+
issues=self.issues.copy(),
|
|
322
|
+
stats=stats,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def check_quality(tables: Dict[str, "pd.DataFrame"], **kwargs) -> QualityReport:
|
|
327
|
+
"""Convenience function for quick quality checks."""
|
|
328
|
+
checker = DataQualityChecker()
|
|
329
|
+
return checker.check_all(tables, **kwargs)
|
misata/schema.py
CHANGED
|
@@ -23,7 +23,7 @@ class Column(BaseModel):
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
name: str
|
|
26
|
-
type: Literal["int", "float", "date", "categorical", "foreign_key", "text", "boolean"]
|
|
26
|
+
type: Literal["int", "float", "date", "time", "datetime", "categorical", "foreign_key", "text", "boolean"]
|
|
27
27
|
distribution_params: Dict[str, Any] = Field(default_factory=dict)
|
|
28
28
|
nullable: bool = False
|
|
29
29
|
unique: bool = False
|
|
@@ -39,8 +39,13 @@ class Column(BaseModel):
|
|
|
39
39
|
|
|
40
40
|
if col_type == "date":
|
|
41
41
|
if "relative_to" not in v:
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
# Provide sensible defaults if start/end not specified
|
|
43
|
+
if "start" not in v:
|
|
44
|
+
from datetime import datetime, timedelta
|
|
45
|
+
v["start"] = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
|
46
|
+
if "end" not in v:
|
|
47
|
+
from datetime import datetime
|
|
48
|
+
v["end"] = datetime.now().strftime("%Y-%m-%d")
|
|
44
49
|
|
|
45
50
|
if col_type in ["int", "float"]:
|
|
46
51
|
if "distribution" not in v:
|