misata 0.1.0b0__py3-none-any.whl → 0.3.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
misata/profiles.py ADDED
@@ -0,0 +1,332 @@
1
+ """
2
+ Distribution Profiles for Realistic Data Generation.
3
+
4
+ Pre-configured distribution parameters that match real-world patterns
5
+ for common data types like age, salary, prices, etc.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional, Union
9
+ import numpy as np
10
+
11
+
12
+ class DistributionProfile:
13
+ """A named distribution configuration for realistic generation.
14
+
15
+ Example:
16
+ profile = DistributionProfile(
17
+ name="age",
18
+ distribution="mixture",
19
+ params={
20
+ "components": [
21
+ {"mean": 35, "std": 12, "weight": 0.6}, # Working age
22
+ {"mean": 70, "std": 8, "weight": 0.2}, # Retirees
23
+ {"mean": 12, "std": 4, "weight": 0.2}, # Children
24
+ ]
25
+ }
26
+ )
27
+ values = profile.generate(1000)
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ name: str,
33
+ distribution: str,
34
+ params: Dict[str, Any],
35
+ min_value: Optional[float] = None,
36
+ max_value: Optional[float] = None,
37
+ decimals: Optional[int] = None,
38
+ ):
39
+ self.name = name
40
+ self.distribution = distribution
41
+ self.params = params
42
+ self.min_value = min_value
43
+ self.max_value = max_value
44
+ self.decimals = decimals
45
+
46
+ def generate(
47
+ self,
48
+ size: int,
49
+ rng: Optional[np.random.Generator] = None
50
+ ) -> np.ndarray:
51
+ """Generate values according to this profile."""
52
+ if rng is None:
53
+ rng = np.random.default_rng()
54
+
55
+ if self.distribution == "normal":
56
+ mean = self.params.get("mean", 50)
57
+ std = self.params.get("std", 10)
58
+ values = rng.normal(mean, std, size)
59
+
60
+ elif self.distribution == "lognormal":
61
+ mean = self.params.get("mean", 0)
62
+ sigma = self.params.get("sigma", 1)
63
+ values = rng.lognormal(mean, sigma, size)
64
+
65
+ elif self.distribution == "exponential":
66
+ scale = self.params.get("scale", 1.0)
67
+ values = rng.exponential(scale, size)
68
+
69
+ elif self.distribution == "pareto":
70
+ alpha = self.params.get("alpha", 2.0)
71
+ min_val = self.params.get("min", 1.0)
72
+ values = (rng.pareto(alpha, size) + 1) * min_val
73
+
74
+ elif self.distribution == "beta":
75
+ a = self.params.get("a", 2)
76
+ b = self.params.get("b", 5)
77
+ scale = self.params.get("scale", 1.0)
78
+ values = rng.beta(a, b, size) * scale
79
+
80
+ elif self.distribution == "mixture":
81
+ # Gaussian mixture model
82
+ components = self.params.get("components", [])
83
+ if not components:
84
+ values = rng.normal(0, 1, size)
85
+ else:
86
+ weights = np.array([c.get("weight", 1) for c in components])
87
+ weights = weights / weights.sum()
88
+
89
+ # Sample component indices
90
+ component_indices = rng.choice(
91
+ len(components), size=size, p=weights
92
+ )
93
+
94
+ values = np.zeros(size)
95
+ for i, comp in enumerate(components):
96
+ mask = component_indices == i
97
+ n = mask.sum()
98
+ if n > 0:
99
+ values[mask] = rng.normal(
100
+ comp.get("mean", 0),
101
+ comp.get("std", 1),
102
+ n
103
+ )
104
+
105
+ elif self.distribution == "zipf":
106
+ # Zipf distribution for long-tail data
107
+ a = self.params.get("alpha", 2.0)
108
+ values = rng.zipf(a, size).astype(float)
109
+
110
+ elif self.distribution == "uniform":
111
+ low = self.params.get("min", 0)
112
+ high = self.params.get("max", 100)
113
+ values = rng.uniform(low, high, size)
114
+
115
+ else:
116
+ # Default to uniform
117
+ values = rng.uniform(0, 100, size)
118
+
119
+ # Apply constraints
120
+ if self.min_value is not None:
121
+ values = np.maximum(values, self.min_value)
122
+ if self.max_value is not None:
123
+ values = np.minimum(values, self.max_value)
124
+ if self.decimals is not None:
125
+ values = np.round(values, self.decimals)
126
+
127
+ return values
128
+
129
+
130
+ # ============ Pre-built Profiles ============
131
+
132
+ PROFILES: Dict[str, DistributionProfile] = {}
133
+
134
+
135
+ def _register_profile(profile: DistributionProfile) -> None:
136
+ """Register a profile by name."""
137
+ PROFILES[profile.name] = profile
138
+
139
+
140
+ # Age distributions
141
+ _register_profile(DistributionProfile(
142
+ name="age_adult",
143
+ distribution="mixture",
144
+ params={
145
+ "components": [
146
+ {"mean": 28, "std": 6, "weight": 0.3}, # Young adults
147
+ {"mean": 42, "std": 10, "weight": 0.45}, # Middle age
148
+ {"mean": 62, "std": 8, "weight": 0.25}, # Older adults
149
+ ]
150
+ },
151
+ min_value=18,
152
+ max_value=100,
153
+ decimals=0,
154
+ ))
155
+
156
+ _register_profile(DistributionProfile(
157
+ name="age_population",
158
+ distribution="mixture",
159
+ params={
160
+ "components": [
161
+ {"mean": 8, "std": 4, "weight": 0.15}, # Children
162
+ {"mean": 25, "std": 8, "weight": 0.25}, # Young adults
163
+ {"mean": 42, "std": 12, "weight": 0.35}, # Middle age
164
+ {"mean": 68, "std": 10, "weight": 0.25}, # Seniors
165
+ ]
166
+ },
167
+ min_value=0,
168
+ max_value=105,
169
+ decimals=0,
170
+ ))
171
+
172
+ # Salary distributions
173
+ _register_profile(DistributionProfile(
174
+ name="salary_usd",
175
+ distribution="lognormal",
176
+ params={"mean": 11.0, "sigma": 0.5}, # Log of ~$60k median
177
+ min_value=25000,
178
+ max_value=500000,
179
+ decimals=0,
180
+ ))
181
+
182
+ _register_profile(DistributionProfile(
183
+ name="salary_tech",
184
+ distribution="mixture",
185
+ params={
186
+ "components": [
187
+ {"mean": 75000, "std": 15000, "weight": 0.2}, # Junior
188
+ {"mean": 120000, "std": 25000, "weight": 0.4}, # Mid
189
+ {"mean": 180000, "std": 40000, "weight": 0.3}, # Senior
190
+ {"mean": 280000, "std": 60000, "weight": 0.1}, # Staff+
191
+ ]
192
+ },
193
+ min_value=50000,
194
+ max_value=600000,
195
+ decimals=0,
196
+ ))
197
+
198
+ # Price distributions
199
+ _register_profile(DistributionProfile(
200
+ name="price_retail",
201
+ distribution="lognormal",
202
+ params={"mean": 3.5, "sigma": 1.2}, # ~$30 median
203
+ min_value=0.99,
204
+ max_value=10000,
205
+ decimals=2,
206
+ ))
207
+
208
+ _register_profile(DistributionProfile(
209
+ name="price_saas",
210
+ distribution="mixture",
211
+ params={
212
+ "components": [
213
+ {"mean": 15, "std": 5, "weight": 0.3}, # Basic tier
214
+ {"mean": 49, "std": 15, "weight": 0.4}, # Pro tier
215
+ {"mean": 199, "std": 50, "weight": 0.25}, # Enterprise
216
+ {"mean": 999, "std": 200, "weight": 0.05}, # Custom
217
+ ]
218
+ },
219
+ min_value=0,
220
+ max_value=5000,
221
+ decimals=0,
222
+ ))
223
+
224
+ # Transaction amounts
225
+ _register_profile(DistributionProfile(
226
+ name="transaction_amount",
227
+ distribution="pareto",
228
+ params={"alpha": 2.5, "min": 10},
229
+ min_value=1,
230
+ max_value=100000,
231
+ decimals=2,
232
+ ))
233
+
234
+ # Counts / quantities
235
+ _register_profile(DistributionProfile(
236
+ name="order_quantity",
237
+ distribution="zipf",
238
+ params={"alpha": 2.0},
239
+ min_value=1,
240
+ max_value=100,
241
+ decimals=0,
242
+ ))
243
+
244
+ # Time-related
245
+ _register_profile(DistributionProfile(
246
+ name="session_duration_seconds",
247
+ distribution="lognormal",
248
+ params={"mean": 5.5, "sigma": 1.5}, # ~4 min median
249
+ min_value=1,
250
+ max_value=7200, # 2 hours max
251
+ decimals=0,
252
+ ))
253
+
254
+ # Ratings and scores
255
+ _register_profile(DistributionProfile(
256
+ name="rating_5star",
257
+ distribution="beta",
258
+ params={"a": 5, "b": 2, "scale": 5}, # Skewed towards higher ratings
259
+ min_value=1,
260
+ max_value=5,
261
+ decimals=1,
262
+ ))
263
+
264
+ _register_profile(DistributionProfile(
265
+ name="nps_score",
266
+ distribution="mixture",
267
+ params={
268
+ "components": [
269
+ {"mean": 3, "std": 2, "weight": 0.15}, # Detractors
270
+ {"mean": 7, "std": 1, "weight": 0.25}, # Passives
271
+ {"mean": 9, "std": 0.8, "weight": 0.6}, # Promoters
272
+ ]
273
+ },
274
+ min_value=0,
275
+ max_value=10,
276
+ decimals=0,
277
+ ))
278
+
279
+ # Percentages
280
+ _register_profile(DistributionProfile(
281
+ name="conversion_rate",
282
+ distribution="beta",
283
+ params={"a": 2, "b": 50, "scale": 100}, # Low conversion (1-5%)
284
+ min_value=0,
285
+ max_value=100,
286
+ decimals=2,
287
+ ))
288
+
289
+ _register_profile(DistributionProfile(
290
+ name="churn_rate",
291
+ distribution="beta",
292
+ params={"a": 1.5, "b": 30, "scale": 100}, # ~5% typical
293
+ min_value=0,
294
+ max_value=100,
295
+ decimals=2,
296
+ ))
297
+
298
+
299
+ def get_profile(name: str) -> Optional[DistributionProfile]:
300
+ """Get a profile by name."""
301
+ return PROFILES.get(name)
302
+
303
+
304
+ def list_profiles() -> List[str]:
305
+ """List all available profile names."""
306
+ return list(PROFILES.keys())
307
+
308
+
309
+ def generate_with_profile(
310
+ profile_name: str,
311
+ size: int,
312
+ rng: Optional[np.random.Generator] = None
313
+ ) -> np.ndarray:
314
+ """Generate values using a named profile.
315
+
316
+ Args:
317
+ profile_name: Name of the profile (e.g., "salary_tech")
318
+ size: Number of values to generate
319
+ rng: Random number generator
320
+
321
+ Returns:
322
+ Array of generated values
323
+
324
+ Raises:
325
+ ValueError: If profile not found
326
+ """
327
+ profile = get_profile(profile_name)
328
+ if profile is None:
329
+ available = ", ".join(list_profiles())
330
+ raise ValueError(f"Unknown profile: {profile_name}. Available: {available}")
331
+
332
+ return profile.generate(size, rng)
misata/quality.py ADDED
@@ -0,0 +1,329 @@
1
+ """
2
+ Data Quality Checker for Synthetic Data Validation.
3
+
4
+ This module validates generated synthetic data for:
5
+ - Distribution plausibility
6
+ - Referential integrity
7
+ - Temporal consistency
8
+ - Domain-specific rules
9
+ """
10
+
11
+ from typing import Dict, List, Any, Optional, Tuple
12
+ from dataclasses import dataclass, field
13
+ import warnings
14
+
15
+
16
+ @dataclass
17
+ class QualityIssue:
18
+ """Represents a single data quality issue."""
19
+ severity: str # "error", "warning", "info"
20
+ category: str # "distribution", "integrity", "temporal", "domain"
21
+ table: str
22
+ column: Optional[str]
23
+ message: str
24
+ details: Dict[str, Any] = field(default_factory=dict)
25
+
26
+
27
+ @dataclass
28
+ class QualityReport:
29
+ """Complete quality report for generated data."""
30
+ score: float # 0-100
31
+ issues: List[QualityIssue]
32
+ stats: Dict[str, Any]
33
+
34
+ @property
35
+ def passed(self) -> bool:
36
+ """Returns True if no errors (warnings OK)."""
37
+ return not any(i.severity == "error" for i in self.issues)
38
+
39
+ def summary(self) -> str:
40
+ """Human-readable summary."""
41
+ errors = sum(1 for i in self.issues if i.severity == "error")
42
+ warnings = sum(1 for i in self.issues if i.severity == "warning")
43
+ return f"Quality Score: {self.score:.1f}/100 | Errors: {errors} | Warnings: {warnings}"
44
+
45
+
46
+ class DataQualityChecker:
47
+ """
48
+ Validate generated synthetic data for realism and correctness.
49
+
50
+ Usage:
51
+ checker = DataQualityChecker()
52
+ report = checker.check_all(tables, relationships, schema)
53
+
54
+ if not report.passed:
55
+ print("Issues found:", report.issues)
56
+ """
57
+
58
+ # Domain-specific plausibility rules
59
+ PLAUSIBILITY_RULES = {
60
+ # Column name patterns -> (min, max, description)
61
+ "age": (0, 120, "Human age"),
62
+ "price": (0, 1_000_000, "Price"),
63
+ "quantity": (0, 10_000, "Quantity"),
64
+ "rating": (1, 5, "Rating"),
65
+ "percentage": (0, 100, "Percentage"),
66
+ "year": (1900, 2100, "Year"),
67
+ "month": (1, 12, "Month"),
68
+ "day": (1, 31, "Day"),
69
+ "hour": (0, 23, "Hour"),
70
+ "minute": (0, 59, "Minute"),
71
+ "score": (0, 100, "Score"),
72
+ "count": (0, 1_000_000, "Count"),
73
+ "duration": (0, 10_000, "Duration"),
74
+ }
75
+
76
+ def __init__(self, strict: bool = False):
77
+ """
78
+ Initialize the quality checker.
79
+
80
+ Args:
81
+ strict: If True, warnings become errors
82
+ """
83
+ self.strict = strict
84
+ self.issues: List[QualityIssue] = []
85
+
86
+ def _add_issue(
87
+ self,
88
+ severity: str,
89
+ category: str,
90
+ table: str,
91
+ column: Optional[str],
92
+ message: str,
93
+ details: Optional[Dict] = None,
94
+ ):
95
+ """Add an issue to the list."""
96
+ if self.strict and severity == "warning":
97
+ severity = "error"
98
+
99
+ self.issues.append(QualityIssue(
100
+ severity=severity,
101
+ category=category,
102
+ table=table,
103
+ column=column,
104
+ message=message,
105
+ details=details or {},
106
+ ))
107
+
108
+ def check_distribution_plausibility(
109
+ self,
110
+ df: "pd.DataFrame",
111
+ table_name: str,
112
+ ) -> None:
113
+ """
114
+ Check if numeric distributions are plausible for their domains.
115
+
116
+ Args:
117
+ df: DataFrame to check
118
+ table_name: Name of the table
119
+ """
120
+ import pandas as pd
121
+ import numpy as np
122
+
123
+ for col in df.columns:
124
+ col_lower = col.lower()
125
+
126
+ # Check against plausibility rules
127
+ for pattern, (min_val, max_val, description) in self.PLAUSIBILITY_RULES.items():
128
+ if pattern in col_lower:
129
+ if pd.api.types.is_numeric_dtype(df[col]):
130
+ actual_min = df[col].min()
131
+ actual_max = df[col].max()
132
+
133
+ if actual_min < min_val:
134
+ self._add_issue(
135
+ "warning", "distribution", table_name, col,
136
+ f"{description} column '{col}' has min {actual_min} < expected {min_val}",
137
+ {"actual_min": actual_min, "expected_min": min_val}
138
+ )
139
+
140
+ if actual_max > max_val:
141
+ self._add_issue(
142
+ "warning", "distribution", table_name, col,
143
+ f"{description} column '{col}' has max {actual_max} > expected {max_val}",
144
+ {"actual_max": actual_max, "expected_max": max_val}
145
+ )
146
+ break
147
+
148
+ # Check for all-null columns
149
+ if df[col].isna().all():
150
+ self._add_issue(
151
+ "error", "distribution", table_name, col,
152
+ f"Column '{col}' is entirely NULL",
153
+ )
154
+
155
+ # Check for zero variance (all same value)
156
+ if pd.api.types.is_numeric_dtype(df[col]) and df[col].std() == 0:
157
+ self._add_issue(
158
+ "warning", "distribution", table_name, col,
159
+ f"Column '{col}' has zero variance (all values identical)",
160
+ {"value": df[col].iloc[0]}
161
+ )
162
+
163
+ def check_referential_integrity(
164
+ self,
165
+ tables: Dict[str, "pd.DataFrame"],
166
+ relationships: List[Any],
167
+ ) -> None:
168
+ """
169
+ Verify all foreign key references are valid.
170
+
171
+ Args:
172
+ tables: Dict of table_name -> DataFrame
173
+ relationships: List of Relationship objects
174
+ """
175
+ for rel in relationships:
176
+ parent_table = rel.parent_table
177
+ child_table = rel.child_table
178
+ parent_key = rel.parent_key
179
+ child_key = rel.child_key
180
+
181
+ if parent_table not in tables:
182
+ self._add_issue(
183
+ "error", "integrity", child_table, child_key,
184
+ f"Parent table '{parent_table}' not found for FK '{child_key}'",
185
+ )
186
+ continue
187
+
188
+ if child_table not in tables:
189
+ continue # Child table might not exist yet
190
+
191
+ parent_df = tables[parent_table]
192
+ child_df = tables[child_table]
193
+
194
+ if parent_key not in parent_df.columns:
195
+ self._add_issue(
196
+ "error", "integrity", parent_table, parent_key,
197
+ f"Parent key '{parent_key}' not found in table '{parent_table}'",
198
+ )
199
+ continue
200
+
201
+ if child_key not in child_df.columns:
202
+ self._add_issue(
203
+ "error", "integrity", child_table, child_key,
204
+ f"Child key '{child_key}' not found in table '{child_table}'",
205
+ )
206
+ continue
207
+
208
+ # Check for orphaned records
209
+ parent_ids = set(parent_df[parent_key].dropna().unique())
210
+ child_ids = set(child_df[child_key].dropna().unique())
211
+ orphans = child_ids - parent_ids
212
+
213
+ if orphans:
214
+ orphan_pct = len(orphans) / len(child_ids) * 100
215
+ self._add_issue(
216
+ "error" if orphan_pct > 1 else "warning",
217
+ "integrity", child_table, child_key,
218
+ f"{len(orphans)} orphaned FK values ({orphan_pct:.1f}%) in '{child_key}' -> '{parent_table}.{parent_key}'",
219
+ {"orphan_count": len(orphans), "orphan_pct": orphan_pct}
220
+ )
221
+
222
+ def check_temporal_consistency(
223
+ self,
224
+ df: "pd.DataFrame",
225
+ table_name: str,
226
+ ) -> None:
227
+ """
228
+ Ensure temporal columns are consistent.
229
+
230
+ Checks:
231
+ - created_at < updated_at
232
+ - start_date < end_date
233
+ - birth_date in past
234
+ """
235
+ import pandas as pd
236
+
237
+ date_cols = [c for c in df.columns if pd.api.types.is_datetime64_any_dtype(df[c])]
238
+
239
+ # Check created < updated
240
+ if "created_at" in date_cols and "updated_at" in date_cols:
241
+ violations = (df["created_at"] > df["updated_at"]).sum()
242
+ if violations > 0:
243
+ self._add_issue(
244
+ "error", "temporal", table_name, "created_at",
245
+ f"{violations} rows have created_at > updated_at",
246
+ {"violation_count": violations}
247
+ )
248
+
249
+ # Check start < end
250
+ if "start_date" in date_cols and "end_date" in date_cols:
251
+ violations = (df["start_date"] > df["end_date"]).sum()
252
+ if violations > 0:
253
+ self._add_issue(
254
+ "error", "temporal", table_name, "start_date",
255
+ f"{violations} rows have start_date > end_date",
256
+ {"violation_count": violations}
257
+ )
258
+
259
+ # Check birth_date is in past
260
+ if "birth_date" in date_cols or "date_of_birth" in date_cols:
261
+ col = "birth_date" if "birth_date" in date_cols else "date_of_birth"
262
+ future_births = (df[col] > pd.Timestamp.now()).sum()
263
+ if future_births > 0:
264
+ self._add_issue(
265
+ "error", "temporal", table_name, col,
266
+ f"{future_births} rows have birth_date in the future",
267
+ {"violation_count": future_births}
268
+ )
269
+
270
+ def check_all(
271
+ self,
272
+ tables: Dict[str, "pd.DataFrame"],
273
+ relationships: Optional[List[Any]] = None,
274
+ schema: Optional[Any] = None,
275
+ ) -> QualityReport:
276
+ """
277
+ Run all quality checks and generate a report.
278
+
279
+ Args:
280
+ tables: Dict of table_name -> DataFrame
281
+ relationships: Optional list of Relationship objects
282
+ schema: Optional SchemaConfig for additional checks
283
+
284
+ Returns:
285
+ QualityReport with score and issues
286
+ """
287
+ self.issues = [] # Reset
288
+
289
+ # Check each table
290
+ for table_name, df in tables.items():
291
+ self.check_distribution_plausibility(df, table_name)
292
+ self.check_temporal_consistency(df, table_name)
293
+
294
+ # Check referential integrity
295
+ if relationships:
296
+ self.check_referential_integrity(tables, relationships)
297
+
298
+ # Calculate score
299
+ base_score = 100
300
+ for issue in self.issues:
301
+ if issue.severity == "error":
302
+ base_score -= 10
303
+ elif issue.severity == "warning":
304
+ base_score -= 3
305
+ else:
306
+ base_score -= 1
307
+
308
+ score = max(0, min(100, base_score))
309
+
310
+ # Gather stats
311
+ stats = {
312
+ "tables_checked": len(tables),
313
+ "total_rows": sum(len(df) for df in tables.values()),
314
+ "total_columns": sum(len(df.columns) for df in tables.values()),
315
+ "error_count": sum(1 for i in self.issues if i.severity == "error"),
316
+ "warning_count": sum(1 for i in self.issues if i.severity == "warning"),
317
+ }
318
+
319
+ return QualityReport(
320
+ score=score,
321
+ issues=self.issues.copy(),
322
+ stats=stats,
323
+ )
324
+
325
+
326
+ def check_quality(tables: Dict[str, "pd.DataFrame"], **kwargs) -> QualityReport:
327
+ """Convenience function for quick quality checks."""
328
+ checker = DataQualityChecker()
329
+ return checker.check_all(tables, **kwargs)
misata/schema.py CHANGED
@@ -23,7 +23,7 @@ class Column(BaseModel):
23
23
  """
24
24
 
25
25
  name: str
26
- type: Literal["int", "float", "date", "categorical", "foreign_key", "text", "boolean"]
26
+ type: Literal["int", "float", "date", "time", "datetime", "categorical", "foreign_key", "text", "boolean"]
27
27
  distribution_params: Dict[str, Any] = Field(default_factory=dict)
28
28
  nullable: bool = False
29
29
  unique: bool = False
@@ -39,8 +39,13 @@ class Column(BaseModel):
39
39
 
40
40
  if col_type == "date":
41
41
  if "relative_to" not in v:
42
- if "start" not in v or "end" not in v:
43
- raise ValueError("Date columns must have 'start' and 'end' OR 'relative_to' in distribution_params")
42
+ # Provide sensible defaults if start/end not specified
43
+ if "start" not in v:
44
+ from datetime import datetime, timedelta
45
+ v["start"] = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
46
+ if "end" not in v:
47
+ from datetime import datetime
48
+ v["end"] = datetime.now().strftime("%Y-%m-%d")
44
49
 
45
50
  if col_type in ["int", "float"]:
46
51
  if "distribution" not in v: