truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/METADATA +147 -23
- truthound_dashboard-1.4.1.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
- truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
- truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,2088 @@
|
|
|
1
|
+
"""Rule generation service.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for automatically generating
|
|
4
|
+
validation rules based on profile data analysis.
|
|
5
|
+
|
|
6
|
+
Features:
|
|
7
|
+
- Multiple strictness levels (loose, medium, strict)
|
|
8
|
+
- Preset templates for different use cases
|
|
9
|
+
- Category-based filtering
|
|
10
|
+
- Multiple export formats (YAML, JSON, Python, TOML)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import re
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import yaml
|
|
21
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
22
|
+
|
|
23
|
+
from truthound_dashboard.db import Profile, Rule, Schema, Source
|
|
24
|
+
from truthound_dashboard.core.services import ProfileRepository, RuleRepository
|
|
25
|
+
from truthound_dashboard.schemas.rule_suggestion import (
|
|
26
|
+
ApplyRulesResponse,
|
|
27
|
+
CrossColumnRuleSuggestion,
|
|
28
|
+
CrossColumnRuleType,
|
|
29
|
+
ExportRulesResponse,
|
|
30
|
+
PresetInfo,
|
|
31
|
+
PresetsResponse,
|
|
32
|
+
RuleCategory,
|
|
33
|
+
RuleExportFormat,
|
|
34
|
+
RulePreset,
|
|
35
|
+
RuleSuggestionResponse,
|
|
36
|
+
StrictnessLevel,
|
|
37
|
+
SuggestedRule,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Common email pattern
|
|
42
|
+
EMAIL_PATTERN = re.compile(
|
|
43
|
+
r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Common date patterns
|
|
47
|
+
DATE_PATTERNS = [
|
|
48
|
+
r"\d{4}-\d{2}-\d{2}", # YYYY-MM-DD
|
|
49
|
+
r"\d{2}/\d{2}/\d{4}", # MM/DD/YYYY
|
|
50
|
+
r"\d{2}-\d{2}-\d{4}", # DD-MM-YYYY
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# =============================================================================
|
|
55
|
+
# Statistical Confidence Calculation Helpers
|
|
56
|
+
# =============================================================================
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def calculate_pattern_confidence(
|
|
60
|
+
match_rate: float,
|
|
61
|
+
sample_size: int,
|
|
62
|
+
min_sample: int = 100,
|
|
63
|
+
base_confidence: float = 0.5,
|
|
64
|
+
) -> float:
|
|
65
|
+
"""Calculate confidence score based on pattern matching and sample size.
|
|
66
|
+
|
|
67
|
+
Uses a statistical approach to compute confidence:
|
|
68
|
+
- Higher match rates increase confidence
|
|
69
|
+
- Larger sample sizes increase confidence
|
|
70
|
+
- Small samples (<min_sample) are penalized
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
match_rate: Rate of pattern matches (0.0 to 1.0).
|
|
74
|
+
sample_size: Number of samples analyzed.
|
|
75
|
+
min_sample: Minimum sample size for full confidence.
|
|
76
|
+
base_confidence: Starting confidence level.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Confidence score between 0.0 and 1.0.
|
|
80
|
+
"""
|
|
81
|
+
if sample_size == 0:
|
|
82
|
+
return base_confidence
|
|
83
|
+
|
|
84
|
+
# Sample size factor: penalize small samples
|
|
85
|
+
size_factor = min(1.0, sample_size / min_sample)
|
|
86
|
+
|
|
87
|
+
# Match rate contribution (higher is better)
|
|
88
|
+
rate_contribution = match_rate * 0.4
|
|
89
|
+
|
|
90
|
+
# Size contribution (larger samples = more reliable)
|
|
91
|
+
size_contribution = size_factor * 0.1
|
|
92
|
+
|
|
93
|
+
# Base contribution
|
|
94
|
+
confidence = base_confidence + rate_contribution + size_contribution
|
|
95
|
+
|
|
96
|
+
# Clamp to valid range
|
|
97
|
+
return max(0.0, min(1.0, confidence))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def calculate_uniqueness_confidence(
|
|
101
|
+
unique_ratio: float,
|
|
102
|
+
total_count: int,
|
|
103
|
+
cardinality: int,
|
|
104
|
+
) -> float:
|
|
105
|
+
"""Calculate confidence for uniqueness-based rules.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
unique_ratio: Ratio of unique values.
|
|
109
|
+
total_count: Total number of rows.
|
|
110
|
+
cardinality: Number of distinct values.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Confidence score.
|
|
114
|
+
"""
|
|
115
|
+
if total_count == 0:
|
|
116
|
+
return 0.5
|
|
117
|
+
|
|
118
|
+
# High uniqueness = likely primary key
|
|
119
|
+
if unique_ratio >= 0.99:
|
|
120
|
+
base = 0.85
|
|
121
|
+
elif unique_ratio >= 0.95:
|
|
122
|
+
base = 0.75
|
|
123
|
+
elif unique_ratio >= 0.8:
|
|
124
|
+
base = 0.65
|
|
125
|
+
else:
|
|
126
|
+
base = 0.5
|
|
127
|
+
|
|
128
|
+
# Bonus for larger datasets (more statistically significant)
|
|
129
|
+
size_bonus = min(0.1, total_count / 10000 * 0.1)
|
|
130
|
+
|
|
131
|
+
return min(1.0, base + size_bonus)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def calculate_correlation_confidence(
|
|
135
|
+
pattern_strength: str,
|
|
136
|
+
column_count: int = 2,
|
|
137
|
+
) -> float:
|
|
138
|
+
"""Calculate confidence for correlation-based rules.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
pattern_strength: 'strong', 'medium', 'weak'.
|
|
142
|
+
column_count: Number of columns involved.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Confidence score.
|
|
146
|
+
"""
|
|
147
|
+
strength_scores = {
|
|
148
|
+
"strong": 0.85,
|
|
149
|
+
"medium": 0.7,
|
|
150
|
+
"weak": 0.55,
|
|
151
|
+
}
|
|
152
|
+
base = strength_scores.get(pattern_strength, 0.6)
|
|
153
|
+
|
|
154
|
+
# Penalize for more columns (harder to maintain relationship)
|
|
155
|
+
column_penalty = max(0, (column_count - 2) * 0.05)
|
|
156
|
+
|
|
157
|
+
return max(0.5, base - column_penalty)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def extract_sample_violations(
|
|
161
|
+
profile_data: dict[str, Any],
|
|
162
|
+
columns: list[str],
|
|
163
|
+
rule_type: str,
|
|
164
|
+
max_samples: int = 5,
|
|
165
|
+
) -> list[dict[str, Any]]:
|
|
166
|
+
"""Extract sample violations from profile data.
|
|
167
|
+
|
|
168
|
+
This function attempts to find potential violations based on the
|
|
169
|
+
profile statistics. In production, this would query actual data.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
profile_data: Profile statistics for columns.
|
|
173
|
+
columns: Column names involved in the rule.
|
|
174
|
+
rule_type: Type of cross-column rule.
|
|
175
|
+
max_samples: Maximum number of sample violations to return.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of sample violation records.
|
|
179
|
+
"""
|
|
180
|
+
violations: list[dict[str, Any]] = []
|
|
181
|
+
|
|
182
|
+
# Check if profile has outlier or anomaly data
|
|
183
|
+
for col in columns:
|
|
184
|
+
col_data = profile_data.get(col, {})
|
|
185
|
+
|
|
186
|
+
# Check for outliers that might indicate violations
|
|
187
|
+
outliers = col_data.get("outliers", [])
|
|
188
|
+
if outliers:
|
|
189
|
+
for outlier in outliers[:max_samples]:
|
|
190
|
+
violations.append({
|
|
191
|
+
"row_index": outlier.get("row", 0),
|
|
192
|
+
"column": col,
|
|
193
|
+
"value": outlier.get("value"),
|
|
194
|
+
"reason": f"Outlier detected in {col}",
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
# Check for null/missing values that might cause violations
|
|
198
|
+
null_count = col_data.get("null_count", 0)
|
|
199
|
+
if null_count > 0 and rule_type in ("column_coexistence", "column_dependency"):
|
|
200
|
+
violations.append({
|
|
201
|
+
"row_index": "multiple",
|
|
202
|
+
"column": col,
|
|
203
|
+
"value": None,
|
|
204
|
+
"reason": f"{null_count} null values in {col}",
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
return violations[:max_samples]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# =============================================================================
|
|
211
|
+
# Preset Definitions
|
|
212
|
+
# =============================================================================
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
PRESET_DEFINITIONS: dict[RulePreset, PresetInfo] = {
|
|
216
|
+
RulePreset.DEFAULT: PresetInfo(
|
|
217
|
+
name=RulePreset.DEFAULT,
|
|
218
|
+
display_name="Default",
|
|
219
|
+
description="General purpose validation rules. Balanced coverage and thresholds.",
|
|
220
|
+
strictness=StrictnessLevel.MEDIUM,
|
|
221
|
+
categories=[
|
|
222
|
+
RuleCategory.SCHEMA,
|
|
223
|
+
RuleCategory.COMPLETENESS,
|
|
224
|
+
RuleCategory.UNIQUENESS,
|
|
225
|
+
RuleCategory.STATISTICS,
|
|
226
|
+
],
|
|
227
|
+
recommended_for="Most data validation scenarios",
|
|
228
|
+
),
|
|
229
|
+
RulePreset.STRICT: PresetInfo(
|
|
230
|
+
name=RulePreset.STRICT,
|
|
231
|
+
display_name="Strict",
|
|
232
|
+
description="Tight thresholds for production data. High confidence rules only.",
|
|
233
|
+
strictness=StrictnessLevel.STRICT,
|
|
234
|
+
categories=[
|
|
235
|
+
RuleCategory.SCHEMA,
|
|
236
|
+
RuleCategory.COMPLETENESS,
|
|
237
|
+
RuleCategory.UNIQUENESS,
|
|
238
|
+
RuleCategory.STATISTICS,
|
|
239
|
+
RuleCategory.PATTERN,
|
|
240
|
+
],
|
|
241
|
+
recommended_for="Production data pipelines, data quality gates",
|
|
242
|
+
),
|
|
243
|
+
RulePreset.LOOSE: PresetInfo(
|
|
244
|
+
name=RulePreset.LOOSE,
|
|
245
|
+
display_name="Loose",
|
|
246
|
+
description="Permissive thresholds for development/testing.",
|
|
247
|
+
strictness=StrictnessLevel.LOOSE,
|
|
248
|
+
categories=[
|
|
249
|
+
RuleCategory.SCHEMA,
|
|
250
|
+
RuleCategory.COMPLETENESS,
|
|
251
|
+
],
|
|
252
|
+
recommended_for="Development, testing, exploratory analysis",
|
|
253
|
+
),
|
|
254
|
+
RulePreset.MINIMAL: PresetInfo(
|
|
255
|
+
name=RulePreset.MINIMAL,
|
|
256
|
+
display_name="Minimal",
|
|
257
|
+
description="Essential rules only. Focus on critical data integrity.",
|
|
258
|
+
strictness=StrictnessLevel.MEDIUM,
|
|
259
|
+
categories=[
|
|
260
|
+
RuleCategory.SCHEMA,
|
|
261
|
+
RuleCategory.COMPLETENESS,
|
|
262
|
+
],
|
|
263
|
+
recommended_for="Quick validation, minimal overhead",
|
|
264
|
+
),
|
|
265
|
+
RulePreset.COMPREHENSIVE: PresetInfo(
|
|
266
|
+
name=RulePreset.COMPREHENSIVE,
|
|
267
|
+
display_name="Comprehensive",
|
|
268
|
+
description="All available rules. Maximum validation coverage.",
|
|
269
|
+
strictness=StrictnessLevel.MEDIUM,
|
|
270
|
+
categories=[
|
|
271
|
+
RuleCategory.SCHEMA,
|
|
272
|
+
RuleCategory.COMPLETENESS,
|
|
273
|
+
RuleCategory.UNIQUENESS,
|
|
274
|
+
RuleCategory.STATISTICS,
|
|
275
|
+
RuleCategory.PATTERN,
|
|
276
|
+
RuleCategory.DISTRIBUTION,
|
|
277
|
+
],
|
|
278
|
+
recommended_for="Full data audit, compliance checks",
|
|
279
|
+
),
|
|
280
|
+
RulePreset.CI_CD: PresetInfo(
|
|
281
|
+
name=RulePreset.CI_CD,
|
|
282
|
+
display_name="CI/CD",
|
|
283
|
+
description="Optimized for continuous integration. Fast execution, clear failures.",
|
|
284
|
+
strictness=StrictnessLevel.STRICT,
|
|
285
|
+
categories=[
|
|
286
|
+
RuleCategory.SCHEMA,
|
|
287
|
+
RuleCategory.COMPLETENESS,
|
|
288
|
+
RuleCategory.UNIQUENESS,
|
|
289
|
+
],
|
|
290
|
+
recommended_for="CI/CD pipelines, automated testing",
|
|
291
|
+
),
|
|
292
|
+
RulePreset.SCHEMA_ONLY: PresetInfo(
|
|
293
|
+
name=RulePreset.SCHEMA_ONLY,
|
|
294
|
+
display_name="Schema Only",
|
|
295
|
+
description="Structure validation only. No statistical checks.",
|
|
296
|
+
strictness=StrictnessLevel.MEDIUM,
|
|
297
|
+
categories=[RuleCategory.SCHEMA],
|
|
298
|
+
recommended_for="Schema drift detection, structure validation",
|
|
299
|
+
),
|
|
300
|
+
RulePreset.FORMAT_ONLY: PresetInfo(
|
|
301
|
+
name=RulePreset.FORMAT_ONLY,
|
|
302
|
+
display_name="Format Only",
|
|
303
|
+
description="Format and pattern rules only.",
|
|
304
|
+
strictness=StrictnessLevel.MEDIUM,
|
|
305
|
+
categories=[RuleCategory.PATTERN],
|
|
306
|
+
recommended_for="Data format validation, PII detection",
|
|
307
|
+
),
|
|
308
|
+
RulePreset.CROSS_COLUMN: PresetInfo(
|
|
309
|
+
name=RulePreset.CROSS_COLUMN,
|
|
310
|
+
display_name="Cross-Column",
|
|
311
|
+
description="Focus on cross-column relationships and constraints.",
|
|
312
|
+
strictness=StrictnessLevel.MEDIUM,
|
|
313
|
+
categories=[
|
|
314
|
+
RuleCategory.RELATIONSHIP,
|
|
315
|
+
RuleCategory.MULTI_COLUMN,
|
|
316
|
+
RuleCategory.UNIQUENESS,
|
|
317
|
+
],
|
|
318
|
+
recommended_for="Data integrity, referential constraints, composite keys",
|
|
319
|
+
),
|
|
320
|
+
RulePreset.DATA_INTEGRITY: PresetInfo(
|
|
321
|
+
name=RulePreset.DATA_INTEGRITY,
|
|
322
|
+
display_name="Data Integrity",
|
|
323
|
+
description="Comprehensive data integrity validation including cross-column rules.",
|
|
324
|
+
strictness=StrictnessLevel.STRICT,
|
|
325
|
+
categories=[
|
|
326
|
+
RuleCategory.SCHEMA,
|
|
327
|
+
RuleCategory.COMPLETENESS,
|
|
328
|
+
RuleCategory.UNIQUENESS,
|
|
329
|
+
RuleCategory.RELATIONSHIP,
|
|
330
|
+
RuleCategory.MULTI_COLUMN,
|
|
331
|
+
],
|
|
332
|
+
recommended_for="Database migrations, data warehouse validation",
|
|
333
|
+
),
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# Strictness thresholds
|
|
338
|
+
STRICTNESS_THRESHOLDS = {
|
|
339
|
+
StrictnessLevel.LOOSE: {
|
|
340
|
+
"min_confidence": 0.3,
|
|
341
|
+
"null_threshold": 10.0,
|
|
342
|
+
"unique_threshold": 90.0,
|
|
343
|
+
"range_buffer": 0.2, # 20% buffer on ranges
|
|
344
|
+
},
|
|
345
|
+
StrictnessLevel.MEDIUM: {
|
|
346
|
+
"min_confidence": 0.5,
|
|
347
|
+
"null_threshold": 5.0,
|
|
348
|
+
"unique_threshold": 95.0,
|
|
349
|
+
"range_buffer": 0.1, # 10% buffer
|
|
350
|
+
},
|
|
351
|
+
StrictnessLevel.STRICT: {
|
|
352
|
+
"min_confidence": 0.7,
|
|
353
|
+
"null_threshold": 1.0,
|
|
354
|
+
"unique_threshold": 99.0,
|
|
355
|
+
"range_buffer": 0.0, # No buffer
|
|
356
|
+
},
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _parse_percentage(value: str | None) -> float:
|
|
361
|
+
"""Parse percentage string to float.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
value: Percentage string like "25.5%".
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Float value (0.0-100.0).
|
|
368
|
+
"""
|
|
369
|
+
if not value:
|
|
370
|
+
return 0.0
|
|
371
|
+
try:
|
|
372
|
+
return float(value.replace("%", ""))
|
|
373
|
+
except (ValueError, AttributeError):
|
|
374
|
+
return 0.0
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class RuleGeneratorService:
|
|
378
|
+
"""Service for generating validation rules from profile data."""
|
|
379
|
+
|
|
380
|
+
def __init__(self, session: AsyncSession):
|
|
381
|
+
"""Initialize service.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
session: Database session.
|
|
385
|
+
"""
|
|
386
|
+
self.session = session
|
|
387
|
+
self.profile_repo = ProfileRepository(session)
|
|
388
|
+
self.rule_repo = RuleRepository(session)
|
|
389
|
+
|
|
390
|
+
def _suggest_null_rules(
|
|
391
|
+
self,
|
|
392
|
+
column: dict[str, Any],
|
|
393
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
394
|
+
) -> list[SuggestedRule]:
|
|
395
|
+
"""Suggest null-related validators based on null percentage.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
column: Column profile data.
|
|
399
|
+
strictness: Strictness level for thresholds.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
List of suggested rules.
|
|
403
|
+
"""
|
|
404
|
+
suggestions = []
|
|
405
|
+
col_name = column.get("name", "")
|
|
406
|
+
null_pct = _parse_percentage(column.get("null_pct"))
|
|
407
|
+
thresholds = STRICTNESS_THRESHOLDS[strictness]
|
|
408
|
+
null_threshold = thresholds["null_threshold"]
|
|
409
|
+
|
|
410
|
+
if null_pct == 0.0:
|
|
411
|
+
# Column has no nulls - suggest NotNull
|
|
412
|
+
suggestions.append(
|
|
413
|
+
SuggestedRule(
|
|
414
|
+
column=col_name,
|
|
415
|
+
validator_name="NotNull",
|
|
416
|
+
params={},
|
|
417
|
+
confidence=0.95,
|
|
418
|
+
reason="Column has 0% null values",
|
|
419
|
+
severity_suggestion="high",
|
|
420
|
+
category=RuleCategory.COMPLETENESS,
|
|
421
|
+
)
|
|
422
|
+
)
|
|
423
|
+
elif null_pct < 1.0:
|
|
424
|
+
# Very few nulls - suggest Null with mostly
|
|
425
|
+
mostly = 0.99 if strictness == StrictnessLevel.STRICT else 0.98
|
|
426
|
+
suggestions.append(
|
|
427
|
+
SuggestedRule(
|
|
428
|
+
column=col_name,
|
|
429
|
+
validator_name="Null",
|
|
430
|
+
params={"mostly": mostly},
|
|
431
|
+
confidence=0.85,
|
|
432
|
+
reason=f"Column has only {null_pct}% null values",
|
|
433
|
+
severity_suggestion="medium",
|
|
434
|
+
category=RuleCategory.COMPLETENESS,
|
|
435
|
+
)
|
|
436
|
+
)
|
|
437
|
+
elif null_pct < null_threshold:
|
|
438
|
+
# Some nulls - suggest Null with lower threshold
|
|
439
|
+
mostly = 1 - (null_pct / 100) - 0.01
|
|
440
|
+
suggestions.append(
|
|
441
|
+
SuggestedRule(
|
|
442
|
+
column=col_name,
|
|
443
|
+
validator_name="Null",
|
|
444
|
+
params={"mostly": round(mostly, 2)},
|
|
445
|
+
confidence=0.7,
|
|
446
|
+
reason=f"Column has {null_pct}% null values",
|
|
447
|
+
severity_suggestion="low",
|
|
448
|
+
category=RuleCategory.COMPLETENESS,
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
return suggestions
|
|
453
|
+
|
|
454
|
+
def _suggest_uniqueness_rules(
|
|
455
|
+
self,
|
|
456
|
+
column: dict[str, Any],
|
|
457
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
458
|
+
) -> list[SuggestedRule]:
|
|
459
|
+
"""Suggest uniqueness validators based on unique percentage.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
column: Column profile data.
|
|
463
|
+
strictness: Strictness level for thresholds.
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
List of suggested rules.
|
|
467
|
+
"""
|
|
468
|
+
suggestions = []
|
|
469
|
+
col_name = column.get("name", "")
|
|
470
|
+
unique_pct = _parse_percentage(column.get("unique_pct"))
|
|
471
|
+
distinct_count = column.get("distinct_count")
|
|
472
|
+
thresholds = STRICTNESS_THRESHOLDS[strictness]
|
|
473
|
+
unique_threshold = thresholds["unique_threshold"]
|
|
474
|
+
|
|
475
|
+
if unique_pct >= 99.9:
|
|
476
|
+
# Nearly unique - suggest Unique
|
|
477
|
+
suggestions.append(
|
|
478
|
+
SuggestedRule(
|
|
479
|
+
column=col_name,
|
|
480
|
+
validator_name="Unique",
|
|
481
|
+
params={},
|
|
482
|
+
confidence=0.95,
|
|
483
|
+
reason=f"Column has {unique_pct}% unique values (likely primary key)",
|
|
484
|
+
severity_suggestion="high",
|
|
485
|
+
category=RuleCategory.UNIQUENESS,
|
|
486
|
+
)
|
|
487
|
+
)
|
|
488
|
+
elif unique_pct >= unique_threshold:
|
|
489
|
+
# High uniqueness - suggest Unique with tolerance
|
|
490
|
+
mostly = unique_pct / 100
|
|
491
|
+
suggestions.append(
|
|
492
|
+
SuggestedRule(
|
|
493
|
+
column=col_name,
|
|
494
|
+
validator_name="Unique",
|
|
495
|
+
params={"mostly": round(mostly, 2)},
|
|
496
|
+
confidence=0.8,
|
|
497
|
+
reason=f"Column has {unique_pct}% unique values",
|
|
498
|
+
severity_suggestion="medium",
|
|
499
|
+
category=RuleCategory.UNIQUENESS,
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
elif unique_pct < 10.0 and distinct_count and distinct_count < 50:
|
|
503
|
+
# Low cardinality - suggest DistinctSet
|
|
504
|
+
buffer = 10 if strictness == StrictnessLevel.LOOSE else 5
|
|
505
|
+
suggestions.append(
|
|
506
|
+
SuggestedRule(
|
|
507
|
+
column=col_name,
|
|
508
|
+
validator_name="DistinctSet",
|
|
509
|
+
params={"max_distinct": distinct_count + buffer},
|
|
510
|
+
confidence=0.75,
|
|
511
|
+
reason=f"Column has low cardinality ({distinct_count} distinct values)",
|
|
512
|
+
severity_suggestion="low",
|
|
513
|
+
category=RuleCategory.DISTRIBUTION,
|
|
514
|
+
)
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
return suggestions
|
|
518
|
+
|
|
519
|
+
def _suggest_range_rules(
|
|
520
|
+
self,
|
|
521
|
+
column: dict[str, Any],
|
|
522
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
523
|
+
) -> list[SuggestedRule]:
|
|
524
|
+
"""Suggest range validators based on min/max values.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
column: Column profile data.
|
|
528
|
+
strictness: Strictness level for thresholds.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
List of suggested rules.
|
|
532
|
+
"""
|
|
533
|
+
suggestions = []
|
|
534
|
+
col_name = column.get("name", "")
|
|
535
|
+
dtype = column.get("dtype", "").lower()
|
|
536
|
+
min_val = column.get("min")
|
|
537
|
+
max_val = column.get("max")
|
|
538
|
+
thresholds = STRICTNESS_THRESHOLDS[strictness]
|
|
539
|
+
buffer = thresholds["range_buffer"]
|
|
540
|
+
|
|
541
|
+
# Only suggest for numeric types
|
|
542
|
+
if dtype not in ("int64", "int32", "float64", "float32", "number", "integer"):
|
|
543
|
+
return suggestions
|
|
544
|
+
|
|
545
|
+
if min_val is not None and max_val is not None:
|
|
546
|
+
try:
|
|
547
|
+
min_num = float(min_val)
|
|
548
|
+
max_num = float(max_val)
|
|
549
|
+
|
|
550
|
+
# Only suggest if range seems reasonable
|
|
551
|
+
if max_num > min_num:
|
|
552
|
+
# Apply buffer to range
|
|
553
|
+
range_size = max_num - min_num
|
|
554
|
+
buffered_min = min_num - (range_size * buffer)
|
|
555
|
+
buffered_max = max_num + (range_size * buffer)
|
|
556
|
+
|
|
557
|
+
suggestions.append(
|
|
558
|
+
SuggestedRule(
|
|
559
|
+
column=col_name,
|
|
560
|
+
validator_name="Range",
|
|
561
|
+
params={
|
|
562
|
+
"min_value": round(buffered_min, 2),
|
|
563
|
+
"max_value": round(buffered_max, 2),
|
|
564
|
+
},
|
|
565
|
+
confidence=0.7,
|
|
566
|
+
reason=f"Column values range from {min_num} to {max_num}",
|
|
567
|
+
severity_suggestion="medium",
|
|
568
|
+
category=RuleCategory.STATISTICS,
|
|
569
|
+
)
|
|
570
|
+
)
|
|
571
|
+
except (ValueError, TypeError):
|
|
572
|
+
pass
|
|
573
|
+
|
|
574
|
+
return suggestions
|
|
575
|
+
|
|
576
|
+
def _suggest_type_rules(
|
|
577
|
+
self,
|
|
578
|
+
column: dict[str, Any],
|
|
579
|
+
schema_column: dict[str, Any] | None = None,
|
|
580
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
581
|
+
) -> list[SuggestedRule]:
|
|
582
|
+
"""Suggest type and pattern validators based on data type.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
column: Column profile data.
|
|
586
|
+
schema_column: Optional schema column definition.
|
|
587
|
+
strictness: Strictness level.
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
List of suggested rules.
|
|
591
|
+
"""
|
|
592
|
+
suggestions = []
|
|
593
|
+
col_name = column.get("name", "").lower()
|
|
594
|
+
original_name = column.get("name", "")
|
|
595
|
+
dtype = column.get("dtype", "").lower()
|
|
596
|
+
|
|
597
|
+
# Email detection by column name
|
|
598
|
+
if any(hint in col_name for hint in ("email", "e_mail", "mail")):
|
|
599
|
+
suggestions.append(
|
|
600
|
+
SuggestedRule(
|
|
601
|
+
column=original_name,
|
|
602
|
+
validator_name="Email",
|
|
603
|
+
params={},
|
|
604
|
+
confidence=0.85,
|
|
605
|
+
reason="Column name suggests email content",
|
|
606
|
+
severity_suggestion="medium",
|
|
607
|
+
category=RuleCategory.PATTERN,
|
|
608
|
+
)
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Phone detection by column name
|
|
612
|
+
if any(hint in col_name for hint in ("phone", "tel", "mobile", "cell")):
|
|
613
|
+
suggestions.append(
|
|
614
|
+
SuggestedRule(
|
|
615
|
+
column=original_name,
|
|
616
|
+
validator_name="Phone",
|
|
617
|
+
params={},
|
|
618
|
+
confidence=0.75,
|
|
619
|
+
reason="Column name suggests phone number",
|
|
620
|
+
severity_suggestion="low",
|
|
621
|
+
category=RuleCategory.PATTERN,
|
|
622
|
+
)
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# URL detection by column name
|
|
626
|
+
if any(hint in col_name for hint in ("url", "link", "website", "href")):
|
|
627
|
+
suggestions.append(
|
|
628
|
+
SuggestedRule(
|
|
629
|
+
column=original_name,
|
|
630
|
+
validator_name="URL",
|
|
631
|
+
params={},
|
|
632
|
+
confidence=0.8,
|
|
633
|
+
reason="Column name suggests URL content",
|
|
634
|
+
severity_suggestion="low",
|
|
635
|
+
category=RuleCategory.PATTERN,
|
|
636
|
+
)
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Date/datetime type detection
|
|
640
|
+
if dtype in ("datetime64", "date", "timestamp"):
|
|
641
|
+
suggestions.append(
|
|
642
|
+
SuggestedRule(
|
|
643
|
+
column=original_name,
|
|
644
|
+
validator_name="DateParseable",
|
|
645
|
+
params={},
|
|
646
|
+
confidence=0.9,
|
|
647
|
+
reason=f"Column has {dtype} data type",
|
|
648
|
+
severity_suggestion="medium",
|
|
649
|
+
category=RuleCategory.SCHEMA,
|
|
650
|
+
)
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# Positive number detection for common column names
|
|
654
|
+
if dtype in ("int64", "int32", "float64", "float32"):
|
|
655
|
+
positive_hints = ("id", "count", "quantity", "amount", "price", "age")
|
|
656
|
+
if any(col_name.endswith(hint) or col_name == hint for hint in positive_hints):
|
|
657
|
+
min_val = column.get("min")
|
|
658
|
+
if min_val is not None:
|
|
659
|
+
try:
|
|
660
|
+
if float(min_val) >= 0:
|
|
661
|
+
suggestions.append(
|
|
662
|
+
SuggestedRule(
|
|
663
|
+
column=original_name,
|
|
664
|
+
validator_name="Positive",
|
|
665
|
+
params={},
|
|
666
|
+
confidence=0.75,
|
|
667
|
+
reason=f"Column name suggests positive values (min={min_val})",
|
|
668
|
+
severity_suggestion="low",
|
|
669
|
+
category=RuleCategory.STATISTICS,
|
|
670
|
+
)
|
|
671
|
+
)
|
|
672
|
+
except (ValueError, TypeError):
|
|
673
|
+
pass
|
|
674
|
+
|
|
675
|
+
return suggestions
|
|
676
|
+
|
|
677
|
+
def _suggest_statistical_rules(
|
|
678
|
+
self,
|
|
679
|
+
column: dict[str, Any],
|
|
680
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
681
|
+
) -> list[SuggestedRule]:
|
|
682
|
+
"""Suggest statistical validators based on distribution.
|
|
683
|
+
|
|
684
|
+
Args:
|
|
685
|
+
column: Column profile data.
|
|
686
|
+
strictness: Strictness level.
|
|
687
|
+
|
|
688
|
+
Returns:
|
|
689
|
+
List of suggested rules.
|
|
690
|
+
"""
|
|
691
|
+
suggestions = []
|
|
692
|
+
col_name = column.get("name", "")
|
|
693
|
+
mean = column.get("mean")
|
|
694
|
+
std = column.get("std")
|
|
695
|
+
|
|
696
|
+
# Suggest Z-score based outlier detection if we have distribution data
|
|
697
|
+
if mean is not None and std is not None and std > 0:
|
|
698
|
+
# Adjust threshold based on strictness
|
|
699
|
+
threshold = {
|
|
700
|
+
StrictnessLevel.LOOSE: 4.0,
|
|
701
|
+
StrictnessLevel.MEDIUM: 3.0,
|
|
702
|
+
StrictnessLevel.STRICT: 2.5,
|
|
703
|
+
}[strictness]
|
|
704
|
+
|
|
705
|
+
suggestions.append(
|
|
706
|
+
SuggestedRule(
|
|
707
|
+
column=col_name,
|
|
708
|
+
validator_name="ZScore",
|
|
709
|
+
params={"threshold": threshold},
|
|
710
|
+
confidence=0.6,
|
|
711
|
+
reason=f"Column has mean={mean:.2f}, std={std:.2f}",
|
|
712
|
+
severity_suggestion="low",
|
|
713
|
+
category=RuleCategory.STATISTICS,
|
|
714
|
+
)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
return suggestions
|
|
718
|
+
|
|
719
|
+
# =============================================================================
|
|
720
|
+
# Cross-Column Rule Suggestion Methods
|
|
721
|
+
# =============================================================================
|
|
722
|
+
|
|
723
|
+
def _suggest_composite_key_rules(
|
|
724
|
+
self,
|
|
725
|
+
columns: list[dict[str, Any]],
|
|
726
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
727
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
728
|
+
"""Suggest composite key (multi-column uniqueness) rules.
|
|
729
|
+
|
|
730
|
+
Analyzes column combinations to detect potential composite keys
|
|
731
|
+
based on uniqueness patterns.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
columns: List of column profile data.
|
|
735
|
+
strictness: Strictness level.
|
|
736
|
+
|
|
737
|
+
Returns:
|
|
738
|
+
List of cross-column suggestions.
|
|
739
|
+
"""
|
|
740
|
+
suggestions = []
|
|
741
|
+
|
|
742
|
+
# Find columns that might form composite keys
|
|
743
|
+
# Look for ID-like columns or columns with moderate cardinality
|
|
744
|
+
id_columns = []
|
|
745
|
+
categorical_columns = []
|
|
746
|
+
|
|
747
|
+
for col in columns:
|
|
748
|
+
col_name = col.get("name", "")
|
|
749
|
+
unique_pct = _parse_percentage(col.get("unique_pct"))
|
|
750
|
+
distinct_count = col.get("distinct_count", 0)
|
|
751
|
+
|
|
752
|
+
# ID-like columns (high but not 100% uniqueness)
|
|
753
|
+
if 50 <= unique_pct < 99.9 and any(
|
|
754
|
+
hint in col_name.lower()
|
|
755
|
+
for hint in ("id", "key", "code", "num", "ref")
|
|
756
|
+
):
|
|
757
|
+
id_columns.append(col_name)
|
|
758
|
+
|
|
759
|
+
# Categorical columns with moderate cardinality
|
|
760
|
+
elif distinct_count and 2 < distinct_count < 100:
|
|
761
|
+
categorical_columns.append(col_name)
|
|
762
|
+
|
|
763
|
+
# Suggest composite keys from ID column pairs
|
|
764
|
+
if len(id_columns) >= 2:
|
|
765
|
+
for i in range(len(id_columns)):
|
|
766
|
+
for j in range(i + 1, min(i + 3, len(id_columns))):
|
|
767
|
+
cols = [id_columns[i], id_columns[j]]
|
|
768
|
+
suggestions.append(
|
|
769
|
+
CrossColumnRuleSuggestion(
|
|
770
|
+
rule_type=CrossColumnRuleType.COMPOSITE_KEY,
|
|
771
|
+
columns=cols,
|
|
772
|
+
validator_name="MultiColumnUnique",
|
|
773
|
+
params={"columns": cols},
|
|
774
|
+
confidence=0.75,
|
|
775
|
+
reason=f"Columns {cols[0]} and {cols[1]} may form a composite key",
|
|
776
|
+
severity_suggestion="high",
|
|
777
|
+
evidence={
|
|
778
|
+
"pattern": "id_column_combination",
|
|
779
|
+
"columns": cols,
|
|
780
|
+
},
|
|
781
|
+
)
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
# Suggest composite keys from ID + categorical combinations
|
|
785
|
+
for id_col in id_columns[:2]: # Limit to avoid explosion
|
|
786
|
+
for cat_col in categorical_columns[:3]:
|
|
787
|
+
suggestions.append(
|
|
788
|
+
CrossColumnRuleSuggestion(
|
|
789
|
+
rule_type=CrossColumnRuleType.COMPOSITE_KEY,
|
|
790
|
+
columns=[id_col, cat_col],
|
|
791
|
+
validator_name="MultiColumnUnique",
|
|
792
|
+
params={"columns": [id_col, cat_col]},
|
|
793
|
+
confidence=0.65,
|
|
794
|
+
reason=f"{id_col} combined with {cat_col} may form a natural key",
|
|
795
|
+
severity_suggestion="medium",
|
|
796
|
+
evidence={
|
|
797
|
+
"pattern": "id_categorical_combination",
|
|
798
|
+
},
|
|
799
|
+
)
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
return suggestions
|
|
803
|
+
|
|
804
|
+
def _suggest_comparison_rules(
|
|
805
|
+
self,
|
|
806
|
+
columns: list[dict[str, Any]],
|
|
807
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
808
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
809
|
+
"""Suggest column comparison rules (e.g., end_date > start_date).
|
|
810
|
+
|
|
811
|
+
Args:
|
|
812
|
+
columns: List of column profile data.
|
|
813
|
+
strictness: Strictness level.
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
List of cross-column suggestions.
|
|
817
|
+
"""
|
|
818
|
+
suggestions = []
|
|
819
|
+
col_map = {col.get("name", ""): col for col in columns}
|
|
820
|
+
|
|
821
|
+
# Common comparison patterns
|
|
822
|
+
date_pairs = [
|
|
823
|
+
("start_date", "end_date", ">"),
|
|
824
|
+
("created_at", "updated_at", "<="),
|
|
825
|
+
("birth_date", "death_date", "<="),
|
|
826
|
+
("hire_date", "termination_date", "<="),
|
|
827
|
+
("order_date", "ship_date", "<="),
|
|
828
|
+
("start_time", "end_time", "<"),
|
|
829
|
+
]
|
|
830
|
+
|
|
831
|
+
numeric_pairs = [
|
|
832
|
+
("min_value", "max_value", "<="),
|
|
833
|
+
("min_price", "max_price", "<="),
|
|
834
|
+
("min_quantity", "max_quantity", "<="),
|
|
835
|
+
("low", "high", "<="),
|
|
836
|
+
("floor", "ceiling", "<="),
|
|
837
|
+
("cost", "price", "<="),
|
|
838
|
+
]
|
|
839
|
+
|
|
840
|
+
# Check date comparison patterns
|
|
841
|
+
for start_hint, end_hint, operator in date_pairs:
|
|
842
|
+
start_cols = [
|
|
843
|
+
c for c in col_map
|
|
844
|
+
if start_hint in c.lower() or c.lower().endswith("_start")
|
|
845
|
+
]
|
|
846
|
+
end_cols = [
|
|
847
|
+
c for c in col_map
|
|
848
|
+
if end_hint in c.lower() or c.lower().endswith("_end")
|
|
849
|
+
]
|
|
850
|
+
|
|
851
|
+
for start_col in start_cols:
|
|
852
|
+
for end_col in end_cols:
|
|
853
|
+
# Avoid matching same column
|
|
854
|
+
if start_col == end_col:
|
|
855
|
+
continue
|
|
856
|
+
# Check if they share a common prefix/suffix
|
|
857
|
+
start_base = start_col.replace("_start", "").replace("start_", "")
|
|
858
|
+
end_base = end_col.replace("_end", "").replace("end_", "")
|
|
859
|
+
base_match = (
|
|
860
|
+
start_base.lower() == end_base.lower()
|
|
861
|
+
or start_base.lower().replace("date", "") == end_base.lower().replace("date", "")
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
confidence = 0.85 if base_match else 0.7
|
|
865
|
+
|
|
866
|
+
suggestions.append(
|
|
867
|
+
CrossColumnRuleSuggestion(
|
|
868
|
+
rule_type=CrossColumnRuleType.COLUMN_COMPARISON,
|
|
869
|
+
columns=[end_col, start_col],
|
|
870
|
+
validator_name="ColumnComparison",
|
|
871
|
+
params={
|
|
872
|
+
"column_a": end_col,
|
|
873
|
+
"column_b": start_col,
|
|
874
|
+
"operator": operator,
|
|
875
|
+
},
|
|
876
|
+
confidence=confidence,
|
|
877
|
+
reason=f"{end_col} should be {operator} {start_col}",
|
|
878
|
+
severity_suggestion="high" if confidence >= 0.8 else "medium",
|
|
879
|
+
evidence={
|
|
880
|
+
"pattern": "date_range",
|
|
881
|
+
"base_match": base_match,
|
|
882
|
+
},
|
|
883
|
+
)
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
# Check numeric comparison patterns
|
|
887
|
+
for min_hint, max_hint, operator in numeric_pairs:
|
|
888
|
+
min_cols = [c for c in col_map if min_hint in c.lower()]
|
|
889
|
+
max_cols = [c for c in col_map if max_hint in c.lower()]
|
|
890
|
+
|
|
891
|
+
for min_col in min_cols:
|
|
892
|
+
for max_col in max_cols:
|
|
893
|
+
if min_col == max_col:
|
|
894
|
+
continue
|
|
895
|
+
|
|
896
|
+
suggestions.append(
|
|
897
|
+
CrossColumnRuleSuggestion(
|
|
898
|
+
rule_type=CrossColumnRuleType.COLUMN_COMPARISON,
|
|
899
|
+
columns=[max_col, min_col],
|
|
900
|
+
validator_name="ColumnComparison",
|
|
901
|
+
params={
|
|
902
|
+
"column_a": max_col,
|
|
903
|
+
"column_b": min_col,
|
|
904
|
+
"operator": ">=",
|
|
905
|
+
},
|
|
906
|
+
confidence=0.8,
|
|
907
|
+
reason=f"{max_col} should be >= {min_col}",
|
|
908
|
+
severity_suggestion="high",
|
|
909
|
+
evidence={
|
|
910
|
+
"pattern": "numeric_range",
|
|
911
|
+
},
|
|
912
|
+
)
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
return suggestions
|
|
916
|
+
|
|
917
|
+
def _suggest_arithmetic_rules(
|
|
918
|
+
self,
|
|
919
|
+
columns: list[dict[str, Any]],
|
|
920
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
921
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
922
|
+
"""Suggest arithmetic relationship rules (sum, product, etc.).
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
columns: List of column profile data.
|
|
926
|
+
strictness: Strictness level.
|
|
927
|
+
|
|
928
|
+
Returns:
|
|
929
|
+
List of cross-column suggestions.
|
|
930
|
+
"""
|
|
931
|
+
suggestions = []
|
|
932
|
+
col_map = {col.get("name", "").lower(): col.get("name", "") for col in columns}
|
|
933
|
+
numeric_cols = [
|
|
934
|
+
col.get("name", "") for col in columns
|
|
935
|
+
if col.get("dtype", "").lower() in ("int64", "int32", "float64", "float32", "number", "integer")
|
|
936
|
+
]
|
|
937
|
+
|
|
938
|
+
# Common sum patterns: subtotal + tax + shipping = total
|
|
939
|
+
sum_patterns = [
|
|
940
|
+
(["subtotal", "tax", "shipping"], "total", "Order total calculation"),
|
|
941
|
+
(["subtotal", "tax"], "total", "Subtotal + tax = total"),
|
|
942
|
+
(["quantity", "unit_price"], None, "Quantity * unit_price"), # Product
|
|
943
|
+
(["hours", "rate"], None, "Hours * rate"), # Product
|
|
944
|
+
(["principal", "interest"], "total_amount", "Principal + interest"),
|
|
945
|
+
(["base_salary", "bonus"], "total_compensation", "Salary + bonus"),
|
|
946
|
+
]
|
|
947
|
+
|
|
948
|
+
for pattern_cols, result_col, description in sum_patterns:
|
|
949
|
+
matched_cols = []
|
|
950
|
+
for p in pattern_cols:
|
|
951
|
+
for col_lower, col_name in col_map.items():
|
|
952
|
+
if p in col_lower:
|
|
953
|
+
matched_cols.append(col_name)
|
|
954
|
+
break
|
|
955
|
+
|
|
956
|
+
if len(matched_cols) >= 2:
|
|
957
|
+
# Check for result column
|
|
958
|
+
result_found = None
|
|
959
|
+
if result_col:
|
|
960
|
+
for col_lower, col_name in col_map.items():
|
|
961
|
+
if result_col in col_lower:
|
|
962
|
+
result_found = col_name
|
|
963
|
+
break
|
|
964
|
+
|
|
965
|
+
if result_found:
|
|
966
|
+
suggestions.append(
|
|
967
|
+
CrossColumnRuleSuggestion(
|
|
968
|
+
rule_type=CrossColumnRuleType.COLUMN_SUM,
|
|
969
|
+
columns=[*matched_cols, result_found],
|
|
970
|
+
validator_name="ColumnSum",
|
|
971
|
+
params={
|
|
972
|
+
"columns": matched_cols,
|
|
973
|
+
"target_column": result_found,
|
|
974
|
+
"tolerance": 0.01,
|
|
975
|
+
},
|
|
976
|
+
confidence=0.75,
|
|
977
|
+
reason=f"Sum of {', '.join(matched_cols)} should equal {result_found}",
|
|
978
|
+
severity_suggestion="high",
|
|
979
|
+
evidence={
|
|
980
|
+
"pattern": "arithmetic_sum",
|
|
981
|
+
"description": description,
|
|
982
|
+
},
|
|
983
|
+
)
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
# Percentage/ratio patterns
|
|
987
|
+
percentage_patterns = [
|
|
988
|
+
("percentage", "total", "part", "Percentage calculation"),
|
|
989
|
+
("rate", "amount", "base", "Rate calculation"),
|
|
990
|
+
("discount_pct", "discount", "subtotal", "Discount percentage"),
|
|
991
|
+
]
|
|
992
|
+
|
|
993
|
+
return suggestions
|
|
994
|
+
|
|
995
|
+
def _suggest_correlation_rules(
|
|
996
|
+
self,
|
|
997
|
+
columns: list[dict[str, Any]],
|
|
998
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
999
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
1000
|
+
"""Suggest column correlation rules for numeric columns.
|
|
1001
|
+
|
|
1002
|
+
Analyzes profile data to identify potentially correlated numeric columns
|
|
1003
|
+
based on naming patterns and statistical properties.
|
|
1004
|
+
|
|
1005
|
+
Args:
|
|
1006
|
+
columns: List of column profile data.
|
|
1007
|
+
strictness: Strictness level.
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
List of cross-column suggestions.
|
|
1011
|
+
"""
|
|
1012
|
+
suggestions = []
|
|
1013
|
+
|
|
1014
|
+
# Filter numeric columns only
|
|
1015
|
+
numeric_cols = [
|
|
1016
|
+
col for col in columns
|
|
1017
|
+
if col.get("dtype", "").lower() in (
|
|
1018
|
+
"int64", "int32", "float64", "float32", "number", "integer", "float"
|
|
1019
|
+
)
|
|
1020
|
+
]
|
|
1021
|
+
|
|
1022
|
+
if len(numeric_cols) < 2:
|
|
1023
|
+
return suggestions
|
|
1024
|
+
|
|
1025
|
+
# Common correlation patterns based on naming conventions
|
|
1026
|
+
correlation_patterns = [
|
|
1027
|
+
# High positive correlation expected
|
|
1028
|
+
(["price", "cost"], "positive", 0.7, "Price/cost related columns"),
|
|
1029
|
+
(["quantity", "total"], "positive", 0.5, "Quantity affects total"),
|
|
1030
|
+
(["height", "weight"], "positive", 0.3, "Physical measurements"),
|
|
1031
|
+
(["income", "expenditure"], "positive", 0.4, "Financial metrics"),
|
|
1032
|
+
(["age", "experience"], "positive", 0.5, "Age correlates with experience"),
|
|
1033
|
+
(["views", "clicks"], "positive", 0.6, "Engagement metrics"),
|
|
1034
|
+
(["revenue", "profit"], "positive", 0.6, "Revenue correlates with profit"),
|
|
1035
|
+
# Negative correlation expected
|
|
1036
|
+
(["discount", "price"], "negative", -0.3, "Discount inversely affects price"),
|
|
1037
|
+
(["errors", "quality"], "negative", -0.5, "Errors reduce quality score"),
|
|
1038
|
+
]
|
|
1039
|
+
|
|
1040
|
+
col_name_map = {col.get("name", "").lower(): col.get("name", "") for col in numeric_cols}
|
|
1041
|
+
|
|
1042
|
+
for hints, direction, expected_correlation, description in correlation_patterns:
|
|
1043
|
+
matched_cols = []
|
|
1044
|
+
for hint in hints:
|
|
1045
|
+
for col_lower, col_name in col_name_map.items():
|
|
1046
|
+
if hint in col_lower and col_name not in matched_cols:
|
|
1047
|
+
matched_cols.append(col_name)
|
|
1048
|
+
break
|
|
1049
|
+
|
|
1050
|
+
if len(matched_cols) >= 2:
|
|
1051
|
+
# Suggest correlation check for the first pair found
|
|
1052
|
+
col_a, col_b = matched_cols[0], matched_cols[1]
|
|
1053
|
+
if direction == "positive":
|
|
1054
|
+
min_corr = expected_correlation
|
|
1055
|
+
max_corr = 1.0
|
|
1056
|
+
else:
|
|
1057
|
+
min_corr = -1.0
|
|
1058
|
+
max_corr = expected_correlation
|
|
1059
|
+
|
|
1060
|
+
suggestions.append(
|
|
1061
|
+
CrossColumnRuleSuggestion(
|
|
1062
|
+
rule_type=CrossColumnRuleType.COLUMN_CORRELATION,
|
|
1063
|
+
columns=[col_a, col_b],
|
|
1064
|
+
validator_name="ColumnCorrelation",
|
|
1065
|
+
params={
|
|
1066
|
+
"column_a": col_a,
|
|
1067
|
+
"column_b": col_b,
|
|
1068
|
+
"min_correlation": min_corr,
|
|
1069
|
+
"max_correlation": max_corr,
|
|
1070
|
+
},
|
|
1071
|
+
confidence=0.65,
|
|
1072
|
+
reason=f"{description}: {col_a} and {col_b} may be correlated",
|
|
1073
|
+
severity_suggestion="medium",
|
|
1074
|
+
evidence={
|
|
1075
|
+
"pattern": "correlation_pattern",
|
|
1076
|
+
"direction": direction,
|
|
1077
|
+
"expected_correlation": expected_correlation,
|
|
1078
|
+
},
|
|
1079
|
+
)
|
|
1080
|
+
)
|
|
1081
|
+
|
|
1082
|
+
# Also suggest correlation check for columns with similar names (e.g., metric_v1, metric_v2)
|
|
1083
|
+
for i, col1 in enumerate(numeric_cols):
|
|
1084
|
+
for col2 in numeric_cols[i + 1:]:
|
|
1085
|
+
name1 = col1.get("name", "")
|
|
1086
|
+
name2 = col2.get("name", "")
|
|
1087
|
+
|
|
1088
|
+
# Check for versioned or indexed columns
|
|
1089
|
+
base1 = re.sub(r"[_\-]?(v?\d+|old|new|prev|next)$", "", name1.lower())
|
|
1090
|
+
base2 = re.sub(r"[_\-]?(v?\d+|old|new|prev|next)$", "", name2.lower())
|
|
1091
|
+
|
|
1092
|
+
if base1 and base1 == base2 and name1 != name2:
|
|
1093
|
+
suggestions.append(
|
|
1094
|
+
CrossColumnRuleSuggestion(
|
|
1095
|
+
rule_type=CrossColumnRuleType.COLUMN_CORRELATION,
|
|
1096
|
+
columns=[name1, name2],
|
|
1097
|
+
validator_name="ColumnCorrelation",
|
|
1098
|
+
params={
|
|
1099
|
+
"column_a": name1,
|
|
1100
|
+
"column_b": name2,
|
|
1101
|
+
"min_correlation": 0.5,
|
|
1102
|
+
"max_correlation": 1.0,
|
|
1103
|
+
},
|
|
1104
|
+
confidence=0.7,
|
|
1105
|
+
reason=f"Versioned columns {name1} and {name2} should be correlated",
|
|
1106
|
+
severity_suggestion="low",
|
|
1107
|
+
evidence={
|
|
1108
|
+
"pattern": "versioned_columns",
|
|
1109
|
+
"base_name": base1,
|
|
1110
|
+
},
|
|
1111
|
+
)
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
return suggestions
|
|
1115
|
+
|
|
1116
|
+
def _suggest_chain_comparison_rules(
|
|
1117
|
+
self,
|
|
1118
|
+
columns: list[dict[str, Any]],
|
|
1119
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
1120
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
1121
|
+
"""Suggest chain comparison rules (a < b < c).
|
|
1122
|
+
|
|
1123
|
+
Args:
|
|
1124
|
+
columns: List of column profile data.
|
|
1125
|
+
strictness: Strictness level.
|
|
1126
|
+
|
|
1127
|
+
Returns:
|
|
1128
|
+
List of cross-column suggestions.
|
|
1129
|
+
"""
|
|
1130
|
+
suggestions = []
|
|
1131
|
+
col_map = {col.get("name", "").lower(): col.get("name", "") for col in columns}
|
|
1132
|
+
|
|
1133
|
+
# Common chain comparison patterns
|
|
1134
|
+
chain_patterns = [
|
|
1135
|
+
# Date chains
|
|
1136
|
+
(["created", "updated", "deleted"], "<=", "Lifecycle date ordering"),
|
|
1137
|
+
(["start_date", "mid_date", "end_date"], "<=", "Date range ordering"),
|
|
1138
|
+
(["ordered", "shipped", "delivered"], "<=", "Order timeline"),
|
|
1139
|
+
(["submitted", "approved", "completed"], "<=", "Workflow dates"),
|
|
1140
|
+
# Numeric chains
|
|
1141
|
+
(["min", "avg", "max"], "<=", "Statistical ordering"),
|
|
1142
|
+
(["low", "mid", "high"], "<=", "Range tier ordering"),
|
|
1143
|
+
(["bronze", "silver", "gold"], "<=", "Tier value ordering"),
|
|
1144
|
+
(["small", "medium", "large"], "<=", "Size value ordering"),
|
|
1145
|
+
(["floor_price", "price", "ceiling_price"], "<=", "Price bounds ordering"),
|
|
1146
|
+
(["cost", "price", "msrp"], "<=", "Pricing chain"),
|
|
1147
|
+
]
|
|
1148
|
+
|
|
1149
|
+
for hints, operator, description in chain_patterns:
|
|
1150
|
+
matched_cols = []
|
|
1151
|
+
for hint in hints:
|
|
1152
|
+
for col_lower, col_name in col_map.items():
|
|
1153
|
+
if hint in col_lower and col_name not in matched_cols:
|
|
1154
|
+
matched_cols.append(col_name)
|
|
1155
|
+
break
|
|
1156
|
+
|
|
1157
|
+
if len(matched_cols) >= 3:
|
|
1158
|
+
suggestions.append(
|
|
1159
|
+
CrossColumnRuleSuggestion(
|
|
1160
|
+
rule_type=CrossColumnRuleType.COLUMN_CHAIN_COMPARISON,
|
|
1161
|
+
columns=matched_cols[:3], # Limit to 3 columns
|
|
1162
|
+
validator_name="ColumnChainComparison",
|
|
1163
|
+
params={
|
|
1164
|
+
"columns": matched_cols[:3],
|
|
1165
|
+
"operator": operator,
|
|
1166
|
+
},
|
|
1167
|
+
confidence=0.75,
|
|
1168
|
+
reason=f"{description}: {' {0} '.format(operator).join(matched_cols[:3])}",
|
|
1169
|
+
severity_suggestion="medium",
|
|
1170
|
+
evidence={
|
|
1171
|
+
"pattern": "chain_comparison",
|
|
1172
|
+
"operator": operator,
|
|
1173
|
+
},
|
|
1174
|
+
)
|
|
1175
|
+
)
|
|
1176
|
+
|
|
1177
|
+
return suggestions
|
|
1178
|
+
|
|
1179
|
+
def _suggest_advanced_arithmetic_rules(
|
|
1180
|
+
self,
|
|
1181
|
+
columns: list[dict[str, Any]],
|
|
1182
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
1183
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
1184
|
+
"""Suggest advanced arithmetic relationship rules (product, ratio, percentage).
|
|
1185
|
+
|
|
1186
|
+
Args:
|
|
1187
|
+
columns: List of column profile data.
|
|
1188
|
+
strictness: Strictness level.
|
|
1189
|
+
|
|
1190
|
+
Returns:
|
|
1191
|
+
List of cross-column suggestions.
|
|
1192
|
+
"""
|
|
1193
|
+
suggestions = []
|
|
1194
|
+
col_map = {col.get("name", "").lower(): col.get("name", "") for col in columns}
|
|
1195
|
+
numeric_cols = [
|
|
1196
|
+
col.get("name", "") for col in columns
|
|
1197
|
+
if col.get("dtype", "").lower() in (
|
|
1198
|
+
"int64", "int32", "float64", "float32", "number", "integer", "float"
|
|
1199
|
+
)
|
|
1200
|
+
]
|
|
1201
|
+
|
|
1202
|
+
# Product patterns (a * b = c)
|
|
1203
|
+
product_patterns = [
|
|
1204
|
+
(["quantity", "unit_price"], "total", "Line item total"),
|
|
1205
|
+
(["quantity", "price"], "amount", "Order amount"),
|
|
1206
|
+
(["hours", "rate"], "cost", "Labor cost"),
|
|
1207
|
+
(["hours", "hourly_rate"], "total_cost", "Total labor cost"),
|
|
1208
|
+
(["length", "width"], "area", "Area calculation"),
|
|
1209
|
+
(["principal", "rate"], "interest", "Interest calculation"),
|
|
1210
|
+
]
|
|
1211
|
+
|
|
1212
|
+
for factors, result_hint, description in product_patterns:
|
|
1213
|
+
factor_cols = []
|
|
1214
|
+
for factor in factors:
|
|
1215
|
+
for col_lower, col_name in col_map.items():
|
|
1216
|
+
if factor in col_lower and col_name not in factor_cols:
|
|
1217
|
+
factor_cols.append(col_name)
|
|
1218
|
+
break
|
|
1219
|
+
|
|
1220
|
+
if len(factor_cols) >= 2:
|
|
1221
|
+
# Find result column
|
|
1222
|
+
result_col = None
|
|
1223
|
+
for col_lower, col_name in col_map.items():
|
|
1224
|
+
if result_hint in col_lower and col_name not in factor_cols:
|
|
1225
|
+
result_col = col_name
|
|
1226
|
+
break
|
|
1227
|
+
|
|
1228
|
+
if result_col:
|
|
1229
|
+
suggestions.append(
|
|
1230
|
+
CrossColumnRuleSuggestion(
|
|
1231
|
+
rule_type=CrossColumnRuleType.COLUMN_PRODUCT,
|
|
1232
|
+
columns=[*factor_cols, result_col],
|
|
1233
|
+
validator_name="ColumnProduct",
|
|
1234
|
+
params={
|
|
1235
|
+
"columns": factor_cols,
|
|
1236
|
+
"target_column": result_col,
|
|
1237
|
+
"tolerance": 0.01,
|
|
1238
|
+
},
|
|
1239
|
+
confidence=0.75,
|
|
1240
|
+
reason=f"{description}: {' × '.join(factor_cols)} = {result_col}",
|
|
1241
|
+
severity_suggestion="high",
|
|
1242
|
+
evidence={
|
|
1243
|
+
"pattern": "arithmetic_product",
|
|
1244
|
+
"description": description,
|
|
1245
|
+
},
|
|
1246
|
+
)
|
|
1247
|
+
)
|
|
1248
|
+
|
|
1249
|
+
# Ratio patterns (a / b = expected ratio or a / b ≈ c)
|
|
1250
|
+
ratio_patterns = [
|
|
1251
|
+
("profit", "revenue", "margin", "Profit margin"),
|
|
1252
|
+
("tax", "subtotal", "tax_rate", "Tax rate"),
|
|
1253
|
+
("discount", "price", "discount_rate", "Discount rate"),
|
|
1254
|
+
("part", "total", "ratio", "Part to total ratio"),
|
|
1255
|
+
("completed", "total", "completion_rate", "Completion rate"),
|
|
1256
|
+
]
|
|
1257
|
+
|
|
1258
|
+
for numerator_hint, denominator_hint, result_hint, description in ratio_patterns:
|
|
1259
|
+
numerator_col = None
|
|
1260
|
+
denominator_col = None
|
|
1261
|
+
result_col = None
|
|
1262
|
+
|
|
1263
|
+
for col_lower, col_name in col_map.items():
|
|
1264
|
+
if numerator_hint in col_lower and not numerator_col:
|
|
1265
|
+
numerator_col = col_name
|
|
1266
|
+
elif denominator_hint in col_lower and not denominator_col:
|
|
1267
|
+
denominator_col = col_name
|
|
1268
|
+
elif result_hint in col_lower and not result_col:
|
|
1269
|
+
result_col = col_name
|
|
1270
|
+
|
|
1271
|
+
if numerator_col and denominator_col:
|
|
1272
|
+
if result_col:
|
|
1273
|
+
# Ratio with result column
|
|
1274
|
+
suggestions.append(
|
|
1275
|
+
CrossColumnRuleSuggestion(
|
|
1276
|
+
rule_type=CrossColumnRuleType.COLUMN_RATIO,
|
|
1277
|
+
columns=[numerator_col, denominator_col, result_col],
|
|
1278
|
+
validator_name="ColumnRatio",
|
|
1279
|
+
params={
|
|
1280
|
+
"numerator_column": numerator_col,
|
|
1281
|
+
"denominator_column": denominator_col,
|
|
1282
|
+
"result_column": result_col,
|
|
1283
|
+
"tolerance": 0.01,
|
|
1284
|
+
},
|
|
1285
|
+
confidence=0.7,
|
|
1286
|
+
reason=f"{description}: {numerator_col} / {denominator_col} = {result_col}",
|
|
1287
|
+
severity_suggestion="medium",
|
|
1288
|
+
evidence={
|
|
1289
|
+
"pattern": "arithmetic_ratio",
|
|
1290
|
+
"description": description,
|
|
1291
|
+
},
|
|
1292
|
+
)
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
# Percentage patterns
|
|
1296
|
+
percentage_patterns = [
|
|
1297
|
+
("discount_pct", "subtotal", "discount", "Discount percentage"),
|
|
1298
|
+
("tax_pct", "subtotal", "tax", "Tax percentage"),
|
|
1299
|
+
("commission_pct", "sales", "commission", "Commission percentage"),
|
|
1300
|
+
("margin_pct", "revenue", "profit", "Margin percentage"),
|
|
1301
|
+
]
|
|
1302
|
+
|
|
1303
|
+
for pct_hint, base_hint, result_hint, description in percentage_patterns:
|
|
1304
|
+
pct_col = None
|
|
1305
|
+
base_col = None
|
|
1306
|
+
result_col = None
|
|
1307
|
+
|
|
1308
|
+
for col_lower, col_name in col_map.items():
|
|
1309
|
+
if pct_hint in col_lower and not pct_col:
|
|
1310
|
+
pct_col = col_name
|
|
1311
|
+
elif base_hint in col_lower and not base_col:
|
|
1312
|
+
base_col = col_name
|
|
1313
|
+
elif result_hint in col_lower and not result_col:
|
|
1314
|
+
result_col = col_name
|
|
1315
|
+
|
|
1316
|
+
if pct_col and base_col and result_col:
|
|
1317
|
+
suggestions.append(
|
|
1318
|
+
CrossColumnRuleSuggestion(
|
|
1319
|
+
rule_type=CrossColumnRuleType.COLUMN_PERCENTAGE,
|
|
1320
|
+
columns=[pct_col, base_col, result_col],
|
|
1321
|
+
validator_name="ColumnPercentage",
|
|
1322
|
+
params={
|
|
1323
|
+
"percentage_column": pct_col,
|
|
1324
|
+
"base_column": base_col,
|
|
1325
|
+
"result_column": result_col,
|
|
1326
|
+
"tolerance": 0.01,
|
|
1327
|
+
},
|
|
1328
|
+
confidence=0.7,
|
|
1329
|
+
reason=f"{description}: {base_col} × {pct_col}% = {result_col}",
|
|
1330
|
+
severity_suggestion="medium",
|
|
1331
|
+
evidence={
|
|
1332
|
+
"pattern": "arithmetic_percentage",
|
|
1333
|
+
"description": description,
|
|
1334
|
+
},
|
|
1335
|
+
)
|
|
1336
|
+
)
|
|
1337
|
+
|
|
1338
|
+
# Difference patterns (a - b = c)
|
|
1339
|
+
difference_patterns = [
|
|
1340
|
+
("gross", "deductions", "net", "Net calculation"),
|
|
1341
|
+
("revenue", "cost", "profit", "Profit calculation"),
|
|
1342
|
+
("end_value", "start_value", "change", "Change calculation"),
|
|
1343
|
+
("current", "previous", "delta", "Delta calculation"),
|
|
1344
|
+
]
|
|
1345
|
+
|
|
1346
|
+
for minuend_hint, subtrahend_hint, result_hint, description in difference_patterns:
|
|
1347
|
+
minuend_col = None
|
|
1348
|
+
subtrahend_col = None
|
|
1349
|
+
result_col = None
|
|
1350
|
+
|
|
1351
|
+
for col_lower, col_name in col_map.items():
|
|
1352
|
+
if minuend_hint in col_lower and not minuend_col:
|
|
1353
|
+
minuend_col = col_name
|
|
1354
|
+
elif subtrahend_hint in col_lower and not subtrahend_col:
|
|
1355
|
+
subtrahend_col = col_name
|
|
1356
|
+
elif result_hint in col_lower and not result_col:
|
|
1357
|
+
result_col = col_name
|
|
1358
|
+
|
|
1359
|
+
if minuend_col and subtrahend_col and result_col:
|
|
1360
|
+
suggestions.append(
|
|
1361
|
+
CrossColumnRuleSuggestion(
|
|
1362
|
+
rule_type=CrossColumnRuleType.COLUMN_DIFFERENCE,
|
|
1363
|
+
columns=[minuend_col, subtrahend_col, result_col],
|
|
1364
|
+
validator_name="ColumnDifference",
|
|
1365
|
+
params={
|
|
1366
|
+
"minuend_column": minuend_col,
|
|
1367
|
+
"subtrahend_column": subtrahend_col,
|
|
1368
|
+
"result_column": result_col,
|
|
1369
|
+
"tolerance": 0.01,
|
|
1370
|
+
},
|
|
1371
|
+
confidence=0.75,
|
|
1372
|
+
reason=f"{description}: {minuend_col} - {subtrahend_col} = {result_col}",
|
|
1373
|
+
severity_suggestion="high",
|
|
1374
|
+
evidence={
|
|
1375
|
+
"pattern": "arithmetic_difference",
|
|
1376
|
+
"description": description,
|
|
1377
|
+
},
|
|
1378
|
+
)
|
|
1379
|
+
)
|
|
1380
|
+
|
|
1381
|
+
return suggestions
|
|
1382
|
+
|
|
1383
|
+
def _suggest_dependency_rules(
|
|
1384
|
+
self,
|
|
1385
|
+
columns: list[dict[str, Any]],
|
|
1386
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
1387
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
1388
|
+
"""Suggest functional dependency and implication rules.
|
|
1389
|
+
|
|
1390
|
+
Args:
|
|
1391
|
+
columns: List of column profile data.
|
|
1392
|
+
strictness: Strictness level.
|
|
1393
|
+
|
|
1394
|
+
Returns:
|
|
1395
|
+
List of cross-column suggestions.
|
|
1396
|
+
"""
|
|
1397
|
+
suggestions = []
|
|
1398
|
+
col_map = {col.get("name", ""): col for col in columns}
|
|
1399
|
+
|
|
1400
|
+
# Common dependency patterns
|
|
1401
|
+
dependency_patterns = [
|
|
1402
|
+
# If status is 'active', email must not be null
|
|
1403
|
+
("status", "email", "active", "If status is active, email is required"),
|
|
1404
|
+
("status", "phone", "active", "If status is active, phone is required"),
|
|
1405
|
+
# If is_premium, subscription_tier must be set
|
|
1406
|
+
("is_premium", "subscription_tier", True, "Premium users must have subscription tier"),
|
|
1407
|
+
# Country determines currency
|
|
1408
|
+
("country", "currency", None, "Country determines currency"),
|
|
1409
|
+
("country_code", "currency_code", None, "Country code determines currency code"),
|
|
1410
|
+
]
|
|
1411
|
+
|
|
1412
|
+
for det_hint, dep_hint, condition, description in dependency_patterns:
|
|
1413
|
+
det_col = None
|
|
1414
|
+
dep_col = None
|
|
1415
|
+
|
|
1416
|
+
for col_name in col_map:
|
|
1417
|
+
if det_hint in col_name.lower():
|
|
1418
|
+
det_col = col_name
|
|
1419
|
+
if dep_hint in col_name.lower():
|
|
1420
|
+
dep_col = col_name
|
|
1421
|
+
|
|
1422
|
+
if det_col and dep_col and det_col != dep_col:
|
|
1423
|
+
if condition is not None:
|
|
1424
|
+
# Implication rule: if condition then dependent not null
|
|
1425
|
+
suggestions.append(
|
|
1426
|
+
CrossColumnRuleSuggestion(
|
|
1427
|
+
rule_type=CrossColumnRuleType.COLUMN_IMPLICATION,
|
|
1428
|
+
columns=[det_col, dep_col],
|
|
1429
|
+
validator_name="ColumnImplication",
|
|
1430
|
+
params={
|
|
1431
|
+
"determinant_column": det_col,
|
|
1432
|
+
"dependent_column": dep_col,
|
|
1433
|
+
"condition_value": condition,
|
|
1434
|
+
},
|
|
1435
|
+
confidence=0.7,
|
|
1436
|
+
reason=description,
|
|
1437
|
+
severity_suggestion="medium",
|
|
1438
|
+
evidence={
|
|
1439
|
+
"pattern": "conditional_dependency",
|
|
1440
|
+
},
|
|
1441
|
+
)
|
|
1442
|
+
)
|
|
1443
|
+
else:
|
|
1444
|
+
# Functional dependency
|
|
1445
|
+
suggestions.append(
|
|
1446
|
+
CrossColumnRuleSuggestion(
|
|
1447
|
+
rule_type=CrossColumnRuleType.COLUMN_DEPENDENCY,
|
|
1448
|
+
columns=[det_col, dep_col],
|
|
1449
|
+
validator_name="ColumnDependency",
|
|
1450
|
+
params={
|
|
1451
|
+
"determinant_column": det_col,
|
|
1452
|
+
"dependent_column": dep_col,
|
|
1453
|
+
},
|
|
1454
|
+
confidence=0.65,
|
|
1455
|
+
reason=description,
|
|
1456
|
+
severity_suggestion="medium",
|
|
1457
|
+
evidence={
|
|
1458
|
+
"pattern": "functional_dependency",
|
|
1459
|
+
},
|
|
1460
|
+
)
|
|
1461
|
+
)
|
|
1462
|
+
|
|
1463
|
+
# Coexistence patterns (all null or all non-null)
|
|
1464
|
+
coexistence_groups = [
|
|
1465
|
+
(["address_line1", "city", "postal_code"], "Address fields should coexist"),
|
|
1466
|
+
(["latitude", "longitude"], "Coordinates should coexist"),
|
|
1467
|
+
(["first_name", "last_name"], "Name fields should coexist"),
|
|
1468
|
+
(["start_date", "end_date"], "Date range fields should coexist"),
|
|
1469
|
+
]
|
|
1470
|
+
|
|
1471
|
+
for hints, description in coexistence_groups:
|
|
1472
|
+
found_cols = []
|
|
1473
|
+
for hint in hints:
|
|
1474
|
+
for col_name in col_map:
|
|
1475
|
+
if hint in col_name.lower():
|
|
1476
|
+
found_cols.append(col_name)
|
|
1477
|
+
break
|
|
1478
|
+
|
|
1479
|
+
if len(found_cols) >= 2:
|
|
1480
|
+
suggestions.append(
|
|
1481
|
+
CrossColumnRuleSuggestion(
|
|
1482
|
+
rule_type=CrossColumnRuleType.COLUMN_COEXISTENCE,
|
|
1483
|
+
columns=found_cols,
|
|
1484
|
+
validator_name="ColumnCoexistence",
|
|
1485
|
+
params={"columns": found_cols},
|
|
1486
|
+
confidence=0.7,
|
|
1487
|
+
reason=description,
|
|
1488
|
+
severity_suggestion="medium",
|
|
1489
|
+
evidence={
|
|
1490
|
+
"pattern": "coexistence",
|
|
1491
|
+
"matched_hints": hints[:len(found_cols)],
|
|
1492
|
+
},
|
|
1493
|
+
)
|
|
1494
|
+
)
|
|
1495
|
+
|
|
1496
|
+
# Mutual exclusivity patterns
|
|
1497
|
+
mutex_groups = [
|
|
1498
|
+
(["phone_home", "phone_work", "phone_mobile"], "At most one phone type"),
|
|
1499
|
+
(["payment_card", "payment_bank", "payment_crypto"], "One payment method"),
|
|
1500
|
+
]
|
|
1501
|
+
|
|
1502
|
+
for hints, description in mutex_groups:
|
|
1503
|
+
found_cols = []
|
|
1504
|
+
for hint in hints:
|
|
1505
|
+
for col_name in col_map:
|
|
1506
|
+
if hint in col_name.lower():
|
|
1507
|
+
found_cols.append(col_name)
|
|
1508
|
+
break
|
|
1509
|
+
|
|
1510
|
+
if len(found_cols) >= 2:
|
|
1511
|
+
suggestions.append(
|
|
1512
|
+
CrossColumnRuleSuggestion(
|
|
1513
|
+
rule_type=CrossColumnRuleType.COLUMN_MUTUAL_EXCLUSIVITY,
|
|
1514
|
+
columns=found_cols,
|
|
1515
|
+
validator_name="ColumnMutualExclusivity",
|
|
1516
|
+
params={"columns": found_cols},
|
|
1517
|
+
confidence=0.6,
|
|
1518
|
+
reason=description,
|
|
1519
|
+
severity_suggestion="low",
|
|
1520
|
+
evidence={
|
|
1521
|
+
"pattern": "mutual_exclusivity",
|
|
1522
|
+
},
|
|
1523
|
+
)
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
return suggestions
|
|
1527
|
+
|
|
1528
|
+
def _generate_cross_column_suggestions(
|
|
1529
|
+
self,
|
|
1530
|
+
columns: list[dict[str, Any]],
|
|
1531
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
1532
|
+
include_types: list[CrossColumnRuleType] | None = None,
|
|
1533
|
+
exclude_types: list[CrossColumnRuleType] | None = None,
|
|
1534
|
+
) -> list[CrossColumnRuleSuggestion]:
|
|
1535
|
+
"""Generate all cross-column rule suggestions.
|
|
1536
|
+
|
|
1537
|
+
Args:
|
|
1538
|
+
columns: List of column profile data.
|
|
1539
|
+
strictness: Strictness level.
|
|
1540
|
+
include_types: Only include these cross-column rule types.
|
|
1541
|
+
exclude_types: Exclude these cross-column rule types.
|
|
1542
|
+
|
|
1543
|
+
Returns:
|
|
1544
|
+
List of cross-column suggestions.
|
|
1545
|
+
"""
|
|
1546
|
+
all_suggestions: list[CrossColumnRuleSuggestion] = []
|
|
1547
|
+
|
|
1548
|
+
# Generate suggestions by type
|
|
1549
|
+
# Each generator method may produce multiple rule types
|
|
1550
|
+
type_generators = {
|
|
1551
|
+
CrossColumnRuleType.COMPOSITE_KEY: self._suggest_composite_key_rules,
|
|
1552
|
+
CrossColumnRuleType.COLUMN_COMPARISON: self._suggest_comparison_rules,
|
|
1553
|
+
CrossColumnRuleType.COLUMN_SUM: self._suggest_arithmetic_rules,
|
|
1554
|
+
CrossColumnRuleType.COLUMN_DEPENDENCY: self._suggest_dependency_rules,
|
|
1555
|
+
CrossColumnRuleType.COLUMN_IMPLICATION: self._suggest_dependency_rules,
|
|
1556
|
+
CrossColumnRuleType.COLUMN_COEXISTENCE: self._suggest_dependency_rules,
|
|
1557
|
+
CrossColumnRuleType.COLUMN_MUTUAL_EXCLUSIVITY: self._suggest_dependency_rules,
|
|
1558
|
+
# New generators for comprehensive cross-column support
|
|
1559
|
+
CrossColumnRuleType.COLUMN_CORRELATION: self._suggest_correlation_rules,
|
|
1560
|
+
CrossColumnRuleType.COLUMN_CHAIN_COMPARISON: self._suggest_chain_comparison_rules,
|
|
1561
|
+
CrossColumnRuleType.COLUMN_PRODUCT: self._suggest_advanced_arithmetic_rules,
|
|
1562
|
+
CrossColumnRuleType.COLUMN_RATIO: self._suggest_advanced_arithmetic_rules,
|
|
1563
|
+
CrossColumnRuleType.COLUMN_PERCENTAGE: self._suggest_advanced_arithmetic_rules,
|
|
1564
|
+
CrossColumnRuleType.COLUMN_DIFFERENCE: self._suggest_advanced_arithmetic_rules,
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
# Determine which types to generate
|
|
1568
|
+
types_to_generate = set(type_generators.keys())
|
|
1569
|
+
|
|
1570
|
+
if include_types:
|
|
1571
|
+
types_to_generate &= set(include_types)
|
|
1572
|
+
|
|
1573
|
+
if exclude_types:
|
|
1574
|
+
types_to_generate -= set(exclude_types)
|
|
1575
|
+
|
|
1576
|
+
# Generate suggestions
|
|
1577
|
+
generated_methods = set()
|
|
1578
|
+
for rule_type in types_to_generate:
|
|
1579
|
+
generator = type_generators.get(rule_type)
|
|
1580
|
+
if generator and generator not in generated_methods:
|
|
1581
|
+
generated_methods.add(generator)
|
|
1582
|
+
suggestions = generator(columns, strictness)
|
|
1583
|
+
all_suggestions.extend(suggestions)
|
|
1584
|
+
|
|
1585
|
+
# Filter by min confidence based on strictness
|
|
1586
|
+
thresholds = STRICTNESS_THRESHOLDS[strictness]
|
|
1587
|
+
min_confidence = thresholds["min_confidence"]
|
|
1588
|
+
all_suggestions = [s for s in all_suggestions if s.confidence >= min_confidence]
|
|
1589
|
+
|
|
1590
|
+
# Deduplicate and sort by confidence
|
|
1591
|
+
seen = set()
|
|
1592
|
+
unique_suggestions = []
|
|
1593
|
+
for s in all_suggestions:
|
|
1594
|
+
key = (s.rule_type.value, tuple(sorted(s.columns)))
|
|
1595
|
+
if key not in seen:
|
|
1596
|
+
seen.add(key)
|
|
1597
|
+
unique_suggestions.append(s)
|
|
1598
|
+
|
|
1599
|
+
unique_suggestions.sort(key=lambda s: s.confidence, reverse=True)
|
|
1600
|
+
|
|
1601
|
+
# Assign unique IDs
|
|
1602
|
+
import uuid
|
|
1603
|
+
for s in unique_suggestions:
|
|
1604
|
+
s.id = str(uuid.uuid4())[:8]
|
|
1605
|
+
|
|
1606
|
+
return unique_suggestions
|
|
1607
|
+
|
|
1608
|
+
def _get_categories_for_preset(
|
|
1609
|
+
self, preset: RulePreset | None
|
|
1610
|
+
) -> list[RuleCategory] | None:
|
|
1611
|
+
"""Get categories for a preset.
|
|
1612
|
+
|
|
1613
|
+
Args:
|
|
1614
|
+
preset: Preset name.
|
|
1615
|
+
|
|
1616
|
+
Returns:
|
|
1617
|
+
List of categories or None for all.
|
|
1618
|
+
"""
|
|
1619
|
+
if preset is None:
|
|
1620
|
+
return None
|
|
1621
|
+
preset_info = PRESET_DEFINITIONS.get(preset)
|
|
1622
|
+
if preset_info:
|
|
1623
|
+
return preset_info.categories
|
|
1624
|
+
return None
|
|
1625
|
+
|
|
1626
|
+
def _filter_by_category(
|
|
1627
|
+
self,
|
|
1628
|
+
suggestions: list[SuggestedRule],
|
|
1629
|
+
include_categories: list[RuleCategory] | None,
|
|
1630
|
+
exclude_categories: list[RuleCategory] | None,
|
|
1631
|
+
) -> list[SuggestedRule]:
|
|
1632
|
+
"""Filter suggestions by category.
|
|
1633
|
+
|
|
1634
|
+
Args:
|
|
1635
|
+
suggestions: List of suggestions.
|
|
1636
|
+
include_categories: Categories to include (None = all).
|
|
1637
|
+
exclude_categories: Categories to exclude.
|
|
1638
|
+
|
|
1639
|
+
Returns:
|
|
1640
|
+
Filtered list.
|
|
1641
|
+
"""
|
|
1642
|
+
result = suggestions
|
|
1643
|
+
|
|
1644
|
+
if include_categories:
|
|
1645
|
+
include_set = set(c.value if isinstance(c, RuleCategory) else c for c in include_categories)
|
|
1646
|
+
result = [
|
|
1647
|
+
s for s in result
|
|
1648
|
+
if (s.category.value if isinstance(s.category, RuleCategory) else s.category) in include_set
|
|
1649
|
+
]
|
|
1650
|
+
|
|
1651
|
+
if exclude_categories:
|
|
1652
|
+
exclude_set = set(c.value if isinstance(c, RuleCategory) else c for c in exclude_categories)
|
|
1653
|
+
result = [
|
|
1654
|
+
s for s in result
|
|
1655
|
+
if (s.category.value if isinstance(s.category, RuleCategory) else s.category) not in exclude_set
|
|
1656
|
+
]
|
|
1657
|
+
|
|
1658
|
+
return result
|
|
1659
|
+
|
|
1660
|
+
async def generate_suggestions(
|
|
1661
|
+
self,
|
|
1662
|
+
source: Source,
|
|
1663
|
+
profile: Profile,
|
|
1664
|
+
schema: Schema | None = None,
|
|
1665
|
+
*,
|
|
1666
|
+
min_confidence: float = 0.5,
|
|
1667
|
+
strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
|
|
1668
|
+
preset: RulePreset | None = None,
|
|
1669
|
+
include_categories: list[RuleCategory] | None = None,
|
|
1670
|
+
exclude_categories: list[RuleCategory] | None = None,
|
|
1671
|
+
enable_cross_column: bool = True,
|
|
1672
|
+
include_cross_column_types: list[CrossColumnRuleType] | None = None,
|
|
1673
|
+
exclude_cross_column_types: list[CrossColumnRuleType] | None = None,
|
|
1674
|
+
) -> RuleSuggestionResponse:
|
|
1675
|
+
"""Generate rule suggestions based on profile data.
|
|
1676
|
+
|
|
1677
|
+
Args:
|
|
1678
|
+
source: Source record.
|
|
1679
|
+
profile: Profile record.
|
|
1680
|
+
schema: Optional schema for additional context.
|
|
1681
|
+
min_confidence: Minimum confidence threshold.
|
|
1682
|
+
strictness: Strictness level for rule thresholds.
|
|
1683
|
+
preset: Preset template to use.
|
|
1684
|
+
include_categories: Categories to include.
|
|
1685
|
+
exclude_categories: Categories to exclude.
|
|
1686
|
+
enable_cross_column: Whether to generate cross-column rules.
|
|
1687
|
+
include_cross_column_types: Cross-column types to include.
|
|
1688
|
+
exclude_cross_column_types: Cross-column types to exclude.
|
|
1689
|
+
|
|
1690
|
+
Returns:
|
|
1691
|
+
Rule suggestion response.
|
|
1692
|
+
"""
|
|
1693
|
+
suggestions: list[SuggestedRule] = []
|
|
1694
|
+
cross_column_suggestions: list[CrossColumnRuleSuggestion] = []
|
|
1695
|
+
|
|
1696
|
+
# Apply preset settings if specified
|
|
1697
|
+
if preset:
|
|
1698
|
+
preset_info = PRESET_DEFINITIONS.get(preset)
|
|
1699
|
+
if preset_info:
|
|
1700
|
+
strictness = preset_info.strictness
|
|
1701
|
+
include_categories = preset_info.categories
|
|
1702
|
+
|
|
1703
|
+
# Adjust min_confidence based on strictness
|
|
1704
|
+
thresholds = STRICTNESS_THRESHOLDS[strictness]
|
|
1705
|
+
effective_min_confidence = max(min_confidence, thresholds["min_confidence"])
|
|
1706
|
+
|
|
1707
|
+
# Get columns from profile
|
|
1708
|
+
columns = profile.columns if hasattr(profile, "columns") else []
|
|
1709
|
+
if not columns and profile.profile_json:
|
|
1710
|
+
columns = profile.profile_json.get("columns", [])
|
|
1711
|
+
|
|
1712
|
+
# Get schema columns for additional context
|
|
1713
|
+
schema_columns = {}
|
|
1714
|
+
if schema and schema.schema_json:
|
|
1715
|
+
schema_columns = schema.schema_json.get("columns", {})
|
|
1716
|
+
|
|
1717
|
+
# Generate single-column suggestions for each column
|
|
1718
|
+
for column in columns:
|
|
1719
|
+
col_name = column.get("name", "")
|
|
1720
|
+
schema_col = schema_columns.get(col_name)
|
|
1721
|
+
|
|
1722
|
+
# Collect all suggestions for this column with strictness
|
|
1723
|
+
suggestions.extend(self._suggest_null_rules(column, strictness))
|
|
1724
|
+
suggestions.extend(self._suggest_uniqueness_rules(column, strictness))
|
|
1725
|
+
suggestions.extend(self._suggest_range_rules(column, strictness))
|
|
1726
|
+
suggestions.extend(self._suggest_type_rules(column, schema_col, strictness))
|
|
1727
|
+
suggestions.extend(self._suggest_statistical_rules(column, strictness))
|
|
1728
|
+
|
|
1729
|
+
# Filter by category
|
|
1730
|
+
suggestions = self._filter_by_category(
|
|
1731
|
+
suggestions, include_categories, exclude_categories
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
# Filter by confidence threshold
|
|
1735
|
+
suggestions = [s for s in suggestions if s.confidence >= effective_min_confidence]
|
|
1736
|
+
|
|
1737
|
+
# Sort by confidence (highest first)
|
|
1738
|
+
suggestions.sort(key=lambda s: s.confidence, reverse=True)
|
|
1739
|
+
|
|
1740
|
+
# Generate cross-column suggestions if enabled
|
|
1741
|
+
if enable_cross_column and columns:
|
|
1742
|
+
cross_column_suggestions = self._generate_cross_column_suggestions(
|
|
1743
|
+
columns,
|
|
1744
|
+
strictness,
|
|
1745
|
+
include_cross_column_types,
|
|
1746
|
+
exclude_cross_column_types,
|
|
1747
|
+
)
|
|
1748
|
+
# Filter by min confidence
|
|
1749
|
+
cross_column_suggestions = [
|
|
1750
|
+
s for s in cross_column_suggestions
|
|
1751
|
+
if s.confidence >= effective_min_confidence
|
|
1752
|
+
]
|
|
1753
|
+
|
|
1754
|
+
# Count high confidence suggestions (single + cross-column)
|
|
1755
|
+
high_confidence = sum(1 for s in suggestions if s.confidence >= 0.8)
|
|
1756
|
+
high_confidence += sum(1 for s in cross_column_suggestions if s.confidence >= 0.8)
|
|
1757
|
+
|
|
1758
|
+
# Count by category
|
|
1759
|
+
by_category: dict[str, int] = {}
|
|
1760
|
+
for s in suggestions:
|
|
1761
|
+
cat_value = s.category.value if isinstance(s.category, RuleCategory) else str(s.category)
|
|
1762
|
+
by_category[cat_value] = by_category.get(cat_value, 0) + 1
|
|
1763
|
+
|
|
1764
|
+
# Add cross-column categories
|
|
1765
|
+
if cross_column_suggestions:
|
|
1766
|
+
by_category["relationship"] = len([
|
|
1767
|
+
s for s in cross_column_suggestions
|
|
1768
|
+
if s.rule_type in (
|
|
1769
|
+
CrossColumnRuleType.COLUMN_COMPARISON,
|
|
1770
|
+
CrossColumnRuleType.COLUMN_DEPENDENCY,
|
|
1771
|
+
CrossColumnRuleType.COLUMN_IMPLICATION,
|
|
1772
|
+
)
|
|
1773
|
+
])
|
|
1774
|
+
by_category["multi_column"] = len([
|
|
1775
|
+
s for s in cross_column_suggestions
|
|
1776
|
+
if s.rule_type in (
|
|
1777
|
+
CrossColumnRuleType.COMPOSITE_KEY,
|
|
1778
|
+
CrossColumnRuleType.COLUMN_SUM,
|
|
1779
|
+
CrossColumnRuleType.COLUMN_COEXISTENCE,
|
|
1780
|
+
CrossColumnRuleType.COLUMN_MUTUAL_EXCLUSIVITY,
|
|
1781
|
+
)
|
|
1782
|
+
])
|
|
1783
|
+
|
|
1784
|
+
# Count by cross-column type
|
|
1785
|
+
by_cross_column_type: dict[str, int] = {}
|
|
1786
|
+
for s in cross_column_suggestions:
|
|
1787
|
+
type_value = s.rule_type.value
|
|
1788
|
+
by_cross_column_type[type_value] = by_cross_column_type.get(type_value, 0) + 1
|
|
1789
|
+
|
|
1790
|
+
# Collect unique categories
|
|
1791
|
+
categories_included = list(set(
|
|
1792
|
+
s.category if isinstance(s.category, RuleCategory) else RuleCategory(s.category)
|
|
1793
|
+
for s in suggestions
|
|
1794
|
+
))
|
|
1795
|
+
if cross_column_suggestions:
|
|
1796
|
+
if RuleCategory.RELATIONSHIP not in categories_included:
|
|
1797
|
+
categories_included.append(RuleCategory.RELATIONSHIP)
|
|
1798
|
+
if RuleCategory.MULTI_COLUMN not in categories_included:
|
|
1799
|
+
categories_included.append(RuleCategory.MULTI_COLUMN)
|
|
1800
|
+
|
|
1801
|
+
# Total suggestions count
|
|
1802
|
+
total_suggestions = len(suggestions) + len(cross_column_suggestions)
|
|
1803
|
+
|
|
1804
|
+
return RuleSuggestionResponse(
|
|
1805
|
+
source_id=source.id,
|
|
1806
|
+
source_name=source.name,
|
|
1807
|
+
profile_id=profile.id,
|
|
1808
|
+
suggestions=suggestions,
|
|
1809
|
+
cross_column_suggestions=cross_column_suggestions,
|
|
1810
|
+
total_suggestions=total_suggestions,
|
|
1811
|
+
high_confidence_count=high_confidence,
|
|
1812
|
+
cross_column_count=len(cross_column_suggestions),
|
|
1813
|
+
generated_at=datetime.utcnow(),
|
|
1814
|
+
strictness=strictness,
|
|
1815
|
+
preset=preset,
|
|
1816
|
+
categories_included=categories_included,
|
|
1817
|
+
by_category=by_category,
|
|
1818
|
+
by_cross_column_type=by_cross_column_type,
|
|
1819
|
+
)
|
|
1820
|
+
|
|
1821
|
+
def _build_rules_dict(
|
|
1822
|
+
self, suggestions: list[SuggestedRule]
|
|
1823
|
+
) -> tuple[dict[str, Any], list[str]]:
|
|
1824
|
+
"""Build rules dictionary from suggestions.
|
|
1825
|
+
|
|
1826
|
+
Args:
|
|
1827
|
+
suggestions: List of suggestions.
|
|
1828
|
+
|
|
1829
|
+
Returns:
|
|
1830
|
+
Tuple of (rules dict, validator names).
|
|
1831
|
+
"""
|
|
1832
|
+
rules_dict: dict[str, Any] = {"columns": {}}
|
|
1833
|
+
validators_applied = []
|
|
1834
|
+
|
|
1835
|
+
for suggestion in suggestions:
|
|
1836
|
+
col_name = suggestion.column
|
|
1837
|
+
validator_name = suggestion.validator_name
|
|
1838
|
+
|
|
1839
|
+
if col_name not in rules_dict["columns"]:
|
|
1840
|
+
rules_dict["columns"][col_name] = {}
|
|
1841
|
+
|
|
1842
|
+
# Add validator with params
|
|
1843
|
+
if suggestion.params:
|
|
1844
|
+
rules_dict["columns"][col_name][validator_name.lower()] = suggestion.params
|
|
1845
|
+
else:
|
|
1846
|
+
rules_dict["columns"][col_name][validator_name.lower()] = True
|
|
1847
|
+
|
|
1848
|
+
validators_applied.append(validator_name)
|
|
1849
|
+
|
|
1850
|
+
return rules_dict, validators_applied
|
|
1851
|
+
|
|
1852
|
+
async def apply_suggestions(
|
|
1853
|
+
self,
|
|
1854
|
+
source: Source,
|
|
1855
|
+
suggestions: list[SuggestedRule],
|
|
1856
|
+
*,
|
|
1857
|
+
rule_name: str | None = None,
|
|
1858
|
+
rule_description: str | None = None,
|
|
1859
|
+
) -> ApplyRulesResponse:
|
|
1860
|
+
"""Apply selected rule suggestions to create validation rules.
|
|
1861
|
+
|
|
1862
|
+
Args:
|
|
1863
|
+
source: Source record.
|
|
1864
|
+
suggestions: Selected suggestions to apply.
|
|
1865
|
+
rule_name: Optional name for the rule set.
|
|
1866
|
+
rule_description: Optional description.
|
|
1867
|
+
|
|
1868
|
+
Returns:
|
|
1869
|
+
Apply rules response.
|
|
1870
|
+
"""
|
|
1871
|
+
# Build rules from suggestions
|
|
1872
|
+
rules_dict, validators_applied = self._build_rules_dict(suggestions)
|
|
1873
|
+
|
|
1874
|
+
# Create YAML string
|
|
1875
|
+
rules_yaml = yaml.dump(rules_dict, default_flow_style=False)
|
|
1876
|
+
|
|
1877
|
+
# Create rule record
|
|
1878
|
+
final_name = rule_name or f"Auto-generated rules for {source.name}"
|
|
1879
|
+
final_description = rule_description or (
|
|
1880
|
+
f"Automatically generated from profile analysis. "
|
|
1881
|
+
f"Includes {len(suggestions)} validators."
|
|
1882
|
+
)
|
|
1883
|
+
|
|
1884
|
+
# Deactivate existing rules
|
|
1885
|
+
existing_rules = await self.rule_repo.get_for_source(
|
|
1886
|
+
source.id, active_only=True
|
|
1887
|
+
)
|
|
1888
|
+
for rule in existing_rules:
|
|
1889
|
+
rule.is_active = False
|
|
1890
|
+
|
|
1891
|
+
# Create new rule
|
|
1892
|
+
rule = await self.rule_repo.create(
|
|
1893
|
+
source_id=source.id,
|
|
1894
|
+
name=final_name,
|
|
1895
|
+
description=final_description,
|
|
1896
|
+
rules_yaml=rules_yaml,
|
|
1897
|
+
rules_json=rules_dict,
|
|
1898
|
+
is_active=True,
|
|
1899
|
+
)
|
|
1900
|
+
|
|
1901
|
+
await self.session.commit()
|
|
1902
|
+
|
|
1903
|
+
return ApplyRulesResponse(
|
|
1904
|
+
source_id=source.id,
|
|
1905
|
+
rule_id=rule.id,
|
|
1906
|
+
rule_name=rule.name,
|
|
1907
|
+
applied_count=len(suggestions),
|
|
1908
|
+
validators=list(set(validators_applied)),
|
|
1909
|
+
created_at=rule.created_at,
|
|
1910
|
+
)
|
|
1911
|
+
|
|
1912
|
+
def export_rules(
|
|
1913
|
+
self,
|
|
1914
|
+
suggestions: list[SuggestedRule],
|
|
1915
|
+
format: RuleExportFormat = RuleExportFormat.YAML,
|
|
1916
|
+
*,
|
|
1917
|
+
rule_name: str = "auto_generated_rules",
|
|
1918
|
+
description: str | None = None,
|
|
1919
|
+
include_metadata: bool = True,
|
|
1920
|
+
) -> ExportRulesResponse:
|
|
1921
|
+
"""Export rules in various formats.
|
|
1922
|
+
|
|
1923
|
+
Args:
|
|
1924
|
+
suggestions: Rules to export.
|
|
1925
|
+
format: Export format.
|
|
1926
|
+
rule_name: Name for the rule set.
|
|
1927
|
+
description: Optional description.
|
|
1928
|
+
include_metadata: Include generation metadata.
|
|
1929
|
+
|
|
1930
|
+
Returns:
|
|
1931
|
+
Export response with content.
|
|
1932
|
+
"""
|
|
1933
|
+
rules_dict, validators = self._build_rules_dict(suggestions)
|
|
1934
|
+
|
|
1935
|
+
# Add metadata if requested
|
|
1936
|
+
if include_metadata:
|
|
1937
|
+
rules_dict["_metadata"] = {
|
|
1938
|
+
"name": rule_name,
|
|
1939
|
+
"description": description or f"Auto-generated rules ({len(suggestions)} validators)",
|
|
1940
|
+
"generated_at": datetime.utcnow().isoformat(),
|
|
1941
|
+
"rule_count": len(suggestions),
|
|
1942
|
+
"validators": list(set(validators)),
|
|
1943
|
+
}
|
|
1944
|
+
|
|
1945
|
+
# Generate content based on format
|
|
1946
|
+
if format == RuleExportFormat.YAML:
|
|
1947
|
+
content = yaml.dump(rules_dict, default_flow_style=False, sort_keys=False)
|
|
1948
|
+
filename = f"{rule_name}.yaml"
|
|
1949
|
+
elif format == RuleExportFormat.JSON:
|
|
1950
|
+
content = json.dumps(rules_dict, indent=2)
|
|
1951
|
+
filename = f"{rule_name}.json"
|
|
1952
|
+
elif format == RuleExportFormat.TOML:
|
|
1953
|
+
content = self._to_toml(rules_dict)
|
|
1954
|
+
filename = f"{rule_name}.toml"
|
|
1955
|
+
elif format == RuleExportFormat.PYTHON:
|
|
1956
|
+
content = self._to_python(rules_dict, rule_name, description)
|
|
1957
|
+
filename = f"{rule_name}.py"
|
|
1958
|
+
else:
|
|
1959
|
+
content = yaml.dump(rules_dict, default_flow_style=False)
|
|
1960
|
+
filename = f"{rule_name}.yaml"
|
|
1961
|
+
|
|
1962
|
+
return ExportRulesResponse(
|
|
1963
|
+
content=content,
|
|
1964
|
+
format=format,
|
|
1965
|
+
filename=filename,
|
|
1966
|
+
rule_count=len(suggestions),
|
|
1967
|
+
generated_at=datetime.utcnow(),
|
|
1968
|
+
)
|
|
1969
|
+
|
|
1970
|
+
def _to_toml(self, rules_dict: dict[str, Any]) -> str:
|
|
1971
|
+
"""Convert rules to TOML format.
|
|
1972
|
+
|
|
1973
|
+
Args:
|
|
1974
|
+
rules_dict: Rules dictionary.
|
|
1975
|
+
|
|
1976
|
+
Returns:
|
|
1977
|
+
TOML string.
|
|
1978
|
+
"""
|
|
1979
|
+
try:
|
|
1980
|
+
import toml
|
|
1981
|
+
return toml.dumps(rules_dict)
|
|
1982
|
+
except ImportError:
|
|
1983
|
+
# Fallback to simple TOML generation
|
|
1984
|
+
lines = []
|
|
1985
|
+
if "_metadata" in rules_dict:
|
|
1986
|
+
lines.append("[_metadata]")
|
|
1987
|
+
for k, v in rules_dict["_metadata"].items():
|
|
1988
|
+
if isinstance(v, str):
|
|
1989
|
+
lines.append(f'{k} = "{v}"')
|
|
1990
|
+
elif isinstance(v, list):
|
|
1991
|
+
lines.append(f'{k} = {json.dumps(v)}')
|
|
1992
|
+
else:
|
|
1993
|
+
lines.append(f"{k} = {v}")
|
|
1994
|
+
lines.append("")
|
|
1995
|
+
|
|
1996
|
+
if "columns" in rules_dict:
|
|
1997
|
+
for col_name, validators in rules_dict["columns"].items():
|
|
1998
|
+
lines.append(f'[columns."{col_name}"]')
|
|
1999
|
+
for val_name, val_config in validators.items():
|
|
2000
|
+
if isinstance(val_config, dict):
|
|
2001
|
+
lines.append(f"[columns.\"{col_name}\".{val_name}]")
|
|
2002
|
+
for pk, pv in val_config.items():
|
|
2003
|
+
if isinstance(pv, str):
|
|
2004
|
+
lines.append(f'{pk} = "{pv}"')
|
|
2005
|
+
else:
|
|
2006
|
+
lines.append(f"{pk} = {pv}")
|
|
2007
|
+
else:
|
|
2008
|
+
lines.append(f"{val_name} = {str(val_config).lower()}")
|
|
2009
|
+
lines.append("")
|
|
2010
|
+
|
|
2011
|
+
return "\n".join(lines)
|
|
2012
|
+
|
|
2013
|
+
def _to_python(
|
|
2014
|
+
self,
|
|
2015
|
+
rules_dict: dict[str, Any],
|
|
2016
|
+
rule_name: str,
|
|
2017
|
+
description: str | None,
|
|
2018
|
+
) -> str:
|
|
2019
|
+
"""Convert rules to Python code.
|
|
2020
|
+
|
|
2021
|
+
Args:
|
|
2022
|
+
rules_dict: Rules dictionary.
|
|
2023
|
+
rule_name: Name for the validation suite.
|
|
2024
|
+
description: Optional description.
|
|
2025
|
+
|
|
2026
|
+
Returns:
|
|
2027
|
+
Python code string.
|
|
2028
|
+
"""
|
|
2029
|
+
lines = [
|
|
2030
|
+
'"""Auto-generated validation rules.',
|
|
2031
|
+
"",
|
|
2032
|
+
f"Name: {rule_name}",
|
|
2033
|
+
]
|
|
2034
|
+
if description:
|
|
2035
|
+
lines.append(f"Description: {description}")
|
|
2036
|
+
lines.extend([
|
|
2037
|
+
'"""',
|
|
2038
|
+
"",
|
|
2039
|
+
"from truthound import th",
|
|
2040
|
+
"",
|
|
2041
|
+
"",
|
|
2042
|
+
f"def validate_{rule_name.replace('-', '_').replace(' ', '_')}(df):",
|
|
2043
|
+
f' """Run auto-generated validation rules."""',
|
|
2044
|
+
" result = th.check(",
|
|
2045
|
+
" df,",
|
|
2046
|
+
" validators=[",
|
|
2047
|
+
])
|
|
2048
|
+
|
|
2049
|
+
# Add validators
|
|
2050
|
+
for col_name, validators in rules_dict.get("columns", {}).items():
|
|
2051
|
+
for val_name, val_config in validators.items():
|
|
2052
|
+
if isinstance(val_config, dict):
|
|
2053
|
+
params_str = ", ".join(
|
|
2054
|
+
f"{k}={repr(v)}" for k, v in val_config.items()
|
|
2055
|
+
)
|
|
2056
|
+
lines.append(f' ("{col_name}", "{val_name}", {{{params_str}}}),')
|
|
2057
|
+
else:
|
|
2058
|
+
lines.append(f' ("{col_name}", "{val_name}"),')
|
|
2059
|
+
|
|
2060
|
+
lines.extend([
|
|
2061
|
+
" ],",
|
|
2062
|
+
" )",
|
|
2063
|
+
" return result",
|
|
2064
|
+
"",
|
|
2065
|
+
"",
|
|
2066
|
+
'if __name__ == "__main__":',
|
|
2067
|
+
" import pandas as pd",
|
|
2068
|
+
" # df = pd.read_csv('your_data.csv')",
|
|
2069
|
+
f" # result = validate_{rule_name.replace('-', '_').replace(' ', '_')}(df)",
|
|
2070
|
+
" # print(result)",
|
|
2071
|
+
"",
|
|
2072
|
+
])
|
|
2073
|
+
|
|
2074
|
+
return "\n".join(lines)
|
|
2075
|
+
|
|
2076
|
+
@staticmethod
|
|
2077
|
+
def get_presets() -> PresetsResponse:
|
|
2078
|
+
"""Get available presets and configuration options.
|
|
2079
|
+
|
|
2080
|
+
Returns:
|
|
2081
|
+
Presets response.
|
|
2082
|
+
"""
|
|
2083
|
+
return PresetsResponse(
|
|
2084
|
+
presets=list(PRESET_DEFINITIONS.values()),
|
|
2085
|
+
strictness_levels=[level.value for level in StrictnessLevel],
|
|
2086
|
+
categories=[cat.value for cat in RuleCategory],
|
|
2087
|
+
export_formats=[fmt.value for fmt in RuleExportFormat],
|
|
2088
|
+
)
|