truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/METADATA +147 -23
  162. truthound_dashboard-1.4.1.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
  164. truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
  166. truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,2088 @@
1
+ """Rule generation service.
2
+
3
+ This module provides functionality for automatically generating
4
+ validation rules based on profile data analysis.
5
+
6
+ Features:
7
+ - Multiple strictness levels (loose, medium, strict)
8
+ - Preset templates for different use cases
9
+ - Category-based filtering
10
+ - Multiple export formats (YAML, JSON, Python, TOML)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import re
17
+ from datetime import datetime
18
+ from typing import Any
19
+
20
+ import yaml
21
+ from sqlalchemy.ext.asyncio import AsyncSession
22
+
23
+ from truthound_dashboard.db import Profile, Rule, Schema, Source
24
+ from truthound_dashboard.core.services import ProfileRepository, RuleRepository
25
+ from truthound_dashboard.schemas.rule_suggestion import (
26
+ ApplyRulesResponse,
27
+ CrossColumnRuleSuggestion,
28
+ CrossColumnRuleType,
29
+ ExportRulesResponse,
30
+ PresetInfo,
31
+ PresetsResponse,
32
+ RuleCategory,
33
+ RuleExportFormat,
34
+ RulePreset,
35
+ RuleSuggestionResponse,
36
+ StrictnessLevel,
37
+ SuggestedRule,
38
+ )
39
+
40
+
41
+ # Common email pattern
42
+ EMAIL_PATTERN = re.compile(
43
+ r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
44
+ )
45
+
46
+ # Common date patterns
47
+ DATE_PATTERNS = [
48
+ r"\d{4}-\d{2}-\d{2}", # YYYY-MM-DD
49
+ r"\d{2}/\d{2}/\d{4}", # MM/DD/YYYY
50
+ r"\d{2}-\d{2}-\d{4}", # DD-MM-YYYY
51
+ ]
52
+
53
+
54
+ # =============================================================================
55
+ # Statistical Confidence Calculation Helpers
56
+ # =============================================================================
57
+
58
+
59
+ def calculate_pattern_confidence(
60
+ match_rate: float,
61
+ sample_size: int,
62
+ min_sample: int = 100,
63
+ base_confidence: float = 0.5,
64
+ ) -> float:
65
+ """Calculate confidence score based on pattern matching and sample size.
66
+
67
+ Uses a statistical approach to compute confidence:
68
+ - Higher match rates increase confidence
69
+ - Larger sample sizes increase confidence
70
+ - Small samples (<min_sample) are penalized
71
+
72
+ Args:
73
+ match_rate: Rate of pattern matches (0.0 to 1.0).
74
+ sample_size: Number of samples analyzed.
75
+ min_sample: Minimum sample size for full confidence.
76
+ base_confidence: Starting confidence level.
77
+
78
+ Returns:
79
+ Confidence score between 0.0 and 1.0.
80
+ """
81
+ if sample_size == 0:
82
+ return base_confidence
83
+
84
+ # Sample size factor: penalize small samples
85
+ size_factor = min(1.0, sample_size / min_sample)
86
+
87
+ # Match rate contribution (higher is better)
88
+ rate_contribution = match_rate * 0.4
89
+
90
+ # Size contribution (larger samples = more reliable)
91
+ size_contribution = size_factor * 0.1
92
+
93
+ # Base contribution
94
+ confidence = base_confidence + rate_contribution + size_contribution
95
+
96
+ # Clamp to valid range
97
+ return max(0.0, min(1.0, confidence))
98
+
99
+
100
+ def calculate_uniqueness_confidence(
101
+ unique_ratio: float,
102
+ total_count: int,
103
+ cardinality: int,
104
+ ) -> float:
105
+ """Calculate confidence for uniqueness-based rules.
106
+
107
+ Args:
108
+ unique_ratio: Ratio of unique values.
109
+ total_count: Total number of rows.
110
+ cardinality: Number of distinct values.
111
+
112
+ Returns:
113
+ Confidence score.
114
+ """
115
+ if total_count == 0:
116
+ return 0.5
117
+
118
+ # High uniqueness = likely primary key
119
+ if unique_ratio >= 0.99:
120
+ base = 0.85
121
+ elif unique_ratio >= 0.95:
122
+ base = 0.75
123
+ elif unique_ratio >= 0.8:
124
+ base = 0.65
125
+ else:
126
+ base = 0.5
127
+
128
+ # Bonus for larger datasets (more statistically significant)
129
+ size_bonus = min(0.1, total_count / 10000 * 0.1)
130
+
131
+ return min(1.0, base + size_bonus)
132
+
133
+
134
+ def calculate_correlation_confidence(
135
+ pattern_strength: str,
136
+ column_count: int = 2,
137
+ ) -> float:
138
+ """Calculate confidence for correlation-based rules.
139
+
140
+ Args:
141
+ pattern_strength: 'strong', 'medium', 'weak'.
142
+ column_count: Number of columns involved.
143
+
144
+ Returns:
145
+ Confidence score.
146
+ """
147
+ strength_scores = {
148
+ "strong": 0.85,
149
+ "medium": 0.7,
150
+ "weak": 0.55,
151
+ }
152
+ base = strength_scores.get(pattern_strength, 0.6)
153
+
154
+ # Penalize for more columns (harder to maintain relationship)
155
+ column_penalty = max(0, (column_count - 2) * 0.05)
156
+
157
+ return max(0.5, base - column_penalty)
158
+
159
+
160
+ def extract_sample_violations(
161
+ profile_data: dict[str, Any],
162
+ columns: list[str],
163
+ rule_type: str,
164
+ max_samples: int = 5,
165
+ ) -> list[dict[str, Any]]:
166
+ """Extract sample violations from profile data.
167
+
168
+ This function attempts to find potential violations based on the
169
+ profile statistics. In production, this would query actual data.
170
+
171
+ Args:
172
+ profile_data: Profile statistics for columns.
173
+ columns: Column names involved in the rule.
174
+ rule_type: Type of cross-column rule.
175
+ max_samples: Maximum number of sample violations to return.
176
+
177
+ Returns:
178
+ List of sample violation records.
179
+ """
180
+ violations: list[dict[str, Any]] = []
181
+
182
+ # Check if profile has outlier or anomaly data
183
+ for col in columns:
184
+ col_data = profile_data.get(col, {})
185
+
186
+ # Check for outliers that might indicate violations
187
+ outliers = col_data.get("outliers", [])
188
+ if outliers:
189
+ for outlier in outliers[:max_samples]:
190
+ violations.append({
191
+ "row_index": outlier.get("row", 0),
192
+ "column": col,
193
+ "value": outlier.get("value"),
194
+ "reason": f"Outlier detected in {col}",
195
+ })
196
+
197
+ # Check for null/missing values that might cause violations
198
+ null_count = col_data.get("null_count", 0)
199
+ if null_count > 0 and rule_type in ("column_coexistence", "column_dependency"):
200
+ violations.append({
201
+ "row_index": "multiple",
202
+ "column": col,
203
+ "value": None,
204
+ "reason": f"{null_count} null values in {col}",
205
+ })
206
+
207
+ return violations[:max_samples]
208
+
209
+
210
+ # =============================================================================
211
+ # Preset Definitions
212
+ # =============================================================================
213
+
214
+
215
+ PRESET_DEFINITIONS: dict[RulePreset, PresetInfo] = {
216
+ RulePreset.DEFAULT: PresetInfo(
217
+ name=RulePreset.DEFAULT,
218
+ display_name="Default",
219
+ description="General purpose validation rules. Balanced coverage and thresholds.",
220
+ strictness=StrictnessLevel.MEDIUM,
221
+ categories=[
222
+ RuleCategory.SCHEMA,
223
+ RuleCategory.COMPLETENESS,
224
+ RuleCategory.UNIQUENESS,
225
+ RuleCategory.STATISTICS,
226
+ ],
227
+ recommended_for="Most data validation scenarios",
228
+ ),
229
+ RulePreset.STRICT: PresetInfo(
230
+ name=RulePreset.STRICT,
231
+ display_name="Strict",
232
+ description="Tight thresholds for production data. High confidence rules only.",
233
+ strictness=StrictnessLevel.STRICT,
234
+ categories=[
235
+ RuleCategory.SCHEMA,
236
+ RuleCategory.COMPLETENESS,
237
+ RuleCategory.UNIQUENESS,
238
+ RuleCategory.STATISTICS,
239
+ RuleCategory.PATTERN,
240
+ ],
241
+ recommended_for="Production data pipelines, data quality gates",
242
+ ),
243
+ RulePreset.LOOSE: PresetInfo(
244
+ name=RulePreset.LOOSE,
245
+ display_name="Loose",
246
+ description="Permissive thresholds for development/testing.",
247
+ strictness=StrictnessLevel.LOOSE,
248
+ categories=[
249
+ RuleCategory.SCHEMA,
250
+ RuleCategory.COMPLETENESS,
251
+ ],
252
+ recommended_for="Development, testing, exploratory analysis",
253
+ ),
254
+ RulePreset.MINIMAL: PresetInfo(
255
+ name=RulePreset.MINIMAL,
256
+ display_name="Minimal",
257
+ description="Essential rules only. Focus on critical data integrity.",
258
+ strictness=StrictnessLevel.MEDIUM,
259
+ categories=[
260
+ RuleCategory.SCHEMA,
261
+ RuleCategory.COMPLETENESS,
262
+ ],
263
+ recommended_for="Quick validation, minimal overhead",
264
+ ),
265
+ RulePreset.COMPREHENSIVE: PresetInfo(
266
+ name=RulePreset.COMPREHENSIVE,
267
+ display_name="Comprehensive",
268
+ description="All available rules. Maximum validation coverage.",
269
+ strictness=StrictnessLevel.MEDIUM,
270
+ categories=[
271
+ RuleCategory.SCHEMA,
272
+ RuleCategory.COMPLETENESS,
273
+ RuleCategory.UNIQUENESS,
274
+ RuleCategory.STATISTICS,
275
+ RuleCategory.PATTERN,
276
+ RuleCategory.DISTRIBUTION,
277
+ ],
278
+ recommended_for="Full data audit, compliance checks",
279
+ ),
280
+ RulePreset.CI_CD: PresetInfo(
281
+ name=RulePreset.CI_CD,
282
+ display_name="CI/CD",
283
+ description="Optimized for continuous integration. Fast execution, clear failures.",
284
+ strictness=StrictnessLevel.STRICT,
285
+ categories=[
286
+ RuleCategory.SCHEMA,
287
+ RuleCategory.COMPLETENESS,
288
+ RuleCategory.UNIQUENESS,
289
+ ],
290
+ recommended_for="CI/CD pipelines, automated testing",
291
+ ),
292
+ RulePreset.SCHEMA_ONLY: PresetInfo(
293
+ name=RulePreset.SCHEMA_ONLY,
294
+ display_name="Schema Only",
295
+ description="Structure validation only. No statistical checks.",
296
+ strictness=StrictnessLevel.MEDIUM,
297
+ categories=[RuleCategory.SCHEMA],
298
+ recommended_for="Schema drift detection, structure validation",
299
+ ),
300
+ RulePreset.FORMAT_ONLY: PresetInfo(
301
+ name=RulePreset.FORMAT_ONLY,
302
+ display_name="Format Only",
303
+ description="Format and pattern rules only.",
304
+ strictness=StrictnessLevel.MEDIUM,
305
+ categories=[RuleCategory.PATTERN],
306
+ recommended_for="Data format validation, PII detection",
307
+ ),
308
+ RulePreset.CROSS_COLUMN: PresetInfo(
309
+ name=RulePreset.CROSS_COLUMN,
310
+ display_name="Cross-Column",
311
+ description="Focus on cross-column relationships and constraints.",
312
+ strictness=StrictnessLevel.MEDIUM,
313
+ categories=[
314
+ RuleCategory.RELATIONSHIP,
315
+ RuleCategory.MULTI_COLUMN,
316
+ RuleCategory.UNIQUENESS,
317
+ ],
318
+ recommended_for="Data integrity, referential constraints, composite keys",
319
+ ),
320
+ RulePreset.DATA_INTEGRITY: PresetInfo(
321
+ name=RulePreset.DATA_INTEGRITY,
322
+ display_name="Data Integrity",
323
+ description="Comprehensive data integrity validation including cross-column rules.",
324
+ strictness=StrictnessLevel.STRICT,
325
+ categories=[
326
+ RuleCategory.SCHEMA,
327
+ RuleCategory.COMPLETENESS,
328
+ RuleCategory.UNIQUENESS,
329
+ RuleCategory.RELATIONSHIP,
330
+ RuleCategory.MULTI_COLUMN,
331
+ ],
332
+ recommended_for="Database migrations, data warehouse validation",
333
+ ),
334
+ }
335
+
336
+
337
+ # Strictness thresholds
338
+ STRICTNESS_THRESHOLDS = {
339
+ StrictnessLevel.LOOSE: {
340
+ "min_confidence": 0.3,
341
+ "null_threshold": 10.0,
342
+ "unique_threshold": 90.0,
343
+ "range_buffer": 0.2, # 20% buffer on ranges
344
+ },
345
+ StrictnessLevel.MEDIUM: {
346
+ "min_confidence": 0.5,
347
+ "null_threshold": 5.0,
348
+ "unique_threshold": 95.0,
349
+ "range_buffer": 0.1, # 10% buffer
350
+ },
351
+ StrictnessLevel.STRICT: {
352
+ "min_confidence": 0.7,
353
+ "null_threshold": 1.0,
354
+ "unique_threshold": 99.0,
355
+ "range_buffer": 0.0, # No buffer
356
+ },
357
+ }
358
+
359
+
360
+ def _parse_percentage(value: str | None) -> float:
361
+ """Parse percentage string to float.
362
+
363
+ Args:
364
+ value: Percentage string like "25.5%".
365
+
366
+ Returns:
367
+ Float value (0.0-100.0).
368
+ """
369
+ if not value:
370
+ return 0.0
371
+ try:
372
+ return float(value.replace("%", ""))
373
+ except (ValueError, AttributeError):
374
+ return 0.0
375
+
376
+
377
+ class RuleGeneratorService:
378
+ """Service for generating validation rules from profile data."""
379
+
380
+ def __init__(self, session: AsyncSession):
381
+ """Initialize service.
382
+
383
+ Args:
384
+ session: Database session.
385
+ """
386
+ self.session = session
387
+ self.profile_repo = ProfileRepository(session)
388
+ self.rule_repo = RuleRepository(session)
389
+
390
+ def _suggest_null_rules(
391
+ self,
392
+ column: dict[str, Any],
393
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
394
+ ) -> list[SuggestedRule]:
395
+ """Suggest null-related validators based on null percentage.
396
+
397
+ Args:
398
+ column: Column profile data.
399
+ strictness: Strictness level for thresholds.
400
+
401
+ Returns:
402
+ List of suggested rules.
403
+ """
404
+ suggestions = []
405
+ col_name = column.get("name", "")
406
+ null_pct = _parse_percentage(column.get("null_pct"))
407
+ thresholds = STRICTNESS_THRESHOLDS[strictness]
408
+ null_threshold = thresholds["null_threshold"]
409
+
410
+ if null_pct == 0.0:
411
+ # Column has no nulls - suggest NotNull
412
+ suggestions.append(
413
+ SuggestedRule(
414
+ column=col_name,
415
+ validator_name="NotNull",
416
+ params={},
417
+ confidence=0.95,
418
+ reason="Column has 0% null values",
419
+ severity_suggestion="high",
420
+ category=RuleCategory.COMPLETENESS,
421
+ )
422
+ )
423
+ elif null_pct < 1.0:
424
+ # Very few nulls - suggest Null with mostly
425
+ mostly = 0.99 if strictness == StrictnessLevel.STRICT else 0.98
426
+ suggestions.append(
427
+ SuggestedRule(
428
+ column=col_name,
429
+ validator_name="Null",
430
+ params={"mostly": mostly},
431
+ confidence=0.85,
432
+ reason=f"Column has only {null_pct}% null values",
433
+ severity_suggestion="medium",
434
+ category=RuleCategory.COMPLETENESS,
435
+ )
436
+ )
437
+ elif null_pct < null_threshold:
438
+ # Some nulls - suggest Null with lower threshold
439
+ mostly = 1 - (null_pct / 100) - 0.01
440
+ suggestions.append(
441
+ SuggestedRule(
442
+ column=col_name,
443
+ validator_name="Null",
444
+ params={"mostly": round(mostly, 2)},
445
+ confidence=0.7,
446
+ reason=f"Column has {null_pct}% null values",
447
+ severity_suggestion="low",
448
+ category=RuleCategory.COMPLETENESS,
449
+ )
450
+ )
451
+
452
+ return suggestions
453
+
454
+ def _suggest_uniqueness_rules(
455
+ self,
456
+ column: dict[str, Any],
457
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
458
+ ) -> list[SuggestedRule]:
459
+ """Suggest uniqueness validators based on unique percentage.
460
+
461
+ Args:
462
+ column: Column profile data.
463
+ strictness: Strictness level for thresholds.
464
+
465
+ Returns:
466
+ List of suggested rules.
467
+ """
468
+ suggestions = []
469
+ col_name = column.get("name", "")
470
+ unique_pct = _parse_percentage(column.get("unique_pct"))
471
+ distinct_count = column.get("distinct_count")
472
+ thresholds = STRICTNESS_THRESHOLDS[strictness]
473
+ unique_threshold = thresholds["unique_threshold"]
474
+
475
+ if unique_pct >= 99.9:
476
+ # Nearly unique - suggest Unique
477
+ suggestions.append(
478
+ SuggestedRule(
479
+ column=col_name,
480
+ validator_name="Unique",
481
+ params={},
482
+ confidence=0.95,
483
+ reason=f"Column has {unique_pct}% unique values (likely primary key)",
484
+ severity_suggestion="high",
485
+ category=RuleCategory.UNIQUENESS,
486
+ )
487
+ )
488
+ elif unique_pct >= unique_threshold:
489
+ # High uniqueness - suggest Unique with tolerance
490
+ mostly = unique_pct / 100
491
+ suggestions.append(
492
+ SuggestedRule(
493
+ column=col_name,
494
+ validator_name="Unique",
495
+ params={"mostly": round(mostly, 2)},
496
+ confidence=0.8,
497
+ reason=f"Column has {unique_pct}% unique values",
498
+ severity_suggestion="medium",
499
+ category=RuleCategory.UNIQUENESS,
500
+ )
501
+ )
502
+ elif unique_pct < 10.0 and distinct_count and distinct_count < 50:
503
+ # Low cardinality - suggest DistinctSet
504
+ buffer = 10 if strictness == StrictnessLevel.LOOSE else 5
505
+ suggestions.append(
506
+ SuggestedRule(
507
+ column=col_name,
508
+ validator_name="DistinctSet",
509
+ params={"max_distinct": distinct_count + buffer},
510
+ confidence=0.75,
511
+ reason=f"Column has low cardinality ({distinct_count} distinct values)",
512
+ severity_suggestion="low",
513
+ category=RuleCategory.DISTRIBUTION,
514
+ )
515
+ )
516
+
517
+ return suggestions
518
+
519
+ def _suggest_range_rules(
520
+ self,
521
+ column: dict[str, Any],
522
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
523
+ ) -> list[SuggestedRule]:
524
+ """Suggest range validators based on min/max values.
525
+
526
+ Args:
527
+ column: Column profile data.
528
+ strictness: Strictness level for thresholds.
529
+
530
+ Returns:
531
+ List of suggested rules.
532
+ """
533
+ suggestions = []
534
+ col_name = column.get("name", "")
535
+ dtype = column.get("dtype", "").lower()
536
+ min_val = column.get("min")
537
+ max_val = column.get("max")
538
+ thresholds = STRICTNESS_THRESHOLDS[strictness]
539
+ buffer = thresholds["range_buffer"]
540
+
541
+ # Only suggest for numeric types
542
+ if dtype not in ("int64", "int32", "float64", "float32", "number", "integer"):
543
+ return suggestions
544
+
545
+ if min_val is not None and max_val is not None:
546
+ try:
547
+ min_num = float(min_val)
548
+ max_num = float(max_val)
549
+
550
+ # Only suggest if range seems reasonable
551
+ if max_num > min_num:
552
+ # Apply buffer to range
553
+ range_size = max_num - min_num
554
+ buffered_min = min_num - (range_size * buffer)
555
+ buffered_max = max_num + (range_size * buffer)
556
+
557
+ suggestions.append(
558
+ SuggestedRule(
559
+ column=col_name,
560
+ validator_name="Range",
561
+ params={
562
+ "min_value": round(buffered_min, 2),
563
+ "max_value": round(buffered_max, 2),
564
+ },
565
+ confidence=0.7,
566
+ reason=f"Column values range from {min_num} to {max_num}",
567
+ severity_suggestion="medium",
568
+ category=RuleCategory.STATISTICS,
569
+ )
570
+ )
571
+ except (ValueError, TypeError):
572
+ pass
573
+
574
+ return suggestions
575
+
576
+ def _suggest_type_rules(
577
+ self,
578
+ column: dict[str, Any],
579
+ schema_column: dict[str, Any] | None = None,
580
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
581
+ ) -> list[SuggestedRule]:
582
+ """Suggest type and pattern validators based on data type.
583
+
584
+ Args:
585
+ column: Column profile data.
586
+ schema_column: Optional schema column definition.
587
+ strictness: Strictness level.
588
+
589
+ Returns:
590
+ List of suggested rules.
591
+ """
592
+ suggestions = []
593
+ col_name = column.get("name", "").lower()
594
+ original_name = column.get("name", "")
595
+ dtype = column.get("dtype", "").lower()
596
+
597
+ # Email detection by column name
598
+ if any(hint in col_name for hint in ("email", "e_mail", "mail")):
599
+ suggestions.append(
600
+ SuggestedRule(
601
+ column=original_name,
602
+ validator_name="Email",
603
+ params={},
604
+ confidence=0.85,
605
+ reason="Column name suggests email content",
606
+ severity_suggestion="medium",
607
+ category=RuleCategory.PATTERN,
608
+ )
609
+ )
610
+
611
+ # Phone detection by column name
612
+ if any(hint in col_name for hint in ("phone", "tel", "mobile", "cell")):
613
+ suggestions.append(
614
+ SuggestedRule(
615
+ column=original_name,
616
+ validator_name="Phone",
617
+ params={},
618
+ confidence=0.75,
619
+ reason="Column name suggests phone number",
620
+ severity_suggestion="low",
621
+ category=RuleCategory.PATTERN,
622
+ )
623
+ )
624
+
625
+ # URL detection by column name
626
+ if any(hint in col_name for hint in ("url", "link", "website", "href")):
627
+ suggestions.append(
628
+ SuggestedRule(
629
+ column=original_name,
630
+ validator_name="URL",
631
+ params={},
632
+ confidence=0.8,
633
+ reason="Column name suggests URL content",
634
+ severity_suggestion="low",
635
+ category=RuleCategory.PATTERN,
636
+ )
637
+ )
638
+
639
+ # Date/datetime type detection
640
+ if dtype in ("datetime64", "date", "timestamp"):
641
+ suggestions.append(
642
+ SuggestedRule(
643
+ column=original_name,
644
+ validator_name="DateParseable",
645
+ params={},
646
+ confidence=0.9,
647
+ reason=f"Column has {dtype} data type",
648
+ severity_suggestion="medium",
649
+ category=RuleCategory.SCHEMA,
650
+ )
651
+ )
652
+
653
+ # Positive number detection for common column names
654
+ if dtype in ("int64", "int32", "float64", "float32"):
655
+ positive_hints = ("id", "count", "quantity", "amount", "price", "age")
656
+ if any(col_name.endswith(hint) or col_name == hint for hint in positive_hints):
657
+ min_val = column.get("min")
658
+ if min_val is not None:
659
+ try:
660
+ if float(min_val) >= 0:
661
+ suggestions.append(
662
+ SuggestedRule(
663
+ column=original_name,
664
+ validator_name="Positive",
665
+ params={},
666
+ confidence=0.75,
667
+ reason=f"Column name suggests positive values (min={min_val})",
668
+ severity_suggestion="low",
669
+ category=RuleCategory.STATISTICS,
670
+ )
671
+ )
672
+ except (ValueError, TypeError):
673
+ pass
674
+
675
+ return suggestions
676
+
677
+ def _suggest_statistical_rules(
678
+ self,
679
+ column: dict[str, Any],
680
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
681
+ ) -> list[SuggestedRule]:
682
+ """Suggest statistical validators based on distribution.
683
+
684
+ Args:
685
+ column: Column profile data.
686
+ strictness: Strictness level.
687
+
688
+ Returns:
689
+ List of suggested rules.
690
+ """
691
+ suggestions = []
692
+ col_name = column.get("name", "")
693
+ mean = column.get("mean")
694
+ std = column.get("std")
695
+
696
+ # Suggest Z-score based outlier detection if we have distribution data
697
+ if mean is not None and std is not None and std > 0:
698
+ # Adjust threshold based on strictness
699
+ threshold = {
700
+ StrictnessLevel.LOOSE: 4.0,
701
+ StrictnessLevel.MEDIUM: 3.0,
702
+ StrictnessLevel.STRICT: 2.5,
703
+ }[strictness]
704
+
705
+ suggestions.append(
706
+ SuggestedRule(
707
+ column=col_name,
708
+ validator_name="ZScore",
709
+ params={"threshold": threshold},
710
+ confidence=0.6,
711
+ reason=f"Column has mean={mean:.2f}, std={std:.2f}",
712
+ severity_suggestion="low",
713
+ category=RuleCategory.STATISTICS,
714
+ )
715
+ )
716
+
717
+ return suggestions
718
+
719
+ # =============================================================================
720
+ # Cross-Column Rule Suggestion Methods
721
+ # =============================================================================
722
+
723
+ def _suggest_composite_key_rules(
724
+ self,
725
+ columns: list[dict[str, Any]],
726
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
727
+ ) -> list[CrossColumnRuleSuggestion]:
728
+ """Suggest composite key (multi-column uniqueness) rules.
729
+
730
+ Analyzes column combinations to detect potential composite keys
731
+ based on uniqueness patterns.
732
+
733
+ Args:
734
+ columns: List of column profile data.
735
+ strictness: Strictness level.
736
+
737
+ Returns:
738
+ List of cross-column suggestions.
739
+ """
740
+ suggestions = []
741
+
742
+ # Find columns that might form composite keys
743
+ # Look for ID-like columns or columns with moderate cardinality
744
+ id_columns = []
745
+ categorical_columns = []
746
+
747
+ for col in columns:
748
+ col_name = col.get("name", "")
749
+ unique_pct = _parse_percentage(col.get("unique_pct"))
750
+ distinct_count = col.get("distinct_count", 0)
751
+
752
+ # ID-like columns (high but not 100% uniqueness)
753
+ if 50 <= unique_pct < 99.9 and any(
754
+ hint in col_name.lower()
755
+ for hint in ("id", "key", "code", "num", "ref")
756
+ ):
757
+ id_columns.append(col_name)
758
+
759
+ # Categorical columns with moderate cardinality
760
+ elif distinct_count and 2 < distinct_count < 100:
761
+ categorical_columns.append(col_name)
762
+
763
+ # Suggest composite keys from ID column pairs
764
+ if len(id_columns) >= 2:
765
+ for i in range(len(id_columns)):
766
+ for j in range(i + 1, min(i + 3, len(id_columns))):
767
+ cols = [id_columns[i], id_columns[j]]
768
+ suggestions.append(
769
+ CrossColumnRuleSuggestion(
770
+ rule_type=CrossColumnRuleType.COMPOSITE_KEY,
771
+ columns=cols,
772
+ validator_name="MultiColumnUnique",
773
+ params={"columns": cols},
774
+ confidence=0.75,
775
+ reason=f"Columns {cols[0]} and {cols[1]} may form a composite key",
776
+ severity_suggestion="high",
777
+ evidence={
778
+ "pattern": "id_column_combination",
779
+ "columns": cols,
780
+ },
781
+ )
782
+ )
783
+
784
+ # Suggest composite keys from ID + categorical combinations
785
+ for id_col in id_columns[:2]: # Limit to avoid explosion
786
+ for cat_col in categorical_columns[:3]:
787
+ suggestions.append(
788
+ CrossColumnRuleSuggestion(
789
+ rule_type=CrossColumnRuleType.COMPOSITE_KEY,
790
+ columns=[id_col, cat_col],
791
+ validator_name="MultiColumnUnique",
792
+ params={"columns": [id_col, cat_col]},
793
+ confidence=0.65,
794
+ reason=f"{id_col} combined with {cat_col} may form a natural key",
795
+ severity_suggestion="medium",
796
+ evidence={
797
+ "pattern": "id_categorical_combination",
798
+ },
799
+ )
800
+ )
801
+
802
+ return suggestions
803
+
804
+ def _suggest_comparison_rules(
805
+ self,
806
+ columns: list[dict[str, Any]],
807
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
808
+ ) -> list[CrossColumnRuleSuggestion]:
809
+ """Suggest column comparison rules (e.g., end_date > start_date).
810
+
811
+ Args:
812
+ columns: List of column profile data.
813
+ strictness: Strictness level.
814
+
815
+ Returns:
816
+ List of cross-column suggestions.
817
+ """
818
+ suggestions = []
819
+ col_map = {col.get("name", ""): col for col in columns}
820
+
821
+ # Common comparison patterns
822
+ date_pairs = [
823
+ ("start_date", "end_date", ">"),
824
+ ("created_at", "updated_at", "<="),
825
+ ("birth_date", "death_date", "<="),
826
+ ("hire_date", "termination_date", "<="),
827
+ ("order_date", "ship_date", "<="),
828
+ ("start_time", "end_time", "<"),
829
+ ]
830
+
831
+ numeric_pairs = [
832
+ ("min_value", "max_value", "<="),
833
+ ("min_price", "max_price", "<="),
834
+ ("min_quantity", "max_quantity", "<="),
835
+ ("low", "high", "<="),
836
+ ("floor", "ceiling", "<="),
837
+ ("cost", "price", "<="),
838
+ ]
839
+
840
+ # Check date comparison patterns
841
+ for start_hint, end_hint, operator in date_pairs:
842
+ start_cols = [
843
+ c for c in col_map
844
+ if start_hint in c.lower() or c.lower().endswith("_start")
845
+ ]
846
+ end_cols = [
847
+ c for c in col_map
848
+ if end_hint in c.lower() or c.lower().endswith("_end")
849
+ ]
850
+
851
+ for start_col in start_cols:
852
+ for end_col in end_cols:
853
+ # Avoid matching same column
854
+ if start_col == end_col:
855
+ continue
856
+ # Check if they share a common prefix/suffix
857
+ start_base = start_col.replace("_start", "").replace("start_", "")
858
+ end_base = end_col.replace("_end", "").replace("end_", "")
859
+ base_match = (
860
+ start_base.lower() == end_base.lower()
861
+ or start_base.lower().replace("date", "") == end_base.lower().replace("date", "")
862
+ )
863
+
864
+ confidence = 0.85 if base_match else 0.7
865
+
866
+ suggestions.append(
867
+ CrossColumnRuleSuggestion(
868
+ rule_type=CrossColumnRuleType.COLUMN_COMPARISON,
869
+ columns=[end_col, start_col],
870
+ validator_name="ColumnComparison",
871
+ params={
872
+ "column_a": end_col,
873
+ "column_b": start_col,
874
+ "operator": operator,
875
+ },
876
+ confidence=confidence,
877
+ reason=f"{end_col} should be {operator} {start_col}",
878
+ severity_suggestion="high" if confidence >= 0.8 else "medium",
879
+ evidence={
880
+ "pattern": "date_range",
881
+ "base_match": base_match,
882
+ },
883
+ )
884
+ )
885
+
886
+ # Check numeric comparison patterns
887
+ for min_hint, max_hint, operator in numeric_pairs:
888
+ min_cols = [c for c in col_map if min_hint in c.lower()]
889
+ max_cols = [c for c in col_map if max_hint in c.lower()]
890
+
891
+ for min_col in min_cols:
892
+ for max_col in max_cols:
893
+ if min_col == max_col:
894
+ continue
895
+
896
+ suggestions.append(
897
+ CrossColumnRuleSuggestion(
898
+ rule_type=CrossColumnRuleType.COLUMN_COMPARISON,
899
+ columns=[max_col, min_col],
900
+ validator_name="ColumnComparison",
901
+ params={
902
+ "column_a": max_col,
903
+ "column_b": min_col,
904
+ "operator": ">=",
905
+ },
906
+ confidence=0.8,
907
+ reason=f"{max_col} should be >= {min_col}",
908
+ severity_suggestion="high",
909
+ evidence={
910
+ "pattern": "numeric_range",
911
+ },
912
+ )
913
+ )
914
+
915
+ return suggestions
916
+
917
+ def _suggest_arithmetic_rules(
918
+ self,
919
+ columns: list[dict[str, Any]],
920
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
921
+ ) -> list[CrossColumnRuleSuggestion]:
922
+ """Suggest arithmetic relationship rules (sum, product, etc.).
923
+
924
+ Args:
925
+ columns: List of column profile data.
926
+ strictness: Strictness level.
927
+
928
+ Returns:
929
+ List of cross-column suggestions.
930
+ """
931
+ suggestions = []
932
+ col_map = {col.get("name", "").lower(): col.get("name", "") for col in columns}
933
+ numeric_cols = [
934
+ col.get("name", "") for col in columns
935
+ if col.get("dtype", "").lower() in ("int64", "int32", "float64", "float32", "number", "integer")
936
+ ]
937
+
938
+ # Common sum patterns: subtotal + tax + shipping = total
939
+ sum_patterns = [
940
+ (["subtotal", "tax", "shipping"], "total", "Order total calculation"),
941
+ (["subtotal", "tax"], "total", "Subtotal + tax = total"),
942
+ (["quantity", "unit_price"], None, "Quantity * unit_price"), # Product
943
+ (["hours", "rate"], None, "Hours * rate"), # Product
944
+ (["principal", "interest"], "total_amount", "Principal + interest"),
945
+ (["base_salary", "bonus"], "total_compensation", "Salary + bonus"),
946
+ ]
947
+
948
+ for pattern_cols, result_col, description in sum_patterns:
949
+ matched_cols = []
950
+ for p in pattern_cols:
951
+ for col_lower, col_name in col_map.items():
952
+ if p in col_lower:
953
+ matched_cols.append(col_name)
954
+ break
955
+
956
+ if len(matched_cols) >= 2:
957
+ # Check for result column
958
+ result_found = None
959
+ if result_col:
960
+ for col_lower, col_name in col_map.items():
961
+ if result_col in col_lower:
962
+ result_found = col_name
963
+ break
964
+
965
+ if result_found:
966
+ suggestions.append(
967
+ CrossColumnRuleSuggestion(
968
+ rule_type=CrossColumnRuleType.COLUMN_SUM,
969
+ columns=[*matched_cols, result_found],
970
+ validator_name="ColumnSum",
971
+ params={
972
+ "columns": matched_cols,
973
+ "target_column": result_found,
974
+ "tolerance": 0.01,
975
+ },
976
+ confidence=0.75,
977
+ reason=f"Sum of {', '.join(matched_cols)} should equal {result_found}",
978
+ severity_suggestion="high",
979
+ evidence={
980
+ "pattern": "arithmetic_sum",
981
+ "description": description,
982
+ },
983
+ )
984
+ )
985
+
986
+ # Percentage/ratio patterns
987
+ percentage_patterns = [
988
+ ("percentage", "total", "part", "Percentage calculation"),
989
+ ("rate", "amount", "base", "Rate calculation"),
990
+ ("discount_pct", "discount", "subtotal", "Discount percentage"),
991
+ ]
992
+
993
+ return suggestions
994
+
995
+ def _suggest_correlation_rules(
996
+ self,
997
+ columns: list[dict[str, Any]],
998
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
999
+ ) -> list[CrossColumnRuleSuggestion]:
1000
+ """Suggest column correlation rules for numeric columns.
1001
+
1002
+ Analyzes profile data to identify potentially correlated numeric columns
1003
+ based on naming patterns and statistical properties.
1004
+
1005
+ Args:
1006
+ columns: List of column profile data.
1007
+ strictness: Strictness level.
1008
+
1009
+ Returns:
1010
+ List of cross-column suggestions.
1011
+ """
1012
+ suggestions = []
1013
+
1014
+ # Filter numeric columns only
1015
+ numeric_cols = [
1016
+ col for col in columns
1017
+ if col.get("dtype", "").lower() in (
1018
+ "int64", "int32", "float64", "float32", "number", "integer", "float"
1019
+ )
1020
+ ]
1021
+
1022
+ if len(numeric_cols) < 2:
1023
+ return suggestions
1024
+
1025
+ # Common correlation patterns based on naming conventions
1026
+ correlation_patterns = [
1027
+ # High positive correlation expected
1028
+ (["price", "cost"], "positive", 0.7, "Price/cost related columns"),
1029
+ (["quantity", "total"], "positive", 0.5, "Quantity affects total"),
1030
+ (["height", "weight"], "positive", 0.3, "Physical measurements"),
1031
+ (["income", "expenditure"], "positive", 0.4, "Financial metrics"),
1032
+ (["age", "experience"], "positive", 0.5, "Age correlates with experience"),
1033
+ (["views", "clicks"], "positive", 0.6, "Engagement metrics"),
1034
+ (["revenue", "profit"], "positive", 0.6, "Revenue correlates with profit"),
1035
+ # Negative correlation expected
1036
+ (["discount", "price"], "negative", -0.3, "Discount inversely affects price"),
1037
+ (["errors", "quality"], "negative", -0.5, "Errors reduce quality score"),
1038
+ ]
1039
+
1040
+ col_name_map = {col.get("name", "").lower(): col.get("name", "") for col in numeric_cols}
1041
+
1042
+ for hints, direction, expected_correlation, description in correlation_patterns:
1043
+ matched_cols = []
1044
+ for hint in hints:
1045
+ for col_lower, col_name in col_name_map.items():
1046
+ if hint in col_lower and col_name not in matched_cols:
1047
+ matched_cols.append(col_name)
1048
+ break
1049
+
1050
+ if len(matched_cols) >= 2:
1051
+ # Suggest correlation check for the first pair found
1052
+ col_a, col_b = matched_cols[0], matched_cols[1]
1053
+ if direction == "positive":
1054
+ min_corr = expected_correlation
1055
+ max_corr = 1.0
1056
+ else:
1057
+ min_corr = -1.0
1058
+ max_corr = expected_correlation
1059
+
1060
+ suggestions.append(
1061
+ CrossColumnRuleSuggestion(
1062
+ rule_type=CrossColumnRuleType.COLUMN_CORRELATION,
1063
+ columns=[col_a, col_b],
1064
+ validator_name="ColumnCorrelation",
1065
+ params={
1066
+ "column_a": col_a,
1067
+ "column_b": col_b,
1068
+ "min_correlation": min_corr,
1069
+ "max_correlation": max_corr,
1070
+ },
1071
+ confidence=0.65,
1072
+ reason=f"{description}: {col_a} and {col_b} may be correlated",
1073
+ severity_suggestion="medium",
1074
+ evidence={
1075
+ "pattern": "correlation_pattern",
1076
+ "direction": direction,
1077
+ "expected_correlation": expected_correlation,
1078
+ },
1079
+ )
1080
+ )
1081
+
1082
+ # Also suggest correlation check for columns with similar names (e.g., metric_v1, metric_v2)
1083
+ for i, col1 in enumerate(numeric_cols):
1084
+ for col2 in numeric_cols[i + 1:]:
1085
+ name1 = col1.get("name", "")
1086
+ name2 = col2.get("name", "")
1087
+
1088
+ # Check for versioned or indexed columns
1089
+ base1 = re.sub(r"[_\-]?(v?\d+|old|new|prev|next)$", "", name1.lower())
1090
+ base2 = re.sub(r"[_\-]?(v?\d+|old|new|prev|next)$", "", name2.lower())
1091
+
1092
+ if base1 and base1 == base2 and name1 != name2:
1093
+ suggestions.append(
1094
+ CrossColumnRuleSuggestion(
1095
+ rule_type=CrossColumnRuleType.COLUMN_CORRELATION,
1096
+ columns=[name1, name2],
1097
+ validator_name="ColumnCorrelation",
1098
+ params={
1099
+ "column_a": name1,
1100
+ "column_b": name2,
1101
+ "min_correlation": 0.5,
1102
+ "max_correlation": 1.0,
1103
+ },
1104
+ confidence=0.7,
1105
+ reason=f"Versioned columns {name1} and {name2} should be correlated",
1106
+ severity_suggestion="low",
1107
+ evidence={
1108
+ "pattern": "versioned_columns",
1109
+ "base_name": base1,
1110
+ },
1111
+ )
1112
+ )
1113
+
1114
+ return suggestions
1115
+
1116
+ def _suggest_chain_comparison_rules(
1117
+ self,
1118
+ columns: list[dict[str, Any]],
1119
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
1120
+ ) -> list[CrossColumnRuleSuggestion]:
1121
+ """Suggest chain comparison rules (a < b < c).
1122
+
1123
+ Args:
1124
+ columns: List of column profile data.
1125
+ strictness: Strictness level.
1126
+
1127
+ Returns:
1128
+ List of cross-column suggestions.
1129
+ """
1130
+ suggestions = []
1131
+ col_map = {col.get("name", "").lower(): col.get("name", "") for col in columns}
1132
+
1133
+ # Common chain comparison patterns
1134
+ chain_patterns = [
1135
+ # Date chains
1136
+ (["created", "updated", "deleted"], "<=", "Lifecycle date ordering"),
1137
+ (["start_date", "mid_date", "end_date"], "<=", "Date range ordering"),
1138
+ (["ordered", "shipped", "delivered"], "<=", "Order timeline"),
1139
+ (["submitted", "approved", "completed"], "<=", "Workflow dates"),
1140
+ # Numeric chains
1141
+ (["min", "avg", "max"], "<=", "Statistical ordering"),
1142
+ (["low", "mid", "high"], "<=", "Range tier ordering"),
1143
+ (["bronze", "silver", "gold"], "<=", "Tier value ordering"),
1144
+ (["small", "medium", "large"], "<=", "Size value ordering"),
1145
+ (["floor_price", "price", "ceiling_price"], "<=", "Price bounds ordering"),
1146
+ (["cost", "price", "msrp"], "<=", "Pricing chain"),
1147
+ ]
1148
+
1149
+ for hints, operator, description in chain_patterns:
1150
+ matched_cols = []
1151
+ for hint in hints:
1152
+ for col_lower, col_name in col_map.items():
1153
+ if hint in col_lower and col_name not in matched_cols:
1154
+ matched_cols.append(col_name)
1155
+ break
1156
+
1157
+ if len(matched_cols) >= 3:
1158
+ suggestions.append(
1159
+ CrossColumnRuleSuggestion(
1160
+ rule_type=CrossColumnRuleType.COLUMN_CHAIN_COMPARISON,
1161
+ columns=matched_cols[:3], # Limit to 3 columns
1162
+ validator_name="ColumnChainComparison",
1163
+ params={
1164
+ "columns": matched_cols[:3],
1165
+ "operator": operator,
1166
+ },
1167
+ confidence=0.75,
1168
+ reason=f"{description}: {' {0} '.format(operator).join(matched_cols[:3])}",
1169
+ severity_suggestion="medium",
1170
+ evidence={
1171
+ "pattern": "chain_comparison",
1172
+ "operator": operator,
1173
+ },
1174
+ )
1175
+ )
1176
+
1177
+ return suggestions
1178
+
1179
+ def _suggest_advanced_arithmetic_rules(
1180
+ self,
1181
+ columns: list[dict[str, Any]],
1182
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
1183
+ ) -> list[CrossColumnRuleSuggestion]:
1184
+ """Suggest advanced arithmetic relationship rules (product, ratio, percentage).
1185
+
1186
+ Args:
1187
+ columns: List of column profile data.
1188
+ strictness: Strictness level.
1189
+
1190
+ Returns:
1191
+ List of cross-column suggestions.
1192
+ """
1193
+ suggestions = []
1194
+ col_map = {col.get("name", "").lower(): col.get("name", "") for col in columns}
1195
+ numeric_cols = [
1196
+ col.get("name", "") for col in columns
1197
+ if col.get("dtype", "").lower() in (
1198
+ "int64", "int32", "float64", "float32", "number", "integer", "float"
1199
+ )
1200
+ ]
1201
+
1202
+ # Product patterns (a * b = c)
1203
+ product_patterns = [
1204
+ (["quantity", "unit_price"], "total", "Line item total"),
1205
+ (["quantity", "price"], "amount", "Order amount"),
1206
+ (["hours", "rate"], "cost", "Labor cost"),
1207
+ (["hours", "hourly_rate"], "total_cost", "Total labor cost"),
1208
+ (["length", "width"], "area", "Area calculation"),
1209
+ (["principal", "rate"], "interest", "Interest calculation"),
1210
+ ]
1211
+
1212
+ for factors, result_hint, description in product_patterns:
1213
+ factor_cols = []
1214
+ for factor in factors:
1215
+ for col_lower, col_name in col_map.items():
1216
+ if factor in col_lower and col_name not in factor_cols:
1217
+ factor_cols.append(col_name)
1218
+ break
1219
+
1220
+ if len(factor_cols) >= 2:
1221
+ # Find result column
1222
+ result_col = None
1223
+ for col_lower, col_name in col_map.items():
1224
+ if result_hint in col_lower and col_name not in factor_cols:
1225
+ result_col = col_name
1226
+ break
1227
+
1228
+ if result_col:
1229
+ suggestions.append(
1230
+ CrossColumnRuleSuggestion(
1231
+ rule_type=CrossColumnRuleType.COLUMN_PRODUCT,
1232
+ columns=[*factor_cols, result_col],
1233
+ validator_name="ColumnProduct",
1234
+ params={
1235
+ "columns": factor_cols,
1236
+ "target_column": result_col,
1237
+ "tolerance": 0.01,
1238
+ },
1239
+ confidence=0.75,
1240
+ reason=f"{description}: {' × '.join(factor_cols)} = {result_col}",
1241
+ severity_suggestion="high",
1242
+ evidence={
1243
+ "pattern": "arithmetic_product",
1244
+ "description": description,
1245
+ },
1246
+ )
1247
+ )
1248
+
1249
+ # Ratio patterns (a / b = expected ratio or a / b ≈ c)
1250
+ ratio_patterns = [
1251
+ ("profit", "revenue", "margin", "Profit margin"),
1252
+ ("tax", "subtotal", "tax_rate", "Tax rate"),
1253
+ ("discount", "price", "discount_rate", "Discount rate"),
1254
+ ("part", "total", "ratio", "Part to total ratio"),
1255
+ ("completed", "total", "completion_rate", "Completion rate"),
1256
+ ]
1257
+
1258
+ for numerator_hint, denominator_hint, result_hint, description in ratio_patterns:
1259
+ numerator_col = None
1260
+ denominator_col = None
1261
+ result_col = None
1262
+
1263
+ for col_lower, col_name in col_map.items():
1264
+ if numerator_hint in col_lower and not numerator_col:
1265
+ numerator_col = col_name
1266
+ elif denominator_hint in col_lower and not denominator_col:
1267
+ denominator_col = col_name
1268
+ elif result_hint in col_lower and not result_col:
1269
+ result_col = col_name
1270
+
1271
+ if numerator_col and denominator_col:
1272
+ if result_col:
1273
+ # Ratio with result column
1274
+ suggestions.append(
1275
+ CrossColumnRuleSuggestion(
1276
+ rule_type=CrossColumnRuleType.COLUMN_RATIO,
1277
+ columns=[numerator_col, denominator_col, result_col],
1278
+ validator_name="ColumnRatio",
1279
+ params={
1280
+ "numerator_column": numerator_col,
1281
+ "denominator_column": denominator_col,
1282
+ "result_column": result_col,
1283
+ "tolerance": 0.01,
1284
+ },
1285
+ confidence=0.7,
1286
+ reason=f"{description}: {numerator_col} / {denominator_col} = {result_col}",
1287
+ severity_suggestion="medium",
1288
+ evidence={
1289
+ "pattern": "arithmetic_ratio",
1290
+ "description": description,
1291
+ },
1292
+ )
1293
+ )
1294
+
1295
+ # Percentage patterns
1296
+ percentage_patterns = [
1297
+ ("discount_pct", "subtotal", "discount", "Discount percentage"),
1298
+ ("tax_pct", "subtotal", "tax", "Tax percentage"),
1299
+ ("commission_pct", "sales", "commission", "Commission percentage"),
1300
+ ("margin_pct", "revenue", "profit", "Margin percentage"),
1301
+ ]
1302
+
1303
+ for pct_hint, base_hint, result_hint, description in percentage_patterns:
1304
+ pct_col = None
1305
+ base_col = None
1306
+ result_col = None
1307
+
1308
+ for col_lower, col_name in col_map.items():
1309
+ if pct_hint in col_lower and not pct_col:
1310
+ pct_col = col_name
1311
+ elif base_hint in col_lower and not base_col:
1312
+ base_col = col_name
1313
+ elif result_hint in col_lower and not result_col:
1314
+ result_col = col_name
1315
+
1316
+ if pct_col and base_col and result_col:
1317
+ suggestions.append(
1318
+ CrossColumnRuleSuggestion(
1319
+ rule_type=CrossColumnRuleType.COLUMN_PERCENTAGE,
1320
+ columns=[pct_col, base_col, result_col],
1321
+ validator_name="ColumnPercentage",
1322
+ params={
1323
+ "percentage_column": pct_col,
1324
+ "base_column": base_col,
1325
+ "result_column": result_col,
1326
+ "tolerance": 0.01,
1327
+ },
1328
+ confidence=0.7,
1329
+ reason=f"{description}: {base_col} × {pct_col}% = {result_col}",
1330
+ severity_suggestion="medium",
1331
+ evidence={
1332
+ "pattern": "arithmetic_percentage",
1333
+ "description": description,
1334
+ },
1335
+ )
1336
+ )
1337
+
1338
+ # Difference patterns (a - b = c)
1339
+ difference_patterns = [
1340
+ ("gross", "deductions", "net", "Net calculation"),
1341
+ ("revenue", "cost", "profit", "Profit calculation"),
1342
+ ("end_value", "start_value", "change", "Change calculation"),
1343
+ ("current", "previous", "delta", "Delta calculation"),
1344
+ ]
1345
+
1346
+ for minuend_hint, subtrahend_hint, result_hint, description in difference_patterns:
1347
+ minuend_col = None
1348
+ subtrahend_col = None
1349
+ result_col = None
1350
+
1351
+ for col_lower, col_name in col_map.items():
1352
+ if minuend_hint in col_lower and not minuend_col:
1353
+ minuend_col = col_name
1354
+ elif subtrahend_hint in col_lower and not subtrahend_col:
1355
+ subtrahend_col = col_name
1356
+ elif result_hint in col_lower and not result_col:
1357
+ result_col = col_name
1358
+
1359
+ if minuend_col and subtrahend_col and result_col:
1360
+ suggestions.append(
1361
+ CrossColumnRuleSuggestion(
1362
+ rule_type=CrossColumnRuleType.COLUMN_DIFFERENCE,
1363
+ columns=[minuend_col, subtrahend_col, result_col],
1364
+ validator_name="ColumnDifference",
1365
+ params={
1366
+ "minuend_column": minuend_col,
1367
+ "subtrahend_column": subtrahend_col,
1368
+ "result_column": result_col,
1369
+ "tolerance": 0.01,
1370
+ },
1371
+ confidence=0.75,
1372
+ reason=f"{description}: {minuend_col} - {subtrahend_col} = {result_col}",
1373
+ severity_suggestion="high",
1374
+ evidence={
1375
+ "pattern": "arithmetic_difference",
1376
+ "description": description,
1377
+ },
1378
+ )
1379
+ )
1380
+
1381
+ return suggestions
1382
+
1383
+ def _suggest_dependency_rules(
1384
+ self,
1385
+ columns: list[dict[str, Any]],
1386
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
1387
+ ) -> list[CrossColumnRuleSuggestion]:
1388
+ """Suggest functional dependency and implication rules.
1389
+
1390
+ Args:
1391
+ columns: List of column profile data.
1392
+ strictness: Strictness level.
1393
+
1394
+ Returns:
1395
+ List of cross-column suggestions.
1396
+ """
1397
+ suggestions = []
1398
+ col_map = {col.get("name", ""): col for col in columns}
1399
+
1400
+ # Common dependency patterns
1401
+ dependency_patterns = [
1402
+ # If status is 'active', email must not be null
1403
+ ("status", "email", "active", "If status is active, email is required"),
1404
+ ("status", "phone", "active", "If status is active, phone is required"),
1405
+ # If is_premium, subscription_tier must be set
1406
+ ("is_premium", "subscription_tier", True, "Premium users must have subscription tier"),
1407
+ # Country determines currency
1408
+ ("country", "currency", None, "Country determines currency"),
1409
+ ("country_code", "currency_code", None, "Country code determines currency code"),
1410
+ ]
1411
+
1412
+ for det_hint, dep_hint, condition, description in dependency_patterns:
1413
+ det_col = None
1414
+ dep_col = None
1415
+
1416
+ for col_name in col_map:
1417
+ if det_hint in col_name.lower():
1418
+ det_col = col_name
1419
+ if dep_hint in col_name.lower():
1420
+ dep_col = col_name
1421
+
1422
+ if det_col and dep_col and det_col != dep_col:
1423
+ if condition is not None:
1424
+ # Implication rule: if condition then dependent not null
1425
+ suggestions.append(
1426
+ CrossColumnRuleSuggestion(
1427
+ rule_type=CrossColumnRuleType.COLUMN_IMPLICATION,
1428
+ columns=[det_col, dep_col],
1429
+ validator_name="ColumnImplication",
1430
+ params={
1431
+ "determinant_column": det_col,
1432
+ "dependent_column": dep_col,
1433
+ "condition_value": condition,
1434
+ },
1435
+ confidence=0.7,
1436
+ reason=description,
1437
+ severity_suggestion="medium",
1438
+ evidence={
1439
+ "pattern": "conditional_dependency",
1440
+ },
1441
+ )
1442
+ )
1443
+ else:
1444
+ # Functional dependency
1445
+ suggestions.append(
1446
+ CrossColumnRuleSuggestion(
1447
+ rule_type=CrossColumnRuleType.COLUMN_DEPENDENCY,
1448
+ columns=[det_col, dep_col],
1449
+ validator_name="ColumnDependency",
1450
+ params={
1451
+ "determinant_column": det_col,
1452
+ "dependent_column": dep_col,
1453
+ },
1454
+ confidence=0.65,
1455
+ reason=description,
1456
+ severity_suggestion="medium",
1457
+ evidence={
1458
+ "pattern": "functional_dependency",
1459
+ },
1460
+ )
1461
+ )
1462
+
1463
+ # Coexistence patterns (all null or all non-null)
1464
+ coexistence_groups = [
1465
+ (["address_line1", "city", "postal_code"], "Address fields should coexist"),
1466
+ (["latitude", "longitude"], "Coordinates should coexist"),
1467
+ (["first_name", "last_name"], "Name fields should coexist"),
1468
+ (["start_date", "end_date"], "Date range fields should coexist"),
1469
+ ]
1470
+
1471
+ for hints, description in coexistence_groups:
1472
+ found_cols = []
1473
+ for hint in hints:
1474
+ for col_name in col_map:
1475
+ if hint in col_name.lower():
1476
+ found_cols.append(col_name)
1477
+ break
1478
+
1479
+ if len(found_cols) >= 2:
1480
+ suggestions.append(
1481
+ CrossColumnRuleSuggestion(
1482
+ rule_type=CrossColumnRuleType.COLUMN_COEXISTENCE,
1483
+ columns=found_cols,
1484
+ validator_name="ColumnCoexistence",
1485
+ params={"columns": found_cols},
1486
+ confidence=0.7,
1487
+ reason=description,
1488
+ severity_suggestion="medium",
1489
+ evidence={
1490
+ "pattern": "coexistence",
1491
+ "matched_hints": hints[:len(found_cols)],
1492
+ },
1493
+ )
1494
+ )
1495
+
1496
+ # Mutual exclusivity patterns
1497
+ mutex_groups = [
1498
+ (["phone_home", "phone_work", "phone_mobile"], "At most one phone type"),
1499
+ (["payment_card", "payment_bank", "payment_crypto"], "One payment method"),
1500
+ ]
1501
+
1502
+ for hints, description in mutex_groups:
1503
+ found_cols = []
1504
+ for hint in hints:
1505
+ for col_name in col_map:
1506
+ if hint in col_name.lower():
1507
+ found_cols.append(col_name)
1508
+ break
1509
+
1510
+ if len(found_cols) >= 2:
1511
+ suggestions.append(
1512
+ CrossColumnRuleSuggestion(
1513
+ rule_type=CrossColumnRuleType.COLUMN_MUTUAL_EXCLUSIVITY,
1514
+ columns=found_cols,
1515
+ validator_name="ColumnMutualExclusivity",
1516
+ params={"columns": found_cols},
1517
+ confidence=0.6,
1518
+ reason=description,
1519
+ severity_suggestion="low",
1520
+ evidence={
1521
+ "pattern": "mutual_exclusivity",
1522
+ },
1523
+ )
1524
+ )
1525
+
1526
+ return suggestions
1527
+
1528
+ def _generate_cross_column_suggestions(
1529
+ self,
1530
+ columns: list[dict[str, Any]],
1531
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
1532
+ include_types: list[CrossColumnRuleType] | None = None,
1533
+ exclude_types: list[CrossColumnRuleType] | None = None,
1534
+ ) -> list[CrossColumnRuleSuggestion]:
1535
+ """Generate all cross-column rule suggestions.
1536
+
1537
+ Args:
1538
+ columns: List of column profile data.
1539
+ strictness: Strictness level.
1540
+ include_types: Only include these cross-column rule types.
1541
+ exclude_types: Exclude these cross-column rule types.
1542
+
1543
+ Returns:
1544
+ List of cross-column suggestions.
1545
+ """
1546
+ all_suggestions: list[CrossColumnRuleSuggestion] = []
1547
+
1548
+ # Generate suggestions by type
1549
+ # Each generator method may produce multiple rule types
1550
+ type_generators = {
1551
+ CrossColumnRuleType.COMPOSITE_KEY: self._suggest_composite_key_rules,
1552
+ CrossColumnRuleType.COLUMN_COMPARISON: self._suggest_comparison_rules,
1553
+ CrossColumnRuleType.COLUMN_SUM: self._suggest_arithmetic_rules,
1554
+ CrossColumnRuleType.COLUMN_DEPENDENCY: self._suggest_dependency_rules,
1555
+ CrossColumnRuleType.COLUMN_IMPLICATION: self._suggest_dependency_rules,
1556
+ CrossColumnRuleType.COLUMN_COEXISTENCE: self._suggest_dependency_rules,
1557
+ CrossColumnRuleType.COLUMN_MUTUAL_EXCLUSIVITY: self._suggest_dependency_rules,
1558
+ # New generators for comprehensive cross-column support
1559
+ CrossColumnRuleType.COLUMN_CORRELATION: self._suggest_correlation_rules,
1560
+ CrossColumnRuleType.COLUMN_CHAIN_COMPARISON: self._suggest_chain_comparison_rules,
1561
+ CrossColumnRuleType.COLUMN_PRODUCT: self._suggest_advanced_arithmetic_rules,
1562
+ CrossColumnRuleType.COLUMN_RATIO: self._suggest_advanced_arithmetic_rules,
1563
+ CrossColumnRuleType.COLUMN_PERCENTAGE: self._suggest_advanced_arithmetic_rules,
1564
+ CrossColumnRuleType.COLUMN_DIFFERENCE: self._suggest_advanced_arithmetic_rules,
1565
+ }
1566
+
1567
+ # Determine which types to generate
1568
+ types_to_generate = set(type_generators.keys())
1569
+
1570
+ if include_types:
1571
+ types_to_generate &= set(include_types)
1572
+
1573
+ if exclude_types:
1574
+ types_to_generate -= set(exclude_types)
1575
+
1576
+ # Generate suggestions
1577
+ generated_methods = set()
1578
+ for rule_type in types_to_generate:
1579
+ generator = type_generators.get(rule_type)
1580
+ if generator and generator not in generated_methods:
1581
+ generated_methods.add(generator)
1582
+ suggestions = generator(columns, strictness)
1583
+ all_suggestions.extend(suggestions)
1584
+
1585
+ # Filter by min confidence based on strictness
1586
+ thresholds = STRICTNESS_THRESHOLDS[strictness]
1587
+ min_confidence = thresholds["min_confidence"]
1588
+ all_suggestions = [s for s in all_suggestions if s.confidence >= min_confidence]
1589
+
1590
+ # Deduplicate and sort by confidence
1591
+ seen = set()
1592
+ unique_suggestions = []
1593
+ for s in all_suggestions:
1594
+ key = (s.rule_type.value, tuple(sorted(s.columns)))
1595
+ if key not in seen:
1596
+ seen.add(key)
1597
+ unique_suggestions.append(s)
1598
+
1599
+ unique_suggestions.sort(key=lambda s: s.confidence, reverse=True)
1600
+
1601
+ # Assign unique IDs
1602
+ import uuid
1603
+ for s in unique_suggestions:
1604
+ s.id = str(uuid.uuid4())[:8]
1605
+
1606
+ return unique_suggestions
1607
+
1608
+ def _get_categories_for_preset(
1609
+ self, preset: RulePreset | None
1610
+ ) -> list[RuleCategory] | None:
1611
+ """Get categories for a preset.
1612
+
1613
+ Args:
1614
+ preset: Preset name.
1615
+
1616
+ Returns:
1617
+ List of categories or None for all.
1618
+ """
1619
+ if preset is None:
1620
+ return None
1621
+ preset_info = PRESET_DEFINITIONS.get(preset)
1622
+ if preset_info:
1623
+ return preset_info.categories
1624
+ return None
1625
+
1626
+ def _filter_by_category(
1627
+ self,
1628
+ suggestions: list[SuggestedRule],
1629
+ include_categories: list[RuleCategory] | None,
1630
+ exclude_categories: list[RuleCategory] | None,
1631
+ ) -> list[SuggestedRule]:
1632
+ """Filter suggestions by category.
1633
+
1634
+ Args:
1635
+ suggestions: List of suggestions.
1636
+ include_categories: Categories to include (None = all).
1637
+ exclude_categories: Categories to exclude.
1638
+
1639
+ Returns:
1640
+ Filtered list.
1641
+ """
1642
+ result = suggestions
1643
+
1644
+ if include_categories:
1645
+ include_set = set(c.value if isinstance(c, RuleCategory) else c for c in include_categories)
1646
+ result = [
1647
+ s for s in result
1648
+ if (s.category.value if isinstance(s.category, RuleCategory) else s.category) in include_set
1649
+ ]
1650
+
1651
+ if exclude_categories:
1652
+ exclude_set = set(c.value if isinstance(c, RuleCategory) else c for c in exclude_categories)
1653
+ result = [
1654
+ s for s in result
1655
+ if (s.category.value if isinstance(s.category, RuleCategory) else s.category) not in exclude_set
1656
+ ]
1657
+
1658
+ return result
1659
+
1660
+ async def generate_suggestions(
1661
+ self,
1662
+ source: Source,
1663
+ profile: Profile,
1664
+ schema: Schema | None = None,
1665
+ *,
1666
+ min_confidence: float = 0.5,
1667
+ strictness: StrictnessLevel = StrictnessLevel.MEDIUM,
1668
+ preset: RulePreset | None = None,
1669
+ include_categories: list[RuleCategory] | None = None,
1670
+ exclude_categories: list[RuleCategory] | None = None,
1671
+ enable_cross_column: bool = True,
1672
+ include_cross_column_types: list[CrossColumnRuleType] | None = None,
1673
+ exclude_cross_column_types: list[CrossColumnRuleType] | None = None,
1674
+ ) -> RuleSuggestionResponse:
1675
+ """Generate rule suggestions based on profile data.
1676
+
1677
+ Args:
1678
+ source: Source record.
1679
+ profile: Profile record.
1680
+ schema: Optional schema for additional context.
1681
+ min_confidence: Minimum confidence threshold.
1682
+ strictness: Strictness level for rule thresholds.
1683
+ preset: Preset template to use.
1684
+ include_categories: Categories to include.
1685
+ exclude_categories: Categories to exclude.
1686
+ enable_cross_column: Whether to generate cross-column rules.
1687
+ include_cross_column_types: Cross-column types to include.
1688
+ exclude_cross_column_types: Cross-column types to exclude.
1689
+
1690
+ Returns:
1691
+ Rule suggestion response.
1692
+ """
1693
+ suggestions: list[SuggestedRule] = []
1694
+ cross_column_suggestions: list[CrossColumnRuleSuggestion] = []
1695
+
1696
+ # Apply preset settings if specified
1697
+ if preset:
1698
+ preset_info = PRESET_DEFINITIONS.get(preset)
1699
+ if preset_info:
1700
+ strictness = preset_info.strictness
1701
+ include_categories = preset_info.categories
1702
+
1703
+ # Adjust min_confidence based on strictness
1704
+ thresholds = STRICTNESS_THRESHOLDS[strictness]
1705
+ effective_min_confidence = max(min_confidence, thresholds["min_confidence"])
1706
+
1707
+ # Get columns from profile
1708
+ columns = profile.columns if hasattr(profile, "columns") else []
1709
+ if not columns and profile.profile_json:
1710
+ columns = profile.profile_json.get("columns", [])
1711
+
1712
+ # Get schema columns for additional context
1713
+ schema_columns = {}
1714
+ if schema and schema.schema_json:
1715
+ schema_columns = schema.schema_json.get("columns", {})
1716
+
1717
+ # Generate single-column suggestions for each column
1718
+ for column in columns:
1719
+ col_name = column.get("name", "")
1720
+ schema_col = schema_columns.get(col_name)
1721
+
1722
+ # Collect all suggestions for this column with strictness
1723
+ suggestions.extend(self._suggest_null_rules(column, strictness))
1724
+ suggestions.extend(self._suggest_uniqueness_rules(column, strictness))
1725
+ suggestions.extend(self._suggest_range_rules(column, strictness))
1726
+ suggestions.extend(self._suggest_type_rules(column, schema_col, strictness))
1727
+ suggestions.extend(self._suggest_statistical_rules(column, strictness))
1728
+
1729
+ # Filter by category
1730
+ suggestions = self._filter_by_category(
1731
+ suggestions, include_categories, exclude_categories
1732
+ )
1733
+
1734
+ # Filter by confidence threshold
1735
+ suggestions = [s for s in suggestions if s.confidence >= effective_min_confidence]
1736
+
1737
+ # Sort by confidence (highest first)
1738
+ suggestions.sort(key=lambda s: s.confidence, reverse=True)
1739
+
1740
+ # Generate cross-column suggestions if enabled
1741
+ if enable_cross_column and columns:
1742
+ cross_column_suggestions = self._generate_cross_column_suggestions(
1743
+ columns,
1744
+ strictness,
1745
+ include_cross_column_types,
1746
+ exclude_cross_column_types,
1747
+ )
1748
+ # Filter by min confidence
1749
+ cross_column_suggestions = [
1750
+ s for s in cross_column_suggestions
1751
+ if s.confidence >= effective_min_confidence
1752
+ ]
1753
+
1754
+ # Count high confidence suggestions (single + cross-column)
1755
+ high_confidence = sum(1 for s in suggestions if s.confidence >= 0.8)
1756
+ high_confidence += sum(1 for s in cross_column_suggestions if s.confidence >= 0.8)
1757
+
1758
+ # Count by category
1759
+ by_category: dict[str, int] = {}
1760
+ for s in suggestions:
1761
+ cat_value = s.category.value if isinstance(s.category, RuleCategory) else str(s.category)
1762
+ by_category[cat_value] = by_category.get(cat_value, 0) + 1
1763
+
1764
+ # Add cross-column categories
1765
+ if cross_column_suggestions:
1766
+ by_category["relationship"] = len([
1767
+ s for s in cross_column_suggestions
1768
+ if s.rule_type in (
1769
+ CrossColumnRuleType.COLUMN_COMPARISON,
1770
+ CrossColumnRuleType.COLUMN_DEPENDENCY,
1771
+ CrossColumnRuleType.COLUMN_IMPLICATION,
1772
+ )
1773
+ ])
1774
+ by_category["multi_column"] = len([
1775
+ s for s in cross_column_suggestions
1776
+ if s.rule_type in (
1777
+ CrossColumnRuleType.COMPOSITE_KEY,
1778
+ CrossColumnRuleType.COLUMN_SUM,
1779
+ CrossColumnRuleType.COLUMN_COEXISTENCE,
1780
+ CrossColumnRuleType.COLUMN_MUTUAL_EXCLUSIVITY,
1781
+ )
1782
+ ])
1783
+
1784
+ # Count by cross-column type
1785
+ by_cross_column_type: dict[str, int] = {}
1786
+ for s in cross_column_suggestions:
1787
+ type_value = s.rule_type.value
1788
+ by_cross_column_type[type_value] = by_cross_column_type.get(type_value, 0) + 1
1789
+
1790
+ # Collect unique categories
1791
+ categories_included = list(set(
1792
+ s.category if isinstance(s.category, RuleCategory) else RuleCategory(s.category)
1793
+ for s in suggestions
1794
+ ))
1795
+ if cross_column_suggestions:
1796
+ if RuleCategory.RELATIONSHIP not in categories_included:
1797
+ categories_included.append(RuleCategory.RELATIONSHIP)
1798
+ if RuleCategory.MULTI_COLUMN not in categories_included:
1799
+ categories_included.append(RuleCategory.MULTI_COLUMN)
1800
+
1801
+ # Total suggestions count
1802
+ total_suggestions = len(suggestions) + len(cross_column_suggestions)
1803
+
1804
+ return RuleSuggestionResponse(
1805
+ source_id=source.id,
1806
+ source_name=source.name,
1807
+ profile_id=profile.id,
1808
+ suggestions=suggestions,
1809
+ cross_column_suggestions=cross_column_suggestions,
1810
+ total_suggestions=total_suggestions,
1811
+ high_confidence_count=high_confidence,
1812
+ cross_column_count=len(cross_column_suggestions),
1813
+ generated_at=datetime.utcnow(),
1814
+ strictness=strictness,
1815
+ preset=preset,
1816
+ categories_included=categories_included,
1817
+ by_category=by_category,
1818
+ by_cross_column_type=by_cross_column_type,
1819
+ )
1820
+
1821
+ def _build_rules_dict(
1822
+ self, suggestions: list[SuggestedRule]
1823
+ ) -> tuple[dict[str, Any], list[str]]:
1824
+ """Build rules dictionary from suggestions.
1825
+
1826
+ Args:
1827
+ suggestions: List of suggestions.
1828
+
1829
+ Returns:
1830
+ Tuple of (rules dict, validator names).
1831
+ """
1832
+ rules_dict: dict[str, Any] = {"columns": {}}
1833
+ validators_applied = []
1834
+
1835
+ for suggestion in suggestions:
1836
+ col_name = suggestion.column
1837
+ validator_name = suggestion.validator_name
1838
+
1839
+ if col_name not in rules_dict["columns"]:
1840
+ rules_dict["columns"][col_name] = {}
1841
+
1842
+ # Add validator with params
1843
+ if suggestion.params:
1844
+ rules_dict["columns"][col_name][validator_name.lower()] = suggestion.params
1845
+ else:
1846
+ rules_dict["columns"][col_name][validator_name.lower()] = True
1847
+
1848
+ validators_applied.append(validator_name)
1849
+
1850
+ return rules_dict, validators_applied
1851
+
1852
+ async def apply_suggestions(
1853
+ self,
1854
+ source: Source,
1855
+ suggestions: list[SuggestedRule],
1856
+ *,
1857
+ rule_name: str | None = None,
1858
+ rule_description: str | None = None,
1859
+ ) -> ApplyRulesResponse:
1860
+ """Apply selected rule suggestions to create validation rules.
1861
+
1862
+ Args:
1863
+ source: Source record.
1864
+ suggestions: Selected suggestions to apply.
1865
+ rule_name: Optional name for the rule set.
1866
+ rule_description: Optional description.
1867
+
1868
+ Returns:
1869
+ Apply rules response.
1870
+ """
1871
+ # Build rules from suggestions
1872
+ rules_dict, validators_applied = self._build_rules_dict(suggestions)
1873
+
1874
+ # Create YAML string
1875
+ rules_yaml = yaml.dump(rules_dict, default_flow_style=False)
1876
+
1877
+ # Create rule record
1878
+ final_name = rule_name or f"Auto-generated rules for {source.name}"
1879
+ final_description = rule_description or (
1880
+ f"Automatically generated from profile analysis. "
1881
+ f"Includes {len(suggestions)} validators."
1882
+ )
1883
+
1884
+ # Deactivate existing rules
1885
+ existing_rules = await self.rule_repo.get_for_source(
1886
+ source.id, active_only=True
1887
+ )
1888
+ for rule in existing_rules:
1889
+ rule.is_active = False
1890
+
1891
+ # Create new rule
1892
+ rule = await self.rule_repo.create(
1893
+ source_id=source.id,
1894
+ name=final_name,
1895
+ description=final_description,
1896
+ rules_yaml=rules_yaml,
1897
+ rules_json=rules_dict,
1898
+ is_active=True,
1899
+ )
1900
+
1901
+ await self.session.commit()
1902
+
1903
+ return ApplyRulesResponse(
1904
+ source_id=source.id,
1905
+ rule_id=rule.id,
1906
+ rule_name=rule.name,
1907
+ applied_count=len(suggestions),
1908
+ validators=list(set(validators_applied)),
1909
+ created_at=rule.created_at,
1910
+ )
1911
+
1912
+ def export_rules(
1913
+ self,
1914
+ suggestions: list[SuggestedRule],
1915
+ format: RuleExportFormat = RuleExportFormat.YAML,
1916
+ *,
1917
+ rule_name: str = "auto_generated_rules",
1918
+ description: str | None = None,
1919
+ include_metadata: bool = True,
1920
+ ) -> ExportRulesResponse:
1921
+ """Export rules in various formats.
1922
+
1923
+ Args:
1924
+ suggestions: Rules to export.
1925
+ format: Export format.
1926
+ rule_name: Name for the rule set.
1927
+ description: Optional description.
1928
+ include_metadata: Include generation metadata.
1929
+
1930
+ Returns:
1931
+ Export response with content.
1932
+ """
1933
+ rules_dict, validators = self._build_rules_dict(suggestions)
1934
+
1935
+ # Add metadata if requested
1936
+ if include_metadata:
1937
+ rules_dict["_metadata"] = {
1938
+ "name": rule_name,
1939
+ "description": description or f"Auto-generated rules ({len(suggestions)} validators)",
1940
+ "generated_at": datetime.utcnow().isoformat(),
1941
+ "rule_count": len(suggestions),
1942
+ "validators": list(set(validators)),
1943
+ }
1944
+
1945
+ # Generate content based on format
1946
+ if format == RuleExportFormat.YAML:
1947
+ content = yaml.dump(rules_dict, default_flow_style=False, sort_keys=False)
1948
+ filename = f"{rule_name}.yaml"
1949
+ elif format == RuleExportFormat.JSON:
1950
+ content = json.dumps(rules_dict, indent=2)
1951
+ filename = f"{rule_name}.json"
1952
+ elif format == RuleExportFormat.TOML:
1953
+ content = self._to_toml(rules_dict)
1954
+ filename = f"{rule_name}.toml"
1955
+ elif format == RuleExportFormat.PYTHON:
1956
+ content = self._to_python(rules_dict, rule_name, description)
1957
+ filename = f"{rule_name}.py"
1958
+ else:
1959
+ content = yaml.dump(rules_dict, default_flow_style=False)
1960
+ filename = f"{rule_name}.yaml"
1961
+
1962
+ return ExportRulesResponse(
1963
+ content=content,
1964
+ format=format,
1965
+ filename=filename,
1966
+ rule_count=len(suggestions),
1967
+ generated_at=datetime.utcnow(),
1968
+ )
1969
+
1970
+ def _to_toml(self, rules_dict: dict[str, Any]) -> str:
1971
+ """Convert rules to TOML format.
1972
+
1973
+ Args:
1974
+ rules_dict: Rules dictionary.
1975
+
1976
+ Returns:
1977
+ TOML string.
1978
+ """
1979
+ try:
1980
+ import toml
1981
+ return toml.dumps(rules_dict)
1982
+ except ImportError:
1983
+ # Fallback to simple TOML generation
1984
+ lines = []
1985
+ if "_metadata" in rules_dict:
1986
+ lines.append("[_metadata]")
1987
+ for k, v in rules_dict["_metadata"].items():
1988
+ if isinstance(v, str):
1989
+ lines.append(f'{k} = "{v}"')
1990
+ elif isinstance(v, list):
1991
+ lines.append(f'{k} = {json.dumps(v)}')
1992
+ else:
1993
+ lines.append(f"{k} = {v}")
1994
+ lines.append("")
1995
+
1996
+ if "columns" in rules_dict:
1997
+ for col_name, validators in rules_dict["columns"].items():
1998
+ lines.append(f'[columns."{col_name}"]')
1999
+ for val_name, val_config in validators.items():
2000
+ if isinstance(val_config, dict):
2001
+ lines.append(f"[columns.\"{col_name}\".{val_name}]")
2002
+ for pk, pv in val_config.items():
2003
+ if isinstance(pv, str):
2004
+ lines.append(f'{pk} = "{pv}"')
2005
+ else:
2006
+ lines.append(f"{pk} = {pv}")
2007
+ else:
2008
+ lines.append(f"{val_name} = {str(val_config).lower()}")
2009
+ lines.append("")
2010
+
2011
+ return "\n".join(lines)
2012
+
2013
+ def _to_python(
2014
+ self,
2015
+ rules_dict: dict[str, Any],
2016
+ rule_name: str,
2017
+ description: str | None,
2018
+ ) -> str:
2019
+ """Convert rules to Python code.
2020
+
2021
+ Args:
2022
+ rules_dict: Rules dictionary.
2023
+ rule_name: Name for the validation suite.
2024
+ description: Optional description.
2025
+
2026
+ Returns:
2027
+ Python code string.
2028
+ """
2029
+ lines = [
2030
+ '"""Auto-generated validation rules.',
2031
+ "",
2032
+ f"Name: {rule_name}",
2033
+ ]
2034
+ if description:
2035
+ lines.append(f"Description: {description}")
2036
+ lines.extend([
2037
+ '"""',
2038
+ "",
2039
+ "from truthound import th",
2040
+ "",
2041
+ "",
2042
+ f"def validate_{rule_name.replace('-', '_').replace(' ', '_')}(df):",
2043
+ f' """Run auto-generated validation rules."""',
2044
+ " result = th.check(",
2045
+ " df,",
2046
+ " validators=[",
2047
+ ])
2048
+
2049
+ # Add validators
2050
+ for col_name, validators in rules_dict.get("columns", {}).items():
2051
+ for val_name, val_config in validators.items():
2052
+ if isinstance(val_config, dict):
2053
+ params_str = ", ".join(
2054
+ f"{k}={repr(v)}" for k, v in val_config.items()
2055
+ )
2056
+ lines.append(f' ("{col_name}", "{val_name}", {{{params_str}}}),')
2057
+ else:
2058
+ lines.append(f' ("{col_name}", "{val_name}"),')
2059
+
2060
+ lines.extend([
2061
+ " ],",
2062
+ " )",
2063
+ " return result",
2064
+ "",
2065
+ "",
2066
+ 'if __name__ == "__main__":',
2067
+ " import pandas as pd",
2068
+ " # df = pd.read_csv('your_data.csv')",
2069
+ f" # result = validate_{rule_name.replace('-', '_').replace(' ', '_')}(df)",
2070
+ " # print(result)",
2071
+ "",
2072
+ ])
2073
+
2074
+ return "\n".join(lines)
2075
+
2076
+ @staticmethod
2077
+ def get_presets() -> PresetsResponse:
2078
+ """Get available presets and configuration options.
2079
+
2080
+ Returns:
2081
+ Presets response.
2082
+ """
2083
+ return PresetsResponse(
2084
+ presets=list(PRESET_DEFINITIONS.values()),
2085
+ strictness_levels=[level.value for level in StrictnessLevel],
2086
+ categories=[cat.value for cat in RuleCategory],
2087
+ export_formats=[fmt.value for fmt in RuleExportFormat],
2088
+ )