truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +437 -10
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +11 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
  146. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
  203. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -6,18 +6,41 @@ enabling non-blocking validation operations in the FastAPI application.
6
6
  The adapter uses ThreadPoolExecutor to run synchronous truthound
7
7
  functions without blocking the async event loop.
8
8
 
9
+ Architecture:
10
+ Dashboard Services
11
+
12
+ TruthoundAdapter (this module)
13
+
14
+ truthound library (external)
15
+
16
+ The adapter is designed for loose coupling with truthound:
17
+ - Protocol-based interfaces for type checking
18
+ - Graceful fallbacks when truthound versions differ
19
+ - All truthound interactions are isolated in this module
20
+
9
21
  Features:
10
- - Async wrappers for all truthound functions
22
+ - Async wrappers for all truthound functions (check, learn, profile, compare, scan, mask)
23
+ - Support for both file paths and DataSource objects
11
24
  - Automatic sampling for large datasets (100MB+ files)
25
+ - ValidationResult conversion for reporter integration
12
26
  - Configurable sample size and sampling methods
13
27
 
14
28
  Example:
15
29
  adapter = get_adapter()
30
+
31
+ # With file path
16
32
  result = await adapter.check("/path/to/data.csv")
17
- schema = await adapter.learn("/path/to/data.csv")
33
+
34
+ # With DataSource
35
+ from truthound_dashboard.core.datasource_factory import create_datasource
36
+ source = create_datasource({"type": "postgresql", "table": "users", ...})
37
+ result = await adapter.check(source)
18
38
 
19
39
  # With auto-sampling for large files
20
40
  result = await adapter.check_with_sampling("/path/to/large.csv")
41
+
42
+ # Convert to ValidationResult for reporters
43
+ validation_result = result.to_validation_result()
21
44
  """
22
45
 
23
46
  from __future__ import annotations
@@ -28,12 +51,18 @@ from concurrent.futures import ThreadPoolExecutor
28
51
  from dataclasses import dataclass
29
52
  from functools import partial
30
53
  from pathlib import Path
31
- from typing import Any, Protocol, runtime_checkable
54
+ from typing import TYPE_CHECKING, Any, Protocol, Union, runtime_checkable
32
55
 
33
56
  import yaml
34
57
 
58
+ if TYPE_CHECKING:
59
+ from truthound_dashboard.core.datasource_factory import SourceConfig
60
+
35
61
  logger = logging.getLogger(__name__)
36
62
 
63
+ # Type alias for data input - can be path string or DataSource object
64
+ DataInput = Union[str, Any]
65
+
37
66
 
38
67
  @runtime_checkable
39
68
  class TruthoundResult(Protocol):
@@ -47,6 +76,9 @@ class TruthoundResult(Protocol):
47
76
  class CheckResult:
48
77
  """Validation check result.
49
78
 
79
+ This class wraps truthound's Report/ValidationResult and provides
80
+ a consistent interface for the dashboard regardless of truthound version.
81
+
50
82
  Attributes:
51
83
  passed: Whether validation passed (no issues).
52
84
  has_critical: Whether critical issues were found.
@@ -56,10 +88,13 @@ class CheckResult:
56
88
  high_issues: Number of high severity issues.
57
89
  medium_issues: Number of medium severity issues.
58
90
  low_issues: Number of low severity issues.
59
- source: Data source path.
91
+ source: Data source path or name.
60
92
  row_count: Number of rows validated.
61
93
  column_count: Number of columns.
62
94
  issues: List of validation issues.
95
+ run_id: Optional run identifier for tracking.
96
+ run_time: Optional timestamp of the validation run.
97
+ _raw_result: Internal reference to the original truthound result.
63
98
  """
64
99
 
65
100
  passed: bool
@@ -74,10 +109,13 @@ class CheckResult:
74
109
  row_count: int
75
110
  column_count: int
76
111
  issues: list[dict[str, Any]]
112
+ run_id: str | None = None
113
+ run_time: Any = None
114
+ _raw_result: Any = None
77
115
 
78
116
  def to_dict(self) -> dict[str, Any]:
79
117
  """Convert to dictionary."""
80
- return {
118
+ result = {
81
119
  "passed": self.passed,
82
120
  "has_critical": self.has_critical,
83
121
  "has_high": self.has_high,
@@ -91,6 +129,39 @@ class CheckResult:
91
129
  "column_count": self.column_count,
92
130
  "issues": self.issues,
93
131
  }
132
+ if self.run_id:
133
+ result["run_id"] = self.run_id
134
+ if self.run_time:
135
+ result["run_time"] = (
136
+ self.run_time.isoformat()
137
+ if hasattr(self.run_time, "isoformat")
138
+ else str(self.run_time)
139
+ )
140
+ return result
141
+
142
+ def to_validation_result(self) -> Any:
143
+ """Convert to truthound's ValidationResult format for reporters.
144
+
145
+ This enables using truthound's reporters directly with this result.
146
+
147
+ Returns:
148
+ An object that implements the ValidationResult interface expected
149
+ by truthound reporters, or the raw result if available.
150
+ """
151
+ # If we have the raw truthound result, prefer using it
152
+ if self._raw_result is not None:
153
+ # Check if it's already a ValidationResult
154
+ if hasattr(self._raw_result, "results") and hasattr(
155
+ self._raw_result, "run_id"
156
+ ):
157
+ return self._raw_result
158
+ # It's a Report - try to convert
159
+ return self._create_validation_result_mock()
160
+ return self._create_validation_result_mock()
161
+
162
+ def _create_validation_result_mock(self) -> "_ValidationResultMock":
163
+ """Create a mock ValidationResult for reporter compatibility."""
164
+ return _ValidationResultMock(self)
94
165
 
95
166
 
96
167
  @dataclass
@@ -122,32 +193,190 @@ class LearnResult:
122
193
  }
123
194
 
124
195
 
196
+ @dataclass
197
+ class ColumnProfileResult:
198
+ """Column-level profile result matching truthound's ColumnProfile structure.
199
+
200
+ Attributes:
201
+ name: Column name.
202
+ physical_type: Polars data type (string).
203
+ inferred_type: Inferred logical type (e.g., email, phone, integer).
204
+ row_count: Number of rows.
205
+ null_count: Number of null values.
206
+ null_ratio: Ratio of null values (0.0-1.0).
207
+ empty_string_count: Number of empty strings.
208
+ distinct_count: Number of distinct values.
209
+ unique_ratio: Ratio of unique values (0.0-1.0).
210
+ is_unique: Whether all values are unique.
211
+ is_constant: Whether all values are the same.
212
+ distribution: Statistical distribution (for numeric columns).
213
+ top_values: Most frequent values.
214
+ bottom_values: Least frequent values.
215
+ min_length: Minimum string length (for string columns).
216
+ max_length: Maximum string length (for string columns).
217
+ avg_length: Average string length (for string columns).
218
+ detected_patterns: Detected patterns (for string columns).
219
+ min_date: Minimum date (for datetime columns).
220
+ max_date: Maximum date (for datetime columns).
221
+ date_gaps: Number of date gaps (for datetime columns).
222
+ suggested_validators: List of suggested validator names.
223
+ profile_duration_ms: Time taken to profile this column.
224
+ """
225
+
226
+ name: str
227
+ physical_type: str
228
+ inferred_type: str = "unknown"
229
+ row_count: int = 0
230
+ null_count: int = 0
231
+ null_ratio: float = 0.0
232
+ empty_string_count: int = 0
233
+ distinct_count: int = 0
234
+ unique_ratio: float = 0.0
235
+ is_unique: bool = False
236
+ is_constant: bool = False
237
+ distribution: dict[str, Any] | None = None
238
+ top_values: list[dict[str, Any]] | None = None
239
+ bottom_values: list[dict[str, Any]] | None = None
240
+ min_length: int | None = None
241
+ max_length: int | None = None
242
+ avg_length: float | None = None
243
+ detected_patterns: list[dict[str, Any]] | None = None
244
+ min_date: str | None = None
245
+ max_date: str | None = None
246
+ date_gaps: int = 0
247
+ suggested_validators: list[str] | None = None
248
+ profile_duration_ms: float = 0.0
249
+
250
+ def to_dict(self) -> dict[str, Any]:
251
+ """Convert to dictionary."""
252
+ result = {
253
+ "name": self.name,
254
+ "physical_type": self.physical_type,
255
+ "inferred_type": self.inferred_type,
256
+ "row_count": self.row_count,
257
+ "null_count": self.null_count,
258
+ "null_ratio": self.null_ratio,
259
+ "empty_string_count": self.empty_string_count,
260
+ "distinct_count": self.distinct_count,
261
+ "unique_ratio": self.unique_ratio,
262
+ "is_unique": self.is_unique,
263
+ "is_constant": self.is_constant,
264
+ "profile_duration_ms": self.profile_duration_ms,
265
+ }
266
+ if self.distribution:
267
+ result["distribution"] = self.distribution
268
+ if self.top_values:
269
+ result["top_values"] = self.top_values
270
+ if self.bottom_values:
271
+ result["bottom_values"] = self.bottom_values
272
+ if self.min_length is not None:
273
+ result["min_length"] = self.min_length
274
+ result["max_length"] = self.max_length
275
+ result["avg_length"] = self.avg_length
276
+ if self.detected_patterns:
277
+ result["detected_patterns"] = self.detected_patterns
278
+ if self.min_date:
279
+ result["min_date"] = self.min_date
280
+ result["max_date"] = self.max_date
281
+ result["date_gaps"] = self.date_gaps
282
+ if self.suggested_validators:
283
+ result["suggested_validators"] = self.suggested_validators
284
+ return result
285
+
286
+
125
287
  @dataclass
126
288
  class ProfileResult:
127
- """Data profiling result.
289
+ """Data profiling result matching truthound's TableProfile structure.
128
290
 
129
291
  Attributes:
130
- source: Data source path.
292
+ name: Table/source name.
293
+ source: Data source path or name.
131
294
  row_count: Number of rows.
132
295
  column_count: Number of columns.
133
- size_bytes: Data size in bytes.
134
- columns: List of column profile dictionaries.
296
+ estimated_memory_bytes: Estimated memory usage in bytes.
297
+ columns: List of column profile results.
298
+ duplicate_row_count: Number of duplicate rows.
299
+ duplicate_row_ratio: Ratio of duplicate rows.
300
+ correlations: Column correlation pairs with coefficients.
301
+ profiled_at: Timestamp when profile was created.
302
+ profile_duration_ms: Total profiling duration in milliseconds.
303
+ size_bytes: Data size in bytes (backward compatibility).
135
304
  """
136
305
 
306
+ name: str
137
307
  source: str
138
308
  row_count: int
139
309
  column_count: int
140
- size_bytes: int
141
- columns: list[dict[str, Any]]
310
+ estimated_memory_bytes: int
311
+ columns: list[ColumnProfileResult]
312
+ duplicate_row_count: int = 0
313
+ duplicate_row_ratio: float = 0.0
314
+ correlations: list[tuple[str, str, float]] | None = None
315
+ profiled_at: str | None = None
316
+ profile_duration_ms: float = 0.0
317
+ size_bytes: int = 0 # Backward compatibility
142
318
 
143
319
  def to_dict(self) -> dict[str, Any]:
144
320
  """Convert to dictionary."""
145
321
  return {
322
+ "name": self.name,
146
323
  "source": self.source,
147
324
  "row_count": self.row_count,
148
325
  "column_count": self.column_count,
149
- "size_bytes": self.size_bytes,
150
- "columns": self.columns,
326
+ "estimated_memory_bytes": self.estimated_memory_bytes,
327
+ "size_bytes": self.size_bytes or self.estimated_memory_bytes,
328
+ "duplicate_row_count": self.duplicate_row_count,
329
+ "duplicate_row_ratio": self.duplicate_row_ratio,
330
+ "correlations": self.correlations,
331
+ "profiled_at": self.profiled_at,
332
+ "profile_duration_ms": self.profile_duration_ms,
333
+ "columns": [col.to_dict() for col in self.columns],
334
+ }
335
+
336
+ def get_column(self, name: str) -> ColumnProfileResult | None:
337
+ """Get column profile by name."""
338
+ for col in self.columns:
339
+ if col.name == name:
340
+ return col
341
+ return None
342
+
343
+ @property
344
+ def column_names(self) -> list[str]:
345
+ """Get list of column names."""
346
+ return [col.name for col in self.columns]
347
+
348
+
349
+ @dataclass
350
+ class GenerateSuiteResult:
351
+ """Validation suite generation result.
352
+
353
+ Result from generating validation rules based on profile data.
354
+
355
+ Attributes:
356
+ rules: List of generated validation rules.
357
+ rule_count: Total number of rules generated.
358
+ categories: Categories of rules generated.
359
+ strictness: Strictness level used for generation.
360
+ yaml_content: Generated rules as YAML string.
361
+ json_content: Generated rules as JSON-serializable dict.
362
+ """
363
+
364
+ rules: list[dict[str, Any]]
365
+ rule_count: int
366
+ categories: list[str]
367
+ strictness: str
368
+ yaml_content: str
369
+ json_content: dict[str, Any]
370
+
371
+ def to_dict(self) -> dict[str, Any]:
372
+ """Convert to dictionary."""
373
+ return {
374
+ "rules": self.rules,
375
+ "rule_count": self.rule_count,
376
+ "categories": self.categories,
377
+ "strictness": self.strictness,
378
+ "yaml_content": self.yaml_content,
379
+ "json_content": self.json_content,
151
380
  }
152
381
 
153
382
 
@@ -156,8 +385,8 @@ class CompareResult:
156
385
  """Drift comparison result.
157
386
 
158
387
  Attributes:
159
- baseline_source: Baseline data source path.
160
- current_source: Current data source path.
388
+ baseline_source: Baseline data source path or name.
389
+ current_source: Current data source path or name.
161
390
  baseline_rows: Number of rows in baseline.
162
391
  current_rows: Number of rows in current.
163
392
  has_drift: Whether drift was detected.
@@ -197,7 +426,7 @@ class ScanResult:
197
426
  """PII scan result.
198
427
 
199
428
  Attributes:
200
- source: Data source path.
429
+ source: Data source path or name.
201
430
  row_count: Number of rows scanned.
202
431
  column_count: Number of columns.
203
432
  total_columns_scanned: Total columns that were scanned.
@@ -241,7 +470,7 @@ class MaskResult:
241
470
  """Data masking result.
242
471
 
243
472
  Attributes:
244
- source: Original data source path.
473
+ source: Original data source path or name.
245
474
  output_path: Path to the masked output file.
246
475
  row_count: Number of rows in the masked data.
247
476
  column_count: Number of columns in the masked data.
@@ -271,12 +500,30 @@ class MaskResult:
271
500
  }
272
501
 
273
502
 
503
+ def _get_source_name(data: DataInput) -> str:
504
+ """Get source name from data input.
505
+
506
+ Args:
507
+ data: File path string or DataSource object.
508
+
509
+ Returns:
510
+ Source name string.
511
+ """
512
+ if isinstance(data, str):
513
+ return data
514
+ # DataSource objects have a name property
515
+ return getattr(data, "name", str(type(data).__name__))
516
+
517
+
274
518
  class TruthoundAdapter:
275
519
  """Async wrapper for truthound functions.
276
520
 
277
521
  This adapter provides an async interface to truthound operations,
278
522
  running them in a thread pool to avoid blocking the event loop.
279
523
 
524
+ The adapter supports both file paths and DataSource objects for
525
+ validation, profiling, and other operations.
526
+
280
527
  Attributes:
281
528
  max_workers: Maximum number of worker threads.
282
529
  """
@@ -292,15 +539,13 @@ class TruthoundAdapter:
292
539
 
293
540
  async def check(
294
541
  self,
295
- data: str,
542
+ data: DataInput,
296
543
  *,
297
544
  validators: list[str] | None = None,
298
- validator_params: dict[str, dict[str, Any]] | None = None,
545
+ validator_config: dict[str, dict[str, Any]] | None = None,
299
546
  schema: str | None = None,
300
547
  auto_schema: bool = False,
301
- columns: list[str] | None = None,
302
548
  min_severity: str | None = None,
303
- strict: bool = False,
304
549
  parallel: bool = False,
305
550
  max_workers: int | None = None,
306
551
  pushdown: bool | None = None,
@@ -311,16 +556,17 @@ class TruthoundAdapter:
311
556
  All parameters map directly to th.check() for maximum flexibility.
312
557
 
313
558
  Args:
314
- data: Data source path (CSV, Parquet, etc.).
559
+ data: Data source - can be:
560
+ - File path string (CSV, Parquet, JSON, etc.)
561
+ - DataSource object (SQL, Cloud DW, etc.)
315
562
  validators: Optional list of validator names to run.
316
- validator_params: Optional dict of per-validator parameters.
563
+ validator_config: Optional dict of per-validator configuration.
317
564
  Format: {"ValidatorName": {"param1": value1, "param2": value2}}
318
- Example: {"Null": {"columns": ["a", "b"], "mostly": 0.95}}
565
+ Example: {"Null": {"columns": ("a", "b"), "mostly": 0.95}}
566
+ Note: In truthound 2.x, columns should be tuples, not lists.
319
567
  schema: Optional path to schema YAML file.
320
568
  auto_schema: If True, auto-learns schema for validation.
321
- columns: Columns to validate. If None, validates all columns.
322
569
  min_severity: Minimum severity to report ("low", "medium", "high", "critical").
323
- strict: If True, raises exception on validation failures.
324
570
  parallel: If True, uses DAG-based parallel execution.
325
571
  max_workers: Max threads for parallel execution.
326
572
  pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
@@ -331,36 +577,38 @@ class TruthoundAdapter:
331
577
  Raises:
332
578
  ImportError: If truthound is not installed.
333
579
  FileNotFoundError: If data file doesn't exist.
334
- ValidationError: If strict=True and validation fails.
335
580
  """
336
581
  import truthound as th
337
582
 
338
583
  # Build kwargs dynamically to avoid passing None for optional params
339
- # This ensures truthound uses its own defaults when params are not specified
340
- kwargs: dict[str, Any] = {
341
- "validators": validators,
342
- "schema": schema,
343
- "auto_schema": auto_schema,
344
- "parallel": parallel,
345
- }
584
+ # Use 'source' parameter for DataSource objects (truthound 2.x API)
585
+ if isinstance(data, str):
586
+ kwargs: dict[str, Any] = {"data": data}
587
+ else:
588
+ kwargs = {"source": data}
589
+
590
+ kwargs.update(
591
+ {
592
+ "validators": validators,
593
+ "schema": schema,
594
+ "auto_schema": auto_schema,
595
+ "parallel": parallel,
596
+ }
597
+ )
346
598
 
347
- # Add per-validator parameters if provided
348
- if validator_params:
349
- kwargs["validator_params"] = validator_params
599
+ # Add per-validator configuration if provided (truthound 2.x uses validator_config)
600
+ if validator_config:
601
+ kwargs["validator_config"] = validator_config
350
602
 
351
603
  # Only add optional params if explicitly set
352
- if columns is not None:
353
- kwargs["columns"] = columns
354
604
  if min_severity is not None:
355
605
  kwargs["min_severity"] = min_severity
356
- if strict:
357
- kwargs["strict"] = strict
358
606
  if max_workers is not None:
359
607
  kwargs["max_workers"] = max_workers
360
608
  if pushdown is not None:
361
609
  kwargs["pushdown"] = pushdown
362
610
 
363
- func = partial(th.check, data, **kwargs)
611
+ func = partial(th.check, **kwargs)
364
612
 
365
613
  loop = asyncio.get_event_loop()
366
614
  result = await loop.run_in_executor(self._executor, func)
@@ -369,7 +617,7 @@ class TruthoundAdapter:
369
617
 
370
618
  async def learn(
371
619
  self,
372
- source: str,
620
+ source: DataInput,
373
621
  *,
374
622
  infer_constraints: bool = True,
375
623
  categorical_threshold: int | None = None,
@@ -378,23 +626,36 @@ class TruthoundAdapter:
378
626
  """Learn schema from data asynchronously.
379
627
 
380
628
  Uses truthound's th.learn() to analyze data and generate schema.
381
- Supports all th.learn() parameters for maximum flexibility.
629
+ If sample_size is provided, delegates to learn_with_sampling() which
630
+ handles dashboard-level sampling before calling th.learn().
631
+
632
+ Note: th.learn() only supports (data, infer_constraints, categorical_threshold).
633
+ sample_size is handled at dashboard level, not passed to truthound.
382
634
 
383
635
  Args:
384
- source: Data source path.
636
+ source: Data source - can be:
637
+ - File path string
638
+ - DataSource object
385
639
  infer_constraints: If True, infers constraints (min/max, allowed values)
386
640
  from data statistics.
387
641
  categorical_threshold: Maximum unique values for categorical detection.
388
642
  Columns with unique values <= threshold are treated as categorical
389
643
  and will have allowed_values inferred. If None, uses truthound
390
644
  default (20).
391
- sample_size: Number of rows to sample for large datasets.
392
- If None, uses all rows. Sampling improves performance but may
393
- miss rare values.
645
+ sample_size: Sample size for large datasets. Handled at dashboard level
646
+ by pre-sampling data before passing to th.learn().
394
647
 
395
648
  Returns:
396
649
  LearnResult with schema information.
397
650
  """
651
+ if sample_size is not None:
652
+ return await self.learn_with_sampling(
653
+ source,
654
+ infer_constraints=infer_constraints,
655
+ categorical_threshold=categorical_threshold,
656
+ sample_size=sample_size,
657
+ )
658
+
398
659
  import truthound as th
399
660
 
400
661
  # Build kwargs dynamically to let truthound use its defaults when not specified
@@ -402,8 +663,6 @@ class TruthoundAdapter:
402
663
 
403
664
  if categorical_threshold is not None:
404
665
  kwargs["categorical_threshold"] = categorical_threshold
405
- if sample_size is not None:
406
- kwargs["sample_size"] = sample_size
407
666
 
408
667
  func = partial(th.learn, source, **kwargs)
409
668
 
@@ -414,75 +673,329 @@ class TruthoundAdapter:
414
673
 
415
674
  async def profile(
416
675
  self,
417
- source: str,
418
- *,
419
- sample_size: int | None = None,
676
+ source: DataInput,
420
677
  ) -> ProfileResult:
421
678
  """Run data profiling asynchronously.
422
679
 
680
+ Note: truthound's th.profile() only accepts (data, source) parameters.
681
+ Advanced configuration options are NOT supported by the underlying library.
682
+
423
683
  Args:
424
- source: Data source path.
425
- sample_size: Maximum number of rows to sample for profiling.
426
- If None, profiles all data. Useful for large datasets.
684
+ source: Data source - can be:
685
+ - File path string
686
+ - DataSource object
427
687
 
428
688
  Returns:
429
689
  ProfileResult with profiling information.
430
690
  """
431
691
  import truthound as th
432
692
 
433
- # Build kwargs dynamically to let truthound use its defaults
434
- kwargs: dict[str, Any] = {}
435
- if sample_size is not None:
436
- kwargs["sample_size"] = sample_size
437
-
438
- func = partial(th.profile, source, **kwargs)
693
+ func = partial(th.profile, source)
439
694
 
440
695
  loop = asyncio.get_event_loop()
441
696
  result = await loop.run_in_executor(self._executor, func)
697
+ return self._convert_profile_result(result)
698
+
699
+ async def profile_advanced(
700
+ self,
701
+ source: DataInput,
702
+ *,
703
+ config: dict[str, Any] | None = None,
704
+ ) -> ProfileResult:
705
+ """Run advanced data profiling with full ProfilerConfig support.
706
+
707
+ This method provides direct access to all ProfilerConfig options
708
+ through a configuration dictionary.
709
+
710
+ Note: DataProfiler.profile() only accepts LazyFrame, so file paths
711
+ are converted to LazyFrame first. For simple profiling without
712
+ advanced config, use profile() method instead.
713
+
714
+ Args:
715
+ source: Data source - file path string or DataSource object.
716
+ config: ProfilerConfig options as dictionary. Supported keys:
717
+ - sample_size: int | None (max rows to sample)
718
+ - random_seed: int (default 42)
719
+ - include_patterns: bool (default True)
720
+ - include_correlations: bool (default False)
721
+ - include_distributions: bool (default True)
722
+ - top_n_values: int (default 10)
723
+ - pattern_sample_size: int (default 1000)
724
+ - correlation_threshold: float (default 0.7)
725
+ - min_pattern_match_ratio: float (default 0.8)
726
+ - n_jobs: int (default 1)
727
+
728
+ Returns:
729
+ ProfileResult with comprehensive profiling information.
730
+
731
+ Raises:
732
+ ImportError: If truthound.profiler module is not available.
733
+ """
734
+ import polars as pl
735
+
736
+ from truthound.profiler import DataProfiler, ProfilerConfig
737
+
738
+ config = config or {}
739
+
740
+ profiler_config = ProfilerConfig(
741
+ sample_size=config.get("sample_size"),
742
+ random_seed=config.get("random_seed", 42),
743
+ include_patterns=config.get("include_patterns", True),
744
+ include_correlations=config.get("include_correlations", False),
745
+ include_distributions=config.get("include_distributions", True),
746
+ top_n_values=config.get("top_n_values", 10),
747
+ pattern_sample_size=config.get("pattern_sample_size", 1000),
748
+ correlation_threshold=config.get("correlation_threshold", 0.7),
749
+ min_pattern_match_ratio=config.get("min_pattern_match_ratio", 0.8),
750
+ n_jobs=config.get("n_jobs", 1),
751
+ )
752
+
753
+ profiler = DataProfiler(config=profiler_config)
754
+
755
+ # DataProfiler.profile() only accepts LazyFrame
756
+ # Convert file path to LazyFrame
757
+ if isinstance(source, str):
758
+ # Determine file format and create LazyFrame
759
+ source_lower = source.lower()
760
+ if source_lower.endswith(".csv"):
761
+ lf = pl.scan_csv(source)
762
+ elif source_lower.endswith(".parquet"):
763
+ lf = pl.scan_parquet(source)
764
+ elif source_lower.endswith((".json", ".ndjson", ".jsonl")):
765
+ lf = pl.scan_ndjson(source)
766
+ else:
767
+ # Fallback to th.profile() for unsupported formats
768
+ import truthound as th
769
+
770
+ func = partial(th.profile, source)
771
+ loop = asyncio.get_event_loop()
772
+ result = await loop.run_in_executor(self._executor, func)
773
+ return self._convert_profile_result(result)
774
+
775
+ func = partial(profiler.profile, lf, name=source, source=source)
776
+ elif hasattr(source, "lazy"):
777
+ # DataFrame with .lazy() method
778
+ func = partial(profiler.profile, source.lazy())
779
+ elif hasattr(source, "collect"):
780
+ # Already a LazyFrame
781
+ func = partial(profiler.profile, source)
782
+ else:
783
+ # Fallback to th.profile() for other types
784
+ import truthound as th
442
785
 
786
+ func = partial(th.profile, source)
787
+ loop = asyncio.get_event_loop()
788
+ result = await loop.run_in_executor(self._executor, func)
789
+ return self._convert_profile_result(result)
790
+
791
+ loop = asyncio.get_event_loop()
792
+ result = await loop.run_in_executor(self._executor, func)
443
793
  return self._convert_profile_result(result)
444
794
 
445
- async def scan(
795
+ async def generate_suite(
446
796
  self,
447
- data: str,
797
+ profile: ProfileResult | dict[str, Any],
448
798
  *,
449
- columns: list[str] | None = None,
450
- regulations: list[str] | None = None,
451
- min_confidence: float = 0.8,
452
- ) -> ScanResult:
453
- """Run PII scan on data asynchronously.
799
+ strictness: str = "medium",
800
+ preset: str = "default",
801
+ include: list[str] | None = None,
802
+ exclude: list[str] | None = None,
803
+ output_format: str = "yaml",
804
+ ) -> GenerateSuiteResult:
805
+ """Generate validation suite from profile.
454
806
 
455
- Uses truthound's th.scan() to detect personally identifiable information
456
- and check compliance with privacy regulations.
807
+ Uses truthound's generate_suite() to automatically create validation
808
+ rules based on profiled data characteristics.
457
809
 
458
810
  Args:
459
- data: Data source path (CSV, Parquet, etc.).
460
- columns: Optional list of columns to scan. If None, scans all columns.
461
- regulations: Optional list of regulations to check compliance.
462
- Supported: "gdpr", "ccpa", "lgpd"
463
- min_confidence: Minimum confidence threshold for PII detection (0.0-1.0).
464
- Default is 0.8.
811
+ profile: Profile result from profile() or profile_advanced(),
812
+ or a dictionary representation of a profile.
813
+ strictness: Strictness level for rule generation:
814
+ - "loose": Permissive thresholds, fewer rules
815
+ - "medium": Balanced defaults (default)
816
+ - "strict": Tight thresholds, comprehensive rules
817
+ preset: Configuration preset for rule generation:
818
+ - "default": General purpose
819
+ - "strict": Production data
820
+ - "loose": Development/testing
821
+ - "minimal": Essential rules only
822
+ - "comprehensive": All available rules
823
+ - "ci_cd": Optimized for CI/CD pipelines
824
+ - "schema_only": Structure validation only
825
+ - "format_only": Format/pattern rules only
826
+ include: List of rule categories to include (None = all).
827
+ Categories: schema, stats, pattern, completeness, uniqueness, distribution
828
+ exclude: List of rule categories to exclude.
829
+ output_format: Output format ("yaml", "json", "python").
465
830
 
466
831
  Returns:
467
- ScanResult with PII findings and regulation violations.
832
+ GenerateSuiteResult with generated rules.
468
833
 
469
834
  Raises:
470
- ImportError: If truthound is not installed.
471
- FileNotFoundError: If data file doesn't exist.
835
+ ImportError: If truthound.profiler module is not available.
472
836
  """
473
- import truthound as th
837
+ from truthound.profiler import generate_suite
838
+ from truthound.profiler.generators import Strictness
839
+
840
+ # Convert strictness string to enum
841
+ strictness_map = {
842
+ "loose": Strictness.LOOSE,
843
+ "medium": Strictness.MEDIUM,
844
+ "strict": Strictness.STRICT,
845
+ }
846
+ strictness_enum = strictness_map.get(strictness.lower(), Strictness.MEDIUM)
847
+
848
+ # Convert ProfileResult to dict if needed
849
+ if isinstance(profile, ProfileResult):
850
+ profile_data = profile.to_dict()
851
+ else:
852
+ profile_data = profile
474
853
 
475
- # Build kwargs dynamically to let truthound use its defaults
854
+ # Build kwargs
476
855
  kwargs: dict[str, Any] = {
477
- "min_confidence": min_confidence,
856
+ "strictness": strictness_enum,
857
+ "preset": preset,
478
858
  }
859
+ if include:
860
+ kwargs["include"] = include
861
+ if exclude:
862
+ kwargs["exclude"] = exclude
479
863
 
480
- if columns is not None:
481
- kwargs["columns"] = columns
482
- if regulations is not None:
483
- kwargs["regulations"] = regulations
864
+ # Generate suite in thread pool
865
+ def _generate():
866
+ return generate_suite(profile_data, **kwargs)
867
+
868
+ loop = asyncio.get_event_loop()
869
+ suite = await loop.run_in_executor(self._executor, _generate)
870
+
871
+ return self._convert_suite_result(suite, strictness, output_format)
872
+
873
+ async def generate_suite_from_source(
874
+ self,
875
+ source: DataInput,
876
+ *,
877
+ strictness: str = "medium",
878
+ preset: str = "default",
879
+ include: list[str] | None = None,
880
+ exclude: list[str] | None = None,
881
+ sample_size: int | None = None,
882
+ include_patterns: bool = True,
883
+ ) -> GenerateSuiteResult:
884
+ """Profile a source and generate validation suite in one step.
885
+
886
+ Convenience method that combines profile() and generate_suite().
887
+
888
+ Args:
889
+ source: Data source - file path string or DataSource object.
890
+ strictness: Strictness level ("loose", "medium", "strict").
891
+ preset: Rule generation preset.
892
+ include: Rule categories to include.
893
+ exclude: Rule categories to exclude.
894
+ sample_size: Number of rows to sample for profiling.
895
+ include_patterns: Enable pattern detection during profiling.
896
+
897
+ Returns:
898
+ GenerateSuiteResult with generated rules.
899
+ """
900
+ # Profile the source first
901
+ profile = await self.profile(
902
+ source,
903
+ sample_size=sample_size,
904
+ include_patterns=include_patterns,
905
+ )
906
+
907
+ # Generate suite from profile
908
+ return await self.generate_suite(
909
+ profile,
910
+ strictness=strictness,
911
+ preset=preset,
912
+ include=include,
913
+ exclude=exclude,
914
+ )
915
+
916
+ def _convert_suite_result(
917
+ self,
918
+ suite: Any,
919
+ strictness: str,
920
+ output_format: str,
921
+ ) -> GenerateSuiteResult:
922
+ """Convert truthound ValidationSuite to GenerateSuiteResult.
923
+
924
+ Args:
925
+ suite: ValidationSuite from generate_suite().
926
+ strictness: Strictness level used.
927
+ output_format: Requested output format.
928
+
929
+ Returns:
930
+ GenerateSuiteResult.
931
+ """
932
+ # Extract rules from suite
933
+ rules = []
934
+ categories = set()
935
+
936
+ if hasattr(suite, "rules"):
937
+ for rule in suite.rules:
938
+ rule_dict = {
939
+ "name": getattr(rule, "name", ""),
940
+ "validator": getattr(rule, "validator", ""),
941
+ "column": getattr(rule, "column", None),
942
+ "params": getattr(rule, "params", {}),
943
+ "severity": getattr(rule, "severity", "medium"),
944
+ "category": getattr(rule, "category", "unknown"),
945
+ }
946
+ rules.append(rule_dict)
947
+ if rule_dict["category"]:
948
+ categories.add(rule_dict["category"])
949
+
950
+ # Generate YAML content
951
+ yaml_content = ""
952
+ if hasattr(suite, "to_yaml"):
953
+ yaml_content = suite.to_yaml()
954
+ else:
955
+ yaml_content = yaml.dump(
956
+ {"rules": rules},
957
+ default_flow_style=False,
958
+ sort_keys=False,
959
+ allow_unicode=True,
960
+ )
961
+
962
+ # Generate JSON content
963
+ json_content = {"rules": rules}
964
+ if hasattr(suite, "to_dict"):
965
+ json_content = suite.to_dict()
966
+
967
+ return GenerateSuiteResult(
968
+ rules=rules,
969
+ rule_count=len(rules),
970
+ categories=sorted(categories),
971
+ strictness=strictness,
972
+ yaml_content=yaml_content,
973
+ json_content=json_content,
974
+ )
975
+
976
+ async def scan(self, data: DataInput) -> ScanResult:
977
+ """Run PII scan on data asynchronously.
978
+
979
+ Uses truthound's th.scan() to detect personally identifiable information.
980
+
981
+ Note: truthound's th.scan() does not support any configuration parameters.
982
+ The scan runs on all columns with default settings.
983
+
984
+ Args:
985
+ data: Data source - can be:
986
+ - File path string (CSV, Parquet, etc.)
987
+ - DataSource object
988
+
989
+ Returns:
990
+ ScanResult with PII findings.
991
+
992
+ Raises:
993
+ ImportError: If truthound is not installed.
994
+ FileNotFoundError: If data file doesn't exist.
995
+ """
996
+ import truthound as th
484
997
 
485
- func = partial(th.scan, data, **kwargs)
998
+ func = partial(th.scan, data)
486
999
 
487
1000
  loop = asyncio.get_event_loop()
488
1001
  result = await loop.run_in_executor(self._executor, func)
@@ -491,20 +1004,19 @@ class TruthoundAdapter:
491
1004
 
492
1005
  async def compare(
493
1006
  self,
494
- baseline: str,
495
- current: str,
1007
+ baseline: DataInput,
1008
+ current: DataInput,
496
1009
  *,
497
1010
  columns: list[str] | None = None,
498
1011
  method: str = "auto",
499
1012
  threshold: float | None = None,
500
- correction: str | None = None,
501
1013
  sample_size: int | None = None,
502
1014
  ) -> CompareResult:
503
1015
  """Compare two datasets for drift detection.
504
1016
 
505
1017
  Args:
506
- baseline: Reference data path.
507
- current: Current data path to compare.
1018
+ baseline: Reference data - can be path string or DataSource.
1019
+ current: Current data to compare - can be path string or DataSource.
508
1020
  columns: Optional list of columns to compare. If None, all common columns.
509
1021
  method: Detection method. Supported methods:
510
1022
  - "auto": Smart selection (numeric → PSI, categorical → chi2)
@@ -518,12 +1030,6 @@ class TruthoundAdapter:
518
1030
  - "anderson": Anderson-Darling (tail-weighted)
519
1031
  threshold: Optional custom threshold for drift detection.
520
1032
  Defaults vary by method: KS/chi2/cvm/anderson=0.05, PSI/JS/KL/wasserstein=0.1
521
- correction: Multiple testing correction method:
522
- - None: Use truthound default (bh for multiple columns)
523
- - "none": No correction
524
- - "bonferroni": Conservative, independent tests
525
- - "holm": Sequential adjustment
526
- - "bh": Benjamini-Hochberg FDR control
527
1033
  sample_size: Optional sample size for large datasets.
528
1034
 
529
1035
  Returns:
@@ -531,17 +1037,13 @@ class TruthoundAdapter:
531
1037
  """
532
1038
  import truthound as th
533
1039
 
534
- # Build kwargs dynamically to avoid passing None for optional params
535
1040
  kwargs: dict[str, Any] = {
536
1041
  "columns": columns,
537
1042
  "method": method,
538
1043
  }
539
1044
 
540
- # Only add optional params if explicitly set
541
1045
  if threshold is not None:
542
1046
  kwargs["threshold"] = threshold
543
- if correction is not None:
544
- kwargs["correction"] = correction
545
1047
  if sample_size is not None:
546
1048
  kwargs["sample_size"] = sample_size
547
1049
 
@@ -554,7 +1056,7 @@ class TruthoundAdapter:
554
1056
 
555
1057
  async def mask(
556
1058
  self,
557
- data: str,
1059
+ data: DataInput,
558
1060
  output: str,
559
1061
  *,
560
1062
  columns: list[str] | None = None,
@@ -566,7 +1068,9 @@ class TruthoundAdapter:
566
1068
  three strategies: redact, hash, and fake.
567
1069
 
568
1070
  Args:
569
- data: Data source path (CSV, Parquet, etc.).
1071
+ data: Data source - can be:
1072
+ - File path string (CSV, Parquet, etc.)
1073
+ - DataSource object
570
1074
  output: Output file path for the masked data.
571
1075
  columns: Optional list of columns to mask. If None, auto-detects PII.
572
1076
  strategy: Masking strategy:
@@ -607,10 +1111,10 @@ class TruthoundAdapter:
607
1111
 
608
1112
  async def check_with_sampling(
609
1113
  self,
610
- data: str,
1114
+ data: DataInput,
611
1115
  *,
612
1116
  validators: list[str] | None = None,
613
- validator_params: dict[str, dict[str, Any]] | None = None,
1117
+ validator_config: dict[str, dict[str, Any]] | None = None,
614
1118
  schema: str | None = None,
615
1119
  auto_schema: bool = False,
616
1120
  columns: list[str] | None = None,
@@ -628,9 +1132,14 @@ class TruthoundAdapter:
628
1132
  before running validation, which significantly improves performance
629
1133
  while maintaining validation accuracy for most use cases.
630
1134
 
1135
+ Note: Sampling is only applied to file-based sources. DataSource
1136
+ objects handle their own data fetching and should use query-level
1137
+ sampling if needed.
1138
+
631
1139
  Args:
632
- data: Data source path (CSV, Parquet, etc.).
1140
+ data: Data source - can be file path or DataSource.
633
1141
  validators: Optional list of validator names to run.
1142
+ validator_config: Optional dict of per-validator configuration.
634
1143
  schema: Optional path to schema YAML file.
635
1144
  auto_schema: If True, auto-learns schema for validation.
636
1145
  columns: Columns to validate. If None, validates all columns.
@@ -649,40 +1158,42 @@ class TruthoundAdapter:
649
1158
  The result.row_count reflects the sampled row count when sampling
650
1159
  was performed. Check the sampling metadata for original row count.
651
1160
  """
652
- from truthound_dashboard.core.sampling import SamplingMethod, get_sampler
653
-
654
- sampler = get_sampler()
655
-
656
- # Check if sampling is needed and perform if so
657
- path = Path(data)
658
- if path.exists() and sampler.needs_sampling(path):
659
- # Determine sampling method
660
- method = None
661
- if sampling_method:
662
- try:
663
- method = SamplingMethod(sampling_method)
664
- except ValueError:
665
- logger.warning(f"Unknown sampling method: {sampling_method}")
666
-
667
- # Perform sampling
668
- sample_result = await sampler.auto_sample(
669
- path,
670
- n=sample_size,
671
- method=method,
672
- )
673
-
674
- if sample_result.was_sampled:
675
- logger.info(
676
- f"Sampled {sample_result.sampled_rows} rows from "
677
- f"{sample_result.original_rows} ({sample_result.size_reduction_pct:.1f}% reduction)"
1161
+ # Only apply sampling to file paths
1162
+ if isinstance(data, str):
1163
+ from truthound_dashboard.core.sampling import SamplingMethod, get_sampler
1164
+
1165
+ sampler = get_sampler()
1166
+
1167
+ # Check if sampling is needed and perform if so
1168
+ path = Path(data)
1169
+ if path.exists() and sampler.needs_sampling(path):
1170
+ # Determine sampling method
1171
+ method = None
1172
+ if sampling_method:
1173
+ try:
1174
+ method = SamplingMethod(sampling_method)
1175
+ except ValueError:
1176
+ logger.warning(f"Unknown sampling method: {sampling_method}")
1177
+
1178
+ # Perform sampling
1179
+ sample_result = await sampler.auto_sample(
1180
+ path,
1181
+ n=sample_size,
1182
+ method=method,
678
1183
  )
679
- data = sample_result.sampled_path
1184
+
1185
+ if sample_result.was_sampled:
1186
+ logger.info(
1187
+ f"Sampled {sample_result.sampled_rows} rows from "
1188
+ f"{sample_result.original_rows} ({sample_result.size_reduction_pct:.1f}% reduction)"
1189
+ )
1190
+ data = sample_result.sampled_path
680
1191
 
681
1192
  # Run validation on (possibly sampled) data
682
1193
  return await self.check(
683
1194
  data,
684
1195
  validators=validators,
685
- validator_params=validator_params,
1196
+ validator_config=validator_config,
686
1197
  schema=schema,
687
1198
  auto_schema=auto_schema,
688
1199
  columns=columns,
@@ -695,7 +1206,7 @@ class TruthoundAdapter:
695
1206
 
696
1207
  async def learn_with_sampling(
697
1208
  self,
698
- source: str,
1209
+ source: DataInput,
699
1210
  *,
700
1211
  infer_constraints: bool = True,
701
1212
  categorical_threshold: int | None = None,
@@ -706,8 +1217,10 @@ class TruthoundAdapter:
706
1217
  This method first applies dashboard-level sampling for very large files,
707
1218
  then passes the sample_size to th.learn() if specified.
708
1219
 
1220
+ Note: Sampling is only applied to file-based sources.
1221
+
709
1222
  Args:
710
- source: Data source path.
1223
+ source: Data source - can be file path or DataSource.
711
1224
  infer_constraints: If True, infer constraints from statistics.
712
1225
  categorical_threshold: Maximum unique values for categorical detection.
713
1226
  sample_size: Number of rows to sample. Used both for dashboard sampling
@@ -716,57 +1229,138 @@ class TruthoundAdapter:
716
1229
  Returns:
717
1230
  LearnResult with schema information.
718
1231
  """
719
- from truthound_dashboard.core.sampling import get_sampler
720
-
721
- sampler = get_sampler()
722
-
723
- # Sample if needed (dashboard-level sampling for very large files)
724
- path = Path(source)
725
- if path.exists() and sampler.needs_sampling(path):
726
- sample_result = await sampler.auto_sample(path, n=sample_size)
727
- if sample_result.was_sampled:
728
- logger.info(
729
- f"Sampled {sample_result.sampled_rows} rows for schema learning"
730
- )
731
- source = sample_result.sampled_path
732
-
1232
+ # Only apply sampling to file paths
1233
+ if isinstance(source, str):
1234
+ from truthound_dashboard.core.sampling import get_sampler
1235
+
1236
+ sampler = get_sampler()
1237
+
1238
+ path = Path(source)
1239
+ if path.exists() and sampler.needs_sampling(path):
1240
+ sample_result = await sampler.auto_sample(path, n=sample_size)
1241
+ if sample_result.was_sampled:
1242
+ logger.info(
1243
+ f"Sampled {sample_result.sampled_rows} rows for schema learning"
1244
+ )
1245
+ source = sample_result.sampled_path
1246
+
1247
+ # sample_size already handled by dashboard-level sampling above,
1248
+ # do NOT pass it to self.learn() — th.learn() doesn't support it
733
1249
  return await self.learn(
734
1250
  source,
735
1251
  infer_constraints=infer_constraints,
736
1252
  categorical_threshold=categorical_threshold,
737
- sample_size=sample_size,
738
1253
  )
739
1254
 
740
1255
  async def profile_with_sampling(
741
1256
  self,
742
- source: str,
1257
+ source: DataInput,
743
1258
  *,
744
1259
  sample_size: int | None = None,
1260
+ include_patterns: bool = True,
1261
+ include_correlations: bool = False,
745
1262
  ) -> ProfileResult:
746
1263
  """Run data profiling with automatic sampling for large datasets.
747
1264
 
1265
+ Note: Sampling is only applied to file-based sources.
1266
+
748
1267
  Args:
749
- source: Data source path.
1268
+ source: Data source - can be file path or DataSource.
750
1269
  sample_size: Number of rows to sample. Uses config default if not specified.
1270
+ include_patterns: Enable pattern detection. Default True.
1271
+ include_correlations: Calculate correlations. Default False.
751
1272
 
752
1273
  Returns:
753
1274
  ProfileResult with profiling information.
754
1275
  """
755
- from truthound_dashboard.core.sampling import get_sampler
1276
+ # Only apply sampling to file paths
1277
+ if isinstance(source, str):
1278
+ from truthound_dashboard.core.sampling import get_sampler
1279
+
1280
+ sampler = get_sampler()
1281
+
1282
+ path = Path(source)
1283
+ if path.exists() and sampler.needs_sampling(path):
1284
+ sample_result = await sampler.auto_sample(path, n=sample_size)
1285
+ if sample_result.was_sampled:
1286
+ logger.info(
1287
+ f"Sampled {sample_result.sampled_rows} rows for profiling"
1288
+ )
1289
+ source = sample_result.sampled_path
1290
+
1291
+ return await self.profile(
1292
+ source,
1293
+ sample_size=sample_size,
1294
+ include_patterns=include_patterns,
1295
+ include_correlations=include_correlations,
1296
+ )
1297
+
1298
+ async def check_from_config(
1299
+ self,
1300
+ source_config: "SourceConfig | dict[str, Any]",
1301
+ *,
1302
+ validators: list[str] | None = None,
1303
+ validator_config: dict[str, dict[str, Any]] | None = None,
1304
+ schema: str | None = None,
1305
+ auto_schema: bool = False,
1306
+ columns: list[str] | None = None,
1307
+ min_severity: str | None = None,
1308
+ strict: bool = False,
1309
+ parallel: bool = False,
1310
+ max_workers: int | None = None,
1311
+ pushdown: bool | None = None,
1312
+ ) -> CheckResult:
1313
+ """Run validation using source configuration.
756
1314
 
757
- sampler = get_sampler()
1315
+ This convenience method creates a DataSource from config
1316
+ and runs validation.
758
1317
 
759
- # Sample if needed
760
- path = Path(source)
761
- if path.exists() and sampler.needs_sampling(path):
762
- sample_result = await sampler.auto_sample(path, n=sample_size)
763
- if sample_result.was_sampled:
764
- logger.info(
765
- f"Sampled {sample_result.sampled_rows} rows for profiling"
766
- )
767
- source = sample_result.sampled_path
1318
+ Args:
1319
+ source_config: Source configuration (SourceConfig or dict).
1320
+ validators: Optional list of validator names to run.
1321
+ validator_config: Optional dict of per-validator configuration.
1322
+ schema: Optional path to schema YAML file.
1323
+ auto_schema: If True, auto-learns schema for validation.
1324
+ columns: Columns to validate.
1325
+ min_severity: Minimum severity to report.
1326
+ strict: If True, raises exception on validation failures.
1327
+ parallel: If True, uses parallel execution.
1328
+ max_workers: Max threads for parallel execution.
1329
+ pushdown: Enable query pushdown for SQL sources.
1330
+
1331
+ Returns:
1332
+ CheckResult with validation results.
1333
+ """
1334
+ from truthound_dashboard.core.datasource_factory import (
1335
+ SourceConfig,
1336
+ SourceType,
1337
+ create_datasource,
1338
+ )
1339
+
1340
+ if isinstance(source_config, dict):
1341
+ config = SourceConfig.from_dict(source_config)
1342
+ else:
1343
+ config = source_config
1344
+
1345
+ # For file sources, use path directly
1346
+ if SourceType.is_file_type(config.source_type) and config.path:
1347
+ data: DataInput = config.path
1348
+ else:
1349
+ data = create_datasource(config)
768
1350
 
769
- return await self.profile(source)
1351
+ return await self.check(
1352
+ data,
1353
+ validators=validators,
1354
+ validator_config=validator_config,
1355
+ schema=schema,
1356
+ auto_schema=auto_schema,
1357
+ columns=columns,
1358
+ min_severity=min_severity,
1359
+ strict=strict,
1360
+ parallel=parallel,
1361
+ max_workers=max_workers,
1362
+ pushdown=pushdown,
1363
+ )
770
1364
 
771
1365
  def _convert_check_result(self, result: Any) -> CheckResult:
772
1366
  """Convert truthound Report to CheckResult.
@@ -779,7 +1373,15 @@ class TruthoundAdapter:
779
1373
  - has_issues: bool
780
1374
  - has_critical: bool
781
1375
  - has_high: bool
1376
+
1377
+ Also handles truthound 2.x ValidationResult format with:
1378
+ - run_id: str
1379
+ - run_time: datetime
1380
+ - results: list[ValidatorResult]
1381
+ - statistics: ResultStatistics
782
1382
  """
1383
+ from datetime import datetime
1384
+
783
1385
  issues = result.issues
784
1386
  severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
785
1387
 
@@ -797,10 +1399,19 @@ class TruthoundAdapter:
797
1399
  "details": getattr(issue, "details", None),
798
1400
  "expected": getattr(issue, "expected", None),
799
1401
  "actual": getattr(issue, "actual", None),
1402
+ "validator_name": getattr(issue, "validator_name", issue.issue_type),
1403
+ "message": getattr(issue, "message", ""),
1404
+ "sample_values": getattr(issue, "sample_values", None),
800
1405
  }
801
1406
  for issue in issues
802
1407
  ]
803
1408
 
1409
+ # Extract run_id and run_time if available (truthound 2.x)
1410
+ run_id = getattr(result, "run_id", None)
1411
+ run_time = getattr(result, "run_time", None)
1412
+ if run_time is None:
1413
+ run_time = datetime.now()
1414
+
804
1415
  return CheckResult(
805
1416
  passed=not result.has_issues,
806
1417
  has_critical=result.has_critical,
@@ -814,6 +1425,9 @@ class TruthoundAdapter:
814
1425
  row_count=result.row_count,
815
1426
  column_count=result.column_count,
816
1427
  issues=converted_issues,
1428
+ run_id=run_id,
1429
+ run_time=run_time,
1430
+ _raw_result=result, # Store raw result for reporter integration
817
1431
  )
818
1432
 
819
1433
  def _convert_learn_result(self, result: Any) -> LearnResult:
@@ -842,35 +1456,241 @@ class TruthoundAdapter:
842
1456
  )
843
1457
 
844
1458
  def _convert_profile_result(self, result: Any) -> ProfileResult:
845
- """Convert truthound ProfileReport to ProfileResult.
1459
+ """Convert truthound TableProfile/ProfileReport to ProfileResult.
1460
+
1461
+ The truthound TableProfile (new API) contains:
1462
+ - name: str
1463
+ - row_count: int
1464
+ - column_count: int
1465
+ - estimated_memory_bytes: int
1466
+ - columns: tuple[ColumnProfile, ...]
1467
+ - duplicate_row_count: int
1468
+ - duplicate_row_ratio: float
1469
+ - correlations: tuple[tuple[str, str, float], ...]
1470
+ - source: str
1471
+ - profiled_at: datetime
1472
+ - profile_duration_ms: float
1473
+
1474
+ Each ColumnProfile contains:
1475
+ - name: str
1476
+ - physical_type: str
1477
+ - inferred_type: DataType enum
1478
+ - row_count, null_count, null_ratio, empty_string_count
1479
+ - distinct_count, unique_ratio, is_unique, is_constant
1480
+ - distribution: DistributionStats | None
1481
+ - top_values, bottom_values: tuple[ValueFrequency, ...]
1482
+ - min_length, max_length, avg_length (string columns)
1483
+ - detected_patterns: tuple[PatternMatch, ...]
1484
+ - min_date, max_date, date_gaps (datetime columns)
1485
+ - suggested_validators: tuple[str, ...]
1486
+ - profile_duration_ms: float
1487
+
1488
+ Also supports legacy ProfileReport format for backward compatibility.
1489
+ """
1490
+ # Check if this is the new TableProfile or legacy ProfileReport
1491
+ if hasattr(result, "estimated_memory_bytes"):
1492
+ # New TableProfile format
1493
+ return self._convert_table_profile(result)
1494
+ else:
1495
+ # Legacy ProfileReport format - convert to new structure
1496
+ return self._convert_legacy_profile(result)
1497
+
1498
+ def _convert_table_profile(self, result: Any) -> ProfileResult:
1499
+ """Convert new truthound TableProfile to ProfileResult."""
1500
+ from datetime import datetime
1501
+
1502
+ columns = []
1503
+ for col in result.columns:
1504
+ # Extract distribution stats if present
1505
+ distribution = None
1506
+ if col.distribution:
1507
+ distribution = {
1508
+ "mean": getattr(col.distribution, "mean", None),
1509
+ "std": getattr(col.distribution, "std", None),
1510
+ "min": getattr(col.distribution, "min", None),
1511
+ "max": getattr(col.distribution, "max", None),
1512
+ "median": getattr(col.distribution, "median", None),
1513
+ "q1": getattr(col.distribution, "q1", None),
1514
+ "q3": getattr(col.distribution, "q3", None),
1515
+ "skewness": getattr(col.distribution, "skewness", None),
1516
+ "kurtosis": getattr(col.distribution, "kurtosis", None),
1517
+ }
1518
+
1519
+ # Convert top_values
1520
+ top_values = None
1521
+ if col.top_values:
1522
+ top_values = [
1523
+ {
1524
+ "value": str(v.value) if v.value is not None else None,
1525
+ "count": v.count,
1526
+ "ratio": v.ratio,
1527
+ }
1528
+ for v in col.top_values
1529
+ ]
1530
+
1531
+ # Convert bottom_values
1532
+ bottom_values = None
1533
+ if col.bottom_values:
1534
+ bottom_values = [
1535
+ {
1536
+ "value": str(v.value) if v.value is not None else None,
1537
+ "count": v.count,
1538
+ "ratio": v.ratio,
1539
+ }
1540
+ for v in col.bottom_values
1541
+ ]
1542
+
1543
+ # Convert detected_patterns
1544
+ detected_patterns = None
1545
+ if col.detected_patterns:
1546
+ detected_patterns = [
1547
+ {
1548
+ "pattern": getattr(p, "pattern", None),
1549
+ "regex": getattr(p, "regex", None),
1550
+ "match_ratio": getattr(p, "match_ratio", 0.0),
1551
+ "sample_matches": list(getattr(p, "sample_matches", [])),
1552
+ }
1553
+ for p in col.detected_patterns
1554
+ ]
1555
+
1556
+ # Get inferred type value
1557
+ inferred_type = "unknown"
1558
+ if hasattr(col, "inferred_type"):
1559
+ inferred_type = (
1560
+ col.inferred_type.value
1561
+ if hasattr(col.inferred_type, "value")
1562
+ else str(col.inferred_type)
1563
+ )
1564
+
1565
+ # Convert datetime fields to ISO strings
1566
+ min_date = None
1567
+ max_date = None
1568
+ if col.min_date:
1569
+ min_date = (
1570
+ col.min_date.isoformat()
1571
+ if isinstance(col.min_date, datetime)
1572
+ else str(col.min_date)
1573
+ )
1574
+ if col.max_date:
1575
+ max_date = (
1576
+ col.max_date.isoformat()
1577
+ if isinstance(col.max_date, datetime)
1578
+ else str(col.max_date)
1579
+ )
1580
+
1581
+ col_result = ColumnProfileResult(
1582
+ name=col.name,
1583
+ physical_type=col.physical_type,
1584
+ inferred_type=inferred_type,
1585
+ row_count=col.row_count,
1586
+ null_count=col.null_count,
1587
+ null_ratio=col.null_ratio,
1588
+ empty_string_count=col.empty_string_count,
1589
+ distinct_count=col.distinct_count,
1590
+ unique_ratio=col.unique_ratio,
1591
+ is_unique=col.is_unique,
1592
+ is_constant=col.is_constant,
1593
+ distribution=distribution,
1594
+ top_values=top_values,
1595
+ bottom_values=bottom_values,
1596
+ min_length=col.min_length,
1597
+ max_length=col.max_length,
1598
+ avg_length=col.avg_length,
1599
+ detected_patterns=detected_patterns,
1600
+ min_date=min_date,
1601
+ max_date=max_date,
1602
+ date_gaps=col.date_gaps,
1603
+ suggested_validators=list(col.suggested_validators)
1604
+ if col.suggested_validators
1605
+ else None,
1606
+ profile_duration_ms=col.profile_duration_ms,
1607
+ )
1608
+ columns.append(col_result)
1609
+
1610
+ # Convert correlations
1611
+ correlations = None
1612
+ if result.correlations:
1613
+ correlations = [
1614
+ (c[0], c[1], c[2]) for c in result.correlations
1615
+ ]
1616
+
1617
+ # Get profiled_at as ISO string
1618
+ profiled_at = None
1619
+ if hasattr(result, "profiled_at") and result.profiled_at:
1620
+ profiled_at = (
1621
+ result.profiled_at.isoformat()
1622
+ if isinstance(result.profiled_at, datetime)
1623
+ else str(result.profiled_at)
1624
+ )
1625
+
1626
+ return ProfileResult(
1627
+ name=getattr(result, "name", ""),
1628
+ source=getattr(result, "source", ""),
1629
+ row_count=result.row_count,
1630
+ column_count=result.column_count,
1631
+ estimated_memory_bytes=result.estimated_memory_bytes,
1632
+ columns=columns,
1633
+ duplicate_row_count=result.duplicate_row_count,
1634
+ duplicate_row_ratio=result.duplicate_row_ratio,
1635
+ correlations=correlations,
1636
+ profiled_at=profiled_at,
1637
+ profile_duration_ms=getattr(result, "profile_duration_ms", 0.0),
1638
+ size_bytes=result.estimated_memory_bytes,
1639
+ )
1640
+
1641
+ def _convert_legacy_profile(self, result: Any) -> ProfileResult:
1642
+ """Convert legacy truthound ProfileReport to ProfileResult.
846
1643
 
847
- The truthound ProfileReport contains:
1644
+ Legacy ProfileReport contains:
848
1645
  - source: str
849
1646
  - row_count: int
850
1647
  - column_count: int
851
1648
  - size_bytes: int
852
- - columns: list[dict]
1649
+ - columns: list[dict] with name, dtype, null_pct, unique_pct, min, max, mean, std
853
1650
  """
854
- columns = [
855
- {
856
- "name": col["name"],
857
- "dtype": col["dtype"],
858
- "null_pct": col.get("null_pct", "0%"),
859
- "unique_pct": col.get("unique_pct", "0%"),
860
- "min": col.get("min"),
861
- "max": col.get("max"),
862
- "mean": col.get("mean"),
863
- "std": col.get("std"),
864
- }
865
- for col in result.columns
866
- ]
1651
+ columns = []
1652
+ for col in result.columns:
1653
+ # Parse null_pct and unique_pct
1654
+ null_ratio = 0.0
1655
+ unique_ratio = 0.0
1656
+ if isinstance(col.get("null_pct"), str):
1657
+ null_ratio = float(col["null_pct"].rstrip("%")) / 100.0
1658
+ elif isinstance(col.get("null_pct"), (int, float)):
1659
+ null_ratio = float(col["null_pct"])
1660
+ if isinstance(col.get("unique_pct"), str):
1661
+ unique_ratio = float(col["unique_pct"].rstrip("%")) / 100.0
1662
+ elif isinstance(col.get("unique_pct"), (int, float)):
1663
+ unique_ratio = float(col["unique_pct"])
1664
+
1665
+ # Build distribution if numeric stats present
1666
+ distribution = None
1667
+ if col.get("min") is not None or col.get("mean") is not None:
1668
+ distribution = {
1669
+ "min": col.get("min"),
1670
+ "max": col.get("max"),
1671
+ "mean": col.get("mean"),
1672
+ "std": col.get("std"),
1673
+ }
1674
+
1675
+ col_result = ColumnProfileResult(
1676
+ name=col["name"],
1677
+ physical_type=col.get("dtype", "unknown"),
1678
+ inferred_type=col.get("dtype", "unknown"),
1679
+ row_count=result.row_count,
1680
+ null_ratio=null_ratio,
1681
+ unique_ratio=unique_ratio,
1682
+ distribution=distribution,
1683
+ )
1684
+ columns.append(col_result)
867
1685
 
868
1686
  return ProfileResult(
1687
+ name=getattr(result, "source", ""),
869
1688
  source=result.source,
870
1689
  row_count=result.row_count,
871
1690
  column_count=result.column_count,
872
- size_bytes=result.size_bytes,
1691
+ estimated_memory_bytes=getattr(result, "size_bytes", 0),
873
1692
  columns=columns,
1693
+ size_bytes=getattr(result, "size_bytes", 0),
874
1694
  )
875
1695
 
876
1696
  def _convert_scan_result(self, result: Any) -> ScanResult:
@@ -887,7 +1707,7 @@ class TruthoundAdapter:
887
1707
  Each PIIFinding has:
888
1708
  - column: str
889
1709
  - pii_type: str
890
- - confidence: float
1710
+ - confidence: float (0-100)
891
1711
  - sample_count: int
892
1712
  - sample_values: list[str] (optional)
893
1713
 
@@ -897,19 +1717,41 @@ class TruthoundAdapter:
897
1717
  - pii_type: str
898
1718
  - message: str
899
1719
  - severity: str (optional)
1720
+
1721
+ Args:
1722
+ result: truthound PIIReport object.
1723
+
1724
+ Returns:
1725
+ ScanResult with PII findings.
900
1726
  """
901
1727
  # Convert findings to dictionaries
902
1728
  findings = []
903
1729
  columns_with_pii = set()
904
1730
  for finding in result.findings:
905
- columns_with_pii.add(finding.column)
1731
+ # Handle both dict and object-style findings
1732
+ if isinstance(finding, dict):
1733
+ confidence = finding.get("confidence", 0)
1734
+ column = finding.get("column", "")
1735
+ pii_type = finding.get("pii_type", "unknown")
1736
+ sample_count = finding.get("count", finding.get("sample_count", 0))
1737
+ sample_values = finding.get("sample_values")
1738
+ else:
1739
+ confidence = getattr(finding, "confidence", 0)
1740
+ column = getattr(finding, "column", "")
1741
+ pii_type = getattr(finding, "pii_type", "unknown")
1742
+ sample_count = getattr(finding, "sample_count", getattr(finding, "count", 0))
1743
+ sample_values = getattr(finding, "sample_values", None)
1744
+
1745
+ columns_with_pii.add(column)
1746
+ # Normalize confidence to 0-1 range if it's in 0-100 range
1747
+ normalized_confidence = confidence / 100.0 if confidence > 1 else confidence
906
1748
  findings.append(
907
1749
  {
908
- "column": finding.column,
909
- "pii_type": finding.pii_type,
910
- "confidence": finding.confidence,
911
- "sample_count": finding.sample_count,
912
- "sample_values": getattr(finding, "sample_values", None),
1750
+ "column": column,
1751
+ "pii_type": pii_type,
1752
+ "confidence": normalized_confidence,
1753
+ "sample_count": sample_count,
1754
+ "sample_values": sample_values,
913
1755
  }
914
1756
  )
915
1757
 
@@ -926,11 +1768,14 @@ class TruthoundAdapter:
926
1768
  }
927
1769
  )
928
1770
 
1771
+ # Get column_count with fallback (not present in some truthound versions)
1772
+ column_count = getattr(result, "column_count", len(columns_with_pii) if columns_with_pii else 0)
1773
+
929
1774
  return ScanResult(
930
1775
  source=result.source,
931
1776
  row_count=result.row_count,
932
- column_count=result.column_count,
933
- total_columns_scanned=result.column_count,
1777
+ column_count=column_count,
1778
+ total_columns_scanned=column_count,
934
1779
  columns_with_pii=len(columns_with_pii),
935
1780
  total_findings=len(findings),
936
1781
  has_violations=getattr(result, "has_violations", len(violations) > 0),
@@ -992,7 +1837,7 @@ class TruthoundAdapter:
992
1837
 
993
1838
  def _convert_mask_result(
994
1839
  self,
995
- source: str,
1840
+ source: DataInput,
996
1841
  output: str,
997
1842
  masked_df: Any,
998
1843
  strategy: str,
@@ -1001,7 +1846,7 @@ class TruthoundAdapter:
1001
1846
  """Convert truthound mask result to MaskResult.
1002
1847
 
1003
1848
  Args:
1004
- source: Original data source path.
1849
+ source: Original data source (path or DataSource).
1005
1850
  output: Output file path.
1006
1851
  masked_df: Polars DataFrame with masked data.
1007
1852
  strategy: Masking strategy used.
@@ -1033,7 +1878,7 @@ class TruthoundAdapter:
1033
1878
  masked_df.write_csv(output)
1034
1879
 
1035
1880
  return MaskResult(
1036
- source=source,
1881
+ source=_get_source_name(source),
1037
1882
  output_path=str(output_path.absolute()),
1038
1883
  row_count=row_count,
1039
1884
  column_count=len(all_columns),
@@ -1047,7 +1892,272 @@ class TruthoundAdapter:
1047
1892
  self._executor.shutdown(wait=False)
1048
1893
 
1049
1894
 
1050
- # Singleton instance
1895
+ # =============================================================================
1896
+ # ValidationResult Mock for Reporter Integration
1897
+ # =============================================================================
1898
+
1899
+
1900
+ class _ValidationResultMock:
1901
+ """Mock object that mimics truthound's ValidationResult interface.
1902
+
1903
+ This enables using truthound reporters with CheckResult objects from
1904
+ this adapter, maintaining loose coupling with the truthound library.
1905
+
1906
+ The mock provides compatibility with truthound reporters that expect:
1907
+ - ValidationResult from truthound.stores.results (new API)
1908
+ - Report from truthound.report (legacy API)
1909
+ """
1910
+
1911
+ def __init__(self, check_result: CheckResult) -> None:
1912
+ from datetime import datetime
1913
+
1914
+ self._result = check_result
1915
+ self._results = [
1916
+ _ValidatorResultMock(issue) for issue in check_result.issues
1917
+ ]
1918
+ self._statistics = _ResultStatisticsMock(check_result)
1919
+ self._run_time = check_result.run_time or datetime.now()
1920
+
1921
+ # === ValidationResult interface (new API) ===
1922
+
1923
+ @property
1924
+ def run_id(self) -> str:
1925
+ return self._result.run_id or f"run-{id(self._result)}"
1926
+
1927
+ @property
1928
+ def run_time(self) -> Any:
1929
+ return self._run_time
1930
+
1931
+ @property
1932
+ def data_asset(self) -> str:
1933
+ return self._result.source
1934
+
1935
+ @property
1936
+ def status(self) -> "_ResultStatusMock":
1937
+ return _ResultStatusMock(self._result.passed)
1938
+
1939
+ @property
1940
+ def success(self) -> bool:
1941
+ return self._result.passed
1942
+
1943
+ @property
1944
+ def results(self) -> list["_ValidatorResultMock"]:
1945
+ return self._results
1946
+
1947
+ @property
1948
+ def statistics(self) -> "_ResultStatisticsMock":
1949
+ return self._statistics
1950
+
1951
+ @property
1952
+ def tags(self) -> dict[str, Any]:
1953
+ return {}
1954
+
1955
+ # === Report interface (legacy API) ===
1956
+
1957
+ @property
1958
+ def source(self) -> str:
1959
+ return self._result.source
1960
+
1961
+ @property
1962
+ def row_count(self) -> int:
1963
+ return self._result.row_count
1964
+
1965
+ @property
1966
+ def column_count(self) -> int:
1967
+ return self._result.column_count
1968
+
1969
+ @property
1970
+ def issues(self) -> list["_ValidatorResultMock"]:
1971
+ return self._results
1972
+
1973
+ @property
1974
+ def has_issues(self) -> bool:
1975
+ return self._result.total_issues > 0
1976
+
1977
+ @property
1978
+ def has_critical(self) -> bool:
1979
+ return self._result.has_critical
1980
+
1981
+ @property
1982
+ def has_high(self) -> bool:
1983
+ return self._result.has_high
1984
+
1985
+ @property
1986
+ def suite_name(self) -> str:
1987
+ return "Truthound Validation"
1988
+
1989
+ def to_dict(self) -> dict[str, Any]:
1990
+ return {
1991
+ "run_id": self.run_id,
1992
+ "run_time": (
1993
+ self._run_time.isoformat()
1994
+ if hasattr(self._run_time, "isoformat")
1995
+ else str(self._run_time)
1996
+ ),
1997
+ "data_asset": self.data_asset,
1998
+ "status": self.status.value,
1999
+ "success": self.success,
2000
+ "results": [r.to_dict() for r in self.results],
2001
+ "statistics": self._statistics.to_dict(),
2002
+ }
2003
+
2004
+ def to_json(self, indent: int | None = 2) -> str:
2005
+ import json
2006
+
2007
+ return json.dumps(self.to_dict(), indent=indent, default=str)
2008
+
2009
+
2010
+ class _ResultStatusMock:
2011
+ """Mock ResultStatus enum for reporter compatibility."""
2012
+
2013
+ def __init__(self, passed: bool) -> None:
2014
+ self._passed = passed
2015
+
2016
+ @property
2017
+ def value(self) -> str:
2018
+ return "SUCCESS" if self._passed else "FAILURE"
2019
+
2020
+ def __str__(self) -> str:
2021
+ return self.value
2022
+
2023
+
2024
+ class _ResultStatisticsMock:
2025
+ """Mock ResultStatistics for reporter compatibility."""
2026
+
2027
+ def __init__(self, check_result: CheckResult) -> None:
2028
+ self._result = check_result
2029
+
2030
+ @property
2031
+ def total_issues(self) -> int:
2032
+ return self._result.total_issues
2033
+
2034
+ @property
2035
+ def total_rows(self) -> int:
2036
+ return self._result.row_count
2037
+
2038
+ @property
2039
+ def total_columns(self) -> int:
2040
+ return self._result.column_count
2041
+
2042
+ @property
2043
+ def critical_count(self) -> int:
2044
+ return self._result.critical_issues
2045
+
2046
+ @property
2047
+ def high_count(self) -> int:
2048
+ return self._result.high_issues
2049
+
2050
+ @property
2051
+ def medium_count(self) -> int:
2052
+ return self._result.medium_issues
2053
+
2054
+ @property
2055
+ def low_count(self) -> int:
2056
+ return self._result.low_issues
2057
+
2058
+ @property
2059
+ def passed(self) -> bool:
2060
+ return self._result.passed
2061
+
2062
+ def to_dict(self) -> dict[str, Any]:
2063
+ return {
2064
+ "total_issues": self.total_issues,
2065
+ "total_rows": self.total_rows,
2066
+ "total_columns": self.total_columns,
2067
+ "critical_count": self.critical_count,
2068
+ "high_count": self.high_count,
2069
+ "medium_count": self.medium_count,
2070
+ "low_count": self.low_count,
2071
+ "passed": self.passed,
2072
+ }
2073
+
2074
+
2075
+ class _ValidatorResultMock:
2076
+ """Mock ValidatorResult for reporter compatibility."""
2077
+
2078
+ def __init__(self, issue: dict[str, Any]) -> None:
2079
+ self._issue = issue
2080
+
2081
+ @property
2082
+ def validator_name(self) -> str:
2083
+ return self._issue.get("validator_name") or self._issue.get("issue_type", "")
2084
+
2085
+ @property
2086
+ def column(self) -> str | None:
2087
+ return self._issue.get("column")
2088
+
2089
+ @property
2090
+ def issue_type(self) -> str:
2091
+ return self._issue.get("issue_type", "")
2092
+
2093
+ @property
2094
+ def severity(self) -> "_SeverityMock":
2095
+ return _SeverityMock(self._issue.get("severity", "medium"))
2096
+
2097
+ @property
2098
+ def message(self) -> str:
2099
+ return self._issue.get("message", "")
2100
+
2101
+ @property
2102
+ def count(self) -> int:
2103
+ return self._issue.get("count", 0)
2104
+
2105
+ @property
2106
+ def success(self) -> bool:
2107
+ return False # All issues are failures
2108
+
2109
+ @property
2110
+ def expected(self) -> Any:
2111
+ return self._issue.get("expected")
2112
+
2113
+ @property
2114
+ def actual(self) -> Any:
2115
+ return self._issue.get("actual")
2116
+
2117
+ @property
2118
+ def details(self) -> dict[str, Any]:
2119
+ return self._issue.get("details") or {}
2120
+
2121
+ @property
2122
+ def sample_values(self) -> list[Any]:
2123
+ return self._issue.get("sample_values") or []
2124
+
2125
+ def to_dict(self) -> dict[str, Any]:
2126
+ return {
2127
+ "validator_name": self.validator_name,
2128
+ "column": self.column,
2129
+ "issue_type": self.issue_type,
2130
+ "severity": self.severity.value,
2131
+ "message": self.message,
2132
+ "count": self.count,
2133
+ "success": self.success,
2134
+ "expected": self.expected,
2135
+ "actual": self.actual,
2136
+ "details": self.details,
2137
+ "sample_values": self.sample_values,
2138
+ }
2139
+
2140
+
2141
+ class _SeverityMock:
2142
+ """Mock Severity enum for reporter compatibility."""
2143
+
2144
+ def __init__(self, value: str) -> None:
2145
+ self._value = value.lower() if isinstance(value, str) else str(value).lower()
2146
+
2147
+ @property
2148
+ def value(self) -> str:
2149
+ return self._value
2150
+
2151
+ def __str__(self) -> str:
2152
+ return self._value
2153
+
2154
+
2155
+ # =============================================================================
2156
+ # Singleton Management
2157
+ # =============================================================================
2158
+
2159
+
2160
+ # Singleton instance
1051
2161
  _adapter: TruthoundAdapter | None = None
1052
2162
 
1053
2163
 
@@ -1072,3 +2182,1316 @@ def reset_adapter() -> None:
1072
2182
  if _adapter is not None:
1073
2183
  _adapter.shutdown()
1074
2184
  _adapter = None
2185
+
2186
+
2187
+ # =============================================================================
2188
+ # Schema Evolution API (truthound.profiler.evolution)
2189
+ # =============================================================================
2190
+
2191
+
2192
+ @dataclass
2193
+ class SchemaChangeResult:
2194
+ """Schema change detection result.
2195
+
2196
+ Represents a single detected change between schema versions.
2197
+
2198
+ Attributes:
2199
+ change_type: Type of change (column_added, column_removed, type_changed, etc.)
2200
+ column_name: Name of the affected column.
2201
+ old_value: Previous value (type, nullable, etc.)
2202
+ new_value: New value.
2203
+ severity: Change severity (info, warning, critical).
2204
+ breaking: Whether this is a breaking change.
2205
+ description: Human-readable description.
2206
+ migration_hint: Suggestion for handling the change.
2207
+ """
2208
+
2209
+ change_type: str
2210
+ column_name: str
2211
+ old_value: Any
2212
+ new_value: Any
2213
+ severity: str
2214
+ breaking: bool
2215
+ description: str
2216
+ migration_hint: str | None = None
2217
+
2218
+ def to_dict(self) -> dict[str, Any]:
2219
+ """Convert to dictionary."""
2220
+ return {
2221
+ "change_type": self.change_type,
2222
+ "column_name": self.column_name,
2223
+ "old_value": self.old_value,
2224
+ "new_value": self.new_value,
2225
+ "severity": self.severity,
2226
+ "breaking": self.breaking,
2227
+ "description": self.description,
2228
+ "migration_hint": self.migration_hint,
2229
+ }
2230
+
2231
+
2232
+ @dataclass
2233
+ class SchemaDetectionResult:
2234
+ """Schema evolution detection result.
2235
+
2236
+ Result from comparing two schemas.
2237
+
2238
+ Attributes:
2239
+ total_changes: Total number of changes detected.
2240
+ breaking_changes: Number of breaking changes.
2241
+ compatibility_level: Compatibility assessment (compatible, minor, breaking).
2242
+ changes: List of individual changes.
2243
+ """
2244
+
2245
+ total_changes: int
2246
+ breaking_changes: int
2247
+ compatibility_level: str
2248
+ changes: list[SchemaChangeResult]
2249
+
2250
+ def to_dict(self) -> dict[str, Any]:
2251
+ """Convert to dictionary."""
2252
+ return {
2253
+ "total_changes": self.total_changes,
2254
+ "breaking_changes": self.breaking_changes,
2255
+ "compatibility_level": self.compatibility_level,
2256
+ "changes": [c.to_dict() for c in self.changes],
2257
+ }
2258
+
2259
+
2260
+ @dataclass
2261
+ class RenameDetectionResult:
2262
+ """Column rename detection result.
2263
+
2264
+ Attributes:
2265
+ old_name: Original column name.
2266
+ new_name: New column name.
2267
+ similarity: Similarity score (0.0-1.0).
2268
+ confidence: Confidence level (high, medium, low).
2269
+ reasons: Reasons for the rename detection.
2270
+ """
2271
+
2272
+ old_name: str
2273
+ new_name: str
2274
+ similarity: float
2275
+ confidence: str
2276
+ reasons: list[str]
2277
+
2278
+ def to_dict(self) -> dict[str, Any]:
2279
+ """Convert to dictionary."""
2280
+ return {
2281
+ "old_name": self.old_name,
2282
+ "new_name": self.new_name,
2283
+ "similarity": self.similarity,
2284
+ "confidence": self.confidence,
2285
+ "reasons": self.reasons,
2286
+ }
2287
+
2288
+
2289
+ @dataclass
2290
+ class RenameDetectionSummary:
2291
+ """Summary of rename detection results.
2292
+
2293
+ Attributes:
2294
+ confirmed_renames: High-confidence confirmed renames.
2295
+ possible_renames: Lower-confidence possible renames.
2296
+ unmatched_added: Columns added without rename match.
2297
+ unmatched_removed: Columns removed without rename match.
2298
+ """
2299
+
2300
+ confirmed_renames: list[RenameDetectionResult]
2301
+ possible_renames: list[RenameDetectionResult]
2302
+ unmatched_added: list[str]
2303
+ unmatched_removed: list[str]
2304
+
2305
+ def to_dict(self) -> dict[str, Any]:
2306
+ """Convert to dictionary."""
2307
+ return {
2308
+ "confirmed_renames": [r.to_dict() for r in self.confirmed_renames],
2309
+ "possible_renames": [r.to_dict() for r in self.possible_renames],
2310
+ "unmatched_added": self.unmatched_added,
2311
+ "unmatched_removed": self.unmatched_removed,
2312
+ }
2313
+
2314
+
2315
+ @dataclass
2316
+ class SchemaVersionResult:
2317
+ """Schema version information.
2318
+
2319
+ Attributes:
2320
+ id: Version identifier (hash or version string).
2321
+ version: Version string (e.g., "1.0.0", "20260129.143000").
2322
+ schema: Schema dictionary.
2323
+ metadata: Optional metadata.
2324
+ created_at: Creation timestamp.
2325
+ has_breaking_changes: Whether this version has breaking changes from parent.
2326
+ changes_from_parent: List of changes from parent version.
2327
+ """
2328
+
2329
+ id: str
2330
+ version: str
2331
+ schema: dict[str, Any]
2332
+ metadata: dict[str, Any] | None
2333
+ created_at: str | None
2334
+ has_breaking_changes: bool = False
2335
+ changes_from_parent: list[SchemaChangeResult] | None = None
2336
+
2337
+ def to_dict(self) -> dict[str, Any]:
2338
+ """Convert to dictionary."""
2339
+ return {
2340
+ "id": self.id,
2341
+ "version": self.version,
2342
+ "schema": self.schema,
2343
+ "metadata": self.metadata,
2344
+ "created_at": self.created_at,
2345
+ "has_breaking_changes": self.has_breaking_changes,
2346
+ "changes_from_parent": (
2347
+ [c.to_dict() for c in self.changes_from_parent]
2348
+ if self.changes_from_parent
2349
+ else None
2350
+ ),
2351
+ }
2352
+
2353
+
2354
+ @dataclass
2355
+ class SchemaDiffResult:
2356
+ """Schema diff between two versions.
2357
+
2358
+ Attributes:
2359
+ from_version: Source version string.
2360
+ to_version: Target version string.
2361
+ changes: List of changes.
2362
+ text_diff: Human-readable text diff.
2363
+ """
2364
+
2365
+ from_version: str
2366
+ to_version: str
2367
+ changes: list[SchemaChangeResult]
2368
+ text_diff: str
2369
+
2370
+ def to_dict(self) -> dict[str, Any]:
2371
+ """Convert to dictionary."""
2372
+ return {
2373
+ "from_version": self.from_version,
2374
+ "to_version": self.to_version,
2375
+ "changes": [c.to_dict() for c in self.changes],
2376
+ "text_diff": self.text_diff,
2377
+ }
2378
+
2379
+
2380
+ @dataclass
2381
+ class SchemaWatcherEvent:
2382
+ """Schema watcher change event.
2383
+
2384
+ Attributes:
2385
+ source: Source name that changed.
2386
+ has_breaking_changes: Whether breaking changes were detected.
2387
+ total_changes: Total number of changes.
2388
+ changes: List of changes.
2389
+ timestamp: Event timestamp.
2390
+ """
2391
+
2392
+ source: str
2393
+ has_breaking_changes: bool
2394
+ total_changes: int
2395
+ changes: list[SchemaChangeResult]
2396
+ timestamp: str
2397
+
2398
+ def to_dict(self) -> dict[str, Any]:
2399
+ """Convert to dictionary."""
2400
+ return {
2401
+ "source": self.source,
2402
+ "has_breaking_changes": self.has_breaking_changes,
2403
+ "total_changes": self.total_changes,
2404
+ "changes": [c.to_dict() for c in self.changes],
2405
+ "timestamp": self.timestamp,
2406
+ }
2407
+
2408
+
2409
+ @dataclass
2410
+ class BreakingChangeAlert:
2411
+ """Breaking change alert with impact analysis.
2412
+
2413
+ Attributes:
2414
+ alert_id: Unique alert identifier.
2415
+ title: Alert title.
2416
+ source: Source name.
2417
+ changes: List of breaking changes.
2418
+ impact_scope: Impact scope (local, downstream, system).
2419
+ affected_consumers: List of affected consumers.
2420
+ data_risk_level: Risk level (1-5).
2421
+ recommendations: List of recommendations.
2422
+ status: Alert status (open, acknowledged, resolved).
2423
+ created_at: Creation timestamp.
2424
+ acknowledged_at: Acknowledgment timestamp.
2425
+ resolved_at: Resolution timestamp.
2426
+ """
2427
+
2428
+ alert_id: str
2429
+ title: str
2430
+ source: str
2431
+ changes: list[SchemaChangeResult]
2432
+ impact_scope: str
2433
+ affected_consumers: list[str]
2434
+ data_risk_level: int
2435
+ recommendations: list[str]
2436
+ status: str
2437
+ created_at: str
2438
+ acknowledged_at: str | None = None
2439
+ resolved_at: str | None = None
2440
+
2441
+ def to_dict(self) -> dict[str, Any]:
2442
+ """Convert to dictionary."""
2443
+ return {
2444
+ "alert_id": self.alert_id,
2445
+ "title": self.title,
2446
+ "source": self.source,
2447
+ "changes": [c.to_dict() for c in self.changes],
2448
+ "impact_scope": self.impact_scope,
2449
+ "affected_consumers": self.affected_consumers,
2450
+ "data_risk_level": self.data_risk_level,
2451
+ "recommendations": self.recommendations,
2452
+ "status": self.status,
2453
+ "created_at": self.created_at,
2454
+ "acknowledged_at": self.acknowledged_at,
2455
+ "resolved_at": self.resolved_at,
2456
+ }
2457
+
2458
+
2459
+ class SchemaEvolutionAdapter:
2460
+ """Async wrapper for truthound schema evolution functions.
2461
+
2462
+ This adapter provides an async interface to truthound's schema evolution
2463
+ module (truthound.profiler.evolution), including:
2464
+ - SchemaEvolutionDetector for change detection
2465
+ - SchemaHistory for version management
2466
+ - SchemaWatcher for continuous monitoring
2467
+ - ColumnRenameDetector for rename detection
2468
+ - BreakingChangeAlertManager for alert management
2469
+ - ImpactAnalyzer for impact analysis
2470
+
2471
+ All operations run in a thread pool to avoid blocking the event loop.
2472
+ """
2473
+
2474
+ def __init__(self, max_workers: int = 4) -> None:
2475
+ """Initialize adapter.
2476
+
2477
+ Args:
2478
+ max_workers: Maximum worker threads for concurrent operations.
2479
+ """
2480
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
2481
+ self._watchers: dict[str, Any] = {} # watcher_id -> SchemaWatcher
2482
+ self._histories: dict[str, Any] = {} # history_id -> SchemaHistory
2483
+ self._alert_manager: Any = None
2484
+ self._impact_analyzer: Any = None
2485
+
2486
+ async def detect_changes(
2487
+ self,
2488
+ current_schema: dict[str, Any],
2489
+ baseline_schema: dict[str, Any],
2490
+ *,
2491
+ detect_renames: bool = True,
2492
+ rename_similarity_threshold: float = 0.8,
2493
+ ) -> SchemaDetectionResult:
2494
+ """Detect schema changes between two schemas.
2495
+
2496
+ Uses truthound's SchemaEvolutionDetector for comprehensive change
2497
+ detection including column additions, removals, type changes, and renames.
2498
+
2499
+ Args:
2500
+ current_schema: Current schema dictionary ({"column": "Type"}).
2501
+ baseline_schema: Baseline schema dictionary.
2502
+ detect_renames: Enable rename detection.
2503
+ rename_similarity_threshold: Threshold for considering a rename (0.0-1.0).
2504
+
2505
+ Returns:
2506
+ SchemaDetectionResult with all detected changes.
2507
+ """
2508
+ from truthound.profiler.evolution import SchemaEvolutionDetector
2509
+
2510
+ def _detect():
2511
+ detector = SchemaEvolutionDetector(
2512
+ detect_renames=detect_renames,
2513
+ rename_similarity_threshold=rename_similarity_threshold,
2514
+ )
2515
+ changes = detector.detect_changes(current_schema, baseline_schema)
2516
+ summary = detector.get_change_summary(changes)
2517
+ return changes, summary
2518
+
2519
+ loop = asyncio.get_event_loop()
2520
+ changes, summary = await loop.run_in_executor(self._executor, _detect)
2521
+
2522
+ return self._convert_detection_result(changes, summary)
2523
+
2524
+ async def detect_renames(
2525
+ self,
2526
+ added_columns: dict[str, str],
2527
+ removed_columns: dict[str, str],
2528
+ *,
2529
+ similarity_threshold: float = 0.8,
2530
+ require_type_match: bool = True,
2531
+ allow_compatible_types: bool = True,
2532
+ algorithm: str = "composite",
2533
+ ) -> RenameDetectionSummary:
2534
+ """Detect column renames between added and removed columns.
2535
+
2536
+ Uses truthound's ColumnRenameDetector with configurable similarity
2537
+ algorithms for accurate rename detection.
2538
+
2539
+ Args:
2540
+ added_columns: Dict of added columns {"name": "Type"}.
2541
+ removed_columns: Dict of removed columns {"name": "Type"}.
2542
+ similarity_threshold: Threshold for considering a rename (0.0-1.0).
2543
+ require_type_match: Require matching types for rename.
2544
+ allow_compatible_types: Allow compatible type changes (e.g., Int32->Int64).
2545
+ algorithm: Similarity algorithm:
2546
+ - "composite": Weighted combination (default)
2547
+ - "levenshtein": Edit distance
2548
+ - "jaro_winkler": Short strings, prefixes
2549
+ - "ngram": Partial matches
2550
+ - "token": snake_case/camelCase names
2551
+
2552
+ Returns:
2553
+ RenameDetectionSummary with confirmed and possible renames.
2554
+ """
2555
+ from truthound.profiler.evolution import ColumnRenameDetector
2556
+
2557
+ def _detect():
2558
+ detector = ColumnRenameDetector(
2559
+ similarity_threshold=similarity_threshold,
2560
+ require_type_match=require_type_match,
2561
+ allow_compatible_types=allow_compatible_types,
2562
+ )
2563
+ return detector.detect(
2564
+ added_columns=added_columns,
2565
+ removed_columns=removed_columns,
2566
+ )
2567
+
2568
+ loop = asyncio.get_event_loop()
2569
+ result = await loop.run_in_executor(self._executor, _detect)
2570
+
2571
+ return self._convert_rename_result(result)
2572
+
2573
+ async def create_history(
2574
+ self,
2575
+ history_id: str,
2576
+ storage_path: str,
2577
+ *,
2578
+ version_strategy: str = "semantic",
2579
+ max_versions: int = 100,
2580
+ compress: bool = True,
2581
+ ) -> str:
2582
+ """Create a new schema history storage.
2583
+
2584
+ Uses truthound's SchemaHistory for version management with support
2585
+ for semantic, incremental, timestamp, and git versioning strategies.
2586
+
2587
+ Args:
2588
+ history_id: Unique identifier for this history instance.
2589
+ storage_path: Path for file-based storage.
2590
+ version_strategy: Version numbering strategy:
2591
+ - "semantic": 1.2.3 format, auto-bumps based on change type
2592
+ - "incremental": 1, 2, 3 simple numbers
2593
+ - "timestamp": 20260128.143052 time-based
2594
+ - "git": a1b2c3d4 git-like hashes
2595
+ max_versions: Maximum versions to keep.
2596
+ compress: Compress stored files.
2597
+
2598
+ Returns:
2599
+ History ID for future operations.
2600
+ """
2601
+ from truthound.profiler.evolution import SchemaHistory
2602
+
2603
+ def _create():
2604
+ return SchemaHistory.create(
2605
+ storage_type="file",
2606
+ path=storage_path,
2607
+ version_strategy=version_strategy,
2608
+ max_versions=max_versions,
2609
+ compress=compress,
2610
+ )
2611
+
2612
+ loop = asyncio.get_event_loop()
2613
+ history = await loop.run_in_executor(self._executor, _create)
2614
+
2615
+ self._histories[history_id] = history
2616
+ return history_id
2617
+
2618
+ async def save_schema_version(
2619
+ self,
2620
+ history_id: str,
2621
+ schema: dict[str, Any],
2622
+ *,
2623
+ version: str | None = None,
2624
+ metadata: dict[str, Any] | None = None,
2625
+ ) -> SchemaVersionResult:
2626
+ """Save a schema version to history.
2627
+
2628
+ Args:
2629
+ history_id: History instance ID.
2630
+ schema: Schema dictionary to save.
2631
+ version: Optional explicit version string.
2632
+ metadata: Optional metadata (author, message, etc.).
2633
+
2634
+ Returns:
2635
+ SchemaVersionResult with version info.
2636
+
2637
+ Raises:
2638
+ ValueError: If history_id not found.
2639
+ """
2640
+ if history_id not in self._histories:
2641
+ raise ValueError(f"History '{history_id}' not found")
2642
+
2643
+ history = self._histories[history_id]
2644
+
2645
+ def _save():
2646
+ kwargs: dict[str, Any] = {}
2647
+ if version:
2648
+ kwargs["version"] = version
2649
+ if metadata:
2650
+ kwargs["metadata"] = metadata
2651
+ return history.save(schema, **kwargs)
2652
+
2653
+ loop = asyncio.get_event_loop()
2654
+ result = await loop.run_in_executor(self._executor, _save)
2655
+
2656
+ return self._convert_version_result(result)
2657
+
2658
+ async def get_schema_version(
2659
+ self,
2660
+ history_id: str,
2661
+ version: str,
2662
+ ) -> SchemaVersionResult | None:
2663
+ """Get a specific schema version.
2664
+
2665
+ Args:
2666
+ history_id: History instance ID.
2667
+ version: Version string or ID.
2668
+
2669
+ Returns:
2670
+ SchemaVersionResult or None if not found.
2671
+ """
2672
+ if history_id not in self._histories:
2673
+ raise ValueError(f"History '{history_id}' not found")
2674
+
2675
+ history = self._histories[history_id]
2676
+
2677
+ def _get():
2678
+ try:
2679
+ return history.get_by_version(version)
2680
+ except Exception:
2681
+ return history.get(version)
2682
+
2683
+ loop = asyncio.get_event_loop()
2684
+ result = await loop.run_in_executor(self._executor, _get)
2685
+
2686
+ if result is None:
2687
+ return None
2688
+ return self._convert_version_result(result)
2689
+
2690
+ async def list_schema_versions(
2691
+ self,
2692
+ history_id: str,
2693
+ *,
2694
+ limit: int = 50,
2695
+ since: str | None = None,
2696
+ ) -> list[SchemaVersionResult]:
2697
+ """List schema versions in history.
2698
+
2699
+ Args:
2700
+ history_id: History instance ID.
2701
+ limit: Maximum versions to return.
2702
+ since: Filter versions since this datetime (ISO format).
2703
+
2704
+ Returns:
2705
+ List of SchemaVersionResult.
2706
+ """
2707
+ from datetime import datetime, timedelta
2708
+
2709
+ if history_id not in self._histories:
2710
+ raise ValueError(f"History '{history_id}' not found")
2711
+
2712
+ history = self._histories[history_id]
2713
+
2714
+ def _list():
2715
+ kwargs: dict[str, Any] = {"limit": limit}
2716
+ if since:
2717
+ kwargs["since"] = datetime.fromisoformat(since)
2718
+ return history.list(**kwargs)
2719
+
2720
+ loop = asyncio.get_event_loop()
2721
+ versions = await loop.run_in_executor(self._executor, _list)
2722
+
2723
+ return [self._convert_version_result(v) for v in versions]
2724
+
2725
+ async def get_latest_version(
2726
+ self,
2727
+ history_id: str,
2728
+ ) -> SchemaVersionResult | None:
2729
+ """Get the latest schema version.
2730
+
2731
+ Args:
2732
+ history_id: History instance ID.
2733
+
2734
+ Returns:
2735
+ Latest SchemaVersionResult or None.
2736
+ """
2737
+ if history_id not in self._histories:
2738
+ raise ValueError(f"History '{history_id}' not found")
2739
+
2740
+ history = self._histories[history_id]
2741
+
2742
+ loop = asyncio.get_event_loop()
2743
+ result = await loop.run_in_executor(
2744
+ self._executor, lambda: history.latest
2745
+ )
2746
+
2747
+ if result is None:
2748
+ return None
2749
+ return self._convert_version_result(result)
2750
+
2751
+ async def diff_versions(
2752
+ self,
2753
+ history_id: str,
2754
+ from_version: str,
2755
+ to_version: str | None = None,
2756
+ ) -> SchemaDiffResult:
2757
+ """Get diff between two schema versions.
2758
+
2759
+ Args:
2760
+ history_id: History instance ID.
2761
+ from_version: Source version string.
2762
+ to_version: Target version string (None = latest).
2763
+
2764
+ Returns:
2765
+ SchemaDiffResult with changes and text diff.
2766
+ """
2767
+ if history_id not in self._histories:
2768
+ raise ValueError(f"History '{history_id}' not found")
2769
+
2770
+ history = self._histories[history_id]
2771
+
2772
+ def _diff():
2773
+ if to_version:
2774
+ return history.diff(from_version, to_version)
2775
+ else:
2776
+ return history.diff(from_version)
2777
+
2778
+ loop = asyncio.get_event_loop()
2779
+ diff = await loop.run_in_executor(self._executor, _diff)
2780
+
2781
+ return self._convert_diff_result(diff, from_version, to_version or "latest")
2782
+
2783
+ async def has_breaking_changes_since(
2784
+ self,
2785
+ history_id: str,
2786
+ version: str,
2787
+ ) -> bool:
2788
+ """Check if there are breaking changes since a version.
2789
+
2790
+ Args:
2791
+ history_id: History instance ID.
2792
+ version: Version to check from.
2793
+
2794
+ Returns:
2795
+ True if breaking changes exist.
2796
+ """
2797
+ if history_id not in self._histories:
2798
+ raise ValueError(f"History '{history_id}' not found")
2799
+
2800
+ history = self._histories[history_id]
2801
+
2802
+ loop = asyncio.get_event_loop()
2803
+ return await loop.run_in_executor(
2804
+ self._executor, lambda: history.has_breaking_changes_since(version)
2805
+ )
2806
+
2807
+ async def rollback_version(
2808
+ self,
2809
+ history_id: str,
2810
+ to_version: str,
2811
+ *,
2812
+ reason: str | None = None,
2813
+ ) -> SchemaVersionResult:
2814
+ """Rollback to a previous version.
2815
+
2816
+ Creates a new version that matches the specified version.
2817
+
2818
+ Args:
2819
+ history_id: History instance ID.
2820
+ to_version: Version to rollback to.
2821
+ reason: Reason for rollback.
2822
+
2823
+ Returns:
2824
+ New SchemaVersionResult after rollback.
2825
+ """
2826
+ if history_id not in self._histories:
2827
+ raise ValueError(f"History '{history_id}' not found")
2828
+
2829
+ history = self._histories[history_id]
2830
+
2831
+ def _rollback():
2832
+ kwargs: dict[str, Any] = {}
2833
+ if reason:
2834
+ kwargs["reason"] = reason
2835
+ return history.rollback(to_version, **kwargs)
2836
+
2837
+ loop = asyncio.get_event_loop()
2838
+ result = await loop.run_in_executor(self._executor, _rollback)
2839
+
2840
+ return self._convert_version_result(result)
2841
+
2842
+ async def create_watcher(
2843
+ self,
2844
+ watcher_id: str,
2845
+ sources: list[dict[str, Any]],
2846
+ *,
2847
+ poll_interval: int = 60,
2848
+ only_breaking: bool = False,
2849
+ enable_history: bool = True,
2850
+ history_path: str | None = None,
2851
+ ) -> str:
2852
+ """Create a new schema watcher.
2853
+
2854
+ Uses truthound's SchemaWatcher for continuous monitoring with
2855
+ configurable sources, handlers, and polling.
2856
+
2857
+ Args:
2858
+ watcher_id: Unique identifier for this watcher.
2859
+ sources: List of source configurations, each with:
2860
+ - type: "file", "dict", or "polars"
2861
+ - path: For file sources
2862
+ - schema: For dict sources
2863
+ - name: Source name
2864
+ poll_interval: Polling interval in seconds.
2865
+ only_breaking: Only alert on breaking changes.
2866
+ enable_history: Enable history tracking.
2867
+ history_path: Path for history storage.
2868
+
2869
+ Returns:
2870
+ Watcher ID for future operations.
2871
+ """
2872
+ from truthound.profiler.evolution import (
2873
+ SchemaWatcher,
2874
+ FileSchemaSource,
2875
+ DictSchemaSource,
2876
+ LoggingEventHandler,
2877
+ HistoryEventHandler,
2878
+ SchemaHistory,
2879
+ )
2880
+
2881
+ def _create():
2882
+ watcher = SchemaWatcher()
2883
+
2884
+ # Add sources
2885
+ for src in sources:
2886
+ src_type = src.get("type", "file")
2887
+ if src_type == "file":
2888
+ watcher.add_source(FileSchemaSource(src["path"]))
2889
+ elif src_type == "dict":
2890
+ watcher.add_source(
2891
+ DictSchemaSource(src["schema"], src.get("name", "dict"))
2892
+ )
2893
+
2894
+ # Add logging handler
2895
+ watcher.add_handler(LoggingEventHandler())
2896
+
2897
+ # Add history handler if enabled
2898
+ if enable_history and history_path:
2899
+ history = SchemaHistory.create(
2900
+ storage_type="file",
2901
+ path=history_path,
2902
+ )
2903
+ watcher.add_handler(HistoryEventHandler(history))
2904
+
2905
+ return watcher
2906
+
2907
+ loop = asyncio.get_event_loop()
2908
+ watcher = await loop.run_in_executor(self._executor, _create)
2909
+
2910
+ self._watchers[watcher_id] = {
2911
+ "watcher": watcher,
2912
+ "poll_interval": poll_interval,
2913
+ "only_breaking": only_breaking,
2914
+ "status": "created",
2915
+ }
2916
+ return watcher_id
2917
+
2918
+ async def start_watcher(
2919
+ self,
2920
+ watcher_id: str,
2921
+ *,
2922
+ daemon: bool = True,
2923
+ ) -> None:
2924
+ """Start a schema watcher.
2925
+
2926
+ Args:
2927
+ watcher_id: Watcher ID to start.
2928
+ daemon: Run as daemon thread.
2929
+
2930
+ Raises:
2931
+ ValueError: If watcher_id not found.
2932
+ """
2933
+ if watcher_id not in self._watchers:
2934
+ raise ValueError(f"Watcher '{watcher_id}' not found")
2935
+
2936
+ watcher_data = self._watchers[watcher_id]
2937
+ watcher = watcher_data["watcher"]
2938
+ poll_interval = watcher_data["poll_interval"]
2939
+
2940
+ def _start():
2941
+ watcher.start(poll_interval=poll_interval, daemon=daemon)
2942
+
2943
+ loop = asyncio.get_event_loop()
2944
+ await loop.run_in_executor(self._executor, _start)
2945
+
2946
+ watcher_data["status"] = "running"
2947
+
2948
+ async def stop_watcher(self, watcher_id: str) -> None:
2949
+ """Stop a schema watcher.
2950
+
2951
+ Args:
2952
+ watcher_id: Watcher ID to stop.
2953
+ """
2954
+ if watcher_id not in self._watchers:
2955
+ raise ValueError(f"Watcher '{watcher_id}' not found")
2956
+
2957
+ watcher_data = self._watchers[watcher_id]
2958
+ watcher = watcher_data["watcher"]
2959
+
2960
+ loop = asyncio.get_event_loop()
2961
+ await loop.run_in_executor(self._executor, watcher.stop)
2962
+
2963
+ watcher_data["status"] = "stopped"
2964
+
2965
+ async def pause_watcher(self, watcher_id: str) -> None:
2966
+ """Pause a schema watcher.
2967
+
2968
+ Args:
2969
+ watcher_id: Watcher ID to pause.
2970
+ """
2971
+ if watcher_id not in self._watchers:
2972
+ raise ValueError(f"Watcher '{watcher_id}' not found")
2973
+
2974
+ watcher_data = self._watchers[watcher_id]
2975
+ watcher = watcher_data["watcher"]
2976
+
2977
+ loop = asyncio.get_event_loop()
2978
+ await loop.run_in_executor(self._executor, watcher.pause)
2979
+
2980
+ watcher_data["status"] = "paused"
2981
+
2982
+ async def resume_watcher(self, watcher_id: str) -> None:
2983
+ """Resume a paused schema watcher.
2984
+
2985
+ Args:
2986
+ watcher_id: Watcher ID to resume.
2987
+ """
2988
+ if watcher_id not in self._watchers:
2989
+ raise ValueError(f"Watcher '{watcher_id}' not found")
2990
+
2991
+ watcher_data = self._watchers[watcher_id]
2992
+ watcher = watcher_data["watcher"]
2993
+
2994
+ loop = asyncio.get_event_loop()
2995
+ await loop.run_in_executor(self._executor, watcher.resume)
2996
+
2997
+ watcher_data["status"] = "running"
2998
+
2999
+ async def check_watcher_now(
3000
+ self,
3001
+ watcher_id: str,
3002
+ ) -> list[SchemaWatcherEvent]:
3003
+ """Execute immediate check for a watcher.
3004
+
3005
+ Args:
3006
+ watcher_id: Watcher ID to check.
3007
+
3008
+ Returns:
3009
+ List of SchemaWatcherEvent for any detected changes.
3010
+ """
3011
+ if watcher_id not in self._watchers:
3012
+ raise ValueError(f"Watcher '{watcher_id}' not found")
3013
+
3014
+ watcher_data = self._watchers[watcher_id]
3015
+ watcher = watcher_data["watcher"]
3016
+
3017
+ loop = asyncio.get_event_loop()
3018
+ events = await loop.run_in_executor(self._executor, watcher.check_now)
3019
+
3020
+ return [self._convert_watcher_event(e) for e in events]
3021
+
3022
+ async def get_watcher_status(self, watcher_id: str) -> dict[str, Any]:
3023
+ """Get watcher status.
3024
+
3025
+ Args:
3026
+ watcher_id: Watcher ID.
3027
+
3028
+ Returns:
3029
+ Status dictionary with status, poll_interval, only_breaking.
3030
+ """
3031
+ if watcher_id not in self._watchers:
3032
+ raise ValueError(f"Watcher '{watcher_id}' not found")
3033
+
3034
+ watcher_data = self._watchers[watcher_id]
3035
+ return {
3036
+ "watcher_id": watcher_id,
3037
+ "status": watcher_data["status"],
3038
+ "poll_interval": watcher_data["poll_interval"],
3039
+ "only_breaking": watcher_data["only_breaking"],
3040
+ }
3041
+
3042
+ async def delete_watcher(self, watcher_id: str) -> None:
3043
+ """Delete a watcher.
3044
+
3045
+ Stops the watcher if running and removes it.
3046
+
3047
+ Args:
3048
+ watcher_id: Watcher ID to delete.
3049
+ """
3050
+ if watcher_id not in self._watchers:
3051
+ raise ValueError(f"Watcher '{watcher_id}' not found")
3052
+
3053
+ watcher_data = self._watchers[watcher_id]
3054
+ if watcher_data["status"] == "running":
3055
+ await self.stop_watcher(watcher_id)
3056
+
3057
+ del self._watchers[watcher_id]
3058
+
3059
+ async def setup_impact_analyzer(
3060
+ self,
3061
+ consumers: dict[str, list[str]] | None = None,
3062
+ queries: dict[str, list[str]] | None = None,
3063
+ ) -> None:
3064
+ """Setup impact analyzer with consumer mappings.
3065
+
3066
+ Args:
3067
+ consumers: Dict of consumer name -> list of sources it depends on.
3068
+ queries: Dict of source name -> list of queries using it.
3069
+ """
3070
+ from truthound.profiler.evolution import ImpactAnalyzer
3071
+
3072
+ def _setup():
3073
+ analyzer = ImpactAnalyzer()
3074
+ if consumers:
3075
+ for consumer, sources in consumers.items():
3076
+ analyzer.register_consumer(consumer, sources)
3077
+ if queries:
3078
+ for source, query_list in queries.items():
3079
+ for query in query_list:
3080
+ analyzer.register_query(source, query)
3081
+ return analyzer
3082
+
3083
+ loop = asyncio.get_event_loop()
3084
+ self._impact_analyzer = await loop.run_in_executor(self._executor, _setup)
3085
+
3086
+ async def setup_alert_manager(
3087
+ self,
3088
+ alert_storage_path: str,
3089
+ ) -> None:
3090
+ """Setup breaking change alert manager.
3091
+
3092
+ Args:
3093
+ alert_storage_path: Path for alert storage.
3094
+ """
3095
+ from truthound.profiler.evolution import BreakingChangeAlertManager
3096
+
3097
+ def _setup():
3098
+ return BreakingChangeAlertManager(
3099
+ impact_analyzer=self._impact_analyzer,
3100
+ alert_storage_path=alert_storage_path,
3101
+ )
3102
+
3103
+ loop = asyncio.get_event_loop()
3104
+ self._alert_manager = await loop.run_in_executor(self._executor, _setup)
3105
+
3106
+ async def create_alert(
3107
+ self,
3108
+ changes: list[dict[str, Any]],
3109
+ source: str,
3110
+ ) -> BreakingChangeAlert:
3111
+ """Create a breaking change alert.
3112
+
3113
+ Args:
3114
+ changes: List of change dictionaries from detect_changes.
3115
+ source: Source name.
3116
+
3117
+ Returns:
3118
+ BreakingChangeAlert with impact analysis.
3119
+
3120
+ Raises:
3121
+ ValueError: If alert manager not setup.
3122
+ """
3123
+ if self._alert_manager is None:
3124
+ raise ValueError("Alert manager not setup. Call setup_alert_manager first.")
3125
+
3126
+ def _create():
3127
+ return self._alert_manager.create_alert(changes, source=source)
3128
+
3129
+ loop = asyncio.get_event_loop()
3130
+ alert = await loop.run_in_executor(self._executor, _create)
3131
+
3132
+ return self._convert_alert_result(alert)
3133
+
3134
+ async def acknowledge_alert(self, alert_id: str) -> None:
3135
+ """Acknowledge an alert.
3136
+
3137
+ Args:
3138
+ alert_id: Alert ID to acknowledge.
3139
+ """
3140
+ if self._alert_manager is None:
3141
+ raise ValueError("Alert manager not setup.")
3142
+
3143
+ loop = asyncio.get_event_loop()
3144
+ await loop.run_in_executor(
3145
+ self._executor, lambda: self._alert_manager.acknowledge_alert(alert_id)
3146
+ )
3147
+
3148
+ async def resolve_alert(self, alert_id: str) -> None:
3149
+ """Resolve an alert.
3150
+
3151
+ Args:
3152
+ alert_id: Alert ID to resolve.
3153
+ """
3154
+ if self._alert_manager is None:
3155
+ raise ValueError("Alert manager not setup.")
3156
+
3157
+ loop = asyncio.get_event_loop()
3158
+ await loop.run_in_executor(
3159
+ self._executor, lambda: self._alert_manager.resolve_alert(alert_id)
3160
+ )
3161
+
3162
+ async def get_alert_history(
3163
+ self,
3164
+ *,
3165
+ status: str | None = None,
3166
+ ) -> list[BreakingChangeAlert]:
3167
+ """Get alert history.
3168
+
3169
+ Args:
3170
+ status: Filter by status (open, acknowledged, resolved).
3171
+
3172
+ Returns:
3173
+ List of BreakingChangeAlert.
3174
+ """
3175
+ if self._alert_manager is None:
3176
+ raise ValueError("Alert manager not setup.")
3177
+
3178
+ def _get():
3179
+ kwargs: dict[str, Any] = {}
3180
+ if status:
3181
+ kwargs["status"] = status
3182
+ return self._alert_manager.get_alert_history(**kwargs)
3183
+
3184
+ loop = asyncio.get_event_loop()
3185
+ alerts = await loop.run_in_executor(self._executor, _get)
3186
+
3187
+ return [self._convert_alert_result(a) for a in alerts]
3188
+
3189
+ async def get_alert_stats(self) -> dict[str, int]:
3190
+ """Get alert statistics.
3191
+
3192
+ Returns:
3193
+ Dict with total, open, acknowledged, resolved counts.
3194
+ """
3195
+ if self._alert_manager is None:
3196
+ raise ValueError("Alert manager not setup.")
3197
+
3198
+ loop = asyncio.get_event_loop()
3199
+ return await loop.run_in_executor(
3200
+ self._executor, self._alert_manager.get_stats
3201
+ )
3202
+
3203
+ # =========================================================================
3204
+ # Result Conversion Methods
3205
+ # =========================================================================
3206
+
3207
+ def _convert_detection_result(
3208
+ self,
3209
+ changes: list[Any],
3210
+ summary: Any,
3211
+ ) -> SchemaDetectionResult:
3212
+ """Convert truthound detection result."""
3213
+ converted_changes = []
3214
+ for c in changes:
3215
+ converted_changes.append(
3216
+ SchemaChangeResult(
3217
+ change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
3218
+ column_name=getattr(c, "column", getattr(c, "column_name", "")),
3219
+ old_value=getattr(c, "old_value", None),
3220
+ new_value=getattr(c, "new_value", None),
3221
+ severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
3222
+ breaking=getattr(c, "breaking", False),
3223
+ description=getattr(c, "description", ""),
3224
+ migration_hint=getattr(c, "migration_hint", None),
3225
+ )
3226
+ )
3227
+
3228
+ compatibility = "compatible"
3229
+ if hasattr(summary, "compatibility_level"):
3230
+ compatibility = (
3231
+ summary.compatibility_level.value
3232
+ if hasattr(summary.compatibility_level, "value")
3233
+ else str(summary.compatibility_level)
3234
+ )
3235
+
3236
+ return SchemaDetectionResult(
3237
+ total_changes=getattr(summary, "total_changes", len(changes)),
3238
+ breaking_changes=getattr(summary, "breaking_changes", 0),
3239
+ compatibility_level=compatibility,
3240
+ changes=converted_changes,
3241
+ )
3242
+
3243
+ def _convert_rename_result(self, result: Any) -> RenameDetectionSummary:
3244
+ """Convert truthound rename detection result."""
3245
+ confirmed = []
3246
+ for r in getattr(result, "confirmed_renames", []):
3247
+ confirmed.append(
3248
+ RenameDetectionResult(
3249
+ old_name=r.old_name,
3250
+ new_name=r.new_name,
3251
+ similarity=r.similarity,
3252
+ confidence=r.confidence.value if hasattr(r.confidence, "value") else str(r.confidence),
3253
+ reasons=list(getattr(r, "reasons", [])),
3254
+ )
3255
+ )
3256
+
3257
+ possible = []
3258
+ for r in getattr(result, "possible_renames", []):
3259
+ possible.append(
3260
+ RenameDetectionResult(
3261
+ old_name=r.old_name,
3262
+ new_name=r.new_name,
3263
+ similarity=r.similarity,
3264
+ confidence=r.confidence.value if hasattr(r.confidence, "value") else str(r.confidence),
3265
+ reasons=list(getattr(r, "reasons", [])),
3266
+ )
3267
+ )
3268
+
3269
+ return RenameDetectionSummary(
3270
+ confirmed_renames=confirmed,
3271
+ possible_renames=possible,
3272
+ unmatched_added=list(getattr(result, "unmatched_added", [])),
3273
+ unmatched_removed=list(getattr(result, "unmatched_removed", [])),
3274
+ )
3275
+
3276
+ def _convert_version_result(self, result: Any) -> SchemaVersionResult:
3277
+ """Convert truthound version result."""
3278
+ from datetime import datetime
3279
+
3280
+ created_at = None
3281
+ if hasattr(result, "created_at") and result.created_at:
3282
+ created_at = (
3283
+ result.created_at.isoformat()
3284
+ if isinstance(result.created_at, datetime)
3285
+ else str(result.created_at)
3286
+ )
3287
+
3288
+ changes = None
3289
+ if hasattr(result, "changes_from_parent") and result.changes_from_parent:
3290
+ changes = [
3291
+ SchemaChangeResult(
3292
+ change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
3293
+ column_name=getattr(c, "column", getattr(c, "column_name", "")),
3294
+ old_value=getattr(c, "old_value", None),
3295
+ new_value=getattr(c, "new_value", None),
3296
+ severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
3297
+ breaking=getattr(c, "breaking", False),
3298
+ description=getattr(c, "description", ""),
3299
+ migration_hint=getattr(c, "migration_hint", None),
3300
+ )
3301
+ for c in result.changes_from_parent
3302
+ ]
3303
+
3304
+ # Get schema as dict
3305
+ schema = {}
3306
+ if hasattr(result, "schema"):
3307
+ schema = result.schema if isinstance(result.schema, dict) else {}
3308
+ elif hasattr(result, "to_dict"):
3309
+ schema = result.to_dict().get("schema", {})
3310
+
3311
+ return SchemaVersionResult(
3312
+ id=getattr(result, "id", getattr(result, "version_id", "")),
3313
+ version=str(getattr(result, "version", "")),
3314
+ schema=schema,
3315
+ metadata=getattr(result, "metadata", None),
3316
+ created_at=created_at,
3317
+ has_breaking_changes=getattr(result, "has_breaking_changes", False),
3318
+ changes_from_parent=changes,
3319
+ )
3320
+
3321
+ def _convert_diff_result(
3322
+ self,
3323
+ diff: Any,
3324
+ from_version: str,
3325
+ to_version: str,
3326
+ ) -> SchemaDiffResult:
3327
+ """Convert truthound diff result."""
3328
+ changes = []
3329
+ for c in getattr(diff, "changes", []):
3330
+ changes.append(
3331
+ SchemaChangeResult(
3332
+ change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
3333
+ column_name=getattr(c, "column", getattr(c, "column_name", "")),
3334
+ old_value=getattr(c, "old_value", None),
3335
+ new_value=getattr(c, "new_value", None),
3336
+ severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
3337
+ breaking=getattr(c, "breaking", False),
3338
+ description=getattr(c, "description", ""),
3339
+ migration_hint=getattr(c, "migration_hint", None),
3340
+ )
3341
+ )
3342
+
3343
+ text_diff = ""
3344
+ if hasattr(diff, "format_text"):
3345
+ text_diff = diff.format_text()
3346
+
3347
+ return SchemaDiffResult(
3348
+ from_version=from_version,
3349
+ to_version=to_version,
3350
+ changes=changes,
3351
+ text_diff=text_diff,
3352
+ )
3353
+
3354
+ def _convert_watcher_event(self, event: Any) -> SchemaWatcherEvent:
3355
+ """Convert truthound watcher event."""
3356
+ from datetime import datetime
3357
+
3358
+ changes = []
3359
+ for c in getattr(event, "changes", []):
3360
+ changes.append(
3361
+ SchemaChangeResult(
3362
+ change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
3363
+ column_name=getattr(c, "column", getattr(c, "column_name", "")),
3364
+ old_value=getattr(c, "old_value", None),
3365
+ new_value=getattr(c, "new_value", None),
3366
+ severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
3367
+ breaking=getattr(c, "breaking", False),
3368
+ description=getattr(c, "description", ""),
3369
+ migration_hint=getattr(c, "migration_hint", None),
3370
+ )
3371
+ )
3372
+
3373
+ timestamp = datetime.utcnow().isoformat()
3374
+ if hasattr(event, "timestamp"):
3375
+ timestamp = (
3376
+ event.timestamp.isoformat()
3377
+ if isinstance(event.timestamp, datetime)
3378
+ else str(event.timestamp)
3379
+ )
3380
+
3381
+ return SchemaWatcherEvent(
3382
+ source=getattr(event, "source", ""),
3383
+ has_breaking_changes=event.has_breaking_changes() if callable(getattr(event, "has_breaking_changes", None)) else getattr(event, "has_breaking_changes", False),
3384
+ total_changes=len(changes),
3385
+ changes=changes,
3386
+ timestamp=timestamp,
3387
+ )
3388
+
3389
+ def _convert_alert_result(self, alert: Any) -> BreakingChangeAlert:
3390
+ """Convert truthound alert result."""
3391
+ from datetime import datetime
3392
+
3393
+ changes = []
3394
+ for c in getattr(alert, "changes", []):
3395
+ if isinstance(c, dict):
3396
+ changes.append(
3397
+ SchemaChangeResult(
3398
+ change_type=c.get("change_type", "unknown"),
3399
+ column_name=c.get("column_name", c.get("column", "")),
3400
+ old_value=c.get("old_value"),
3401
+ new_value=c.get("new_value"),
3402
+ severity=c.get("severity", "info"),
3403
+ breaking=c.get("breaking", False),
3404
+ description=c.get("description", ""),
3405
+ migration_hint=c.get("migration_hint"),
3406
+ )
3407
+ )
3408
+ else:
3409
+ changes.append(
3410
+ SchemaChangeResult(
3411
+ change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
3412
+ column_name=getattr(c, "column", getattr(c, "column_name", "")),
3413
+ old_value=getattr(c, "old_value", None),
3414
+ new_value=getattr(c, "new_value", None),
3415
+ severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
3416
+ breaking=getattr(c, "breaking", False),
3417
+ description=getattr(c, "description", ""),
3418
+ migration_hint=getattr(c, "migration_hint", None),
3419
+ )
3420
+ )
3421
+
3422
+ # Extract impact info
3423
+ impact = getattr(alert, "impact", None)
3424
+ impact_scope = "local"
3425
+ affected_consumers: list[str] = []
3426
+ data_risk_level = 1
3427
+ recommendations: list[str] = []
3428
+
3429
+ if impact:
3430
+ impact_scope = impact.scope.value if hasattr(impact.scope, "value") else str(impact.scope)
3431
+ affected_consumers = list(getattr(impact, "affected_consumers", []))
3432
+ data_risk_level = getattr(impact, "data_risk_level", 1)
3433
+ recommendations = list(getattr(impact, "recommendations", []))
3434
+
3435
+ # Extract timestamps
3436
+ def _format_dt(dt: Any) -> str | None:
3437
+ if dt is None:
3438
+ return None
3439
+ if isinstance(dt, datetime):
3440
+ return dt.isoformat()
3441
+ return str(dt)
3442
+
3443
+ return BreakingChangeAlert(
3444
+ alert_id=getattr(alert, "alert_id", ""),
3445
+ title=getattr(alert, "title", ""),
3446
+ source=getattr(alert, "source", ""),
3447
+ changes=changes,
3448
+ impact_scope=impact_scope,
3449
+ affected_consumers=affected_consumers,
3450
+ data_risk_level=data_risk_level,
3451
+ recommendations=recommendations,
3452
+ status=getattr(alert, "status", "open"),
3453
+ created_at=_format_dt(getattr(alert, "created_at", None)) or datetime.utcnow().isoformat(),
3454
+ acknowledged_at=_format_dt(getattr(alert, "acknowledged_at", None)),
3455
+ resolved_at=_format_dt(getattr(alert, "resolved_at", None)),
3456
+ )
3457
+
3458
+ def shutdown(self) -> None:
3459
+ """Shutdown the executor and stop all watchers."""
3460
+ # Stop all watchers
3461
+ for watcher_id in list(self._watchers.keys()):
3462
+ watcher_data = self._watchers[watcher_id]
3463
+ if watcher_data["status"] == "running":
3464
+ watcher_data["watcher"].stop()
3465
+
3466
+ self._watchers.clear()
3467
+ self._histories.clear()
3468
+ self._executor.shutdown(wait=False)
3469
+
3470
+
3471
+ # Singleton instance for schema evolution
3472
+ _schema_evolution_adapter: SchemaEvolutionAdapter | None = None
3473
+
3474
+
3475
+ def get_schema_evolution_adapter() -> SchemaEvolutionAdapter:
3476
+ """Get singleton schema evolution adapter instance.
3477
+
3478
+ Returns:
3479
+ SchemaEvolutionAdapter singleton.
3480
+ """
3481
+ global _schema_evolution_adapter
3482
+ if _schema_evolution_adapter is None:
3483
+ from truthound_dashboard.config import get_settings
3484
+
3485
+ settings = get_settings()
3486
+ _schema_evolution_adapter = SchemaEvolutionAdapter(
3487
+ max_workers=settings.max_workers
3488
+ )
3489
+ return _schema_evolution_adapter
3490
+
3491
+
3492
+ def reset_schema_evolution_adapter() -> None:
3493
+ """Reset schema evolution adapter singleton (for testing)."""
3494
+ global _schema_evolution_adapter
3495
+ if _schema_evolution_adapter is not None:
3496
+ _schema_evolution_adapter.shutdown()
3497
+ _schema_evolution_adapter = None