truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +437 -10
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +11 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
  146. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
  203. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,718 @@
1
+ """Truthound backend implementation.
2
+
3
+ This module provides the concrete implementation of the data quality
4
+ backend using the truthound library. All truthound imports are isolated
5
+ here with lazy loading for better independence.
6
+
7
+ Updated for truthound 2.x API:
8
+ - Uses truthound.datasources.get_datasource() for auto-detection
9
+ - Supports both old and new import paths for backward compatibility
10
+ - Uses DataSourceCapability for feature detection
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from functools import partial
17
+ from typing import Any
18
+
19
+ from truthound_dashboard.core.converters import TruthoundResultConverter
20
+ from truthound_dashboard.core.interfaces import DataInput, DataSourceCapability
21
+ from truthound_dashboard.core.truthound_adapter import (
22
+ CheckResult,
23
+ ColumnProfileResult,
24
+ CompareResult,
25
+ GenerateSuiteResult,
26
+ LearnResult,
27
+ MaskResult,
28
+ ProfileResult,
29
+ ScanResult,
30
+ )
31
+
32
+ from .base import BaseDataQualityBackend
33
+ from .errors import BackendOperationError, BackendUnavailableError
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class TruthoundBackend(BaseDataQualityBackend):
39
+ """Truthound-based data quality backend.
40
+
41
+ This backend uses the truthound library for all data quality operations.
42
+ Truthound imports are lazy-loaded to allow the dashboard to start
43
+ even if truthound is not installed (for testing or limited functionality).
44
+
45
+ Example:
46
+ backend = TruthoundBackend()
47
+ if backend.is_available():
48
+ result = await backend.check("data.csv")
49
+ """
50
+
51
+ def __init__(self, max_workers: int = 4) -> None:
52
+ """Initialize truthound backend.
53
+
54
+ Args:
55
+ max_workers: Maximum worker threads for async operations.
56
+ """
57
+ super().__init__(max_workers=max_workers)
58
+ self._th = None # Lazy-loaded truthound module
59
+ self._converter = TruthoundResultConverter()
60
+
61
+ def _get_truthound(self):
62
+ """Get truthound module with lazy loading.
63
+
64
+ Returns:
65
+ Truthound module.
66
+
67
+ Raises:
68
+ BackendUnavailableError: If truthound is not installed.
69
+ """
70
+ if self._th is None:
71
+ try:
72
+ import truthound as th
73
+ self._th = th
74
+ except ImportError as e:
75
+ raise BackendUnavailableError(
76
+ "truthound",
77
+ "Library not installed. Install with: pip install truthound"
78
+ ) from e
79
+ return self._th
80
+
81
+ def is_available(self) -> bool:
82
+ """Check if truthound is available.
83
+
84
+ Returns:
85
+ True if truthound is installed and importable.
86
+ """
87
+ try:
88
+ import truthound
89
+ return True
90
+ except ImportError:
91
+ return False
92
+
93
+ def get_version(self) -> str | None:
94
+ """Get truthound version.
95
+
96
+ Returns:
97
+ Truthound version string or None if not available.
98
+ """
99
+ try:
100
+ import truthound
101
+ return getattr(truthound, "__version__", None)
102
+ except ImportError:
103
+ return None
104
+
105
+ def _resolve_data_input(self, data: DataInput) -> Any:
106
+ """Resolve DataInput to a format truthound can process.
107
+
108
+ Truthound 2.x accepts DataSource objects directly, so we try to
109
+ pass them through. For backward compatibility, we also support
110
+ extracting LazyFrames from DataSource objects.
111
+
112
+ Args:
113
+ data: File path string, DataSource object, or DataFrame.
114
+
115
+ Returns:
116
+ File path string, DataSource, or DataFrame that truthound can process.
117
+ """
118
+ if isinstance(data, str):
119
+ return data
120
+
121
+ # Check if it's a truthound DataSource (new API)
122
+ # These should be passed directly to truthound functions
123
+ if hasattr(data, "capabilities"):
124
+ # It's likely a truthound 2.x DataSource
125
+ return data
126
+
127
+ # Check if it's a DataSource with to_polars_lazyframe method (legacy)
128
+ if hasattr(data, "to_polars_lazyframe"):
129
+ try:
130
+ return data.to_polars_lazyframe()
131
+ except Exception:
132
+ # If extraction fails, try passing the object directly
133
+ return data
134
+
135
+ # If it's already a LazyFrame or DataFrame, return as-is
136
+ return data
137
+
138
+ def _get_source_capabilities(self, data: DataInput) -> set[str]:
139
+ """Get capabilities from a data source if available.
140
+
141
+ Args:
142
+ data: DataInput object.
143
+
144
+ Returns:
145
+ Set of capability names, or empty set if not available.
146
+ """
147
+ if hasattr(data, "capabilities"):
148
+ try:
149
+ capabilities = data.capabilities
150
+ return {c.name for c in capabilities}
151
+ except Exception:
152
+ pass
153
+ return set()
154
+
155
+ async def check(
156
+ self,
157
+ data: DataInput,
158
+ *,
159
+ validators: list[str] | None = None,
160
+ validator_config: dict[str, dict[str, Any]] | None = None,
161
+ schema: str | None = None,
162
+ auto_schema: bool = False,
163
+ columns: list[str] | None = None,
164
+ min_severity: str | None = None,
165
+ strict: bool = False,
166
+ parallel: bool = False,
167
+ max_workers: int | None = None,
168
+ pushdown: bool | None = None,
169
+ ) -> CheckResult:
170
+ """Run data validation using truthound.
171
+
172
+ Updated for truthound 2.x API:
173
+ - Supports passing DataSource objects directly via 'source' parameter
174
+ - Falls back to 'data' parameter for file paths and DataFrames
175
+
176
+ Args:
177
+ data: File path, DataSource object, or DataFrame.
178
+ validators: List of validator names to run.
179
+ validator_config: Per-validator configuration.
180
+ schema: Path to schema YAML file.
181
+ auto_schema: Auto-learn schema for validation.
182
+ columns: Columns to validate.
183
+ min_severity: Minimum severity to report.
184
+ strict: Raise exception on failures.
185
+ parallel: Use parallel execution.
186
+ max_workers: Max threads for parallel.
187
+ pushdown: Enable query pushdown. If None, auto-detect from source capabilities.
188
+
189
+ Returns:
190
+ CheckResult with validation results.
191
+ """
192
+ th = self._get_truthound()
193
+
194
+ # Resolve DataSource
195
+ resolved_data = self._resolve_data_input(data)
196
+
197
+ # Build kwargs
198
+ kwargs: dict[str, Any] = {}
199
+
200
+ # Truthound 2.x prefers 'source' for DataSource objects
201
+ # but also accepts 'data' for backward compatibility
202
+ if hasattr(resolved_data, "capabilities"):
203
+ # It's a truthound 2.x DataSource, use 'source' parameter
204
+ kwargs["source"] = resolved_data
205
+
206
+ # Auto-enable pushdown if source supports it and not explicitly set
207
+ if pushdown is None:
208
+ source_caps = self._get_source_capabilities(resolved_data)
209
+ if "SQL_PUSHDOWN" in source_caps:
210
+ pushdown = True
211
+ else:
212
+ # File path or DataFrame, use 'data' parameter
213
+ kwargs["data"] = resolved_data
214
+
215
+ kwargs.update({
216
+ "validators": validators,
217
+ "schema": schema,
218
+ "auto_schema": auto_schema,
219
+ "parallel": parallel,
220
+ })
221
+
222
+ if validator_config:
223
+ kwargs["validator_config"] = validator_config
224
+ if columns is not None:
225
+ kwargs["columns"] = columns
226
+ if min_severity is not None:
227
+ kwargs["min_severity"] = min_severity
228
+ if strict:
229
+ kwargs["strict"] = strict
230
+ if max_workers is not None:
231
+ kwargs["max_workers"] = max_workers
232
+ if pushdown is not None:
233
+ kwargs["pushdown"] = pushdown
234
+
235
+ try:
236
+ func = partial(th.check, **kwargs)
237
+ result = await self._run_in_executor(func)
238
+ return self._convert_check_result(result)
239
+ except Exception as e:
240
+ if "truthound" in str(type(e).__module__):
241
+ raise BackendOperationError(
242
+ "truthound", "check", str(e), original_error=e
243
+ ) from e
244
+ raise
245
+
246
+ async def learn(
247
+ self,
248
+ source: DataInput,
249
+ *,
250
+ infer_constraints: bool = True,
251
+ categorical_threshold: int | None = None,
252
+ sample_size: int | None = None,
253
+ ) -> LearnResult:
254
+ """Learn schema from data using truthound.
255
+
256
+ Args:
257
+ source: File path or DataSource object.
258
+ infer_constraints: Infer constraints from statistics.
259
+ categorical_threshold: Max unique values for categorical.
260
+ sample_size: Number of rows to sample.
261
+
262
+ Returns:
263
+ LearnResult with schema information.
264
+ """
265
+ th = self._get_truthound()
266
+
267
+ # Resolve DataSource to LazyFrame if needed
268
+ resolved_source = self._resolve_data_input(source)
269
+
270
+ kwargs: dict[str, Any] = {"infer_constraints": infer_constraints}
271
+ if categorical_threshold is not None:
272
+ kwargs["categorical_threshold"] = categorical_threshold
273
+ if sample_size is not None:
274
+ kwargs["sample_size"] = sample_size
275
+
276
+ try:
277
+ func = partial(th.learn, resolved_source, **kwargs)
278
+ result = await self._run_in_executor(func)
279
+ return self._convert_learn_result(result)
280
+ except Exception as e:
281
+ if "truthound" in str(type(e).__module__):
282
+ raise BackendOperationError(
283
+ "truthound", "learn", str(e), original_error=e
284
+ ) from e
285
+ raise
286
+
287
+ async def profile(
288
+ self,
289
+ source: DataInput,
290
+ *,
291
+ sample_size: int | None = None,
292
+ include_patterns: bool = True,
293
+ include_correlations: bool = False,
294
+ include_distributions: bool = True,
295
+ top_n_values: int = 10,
296
+ pattern_sample_size: int = 1000,
297
+ correlation_threshold: float = 0.7,
298
+ min_pattern_match_ratio: float = 0.8,
299
+ n_jobs: int = 1,
300
+ ) -> ProfileResult:
301
+ """Run data profiling using truthound.
302
+
303
+ Args:
304
+ source: File path or DataSource object.
305
+ sample_size: Max rows to sample.
306
+ include_patterns: Enable pattern detection.
307
+ include_correlations: Calculate correlations.
308
+ include_distributions: Include distribution stats.
309
+ top_n_values: Top/bottom values per column.
310
+ pattern_sample_size: Sample size for pattern matching.
311
+ correlation_threshold: Minimum correlation to report.
312
+ min_pattern_match_ratio: Minimum pattern match ratio.
313
+ n_jobs: Number of parallel jobs.
314
+
315
+ Returns:
316
+ ProfileResult with profiling information.
317
+ """
318
+ # Resolve DataSource to LazyFrame if needed
319
+ resolved_source = self._resolve_data_input(source)
320
+
321
+ # Use th.profile() API which handles file paths and DataFrames
322
+ # Note: th.profile() doesn't support advanced ProfilerConfig options,
323
+ # those are only available via DataProfiler with LazyFrame input.
324
+ # See: .truthound_docs/python-api/core-functions.md
325
+ th = self._get_truthound()
326
+
327
+ func = partial(th.profile, resolved_source)
328
+ result = await self._run_in_executor(func)
329
+ return self._convert_profile_result(result)
330
+
331
+ async def compare(
332
+ self,
333
+ baseline: DataInput,
334
+ current: DataInput,
335
+ *,
336
+ columns: list[str] | None = None,
337
+ method: str = "auto",
338
+ threshold: float | None = None,
339
+ sample_size: int | None = None,
340
+ ) -> CompareResult:
341
+ """Compare datasets for drift detection using truthound.
342
+
343
+ Args:
344
+ baseline: Reference data.
345
+ current: Current data to compare.
346
+ columns: Columns to compare.
347
+ method: Detection method.
348
+ threshold: Drift threshold.
349
+ sample_size: Sample size for large datasets.
350
+
351
+ Returns:
352
+ CompareResult with drift results.
353
+ """
354
+ th = self._get_truthound()
355
+
356
+ # Resolve DataSource inputs to LazyFrame if needed
357
+ resolved_baseline = self._resolve_data_input(baseline)
358
+ resolved_current = self._resolve_data_input(current)
359
+
360
+ kwargs: dict[str, Any] = {
361
+ "columns": columns,
362
+ "method": method,
363
+ }
364
+
365
+ if threshold is not None:
366
+ kwargs["threshold"] = threshold
367
+ if sample_size is not None:
368
+ kwargs["sample_size"] = sample_size
369
+
370
+ try:
371
+ func = partial(th.compare, resolved_baseline, resolved_current, **kwargs)
372
+ result = await self._run_in_executor(func)
373
+ return self._convert_compare_result(result)
374
+ except Exception as e:
375
+ if "truthound" in str(type(e).__module__):
376
+ raise BackendOperationError(
377
+ "truthound", "compare", str(e), original_error=e
378
+ ) from e
379
+ raise
380
+
381
+ async def scan(
382
+ self,
383
+ data: DataInput,
384
+ *,
385
+ columns: list[str] | None = None,
386
+ regulations: list[str] | None = None,
387
+ min_confidence: float = 0.8,
388
+ ) -> ScanResult:
389
+ """Scan for PII using truthound.
390
+
391
+ Args:
392
+ data: File path or DataSource object.
393
+ columns: Columns to scan.
394
+ regulations: Regulations to check.
395
+ min_confidence: Minimum PII confidence.
396
+
397
+ Returns:
398
+ ScanResult with PII findings.
399
+ """
400
+ th = self._get_truthound()
401
+
402
+ # Resolve DataSource to LazyFrame if needed
403
+ resolved_data = self._resolve_data_input(data)
404
+
405
+ # Note: truthound's th.scan() does not support min_confidence, columns,
406
+ # or regulations parameters. We filter results after scanning.
407
+ # See: .truthound_docs/python-api/core-functions.md
408
+
409
+ try:
410
+ func = partial(th.scan, resolved_data)
411
+ result = await self._run_in_executor(func)
412
+ return self._convert_scan_result(
413
+ result,
414
+ min_confidence=min_confidence,
415
+ columns=columns,
416
+ regulations=regulations,
417
+ )
418
+ except Exception as e:
419
+ if "truthound" in str(type(e).__module__):
420
+ raise BackendOperationError(
421
+ "truthound", "scan", str(e), original_error=e
422
+ ) from e
423
+ raise
424
+
425
+ async def mask(
426
+ self,
427
+ data: DataInput,
428
+ output: str,
429
+ *,
430
+ columns: list[str] | None = None,
431
+ strategy: str = "redact",
432
+ ) -> MaskResult:
433
+ """Mask sensitive data using truthound.
434
+
435
+ Args:
436
+ data: File path or DataSource object.
437
+ output: Output file path.
438
+ columns: Columns to mask.
439
+ strategy: Masking strategy.
440
+
441
+ Returns:
442
+ MaskResult with masking details.
443
+ """
444
+ th = self._get_truthound()
445
+
446
+ # Resolve DataSource to LazyFrame if needed
447
+ resolved_data = self._resolve_data_input(data)
448
+
449
+ if strategy not in ("redact", "hash", "fake"):
450
+ raise ValueError(
451
+ f"Invalid strategy: {strategy}. Use 'redact', 'hash', or 'fake'."
452
+ )
453
+
454
+ kwargs: dict[str, Any] = {
455
+ "strategy": strategy,
456
+ }
457
+
458
+ if columns is not None:
459
+ kwargs["columns"] = columns
460
+
461
+ try:
462
+ func = partial(th.mask, resolved_data, **kwargs)
463
+ masked_df = await self._run_in_executor(func)
464
+ return self._convert_mask_result(data, output, masked_df, strategy, columns)
465
+ except Exception as e:
466
+ if "truthound" in str(type(e).__module__):
467
+ raise BackendOperationError(
468
+ "truthound", "mask", str(e), original_error=e
469
+ ) from e
470
+ raise
471
+
472
+ async def generate_suite(
473
+ self,
474
+ profile: ProfileResult | dict[str, Any],
475
+ *,
476
+ strictness: str = "medium",
477
+ preset: str = "default",
478
+ include: list[str] | None = None,
479
+ exclude: list[str] | None = None,
480
+ output_format: str = "yaml",
481
+ ) -> GenerateSuiteResult:
482
+ """Generate validation suite from profile using truthound.
483
+
484
+ Args:
485
+ profile: Profile result or dictionary.
486
+ strictness: Rule strictness level.
487
+ preset: Rule generation preset.
488
+ include: Rule categories to include.
489
+ exclude: Rule categories to exclude.
490
+ output_format: Output format.
491
+
492
+ Returns:
493
+ GenerateSuiteResult with generated rules.
494
+ """
495
+ from truthound.profiler import generate_suite
496
+ from truthound.profiler.generators import Strictness
497
+
498
+ strictness_map = {
499
+ "loose": Strictness.LOOSE,
500
+ "medium": Strictness.MEDIUM,
501
+ "strict": Strictness.STRICT,
502
+ }
503
+ strictness_enum = strictness_map.get(strictness.lower(), Strictness.MEDIUM)
504
+
505
+ if isinstance(profile, ProfileResult):
506
+ profile_data = profile.to_dict()
507
+ else:
508
+ profile_data = profile
509
+
510
+ kwargs: dict[str, Any] = {
511
+ "strictness": strictness_enum,
512
+ "preset": preset,
513
+ }
514
+ if include:
515
+ kwargs["include"] = include
516
+ if exclude:
517
+ kwargs["exclude"] = exclude
518
+
519
+ def _generate():
520
+ return generate_suite(profile_data, **kwargs)
521
+
522
+ suite = await self._run_in_executor(_generate)
523
+ return self._convert_suite_result(suite, strictness, output_format)
524
+
525
+ # =========================================================================
526
+ # Result Conversion Methods
527
+ # =========================================================================
528
+
529
+ def _convert_check_result(self, result: Any) -> CheckResult:
530
+ """Convert truthound Report to CheckResult."""
531
+ data = self._converter.convert_check_result(result)
532
+ return CheckResult(
533
+ passed=data["passed"],
534
+ has_critical=data["has_critical"],
535
+ has_high=data["has_high"],
536
+ total_issues=data["total_issues"],
537
+ critical_issues=data["critical_issues"],
538
+ high_issues=data["high_issues"],
539
+ medium_issues=data["medium_issues"],
540
+ low_issues=data["low_issues"],
541
+ source=data["source"],
542
+ row_count=data["row_count"],
543
+ column_count=data["column_count"],
544
+ issues=data["issues"],
545
+ )
546
+
547
+ def _convert_learn_result(self, result: Any) -> LearnResult:
548
+ """Convert truthound Schema to LearnResult."""
549
+ data = self._converter.convert_learn_result(result)
550
+ return LearnResult(
551
+ schema=data["schema"],
552
+ schema_yaml=data["schema_yaml"],
553
+ row_count=data["row_count"],
554
+ column_count=data["column_count"],
555
+ columns=data["columns"],
556
+ )
557
+
558
+ def _convert_profile_result(self, result: Any) -> ProfileResult:
559
+ """Convert truthound TableProfile to ProfileResult."""
560
+ data = self._converter.convert_profile_result(result)
561
+
562
+ columns = [
563
+ ColumnProfileResult(
564
+ name=col["name"],
565
+ physical_type=col["physical_type"],
566
+ inferred_type=col.get("inferred_type", "unknown"),
567
+ row_count=col.get("row_count", 0),
568
+ null_count=col.get("null_count", 0),
569
+ null_ratio=col.get("null_ratio", 0.0),
570
+ empty_string_count=col.get("empty_string_count", 0),
571
+ distinct_count=col.get("distinct_count", 0),
572
+ unique_ratio=col.get("unique_ratio", 0.0),
573
+ is_unique=col.get("is_unique", False),
574
+ is_constant=col.get("is_constant", False),
575
+ distribution=col.get("distribution"),
576
+ top_values=col.get("top_values"),
577
+ bottom_values=col.get("bottom_values"),
578
+ min_length=col.get("min_length"),
579
+ max_length=col.get("max_length"),
580
+ avg_length=col.get("avg_length"),
581
+ detected_patterns=col.get("detected_patterns"),
582
+ min_date=col.get("min_date"),
583
+ max_date=col.get("max_date"),
584
+ date_gaps=col.get("date_gaps", 0),
585
+ suggested_validators=col.get("suggested_validators"),
586
+ profile_duration_ms=col.get("profile_duration_ms", 0.0),
587
+ )
588
+ for col in data["columns"]
589
+ ]
590
+
591
+ return ProfileResult(
592
+ name=data["name"],
593
+ source=data["source"],
594
+ row_count=data["row_count"],
595
+ column_count=data["column_count"],
596
+ estimated_memory_bytes=data["estimated_memory_bytes"],
597
+ columns=columns,
598
+ duplicate_row_count=data.get("duplicate_row_count", 0),
599
+ duplicate_row_ratio=data.get("duplicate_row_ratio", 0.0),
600
+ correlations=data.get("correlations"),
601
+ profiled_at=data.get("profiled_at"),
602
+ profile_duration_ms=data.get("profile_duration_ms", 0.0),
603
+ size_bytes=data.get("size_bytes", 0),
604
+ )
605
+
606
+ def _convert_compare_result(self, result: Any) -> CompareResult:
607
+ """Convert truthound DriftReport to CompareResult."""
608
+ data = self._converter.convert_compare_result(result)
609
+ return CompareResult(
610
+ baseline_source=data["baseline_source"],
611
+ current_source=data["current_source"],
612
+ baseline_rows=data["baseline_rows"],
613
+ current_rows=data["current_rows"],
614
+ has_drift=data["has_drift"],
615
+ has_high_drift=data["has_high_drift"],
616
+ total_columns=data["total_columns"],
617
+ drifted_columns=data["drifted_columns"],
618
+ columns=data["columns"],
619
+ )
620
+
621
+ def _convert_scan_result(
622
+ self,
623
+ result: Any,
624
+ *,
625
+ min_confidence: float = 0.8,
626
+ columns: list[str] | None = None,
627
+ regulations: list[str] | None = None,
628
+ ) -> ScanResult:
629
+ """Convert truthound PIIReport to ScanResult with optional filtering.
630
+
631
+ Args:
632
+ result: truthound PIIReport object.
633
+ min_confidence: Filter findings by minimum confidence (0.0-1.0).
634
+ columns: Filter findings to specific columns only.
635
+ regulations: Filter findings by regulation types.
636
+
637
+ Returns:
638
+ ScanResult with filtered PII findings.
639
+ """
640
+ data = self._converter.convert_scan_result(result)
641
+
642
+ # Filter findings based on parameters
643
+ findings = data["findings"]
644
+ if findings:
645
+ # Filter by min_confidence (confidence is 0-100 in findings)
646
+ findings = [
647
+ f for f in findings
648
+ if f.get("confidence", 100) >= min_confidence * 100
649
+ ]
650
+
651
+ # Filter by columns
652
+ if columns:
653
+ findings = [
654
+ f for f in findings
655
+ if f.get("column") in columns
656
+ ]
657
+
658
+ # Filter by regulations (if finding has regulation info)
659
+ if regulations:
660
+ findings = [
661
+ f for f in findings
662
+ if not f.get("regulation") or f.get("regulation") in regulations
663
+ ]
664
+
665
+ # Recalculate summary stats after filtering
666
+ columns_with_pii = len({f.get("column") for f in findings if f.get("column")})
667
+
668
+ return ScanResult(
669
+ source=data["source"],
670
+ row_count=data["row_count"],
671
+ column_count=data["column_count"],
672
+ total_columns_scanned=data["total_columns_scanned"],
673
+ columns_with_pii=columns_with_pii,
674
+ total_findings=len(findings),
675
+ has_violations=data["has_violations"],
676
+ total_violations=data["total_violations"],
677
+ findings=findings,
678
+ violations=data["violations"],
679
+ )
680
+
681
+ def _convert_mask_result(
682
+ self,
683
+ source: DataInput,
684
+ output: str,
685
+ masked_df: Any,
686
+ strategy: str,
687
+ columns: list[str] | None,
688
+ ) -> MaskResult:
689
+ """Convert truthound mask result to MaskResult."""
690
+ data = self._converter.convert_mask_result(
691
+ source, output, masked_df, strategy, columns
692
+ )
693
+ return MaskResult(
694
+ source=data["source"],
695
+ output_path=data["output_path"],
696
+ row_count=data["row_count"],
697
+ column_count=data["column_count"],
698
+ columns_masked=data["columns_masked"],
699
+ strategy=data["strategy"],
700
+ original_columns=data["original_columns"],
701
+ )
702
+
703
+ def _convert_suite_result(
704
+ self,
705
+ suite: Any,
706
+ strictness: str,
707
+ output_format: str,
708
+ ) -> GenerateSuiteResult:
709
+ """Convert truthound ValidationSuite to GenerateSuiteResult."""
710
+ data = self._converter.convert_suite_result(suite, strictness, output_format)
711
+ return GenerateSuiteResult(
712
+ rules=data["rules"],
713
+ rule_count=data["rule_count"],
714
+ categories=data["categories"],
715
+ strictness=data["strictness"],
716
+ yaml_content=data["yaml_content"],
717
+ json_content=data["json_content"],
718
+ )