truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. truthound_dashboard/api/alerts.py +75 -86
  2. truthound_dashboard/api/anomaly.py +7 -13
  3. truthound_dashboard/api/cross_alerts.py +38 -52
  4. truthound_dashboard/api/drift.py +49 -59
  5. truthound_dashboard/api/drift_monitor.py +234 -79
  6. truthound_dashboard/api/enterprise_sampling.py +498 -0
  7. truthound_dashboard/api/history.py +57 -5
  8. truthound_dashboard/api/lineage.py +3 -48
  9. truthound_dashboard/api/maintenance.py +104 -49
  10. truthound_dashboard/api/mask.py +1 -2
  11. truthound_dashboard/api/middleware.py +2 -1
  12. truthound_dashboard/api/model_monitoring.py +435 -311
  13. truthound_dashboard/api/notifications.py +227 -191
  14. truthound_dashboard/api/notifications_advanced.py +21 -20
  15. truthound_dashboard/api/observability.py +586 -0
  16. truthound_dashboard/api/plugins.py +2 -433
  17. truthound_dashboard/api/profile.py +199 -37
  18. truthound_dashboard/api/quality_reporter.py +701 -0
  19. truthound_dashboard/api/reports.py +7 -16
  20. truthound_dashboard/api/router.py +66 -0
  21. truthound_dashboard/api/rule_suggestions.py +5 -5
  22. truthound_dashboard/api/scan.py +17 -19
  23. truthound_dashboard/api/schedules.py +85 -50
  24. truthound_dashboard/api/schema_evolution.py +6 -6
  25. truthound_dashboard/api/schema_watcher.py +667 -0
  26. truthound_dashboard/api/sources.py +98 -27
  27. truthound_dashboard/api/tiering.py +1323 -0
  28. truthound_dashboard/api/triggers.py +14 -11
  29. truthound_dashboard/api/validations.py +12 -11
  30. truthound_dashboard/api/versioning.py +1 -6
  31. truthound_dashboard/core/__init__.py +129 -3
  32. truthound_dashboard/core/actions/__init__.py +62 -0
  33. truthound_dashboard/core/actions/custom.py +426 -0
  34. truthound_dashboard/core/actions/notifications.py +910 -0
  35. truthound_dashboard/core/actions/storage.py +472 -0
  36. truthound_dashboard/core/actions/webhook.py +281 -0
  37. truthound_dashboard/core/anomaly.py +262 -67
  38. truthound_dashboard/core/anomaly_explainer.py +4 -3
  39. truthound_dashboard/core/backends/__init__.py +67 -0
  40. truthound_dashboard/core/backends/base.py +299 -0
  41. truthound_dashboard/core/backends/errors.py +191 -0
  42. truthound_dashboard/core/backends/factory.py +423 -0
  43. truthound_dashboard/core/backends/mock_backend.py +451 -0
  44. truthound_dashboard/core/backends/truthound_backend.py +718 -0
  45. truthound_dashboard/core/checkpoint/__init__.py +87 -0
  46. truthound_dashboard/core/checkpoint/adapters.py +814 -0
  47. truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
  48. truthound_dashboard/core/checkpoint/runner.py +270 -0
  49. truthound_dashboard/core/connections.py +437 -10
  50. truthound_dashboard/core/converters/__init__.py +14 -0
  51. truthound_dashboard/core/converters/truthound.py +620 -0
  52. truthound_dashboard/core/cross_alerts.py +540 -320
  53. truthound_dashboard/core/datasource_factory.py +1672 -0
  54. truthound_dashboard/core/drift_monitor.py +216 -20
  55. truthound_dashboard/core/enterprise_sampling.py +1291 -0
  56. truthound_dashboard/core/interfaces/__init__.py +225 -0
  57. truthound_dashboard/core/interfaces/actions.py +652 -0
  58. truthound_dashboard/core/interfaces/base.py +247 -0
  59. truthound_dashboard/core/interfaces/checkpoint.py +676 -0
  60. truthound_dashboard/core/interfaces/protocols.py +664 -0
  61. truthound_dashboard/core/interfaces/reporters.py +650 -0
  62. truthound_dashboard/core/interfaces/routing.py +646 -0
  63. truthound_dashboard/core/interfaces/triggers.py +619 -0
  64. truthound_dashboard/core/lineage.py +407 -71
  65. truthound_dashboard/core/model_monitoring.py +431 -3
  66. truthound_dashboard/core/notifications/base.py +4 -0
  67. truthound_dashboard/core/notifications/channels.py +501 -1203
  68. truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
  69. truthound_dashboard/core/notifications/deduplication/service.py +131 -348
  70. truthound_dashboard/core/notifications/dispatcher.py +202 -11
  71. truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
  72. truthound_dashboard/core/notifications/escalation/engine.py +168 -358
  73. truthound_dashboard/core/notifications/routing/__init__.py +88 -128
  74. truthound_dashboard/core/notifications/routing/engine.py +90 -317
  75. truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
  76. truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
  77. truthound_dashboard/core/notifications/throttling/builder.py +117 -255
  78. truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
  79. truthound_dashboard/core/phase5/collaboration.py +1 -1
  80. truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
  81. truthound_dashboard/core/quality_reporter.py +1359 -0
  82. truthound_dashboard/core/report_history.py +0 -6
  83. truthound_dashboard/core/reporters/__init__.py +175 -14
  84. truthound_dashboard/core/reporters/adapters.py +943 -0
  85. truthound_dashboard/core/reporters/base.py +0 -3
  86. truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
  87. truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
  88. truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
  89. truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
  90. truthound_dashboard/core/reporters/compat.py +266 -0
  91. truthound_dashboard/core/reporters/csv_reporter.py +2 -35
  92. truthound_dashboard/core/reporters/factory.py +526 -0
  93. truthound_dashboard/core/reporters/interfaces.py +745 -0
  94. truthound_dashboard/core/reporters/registry.py +1 -10
  95. truthound_dashboard/core/scheduler.py +165 -0
  96. truthound_dashboard/core/schema_evolution.py +3 -3
  97. truthound_dashboard/core/schema_watcher.py +1528 -0
  98. truthound_dashboard/core/services.py +595 -76
  99. truthound_dashboard/core/store_manager.py +810 -0
  100. truthound_dashboard/core/streaming_anomaly.py +169 -4
  101. truthound_dashboard/core/tiering.py +1309 -0
  102. truthound_dashboard/core/triggers/evaluators.py +178 -8
  103. truthound_dashboard/core/truthound_adapter.py +2620 -197
  104. truthound_dashboard/core/unified_alerts.py +23 -20
  105. truthound_dashboard/db/__init__.py +8 -0
  106. truthound_dashboard/db/database.py +8 -2
  107. truthound_dashboard/db/models.py +944 -25
  108. truthound_dashboard/db/repository.py +2 -0
  109. truthound_dashboard/main.py +11 -0
  110. truthound_dashboard/schemas/__init__.py +177 -16
  111. truthound_dashboard/schemas/base.py +44 -23
  112. truthound_dashboard/schemas/collaboration.py +19 -6
  113. truthound_dashboard/schemas/cross_alerts.py +19 -3
  114. truthound_dashboard/schemas/drift.py +61 -55
  115. truthound_dashboard/schemas/drift_monitor.py +67 -23
  116. truthound_dashboard/schemas/enterprise_sampling.py +653 -0
  117. truthound_dashboard/schemas/lineage.py +0 -33
  118. truthound_dashboard/schemas/mask.py +10 -8
  119. truthound_dashboard/schemas/model_monitoring.py +89 -10
  120. truthound_dashboard/schemas/notifications_advanced.py +13 -0
  121. truthound_dashboard/schemas/observability.py +453 -0
  122. truthound_dashboard/schemas/plugins.py +0 -280
  123. truthound_dashboard/schemas/profile.py +154 -247
  124. truthound_dashboard/schemas/quality_reporter.py +403 -0
  125. truthound_dashboard/schemas/reports.py +2 -2
  126. truthound_dashboard/schemas/rule_suggestion.py +8 -1
  127. truthound_dashboard/schemas/scan.py +4 -24
  128. truthound_dashboard/schemas/schedule.py +11 -3
  129. truthound_dashboard/schemas/schema_watcher.py +727 -0
  130. truthound_dashboard/schemas/source.py +17 -2
  131. truthound_dashboard/schemas/tiering.py +822 -0
  132. truthound_dashboard/schemas/triggers.py +16 -0
  133. truthound_dashboard/schemas/unified_alerts.py +7 -0
  134. truthound_dashboard/schemas/validation.py +0 -13
  135. truthound_dashboard/schemas/validators/base.py +41 -21
  136. truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
  137. truthound_dashboard/schemas/validators/localization_validators.py +273 -0
  138. truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
  139. truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
  140. truthound_dashboard/schemas/validators/referential_validators.py +312 -0
  141. truthound_dashboard/schemas/validators/registry.py +93 -8
  142. truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
  143. truthound_dashboard/schemas/versioning.py +1 -6
  144. truthound_dashboard/static/index.html +2 -2
  145. truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
  146. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
  147. truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
  148. truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
  149. truthound_dashboard/core/plugins/hooks/manager.py +0 -403
  150. truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
  151. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
  152. truthound_dashboard/core/reporters/junit_reporter.py +0 -233
  153. truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
  154. truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
  155. truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
  156. truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
  157. truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
  158. truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
  159. truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
  160. truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
  161. truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
  162. truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
  163. truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
  164. truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
  165. truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
  166. truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
  167. truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
  168. truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
  169. truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
  170. truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
  171. truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
  172. truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
  173. truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
  174. truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
  175. truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
  176. truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
  177. truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
  178. truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
  179. truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
  180. truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
  181. truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
  182. truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
  183. truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
  184. truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
  185. truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
  186. truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
  187. truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
  188. truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
  189. truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
  190. truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
  191. truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
  192. truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
  193. truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
  194. truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
  195. truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
  196. truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
  197. truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
  198. truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
  199. truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
  200. truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
  201. truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
  202. truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
  203. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
  204. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
  205. {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,16 +4,25 @@ This module contains service classes that implement business logic
4
4
  for the dashboard, separating concerns from API handlers.
5
5
 
6
6
  Services handle:
7
- - Data source management
7
+ - Data source management with multi-backend support
8
8
  - Schema learning and storage
9
9
  - Validation execution and tracking
10
10
  - Data profiling with history
11
11
  - Drift detection
12
12
  - Schedule management
13
+
14
+ Supports various data backends through truthound's DataSource abstraction:
15
+ - File: CSV, Parquet, JSON, NDJSON, JSONL
16
+ - SQL: SQLite, PostgreSQL, MySQL
17
+ - Cloud DW: BigQuery, Snowflake, Redshift, Databricks
18
+ - Enterprise: Oracle, SQL Server
19
+ - NoSQL: MongoDB, Elasticsearch (async)
20
+ - Streaming: Kafka (async)
13
21
  """
14
22
 
15
23
  from __future__ import annotations
16
24
 
25
+ import logging
17
26
  from collections import Counter, defaultdict
18
27
  from collections.abc import Sequence
19
28
  from datetime import datetime, timedelta
@@ -35,13 +44,24 @@ from truthound_dashboard.db import (
35
44
  Validation,
36
45
  )
37
46
 
47
+ from .datasource_factory import (
48
+ SourceConfig,
49
+ SourceType,
50
+ create_datasource,
51
+ get_source_path_or_datasource,
52
+ )
38
53
  from .truthound_adapter import (
39
54
  CheckResult,
55
+ DataInput,
56
+ GenerateSuiteResult,
40
57
  MaskResult,
58
+ ProfileResult,
41
59
  ScanResult,
42
60
  get_adapter,
43
61
  )
44
62
 
63
+ logger = logging.getLogger(__name__)
64
+
45
65
 
46
66
  class SourceRepository(BaseRepository[Source]):
47
67
  """Repository for Source model operations."""
@@ -82,6 +102,74 @@ class SourceRepository(BaseRepository[Source]):
82
102
  return result.scalar_one_or_none()
83
103
 
84
104
 
105
+ def get_data_input_from_source(source: Source) -> DataInput:
106
+ """Get DataInput (path or DataSource object) from Source model.
107
+
108
+ This helper function creates the appropriate data input for truthound
109
+ operations based on the source type and configuration.
110
+
111
+ For file-based sources, returns the file path string.
112
+ For database sources, creates and returns a DataSource object.
113
+
114
+ Args:
115
+ source: Source database model.
116
+
117
+ Returns:
118
+ File path string for file sources, DataSource object for others.
119
+
120
+ Raises:
121
+ ValueError: If source configuration is invalid.
122
+ """
123
+ source_type = source.type.lower()
124
+ config = source.config or {}
125
+
126
+ # For file sources, return path directly
127
+ if SourceType.is_file_type(source_type):
128
+ path = config.get("path") or source.source_path
129
+ if not path:
130
+ raise ValueError(f"No path configured for file source: {source.name}")
131
+ return path
132
+
133
+ # For database sources, create DataSource object
134
+ try:
135
+ full_config = {"type": source_type, **config}
136
+ return create_datasource(full_config)
137
+ except Exception as e:
138
+ logger.error(f"Failed to create DataSource for {source.name}: {e}")
139
+ raise ValueError(f"Failed to create DataSource: {e}") from e
140
+
141
+
142
+ async def get_async_data_input_from_source(source: Source) -> DataInput:
143
+ """Get DataInput for async sources (MongoDB, Elasticsearch, Kafka).
144
+
145
+ This helper function creates DataSource objects for sources that
146
+ require async initialization.
147
+
148
+ Args:
149
+ source: Source database model.
150
+
151
+ Returns:
152
+ DataSource object.
153
+
154
+ Raises:
155
+ ValueError: If source type doesn't require async or config is invalid.
156
+ """
157
+ from .datasource_factory import create_datasource_async
158
+
159
+ source_type = source.type.lower()
160
+ config = source.config or {}
161
+
162
+ if not SourceType.is_async_type(source_type):
163
+ raise ValueError(f"Source type '{source_type}' doesn't require async creation")
164
+
165
+ try:
166
+ full_config = {"type": source_type, **config}
167
+ return await create_datasource_async(full_config)
168
+ except Exception as e:
169
+ logger.error(f"Failed to create async DataSource for {source.name}: {e}")
170
+ raise ValueError(f"Failed to create async DataSource: {e}") from e
171
+
172
+
85
173
  class SchemaRepository(BaseRepository[Schema]):
86
174
  """Repository for Schema model operations."""
87
175
 
@@ -196,22 +284,28 @@ class ValidationRepository(BaseRepository[Validation]):
196
284
  self,
197
285
  source_id: str,
198
286
  *,
287
+ offset: int = 0,
199
288
  limit: int = 20,
200
- ) -> Sequence[Validation]:
201
- """Get validations for a source.
289
+ ) -> tuple[Sequence[Validation], int]:
290
+ """Get validations for a source with pagination.
202
291
 
203
292
  Args:
204
293
  source_id: Source ID.
294
+ offset: Number of items to skip.
205
295
  limit: Maximum to return.
206
296
 
207
297
  Returns:
208
- Sequence of validations.
298
+ Tuple of (validations, total_count).
209
299
  """
210
- return await self.list(
300
+ filters = [Validation.source_id == source_id]
301
+ validations = await self.list(
302
+ offset=offset,
211
303
  limit=limit,
212
- filters=[Validation.source_id == source_id],
304
+ filters=filters,
213
305
  order_by=Validation.created_at.desc(),
214
306
  )
307
+ total = await self.count(filters=filters)
308
+ return validations, total
215
309
 
216
310
  async def get_latest_for_source(self, source_id: str) -> Validation | None:
217
311
  """Get most recent validation for a source.
@@ -230,6 +324,24 @@ class ValidationRepository(BaseRepository[Validation]):
230
324
  )
231
325
  return result.scalar_one_or_none()
232
326
 
327
+ async def get_with_source(self, validation_id: str) -> Validation | None:
328
+ """Get validation by ID with source eagerly loaded.
329
+
330
+ Args:
331
+ validation_id: Validation ID.
332
+
333
+ Returns:
334
+ Validation with source loaded, or None.
335
+ """
336
+ from sqlalchemy.orm import selectinload
337
+
338
+ result = await self.session.execute(
339
+ select(Validation)
340
+ .options(selectinload(Validation.source))
341
+ .where(Validation.id == validation_id)
342
+ )
343
+ return result.scalar_one_or_none()
344
+
233
345
 
234
346
  class SourceService:
235
347
  """Service for managing data sources.
@@ -273,6 +385,19 @@ class SourceService:
273
385
  return await self.repository.get_active(offset=offset, limit=limit)
274
386
  return await self.repository.list(offset=offset, limit=limit)
275
387
 
388
+ async def count(self, *, active_only: bool = True) -> int:
389
+ """Count sources.
390
+
391
+ Args:
392
+ active_only: Only count active sources.
393
+
394
+ Returns:
395
+ Total count of sources.
396
+ """
397
+ if active_only:
398
+ return await self.repository.count(filters=[Source.is_active == True])
399
+ return await self.repository.count()
400
+
276
401
  async def create(
277
402
  self,
278
403
  *,
@@ -372,7 +497,8 @@ class SourceService:
372
497
  Returns:
373
498
  Sequence of validations.
374
499
  """
375
- return await self.validation_repo.get_for_source(source_id, limit=limit)
500
+ validations, _ = await self.validation_repo.get_for_source(source_id, limit=limit)
501
+ return validations
376
502
 
377
503
 
378
504
  class ValidationService:
@@ -380,6 +506,14 @@ class ValidationService:
380
506
 
381
507
  Handles validation execution, result storage, and history.
382
508
  Supports both built-in truthound validators and custom validators.
509
+
510
+ Supports various data backends through truthound's DataSource abstraction:
511
+ - File: CSV, Parquet, JSON, NDJSON, JSONL
512
+ - SQL: SQLite, PostgreSQL, MySQL
513
+ - Cloud DW: BigQuery, Snowflake, Redshift, Databricks
514
+ - Enterprise: Oracle, SQL Server
515
+ - NoSQL: MongoDB, Elasticsearch (async)
516
+ - Streaming: Kafka (async)
383
517
  """
384
518
 
385
519
  def __init__(self, session: AsyncSession) -> None:
@@ -399,13 +533,11 @@ class ValidationService:
399
533
  source_id: str,
400
534
  *,
401
535
  validators: list[str] | None = None,
402
- validator_params: dict[str, dict[str, Any]] | None = None,
536
+ validator_config: dict[str, dict[str, Any]] | None = None,
403
537
  custom_validators: list[dict[str, Any]] | None = None,
404
538
  schema_path: str | None = None,
405
539
  auto_schema: bool = False,
406
- columns: list[str] | None = None,
407
540
  min_severity: str | None = None,
408
- strict: bool = False,
409
541
  parallel: bool = False,
410
542
  max_workers: int | None = None,
411
543
  pushdown: bool | None = None,
@@ -416,20 +548,22 @@ class ValidationService:
416
548
  allowing fine-grained control over validation behavior. It also supports
417
549
  running custom validators alongside built-in validators.
418
550
 
551
+ Supports all data source types including files, SQL databases,
552
+ cloud data warehouses, and async sources (MongoDB, Elasticsearch, Kafka).
553
+
419
554
  Args:
420
555
  source_id: Source ID to validate.
421
556
  validators: Optional validator list. If None, all validators run.
422
- validator_params: Optional per-validator parameters.
557
+ validator_config: Optional per-validator configuration (truthound 2.x).
423
558
  Format: {"ValidatorName": {"param1": value1, "param2": value2}}
424
- Example: {"Null": {"columns": ["email"], "mostly": 0.95},
559
+ Example: {"Null": {"columns": ("email",), "mostly": 0.95},
425
560
  "CompletenessRatio": {"column": "phone", "min_ratio": 0.98}}
561
+ Note: columns should be tuples, not lists, for truthound 2.x.
426
562
  custom_validators: Optional list of custom validator configs.
427
563
  Format: [{"validator_id": "...", "column": "...", "params": {...}}]
428
564
  schema_path: Optional schema file path.
429
565
  auto_schema: Auto-learn schema if True.
430
- columns: Columns to validate. If None, validates all columns.
431
566
  min_severity: Minimum severity to report ("low", "medium", "high", "critical").
432
- strict: If True, raises exception on validation failures.
433
567
  parallel: If True, uses DAG-based parallel execution.
434
568
  max_workers: Max threads for parallel execution (requires parallel=True).
435
569
  pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
@@ -438,7 +572,7 @@ class ValidationService:
438
572
  Validation record with results.
439
573
 
440
574
  Raises:
441
- ValueError: If source not found.
575
+ ValueError: If source not found or data source creation fails.
442
576
  """
443
577
  # Get source
444
578
  source = await self.source_repo.get_by_id(source_id)
@@ -453,16 +587,21 @@ class ValidationService:
453
587
  )
454
588
 
455
589
  try:
590
+ # Get data input based on source type
591
+ # For async sources (MongoDB, Elasticsearch, Kafka), use async creation
592
+ if SourceType.is_async_type(source.type):
593
+ data_input = await get_async_data_input_from_source(source)
594
+ else:
595
+ data_input = get_data_input_from_source(source)
596
+
456
597
  # Run built-in validation with all supported parameters
457
598
  result = await self.adapter.check(
458
- source.source_path or "",
599
+ data_input,
459
600
  validators=validators,
460
- validator_params=validator_params,
601
+ validator_config=validator_config,
461
602
  schema=schema_path,
462
603
  auto_schema=auto_schema,
463
- columns=columns,
464
604
  min_severity=min_severity,
465
- strict=strict,
466
605
  parallel=parallel,
467
606
  max_workers=max_workers,
468
607
  pushdown=pushdown,
@@ -649,39 +788,49 @@ class ValidationService:
649
788
  delta = validation.completed_at - validation.started_at
650
789
  validation.duration_ms = int(delta.total_seconds() * 1000)
651
790
 
652
- async def get_validation(self, validation_id: str) -> Validation | None:
791
+ async def get_validation(
792
+ self, validation_id: str, *, with_source: bool = False
793
+ ) -> Validation | None:
653
794
  """Get validation by ID.
654
795
 
655
796
  Args:
656
797
  validation_id: Validation ID.
798
+ with_source: If True, eagerly load the source relationship.
657
799
 
658
800
  Returns:
659
801
  Validation or None.
660
802
  """
803
+ if with_source:
804
+ return await self.validation_repo.get_with_source(validation_id)
661
805
  return await self.validation_repo.get_by_id(validation_id)
662
806
 
663
807
  async def list_for_source(
664
808
  self,
665
809
  source_id: str,
666
810
  *,
811
+ offset: int = 0,
667
812
  limit: int = 20,
668
- ) -> Sequence[Validation]:
669
- """List validations for a source.
813
+ ) -> tuple[Sequence[Validation], int]:
814
+ """List validations for a source with pagination.
670
815
 
671
816
  Args:
672
817
  source_id: Source ID.
818
+ offset: Number of items to skip.
673
819
  limit: Maximum to return.
674
820
 
675
821
  Returns:
676
- Sequence of validations.
822
+ Tuple of (validations, total_count).
677
823
  """
678
- return await self.validation_repo.get_for_source(source_id, limit=limit)
824
+ return await self.validation_repo.get_for_source(
825
+ source_id, offset=offset, limit=limit
826
+ )
679
827
 
680
828
 
681
829
  class SchemaService:
682
830
  """Service for schema learning and management.
683
831
 
684
832
  Handles schema learning, storage, and retrieval.
833
+ Supports all data source types through DataSource abstraction.
685
834
  """
686
835
 
687
836
  def __init__(self, session: AsyncSession) -> None:
@@ -706,7 +855,7 @@ class SchemaService:
706
855
  """Learn and store schema for a source.
707
856
 
708
857
  Wraps truthound's th.learn() with full parameter support for schema
709
- inference customization.
858
+ inference customization. Supports all data source types.
710
859
 
711
860
  Args:
712
861
  source_id: Source ID.
@@ -722,16 +871,22 @@ class SchemaService:
722
871
  Created schema record.
723
872
 
724
873
  Raises:
725
- ValueError: If source not found.
874
+ ValueError: If source not found or data source creation fails.
726
875
  """
727
876
  # Get source
728
877
  source = await self.source_repo.get_by_id(source_id)
729
878
  if source is None:
730
879
  raise ValueError(f"Source '{source_id}' not found")
731
880
 
881
+ # Get data input based on source type
882
+ if SourceType.is_async_type(source.type):
883
+ data_input = await get_async_data_input_from_source(source)
884
+ else:
885
+ data_input = get_data_input_from_source(source)
886
+
732
887
  # Learn schema with all parameters
733
888
  result = await self.adapter.learn(
734
- source.source_path or "",
889
+ data_input,
735
890
  infer_constraints=infer_constraints,
736
891
  categorical_threshold=categorical_threshold,
737
892
  sample_size=sample_size,
@@ -1019,17 +1174,20 @@ class ProfileRepository(BaseRepository[Profile]):
1019
1174
  source_id: str,
1020
1175
  *,
1021
1176
  limit: int = 20,
1177
+ offset: int = 0,
1022
1178
  ) -> Sequence[Profile]:
1023
1179
  """Get profiles for a source.
1024
1180
 
1025
1181
  Args:
1026
1182
  source_id: Source ID.
1027
1183
  limit: Maximum to return.
1184
+ offset: Number to skip.
1028
1185
 
1029
1186
  Returns:
1030
1187
  Sequence of profiles.
1031
1188
  """
1032
1189
  return await self.list(
1190
+ offset=offset,
1033
1191
  limit=limit,
1034
1192
  filters=[Profile.source_id == source_id],
1035
1193
  order_by=Profile.created_at.desc(),
@@ -1167,6 +1325,10 @@ class ProfileService:
1167
1325
  """Service for data profiling with history tracking.
1168
1326
 
1169
1327
  Handles data profiling operations and stores results.
1328
+ Uses the new truthound profiler API with ProfilerConfig for
1329
+ fine-grained control over profiling behavior.
1330
+
1331
+ Supports all data source types through DataSource abstraction.
1170
1332
  """
1171
1333
 
1172
1334
  def __init__(self, session: AsyncSession) -> None:
@@ -1184,30 +1346,104 @@ class ProfileService:
1184
1346
  self,
1185
1347
  source_id: str,
1186
1348
  *,
1187
- sample_size: int | None = None,
1188
1349
  save: bool = True,
1189
1350
  ) -> Profile:
1190
1351
  """Profile a data source and optionally save result.
1191
1352
 
1353
+ Note: truthound's th.profile() only accepts (data, source) parameters.
1354
+ Advanced configuration options are NOT supported by the underlying library.
1355
+
1356
+ Supports all data source types including files, SQL databases,
1357
+ cloud data warehouses, and async sources.
1358
+
1192
1359
  Args:
1193
1360
  source_id: Source ID to profile.
1194
- sample_size: Maximum number of rows to sample for profiling.
1195
- If None, profiles all data. Useful for large datasets.
1196
1361
  save: Whether to save profile to database.
1197
1362
 
1198
1363
  Returns:
1199
1364
  Profile model with results.
1200
1365
 
1201
1366
  Raises:
1202
- ValueError: If source not found.
1367
+ ValueError: If source not found or data source creation fails.
1203
1368
  """
1204
1369
  source = await self.source_repo.get_by_id(source_id)
1205
1370
  if source is None:
1206
1371
  raise ValueError(f"Source '{source_id}' not found")
1207
1372
 
1208
- result = await self.adapter.profile(
1209
- source.source_path or "",
1210
- sample_size=sample_size,
1373
+ # Get data input based on source type
1374
+ if SourceType.is_async_type(source.type):
1375
+ data_input = await get_async_data_input_from_source(source)
1376
+ else:
1377
+ data_input = get_data_input_from_source(source)
1378
+
1379
+ result = await self.adapter.profile(data_input)
1380
+
1381
+ if save:
1382
+ profile = await self.profile_repo.create(
1383
+ source_id=source_id,
1384
+ profile_json=result.to_dict(),
1385
+ row_count=result.row_count,
1386
+ column_count=result.column_count,
1387
+ size_bytes=result.size_bytes or result.estimated_memory_bytes,
1388
+ )
1389
+ return profile
1390
+
1391
+ # Return unsaved profile object
1392
+ profile = Profile(
1393
+ source_id=source_id,
1394
+ profile_json=result.to_dict(),
1395
+ row_count=result.row_count,
1396
+ column_count=result.column_count,
1397
+ size_bytes=result.size_bytes or result.estimated_memory_bytes,
1398
+ )
1399
+ return profile
1400
+
1401
+ async def profile_source_advanced(
1402
+ self,
1403
+ source_id: str,
1404
+ *,
1405
+ config: dict[str, Any] | None = None,
1406
+ save: bool = True,
1407
+ ) -> Profile:
1408
+ """Profile a data source with full ProfilerConfig support.
1409
+
1410
+ Provides direct access to all ProfilerConfig options through
1411
+ a configuration dictionary for maximum flexibility.
1412
+
1413
+ Args:
1414
+ source_id: Source ID to profile.
1415
+ config: ProfilerConfig options as dictionary:
1416
+ - sample_size: int | None (max rows to sample)
1417
+ - random_seed: int (default 42)
1418
+ - include_patterns: bool (default True)
1419
+ - include_correlations: bool (default False)
1420
+ - include_distributions: bool (default True)
1421
+ - top_n_values: int (default 10)
1422
+ - pattern_sample_size: int (default 1000)
1423
+ - correlation_threshold: float (default 0.7)
1424
+ - min_pattern_match_ratio: float (default 0.8)
1425
+ - n_jobs: int (default 1)
1426
+ save: Whether to save profile to database.
1427
+
1428
+ Returns:
1429
+ Profile model with results.
1430
+
1431
+ Raises:
1432
+ ValueError: If source not found or data source creation fails.
1433
+ """
1434
+ source = await self.source_repo.get_by_id(source_id)
1435
+ if source is None:
1436
+ raise ValueError(f"Source '{source_id}' not found")
1437
+
1438
+ # Get data input based on source type
1439
+ if SourceType.is_async_type(source.type):
1440
+ data_input = await get_async_data_input_from_source(source)
1441
+ else:
1442
+ data_input = get_data_input_from_source(source)
1443
+
1444
+ result = await self.adapter.profile_advanced(
1445
+ data_input,
1446
+ config=config,
1211
1447
  )
1212
1448
 
1213
1449
  if save:
@@ -1216,7 +1452,7 @@ class ProfileService:
1216
1452
  profile_json=result.to_dict(),
1217
1453
  row_count=result.row_count,
1218
1454
  column_count=result.column_count,
1219
- size_bytes=result.size_bytes,
1455
+ size_bytes=result.size_bytes or result.estimated_memory_bytes,
1220
1456
  )
1221
1457
  return profile
1222
1458
 
@@ -1226,10 +1462,115 @@ class ProfileService:
1226
1462
  profile_json=result.to_dict(),
1227
1463
  row_count=result.row_count,
1228
1464
  column_count=result.column_count,
1229
- size_bytes=result.size_bytes,
1465
+ size_bytes=result.size_bytes or result.estimated_memory_bytes,
1230
1466
  )
1231
1467
  return profile
1232
1468
 
1469
+ async def generate_rules_from_profile(
1470
+ self,
1471
+ source_id: str,
1472
+ *,
1473
+ strictness: str = "medium",
1474
+ preset: str = "default",
1475
+ include_categories: list[str] | None = None,
1476
+ exclude_categories: list[str] | None = None,
1477
+ profile_if_needed: bool = True,
1478
+ sample_size: int | None = None,
1479
+ ) -> dict[str, Any]:
1480
+ """Generate validation rules from source profile.
1481
+
1482
+ Uses truthound's generate_suite() to automatically create
1483
+ validation rules based on the profiled data characteristics.
1484
+
1485
+ Args:
1486
+ source_id: Source ID to generate rules for.
1487
+ strictness: Rule strictness level:
1488
+ - "loose": Permissive thresholds, fewer rules
1489
+ - "medium": Balanced defaults (default)
1490
+ - "strict": Tight thresholds, comprehensive rules
1491
+ preset: Rule generation preset:
1492
+ - "default": General purpose
1493
+ - "strict": Production data
1494
+ - "loose": Development/testing
1495
+ - "minimal": Essential rules only
1496
+ - "comprehensive": All available rules
1497
+ - "ci_cd": CI/CD optimized
1498
+ - "schema_only": Structure validation only
1499
+ - "format_only": Format/pattern rules only
1500
+ include_categories: Rule categories to include (None = all).
1501
+ exclude_categories: Rule categories to exclude.
1502
+ profile_if_needed: If True, profile source if no recent profile exists.
1503
+ sample_size: Sample size for profiling if needed.
1504
+
1505
+ Returns:
1506
+ Dictionary with generated rules, YAML content, and metadata.
1507
+
1508
+ Raises:
1509
+ ValueError: If source not found or no profile available.
1510
+ """
1511
+ source = await self.source_repo.get_by_id(source_id)
1512
+ if source is None:
1513
+ raise ValueError(f"Source '{source_id}' not found")
1514
+
1515
+ # Get or create profile
1516
+ profile = await self.profile_repo.get_latest_for_source(source_id)
1517
+
1518
+ if profile is None:
1519
+ if not profile_if_needed:
1520
+ raise ValueError(
1521
+ f"No profile found for source '{source_id}'. "
1522
+ "Run profile_source() first or set profile_if_needed=True."
1523
+ )
1524
+ # Create profile
1525
+ profile = await self.profile_source(
1526
+ source_id,
1527
+ sample_size=sample_size,
1528
+ include_patterns=True,
1529
+ save=True,
1530
+ )
1531
+
1532
+ # Generate rules from profile
1533
+ result = await self.adapter.generate_suite(
1534
+ profile.profile_json,
1535
+ strictness=strictness,
1536
+ preset=preset,
1537
+ include=include_categories,
1538
+ exclude=exclude_categories,
1539
+ )
1540
+
1541
+ return {
1542
+ "source_id": source_id,
1543
+ "profile_id": str(profile.id) if profile.id else None,
1544
+ "rules": result.rules,
1545
+ "rule_count": result.rule_count,
1546
+ "categories": result.categories,
1547
+ "strictness": result.strictness,
1548
+ "yaml_content": result.yaml_content,
1549
+ "json_content": result.json_content,
1550
+ }
1551
+
1552
+ async def get(self, profile_id: str) -> Profile | None:
1553
+ """Get a profile by ID.
1554
+
1555
+ Args:
1556
+ profile_id: Profile ID.
1557
+
1558
+ Returns:
1559
+ Profile or None.
1560
+ """
1561
+ return await self.profile_repo.get_by_id(profile_id)
1562
+
1563
+ async def get_latest(self, source_id: str) -> Profile | None:
1564
+ """Get the latest profile for a source.
1565
+
1566
+ Args:
1567
+ source_id: Source ID.
1568
+
1569
+ Returns:
1570
+ Latest profile or None.
1571
+ """
1572
+ return await self.profile_repo.get_latest_for_source(source_id)
1573
+
1233
1574
  async def get_latest_profile(self, source_id: str) -> Profile | None:
1234
1575
  """Get the latest profile for a source.
1235
1576
 
@@ -1258,6 +1599,157 @@ class ProfileService:
1258
1599
  """
1259
1600
  return await self.profile_repo.get_for_source(source_id, limit=limit)
1260
1601
 
1602
+ async def compare_profiles(
1603
+ self,
1604
+ source_id: str,
1605
+ profile_id_1: str | None = None,
1606
+ profile_id_2: str | None = None,
1607
+ ) -> dict[str, Any]:
1608
+ """Compare two profiles for the same source.
1609
+
1610
+ Useful for detecting schema evolution and data drift over time.
1611
+
1612
+ Args:
1613
+ source_id: Source ID.
1614
+ profile_id_1: First profile ID (None = second-latest).
1615
+ profile_id_2: Second profile ID (None = latest).
1616
+
1617
+ Returns:
1618
+ Comparison result with changes and drift indicators.
1619
+
1620
+ Raises:
1621
+ ValueError: If not enough profiles exist.
1622
+ """
1623
+ profiles = await self.profile_repo.get_for_source(source_id, limit=10)
1624
+
1625
+ if len(profiles) < 2:
1626
+ raise ValueError(
1627
+ f"Need at least 2 profiles to compare. Source '{source_id}' has {len(profiles)}."
1628
+ )
1629
+
1630
+ # Get profiles to compare
1631
+ if profile_id_2 is None:
1632
+ profile_2 = profiles[0] # Latest
1633
+ else:
1634
+ profile_2 = await self.profile_repo.get_by_id(profile_id_2)
1635
+ if profile_2 is None:
1636
+ raise ValueError(f"Profile '{profile_id_2}' not found")
1637
+
1638
+ if profile_id_1 is None:
1639
+ profile_1 = profiles[1] # Second-latest
1640
+ else:
1641
+ profile_1 = await self.profile_repo.get_by_id(profile_id_1)
1642
+ if profile_1 is None:
1643
+ raise ValueError(f"Profile '{profile_id_1}' not found")
1644
+
1645
+ # Compare profiles
1646
+ return self._compare_profile_data(
1647
+ profile_1.profile_json,
1648
+ profile_2.profile_json,
1649
+ profile_1_id=str(profile_1.id),
1650
+ profile_2_id=str(profile_2.id),
1651
+ )
1652
+
1653
+ def _compare_profile_data(
1654
+ self,
1655
+ profile_1: dict[str, Any],
1656
+ profile_2: dict[str, Any],
1657
+ profile_1_id: str,
1658
+ profile_2_id: str,
1659
+ ) -> dict[str, Any]:
1660
+ """Compare two profile data dictionaries.
1661
+
1662
+ Args:
1663
+ profile_1: Older profile data.
1664
+ profile_2: Newer profile data.
1665
+ profile_1_id: Older profile ID.
1666
+ profile_2_id: Newer profile ID.
1667
+
1668
+ Returns:
1669
+ Comparison result.
1670
+ """
1671
+ changes = []
1672
+ column_diffs = []
1673
+
1674
+ # Extract column data
1675
+ cols_1 = {c["name"]: c for c in profile_1.get("columns", [])}
1676
+ cols_2 = {c["name"]: c for c in profile_2.get("columns", [])}
1677
+
1678
+ # Detect added/removed columns
1679
+ added_cols = set(cols_2.keys()) - set(cols_1.keys())
1680
+ removed_cols = set(cols_1.keys()) - set(cols_2.keys())
1681
+ common_cols = set(cols_1.keys()) & set(cols_2.keys())
1682
+
1683
+ for col in added_cols:
1684
+ changes.append({
1685
+ "type": "column_added",
1686
+ "column": col,
1687
+ "details": cols_2[col],
1688
+ })
1689
+
1690
+ for col in removed_cols:
1691
+ changes.append({
1692
+ "type": "column_removed",
1693
+ "column": col,
1694
+ "details": cols_1[col],
1695
+ })
1696
+
1697
+ # Compare common columns
1698
+ for col in common_cols:
1699
+ col_1 = cols_1[col]
1700
+ col_2 = cols_2[col]
1701
+ col_changes = []
1702
+
1703
+ # Type change
1704
+ if col_1.get("inferred_type") != col_2.get("inferred_type"):
1705
+ col_changes.append({
1706
+ "field": "inferred_type",
1707
+ "old": col_1.get("inferred_type"),
1708
+ "new": col_2.get("inferred_type"),
1709
+ })
1710
+
1711
+ # Null ratio change
1712
+ old_null = col_1.get("null_ratio", 0)
1713
+ new_null = col_2.get("null_ratio", 0)
1714
+ if abs(old_null - new_null) > 0.05: # 5% threshold
1715
+ col_changes.append({
1716
+ "field": "null_ratio",
1717
+ "old": old_null,
1718
+ "new": new_null,
1719
+ "change": new_null - old_null,
1720
+ })
1721
+
1722
+ # Unique ratio change
1723
+ old_unique = col_1.get("unique_ratio", 0)
1724
+ new_unique = col_2.get("unique_ratio", 0)
1725
+ if abs(old_unique - new_unique) > 0.1: # 10% threshold
1726
+ col_changes.append({
1727
+ "field": "unique_ratio",
1728
+ "old": old_unique,
1729
+ "new": new_unique,
1730
+ "change": new_unique - old_unique,
1731
+ })
1732
+
1733
+ if col_changes:
1734
+ column_diffs.append({
1735
+ "column": col,
1736
+ "changes": col_changes,
1737
+ })
1738
+
1739
+ return {
1740
+ "profile_1_id": profile_1_id,
1741
+ "profile_2_id": profile_2_id,
1742
+ "row_count_change": profile_2.get("row_count", 0) - profile_1.get("row_count", 0),
1743
+ "column_count_change": profile_2.get("column_count", 0) - profile_1.get("column_count", 0),
1744
+ "added_columns": list(added_cols),
1745
+ "removed_columns": list(removed_cols),
1746
+ "schema_changes": changes,
1747
+ "column_diffs": column_diffs,
1748
+ "has_breaking_changes": len(removed_cols) > 0 or any(
1749
+ c.get("field") == "inferred_type" for cd in column_diffs for c in cd.get("changes", [])
1750
+ ),
1751
+ }
1752
+
1261
1753
 
1262
1754
  class HistoryService:
1263
1755
  """Service for validation history and analytics.
@@ -1399,6 +1891,7 @@ class DriftService:
1399
1891
  """Service for drift detection.
1400
1892
 
1401
1893
  Handles drift comparison between datasets.
1894
+ Supports all data source types through DataSource abstraction.
1402
1895
  """
1403
1896
 
1404
1897
  def __init__(self, session: AsyncSession) -> None:
@@ -1420,12 +1913,14 @@ class DriftService:
1420
1913
  columns: list[str] | None = None,
1421
1914
  method: str = "auto",
1422
1915
  threshold: float | None = None,
1423
- correction: str | None = None,
1424
1916
  sample_size: int | None = None,
1425
1917
  save: bool = True,
1426
1918
  ) -> DriftComparison:
1427
1919
  """Compare two datasets for drift detection.
1428
1920
 
1921
+ Supports comparing data from various source types including files,
1922
+ SQL databases, cloud data warehouses, and async sources.
1923
+
1429
1924
  Args:
1430
1925
  baseline_source_id: Baseline source ID.
1431
1926
  current_source_id: Current source ID.
@@ -1433,7 +1928,6 @@ class DriftService:
1433
1928
  method: Detection method. Supported:
1434
1929
  auto, ks, psi, chi2, js, kl, wasserstein, cvm, anderson
1435
1930
  threshold: Optional custom threshold.
1436
- correction: Multiple testing correction (none, bonferroni, holm, bh).
1437
1931
  sample_size: Optional sample size.
1438
1932
  save: Whether to save comparison to database.
1439
1933
 
@@ -1441,7 +1935,7 @@ class DriftService:
1441
1935
  DriftComparison model with results.
1442
1936
 
1443
1937
  Raises:
1444
- ValueError: If source not found.
1938
+ ValueError: If source not found or data source creation fails.
1445
1939
  """
1446
1940
  baseline = await self.source_repo.get_by_id(baseline_source_id)
1447
1941
  if baseline is None:
@@ -1451,13 +1945,23 @@ class DriftService:
1451
1945
  if current is None:
1452
1946
  raise ValueError(f"Current source '{current_source_id}' not found")
1453
1947
 
1948
+ # Get data inputs based on source types
1949
+ if SourceType.is_async_type(baseline.type):
1950
+ baseline_input = await get_async_data_input_from_source(baseline)
1951
+ else:
1952
+ baseline_input = get_data_input_from_source(baseline)
1953
+
1954
+ if SourceType.is_async_type(current.type):
1955
+ current_input = await get_async_data_input_from_source(current)
1956
+ else:
1957
+ current_input = get_data_input_from_source(current)
1958
+
1454
1959
  result = await self.adapter.compare(
1455
- baseline.source_path or "",
1456
- current.source_path or "",
1960
+ baseline_input,
1961
+ current_input,
1457
1962
  columns=columns,
1458
1963
  method=method,
1459
1964
  threshold=threshold,
1460
- correction=correction,
1461
1965
  sample_size=sample_size,
1462
1966
  )
1463
1967
 
@@ -1465,7 +1969,6 @@ class DriftService:
1465
1969
  "columns": columns,
1466
1970
  "method": method,
1467
1971
  "threshold": threshold,
1468
- "correction": correction,
1469
1972
  "sample_size": sample_size,
1470
1973
  }
1471
1974
 
@@ -1552,6 +2055,8 @@ class ScheduleService:
1552
2055
  *,
1553
2056
  name: str,
1554
2057
  cron_expression: str,
2058
+ trigger_type: str = "cron",
2059
+ trigger_config: dict[str, Any] | None = None,
1555
2060
  notify_on_failure: bool = True,
1556
2061
  config: dict[str, Any] | None = None,
1557
2062
  ) -> Schedule:
@@ -1581,6 +2086,8 @@ class ScheduleService:
1581
2086
  name=name,
1582
2087
  source_id=source_id,
1583
2088
  cron_expression=cron_expression,
2089
+ trigger_type=trigger_type,
2090
+ trigger_config=trigger_config,
1584
2091
  is_active=True,
1585
2092
  notify_on_failure=notify_on_failure,
1586
2093
  next_run_at=next_run,
@@ -1782,6 +2289,7 @@ class PIIScanService:
1782
2289
  """Service for PII scanning operations.
1783
2290
 
1784
2291
  Handles PII detection and regulation compliance checking using th.scan().
2292
+ Supports all data source types through DataSource abstraction.
1785
2293
  """
1786
2294
 
1787
2295
  def __init__(self, session: AsyncSession) -> None:
@@ -1795,31 +2303,23 @@ class PIIScanService:
1795
2303
  self.scan_repo = PIIScanRepository(session)
1796
2304
  self.adapter = get_adapter()
1797
2305
 
1798
- async def run_scan(
1799
- self,
1800
- source_id: str,
1801
- *,
1802
- columns: list[str] | None = None,
1803
- regulations: list[str] | None = None,
1804
- min_confidence: float = 0.8,
1805
- ) -> PIIScan:
2306
+ async def run_scan(self, source_id: str) -> PIIScan:
1806
2307
  """Run PII scan on a source.
1807
2308
 
1808
- This method provides access to truthound's th.scan() parameters,
1809
- allowing detection of personally identifiable information and
1810
- checking compliance with privacy regulations.
2309
+ Note: truthound's th.scan() does not support configuration parameters.
2310
+ The scan runs on all columns with default settings.
2311
+
2312
+ Supports all data source types including files, SQL databases,
2313
+ cloud data warehouses, and async sources.
1811
2314
 
1812
2315
  Args:
1813
2316
  source_id: Source ID to scan.
1814
- columns: Optional columns to scan. If None, scans all columns.
1815
- regulations: Optional regulations to check (gdpr, ccpa, lgpd).
1816
- min_confidence: Minimum confidence threshold (0.0-1.0). Default 0.8.
1817
2317
 
1818
2318
  Returns:
1819
2319
  PIIScan record with results.
1820
2320
 
1821
2321
  Raises:
1822
- ValueError: If source not found.
2322
+ ValueError: If source not found or data source creation fails.
1823
2323
  """
1824
2324
  # Get source
1825
2325
  source = await self.source_repo.get_by_id(source_id)
@@ -1830,19 +2330,18 @@ class PIIScanService:
1830
2330
  scan = await self.scan_repo.create(
1831
2331
  source_id=source_id,
1832
2332
  status="running",
1833
- min_confidence=min_confidence,
1834
- regulations_checked=regulations,
1835
2333
  started_at=datetime.utcnow(),
1836
2334
  )
1837
2335
 
1838
2336
  try:
1839
- # Run scan
1840
- result = await self.adapter.scan(
1841
- source.source_path or "",
1842
- columns=columns,
1843
- regulations=regulations,
1844
- min_confidence=min_confidence,
1845
- )
2337
+ # Get data input based on source type
2338
+ if SourceType.is_async_type(source.type):
2339
+ data_input = await get_async_data_input_from_source(source)
2340
+ else:
2341
+ data_input = get_data_input_from_source(source)
2342
+
2343
+ # Run scan - truthound's th.scan() does not support parameters
2344
+ result = await self.adapter.scan(data_input)
1846
2345
 
1847
2346
  # Update scan with results
1848
2347
  await self._update_scan_success(scan, result)
@@ -1972,6 +2471,8 @@ class MaskService:
1972
2471
  - redact: Replace values with asterisks
1973
2472
  - hash: Replace values with SHA256 hash (deterministic)
1974
2473
  - fake: Replace values with realistic fake data
2474
+
2475
+ Supports all data source types through DataSource abstraction.
1975
2476
  """
1976
2477
 
1977
2478
  def __init__(self, session: AsyncSession) -> None:
@@ -1991,18 +2492,22 @@ class MaskService:
1991
2492
  *,
1992
2493
  columns: list[str] | None = None,
1993
2494
  strategy: str = "redact",
1994
- output_format: str = "csv",
1995
2495
  ) -> DataMask:
1996
2496
  """Run data masking on a source.
1997
2497
 
1998
2498
  This method provides access to truthound's th.mask() with
1999
2499
  three masking strategies for PII protection.
2000
2500
 
2501
+ Supports all data source types including files, SQL databases,
2502
+ cloud data warehouses, and async sources.
2503
+
2504
+ Note: output_format parameter was removed as truthound's th.mask()
2505
+ does not support this parameter. Output is always CSV format.
2506
+
2001
2507
  Args:
2002
2508
  source_id: Source ID to mask.
2003
2509
  columns: Optional columns to mask. If None, auto-detects PII.
2004
2510
  strategy: Masking strategy (redact, hash, fake). Default is redact.
2005
- output_format: Output file format (csv, parquet, json). Default is csv.
2006
2511
 
2007
2512
  Returns:
2008
2513
  DataMask record with results.
@@ -2010,6 +2515,9 @@ class MaskService:
2010
2515
  Raises:
2011
2516
  ValueError: If source not found or invalid strategy.
2012
2517
  """
2518
+ from pathlib import Path
2519
+ import tempfile
2520
+
2013
2521
  # Validate strategy
2014
2522
  if strategy not in ("redact", "hash", "fake"):
2015
2523
  raise ValueError(
@@ -2022,14 +2530,19 @@ class MaskService:
2022
2530
  raise ValueError(f"Source '{source_id}' not found")
2023
2531
 
2024
2532
  # Determine output path
2025
- source_path = source.source_path or ""
2026
- import os
2027
- from pathlib import Path
2533
+ # For file sources, use the same directory structure
2534
+ # For other sources, use a temp directory or configured output directory
2535
+ if SourceType.is_file_type(source.type):
2536
+ source_path = source.source_path or source.config.get("path", "")
2537
+ base_path = Path(source_path)
2538
+ output_dir = base_path.parent / "masked"
2539
+ else:
2540
+ # For non-file sources, use a temp directory
2541
+ output_dir = Path(tempfile.gettempdir()) / "truthound_masked"
2028
2542
 
2029
- base_path = Path(source_path)
2030
- output_dir = base_path.parent / "masked"
2031
2543
  output_dir.mkdir(exist_ok=True)
2032
- output_filename = f"{base_path.stem}_masked_{strategy}.{output_format}"
2544
+ # Output format is always CSV as truthound's th.mask() does not support format selection
2545
+ output_filename = f"{source.name}_masked_{strategy}.csv"
2033
2546
  output_path = str(output_dir / output_filename)
2034
2547
 
2035
2548
  # Create mask record
@@ -2042,9 +2555,15 @@ class MaskService:
2042
2555
  )
2043
2556
 
2044
2557
  try:
2558
+ # Get data input based on source type
2559
+ if SourceType.is_async_type(source.type):
2560
+ data_input = await get_async_data_input_from_source(source)
2561
+ else:
2562
+ data_input = get_data_input_from_source(source)
2563
+
2045
2564
  # Run masking
2046
2565
  result = await self.adapter.mask(
2047
- source_path,
2566
+ data_input,
2048
2567
  output_path,
2049
2568
  columns=columns,
2050
2569
  strategy=strategy,