truthound-dashboard 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/__init__.py +11 -0
- truthound_dashboard/__main__.py +6 -0
- truthound_dashboard/api/__init__.py +15 -0
- truthound_dashboard/api/deps.py +153 -0
- truthound_dashboard/api/drift.py +179 -0
- truthound_dashboard/api/error_handlers.py +287 -0
- truthound_dashboard/api/health.py +78 -0
- truthound_dashboard/api/history.py +62 -0
- truthound_dashboard/api/middleware.py +626 -0
- truthound_dashboard/api/notifications.py +561 -0
- truthound_dashboard/api/profile.py +52 -0
- truthound_dashboard/api/router.py +83 -0
- truthound_dashboard/api/rules.py +277 -0
- truthound_dashboard/api/schedules.py +329 -0
- truthound_dashboard/api/schemas.py +136 -0
- truthound_dashboard/api/sources.py +229 -0
- truthound_dashboard/api/validations.py +125 -0
- truthound_dashboard/cli.py +226 -0
- truthound_dashboard/config.py +132 -0
- truthound_dashboard/core/__init__.py +264 -0
- truthound_dashboard/core/base.py +185 -0
- truthound_dashboard/core/cache.py +479 -0
- truthound_dashboard/core/connections.py +331 -0
- truthound_dashboard/core/encryption.py +409 -0
- truthound_dashboard/core/exceptions.py +627 -0
- truthound_dashboard/core/logging.py +488 -0
- truthound_dashboard/core/maintenance.py +542 -0
- truthound_dashboard/core/notifications/__init__.py +56 -0
- truthound_dashboard/core/notifications/base.py +390 -0
- truthound_dashboard/core/notifications/channels.py +557 -0
- truthound_dashboard/core/notifications/dispatcher.py +453 -0
- truthound_dashboard/core/notifications/events.py +155 -0
- truthound_dashboard/core/notifications/service.py +744 -0
- truthound_dashboard/core/sampling.py +626 -0
- truthound_dashboard/core/scheduler.py +311 -0
- truthound_dashboard/core/services.py +1531 -0
- truthound_dashboard/core/truthound_adapter.py +659 -0
- truthound_dashboard/db/__init__.py +67 -0
- truthound_dashboard/db/base.py +108 -0
- truthound_dashboard/db/database.py +196 -0
- truthound_dashboard/db/models.py +732 -0
- truthound_dashboard/db/repository.py +237 -0
- truthound_dashboard/main.py +309 -0
- truthound_dashboard/schemas/__init__.py +150 -0
- truthound_dashboard/schemas/base.py +96 -0
- truthound_dashboard/schemas/drift.py +118 -0
- truthound_dashboard/schemas/history.py +74 -0
- truthound_dashboard/schemas/profile.py +91 -0
- truthound_dashboard/schemas/rule.py +199 -0
- truthound_dashboard/schemas/schedule.py +88 -0
- truthound_dashboard/schemas/schema.py +121 -0
- truthound_dashboard/schemas/source.py +138 -0
- truthound_dashboard/schemas/validation.py +192 -0
- truthound_dashboard/static/assets/index-BqJMyAHX.js +110 -0
- truthound_dashboard/static/assets/index-DMDxHCTs.js +465 -0
- truthound_dashboard/static/assets/index-Dm2D11TK.css +1 -0
- truthound_dashboard/static/index.html +15 -0
- truthound_dashboard/static/mockServiceWorker.js +349 -0
- truthound_dashboard-1.0.0.dist-info/METADATA +218 -0
- truthound_dashboard-1.0.0.dist-info/RECORD +62 -0
- truthound_dashboard-1.0.0.dist-info/WHEEL +4 -0
- truthound_dashboard-1.0.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,1531 @@
|
|
|
1
|
+
"""Business logic services.
|
|
2
|
+
|
|
3
|
+
This module contains service classes that implement business logic
|
|
4
|
+
for the dashboard, separating concerns from API handlers.
|
|
5
|
+
|
|
6
|
+
Services handle:
|
|
7
|
+
- Data source management
|
|
8
|
+
- Schema learning and storage
|
|
9
|
+
- Validation execution and tracking
|
|
10
|
+
- Data profiling with history
|
|
11
|
+
- Drift detection
|
|
12
|
+
- Schedule management
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from collections import Counter, defaultdict
|
|
18
|
+
from collections.abc import Sequence
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from typing import Any, Literal
|
|
21
|
+
|
|
22
|
+
from sqlalchemy import and_, select
|
|
23
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
24
|
+
|
|
25
|
+
from truthound_dashboard.db import (
|
|
26
|
+
BaseRepository,
|
|
27
|
+
DriftComparison,
|
|
28
|
+
Profile,
|
|
29
|
+
Rule,
|
|
30
|
+
Schedule,
|
|
31
|
+
Schema,
|
|
32
|
+
Source,
|
|
33
|
+
Validation,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
from .truthound_adapter import (
|
|
37
|
+
CheckResult,
|
|
38
|
+
get_adapter,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SourceRepository(BaseRepository[Source]):
|
|
43
|
+
"""Repository for Source model operations."""
|
|
44
|
+
|
|
45
|
+
model = Source
|
|
46
|
+
|
|
47
|
+
async def get_active(
|
|
48
|
+
self,
|
|
49
|
+
*,
|
|
50
|
+
offset: int = 0,
|
|
51
|
+
limit: int = 100,
|
|
52
|
+
) -> Sequence[Source]:
|
|
53
|
+
"""Get active sources only.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
offset: Number to skip.
|
|
57
|
+
limit: Maximum to return.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Sequence of active sources.
|
|
61
|
+
"""
|
|
62
|
+
return await self.list(
|
|
63
|
+
offset=offset,
|
|
64
|
+
limit=limit,
|
|
65
|
+
filters=[Source.is_active],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
async def get_by_name(self, name: str) -> Source | None:
|
|
69
|
+
"""Get source by name.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
name: Source name to find.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Source or None if not found.
|
|
76
|
+
"""
|
|
77
|
+
result = await self.session.execute(select(Source).where(Source.name == name))
|
|
78
|
+
return result.scalar_one_or_none()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SchemaRepository(BaseRepository[Schema]):
|
|
82
|
+
"""Repository for Schema model operations."""
|
|
83
|
+
|
|
84
|
+
model = Schema
|
|
85
|
+
|
|
86
|
+
async def get_active_for_source(self, source_id: str) -> Schema | None:
|
|
87
|
+
"""Get active schema for a source.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
source_id: Source ID.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Active schema or None.
|
|
94
|
+
"""
|
|
95
|
+
result = await self.session.execute(
|
|
96
|
+
select(Schema)
|
|
97
|
+
.where(Schema.source_id == source_id)
|
|
98
|
+
.where(Schema.is_active)
|
|
99
|
+
.order_by(Schema.created_at.desc())
|
|
100
|
+
.limit(1)
|
|
101
|
+
)
|
|
102
|
+
return result.scalar_one_or_none()
|
|
103
|
+
|
|
104
|
+
async def deactivate_for_source(self, source_id: str) -> None:
|
|
105
|
+
"""Deactivate all schemas for a source.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
source_id: Source ID.
|
|
109
|
+
"""
|
|
110
|
+
result = await self.session.execute(
|
|
111
|
+
select(Schema).where(Schema.source_id == source_id).where(Schema.is_active)
|
|
112
|
+
)
|
|
113
|
+
schemas = result.scalars().all()
|
|
114
|
+
for schema in schemas:
|
|
115
|
+
schema.is_active = False
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class RuleRepository(BaseRepository[Rule]):
|
|
119
|
+
"""Repository for Rule model operations."""
|
|
120
|
+
|
|
121
|
+
model = Rule
|
|
122
|
+
|
|
123
|
+
async def get_for_source(
|
|
124
|
+
self,
|
|
125
|
+
source_id: str,
|
|
126
|
+
*,
|
|
127
|
+
limit: int = 50,
|
|
128
|
+
active_only: bool = False,
|
|
129
|
+
) -> Sequence[Rule]:
|
|
130
|
+
"""Get rules for a source.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
source_id: Source ID.
|
|
134
|
+
limit: Maximum to return.
|
|
135
|
+
active_only: Only return active rules.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Sequence of rules.
|
|
139
|
+
"""
|
|
140
|
+
filters = [Rule.source_id == source_id]
|
|
141
|
+
if active_only:
|
|
142
|
+
filters.append(Rule.is_active)
|
|
143
|
+
|
|
144
|
+
return await self.list(
|
|
145
|
+
limit=limit,
|
|
146
|
+
filters=filters,
|
|
147
|
+
order_by=Rule.created_at.desc(),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
async def get_active_for_source(self, source_id: str) -> Rule | None:
|
|
151
|
+
"""Get most recent active rule for a source.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
source_id: Source ID.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Active rule or None.
|
|
158
|
+
"""
|
|
159
|
+
result = await self.session.execute(
|
|
160
|
+
select(Rule)
|
|
161
|
+
.where(Rule.source_id == source_id)
|
|
162
|
+
.where(Rule.is_active)
|
|
163
|
+
.order_by(Rule.created_at.desc())
|
|
164
|
+
.limit(1)
|
|
165
|
+
)
|
|
166
|
+
return result.scalar_one_or_none()
|
|
167
|
+
|
|
168
|
+
async def deactivate_for_source(self, source_id: str) -> int:
|
|
169
|
+
"""Deactivate all rules for a source.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
source_id: Source ID.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Number of rules deactivated.
|
|
176
|
+
"""
|
|
177
|
+
result = await self.session.execute(
|
|
178
|
+
select(Rule).where(Rule.source_id == source_id).where(Rule.is_active)
|
|
179
|
+
)
|
|
180
|
+
rules = result.scalars().all()
|
|
181
|
+
for rule in rules:
|
|
182
|
+
rule.is_active = False
|
|
183
|
+
return len(rules)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class ValidationRepository(BaseRepository[Validation]):
|
|
187
|
+
"""Repository for Validation model operations."""
|
|
188
|
+
|
|
189
|
+
model = Validation
|
|
190
|
+
|
|
191
|
+
async def get_for_source(
|
|
192
|
+
self,
|
|
193
|
+
source_id: str,
|
|
194
|
+
*,
|
|
195
|
+
limit: int = 20,
|
|
196
|
+
) -> Sequence[Validation]:
|
|
197
|
+
"""Get validations for a source.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
source_id: Source ID.
|
|
201
|
+
limit: Maximum to return.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Sequence of validations.
|
|
205
|
+
"""
|
|
206
|
+
return await self.list(
|
|
207
|
+
limit=limit,
|
|
208
|
+
filters=[Validation.source_id == source_id],
|
|
209
|
+
order_by=Validation.created_at.desc(),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
async def get_latest_for_source(self, source_id: str) -> Validation | None:
|
|
213
|
+
"""Get most recent validation for a source.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
source_id: Source ID.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Latest validation or None.
|
|
220
|
+
"""
|
|
221
|
+
result = await self.session.execute(
|
|
222
|
+
select(Validation)
|
|
223
|
+
.where(Validation.source_id == source_id)
|
|
224
|
+
.order_by(Validation.created_at.desc())
|
|
225
|
+
.limit(1)
|
|
226
|
+
)
|
|
227
|
+
return result.scalar_one_or_none()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class SourceService:
|
|
231
|
+
"""Service for managing data sources.
|
|
232
|
+
|
|
233
|
+
Handles source CRUD operations and related business logic.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
237
|
+
"""Initialize service.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
session: Database session.
|
|
241
|
+
"""
|
|
242
|
+
self.session = session
|
|
243
|
+
self.repository = SourceRepository(session)
|
|
244
|
+
self.schema_repo = SchemaRepository(session)
|
|
245
|
+
self.validation_repo = ValidationRepository(session)
|
|
246
|
+
|
|
247
|
+
async def get_by_id(self, id: str) -> Source | None:
|
|
248
|
+
"""Get source by ID."""
|
|
249
|
+
return await self.repository.get_by_id(id)
|
|
250
|
+
|
|
251
|
+
async def list(
|
|
252
|
+
self,
|
|
253
|
+
*,
|
|
254
|
+
offset: int = 0,
|
|
255
|
+
limit: int = 100,
|
|
256
|
+
active_only: bool = True,
|
|
257
|
+
) -> Sequence[Source]:
|
|
258
|
+
"""List sources.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
offset: Number to skip.
|
|
262
|
+
limit: Maximum to return.
|
|
263
|
+
active_only: Only return active sources.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Sequence of sources.
|
|
267
|
+
"""
|
|
268
|
+
if active_only:
|
|
269
|
+
return await self.repository.get_active(offset=offset, limit=limit)
|
|
270
|
+
return await self.repository.list(offset=offset, limit=limit)
|
|
271
|
+
|
|
272
|
+
async def create(
|
|
273
|
+
self,
|
|
274
|
+
*,
|
|
275
|
+
name: str,
|
|
276
|
+
type: str,
|
|
277
|
+
config: dict[str, Any],
|
|
278
|
+
description: str | None = None,
|
|
279
|
+
) -> Source:
|
|
280
|
+
"""Create new source.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
name: Source name.
|
|
284
|
+
type: Source type (file, postgresql, etc.).
|
|
285
|
+
config: Source configuration.
|
|
286
|
+
description: Optional description.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Created source.
|
|
290
|
+
"""
|
|
291
|
+
return await self.repository.create(
|
|
292
|
+
name=name,
|
|
293
|
+
type=type,
|
|
294
|
+
config=config,
|
|
295
|
+
description=description,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
async def update(
|
|
299
|
+
self,
|
|
300
|
+
id: str,
|
|
301
|
+
*,
|
|
302
|
+
name: str | None = None,
|
|
303
|
+
config: dict[str, Any] | None = None,
|
|
304
|
+
description: str | None = None,
|
|
305
|
+
is_active: bool | None = None,
|
|
306
|
+
) -> Source | None:
|
|
307
|
+
"""Update source.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
id: Source ID.
|
|
311
|
+
name: New name.
|
|
312
|
+
config: New config.
|
|
313
|
+
description: New description.
|
|
314
|
+
is_active: New active status.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Updated source or None.
|
|
318
|
+
"""
|
|
319
|
+
update_data = {}
|
|
320
|
+
if name is not None:
|
|
321
|
+
update_data["name"] = name
|
|
322
|
+
if config is not None:
|
|
323
|
+
update_data["config"] = config
|
|
324
|
+
if description is not None:
|
|
325
|
+
update_data["description"] = description
|
|
326
|
+
if is_active is not None:
|
|
327
|
+
update_data["is_active"] = is_active
|
|
328
|
+
|
|
329
|
+
if not update_data:
|
|
330
|
+
return await self.repository.get_by_id(id)
|
|
331
|
+
|
|
332
|
+
return await self.repository.update(id, **update_data)
|
|
333
|
+
|
|
334
|
+
async def delete(self, id: str) -> bool:
|
|
335
|
+
"""Delete source and related data.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
id: Source ID.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
True if deleted.
|
|
342
|
+
"""
|
|
343
|
+
return await self.repository.delete(id)
|
|
344
|
+
|
|
345
|
+
async def get_schema(self, source_id: str) -> Schema | None:
|
|
346
|
+
"""Get active schema for source.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
source_id: Source ID.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Active schema or None.
|
|
353
|
+
"""
|
|
354
|
+
return await self.schema_repo.get_active_for_source(source_id)
|
|
355
|
+
|
|
356
|
+
async def get_validations(
|
|
357
|
+
self,
|
|
358
|
+
source_id: str,
|
|
359
|
+
*,
|
|
360
|
+
limit: int = 20,
|
|
361
|
+
) -> Sequence[Validation]:
|
|
362
|
+
"""Get validations for source.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
source_id: Source ID.
|
|
366
|
+
limit: Maximum to return.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Sequence of validations.
|
|
370
|
+
"""
|
|
371
|
+
return await self.validation_repo.get_for_source(source_id, limit=limit)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
class ValidationService:
|
|
375
|
+
"""Service for running and managing validations.
|
|
376
|
+
|
|
377
|
+
Handles validation execution, result storage, and history.
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
381
|
+
"""Initialize service.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
session: Database session.
|
|
385
|
+
"""
|
|
386
|
+
self.session = session
|
|
387
|
+
self.source_repo = SourceRepository(session)
|
|
388
|
+
self.schema_repo = SchemaRepository(session)
|
|
389
|
+
self.validation_repo = ValidationRepository(session)
|
|
390
|
+
self.adapter = get_adapter()
|
|
391
|
+
|
|
392
|
+
async def run_validation(
|
|
393
|
+
self,
|
|
394
|
+
source_id: str,
|
|
395
|
+
*,
|
|
396
|
+
validators: list[str] | None = None,
|
|
397
|
+
schema_path: str | None = None,
|
|
398
|
+
auto_schema: bool = False,
|
|
399
|
+
) -> Validation:
|
|
400
|
+
"""Run validation on a source.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
source_id: Source ID to validate.
|
|
404
|
+
validators: Optional validator list.
|
|
405
|
+
schema_path: Optional schema file path.
|
|
406
|
+
auto_schema: Auto-learn schema if True.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Validation record with results.
|
|
410
|
+
|
|
411
|
+
Raises:
|
|
412
|
+
ValueError: If source not found.
|
|
413
|
+
"""
|
|
414
|
+
# Get source
|
|
415
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
416
|
+
if source is None:
|
|
417
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
418
|
+
|
|
419
|
+
# Create validation record
|
|
420
|
+
validation = await self.validation_repo.create(
|
|
421
|
+
source_id=source_id,
|
|
422
|
+
status="running",
|
|
423
|
+
started_at=datetime.utcnow(),
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
try:
|
|
427
|
+
# Run validation
|
|
428
|
+
result = await self.adapter.check(
|
|
429
|
+
source.source_path or "",
|
|
430
|
+
validators=validators,
|
|
431
|
+
schema=schema_path,
|
|
432
|
+
auto_schema=auto_schema,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Update validation with results
|
|
436
|
+
await self._update_validation_success(validation, result)
|
|
437
|
+
|
|
438
|
+
# Update source last validated
|
|
439
|
+
source.last_validated_at = datetime.utcnow()
|
|
440
|
+
|
|
441
|
+
except Exception as e:
|
|
442
|
+
# Update validation with error
|
|
443
|
+
validation.mark_error(str(e))
|
|
444
|
+
|
|
445
|
+
await self.session.flush()
|
|
446
|
+
await self.session.refresh(validation)
|
|
447
|
+
return validation
|
|
448
|
+
|
|
449
|
+
async def _update_validation_success(
|
|
450
|
+
self,
|
|
451
|
+
validation: Validation,
|
|
452
|
+
result: CheckResult,
|
|
453
|
+
) -> None:
|
|
454
|
+
"""Update validation with successful result.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
validation: Validation record to update.
|
|
458
|
+
result: Check result from adapter.
|
|
459
|
+
"""
|
|
460
|
+
validation.status = "success" if result.passed else "failed"
|
|
461
|
+
validation.passed = result.passed
|
|
462
|
+
validation.has_critical = result.has_critical
|
|
463
|
+
validation.has_high = result.has_high
|
|
464
|
+
validation.total_issues = result.total_issues
|
|
465
|
+
validation.critical_issues = result.critical_issues
|
|
466
|
+
validation.high_issues = result.high_issues
|
|
467
|
+
validation.medium_issues = result.medium_issues
|
|
468
|
+
validation.low_issues = result.low_issues
|
|
469
|
+
validation.row_count = result.row_count
|
|
470
|
+
validation.column_count = result.column_count
|
|
471
|
+
validation.result_json = result.to_dict()
|
|
472
|
+
validation.completed_at = datetime.utcnow()
|
|
473
|
+
|
|
474
|
+
if validation.started_at:
|
|
475
|
+
delta = validation.completed_at - validation.started_at
|
|
476
|
+
validation.duration_ms = int(delta.total_seconds() * 1000)
|
|
477
|
+
|
|
478
|
+
async def get_validation(self, validation_id: str) -> Validation | None:
|
|
479
|
+
"""Get validation by ID.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
validation_id: Validation ID.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
Validation or None.
|
|
486
|
+
"""
|
|
487
|
+
return await self.validation_repo.get_by_id(validation_id)
|
|
488
|
+
|
|
489
|
+
async def list_for_source(
|
|
490
|
+
self,
|
|
491
|
+
source_id: str,
|
|
492
|
+
*,
|
|
493
|
+
limit: int = 20,
|
|
494
|
+
) -> Sequence[Validation]:
|
|
495
|
+
"""List validations for a source.
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
source_id: Source ID.
|
|
499
|
+
limit: Maximum to return.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
Sequence of validations.
|
|
503
|
+
"""
|
|
504
|
+
return await self.validation_repo.get_for_source(source_id, limit=limit)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
class SchemaService:
|
|
508
|
+
"""Service for schema learning and management.
|
|
509
|
+
|
|
510
|
+
Handles schema learning, storage, and retrieval.
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
514
|
+
"""Initialize service.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
session: Database session.
|
|
518
|
+
"""
|
|
519
|
+
self.session = session
|
|
520
|
+
self.source_repo = SourceRepository(session)
|
|
521
|
+
self.schema_repo = SchemaRepository(session)
|
|
522
|
+
self.adapter = get_adapter()
|
|
523
|
+
|
|
524
|
+
async def learn_schema(
|
|
525
|
+
self,
|
|
526
|
+
source_id: str,
|
|
527
|
+
*,
|
|
528
|
+
infer_constraints: bool = True,
|
|
529
|
+
) -> Schema:
|
|
530
|
+
"""Learn and store schema for a source.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
source_id: Source ID.
|
|
534
|
+
infer_constraints: Infer constraints from data.
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
Created schema record.
|
|
538
|
+
|
|
539
|
+
Raises:
|
|
540
|
+
ValueError: If source not found.
|
|
541
|
+
"""
|
|
542
|
+
# Get source
|
|
543
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
544
|
+
if source is None:
|
|
545
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
546
|
+
|
|
547
|
+
# Learn schema
|
|
548
|
+
result = await self.adapter.learn(
|
|
549
|
+
source.source_path or "",
|
|
550
|
+
infer_constraints=infer_constraints,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Deactivate existing schemas
|
|
554
|
+
await self.schema_repo.deactivate_for_source(source_id)
|
|
555
|
+
|
|
556
|
+
# Create new schema
|
|
557
|
+
schema = await self.schema_repo.create(
|
|
558
|
+
source_id=source_id,
|
|
559
|
+
schema_yaml=result.schema_yaml,
|
|
560
|
+
schema_json=result.schema,
|
|
561
|
+
row_count=result.row_count,
|
|
562
|
+
column_count=result.column_count,
|
|
563
|
+
is_active=True,
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
return schema
|
|
567
|
+
|
|
568
|
+
async def get_schema(self, source_id: str) -> Schema | None:
|
|
569
|
+
"""Get active schema for source.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
source_id: Source ID.
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
Active schema or None.
|
|
576
|
+
"""
|
|
577
|
+
return await self.schema_repo.get_active_for_source(source_id)
|
|
578
|
+
|
|
579
|
+
async def update_schema(
|
|
580
|
+
self,
|
|
581
|
+
source_id: str,
|
|
582
|
+
schema_yaml: str,
|
|
583
|
+
) -> Schema | None:
|
|
584
|
+
"""Update schema YAML for a source.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
source_id: Source ID.
|
|
588
|
+
schema_yaml: New schema YAML.
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
Updated schema or None.
|
|
592
|
+
"""
|
|
593
|
+
import yaml
|
|
594
|
+
|
|
595
|
+
schema = await self.schema_repo.get_active_for_source(source_id)
|
|
596
|
+
if schema is None:
|
|
597
|
+
return None
|
|
598
|
+
|
|
599
|
+
# Parse and update
|
|
600
|
+
try:
|
|
601
|
+
schema_json = yaml.safe_load(schema_yaml)
|
|
602
|
+
except yaml.YAMLError:
|
|
603
|
+
schema_json = None
|
|
604
|
+
|
|
605
|
+
schema.schema_yaml = schema_yaml
|
|
606
|
+
schema.schema_json = schema_json
|
|
607
|
+
|
|
608
|
+
await self.session.flush()
|
|
609
|
+
await self.session.refresh(schema)
|
|
610
|
+
return schema
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
class RuleService:
|
|
614
|
+
"""Service for managing custom validation rules.
|
|
615
|
+
|
|
616
|
+
Handles rule CRUD operations and YAML parsing.
|
|
617
|
+
"""
|
|
618
|
+
|
|
619
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
620
|
+
"""Initialize service.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
session: Database session.
|
|
624
|
+
"""
|
|
625
|
+
self.session = session
|
|
626
|
+
self.source_repo = SourceRepository(session)
|
|
627
|
+
self.rule_repo = RuleRepository(session)
|
|
628
|
+
|
|
629
|
+
async def get_rule(self, rule_id: str) -> Rule | None:
|
|
630
|
+
"""Get rule by ID.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
rule_id: Rule ID.
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
Rule or None.
|
|
637
|
+
"""
|
|
638
|
+
return await self.rule_repo.get_by_id(rule_id)
|
|
639
|
+
|
|
640
|
+
async def get_rules_for_source(
|
|
641
|
+
self,
|
|
642
|
+
source_id: str,
|
|
643
|
+
*,
|
|
644
|
+
limit: int = 50,
|
|
645
|
+
active_only: bool = False,
|
|
646
|
+
) -> Sequence[Rule]:
|
|
647
|
+
"""Get all rules for a source.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
source_id: Source ID.
|
|
651
|
+
limit: Maximum to return.
|
|
652
|
+
active_only: Only return active rules.
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
Sequence of rules.
|
|
656
|
+
"""
|
|
657
|
+
return await self.rule_repo.get_for_source(
|
|
658
|
+
source_id,
|
|
659
|
+
limit=limit,
|
|
660
|
+
active_only=active_only,
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
async def get_active_rule(self, source_id: str) -> Rule | None:
|
|
664
|
+
"""Get the active rule for a source.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
source_id: Source ID.
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
Active rule or None.
|
|
671
|
+
"""
|
|
672
|
+
return await self.rule_repo.get_active_for_source(source_id)
|
|
673
|
+
|
|
674
|
+
async def create_rule(
|
|
675
|
+
self,
|
|
676
|
+
source_id: str,
|
|
677
|
+
*,
|
|
678
|
+
rules_yaml: str,
|
|
679
|
+
name: str = "Default Rules",
|
|
680
|
+
description: str | None = None,
|
|
681
|
+
version: str | None = None,
|
|
682
|
+
activate: bool = True,
|
|
683
|
+
) -> Rule:
|
|
684
|
+
"""Create a new rule for a source.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
source_id: Source ID.
|
|
688
|
+
rules_yaml: YAML content defining rules.
|
|
689
|
+
name: Human-readable name.
|
|
690
|
+
description: Optional description.
|
|
691
|
+
version: Optional version string.
|
|
692
|
+
activate: Whether to make this the active rule.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
Created rule.
|
|
696
|
+
|
|
697
|
+
Raises:
|
|
698
|
+
ValueError: If source not found or YAML is invalid.
|
|
699
|
+
"""
|
|
700
|
+
import yaml
|
|
701
|
+
|
|
702
|
+
# Verify source exists
|
|
703
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
704
|
+
if source is None:
|
|
705
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
706
|
+
|
|
707
|
+
# Parse YAML
|
|
708
|
+
try:
|
|
709
|
+
rules_json = yaml.safe_load(rules_yaml)
|
|
710
|
+
except yaml.YAMLError as e:
|
|
711
|
+
raise ValueError(f"Invalid YAML: {e}")
|
|
712
|
+
|
|
713
|
+
# Deactivate existing rules if activating this one
|
|
714
|
+
if activate:
|
|
715
|
+
await self.rule_repo.deactivate_for_source(source_id)
|
|
716
|
+
|
|
717
|
+
# Create rule
|
|
718
|
+
rule = await self.rule_repo.create(
|
|
719
|
+
source_id=source_id,
|
|
720
|
+
name=name,
|
|
721
|
+
description=description,
|
|
722
|
+
rules_yaml=rules_yaml,
|
|
723
|
+
rules_json=rules_json,
|
|
724
|
+
is_active=activate,
|
|
725
|
+
version=version,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
return rule
|
|
729
|
+
|
|
730
|
+
async def update_rule(
|
|
731
|
+
self,
|
|
732
|
+
rule_id: str,
|
|
733
|
+
*,
|
|
734
|
+
name: str | None = None,
|
|
735
|
+
description: str | None = None,
|
|
736
|
+
rules_yaml: str | None = None,
|
|
737
|
+
version: str | None = None,
|
|
738
|
+
is_active: bool | None = None,
|
|
739
|
+
) -> Rule | None:
|
|
740
|
+
"""Update an existing rule.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
rule_id: Rule ID.
|
|
744
|
+
name: New name.
|
|
745
|
+
description: New description.
|
|
746
|
+
rules_yaml: New YAML content.
|
|
747
|
+
version: New version.
|
|
748
|
+
is_active: New active status.
|
|
749
|
+
|
|
750
|
+
Returns:
|
|
751
|
+
Updated rule or None if not found.
|
|
752
|
+
|
|
753
|
+
Raises:
|
|
754
|
+
ValueError: If YAML is invalid.
|
|
755
|
+
"""
|
|
756
|
+
import yaml
|
|
757
|
+
|
|
758
|
+
rule = await self.rule_repo.get_by_id(rule_id)
|
|
759
|
+
if rule is None:
|
|
760
|
+
return None
|
|
761
|
+
|
|
762
|
+
# Update fields
|
|
763
|
+
if name is not None:
|
|
764
|
+
rule.name = name
|
|
765
|
+
if description is not None:
|
|
766
|
+
rule.description = description
|
|
767
|
+
if version is not None:
|
|
768
|
+
rule.version = version
|
|
769
|
+
|
|
770
|
+
# Update YAML and parse
|
|
771
|
+
if rules_yaml is not None:
|
|
772
|
+
try:
|
|
773
|
+
rules_json = yaml.safe_load(rules_yaml)
|
|
774
|
+
except yaml.YAMLError as e:
|
|
775
|
+
raise ValueError(f"Invalid YAML: {e}")
|
|
776
|
+
rule.rules_yaml = rules_yaml
|
|
777
|
+
rule.rules_json = rules_json
|
|
778
|
+
|
|
779
|
+
# Handle activation
|
|
780
|
+
if is_active is not None:
|
|
781
|
+
if is_active and not rule.is_active:
|
|
782
|
+
# Deactivate other rules when activating this one
|
|
783
|
+
await self.rule_repo.deactivate_for_source(rule.source_id)
|
|
784
|
+
rule.is_active = is_active
|
|
785
|
+
|
|
786
|
+
await self.session.flush()
|
|
787
|
+
await self.session.refresh(rule)
|
|
788
|
+
return rule
|
|
789
|
+
|
|
790
|
+
async def delete_rule(self, rule_id: str) -> bool:
|
|
791
|
+
"""Delete a rule.
|
|
792
|
+
|
|
793
|
+
Args:
|
|
794
|
+
rule_id: Rule ID.
|
|
795
|
+
|
|
796
|
+
Returns:
|
|
797
|
+
True if deleted.
|
|
798
|
+
"""
|
|
799
|
+
return await self.rule_repo.delete(rule_id)
|
|
800
|
+
|
|
801
|
+
async def activate_rule(self, rule_id: str) -> Rule | None:
|
|
802
|
+
"""Activate a rule and deactivate others for the same source.
|
|
803
|
+
|
|
804
|
+
Args:
|
|
805
|
+
rule_id: Rule ID to activate.
|
|
806
|
+
|
|
807
|
+
Returns:
|
|
808
|
+
Activated rule or None if not found.
|
|
809
|
+
"""
|
|
810
|
+
rule = await self.rule_repo.get_by_id(rule_id)
|
|
811
|
+
if rule is None:
|
|
812
|
+
return None
|
|
813
|
+
|
|
814
|
+
# Deactivate other rules
|
|
815
|
+
await self.rule_repo.deactivate_for_source(rule.source_id)
|
|
816
|
+
|
|
817
|
+
# Activate this rule
|
|
818
|
+
rule.is_active = True
|
|
819
|
+
|
|
820
|
+
await self.session.flush()
|
|
821
|
+
await self.session.refresh(rule)
|
|
822
|
+
return rule
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
class ProfileRepository(BaseRepository[Profile]):
|
|
826
|
+
"""Repository for Profile model operations."""
|
|
827
|
+
|
|
828
|
+
model = Profile
|
|
829
|
+
|
|
830
|
+
async def get_for_source(
|
|
831
|
+
self,
|
|
832
|
+
source_id: str,
|
|
833
|
+
*,
|
|
834
|
+
limit: int = 20,
|
|
835
|
+
) -> Sequence[Profile]:
|
|
836
|
+
"""Get profiles for a source.
|
|
837
|
+
|
|
838
|
+
Args:
|
|
839
|
+
source_id: Source ID.
|
|
840
|
+
limit: Maximum to return.
|
|
841
|
+
|
|
842
|
+
Returns:
|
|
843
|
+
Sequence of profiles.
|
|
844
|
+
"""
|
|
845
|
+
return await self.list(
|
|
846
|
+
limit=limit,
|
|
847
|
+
filters=[Profile.source_id == source_id],
|
|
848
|
+
order_by=Profile.created_at.desc(),
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
async def get_latest_for_source(self, source_id: str) -> Profile | None:
|
|
852
|
+
"""Get most recent profile for a source.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
source_id: Source ID.
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
Latest profile or None.
|
|
859
|
+
"""
|
|
860
|
+
result = await self.session.execute(
|
|
861
|
+
select(Profile)
|
|
862
|
+
.where(Profile.source_id == source_id)
|
|
863
|
+
.order_by(Profile.created_at.desc())
|
|
864
|
+
.limit(1)
|
|
865
|
+
)
|
|
866
|
+
return result.scalar_one_or_none()
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
class ScheduleRepository(BaseRepository[Schedule]):
|
|
870
|
+
"""Repository for Schedule model operations."""
|
|
871
|
+
|
|
872
|
+
model = Schedule
|
|
873
|
+
|
|
874
|
+
async def get_active(
|
|
875
|
+
self,
|
|
876
|
+
*,
|
|
877
|
+
offset: int = 0,
|
|
878
|
+
limit: int = 100,
|
|
879
|
+
) -> Sequence[Schedule]:
|
|
880
|
+
"""Get active schedules only.
|
|
881
|
+
|
|
882
|
+
Args:
|
|
883
|
+
offset: Number to skip.
|
|
884
|
+
limit: Maximum to return.
|
|
885
|
+
|
|
886
|
+
Returns:
|
|
887
|
+
Sequence of active schedules.
|
|
888
|
+
"""
|
|
889
|
+
return await self.list(
|
|
890
|
+
offset=offset,
|
|
891
|
+
limit=limit,
|
|
892
|
+
filters=[Schedule.is_active],
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
async def get_for_source(
|
|
896
|
+
self,
|
|
897
|
+
source_id: str,
|
|
898
|
+
*,
|
|
899
|
+
limit: int = 50,
|
|
900
|
+
) -> Sequence[Schedule]:
|
|
901
|
+
"""Get schedules for a source.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
source_id: Source ID.
|
|
905
|
+
limit: Maximum to return.
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
Sequence of schedules.
|
|
909
|
+
"""
|
|
910
|
+
return await self.list(
|
|
911
|
+
limit=limit,
|
|
912
|
+
filters=[Schedule.source_id == source_id],
|
|
913
|
+
order_by=Schedule.created_at.desc(),
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
class DriftComparisonRepository(BaseRepository[DriftComparison]):
|
|
918
|
+
"""Repository for DriftComparison model operations."""
|
|
919
|
+
|
|
920
|
+
model = DriftComparison
|
|
921
|
+
|
|
922
|
+
async def get_for_sources(
|
|
923
|
+
self,
|
|
924
|
+
baseline_source_id: str | None = None,
|
|
925
|
+
current_source_id: str | None = None,
|
|
926
|
+
*,
|
|
927
|
+
limit: int = 20,
|
|
928
|
+
) -> Sequence[DriftComparison]:
|
|
929
|
+
"""Get drift comparisons for sources.
|
|
930
|
+
|
|
931
|
+
Args:
|
|
932
|
+
baseline_source_id: Optional baseline source ID.
|
|
933
|
+
current_source_id: Optional current source ID.
|
|
934
|
+
limit: Maximum to return.
|
|
935
|
+
|
|
936
|
+
Returns:
|
|
937
|
+
Sequence of drift comparisons.
|
|
938
|
+
"""
|
|
939
|
+
filters = []
|
|
940
|
+
if baseline_source_id:
|
|
941
|
+
filters.append(DriftComparison.baseline_source_id == baseline_source_id)
|
|
942
|
+
if current_source_id:
|
|
943
|
+
filters.append(DriftComparison.current_source_id == current_source_id)
|
|
944
|
+
|
|
945
|
+
return await self.list(
|
|
946
|
+
limit=limit,
|
|
947
|
+
filters=filters if filters else None,
|
|
948
|
+
order_by=DriftComparison.created_at.desc(),
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
async def get_latest(
|
|
952
|
+
self,
|
|
953
|
+
baseline_source_id: str,
|
|
954
|
+
current_source_id: str,
|
|
955
|
+
) -> DriftComparison | None:
|
|
956
|
+
"""Get latest comparison between two sources.
|
|
957
|
+
|
|
958
|
+
Args:
|
|
959
|
+
baseline_source_id: Baseline source ID.
|
|
960
|
+
current_source_id: Current source ID.
|
|
961
|
+
|
|
962
|
+
Returns:
|
|
963
|
+
Latest comparison or None.
|
|
964
|
+
"""
|
|
965
|
+
result = await self.session.execute(
|
|
966
|
+
select(DriftComparison)
|
|
967
|
+
.where(
|
|
968
|
+
and_(
|
|
969
|
+
DriftComparison.baseline_source_id == baseline_source_id,
|
|
970
|
+
DriftComparison.current_source_id == current_source_id,
|
|
971
|
+
)
|
|
972
|
+
)
|
|
973
|
+
.order_by(DriftComparison.created_at.desc())
|
|
974
|
+
.limit(1)
|
|
975
|
+
)
|
|
976
|
+
return result.scalar_one_or_none()
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
class ProfileService:
|
|
980
|
+
"""Service for data profiling with history tracking.
|
|
981
|
+
|
|
982
|
+
Handles data profiling operations and stores results.
|
|
983
|
+
"""
|
|
984
|
+
|
|
985
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
986
|
+
"""Initialize service.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
session: Database session.
|
|
990
|
+
"""
|
|
991
|
+
self.session = session
|
|
992
|
+
self.source_repo = SourceRepository(session)
|
|
993
|
+
self.profile_repo = ProfileRepository(session)
|
|
994
|
+
self.adapter = get_adapter()
|
|
995
|
+
|
|
996
|
+
async def profile_source(self, source_id: str, *, save: bool = True) -> Profile:
|
|
997
|
+
"""Profile a data source and optionally save result.
|
|
998
|
+
|
|
999
|
+
Args:
|
|
1000
|
+
source_id: Source ID to profile.
|
|
1001
|
+
save: Whether to save profile to database.
|
|
1002
|
+
|
|
1003
|
+
Returns:
|
|
1004
|
+
Profile model with results.
|
|
1005
|
+
|
|
1006
|
+
Raises:
|
|
1007
|
+
ValueError: If source not found.
|
|
1008
|
+
"""
|
|
1009
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
1010
|
+
if source is None:
|
|
1011
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1012
|
+
|
|
1013
|
+
result = await self.adapter.profile(source.source_path or "")
|
|
1014
|
+
|
|
1015
|
+
if save:
|
|
1016
|
+
profile = await self.profile_repo.create(
|
|
1017
|
+
source_id=source_id,
|
|
1018
|
+
profile_json=result.to_dict(),
|
|
1019
|
+
row_count=result.row_count,
|
|
1020
|
+
column_count=result.column_count,
|
|
1021
|
+
size_bytes=result.size_bytes,
|
|
1022
|
+
)
|
|
1023
|
+
return profile
|
|
1024
|
+
|
|
1025
|
+
# Return unsaved profile object
|
|
1026
|
+
profile = Profile(
|
|
1027
|
+
source_id=source_id,
|
|
1028
|
+
profile_json=result.to_dict(),
|
|
1029
|
+
row_count=result.row_count,
|
|
1030
|
+
column_count=result.column_count,
|
|
1031
|
+
size_bytes=result.size_bytes,
|
|
1032
|
+
)
|
|
1033
|
+
return profile
|
|
1034
|
+
|
|
1035
|
+
async def get_latest_profile(self, source_id: str) -> Profile | None:
|
|
1036
|
+
"""Get the latest profile for a source.
|
|
1037
|
+
|
|
1038
|
+
Args:
|
|
1039
|
+
source_id: Source ID.
|
|
1040
|
+
|
|
1041
|
+
Returns:
|
|
1042
|
+
Latest profile or None.
|
|
1043
|
+
"""
|
|
1044
|
+
return await self.profile_repo.get_latest_for_source(source_id)
|
|
1045
|
+
|
|
1046
|
+
async def list_profiles(
|
|
1047
|
+
self,
|
|
1048
|
+
source_id: str,
|
|
1049
|
+
*,
|
|
1050
|
+
limit: int = 20,
|
|
1051
|
+
) -> Sequence[Profile]:
|
|
1052
|
+
"""List profiles for a source.
|
|
1053
|
+
|
|
1054
|
+
Args:
|
|
1055
|
+
source_id: Source ID.
|
|
1056
|
+
limit: Maximum to return.
|
|
1057
|
+
|
|
1058
|
+
Returns:
|
|
1059
|
+
Sequence of profiles.
|
|
1060
|
+
"""
|
|
1061
|
+
return await self.profile_repo.get_for_source(source_id, limit=limit)
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
class HistoryService:
|
|
1065
|
+
"""Service for validation history and analytics.
|
|
1066
|
+
|
|
1067
|
+
Provides aggregated views of validation history with trend analysis.
|
|
1068
|
+
"""
|
|
1069
|
+
|
|
1070
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
1071
|
+
"""Initialize service.
|
|
1072
|
+
|
|
1073
|
+
Args:
|
|
1074
|
+
session: Database session.
|
|
1075
|
+
"""
|
|
1076
|
+
self.session = session
|
|
1077
|
+
self.validation_repo = ValidationRepository(session)
|
|
1078
|
+
|
|
1079
|
+
async def get_history(
|
|
1080
|
+
self,
|
|
1081
|
+
source_id: str,
|
|
1082
|
+
*,
|
|
1083
|
+
period: Literal["7d", "30d", "90d"] = "30d",
|
|
1084
|
+
granularity: Literal["hourly", "daily", "weekly"] = "daily",
|
|
1085
|
+
) -> dict[str, Any]:
|
|
1086
|
+
"""Get validation history with trend data.
|
|
1087
|
+
|
|
1088
|
+
Args:
|
|
1089
|
+
source_id: Source ID.
|
|
1090
|
+
period: Time period to analyze.
|
|
1091
|
+
granularity: Aggregation granularity.
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
Dictionary with summary, trend, failure_frequency, and recent_validations.
|
|
1095
|
+
"""
|
|
1096
|
+
days = {"7d": 7, "30d": 30, "90d": 90}[period]
|
|
1097
|
+
start_date = datetime.utcnow() - timedelta(days=days)
|
|
1098
|
+
|
|
1099
|
+
# Get validations in period
|
|
1100
|
+
result = await self.session.execute(
|
|
1101
|
+
select(Validation)
|
|
1102
|
+
.where(Validation.source_id == source_id)
|
|
1103
|
+
.where(Validation.created_at >= start_date)
|
|
1104
|
+
.order_by(Validation.created_at.desc())
|
|
1105
|
+
)
|
|
1106
|
+
validations = list(result.scalars().all())
|
|
1107
|
+
|
|
1108
|
+
# Calculate statistics
|
|
1109
|
+
total_runs = len(validations)
|
|
1110
|
+
passed_runs = sum(1 for v in validations if v.passed)
|
|
1111
|
+
failed_runs = sum(1 for v in validations if v.passed is False)
|
|
1112
|
+
success_rate = (passed_runs / total_runs * 100) if total_runs > 0 else 0
|
|
1113
|
+
|
|
1114
|
+
# Aggregate by granularity
|
|
1115
|
+
trend_data = self._aggregate_by_period(validations, granularity)
|
|
1116
|
+
|
|
1117
|
+
# Calculate failure frequency
|
|
1118
|
+
failure_frequency = self._calculate_failure_frequency(validations)
|
|
1119
|
+
|
|
1120
|
+
# Recent validations (top 10)
|
|
1121
|
+
recent_validations = [
|
|
1122
|
+
{
|
|
1123
|
+
"id": v.id,
|
|
1124
|
+
"status": v.status,
|
|
1125
|
+
"passed": v.passed,
|
|
1126
|
+
"has_critical": v.has_critical,
|
|
1127
|
+
"has_high": v.has_high,
|
|
1128
|
+
"total_issues": v.total_issues,
|
|
1129
|
+
"created_at": v.created_at.isoformat(),
|
|
1130
|
+
}
|
|
1131
|
+
for v in validations[:10]
|
|
1132
|
+
]
|
|
1133
|
+
|
|
1134
|
+
return {
|
|
1135
|
+
"summary": {
|
|
1136
|
+
"total_runs": total_runs,
|
|
1137
|
+
"passed_runs": passed_runs,
|
|
1138
|
+
"failed_runs": failed_runs,
|
|
1139
|
+
"success_rate": round(success_rate, 2),
|
|
1140
|
+
},
|
|
1141
|
+
"trend": trend_data,
|
|
1142
|
+
"failure_frequency": failure_frequency,
|
|
1143
|
+
"recent_validations": recent_validations,
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
def _aggregate_by_period(
|
|
1147
|
+
self,
|
|
1148
|
+
validations: list[Validation],
|
|
1149
|
+
granularity: Literal["hourly", "daily", "weekly"],
|
|
1150
|
+
) -> list[dict[str, Any]]:
|
|
1151
|
+
"""Aggregate validations by time period."""
|
|
1152
|
+
buckets: dict[str, list[Validation]] = defaultdict(list)
|
|
1153
|
+
|
|
1154
|
+
for v in validations:
|
|
1155
|
+
if granularity == "hourly":
|
|
1156
|
+
key = v.created_at.strftime("%Y-%m-%d %H:00")
|
|
1157
|
+
elif granularity == "daily":
|
|
1158
|
+
key = v.created_at.strftime("%Y-%m-%d")
|
|
1159
|
+
else: # weekly
|
|
1160
|
+
monday = v.created_at - timedelta(days=v.created_at.weekday())
|
|
1161
|
+
key = monday.strftime("%Y-%m-%d")
|
|
1162
|
+
|
|
1163
|
+
buckets[key].append(v)
|
|
1164
|
+
|
|
1165
|
+
trend = []
|
|
1166
|
+
for date, vals in sorted(buckets.items()):
|
|
1167
|
+
passed_count = sum(1 for v in vals if v.passed)
|
|
1168
|
+
success_rate = (passed_count / len(vals) * 100) if vals else 0
|
|
1169
|
+
trend.append(
|
|
1170
|
+
{
|
|
1171
|
+
"date": date,
|
|
1172
|
+
"success_rate": round(success_rate, 2),
|
|
1173
|
+
"run_count": len(vals),
|
|
1174
|
+
"passed_count": passed_count,
|
|
1175
|
+
"failed_count": len(vals) - passed_count,
|
|
1176
|
+
}
|
|
1177
|
+
)
|
|
1178
|
+
|
|
1179
|
+
return trend
|
|
1180
|
+
|
|
1181
|
+
def _calculate_failure_frequency(
|
|
1182
|
+
self,
|
|
1183
|
+
validations: list[Validation],
|
|
1184
|
+
) -> list[dict[str, Any]]:
|
|
1185
|
+
"""Calculate failure frequency by issue type."""
|
|
1186
|
+
failures: Counter[str] = Counter()
|
|
1187
|
+
|
|
1188
|
+
for v in validations:
|
|
1189
|
+
if v.result_json and "issues" in v.result_json:
|
|
1190
|
+
for issue in v.result_json["issues"]:
|
|
1191
|
+
key = f"{issue.get('column', 'unknown')}.{issue.get('issue_type', 'unknown')}"
|
|
1192
|
+
failures[key] += issue.get("count", 1)
|
|
1193
|
+
|
|
1194
|
+
return [
|
|
1195
|
+
{"issue": issue, "count": count}
|
|
1196
|
+
for issue, count in failures.most_common(10)
|
|
1197
|
+
]
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
class DriftService:
|
|
1201
|
+
"""Service for drift detection.
|
|
1202
|
+
|
|
1203
|
+
Handles drift comparison between datasets.
|
|
1204
|
+
"""
|
|
1205
|
+
|
|
1206
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
1207
|
+
"""Initialize service.
|
|
1208
|
+
|
|
1209
|
+
Args:
|
|
1210
|
+
session: Database session.
|
|
1211
|
+
"""
|
|
1212
|
+
self.session = session
|
|
1213
|
+
self.source_repo = SourceRepository(session)
|
|
1214
|
+
self.drift_repo = DriftComparisonRepository(session)
|
|
1215
|
+
self.adapter = get_adapter()
|
|
1216
|
+
|
|
1217
|
+
async def compare(
|
|
1218
|
+
self,
|
|
1219
|
+
baseline_source_id: str,
|
|
1220
|
+
current_source_id: str,
|
|
1221
|
+
*,
|
|
1222
|
+
columns: list[str] | None = None,
|
|
1223
|
+
method: str = "auto",
|
|
1224
|
+
threshold: float | None = None,
|
|
1225
|
+
sample_size: int | None = None,
|
|
1226
|
+
save: bool = True,
|
|
1227
|
+
) -> DriftComparison:
|
|
1228
|
+
"""Compare two datasets for drift detection.
|
|
1229
|
+
|
|
1230
|
+
Args:
|
|
1231
|
+
baseline_source_id: Baseline source ID.
|
|
1232
|
+
current_source_id: Current source ID.
|
|
1233
|
+
columns: Optional list of columns to compare.
|
|
1234
|
+
method: Detection method.
|
|
1235
|
+
threshold: Optional custom threshold.
|
|
1236
|
+
sample_size: Optional sample size.
|
|
1237
|
+
save: Whether to save comparison to database.
|
|
1238
|
+
|
|
1239
|
+
Returns:
|
|
1240
|
+
DriftComparison model with results.
|
|
1241
|
+
|
|
1242
|
+
Raises:
|
|
1243
|
+
ValueError: If source not found.
|
|
1244
|
+
"""
|
|
1245
|
+
baseline = await self.source_repo.get_by_id(baseline_source_id)
|
|
1246
|
+
if baseline is None:
|
|
1247
|
+
raise ValueError(f"Baseline source '{baseline_source_id}' not found")
|
|
1248
|
+
|
|
1249
|
+
current = await self.source_repo.get_by_id(current_source_id)
|
|
1250
|
+
if current is None:
|
|
1251
|
+
raise ValueError(f"Current source '{current_source_id}' not found")
|
|
1252
|
+
|
|
1253
|
+
result = await self.adapter.compare(
|
|
1254
|
+
baseline.source_path or "",
|
|
1255
|
+
current.source_path or "",
|
|
1256
|
+
columns=columns,
|
|
1257
|
+
method=method,
|
|
1258
|
+
threshold=threshold,
|
|
1259
|
+
sample_size=sample_size,
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
config = {
|
|
1263
|
+
"columns": columns,
|
|
1264
|
+
"method": method,
|
|
1265
|
+
"threshold": threshold,
|
|
1266
|
+
"sample_size": sample_size,
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
if save:
|
|
1270
|
+
comparison = await self.drift_repo.create(
|
|
1271
|
+
baseline_source_id=baseline_source_id,
|
|
1272
|
+
current_source_id=current_source_id,
|
|
1273
|
+
has_drift=result.has_drift,
|
|
1274
|
+
has_high_drift=result.has_high_drift,
|
|
1275
|
+
total_columns=result.total_columns,
|
|
1276
|
+
drifted_columns=len(result.drifted_columns),
|
|
1277
|
+
result_json=result.to_dict(),
|
|
1278
|
+
config=config,
|
|
1279
|
+
)
|
|
1280
|
+
return comparison
|
|
1281
|
+
|
|
1282
|
+
# Return unsaved comparison object
|
|
1283
|
+
comparison = DriftComparison(
|
|
1284
|
+
baseline_source_id=baseline_source_id,
|
|
1285
|
+
current_source_id=current_source_id,
|
|
1286
|
+
has_drift=result.has_drift,
|
|
1287
|
+
has_high_drift=result.has_high_drift,
|
|
1288
|
+
total_columns=result.total_columns,
|
|
1289
|
+
drifted_columns=len(result.drifted_columns),
|
|
1290
|
+
result_json=result.to_dict(),
|
|
1291
|
+
config=config,
|
|
1292
|
+
)
|
|
1293
|
+
return comparison
|
|
1294
|
+
|
|
1295
|
+
async def get_comparison(self, comparison_id: str) -> DriftComparison | None:
|
|
1296
|
+
"""Get a drift comparison by ID.
|
|
1297
|
+
|
|
1298
|
+
Args:
|
|
1299
|
+
comparison_id: Comparison ID.
|
|
1300
|
+
|
|
1301
|
+
Returns:
|
|
1302
|
+
DriftComparison or None.
|
|
1303
|
+
"""
|
|
1304
|
+
return await self.drift_repo.get_by_id(comparison_id)
|
|
1305
|
+
|
|
1306
|
+
async def list_comparisons(
|
|
1307
|
+
self,
|
|
1308
|
+
*,
|
|
1309
|
+
baseline_source_id: str | None = None,
|
|
1310
|
+
current_source_id: str | None = None,
|
|
1311
|
+
limit: int = 20,
|
|
1312
|
+
) -> Sequence[DriftComparison]:
|
|
1313
|
+
"""List drift comparisons.
|
|
1314
|
+
|
|
1315
|
+
Args:
|
|
1316
|
+
baseline_source_id: Optional baseline source ID filter.
|
|
1317
|
+
current_source_id: Optional current source ID filter.
|
|
1318
|
+
limit: Maximum to return.
|
|
1319
|
+
|
|
1320
|
+
Returns:
|
|
1321
|
+
Sequence of drift comparisons.
|
|
1322
|
+
"""
|
|
1323
|
+
return await self.drift_repo.get_for_sources(
|
|
1324
|
+
baseline_source_id=baseline_source_id,
|
|
1325
|
+
current_source_id=current_source_id,
|
|
1326
|
+
limit=limit,
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
class ScheduleService:
|
|
1331
|
+
"""Service for managing validation schedules.
|
|
1332
|
+
|
|
1333
|
+
Handles schedule CRUD and integrates with APScheduler.
|
|
1334
|
+
"""
|
|
1335
|
+
|
|
1336
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
1337
|
+
"""Initialize service.
|
|
1338
|
+
|
|
1339
|
+
Args:
|
|
1340
|
+
session: Database session.
|
|
1341
|
+
"""
|
|
1342
|
+
self.session = session
|
|
1343
|
+
self.source_repo = SourceRepository(session)
|
|
1344
|
+
self.schedule_repo = ScheduleRepository(session)
|
|
1345
|
+
|
|
1346
|
+
async def create_schedule(
|
|
1347
|
+
self,
|
|
1348
|
+
source_id: str,
|
|
1349
|
+
*,
|
|
1350
|
+
name: str,
|
|
1351
|
+
cron_expression: str,
|
|
1352
|
+
notify_on_failure: bool = True,
|
|
1353
|
+
config: dict[str, Any] | None = None,
|
|
1354
|
+
) -> Schedule:
|
|
1355
|
+
"""Create a new schedule.
|
|
1356
|
+
|
|
1357
|
+
Args:
|
|
1358
|
+
source_id: Source ID to schedule.
|
|
1359
|
+
name: Schedule name.
|
|
1360
|
+
cron_expression: Cron expression.
|
|
1361
|
+
notify_on_failure: Send notification on failure.
|
|
1362
|
+
config: Additional configuration.
|
|
1363
|
+
|
|
1364
|
+
Returns:
|
|
1365
|
+
Created schedule.
|
|
1366
|
+
|
|
1367
|
+
Raises:
|
|
1368
|
+
ValueError: If source not found or invalid cron expression.
|
|
1369
|
+
"""
|
|
1370
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
1371
|
+
if source is None:
|
|
1372
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1373
|
+
|
|
1374
|
+
# Validate cron expression
|
|
1375
|
+
next_run = self._get_next_run(cron_expression)
|
|
1376
|
+
|
|
1377
|
+
schedule = await self.schedule_repo.create(
|
|
1378
|
+
name=name,
|
|
1379
|
+
source_id=source_id,
|
|
1380
|
+
cron_expression=cron_expression,
|
|
1381
|
+
is_active=True,
|
|
1382
|
+
notify_on_failure=notify_on_failure,
|
|
1383
|
+
next_run_at=next_run,
|
|
1384
|
+
config=config,
|
|
1385
|
+
)
|
|
1386
|
+
|
|
1387
|
+
return schedule
|
|
1388
|
+
|
|
1389
|
+
async def get_schedule(self, schedule_id: str) -> Schedule | None:
|
|
1390
|
+
"""Get schedule by ID.
|
|
1391
|
+
|
|
1392
|
+
Args:
|
|
1393
|
+
schedule_id: Schedule ID.
|
|
1394
|
+
|
|
1395
|
+
Returns:
|
|
1396
|
+
Schedule or None.
|
|
1397
|
+
"""
|
|
1398
|
+
return await self.schedule_repo.get_by_id(schedule_id)
|
|
1399
|
+
|
|
1400
|
+
async def list_schedules(
|
|
1401
|
+
self,
|
|
1402
|
+
*,
|
|
1403
|
+
source_id: str | None = None,
|
|
1404
|
+
active_only: bool = False,
|
|
1405
|
+
limit: int = 100,
|
|
1406
|
+
) -> Sequence[Schedule]:
|
|
1407
|
+
"""List schedules.
|
|
1408
|
+
|
|
1409
|
+
Args:
|
|
1410
|
+
source_id: Optional source ID filter.
|
|
1411
|
+
active_only: Only return active schedules.
|
|
1412
|
+
limit: Maximum to return.
|
|
1413
|
+
|
|
1414
|
+
Returns:
|
|
1415
|
+
Sequence of schedules.
|
|
1416
|
+
"""
|
|
1417
|
+
if source_id:
|
|
1418
|
+
return await self.schedule_repo.get_for_source(source_id, limit=limit)
|
|
1419
|
+
if active_only:
|
|
1420
|
+
return await self.schedule_repo.get_active(limit=limit)
|
|
1421
|
+
return await self.schedule_repo.list(limit=limit)
|
|
1422
|
+
|
|
1423
|
+
async def update_schedule(
|
|
1424
|
+
self,
|
|
1425
|
+
schedule_id: str,
|
|
1426
|
+
*,
|
|
1427
|
+
name: str | None = None,
|
|
1428
|
+
cron_expression: str | None = None,
|
|
1429
|
+
notify_on_failure: bool | None = None,
|
|
1430
|
+
config: dict[str, Any] | None = None,
|
|
1431
|
+
) -> Schedule | None:
|
|
1432
|
+
"""Update a schedule.
|
|
1433
|
+
|
|
1434
|
+
Args:
|
|
1435
|
+
schedule_id: Schedule ID.
|
|
1436
|
+
name: New name.
|
|
1437
|
+
cron_expression: New cron expression.
|
|
1438
|
+
notify_on_failure: New notification setting.
|
|
1439
|
+
config: New configuration.
|
|
1440
|
+
|
|
1441
|
+
Returns:
|
|
1442
|
+
Updated schedule or None.
|
|
1443
|
+
"""
|
|
1444
|
+
schedule = await self.schedule_repo.get_by_id(schedule_id)
|
|
1445
|
+
if schedule is None:
|
|
1446
|
+
return None
|
|
1447
|
+
|
|
1448
|
+
if name is not None:
|
|
1449
|
+
schedule.name = name
|
|
1450
|
+
if cron_expression is not None:
|
|
1451
|
+
schedule.cron_expression = cron_expression
|
|
1452
|
+
schedule.next_run_at = self._get_next_run(cron_expression)
|
|
1453
|
+
if notify_on_failure is not None:
|
|
1454
|
+
schedule.notify_on_failure = notify_on_failure
|
|
1455
|
+
if config is not None:
|
|
1456
|
+
schedule.config = config
|
|
1457
|
+
|
|
1458
|
+
await self.session.flush()
|
|
1459
|
+
await self.session.refresh(schedule)
|
|
1460
|
+
return schedule
|
|
1461
|
+
|
|
1462
|
+
async def delete_schedule(self, schedule_id: str) -> bool:
|
|
1463
|
+
"""Delete a schedule.
|
|
1464
|
+
|
|
1465
|
+
Args:
|
|
1466
|
+
schedule_id: Schedule ID.
|
|
1467
|
+
|
|
1468
|
+
Returns:
|
|
1469
|
+
True if deleted.
|
|
1470
|
+
"""
|
|
1471
|
+
return await self.schedule_repo.delete(schedule_id)
|
|
1472
|
+
|
|
1473
|
+
async def pause_schedule(self, schedule_id: str) -> Schedule | None:
|
|
1474
|
+
"""Pause a schedule.
|
|
1475
|
+
|
|
1476
|
+
Args:
|
|
1477
|
+
schedule_id: Schedule ID.
|
|
1478
|
+
|
|
1479
|
+
Returns:
|
|
1480
|
+
Updated schedule or None.
|
|
1481
|
+
"""
|
|
1482
|
+
schedule = await self.schedule_repo.get_by_id(schedule_id)
|
|
1483
|
+
if schedule is None:
|
|
1484
|
+
return None
|
|
1485
|
+
|
|
1486
|
+
schedule.pause()
|
|
1487
|
+
await self.session.flush()
|
|
1488
|
+
await self.session.refresh(schedule)
|
|
1489
|
+
return schedule
|
|
1490
|
+
|
|
1491
|
+
async def resume_schedule(self, schedule_id: str) -> Schedule | None:
|
|
1492
|
+
"""Resume a paused schedule.
|
|
1493
|
+
|
|
1494
|
+
Args:
|
|
1495
|
+
schedule_id: Schedule ID.
|
|
1496
|
+
|
|
1497
|
+
Returns:
|
|
1498
|
+
Updated schedule or None.
|
|
1499
|
+
"""
|
|
1500
|
+
schedule = await self.schedule_repo.get_by_id(schedule_id)
|
|
1501
|
+
if schedule is None:
|
|
1502
|
+
return None
|
|
1503
|
+
|
|
1504
|
+
schedule.resume()
|
|
1505
|
+
schedule.next_run_at = self._get_next_run(schedule.cron_expression)
|
|
1506
|
+
await self.session.flush()
|
|
1507
|
+
await self.session.refresh(schedule)
|
|
1508
|
+
return schedule
|
|
1509
|
+
|
|
1510
|
+
def _get_next_run(self, cron_expression: str) -> datetime:
|
|
1511
|
+
"""Calculate next run time from cron expression.
|
|
1512
|
+
|
|
1513
|
+
Args:
|
|
1514
|
+
cron_expression: Cron expression.
|
|
1515
|
+
|
|
1516
|
+
Returns:
|
|
1517
|
+
Next run datetime.
|
|
1518
|
+
|
|
1519
|
+
Raises:
|
|
1520
|
+
ValueError: If invalid cron expression.
|
|
1521
|
+
"""
|
|
1522
|
+
try:
|
|
1523
|
+
from apscheduler.triggers.cron import CronTrigger
|
|
1524
|
+
|
|
1525
|
+
trigger = CronTrigger.from_crontab(cron_expression)
|
|
1526
|
+
next_fire = trigger.get_next_fire_time(None, datetime.utcnow())
|
|
1527
|
+
if next_fire is None:
|
|
1528
|
+
raise ValueError("Could not calculate next run time")
|
|
1529
|
+
return next_fire
|
|
1530
|
+
except Exception as e:
|
|
1531
|
+
raise ValueError(f"Invalid cron expression: {e}")
|