PyPI - truthound-dashboard - Versions diffs - 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

truthound-dashboard 1.4.4py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

truthound_dashboard/api/alerts.py +75 -86
truthound_dashboard/api/anomaly.py +7 -13
truthound_dashboard/api/cross_alerts.py +38 -52
truthound_dashboard/api/drift.py +49 -59
truthound_dashboard/api/drift_monitor.py +234 -79
truthound_dashboard/api/enterprise_sampling.py +498 -0
truthound_dashboard/api/history.py +57 -5
truthound_dashboard/api/lineage.py +3 -48
truthound_dashboard/api/maintenance.py +104 -49
truthound_dashboard/api/mask.py +1 -2
truthound_dashboard/api/middleware.py +2 -1
truthound_dashboard/api/model_monitoring.py +435 -311
truthound_dashboard/api/notifications.py +227 -191
truthound_dashboard/api/notifications_advanced.py +21 -20
truthound_dashboard/api/observability.py +586 -0
truthound_dashboard/api/plugins.py +2 -433
truthound_dashboard/api/profile.py +199 -37
truthound_dashboard/api/quality_reporter.py +701 -0
truthound_dashboard/api/reports.py +7 -16
truthound_dashboard/api/router.py +66 -0
truthound_dashboard/api/rule_suggestions.py +5 -5
truthound_dashboard/api/scan.py +17 -19
truthound_dashboard/api/schedules.py +85 -50
truthound_dashboard/api/schema_evolution.py +6 -6
truthound_dashboard/api/schema_watcher.py +667 -0
truthound_dashboard/api/sources.py +98 -27
truthound_dashboard/api/tiering.py +1323 -0
truthound_dashboard/api/triggers.py +14 -11
truthound_dashboard/api/validations.py +12 -11
truthound_dashboard/api/versioning.py +1 -6
truthound_dashboard/core/__init__.py +129 -3
truthound_dashboard/core/actions/__init__.py +62 -0
truthound_dashboard/core/actions/custom.py +426 -0
truthound_dashboard/core/actions/notifications.py +910 -0
truthound_dashboard/core/actions/storage.py +472 -0
truthound_dashboard/core/actions/webhook.py +281 -0
truthound_dashboard/core/anomaly.py +262 -67
truthound_dashboard/core/anomaly_explainer.py +4 -3
truthound_dashboard/core/backends/__init__.py +67 -0
truthound_dashboard/core/backends/base.py +299 -0
truthound_dashboard/core/backends/errors.py +191 -0
truthound_dashboard/core/backends/factory.py +423 -0
truthound_dashboard/core/backends/mock_backend.py +451 -0
truthound_dashboard/core/backends/truthound_backend.py +718 -0
truthound_dashboard/core/checkpoint/__init__.py +87 -0
truthound_dashboard/core/checkpoint/adapters.py +814 -0
truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
truthound_dashboard/core/checkpoint/runner.py +270 -0
truthound_dashboard/core/connections.py +437 -10
truthound_dashboard/core/converters/__init__.py +14 -0
truthound_dashboard/core/converters/truthound.py +620 -0
truthound_dashboard/core/cross_alerts.py +540 -320
truthound_dashboard/core/datasource_factory.py +1672 -0
truthound_dashboard/core/drift_monitor.py +216 -20
truthound_dashboard/core/enterprise_sampling.py +1291 -0
truthound_dashboard/core/interfaces/__init__.py +225 -0
truthound_dashboard/core/interfaces/actions.py +652 -0
truthound_dashboard/core/interfaces/base.py +247 -0
truthound_dashboard/core/interfaces/checkpoint.py +676 -0
truthound_dashboard/core/interfaces/protocols.py +664 -0
truthound_dashboard/core/interfaces/reporters.py +650 -0
truthound_dashboard/core/interfaces/routing.py +646 -0
truthound_dashboard/core/interfaces/triggers.py +619 -0
truthound_dashboard/core/lineage.py +407 -71
truthound_dashboard/core/model_monitoring.py +431 -3
truthound_dashboard/core/notifications/base.py +4 -0
truthound_dashboard/core/notifications/channels.py +501 -1203
truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
truthound_dashboard/core/notifications/deduplication/service.py +131 -348
truthound_dashboard/core/notifications/dispatcher.py +202 -11
truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
truthound_dashboard/core/notifications/escalation/engine.py +168 -358
truthound_dashboard/core/notifications/routing/__init__.py +88 -128
truthound_dashboard/core/notifications/routing/engine.py +90 -317
truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
truthound_dashboard/core/notifications/throttling/builder.py +117 -255
truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
truthound_dashboard/core/phase5/collaboration.py +1 -1
truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
truthound_dashboard/core/quality_reporter.py +1359 -0
truthound_dashboard/core/report_history.py +0 -6
truthound_dashboard/core/reporters/__init__.py +175 -14
truthound_dashboard/core/reporters/adapters.py +943 -0
truthound_dashboard/core/reporters/base.py +0 -3
truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
truthound_dashboard/core/reporters/compat.py +266 -0
truthound_dashboard/core/reporters/csv_reporter.py +2 -35
truthound_dashboard/core/reporters/factory.py +526 -0
truthound_dashboard/core/reporters/interfaces.py +745 -0
truthound_dashboard/core/reporters/registry.py +1 -10
truthound_dashboard/core/scheduler.py +165 -0
truthound_dashboard/core/schema_evolution.py +3 -3
truthound_dashboard/core/schema_watcher.py +1528 -0
truthound_dashboard/core/services.py +595 -76
truthound_dashboard/core/store_manager.py +810 -0
truthound_dashboard/core/streaming_anomaly.py +169 -4
truthound_dashboard/core/tiering.py +1309 -0
truthound_dashboard/core/triggers/evaluators.py +178 -8
truthound_dashboard/core/truthound_adapter.py +2620 -197
truthound_dashboard/core/unified_alerts.py +23 -20
truthound_dashboard/db/__init__.py +8 -0
truthound_dashboard/db/database.py +8 -2
truthound_dashboard/db/models.py +944 -25
truthound_dashboard/db/repository.py +2 -0
truthound_dashboard/main.py +11 -0
truthound_dashboard/schemas/__init__.py +177 -16
truthound_dashboard/schemas/base.py +44 -23
truthound_dashboard/schemas/collaboration.py +19 -6
truthound_dashboard/schemas/cross_alerts.py +19 -3
truthound_dashboard/schemas/drift.py +61 -55
truthound_dashboard/schemas/drift_monitor.py +67 -23
truthound_dashboard/schemas/enterprise_sampling.py +653 -0
truthound_dashboard/schemas/lineage.py +0 -33
truthound_dashboard/schemas/mask.py +10 -8
truthound_dashboard/schemas/model_monitoring.py +89 -10
truthound_dashboard/schemas/notifications_advanced.py +13 -0
truthound_dashboard/schemas/observability.py +453 -0
truthound_dashboard/schemas/plugins.py +0 -280
truthound_dashboard/schemas/profile.py +154 -247
truthound_dashboard/schemas/quality_reporter.py +403 -0
truthound_dashboard/schemas/reports.py +2 -2
truthound_dashboard/schemas/rule_suggestion.py +8 -1
truthound_dashboard/schemas/scan.py +4 -24
truthound_dashboard/schemas/schedule.py +11 -3
truthound_dashboard/schemas/schema_watcher.py +727 -0
truthound_dashboard/schemas/source.py +17 -2
truthound_dashboard/schemas/tiering.py +822 -0
truthound_dashboard/schemas/triggers.py +16 -0
truthound_dashboard/schemas/unified_alerts.py +7 -0
truthound_dashboard/schemas/validation.py +0 -13
truthound_dashboard/schemas/validators/base.py +41 -21
truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
truthound_dashboard/schemas/validators/localization_validators.py +273 -0
truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
truthound_dashboard/schemas/validators/referential_validators.py +312 -0
truthound_dashboard/schemas/validators/registry.py +93 -8
truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
truthound_dashboard/schemas/versioning.py +1 -6
truthound_dashboard/static/index.html +2 -2
truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
{truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
truthound_dashboard/core/plugins/hooks/manager.py +0 -403
truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
truthound_dashboard/core/reporters/junit_reporter.py +0 -233
truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
{truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
{truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
{truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0

truthound_dashboard/core/services.py CHANGED Viewed

@@ -4,16 +4,25 @@ This module contains service classes that implement business logic
 for the dashboard, separating concerns from API handlers.
 Services handle:
-- Data source management
+- Data source management with multi-backend support
 - Schema learning and storage
 - Validation execution and tracking
 - Data profiling with history
 - Drift detection
 - Schedule management
+Supports various data backends through truthound's DataSource abstraction:
+- File: CSV, Parquet, JSON, NDJSON, JSONL
+- SQL: SQLite, PostgreSQL, MySQL
+- Cloud DW: BigQuery, Snowflake, Redshift, Databricks
+- Enterprise: Oracle, SQL Server
+- NoSQL: MongoDB, Elasticsearch (async)
+- Streaming: Kafka (async)
 """
 from __future__ import annotations
+import logging
 from collections import Counter, defaultdict
 from collections.abc import Sequence
 from datetime import datetime, timedelta
@@ -35,13 +44,24 @@ from truthound_dashboard.db import (
     Validation,
 )
+from .datasource_factory import (
+    SourceConfig,
+    SourceType,
+    create_datasource,
+    get_source_path_or_datasource,
+)
 from .truthound_adapter import (
     CheckResult,
+    DataInput,
+    GenerateSuiteResult,
     MaskResult,
+    ProfileResult,
     ScanResult,
     get_adapter,
 )
+logger = logging.getLogger(__name__)
 class SourceRepository(BaseRepository[Source]):
     """Repository for Source model operations."""
@@ -82,6 +102,74 @@ class SourceRepository(BaseRepository[Source]):
         return result.scalar_one_or_none()
+def get_data_input_from_source(source: Source) -> DataInput:
+    """Get DataInput (path or DataSource object) from Source model.
+    This helper function creates the appropriate data input for truthound
+    operations based on the source type and configuration.
+    For file-based sources, returns the file path string.
+    For database sources, creates and returns a DataSource object.
+    Args:
+        source: Source database model.
+    Returns:
+        File path string for file sources, DataSource object for others.
+    Raises:
+        ValueError: If source configuration is invalid.
+    """
+    source_type = source.type.lower()
+    config = source.config or {}
+    # For file sources, return path directly
+    if SourceType.is_file_type(source_type):
+        path = config.get("path") or source.source_path
+        if not path:
+            raise ValueError(f"No path configured for file source: {source.name}")
+        return path
+    # For database sources, create DataSource object
+    try:
+        full_config = {"type": source_type, **config}
+        return create_datasource(full_config)
+    except Exception as e:
+        logger.error(f"Failed to create DataSource for {source.name}: {e}")
+        raise ValueError(f"Failed to create DataSource: {e}") from e
+async def get_async_data_input_from_source(source: Source) -> DataInput:
+    """Get DataInput for async sources (MongoDB, Elasticsearch, Kafka).
+    This helper function creates DataSource objects for sources that
+    require async initialization.
+    Args:
+        source: Source database model.
+    Returns:
+        DataSource object.
+    Raises:
+        ValueError: If source type doesn't require async or config is invalid.
+    """
+    from .datasource_factory import create_datasource_async
+    source_type = source.type.lower()
+    config = source.config or {}
+    if not SourceType.is_async_type(source_type):
+        raise ValueError(f"Source type '{source_type}' doesn't require async creation")
+    try:
+        full_config = {"type": source_type, **config}
+        return await create_datasource_async(full_config)
+    except Exception as e:
+        logger.error(f"Failed to create async DataSource for {source.name}: {e}")
+        raise ValueError(f"Failed to create async DataSource: {e}") from e
 class SchemaRepository(BaseRepository[Schema]):
     """Repository for Schema model operations."""
@@ -196,22 +284,28 @@ class ValidationRepository(BaseRepository[Validation]):
         self,
         source_id: str,
         *,
+        offset: int = 0,
         limit: int = 20,
-    ) -> Sequence[Validation]:
-        """Get validations for a source.
+    ) -> tuple[Sequence[Validation], int]:
+        """Get validations for a source with pagination.
         Args:
             source_id: Source ID.
+            offset: Number of items to skip.
             limit: Maximum to return.
         Returns:
-            Sequence of validations.
+            Tuple of (validations, total_count).
         """
-        return await self.list(
+        filters = [Validation.source_id == source_id]
+        validations = await self.list(
+            offset=offset,
             limit=limit,
-            filters=[Validation.source_id == source_id],
+            filters=filters,
             order_by=Validation.created_at.desc(),
         )
+        total = await self.count(filters=filters)
+        return validations, total
     async def get_latest_for_source(self, source_id: str) -> Validation | None:
         """Get most recent validation for a source.
@@ -230,6 +324,24 @@ class ValidationRepository(BaseRepository[Validation]):
         )
         return result.scalar_one_or_none()
+    async def get_with_source(self, validation_id: str) -> Validation | None:
+        """Get validation by ID with source eagerly loaded.
+        Args:
+            validation_id: Validation ID.
+        Returns:
+            Validation with source loaded, or None.
+        """
+        from sqlalchemy.orm import selectinload
+        result = await self.session.execute(
+            select(Validation)
+            .options(selectinload(Validation.source))
+            .where(Validation.id == validation_id)
+        )
+        return result.scalar_one_or_none()
 class SourceService:
     """Service for managing data sources.
@@ -273,6 +385,19 @@ class SourceService:
             return await self.repository.get_active(offset=offset, limit=limit)
         return await self.repository.list(offset=offset, limit=limit)
+    async def count(self, *, active_only: bool = True) -> int:
+        """Count sources.
+        Args:
+            active_only: Only count active sources.
+        Returns:
+            Total count of sources.
+        """
+        if active_only:
+            return await self.repository.count(filters=[Source.is_active == True])
+        return await self.repository.count()
     async def create(
         self,
         *,
@@ -372,7 +497,8 @@ class SourceService:
         Returns:
             Sequence of validations.
         """
-        return await self.validation_repo.get_for_source(source_id, limit=limit)
+        validations, _ = await self.validation_repo.get_for_source(source_id, limit=limit)
+        return validations
 class ValidationService:
@@ -380,6 +506,14 @@ class ValidationService:
     Handles validation execution, result storage, and history.
     Supports both built-in truthound validators and custom validators.
+    Supports various data backends through truthound's DataSource abstraction:
+    - File: CSV, Parquet, JSON, NDJSON, JSONL
+    - SQL: SQLite, PostgreSQL, MySQL
+    - Cloud DW: BigQuery, Snowflake, Redshift, Databricks
+    - Enterprise: Oracle, SQL Server
+    - NoSQL: MongoDB, Elasticsearch (async)
+    - Streaming: Kafka (async)
     """
     def __init__(self, session: AsyncSession) -> None:
@@ -399,13 +533,11 @@ class ValidationService:
         source_id: str,
         *,
         validators: list[str] | None = None,
-        validator_params: dict[str, dict[str, Any]] | None = None,
+        validator_config: dict[str, dict[str, Any]] | None = None,
         custom_validators: list[dict[str, Any]] | None = None,
         schema_path: str | None = None,
         auto_schema: bool = False,
-        columns: list[str] | None = None,
         min_severity: str | None = None,
-        strict: bool = False,
         parallel: bool = False,
         max_workers: int | None = None,
         pushdown: bool | None = None,
@@ -416,20 +548,22 @@ class ValidationService:
         allowing fine-grained control over validation behavior. It also supports
         running custom validators alongside built-in validators.
+        Supports all data source types including files, SQL databases,
+        cloud data warehouses, and async sources (MongoDB, Elasticsearch, Kafka).
         Args:
             source_id: Source ID to validate.
             validators: Optional validator list. If None, all validators run.
-            validator_params: Optional per-validator parameters.
+            validator_config: Optional per-validator configuration (truthound 2.x).
                 Format: {"ValidatorName": {"param1": value1, "param2": value2}}
-                Example: {"Null": {"columns": ["email"], "mostly": 0.95},
+                Example: {"Null": {"columns": ("email",), "mostly": 0.95},
                           "CompletenessRatio": {"column": "phone", "min_ratio": 0.98}}
+                Note: columns should be tuples, not lists, for truthound 2.x.
             custom_validators: Optional list of custom validator configs.
                 Format: [{"validator_id": "...", "column": "...", "params": {...}}]
             schema_path: Optional schema file path.
             auto_schema: Auto-learn schema if True.
-            columns: Columns to validate. If None, validates all columns.
             min_severity: Minimum severity to report ("low", "medium", "high", "critical").
-            strict: If True, raises exception on validation failures.
             parallel: If True, uses DAG-based parallel execution.
             max_workers: Max threads for parallel execution (requires parallel=True).
             pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
@@ -438,7 +572,7 @@ class ValidationService:
             Validation record with results.
         Raises:
-            ValueError: If source not found.
+            ValueError: If source not found or data source creation fails.
         """
         # Get source
         source = await self.source_repo.get_by_id(source_id)
@@ -453,16 +587,21 @@ class ValidationService:
         )
         try:
+            # Get data input based on source type
+            # For async sources (MongoDB, Elasticsearch, Kafka), use async creation
+            if SourceType.is_async_type(source.type):
+                data_input = await get_async_data_input_from_source(source)
+            else:
+                data_input = get_data_input_from_source(source)
             # Run built-in validation with all supported parameters
             result = await self.adapter.check(
-                source.source_path or "",
+                data_input,
                 validators=validators,
-                validator_params=validator_params,
+                validator_config=validator_config,
                 schema=schema_path,
                 auto_schema=auto_schema,
-                columns=columns,
                 min_severity=min_severity,
-                strict=strict,
                 parallel=parallel,
                 max_workers=max_workers,
                 pushdown=pushdown,
@@ -649,39 +788,49 @@ class ValidationService:
             delta = validation.completed_at - validation.started_at
             validation.duration_ms = int(delta.total_seconds() * 1000)
-    async def get_validation(self, validation_id: str) -> Validation | None:
+    async def get_validation(
+        self, validation_id: str, *, with_source: bool = False
+    ) -> Validation | None:
         """Get validation by ID.
         Args:
             validation_id: Validation ID.
+            with_source: If True, eagerly load the source relationship.
         Returns:
             Validation or None.
         """
+        if with_source:
+            return await self.validation_repo.get_with_source(validation_id)
         return await self.validation_repo.get_by_id(validation_id)
     async def list_for_source(
         self,
         source_id: str,
         *,
+        offset: int = 0,
         limit: int = 20,
-    ) -> Sequence[Validation]:
-        """List validations for a source.
+    ) -> tuple[Sequence[Validation], int]:
+        """List validations for a source with pagination.
         Args:
             source_id: Source ID.
+            offset: Number of items to skip.
             limit: Maximum to return.
         Returns:
-            Sequence of validations.
+            Tuple of (validations, total_count).
         """
-        return await self.validation_repo.get_for_source(source_id, limit=limit)
+        return await self.validation_repo.get_for_source(
+            source_id, offset=offset, limit=limit
+        )
 class SchemaService:
     """Service for schema learning and management.
     Handles schema learning, storage, and retrieval.
+    Supports all data source types through DataSource abstraction.
     """
     def __init__(self, session: AsyncSession) -> None:
@@ -706,7 +855,7 @@ class SchemaService:
         """Learn and store schema for a source.
         Wraps truthound's th.learn() with full parameter support for schema
-        inference customization.
+        inference customization. Supports all data source types.
         Args:
             source_id: Source ID.
@@ -722,16 +871,22 @@ class SchemaService:
             Created schema record.
         Raises:
-            ValueError: If source not found.
+            ValueError: If source not found or data source creation fails.
         """
         # Get source
         source = await self.source_repo.get_by_id(source_id)
         if source is None:
             raise ValueError(f"Source '{source_id}' not found")
+        # Get data input based on source type
+        if SourceType.is_async_type(source.type):
+            data_input = await get_async_data_input_from_source(source)
+        else:
+            data_input = get_data_input_from_source(source)
         # Learn schema with all parameters
         result = await self.adapter.learn(
-            source.source_path or "",
+            data_input,
             infer_constraints=infer_constraints,
             categorical_threshold=categorical_threshold,
             sample_size=sample_size,
@@ -1019,17 +1174,20 @@ class ProfileRepository(BaseRepository[Profile]):
         source_id: str,
         *,
         limit: int = 20,
+        offset: int = 0,
     ) -> Sequence[Profile]:
         """Get profiles for a source.
         Args:
             source_id: Source ID.
             limit: Maximum to return.
+            offset: Number to skip.
         Returns:
             Sequence of profiles.
         """
         return await self.list(
+            offset=offset,
             limit=limit,
             filters=[Profile.source_id == source_id],
             order_by=Profile.created_at.desc(),
@@ -1167,6 +1325,10 @@ class ProfileService:
     """Service for data profiling with history tracking.
     Handles data profiling operations and stores results.
+    Uses the new truthound profiler API with ProfilerConfig for
+    fine-grained control over profiling behavior.
+    Supports all data source types through DataSource abstraction.
     """
     def __init__(self, session: AsyncSession) -> None:
@@ -1184,30 +1346,104 @@ class ProfileService:
         self,
         source_id: str,
         *,
-        sample_size: int | None = None,
         save: bool = True,
     ) -> Profile:
         """Profile a data source and optionally save result.
+        Note: truthound's th.profile() only accepts (data, source) parameters.
+        Advanced configuration options are NOT supported by the underlying library.
+        Supports all data source types including files, SQL databases,
+        cloud data warehouses, and async sources.
         Args:
             source_id: Source ID to profile.
-            sample_size: Maximum number of rows to sample for profiling.
-                If None, profiles all data. Useful for large datasets.
             save: Whether to save profile to database.
         Returns:
             Profile model with results.
         Raises:
-            ValueError: If source not found.
+            ValueError: If source not found or data source creation fails.
         """
         source = await self.source_repo.get_by_id(source_id)
         if source is None:
             raise ValueError(f"Source '{source_id}' not found")
-        result = await self.adapter.profile(
-            source.source_path or "",
-            sample_size=sample_size,
+        # Get data input based on source type
+        if SourceType.is_async_type(source.type):
+            data_input = await get_async_data_input_from_source(source)
+        else:
+            data_input = get_data_input_from_source(source)
+        result = await self.adapter.profile(data_input)
+        if save:
+            profile = await self.profile_repo.create(
+                source_id=source_id,
+                profile_json=result.to_dict(),
+                row_count=result.row_count,
+                column_count=result.column_count,
+                size_bytes=result.size_bytes or result.estimated_memory_bytes,
+            )
+            return profile
+        # Return unsaved profile object
+        profile = Profile(
+            source_id=source_id,
+            profile_json=result.to_dict(),
+            row_count=result.row_count,
+            column_count=result.column_count,
+            size_bytes=result.size_bytes or result.estimated_memory_bytes,
+        )
+        return profile
+    async def profile_source_advanced(
+        self,
+        source_id: str,
+        *,
+        config: dict[str, Any] | None = None,
+        save: bool = True,
+    ) -> Profile:
+        """Profile a data source with full ProfilerConfig support.
+        Provides direct access to all ProfilerConfig options through
+        a configuration dictionary for maximum flexibility.
+        Args:
+            source_id: Source ID to profile.
+            config: ProfilerConfig options as dictionary:
+                - sample_size: int | None (max rows to sample)
+                - random_seed: int (default 42)
+                - include_patterns: bool (default True)
+                - include_correlations: bool (default False)
+                - include_distributions: bool (default True)
+                - top_n_values: int (default 10)
+                - pattern_sample_size: int (default 1000)
+                - correlation_threshold: float (default 0.7)
+                - min_pattern_match_ratio: float (default 0.8)
+                - n_jobs: int (default 1)
+            save: Whether to save profile to database.
+        Returns:
+            Profile model with results.
+        Raises:
+            ValueError: If source not found or data source creation fails.
+        """
+        source = await self.source_repo.get_by_id(source_id)
+        if source is None:
+            raise ValueError(f"Source '{source_id}' not found")
+        # Get data input based on source type
+        if SourceType.is_async_type(source.type):
+            data_input = await get_async_data_input_from_source(source)
+        else:
+            data_input = get_data_input_from_source(source)
+        result = await self.adapter.profile_advanced(
+            data_input,
+            config=config,
         )
         if save:
@@ -1216,7 +1452,7 @@ class ProfileService:
                 profile_json=result.to_dict(),
                 row_count=result.row_count,
                 column_count=result.column_count,
-                size_bytes=result.size_bytes,
+                size_bytes=result.size_bytes or result.estimated_memory_bytes,
             )
             return profile
@@ -1226,10 +1462,115 @@ class ProfileService:
             profile_json=result.to_dict(),
             row_count=result.row_count,
             column_count=result.column_count,
-            size_bytes=result.size_bytes,
+            size_bytes=result.size_bytes or result.estimated_memory_bytes,
         )
         return profile
+    async def generate_rules_from_profile(
+        self,
+        source_id: str,
+        *,
+        strictness: str = "medium",
+        preset: str = "default",
+        include_categories: list[str] | None = None,
+        exclude_categories: list[str] | None = None,
+        profile_if_needed: bool = True,
+        sample_size: int | None = None,
+    ) -> dict[str, Any]:
+        """Generate validation rules from source profile.
+        Uses truthound's generate_suite() to automatically create
+        validation rules based on the profiled data characteristics.
+        Args:
+            source_id: Source ID to generate rules for.
+            strictness: Rule strictness level:
+                - "loose": Permissive thresholds, fewer rules
+                - "medium": Balanced defaults (default)
+                - "strict": Tight thresholds, comprehensive rules
+            preset: Rule generation preset:
+                - "default": General purpose
+                - "strict": Production data
+                - "loose": Development/testing
+                - "minimal": Essential rules only
+                - "comprehensive": All available rules
+                - "ci_cd": CI/CD optimized
+                - "schema_only": Structure validation only
+                - "format_only": Format/pattern rules only
+            include_categories: Rule categories to include (None = all).
+            exclude_categories: Rule categories to exclude.
+            profile_if_needed: If True, profile source if no recent profile exists.
+            sample_size: Sample size for profiling if needed.
+        Returns:
+            Dictionary with generated rules, YAML content, and metadata.
+        Raises:
+            ValueError: If source not found or no profile available.
+        """
+        source = await self.source_repo.get_by_id(source_id)
+        if source is None:
+            raise ValueError(f"Source '{source_id}' not found")
+        # Get or create profile
+        profile = await self.profile_repo.get_latest_for_source(source_id)
+        if profile is None:
+            if not profile_if_needed:
+                raise ValueError(
+                    f"No profile found for source '{source_id}'. "
+                    "Run profile_source() first or set profile_if_needed=True."
+                )
+            # Create profile
+            profile = await self.profile_source(
+                source_id,
+                sample_size=sample_size,
+                include_patterns=True,
+                save=True,
+            )
+        # Generate rules from profile
+        result = await self.adapter.generate_suite(
+            profile.profile_json,
+            strictness=strictness,
+            preset=preset,
+            include=include_categories,
+            exclude=exclude_categories,
+        )
+        return {
+            "source_id": source_id,
+            "profile_id": str(profile.id) if profile.id else None,
+            "rules": result.rules,
+            "rule_count": result.rule_count,
+            "categories": result.categories,
+            "strictness": result.strictness,
+            "yaml_content": result.yaml_content,
+            "json_content": result.json_content,
+        }
+    async def get(self, profile_id: str) -> Profile | None:
+        """Get a profile by ID.
+        Args:
+            profile_id: Profile ID.
+        Returns:
+            Profile or None.
+        """
+        return await self.profile_repo.get_by_id(profile_id)
+    async def get_latest(self, source_id: str) -> Profile | None:
+        """Get the latest profile for a source.
+        Args:
+            source_id: Source ID.
+        Returns:
+            Latest profile or None.
+        """
+        return await self.profile_repo.get_latest_for_source(source_id)
     async def get_latest_profile(self, source_id: str) -> Profile | None:
         """Get the latest profile for a source.
@@ -1258,6 +1599,157 @@ class ProfileService:
         """
         return await self.profile_repo.get_for_source(source_id, limit=limit)
+    async def compare_profiles(
+        self,
+        source_id: str,
+        profile_id_1: str | None = None,
+        profile_id_2: str | None = None,
+    ) -> dict[str, Any]:
+        """Compare two profiles for the same source.
+        Useful for detecting schema evolution and data drift over time.
+        Args:
+            source_id: Source ID.
+            profile_id_1: First profile ID (None = second-latest).
+            profile_id_2: Second profile ID (None = latest).
+        Returns:
+            Comparison result with changes and drift indicators.
+        Raises:
+            ValueError: If not enough profiles exist.
+        """
+        profiles = await self.profile_repo.get_for_source(source_id, limit=10)
+        if len(profiles) < 2:
+            raise ValueError(
+                f"Need at least 2 profiles to compare. Source '{source_id}' has {len(profiles)}."
+            )
+        # Get profiles to compare
+        if profile_id_2 is None:
+            profile_2 = profiles[0]  # Latest
+        else:
+            profile_2 = await self.profile_repo.get_by_id(profile_id_2)
+            if profile_2 is None:
+                raise ValueError(f"Profile '{profile_id_2}' not found")
+        if profile_id_1 is None:
+            profile_1 = profiles[1]  # Second-latest
+        else:
+            profile_1 = await self.profile_repo.get_by_id(profile_id_1)
+            if profile_1 is None:
+                raise ValueError(f"Profile '{profile_id_1}' not found")
+        # Compare profiles
+        return self._compare_profile_data(
+            profile_1.profile_json,
+            profile_2.profile_json,
+            profile_1_id=str(profile_1.id),
+            profile_2_id=str(profile_2.id),
+        )
+    def _compare_profile_data(
+        self,
+        profile_1: dict[str, Any],
+        profile_2: dict[str, Any],
+        profile_1_id: str,
+        profile_2_id: str,
+    ) -> dict[str, Any]:
+        """Compare two profile data dictionaries.
+        Args:
+            profile_1: Older profile data.
+            profile_2: Newer profile data.
+            profile_1_id: Older profile ID.
+            profile_2_id: Newer profile ID.
+        Returns:
+            Comparison result.
+        """
+        changes = []
+        column_diffs = []
+        # Extract column data
+        cols_1 = {c["name"]: c for c in profile_1.get("columns", [])}
+        cols_2 = {c["name"]: c for c in profile_2.get("columns", [])}
+        # Detect added/removed columns
+        added_cols = set(cols_2.keys()) - set(cols_1.keys())
+        removed_cols = set(cols_1.keys()) - set(cols_2.keys())
+        common_cols = set(cols_1.keys()) & set(cols_2.keys())
+        for col in added_cols:
+            changes.append({
+                "type": "column_added",
+                "column": col,
+                "details": cols_2[col],
+            })
+        for col in removed_cols:
+            changes.append({
+                "type": "column_removed",
+                "column": col,
+                "details": cols_1[col],
+            })
+        # Compare common columns
+        for col in common_cols:
+            col_1 = cols_1[col]
+            col_2 = cols_2[col]
+            col_changes = []
+            # Type change
+            if col_1.get("inferred_type") != col_2.get("inferred_type"):
+                col_changes.append({
+                    "field": "inferred_type",
+                    "old": col_1.get("inferred_type"),
+                    "new": col_2.get("inferred_type"),
+                })
+            # Null ratio change
+            old_null = col_1.get("null_ratio", 0)
+            new_null = col_2.get("null_ratio", 0)
+            if abs(old_null - new_null) > 0.05:  # 5% threshold
+                col_changes.append({
+                    "field": "null_ratio",
+                    "old": old_null,
+                    "new": new_null,
+                    "change": new_null - old_null,
+                })
+            # Unique ratio change
+            old_unique = col_1.get("unique_ratio", 0)
+            new_unique = col_2.get("unique_ratio", 0)
+            if abs(old_unique - new_unique) > 0.1:  # 10% threshold
+                col_changes.append({
+                    "field": "unique_ratio",
+                    "old": old_unique,
+                    "new": new_unique,
+                    "change": new_unique - old_unique,
+                })
+            if col_changes:
+                column_diffs.append({
+                    "column": col,
+                    "changes": col_changes,
+                })
+        return {
+            "profile_1_id": profile_1_id,
+            "profile_2_id": profile_2_id,
+            "row_count_change": profile_2.get("row_count", 0) - profile_1.get("row_count", 0),
+            "column_count_change": profile_2.get("column_count", 0) - profile_1.get("column_count", 0),
+            "added_columns": list(added_cols),
+            "removed_columns": list(removed_cols),
+            "schema_changes": changes,
+            "column_diffs": column_diffs,
+            "has_breaking_changes": len(removed_cols) > 0 or any(
+                c.get("field") == "inferred_type" for cd in column_diffs for c in cd.get("changes", [])
+            ),
+        }
 class HistoryService:
     """Service for validation history and analytics.
@@ -1399,6 +1891,7 @@ class DriftService:
     """Service for drift detection.
     Handles drift comparison between datasets.
+    Supports all data source types through DataSource abstraction.
     """
     def __init__(self, session: AsyncSession) -> None:
@@ -1420,12 +1913,14 @@ class DriftService:
         columns: list[str] | None = None,
         method: str = "auto",
         threshold: float | None = None,
-        correction: str | None = None,
         sample_size: int | None = None,
         save: bool = True,
     ) -> DriftComparison:
         """Compare two datasets for drift detection.
+        Supports comparing data from various source types including files,
+        SQL databases, cloud data warehouses, and async sources.
         Args:
             baseline_source_id: Baseline source ID.
             current_source_id: Current source ID.
@@ -1433,7 +1928,6 @@ class DriftService:
             method: Detection method. Supported:
                 auto, ks, psi, chi2, js, kl, wasserstein, cvm, anderson
             threshold: Optional custom threshold.
-            correction: Multiple testing correction (none, bonferroni, holm, bh).
             sample_size: Optional sample size.
             save: Whether to save comparison to database.
@@ -1441,7 +1935,7 @@ class DriftService:
             DriftComparison model with results.
         Raises:
-            ValueError: If source not found.
+            ValueError: If source not found or data source creation fails.
         """
         baseline = await self.source_repo.get_by_id(baseline_source_id)
         if baseline is None:
@@ -1451,13 +1945,23 @@ class DriftService:
         if current is None:
             raise ValueError(f"Current source '{current_source_id}' not found")
+        # Get data inputs based on source types
+        if SourceType.is_async_type(baseline.type):
+            baseline_input = await get_async_data_input_from_source(baseline)
+        else:
+            baseline_input = get_data_input_from_source(baseline)
+        if SourceType.is_async_type(current.type):
+            current_input = await get_async_data_input_from_source(current)
+        else:
+            current_input = get_data_input_from_source(current)
         result = await self.adapter.compare(
-            baseline.source_path or "",
-            current.source_path or "",
+            baseline_input,
+            current_input,
             columns=columns,
             method=method,
             threshold=threshold,
-            correction=correction,
             sample_size=sample_size,
         )
@@ -1465,7 +1969,6 @@ class DriftService:
             "columns": columns,
             "method": method,
             "threshold": threshold,
-            "correction": correction,
             "sample_size": sample_size,
         }
@@ -1552,6 +2055,8 @@ class ScheduleService:
         *,
         name: str,
         cron_expression: str,
+        trigger_type: str = "cron",
+        trigger_config: dict[str, Any] | None = None,
         notify_on_failure: bool = True,
         config: dict[str, Any] | None = None,
     ) -> Schedule:
@@ -1581,6 +2086,8 @@ class ScheduleService:
             name=name,
             source_id=source_id,
             cron_expression=cron_expression,
+            trigger_type=trigger_type,
+            trigger_config=trigger_config,
             is_active=True,
             notify_on_failure=notify_on_failure,
             next_run_at=next_run,
@@ -1782,6 +2289,7 @@ class PIIScanService:
     """Service for PII scanning operations.
     Handles PII detection and regulation compliance checking using th.scan().
+    Supports all data source types through DataSource abstraction.
     """
     def __init__(self, session: AsyncSession) -> None:
@@ -1795,31 +2303,23 @@ class PIIScanService:
         self.scan_repo = PIIScanRepository(session)
         self.adapter = get_adapter()
-    async def run_scan(
-        self,
-        source_id: str,
-        *,
-        columns: list[str] | None = None,
-        regulations: list[str] | None = None,
-        min_confidence: float = 0.8,
-    ) -> PIIScan:
+    async def run_scan(self, source_id: str) -> PIIScan:
         """Run PII scan on a source.
-        This method provides access to truthound's th.scan() parameters,
-        allowing detection of personally identifiable information and
-        checking compliance with privacy regulations.
+        Note: truthound's th.scan() does not support configuration parameters.
+        The scan runs on all columns with default settings.
+        Supports all data source types including files, SQL databases,
+        cloud data warehouses, and async sources.
         Args:
             source_id: Source ID to scan.
-            columns: Optional columns to scan. If None, scans all columns.
-            regulations: Optional regulations to check (gdpr, ccpa, lgpd).
-            min_confidence: Minimum confidence threshold (0.0-1.0). Default 0.8.
         Returns:
             PIIScan record with results.
         Raises:
-            ValueError: If source not found.
+            ValueError: If source not found or data source creation fails.
         """
         # Get source
         source = await self.source_repo.get_by_id(source_id)
@@ -1830,19 +2330,18 @@ class PIIScanService:
         scan = await self.scan_repo.create(
             source_id=source_id,
             status="running",
-            min_confidence=min_confidence,
-            regulations_checked=regulations,
             started_at=datetime.utcnow(),
         )
         try:
-            # Run scan
-            result = await self.adapter.scan(
-                source.source_path or "",
-                columns=columns,
-                regulations=regulations,
-                min_confidence=min_confidence,
-            )
+            # Get data input based on source type
+            if SourceType.is_async_type(source.type):
+                data_input = await get_async_data_input_from_source(source)
+            else:
+                data_input = get_data_input_from_source(source)
+            # Run scan - truthound's th.scan() does not support parameters
+            result = await self.adapter.scan(data_input)
             # Update scan with results
             await self._update_scan_success(scan, result)
@@ -1972,6 +2471,8 @@ class MaskService:
     - redact: Replace values with asterisks
     - hash: Replace values with SHA256 hash (deterministic)
     - fake: Replace values with realistic fake data
+    Supports all data source types through DataSource abstraction.
     """
     def __init__(self, session: AsyncSession) -> None:
@@ -1991,18 +2492,22 @@ class MaskService:
         *,
         columns: list[str] | None = None,
         strategy: str = "redact",
-        output_format: str = "csv",
     ) -> DataMask:
         """Run data masking on a source.
         This method provides access to truthound's th.mask() with
         three masking strategies for PII protection.
+        Supports all data source types including files, SQL databases,
+        cloud data warehouses, and async sources.
+        Note: output_format parameter was removed as truthound's th.mask()
+        does not support this parameter. Output is always CSV format.
         Args:
             source_id: Source ID to mask.
             columns: Optional columns to mask. If None, auto-detects PII.
             strategy: Masking strategy (redact, hash, fake). Default is redact.
-            output_format: Output file format (csv, parquet, json). Default is csv.
         Returns:
             DataMask record with results.
@@ -2010,6 +2515,9 @@ class MaskService:
         Raises:
             ValueError: If source not found or invalid strategy.
         """
+        from pathlib import Path
+        import tempfile
         # Validate strategy
         if strategy not in ("redact", "hash", "fake"):
             raise ValueError(
@@ -2022,14 +2530,19 @@ class MaskService:
             raise ValueError(f"Source '{source_id}' not found")
         # Determine output path
-        source_path = source.source_path or ""
-        import os
-        from pathlib import Path
+        # For file sources, use the same directory structure
+        # For other sources, use a temp directory or configured output directory
+        if SourceType.is_file_type(source.type):
+            source_path = source.source_path or source.config.get("path", "")
+            base_path = Path(source_path)
+            output_dir = base_path.parent / "masked"
+        else:
+            # For non-file sources, use a temp directory
+            output_dir = Path(tempfile.gettempdir()) / "truthound_masked"
-        base_path = Path(source_path)
-        output_dir = base_path.parent / "masked"
         output_dir.mkdir(exist_ok=True)
-        output_filename = f"{base_path.stem}_masked_{strategy}.{output_format}"
+        # Output format is always CSV as truthound's th.mask() does not support format selection
+        output_filename = f"{source.name}_masked_{strategy}.csv"
         output_path = str(output_dir / output_filename)
         # Create mask record
@@ -2042,9 +2555,15 @@ class MaskService:
         )
         try:
+            # Get data input based on source type
+            if SourceType.is_async_type(source.type):
+                data_input = await get_async_data_input_from_source(source)
+            else:
+                data_input = get_data_input_from_source(source)
             # Run masking
             result = await self.adapter.mask(
-                source_path,
+                data_input,
                 output_path,
                 columns=columns,
                 strategy=strategy,

truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

truthound-dashboard 1.4.4py3-none-any.whl → 1.5.0py3-none-any.whl