PyPI - truthound-dashboard - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

truthound-dashboard 1.2.1py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

truthound_dashboard/api/deps.py +28 -0
truthound_dashboard/api/drift.py +1 -0
truthound_dashboard/api/mask.py +164 -0
truthound_dashboard/api/profile.py +11 -3
truthound_dashboard/api/router.py +22 -0
truthound_dashboard/api/scan.py +168 -0
truthound_dashboard/api/schemas.py +13 -4
truthound_dashboard/api/validations.py +33 -1
truthound_dashboard/api/validators.py +85 -0
truthound_dashboard/core/__init__.py +8 -0
truthound_dashboard/core/phase5/activity.py +1 -1
truthound_dashboard/core/services.py +457 -7
truthound_dashboard/core/truthound_adapter.py +441 -26
truthound_dashboard/db/__init__.py +6 -0
truthound_dashboard/db/models.py +250 -1
truthound_dashboard/schemas/__init__.py +52 -1
truthound_dashboard/schemas/collaboration.py +1 -1
truthound_dashboard/schemas/drift.py +118 -3
truthound_dashboard/schemas/mask.py +209 -0
truthound_dashboard/schemas/profile.py +45 -2
truthound_dashboard/schemas/scan.py +312 -0
truthound_dashboard/schemas/schema.py +30 -2
truthound_dashboard/schemas/validation.py +60 -3
truthound_dashboard/schemas/validators/__init__.py +59 -0
truthound_dashboard/schemas/validators/aggregate_validators.py +238 -0
truthound_dashboard/schemas/validators/anomaly_validators.py +723 -0
truthound_dashboard/schemas/validators/base.py +263 -0
truthound_dashboard/schemas/validators/completeness_validators.py +269 -0
truthound_dashboard/schemas/validators/cross_table_validators.py +375 -0
truthound_dashboard/schemas/validators/datetime_validators.py +253 -0
truthound_dashboard/schemas/validators/distribution_validators.py +422 -0
truthound_dashboard/schemas/validators/drift_validators.py +615 -0
truthound_dashboard/schemas/validators/geospatial_validators.py +486 -0
truthound_dashboard/schemas/validators/multi_column_validators.py +706 -0
truthound_dashboard/schemas/validators/privacy_validators.py +531 -0
truthound_dashboard/schemas/validators/query_validators.py +510 -0
truthound_dashboard/schemas/validators/registry.py +318 -0
truthound_dashboard/schemas/validators/schema_validators.py +408 -0
truthound_dashboard/schemas/validators/string_validators.py +396 -0
truthound_dashboard/schemas/validators/table_validators.py +412 -0
truthound_dashboard/schemas/validators/uniqueness_validators.py +355 -0
truthound_dashboard/schemas/validators.py +59 -0
truthound_dashboard/static/assets/index-BZG20KuF.js +586 -0
truthound_dashboard/static/assets/index-D_HyZ3pb.css +1 -0
truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +1 -0
truthound_dashboard/static/index.html +2 -2
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/METADATA +50 -11
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/RECORD +51 -27
truthound_dashboard/static/assets/index-BqXVFyqj.js +0 -574
truthound_dashboard/static/assets/index-o8qHVDte.css +0 -1
truthound_dashboard/static/assets/unmerged_dictionaries-n_T3wZTf.js +0 -1
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/WHEEL +0 -0
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/entry_points.txt +0 -0
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/licenses/LICENSE +0 -0

truthound_dashboard/schemas/mask.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Pydantic schemas for data masking (th.mask) operations.
+Provides schemas for masking requests, responses, and history.
+Supports three masking strategies: redact, hash, fake.
+"""
+from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from typing import Literal
+from pydantic import Field
+from .base import BaseSchema, IDMixin, TimestampMixin
+class MaskingStrategy(str, Enum):
+    """Masking strategy options.
+    - redact: Replace values with asterisks (e.g., "john@example.com" -> "****")
+    - hash: Replace values with SHA256 hash (deterministic, can be used for joins)
+    - fake: Replace values with realistic fake data (e.g., "john@example.com" -> "alice@test.org")
+    """
+    REDACT = "redact"
+    HASH = "hash"
+    FAKE = "fake"
+MaskingStrategyLiteral = Literal["redact", "hash", "fake"]
+class MaskStatus(str, Enum):
+    """Status of a masking operation."""
+    PENDING = "pending"
+    RUNNING = "running"
+    SUCCESS = "success"
+    FAILED = "failed"
+    ERROR = "error"
+class MaskRequest(BaseSchema):
+    """Request body for running a masking operation.
+    Attributes:
+        columns: Optional list of columns to mask. If None, auto-detects PII.
+        strategy: Masking strategy to use. Defaults to "redact".
+        output_format: Output file format. Defaults to "csv".
+    """
+    columns: list[str] | None = Field(
+        default=None,
+        description="Columns to mask. If not specified, auto-detects PII columns.",
+    )
+    strategy: MaskingStrategyLiteral = Field(
+        default="redact",
+        description="Masking strategy: 'redact' (asterisks), 'hash' (SHA256), 'fake' (realistic data)",
+    )
+    output_format: Literal["csv", "parquet", "json"] = Field(
+        default="csv",
+        description="Output file format",
+    )
+class MaskSummary(BaseSchema):
+    """Summary of a masking operation.
+    Attributes:
+        source_id: ID of the source that was masked.
+        source_name: Name of the source.
+        status: Current status of the operation.
+        strategy: Masking strategy used.
+        columns_masked: Number of columns that were masked.
+        row_count: Number of rows processed.
+        duration_ms: Operation duration in milliseconds.
+    """
+    source_id: str
+    source_name: str | None = None
+    status: str
+    strategy: str
+    columns_masked: int
+    row_count: int | None = None
+    duration_ms: int | None = None
+class MaskResponse(BaseSchema, IDMixin, TimestampMixin):
+    """Response for a masking operation.
+    Attributes:
+        id: Unique identifier for the masking operation.
+        source_id: ID of the source that was masked.
+        status: Current status (pending, running, success, failed, error).
+        strategy: Masking strategy used.
+        output_path: Path to the masked output file.
+        columns_masked: List of columns that were masked.
+        auto_detected: Whether PII columns were auto-detected.
+        row_count: Number of rows processed.
+        column_count: Total number of columns.
+        duration_ms: Operation duration in milliseconds.
+        error_message: Error message if operation failed.
+        started_at: When the operation started.
+        completed_at: When the operation completed.
+    """
+    source_id: str
+    status: str
+    strategy: str
+    output_path: str | None = None
+    columns_masked: list[str] | None = None
+    auto_detected: bool = False
+    row_count: int | None = None
+    column_count: int | None = None
+    duration_ms: int | None = None
+    error_message: str | None = None
+    started_at: datetime | None = None
+    completed_at: datetime | None = None
+    @classmethod
+    def from_db(cls, db_mask: object) -> MaskResponse:
+        """Create response from database model.
+        Args:
+            db_mask: DataMask database model instance.
+        Returns:
+            MaskResponse instance.
+        """
+        return cls(
+            id=db_mask.id,
+            source_id=db_mask.source_id,
+            status=db_mask.status,
+            strategy=db_mask.strategy,
+            output_path=db_mask.output_path,
+            columns_masked=db_mask.columns_masked,
+            auto_detected=db_mask.auto_detected,
+            row_count=db_mask.row_count,
+            column_count=db_mask.column_count,
+            duration_ms=db_mask.duration_ms,
+            error_message=db_mask.error_message,
+            started_at=db_mask.started_at,
+            completed_at=db_mask.completed_at,
+            created_at=db_mask.created_at,
+            updated_at=getattr(db_mask, "updated_at", None),
+        )
+class MaskListItem(BaseSchema, IDMixin):
+    """List item for masking operations.
+    Attributes:
+        id: Unique identifier.
+        source_id: ID of the source.
+        source_name: Name of the source.
+        status: Current status.
+        strategy: Masking strategy used.
+        columns_masked: Number of columns masked.
+        row_count: Number of rows processed.
+        duration_ms: Operation duration in milliseconds.
+        created_at: When the operation was created.
+    """
+    source_id: str
+    source_name: str | None = None
+    status: str
+    strategy: str
+    columns_masked: int = 0
+    row_count: int | None = None
+    duration_ms: int | None = None
+    created_at: datetime
+    @classmethod
+    def from_db(cls, db_mask: object, source_name: str | None = None) -> MaskListItem:
+        """Create list item from database model.
+        Args:
+            db_mask: DataMask database model instance.
+            source_name: Optional source name.
+        Returns:
+            MaskListItem instance.
+        """
+        return cls(
+            id=db_mask.id,
+            source_id=db_mask.source_id,
+            source_name=source_name or getattr(db_mask.source, "name", None),
+            status=db_mask.status,
+            strategy=db_mask.strategy,
+            columns_masked=len(db_mask.columns_masked) if db_mask.columns_masked else 0,
+            row_count=db_mask.row_count,
+            duration_ms=db_mask.duration_ms,
+            created_at=db_mask.created_at,
+        )
+class MaskListResponse(BaseSchema):
+    """Response for listing masking operations.
+    Attributes:
+        data: List of masking operation items.
+        total: Total number of items.
+        limit: Maximum items per page.
+    """
+    data: list[MaskListItem]
+    total: int
+    limit: int = 20

truthound_dashboard/schemas/profile.py CHANGED Viewed

@@ -12,6 +12,22 @@ from pydantic import Field
 from .base import BaseSchema
+class ProfileRequest(BaseSchema):
+    """Request schema for data profiling.
+    Provides optional configuration for profiling operations.
+    All fields are optional with sensible defaults.
+    """
+    sample_size: int | None = Field(
+        default=None,
+        ge=1,
+        description="Maximum number of rows to sample for profiling. "
+        "If None, profiles all data. Useful for large datasets.",
+        examples=[10000, 50000, 100000],
+    )
 class ColumnProfile(BaseSchema):
     """Profile information for a single column."""
@@ -60,14 +76,41 @@ class ProfileResponse(BaseSchema):
     @classmethod
     def from_result(cls, result: Any) -> ProfileResponse:
-        """Create response from adapter result.
+        """Create response from adapter result or Profile model.
         Args:
-            result: ProfileResult from adapter.
+            result: ProfileResult from adapter or Profile model.
         Returns:
             ProfileResponse instance.
         """
+        # Handle Profile model (from database)
+        if hasattr(result, "profile_json"):
+            profile_json = result.profile_json
+            source_name = profile_json.get("source", result.source_id)
+            columns_data = profile_json.get("columns", [])
+            columns = [
+                ColumnProfile(
+                    name=col["name"],
+                    dtype=col["dtype"],
+                    null_pct=col.get("null_pct", "0%"),
+                    unique_pct=col.get("unique_pct", "0%"),
+                    min=col.get("min"),
+                    max=col.get("max"),
+                    mean=col.get("mean"),
+                    std=col.get("std"),
+                )
+                for col in columns_data
+            ]
+            return cls(
+                source=source_name,
+                row_count=result.row_count or 0,
+                column_count=result.column_count or 0,
+                size_bytes=result.size_bytes or 0,
+                columns=columns,
+            )
+        # Handle ProfileResult (from adapter)
         columns = [
             ColumnProfile(
                 name=col["name"],

truthound_dashboard/schemas/scan.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""PII scan-related Pydantic schemas.
+This module defines schemas for PII scan API operations using th.scan().
+The scan functionality detects personally identifiable information (PII)
+in datasets and checks compliance with privacy regulations (GDPR, CCPA, LGPD).
+"""
+from __future__ import annotations
+from datetime import datetime
+from enum import Enum
+from typing import Any, Literal
+from pydantic import Field
+from .base import BaseSchema, IDMixin, ListResponseWrapper
+class Regulation(str, Enum):
+    """Supported privacy regulations for compliance checking."""
+    GDPR = "gdpr"
+    CCPA = "ccpa"
+    LGPD = "lgpd"
+# Type alias for regulation literal
+RegulationLiteral = Literal["gdpr", "ccpa", "lgpd"]
+# PII type categories commonly detected
+PII_TYPES = [
+    "email",
+    "phone",
+    "ssn",
+    "credit_card",
+    "ip_address",
+    "date_of_birth",
+    "address",
+    "name",
+    "passport",
+    "driver_license",
+    "national_id",
+    "bank_account",
+    "medical_record",
+    "biometric",
+]
+class PIIScanRequest(BaseSchema):
+    """Request to run PII scan on a data source.
+    This schema maps to truthound's th.scan() parameters for maximum flexibility.
+    All optional parameters default to None to use truthound's defaults.
+    """
+    # Column filtering
+    columns: list[str] | None = Field(
+        default=None,
+        description="Columns to scan. If None, all columns are scanned.",
+        examples=[["email", "phone", "ssn"]],
+    )
+    # Regulation compliance checking
+    regulations: list[RegulationLiteral] | None = Field(
+        default=None,
+        description="Privacy regulations to check compliance: gdpr, ccpa, lgpd",
+        examples=[["gdpr", "ccpa"]],
+    )
+    # Confidence threshold
+    min_confidence: float = Field(
+        default=0.8,
+        ge=0.0,
+        le=1.0,
+        description="Minimum confidence threshold for PII detection (0.0-1.0)",
+        examples=[0.8, 0.9],
+    )
+class PIIFinding(BaseSchema):
+    """Single PII finding detected in a column.
+    Represents one type of PII detected within a specific column,
+    including confidence score and sample information.
+    """
+    column: str = Field(..., description="Column where PII was detected")
+    pii_type: str = Field(
+        ...,
+        description="Type of PII detected",
+        examples=["email", "ssn", "phone", "credit_card"],
+    )
+    confidence: float = Field(
+        ...,
+        ge=0.0,
+        le=1.0,
+        description="Confidence score for this detection (0.0-1.0)",
+    )
+    sample_count: int = Field(
+        ...,
+        ge=0,
+        description="Number of values matching this PII type",
+    )
+    sample_values: list[str] | None = Field(
+        default=None,
+        description="Sample values that matched (redacted for privacy)",
+    )
+class RegulationViolation(BaseSchema):
+    """Regulation compliance violation.
+    Represents a violation of a specific privacy regulation
+    detected in the scanned data.
+    """
+    regulation: RegulationLiteral = Field(
+        ...,
+        description="Violated regulation",
+    )
+    column: str = Field(
+        ...,
+        description="Column with violation",
+    )
+    pii_type: str = Field(
+        ...,
+        description="Type of PII causing the violation",
+    )
+    message: str = Field(
+        ...,
+        description="Human-readable violation description",
+    )
+    severity: Literal["low", "medium", "high", "critical"] = Field(
+        default="high",
+        description="Severity level of the violation",
+    )
+class PIIScanSummary(BaseSchema):
+    """Summary statistics for a PII scan run."""
+    total_columns_scanned: int = Field(
+        default=0,
+        ge=0,
+        description="Total number of columns scanned",
+    )
+    columns_with_pii: int = Field(
+        default=0,
+        ge=0,
+        description="Number of columns containing PII",
+    )
+    total_findings: int = Field(
+        default=0,
+        ge=0,
+        description="Total number of PII findings",
+    )
+    has_violations: bool = Field(
+        default=False,
+        description="Whether any regulation violations were found",
+    )
+    total_violations: int = Field(
+        default=0,
+        ge=0,
+        description="Total number of regulation violations",
+    )
+class PIIScanResponse(IDMixin, PIIScanSummary):
+    """Full PII scan response with all details."""
+    source_id: str = Field(..., description="Source that was scanned")
+    status: Literal["pending", "running", "success", "failed", "error"] = Field(
+        ...,
+        description="Current scan status",
+    )
+    # Data statistics
+    row_count: int | None = Field(default=None, description="Number of rows scanned")
+    column_count: int | None = Field(default=None, description="Number of columns")
+    # Scan configuration used
+    min_confidence: float = Field(
+        default=0.8,
+        description="Confidence threshold used for this scan",
+    )
+    regulations_checked: list[str] | None = Field(
+        default=None,
+        description="Regulations that were checked",
+    )
+    # Findings (full details)
+    findings: list[PIIFinding] = Field(
+        default_factory=list,
+        description="List of PII findings",
+    )
+    # Regulation violations
+    violations: list[RegulationViolation] = Field(
+        default_factory=list,
+        description="List of regulation violations",
+    )
+    # Error info (if status is 'error')
+    error_message: str | None = Field(
+        default=None,
+        description="Error message if scan failed",
+    )
+    # Timing
+    duration_ms: int | None = Field(
+        default=None,
+        ge=0,
+        description="Scan duration in milliseconds",
+    )
+    started_at: datetime | None = Field(default=None, description="Start timestamp")
+    completed_at: datetime | None = Field(
+        default=None,
+        description="Completion timestamp",
+    )
+    created_at: datetime = Field(..., description="Record creation timestamp")
+    @classmethod
+    def from_model(cls, scan: Any) -> PIIScanResponse:
+        """Create response from model.
+        Args:
+            scan: PIIScan model instance.
+        Returns:
+            PIIScanResponse instance.
+        """
+        findings = []
+        if scan.result_json and "findings" in scan.result_json:
+            findings = [
+                PIIFinding(**finding) for finding in scan.result_json["findings"]
+            ]
+        violations = []
+        if scan.result_json and "violations" in scan.result_json:
+            violations = [
+                RegulationViolation(**violation)
+                for violation in scan.result_json["violations"]
+            ]
+        return cls(
+            id=scan.id,
+            source_id=scan.source_id,
+            status=scan.status,
+            total_columns_scanned=scan.total_columns_scanned or 0,
+            columns_with_pii=scan.columns_with_pii or 0,
+            total_findings=scan.total_findings or 0,
+            has_violations=scan.has_violations or False,
+            total_violations=scan.total_violations or 0,
+            row_count=scan.row_count,
+            column_count=scan.column_count,
+            min_confidence=scan.min_confidence or 0.8,
+            regulations_checked=scan.regulations_checked,
+            findings=findings,
+            violations=violations,
+            error_message=scan.error_message,
+            duration_ms=scan.duration_ms,
+            started_at=scan.started_at,
+            completed_at=scan.completed_at,
+            created_at=scan.created_at,
+        )
+class PIIScanListItem(IDMixin, PIIScanSummary):
+    """PII scan list item (without full findings/violations)."""
+    source_id: str
+    status: Literal["pending", "running", "success", "failed", "error"]
+    row_count: int | None = None
+    column_count: int | None = None
+    min_confidence: float = 0.8
+    regulations_checked: list[str] | None = None
+    duration_ms: int | None = None
+    created_at: datetime
+    @classmethod
+    def from_model(cls, scan: Any) -> PIIScanListItem:
+        """Create list item from model.
+        Args:
+            scan: PIIScan model instance.
+        Returns:
+            PIIScanListItem instance.
+        """
+        return cls(
+            id=scan.id,
+            source_id=scan.source_id,
+            status=scan.status,
+            total_columns_scanned=scan.total_columns_scanned or 0,
+            columns_with_pii=scan.columns_with_pii or 0,
+            total_findings=scan.total_findings or 0,
+            has_violations=scan.has_violations or False,
+            total_violations=scan.total_violations or 0,
+            row_count=scan.row_count,
+            column_count=scan.column_count,
+            min_confidence=scan.min_confidence or 0.8,
+            regulations_checked=scan.regulations_checked,
+            duration_ms=scan.duration_ms,
+            created_at=scan.created_at,
+        )
+class PIIScanListResponse(ListResponseWrapper[PIIScanListItem]):
+    """Paginated PII scan list response."""
+    pass

truthound_dashboard/schemas/schema.py CHANGED Viewed

@@ -40,11 +40,39 @@ class ColumnSchema(BaseSchema):
 class SchemaLearnRequest(BaseSchema):
-    """Request to learn schema from source."""
+    """Request to learn schema from source.
+    Maps to truthound's th.learn() parameters for schema inference.
+    Attributes:
+        infer_constraints: If True, infers min/max, allowed values from data.
+        categorical_threshold: Maximum unique values for categorical detection.
+            Columns with unique values <= this threshold are treated as categorical.
+        sample_size: Number of rows to sample for large datasets.
+            If None, uses all rows. Useful for performance with large files.
+    """
     infer_constraints: bool = Field(
         default=True,
-        description="Infer constraints from data statistics",
+        description="Infer constraints (min/max, allowed values) from data statistics",
+    )
+    categorical_threshold: int | None = Field(
+        default=None,
+        ge=1,
+        le=1000,
+        description=(
+            "Maximum unique values for categorical detection. "
+            "Columns with unique values <= threshold are treated as categorical. "
+            "If None, uses truthound default (20)."
+        ),
+    )
+    sample_size: int | None = Field(
+        default=None,
+        ge=100,
+        description=(
+            "Number of rows to sample for schema learning. "
+            "If None, uses all rows. Useful for large datasets."
+        ),
     )

truthound-dashboard 1.2.1__py3-none-any.whl → 1.3.1__py3-none-any.whl

truthound-dashboard 1.2.1py3-none-any.whl → 1.3.1py3-none-any.whl