PyPI - truthound-dashboard - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

truthound-dashboard 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

truthound_dashboard/api/deps.py +28 -0
truthound_dashboard/api/drift.py +1 -0
truthound_dashboard/api/mask.py +164 -0
truthound_dashboard/api/profile.py +11 -3
truthound_dashboard/api/router.py +22 -0
truthound_dashboard/api/scan.py +168 -0
truthound_dashboard/api/schemas.py +13 -4
truthound_dashboard/api/validations.py +33 -1
truthound_dashboard/api/validators.py +85 -0
truthound_dashboard/core/__init__.py +8 -0
truthound_dashboard/core/phase5/activity.py +1 -1
truthound_dashboard/core/services.py +457 -7
truthound_dashboard/core/truthound_adapter.py +441 -26
truthound_dashboard/db/__init__.py +6 -0
truthound_dashboard/db/models.py +250 -1
truthound_dashboard/schemas/__init__.py +52 -1
truthound_dashboard/schemas/collaboration.py +1 -1
truthound_dashboard/schemas/drift.py +118 -3
truthound_dashboard/schemas/mask.py +209 -0
truthound_dashboard/schemas/profile.py +45 -2
truthound_dashboard/schemas/scan.py +312 -0
truthound_dashboard/schemas/schema.py +30 -2
truthound_dashboard/schemas/validation.py +60 -3
truthound_dashboard/schemas/validators/__init__.py +59 -0
truthound_dashboard/schemas/validators/aggregate_validators.py +238 -0
truthound_dashboard/schemas/validators/anomaly_validators.py +723 -0
truthound_dashboard/schemas/validators/base.py +263 -0
truthound_dashboard/schemas/validators/completeness_validators.py +269 -0
truthound_dashboard/schemas/validators/cross_table_validators.py +375 -0
truthound_dashboard/schemas/validators/datetime_validators.py +253 -0
truthound_dashboard/schemas/validators/distribution_validators.py +422 -0
truthound_dashboard/schemas/validators/drift_validators.py +615 -0
truthound_dashboard/schemas/validators/geospatial_validators.py +486 -0
truthound_dashboard/schemas/validators/multi_column_validators.py +706 -0
truthound_dashboard/schemas/validators/privacy_validators.py +531 -0
truthound_dashboard/schemas/validators/query_validators.py +510 -0
truthound_dashboard/schemas/validators/registry.py +318 -0
truthound_dashboard/schemas/validators/schema_validators.py +408 -0
truthound_dashboard/schemas/validators/string_validators.py +396 -0
truthound_dashboard/schemas/validators/table_validators.py +412 -0
truthound_dashboard/schemas/validators/uniqueness_validators.py +355 -0
truthound_dashboard/schemas/validators.py +59 -0
truthound_dashboard/static/assets/{index-BqXVFyqj.js → index-BCA8H1hO.js} +95 -95
truthound_dashboard/static/assets/index-BNsSQ2fN.css +1 -0
truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +1 -0
truthound_dashboard/static/index.html +2 -2
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/METADATA +46 -11
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/RECORD +51 -27
truthound_dashboard/static/assets/index-o8qHVDte.css +0 -1
truthound_dashboard/static/assets/unmerged_dictionaries-n_T3wZTf.js +0 -1
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/WHEEL +0 -0
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/entry_points.txt +0 -0
{truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/licenses/LICENSE +0 -0

truthound_dashboard/db/models.py CHANGED Viewed

@@ -598,6 +598,253 @@ class DriftComparison(Base, UUIDMixin, TimestampMixin):
         return []
+class MaskingStrategy(str, Enum):
+    """Masking strategy enum."""
+    REDACT = "redact"
+    HASH = "hash"
+    FAKE = "fake"
+class DataMask(Base, UUIDMixin):
+    """Data masking operation model.
+    Stores results from th.mask() data masking operations.
+    Supports three strategies: redact (asterisks), hash (SHA256), fake (realistic data).
+    Attributes:
+        id: Unique identifier (UUID).
+        source_id: Reference to parent Source.
+        status: Current status (pending, running, success, failed, error).
+        strategy: Masking strategy used (redact, hash, fake).
+        output_path: Path to the masked output file.
+        columns_masked: List of columns that were masked.
+        row_count: Number of rows processed.
+        column_count: Number of columns in the data.
+        auto_detected: Whether PII columns were auto-detected.
+        result_json: Full mask result as JSON.
+        duration_ms: Operation duration in milliseconds.
+    """
+    __tablename__ = "data_masks"
+    # Composite index for efficient history queries (source + time ordering)
+    __table_args__ = (
+        Index("idx_data_masks_source_created", "source_id", "created_at"),
+    )
+    source_id: Mapped[str] = mapped_column(
+        String(36),
+        ForeignKey("sources.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    # Status tracking
+    status: Mapped[str] = mapped_column(
+        String(20),
+        nullable=False,
+        default="pending",
+        index=True,
+    )
+    # Masking configuration
+    strategy: Mapped[str] = mapped_column(
+        String(20),
+        nullable=False,
+        default=MaskingStrategy.REDACT.value,
+        index=True,
+    )
+    output_path: Mapped[str | None] = mapped_column(Text, nullable=True)
+    columns_masked: Mapped[list[str] | None] = mapped_column(JSON, nullable=True)
+    auto_detected: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
+    # Data statistics
+    row_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    column_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    # Full result and timing
+    result_json: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
+    error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
+    duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    # Timestamps
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime, default=datetime.utcnow, nullable=False
+    )
+    started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
+    completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
+    # Relationships
+    source: Mapped[Source] = relationship(
+        "Source",
+        backref="data_masks",
+    )
+    @property
+    def is_complete(self) -> bool:
+        """Check if masking operation has completed."""
+        return self.status in ("success", "failed", "error")
+    @property
+    def masked_column_count(self) -> int:
+        """Get number of columns that were masked."""
+        return len(self.columns_masked) if self.columns_masked else 0
+    def mark_started(self) -> None:
+        """Mark operation as started."""
+        self.status = "running"
+        self.started_at = datetime.utcnow()
+    def mark_completed(
+        self,
+        result: dict[str, Any],
+    ) -> None:
+        """Mark operation as completed with results."""
+        self.status = "success"
+        self.result_json = result
+        self.completed_at = datetime.utcnow()
+        if self.started_at:
+            delta = self.completed_at - self.started_at
+            self.duration_ms = int(delta.total_seconds() * 1000)
+    def mark_error(self, message: str) -> None:
+        """Mark operation as errored."""
+        self.status = "error"
+        self.error_message = message
+        self.completed_at = datetime.utcnow()
+        if self.started_at:
+            delta = self.completed_at - self.started_at
+            self.duration_ms = int(delta.total_seconds() * 1000)
+class PIIScan(Base, UUIDMixin):
+    """PII scan result model.
+    Stores results from th.scan() PII detection runs.
+    Attributes:
+        id: Unique identifier (UUID).
+        source_id: Reference to parent Source.
+        status: Current status (pending, running, success, failed, error).
+        total_columns_scanned: Total columns that were scanned.
+        columns_with_pii: Number of columns containing PII.
+        total_findings: Total number of PII findings.
+        has_violations: Whether any regulation violations were found.
+        total_violations: Number of regulation violations.
+        min_confidence: Confidence threshold used for this scan.
+        regulations_checked: List of regulations checked.
+        result_json: Full scan result as JSON.
+        duration_ms: Scan duration in milliseconds.
+    """
+    __tablename__ = "pii_scans"
+    # Composite index for efficient history queries (source + time ordering)
+    __table_args__ = (
+        Index("idx_pii_scans_source_created", "source_id", "created_at"),
+    )
+    source_id: Mapped[str] = mapped_column(
+        String(36),
+        ForeignKey("sources.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    # Status tracking
+    status: Mapped[str] = mapped_column(
+        String(20),
+        nullable=False,
+        default="pending",
+        index=True,
+    )
+    # Scan summary
+    total_columns_scanned: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    columns_with_pii: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    total_findings: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    has_violations: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
+    total_violations: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    # Data statistics
+    row_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    column_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    # Configuration used
+    min_confidence: Mapped[float | None] = mapped_column(Float, nullable=True)
+    regulations_checked: Mapped[list[str] | None] = mapped_column(JSON, nullable=True)
+    # Full result and timing
+    result_json: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
+    error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
+    duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    # Timestamps
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime, default=datetime.utcnow, nullable=False
+    )
+    started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
+    completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
+    # Relationships
+    source: Mapped[Source] = relationship(
+        "Source",
+        backref="pii_scans",
+    )
+    @property
+    def findings(self) -> list[dict[str, Any]]:
+        """Get list of PII findings from result JSON."""
+        if self.result_json and "findings" in self.result_json:
+            return self.result_json["findings"]
+        return []
+    @property
+    def violations(self) -> list[dict[str, Any]]:
+        """Get list of regulation violations from result JSON."""
+        if self.result_json and "violations" in self.result_json:
+            return self.result_json["violations"]
+        return []
+    @property
+    def is_complete(self) -> bool:
+        """Check if scan has completed (success, failed, or error)."""
+        return self.status in ("success", "failed", "error")
+    def mark_started(self) -> None:
+        """Mark scan as started."""
+        self.status = "running"
+        self.started_at = datetime.utcnow()
+    def mark_completed(
+        self,
+        has_violations: bool,
+        result: dict[str, Any],
+    ) -> None:
+        """Mark scan as completed with results."""
+        self.status = "success" if not has_violations else "failed"
+        self.has_violations = has_violations
+        self.result_json = result
+        self.completed_at = datetime.utcnow()
+        if self.started_at:
+            delta = self.completed_at - self.started_at
+            self.duration_ms = int(delta.total_seconds() * 1000)
+    def mark_error(self, message: str) -> None:
+        """Mark scan as errored."""
+        self.status = "error"
+        self.error_message = message
+        self.completed_at = datetime.utcnow()
+        if self.started_at:
+            delta = self.completed_at - self.started_at
+            self.duration_ms = int(delta.total_seconds() * 1000)
 class AppSettings(Base):
     """Application settings model.
@@ -1411,7 +1658,9 @@ class Activity(Base, UUIDMixin):
     action: Mapped[str] = mapped_column(String(30), nullable=False, index=True)
     actor_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
     description: Mapped[str | None] = mapped_column(Text, nullable=True)
-    metadata: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
+    activity_metadata: Mapped[dict[str, Any] | None] = mapped_column(
+        "metadata", JSON, nullable=True
+    )
     created_at: Mapped[datetime] = mapped_column(
         DateTime,
         default=datetime.utcnow,

truthound_dashboard/schemas/__init__.py CHANGED Viewed

@@ -20,12 +20,18 @@ from .base import (
 )
 from .drift import (
     ColumnDriftResult,
+    CorrectionMethod,
+    CorrectionMethodLiteral,
+    DEFAULT_THRESHOLDS,
     DriftCompareRequest,
     DriftComparisonListItem,
     DriftComparisonListResponse,
     DriftComparisonResponse,
+    DriftMethod,
+    DriftMethodLiteral,
     DriftResult,
     DriftSourceSummary,
+    get_default_threshold,
 )
 from .history import (
     FailureFrequencyItem,
@@ -35,7 +41,7 @@ from .history import (
     RecentValidation,
     TrendDataPoint,
 )
-from .profile import ColumnProfile, ProfileResponse
+from .profile import ColumnProfile, ProfileRequest, ProfileResponse
 from .rule import (
     RuleBase,
     RuleCreate,
@@ -114,6 +120,26 @@ from .collaboration import (
     CommentUpdate,
     ResourceType,
 )
+from .mask import (
+    MaskingStrategy,
+    MaskingStrategyLiteral,
+    MaskListItem,
+    MaskListResponse,
+    MaskRequest,
+    MaskResponse,
+    MaskStatus,
+    MaskSummary,
+)
+from .scan import (
+    PIIFinding,
+    PIIScanListItem,
+    PIIScanListResponse,
+    PIIScanRequest,
+    PIIScanResponse,
+    PIIScanSummary,
+    Regulation,
+    RegulationViolation,
+)
 from .schema import (
     ColumnSchema,
     SchemaLearnRequest,
@@ -182,8 +208,27 @@ __all__ = [
     "SchemaResponse",
     "SchemaSummary",
     # Profile
+    "ProfileRequest",
     "ColumnProfile",
     "ProfileResponse",
+    # Data Masking
+    "MaskingStrategy",
+    "MaskingStrategyLiteral",
+    "MaskStatus",
+    "MaskRequest",
+    "MaskSummary",
+    "MaskResponse",
+    "MaskListItem",
+    "MaskListResponse",
+    # PII Scan
+    "Regulation",
+    "PIIScanRequest",
+    "PIIFinding",
+    "RegulationViolation",
+    "PIIScanSummary",
+    "PIIScanResponse",
+    "PIIScanListItem",
+    "PIIScanListResponse",
     # History
     "TrendDataPoint",
     "FailureFrequencyItem",
@@ -192,6 +237,12 @@ __all__ = [
     "HistoryResponse",
     "HistoryQueryParams",
     # Drift
+    "DriftMethod",
+    "DriftMethodLiteral",
+    "CorrectionMethod",
+    "CorrectionMethodLiteral",
+    "DEFAULT_THRESHOLDS",
+    "get_default_threshold",
     "DriftCompareRequest",
     "ColumnDriftResult",
     "DriftResult",

truthound_dashboard/schemas/collaboration.py CHANGED Viewed

@@ -142,7 +142,7 @@ class ActivityResponse(BaseSchema, IDMixin):
             action=ActivityAction(activity.action),
             actor_id=activity.actor_id,
             description=activity.description,
-            metadata=activity.metadata,
+            metadata=activity.activity_metadata,
             created_at=activity.created_at,
         )

truthound_dashboard/schemas/drift.py CHANGED Viewed

@@ -1,10 +1,26 @@
 """Drift detection schemas.
 Schemas for drift comparison request/response.
+Drift Methods (from truthound):
+- ks: Kolmogorov-Smirnov test (continuous distributions)
+- psi: Population Stability Index (any distribution, industry standard)
+- chi2: Chi-Square test (categorical data)
+- js: Jensen-Shannon divergence (probability distributions)
+- kl: Kullback-Leibler divergence (distribution difference)
+- wasserstein: Wasserstein/Earth Mover's Distance (distribution transport)
+- cvm: Cramér-von Mises test (more sensitive to tails than KS)
+- anderson: Anderson-Darling test (weighted for tail sensitivity)
+Multiple Testing Correction:
+- bonferroni: Conservative, independent tests
+- holm: Sequential adjustment, less conservative
+- bh: Benjamini-Hochberg (FDR control, default for multiple columns)
 """
 from __future__ import annotations
+from enum import Enum
 from typing import Any, Literal
 from pydantic import BaseModel, Field
@@ -12,6 +28,87 @@ from pydantic import BaseModel, Field
 from .base import IDMixin, TimestampMixin
+class DriftMethod(str, Enum):
+    """Drift detection methods supported by truthound.
+    Each method has different characteristics and use cases:
+    - auto: Smart selection based on data type (numeric → PSI, categorical → chi2)
+    - ks: Kolmogorov-Smirnov test - best for continuous distributions
+    - psi: Population Stability Index - industry standard, any distribution
+    - chi2: Chi-Square test - best for categorical data
+    - js: Jensen-Shannon divergence - symmetric, bounded (0-1)
+    - kl: Kullback-Leibler divergence - information loss measure
+    - wasserstein: Earth Mover's Distance - metric, meaningful for non-overlapping
+    - cvm: Cramér-von Mises - more sensitive to tail differences than KS
+    - anderson: Anderson-Darling - weighted for tail sensitivity
+    """
+    AUTO = "auto"
+    KS = "ks"
+    PSI = "psi"
+    CHI2 = "chi2"
+    JS = "js"
+    KL = "kl"
+    WASSERSTEIN = "wasserstein"
+    CVM = "cvm"
+    ANDERSON = "anderson"
+class CorrectionMethod(str, Enum):
+    """Multiple testing correction methods.
+    When comparing multiple columns, correction adjusts p-values to control
+    false discovery rate:
+    - none: No correction (use with caution)
+    - bonferroni: Conservative, suitable for independent tests
+    - holm: Sequential adjustment, less conservative than Bonferroni
+    - bh: Benjamini-Hochberg (FDR control), default for multiple columns
+    """
+    NONE = "none"
+    BONFERRONI = "bonferroni"
+    HOLM = "holm"
+    BH = "bh"
+# Default thresholds for each detection method
+DEFAULT_THRESHOLDS: dict[DriftMethod, float] = {
+    DriftMethod.AUTO: 0.05,
+    DriftMethod.KS: 0.05,
+    DriftMethod.PSI: 0.1,
+    DriftMethod.CHI2: 0.05,
+    DriftMethod.JS: 0.1,
+    DriftMethod.KL: 0.1,
+    DriftMethod.WASSERSTEIN: 0.1,  # Scale-dependent, adjust based on data
+    DriftMethod.CVM: 0.05,
+    DriftMethod.ANDERSON: 0.05,
+}
+def get_default_threshold(method: DriftMethod | str) -> float:
+    """Get default threshold for a drift detection method.
+    Args:
+        method: Drift detection method
+    Returns:
+        Default threshold value for the method
+    """
+    if isinstance(method, str):
+        try:
+            method = DriftMethod(method)
+        except ValueError:
+            return 0.05  # Fallback default
+    return DEFAULT_THRESHOLDS.get(method, 0.05)
+# Type alias for method values (for Literal type hints)
+DriftMethodLiteral = Literal[
+    "auto", "ks", "psi", "chi2", "js", "kl", "wasserstein", "cvm", "anderson"
+]
+CorrectionMethodLiteral = Literal["none", "bonferroni", "holm", "bh"]
 class DriftCompareRequest(BaseModel):
     """Request body for drift comparison."""
@@ -20,10 +117,28 @@ class DriftCompareRequest(BaseModel):
     columns: list[str] | None = Field(
         None, description="Columns to compare (None = all)"
     )
-    method: Literal["auto", "ks", "psi", "chi2", "js"] = Field(
-        "auto", description="Drift detection method"
+    method: DriftMethodLiteral = Field(
+        "auto",
+        description=(
+            "Drift detection method: "
+            "auto (smart selection), ks (Kolmogorov-Smirnov), psi (Population Stability Index), "
+            "chi2 (Chi-Square), js (Jensen-Shannon), kl (Kullback-Leibler), "
+            "wasserstein (Earth Mover's), cvm (Cramér-von Mises), anderson (Anderson-Darling)"
+        ),
+    )
+    threshold: float | None = Field(
+        None,
+        ge=0,
+        le=1,
+        description="Custom threshold (default varies by method: KS/chi2/cvm/anderson=0.05, PSI/JS/KL/wasserstein=0.1)",
+    )
+    correction: CorrectionMethodLiteral | None = Field(
+        None,
+        description=(
+            "Multiple testing correction: none, bonferroni (conservative), "
+            "holm (sequential), bh (Benjamini-Hochberg FDR, default for multiple columns)"
+        ),
     )
-    threshold: float | None = Field(None, ge=0, le=1, description="Custom threshold")
     sample_size: int | None = Field(
         None, ge=100, description="Sample size for large datasets"
     )

truthound-dashboard 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

truthound-dashboard 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl