PyPI - validmind - Versions diffs - 2.9.1__py3-none-any.whl → 2.9.3__py3-none-any.whl - Mend

validmind 2.9.1py3-none-any.whl → 2.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

validmind/vm_models/result/pii_filter.py ADDED Viewed

@@ -0,0 +1,202 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+"""
+PII filtering utilities using Microsoft Presidio for detecting and masking
+personally identifiable information in test result data.
+"""
+import os
+from enum import Enum
+from typing import Dict
+import pandas as pd
+from ...logging import get_logger
+logger = get_logger(__name__)
+class PIIDetectionMode(Enum):
+    """Enum for PII detection modes."""
+    DISABLED = "disabled"
+    TEST_RESULTS = "test_results"
+    TEST_DESCRIPTIONS = "test_descriptions"
+    ALL = "all"
+# Default entities to detect common PII types
+DEFAULT_ENTITIES = [
+    "PERSON",
+    "EMAIL_ADDRESS",
+    "PHONE_NUMBER",
+    "CREDIT_CARD",
+    "US_SSN",
+    "US_DRIVER_LICENSE",
+    "IP_ADDRESS",
+    "LOCATION",
+    "DATE_TIME",
+    "US_PASSPORT",
+    "US_BANK_NUMBER",
+    "IBAN_CODE",
+]
+# Default confidence threshold
+DEFAULT_THRESHOLD = 0.5
+# Default sample size for DataFrame PII scanning
+SAMPLE_SIZE = 100
+def get_pii_detection_mode() -> PIIDetectionMode:
+    """
+    Get the current PII detection mode.
+    Returns:
+        PIIDetectionMode.DISABLED if:
+        - Environment variable is not set
+        - Environment variable is set to "disabled"
+        - Presidio packages are not installed
+        - Invalid mode value
+        Otherwise returns the specified mode (test_results, test_descriptions, or all)
+    """
+    mode_str = os.getenv("VALIDMIND_PII_DETECTION", "disabled").lower()
+    try:
+        mode = PIIDetectionMode(mode_str)
+    except ValueError:
+        logger.warning(
+            f"Invalid PII detection mode '{mode_str}'. "
+            f"Valid options: {', '.join([mode.value for mode in PIIDetectionMode])}. "
+            f"Defaulting to 'disabled'."
+        )
+        mode = PIIDetectionMode.DISABLED
+    # If mode is not disabled, check if Presidio is actually available
+    if mode != PIIDetectionMode.DISABLED:
+        if not _is_presidio_available():
+            logger.warning(
+                f"PII detection mode '{mode.value}' requested but Presidio not available. "
+                "Falling back to 'disabled' mode. Install with: pip install validmind[pii-detection]"
+            )
+            mode = PIIDetectionMode.DISABLED
+    return mode
+def _is_presidio_available() -> bool:
+    """Check if any Presidio components are available."""
+    return _get_presidio_text() is not None or _get_presidio_df() is not None
+def _get_presidio_text():
+    """Get Presidio analyzer for text analysis."""
+    from presidio_analyzer import AnalyzerEngine
+    return AnalyzerEngine()
+def _get_presidio_df():
+    """Get Presidio Structured PandasAnalysisBuilder for DataFrame analysis."""
+    from presidio_structured import PandasAnalysisBuilder
+    return PandasAnalysisBuilder()
+def scan_text(text: str) -> bool:
+    """
+    Scan text for PII content. Raises ValueError if PII is found.
+    Args:
+        text: The text to scan for PII
+    Returns:
+        True if no PII is found
+    Raises:
+        ValueError: If PII is detected
+    """
+    # sanity check
+    mode = get_pii_detection_mode()
+    if mode == PIIDetectionMode.DISABLED:
+        return True
+    analyzer = _get_presidio_text()
+    results = analyzer.analyze(text=text, entities=DEFAULT_ENTITIES, language="en")
+    # Filter results by confidence threshold
+    pii_entities = [
+        {
+            "entity_type": result.entity_type,
+            "start": result.start,
+            "end": result.end,
+            "score": result.score,
+            "text": text[result.start : result.end],
+        }
+        for result in results
+        if result.score >= DEFAULT_THRESHOLD
+    ]
+    if pii_entities:
+        entity_types = set(entity["entity_type"] for entity in pii_entities)
+        raise ValueError(
+            f"PII detected in text content. Entity types found: {', '.join(entity_types)}."
+        )
+    return True
+def scan_df(df: pd.DataFrame) -> bool:
+    """
+    Scan a pandas DataFrame for PII content. Raises ValueError if PII is found.
+    Args:
+        df: The DataFrame to scan
+    Returns:
+        True if no PII is found
+    Raises:
+        ValueError: If PII is detected
+    """
+    # sanity check
+    mode = get_pii_detection_mode()
+    if mode == PIIDetectionMode.DISABLED:
+        return True
+    # Scan all string/object columns
+    columns = [col for col in df.columns if df[col].dtype == "object"]
+    if not columns:
+        return True
+    # Limit the number of rows to scan for performance
+    sample_df = df.head(SAMPLE_SIZE) if len(df) > SAMPLE_SIZE else df
+    # Use structured analysis
+    builder = _get_presidio_df()
+    tabular_analysis = builder.generate_analysis(
+        sample_df,
+        selection_strategy="mixed",
+        mixed_strategy_threshold=DEFAULT_THRESHOLD,
+    )
+    entity_mapping: Dict[str, str] = getattr(tabular_analysis, "entity_mapping", {})
+    pii_columns = [
+        column
+        for column in columns
+        if column in entity_mapping and entity_mapping[column]
+    ]
+    if pii_columns:
+        entity_types = [entity_mapping[col] for col in pii_columns]
+        raise ValueError(
+            f"PII detected in DataFrame columns: {', '.join(pii_columns)}. "
+            f"Entity types found: {', '.join(entity_types)}."
+        )
+    return True

validmind/vm_models/result/result.py CHANGED Viewed

@@ -31,10 +31,10 @@ from ...utils import (
 )
 from ..figure import Figure, create_figure
 from ..input import VMInput
+from .pii_filter import PIIDetectionMode, get_pii_detection_mode, scan_df, scan_text
 from .utils import (
     AI_REVISION_NAME,
     DEFAULT_REVISION_NAME,
-    check_for_sensitive_data,
     figures_to_widgets,
     get_result_template,
     tables_to_widgets,
@@ -222,8 +222,10 @@ class TestResult(Result):
             description = super().__getattribute__("description")
             if isinstance(description, DescriptionFuture):
-                self._was_description_generated = True
-                self.description = description.get_description()
+                (
+                    self.description,
+                    self._was_description_generated,
+                ) = description.get_description()
         return super().__getattribute__(name)
@@ -465,8 +467,10 @@ class TestResult(Result):
             )
         )
-        if self.metric is not None:
-            # metrics are logged as separate entities
+        # Only log unit metrics when the metric is a scalar value.
+        # Some tests may assign a list/array of per-row metrics to `self.metric`.
+        # Those should not be sent to the unit-metric endpoint which expects scalars.
+        if self.metric is not None and not hasattr(self.metric, "__len__"):
             tasks.append(
                 api_client.alog_metric(
                     key=self.result_id,
@@ -521,7 +525,7 @@ class TestResult(Result):
         return await asyncio.gather(*tasks)
-    def log(
+    def log(  # noqa: C901
         self,
         section_id: str = None,
         content_id: str = None,
@@ -552,9 +556,15 @@ class TestResult(Result):
         self.check_result_id_exist()
-        if not unsafe:
+        if not unsafe and get_pii_detection_mode() in [
+            PIIDetectionMode.TEST_RESULTS,
+            PIIDetectionMode.ALL,
+        ]:
             for table in self.tables or []:
-                check_for_sensitive_data(table.data, self._get_flat_inputs())
+                scan_df(table.data)
+            if self.description:
+                scan_text(self.description)
         if section_id:
             self._validate_section_id_for_block(section_id, position)
@@ -701,6 +711,22 @@ class TextGenerationResult(Result):
             position (int): The position (index) within the section to insert the test
                 result.
         """
+        # Check description text for PII when available
+        if self.description:
+            try:
+                from .pii_filter import check_text_for_pii
+                check_text_for_pii(self.description, raise_on_detection=True)
+            except ImportError:
+                logger.debug(
+                    "PII detection not available - skipping PII check for description"
+                )
+            except ValueError:
+                # Re-raise PII detection errors
+                raise
+            except Exception as e:
+                logger.warning(f"PII detection failed for description: {e}")
         run_async(
             self.log_async,
             content_id=content_id,

validmind/vm_models/result/utils.py CHANGED Viewed

@@ -5,15 +5,12 @@
 import os
 from typing import TYPE_CHECKING, Dict, List, Union
-import pandas as pd
 from ipywidgets import HTML, GridBox, Layout
 from jinja2 import Template
 from ... import api_client
 from ...logging import get_logger
-from ..dataset import VMDataset
 from ..figure import Figure
-from ..input import VMInput
 if TYPE_CHECKING:
     from .result import ResultTable
@@ -52,30 +49,6 @@ async def update_metadata(content_id: str, text: str, _json: Union[Dict, List] =
     await api_client.alog_metadata(content_id, text, _json)
-def check_for_sensitive_data(data: pd.DataFrame, inputs: List[VMInput]):
-    """Check if the data contains sensitive information from input datasets."""
-    dataset_columns = {
-        col: len(input_obj.df)
-        for input_obj in inputs
-        if isinstance(input_obj, VMDataset)
-        for col in input_obj.columns
-    }
-    table_columns = {col: len(data) for col in data.columns}
-    offending_columns = [
-        col
-        for col in table_columns
-        if col in dataset_columns and table_columns[col] == dataset_columns[col]
-    ]
-    if offending_columns:
-        raise ValueError(
-            f"Raw input data found in table, pass `unsafe=True` "
-            f"or remove the offending columns: {offending_columns}"
-        )
 def tables_to_widgets(tables: List["ResultTable"]):
     """Convert a list of tables to ipywidgets."""
     widgets = [

validmind 2.9.1__py3-none-any.whl → 2.9.3__py3-none-any.whl

validmind 2.9.1py3-none-any.whl → 2.9.3py3-none-any.whl