PyPI - validmind - Versions diffs - 2.8.28__py3-none-any.whl → 2.9.1__py3-none-any.whl - Mend

validmind 2.8.28py3-none-any.whl → 2.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import Dict, List, Tuple
 import pandas as pd
 import plotly.graph_objs as go
@@ -18,7 +18,7 @@ def ClassImbalanceDrift(
     datasets: List[VMDataset],
     drift_pct_threshold: float = 5.0,
     title: str = "Class Distribution Drift",
-):
+) -> Tuple[go.Figure, Dict[str, pd.DataFrame], bool]:
     """
     Evaluates drift in class distribution between reference and monitoring datasets.

validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
@@ -18,7 +18,7 @@ from validmind.vm_models import VMDataset, VMModel
 @tasks("classification", "text_classification")
 def ClassificationAccuracyDrift(
     datasets: List[VMDataset], model: VMModel, drift_pct_threshold=20
-):
+) -> Tuple[Dict[str, pd.DataFrame], bool, RawData]:
     """
     Compares classification accuracy metrics between reference and monitoring datasets.

validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
@@ -18,7 +18,7 @@ from validmind.vm_models import VMDataset, VMModel
 @tasks("classification", "text_classification")
 def ConfusionMatrixDrift(
     datasets: List[VMDataset], model: VMModel, drift_pct_threshold=20
-):
+) -> Tuple[Dict[str, pd.DataFrame], bool, RawData]:
     """
     Compares confusion matrix metrics between reference and monitoring datasets.

validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import List, Tuple
 import numpy as np
 import plotly.graph_objects as go
@@ -17,7 +17,7 @@ from validmind.vm_models import VMDataset, VMModel
 def CumulativePredictionProbabilitiesDrift(
     datasets: List[VMDataset],
     model: VMModel,
-):
+) -> Tuple[go.Figure, RawData]:
     """
     Compares cumulative prediction probability distributions between reference and monitoring datasets.

validmind/tests/ongoing_monitoring/FeatureDrift.py CHANGED Viewed

@@ -2,11 +2,14 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 from validmind import RawData, tags, tasks
+from validmind.vm_models import VMDataset
 def calculate_psi_score(actual, expected):
@@ -92,11 +95,11 @@ def create_distribution_plot(feature_name, reference_dist, monitoring_dist, bins
 @tags("visualization")
 @tasks("monitoring")
 def FeatureDrift(
-    datasets,
+    datasets: List[VMDataset],
     bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
     feature_columns=None,
     psi_threshold=0.2,
-):
+) -> Tuple[Dict[str, pd.DataFrame], go.Figure, bool, RawData]:
     """
     Evaluates changes in feature distribution over time to identify potential model drift.

validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py CHANGED Viewed

@@ -3,14 +3,19 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import List, Tuple
 import matplotlib.pyplot as plt
 from validmind import RawData, tags, tasks
+from validmind.vm_models import VMDataset, VMModel
 @tags("visualization")
 @tasks("monitoring")
-def PredictionAcrossEachFeature(datasets, model):
+def PredictionAcrossEachFeature(
+    datasets: List[VMDataset], model: VMModel
+) -> Tuple[plt.Figure, RawData]:
     """
     Assesses differences in model predictions across individual features between reference and monitoring datasets
     through visual analysis.

validmind/tests/ongoing_monitoring/PredictionCorrelation.py CHANGED Viewed

@@ -2,15 +2,22 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import Dict, List, Tuple
 import pandas as pd
 import plotly.graph_objects as go
 from validmind import RawData, tags, tasks
+from validmind.vm_models import VMDataset, VMModel
 @tags("visualization")
 @tasks("monitoring")
-def PredictionCorrelation(datasets, model, drift_pct_threshold=20):
+def PredictionCorrelation(
+    datasets: List[VMDataset],
+    model: VMModel,
+    drift_pct_threshold: float = 20,
+) -> Tuple[Dict[str, pd.DataFrame], go.Figure, bool, RawData]:
     """
     Assesses correlation changes between model predictions from reference and monitoring datasets to detect potential
     target drift.

validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
@@ -21,7 +21,7 @@ def PredictionProbabilitiesHistogramDrift(
     model: VMModel,
     title="Prediction Probabilities Histogram Drift",
     drift_pct_threshold: float = 20.0,
-):
+) -> Tuple[go.Figure, Dict[str, pd.DataFrame], bool, RawData]:
     """
     Compares prediction probability distributions between reference and monitoring datasets.

validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py CHANGED Viewed

@@ -2,15 +2,20 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import List, Tuple
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
 @tags("visualization")
 @tasks("monitoring")
-def PredictionQuantilesAcrossFeatures(datasets, model):
+def PredictionQuantilesAcrossFeatures(
+    datasets: List[VMDataset], model: VMModel
+) -> Tuple[go.Figure, ...]:
     """
     Assesses differences in model prediction distributions across individual features between reference
     and monitoring datasets through quantile analysis.

validmind/tests/ongoing_monitoring/ROCCurveDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import List, Tuple
 import numpy as np
 import plotly.graph_objects as go
@@ -20,7 +20,9 @@ from validmind.vm_models import VMDataset, VMModel
     "visualization",
 )
 @tasks("classification", "text_classification")
-def ROCCurveDrift(datasets: List[VMDataset], model: VMModel):
+def ROCCurveDrift(
+    datasets: List[VMDataset], model: VMModel
+) -> Tuple[go.Figure, go.Figure, RawData]:
     """
     Compares ROC curves between reference and monitoring datasets.

validmind/tests/ongoing_monitoring/ScoreBandsDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
@@ -19,7 +19,7 @@ def ScoreBandsDrift(
     score_column: str = "score",
     score_bands: list = None,
     drift_threshold: float = 20.0,
-):
+) -> Tuple[Dict[str, pd.DataFrame], bool, RawData]:
     """
     Analyzes drift in population distribution and default rates across score bands.

validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from typing import List
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
@@ -21,7 +21,7 @@ def ScorecardHistogramDrift(
     score_column: str = "score",
     title: str = "Scorecard Histogram Drift",
     drift_pct_threshold: float = 20.0,
-):
+) -> Tuple[go.Figure, Dict[str, pd.DataFrame], bool]:
     """
     Compares score distributions between reference and monitoring datasets for each class.

validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py CHANGED Viewed

@@ -2,17 +2,24 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import Dict, List, Tuple
 import pandas as pd
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
 from scipy.stats import kurtosis, skew
 from validmind import RawData, tags, tasks
+from validmind.vm_models import VMDataset, VMModel
 @tags("visualization")
 @tasks("monitoring")
-def TargetPredictionDistributionPlot(datasets, model, drift_pct_threshold=20):
+def TargetPredictionDistributionPlot(
+    datasets: List[VMDataset],
+    model: VMModel,
+    drift_pct_threshold: float = 20,
+) -> Tuple[Dict[str, pd.DataFrame], go.Figure, bool, RawData]:
     """
     Assesses differences in prediction distributions between a reference dataset and a monitoring dataset to identify
     potential data drift.

validmind/tests/output.py CHANGED Viewed

@@ -45,7 +45,13 @@ class BooleanOutputHandler(OutputHandler):
 class MetricOutputHandler(OutputHandler):
     def can_handle(self, item: Any) -> bool:
-        return isinstance(item, (int, float))
+        # Accept individual numbers
+        if isinstance(item, (int, float)):
+            return True
+        # Accept lists/arrays of numbers for per-row metrics
+        if isinstance(item, (list, tuple, np.ndarray)):
+            return all(isinstance(x, (int, float, np.number)) for x in item)
+        return False
     def process(self, item: Any, result: TestResult) -> None:
         if result.metric is not None:
@@ -169,11 +175,12 @@ def process_output(item: Any, result: TestResult) -> None:
     """Process a single test output item and update the TestResult."""
     handlers = [
         BooleanOutputHandler(),
-        MetricOutputHandler(),
         FigureOutputHandler(),
         TableOutputHandler(),
         RawDataOutputHandler(),
         StringOutputHandler(),
+        # Unit metrics should be processed last
+        MetricOutputHandler(),
     ]
     for handler in handlers:

validmind/tests/plots/BoxPlot.py ADDED Viewed

@@ -0,0 +1,260 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import List, Optional
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from validmind import tags, tasks
+from validmind.errors import SkipTestError
+from validmind.vm_models import VMDataset
+def _validate_inputs(
+    dataset: VMDataset, columns: Optional[List[str]], group_by: Optional[str]
+):
+    """Validate inputs and return validated columns."""
+    if columns is None:
+        columns = dataset.feature_columns_numeric
+    else:
+        available_columns = set(dataset.feature_columns_numeric)
+        columns = [col for col in columns if col in available_columns]
+    if not columns:
+        raise SkipTestError("No numerical columns found for box plotting")
+    if group_by is not None:
+        if group_by not in dataset.df.columns:
+            raise SkipTestError(f"Group column '{group_by}' not found in dataset")
+        if group_by in columns:
+            columns.remove(group_by)
+    return columns
+def _create_grouped_boxplot(
+    dataset, columns, group_by, colors, show_outliers, title_prefix, width, height
+):
+    """Create grouped box plots."""
+    fig = go.Figure()
+    groups = dataset.df[group_by].dropna().unique()
+    for col_idx, column in enumerate(columns):
+        for group_idx, group_value in enumerate(groups):
+            data_subset = dataset.df[dataset.df[group_by] == group_value][
+                column
+            ].dropna()
+            if len(data_subset) > 0:
+                color = colors[group_idx % len(colors)]
+                fig.add_trace(
+                    go.Box(
+                        y=data_subset,
+                        name=f"{group_value}",
+                        marker_color=color,
+                        boxpoints="outliers" if show_outliers else False,
+                        jitter=0.3,
+                        pointpos=-1.8,
+                        legendgroup=f"{group_value}",
+                        showlegend=(col_idx == 0),
+                        offsetgroup=group_idx,
+                        x=[column] * len(data_subset),
+                    )
+                )
+    fig.update_layout(
+        title=f"{title_prefix} Features by {group_by}",
+        xaxis_title="Features",
+        yaxis_title="Values",
+        boxmode="group",
+        width=width,
+        height=height,
+        template="plotly_white",
+    )
+    return fig
+def _create_single_boxplot(
+    dataset, column, colors, show_outliers, title_prefix, width, height
+):
+    """Create single column box plot."""
+    data = dataset.df[column].dropna()
+    if len(data) == 0:
+        raise SkipTestError(f"No data available for column {column}")
+    fig = go.Figure()
+    fig.add_trace(
+        go.Box(
+            y=data,
+            name=column,
+            marker_color=colors[0],
+            boxpoints="outliers" if show_outliers else False,
+            jitter=0.3,
+            pointpos=-1.8,
+        )
+    )
+    fig.update_layout(
+        title=f"{title_prefix} {column}",
+        yaxis_title=column,
+        width=width,
+        height=height,
+        template="plotly_white",
+        showlegend=False,
+    )
+    return fig
+def _create_multiple_boxplots(
+    dataset, columns, colors, show_outliers, title_prefix, width, height
+):
+    """Create multiple column box plots in subplot layout."""
+    n_cols = min(3, len(columns))
+    n_rows = (len(columns) + n_cols - 1) // n_cols
+    subplot_titles = [f"{title_prefix} {col}" for col in columns]
+    fig = make_subplots(
+        rows=n_rows,
+        cols=n_cols,
+        subplot_titles=subplot_titles,
+        vertical_spacing=0.1,
+        horizontal_spacing=0.1,
+    )
+    for idx, column in enumerate(columns):
+        row = (idx // n_cols) + 1
+        col = (idx % n_cols) + 1
+        data = dataset.df[column].dropna()
+        if len(data) > 0:
+            color = colors[idx % len(colors)]
+            fig.add_trace(
+                go.Box(
+                    y=data,
+                    name=column,
+                    marker_color=color,
+                    boxpoints="outliers" if show_outliers else False,
+                    jitter=0.3,
+                    pointpos=-1.8,
+                    showlegend=False,
+                ),
+                row=row,
+                col=col,
+            )
+            fig.update_yaxes(title_text=column, row=row, col=col)
+        else:
+            fig.add_annotation(
+                text=f"No data available<br>for {column}",
+                x=0.5,
+                y=0.5,
+                xref=f"x{idx+1} domain" if idx > 0 else "x domain",
+                yref=f"y{idx+1} domain" if idx > 0 else "y domain",
+                showarrow=False,
+                row=row,
+                col=col,
+            )
+    fig.update_layout(
+        title="Dataset Feature Distributions",
+        width=width,
+        height=height,
+        template="plotly_white",
+        showlegend=False,
+    )
+    return fig
+@tags("tabular_data", "visualization", "data_quality")
+@tasks("classification", "regression", "clustering")
+def BoxPlot(
+    dataset: VMDataset,
+    columns: Optional[List[str]] = None,
+    group_by: Optional[str] = None,
+    width: int = 1200,
+    height: int = 600,
+    colors: Optional[List[str]] = None,
+    show_outliers: bool = True,
+    title_prefix: str = "Box Plot of",
+) -> go.Figure:
+    """
+    Generates customizable box plots for numerical features in a dataset with optional grouping using Plotly.
+    ### Purpose
+    This test provides a flexible way to visualize the distribution of numerical features
+    through interactive box plots, with optional grouping by categorical variables. Box plots are
+    effective for identifying outliers, comparing distributions across groups, and
+    understanding the spread and central tendency of the data.
+    ### Test Mechanism
+    The test creates interactive box plots for specified numerical columns (or all numerical columns
+    if none specified). It supports various customization options including:
+    - Grouping by categorical variables
+    - Customizable colors and styling
+    - Outlier display options
+    - Interactive hover information
+    - Zoom and pan capabilities
+    ### Signs of High Risk
+    - Presence of many outliers indicating data quality issues
+    - Highly skewed distributions
+    - Large differences in variance across groups
+    - Unexpected patterns in grouped data
+    ### Strengths
+    - Clear visualization of distribution statistics (median, quartiles, outliers)
+    - Interactive Plotly plots with hover information and zoom capabilities
+    - Effective for comparing distributions across groups
+    - Handles missing values appropriately
+    - Highly customizable appearance
+    ### Limitations
+    - Limited to numerical features only
+    - May not be suitable for continuous variables with many unique values
+    - Visual interpretation may be subjective
+    - Less effective with very large datasets
+    """
+    # Validate inputs
+    columns = _validate_inputs(dataset, columns, group_by)
+    # Set default colors
+    if colors is None:
+        colors = [
+            "steelblue",
+            "orange",
+            "green",
+            "red",
+            "purple",
+            "brown",
+            "pink",
+            "gray",
+            "olive",
+            "cyan",
+        ]
+    # Create appropriate plot type
+    if group_by is not None:
+        return _create_grouped_boxplot(
+            dataset,
+            columns,
+            group_by,
+            colors,
+            show_outliers,
+            title_prefix,
+            width,
+            height,
+        )
+    elif len(columns) == 1:
+        return _create_single_boxplot(
+            dataset, columns[0], colors, show_outliers, title_prefix, width, height
+        )
+    else:
+        return _create_multiple_boxplots(
+            dataset, columns, colors, show_outliers, title_prefix, width, height
+        )

validmind 2.8.28__py3-none-any.whl → 2.9.1__py3-none-any.whl

validmind 2.8.28py3-none-any.whl → 2.9.1py3-none-any.whl