PyPI - validmind - Versions diffs - 2.0.0__py3-none-any.whl → 2.0.7__py3-none-any.whl - Mend

validmind 2.0.0py3-none-any.whl → 2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

validmind/__init__.py +4 -1
validmind/__version__.py +1 -1
validmind/ai.py +197 -0
validmind/api_client.py +16 -4
validmind/client.py +23 -3
validmind/datasets/classification/customer_churn.py +2 -2
validmind/datasets/nlp/__init__.py +5 -0
validmind/datasets/nlp/cnn_dailymail.py +98 -0
validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
validmind/errors.py +11 -1
validmind/models/huggingface.py +2 -2
validmind/models/pytorch.py +3 -3
validmind/models/sklearn.py +4 -4
validmind/tests/__init__.py +47 -9
validmind/tests/data_validation/DatasetDescription.py +0 -1
validmind/tests/data_validation/PiTCreditScoresHistogram.py +8 -3
validmind/tests/data_validation/TargetRateBarPlots.py +3 -1
validmind/tests/data_validation/nlp/StopWords.py +1 -6
validmind/tests/data_validation/nlp/TextDescription.py +20 -9
validmind/tests/decorator.py +189 -0
validmind/tests/model_validation/MeteorScore.py +92 -0
validmind/tests/model_validation/RegardHistogram.py +5 -6
validmind/tests/model_validation/RegardScore.py +3 -5
validmind/tests/model_validation/RougeMetrics.py +6 -4
validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +3 -1
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +30 -4
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -3
validmind/tests/model_validation/statsmodels/ADF.py +27 -1
validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +1 -13
validmind/tests/prompt_validation/ai_powered_test.py +2 -0
validmind/unit_metrics/__init__.py +0 -2
validmind/unit_metrics/composite.py +275 -0
validmind/unit_metrics/regression/GiniCoefficient.py +39 -0
validmind/unit_metrics/regression/HuberLoss.py +27 -0
validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +36 -0
validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +22 -0
validmind/unit_metrics/regression/MeanBiasDeviation.py +22 -0
validmind/unit_metrics/regression/QuantileLoss.py +25 -0
validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +27 -0
validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +22 -0
validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +22 -0
validmind/unit_metrics/regression/sklearn/RSquaredScore.py +22 -0
validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +23 -0
validmind/unit_metrics/sklearn/classification/Accuracy.py +2 -0
validmind/unit_metrics/sklearn/classification/F1.py +2 -0
validmind/unit_metrics/sklearn/classification/Precision.py +2 -0
validmind/unit_metrics/sklearn/classification/ROC_AUC.py +2 -0
validmind/unit_metrics/sklearn/classification/Recall.py +2 -0
validmind/utils.py +17 -1
validmind/vm_models/dataset.py +376 -21
validmind/vm_models/figure.py +52 -17
validmind/vm_models/test/metric.py +33 -30
validmind/vm_models/test/output_template.py +0 -27
validmind/vm_models/test/result_wrapper.py +57 -24
validmind/vm_models/test/test.py +2 -1
validmind/vm_models/test/threshold_test.py +24 -13
validmind/vm_models/test_context.py +7 -0
validmind/vm_models/test_suite/runner.py +1 -1
validmind/vm_models/test_suite/test.py +1 -1
{validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/METADATA +9 -13
{validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/RECORD +69 -48
validmind-2.0.7.dist-info/entry_points.txt +3 -0
{validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/LICENSE +0 -0
{validmind-2.0.0.dist-info → validmind-2.0.7.dist-info}/WHEEL +0 -0

validmind/tests/model_validation/RegardScore.py CHANGED Viewed

@@ -59,21 +59,19 @@ class RegardScore(Metric):
         y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
         y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
-        input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
-        if not len(y_true) == len(y_pred) == len(input_text):
+        if not len(y_true) == len(y_pred):
             raise ValueError(
                 "Inconsistent lengths among input text, true summaries, and predicted summaries."
             )
-        return input_text, y_true, y_pred
+        return y_true, y_pred
     def regard_line_plot(self):
         regard_tool = evaluate.load("regard")
-        input_text, y_true, y_pred = self._get_datasets()
+        y_true, y_pred = self._get_datasets()
         dataframes = {
-            "Input Text": input_text,
             "Target Text": y_true,
             "Predicted Summaries": y_pred,
         }

validmind/tests/model_validation/RougeMetrics.py CHANGED Viewed

@@ -76,7 +76,6 @@ class RougeMetrics(Metric):
         if r_metrics is None:
             raise ValueError("rouge_metrics must be provided in params")
-        # With all
         if not (
             set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
             == set(r_metrics)
@@ -97,12 +96,13 @@ class RougeMetrics(Metric):
         metrics_df = pd.DataFrame(score_list)
         figures = []
         for m in metrics_df.columns:
             df_scores = pd.DataFrame(metrics_df[m].tolist())
             # Visualization part
             fig = go.Figure()
-            # Adding the line plots
+            # Adding the line plots for precision, recall, and F1-score with lines and markers
             fig.add_trace(
                 go.Scatter(
                     x=df_scores.index,
@@ -129,11 +129,13 @@ class RougeMetrics(Metric):
             )
             fig.update_layout(
-                title="ROUGE Scores for Each Row",
+                title=f"ROUGE Scores for {m}",
                 xaxis_title="Row Index",
                 yaxis_title="Score",
             )
-            k = m.replace("-", "")
+            # Ensure a unique key for each metric
+            k = f"{m.replace('-', '')}_{len(figures)}"
             figures.append(
                 Figure(
                     for_object=self,

validmind/tests/model_validation/SelfCheckNLIScore.py ADDED Viewed

@@ -0,0 +1,112 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import itertools
+from dataclasses import dataclass
+import pandas as pd
+import plotly.graph_objects as go
+import torch
+from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
+from tqdm import tqdm
+from validmind.vm_models import Figure, Metric
+@dataclass
+class SelfCheckNLIScore(Metric):
+    """
+    Evaluates text generation models' performance by quantifying the level of hallucination in generated texts compared to reference texts.
+    **Purpose**: The HallucinationScore metric is designed to assess the factual accuracy and reliability of text generated by models, focusing on the detection and quantification of hallucinations—instances where generated content deviates from factual or expected outputs. By comparing generated texts against reference texts, this metric highlights discrepancies indicative of hallucinations, offering insights into the model's ability to produce contextually and factually coherent content.
+    **Test Mechanism**: To compute the HallucinationScore, the metric employs a comparison between the generated texts (model predictions) and the provided reference texts (true values). Using the SelfCheckNLI model, it evaluates each generated text's level of factual congruence with the reference, assigning a hallucination score based on the semantic coherence and factual accuracy. The scores for each text instance are then visualized in a line plot, allowing for the examination of hallucination trends across the dataset.
+    **Signs of High Risk**:
+    - High hallucination scores across a significant portion of the dataset, indicating a prevalence of factually inaccurate or irrelevant content generation.
+    - Patterns of consistent hallucination in specific contexts or subjects, suggesting gaps in the model's understanding or knowledge.
+    - Sharp fluctuations in hallucination scores, which may reveal inconsistencies in the model's performance or sensitivity to certain types of input.
+    **Strengths**:
+    - Directly addresses the critical aspect of factual accuracy in generated text, beyond mere linguistic or stylistic coherence.
+    - Provides a granular, instance-by-instance analysis of model performance, allowing for targeted improvements and diagnostics.
+    - Facilitates a deeper understanding of a model's capabilities and limitations in producing reliable and accurate content.
+    **Limitations**:
+    - Reliance on the SelfCheckNLI model means the accuracy and effectiveness of the HallucinationScore are contingent upon the performance and suitability of the underlying NLI model.
+    - May not fully capture the subtleties of certain factual inaccuracies or the contextual relevance of reference texts, especially in complex or nuanced domains.
+    - Potentially resource-intensive, given the computational demands of running advanced NLI models for large datasets.
+    """
+    name = "self_check_nli_score"
+    required_inputs = ["model", "dataset"]
+    def run(self):
+        # Assuming the dataset is structured with generated sentences and reference samples
+        y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
+        y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
+        hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
+        # Visualization of scores
+        figures = self.visualize_scores(hallucination_scores)
+        return self.cache_results(figures=figures)
+    def compute_hallucination_scores(self, predictions, references):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        selfcheck_nli = SelfCheckNLI(device=device)
+        hallucination_scores = []
+        print("Starting hallucination score computation...")
+        for index, (sentences, samples) in enumerate(
+            tqdm(zip(predictions, references), total=len(predictions))
+        ):
+            sent_scores_nli = selfcheck_nli.predict(
+                sentences=sentences, sampled_passages=samples
+            )
+            # Compute the mean of the hallucination scores for this row
+            average_score = sent_scores_nli.mean()
+            hallucination_scores.append(average_score)
+            # Print a progress update for each row
+            print(
+                f"Row {index + 1}/{len(predictions)}: Average hallucination score: {average_score}"
+            )
+        print("Completed hallucination score computation.")
+        return hallucination_scores
+    def visualize_scores(self, scores):
+        scores_df = pd.DataFrame(scores, columns=["Hallucination Score"])
+        fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=scores_df.index,
+                y=scores_df["Hallucination Score"],
+                mode="lines+markers",
+                name="Hallucination Score",
+            )
+        )
+        fig.update_layout(
+            title="Hallucination Scores Across Text Instances",
+            xaxis_title="Text Instance Index",
+            yaxis_title="Hallucination Score",
+        )
+        # Wrapping the plotly figure for compatibility with your framework might be needed
+        figures = [
+            Figure(
+                for_object=self,
+                key=self.key,
+                figure=fig,
+            )
+        ]
+        return figures

validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py CHANGED Viewed

@@ -59,30 +59,25 @@ class DescriptiveAnalytics(Metric):
     }
     def run(self):
-        mean = np.mean(self.inputs.dataset.y_pred(self.inputs.model.input_id))
-        median = np.median(self.inputs.dataset.y_pred(self.inputs.model.input_id))
-        std = np.std(self.inputs.dataset.y_pred(self.inputs.model.input_id))
+        # Assuming y_pred returns a 2D array of embeddings [samples, features]
+        preds = self.inputs.dataset.y_pred(self.inputs.model.input_id)
+        # Calculate statistics across the embedding dimensions, not across all embeddings
+        means = np.mean(preds, axis=0)  # Mean of each feature across all samples
+        medians = np.median(preds, axis=0)  # Median of each feature across all samples
+        stds = np.std(preds, axis=0)  # Std. dev. of each feature across all samples
+        # Plot histograms of the calculated statistics
+        mean_fig = px.histogram(x=means, title="Distribution of Embedding Means")
+        median_fig = px.histogram(x=medians, title="Distribution of Embedding Medians")
+        std_fig = px.histogram(
+            x=stds, title="Distribution of Embedding Standard Deviations"
+        )
         return self.cache_results(
             figures=[
-                Figure(
-                    for_object=self,
-                    key=self.key,
-                    figure=px.histogram(mean, title="Distribution of Embedding Means"),
-                ),
-                Figure(
-                    for_object=self,
-                    key=self.key,
-                    figure=px.histogram(
-                        median, title="Distribution of Embedding Medians"
-                    ),
-                ),
-                Figure(
-                    for_object=self,
-                    key=self.key,
-                    figure=px.histogram(
-                        std, title="Distribution of Embedding Standard Deviations"
-                    ),
-                ),
+                Figure(for_object=self, key=f"{self.key}_mean", figure=mean_fig),
+                Figure(for_object=self, key=f"{self.key}_median", figure=median_fig),
+                Figure(for_object=self, key=f"{self.key}_std", figure=std_fig),
             ],
         )

validmind/tests/model_validation/sklearn/ClassifierPerformance.py CHANGED Viewed

@@ -131,7 +131,9 @@ class ClassifierPerformance(Metric):
         y_true = self.y_true()
         class_pred = self.y_pred()
-        report = metrics.classification_report(y_true, class_pred, output_dict=True)
+        report = metrics.classification_report(
+            y_true, class_pred, output_dict=True, zero_division=0
+        )
         report["roc_auc"] = multiclass_roc_auc_score(y_true, class_pred)
         return self.cache_results(report)

validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py CHANGED Viewed

@@ -8,6 +8,7 @@ from dataclasses import dataclass
 import matplotlib.pyplot as plt
 import shap
+from validmind.errors import UnsupportedModelForSHAPError
 from validmind.logging import get_logger
 from validmind.vm_models import Figure, Metric
@@ -72,6 +73,9 @@ class SHAPGlobalImportance(Metric):
             "visualization",
         ],
     }
+    default_params = {
+        "kernel_explainer_samples": 10,
+    }
     def _generate_shap_plot(self, type_, shap_values, x_test):
         """
@@ -127,22 +131,44 @@ class SHAPGlobalImportance(Metric):
             model_class == "XGBClassifier"
             or model_class == "RandomForestClassifier"
             or model_class == "CatBoostClassifier"
+            or model_class == "DecisionTreeClassifier"
         ):
             explainer = shap.TreeExplainer(trained_model)
         elif (
             model_class == "LogisticRegression"
             or model_class == "XGBRegressor"
             or model_class == "LinearRegression"
+            or model_class == "LinearSVC"
         ):
             explainer = shap.LinearExplainer(trained_model, self.inputs.dataset.x)
+        elif model_class == "SVC":
+            # KernelExplainer is slow so we use shap.sample to speed it up
+            explainer = shap.KernelExplainer(
+                trained_model.predict,
+                shap.sample(
+                    self.inputs.dataset.x,
+                    self.params["kernel_explainer_samples"],
+                ),
+            )
+        else:
+            raise UnsupportedModelForSHAPError(
+                f"Model {model_class} not supported for SHAP importance."
+            )
+        # KernelExplainer is slow so we use shap.sample to speed it up
+        if isinstance(explainer, shap.KernelExplainer):
+            shap_sample = shap.sample(
+                self.inputs.dataset.x,
+                self.params["kernel_explainer_samples"],
+            )
         else:
-            raise ValueError(f"Model {model_class} not supported for SHAP importance.")
+            shap_sample = self.inputs.dataset.x
-        shap_values = explainer.shap_values(self.inputs.dataset.x)
+        shap_values = explainer.shap_values(shap_sample)
         figures = [
-            self._generate_shap_plot("mean", shap_values, self.inputs.dataset.x),
-            self._generate_shap_plot("summary", shap_values, self.inputs.dataset.x),
+            self._generate_shap_plot("mean", shap_values, shap_sample),
+            self._generate_shap_plot("summary", shap_values, shap_sample),
         ]
         # restore warnings

validmind/tests/model_validation/sklearn/TrainingTestDegradation.py CHANGED Viewed

@@ -129,12 +129,12 @@ class TrainingTestDegradation(ThresholdTest):
         y_test_true = y_test_true.astype(y_test_pred.dtype)
         report_train = metrics.classification_report(
-            y_train_true, y_train_pred, output_dict=True
+            y_train_true, y_train_pred, output_dict=True, zero_division=0
         )
         report_train["roc_auc"] = multiclass_roc_auc_score(y_train_true, y_train_pred)
         report_test = metrics.classification_report(
-            y_test_true, y_test_pred, output_dict=True
+            y_test_true, y_test_pred, output_dict=True, zero_division=0
         )
         report_test["roc_auc"] = multiclass_roc_auc_score(y_test_true, y_test_pred)
@@ -145,7 +145,13 @@ class TrainingTestDegradation(ThresholdTest):
             for metric_name in ["precision", "recall", "f1-score"]:
                 train_score = report_train[class_name][metric_name]
                 test_score = report_test[class_name][metric_name]
-                degradation = (train_score - test_score) / train_score
+                # If training score is 0, degradation is assumed to be 100%
+                if train_score == 0:
+                    degradation = 1.0
+                else:
+                    degradation = (train_score - test_score) / train_score
                 passed = degradation < self.params["max_threshold"]
                 test_results.append(
                     ThresholdTestResult(

validmind/tests/model_validation/statsmodels/ADF.py CHANGED Viewed

@@ -2,9 +2,10 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from pandas import DataFrame
 from statsmodels.tsa.stattools import adfuller
-from validmind.vm_models import Metric
+from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
 class ADF(Metric):
@@ -51,6 +52,31 @@ class ADF(Metric):
         ],
     }
+    def summary(self, metric_value: dict):
+        table = DataFrame.from_dict(metric_value, orient="index")
+        table = table.reset_index()
+        table.columns = [
+            "Feature",
+            "ADF Statistic",
+            "P-Value",
+            "Used Lag",
+            "Number of Observations",
+            "Critical Values",
+            "IC Best",
+        ]
+        table = table.rename_axis("Index", axis=1)
+        return ResultSummary(
+            results=[
+                ResultTable(
+                    data=table,
+                    metadata=ResultTableMetadata(
+                        title="ADF Test Results for Each Feature"
+                    ),
+                ),
+            ]
+        )
     def run(self):
         """
         Calculates ADF metric for each of the dataset features

validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py CHANGED Viewed

@@ -79,7 +79,7 @@ class RegressionModelsPerformance(Metric):
     def sample_performance_ols(self, models, datasets):
         evaluation_results = []
-        for (model, dataset) in zip(models, datasets):
+        for model, dataset in zip(models, datasets):
             X_columns = dataset.get_features_columns()
             y_true = dataset.y
             y_pred = dataset.y_pred(model.input_id)

validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py CHANGED Viewed

@@ -73,18 +73,6 @@ class ResidualsVisualInspection(Metric):
         "tags": ["statsmodels", "visualization"],
     }
-    def get_residuals(self, column, series):
-        """
-        Get the seasonal decomposition residuals from the test
-        context or re-compute them if not available. This allows
-        running the test individually or as part of a test suite.
-        """
-        sd_all_columns = self.test_context.get_context_data("seasonal_decompose")
-        if sd_all_columns is None or column not in sd_all_columns:
-            return seasonal_decompose(series, model="additive")
-        return sd_all_columns[column]
     @staticmethod
     def residual_analysis(residuals, variable_name, axes):
         residuals = residuals.dropna().reset_index(
@@ -115,7 +103,7 @@ class ResidualsVisualInspection(Metric):
         # TODO: specify which columns to plot via params
         for col in x_train.columns:
-            sd = self.get_residuals(col, x_train[col])
+            sd = seasonal_decompose(x_train[col], model="additive")
             # Remove NaN values from the residuals and reset the index
             residuals = pd.Series(sd.resid).dropna().reset_index(drop=True)

validmind/tests/prompt_validation/ai_powered_test.py CHANGED Viewed

@@ -57,6 +57,8 @@ class AIPoweredTest:
                     {"role": "system", "content": system_prompt},
                     {"role": "user", "content": user_prompt},
                 ],
+                temperature=0.0,
+                seed=42,
             )
             .choices[0]
             .message.content

validmind/unit_metrics/__init__.py CHANGED Viewed

@@ -237,8 +237,6 @@ def run_metric(metric_id=None, inputs=None, params=None):
     # Run the metric
     result = metric.run()
-    cache_key = get_metric_cache_key(metric_id, params, inputs)
     unit_metric_results_cache[cache_key] = result
     return result

validmind 2.0.0__py3-none-any.whl → 2.0.7__py3-none-any.whl

validmind 2.0.0py3-none-any.whl → 2.0.7py3-none-any.whl