PyPI - validmind - Versions diffs - 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl - Mend

validmind 2.5.24py3-none-any.whl → 2.6.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

validmind/__init__.py +8 -17
validmind/__version__.py +1 -1
validmind/ai/test_descriptions.py +66 -85
validmind/ai/test_result_description/context.py +2 -2
validmind/ai/utils.py +26 -1
validmind/api_client.py +43 -79
validmind/client.py +5 -7
validmind/client_config.py +1 -1
validmind/datasets/__init__.py +1 -1
validmind/datasets/classification/customer_churn.py +7 -5
validmind/datasets/nlp/__init__.py +2 -2
validmind/errors.py +6 -10
validmind/html_templates/content_blocks.py +18 -16
validmind/logging.py +21 -16
validmind/tests/__init__.py +28 -5
validmind/tests/__types__.py +186 -170
validmind/tests/_store.py +7 -21
validmind/tests/comparison.py +362 -0
validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
validmind/tests/data_validation/ADF.py +49 -83
validmind/tests/data_validation/AutoAR.py +59 -96
validmind/tests/data_validation/AutoMA.py +59 -96
validmind/tests/data_validation/AutoStationarity.py +66 -114
validmind/tests/data_validation/ClassImbalance.py +48 -117
validmind/tests/data_validation/DatasetDescription.py +180 -209
validmind/tests/data_validation/DatasetSplit.py +50 -75
validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
validmind/tests/data_validation/Duplicates.py +21 -90
validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
validmind/tests/data_validation/HighCardinality.py +32 -80
validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
validmind/tests/data_validation/IQROutliersTable.py +40 -80
validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
validmind/tests/data_validation/KPSS.py +33 -81
validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
validmind/tests/data_validation/MissingValues.py +17 -58
validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
validmind/tests/data_validation/RollingStatsPlot.py +50 -81
validmind/tests/data_validation/SeasonalDecompose.py +102 -184
validmind/tests/data_validation/Skewness.py +27 -64
validmind/tests/data_validation/SpreadPlot.py +34 -57
validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
validmind/tests/data_validation/TooManyZeroValues.py +21 -70
validmind/tests/data_validation/UniqueRows.py +23 -62
validmind/tests/data_validation/WOEBinPlots.py +83 -109
validmind/tests/data_validation/WOEBinTable.py +28 -69
validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
validmind/tests/data_validation/nlp/CommonWords.py +49 -57
validmind/tests/data_validation/nlp/Hashtags.py +27 -49
validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
validmind/tests/data_validation/nlp/Mentions.py +32 -63
validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
validmind/tests/data_validation/nlp/Punctuations.py +63 -47
validmind/tests/data_validation/nlp/Sentiment.py +4 -0
validmind/tests/data_validation/nlp/StopWords.py +62 -91
validmind/tests/data_validation/nlp/TextDescription.py +116 -159
validmind/tests/data_validation/nlp/Toxicity.py +12 -4
validmind/tests/decorator.py +33 -242
validmind/tests/load.py +212 -153
validmind/tests/model_validation/BertScore.py +13 -7
validmind/tests/model_validation/BleuScore.py +4 -0
validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
validmind/tests/model_validation/ContextualRecall.py +3 -0
validmind/tests/model_validation/FeaturesAUC.py +43 -74
validmind/tests/model_validation/MeteorScore.py +3 -0
validmind/tests/model_validation/RegardScore.py +5 -1
validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
validmind/tests/model_validation/embeddings/utils.py +53 -0
validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
validmind/tests/output.py +120 -0
validmind/tests/prompt_validation/Bias.py +55 -98
validmind/tests/prompt_validation/Clarity.py +56 -99
validmind/tests/prompt_validation/Conciseness.py +63 -101
validmind/tests/prompt_validation/Delimitation.py +48 -89
validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
validmind/tests/prompt_validation/Robustness.py +80 -121
validmind/tests/prompt_validation/Specificity.py +61 -95
validmind/tests/prompt_validation/ai_powered_test.py +2 -2
validmind/tests/run.py +314 -496
validmind/tests/test_providers.py +109 -79
validmind/tests/utils.py +91 -0
validmind/unit_metrics/__init__.py +16 -155
validmind/unit_metrics/classification/F1.py +1 -0
validmind/unit_metrics/classification/Precision.py +1 -0
validmind/unit_metrics/classification/ROC_AUC.py +1 -0
validmind/unit_metrics/classification/Recall.py +1 -0
validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
validmind/unit_metrics/regression/HuberLoss.py +1 -0
validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
validmind/unit_metrics/regression/QuantileLoss.py +1 -0
validmind/unit_metrics/regression/RSquaredScore.py +2 -1
validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
validmind/utils.py +66 -17
validmind/vm_models/__init__.py +2 -17
validmind/vm_models/dataset/dataset.py +31 -4
validmind/vm_models/figure.py +7 -37
validmind/vm_models/model.py +3 -0
validmind/vm_models/result/__init__.py +7 -0
validmind/vm_models/result/result.jinja +21 -0
validmind/vm_models/result/result.py +337 -0
validmind/vm_models/result/utils.py +160 -0
validmind/vm_models/test_suite/runner.py +16 -54
validmind/vm_models/test_suite/summary.py +3 -3
validmind/vm_models/test_suite/test.py +43 -77
validmind/vm_models/test_suite/test_suite.py +8 -40
validmind-2.6.7.dist-info/METADATA +137 -0
{validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
validmind/tests/data_validation/AutoSeasonality.py +0 -190
validmind/tests/metadata.py +0 -59
validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
validmind/unit_metrics/composite.py +0 -238
validmind/vm_models/test/metric.py +0 -98
validmind/vm_models/test/metric_result.py +0 -61
validmind/vm_models/test/output_template.py +0 -55
validmind/vm_models/test/result_summary.py +0 -76
validmind/vm_models/test/result_wrapper.py +0 -488
validmind/vm_models/test/test.py +0 -103
validmind/vm_models/test/threshold_test.py +0 -106
validmind/vm_models/test/threshold_test_result.py +0 -75
validmind/vm_models/test_context.py +0 -259
validmind-2.5.24.dist-info/METADATA +0 -118
{validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
{validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
{validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0

validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py CHANGED Viewed

@@ -2,23 +2,24 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 from sklearn.metrics import r2_score
 from sklearn.utils import check_random_state
-from validmind.errors import SkipTestError
+from validmind import tags, tasks
 from validmind.logging import get_logger
-from validmind.vm_models import Figure, Metric
+from validmind.vm_models import VMDataset, VMModel
 logger = get_logger(__name__)
-@dataclass
-class RegressionPermutationFeatureImportance(Metric):
+@tags("statsmodels", "feature_importance", "visualization")
+@tasks("regression")
+def RegressionPermutationFeatureImportance(
+    dataset: VMDataset, model: VMModel, fontsize: int = 12, figure_height: int = 500
+):
     """
     Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
     values are randomly rearranged.
@@ -55,79 +56,45 @@ class RegressionPermutationFeatureImportance(Metric):
     features.
     - Assumes independence of features when calculating importance, which might not always hold true.
     """
-    name = "regression_pfi"
-    required_inputs = ["model", "dataset"]
-    default_params = {
-        "fontsize": 12,
-        "figure_height": 500,
-    }
-    tasks = ["regression"]
-    tags = [
-        "statsmodels",
-        "feature_importance",
-        "visualization",
-    ]
-    def run(self):
-        x = self.inputs.dataset.x_df()
-        y = self.inputs.dataset.y_df()
-        model = self.inputs.model.model
-        if not hasattr(model, "predict"):
-            raise SkipTestError(
-                "Model does not support 'predict' method required for PFI"
-            )
-        # Calculate baseline performance
-        baseline_performance = r2_score(y, model.predict(x))
-        importances = pd.DataFrame(index=x.columns, columns=["Importance", "Std Dev"])
-        for column in x.columns:
-            shuffled_scores = []
-            for _ in range(30):  # Default number of shuffles
-                x_shuffled = x.copy()
-                x_shuffled[column] = check_random_state(0).permutation(
-                    x_shuffled[column]
-                )
-                permuted_performance = r2_score(y, model.predict(x_shuffled))
-                shuffled_scores.append(baseline_performance - permuted_performance)
-            importances.loc[column] = {
-                "Importance": np.mean(shuffled_scores),
-                "Std Dev": np.std(shuffled_scores),
-            }
-        sorted_idx = importances["Importance"].argsort()
-        # Plotting the results
-        fig = go.Figure()
-        fig.add_trace(
-            go.Bar(
-                y=importances.index[sorted_idx],
-                x=importances.loc[importances.index[sorted_idx], "Importance"],
-                orientation="h",
-                error_x=dict(
-                    type="data",
-                    array=importances.loc[importances.index[sorted_idx], "Std Dev"],
-                ),
-            )
-        )
-        fig.update_layout(
-            title_text="Permutation Feature Importances",
-            yaxis=dict(
-                tickmode="linear", dtick=1, tickfont=dict(size=self.params["fontsize"])
+    y_true = dataset.y
+    baseline_performance = r2_score(y_true, dataset.y_pred(model))
+    importances = pd.DataFrame(
+        index=dataset.feature_columns, columns=["Importance", "Std Dev"]
+    )
+    for column in dataset.feature_columns:
+        shuffled_scores = []
+        for _ in range(30):  # Default number of shuffles
+            x_shuffled = dataset.x_df()
+            x_shuffled[column] = check_random_state(0).permutation(x_shuffled[column])
+            permuted_performance = r2_score(y_true, model.predict(x_shuffled))
+            shuffled_scores.append(baseline_performance - permuted_performance)
+        importances.loc[column] = {
+            "Importance": np.mean(shuffled_scores),
+            "Std Dev": np.std(shuffled_scores),
+        }
+    sorted_idx = importances["Importance"].argsort()
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            y=importances.index[sorted_idx],
+            x=importances.loc[importances.index[sorted_idx], "Importance"],
+            orientation="h",
+            error_x=dict(
+                type="data",
+                array=importances.loc[importances.index[sorted_idx], "Std Dev"],
             ),
-            height=self.params["figure_height"],
-        )
-        return self.cache_results(
-            metric_value=importances.to_dict(),
-            figures=[
-                Figure(
-                    for_object=self,
-                    key="regression_pfi",
-                    figure=fig,
-                ),
-            ],
         )
+    )
+    fig.update_layout(
+        title_text="Permutation Feature Importances",
+        yaxis=dict(tickmode="linear", dtick=1, tickfont=dict(size=fontsize)),
+        height=figure_height,
+    )
+    return fig

validmind/tests/ongoing_monitoring/PredictionCorrelation.py CHANGED Viewed

@@ -92,6 +92,8 @@ def PredictionCorrelation(datasets, model):
     plt.legend()
     plt.tight_layout()
+    plt.close()
     corr_final["Features"] = corr_final.index
     corr_final = corr_final[
         ["Features", "Reference Predictions", "Monitoring Predictions"]

validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py CHANGED Viewed

@@ -52,14 +52,16 @@ def TargetPredictionDistributionPlot(datasets, model):
     fig = plt.figure()
     plot = sns.kdeplot(
-        pred_ref["Reference Prediction"], shade=True, label="Reference Prediction"
+        pred_ref["Reference Prediction"], fill=True, label="Reference Prediction"
     )
     plot = sns.kdeplot(
-        pred_monitor["Monitoring Prediction"], shade=True, label="Monitor Prediction"
+        pred_monitor["Monitoring Prediction"], fill=True, label="Monitor Prediction"
     )
     plot.set(
         xlabel="Prediction", title="Distribution of Reference & Monitor Predictions"
     )
     plot.legend()
+    plt.close()
     return fig

validmind/tests/output.py ADDED Viewed

@@ -0,0 +1,120 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
+from uuid import uuid4
+import numpy as np
+import pandas as pd
+from validmind.vm_models.figure import (
+    Figure,
+    is_matplotlib_figure,
+    is_plotly_figure,
+    is_png_image,
+)
+from validmind.vm_models.result import ResultTable, TestResult
+class OutputHandler(ABC):
+    """Base class for handling different types of test outputs"""
+    @abstractmethod
+    def can_handle(self, item: Any) -> bool:
+        """Check if this handler can process the given item"""
+        pass
+    @abstractmethod
+    def process(self, item: Any, result: TestResult) -> None:
+        """Process the item and update the TestResult"""
+        pass
+class BooleanOutputHandler(OutputHandler):
+    def can_handle(self, item: Any) -> bool:
+        return isinstance(item, (bool, np.bool_))
+    def process(self, item: Any, result: TestResult) -> None:
+        if result.passed is not None:
+            raise ValueError("Test returned more than one boolean value")
+        result.passed = bool(item)
+class MetricOutputHandler(OutputHandler):
+    def can_handle(self, item: Any) -> bool:
+        return isinstance(item, (int, float))
+    def process(self, item: Any, result: TestResult) -> None:
+        if result.metric is not None:
+            raise ValueError("Only one unit metric may be returned per test.")
+        result.metric = item
+class FigureOutputHandler(OutputHandler):
+    def can_handle(self, item: Any) -> bool:
+        return (
+            isinstance(item, Figure)
+            or is_matplotlib_figure(item)
+            or is_plotly_figure(item)
+            or is_png_image(item)
+        )
+    def process(self, item: Any, result: TestResult) -> None:
+        if isinstance(item, Figure):
+            result.add_figure(item)
+        else:
+            random_id = str(uuid4())[:4]
+            result.add_figure(
+                Figure(
+                    key=f"{result.result_id}:{random_id}",
+                    figure=item,
+                    ref_id=result.ref_id,
+                )
+            )
+class TableOutputHandler(OutputHandler):
+    def can_handle(self, item: Any) -> bool:
+        return isinstance(item, (list, pd.DataFrame, dict, ResultTable))
+    def process(
+        self,
+        item: Union[List[Dict[str, Any]], pd.DataFrame, Dict[str, Any], ResultTable],
+        result: TestResult,
+    ) -> None:
+        tables = item if isinstance(item, dict) else {"": item}
+        for table_name, table_data in tables.items():
+            # if already a ResultTable, add it directly
+            if isinstance(table_data, ResultTable):
+                result.add_table(table_data)
+                continue
+            if not isinstance(table_data, (list, pd.DataFrame)):
+                raise ValueError(
+                    "Invalid table format: must be a list of dictionaries or a DataFrame"
+                )
+            if isinstance(table_data, list):
+                table_data = pd.DataFrame(table_data)
+            result.add_table(ResultTable(data=table_data, title=table_name or None))
+def process_output(item: Any, result: TestResult) -> None:
+    """Process a single test output item and update the TestResult."""
+    handlers = [
+        BooleanOutputHandler(),
+        MetricOutputHandler(),
+        FigureOutputHandler(),
+        TableOutputHandler(),
+    ]
+    for handler in handlers:
+        if handler.can_handle(item):
+            handler.process(item, result)
+            return
+    raise ValueError(f"Invalid test output type: {type(item)}")

validmind/tests/prompt_validation/Bias.py CHANGED Viewed

@@ -2,19 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
-from typing import List
-import pandas as pd
+from validmind import tags, tasks
 from validmind.errors import MissingRequiredTestInputError
-from validmind.vm_models import (
-    ResultSummary,
-    ResultTable,
-    ResultTableMetadata,
-    ThresholdTest,
-    ThresholdTestResult,
-)
 from .ai_powered_test import (
     call_model,
@@ -23,9 +12,40 @@ from .ai_powered_test import (
     missing_prompt_message,
 )
+SYSTEM = """
+You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different best practices. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
+Consider the following best practice documentation:
+```
+Its been shown that, when passing examples to an LLM via the prompt for the purpose of Sentiment Analysis, a distribution with a majority of positive examples led the LLM to classify ambiguous sentences as "Positive", and vice versa. However, for more straightforward tasks, such as sentiment classification, biases may be harder to induce. The order of presentation also appeared to influence the LLM's outputs, especially when combined with a skewed distribution of exemplars.
+Tactics for Avoiding Bias in Prompting:
+1. Balanced Distribution: Avoid heavily skewed distributions of examples. Instead, offer amore balanced number of examples for each label.
+2. Randomize Order: To prevent order-induced biases, shuffle the sequence of examples ensuring no fixed pattern, such as presenting all positive examples first.
+3. Continued Experimentation: Regularly test and tweak prompts, especially for complex tasks where the LLM might not have extensive knowledge. This iterative process helps in identifying potential pitfalls and refining prompting strategies.
+Example:
+For instance, using 8 positive and 2 negative examples led the LLM to classify an ambiguous sentence "I feel something" as "Positive". However, with a more balanced distribution, the LLM showed unbiased behavior.
+```
+Score the bias of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
+Response Format:
+```
+Score: <score>
+Explanation: <explanation>
+```
+""".strip()
+USER = '''
+Prompt:
+"""
+{prompt_to_test}
+"""
+'''.strip()
-@dataclass
-class Bias(ThresholdTest):
+@tags("llm", "few_shot")
+@tasks("text_classification", "text_summarization")
+def Bias(model, min_threshold=7):
     """
     Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
     prompt.
@@ -74,87 +94,24 @@ class Bias(ThresholdTest):
     - The use of a grading mechanism to gauge bias may not be entirely accurate in every case, particularly when the
     difference between threshold and score is narrow.
     """
-    name = "bias"
-    required_inputs = ["model.prompt"]
-    default_params = {"min_threshold": 7}
-    tasks = ["text_classification", "text_summarization"]
-    tags = ["llm", "few_shot"]
-    system_prompt = """
-You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different best practices. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
-Consider the following best practice documentation:
-```
-Its been shown that, when passing examples to an LLM via the prompt for the purpose of Sentiment Analysis, a distribution with a majority of positive examples led the LLM to classify ambiguous sentences as "Positive", and vice versa. However, for more straightforward tasks, such as sentiment classification, biases may be harder to induce. The order of presentation also appeared to influence the LLM's outputs, especially when combined with a skewed distribution of exemplars.
-Tactics for Avoiding Bias in Prompting:
-1. Balanced Distribution: Avoid heavily skewed distributions of examples. Instead, offer amore balanced number of examples for each label.
-2. Randomize Order: To prevent order-induced biases, shuffle the sequence of examples ensuring no fixed pattern, such as presenting all positive examples first.
-3. Continued Experimentation: Regularly test and tweak prompts, especially for complex tasks where the LLM might not have extensive knowledge. This iterative process helps in identifying potential pitfalls and refining prompting strategies.
-Example:
-For instance, using 8 positive and 2 negative examples led the LLM to classify an ambiguous sentence "I feel something" as "Positive". However, with a more balanced distribution, the LLM showed unbiased behavior.
-```
-Score the bias of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
-Response Format:
-```
-Score: <score>
-Explanation: <explanation>
-```
-""".strip()
-    user_prompt = '''
-Prompt:
-"""
-{prompt_to_test}
-"""
-'''.strip()
-    def summary(self, results: List[ThresholdTestResult], all_passed: bool):
-        result = results[0]
-        results_table = [
-            {
-                "Score": result.values["score"],
-                "Threshold": result.values["threshold"],
-                "Explanation": result.values["explanation"],
-                "Pass/Fail": "Pass" if result.passed else "Fail",
-            }
-        ]
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=pd.DataFrame(results_table),
-                    metadata=ResultTableMetadata(
-                        title="Bias Test on Prompt",
-                    ),
-                )
-            ]
-        )
-    def run(self):
-        if not hasattr(self.inputs.model, "prompt"):
-            raise MissingRequiredTestInputError(missing_prompt_message)
-        response = call_model(
-            system_prompt=self.system_prompt,
-            user_prompt=self.user_prompt.format(
-                prompt_to_test=self.inputs.model.prompt.template
-            ),
-        )
-        score = get_score(response)
-        explanation = get_explanation(response)
-        passed = score > self.params["min_threshold"]
-        results = [
-            ThresholdTestResult(
-                passed=passed,
-                values={
-                    "score": score,
-                    "explanation": explanation,
-                    "threshold": self.params["min_threshold"],
-                },
-            )
-        ]
-        return self.cache_results(results, passed=passed)
+    if not hasattr(model, "prompt"):
+        raise MissingRequiredTestInputError(missing_prompt_message)
+    response = call_model(
+        system_prompt=SYSTEM,
+        user_prompt=USER.format(prompt_to_test=model.prompt.template),
+    )
+    score = get_score(response)
+    explanation = get_explanation(response)
+    passed = score > min_threshold
+    return [
+        {
+            "Score": score,
+            "Explanation": explanation,
+            "Threshold": min_threshold,
+            "Pass/Fail": "Pass" if passed else "Fail",
+        }
+    ], passed

validmind/tests/prompt_validation/Clarity.py CHANGED Viewed

@@ -2,19 +2,8 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
-from typing import List
-import pandas as pd
+from validmind import tags, tasks
 from validmind.errors import MissingRequiredTestInputError
-from validmind.vm_models import (
-    ResultSummary,
-    ResultTable,
-    ResultTableMetadata,
-    ThresholdTest,
-    ThresholdTestResult,
-)
 from .ai_powered_test import (
     call_model,
@@ -23,9 +12,41 @@ from .ai_powered_test import (
     missing_prompt_message,
 )
+SYSTEM = """
+You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
+Consider the following documentation on prompt clarity guidelines when evaluating the prompt:
+'''
+Clear prompts minimize the room for misinterpretation, allowing the LLM to generate more relevant and accurate responses. Ambiguous or vague instructions might leave the model guessing, leading to suboptimal outputs.
+Tactics for Ensuring Clarity that will be referenced during evaluation:
+1. Detail Inclusion: Provide essential details or context to prevent the LLM from making assumptions.
+2. Adopt a Persona: Use system messages to specify the desired persona for the LLM's responses.
+3. Specify Steps: For certain tasks, delineate the required steps explicitly, helping the model in sequential understanding.
+4. Provide Examples: While general instructions are efficient, in some scenarios, "few-shot" prompting or style examples can guide the LLM more effectively.
+5. Determine Output Length: Define the targeted length of the response, whether in terms of paragraphs, bullet points, or other units. While word counts aren't always precise, specifying formats like paragraphs can offer more predictable results.
+'''
+Score the clarity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
-@dataclass
-class Clarity(ThresholdTest):
+Response Format:
+```
+Score: <score>
+Explanation: <explanation>
+```
+"""
+USER = """
+Prompt:
+'''
+{prompt_to_test}
+'''
+"""
+@tags("llm", "zero_shot", "few_shot")
+@tasks("text_classification", "text_summarization")
+def Clarity(model, min_threshold=7):
     """
     Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
@@ -62,88 +83,24 @@ class Clarity(ThresholdTest):
     examples, and specification of output length) contribute equally to clarity, which might not always be the case
     - The evaluation may not be as effective if used on non-textual models
     """
-    name = "clarity"
-    required_inputs = ["model.prompt"]
-    default_params = {"min_threshold": 7}
-    tasks = ["text_classification", "text_summarization"]
-    tags = ["llm", "zero_shot", "few_shot"]
-    system_prompt = """
-You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
-Consider the following documentation on prompt clarity guidelines when evaluating the prompt:
-'''
-Clear prompts minimize the room for misinterpretation, allowing the LLM to generate more relevant and accurate responses. Ambiguous or vague instructions might leave the model guessing, leading to suboptimal outputs.
-Tactics for Ensuring Clarity that will be referenced during evaluation:
-1. Detail Inclusion: Provide essential details or context to prevent the LLM from making assumptions.
-2. Adopt a Persona: Use system messages to specify the desired persona for the LLM's responses.
-3. Specify Steps: For certain tasks, delineate the required steps explicitly, helping the model in sequential understanding.
-4. Provide Examples: While general instructions are efficient, in some scenarios, "few-shot" prompting or style examples can guide the LLM more effectively.
-5. Determine Output Length: Define the targeted length of the response, whether in terms of paragraphs, bullet points, or other units. While word counts aren't always precise, specifying formats like paragraphs can offer more predictable results.
-'''
-Score the clarity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
-Response Format:
-```
-Score: <score>
-Explanation: <explanation>
-```
-""".strip()
-    user_prompt = '''
-Prompt:
-"""
-{prompt_to_test}
-"""
-'''.strip()
-    def summary(self, results: List[ThresholdTestResult], all_passed: bool):
-        result = results[0]
-        results_table = [
-            {
-                "Score": result.values["score"],
-                "Threshold": result.values["threshold"],
-                "Explanation": result.values["explanation"],
-                "Pass/Fail": "Pass" if result.passed else "Fail",
-            }
-        ]
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=pd.DataFrame(results_table),
-                    metadata=ResultTableMetadata(
-                        title="Clarity Test for LLM Prompt",
-                    ),
-                )
-            ]
-        )
-    def run(self):
-        if not hasattr(self.inputs.model, "prompt"):
-            raise MissingRequiredTestInputError(missing_prompt_message)
-        response = call_model(
-            system_prompt=self.system_prompt,
-            user_prompt=self.user_prompt.format(
-                prompt_to_test=self.inputs.model.prompt.template
-            ),
-        )
-        score = get_score(response)
-        explanation = get_explanation(response)
-        passed = score > self.params["min_threshold"]
-        results = [
-            ThresholdTestResult(
-                passed=passed,
-                values={
-                    "score": score,
-                    "explanation": explanation,
-                    "threshold": self.params["min_threshold"],
-                },
-            )
-        ]
-        return self.cache_results(results, passed=passed)
+    if not hasattr(model, "prompt"):
+        raise MissingRequiredTestInputError(missing_prompt_message)
+    response = call_model(
+        system_prompt=SYSTEM,
+        user_prompt=USER.format(prompt_to_test=model.prompt.template),
+    )
+    score = get_score(response)
+    explanation = get_explanation(response)
+    passed = score > min_threshold
+    return [
+        {
+            "Score": score,
+            "Explanation": explanation,
+            "Threshold": min_threshold,
+            "Pass/Fail": "Pass" if passed else "Fail",
+        }
+    ], passed

validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

validmind 2.5.24py3-none-any.whl → 2.6.7py3-none-any.whl