validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -14,22 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import faithfulness
|
17
|
+
from ragas.metrics import Faithfulness as faithfulness
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for Faithfulness. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm", "rag_performance")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization")
|
29
32
|
def Faithfulness(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
34
|
+
user_input_column="user_input",
|
35
|
+
response_column="response",
|
36
|
+
retrieved_contexts_column="retrieved_contexts",
|
33
37
|
): # noqa
|
34
38
|
"""
|
35
39
|
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
|
@@ -54,20 +58,23 @@ def Faithfulness(
|
|
54
58
|
|
55
59
|
This metric requires the following columns in your dataset:
|
56
60
|
|
57
|
-
- `
|
61
|
+
- `user_input` (str): The user input that the model is responding to.
|
62
|
+
- `retrieved_contexts` (List[str]): A list of text contexts which are retrieved to generate
|
58
63
|
the answer.
|
59
|
-
- `
|
64
|
+
- `response` (str): The response generated by the model which will be evaluated for
|
60
65
|
faithfulness against the given contexts.
|
61
66
|
|
62
67
|
If the above data is not in the appropriate column, you can specify different column
|
63
|
-
names for these fields using the parameters `
|
68
|
+
names for these fields using the parameters `retrieved_contexts_column` and
|
69
|
+
`response_column`.
|
64
70
|
|
65
71
|
For example, if your dataset has this data stored in different columns, you can
|
66
72
|
pass the following parameters:
|
67
73
|
```python
|
68
74
|
{
|
69
|
-
"
|
70
|
-
"
|
75
|
+
"retrieved_contexts_column": "context_info",
|
76
|
+
"response_column": "my_answer_col",
|
77
|
+
"user_input_column": "user_input",
|
71
78
|
}
|
72
79
|
```
|
73
80
|
|
@@ -76,8 +83,9 @@ def Faithfulness(
|
|
76
83
|
```python
|
77
84
|
pred_col = dataset.prediction_column(model)
|
78
85
|
params = {
|
79
|
-
"
|
80
|
-
"
|
86
|
+
"retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
|
87
|
+
"response_column": f"{pred_col}.response",
|
88
|
+
"user_input_column": "user_input",
|
81
89
|
}
|
82
90
|
```
|
83
91
|
|
@@ -85,8 +93,9 @@ def Faithfulness(
|
|
85
93
|
```python
|
86
94
|
pred_col = dataset.prediction_column(model)
|
87
95
|
params = {
|
88
|
-
"
|
89
|
-
"
|
96
|
+
"retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
|
97
|
+
"response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
98
|
+
"user_input_column": "user_input",
|
90
99
|
}
|
91
100
|
```
|
92
101
|
"""
|
@@ -97,31 +106,34 @@ def Faithfulness(
|
|
97
106
|
)
|
98
107
|
|
99
108
|
required_columns = {
|
100
|
-
"
|
101
|
-
"
|
109
|
+
"response": response_column,
|
110
|
+
"retrieved_contexts": retrieved_contexts_column,
|
111
|
+
"user_input": user_input_column,
|
102
112
|
}
|
103
113
|
|
104
114
|
df = get_renamed_columns(dataset._df, required_columns)
|
105
115
|
|
106
116
|
result_df = evaluate(
|
107
|
-
Dataset.from_pandas(df), metrics=[faithfulness], **get_ragas_config()
|
117
|
+
Dataset.from_pandas(df), metrics=[faithfulness()], **get_ragas_config()
|
108
118
|
).to_pandas()
|
109
119
|
|
110
|
-
|
111
|
-
|
120
|
+
score_column = "faithfulness"
|
121
|
+
|
122
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
123
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
112
124
|
|
113
125
|
return (
|
114
126
|
{
|
115
|
-
# "Scores (will not be uploaded to
|
116
|
-
# ["
|
127
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
128
|
+
# ["retrieved_contexts", "response", "faithfulness"]
|
117
129
|
# ],
|
118
130
|
"Aggregate Scores": [
|
119
131
|
{
|
120
|
-
"Mean Score": result_df[
|
121
|
-
"Median Score": result_df[
|
122
|
-
"Max Score": result_df[
|
123
|
-
"Min Score": result_df[
|
124
|
-
"Standard Deviation": result_df[
|
132
|
+
"Mean Score": result_df[score_column].mean(),
|
133
|
+
"Median Score": result_df[score_column].median(),
|
134
|
+
"Max Score": result_df[score_column].max(),
|
135
|
+
"Min Score": result_df[score_column].min(),
|
136
|
+
"Standard Deviation": result_df[score_column].std(),
|
125
137
|
"Count": result_df.shape[0],
|
126
138
|
}
|
127
139
|
],
|
@@ -14,23 +14,30 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import
|
17
|
+
from ragas.metrics import NoiseSensitivity as noise_sensitivity
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for NoiseSensitivity. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
28
|
+
|
29
|
+
VALID_FOCUS_VALUES = ["relevant", "irrelevant"]
|
25
30
|
|
26
31
|
|
27
32
|
@tags("ragas", "llm", "rag_performance")
|
28
33
|
@tasks("text_qa", "text_generation", "text_summarization")
|
29
34
|
def NoiseSensitivity(
|
30
35
|
dataset,
|
31
|
-
|
32
|
-
|
33
|
-
|
36
|
+
response_column="response",
|
37
|
+
retrieved_contexts_column="retrieved_contexts",
|
38
|
+
reference_column="reference",
|
39
|
+
focus="relevant",
|
40
|
+
user_input_column="user_input",
|
34
41
|
):
|
35
42
|
"""
|
36
43
|
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
|
@@ -58,20 +65,22 @@ def NoiseSensitivity(
|
|
58
65
|
|
59
66
|
This metric requires the following columns in your dataset:
|
60
67
|
|
61
|
-
- `
|
68
|
+
- `retrieved_contexts` (List[str]): A list of text contexts which are retrieved to generate
|
62
69
|
the answer.
|
63
|
-
- `
|
64
|
-
- `
|
65
|
-
|
70
|
+
- `response` (str): The response generated by the model
|
71
|
+
- `reference` (str): The "correct" answer to the question
|
72
|
+
- `user_input` (str): The user input question
|
66
73
|
If the above data is not in the appropriate column, you can specify different column
|
67
|
-
names for these fields using the parameters `
|
74
|
+
names for these fields using the parameters `retrieved_contexts_column` and `response_column`.
|
68
75
|
|
69
76
|
For example, if your dataset has this data stored in different columns, you can
|
70
77
|
pass the following parameters:
|
71
78
|
```python
|
72
79
|
{
|
73
|
-
"
|
74
|
-
"
|
80
|
+
"retrieved_contexts_column": "context_info",
|
81
|
+
"response_column": "my_answer_col",
|
82
|
+
"reference_column": "reference",
|
83
|
+
"user_input_column": "user_input",
|
75
84
|
}
|
76
85
|
```
|
77
86
|
|
@@ -80,8 +89,10 @@ def NoiseSensitivity(
|
|
80
89
|
```python
|
81
90
|
pred_col = dataset.prediction_column(model)
|
82
91
|
params = {
|
83
|
-
"
|
84
|
-
"
|
92
|
+
"reference_column": "reference",
|
93
|
+
"retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
|
94
|
+
"response_column": f"{pred_col}.response",
|
95
|
+
"user_input_column": "user_input",
|
85
96
|
}
|
86
97
|
```
|
87
98
|
|
@@ -89,8 +100,10 @@ def NoiseSensitivity(
|
|
89
100
|
```python
|
90
101
|
pred_col = dataset.prediction_column(model)
|
91
102
|
params = {
|
92
|
-
"
|
93
|
-
"
|
103
|
+
"reference_column": "reference",
|
104
|
+
"retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
|
105
|
+
"response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
106
|
+
"user_input_column": "user_input",
|
94
107
|
}
|
95
108
|
|
96
109
|
### Signs of High Risk
|
@@ -118,37 +131,48 @@ def NoiseSensitivity(
|
|
118
131
|
message="promote has been superseded by promote_options='default'.",
|
119
132
|
)
|
120
133
|
|
134
|
+
if focus not in VALID_FOCUS_VALUES:
|
135
|
+
raise ValueError(
|
136
|
+
f"Invalid focus parameter: '{focus}'. "
|
137
|
+
f"Must be one of: {VALID_FOCUS_VALUES}"
|
138
|
+
)
|
139
|
+
|
121
140
|
required_columns = {
|
122
|
-
"
|
123
|
-
"
|
124
|
-
"
|
141
|
+
"response": response_column,
|
142
|
+
"retrieved_contexts": retrieved_contexts_column,
|
143
|
+
"reference": reference_column,
|
144
|
+
"user_input": user_input_column,
|
125
145
|
}
|
126
146
|
|
127
147
|
df = get_renamed_columns(dataset._df, required_columns)
|
128
148
|
|
129
149
|
result_df = evaluate(
|
130
150
|
Dataset.from_pandas(df),
|
131
|
-
metrics=[
|
151
|
+
metrics=[noise_sensitivity(focus=focus)],
|
132
152
|
**get_ragas_config(),
|
133
153
|
).to_pandas()
|
134
154
|
|
155
|
+
score_column = f"noise_sensitivity_{focus}"
|
156
|
+
|
135
157
|
fig_histogram = px.histogram(
|
136
|
-
x=result_df[
|
158
|
+
x=result_df[score_column].to_list(),
|
159
|
+
nbins=10,
|
160
|
+
title=f"Noise Sensitivity ({focus})",
|
161
|
+
)
|
162
|
+
fig_box = px.box(
|
163
|
+
x=result_df[score_column].to_list(),
|
164
|
+
title=f"Noise Sensitivity Distribution ({focus})",
|
137
165
|
)
|
138
|
-
fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
|
139
166
|
|
140
167
|
return (
|
141
168
|
{
|
142
|
-
# "Scores (will not be uploaded to UI)": result_df[
|
143
|
-
# ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
|
144
|
-
# ],
|
145
169
|
"Aggregate Scores": [
|
146
170
|
{
|
147
|
-
"Mean Score": result_df[
|
148
|
-
"Median Score": result_df[
|
149
|
-
"Max Score": result_df[
|
150
|
-
"Min Score": result_df[
|
151
|
-
"Standard Deviation": result_df[
|
171
|
+
"Mean Score": result_df[score_column].mean(),
|
172
|
+
"Median Score": result_df[score_column].median(),
|
173
|
+
"Max Score": result_df[score_column].max(),
|
174
|
+
"Min Score": result_df[score_column].min(),
|
175
|
+
"Standard Deviation": result_df[score_column].std(),
|
152
176
|
"Count": result_df.shape[0],
|
153
177
|
}
|
154
178
|
],
|
@@ -14,36 +14,39 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import
|
17
|
+
from ragas.metrics import ResponseRelevancy as response_relevancy
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for AnswerRelevance. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm", "rag_performance")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization")
|
29
|
-
def
|
32
|
+
def ResponseRelevancy(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
user_input_column="user_input",
|
35
|
+
retrieved_contexts_column=None,
|
36
|
+
response_column="response",
|
34
37
|
):
|
35
38
|
"""
|
36
39
|
Assesses how pertinent the generated answer is to the given prompt.
|
37
40
|
|
38
|
-
The evaluation metric,
|
41
|
+
The evaluation metric, Response Relevancy, focuses on assessing how pertinent the
|
39
42
|
generated answer is to the given prompt. A lower score is assigned to answers that
|
40
43
|
are incomplete or contain redundant information and higher scores indicate better
|
41
|
-
relevancy. This metric is computed using the `
|
42
|
-
`
|
44
|
+
relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
|
45
|
+
and the `response`.
|
43
46
|
|
44
|
-
The
|
45
|
-
`
|
46
|
-
based on the `
|
47
|
+
The Response Relevancy is defined as the mean cosine similartiy of the original
|
48
|
+
`user_input` to a number of artifical questions, which are generated (reverse-engineered)
|
49
|
+
based on the `response`:
|
47
50
|
|
48
51
|
$$
|
49
52
|
\\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} cos(E_{g_i}, E_o)
|
@@ -66,10 +69,10 @@ def AnswerRelevance(
|
|
66
69
|
|
67
70
|
This metric requires the following columns in your dataset:
|
68
71
|
|
69
|
-
- `
|
70
|
-
- `
|
71
|
-
generating an answer.
|
72
|
-
- `
|
72
|
+
- `user_input` (str): The text query that was input into the model.
|
73
|
+
- `retrieved_contexts` (List[str]): Any contextual information retrieved by the model
|
74
|
+
before generating an answer.
|
75
|
+
- `response` (str): The response generated by the model.
|
73
76
|
|
74
77
|
If the above data is not in the appropriate column, you can specify different column
|
75
78
|
names for these fields using the parameters `question_column`, `answer_column`, and
|
@@ -79,9 +82,9 @@ def AnswerRelevance(
|
|
79
82
|
pass the following parameters:
|
80
83
|
```python
|
81
84
|
params = {
|
82
|
-
"
|
83
|
-
"
|
84
|
-
"
|
85
|
+
"user_input_column": "input_text",
|
86
|
+
"response_column": "output_text",
|
87
|
+
"retrieved_contexts_column": "context_info"
|
85
88
|
}
|
86
89
|
```
|
87
90
|
|
@@ -90,8 +93,8 @@ def AnswerRelevance(
|
|
90
93
|
```python
|
91
94
|
pred_col = dataset.prediction_column(model)
|
92
95
|
params = {
|
93
|
-
"
|
94
|
-
"
|
96
|
+
"response_column": f"{pred_col}.generated_answer",
|
97
|
+
"retrieved_contexts_column": f"{pred_col}.contexts",
|
95
98
|
}
|
96
99
|
```
|
97
100
|
|
@@ -99,8 +102,8 @@ def AnswerRelevance(
|
|
99
102
|
```python
|
100
103
|
pred_col = dataset.prediction_column(model)
|
101
104
|
params = {
|
102
|
-
"
|
103
|
-
"
|
105
|
+
"response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
106
|
+
"retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
|
104
107
|
}
|
105
108
|
```
|
106
109
|
"""
|
@@ -111,32 +114,40 @@ def AnswerRelevance(
|
|
111
114
|
)
|
112
115
|
|
113
116
|
required_columns = {
|
114
|
-
"
|
115
|
-
"
|
116
|
-
"contexts": contexts_column,
|
117
|
+
"user_input": user_input_column,
|
118
|
+
"response": response_column,
|
117
119
|
}
|
118
120
|
|
121
|
+
if retrieved_contexts_column:
|
122
|
+
required_columns["retrieved_contexts"] = retrieved_contexts_column
|
123
|
+
|
119
124
|
df = get_renamed_columns(dataset._df, required_columns)
|
120
125
|
|
126
|
+
metrics = [response_relevancy()]
|
127
|
+
|
121
128
|
result_df = evaluate(
|
122
|
-
Dataset.from_pandas(df),
|
129
|
+
Dataset.from_pandas(df),
|
130
|
+
metrics=metrics,
|
131
|
+
**get_ragas_config(),
|
123
132
|
).to_pandas()
|
124
133
|
|
125
|
-
|
126
|
-
|
134
|
+
score_column = "answer_relevancy"
|
135
|
+
|
136
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
137
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
127
138
|
|
128
139
|
return (
|
129
140
|
{
|
130
|
-
# "Scores (will not be uploaded to
|
131
|
-
# ["
|
141
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
142
|
+
# ["user_input", "retrieved_contexts", "response", "answer_relevancy"]
|
132
143
|
# ],
|
133
144
|
"Aggregate Scores": [
|
134
145
|
{
|
135
|
-
"Mean Score": result_df[
|
136
|
-
"Median Score": result_df[
|
137
|
-
"Max Score": result_df[
|
138
|
-
"Min Score": result_df[
|
139
|
-
"Standard Deviation": result_df[
|
146
|
+
"Mean Score": result_df[score_column].mean(),
|
147
|
+
"Median Score": result_df[score_column].median(),
|
148
|
+
"Max Score": result_df[score_column].max(),
|
149
|
+
"Min Score": result_df[score_column].min(),
|
150
|
+
"Standard Deviation": result_df[score_column].std(),
|
140
151
|
"Count": result_df.shape[0],
|
141
152
|
}
|
142
153
|
],
|
@@ -14,30 +14,33 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import
|
17
|
+
from ragas.metrics import SemanticSimilarity as semantic_similarity
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for AnswerSimilarity. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization")
|
29
|
-
def
|
32
|
+
def SemanticSimilarity(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
34
|
+
response_column="response",
|
35
|
+
reference_column="reference",
|
33
36
|
):
|
34
37
|
"""
|
35
|
-
Calculates the semantic similarity between generated
|
38
|
+
Calculates the semantic similarity between generated responses and ground truths
|
36
39
|
|
37
40
|
The concept of Answer Semantic Similarity pertains to the assessment of the semantic
|
38
41
|
resemblance between the generated answer and the ground truth. This evaluation is
|
39
|
-
based on the `
|
40
|
-
of 0 to 1. A higher score signifies a better alignment between the generated
|
42
|
+
based on the `reference` and the `response`, with values falling within the range
|
43
|
+
of 0 to 1. A higher score signifies a better alignment between the generated response
|
41
44
|
and the ground truth.
|
42
45
|
|
43
46
|
Measuring the semantic similarity between answers can offer valuable insights into
|
@@ -55,19 +58,19 @@ def AnswerSimilarity(
|
|
55
58
|
|
56
59
|
This metric requires the following columns in your dataset:
|
57
60
|
|
58
|
-
- `
|
59
|
-
- `
|
61
|
+
- `response` (str): The text response generated by the model.
|
62
|
+
- `reference` (str): The ground truth answer that the generated answer is compared
|
60
63
|
against.
|
61
64
|
|
62
65
|
If the above data is not in the appropriate column, you can specify different column
|
63
|
-
names for these fields using the parameters `
|
66
|
+
names for these fields using the parameters `response_column`, and `reference_column`.
|
64
67
|
|
65
68
|
For example, if your dataset has this data stored in different columns, you can
|
66
69
|
pass the following parameters:
|
67
70
|
```python
|
68
71
|
{
|
69
|
-
"
|
70
|
-
"
|
72
|
+
"response_column": "llm_output_col",
|
73
|
+
"reference_column": "my_ground_truth_col",
|
71
74
|
}
|
72
75
|
```
|
73
76
|
|
@@ -76,8 +79,8 @@ def AnswerSimilarity(
|
|
76
79
|
```python
|
77
80
|
pred_col = dataset.prediction_column(model)
|
78
81
|
params = {
|
79
|
-
"
|
80
|
-
"
|
82
|
+
"response_column": f"{pred_col}.generated_answer",
|
83
|
+
"reference_column": "my_ground_truth_col",
|
81
84
|
}
|
82
85
|
```
|
83
86
|
|
@@ -85,8 +88,8 @@ def AnswerSimilarity(
|
|
85
88
|
```python
|
86
89
|
pred_col = dataset.prediction_column(model)
|
87
90
|
params = {
|
88
|
-
"
|
89
|
-
"
|
91
|
+
"response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
92
|
+
"reference_column": "my_ground_truth_col",
|
90
93
|
}
|
91
94
|
```
|
92
95
|
"""
|
@@ -97,31 +100,33 @@ def AnswerSimilarity(
|
|
97
100
|
)
|
98
101
|
|
99
102
|
required_columns = {
|
100
|
-
"
|
101
|
-
"
|
103
|
+
"response": response_column,
|
104
|
+
"reference": reference_column,
|
102
105
|
}
|
103
106
|
|
104
107
|
df = get_renamed_columns(dataset._df, required_columns)
|
105
108
|
|
106
109
|
result_df = evaluate(
|
107
|
-
Dataset.from_pandas(df), metrics=[
|
110
|
+
Dataset.from_pandas(df), metrics=[semantic_similarity()], **get_ragas_config()
|
108
111
|
).to_pandas()
|
109
112
|
|
110
|
-
|
111
|
-
|
113
|
+
score_column = "semantic_similarity"
|
114
|
+
|
115
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
116
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
112
117
|
|
113
118
|
return (
|
114
119
|
{
|
115
|
-
# "Scores (will not be uploaded to
|
116
|
-
# ["
|
120
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
121
|
+
# ["response", "reference", "semantic_similarity"]
|
117
122
|
# ],
|
118
123
|
"Aggregate Scores": [
|
119
124
|
{
|
120
|
-
"Mean Score": result_df[
|
121
|
-
"Median Score": result_df[
|
122
|
-
"Max Score": result_df[
|
123
|
-
"Min Score": result_df[
|
124
|
-
"Standard Deviation": result_df[
|
125
|
+
"Mean Score": result_df[score_column].mean(),
|
126
|
+
"Median Score": result_df[score_column].median(),
|
127
|
+
"Max Score": result_df[score_column].max(),
|
128
|
+
"Min Score": result_df[score_column].min(),
|
129
|
+
"Standard Deviation": result_df[score_column].std(),
|
125
130
|
"Count": result_df.shape[0],
|
126
131
|
}
|
127
132
|
],
|
@@ -2,15 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
5
|
+
from sklearn.metrics import adjusted_mutual_info_score
|
6
6
|
|
7
|
-
from
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.vm_models import VMDataset, VMModel
|
8
9
|
|
9
|
-
from .ClusterPerformance import ClusterPerformance
|
10
10
|
|
11
|
-
|
12
|
-
@
|
13
|
-
|
11
|
+
@tags("sklearn", "model_performance", "clustering")
|
12
|
+
@tasks("clustering")
|
13
|
+
def AdjustedMutualInformation(model: VMModel, dataset: VMDataset):
|
14
14
|
"""
|
15
15
|
Evaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting
|
16
16
|
for chance.
|
@@ -52,14 +52,11 @@ class AdjustedMutualInformation(ClusterPerformance):
|
|
52
52
|
- The interpretability of the score can be complex as it depends on the understanding of information theory
|
53
53
|
concepts.
|
54
54
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
55
|
+
return [
|
56
|
+
{
|
57
|
+
"Adjusted Mutual Information": adjusted_mutual_info_score(
|
58
|
+
labels_true=dataset.y,
|
59
|
+
labels_pred=dataset.y_pred(model),
|
60
|
+
)
|
61
|
+
}
|
62
62
|
]
|
63
|
-
|
64
|
-
def metric_info(self):
|
65
|
-
return {"Adjusted Mutual Information": metrics.adjusted_mutual_info_score}
|