validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -14,22 +14,25 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import context_entity_recall
|
17
|
+
from ragas.metrics import ContextEntityRecall as context_entity_recall
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for ContextEntityRecall. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm", "retrieval_performance")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization")
|
29
32
|
def ContextEntityRecall(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
34
|
+
retrieved_contexts_column: str = "retrieved_contexts",
|
35
|
+
reference_column: str = "reference",
|
33
36
|
):
|
34
37
|
"""
|
35
38
|
Evaluates the context entity recall for dataset entries and visualizes the results.
|
@@ -37,18 +40,18 @@ def ContextEntityRecall(
|
|
37
40
|
### Overview
|
38
41
|
|
39
42
|
This metric gives the measure of recall of the retrieved context, based on the
|
40
|
-
number of entities present in both `
|
41
|
-
number of entities present in the `
|
42
|
-
of what fraction of entities are recalled from `
|
43
|
+
number of entities present in both `reference` and `retrieved_contexts` relative to the
|
44
|
+
number of entities present in the `reference` alone. Simply put, it is a measure
|
45
|
+
of what fraction of entities are recalled from `reference`. This metric is
|
43
46
|
useful in fact-based use cases like tourism help desk, historical QA, etc. This
|
44
47
|
metric can help evaluate the retrieval mechanism for entities, based on comparison
|
45
|
-
with entities present in `
|
46
|
-
we need the `
|
48
|
+
with entities present in `reference`, because in cases where entities matter,
|
49
|
+
we need the `retrieved_contexts` which cover them.
|
47
50
|
|
48
51
|
### Formula
|
49
52
|
|
50
53
|
To compute this metric, we use two sets, $GE$ and $CE$, representing the set of
|
51
|
-
entities present in `
|
54
|
+
entities present in `reference` and set of entities present in `retrieved_contexts`
|
52
55
|
respectively. We then take the number of elements in intersection of these sets and
|
53
56
|
divide it by the number of elements present in the $GE$, given by the formula:
|
54
57
|
|
@@ -60,20 +63,20 @@ def ContextEntityRecall(
|
|
60
63
|
|
61
64
|
This metric requires the following columns in your dataset:
|
62
65
|
|
63
|
-
- `
|
64
|
-
sure if they contain the entities present in the
|
65
|
-
- `
|
66
|
-
extracted and compared with the entities in the `
|
66
|
+
- `retrieved_contexts` (List[str]): A list of text contexts which will be evaluated to make
|
67
|
+
sure if they contain the entities present in the `reference`.
|
68
|
+
- `reference` (str): The ground truth text from which the entities will be
|
69
|
+
extracted and compared with the entities in the `retrieved_contexts`.
|
67
70
|
|
68
71
|
If the above data is not in the appropriate column, you can specify different column
|
69
|
-
names for these fields using the parameters `
|
72
|
+
names for these fields using the parameters `retrieved_contexts_column`, and `reference_column`.
|
70
73
|
|
71
74
|
For example, if your dataset has this data stored in different columns, you can
|
72
75
|
pass the following parameters:
|
73
76
|
```python
|
74
77
|
{
|
75
|
-
"
|
76
|
-
"
|
78
|
+
"retrieved_contexts_column": "context_info",
|
79
|
+
"reference_column": "my_ground_truth_col",
|
77
80
|
}
|
78
81
|
```
|
79
82
|
|
@@ -82,8 +85,8 @@ def ContextEntityRecall(
|
|
82
85
|
```python
|
83
86
|
pred_col = dataset.prediction_column(model)
|
84
87
|
params = {
|
85
|
-
"
|
86
|
-
"
|
88
|
+
"retrieved_contexts_column": f"{pred_col}.contexts",
|
89
|
+
"reference_column": "my_ground_truth_col",
|
87
90
|
}
|
88
91
|
```
|
89
92
|
|
@@ -91,8 +94,8 @@ def ContextEntityRecall(
|
|
91
94
|
```python
|
92
95
|
pred_col = dataset.prediction_column(model)
|
93
96
|
params = {
|
94
|
-
"
|
95
|
-
"
|
97
|
+
"retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
|
98
|
+
"reference_column": "my_ground_truth_col",
|
96
99
|
}
|
97
100
|
```
|
98
101
|
"""
|
@@ -103,37 +106,37 @@ def ContextEntityRecall(
|
|
103
106
|
)
|
104
107
|
|
105
108
|
required_columns = {
|
106
|
-
"
|
107
|
-
"
|
109
|
+
"reference": reference_column,
|
110
|
+
"retrieved_contexts": retrieved_contexts_column,
|
108
111
|
}
|
109
112
|
|
110
113
|
df = get_renamed_columns(dataset._df, required_columns)
|
111
114
|
|
112
115
|
result_df = evaluate(
|
113
|
-
Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
|
116
|
+
Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config()
|
114
117
|
).to_pandas()
|
115
118
|
|
116
|
-
|
117
|
-
|
118
|
-
)
|
119
|
-
fig_box = px.box(x=result_df[
|
119
|
+
score_column = "context_entity_recall"
|
120
|
+
|
121
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
122
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
120
123
|
|
121
124
|
return (
|
122
125
|
{
|
123
|
-
# "Scores (will not be uploaded to
|
126
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
124
127
|
# [
|
125
|
-
# "
|
126
|
-
# "
|
128
|
+
# "retrieved_contexts",
|
129
|
+
# "reference",
|
127
130
|
# "context_entity_recall",
|
128
131
|
# ]
|
129
132
|
# ],
|
130
133
|
"Aggregate Scores": [
|
131
134
|
{
|
132
|
-
"Mean Score": result_df[
|
133
|
-
"Median Score": result_df[
|
134
|
-
"Max Score": result_df[
|
135
|
-
"Min Score": result_df[
|
136
|
-
"Standard Deviation": result_df[
|
135
|
+
"Mean Score": result_df[score_column].mean(),
|
136
|
+
"Median Score": result_df[score_column].median(),
|
137
|
+
"Max Score": result_df[score_column].max(),
|
138
|
+
"Min Score": result_df[score_column].min(),
|
139
|
+
"Standard Deviation": result_df[score_column].std(),
|
137
140
|
"Count": result_df.shape[0],
|
138
141
|
}
|
139
142
|
],
|
@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import context_precision
|
17
|
+
from ragas.metrics import LLMContextPrecisionWithReference as context_precision
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for ContextPrecision. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm", "retrieval_performance")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
29
32
|
def ContextPrecision(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
user_input_column: str = "user_input",
|
35
|
+
retrieved_contexts_column: str = "retrieved_contexts",
|
36
|
+
reference_column: str = "reference",
|
34
37
|
): # noqa: B950
|
35
38
|
"""
|
36
39
|
Context Precision is a metric that evaluates whether all of the ground-truth
|
@@ -53,22 +56,22 @@ def ContextPrecision(
|
|
53
56
|
|
54
57
|
This metric requires the following columns in your dataset:
|
55
58
|
|
56
|
-
- `
|
57
|
-
- `
|
59
|
+
- `user_input` (str): The text query that was input into the model.
|
60
|
+
- `retrieved_contexts` (List[str]): A list of text contexts which are retrieved and which
|
58
61
|
will be evaluated to make sure they contain relevant info in the correct order.
|
59
|
-
- `
|
62
|
+
- `reference` (str): The ground truth text to compare with the retrieved contexts.
|
60
63
|
|
61
64
|
If the above data is not in the appropriate column, you can specify different column
|
62
|
-
names for these fields using the parameters `
|
63
|
-
and `
|
65
|
+
names for these fields using the parameters `user_input_column`, `retrieved_contexts_column`
|
66
|
+
and `reference_column`.
|
64
67
|
|
65
68
|
For example, if your dataset has this data stored in different columns, you can
|
66
69
|
pass the following parameters:
|
67
70
|
```python
|
68
71
|
{
|
69
|
-
"
|
70
|
-
"
|
71
|
-
"
|
72
|
+
"user_input_column": "question",
|
73
|
+
"retrieved_contexts_column": "context_info",
|
74
|
+
"reference_column": "my_ground_truth_col",
|
72
75
|
}
|
73
76
|
```
|
74
77
|
|
@@ -77,8 +80,8 @@ def ContextPrecision(
|
|
77
80
|
```python
|
78
81
|
pred_col = dataset.prediction_column(model)
|
79
82
|
params = {
|
80
|
-
"
|
81
|
-
"
|
83
|
+
"retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
|
84
|
+
"reference_column": "my_ground_truth_col",
|
82
85
|
}
|
83
86
|
```
|
84
87
|
|
@@ -86,8 +89,8 @@ def ContextPrecision(
|
|
86
89
|
```python
|
87
90
|
pred_col = dataset.prediction_column(model)
|
88
91
|
params = {
|
89
|
-
"
|
90
|
-
"
|
92
|
+
"retrieved_contexts_column": lambda x: [x[pred_col]["context_message"]],
|
93
|
+
"reference_column": "my_ground_truth_col",
|
91
94
|
}
|
92
95
|
```
|
93
96
|
"""
|
@@ -98,32 +101,34 @@ def ContextPrecision(
|
|
98
101
|
)
|
99
102
|
|
100
103
|
required_columns = {
|
101
|
-
"
|
102
|
-
"
|
103
|
-
"
|
104
|
+
"user_input": user_input_column,
|
105
|
+
"retrieved_contexts": retrieved_contexts_column,
|
106
|
+
"reference": reference_column,
|
104
107
|
}
|
105
108
|
|
106
109
|
df = get_renamed_columns(dataset._df, required_columns)
|
107
110
|
|
108
111
|
result_df = evaluate(
|
109
|
-
Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
|
112
|
+
Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
|
110
113
|
).to_pandas()
|
111
114
|
|
112
|
-
|
113
|
-
|
115
|
+
score_column = "llm_context_precision_with_reference"
|
116
|
+
|
117
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
118
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
114
119
|
|
115
120
|
return (
|
116
121
|
{
|
117
|
-
# "Scores (will not be uploaded to
|
118
|
-
# ["
|
122
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
123
|
+
# ["user_input", "retrieved_contexts", "reference", "llm_context_precision_with_reference"]
|
119
124
|
# ],
|
120
125
|
"Aggregate Scores": [
|
121
126
|
{
|
122
|
-
"Mean Score": result_df[
|
123
|
-
"Median Score": result_df[
|
124
|
-
"Max Score": result_df[
|
125
|
-
"Min Score": result_df[
|
126
|
-
"Standard Deviation": result_df[
|
127
|
+
"Mean Score": result_df[score_column].mean(),
|
128
|
+
"Median Score": result_df[score_column].median(),
|
129
|
+
"Max Score": result_df[score_column].max(),
|
130
|
+
"Min Score": result_df[score_column].min(),
|
131
|
+
"Standard Deviation": result_df[score_column].std(),
|
127
132
|
"Count": result_df.shape[0],
|
128
133
|
}
|
129
134
|
],
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
12
|
+
|
13
|
+
from .utils import get_ragas_config, get_renamed_columns
|
14
|
+
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import LLMContextPrecisionWithoutReference as context_precision
|
18
|
+
except ImportError as e:
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for ContextPrecision. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
28
|
+
|
29
|
+
|
30
|
+
@tags("ragas", "llm", "retrieval_performance")
|
31
|
+
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
32
|
+
def ContextPrecisionWithoutReference(
|
33
|
+
dataset,
|
34
|
+
user_input_column: str = "user_input",
|
35
|
+
retrieved_contexts_column: str = "retrieved_contexts",
|
36
|
+
response_column: str = "response",
|
37
|
+
): # noqa: B950
|
38
|
+
"""
|
39
|
+
Context Precision Without Reference is a metric used to evaluate the relevance of
|
40
|
+
retrieved contexts compared to the expected response for a given user input. This
|
41
|
+
metric compares each retrieved context (or chunk) with the response to estimate
|
42
|
+
if the retrieved context is relevant.
|
43
|
+
|
44
|
+
This metric can be used when you have both retrieved contexts and associated
|
45
|
+
reference contexts for a `user_input`. Using a Language Model (LLM), it determines
|
46
|
+
the relevance of each retrieved context by comparing it directly with the response,
|
47
|
+
producing scores between 0 and 1, where higher scores indicate better precision in
|
48
|
+
retrieving relevant contexts.
|
49
|
+
|
50
|
+
### Configuring Columns
|
51
|
+
|
52
|
+
This metric requires the following columns in your dataset:
|
53
|
+
|
54
|
+
- `user_input` (str): The user query or input to the model.
|
55
|
+
- `retrieved_contexts` (List[str]): A list of text contexts retrieved for the
|
56
|
+
user input that will be evaluated for relevance.
|
57
|
+
- `response` (str): The model’s output response associated with the user input.
|
58
|
+
|
59
|
+
If your dataset stores this data in different columns, you can specify alternate
|
60
|
+
column names using the parameters `user_input_column`, `retrieved_contexts_column`,
|
61
|
+
and `response_column`.
|
62
|
+
|
63
|
+
Example configuration for custom column names:
|
64
|
+
```python
|
65
|
+
{
|
66
|
+
"user_input_column": "user_query",
|
67
|
+
"retrieved_contexts_column": "retrieved_texts",
|
68
|
+
"response_column": "model_output",
|
69
|
+
}
|
70
|
+
```
|
71
|
+
|
72
|
+
For datasets with data stored as dictionaries in other columns, specify the
|
73
|
+
column and key like so:
|
74
|
+
```python
|
75
|
+
pred_col = dataset.prediction_column(model)
|
76
|
+
params = {
|
77
|
+
"retrieved_contexts_column": f"{pred_col}.contexts",
|
78
|
+
"response_column": f"{pred_col}.response",
|
79
|
+
}
|
80
|
+
```
|
81
|
+
|
82
|
+
Alternatively, for complex situations, you may use a function to extract data:
|
83
|
+
```python
|
84
|
+
pred_col = dataset.prediction_column(model)
|
85
|
+
params = {
|
86
|
+
"retrieved_contexts_column": lambda x: [x[pred_col]["context_message"]],
|
87
|
+
"response_column": "my_response_col",
|
88
|
+
}
|
89
|
+
```
|
90
|
+
"""
|
91
|
+
|
92
|
+
warnings.filterwarnings(
|
93
|
+
"ignore",
|
94
|
+
category=FutureWarning,
|
95
|
+
message="promote has been superseded by promote_options='default'.",
|
96
|
+
)
|
97
|
+
|
98
|
+
required_columns = {
|
99
|
+
"user_input": user_input_column,
|
100
|
+
"retrieved_contexts": retrieved_contexts_column,
|
101
|
+
"response": response_column,
|
102
|
+
}
|
103
|
+
|
104
|
+
df = get_renamed_columns(dataset._df, required_columns)
|
105
|
+
|
106
|
+
result_df = evaluate(
|
107
|
+
Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
|
108
|
+
).to_pandas()
|
109
|
+
|
110
|
+
score_column = "llm_context_precision_without_reference"
|
111
|
+
|
112
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
113
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
114
|
+
|
115
|
+
return (
|
116
|
+
{
|
117
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
118
|
+
# ["user_input", "retrieved_contexts", "response", "llm_context_precision_without_reference"]
|
119
|
+
# ],
|
120
|
+
"Aggregate Scores": [
|
121
|
+
{
|
122
|
+
"Mean Score": result_df[score_column].mean(),
|
123
|
+
"Median Score": result_df[score_column].median(),
|
124
|
+
"Max Score": result_df[score_column].max(),
|
125
|
+
"Min Score": result_df[score_column].min(),
|
126
|
+
"Standard Deviation": result_df[score_column].std(),
|
127
|
+
"Count": result_df.shape[0],
|
128
|
+
}
|
129
|
+
],
|
130
|
+
},
|
131
|
+
fig_histogram,
|
132
|
+
fig_box,
|
133
|
+
)
|
@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import context_recall
|
17
|
+
from ragas.metrics import LLMContextRecall as context_recall
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for ContextRecall. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm", "retrieval_performance")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
29
32
|
def ContextRecall(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
user_input_column: str = "user_input",
|
35
|
+
retrieved_contexts_column: str = "retrieved_contexts",
|
36
|
+
reference_column: str = "reference",
|
34
37
|
):
|
35
38
|
"""
|
36
39
|
Context recall measures the extent to which the retrieved context aligns with the
|
@@ -53,22 +56,22 @@ def ContextRecall(
|
|
53
56
|
|
54
57
|
This metric requires the following columns in your dataset:
|
55
58
|
|
56
|
-
- `
|
57
|
-
- `
|
58
|
-
will be evaluated to make sure they contain all items in the ground truth.
|
59
|
-
- `
|
59
|
+
- `user_input` (str): The text query that was input into the model.
|
60
|
+
- `retrieved_contexts` (List[str]): A list of text contexts which are retrieved and
|
61
|
+
which will be evaluated to make sure they contain all items in the ground truth.
|
62
|
+
- `reference` (str): The ground truth text to compare with the retrieved contexts.
|
60
63
|
|
61
64
|
If the above data is not in the appropriate column, you can specify different column
|
62
|
-
names for these fields using the parameters `
|
63
|
-
and `
|
65
|
+
names for these fields using the parameters `user_input_column`,
|
66
|
+
`retrieved_contexts_column` and `reference_column`.
|
64
67
|
|
65
68
|
For example, if your dataset has this data stored in different columns, you can
|
66
69
|
pass the following parameters:
|
67
70
|
```python
|
68
71
|
{
|
69
|
-
"
|
70
|
-
"
|
71
|
-
"
|
72
|
+
"user_input_column": "user_input",
|
73
|
+
"retrieved_contexts_column": "retrieved_contexts",
|
74
|
+
"reference_column": "reference",
|
72
75
|
}
|
73
76
|
```
|
74
77
|
|
@@ -77,8 +80,8 @@ def ContextRecall(
|
|
77
80
|
```python
|
78
81
|
pred_col = dataset.prediction_column(model)
|
79
82
|
params = {
|
80
|
-
"
|
81
|
-
"
|
83
|
+
"retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
|
84
|
+
"reference_column": f"{pred_col}.reference",
|
82
85
|
}
|
83
86
|
```
|
84
87
|
|
@@ -86,8 +89,8 @@ def ContextRecall(
|
|
86
89
|
```python
|
87
90
|
pred_col = dataset.prediction_column(model)
|
88
91
|
params = {
|
89
|
-
"
|
90
|
-
"
|
92
|
+
"retrieved_contexts_column": lambda x: [x[pred_col]["retrieved_contexts"]],
|
93
|
+
"reference_column": "my_ground_truth_col",
|
91
94
|
}
|
92
95
|
```
|
93
96
|
"""
|
@@ -98,32 +101,34 @@ def ContextRecall(
|
|
98
101
|
)
|
99
102
|
|
100
103
|
required_columns = {
|
101
|
-
"
|
102
|
-
"
|
103
|
-
"
|
104
|
+
"user_input": user_input_column,
|
105
|
+
"retrieved_contexts": retrieved_contexts_column,
|
106
|
+
"reference": reference_column,
|
104
107
|
}
|
105
108
|
|
106
109
|
df = get_renamed_columns(dataset._df, required_columns)
|
107
110
|
|
108
111
|
result_df = evaluate(
|
109
|
-
Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
|
112
|
+
Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config()
|
110
113
|
).to_pandas()
|
111
114
|
|
112
|
-
|
113
|
-
|
115
|
+
score_column = "context_recall"
|
116
|
+
|
117
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
118
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
114
119
|
|
115
120
|
return (
|
116
121
|
{
|
117
|
-
# "Scores (will not be uploaded to
|
122
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
118
123
|
# ["question", "contexts", "ground_truth", "context_recall"]
|
119
124
|
# ],
|
120
125
|
"Aggregate Scores": [
|
121
126
|
{
|
122
|
-
"Mean Score": result_df[
|
123
|
-
"Median Score": result_df[
|
124
|
-
"Max Score": result_df[
|
125
|
-
"Min Score": result_df[
|
126
|
-
"Standard Deviation": result_df[
|
127
|
+
"Mean Score": result_df[score_column].mean(),
|
128
|
+
"Median Score": result_df[score_column].median(),
|
129
|
+
"Max Score": result_df[score_column].max(),
|
130
|
+
"Min Score": result_df[score_column].min(),
|
131
|
+
"Standard Deviation": result_df[score_column].std(),
|
127
132
|
"Count": result_df.shape[0],
|
128
133
|
}
|
129
134
|
],
|