validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +72 -49
- validmind/api_client.py +42 -16
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/errors.py +1 -1
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +12 -7
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +13 -7
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +99 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +560 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/metric.py +9 -24
- validmind/vm_models/test/result_wrapper.py +124 -28
- validmind/vm_models/test/threshold_test.py +10 -28
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,134 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import answer_relevancy
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm", "rag_performance")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def AnswerRelevance(
|
20
|
+
dataset,
|
21
|
+
question_column="question",
|
22
|
+
contexts_column="contexts",
|
23
|
+
answer_column="answer",
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Assesses how pertinent the generated answer is to the given prompt.
|
27
|
+
|
28
|
+
The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the
|
29
|
+
generated answer is to the given prompt. A lower score is assigned to answers that
|
30
|
+
are incomplete or contain redundant information and higher scores indicate better
|
31
|
+
relevancy. This metric is computed using the `question`, the `contexts` and the
|
32
|
+
`answer`.
|
33
|
+
|
34
|
+
The Answer Relevancy is defined as the mean cosine similartiy of the original
|
35
|
+
`question` to a number of artifical questions, which are generated (reverse-engineered)
|
36
|
+
based on the `answer`:
|
37
|
+
|
38
|
+
$$
|
39
|
+
\\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} cos(E_{g_i}, E_o)
|
40
|
+
$$
|
41
|
+
$$
|
42
|
+
\\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} \\frac{E_{g_i} \\cdot E_o}{\\|E_{g_i}\\|\\|E_o\\|}
|
43
|
+
$$
|
44
|
+
|
45
|
+
Where:
|
46
|
+
- $E_{g_i}$ is the embedding of the generated question $i$.
|
47
|
+
- $E_o$ is the embedding of the original question.
|
48
|
+
- $N$ is the number of generated questions - 3 by default.
|
49
|
+
|
50
|
+
**Note**: *This is a reference-free metric, meaning that it does not require a
|
51
|
+
`ground_truth` answer to compare against. A similar metric that does evaluate the
|
52
|
+
correctness of a generated answser with respect to a `ground_truth` answer is
|
53
|
+
`validmind.model_validation.ragas.AnswerCorrectness`.*
|
54
|
+
|
55
|
+
### Configuring Columns
|
56
|
+
|
57
|
+
This metric requires the following columns in your dataset:
|
58
|
+
- `question` (str): The text query that was input into the model.
|
59
|
+
- `contexts` (List[str]): Any contextual information retrieved by the model before
|
60
|
+
generating an answer.
|
61
|
+
- `answer` (str): The response generated by the model.
|
62
|
+
|
63
|
+
If the above data is not in the appropriate column, you can specify different column
|
64
|
+
names for these fields using the parameters `question_column`, `answer_column`, and
|
65
|
+
`contexts_column`.
|
66
|
+
|
67
|
+
For example, if your dataset has this data stored in different columns, you can
|
68
|
+
pass the following parameters:
|
69
|
+
```python
|
70
|
+
params = {
|
71
|
+
"question_column": "input_text",
|
72
|
+
"answer_column": "output_text",
|
73
|
+
"contexts_column": "context_info"
|
74
|
+
}
|
75
|
+
```
|
76
|
+
|
77
|
+
If answer and contexts are stored as a dictionary in another column, specify the
|
78
|
+
column and key like this:
|
79
|
+
```python
|
80
|
+
pred_col = dataset.prediction_column(model)
|
81
|
+
params = {
|
82
|
+
"answer_column": f"{pred_col}.generated_answer",
|
83
|
+
"contexts_column": f"{pred_col}.contexts",
|
84
|
+
}
|
85
|
+
```
|
86
|
+
|
87
|
+
For more complex data structures, you can use a function to extract the answers:
|
88
|
+
```python
|
89
|
+
pred_col = dataset.prediction_column(model)
|
90
|
+
params = {
|
91
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
92
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
93
|
+
}
|
94
|
+
```
|
95
|
+
"""
|
96
|
+
warnings.filterwarnings(
|
97
|
+
"ignore",
|
98
|
+
category=FutureWarning,
|
99
|
+
message="promote has been superseded by promote_options='default'.",
|
100
|
+
)
|
101
|
+
|
102
|
+
required_columns = {
|
103
|
+
"question": question_column,
|
104
|
+
"answer": answer_column,
|
105
|
+
"contexts": contexts_column,
|
106
|
+
}
|
107
|
+
|
108
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
109
|
+
|
110
|
+
result_df = evaluate(
|
111
|
+
Dataset.from_pandas(df),
|
112
|
+
metrics=[answer_relevancy],
|
113
|
+
).to_pandas()
|
114
|
+
|
115
|
+
fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
|
116
|
+
fig_box = px.box(x=result_df["answer_relevancy"].to_list())
|
117
|
+
|
118
|
+
return (
|
119
|
+
{
|
120
|
+
"Scores": result_df[["question", "contexts", "answer", "answer_relevancy"]],
|
121
|
+
"Aggregate Scores": [
|
122
|
+
{
|
123
|
+
"Mean Score": result_df["answer_relevancy"].mean(),
|
124
|
+
"Median Score": result_df["answer_relevancy"].median(),
|
125
|
+
"Max Score": result_df["answer_relevancy"].max(),
|
126
|
+
"Min Score": result_df["answer_relevancy"].min(),
|
127
|
+
"Standard Deviation": result_df["answer_relevancy"].std(),
|
128
|
+
"Count": len(result_df),
|
129
|
+
}
|
130
|
+
],
|
131
|
+
},
|
132
|
+
fig_histogram,
|
133
|
+
fig_box,
|
134
|
+
)
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import answer_similarity
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def AnswerSimilarity(
|
20
|
+
dataset,
|
21
|
+
answer_column="answer",
|
22
|
+
ground_truth_column="ground_truth",
|
23
|
+
):
|
24
|
+
"""
|
25
|
+
Calculates the semantic similarity between generated answers and ground truths
|
26
|
+
|
27
|
+
The concept of Answer Semantic Similarity pertains to the assessment of the semantic
|
28
|
+
resemblance between the generated answer and the ground truth. This evaluation is
|
29
|
+
based on the `ground_truth` and the `answer`, with values falling within the range
|
30
|
+
of 0 to 1. A higher score signifies a better alignment between the generated answer
|
31
|
+
and the ground truth.
|
32
|
+
|
33
|
+
Measuring the semantic similarity between answers can offer valuable insights into
|
34
|
+
the quality of the generated response. This evaluation utilizes a cross-encoder
|
35
|
+
model to calculate the semantic similarity score.
|
36
|
+
|
37
|
+
See this paper for more details: https://arxiv.org/pdf/2108.06130.pdf
|
38
|
+
|
39
|
+
The following steps are involved in computing the answer similarity score:
|
40
|
+
1. Vectorize the ground truth answer using the specified embedding model.
|
41
|
+
2. Vectorize the generated answer using the same embedding model.
|
42
|
+
3. Compute the cosine similarity between the two vectors.
|
43
|
+
|
44
|
+
### Configuring Columns
|
45
|
+
|
46
|
+
This metric requires the following columns in your dataset:
|
47
|
+
- `answer` (str): The text response generated by the model.
|
48
|
+
- `ground_truth` (str): The ground truth answer that the generated answer is compared
|
49
|
+
against.
|
50
|
+
|
51
|
+
If the above data is not in the appropriate column, you can specify different column
|
52
|
+
names for these fields using the parameters `answer_column`, and `ground_truth_column`.
|
53
|
+
|
54
|
+
For example, if your dataset has this data stored in different columns, you can
|
55
|
+
pass the following parameters:
|
56
|
+
```python
|
57
|
+
{
|
58
|
+
"answer_column": "llm_output_col",
|
59
|
+
"ground_truth_column": "my_ground_truth_col",
|
60
|
+
}
|
61
|
+
```
|
62
|
+
|
63
|
+
If answer is stored as a dictionary in another column, specify the column and key
|
64
|
+
like this:
|
65
|
+
```python
|
66
|
+
pred_col = dataset.prediction_column(model)
|
67
|
+
params = {
|
68
|
+
"answer_column": f"{pred_col}.generated_answer",
|
69
|
+
"ground_truth_column": "my_ground_truth_col",
|
70
|
+
}
|
71
|
+
```
|
72
|
+
|
73
|
+
For more complex situations, you can use a function to extract the data:
|
74
|
+
```python
|
75
|
+
pred_col = dataset.prediction_column(model)
|
76
|
+
params = {
|
77
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
78
|
+
"ground_truth_column": "my_ground_truth_col",
|
79
|
+
}
|
80
|
+
```
|
81
|
+
"""
|
82
|
+
warnings.filterwarnings(
|
83
|
+
"ignore",
|
84
|
+
category=FutureWarning,
|
85
|
+
message="promote has been superseded by promote_options='default'.",
|
86
|
+
)
|
87
|
+
|
88
|
+
required_columns = {
|
89
|
+
"answer": answer_column,
|
90
|
+
"ground_truth": ground_truth_column,
|
91
|
+
}
|
92
|
+
|
93
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
94
|
+
|
95
|
+
result_df = evaluate(
|
96
|
+
Dataset.from_pandas(df),
|
97
|
+
metrics=[answer_similarity],
|
98
|
+
).to_pandas()
|
99
|
+
|
100
|
+
fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
|
101
|
+
fig_box = px.box(x=result_df["answer_similarity"].to_list())
|
102
|
+
|
103
|
+
return (
|
104
|
+
{
|
105
|
+
"Scores": result_df[["answer", "ground_truth", "answer_similarity"]],
|
106
|
+
"Aggregate Scores": [
|
107
|
+
{
|
108
|
+
"Mean Score": result_df["answer_similarity"].mean(),
|
109
|
+
"Median Score": result_df["answer_similarity"].median(),
|
110
|
+
"Max Score": result_df["answer_similarity"].max(),
|
111
|
+
"Min Score": result_df["answer_similarity"].min(),
|
112
|
+
"Standard Deviation": result_df["answer_similarity"].std(),
|
113
|
+
"Count": len(result_df),
|
114
|
+
}
|
115
|
+
],
|
116
|
+
},
|
117
|
+
fig_histogram,
|
118
|
+
fig_box,
|
119
|
+
)
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics.critique import AspectCritique as _AspectCritique
|
11
|
+
from ragas.metrics.critique import (
|
12
|
+
coherence,
|
13
|
+
conciseness,
|
14
|
+
correctness,
|
15
|
+
harmfulness,
|
16
|
+
maliciousness,
|
17
|
+
)
|
18
|
+
|
19
|
+
from validmind import tags, tasks
|
20
|
+
|
21
|
+
from .utils import get_renamed_columns
|
22
|
+
|
23
|
+
aspect_map = {
|
24
|
+
"coherence": coherence,
|
25
|
+
"conciseness": conciseness,
|
26
|
+
"correctness": correctness,
|
27
|
+
"harmfulness": harmfulness,
|
28
|
+
"maliciousness": maliciousness,
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
@tags("ragas", "llm", "qualitative")
|
33
|
+
@tasks("text_summarization", "text_generation", "text_qa")
|
34
|
+
def AspectCritique(
|
35
|
+
dataset,
|
36
|
+
question_column="question",
|
37
|
+
answer_column="answer",
|
38
|
+
contexts_column="contexts",
|
39
|
+
aspects: list = [
|
40
|
+
"coherence",
|
41
|
+
"conciseness",
|
42
|
+
"correctness",
|
43
|
+
"harmfulness",
|
44
|
+
"maliciousness",
|
45
|
+
],
|
46
|
+
additional_aspects: list = [],
|
47
|
+
):
|
48
|
+
"""
|
49
|
+
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
50
|
+
coherence, correctness, and conciseness.
|
51
|
+
|
52
|
+
### Overview:
|
53
|
+
|
54
|
+
This is designed to assess submissions against predefined and user-defined "aspects".
|
55
|
+
For each aspect, a judge LLM is prompted to critique a piece of generated text based
|
56
|
+
on a description of the aspect. The output of this evaluation is a binary (0/1 = yes/no)
|
57
|
+
score that indicates whether the submission aligns with the defined aspect or not.
|
58
|
+
|
59
|
+
### Inputs and Outputs:
|
60
|
+
|
61
|
+
The input to this metric is a dataset containing the input `question` (prompt to the LLM)
|
62
|
+
and the `answer` (text generated by the LLM). Any retrieved `contexts` can also be
|
63
|
+
included to enhance the evaluation.
|
64
|
+
|
65
|
+
The `question_column`, `answer_column`, and `contexts_column` parameters can be used to
|
66
|
+
specify the names or sources for the data that this metric will evaluate if the dataset
|
67
|
+
does not contain the required columns `question`, `answer`, and `contexts`.
|
68
|
+
|
69
|
+
By default, the aspects evaluated are harmfulness, maliciousness, coherence,
|
70
|
+
correctness, and conciseness. To change the aspects evaluated, the `aspects` parameter
|
71
|
+
can be set to a list containing any of these aspects.
|
72
|
+
|
73
|
+
To add custom aspects, the `additional_aspects` parameter can be passed as a list
|
74
|
+
of tuples where each tuple contains the aspect name and a description of the aspect
|
75
|
+
that the judge LLM will use to critique the submission.
|
76
|
+
|
77
|
+
The output of this metric is a table of scores for each aspect where the aspect score
|
78
|
+
is the number of "yes" scores divided by the total number of submissions:
|
79
|
+
$$
|
80
|
+
\\text{aspect score} = \\frac{\\text{number of "yes" scores}}{\\text{total number of submissions}}
|
81
|
+
$$
|
82
|
+
|
83
|
+
### Examples:
|
84
|
+
|
85
|
+
- **Mapping to Required Columns:** If the dataset does not contain the columns required
|
86
|
+
to run this metric (i.e., `question`, `answer`, and `contexts`), the
|
87
|
+
|
88
|
+
```python
|
89
|
+
pred_col = my_vm_dataset.prediction_column(my_vm_model)
|
90
|
+
run_test(
|
91
|
+
"validmind.model_validation.ragas.AspectCritique",
|
92
|
+
inputs={"dataset": my_vm_dataset},
|
93
|
+
params={
|
94
|
+
"question_column": "input_prompt",
|
95
|
+
"answer_column": f"{pred_col}.llm_output",
|
96
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
97
|
+
},
|
98
|
+
)
|
99
|
+
```
|
100
|
+
|
101
|
+
- **Custom Aspects:** To evaluate custom aspects, the `additional_aspects` parameter can
|
102
|
+
be set to a list of tuples where each tuple contains the aspect name and a description
|
103
|
+
of the aspect that the judge LLM will use to critique the submission. For example, to
|
104
|
+
evaluate whether the LLM-generated text has a "professional tone", the `additional_aspects`
|
105
|
+
parameter can be set like this:
|
106
|
+
|
107
|
+
```python
|
108
|
+
run_test(
|
109
|
+
"validmind.model_validation.ragas.AspectCritique",
|
110
|
+
inputs={"dataset": my_vm_dataset},
|
111
|
+
params={
|
112
|
+
"additional_aspects": [
|
113
|
+
("professionalism", "Does the text have a professional tone?"),
|
114
|
+
],
|
115
|
+
},
|
116
|
+
)
|
117
|
+
```
|
118
|
+
"""
|
119
|
+
warnings.filterwarnings(
|
120
|
+
"ignore",
|
121
|
+
category=FutureWarning,
|
122
|
+
message="promote has been superseded by promote_options='default'.",
|
123
|
+
)
|
124
|
+
|
125
|
+
required_columns = {
|
126
|
+
"question": question_column,
|
127
|
+
"answer": answer_column,
|
128
|
+
"contexts": contexts_column,
|
129
|
+
}
|
130
|
+
|
131
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
132
|
+
|
133
|
+
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
134
|
+
custom_aspects = [
|
135
|
+
_AspectCritique(name=name, definition=description)
|
136
|
+
for name, description in additional_aspects
|
137
|
+
]
|
138
|
+
all_aspects = [*built_in_aspects, *custom_aspects]
|
139
|
+
|
140
|
+
result_df = evaluate(Dataset.from_pandas(df), metrics=all_aspects).to_pandas()
|
141
|
+
|
142
|
+
df_melted = result_df.melt(
|
143
|
+
id_vars=["question", "answer", "contexts"],
|
144
|
+
value_vars=[aspect.name for aspect in all_aspects],
|
145
|
+
var_name="Metric",
|
146
|
+
value_name="Result",
|
147
|
+
)
|
148
|
+
df_counts = df_melted.groupby(["Metric", "Result"]).size().reset_index(name="Count")
|
149
|
+
df_counts["Result"] = df_counts["Result"].map({0: "Fail", 1: "Pass"})
|
150
|
+
|
151
|
+
fig = px.bar(
|
152
|
+
df_counts,
|
153
|
+
x="Metric",
|
154
|
+
y="Count",
|
155
|
+
color="Result",
|
156
|
+
color_discrete_map={"Fail": "red", "Pass": "green"},
|
157
|
+
labels={"Count": "Pass vs Fail Count", "Metric": "Aspect Name"},
|
158
|
+
barmode="group",
|
159
|
+
title="Aspect Critique Results",
|
160
|
+
)
|
161
|
+
|
162
|
+
return {
|
163
|
+
"Aspect Scores": [
|
164
|
+
{"Aspect": aspect, "Score": result_df[aspect].mean()}
|
165
|
+
for aspect in aspects + [aspect.name for aspect in custom_aspects]
|
166
|
+
]
|
167
|
+
}, fig
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import context_entity_recall
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm", "retrieval_performance")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def ContextEntityRecall(
|
20
|
+
dataset,
|
21
|
+
contexts_column: str = "contexts",
|
22
|
+
ground_truth_column: str = "ground_truth",
|
23
|
+
):
|
24
|
+
"""
|
25
|
+
Evaluates the context entity recall for dataset entries and visualizes the results.
|
26
|
+
|
27
|
+
### Overview
|
28
|
+
|
29
|
+
This metric gives the measure of recall of the retrieved context, based on the
|
30
|
+
number of entities present in both `ground_truths` and `contexts` relative to the
|
31
|
+
number of entities present in the `ground_truths` alone. Simply put, it is a measure
|
32
|
+
of what fraction of entities are recalled from `ground_truths`. This metric is
|
33
|
+
useful in fact-based use cases like tourism help desk, historical QA, etc. This
|
34
|
+
metric can help evaluate the retrieval mechanism for entities, based on comparison
|
35
|
+
with entities present in `ground_truths`, because in cases where entities matter,
|
36
|
+
we need the `contexts` which cover them.
|
37
|
+
|
38
|
+
### Formula
|
39
|
+
|
40
|
+
To compute this metric, we use two sets, $GE$ and $CE$, representing the set of
|
41
|
+
entities present in `ground_truths` and set of entities present in `contexts`
|
42
|
+
respectively. We then take the number of elements in intersection of these sets and
|
43
|
+
divide it by the number of elements present in the $GE$, given by the formula:
|
44
|
+
|
45
|
+
$$
|
46
|
+
\\text{context entity recall} = \\frac{| CE \\cap GE |}{| GE |}
|
47
|
+
$$
|
48
|
+
|
49
|
+
### Configuring Columns
|
50
|
+
|
51
|
+
This metric requires the following columns in your dataset:
|
52
|
+
- `contexts` (List[str]): A list of text contexts which will be evaluated to make
|
53
|
+
sure if they contain the entities present in the ground truth.
|
54
|
+
- `ground_truth` (str): The ground truth text from which the entities will be
|
55
|
+
extracted and compared with the entities in the `contexts`.
|
56
|
+
|
57
|
+
If the above data is not in the appropriate column, you can specify different column
|
58
|
+
names for these fields using the parameters `contexts_column`, and `ground_truth_column`.
|
59
|
+
|
60
|
+
For example, if your dataset has this data stored in different columns, you can
|
61
|
+
pass the following parameters:
|
62
|
+
```python
|
63
|
+
{
|
64
|
+
"contexts_column": "context_info"
|
65
|
+
"ground_truth_column": "my_ground_truth_col",
|
66
|
+
}
|
67
|
+
```
|
68
|
+
|
69
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
70
|
+
like this:
|
71
|
+
```python
|
72
|
+
pred_col = dataset.prediction_column(model)
|
73
|
+
params = {
|
74
|
+
"contexts_column": f"{pred_col}.contexts",
|
75
|
+
"ground_truth_column": "my_ground_truth_col",
|
76
|
+
}
|
77
|
+
```
|
78
|
+
|
79
|
+
For more complex situations, you can use a function to extract the data:
|
80
|
+
```python
|
81
|
+
pred_col = dataset.prediction_column(model)
|
82
|
+
params = {
|
83
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
84
|
+
"ground_truth_column": "my_ground_truth_col",
|
85
|
+
}
|
86
|
+
```
|
87
|
+
"""
|
88
|
+
warnings.filterwarnings(
|
89
|
+
"ignore",
|
90
|
+
category=FutureWarning,
|
91
|
+
message="promote has been superseded by promote_options='default'.",
|
92
|
+
)
|
93
|
+
|
94
|
+
required_columns = {
|
95
|
+
"ground_truth": ground_truth_column,
|
96
|
+
"contexts": contexts_column,
|
97
|
+
}
|
98
|
+
|
99
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
100
|
+
|
101
|
+
result_df = evaluate(
|
102
|
+
Dataset.from_pandas(df),
|
103
|
+
metrics=[context_entity_recall],
|
104
|
+
).to_pandas()
|
105
|
+
|
106
|
+
fig_histogram = px.histogram(
|
107
|
+
x=result_df["context_entity_recall"].to_list(), nbins=10
|
108
|
+
)
|
109
|
+
fig_box = px.box(x=result_df["context_entity_recall"].to_list())
|
110
|
+
|
111
|
+
return (
|
112
|
+
{
|
113
|
+
"Scores": result_df[
|
114
|
+
[
|
115
|
+
"contexts",
|
116
|
+
"ground_truth",
|
117
|
+
"context_entity_recall",
|
118
|
+
]
|
119
|
+
],
|
120
|
+
"Aggregate Scores": [
|
121
|
+
{
|
122
|
+
"Mean Score": result_df["context_entity_recall"].mean(),
|
123
|
+
"Median Score": result_df["context_entity_recall"].median(),
|
124
|
+
"Max Score": result_df["context_entity_recall"].max(),
|
125
|
+
"Min Score": result_df["context_entity_recall"].min(),
|
126
|
+
"Standard Deviation": result_df["context_entity_recall"].std(),
|
127
|
+
"Count": len(result_df),
|
128
|
+
}
|
129
|
+
],
|
130
|
+
},
|
131
|
+
fig_histogram,
|
132
|
+
fig_box,
|
133
|
+
)
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import context_precision
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm", "retrieval_performance")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
19
|
+
def ContextPrecision(
|
20
|
+
dataset,
|
21
|
+
question_column: str = "question",
|
22
|
+
contexts_column: str = "contexts",
|
23
|
+
ground_truth_column: str = "ground_truth",
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Context Precision is a metric that evaluates whether all of the ground-truth
|
27
|
+
relevant items present in the contexts are ranked higher or not. Ideally all the
|
28
|
+
relevant chunks must appear at the top ranks. This metric is computed using the
|
29
|
+
`question`, `ground_truth` and the `contexts`, with values ranging between 0 and 1,
|
30
|
+
where higher scores indicate better precision.
|
31
|
+
|
32
|
+
$$
|
33
|
+
\\text{Context Precision@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
|
34
|
+
$$
|
35
|
+
$$
|
36
|
+
\\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
|
37
|
+
$$
|
38
|
+
|
39
|
+
Where $K$ is the total number of chunks in contexts and $v_k \\in \\{0, 1\\}$ is the
|
40
|
+
relevance indicator at rank $k$.
|
41
|
+
|
42
|
+
### Configuring Columns
|
43
|
+
|
44
|
+
This metric requires the following columns in your dataset:
|
45
|
+
- `question` (str): The text query that was input into the model.
|
46
|
+
- `contexts` (List[str]): A list of text contexts which are retrieved and which
|
47
|
+
will be evaluated to make sure they contain relevant info in the correct order.
|
48
|
+
- `ground_truth` (str): The ground truth text to compare with the retrieved contexts.
|
49
|
+
|
50
|
+
If the above data is not in the appropriate column, you can specify different column
|
51
|
+
names for these fields using the parameters `question_column`, `contexts_column`
|
52
|
+
and `ground_truth_column`.
|
53
|
+
|
54
|
+
For example, if your dataset has this data stored in different columns, you can
|
55
|
+
pass the following parameters:
|
56
|
+
```python
|
57
|
+
{
|
58
|
+
"question_column": "question",
|
59
|
+
"contexts_column": "context_info"
|
60
|
+
"ground_truth_column": "my_ground_truth_col",
|
61
|
+
}
|
62
|
+
```
|
63
|
+
|
64
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
65
|
+
like this:
|
66
|
+
```python
|
67
|
+
pred_col = dataset.prediction_column(model)
|
68
|
+
params = {
|
69
|
+
"contexts_column": f"{pred_col}.contexts",
|
70
|
+
"ground_truth_column": "my_ground_truth_col",
|
71
|
+
}
|
72
|
+
```
|
73
|
+
|
74
|
+
For more complex situations, you can use a function to extract the data:
|
75
|
+
```python
|
76
|
+
pred_col = dataset.prediction_column(model)
|
77
|
+
params = {
|
78
|
+
"contexts_column": lambda x: [x[pred_col]["context_message"]],
|
79
|
+
"ground_truth_column": "my_ground_truth_col",
|
80
|
+
}
|
81
|
+
```
|
82
|
+
"""
|
83
|
+
warnings.filterwarnings(
|
84
|
+
"ignore",
|
85
|
+
category=FutureWarning,
|
86
|
+
message="promote has been superseded by promote_options='default'.",
|
87
|
+
)
|
88
|
+
|
89
|
+
required_columns = {
|
90
|
+
"question": question_column,
|
91
|
+
"contexts": contexts_column,
|
92
|
+
"ground_truth": ground_truth_column,
|
93
|
+
}
|
94
|
+
|
95
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
96
|
+
|
97
|
+
result_df = evaluate(
|
98
|
+
Dataset.from_pandas(df),
|
99
|
+
metrics=[context_precision],
|
100
|
+
).to_pandas()
|
101
|
+
|
102
|
+
fig_histogram = px.histogram(x=result_df["context_precision"].to_list(), nbins=10)
|
103
|
+
fig_box = px.box(x=result_df["context_precision"].to_list())
|
104
|
+
|
105
|
+
return (
|
106
|
+
{
|
107
|
+
"Scores": result_df[
|
108
|
+
["question", "contexts", "ground_truth", "context_precision"]
|
109
|
+
],
|
110
|
+
"Aggregate Scores": [
|
111
|
+
{
|
112
|
+
"Mean Score": result_df["context_precision"].mean(),
|
113
|
+
"Median Score": result_df["context_precision"].median(),
|
114
|
+
"Max Score": result_df["context_precision"].max(),
|
115
|
+
"Min Score": result_df["context_precision"].min(),
|
116
|
+
"Standard Deviation": result_df["context_precision"].std(),
|
117
|
+
"Count": len(result_df),
|
118
|
+
}
|
119
|
+
],
|
120
|
+
},
|
121
|
+
fig_histogram,
|
122
|
+
fig_box,
|
123
|
+
)
|