validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +3 -3
- validmind/api_client.py +2 -3
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +2 -2
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +5 -1
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +56 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +558 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/result_wrapper.py +61 -24
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import answer_correctness
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def AnswerCorrectness(
|
20
|
+
dataset,
|
21
|
+
question_column="question",
|
22
|
+
answer_column="answer",
|
23
|
+
ground_truth_column="ground_truth",
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Evaluates the correctness of answers in a dataset with respect to the provided ground
|
27
|
+
truths and visualizes the results in a histogram.
|
28
|
+
|
29
|
+
The assessment of Answer Correctness involves gauging the accuracy of the generated
|
30
|
+
answer when compared to the ground truth. This evaluation relies on the `ground truth`
|
31
|
+
and the `answer`, with scores ranging from 0 to 1. A higher score indicates a closer
|
32
|
+
alignment between the generated answer and the ground truth, signifying better
|
33
|
+
correctness.
|
34
|
+
|
35
|
+
Answer correctness encompasses two critical aspects: semantic similarity between the
|
36
|
+
generated answer and the ground truth, as well as factual similarity. These aspects
|
37
|
+
are combined using a weighted scheme to formulate the answer correctness score. Users
|
38
|
+
also have the option to employ a `threshold` value to round the resulting score to
|
39
|
+
a binary value (0 or 1) based on the threshold.
|
40
|
+
|
41
|
+
Factual correctness quantifies the factual overlap between the generated answer and
|
42
|
+
the ground truth answer. This is done using the concepts of:
|
43
|
+
|
44
|
+
- TP (True Positive): Facts or statements that are present in both the ground truth
|
45
|
+
and the generated answer.
|
46
|
+
- FP (False Positive): Facts or statements that are present in the generated answer
|
47
|
+
but not in the ground truth.
|
48
|
+
- FN (False Negative): Facts or statements that are present in the ground truth but
|
49
|
+
not in the generated answer.
|
50
|
+
|
51
|
+
### Configuring Columns
|
52
|
+
|
53
|
+
This metric requires specific columns to be present in the dataset:
|
54
|
+
- `question` (str): The text prompt or query that was input into the model.
|
55
|
+
- `answer` (str): The text response generated by the model.
|
56
|
+
- `ground_truth` (str): The ground truth answer that the generated answer is compared
|
57
|
+
against.
|
58
|
+
|
59
|
+
If the above data is not in the appropriate column, you can specify different column
|
60
|
+
names for these fields using the parameters `question_column`, `answer_column`, and
|
61
|
+
`ground_truth_column`.
|
62
|
+
|
63
|
+
For example, if your dataset has this data stored in different columns, you can
|
64
|
+
pass the following parameters:
|
65
|
+
```python
|
66
|
+
params = {
|
67
|
+
"question_column": "input_text",
|
68
|
+
"answer_column": "output_text",
|
69
|
+
"ground_truth_column": "human_answer",
|
70
|
+
}
|
71
|
+
```
|
72
|
+
|
73
|
+
If answer and contexts are stored as a dictionary in another column, specify the
|
74
|
+
column and key like this:
|
75
|
+
```python
|
76
|
+
pred_col = dataset.prediction_column(model)
|
77
|
+
params = {
|
78
|
+
"answer_column": f"{pred_col}.generated_answer",
|
79
|
+
"ground_truth_column": f"{pred_col}.contexts",
|
80
|
+
}
|
81
|
+
```
|
82
|
+
|
83
|
+
For more complex data structures, you can use a function to extract the answers:
|
84
|
+
```python
|
85
|
+
pred_col = dataset.prediction_column(model)
|
86
|
+
params = {
|
87
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
88
|
+
"ground_truth_column": lambda row: [row[pred_col]["context_message"]],
|
89
|
+
}
|
90
|
+
```
|
91
|
+
"""
|
92
|
+
warnings.filterwarnings(
|
93
|
+
"ignore",
|
94
|
+
category=FutureWarning,
|
95
|
+
message="promote has been superseded by promote_options='default'.",
|
96
|
+
)
|
97
|
+
|
98
|
+
required_columns = {
|
99
|
+
"question": question_column,
|
100
|
+
"answer": answer_column,
|
101
|
+
"ground_truth": ground_truth_column,
|
102
|
+
}
|
103
|
+
|
104
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
105
|
+
|
106
|
+
result_df = evaluate(
|
107
|
+
Dataset.from_pandas(df), metrics=[answer_correctness]
|
108
|
+
).to_pandas()
|
109
|
+
|
110
|
+
fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
|
111
|
+
fig_box = px.box(x=result_df["answer_correctness"].to_list())
|
112
|
+
|
113
|
+
return (
|
114
|
+
{
|
115
|
+
"Scores": result_df[
|
116
|
+
["question", "answer", "ground_truth", "answer_correctness"]
|
117
|
+
],
|
118
|
+
"Aggregate Scores": [
|
119
|
+
{
|
120
|
+
"Mean Score": result_df["answer_correctness"].mean(),
|
121
|
+
"Median Score": result_df["answer_correctness"].median(),
|
122
|
+
"Max Score": result_df["answer_correctness"].max(),
|
123
|
+
"Min Score": result_df["answer_correctness"].min(),
|
124
|
+
"Standard Deviation": result_df["answer_correctness"].std(),
|
125
|
+
"Count": len(result_df),
|
126
|
+
}
|
127
|
+
],
|
128
|
+
},
|
129
|
+
fig_histogram,
|
130
|
+
fig_box,
|
131
|
+
)
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import answer_relevancy
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm", "rag_performance")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def AnswerRelevance(
|
20
|
+
dataset,
|
21
|
+
question_column="question",
|
22
|
+
contexts_column="contexts",
|
23
|
+
answer_column="answer",
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Assesses how pertinent the generated answer is to the given prompt.
|
27
|
+
|
28
|
+
The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the
|
29
|
+
generated answer is to the given prompt. A lower score is assigned to answers that
|
30
|
+
are incomplete or contain redundant information and higher scores indicate better
|
31
|
+
relevancy. This metric is computed using the `question`, the `contexts` and the
|
32
|
+
`answer`.
|
33
|
+
|
34
|
+
The Answer Relevancy is defined as the mean cosine similartiy of the original
|
35
|
+
`question` to a number of artifical questions, which are generated (reverse-engineered)
|
36
|
+
based on the `answer`:
|
37
|
+
|
38
|
+
$$
|
39
|
+
\\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} cos(E_{g_i}, E_o)
|
40
|
+
$$
|
41
|
+
$$
|
42
|
+
\\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} \\frac{E_{g_i} \\cdot E_o}{\\|E_{g_i}\\|\\|E_o\\|}
|
43
|
+
$$
|
44
|
+
|
45
|
+
Where:
|
46
|
+
- $E_{g_i}$ is the embedding of the generated question $i$.
|
47
|
+
- $E_o$ is the embedding of the original question.
|
48
|
+
- $N$ is the number of generated questions - 3 by default.
|
49
|
+
|
50
|
+
**Note**: *This is a reference-free metric, meaning that it does not require a
|
51
|
+
`ground_truth` answer to compare against. A similar metric that does evaluate the
|
52
|
+
correctness of a generated answser with respect to a `ground_truth` answer is
|
53
|
+
`validmind.model_validation.ragas.AnswerCorrectness`.*
|
54
|
+
|
55
|
+
### Configuring Columns
|
56
|
+
|
57
|
+
This metric requires the following columns in your dataset:
|
58
|
+
- `question` (str): The text query that was input into the model.
|
59
|
+
- `contexts` (List[str]): Any contextual information retrieved by the model before
|
60
|
+
generating an answer.
|
61
|
+
- `answer` (str): The response generated by the model.
|
62
|
+
|
63
|
+
If the above data is not in the appropriate column, you can specify different column
|
64
|
+
names for these fields using the parameters `question_column`, `answer_column`, and
|
65
|
+
`contexts_column`.
|
66
|
+
|
67
|
+
For example, if your dataset has this data stored in different columns, you can
|
68
|
+
pass the following parameters:
|
69
|
+
```python
|
70
|
+
params = {
|
71
|
+
"question_column": "input_text",
|
72
|
+
"answer_column": "output_text",
|
73
|
+
"contexts_column": "context_info"
|
74
|
+
}
|
75
|
+
```
|
76
|
+
|
77
|
+
If answer and contexts are stored as a dictionary in another column, specify the
|
78
|
+
column and key like this:
|
79
|
+
```python
|
80
|
+
pred_col = dataset.prediction_column(model)
|
81
|
+
params = {
|
82
|
+
"answer_column": f"{pred_col}.generated_answer",
|
83
|
+
"contexts_column": f"{pred_col}.contexts",
|
84
|
+
}
|
85
|
+
```
|
86
|
+
|
87
|
+
For more complex data structures, you can use a function to extract the answers:
|
88
|
+
```python
|
89
|
+
pred_col = dataset.prediction_column(model)
|
90
|
+
params = {
|
91
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
92
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
93
|
+
}
|
94
|
+
```
|
95
|
+
"""
|
96
|
+
warnings.filterwarnings(
|
97
|
+
"ignore",
|
98
|
+
category=FutureWarning,
|
99
|
+
message="promote has been superseded by promote_options='default'.",
|
100
|
+
)
|
101
|
+
|
102
|
+
required_columns = {
|
103
|
+
"question": question_column,
|
104
|
+
"answer": answer_column,
|
105
|
+
"contexts": contexts_column,
|
106
|
+
}
|
107
|
+
|
108
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
109
|
+
|
110
|
+
result_df = evaluate(
|
111
|
+
Dataset.from_pandas(df),
|
112
|
+
metrics=[answer_relevancy],
|
113
|
+
).to_pandas()
|
114
|
+
|
115
|
+
fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
|
116
|
+
fig_box = px.box(x=result_df["answer_relevancy"].to_list())
|
117
|
+
|
118
|
+
return (
|
119
|
+
{
|
120
|
+
"Scores": result_df[["question", "contexts", "answer", "answer_relevancy"]],
|
121
|
+
"Aggregate Scores": [
|
122
|
+
{
|
123
|
+
"Mean Score": result_df["answer_relevancy"].mean(),
|
124
|
+
"Median Score": result_df["answer_relevancy"].median(),
|
125
|
+
"Max Score": result_df["answer_relevancy"].max(),
|
126
|
+
"Min Score": result_df["answer_relevancy"].min(),
|
127
|
+
"Standard Deviation": result_df["answer_relevancy"].std(),
|
128
|
+
"Count": len(result_df),
|
129
|
+
}
|
130
|
+
],
|
131
|
+
},
|
132
|
+
fig_histogram,
|
133
|
+
fig_box,
|
134
|
+
)
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import answer_similarity
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def AnswerSimilarity(
|
20
|
+
dataset,
|
21
|
+
answer_column="answer",
|
22
|
+
ground_truth_column="ground_truth",
|
23
|
+
):
|
24
|
+
"""
|
25
|
+
Calculates the semantic similarity between generated answers and ground truths
|
26
|
+
|
27
|
+
The concept of Answer Semantic Similarity pertains to the assessment of the semantic
|
28
|
+
resemblance between the generated answer and the ground truth. This evaluation is
|
29
|
+
based on the `ground_truth` and the `answer`, with values falling within the range
|
30
|
+
of 0 to 1. A higher score signifies a better alignment between the generated answer
|
31
|
+
and the ground truth.
|
32
|
+
|
33
|
+
Measuring the semantic similarity between answers can offer valuable insights into
|
34
|
+
the quality of the generated response. This evaluation utilizes a cross-encoder
|
35
|
+
model to calculate the semantic similarity score.
|
36
|
+
|
37
|
+
See this paper for more details: https://arxiv.org/pdf/2108.06130.pdf
|
38
|
+
|
39
|
+
The following steps are involved in computing the answer similarity score:
|
40
|
+
1. Vectorize the ground truth answer using the specified embedding model.
|
41
|
+
2. Vectorize the generated answer using the same embedding model.
|
42
|
+
3. Compute the cosine similarity between the two vectors.
|
43
|
+
|
44
|
+
### Configuring Columns
|
45
|
+
|
46
|
+
This metric requires the following columns in your dataset:
|
47
|
+
- `answer` (str): The text response generated by the model.
|
48
|
+
- `ground_truth` (str): The ground truth answer that the generated answer is compared
|
49
|
+
against.
|
50
|
+
|
51
|
+
If the above data is not in the appropriate column, you can specify different column
|
52
|
+
names for these fields using the parameters `answer_column`, and `ground_truth_column`.
|
53
|
+
|
54
|
+
For example, if your dataset has this data stored in different columns, you can
|
55
|
+
pass the following parameters:
|
56
|
+
```python
|
57
|
+
{
|
58
|
+
"answer_column": "llm_output_col",
|
59
|
+
"ground_truth_column": "my_ground_truth_col",
|
60
|
+
}
|
61
|
+
```
|
62
|
+
|
63
|
+
If answer is stored as a dictionary in another column, specify the column and key
|
64
|
+
like this:
|
65
|
+
```python
|
66
|
+
pred_col = dataset.prediction_column(model)
|
67
|
+
params = {
|
68
|
+
"answer_column": f"{pred_col}.generated_answer",
|
69
|
+
"ground_truth_column": "my_ground_truth_col",
|
70
|
+
}
|
71
|
+
```
|
72
|
+
|
73
|
+
For more complex situations, you can use a function to extract the data:
|
74
|
+
```python
|
75
|
+
pred_col = dataset.prediction_column(model)
|
76
|
+
params = {
|
77
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
78
|
+
"ground_truth_column": "my_ground_truth_col",
|
79
|
+
}
|
80
|
+
```
|
81
|
+
"""
|
82
|
+
warnings.filterwarnings(
|
83
|
+
"ignore",
|
84
|
+
category=FutureWarning,
|
85
|
+
message="promote has been superseded by promote_options='default'.",
|
86
|
+
)
|
87
|
+
|
88
|
+
required_columns = {
|
89
|
+
"answer": answer_column,
|
90
|
+
"ground_truth": ground_truth_column,
|
91
|
+
}
|
92
|
+
|
93
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
94
|
+
|
95
|
+
result_df = evaluate(
|
96
|
+
Dataset.from_pandas(df),
|
97
|
+
metrics=[answer_similarity],
|
98
|
+
).to_pandas()
|
99
|
+
|
100
|
+
fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
|
101
|
+
fig_box = px.box(x=result_df["answer_similarity"].to_list())
|
102
|
+
|
103
|
+
return (
|
104
|
+
{
|
105
|
+
"Scores": result_df[["answer", "ground_truth", "answer_similarity"]],
|
106
|
+
"Aggregate Scores": [
|
107
|
+
{
|
108
|
+
"Mean Score": result_df["answer_similarity"].mean(),
|
109
|
+
"Median Score": result_df["answer_similarity"].median(),
|
110
|
+
"Max Score": result_df["answer_similarity"].max(),
|
111
|
+
"Min Score": result_df["answer_similarity"].min(),
|
112
|
+
"Standard Deviation": result_df["answer_similarity"].std(),
|
113
|
+
"Count": len(result_df),
|
114
|
+
}
|
115
|
+
],
|
116
|
+
},
|
117
|
+
fig_histogram,
|
118
|
+
fig_box,
|
119
|
+
)
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics.critique import AspectCritique as _AspectCritique
|
11
|
+
from ragas.metrics.critique import (
|
12
|
+
coherence,
|
13
|
+
conciseness,
|
14
|
+
correctness,
|
15
|
+
harmfulness,
|
16
|
+
maliciousness,
|
17
|
+
)
|
18
|
+
|
19
|
+
from validmind import tags, tasks
|
20
|
+
|
21
|
+
from .utils import get_renamed_columns
|
22
|
+
|
23
|
+
aspect_map = {
|
24
|
+
"coherence": coherence,
|
25
|
+
"conciseness": conciseness,
|
26
|
+
"correctness": correctness,
|
27
|
+
"harmfulness": harmfulness,
|
28
|
+
"maliciousness": maliciousness,
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
@tags("ragas", "llm", "qualitative")
|
33
|
+
@tasks("text_summarization", "text_generation", "text_qa")
|
34
|
+
def AspectCritique(
|
35
|
+
dataset,
|
36
|
+
question_column="question",
|
37
|
+
answer_column="answer",
|
38
|
+
contexts_column="contexts",
|
39
|
+
aspects: list = [
|
40
|
+
"coherence",
|
41
|
+
"conciseness",
|
42
|
+
"correctness",
|
43
|
+
"harmfulness",
|
44
|
+
"maliciousness",
|
45
|
+
],
|
46
|
+
additional_aspects: list = [],
|
47
|
+
):
|
48
|
+
"""
|
49
|
+
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
50
|
+
coherence, correctness, and conciseness.
|
51
|
+
|
52
|
+
### Overview:
|
53
|
+
|
54
|
+
This is designed to assess submissions against predefined and user-defined "aspects".
|
55
|
+
For each aspect, a judge LLM is prompted to critique a piece of generated text based
|
56
|
+
on a description of the aspect. The output of this evaluation is a binary (0/1 = yes/no)
|
57
|
+
score that indicates whether the submission aligns with the defined aspect or not.
|
58
|
+
|
59
|
+
### Inputs and Outputs:
|
60
|
+
|
61
|
+
The input to this metric is a dataset containing the input `question` (prompt to the LLM)
|
62
|
+
and the `answer` (text generated by the LLM). Any retrieved `contexts` can also be
|
63
|
+
included to enhance the evaluation.
|
64
|
+
|
65
|
+
The `question_column`, `answer_column`, and `contexts_column` parameters can be used to
|
66
|
+
specify the names or sources for the data that this metric will evaluate if the dataset
|
67
|
+
does not contain the required columns `question`, `answer`, and `contexts`.
|
68
|
+
|
69
|
+
By default, the aspects evaluated are harmfulness, maliciousness, coherence,
|
70
|
+
correctness, and conciseness. To change the aspects evaluated, the `aspects` parameter
|
71
|
+
can be set to a list containing any of these aspects.
|
72
|
+
|
73
|
+
To add custom aspects, the `additional_aspects` parameter can be passed as a list
|
74
|
+
of tuples where each tuple contains the aspect name and a description of the aspect
|
75
|
+
that the judge LLM will use to critique the submission.
|
76
|
+
|
77
|
+
The output of this metric is a table of scores for each aspect where the aspect score
|
78
|
+
is the number of "yes" scores divided by the total number of submissions:
|
79
|
+
$$
|
80
|
+
\\text{aspect score} = \\frac{\\text{number of "yes" scores}}{\\text{total number of submissions}}
|
81
|
+
$$
|
82
|
+
|
83
|
+
### Examples:
|
84
|
+
|
85
|
+
- **Mapping to Required Columns:** If the dataset does not contain the columns required
|
86
|
+
to run this metric (i.e., `question`, `answer`, and `contexts`), the
|
87
|
+
|
88
|
+
```python
|
89
|
+
pred_col = my_vm_dataset.prediction_column(my_vm_model)
|
90
|
+
run_test(
|
91
|
+
"validmind.model_validation.ragas.AspectCritique",
|
92
|
+
inputs={"dataset": my_vm_dataset},
|
93
|
+
params={
|
94
|
+
"question_column": "input_prompt",
|
95
|
+
"answer_column": f"{pred_col}.llm_output",
|
96
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
97
|
+
},
|
98
|
+
)
|
99
|
+
```
|
100
|
+
|
101
|
+
- **Custom Aspects:** To evaluate custom aspects, the `additional_aspects` parameter can
|
102
|
+
be set to a list of tuples where each tuple contains the aspect name and a description
|
103
|
+
of the aspect that the judge LLM will use to critique the submission. For example, to
|
104
|
+
evaluate whether the LLM-generated text has a "professional tone", the `additional_aspects`
|
105
|
+
parameter can be set like this:
|
106
|
+
|
107
|
+
```python
|
108
|
+
run_test(
|
109
|
+
"validmind.model_validation.ragas.AspectCritique",
|
110
|
+
inputs={"dataset": my_vm_dataset},
|
111
|
+
params={
|
112
|
+
"additional_aspects": [
|
113
|
+
("professionalism", "Does the text have a professional tone?"),
|
114
|
+
],
|
115
|
+
},
|
116
|
+
)
|
117
|
+
```
|
118
|
+
"""
|
119
|
+
warnings.filterwarnings(
|
120
|
+
"ignore",
|
121
|
+
category=FutureWarning,
|
122
|
+
message="promote has been superseded by promote_options='default'.",
|
123
|
+
)
|
124
|
+
|
125
|
+
required_columns = {
|
126
|
+
"question": question_column,
|
127
|
+
"answer": answer_column,
|
128
|
+
"contexts": contexts_column,
|
129
|
+
}
|
130
|
+
|
131
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
132
|
+
|
133
|
+
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
134
|
+
custom_aspects = [
|
135
|
+
_AspectCritique(name=name, definition=description)
|
136
|
+
for name, description in additional_aspects
|
137
|
+
]
|
138
|
+
all_aspects = [*built_in_aspects, *custom_aspects]
|
139
|
+
|
140
|
+
result_df = evaluate(Dataset.from_pandas(df), metrics=all_aspects).to_pandas()
|
141
|
+
|
142
|
+
df_melted = result_df.melt(
|
143
|
+
id_vars=["question", "answer", "contexts"],
|
144
|
+
value_vars=[aspect.name for aspect in all_aspects],
|
145
|
+
var_name="Metric",
|
146
|
+
value_name="Result",
|
147
|
+
)
|
148
|
+
df_counts = df_melted.groupby(["Metric", "Result"]).size().reset_index(name="Count")
|
149
|
+
df_counts["Result"] = df_counts["Result"].map({0: "Fail", 1: "Pass"})
|
150
|
+
|
151
|
+
fig = px.bar(
|
152
|
+
df_counts,
|
153
|
+
x="Metric",
|
154
|
+
y="Count",
|
155
|
+
color="Result",
|
156
|
+
color_discrete_map={"Fail": "red", "Pass": "green"},
|
157
|
+
labels={"Count": "Pass vs Fail Count", "Metric": "Aspect Name"},
|
158
|
+
barmode="group",
|
159
|
+
title="Aspect Critique Results",
|
160
|
+
)
|
161
|
+
|
162
|
+
return {
|
163
|
+
"Aspect Scores": [
|
164
|
+
{"Aspect": aspect, "Score": result_df[aspect].mean()}
|
165
|
+
for aspect in aspects + [aspect.name for aspect in custom_aspects]
|
166
|
+
]
|
167
|
+
}, fig
|
@@ -0,0 +1,133 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import context_entity_recall
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm", "retrieval_performance")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def ContextEntityRecall(
|
20
|
+
dataset,
|
21
|
+
contexts_column: str = "contexts",
|
22
|
+
ground_truth_column: str = "ground_truth",
|
23
|
+
):
|
24
|
+
"""
|
25
|
+
Evaluates the context entity recall for dataset entries and visualizes the results.
|
26
|
+
|
27
|
+
### Overview
|
28
|
+
|
29
|
+
This metric gives the measure of recall of the retrieved context, based on the
|
30
|
+
number of entities present in both `ground_truths` and `contexts` relative to the
|
31
|
+
number of entities present in the `ground_truths` alone. Simply put, it is a measure
|
32
|
+
of what fraction of entities are recalled from `ground_truths`. This metric is
|
33
|
+
useful in fact-based use cases like tourism help desk, historical QA, etc. This
|
34
|
+
metric can help evaluate the retrieval mechanism for entities, based on comparison
|
35
|
+
with entities present in `ground_truths`, because in cases where entities matter,
|
36
|
+
we need the `contexts` which cover them.
|
37
|
+
|
38
|
+
### Formula
|
39
|
+
|
40
|
+
To compute this metric, we use two sets, $GE$ and $CE$, representing the set of
|
41
|
+
entities present in `ground_truths` and set of entities present in `contexts`
|
42
|
+
respectively. We then take the number of elements in intersection of these sets and
|
43
|
+
divide it by the number of elements present in the $GE$, given by the formula:
|
44
|
+
|
45
|
+
$$
|
46
|
+
\\text{context entity recall} = \\frac{| CE \\cap GE |}{| GE |}
|
47
|
+
$$
|
48
|
+
|
49
|
+
### Configuring Columns
|
50
|
+
|
51
|
+
This metric requires the following columns in your dataset:
|
52
|
+
- `contexts` (List[str]): A list of text contexts which will be evaluated to make
|
53
|
+
sure if they contain the entities present in the ground truth.
|
54
|
+
- `ground_truth` (str): The ground truth text from which the entities will be
|
55
|
+
extracted and compared with the entities in the `contexts`.
|
56
|
+
|
57
|
+
If the above data is not in the appropriate column, you can specify different column
|
58
|
+
names for these fields using the parameters `contexts_column`, and `ground_truth_column`.
|
59
|
+
|
60
|
+
For example, if your dataset has this data stored in different columns, you can
|
61
|
+
pass the following parameters:
|
62
|
+
```python
|
63
|
+
{
|
64
|
+
"contexts_column": "context_info"
|
65
|
+
"ground_truth_column": "my_ground_truth_col",
|
66
|
+
}
|
67
|
+
```
|
68
|
+
|
69
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
70
|
+
like this:
|
71
|
+
```python
|
72
|
+
pred_col = dataset.prediction_column(model)
|
73
|
+
params = {
|
74
|
+
"contexts_column": f"{pred_col}.contexts",
|
75
|
+
"ground_truth_column": "my_ground_truth_col",
|
76
|
+
}
|
77
|
+
```
|
78
|
+
|
79
|
+
For more complex situations, you can use a function to extract the data:
|
80
|
+
```python
|
81
|
+
pred_col = dataset.prediction_column(model)
|
82
|
+
params = {
|
83
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
84
|
+
"ground_truth_column": "my_ground_truth_col",
|
85
|
+
}
|
86
|
+
```
|
87
|
+
"""
|
88
|
+
warnings.filterwarnings(
|
89
|
+
"ignore",
|
90
|
+
category=FutureWarning,
|
91
|
+
message="promote has been superseded by promote_options='default'.",
|
92
|
+
)
|
93
|
+
|
94
|
+
required_columns = {
|
95
|
+
"ground_truth": ground_truth_column,
|
96
|
+
"contexts": contexts_column,
|
97
|
+
}
|
98
|
+
|
99
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
100
|
+
|
101
|
+
result_df = evaluate(
|
102
|
+
Dataset.from_pandas(df),
|
103
|
+
metrics=[context_entity_recall],
|
104
|
+
).to_pandas()
|
105
|
+
|
106
|
+
fig_histogram = px.histogram(
|
107
|
+
x=result_df["context_entity_recall"].to_list(), nbins=10
|
108
|
+
)
|
109
|
+
fig_box = px.box(x=result_df["context_entity_recall"].to_list())
|
110
|
+
|
111
|
+
return (
|
112
|
+
{
|
113
|
+
"Scores": result_df[
|
114
|
+
[
|
115
|
+
"contexts",
|
116
|
+
"ground_truth",
|
117
|
+
"context_entity_recall",
|
118
|
+
]
|
119
|
+
],
|
120
|
+
"Aggregate Scores": [
|
121
|
+
{
|
122
|
+
"Mean Score": result_df["context_entity_recall"].mean(),
|
123
|
+
"Median Score": result_df["context_entity_recall"].median(),
|
124
|
+
"Max Score": result_df["context_entity_recall"].max(),
|
125
|
+
"Min Score": result_df["context_entity_recall"].min(),
|
126
|
+
"Standard Deviation": result_df["context_entity_recall"].std(),
|
127
|
+
"Count": len(result_df),
|
128
|
+
}
|
129
|
+
],
|
130
|
+
},
|
131
|
+
fig_histogram,
|
132
|
+
fig_box,
|
133
|
+
)
|