validmind 2.5.15__py3-none-any.whl → 2.5.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +54 -112
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/errors.py +17 -0
- validmind/tests/__types__.py +19 -10
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +20 -24
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +4 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/JarqueBera.py +22 -30
- validmind/tests/{model_validation/statsmodels → data_validation}/LJungBox.py +23 -27
- validmind/tests/data_validation/ProtectedClassesCombination.py +205 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +141 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +180 -0
- validmind/tests/{model_validation/statsmodels → data_validation}/RunsTest.py +17 -20
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +20 -22
- validmind/tests/data_validation/nlp/Hashtags.py +15 -20
- validmind/tests/data_validation/nlp/TextDescription.py +3 -1
- validmind/tests/load.py +21 -5
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +12 -5
- validmind/tests/model_validation/ragas/AnswerRelevance.py +12 -6
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +12 -6
- validmind/tests/model_validation/ragas/AspectCritique.py +22 -17
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +12 -6
- validmind/tests/model_validation/ragas/ContextPrecision.py +12 -6
- validmind/tests/model_validation/ragas/ContextRecall.py +12 -6
- validmind/tests/model_validation/ragas/ContextUtilization.py +161 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +12 -6
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +158 -0
- validmind/tests/model_validation/sklearn/FeatureImportance.py +3 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +59 -0
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +40 -20
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +0 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +50 -26
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/METADATA +4 -3
- {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/RECORD +52 -39
- {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/WHEEL +1 -1
- {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/LICENSE +0 -0
- {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/entry_points.txt +0 -0
@@ -58,6 +58,9 @@ def ContextualRecall(dataset, model):
|
|
58
58
|
- Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
|
59
59
|
"""
|
60
60
|
|
61
|
+
# download nltk data
|
62
|
+
nltk.download("punkt_tab", quiet=True)
|
63
|
+
|
61
64
|
y_true = dataset.y
|
62
65
|
y_pred = dataset.y_pred(model)
|
63
66
|
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import answer_correctness
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for AnswerCorrectness. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization")
|
@@ -88,11 +100,6 @@ def AnswerCorrectness(
|
|
88
100
|
}
|
89
101
|
```
|
90
102
|
"""
|
91
|
-
try:
|
92
|
-
from ragas import evaluate
|
93
|
-
from ragas.metrics import answer_correctness
|
94
|
-
except ImportError:
|
95
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
96
103
|
|
97
104
|
warnings.filterwarnings(
|
98
105
|
"ignore",
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import answer_relevancy
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for AnswerRelevance. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm", "rag_performance")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization")
|
@@ -92,12 +104,6 @@ def AnswerRelevance(
|
|
92
104
|
}
|
93
105
|
```
|
94
106
|
"""
|
95
|
-
try:
|
96
|
-
from ragas import evaluate
|
97
|
-
from ragas.metrics import answer_relevancy
|
98
|
-
except ImportError:
|
99
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
100
|
-
|
101
107
|
warnings.filterwarnings(
|
102
108
|
"ignore",
|
103
109
|
category=FutureWarning,
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import answer_similarity
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for AnswerSimilarity. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization")
|
@@ -78,12 +90,6 @@ def AnswerSimilarity(
|
|
78
90
|
}
|
79
91
|
```
|
80
92
|
"""
|
81
|
-
try:
|
82
|
-
from ragas import evaluate
|
83
|
-
from ragas.metrics import answer_similarity
|
84
|
-
except ImportError:
|
85
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
86
|
-
|
87
93
|
warnings.filterwarnings(
|
88
94
|
"ignore",
|
89
95
|
category=FutureWarning,
|
@@ -8,9 +8,28 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import AspectCritic
|
18
|
+
from ragas.metrics._aspect_critic import (
|
19
|
+
coherence,
|
20
|
+
conciseness,
|
21
|
+
correctness,
|
22
|
+
harmfulness,
|
23
|
+
maliciousness,
|
24
|
+
)
|
25
|
+
except ImportError as e:
|
26
|
+
raise MissingDependencyError(
|
27
|
+
"Missing required package `ragas` for AspectCritique. "
|
28
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
29
|
+
required_dependencies=["ragas"],
|
30
|
+
extra="llm",
|
31
|
+
) from e
|
32
|
+
|
14
33
|
LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
|
15
34
|
|
16
35
|
|
@@ -101,20 +120,7 @@ def AspectCritique(
|
|
101
120
|
)
|
102
121
|
```
|
103
122
|
"""
|
104
|
-
|
105
|
-
from ragas import evaluate
|
106
|
-
from ragas.metrics.critique import AspectCritique as _AspectCritique
|
107
|
-
from ragas.metrics.critique import (
|
108
|
-
coherence,
|
109
|
-
conciseness,
|
110
|
-
correctness,
|
111
|
-
harmfulness,
|
112
|
-
maliciousness,
|
113
|
-
)
|
114
|
-
except ImportError:
|
115
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
116
|
-
|
117
|
-
aspect_map = {
|
123
|
+
built_in_aspects = {
|
118
124
|
"coherence": coherence,
|
119
125
|
"conciseness": conciseness,
|
120
126
|
"correctness": correctness,
|
@@ -136,16 +142,15 @@ def AspectCritique(
|
|
136
142
|
|
137
143
|
df = get_renamed_columns(dataset._df, required_columns)
|
138
144
|
|
139
|
-
built_in_aspects = [aspect_map[aspect] for aspect in aspects]
|
140
145
|
custom_aspects = (
|
141
146
|
[
|
142
|
-
|
147
|
+
AspectCritic(name=name, definition=description)
|
143
148
|
for name, description in additional_aspects
|
144
149
|
]
|
145
150
|
if additional_aspects
|
146
151
|
else []
|
147
152
|
)
|
148
|
-
all_aspects = [
|
153
|
+
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
|
149
154
|
|
150
155
|
result_df = evaluate(
|
151
156
|
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import context_entity_recall
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for ContextEntityRecall. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm", "retrieval_performance")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization")
|
@@ -84,12 +96,6 @@ def ContextEntityRecall(
|
|
84
96
|
}
|
85
97
|
```
|
86
98
|
"""
|
87
|
-
try:
|
88
|
-
from ragas import evaluate
|
89
|
-
from ragas.metrics import context_entity_recall
|
90
|
-
except ImportError:
|
91
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
92
|
-
|
93
99
|
warnings.filterwarnings(
|
94
100
|
"ignore",
|
95
101
|
category=FutureWarning,
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import context_precision
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for ContextPrecision. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm", "retrieval_performance")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
@@ -79,12 +91,6 @@ def ContextPrecision(
|
|
79
91
|
}
|
80
92
|
```
|
81
93
|
"""
|
82
|
-
try:
|
83
|
-
from ragas import evaluate
|
84
|
-
from ragas.metrics import context_precision
|
85
|
-
except ImportError:
|
86
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
87
|
-
|
88
94
|
warnings.filterwarnings(
|
89
95
|
"ignore",
|
90
96
|
category=FutureWarning,
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import context_recall
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for ContextRecall. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm", "retrieval_performance")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
@@ -79,12 +91,6 @@ def ContextRecall(
|
|
79
91
|
}
|
80
92
|
```
|
81
93
|
"""
|
82
|
-
try:
|
83
|
-
from ragas import evaluate
|
84
|
-
from ragas.metrics import context_recall
|
85
|
-
except ImportError:
|
86
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
87
|
-
|
88
94
|
warnings.filterwarnings(
|
89
95
|
"ignore",
|
90
96
|
category=FutureWarning,
|
@@ -0,0 +1,161 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
12
|
+
|
13
|
+
from .utils import get_ragas_config, get_renamed_columns
|
14
|
+
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import context_utilization
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for ContextUtilization. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
26
|
+
|
27
|
+
@tags("ragas", "llm", "retrieval_performance")
|
28
|
+
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
29
|
+
def ContextUtilization(
|
30
|
+
dataset,
|
31
|
+
question_column: str = "question",
|
32
|
+
contexts_column: str = "contexts",
|
33
|
+
answer_column: str = "answer",
|
34
|
+
): # noqa: B950
|
35
|
+
"""
|
36
|
+
Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
|
37
|
+
within the provided contexts.
|
38
|
+
|
39
|
+
### Purpose
|
40
|
+
|
41
|
+
The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
|
42
|
+
higher within the provided retrieval results. This metric is essential for assessing the performance of models,
|
43
|
+
especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
|
44
|
+
|
45
|
+
### Test Mechanism
|
46
|
+
|
47
|
+
The test calculates Context Utilization using the formula:
|
48
|
+
|
49
|
+
$$
|
50
|
+
\\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
|
51
|
+
$$
|
52
|
+
$$
|
53
|
+
\\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
|
54
|
+
$$
|
55
|
+
|
56
|
+
Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
|
57
|
+
|
58
|
+
|
59
|
+
This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
|
60
|
+
scores, generating a histogram and box plot for visualization.
|
61
|
+
|
62
|
+
#### Configuring Columns
|
63
|
+
|
64
|
+
This metric requires the following columns in your dataset:
|
65
|
+
|
66
|
+
- `question` (str): The text query that was input into the model.
|
67
|
+
- `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
|
68
|
+
make sure they contain relevant info in the correct order.
|
69
|
+
- `answer` (str): The llm-generated response for the input `question`.
|
70
|
+
|
71
|
+
If the above data is not in the appropriate column, you can specify different column
|
72
|
+
names for these fields using the parameters `question_column`, `contexts_column`
|
73
|
+
and `ground_truth_column`.
|
74
|
+
|
75
|
+
For example, if your dataset has this data stored in different columns, you can
|
76
|
+
pass the following parameters:
|
77
|
+
```python
|
78
|
+
{
|
79
|
+
"question_column": "question",
|
80
|
+
"contexts_column": "context_info"
|
81
|
+
"ground_truth_column": "my_ground_truth_col",
|
82
|
+
}
|
83
|
+
```
|
84
|
+
|
85
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
86
|
+
like this:
|
87
|
+
```python
|
88
|
+
pred_col = dataset.prediction_column(model)
|
89
|
+
params = {
|
90
|
+
"contexts_column": f"{pred_col}.contexts",
|
91
|
+
"ground_truth_column": "my_ground_truth_col",
|
92
|
+
}
|
93
|
+
```
|
94
|
+
|
95
|
+
For more complex situations, you can use a function to extract the data:
|
96
|
+
```python
|
97
|
+
pred_col = dataset.prediction_column(model)
|
98
|
+
params = {
|
99
|
+
"contexts_column": lambda x: [x[pred_col]["context_message"]],
|
100
|
+
"ground_truth_column": "my_ground_truth_col",
|
101
|
+
}
|
102
|
+
```
|
103
|
+
|
104
|
+
### Signs of High Risk
|
105
|
+
|
106
|
+
- Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
|
107
|
+
- High standard deviation, suggesting inconsistent model performance.
|
108
|
+
- Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
|
109
|
+
|
110
|
+
### Strengths
|
111
|
+
|
112
|
+
- Quantifies the rank of relevant context chunks in generating responses.
|
113
|
+
- Provides clear visualizations through histograms and box plots for ease of interpretation.
|
114
|
+
- Adapts to different dataset schema by allowing configurable column names.
|
115
|
+
|
116
|
+
### Limitations
|
117
|
+
|
118
|
+
- Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
|
119
|
+
- Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
|
120
|
+
- Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
|
121
|
+
"""
|
122
|
+
warnings.filterwarnings(
|
123
|
+
"ignore",
|
124
|
+
category=FutureWarning,
|
125
|
+
message="promote has been superseded by promote_options='default'.",
|
126
|
+
)
|
127
|
+
|
128
|
+
required_columns = {
|
129
|
+
"question": question_column,
|
130
|
+
"contexts": contexts_column,
|
131
|
+
"answer": answer_column,
|
132
|
+
}
|
133
|
+
|
134
|
+
df = get_renamed_columns(dataset._df, required_columns)
|
135
|
+
|
136
|
+
result_df = evaluate(
|
137
|
+
Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
|
138
|
+
).to_pandas()
|
139
|
+
|
140
|
+
fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
|
141
|
+
fig_box = px.box(x=result_df["context_utilization"].to_list())
|
142
|
+
|
143
|
+
return (
|
144
|
+
{
|
145
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
146
|
+
# ["question", "contexts", "answer", "context_utilization"]
|
147
|
+
# ],
|
148
|
+
"Aggregate Scores": [
|
149
|
+
{
|
150
|
+
"Mean Score": result_df["context_utilization"].mean(),
|
151
|
+
"Median Score": result_df["context_utilization"].median(),
|
152
|
+
"Max Score": result_df["context_utilization"].max(),
|
153
|
+
"Min Score": result_df["context_utilization"].min(),
|
154
|
+
"Standard Deviation": result_df["context_utilization"].std(),
|
155
|
+
"Count": result_df.shape[0],
|
156
|
+
}
|
157
|
+
],
|
158
|
+
},
|
159
|
+
fig_histogram,
|
160
|
+
fig_box,
|
161
|
+
)
|
@@ -8,9 +8,21 @@ import plotly.express as px
|
|
8
8
|
from datasets import Dataset
|
9
9
|
|
10
10
|
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
11
12
|
|
12
13
|
from .utils import get_ragas_config, get_renamed_columns
|
13
14
|
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import faithfulness
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for Faithfulness. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
14
26
|
|
15
27
|
@tags("ragas", "llm", "rag_performance")
|
16
28
|
@tasks("text_qa", "text_generation", "text_summarization")
|
@@ -78,12 +90,6 @@ def Faithfulness(
|
|
78
90
|
}
|
79
91
|
```
|
80
92
|
"""
|
81
|
-
try:
|
82
|
-
from ragas import evaluate
|
83
|
-
from ragas.metrics import faithfulness
|
84
|
-
except ImportError:
|
85
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
86
|
-
|
87
93
|
warnings.filterwarnings(
|
88
94
|
"ignore",
|
89
95
|
category=FutureWarning,
|
@@ -0,0 +1,158 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import MissingDependencyError
|
12
|
+
|
13
|
+
from .utils import get_ragas_config, get_renamed_columns
|
14
|
+
|
15
|
+
try:
|
16
|
+
from ragas import evaluate
|
17
|
+
from ragas.metrics import noise_sensitivity_relevant
|
18
|
+
except ImportError as e:
|
19
|
+
raise MissingDependencyError(
|
20
|
+
"Missing required package `ragas` for NoiseSensitivity. "
|
21
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
+
required_dependencies=["ragas"],
|
23
|
+
extra="llm",
|
24
|
+
) from e
|
25
|
+
|
26
|
+
|
27
|
+
@tags("ragas", "llm", "rag_performance")
|
28
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
29
|
+
def NoiseSensitivity(
|
30
|
+
dataset,
|
31
|
+
answer_column="answer",
|
32
|
+
contexts_column="contexts",
|
33
|
+
ground_truth_column="ground_truth",
|
34
|
+
):
|
35
|
+
"""
|
36
|
+
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
|
37
|
+
generates incorrect responses.
|
38
|
+
|
39
|
+
### Purpose
|
40
|
+
|
41
|
+
The Noise Sensitivity test aims to measure how sensitive an LLM is to irrelevant or noisy information within the
|
42
|
+
contextual data used to generate its responses. A lower noise sensitivity score suggests better model robustness in
|
43
|
+
generating accurate answers from given contexts.
|
44
|
+
|
45
|
+
### Test Mechanism
|
46
|
+
|
47
|
+
This test evaluates the model's answers by comparing the claims made in the generated response against the ground
|
48
|
+
truth and the retrieved context. The noise sensitivity score is calculated as:
|
49
|
+
|
50
|
+
$$
|
51
|
+
\\text{noise sensitivity} = {|\\text{Number of incorrect claims in answer}| \\over |\\text{Number of total claims in answer}|}
|
52
|
+
$$
|
53
|
+
|
54
|
+
The formula computes the fraction of incorrect claims to the total claims in the answer, using a dataset where
|
55
|
+
'answer', 'context', and 'ground_truth' columns are specified.
|
56
|
+
|
57
|
+
#### Configuring Columns
|
58
|
+
|
59
|
+
This metric requires the following columns in your dataset:
|
60
|
+
|
61
|
+
- `contexts` (List[str]): A list of text contexts which are retrieved to generate
|
62
|
+
the answer.
|
63
|
+
- `answer` (str): The response generated by the model
|
64
|
+
- `ground_truth` (str): The "correct" answer to the question
|
65
|
+
|
66
|
+
If the above data is not in the appropriate column, you can specify different column
|
67
|
+
names for these fields using the parameters `contexts_column` and `answer_column`.
|
68
|
+
|
69
|
+
For example, if your dataset has this data stored in different columns, you can
|
70
|
+
pass the following parameters:
|
71
|
+
```python
|
72
|
+
{
|
73
|
+
"contexts_column": "context_info"
|
74
|
+
"answer_column": "my_answer_col",
|
75
|
+
}
|
76
|
+
```
|
77
|
+
|
78
|
+
If the data is stored as a dictionary in another column, specify the column and key
|
79
|
+
like this:
|
80
|
+
```python
|
81
|
+
pred_col = dataset.prediction_column(model)
|
82
|
+
params = {
|
83
|
+
"contexts_column": f"{pred_col}.contexts",
|
84
|
+
"answer_column": f"{pred_col}.answer",
|
85
|
+
}
|
86
|
+
```
|
87
|
+
|
88
|
+
For more complex situations, you can use a function to extract the data:
|
89
|
+
```python
|
90
|
+
pred_col = dataset.prediction_column(model)
|
91
|
+
params = {
|
92
|
+
"contexts_column": lambda row: [row[pred_col]["context_message"]],
|
93
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
94
|
+
}
|
95
|
+
|
96
|
+
### Signs of High Risk
|
97
|
+
|
98
|
+
- High noise sensitivity scores across multiple samples.
|
99
|
+
- Significant deviation between mean and median noise sensitivity scores.
|
100
|
+
- High standard deviation indicating inconsistency in the model's performance.
|
101
|
+
|
102
|
+
### Strengths
|
103
|
+
|
104
|
+
- Provides a quantitative measure of how well the LLM handles noisy or irrelevant context.
|
105
|
+
- Easy integration and configuration using column parameters.
|
106
|
+
- Utilizes both histogram and box plot visualizations to analyze score distribution.
|
107
|
+
|
108
|
+
### Limitations
|
109
|
+
|
110
|
+
- Requires accurate ground truth that aligns with the generated answers.
|
111
|
+
- Assumes the context provided is sufficiently granular to assess noise sensitivity.
|
112
|
+
- Primarily applicable to tasks like text QA, text generation, and text summarization where contextual relevance is
|
113
|
+
critical.
|
114
|
+
"""
|
115
|
+
warnings.filterwarnings(
|
116
|
+
"ignore",
|
117
|
+
category=FutureWarning,
|
118
|
+
message="promote has been superseded by promote_options='default'.",
|
119
|
+
)
|
120
|
+
|
121
|
+
required_columns = {
|
122
|
+
"answer": answer_column,
|
123
|
+
"contexts": contexts_column,
|
124
|
+
"ground_truth": ground_truth_column,
|
125
|
+
}
|
126
|
+
|
127
|
+
df = get_renamed_columns(dataset._df, required_columns)
|
128
|
+
|
129
|
+
result_df = evaluate(
|
130
|
+
Dataset.from_pandas(df),
|
131
|
+
metrics=[noise_sensitivity_relevant],
|
132
|
+
**get_ragas_config(),
|
133
|
+
).to_pandas()
|
134
|
+
|
135
|
+
fig_histogram = px.histogram(
|
136
|
+
x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
|
137
|
+
)
|
138
|
+
fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
|
139
|
+
|
140
|
+
return (
|
141
|
+
{
|
142
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
143
|
+
# ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
|
144
|
+
# ],
|
145
|
+
"Aggregate Scores": [
|
146
|
+
{
|
147
|
+
"Mean Score": result_df["noise_sensitivity_relevant"].mean(),
|
148
|
+
"Median Score": result_df["noise_sensitivity_relevant"].median(),
|
149
|
+
"Max Score": result_df["noise_sensitivity_relevant"].max(),
|
150
|
+
"Min Score": result_df["noise_sensitivity_relevant"].min(),
|
151
|
+
"Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
|
152
|
+
"Count": result_df.shape[0],
|
153
|
+
}
|
154
|
+
],
|
155
|
+
},
|
156
|
+
fig_histogram,
|
157
|
+
fig_box,
|
158
|
+
)
|
@@ -81,9 +81,9 @@ def FeatureImportance(dataset, model, num_features=3):
|
|
81
81
|
# Dynamically add feature columns to the result
|
82
82
|
for i in range(num_features):
|
83
83
|
if i < len(top_features):
|
84
|
-
result[
|
85
|
-
f"
|
86
|
-
|
84
|
+
result[
|
85
|
+
f"Feature {i + 1}"
|
86
|
+
] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
|
87
87
|
else:
|
88
88
|
result[f"Feature {i + 1}"] = None
|
89
89
|
|
@@ -109,7 +109,7 @@ class PermutationFeatureImportance(Metric):
|
|
109
109
|
)
|
110
110
|
)
|
111
111
|
fig.update_layout(
|
112
|
-
title_text="Permutation Importances
|
112
|
+
title_text="Permutation Importances",
|
113
113
|
yaxis=dict(
|
114
114
|
tickmode="linear", # set tick mode to linear
|
115
115
|
dtick=1, # set interval between ticks
|
@@ -3,11 +3,10 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
-
|
7
6
|
from sklearn import metrics
|
8
7
|
|
9
|
-
from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
|
10
8
|
from validmind import tags, tasks
|
9
|
+
from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
|
11
10
|
|
12
11
|
|
13
12
|
@tags("sklearn", "model_performance")
|