validmind 2.5.15__py3-none-any.whl → 2.5.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +54 -112
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/datasets/credit_risk/__init__.py +1 -0
  9. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  10. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  11. validmind/errors.py +17 -0
  12. validmind/tests/__types__.py +19 -10
  13. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +20 -24
  14. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +4 -1
  15. validmind/tests/{model_validation/statsmodels → data_validation}/JarqueBera.py +22 -30
  16. validmind/tests/{model_validation/statsmodels → data_validation}/LJungBox.py +23 -27
  17. validmind/tests/data_validation/ProtectedClassesCombination.py +205 -0
  18. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  19. validmind/tests/data_validation/ProtectedClassesDisparity.py +141 -0
  20. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +180 -0
  21. validmind/tests/{model_validation/statsmodels → data_validation}/RunsTest.py +17 -20
  22. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +20 -22
  23. validmind/tests/data_validation/nlp/Hashtags.py +15 -20
  24. validmind/tests/data_validation/nlp/TextDescription.py +3 -1
  25. validmind/tests/load.py +21 -5
  26. validmind/tests/model_validation/ContextualRecall.py +3 -0
  27. validmind/tests/model_validation/ragas/AnswerCorrectness.py +12 -5
  28. validmind/tests/model_validation/ragas/AnswerRelevance.py +12 -6
  29. validmind/tests/model_validation/ragas/AnswerSimilarity.py +12 -6
  30. validmind/tests/model_validation/ragas/AspectCritique.py +22 -17
  31. validmind/tests/model_validation/ragas/ContextEntityRecall.py +12 -6
  32. validmind/tests/model_validation/ragas/ContextPrecision.py +12 -6
  33. validmind/tests/model_validation/ragas/ContextRecall.py +12 -6
  34. validmind/tests/model_validation/ragas/ContextUtilization.py +161 -0
  35. validmind/tests/model_validation/ragas/Faithfulness.py +12 -6
  36. validmind/tests/model_validation/ragas/NoiseSensitivity.py +158 -0
  37. validmind/tests/model_validation/sklearn/FeatureImportance.py +3 -3
  38. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +1 -1
  39. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -2
  40. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +59 -0
  41. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +40 -20
  42. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +0 -1
  43. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +1 -1
  44. validmind/utils.py +4 -0
  45. validmind/vm_models/test/metric.py +1 -0
  46. validmind/vm_models/test/result_wrapper.py +50 -26
  47. validmind/vm_models/test/threshold_test.py +1 -0
  48. {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/METADATA +4 -3
  49. {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/RECORD +52 -39
  50. {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/WHEEL +1 -1
  51. {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/LICENSE +0 -0
  52. {validmind-2.5.15.dist-info → validmind-2.5.19.dist-info}/entry_points.txt +0 -0
@@ -58,6 +58,9 @@ def ContextualRecall(dataset, model):
58
58
  - Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
59
59
  """
60
60
 
61
+ # download nltk data
62
+ nltk.download("punkt_tab", quiet=True)
63
+
61
64
  y_true = dataset.y
62
65
  y_pred = dataset.y_pred(model)
63
66
 
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import answer_correctness
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for AnswerCorrectness. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm")
16
28
  @tasks("text_qa", "text_generation", "text_summarization")
@@ -88,11 +100,6 @@ def AnswerCorrectness(
88
100
  }
89
101
  ```
90
102
  """
91
- try:
92
- from ragas import evaluate
93
- from ragas.metrics import answer_correctness
94
- except ImportError:
95
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
96
103
 
97
104
  warnings.filterwarnings(
98
105
  "ignore",
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import answer_relevancy
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for AnswerRelevance. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm", "rag_performance")
16
28
  @tasks("text_qa", "text_generation", "text_summarization")
@@ -92,12 +104,6 @@ def AnswerRelevance(
92
104
  }
93
105
  ```
94
106
  """
95
- try:
96
- from ragas import evaluate
97
- from ragas.metrics import answer_relevancy
98
- except ImportError:
99
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
100
-
101
107
  warnings.filterwarnings(
102
108
  "ignore",
103
109
  category=FutureWarning,
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import answer_similarity
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for AnswerSimilarity. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm")
16
28
  @tasks("text_qa", "text_generation", "text_summarization")
@@ -78,12 +90,6 @@ def AnswerSimilarity(
78
90
  }
79
91
  ```
80
92
  """
81
- try:
82
- from ragas import evaluate
83
- from ragas.metrics import answer_similarity
84
- except ImportError:
85
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
86
-
87
93
  warnings.filterwarnings(
88
94
  "ignore",
89
95
  category=FutureWarning,
@@ -8,9 +8,28 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import AspectCritic
18
+ from ragas.metrics._aspect_critic import (
19
+ coherence,
20
+ conciseness,
21
+ correctness,
22
+ harmfulness,
23
+ maliciousness,
24
+ )
25
+ except ImportError as e:
26
+ raise MissingDependencyError(
27
+ "Missing required package `ragas` for AspectCritique. "
28
+ "Please run `pip install validmind[llm]` to use LLM tests",
29
+ required_dependencies=["ragas"],
30
+ extra="llm",
31
+ ) from e
32
+
14
33
  LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
15
34
 
16
35
 
@@ -101,20 +120,7 @@ def AspectCritique(
101
120
  )
102
121
  ```
103
122
  """
104
- try:
105
- from ragas import evaluate
106
- from ragas.metrics.critique import AspectCritique as _AspectCritique
107
- from ragas.metrics.critique import (
108
- coherence,
109
- conciseness,
110
- correctness,
111
- harmfulness,
112
- maliciousness,
113
- )
114
- except ImportError:
115
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
116
-
117
- aspect_map = {
123
+ built_in_aspects = {
118
124
  "coherence": coherence,
119
125
  "conciseness": conciseness,
120
126
  "correctness": correctness,
@@ -136,16 +142,15 @@ def AspectCritique(
136
142
 
137
143
  df = get_renamed_columns(dataset._df, required_columns)
138
144
 
139
- built_in_aspects = [aspect_map[aspect] for aspect in aspects]
140
145
  custom_aspects = (
141
146
  [
142
- _AspectCritique(name=name, definition=description)
147
+ AspectCritic(name=name, definition=description)
143
148
  for name, description in additional_aspects
144
149
  ]
145
150
  if additional_aspects
146
151
  else []
147
152
  )
148
- all_aspects = [*built_in_aspects, *custom_aspects]
153
+ all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
149
154
 
150
155
  result_df = evaluate(
151
156
  Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import context_entity_recall
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for ContextEntityRecall. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm", "retrieval_performance")
16
28
  @tasks("text_qa", "text_generation", "text_summarization")
@@ -84,12 +96,6 @@ def ContextEntityRecall(
84
96
  }
85
97
  ```
86
98
  """
87
- try:
88
- from ragas import evaluate
89
- from ragas.metrics import context_entity_recall
90
- except ImportError:
91
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
92
-
93
99
  warnings.filterwarnings(
94
100
  "ignore",
95
101
  category=FutureWarning,
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import context_precision
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for ContextPrecision. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm", "retrieval_performance")
16
28
  @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
@@ -79,12 +91,6 @@ def ContextPrecision(
79
91
  }
80
92
  ```
81
93
  """
82
- try:
83
- from ragas import evaluate
84
- from ragas.metrics import context_precision
85
- except ImportError:
86
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
87
-
88
94
  warnings.filterwarnings(
89
95
  "ignore",
90
96
  category=FutureWarning,
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import context_recall
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for ContextRecall. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm", "retrieval_performance")
16
28
  @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
@@ -79,12 +91,6 @@ def ContextRecall(
79
91
  }
80
92
  ```
81
93
  """
82
- try:
83
- from ragas import evaluate
84
- from ragas.metrics import context_recall
85
- except ImportError:
86
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
87
-
88
94
  warnings.filterwarnings(
89
95
  "ignore",
90
96
  category=FutureWarning,
@@ -0,0 +1,161 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+
10
+ from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
12
+
13
+ from .utils import get_ragas_config, get_renamed_columns
14
+
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import context_utilization
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for ContextUtilization. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
26
+
27
+ @tags("ragas", "llm", "retrieval_performance")
28
+ @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
29
+ def ContextUtilization(
30
+ dataset,
31
+ question_column: str = "question",
32
+ contexts_column: str = "contexts",
33
+ answer_column: str = "answer",
34
+ ): # noqa: B950
35
+ """
36
+ Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
37
+ within the provided contexts.
38
+
39
+ ### Purpose
40
+
41
+ The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
42
+ higher within the provided retrieval results. This metric is essential for assessing the performance of models,
43
+ especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
44
+
45
+ ### Test Mechanism
46
+
47
+ The test calculates Context Utilization using the formula:
48
+
49
+ $$
50
+ \\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
51
+ $$
52
+ $$
53
+ \\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
54
+ $$
55
+
56
+ Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
57
+
58
+
59
+ This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
60
+ scores, generating a histogram and box plot for visualization.
61
+
62
+ #### Configuring Columns
63
+
64
+ This metric requires the following columns in your dataset:
65
+
66
+ - `question` (str): The text query that was input into the model.
67
+ - `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
68
+ make sure they contain relevant info in the correct order.
69
+ - `answer` (str): The llm-generated response for the input `question`.
70
+
71
+ If the above data is not in the appropriate column, you can specify different column
72
+ names for these fields using the parameters `question_column`, `contexts_column`
73
+ and `ground_truth_column`.
74
+
75
+ For example, if your dataset has this data stored in different columns, you can
76
+ pass the following parameters:
77
+ ```python
78
+ {
79
+ "question_column": "question",
80
+ "contexts_column": "context_info"
81
+ "ground_truth_column": "my_ground_truth_col",
82
+ }
83
+ ```
84
+
85
+ If the data is stored as a dictionary in another column, specify the column and key
86
+ like this:
87
+ ```python
88
+ pred_col = dataset.prediction_column(model)
89
+ params = {
90
+ "contexts_column": f"{pred_col}.contexts",
91
+ "ground_truth_column": "my_ground_truth_col",
92
+ }
93
+ ```
94
+
95
+ For more complex situations, you can use a function to extract the data:
96
+ ```python
97
+ pred_col = dataset.prediction_column(model)
98
+ params = {
99
+ "contexts_column": lambda x: [x[pred_col]["context_message"]],
100
+ "ground_truth_column": "my_ground_truth_col",
101
+ }
102
+ ```
103
+
104
+ ### Signs of High Risk
105
+
106
+ - Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
107
+ - High standard deviation, suggesting inconsistent model performance.
108
+ - Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
109
+
110
+ ### Strengths
111
+
112
+ - Quantifies the rank of relevant context chunks in generating responses.
113
+ - Provides clear visualizations through histograms and box plots for ease of interpretation.
114
+ - Adapts to different dataset schema by allowing configurable column names.
115
+
116
+ ### Limitations
117
+
118
+ - Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
119
+ - Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
120
+ - Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
121
+ """
122
+ warnings.filterwarnings(
123
+ "ignore",
124
+ category=FutureWarning,
125
+ message="promote has been superseded by promote_options='default'.",
126
+ )
127
+
128
+ required_columns = {
129
+ "question": question_column,
130
+ "contexts": contexts_column,
131
+ "answer": answer_column,
132
+ }
133
+
134
+ df = get_renamed_columns(dataset._df, required_columns)
135
+
136
+ result_df = evaluate(
137
+ Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
138
+ ).to_pandas()
139
+
140
+ fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
141
+ fig_box = px.box(x=result_df["context_utilization"].to_list())
142
+
143
+ return (
144
+ {
145
+ # "Scores (will not be uploaded to UI)": result_df[
146
+ # ["question", "contexts", "answer", "context_utilization"]
147
+ # ],
148
+ "Aggregate Scores": [
149
+ {
150
+ "Mean Score": result_df["context_utilization"].mean(),
151
+ "Median Score": result_df["context_utilization"].median(),
152
+ "Max Score": result_df["context_utilization"].max(),
153
+ "Min Score": result_df["context_utilization"].min(),
154
+ "Standard Deviation": result_df["context_utilization"].std(),
155
+ "Count": result_df.shape[0],
156
+ }
157
+ ],
158
+ },
159
+ fig_histogram,
160
+ fig_box,
161
+ )
@@ -8,9 +8,21 @@ import plotly.express as px
8
8
  from datasets import Dataset
9
9
 
10
10
  from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
11
12
 
12
13
  from .utils import get_ragas_config, get_renamed_columns
13
14
 
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import faithfulness
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for Faithfulness. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
14
26
 
15
27
  @tags("ragas", "llm", "rag_performance")
16
28
  @tasks("text_qa", "text_generation", "text_summarization")
@@ -78,12 +90,6 @@ def Faithfulness(
78
90
  }
79
91
  ```
80
92
  """
81
- try:
82
- from ragas import evaluate
83
- from ragas.metrics import faithfulness
84
- except ImportError:
85
- raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
86
-
87
93
  warnings.filterwarnings(
88
94
  "ignore",
89
95
  category=FutureWarning,
@@ -0,0 +1,158 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+
10
+ from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
12
+
13
+ from .utils import get_ragas_config, get_renamed_columns
14
+
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import noise_sensitivity_relevant
18
+ except ImportError as e:
19
+ raise MissingDependencyError(
20
+ "Missing required package `ragas` for NoiseSensitivity. "
21
+ "Please run `pip install validmind[llm]` to use LLM tests",
22
+ required_dependencies=["ragas"],
23
+ extra="llm",
24
+ ) from e
25
+
26
+
27
+ @tags("ragas", "llm", "rag_performance")
28
+ @tasks("text_qa", "text_generation", "text_summarization")
29
+ def NoiseSensitivity(
30
+ dataset,
31
+ answer_column="answer",
32
+ contexts_column="contexts",
33
+ ground_truth_column="ground_truth",
34
+ ):
35
+ """
36
+ Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
37
+ generates incorrect responses.
38
+
39
+ ### Purpose
40
+
41
+ The Noise Sensitivity test aims to measure how sensitive an LLM is to irrelevant or noisy information within the
42
+ contextual data used to generate its responses. A lower noise sensitivity score suggests better model robustness in
43
+ generating accurate answers from given contexts.
44
+
45
+ ### Test Mechanism
46
+
47
+ This test evaluates the model's answers by comparing the claims made in the generated response against the ground
48
+ truth and the retrieved context. The noise sensitivity score is calculated as:
49
+
50
+ $$
51
+ \\text{noise sensitivity} = {|\\text{Number of incorrect claims in answer}| \\over |\\text{Number of total claims in answer}|}
52
+ $$
53
+
54
+ The formula computes the fraction of incorrect claims to the total claims in the answer, using a dataset where
55
+ 'answer', 'context', and 'ground_truth' columns are specified.
56
+
57
+ #### Configuring Columns
58
+
59
+ This metric requires the following columns in your dataset:
60
+
61
+ - `contexts` (List[str]): A list of text contexts which are retrieved to generate
62
+ the answer.
63
+ - `answer` (str): The response generated by the model
64
+ - `ground_truth` (str): The "correct" answer to the question
65
+
66
+ If the above data is not in the appropriate column, you can specify different column
67
+ names for these fields using the parameters `contexts_column` and `answer_column`.
68
+
69
+ For example, if your dataset has this data stored in different columns, you can
70
+ pass the following parameters:
71
+ ```python
72
+ {
73
+ "contexts_column": "context_info"
74
+ "answer_column": "my_answer_col",
75
+ }
76
+ ```
77
+
78
+ If the data is stored as a dictionary in another column, specify the column and key
79
+ like this:
80
+ ```python
81
+ pred_col = dataset.prediction_column(model)
82
+ params = {
83
+ "contexts_column": f"{pred_col}.contexts",
84
+ "answer_column": f"{pred_col}.answer",
85
+ }
86
+ ```
87
+
88
+ For more complex situations, you can use a function to extract the data:
89
+ ```python
90
+ pred_col = dataset.prediction_column(model)
91
+ params = {
92
+ "contexts_column": lambda row: [row[pred_col]["context_message"]],
93
+ "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
94
+ }
95
+
96
+ ### Signs of High Risk
97
+
98
+ - High noise sensitivity scores across multiple samples.
99
+ - Significant deviation between mean and median noise sensitivity scores.
100
+ - High standard deviation indicating inconsistency in the model's performance.
101
+
102
+ ### Strengths
103
+
104
+ - Provides a quantitative measure of how well the LLM handles noisy or irrelevant context.
105
+ - Easy integration and configuration using column parameters.
106
+ - Utilizes both histogram and box plot visualizations to analyze score distribution.
107
+
108
+ ### Limitations
109
+
110
+ - Requires accurate ground truth that aligns with the generated answers.
111
+ - Assumes the context provided is sufficiently granular to assess noise sensitivity.
112
+ - Primarily applicable to tasks like text QA, text generation, and text summarization where contextual relevance is
113
+ critical.
114
+ """
115
+ warnings.filterwarnings(
116
+ "ignore",
117
+ category=FutureWarning,
118
+ message="promote has been superseded by promote_options='default'.",
119
+ )
120
+
121
+ required_columns = {
122
+ "answer": answer_column,
123
+ "contexts": contexts_column,
124
+ "ground_truth": ground_truth_column,
125
+ }
126
+
127
+ df = get_renamed_columns(dataset._df, required_columns)
128
+
129
+ result_df = evaluate(
130
+ Dataset.from_pandas(df),
131
+ metrics=[noise_sensitivity_relevant],
132
+ **get_ragas_config(),
133
+ ).to_pandas()
134
+
135
+ fig_histogram = px.histogram(
136
+ x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
137
+ )
138
+ fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
139
+
140
+ return (
141
+ {
142
+ # "Scores (will not be uploaded to UI)": result_df[
143
+ # ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
144
+ # ],
145
+ "Aggregate Scores": [
146
+ {
147
+ "Mean Score": result_df["noise_sensitivity_relevant"].mean(),
148
+ "Median Score": result_df["noise_sensitivity_relevant"].median(),
149
+ "Max Score": result_df["noise_sensitivity_relevant"].max(),
150
+ "Min Score": result_df["noise_sensitivity_relevant"].min(),
151
+ "Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
152
+ "Count": result_df.shape[0],
153
+ }
154
+ ],
155
+ },
156
+ fig_histogram,
157
+ fig_box,
158
+ )
@@ -81,9 +81,9 @@ def FeatureImportance(dataset, model, num_features=3):
81
81
  # Dynamically add feature columns to the result
82
82
  for i in range(num_features):
83
83
  if i < len(top_features):
84
- result[f"Feature {i + 1}"] = (
85
- f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
86
- )
84
+ result[
85
+ f"Feature {i + 1}"
86
+ ] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
87
87
  else:
88
88
  result[f"Feature {i + 1}"] = None
89
89
 
@@ -109,7 +109,7 @@ class PermutationFeatureImportance(Metric):
109
109
  )
110
110
  )
111
111
  fig.update_layout(
112
- title_text="Permutation Importances (train set)",
112
+ title_text="Permutation Importances",
113
113
  yaxis=dict(
114
114
  tickmode="linear", # set tick mode to linear
115
115
  dtick=1, # set interval between ticks
@@ -3,11 +3,10 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  import pandas as pd
6
-
7
6
  from sklearn import metrics
8
7
 
9
- from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
10
8
  from validmind import tags, tasks
9
+ from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
11
10
 
12
11
 
13
12
  @tags("sklearn", "model_performance")