validmind 2.1.1__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,131 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+ from ragas import evaluate
10
+ from ragas.metrics import answer_correctness
11
+
12
+ from validmind import tags, tasks
13
+
14
+ from .utils import get_renamed_columns
15
+
16
+
17
+ @tags("ragas", "llm")
18
+ @tasks("text_qa", "text_generation", "text_summarization")
19
+ def AnswerCorrectness(
20
+ dataset,
21
+ question_column="question",
22
+ answer_column="answer",
23
+ ground_truth_column="ground_truth",
24
+ ):
25
+ """
26
+ Evaluates the correctness of answers in a dataset with respect to the provided ground
27
+ truths and visualizes the results in a histogram.
28
+
29
+ The assessment of Answer Correctness involves gauging the accuracy of the generated
30
+ answer when compared to the ground truth. This evaluation relies on the `ground truth`
31
+ and the `answer`, with scores ranging from 0 to 1. A higher score indicates a closer
32
+ alignment between the generated answer and the ground truth, signifying better
33
+ correctness.
34
+
35
+ Answer correctness encompasses two critical aspects: semantic similarity between the
36
+ generated answer and the ground truth, as well as factual similarity. These aspects
37
+ are combined using a weighted scheme to formulate the answer correctness score. Users
38
+ also have the option to employ a `threshold` value to round the resulting score to
39
+ a binary value (0 or 1) based on the threshold.
40
+
41
+ Factual correctness quantifies the factual overlap between the generated answer and
42
+ the ground truth answer. This is done using the concepts of:
43
+
44
+ - TP (True Positive): Facts or statements that are present in both the ground truth
45
+ and the generated answer.
46
+ - FP (False Positive): Facts or statements that are present in the generated answer
47
+ but not in the ground truth.
48
+ - FN (False Negative): Facts or statements that are present in the ground truth but
49
+ not in the generated answer.
50
+
51
+ ### Configuring Columns
52
+
53
+ This metric requires specific columns to be present in the dataset:
54
+ - `question` (str): The text prompt or query that was input into the model.
55
+ - `answer` (str): The text response generated by the model.
56
+ - `ground_truth` (str): The ground truth answer that the generated answer is compared
57
+ against.
58
+
59
+ If the above data is not in the appropriate column, you can specify different column
60
+ names for these fields using the parameters `question_column`, `answer_column`, and
61
+ `ground_truth_column`.
62
+
63
+ For example, if your dataset has this data stored in different columns, you can
64
+ pass the following parameters:
65
+ ```python
66
+ params = {
67
+ "question_column": "input_text",
68
+ "answer_column": "output_text",
69
+ "ground_truth_column": "human_answer",
70
+ }
71
+ ```
72
+
73
+ If answer and contexts are stored as a dictionary in another column, specify the
74
+ column and key like this:
75
+ ```python
76
+ pred_col = dataset.prediction_column(model)
77
+ params = {
78
+ "answer_column": f"{pred_col}.generated_answer",
79
+ "ground_truth_column": f"{pred_col}.contexts",
80
+ }
81
+ ```
82
+
83
+ For more complex data structures, you can use a function to extract the answers:
84
+ ```python
85
+ pred_col = dataset.prediction_column(model)
86
+ params = {
87
+ "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
88
+ "ground_truth_column": lambda row: [row[pred_col]["context_message"]],
89
+ }
90
+ ```
91
+ """
92
+ warnings.filterwarnings(
93
+ "ignore",
94
+ category=FutureWarning,
95
+ message="promote has been superseded by promote_options='default'.",
96
+ )
97
+
98
+ required_columns = {
99
+ "question": question_column,
100
+ "answer": answer_column,
101
+ "ground_truth": ground_truth_column,
102
+ }
103
+
104
+ df = get_renamed_columns(dataset.df, required_columns)
105
+
106
+ result_df = evaluate(
107
+ Dataset.from_pandas(df), metrics=[answer_correctness]
108
+ ).to_pandas()
109
+
110
+ fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
111
+ fig_box = px.box(x=result_df["answer_correctness"].to_list())
112
+
113
+ return (
114
+ {
115
+ "Scores": result_df[
116
+ ["question", "answer", "ground_truth", "answer_correctness"]
117
+ ],
118
+ "Aggregate Scores": [
119
+ {
120
+ "Mean Score": result_df["answer_correctness"].mean(),
121
+ "Median Score": result_df["answer_correctness"].median(),
122
+ "Max Score": result_df["answer_correctness"].max(),
123
+ "Min Score": result_df["answer_correctness"].min(),
124
+ "Standard Deviation": result_df["answer_correctness"].std(),
125
+ "Count": len(result_df),
126
+ }
127
+ ],
128
+ },
129
+ fig_histogram,
130
+ fig_box,
131
+ )
@@ -0,0 +1,134 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+ from ragas import evaluate
10
+ from ragas.metrics import answer_relevancy
11
+
12
+ from validmind import tags, tasks
13
+
14
+ from .utils import get_renamed_columns
15
+
16
+
17
+ @tags("ragas", "llm", "rag_performance")
18
+ @tasks("text_qa", "text_generation", "text_summarization")
19
+ def AnswerRelevance(
20
+ dataset,
21
+ question_column="question",
22
+ contexts_column="contexts",
23
+ answer_column="answer",
24
+ ):
25
+ """
26
+ Assesses how pertinent the generated answer is to the given prompt.
27
+
28
+ The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the
29
+ generated answer is to the given prompt. A lower score is assigned to answers that
30
+ are incomplete or contain redundant information and higher scores indicate better
31
+ relevancy. This metric is computed using the `question`, the `contexts` and the
32
+ `answer`.
33
+
34
+ The Answer Relevancy is defined as the mean cosine similartiy of the original
35
+ `question` to a number of artifical questions, which are generated (reverse-engineered)
36
+ based on the `answer`:
37
+
38
+ $$
39
+ \\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} cos(E_{g_i}, E_o)
40
+ $$
41
+ $$
42
+ \\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} \\frac{E_{g_i} \\cdot E_o}{\\|E_{g_i}\\|\\|E_o\\|}
43
+ $$
44
+
45
+ Where:
46
+ - $E_{g_i}$ is the embedding of the generated question $i$.
47
+ - $E_o$ is the embedding of the original question.
48
+ - $N$ is the number of generated questions - 3 by default.
49
+
50
+ **Note**: *This is a reference-free metric, meaning that it does not require a
51
+ `ground_truth` answer to compare against. A similar metric that does evaluate the
52
+ correctness of a generated answser with respect to a `ground_truth` answer is
53
+ `validmind.model_validation.ragas.AnswerCorrectness`.*
54
+
55
+ ### Configuring Columns
56
+
57
+ This metric requires the following columns in your dataset:
58
+ - `question` (str): The text query that was input into the model.
59
+ - `contexts` (List[str]): Any contextual information retrieved by the model before
60
+ generating an answer.
61
+ - `answer` (str): The response generated by the model.
62
+
63
+ If the above data is not in the appropriate column, you can specify different column
64
+ names for these fields using the parameters `question_column`, `answer_column`, and
65
+ `contexts_column`.
66
+
67
+ For example, if your dataset has this data stored in different columns, you can
68
+ pass the following parameters:
69
+ ```python
70
+ params = {
71
+ "question_column": "input_text",
72
+ "answer_column": "output_text",
73
+ "contexts_column": "context_info"
74
+ }
75
+ ```
76
+
77
+ If answer and contexts are stored as a dictionary in another column, specify the
78
+ column and key like this:
79
+ ```python
80
+ pred_col = dataset.prediction_column(model)
81
+ params = {
82
+ "answer_column": f"{pred_col}.generated_answer",
83
+ "contexts_column": f"{pred_col}.contexts",
84
+ }
85
+ ```
86
+
87
+ For more complex data structures, you can use a function to extract the answers:
88
+ ```python
89
+ pred_col = dataset.prediction_column(model)
90
+ params = {
91
+ "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
92
+ "contexts_column": lambda row: [row[pred_col]["context_message"]],
93
+ }
94
+ ```
95
+ """
96
+ warnings.filterwarnings(
97
+ "ignore",
98
+ category=FutureWarning,
99
+ message="promote has been superseded by promote_options='default'.",
100
+ )
101
+
102
+ required_columns = {
103
+ "question": question_column,
104
+ "answer": answer_column,
105
+ "contexts": contexts_column,
106
+ }
107
+
108
+ df = get_renamed_columns(dataset.df, required_columns)
109
+
110
+ result_df = evaluate(
111
+ Dataset.from_pandas(df),
112
+ metrics=[answer_relevancy],
113
+ ).to_pandas()
114
+
115
+ fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
116
+ fig_box = px.box(x=result_df["answer_relevancy"].to_list())
117
+
118
+ return (
119
+ {
120
+ "Scores": result_df[["question", "contexts", "answer", "answer_relevancy"]],
121
+ "Aggregate Scores": [
122
+ {
123
+ "Mean Score": result_df["answer_relevancy"].mean(),
124
+ "Median Score": result_df["answer_relevancy"].median(),
125
+ "Max Score": result_df["answer_relevancy"].max(),
126
+ "Min Score": result_df["answer_relevancy"].min(),
127
+ "Standard Deviation": result_df["answer_relevancy"].std(),
128
+ "Count": len(result_df),
129
+ }
130
+ ],
131
+ },
132
+ fig_histogram,
133
+ fig_box,
134
+ )
@@ -0,0 +1,119 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+ from ragas import evaluate
10
+ from ragas.metrics import answer_similarity
11
+
12
+ from validmind import tags, tasks
13
+
14
+ from .utils import get_renamed_columns
15
+
16
+
17
+ @tags("ragas", "llm")
18
+ @tasks("text_qa", "text_generation", "text_summarization")
19
+ def AnswerSimilarity(
20
+ dataset,
21
+ answer_column="answer",
22
+ ground_truth_column="ground_truth",
23
+ ):
24
+ """
25
+ Calculates the semantic similarity between generated answers and ground truths
26
+
27
+ The concept of Answer Semantic Similarity pertains to the assessment of the semantic
28
+ resemblance between the generated answer and the ground truth. This evaluation is
29
+ based on the `ground_truth` and the `answer`, with values falling within the range
30
+ of 0 to 1. A higher score signifies a better alignment between the generated answer
31
+ and the ground truth.
32
+
33
+ Measuring the semantic similarity between answers can offer valuable insights into
34
+ the quality of the generated response. This evaluation utilizes a cross-encoder
35
+ model to calculate the semantic similarity score.
36
+
37
+ See this paper for more details: https://arxiv.org/pdf/2108.06130.pdf
38
+
39
+ The following steps are involved in computing the answer similarity score:
40
+ 1. Vectorize the ground truth answer using the specified embedding model.
41
+ 2. Vectorize the generated answer using the same embedding model.
42
+ 3. Compute the cosine similarity between the two vectors.
43
+
44
+ ### Configuring Columns
45
+
46
+ This metric requires the following columns in your dataset:
47
+ - `answer` (str): The text response generated by the model.
48
+ - `ground_truth` (str): The ground truth answer that the generated answer is compared
49
+ against.
50
+
51
+ If the above data is not in the appropriate column, you can specify different column
52
+ names for these fields using the parameters `answer_column`, and `ground_truth_column`.
53
+
54
+ For example, if your dataset has this data stored in different columns, you can
55
+ pass the following parameters:
56
+ ```python
57
+ {
58
+ "answer_column": "llm_output_col",
59
+ "ground_truth_column": "my_ground_truth_col",
60
+ }
61
+ ```
62
+
63
+ If answer is stored as a dictionary in another column, specify the column and key
64
+ like this:
65
+ ```python
66
+ pred_col = dataset.prediction_column(model)
67
+ params = {
68
+ "answer_column": f"{pred_col}.generated_answer",
69
+ "ground_truth_column": "my_ground_truth_col",
70
+ }
71
+ ```
72
+
73
+ For more complex situations, you can use a function to extract the data:
74
+ ```python
75
+ pred_col = dataset.prediction_column(model)
76
+ params = {
77
+ "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
78
+ "ground_truth_column": "my_ground_truth_col",
79
+ }
80
+ ```
81
+ """
82
+ warnings.filterwarnings(
83
+ "ignore",
84
+ category=FutureWarning,
85
+ message="promote has been superseded by promote_options='default'.",
86
+ )
87
+
88
+ required_columns = {
89
+ "answer": answer_column,
90
+ "ground_truth": ground_truth_column,
91
+ }
92
+
93
+ df = get_renamed_columns(dataset.df, required_columns)
94
+
95
+ result_df = evaluate(
96
+ Dataset.from_pandas(df),
97
+ metrics=[answer_similarity],
98
+ ).to_pandas()
99
+
100
+ fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
101
+ fig_box = px.box(x=result_df["answer_similarity"].to_list())
102
+
103
+ return (
104
+ {
105
+ "Scores": result_df[["answer", "ground_truth", "answer_similarity"]],
106
+ "Aggregate Scores": [
107
+ {
108
+ "Mean Score": result_df["answer_similarity"].mean(),
109
+ "Median Score": result_df["answer_similarity"].median(),
110
+ "Max Score": result_df["answer_similarity"].max(),
111
+ "Min Score": result_df["answer_similarity"].min(),
112
+ "Standard Deviation": result_df["answer_similarity"].std(),
113
+ "Count": len(result_df),
114
+ }
115
+ ],
116
+ },
117
+ fig_histogram,
118
+ fig_box,
119
+ )
@@ -0,0 +1,167 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+ from ragas import evaluate
10
+ from ragas.metrics.critique import AspectCritique as _AspectCritique
11
+ from ragas.metrics.critique import (
12
+ coherence,
13
+ conciseness,
14
+ correctness,
15
+ harmfulness,
16
+ maliciousness,
17
+ )
18
+
19
+ from validmind import tags, tasks
20
+
21
+ from .utils import get_renamed_columns
22
+
23
+ aspect_map = {
24
+ "coherence": coherence,
25
+ "conciseness": conciseness,
26
+ "correctness": correctness,
27
+ "harmfulness": harmfulness,
28
+ "maliciousness": maliciousness,
29
+ }
30
+
31
+
32
+ @tags("ragas", "llm", "qualitative")
33
+ @tasks("text_summarization", "text_generation", "text_qa")
34
+ def AspectCritique(
35
+ dataset,
36
+ question_column="question",
37
+ answer_column="answer",
38
+ contexts_column="contexts",
39
+ aspects: list = [
40
+ "coherence",
41
+ "conciseness",
42
+ "correctness",
43
+ "harmfulness",
44
+ "maliciousness",
45
+ ],
46
+ additional_aspects: list = [],
47
+ ):
48
+ """
49
+ Evaluates generations against the following aspects: harmfulness, maliciousness,
50
+ coherence, correctness, and conciseness.
51
+
52
+ ### Overview:
53
+
54
+ This is designed to assess submissions against predefined and user-defined "aspects".
55
+ For each aspect, a judge LLM is prompted to critique a piece of generated text based
56
+ on a description of the aspect. The output of this evaluation is a binary (0/1 = yes/no)
57
+ score that indicates whether the submission aligns with the defined aspect or not.
58
+
59
+ ### Inputs and Outputs:
60
+
61
+ The input to this metric is a dataset containing the input `question` (prompt to the LLM)
62
+ and the `answer` (text generated by the LLM). Any retrieved `contexts` can also be
63
+ included to enhance the evaluation.
64
+
65
+ The `question_column`, `answer_column`, and `contexts_column` parameters can be used to
66
+ specify the names or sources for the data that this metric will evaluate if the dataset
67
+ does not contain the required columns `question`, `answer`, and `contexts`.
68
+
69
+ By default, the aspects evaluated are harmfulness, maliciousness, coherence,
70
+ correctness, and conciseness. To change the aspects evaluated, the `aspects` parameter
71
+ can be set to a list containing any of these aspects.
72
+
73
+ To add custom aspects, the `additional_aspects` parameter can be passed as a list
74
+ of tuples where each tuple contains the aspect name and a description of the aspect
75
+ that the judge LLM will use to critique the submission.
76
+
77
+ The output of this metric is a table of scores for each aspect where the aspect score
78
+ is the number of "yes" scores divided by the total number of submissions:
79
+ $$
80
+ \\text{aspect score} = \\frac{\\text{number of "yes" scores}}{\\text{total number of submissions}}
81
+ $$
82
+
83
+ ### Examples:
84
+
85
+ - **Mapping to Required Columns:** If the dataset does not contain the columns required
86
+ to run this metric (i.e., `question`, `answer`, and `contexts`), the
87
+
88
+ ```python
89
+ pred_col = my_vm_dataset.prediction_column(my_vm_model)
90
+ run_test(
91
+ "validmind.model_validation.ragas.AspectCritique",
92
+ inputs={"dataset": my_vm_dataset},
93
+ params={
94
+ "question_column": "input_prompt",
95
+ "answer_column": f"{pred_col}.llm_output",
96
+ "contexts_column": lambda row: [row[pred_col]["context_message"]],
97
+ },
98
+ )
99
+ ```
100
+
101
+ - **Custom Aspects:** To evaluate custom aspects, the `additional_aspects` parameter can
102
+ be set to a list of tuples where each tuple contains the aspect name and a description
103
+ of the aspect that the judge LLM will use to critique the submission. For example, to
104
+ evaluate whether the LLM-generated text has a "professional tone", the `additional_aspects`
105
+ parameter can be set like this:
106
+
107
+ ```python
108
+ run_test(
109
+ "validmind.model_validation.ragas.AspectCritique",
110
+ inputs={"dataset": my_vm_dataset},
111
+ params={
112
+ "additional_aspects": [
113
+ ("professionalism", "Does the text have a professional tone?"),
114
+ ],
115
+ },
116
+ )
117
+ ```
118
+ """
119
+ warnings.filterwarnings(
120
+ "ignore",
121
+ category=FutureWarning,
122
+ message="promote has been superseded by promote_options='default'.",
123
+ )
124
+
125
+ required_columns = {
126
+ "question": question_column,
127
+ "answer": answer_column,
128
+ "contexts": contexts_column,
129
+ }
130
+
131
+ df = get_renamed_columns(dataset.df, required_columns)
132
+
133
+ built_in_aspects = [aspect_map[aspect] for aspect in aspects]
134
+ custom_aspects = [
135
+ _AspectCritique(name=name, definition=description)
136
+ for name, description in additional_aspects
137
+ ]
138
+ all_aspects = [*built_in_aspects, *custom_aspects]
139
+
140
+ result_df = evaluate(Dataset.from_pandas(df), metrics=all_aspects).to_pandas()
141
+
142
+ df_melted = result_df.melt(
143
+ id_vars=["question", "answer", "contexts"],
144
+ value_vars=[aspect.name for aspect in all_aspects],
145
+ var_name="Metric",
146
+ value_name="Result",
147
+ )
148
+ df_counts = df_melted.groupby(["Metric", "Result"]).size().reset_index(name="Count")
149
+ df_counts["Result"] = df_counts["Result"].map({0: "Fail", 1: "Pass"})
150
+
151
+ fig = px.bar(
152
+ df_counts,
153
+ x="Metric",
154
+ y="Count",
155
+ color="Result",
156
+ color_discrete_map={"Fail": "red", "Pass": "green"},
157
+ labels={"Count": "Pass vs Fail Count", "Metric": "Aspect Name"},
158
+ barmode="group",
159
+ title="Aspect Critique Results",
160
+ )
161
+
162
+ return {
163
+ "Aspect Scores": [
164
+ {"Aspect": aspect, "Score": result_df[aspect].mean()}
165
+ for aspect in aspects + [aspect.name for aspect in custom_aspects]
166
+ ]
167
+ }, fig
@@ -0,0 +1,133 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+ from ragas import evaluate
10
+ from ragas.metrics import context_entity_recall
11
+
12
+ from validmind import tags, tasks
13
+
14
+ from .utils import get_renamed_columns
15
+
16
+
17
+ @tags("ragas", "llm", "retrieval_performance")
18
+ @tasks("text_qa", "text_generation", "text_summarization")
19
+ def ContextEntityRecall(
20
+ dataset,
21
+ contexts_column: str = "contexts",
22
+ ground_truth_column: str = "ground_truth",
23
+ ):
24
+ """
25
+ Evaluates the context entity recall for dataset entries and visualizes the results.
26
+
27
+ ### Overview
28
+
29
+ This metric gives the measure of recall of the retrieved context, based on the
30
+ number of entities present in both `ground_truths` and `contexts` relative to the
31
+ number of entities present in the `ground_truths` alone. Simply put, it is a measure
32
+ of what fraction of entities are recalled from `ground_truths`. This metric is
33
+ useful in fact-based use cases like tourism help desk, historical QA, etc. This
34
+ metric can help evaluate the retrieval mechanism for entities, based on comparison
35
+ with entities present in `ground_truths`, because in cases where entities matter,
36
+ we need the `contexts` which cover them.
37
+
38
+ ### Formula
39
+
40
+ To compute this metric, we use two sets, $GE$ and $CE$, representing the set of
41
+ entities present in `ground_truths` and set of entities present in `contexts`
42
+ respectively. We then take the number of elements in intersection of these sets and
43
+ divide it by the number of elements present in the $GE$, given by the formula:
44
+
45
+ $$
46
+ \\text{context entity recall} = \\frac{| CE \\cap GE |}{| GE |}
47
+ $$
48
+
49
+ ### Configuring Columns
50
+
51
+ This metric requires the following columns in your dataset:
52
+ - `contexts` (List[str]): A list of text contexts which will be evaluated to make
53
+ sure if they contain the entities present in the ground truth.
54
+ - `ground_truth` (str): The ground truth text from which the entities will be
55
+ extracted and compared with the entities in the `contexts`.
56
+
57
+ If the above data is not in the appropriate column, you can specify different column
58
+ names for these fields using the parameters `contexts_column`, and `ground_truth_column`.
59
+
60
+ For example, if your dataset has this data stored in different columns, you can
61
+ pass the following parameters:
62
+ ```python
63
+ {
64
+ "contexts_column": "context_info"
65
+ "ground_truth_column": "my_ground_truth_col",
66
+ }
67
+ ```
68
+
69
+ If the data is stored as a dictionary in another column, specify the column and key
70
+ like this:
71
+ ```python
72
+ pred_col = dataset.prediction_column(model)
73
+ params = {
74
+ "contexts_column": f"{pred_col}.contexts",
75
+ "ground_truth_column": "my_ground_truth_col",
76
+ }
77
+ ```
78
+
79
+ For more complex situations, you can use a function to extract the data:
80
+ ```python
81
+ pred_col = dataset.prediction_column(model)
82
+ params = {
83
+ "contexts_column": lambda row: [row[pred_col]["context_message"]],
84
+ "ground_truth_column": "my_ground_truth_col",
85
+ }
86
+ ```
87
+ """
88
+ warnings.filterwarnings(
89
+ "ignore",
90
+ category=FutureWarning,
91
+ message="promote has been superseded by promote_options='default'.",
92
+ )
93
+
94
+ required_columns = {
95
+ "ground_truth": ground_truth_column,
96
+ "contexts": contexts_column,
97
+ }
98
+
99
+ df = get_renamed_columns(dataset.df, required_columns)
100
+
101
+ result_df = evaluate(
102
+ Dataset.from_pandas(df),
103
+ metrics=[context_entity_recall],
104
+ ).to_pandas()
105
+
106
+ fig_histogram = px.histogram(
107
+ x=result_df["context_entity_recall"].to_list(), nbins=10
108
+ )
109
+ fig_box = px.box(x=result_df["context_entity_recall"].to_list())
110
+
111
+ return (
112
+ {
113
+ "Scores": result_df[
114
+ [
115
+ "contexts",
116
+ "ground_truth",
117
+ "context_entity_recall",
118
+ ]
119
+ ],
120
+ "Aggregate Scores": [
121
+ {
122
+ "Mean Score": result_df["context_entity_recall"].mean(),
123
+ "Median Score": result_df["context_entity_recall"].median(),
124
+ "Max Score": result_df["context_entity_recall"].max(),
125
+ "Min Score": result_df["context_entity_recall"].min(),
126
+ "Standard Deviation": result_df["context_entity_recall"].std(),
127
+ "Count": len(result_df),
128
+ }
129
+ ],
130
+ },
131
+ fig_histogram,
132
+ fig_box,
133
+ )