validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.8.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -14,22 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import faithfulness
17
+ from ragas.metrics import Faithfulness as faithfulness
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for Faithfulness. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for Faithfulness. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm", "rag_performance")
28
31
  @tasks("text_qa", "text_generation", "text_summarization")
29
32
  def Faithfulness(
30
33
  dataset,
31
- answer_column="answer",
32
- contexts_column="contexts",
34
+ user_input_column="user_input",
35
+ response_column="response",
36
+ retrieved_contexts_column="retrieved_contexts",
33
37
  ): # noqa
34
38
  """
35
39
  Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
@@ -54,20 +58,23 @@ def Faithfulness(
54
58
 
55
59
  This metric requires the following columns in your dataset:
56
60
 
57
- - `contexts` (List[str]): A list of text contexts which are retrieved to generate
61
+ - `user_input` (str): The user input that the model is responding to.
62
+ - `retrieved_contexts` (List[str]): A list of text contexts which are retrieved to generate
58
63
  the answer.
59
- - `answer` (str): The response generated by the model which will be evaluated for
64
+ - `response` (str): The response generated by the model which will be evaluated for
60
65
  faithfulness against the given contexts.
61
66
 
62
67
  If the above data is not in the appropriate column, you can specify different column
63
- names for these fields using the parameters `contexts_column` and `answer_column`.
68
+ names for these fields using the parameters `retrieved_contexts_column` and
69
+ `response_column`.
64
70
 
65
71
  For example, if your dataset has this data stored in different columns, you can
66
72
  pass the following parameters:
67
73
  ```python
68
74
  {
69
- "contexts_column": "context_info"
70
- "answer_column": "my_answer_col",
75
+ "retrieved_contexts_column": "context_info",
76
+ "response_column": "my_answer_col",
77
+ "user_input_column": "user_input",
71
78
  }
72
79
  ```
73
80
 
@@ -76,8 +83,9 @@ def Faithfulness(
76
83
  ```python
77
84
  pred_col = dataset.prediction_column(model)
78
85
  params = {
79
- "contexts_column": f"{pred_col}.contexts",
80
- "answer_column": f"{pred_col}.answer",
86
+ "retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
87
+ "response_column": f"{pred_col}.response",
88
+ "user_input_column": "user_input",
81
89
  }
82
90
  ```
83
91
 
@@ -85,8 +93,9 @@ def Faithfulness(
85
93
  ```python
86
94
  pred_col = dataset.prediction_column(model)
87
95
  params = {
88
- "contexts_column": lambda row: [row[pred_col]["context_message"]],
89
- "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
96
+ "retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
97
+ "response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
98
+ "user_input_column": "user_input",
90
99
  }
91
100
  ```
92
101
  """
@@ -97,31 +106,34 @@ def Faithfulness(
97
106
  )
98
107
 
99
108
  required_columns = {
100
- "answer": answer_column,
101
- "contexts": contexts_column,
109
+ "response": response_column,
110
+ "retrieved_contexts": retrieved_contexts_column,
111
+ "user_input": user_input_column,
102
112
  }
103
113
 
104
114
  df = get_renamed_columns(dataset._df, required_columns)
105
115
 
106
116
  result_df = evaluate(
107
- Dataset.from_pandas(df), metrics=[faithfulness], **get_ragas_config()
117
+ Dataset.from_pandas(df), metrics=[faithfulness()], **get_ragas_config()
108
118
  ).to_pandas()
109
119
 
110
- fig_histogram = px.histogram(x=result_df["faithfulness"].to_list(), nbins=10)
111
- fig_box = px.box(x=result_df["faithfulness"].to_list())
120
+ score_column = "faithfulness"
121
+
122
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
123
+ fig_box = px.box(x=result_df[score_column].to_list())
112
124
 
113
125
  return (
114
126
  {
115
- # "Scores (will not be uploaded to UI)": result_df[
116
- # ["contexts", "answer", "faithfulness"]
127
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
128
+ # ["retrieved_contexts", "response", "faithfulness"]
117
129
  # ],
118
130
  "Aggregate Scores": [
119
131
  {
120
- "Mean Score": result_df["faithfulness"].mean(),
121
- "Median Score": result_df["faithfulness"].median(),
122
- "Max Score": result_df["faithfulness"].max(),
123
- "Min Score": result_df["faithfulness"].min(),
124
- "Standard Deviation": result_df["faithfulness"].std(),
132
+ "Mean Score": result_df[score_column].mean(),
133
+ "Median Score": result_df[score_column].median(),
134
+ "Max Score": result_df[score_column].max(),
135
+ "Min Score": result_df[score_column].min(),
136
+ "Standard Deviation": result_df[score_column].std(),
125
137
  "Count": result_df.shape[0],
126
138
  }
127
139
  ],
@@ -14,23 +14,30 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import noise_sensitivity_relevant
17
+ from ragas.metrics import NoiseSensitivity as noise_sensitivity
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for NoiseSensitivity. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for NoiseSensitivity. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
28
+
29
+ VALID_FOCUS_VALUES = ["relevant", "irrelevant"]
25
30
 
26
31
 
27
32
  @tags("ragas", "llm", "rag_performance")
28
33
  @tasks("text_qa", "text_generation", "text_summarization")
29
34
  def NoiseSensitivity(
30
35
  dataset,
31
- answer_column="answer",
32
- contexts_column="contexts",
33
- ground_truth_column="ground_truth",
36
+ response_column="response",
37
+ retrieved_contexts_column="retrieved_contexts",
38
+ reference_column="reference",
39
+ focus="relevant",
40
+ user_input_column="user_input",
34
41
  ):
35
42
  """
36
43
  Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
@@ -58,20 +65,22 @@ def NoiseSensitivity(
58
65
 
59
66
  This metric requires the following columns in your dataset:
60
67
 
61
- - `contexts` (List[str]): A list of text contexts which are retrieved to generate
68
+ - `retrieved_contexts` (List[str]): A list of text contexts which are retrieved to generate
62
69
  the answer.
63
- - `answer` (str): The response generated by the model
64
- - `ground_truth` (str): The "correct" answer to the question
65
-
70
+ - `response` (str): The response generated by the model
71
+ - `reference` (str): The "correct" answer to the question
72
+ - `user_input` (str): The user input question
66
73
  If the above data is not in the appropriate column, you can specify different column
67
- names for these fields using the parameters `contexts_column` and `answer_column`.
74
+ names for these fields using the parameters `retrieved_contexts_column` and `response_column`.
68
75
 
69
76
  For example, if your dataset has this data stored in different columns, you can
70
77
  pass the following parameters:
71
78
  ```python
72
79
  {
73
- "contexts_column": "context_info"
74
- "answer_column": "my_answer_col",
80
+ "retrieved_contexts_column": "context_info",
81
+ "response_column": "my_answer_col",
82
+ "reference_column": "reference",
83
+ "user_input_column": "user_input",
75
84
  }
76
85
  ```
77
86
 
@@ -80,8 +89,10 @@ def NoiseSensitivity(
80
89
  ```python
81
90
  pred_col = dataset.prediction_column(model)
82
91
  params = {
83
- "contexts_column": f"{pred_col}.contexts",
84
- "answer_column": f"{pred_col}.answer",
92
+ "reference_column": "reference",
93
+ "retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
94
+ "response_column": f"{pred_col}.response",
95
+ "user_input_column": "user_input",
85
96
  }
86
97
  ```
87
98
 
@@ -89,8 +100,10 @@ def NoiseSensitivity(
89
100
  ```python
90
101
  pred_col = dataset.prediction_column(model)
91
102
  params = {
92
- "contexts_column": lambda row: [row[pred_col]["context_message"]],
93
- "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
103
+ "reference_column": "reference",
104
+ "retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
105
+ "response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
106
+ "user_input_column": "user_input",
94
107
  }
95
108
 
96
109
  ### Signs of High Risk
@@ -118,37 +131,48 @@ def NoiseSensitivity(
118
131
  message="promote has been superseded by promote_options='default'.",
119
132
  )
120
133
 
134
+ if focus not in VALID_FOCUS_VALUES:
135
+ raise ValueError(
136
+ f"Invalid focus parameter: '{focus}'. "
137
+ f"Must be one of: {VALID_FOCUS_VALUES}"
138
+ )
139
+
121
140
  required_columns = {
122
- "answer": answer_column,
123
- "contexts": contexts_column,
124
- "ground_truth": ground_truth_column,
141
+ "response": response_column,
142
+ "retrieved_contexts": retrieved_contexts_column,
143
+ "reference": reference_column,
144
+ "user_input": user_input_column,
125
145
  }
126
146
 
127
147
  df = get_renamed_columns(dataset._df, required_columns)
128
148
 
129
149
  result_df = evaluate(
130
150
  Dataset.from_pandas(df),
131
- metrics=[noise_sensitivity_relevant],
151
+ metrics=[noise_sensitivity(focus=focus)],
132
152
  **get_ragas_config(),
133
153
  ).to_pandas()
134
154
 
155
+ score_column = f"noise_sensitivity_{focus}"
156
+
135
157
  fig_histogram = px.histogram(
136
- x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
158
+ x=result_df[score_column].to_list(),
159
+ nbins=10,
160
+ title=f"Noise Sensitivity ({focus})",
161
+ )
162
+ fig_box = px.box(
163
+ x=result_df[score_column].to_list(),
164
+ title=f"Noise Sensitivity Distribution ({focus})",
137
165
  )
138
- fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
139
166
 
140
167
  return (
141
168
  {
142
- # "Scores (will not be uploaded to UI)": result_df[
143
- # ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
144
- # ],
145
169
  "Aggregate Scores": [
146
170
  {
147
- "Mean Score": result_df["noise_sensitivity_relevant"].mean(),
148
- "Median Score": result_df["noise_sensitivity_relevant"].median(),
149
- "Max Score": result_df["noise_sensitivity_relevant"].max(),
150
- "Min Score": result_df["noise_sensitivity_relevant"].min(),
151
- "Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
171
+ "Mean Score": result_df[score_column].mean(),
172
+ "Median Score": result_df[score_column].median(),
173
+ "Max Score": result_df[score_column].max(),
174
+ "Min Score": result_df[score_column].min(),
175
+ "Standard Deviation": result_df[score_column].std(),
152
176
  "Count": result_df.shape[0],
153
177
  }
154
178
  ],
@@ -14,36 +14,39 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import answer_relevancy
17
+ from ragas.metrics import ResponseRelevancy as response_relevancy
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for AnswerRelevance. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for AnswerRelevance. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm", "rag_performance")
28
31
  @tasks("text_qa", "text_generation", "text_summarization")
29
- def AnswerRelevance(
32
+ def ResponseRelevancy(
30
33
  dataset,
31
- question_column="question",
32
- contexts_column="contexts",
33
- answer_column="answer",
34
+ user_input_column="user_input",
35
+ retrieved_contexts_column=None,
36
+ response_column="response",
34
37
  ):
35
38
  """
36
39
  Assesses how pertinent the generated answer is to the given prompt.
37
40
 
38
- The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the
41
+ The evaluation metric, Response Relevancy, focuses on assessing how pertinent the
39
42
  generated answer is to the given prompt. A lower score is assigned to answers that
40
43
  are incomplete or contain redundant information and higher scores indicate better
41
- relevancy. This metric is computed using the `question`, the `contexts` and the
42
- `answer`.
44
+ relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
45
+ and the `response`.
43
46
 
44
- The Answer Relevancy is defined as the mean cosine similartiy of the original
45
- `question` to a number of artifical questions, which are generated (reverse-engineered)
46
- based on the `answer`:
47
+ The Response Relevancy is defined as the mean cosine similartiy of the original
48
+ `user_input` to a number of artifical questions, which are generated (reverse-engineered)
49
+ based on the `response`:
47
50
 
48
51
  $$
49
52
  \\text{answer relevancy} = \\frac{1}{N} \\sum_{i=1}^{N} cos(E_{g_i}, E_o)
@@ -66,10 +69,10 @@ def AnswerRelevance(
66
69
 
67
70
  This metric requires the following columns in your dataset:
68
71
 
69
- - `question` (str): The text query that was input into the model.
70
- - `contexts` (List[str]): Any contextual information retrieved by the model before
71
- generating an answer.
72
- - `answer` (str): The response generated by the model.
72
+ - `user_input` (str): The text query that was input into the model.
73
+ - `retrieved_contexts` (List[str]): Any contextual information retrieved by the model
74
+ before generating an answer.
75
+ - `response` (str): The response generated by the model.
73
76
 
74
77
  If the above data is not in the appropriate column, you can specify different column
75
78
  names for these fields using the parameters `question_column`, `answer_column`, and
@@ -79,9 +82,9 @@ def AnswerRelevance(
79
82
  pass the following parameters:
80
83
  ```python
81
84
  params = {
82
- "question_column": "input_text",
83
- "answer_column": "output_text",
84
- "contexts_column": "context_info"
85
+ "user_input_column": "input_text",
86
+ "response_column": "output_text",
87
+ "retrieved_contexts_column": "context_info"
85
88
  }
86
89
  ```
87
90
 
@@ -90,8 +93,8 @@ def AnswerRelevance(
90
93
  ```python
91
94
  pred_col = dataset.prediction_column(model)
92
95
  params = {
93
- "answer_column": f"{pred_col}.generated_answer",
94
- "contexts_column": f"{pred_col}.contexts",
96
+ "response_column": f"{pred_col}.generated_answer",
97
+ "retrieved_contexts_column": f"{pred_col}.contexts",
95
98
  }
96
99
  ```
97
100
 
@@ -99,8 +102,8 @@ def AnswerRelevance(
99
102
  ```python
100
103
  pred_col = dataset.prediction_column(model)
101
104
  params = {
102
- "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
103
- "contexts_column": lambda row: [row[pred_col]["context_message"]],
105
+ "response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
106
+ "retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
104
107
  }
105
108
  ```
106
109
  """
@@ -111,32 +114,40 @@ def AnswerRelevance(
111
114
  )
112
115
 
113
116
  required_columns = {
114
- "question": question_column,
115
- "answer": answer_column,
116
- "contexts": contexts_column,
117
+ "user_input": user_input_column,
118
+ "response": response_column,
117
119
  }
118
120
 
121
+ if retrieved_contexts_column:
122
+ required_columns["retrieved_contexts"] = retrieved_contexts_column
123
+
119
124
  df = get_renamed_columns(dataset._df, required_columns)
120
125
 
126
+ metrics = [response_relevancy()]
127
+
121
128
  result_df = evaluate(
122
- Dataset.from_pandas(df), metrics=[answer_relevancy], **get_ragas_config()
129
+ Dataset.from_pandas(df),
130
+ metrics=metrics,
131
+ **get_ragas_config(),
123
132
  ).to_pandas()
124
133
 
125
- fig_histogram = px.histogram(x=result_df["answer_relevancy"].to_list(), nbins=10)
126
- fig_box = px.box(x=result_df["answer_relevancy"].to_list())
134
+ score_column = "answer_relevancy"
135
+
136
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
137
+ fig_box = px.box(x=result_df[score_column].to_list())
127
138
 
128
139
  return (
129
140
  {
130
- # "Scores (will not be uploaded to UI)": result_df[
131
- # ["question", "contexts", "answer", "answer_relevancy"]
141
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
142
+ # ["user_input", "retrieved_contexts", "response", "answer_relevancy"]
132
143
  # ],
133
144
  "Aggregate Scores": [
134
145
  {
135
- "Mean Score": result_df["answer_relevancy"].mean(),
136
- "Median Score": result_df["answer_relevancy"].median(),
137
- "Max Score": result_df["answer_relevancy"].max(),
138
- "Min Score": result_df["answer_relevancy"].min(),
139
- "Standard Deviation": result_df["answer_relevancy"].std(),
146
+ "Mean Score": result_df[score_column].mean(),
147
+ "Median Score": result_df[score_column].median(),
148
+ "Max Score": result_df[score_column].max(),
149
+ "Min Score": result_df[score_column].min(),
150
+ "Standard Deviation": result_df[score_column].std(),
140
151
  "Count": result_df.shape[0],
141
152
  }
142
153
  ],
@@ -14,30 +14,33 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import answer_similarity
17
+ from ragas.metrics import SemanticSimilarity as semantic_similarity
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for AnswerSimilarity. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for AnswerSimilarity. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm")
28
31
  @tasks("text_qa", "text_generation", "text_summarization")
29
- def AnswerSimilarity(
32
+ def SemanticSimilarity(
30
33
  dataset,
31
- answer_column="answer",
32
- ground_truth_column="ground_truth",
34
+ response_column="response",
35
+ reference_column="reference",
33
36
  ):
34
37
  """
35
- Calculates the semantic similarity between generated answers and ground truths
38
+ Calculates the semantic similarity between generated responses and ground truths
36
39
 
37
40
  The concept of Answer Semantic Similarity pertains to the assessment of the semantic
38
41
  resemblance between the generated answer and the ground truth. This evaluation is
39
- based on the `ground_truth` and the `answer`, with values falling within the range
40
- of 0 to 1. A higher score signifies a better alignment between the generated answer
42
+ based on the `reference` and the `response`, with values falling within the range
43
+ of 0 to 1. A higher score signifies a better alignment between the generated response
41
44
  and the ground truth.
42
45
 
43
46
  Measuring the semantic similarity between answers can offer valuable insights into
@@ -55,19 +58,19 @@ def AnswerSimilarity(
55
58
 
56
59
  This metric requires the following columns in your dataset:
57
60
 
58
- - `answer` (str): The text response generated by the model.
59
- - `ground_truth` (str): The ground truth answer that the generated answer is compared
61
+ - `response` (str): The text response generated by the model.
62
+ - `reference` (str): The ground truth answer that the generated answer is compared
60
63
  against.
61
64
 
62
65
  If the above data is not in the appropriate column, you can specify different column
63
- names for these fields using the parameters `answer_column`, and `ground_truth_column`.
66
+ names for these fields using the parameters `response_column`, and `reference_column`.
64
67
 
65
68
  For example, if your dataset has this data stored in different columns, you can
66
69
  pass the following parameters:
67
70
  ```python
68
71
  {
69
- "answer_column": "llm_output_col",
70
- "ground_truth_column": "my_ground_truth_col",
72
+ "response_column": "llm_output_col",
73
+ "reference_column": "my_ground_truth_col",
71
74
  }
72
75
  ```
73
76
 
@@ -76,8 +79,8 @@ def AnswerSimilarity(
76
79
  ```python
77
80
  pred_col = dataset.prediction_column(model)
78
81
  params = {
79
- "answer_column": f"{pred_col}.generated_answer",
80
- "ground_truth_column": "my_ground_truth_col",
82
+ "response_column": f"{pred_col}.generated_answer",
83
+ "reference_column": "my_ground_truth_col",
81
84
  }
82
85
  ```
83
86
 
@@ -85,8 +88,8 @@ def AnswerSimilarity(
85
88
  ```python
86
89
  pred_col = dataset.prediction_column(model)
87
90
  params = {
88
- "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
89
- "ground_truth_column": "my_ground_truth_col",
91
+ "response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
92
+ "reference_column": "my_ground_truth_col",
90
93
  }
91
94
  ```
92
95
  """
@@ -97,31 +100,33 @@ def AnswerSimilarity(
97
100
  )
98
101
 
99
102
  required_columns = {
100
- "answer": answer_column,
101
- "ground_truth": ground_truth_column,
103
+ "response": response_column,
104
+ "reference": reference_column,
102
105
  }
103
106
 
104
107
  df = get_renamed_columns(dataset._df, required_columns)
105
108
 
106
109
  result_df = evaluate(
107
- Dataset.from_pandas(df), metrics=[answer_similarity], **get_ragas_config()
110
+ Dataset.from_pandas(df), metrics=[semantic_similarity()], **get_ragas_config()
108
111
  ).to_pandas()
109
112
 
110
- fig_histogram = px.histogram(x=result_df["answer_similarity"].to_list(), nbins=10)
111
- fig_box = px.box(x=result_df["answer_similarity"].to_list())
113
+ score_column = "semantic_similarity"
114
+
115
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
116
+ fig_box = px.box(x=result_df[score_column].to_list())
112
117
 
113
118
  return (
114
119
  {
115
- # "Scores (will not be uploaded to UI)": result_df[
116
- # ["answer", "ground_truth", "answer_similarity"]
120
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
121
+ # ["response", "reference", "semantic_similarity"]
117
122
  # ],
118
123
  "Aggregate Scores": [
119
124
  {
120
- "Mean Score": result_df["answer_similarity"].mean(),
121
- "Median Score": result_df["answer_similarity"].median(),
122
- "Max Score": result_df["answer_similarity"].max(),
123
- "Min Score": result_df["answer_similarity"].min(),
124
- "Standard Deviation": result_df["answer_similarity"].std(),
125
+ "Mean Score": result_df[score_column].mean(),
126
+ "Median Score": result_df[score_column].median(),
127
+ "Max Score": result_df[score_column].max(),
128
+ "Min Score": result_df[score_column].min(),
129
+ "Standard Deviation": result_df[score_column].std(),
125
130
  "Count": result_df.shape[0],
126
131
  }
127
132
  ],
@@ -2,15 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
5
+ from sklearn.metrics import adjusted_mutual_info_score
6
6
 
7
- from sklearn import metrics
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset, VMModel
8
9
 
9
- from .ClusterPerformance import ClusterPerformance
10
10
 
11
-
12
- @dataclass
13
- class AdjustedMutualInformation(ClusterPerformance):
11
+ @tags("sklearn", "model_performance", "clustering")
12
+ @tasks("clustering")
13
+ def AdjustedMutualInformation(model: VMModel, dataset: VMDataset):
14
14
  """
15
15
  Evaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting
16
16
  for chance.
@@ -52,14 +52,11 @@ class AdjustedMutualInformation(ClusterPerformance):
52
52
  - The interpretability of the score can be complex as it depends on the understanding of information theory
53
53
  concepts.
54
54
  """
55
-
56
- name = "adjusted_mutual_information"
57
- required_inputs = ["model", "dataset"]
58
- tasks = ["clustering"]
59
- tags = [
60
- "sklearn",
61
- "model_performance",
55
+ return [
56
+ {
57
+ "Adjusted Mutual Information": adjusted_mutual_info_score(
58
+ labels_true=dataset.y,
59
+ labels_pred=dataset.y_pred(model),
60
+ )
61
+ }
62
62
  ]
63
-
64
- def metric_info(self):
65
- return {"Adjusted Mutual Information": metrics.adjusted_mutual_info_score}