validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -11,6 +11,8 @@ from validmind import tags, tasks
11
11
 
12
12
  from .utils import get_ragas_config, get_renamed_columns
13
13
 
14
+ LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
15
+
14
16
 
15
17
  @tags("ragas", "llm", "qualitative")
16
18
  @tasks("text_summarization", "text_generation", "text_qa")
@@ -101,8 +103,8 @@ def AspectCritique(
101
103
  """
102
104
  try:
103
105
  from ragas import evaluate
104
- from ragas.metrics.critique import AspectCritique as _AspectCritique
105
- from ragas.metrics.critique import (
106
+ from ragas.metrics import AspectCritic
107
+ from ragas.metrics._aspect_critic import (
106
108
  coherence,
107
109
  conciseness,
108
110
  correctness,
@@ -112,7 +114,7 @@ def AspectCritique(
112
114
  except ImportError:
113
115
  raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
114
116
 
115
- aspect_map = {
117
+ built_in_aspects = {
116
118
  "coherence": coherence,
117
119
  "conciseness": conciseness,
118
120
  "correctness": correctness,
@@ -134,21 +136,25 @@ def AspectCritique(
134
136
 
135
137
  df = get_renamed_columns(dataset._df, required_columns)
136
138
 
137
- built_in_aspects = [aspect_map[aspect] for aspect in aspects]
138
139
  custom_aspects = (
139
140
  [
140
- _AspectCritique(name=name, definition=description)
141
+ AspectCritic(name=name, definition=description)
141
142
  for name, description in additional_aspects
142
143
  ]
143
144
  if additional_aspects
144
145
  else []
145
146
  )
146
- all_aspects = [*built_in_aspects, *custom_aspects]
147
+ all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
147
148
 
148
149
  result_df = evaluate(
149
150
  Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
150
151
  ).to_pandas()
151
152
 
153
+ # reverse the score for aspects where lower is better
154
+ for aspect in LOWER_IS_BETTER_ASPECTS:
155
+ if aspect in result_df.columns:
156
+ result_df[aspect] = 1 - result_df[aspect]
157
+
152
158
  df_melted = result_df.melt(
153
159
  id_vars=["question", "answer", "contexts"],
154
160
  value_vars=[aspect.name for aspect in all_aspects],
@@ -47,6 +47,7 @@ def ContextEntityRecall(
47
47
  ### Configuring Columns
48
48
 
49
49
  This metric requires the following columns in your dataset:
50
+
50
51
  - `contexts` (List[str]): A list of text contexts which will be evaluated to make
51
52
  sure if they contain the entities present in the ground truth.
52
53
  - `ground_truth` (str): The ground truth text from which the entities will be
@@ -113,13 +114,13 @@ def ContextEntityRecall(
113
114
 
114
115
  return (
115
116
  {
116
- "Scores (will not be uploaded to UI)": result_df[
117
- [
118
- "contexts",
119
- "ground_truth",
120
- "context_entity_recall",
121
- ]
122
- ],
117
+ # "Scores (will not be uploaded to UI)": result_df[
118
+ # [
119
+ # "contexts",
120
+ # "ground_truth",
121
+ # "context_entity_recall",
122
+ # ]
123
+ # ],
123
124
  "Aggregate Scores": [
124
125
  {
125
126
  "Mean Score": result_df["context_entity_recall"].mean(),
@@ -127,7 +128,7 @@ def ContextEntityRecall(
127
128
  "Max Score": result_df["context_entity_recall"].max(),
128
129
  "Min Score": result_df["context_entity_recall"].min(),
129
130
  "Standard Deviation": result_df["context_entity_recall"].std(),
130
- "Count": len(result_df),
131
+ "Count": result_df.shape[0],
131
132
  }
132
133
  ],
133
134
  },
@@ -40,6 +40,7 @@ def ContextPrecision(
40
40
  ### Configuring Columns
41
41
 
42
42
  This metric requires the following columns in your dataset:
43
+
43
44
  - `question` (str): The text query that was input into the model.
44
45
  - `contexts` (List[str]): A list of text contexts which are retrieved and which
45
46
  will be evaluated to make sure they contain relevant info in the correct order.
@@ -107,9 +108,9 @@ def ContextPrecision(
107
108
 
108
109
  return (
109
110
  {
110
- "Scores (will not be uploaded to UI)": result_df[
111
- ["question", "contexts", "ground_truth", "context_precision"]
112
- ],
111
+ # "Scores (will not be uploaded to UI)": result_df[
112
+ # ["question", "contexts", "ground_truth", "context_precision"]
113
+ # ],
113
114
  "Aggregate Scores": [
114
115
  {
115
116
  "Mean Score": result_df["context_precision"].mean(),
@@ -117,7 +118,7 @@ def ContextPrecision(
117
118
  "Max Score": result_df["context_precision"].max(),
118
119
  "Min Score": result_df["context_precision"].min(),
119
120
  "Standard Deviation": result_df["context_precision"].std(),
120
- "Count": len(result_df),
121
+ "Count": result_df.shape[0],
121
122
  }
122
123
  ],
123
124
  },
@@ -40,6 +40,7 @@ def ContextRecall(
40
40
  ### Configuring Columns
41
41
 
42
42
  This metric requires the following columns in your dataset:
43
+
43
44
  - `question` (str): The text query that was input into the model.
44
45
  - `contexts` (List[str]): A list of text contexts which are retrieved and which
45
46
  will be evaluated to make sure they contain all items in the ground truth.
@@ -107,9 +108,9 @@ def ContextRecall(
107
108
 
108
109
  return (
109
110
  {
110
- "Scores (will not be uploaded to UI)": result_df[
111
- ["question", "contexts", "ground_truth", "context_recall"]
112
- ],
111
+ # "Scores (will not be uploaded to UI)": result_df[
112
+ # ["question", "contexts", "ground_truth", "context_recall"]
113
+ # ],
113
114
  "Aggregate Scores": [
114
115
  {
115
116
  "Mean Score": result_df["context_recall"].mean(),
@@ -117,7 +118,7 @@ def ContextRecall(
117
118
  "Max Score": result_df["context_recall"].max(),
118
119
  "Min Score": result_df["context_recall"].min(),
119
120
  "Standard Deviation": result_df["context_recall"].std(),
120
- "Count": len(result_df),
121
+ "Count": result_df.shape[0],
121
122
  }
122
123
  ],
123
124
  },
@@ -0,0 +1,155 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+
10
+ from validmind import tags, tasks
11
+
12
+ from .utils import get_ragas_config, get_renamed_columns
13
+
14
+
15
+ @tags("ragas", "llm", "retrieval_performance")
16
+ @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
17
+ def ContextUtilization(
18
+ dataset,
19
+ question_column: str = "question",
20
+ contexts_column: str = "contexts",
21
+ answer_column: str = "answer",
22
+ ): # noqa: B950
23
+ """
24
+ Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
25
+ within the provided contexts.
26
+
27
+ ### Purpose
28
+
29
+ The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
30
+ higher within the provided retrieval results. This metric is essential for assessing the performance of models,
31
+ especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
32
+
33
+ ### Test Mechanism
34
+
35
+ The test calculates Context Utilization using the formula:
36
+
37
+ $$
38
+ \\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
39
+ $$
40
+ $$
41
+ \\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
42
+ $$
43
+
44
+ Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
45
+
46
+
47
+ This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
48
+ scores, generating a histogram and box plot for visualization.
49
+
50
+ #### Configuring Columns
51
+
52
+ This metric requires the following columns in your dataset:
53
+
54
+ - `question` (str): The text query that was input into the model.
55
+ - `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
56
+ make sure they contain relevant info in the correct order.
57
+ - `answer` (str): The llm-generated response for the input `question`.
58
+
59
+ If the above data is not in the appropriate column, you can specify different column
60
+ names for these fields using the parameters `question_column`, `contexts_column`
61
+ and `ground_truth_column`.
62
+
63
+ For example, if your dataset has this data stored in different columns, you can
64
+ pass the following parameters:
65
+ ```python
66
+ {
67
+ "question_column": "question",
68
+ "contexts_column": "context_info"
69
+ "ground_truth_column": "my_ground_truth_col",
70
+ }
71
+ ```
72
+
73
+ If the data is stored as a dictionary in another column, specify the column and key
74
+ like this:
75
+ ```python
76
+ pred_col = dataset.prediction_column(model)
77
+ params = {
78
+ "contexts_column": f"{pred_col}.contexts",
79
+ "ground_truth_column": "my_ground_truth_col",
80
+ }
81
+ ```
82
+
83
+ For more complex situations, you can use a function to extract the data:
84
+ ```python
85
+ pred_col = dataset.prediction_column(model)
86
+ params = {
87
+ "contexts_column": lambda x: [x[pred_col]["context_message"]],
88
+ "ground_truth_column": "my_ground_truth_col",
89
+ }
90
+ ```
91
+
92
+ ### Signs of High Risk
93
+
94
+ - Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
95
+ - High standard deviation, suggesting inconsistent model performance.
96
+ - Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
97
+
98
+ ### Strengths
99
+
100
+ - Quantifies the rank of relevant context chunks in generating responses.
101
+ - Provides clear visualizations through histograms and box plots for ease of interpretation.
102
+ - Adapts to different dataset schema by allowing configurable column names.
103
+
104
+ ### Limitations
105
+
106
+ - Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
107
+ - Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
108
+ - Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
109
+ """
110
+ try:
111
+ from ragas import evaluate
112
+ from ragas.metrics import context_utilization
113
+ except ImportError:
114
+ raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
115
+
116
+ warnings.filterwarnings(
117
+ "ignore",
118
+ category=FutureWarning,
119
+ message="promote has been superseded by promote_options='default'.",
120
+ )
121
+
122
+ required_columns = {
123
+ "question": question_column,
124
+ "contexts": contexts_column,
125
+ "answer": answer_column,
126
+ }
127
+
128
+ df = get_renamed_columns(dataset._df, required_columns)
129
+
130
+ result_df = evaluate(
131
+ Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
132
+ ).to_pandas()
133
+
134
+ fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
135
+ fig_box = px.box(x=result_df["context_utilization"].to_list())
136
+
137
+ return (
138
+ {
139
+ # "Scores (will not be uploaded to UI)": result_df[
140
+ # ["question", "contexts", "answer", "context_utilization"]
141
+ # ],
142
+ "Aggregate Scores": [
143
+ {
144
+ "Mean Score": result_df["context_utilization"].mean(),
145
+ "Median Score": result_df["context_utilization"].median(),
146
+ "Max Score": result_df["context_utilization"].max(),
147
+ "Min Score": result_df["context_utilization"].min(),
148
+ "Standard Deviation": result_df["context_utilization"].std(),
149
+ "Count": result_df.shape[0],
150
+ }
151
+ ],
152
+ },
153
+ fig_histogram,
154
+ fig_box,
155
+ )
@@ -41,6 +41,7 @@ def Faithfulness(
41
41
  ### Configuring Columns
42
42
 
43
43
  This metric requires the following columns in your dataset:
44
+
44
45
  - `contexts` (List[str]): A list of text contexts which are retrieved to generate
45
46
  the answer.
46
47
  - `answer` (str): The response generated by the model which will be evaluated for
@@ -105,9 +106,9 @@ def Faithfulness(
105
106
 
106
107
  return (
107
108
  {
108
- "Scores (will not be uploaded to UI)": result_df[
109
- ["contexts", "answer", "faithfulness"]
110
- ],
109
+ # "Scores (will not be uploaded to UI)": result_df[
110
+ # ["contexts", "answer", "faithfulness"]
111
+ # ],
111
112
  "Aggregate Scores": [
112
113
  {
113
114
  "Mean Score": result_df["faithfulness"].mean(),
@@ -115,7 +116,7 @@ def Faithfulness(
115
116
  "Max Score": result_df["faithfulness"].max(),
116
117
  "Min Score": result_df["faithfulness"].min(),
117
118
  "Standard Deviation": result_df["faithfulness"].std(),
118
- "Count": len(result_df),
119
+ "Count": result_df.shape[0],
119
120
  }
120
121
  ],
121
122
  },
@@ -0,0 +1,152 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+
10
+ from validmind import tags, tasks
11
+
12
+ from .utils import get_ragas_config, get_renamed_columns
13
+
14
+
15
+ @tags("ragas", "llm", "rag_performance")
16
+ @tasks("text_qa", "text_generation", "text_summarization")
17
+ def NoiseSensitivity(
18
+ dataset,
19
+ answer_column="answer",
20
+ contexts_column="contexts",
21
+ ground_truth_column="ground_truth",
22
+ ):
23
+ """
24
+ Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
25
+ generates incorrect responses.
26
+
27
+ ### Purpose
28
+
29
+ The Noise Sensitivity test aims to measure how sensitive an LLM is to irrelevant or noisy information within the
30
+ contextual data used to generate its responses. A lower noise sensitivity score suggests better model robustness in
31
+ generating accurate answers from given contexts.
32
+
33
+ ### Test Mechanism
34
+
35
+ This test evaluates the model's answers by comparing the claims made in the generated response against the ground
36
+ truth and the retrieved context. The noise sensitivity score is calculated as:
37
+
38
+ $$
39
+ \\text{noise sensitivity} = {|\\text{Number of incorrect claims in answer}| \\over |\\text{Number of total claims in answer}|}
40
+ $$
41
+
42
+ The formula computes the fraction of incorrect claims to the total claims in the answer, using a dataset where
43
+ 'answer', 'context', and 'ground_truth' columns are specified.
44
+
45
+ #### Configuring Columns
46
+
47
+ This metric requires the following columns in your dataset:
48
+
49
+ - `contexts` (List[str]): A list of text contexts which are retrieved to generate
50
+ the answer.
51
+ - `answer` (str): The response generated by the model
52
+ - `ground_truth` (str): The "correct" answer to the question
53
+
54
+ If the above data is not in the appropriate column, you can specify different column
55
+ names for these fields using the parameters `contexts_column` and `answer_column`.
56
+
57
+ For example, if your dataset has this data stored in different columns, you can
58
+ pass the following parameters:
59
+ ```python
60
+ {
61
+ "contexts_column": "context_info"
62
+ "answer_column": "my_answer_col",
63
+ }
64
+ ```
65
+
66
+ If the data is stored as a dictionary in another column, specify the column and key
67
+ like this:
68
+ ```python
69
+ pred_col = dataset.prediction_column(model)
70
+ params = {
71
+ "contexts_column": f"{pred_col}.contexts",
72
+ "answer_column": f"{pred_col}.answer",
73
+ }
74
+ ```
75
+
76
+ For more complex situations, you can use a function to extract the data:
77
+ ```python
78
+ pred_col = dataset.prediction_column(model)
79
+ params = {
80
+ "contexts_column": lambda row: [row[pred_col]["context_message"]],
81
+ "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
82
+ }
83
+
84
+ ### Signs of High Risk
85
+
86
+ - High noise sensitivity scores across multiple samples.
87
+ - Significant deviation between mean and median noise sensitivity scores.
88
+ - High standard deviation indicating inconsistency in the model's performance.
89
+
90
+ ### Strengths
91
+
92
+ - Provides a quantitative measure of how well the LLM handles noisy or irrelevant context.
93
+ - Easy integration and configuration using column parameters.
94
+ - Utilizes both histogram and box plot visualizations to analyze score distribution.
95
+
96
+ ### Limitations
97
+
98
+ - Requires accurate ground truth that aligns with the generated answers.
99
+ - Assumes the context provided is sufficiently granular to assess noise sensitivity.
100
+ - Primarily applicable to tasks like text QA, text generation, and text summarization where contextual relevance is
101
+ critical.
102
+ """
103
+ try:
104
+ from ragas import evaluate
105
+ from ragas.metrics import noise_sensitivity_relevant
106
+ except ImportError:
107
+ raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
108
+
109
+ warnings.filterwarnings(
110
+ "ignore",
111
+ category=FutureWarning,
112
+ message="promote has been superseded by promote_options='default'.",
113
+ )
114
+
115
+ required_columns = {
116
+ "answer": answer_column,
117
+ "contexts": contexts_column,
118
+ "ground_truth": ground_truth_column,
119
+ }
120
+
121
+ df = get_renamed_columns(dataset._df, required_columns)
122
+
123
+ result_df = evaluate(
124
+ Dataset.from_pandas(df),
125
+ metrics=[noise_sensitivity_relevant],
126
+ **get_ragas_config(),
127
+ ).to_pandas()
128
+
129
+ fig_histogram = px.histogram(
130
+ x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
131
+ )
132
+ fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
133
+
134
+ return (
135
+ {
136
+ # "Scores (will not be uploaded to UI)": result_df[
137
+ # ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
138
+ # ],
139
+ "Aggregate Scores": [
140
+ {
141
+ "Mean Score": result_df["noise_sensitivity_relevant"].mean(),
142
+ "Median Score": result_df["noise_sensitivity_relevant"].median(),
143
+ "Max Score": result_df["noise_sensitivity_relevant"].max(),
144
+ "Min Score": result_df["noise_sensitivity_relevant"].min(),
145
+ "Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
146
+ "Count": result_df.shape[0],
147
+ }
148
+ ],
149
+ },
150
+ fig_histogram,
151
+ fig_box,
152
+ )
@@ -5,11 +5,17 @@
5
5
  import os
6
6
 
7
7
  from validmind.ai.utils import get_client_and_model
8
+ from validmind.client_config import client_config
8
9
 
9
10
  EMBEDDINGS_MODEL = "text-embedding-3-small"
10
11
 
11
12
 
12
13
  def get_ragas_config():
14
+ if not client_config.can_generate_llm_test_descriptions():
15
+ raise ValueError(
16
+ "LLM based descriptions are not enabled in the current configuration."
17
+ )
18
+
13
19
  # import here since its an optional dependency
14
20
  try:
15
21
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
@@ -15,29 +15,36 @@ class AdjustedMutualInformation(ClusterPerformance):
15
15
  Evaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting
16
16
  for chance.
17
17
 
18
- **1. Purpose**: The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a
19
- machine learning model, more specifically, a clustering model. It measures the mutual information between the true
20
- labels and the ones predicted by the model, adjusting for chance.
18
+ ### Purpose
21
19
 
22
- **2. Test Mechanism**: The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function.
23
- This function calculates the mutual information between the true labels and the ones predicted while correcting for
24
- the chance correlation expected due to random label assignments. This test requires the model, the training
25
- dataset, and the test dataset as inputs.
20
+ The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a machine learning
21
+ model, more specifically, a clustering model. It measures the mutual information between the true labels and the
22
+ ones predicted by the model, adjusting for chance.
23
+
24
+ ### Test Mechanism
25
+
26
+ The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function. This function
27
+ calculates the mutual information between the true labels and the ones predicted while correcting for the chance
28
+ correlation expected due to random label assignments. This test requires the model, the training dataset, and the
29
+ test dataset as inputs.
30
+
31
+ ### Signs of High Risk
26
32
 
27
- **3. Signs of High Risk**:
28
33
  - Low Adjusted Mutual Information Score: This score ranges between 0 and 1. A low score (closer to 0) can indicate
29
34
  poor model performance as the predicted labels do not align well with the true labels.
30
- - In case of high dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
35
+ - In case of high-dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
31
36
  may not perform reliably.
32
37
 
33
- **4. Strengths**:
38
+ ### Strengths
39
+
34
40
  - The AMI metric takes into account the randomness of the predicted labels, which makes it more robust than the
35
41
  simple Mutual Information.
36
42
  - The scale of AMI is not dependent on the sizes of the clustering, allowing for comparability between different
37
43
  datasets or models.
38
44
  - Good for comparing the output of clustering algorithms where the number of clusters is not known a priori.
39
45
 
40
- **5. Limitations**:
46
+ ### Limitations
47
+
41
48
  - Adjusted Mutual Information does not take into account the continuous nature of some data. As a result, it may
42
49
  not be the best choice for regression or other continuous types of tasks.
43
50
  - AMI has the drawback of being biased towards clusterings with a higher number of clusters.
@@ -47,7 +54,7 @@ class AdjustedMutualInformation(ClusterPerformance):
47
54
  """
48
55
 
49
56
  name = "adjusted_mutual_information"
50
- required_inputs = ["model", "datasets"]
57
+ required_inputs = ["model", "dataset"]
51
58
  tasks = ["clustering"]
52
59
  tags = [
53
60
  "sklearn",
@@ -15,38 +15,43 @@ class AdjustedRandIndex(ClusterPerformance):
15
15
  Measures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine
16
16
  learning models.
17
17
 
18
- **1. Purpose:**
18
+ ### Purpose
19
+
19
20
  The Adjusted Rand Index (ARI) metric is intended to measure the similarity between two data clusters. This metric
20
- is specifically being used for clustering machine learning models to validly quantify how well the model is
21
- clustering and producing data groups. It involves comparing the model's produced clusters against the actual (true)
22
- clusters found in the dataset.
21
+ is specifically used for clustering machine learning models to quantify how well the model is clustering and
22
+ producing data groups. It involves comparing the model's produced clusters against the actual (true) clusters found
23
+ in the dataset.
24
+
25
+ ### Test Mechanism
26
+
27
+ The Adjusted Rand Index (ARI) is calculated using the `adjusted_rand_score` method from the `sklearn.metrics`
28
+ module in Python. The test requires inputs including the model itself and the model's training and test datasets.
29
+ The model's computed clusters and the true clusters are compared, and the similarities are measured to compute the
30
+ ARI.
23
31
 
24
- **2. Test Mechanism:**
25
- The Adjusted Rand Index (ARI) is calculated by using the `adjusted_rand_score` method from the sklearn metrics in
26
- Python. The test requires inputs including the model itself and the model's training and test datasets. The model's
27
- computed clusters and the true clusters are compared, and the similarities are measured to compute the ARI.
32
+ ### Signs of High Risk
28
33
 
29
- **3. Signs of High Risk:**
30
- - If the ARI is close to zero, it signifies that the model's cluster assignments are random and don't match the
34
+ - If the ARI is close to zero, it signifies that the model's cluster assignments are random and do not match the
31
35
  actual dataset clusters, indicating a high risk.
32
36
  - An ARI of less than zero indicates that the model's clustering performance is worse than random.
33
37
 
34
- **4. Strengths:**
35
- - ARI is normalized and it hence gives a consistent metric between -1 and +1, irrespective of raw cluster sizes or
38
+ ### Strengths
39
+
40
+ - ARI is normalized and provides a consistent metric between -1 and +1, irrespective of raw cluster sizes or
36
41
  dataset size variations.
37
- - It doesn’t require a ground truth for computation which makes it ideal for unsupervised learning model
38
- evaluations.
42
+ - It does not require a ground truth for computation, making it ideal for unsupervised learning model evaluations.
39
43
  - It penalizes for false positives and false negatives, providing a robust measure of clustering quality.
40
44
 
41
- **5. Limitations:**
45
+ ### Limitations
46
+
42
47
  - In real-world situations, true clustering is often unknown, which can hinder the practical application of the ARI.
43
48
  - The ARI requires all individual data instances to be independent, which may not always hold true.
44
- - It may be difficult to interpret the implications of an ARI score without a context or a benchmark, as it is
49
+ - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
45
50
  heavily dependent on the characteristics of the dataset used.
46
51
  """
47
52
 
48
53
  name = "adjusted_rand_index"
49
- required_inputs = ["model", "datasets"]
54
+ required_inputs = ["model", "dataset"]
50
55
  tasks = ["clustering"]
51
56
  tags = [
52
57
  "sklearn",