validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.8.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -14,22 +14,25 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import context_entity_recall
17
+ from ragas.metrics import ContextEntityRecall as context_entity_recall
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for ContextEntityRecall. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for ContextEntityRecall. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm", "retrieval_performance")
28
31
  @tasks("text_qa", "text_generation", "text_summarization")
29
32
  def ContextEntityRecall(
30
33
  dataset,
31
- contexts_column: str = "contexts",
32
- ground_truth_column: str = "ground_truth",
34
+ retrieved_contexts_column: str = "retrieved_contexts",
35
+ reference_column: str = "reference",
33
36
  ):
34
37
  """
35
38
  Evaluates the context entity recall for dataset entries and visualizes the results.
@@ -37,18 +40,18 @@ def ContextEntityRecall(
37
40
  ### Overview
38
41
 
39
42
  This metric gives the measure of recall of the retrieved context, based on the
40
- number of entities present in both `ground_truths` and `contexts` relative to the
41
- number of entities present in the `ground_truths` alone. Simply put, it is a measure
42
- of what fraction of entities are recalled from `ground_truths`. This metric is
43
+ number of entities present in both `reference` and `retrieved_contexts` relative to the
44
+ number of entities present in the `reference` alone. Simply put, it is a measure
45
+ of what fraction of entities are recalled from `reference`. This metric is
43
46
  useful in fact-based use cases like tourism help desk, historical QA, etc. This
44
47
  metric can help evaluate the retrieval mechanism for entities, based on comparison
45
- with entities present in `ground_truths`, because in cases where entities matter,
46
- we need the `contexts` which cover them.
48
+ with entities present in `reference`, because in cases where entities matter,
49
+ we need the `retrieved_contexts` which cover them.
47
50
 
48
51
  ### Formula
49
52
 
50
53
  To compute this metric, we use two sets, $GE$ and $CE$, representing the set of
51
- entities present in `ground_truths` and set of entities present in `contexts`
54
+ entities present in `reference` and set of entities present in `retrieved_contexts`
52
55
  respectively. We then take the number of elements in intersection of these sets and
53
56
  divide it by the number of elements present in the $GE$, given by the formula:
54
57
 
@@ -60,20 +63,20 @@ def ContextEntityRecall(
60
63
 
61
64
  This metric requires the following columns in your dataset:
62
65
 
63
- - `contexts` (List[str]): A list of text contexts which will be evaluated to make
64
- sure if they contain the entities present in the ground truth.
65
- - `ground_truth` (str): The ground truth text from which the entities will be
66
- extracted and compared with the entities in the `contexts`.
66
+ - `retrieved_contexts` (List[str]): A list of text contexts which will be evaluated to make
67
+ sure if they contain the entities present in the `reference`.
68
+ - `reference` (str): The ground truth text from which the entities will be
69
+ extracted and compared with the entities in the `retrieved_contexts`.
67
70
 
68
71
  If the above data is not in the appropriate column, you can specify different column
69
- names for these fields using the parameters `contexts_column`, and `ground_truth_column`.
72
+ names for these fields using the parameters `retrieved_contexts_column`, and `reference_column`.
70
73
 
71
74
  For example, if your dataset has this data stored in different columns, you can
72
75
  pass the following parameters:
73
76
  ```python
74
77
  {
75
- "contexts_column": "context_info"
76
- "ground_truth_column": "my_ground_truth_col",
78
+ "retrieved_contexts_column": "context_info",
79
+ "reference_column": "my_ground_truth_col",
77
80
  }
78
81
  ```
79
82
 
@@ -82,8 +85,8 @@ def ContextEntityRecall(
82
85
  ```python
83
86
  pred_col = dataset.prediction_column(model)
84
87
  params = {
85
- "contexts_column": f"{pred_col}.contexts",
86
- "ground_truth_column": "my_ground_truth_col",
88
+ "retrieved_contexts_column": f"{pred_col}.contexts",
89
+ "reference_column": "my_ground_truth_col",
87
90
  }
88
91
  ```
89
92
 
@@ -91,8 +94,8 @@ def ContextEntityRecall(
91
94
  ```python
92
95
  pred_col = dataset.prediction_column(model)
93
96
  params = {
94
- "contexts_column": lambda row: [row[pred_col]["context_message"]],
95
- "ground_truth_column": "my_ground_truth_col",
97
+ "retrieved_contexts_column": lambda row: [row[pred_col]["context_message"]],
98
+ "reference_column": "my_ground_truth_col",
96
99
  }
97
100
  ```
98
101
  """
@@ -103,37 +106,37 @@ def ContextEntityRecall(
103
106
  )
104
107
 
105
108
  required_columns = {
106
- "ground_truth": ground_truth_column,
107
- "contexts": contexts_column,
109
+ "reference": reference_column,
110
+ "retrieved_contexts": retrieved_contexts_column,
108
111
  }
109
112
 
110
113
  df = get_renamed_columns(dataset._df, required_columns)
111
114
 
112
115
  result_df = evaluate(
113
- Dataset.from_pandas(df), metrics=[context_entity_recall], **get_ragas_config()
116
+ Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config()
114
117
  ).to_pandas()
115
118
 
116
- fig_histogram = px.histogram(
117
- x=result_df["context_entity_recall"].to_list(), nbins=10
118
- )
119
- fig_box = px.box(x=result_df["context_entity_recall"].to_list())
119
+ score_column = "context_entity_recall"
120
+
121
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
122
+ fig_box = px.box(x=result_df[score_column].to_list())
120
123
 
121
124
  return (
122
125
  {
123
- # "Scores (will not be uploaded to UI)": result_df[
126
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
124
127
  # [
125
- # "contexts",
126
- # "ground_truth",
128
+ # "retrieved_contexts",
129
+ # "reference",
127
130
  # "context_entity_recall",
128
131
  # ]
129
132
  # ],
130
133
  "Aggregate Scores": [
131
134
  {
132
- "Mean Score": result_df["context_entity_recall"].mean(),
133
- "Median Score": result_df["context_entity_recall"].median(),
134
- "Max Score": result_df["context_entity_recall"].max(),
135
- "Min Score": result_df["context_entity_recall"].min(),
136
- "Standard Deviation": result_df["context_entity_recall"].std(),
135
+ "Mean Score": result_df[score_column].mean(),
136
+ "Median Score": result_df[score_column].median(),
137
+ "Max Score": result_df[score_column].max(),
138
+ "Min Score": result_df[score_column].min(),
139
+ "Standard Deviation": result_df[score_column].std(),
137
140
  "Count": result_df.shape[0],
138
141
  }
139
142
  ],
@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import context_precision
17
+ from ragas.metrics import LLMContextPrecisionWithReference as context_precision
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for ContextPrecision. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for ContextPrecision. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm", "retrieval_performance")
28
31
  @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
29
32
  def ContextPrecision(
30
33
  dataset,
31
- question_column: str = "question",
32
- contexts_column: str = "contexts",
33
- ground_truth_column: str = "ground_truth",
34
+ user_input_column: str = "user_input",
35
+ retrieved_contexts_column: str = "retrieved_contexts",
36
+ reference_column: str = "reference",
34
37
  ): # noqa: B950
35
38
  """
36
39
  Context Precision is a metric that evaluates whether all of the ground-truth
@@ -53,22 +56,22 @@ def ContextPrecision(
53
56
 
54
57
  This metric requires the following columns in your dataset:
55
58
 
56
- - `question` (str): The text query that was input into the model.
57
- - `contexts` (List[str]): A list of text contexts which are retrieved and which
59
+ - `user_input` (str): The text query that was input into the model.
60
+ - `retrieved_contexts` (List[str]): A list of text contexts which are retrieved and which
58
61
  will be evaluated to make sure they contain relevant info in the correct order.
59
- - `ground_truth` (str): The ground truth text to compare with the retrieved contexts.
62
+ - `reference` (str): The ground truth text to compare with the retrieved contexts.
60
63
 
61
64
  If the above data is not in the appropriate column, you can specify different column
62
- names for these fields using the parameters `question_column`, `contexts_column`
63
- and `ground_truth_column`.
65
+ names for these fields using the parameters `user_input_column`, `retrieved_contexts_column`
66
+ and `reference_column`.
64
67
 
65
68
  For example, if your dataset has this data stored in different columns, you can
66
69
  pass the following parameters:
67
70
  ```python
68
71
  {
69
- "question_column": "question",
70
- "contexts_column": "context_info"
71
- "ground_truth_column": "my_ground_truth_col",
72
+ "user_input_column": "question",
73
+ "retrieved_contexts_column": "context_info",
74
+ "reference_column": "my_ground_truth_col",
72
75
  }
73
76
  ```
74
77
 
@@ -77,8 +80,8 @@ def ContextPrecision(
77
80
  ```python
78
81
  pred_col = dataset.prediction_column(model)
79
82
  params = {
80
- "contexts_column": f"{pred_col}.contexts",
81
- "ground_truth_column": "my_ground_truth_col",
83
+ "retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
84
+ "reference_column": "my_ground_truth_col",
82
85
  }
83
86
  ```
84
87
 
@@ -86,8 +89,8 @@ def ContextPrecision(
86
89
  ```python
87
90
  pred_col = dataset.prediction_column(model)
88
91
  params = {
89
- "contexts_column": lambda x: [x[pred_col]["context_message"]],
90
- "ground_truth_column": "my_ground_truth_col",
92
+ "retrieved_contexts_column": lambda x: [x[pred_col]["context_message"]],
93
+ "reference_column": "my_ground_truth_col",
91
94
  }
92
95
  ```
93
96
  """
@@ -98,32 +101,34 @@ def ContextPrecision(
98
101
  )
99
102
 
100
103
  required_columns = {
101
- "question": question_column,
102
- "contexts": contexts_column,
103
- "ground_truth": ground_truth_column,
104
+ "user_input": user_input_column,
105
+ "retrieved_contexts": retrieved_contexts_column,
106
+ "reference": reference_column,
104
107
  }
105
108
 
106
109
  df = get_renamed_columns(dataset._df, required_columns)
107
110
 
108
111
  result_df = evaluate(
109
- Dataset.from_pandas(df), metrics=[context_precision], **get_ragas_config()
112
+ Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
110
113
  ).to_pandas()
111
114
 
112
- fig_histogram = px.histogram(x=result_df["context_precision"].to_list(), nbins=10)
113
- fig_box = px.box(x=result_df["context_precision"].to_list())
115
+ score_column = "llm_context_precision_with_reference"
116
+
117
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
118
+ fig_box = px.box(x=result_df[score_column].to_list())
114
119
 
115
120
  return (
116
121
  {
117
- # "Scores (will not be uploaded to UI)": result_df[
118
- # ["question", "contexts", "ground_truth", "context_precision"]
122
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
123
+ # ["user_input", "retrieved_contexts", "reference", "llm_context_precision_with_reference"]
119
124
  # ],
120
125
  "Aggregate Scores": [
121
126
  {
122
- "Mean Score": result_df["context_precision"].mean(),
123
- "Median Score": result_df["context_precision"].median(),
124
- "Max Score": result_df["context_precision"].max(),
125
- "Min Score": result_df["context_precision"].min(),
126
- "Standard Deviation": result_df["context_precision"].std(),
127
+ "Mean Score": result_df[score_column].mean(),
128
+ "Median Score": result_df[score_column].median(),
129
+ "Max Score": result_df[score_column].max(),
130
+ "Min Score": result_df[score_column].min(),
131
+ "Standard Deviation": result_df[score_column].std(),
127
132
  "Count": result_df.shape[0],
128
133
  }
129
134
  ],
@@ -0,0 +1,133 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+
10
+ from validmind import tags, tasks
11
+ from validmind.errors import MissingDependencyError
12
+
13
+ from .utils import get_ragas_config, get_renamed_columns
14
+
15
+ try:
16
+ from ragas import evaluate
17
+ from ragas.metrics import LLMContextPrecisionWithoutReference as context_precision
18
+ except ImportError as e:
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for ContextPrecision. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
28
+
29
+
30
+ @tags("ragas", "llm", "retrieval_performance")
31
+ @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
32
+ def ContextPrecisionWithoutReference(
33
+ dataset,
34
+ user_input_column: str = "user_input",
35
+ retrieved_contexts_column: str = "retrieved_contexts",
36
+ response_column: str = "response",
37
+ ): # noqa: B950
38
+ """
39
+ Context Precision Without Reference is a metric used to evaluate the relevance of
40
+ retrieved contexts compared to the expected response for a given user input. This
41
+ metric compares each retrieved context (or chunk) with the response to estimate
42
+ if the retrieved context is relevant.
43
+
44
+ This metric can be used when you have both retrieved contexts and associated
45
+ reference contexts for a `user_input`. Using a Language Model (LLM), it determines
46
+ the relevance of each retrieved context by comparing it directly with the response,
47
+ producing scores between 0 and 1, where higher scores indicate better precision in
48
+ retrieving relevant contexts.
49
+
50
+ ### Configuring Columns
51
+
52
+ This metric requires the following columns in your dataset:
53
+
54
+ - `user_input` (str): The user query or input to the model.
55
+ - `retrieved_contexts` (List[str]): A list of text contexts retrieved for the
56
+ user input that will be evaluated for relevance.
57
+ - `response` (str): The model’s output response associated with the user input.
58
+
59
+ If your dataset stores this data in different columns, you can specify alternate
60
+ column names using the parameters `user_input_column`, `retrieved_contexts_column`,
61
+ and `response_column`.
62
+
63
+ Example configuration for custom column names:
64
+ ```python
65
+ {
66
+ "user_input_column": "user_query",
67
+ "retrieved_contexts_column": "retrieved_texts",
68
+ "response_column": "model_output",
69
+ }
70
+ ```
71
+
72
+ For datasets with data stored as dictionaries in other columns, specify the
73
+ column and key like so:
74
+ ```python
75
+ pred_col = dataset.prediction_column(model)
76
+ params = {
77
+ "retrieved_contexts_column": f"{pred_col}.contexts",
78
+ "response_column": f"{pred_col}.response",
79
+ }
80
+ ```
81
+
82
+ Alternatively, for complex situations, you may use a function to extract data:
83
+ ```python
84
+ pred_col = dataset.prediction_column(model)
85
+ params = {
86
+ "retrieved_contexts_column": lambda x: [x[pred_col]["context_message"]],
87
+ "response_column": "my_response_col",
88
+ }
89
+ ```
90
+ """
91
+
92
+ warnings.filterwarnings(
93
+ "ignore",
94
+ category=FutureWarning,
95
+ message="promote has been superseded by promote_options='default'.",
96
+ )
97
+
98
+ required_columns = {
99
+ "user_input": user_input_column,
100
+ "retrieved_contexts": retrieved_contexts_column,
101
+ "response": response_column,
102
+ }
103
+
104
+ df = get_renamed_columns(dataset._df, required_columns)
105
+
106
+ result_df = evaluate(
107
+ Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
108
+ ).to_pandas()
109
+
110
+ score_column = "llm_context_precision_without_reference"
111
+
112
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
113
+ fig_box = px.box(x=result_df[score_column].to_list())
114
+
115
+ return (
116
+ {
117
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
118
+ # ["user_input", "retrieved_contexts", "response", "llm_context_precision_without_reference"]
119
+ # ],
120
+ "Aggregate Scores": [
121
+ {
122
+ "Mean Score": result_df[score_column].mean(),
123
+ "Median Score": result_df[score_column].median(),
124
+ "Max Score": result_df[score_column].max(),
125
+ "Min Score": result_df[score_column].min(),
126
+ "Standard Deviation": result_df[score_column].std(),
127
+ "Count": result_df.shape[0],
128
+ }
129
+ ],
130
+ },
131
+ fig_histogram,
132
+ fig_box,
133
+ )
@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import context_recall
17
+ from ragas.metrics import LLMContextRecall as context_recall
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for ContextRecall. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for ContextRecall. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm", "retrieval_performance")
28
31
  @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
29
32
  def ContextRecall(
30
33
  dataset,
31
- question_column: str = "question",
32
- contexts_column: str = "contexts",
33
- ground_truth_column: str = "ground_truth",
34
+ user_input_column: str = "user_input",
35
+ retrieved_contexts_column: str = "retrieved_contexts",
36
+ reference_column: str = "reference",
34
37
  ):
35
38
  """
36
39
  Context recall measures the extent to which the retrieved context aligns with the
@@ -53,22 +56,22 @@ def ContextRecall(
53
56
 
54
57
  This metric requires the following columns in your dataset:
55
58
 
56
- - `question` (str): The text query that was input into the model.
57
- - `contexts` (List[str]): A list of text contexts which are retrieved and which
58
- will be evaluated to make sure they contain all items in the ground truth.
59
- - `ground_truth` (str): The ground truth text to compare with the retrieved contexts.
59
+ - `user_input` (str): The text query that was input into the model.
60
+ - `retrieved_contexts` (List[str]): A list of text contexts which are retrieved and
61
+ which will be evaluated to make sure they contain all items in the ground truth.
62
+ - `reference` (str): The ground truth text to compare with the retrieved contexts.
60
63
 
61
64
  If the above data is not in the appropriate column, you can specify different column
62
- names for these fields using the parameters `question_column`, `contexts_column`
63
- and `ground_truth_column`.
65
+ names for these fields using the parameters `user_input_column`,
66
+ `retrieved_contexts_column` and `reference_column`.
64
67
 
65
68
  For example, if your dataset has this data stored in different columns, you can
66
69
  pass the following parameters:
67
70
  ```python
68
71
  {
69
- "question_column": "question",
70
- "contexts_column": "context_info"
71
- "ground_truth_column": "my_ground_truth_col",
72
+ "user_input_column": "user_input",
73
+ "retrieved_contexts_column": "retrieved_contexts",
74
+ "reference_column": "reference",
72
75
  }
73
76
  ```
74
77
 
@@ -77,8 +80,8 @@ def ContextRecall(
77
80
  ```python
78
81
  pred_col = dataset.prediction_column(model)
79
82
  params = {
80
- "contexts_column": f"{pred_col}.contexts",
81
- "ground_truth_column": "my_ground_truth_col",
83
+ "retrieved_contexts_column": f"{pred_col}.retrieved_contexts",
84
+ "reference_column": f"{pred_col}.reference",
82
85
  }
83
86
  ```
84
87
 
@@ -86,8 +89,8 @@ def ContextRecall(
86
89
  ```python
87
90
  pred_col = dataset.prediction_column(model)
88
91
  params = {
89
- "contexts_column": lambda x: [x[pred_col]["context_message"]],
90
- "ground_truth_column": "my_ground_truth_col",
92
+ "retrieved_contexts_column": lambda x: [x[pred_col]["retrieved_contexts"]],
93
+ "reference_column": "my_ground_truth_col",
91
94
  }
92
95
  ```
93
96
  """
@@ -98,32 +101,34 @@ def ContextRecall(
98
101
  )
99
102
 
100
103
  required_columns = {
101
- "question": question_column,
102
- "contexts": contexts_column,
103
- "ground_truth": ground_truth_column,
104
+ "user_input": user_input_column,
105
+ "retrieved_contexts": retrieved_contexts_column,
106
+ "reference": reference_column,
104
107
  }
105
108
 
106
109
  df = get_renamed_columns(dataset._df, required_columns)
107
110
 
108
111
  result_df = evaluate(
109
- Dataset.from_pandas(df), metrics=[context_recall], **get_ragas_config()
112
+ Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config()
110
113
  ).to_pandas()
111
114
 
112
- fig_histogram = px.histogram(x=result_df["context_recall"].to_list(), nbins=10)
113
- fig_box = px.box(x=result_df["context_recall"].to_list())
115
+ score_column = "context_recall"
116
+
117
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
118
+ fig_box = px.box(x=result_df[score_column].to_list())
114
119
 
115
120
  return (
116
121
  {
117
- # "Scores (will not be uploaded to UI)": result_df[
122
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
118
123
  # ["question", "contexts", "ground_truth", "context_recall"]
119
124
  # ],
120
125
  "Aggregate Scores": [
121
126
  {
122
- "Mean Score": result_df["context_recall"].mean(),
123
- "Median Score": result_df["context_recall"].median(),
124
- "Max Score": result_df["context_recall"].max(),
125
- "Min Score": result_df["context_recall"].min(),
126
- "Standard Deviation": result_df["context_recall"].std(),
127
+ "Mean Score": result_df[score_column].mean(),
128
+ "Median Score": result_df[score_column].median(),
129
+ "Max Score": result_df[score_column].max(),
130
+ "Min Score": result_df[score_column].min(),
131
+ "Standard Deviation": result_df[score_column].std(),
127
132
  "Count": result_df.shape[0],
128
133
  }
129
134
  ],