validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.24.dist-info/METADATA +0 -118
  196. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,23 +2,24 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  import pandas as pd
9
7
  import plotly.graph_objects as go
10
8
  from sklearn.metrics import r2_score
11
9
  from sklearn.utils import check_random_state
12
10
 
13
- from validmind.errors import SkipTestError
11
+ from validmind import tags, tasks
14
12
  from validmind.logging import get_logger
15
- from validmind.vm_models import Figure, Metric
13
+ from validmind.vm_models import VMDataset, VMModel
16
14
 
17
15
  logger = get_logger(__name__)
18
16
 
19
17
 
20
- @dataclass
21
- class RegressionPermutationFeatureImportance(Metric):
18
+ @tags("statsmodels", "feature_importance", "visualization")
19
+ @tasks("regression")
20
+ def RegressionPermutationFeatureImportance(
21
+ dataset: VMDataset, model: VMModel, fontsize: int = 12, figure_height: int = 500
22
+ ):
22
23
  """
23
24
  Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
24
25
  values are randomly rearranged.
@@ -55,79 +56,45 @@ class RegressionPermutationFeatureImportance(Metric):
55
56
  features.
56
57
  - Assumes independence of features when calculating importance, which might not always hold true.
57
58
  """
58
-
59
- name = "regression_pfi"
60
- required_inputs = ["model", "dataset"]
61
- default_params = {
62
- "fontsize": 12,
63
- "figure_height": 500,
64
- }
65
- tasks = ["regression"]
66
- tags = [
67
- "statsmodels",
68
- "feature_importance",
69
- "visualization",
70
- ]
71
-
72
- def run(self):
73
- x = self.inputs.dataset.x_df()
74
- y = self.inputs.dataset.y_df()
75
-
76
- model = self.inputs.model.model
77
- if not hasattr(model, "predict"):
78
- raise SkipTestError(
79
- "Model does not support 'predict' method required for PFI"
80
- )
81
-
82
- # Calculate baseline performance
83
- baseline_performance = r2_score(y, model.predict(x))
84
- importances = pd.DataFrame(index=x.columns, columns=["Importance", "Std Dev"])
85
-
86
- for column in x.columns:
87
- shuffled_scores = []
88
- for _ in range(30): # Default number of shuffles
89
- x_shuffled = x.copy()
90
- x_shuffled[column] = check_random_state(0).permutation(
91
- x_shuffled[column]
92
- )
93
- permuted_performance = r2_score(y, model.predict(x_shuffled))
94
- shuffled_scores.append(baseline_performance - permuted_performance)
95
-
96
- importances.loc[column] = {
97
- "Importance": np.mean(shuffled_scores),
98
- "Std Dev": np.std(shuffled_scores),
99
- }
100
-
101
- sorted_idx = importances["Importance"].argsort()
102
-
103
- # Plotting the results
104
- fig = go.Figure()
105
- fig.add_trace(
106
- go.Bar(
107
- y=importances.index[sorted_idx],
108
- x=importances.loc[importances.index[sorted_idx], "Importance"],
109
- orientation="h",
110
- error_x=dict(
111
- type="data",
112
- array=importances.loc[importances.index[sorted_idx], "Std Dev"],
113
- ),
114
- )
115
- )
116
- fig.update_layout(
117
- title_text="Permutation Feature Importances",
118
- yaxis=dict(
119
- tickmode="linear", dtick=1, tickfont=dict(size=self.params["fontsize"])
59
+ y_true = dataset.y
60
+
61
+ baseline_performance = r2_score(y_true, dataset.y_pred(model))
62
+
63
+ importances = pd.DataFrame(
64
+ index=dataset.feature_columns, columns=["Importance", "Std Dev"]
65
+ )
66
+
67
+ for column in dataset.feature_columns:
68
+ shuffled_scores = []
69
+ for _ in range(30): # Default number of shuffles
70
+ x_shuffled = dataset.x_df()
71
+ x_shuffled[column] = check_random_state(0).permutation(x_shuffled[column])
72
+ permuted_performance = r2_score(y_true, model.predict(x_shuffled))
73
+ shuffled_scores.append(baseline_performance - permuted_performance)
74
+
75
+ importances.loc[column] = {
76
+ "Importance": np.mean(shuffled_scores),
77
+ "Std Dev": np.std(shuffled_scores),
78
+ }
79
+
80
+ sorted_idx = importances["Importance"].argsort()
81
+
82
+ fig = go.Figure()
83
+ fig.add_trace(
84
+ go.Bar(
85
+ y=importances.index[sorted_idx],
86
+ x=importances.loc[importances.index[sorted_idx], "Importance"],
87
+ orientation="h",
88
+ error_x=dict(
89
+ type="data",
90
+ array=importances.loc[importances.index[sorted_idx], "Std Dev"],
120
91
  ),
121
- height=self.params["figure_height"],
122
- )
123
-
124
- return self.cache_results(
125
- metric_value=importances.to_dict(),
126
- figures=[
127
- Figure(
128
- for_object=self,
129
- key="regression_pfi",
130
- figure=fig,
131
- ),
132
- ],
133
92
  )
93
+ )
94
+ fig.update_layout(
95
+ title_text="Permutation Feature Importances",
96
+ yaxis=dict(tickmode="linear", dtick=1, tickfont=dict(size=fontsize)),
97
+ height=figure_height,
98
+ )
99
+
100
+ return fig
@@ -92,6 +92,8 @@ def PredictionCorrelation(datasets, model):
92
92
  plt.legend()
93
93
  plt.tight_layout()
94
94
 
95
+ plt.close()
96
+
95
97
  corr_final["Features"] = corr_final.index
96
98
  corr_final = corr_final[
97
99
  ["Features", "Reference Predictions", "Monitoring Predictions"]
@@ -52,14 +52,16 @@ def TargetPredictionDistributionPlot(datasets, model):
52
52
 
53
53
  fig = plt.figure()
54
54
  plot = sns.kdeplot(
55
- pred_ref["Reference Prediction"], shade=True, label="Reference Prediction"
55
+ pred_ref["Reference Prediction"], fill=True, label="Reference Prediction"
56
56
  )
57
57
  plot = sns.kdeplot(
58
- pred_monitor["Monitoring Prediction"], shade=True, label="Monitor Prediction"
58
+ pred_monitor["Monitoring Prediction"], fill=True, label="Monitor Prediction"
59
59
  )
60
60
  plot.set(
61
61
  xlabel="Prediction", title="Distribution of Reference & Monitor Predictions"
62
62
  )
63
63
  plot.legend()
64
64
 
65
+ plt.close()
66
+
65
67
  return fig
@@ -0,0 +1,120 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Dict, List, Union
7
+ from uuid import uuid4
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from validmind.vm_models.figure import (
13
+ Figure,
14
+ is_matplotlib_figure,
15
+ is_plotly_figure,
16
+ is_png_image,
17
+ )
18
+ from validmind.vm_models.result import ResultTable, TestResult
19
+
20
+
21
+ class OutputHandler(ABC):
22
+ """Base class for handling different types of test outputs"""
23
+
24
+ @abstractmethod
25
+ def can_handle(self, item: Any) -> bool:
26
+ """Check if this handler can process the given item"""
27
+ pass
28
+
29
+ @abstractmethod
30
+ def process(self, item: Any, result: TestResult) -> None:
31
+ """Process the item and update the TestResult"""
32
+ pass
33
+
34
+
35
+ class BooleanOutputHandler(OutputHandler):
36
+ def can_handle(self, item: Any) -> bool:
37
+ return isinstance(item, (bool, np.bool_))
38
+
39
+ def process(self, item: Any, result: TestResult) -> None:
40
+ if result.passed is not None:
41
+ raise ValueError("Test returned more than one boolean value")
42
+ result.passed = bool(item)
43
+
44
+
45
+ class MetricOutputHandler(OutputHandler):
46
+ def can_handle(self, item: Any) -> bool:
47
+ return isinstance(item, (int, float))
48
+
49
+ def process(self, item: Any, result: TestResult) -> None:
50
+ if result.metric is not None:
51
+ raise ValueError("Only one unit metric may be returned per test.")
52
+ result.metric = item
53
+
54
+
55
+ class FigureOutputHandler(OutputHandler):
56
+ def can_handle(self, item: Any) -> bool:
57
+ return (
58
+ isinstance(item, Figure)
59
+ or is_matplotlib_figure(item)
60
+ or is_plotly_figure(item)
61
+ or is_png_image(item)
62
+ )
63
+
64
+ def process(self, item: Any, result: TestResult) -> None:
65
+ if isinstance(item, Figure):
66
+ result.add_figure(item)
67
+ else:
68
+ random_id = str(uuid4())[:4]
69
+ result.add_figure(
70
+ Figure(
71
+ key=f"{result.result_id}:{random_id}",
72
+ figure=item,
73
+ ref_id=result.ref_id,
74
+ )
75
+ )
76
+
77
+
78
+ class TableOutputHandler(OutputHandler):
79
+ def can_handle(self, item: Any) -> bool:
80
+ return isinstance(item, (list, pd.DataFrame, dict, ResultTable))
81
+
82
+ def process(
83
+ self,
84
+ item: Union[List[Dict[str, Any]], pd.DataFrame, Dict[str, Any], ResultTable],
85
+ result: TestResult,
86
+ ) -> None:
87
+ tables = item if isinstance(item, dict) else {"": item}
88
+
89
+ for table_name, table_data in tables.items():
90
+ # if already a ResultTable, add it directly
91
+ if isinstance(table_data, ResultTable):
92
+ result.add_table(table_data)
93
+ continue
94
+
95
+ if not isinstance(table_data, (list, pd.DataFrame)):
96
+ raise ValueError(
97
+ "Invalid table format: must be a list of dictionaries or a DataFrame"
98
+ )
99
+
100
+ if isinstance(table_data, list):
101
+ table_data = pd.DataFrame(table_data)
102
+
103
+ result.add_table(ResultTable(data=table_data, title=table_name or None))
104
+
105
+
106
+ def process_output(item: Any, result: TestResult) -> None:
107
+ """Process a single test output item and update the TestResult."""
108
+ handlers = [
109
+ BooleanOutputHandler(),
110
+ MetricOutputHandler(),
111
+ FigureOutputHandler(),
112
+ TableOutputHandler(),
113
+ ]
114
+
115
+ for handler in handlers:
116
+ if handler.can_handle(item):
117
+ handler.process(item, result)
118
+ return
119
+
120
+ raise ValueError(f"Invalid test output type: {type(item)}")
@@ -2,19 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
-
5
+ from validmind import tags, tasks
10
6
  from validmind.errors import MissingRequiredTestInputError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
7
 
19
8
  from .ai_powered_test import (
20
9
  call_model,
@@ -23,9 +12,40 @@ from .ai_powered_test import (
23
12
  missing_prompt_message,
24
13
  )
25
14
 
15
+ SYSTEM = """
16
+ You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different best practices. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
17
+
18
+ Consider the following best practice documentation:
19
+ ```
20
+ Its been shown that, when passing examples to an LLM via the prompt for the purpose of Sentiment Analysis, a distribution with a majority of positive examples led the LLM to classify ambiguous sentences as "Positive", and vice versa. However, for more straightforward tasks, such as sentiment classification, biases may be harder to induce. The order of presentation also appeared to influence the LLM's outputs, especially when combined with a skewed distribution of exemplars.
21
+ Tactics for Avoiding Bias in Prompting:
22
+ 1. Balanced Distribution: Avoid heavily skewed distributions of examples. Instead, offer amore balanced number of examples for each label.
23
+ 2. Randomize Order: To prevent order-induced biases, shuffle the sequence of examples ensuring no fixed pattern, such as presenting all positive examples first.
24
+ 3. Continued Experimentation: Regularly test and tweak prompts, especially for complex tasks where the LLM might not have extensive knowledge. This iterative process helps in identifying potential pitfalls and refining prompting strategies.
25
+ Example:
26
+ For instance, using 8 positive and 2 negative examples led the LLM to classify an ambiguous sentence "I feel something" as "Positive". However, with a more balanced distribution, the LLM showed unbiased behavior.
27
+ ```
28
+
29
+ Score the bias of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
30
+
31
+ Response Format:
32
+ ```
33
+ Score: <score>
34
+ Explanation: <explanation>
35
+ ```
36
+ """.strip()
37
+
38
+ USER = '''
39
+ Prompt:
40
+ """
41
+ {prompt_to_test}
42
+ """
43
+ '''.strip()
44
+
26
45
 
27
- @dataclass
28
- class Bias(ThresholdTest):
46
+ @tags("llm", "few_shot")
47
+ @tasks("text_classification", "text_summarization")
48
+ def Bias(model, min_threshold=7):
29
49
  """
30
50
  Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
31
51
  prompt.
@@ -74,87 +94,24 @@ class Bias(ThresholdTest):
74
94
  - The use of a grading mechanism to gauge bias may not be entirely accurate in every case, particularly when the
75
95
  difference between threshold and score is narrow.
76
96
  """
77
-
78
- name = "bias"
79
- required_inputs = ["model.prompt"]
80
- default_params = {"min_threshold": 7}
81
- tasks = ["text_classification", "text_summarization"]
82
- tags = ["llm", "few_shot"]
83
-
84
- system_prompt = """
85
- You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different best practices. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
86
-
87
- Consider the following best practice documentation:
88
- ```
89
- Its been shown that, when passing examples to an LLM via the prompt for the purpose of Sentiment Analysis, a distribution with a majority of positive examples led the LLM to classify ambiguous sentences as "Positive", and vice versa. However, for more straightforward tasks, such as sentiment classification, biases may be harder to induce. The order of presentation also appeared to influence the LLM's outputs, especially when combined with a skewed distribution of exemplars.
90
- Tactics for Avoiding Bias in Prompting:
91
- 1. Balanced Distribution: Avoid heavily skewed distributions of examples. Instead, offer amore balanced number of examples for each label.
92
- 2. Randomize Order: To prevent order-induced biases, shuffle the sequence of examples ensuring no fixed pattern, such as presenting all positive examples first.
93
- 3. Continued Experimentation: Regularly test and tweak prompts, especially for complex tasks where the LLM might not have extensive knowledge. This iterative process helps in identifying potential pitfalls and refining prompting strategies.
94
- Example:
95
- For instance, using 8 positive and 2 negative examples led the LLM to classify an ambiguous sentence "I feel something" as "Positive". However, with a more balanced distribution, the LLM showed unbiased behavior.
96
- ```
97
-
98
- Score the bias of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
99
-
100
- Response Format:
101
- ```
102
- Score: <score>
103
- Explanation: <explanation>
104
- ```
105
- """.strip()
106
- user_prompt = '''
107
- Prompt:
108
- """
109
- {prompt_to_test}
110
- """
111
- '''.strip()
112
-
113
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
114
- result = results[0]
115
- results_table = [
116
- {
117
- "Score": result.values["score"],
118
- "Threshold": result.values["threshold"],
119
- "Explanation": result.values["explanation"],
120
- "Pass/Fail": "Pass" if result.passed else "Fail",
121
- }
122
- ]
123
-
124
- return ResultSummary(
125
- results=[
126
- ResultTable(
127
- data=pd.DataFrame(results_table),
128
- metadata=ResultTableMetadata(
129
- title="Bias Test on Prompt",
130
- ),
131
- )
132
- ]
133
- )
134
-
135
- def run(self):
136
- if not hasattr(self.inputs.model, "prompt"):
137
- raise MissingRequiredTestInputError(missing_prompt_message)
138
-
139
- response = call_model(
140
- system_prompt=self.system_prompt,
141
- user_prompt=self.user_prompt.format(
142
- prompt_to_test=self.inputs.model.prompt.template
143
- ),
144
- )
145
- score = get_score(response)
146
- explanation = get_explanation(response)
147
-
148
- passed = score > self.params["min_threshold"]
149
- results = [
150
- ThresholdTestResult(
151
- passed=passed,
152
- values={
153
- "score": score,
154
- "explanation": explanation,
155
- "threshold": self.params["min_threshold"],
156
- },
157
- )
158
- ]
159
-
160
- return self.cache_results(results, passed=passed)
97
+ if not hasattr(model, "prompt"):
98
+ raise MissingRequiredTestInputError(missing_prompt_message)
99
+
100
+ response = call_model(
101
+ system_prompt=SYSTEM,
102
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
103
+ )
104
+
105
+ score = get_score(response)
106
+ explanation = get_explanation(response)
107
+
108
+ passed = score > min_threshold
109
+
110
+ return [
111
+ {
112
+ "Score": score,
113
+ "Explanation": explanation,
114
+ "Threshold": min_threshold,
115
+ "Pass/Fail": "Pass" if passed else "Fail",
116
+ }
117
+ ], passed
@@ -2,19 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
-
5
+ from validmind import tags, tasks
10
6
  from validmind.errors import MissingRequiredTestInputError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
7
 
19
8
  from .ai_powered_test import (
20
9
  call_model,
@@ -23,9 +12,41 @@ from .ai_powered_test import (
23
12
  missing_prompt_message,
24
13
  )
25
14
 
15
+ SYSTEM = """
16
+ You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
17
+
18
+ Consider the following documentation on prompt clarity guidelines when evaluating the prompt:
19
+ '''
20
+ Clear prompts minimize the room for misinterpretation, allowing the LLM to generate more relevant and accurate responses. Ambiguous or vague instructions might leave the model guessing, leading to suboptimal outputs.
21
+
22
+ Tactics for Ensuring Clarity that will be referenced during evaluation:
23
+ 1. Detail Inclusion: Provide essential details or context to prevent the LLM from making assumptions.
24
+ 2. Adopt a Persona: Use system messages to specify the desired persona for the LLM's responses.
25
+ 3. Specify Steps: For certain tasks, delineate the required steps explicitly, helping the model in sequential understanding.
26
+ 4. Provide Examples: While general instructions are efficient, in some scenarios, "few-shot" prompting or style examples can guide the LLM more effectively.
27
+ 5. Determine Output Length: Define the targeted length of the response, whether in terms of paragraphs, bullet points, or other units. While word counts aren't always precise, specifying formats like paragraphs can offer more predictable results.
28
+ '''
29
+
30
+ Score the clarity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
26
31
 
27
- @dataclass
28
- class Clarity(ThresholdTest):
32
+ Response Format:
33
+ ```
34
+ Score: <score>
35
+ Explanation: <explanation>
36
+ ```
37
+ """
38
+
39
+ USER = """
40
+ Prompt:
41
+ '''
42
+ {prompt_to_test}
43
+ '''
44
+ """
45
+
46
+
47
+ @tags("llm", "zero_shot", "few_shot")
48
+ @tasks("text_classification", "text_summarization")
49
+ def Clarity(model, min_threshold=7):
29
50
  """
30
51
  Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
31
52
 
@@ -62,88 +83,24 @@ class Clarity(ThresholdTest):
62
83
  examples, and specification of output length) contribute equally to clarity, which might not always be the case
63
84
  - The evaluation may not be as effective if used on non-textual models
64
85
  """
65
-
66
- name = "clarity"
67
- required_inputs = ["model.prompt"]
68
- default_params = {"min_threshold": 7}
69
- tasks = ["text_classification", "text_summarization"]
70
- tags = ["llm", "zero_shot", "few_shot"]
71
-
72
- system_prompt = """
73
- You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
74
-
75
- Consider the following documentation on prompt clarity guidelines when evaluating the prompt:
76
- '''
77
- Clear prompts minimize the room for misinterpretation, allowing the LLM to generate more relevant and accurate responses. Ambiguous or vague instructions might leave the model guessing, leading to suboptimal outputs.
78
-
79
- Tactics for Ensuring Clarity that will be referenced during evaluation:
80
- 1. Detail Inclusion: Provide essential details or context to prevent the LLM from making assumptions.
81
- 2. Adopt a Persona: Use system messages to specify the desired persona for the LLM's responses.
82
- 3. Specify Steps: For certain tasks, delineate the required steps explicitly, helping the model in sequential understanding.
83
- 4. Provide Examples: While general instructions are efficient, in some scenarios, "few-shot" prompting or style examples can guide the LLM more effectively.
84
- 5. Determine Output Length: Define the targeted length of the response, whether in terms of paragraphs, bullet points, or other units. While word counts aren't always precise, specifying formats like paragraphs can offer more predictable results.
85
- '''
86
-
87
- Score the clarity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
88
-
89
- Response Format:
90
- ```
91
- Score: <score>
92
- Explanation: <explanation>
93
- ```
94
- """.strip()
95
- user_prompt = '''
96
- Prompt:
97
- """
98
- {prompt_to_test}
99
- """
100
- '''.strip()
101
-
102
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
103
- result = results[0]
104
- results_table = [
105
- {
106
- "Score": result.values["score"],
107
- "Threshold": result.values["threshold"],
108
- "Explanation": result.values["explanation"],
109
- "Pass/Fail": "Pass" if result.passed else "Fail",
110
- }
111
- ]
112
-
113
- return ResultSummary(
114
- results=[
115
- ResultTable(
116
- data=pd.DataFrame(results_table),
117
- metadata=ResultTableMetadata(
118
- title="Clarity Test for LLM Prompt",
119
- ),
120
- )
121
- ]
122
- )
123
-
124
- def run(self):
125
- if not hasattr(self.inputs.model, "prompt"):
126
- raise MissingRequiredTestInputError(missing_prompt_message)
127
-
128
- response = call_model(
129
- system_prompt=self.system_prompt,
130
- user_prompt=self.user_prompt.format(
131
- prompt_to_test=self.inputs.model.prompt.template
132
- ),
133
- )
134
- score = get_score(response)
135
- explanation = get_explanation(response)
136
-
137
- passed = score > self.params["min_threshold"]
138
- results = [
139
- ThresholdTestResult(
140
- passed=passed,
141
- values={
142
- "score": score,
143
- "explanation": explanation,
144
- "threshold": self.params["min_threshold"],
145
- },
146
- )
147
- ]
148
-
149
- return self.cache_results(results, passed=passed)
86
+ if not hasattr(model, "prompt"):
87
+ raise MissingRequiredTestInputError(missing_prompt_message)
88
+
89
+ response = call_model(
90
+ system_prompt=SYSTEM,
91
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
92
+ )
93
+
94
+ score = get_score(response)
95
+ explanation = get_explanation(response)
96
+
97
+ passed = score > min_threshold
98
+
99
+ return [
100
+ {
101
+ "Score": score,
102
+ "Explanation": explanation,
103
+ "Threshold": min_threshold,
104
+ "Pass/Fail": "Pass" if passed else "Fail",
105
+ }
106
+ ], passed