validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -12,29 +12,37 @@ from validmind import tags, tasks
12
12
  @tasks("monitoring")
13
13
  def TargetPredictionDistributionPlot(datasets, model):
14
14
  """
15
- **Purpose:**
16
- This test provides the prediction distributions from the reference dataset and the new monitoring dataset. If there
17
- are significant differences in the distributions, it might indicate different underlying data characteristics that
18
- warrant further investigation into the root causes.
19
-
20
- **Test Mechanism:**
21
- The methodology involves generating Kernel Density Estimation (KDE) plots for the prediction probabilities from
22
- both the reference and monitoring datasets. By comparing these KDE plots, one can visually assess any significant
23
- differences in the prediction distributions between the two datasets.
24
-
25
- **Signs of High Risk:**
26
- - Significant divergence between the distribution curves of the reference and monitoring predictions
27
- - Unusual shifts or bimodal distribution in the monitoring predictions compared to the reference predictions
28
-
29
- **Strengths:**
30
- - Visual representation makes it easy to spot differences in prediction distributions
31
- - Useful for identifying potential data drift or changes in underlying data characteristics
32
- - Simple and efficient to implement using standard plotting libraries
33
-
34
- **Limitations:**
35
- - Subjective interpretation of the visual plots
36
- - Might not pinpoint the exact cause of distribution changes
37
- - Less effective if the differences in distributions are subtle and not easily visible
15
+ Assesses differences in prediction distributions between a reference dataset and a monitoring dataset to identify
16
+ potential data drift.
17
+
18
+ ### Purpose
19
+
20
+ The Target Prediction Distribution Plot test aims to evaluate potential changes in the prediction distributions
21
+ between the reference and new monitoring datasets. It seeks to identify underlying shifts in data characteristics
22
+ that warrant further investigation.
23
+
24
+ ### Test Mechanism
25
+
26
+ This test generates Kernel Density Estimation (KDE) plots for prediction probabilities from both the reference and
27
+ monitoring datasets. By visually comparing the KDE plots, it assesses significant differences in the prediction
28
+ distributions between the two datasets.
29
+
30
+ ### Signs of High Risk
31
+
32
+ - Significant divergence between the distribution curves of reference and monitoring predictions.
33
+ - Unusual shifts or bimodal distribution in the monitoring predictions compared to the reference predictions.
34
+
35
+ ### Strengths
36
+
37
+ - Visual representation makes it easy to spot differences in prediction distributions.
38
+ - Useful for identifying potential data drift or changes in underlying data characteristics.
39
+ - Simple and efficient to implement using standard plotting libraries.
40
+
41
+ ### Limitations
42
+
43
+ - Subjective interpretation of the visual plots.
44
+ - Might not pinpoint the exact cause of distribution changes.
45
+ - Less effective if the differences in distributions are subtle and not easily visible.
38
46
  """
39
47
 
40
48
  pred_ref = datasets[0].y_prob_df(model)
@@ -27,42 +27,45 @@ from .ai_powered_test import (
27
27
  @dataclass
28
28
  class Bias(ThresholdTest):
29
29
  """
30
- Evaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt.
30
+ Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
31
+ prompt.
32
+
33
+ ### Purpose
31
34
 
32
- **Purpose:**
33
35
  The Bias Evaluation test calculates if and how the order and distribution of exemplars (examples) in a few-shot
34
36
  learning prompt affect the output of a Large Language Model (LLM). The results of this evaluation can be used to
35
37
  fine-tune the model's performance and manage any unintended biases in its results.
36
38
 
37
- **Test Mechanism:**
39
+ ### Test Mechanism
40
+
38
41
  This test uses two checks:
39
42
 
40
- 1. *Distribution of Exemplars:* The number of positive vs. negative examples in a prompt is varied. The test then
43
+ 1. **Distribution of Exemplars:** The number of positive vs. negative examples in a prompt is varied. The test then
41
44
  examines the LLM's classification of a neutral or ambiguous statement under these circumstances.
42
- 2. *Order of Exemplars:* The sequence in which positive and negative examples are presented to the model is
45
+ 2. **Order of Exemplars:** The sequence in which positive and negative examples are presented to the model is
43
46
  modified. Their resultant effect on the LLM's response is studied.
44
47
 
45
48
  For each test case, the LLM grades the input prompt on a scale of 1 to 10. It evaluates whether the examples in the
46
49
  prompt could produce biased responses. The test only passes if the score meets or exceeds a predetermined minimum
47
- threshold. This threshold is set at 7 by default, but it can be modified as per the requirements via the test
50
+ threshold. This threshold is set at 7 by default but can be modified as per the requirements via the test
48
51
  parameters.
49
52
 
50
- **Signs of High Risk:**
53
+ ### Signs of High Risk
51
54
 
52
55
  - A skewed result favoring either positive or negative responses may suggest potential bias in the model. This skew
53
56
  could be caused by an unbalanced distribution of positive and negative exemplars.
54
57
  - If the score given by the model is less than the set minimum threshold, it might indicate a risk of high bias and
55
58
  hence poor performance.
56
59
 
57
- **Strengths:**
60
+ ### Strengths
58
61
 
59
- - This test provides a quantitative measure of potential bias, providing clear guidelines for developers about
62
+ - This test provides a quantitative measure of potential bias, offering clear guidelines for developers about
60
63
  whether their Large Language Model (LLM) contains significant bias.
61
- - It's useful in evaluating the impartiality of the model based on the distribution and sequence of examples.
64
+ - It is useful in evaluating the impartiality of the model based on the distribution and sequence of examples.
62
65
  - The flexibility to adjust the minimum required threshold allows tailoring this test to stricter or more lenient
63
66
  bias standards.
64
67
 
65
- **Limitations:**
68
+ ### Limitations
66
69
 
67
70
  - The test may not pick up on more subtle forms of bias or biases that are not directly related to the distribution
68
71
  or order of exemplars.
@@ -29,36 +29,38 @@ class Clarity(ThresholdTest):
29
29
  """
30
30
  Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
31
31
 
32
- **Purpose:**
32
+ ### Purpose
33
+
33
34
  The Clarity evaluation metric is used to assess how clear the prompts of a Large Language Model (LLM) are. This
34
35
  assessment is particularly important because clear prompts assist the LLM in more accurately interpreting and
35
36
  responding to instructions.
36
37
 
37
- **Test Mechanism:**
38
+ ### Test Mechanism
39
+
38
40
  The evaluation uses an LLM to scrutinize the clarity of prompts, factoring in considerations such as the inclusion
39
- of relevant details, persona adoption, step-by-step instructions, usage of examples and specification of desired
41
+ of relevant details, persona adoption, step-by-step instructions, usage of examples, and specification of desired
40
42
  output length. Each prompt is rated on a clarity scale of 1 to 10, and any prompt scoring at or above the preset
41
43
  threshold (default of 7) will be marked as clear. It is important to note that this threshold can be adjusted via
42
44
  test parameters, providing flexibility in the evaluation process.
43
45
 
44
- **Signs of High Risk:**
46
+ ### Signs of High Risk
45
47
 
46
48
  - Prompts that consistently score below the clarity threshold
47
- - Repeated failure of prompts to adhere to guidelines for clarity. These guidelines could include detail inclusion,
48
- persona adoption, explicit step-by-step instructions, use of examples, and specification of output length.
49
+ - Repeated failure of prompts to adhere to guidelines for clarity, including detail inclusion, persona adoption,
50
+ explicit step-by-step instructions, use of examples, and specification of output length
49
51
 
50
- **Strengths:**
52
+ ### Strengths
51
53
 
52
- - Encourages the development of more effective prompts that aid the LLM in interpreting instructions accurately.
53
- - Applies a quantifiable measure (a score from 1 to 10) to evaluate the clarity of prompts.
54
- - Threshold for clarity is adjustable, allowing for flexible evaluation depending on the context.
54
+ - Encourages the development of more effective prompts that aid the LLM in interpreting instructions accurately
55
+ - Applies a quantifiable measure (a score from 1 to 10) to evaluate the clarity of prompts
56
+ - Threshold for clarity is adjustable, allowing for flexible evaluation depending on the context
55
57
 
56
- **Limitations:**
58
+ ### Limitations
57
59
 
58
- - Scoring system is subjective and relies on the AI’s interpretation of 'clarity'.
60
+ - Scoring system is subjective and relies on the AI’s interpretation of 'clarity'
59
61
  - The test assumes that all required factors (detail inclusion, persona adoption, step-by-step instructions, use of
60
- examples, and specification of output length) contribute equally to clarity, which might not always be the case.
61
- - The evaluation may not be as effective if used on non-textual models.
62
+ examples, and specification of output length) contribute equally to clarity, which might not always be the case
63
+ - The evaluation may not be as effective if used on non-textual models
62
64
  """
63
65
 
64
66
  name = "clarity"
@@ -29,31 +29,33 @@ class Conciseness(ThresholdTest):
29
29
  """
30
30
  Analyzes and grades the conciseness of prompts provided to a Large Language Model.
31
31
 
32
- **Purpose:**
32
+ ### Purpose
33
+
33
34
  The Conciseness Assessment is designed to evaluate the brevity and succinctness of prompts provided to a Language
34
35
  Learning Model (LLM). A concise prompt strikes a balance between offering clear instructions and eliminating
35
36
  redundant or unnecessary information, ensuring that the LLM receives relevant input without being overwhelmed.
36
37
 
37
- **Test Mechanism:**
38
+ ### Test Mechanism
39
+
38
40
  Using an LLM, this test conducts a conciseness analysis on input prompts. The analysis grades the prompt on a scale
39
41
  from 1 to 10, where the grade reflects how well the prompt delivers clear instructions without being verbose.
40
42
  Prompts that score equal to or above a predefined threshold (default set to 7) are deemed successfully concise.
41
43
  This threshold can be adjusted to meet specific requirements.
42
44
 
43
- **Signs of High Risk:**
45
+ ### Signs of High Risk
44
46
 
45
47
  - Prompts that consistently score below the predefined threshold.
46
48
  - Prompts that are overly wordy or contain unnecessary information.
47
49
  - Prompts that create confusion or ambiguity due to excess or unnecessary information.
48
50
 
49
- **Strengths:**
51
+ ### Strengths
50
52
 
51
53
  - Ensures clarity and effectiveness of the prompts.
52
54
  - Promotes brevity and preciseness in prompts without sacrificing essential information.
53
55
  - Useful for models like LLMs, where input prompt length and clarity greatly influence model performance.
54
56
  - Provides a quantifiable measure of prompt conciseness.
55
57
 
56
- **Limitations:**
58
+ ### Limitations
57
59
 
58
60
  - The conciseness score is based on an AI's assessment, which might not fully capture human interpretation of
59
61
  conciseness.
@@ -29,38 +29,39 @@ class Delimitation(ThresholdTest):
29
29
  """
30
30
  Evaluates the proper use of delimiters in prompts provided to Large Language Models.
31
31
 
32
- **Purpose:**
33
- This test, dubbed the "Delimitation Test", is engineered to assess whether prompts provided to the Language
34
- Learning Model (LLM) correctly use delimiters to mark different sections of the input. Well-delimited prompts
35
- simplify the interpretation process for LLM, ensuring responses are precise and accurate.
32
+ ### Purpose
33
+
34
+ The Delimitation Test aims to assess whether prompts provided to the Language Learning Model (LLM) correctly use
35
+ delimiters to mark different sections of the input. Well-delimited prompts help simplify the interpretation process
36
+ for the LLM, ensuring that the responses are precise and accurate.
37
+
38
+ ### Test Mechanism
36
39
 
37
- **Test Mechanism:**
38
40
  The test employs an LLM to examine prompts for appropriate use of delimiters such as triple quotation marks, XML
39
- tags, and section titles. Each prompt is assigned a score from 1 to 10 based on its delimitation integrity. Those
41
+ tags, and section titles. Each prompt is assigned a score from 1 to 10 based on its delimitation integrity. Prompts
40
42
  with scores equal to or above the preset threshold (which is 7 by default, although it can be adjusted as
41
43
  necessary) pass the test.
42
44
 
43
- **Signs of High Risk:**
45
+ ### Signs of High Risk
44
46
 
45
- - The test identifies prompts where a delimiter is missing, improperly placed, or incorrect, which can lead to
46
- misinterpretation by the LLM.
47
- - A high-risk scenario may involve complex prompts with multiple tasks or diverse data where correct delimitation
48
- is integral to understanding.
49
- - Low scores (below the threshold) are a clear indicator of high risk.
47
+ - Prompts missing, improperly placed, or incorrectly used delimiters, leading to misinterpretation by the LLM.
48
+ - High-risk scenarios with complex prompts involving multiple tasks or diverse data where correct delimitation is
49
+ crucial.
50
+ - Scores below the threshold, indicating a high risk.
50
51
 
51
- **Strengths:**
52
+ ### Strengths
52
53
 
53
- - This test ensures clarity in the demarcation of different components of given prompts.
54
- - It helps reduce ambiguity in understanding prompts, particularly for complex tasks.
55
- - Scoring allows for quantified insight into the appropriateness of delimiter usage, aiding continuous improvement.
54
+ - Ensures clarity in demarcating different components of given prompts.
55
+ - Reduces ambiguity in understanding prompts, especially for complex tasks.
56
+ - Provides a quantified insight into the appropriateness of delimiter usage, aiding continuous improvement.
56
57
 
57
- **Limitations:**
58
+ ### Limitations
58
59
 
59
- - The test only checks for the presence and placement of delimiter, not whether the correct delimiter type is used
60
- for the specific data or task.
61
- - It may not fully reveal the impacts of poor delimitation on LLM's final performance.
62
- - Depending on the complexity of the tasks and prompts, the preset score threshold may not be refined enough,
63
- requiring regular manual adjustment.
60
+ - Only checks for the presence and placement of delimiters, not whether the correct delimiter type is used for the
61
+ specific data or task.
62
+ - May not fully reveal the impacts of poor delimitation on the LLM's final performance.
63
+ - The preset score threshold may not be refined enough for complex tasks and prompts, requiring regular manual
64
+ adjustment.
64
65
  """
65
66
 
66
67
  name = "delimitation"
@@ -29,34 +29,36 @@ class NegativeInstruction(ThresholdTest):
29
29
  """
30
30
  Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
31
31
 
32
- **Purpose:**
32
+ ### Purpose
33
+
33
34
  The Negative Instruction test is utilized to scrutinize the prompts given to a Large Language Model (LLM). The
34
35
  objective is to ensure these prompts are expressed using proactive, affirmative language. The focus is on
35
36
  instructions indicating what needs to be done rather than what needs to be avoided, thereby guiding the LLM more
36
37
  efficiently towards the desired output.
37
38
 
38
- **Test Mechanism:**
39
+ ### Test Mechanism
40
+
39
41
  An LLM is employed to evaluate each prompt. The prompt is graded based on its use of positive instructions with
40
42
  scores ranging between 1-10. This grade reflects how effectively the prompt leverages affirmative language while
41
43
  shying away from negative or restrictive instructions. A prompt that attains a grade equal to or above a
42
44
  predetermined threshold (7 by default) is regarded as adhering effectively to the best practices of positive
43
45
  instruction. This threshold can be custom-tailored through the test parameters.
44
46
 
45
- **Signs of High Risk:**
47
+ ### Signs of High Risk
46
48
 
47
49
  - Low score obtained from the LLM analysis, indicating heavy reliance on negative instructions in the prompts.
48
50
  - Failure to surpass the preset minimum threshold.
49
51
  - The LLM generates ambiguous or undesirable outputs as a consequence of the negative instructions used in the
50
52
  prompt.
51
53
 
52
- **Strengths:**
54
+ ### Strengths
53
55
 
54
56
  - Encourages the usage of affirmative, proactive language in prompts, aiding in more accurate and advantageous
55
57
  model responses.
56
58
  - The test result provides a comprehensible score, helping to understand how well a prompt follows the positive
57
59
  instruction best practices.
58
60
 
59
- **Limitations:**
61
+ ### Limitations
60
62
 
61
63
  - Despite an adequate score, a prompt could still be misleading or could lead to undesired responses due to factors
62
64
  not covered by this test.
@@ -24,31 +24,33 @@ class Robustness(ThresholdTest):
24
24
  """
25
25
  Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
26
26
 
27
- **Purpose:**
27
+ ### Purpose
28
+
28
29
  The Robustness test is meant to evaluate the resilience and reliability of prompts provided to a Language Learning
29
- Model (LLM). The aim of this test is to guarantee that the prompts consistently generate accurate and the expected
30
- outputs, despite being in diverse or challenging scenarios.
30
+ Model (LLM). The aim of this test is to guarantee that the prompts consistently generate accurate and expected
31
+ outputs, even in diverse or challenging scenarios.
32
+
33
+ ### Test Mechanism
31
34
 
32
- **Test Mechanism:**
33
35
  The Robustness test appraises prompts under various conditions, alterations, and contexts to ascertain their
34
- stability in producing consistent responses from the LLM. Factors evaluated range from different phrasings,
35
- inclusion of potential distracting elements, and various input complexities. By default, the test generates 10
36
- inputs for a prompt but can be adjusted according to test parameters.
36
+ stability in producing consistent responses from the LLM. Factors evaluated include different phrasings, inclusion
37
+ of potential distracting elements, and various input complexities. By default, the test generates 10 inputs for a
38
+ prompt but can be adjusted according to test parameters.
37
39
 
38
- **Signs of High Risk:**
40
+ ### Signs of High Risk
39
41
 
40
42
  - If the output from the tests diverges extensively from the expected results, this indicates high risk.
41
43
  - When the prompt doesn't give a consistent performance across various tests.
42
44
  - A high risk is indicated when the prompt is susceptible to breaking, especially when the output is expected to be
43
45
  of a specific type.
44
46
 
45
- **Strengths:**
47
+ ### Strengths
46
48
 
47
49
  - The robustness test helps to ensure stable performance of the LLM prompts and lowers the chances of generating
48
50
  unexpected or off-target outputs.
49
51
  - This test is vital for applications where predictability and reliability of the LLM’s output are crucial.
50
52
 
51
- **Limitations:**
53
+ ### Limitations
52
54
 
53
55
  - Currently, the test only supports single-variable prompts, which restricts its application to more complex models.
54
56
  - When there are too many target classes (over 10), the test is skipped, which can leave potential vulnerabilities
@@ -27,40 +27,42 @@ from .ai_powered_test import (
27
27
  @dataclass
28
28
  class Specificity(ThresholdTest):
29
29
  """
30
- Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,
31
- detail, and relevance.
30
+ Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
31
+ and relevance.
32
+
33
+ ### Purpose
32
34
 
33
- **Purpose:**
34
35
  The Specificity Test evaluates the clarity, precision, and effectiveness of the prompts provided to a Language
35
- Learning Model (LLM). It aims to ensure that the instructions embedded in a prompt are indisputably clear and
36
- relevant, thereby helping to yank out ambiguity and steer the LLM towards desired outputs. This level of
37
- specificity significantly affects the accuracy and relevance of LLM outputs.
36
+ Model (LLM). It aims to ensure that the instructions embedded in a prompt are indisputably clear and relevant,
37
+ thereby helping to remove ambiguity and steer the LLM towards desired outputs. This level of specificity
38
+ significantly affects the accuracy and relevance of LLM outputs.
39
+
40
+ ### Test Mechanism
38
41
 
39
- **Test Mechanism:**
40
42
  The Specificity Test employs an LLM to grade each prompt based on clarity, detail, and relevance parameters within
41
43
  a specificity scale that extends from 1 to 10. On this scale, prompts scoring equal to or more than a predefined
42
44
  threshold (set to 7 by default) pass the evaluation, while those scoring below this threshold fail it. Users can
43
45
  adjust this threshold as per their requirements.
44
46
 
45
- **Signs of High Risk:**
47
+ ### Signs of High Risk
46
48
 
47
49
  - Prompts scoring consistently below the established threshold
48
50
  - Vague or ambiguous prompts that do not provide clear direction to the LLM
49
51
  - Overly verbose prompts that may confuse the LLM instead of providing clear guidance
50
52
 
51
- **Strengths:**
53
+ ### Strengths
52
54
 
53
55
  - Enables precise and clear communication with the LLM to achieve desired outputs
54
56
  - Serves as a crucial means to measure the effectiveness of prompts
55
57
  - Highly customizable, allowing users to set their threshold based on specific use cases
56
58
 
57
- **Limitations:**
59
+ ### Limitations
58
60
 
59
61
  - This test doesn't consider the content comprehension capability of the LLM
60
62
  - High specificity score doesn't guarantee a high-quality response from the LLM, as the model's performance is also
61
63
  dependent on various other factors
62
64
  - Striking a balance between specificity and verbosity can be challenging, as overly detailed prompts might confuse
63
- or mislead the model.
65
+ or mislead the model
64
66
  """
65
67
 
66
68
  name = "specificity"
@@ -5,6 +5,7 @@
5
5
  import re
6
6
 
7
7
  from validmind.ai.utils import get_client_and_model
8
+ from validmind.client_config import client_config
8
9
 
9
10
  missing_prompt_message = """
10
11
  Cannot run prompt validation tests on a model with no prompt.
@@ -24,6 +25,11 @@ def call_model(
24
25
  system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
25
26
  ):
26
27
  """Call LLM with the given prompts and return the response"""
28
+ if not client_config.can_generate_llm_test_descriptions():
29
+ raise ValueError(
30
+ "LLM based descriptions are not enabled for your organization."
31
+ )
32
+
27
33
  client, model = get_client_and_model()
28
34
 
29
35
  return (
validmind/tests/run.py CHANGED
@@ -17,6 +17,7 @@ from validmind.vm_models import (
17
17
  MetricResult,
18
18
  ResultSummary,
19
19
  ResultTable,
20
+ ResultTableMetadata,
20
21
  TestContext,
21
22
  TestInput,
22
23
  ThresholdTestResults,
@@ -147,6 +148,26 @@ def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str,
147
148
  return [figure for figures in figure_lists for figure in figures]
148
149
 
149
150
 
151
+ def _combine_unit_metrics(results: List[MetricResultWrapper]):
152
+ if not results[0].scalar:
153
+ return
154
+
155
+ for result in results:
156
+ table = ResultTable(
157
+ data=[{"value": result.scalar}],
158
+ metadata=ResultTableMetadata(title="Unit Metrics"),
159
+ )
160
+ if not result.metric:
161
+ result.metric = MetricResult(
162
+ ref_id="will_be_overwritten",
163
+ key=result.result_id,
164
+ value=result.scalar,
165
+ summary=ResultSummary(results=[table]),
166
+ )
167
+ else:
168
+ result.metric.summary.results.append(table)
169
+
170
+
150
171
  def metric_comparison(
151
172
  results: List[MetricResultWrapper],
152
173
  test_id: TestID,
@@ -172,22 +193,41 @@ def metric_comparison(
172
193
  raise ValueError(f"Unsupported type for value: {v}")
173
194
  input_group_strings.append(new_group)
174
195
 
175
- merged_summary = _combine_summaries(
176
- [
177
- {"inputs": input_group_strings[i], "summary": result.metric.summary}
178
- for i, result in enumerate(results)
179
- ]
180
- )
181
- merged_figures = _combine_figures(
182
- [result.figures for result in results], input_groups
183
- )
184
-
185
- # Patch figure metadata so they are connected to the comparison result
186
- if merged_figures and len(merged_figures):
187
- for i, figure in enumerate(merged_figures):
188
- figure.key = f"{figure.key}-{i}"
189
- figure.metadata["_name"] = test_id
190
- figure.metadata["_ref_id"] = ref_id
196
+ # handle unit metrics (scalar values) by adding it to the summary
197
+ _combine_unit_metrics(results)
198
+
199
+ # Check if the results list contains a result object with a metric
200
+ if any(
201
+ hasattr(result, "metric")
202
+ and hasattr(result.metric, "summary")
203
+ and result.metric.summary
204
+ for result in results
205
+ ):
206
+ # Compute merged summaries only if there is a result with a metric
207
+ merged_summary = _combine_summaries(
208
+ [
209
+ {"inputs": input_group_strings[i], "summary": result.metric.summary}
210
+ for i, result in enumerate(results)
211
+ ]
212
+ )
213
+ else:
214
+ merged_summary = None
215
+
216
+ # Check if the results list contains a result object with figures
217
+ if any(hasattr(result, "figures") and result.figures for result in results):
218
+ # Compute merged figures only if there is at least one result with figures
219
+ merged_figures = _combine_figures(
220
+ [result.figures for result in results],
221
+ input_groups,
222
+ )
223
+ # Patch figure metadata so they are connected to the comparison result
224
+ if merged_figures and len(merged_figures):
225
+ for i, figure in enumerate(merged_figures):
226
+ figure.key = f"{figure.key}-{i}"
227
+ figure.metadata["_name"] = test_id
228
+ figure.metadata["_ref_id"] = ref_id
229
+ else:
230
+ merged_figures = None
191
231
 
192
232
  return MetricResultWrapper(
193
233
  result_id=test_id,
@@ -196,7 +236,7 @@ def metric_comparison(
196
236
  test_id=test_id,
197
237
  default_description=f"Comparison test result for {test_id}",
198
238
  summary=merged_summary.serialize() if merged_summary else None,
199
- figures=merged_figures,
239
+ figures=merged_figures if merged_figures else None,
200
240
  should_generate=generate_description,
201
241
  ),
202
242
  ],
@@ -294,6 +334,8 @@ def threshold_test_comparison(
294
334
  def run_comparison_test(
295
335
  test_id: TestID,
296
336
  input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
337
+ name: str = None,
338
+ unit_metrics: List[TestID] = None,
297
339
  params: Dict[str, Any] = None,
298
340
  show: bool = True,
299
341
  output_template: str = None,
@@ -308,6 +350,8 @@ def run_comparison_test(
308
350
  results = [
309
351
  run_test(
310
352
  test_id,
353
+ name=name,
354
+ unit_metrics=unit_metrics,
311
355
  inputs=inputs,
312
356
  show=False,
313
357
  params=params,
@@ -387,33 +431,34 @@ def run_test(
387
431
  "When providing an `input_grid`, you cannot also provide `inputs` or `kwargs`"
388
432
  )
389
433
 
434
+ if unit_metrics:
435
+ metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
436
+ test_id = f"validmind.composite_metric.{metric_id_name}" or test_id
437
+
390
438
  if input_grid:
391
439
  return run_comparison_test(
392
440
  test_id,
393
441
  input_grid,
442
+ name=name,
443
+ unit_metrics=unit_metrics,
394
444
  params=params,
395
445
  output_template=output_template,
396
446
  show=show,
397
447
  generate_description=__generate_description,
398
448
  )
399
449
 
400
- if test_id and test_id.startswith("validmind.unit_metrics"):
450
+ if test_id.startswith("validmind.unit_metrics"):
401
451
  # TODO: as we move towards a more unified approach to metrics
402
452
  # we will want to make everything functional and remove the
403
453
  # separation between unit metrics and "normal" metrics
404
454
  return run_metric(test_id, inputs=inputs, params=params, show=show)
405
455
 
406
456
  if unit_metrics:
407
- metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
408
- test_id = f"validmind.composite_metric.{metric_id_name}"
409
-
410
457
  error, TestClass = load_composite_metric(
411
458
  unit_metrics=unit_metrics, metric_name=metric_id_name
412
459
  )
413
-
414
460
  if error:
415
461
  raise LoadTestError(error)
416
-
417
462
  else:
418
463
  TestClass = load_test(test_id, reload=True)
419
464