validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -29,31 +29,33 @@ class Conciseness(ThresholdTest):
|
|
29
29
|
"""
|
30
30
|
Analyzes and grades the conciseness of prompts provided to a Large Language Model.
|
31
31
|
|
32
|
-
|
32
|
+
### Purpose
|
33
|
+
|
33
34
|
The Conciseness Assessment is designed to evaluate the brevity and succinctness of prompts provided to a Language
|
34
35
|
Learning Model (LLM). A concise prompt strikes a balance between offering clear instructions and eliminating
|
35
36
|
redundant or unnecessary information, ensuring that the LLM receives relevant input without being overwhelmed.
|
36
37
|
|
37
|
-
|
38
|
+
### Test Mechanism
|
39
|
+
|
38
40
|
Using an LLM, this test conducts a conciseness analysis on input prompts. The analysis grades the prompt on a scale
|
39
41
|
from 1 to 10, where the grade reflects how well the prompt delivers clear instructions without being verbose.
|
40
42
|
Prompts that score equal to or above a predefined threshold (default set to 7) are deemed successfully concise.
|
41
43
|
This threshold can be adjusted to meet specific requirements.
|
42
44
|
|
43
|
-
|
45
|
+
### Signs of High Risk
|
44
46
|
|
45
47
|
- Prompts that consistently score below the predefined threshold.
|
46
48
|
- Prompts that are overly wordy or contain unnecessary information.
|
47
49
|
- Prompts that create confusion or ambiguity due to excess or unnecessary information.
|
48
50
|
|
49
|
-
|
51
|
+
### Strengths
|
50
52
|
|
51
53
|
- Ensures clarity and effectiveness of the prompts.
|
52
54
|
- Promotes brevity and preciseness in prompts without sacrificing essential information.
|
53
55
|
- Useful for models like LLMs, where input prompt length and clarity greatly influence model performance.
|
54
56
|
- Provides a quantifiable measure of prompt conciseness.
|
55
57
|
|
56
|
-
|
58
|
+
### Limitations
|
57
59
|
|
58
60
|
- The conciseness score is based on an AI's assessment, which might not fully capture human interpretation of
|
59
61
|
conciseness.
|
@@ -29,38 +29,39 @@ class Delimitation(ThresholdTest):
|
|
29
29
|
"""
|
30
30
|
Evaluates the proper use of delimiters in prompts provided to Large Language Models.
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
### Purpose
|
33
|
+
|
34
|
+
The Delimitation Test aims to assess whether prompts provided to the Language Learning Model (LLM) correctly use
|
35
|
+
delimiters to mark different sections of the input. Well-delimited prompts help simplify the interpretation process
|
36
|
+
for the LLM, ensuring that the responses are precise and accurate.
|
37
|
+
|
38
|
+
### Test Mechanism
|
36
39
|
|
37
|
-
**Test Mechanism:**
|
38
40
|
The test employs an LLM to examine prompts for appropriate use of delimiters such as triple quotation marks, XML
|
39
|
-
tags, and section titles. Each prompt is assigned a score from 1 to 10 based on its delimitation integrity.
|
41
|
+
tags, and section titles. Each prompt is assigned a score from 1 to 10 based on its delimitation integrity. Prompts
|
40
42
|
with scores equal to or above the preset threshold (which is 7 by default, although it can be adjusted as
|
41
43
|
necessary) pass the test.
|
42
44
|
|
43
|
-
|
45
|
+
### Signs of High Risk
|
44
46
|
|
45
|
-
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
- Low scores (below the threshold) are a clear indicator of high risk.
|
47
|
+
- Prompts missing, improperly placed, or incorrectly used delimiters, leading to misinterpretation by the LLM.
|
48
|
+
- High-risk scenarios with complex prompts involving multiple tasks or diverse data where correct delimitation is
|
49
|
+
crucial.
|
50
|
+
- Scores below the threshold, indicating a high risk.
|
50
51
|
|
51
|
-
|
52
|
+
### Strengths
|
52
53
|
|
53
|
-
-
|
54
|
-
-
|
55
|
-
-
|
54
|
+
- Ensures clarity in demarcating different components of given prompts.
|
55
|
+
- Reduces ambiguity in understanding prompts, especially for complex tasks.
|
56
|
+
- Provides a quantified insight into the appropriateness of delimiter usage, aiding continuous improvement.
|
56
57
|
|
57
|
-
|
58
|
+
### Limitations
|
58
59
|
|
59
|
-
-
|
60
|
-
|
61
|
-
-
|
62
|
-
-
|
63
|
-
|
60
|
+
- Only checks for the presence and placement of delimiters, not whether the correct delimiter type is used for the
|
61
|
+
specific data or task.
|
62
|
+
- May not fully reveal the impacts of poor delimitation on the LLM's final performance.
|
63
|
+
- The preset score threshold may not be refined enough for complex tasks and prompts, requiring regular manual
|
64
|
+
adjustment.
|
64
65
|
"""
|
65
66
|
|
66
67
|
name = "delimitation"
|
@@ -29,34 +29,36 @@ class NegativeInstruction(ThresholdTest):
|
|
29
29
|
"""
|
30
30
|
Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
|
31
31
|
|
32
|
-
|
32
|
+
### Purpose
|
33
|
+
|
33
34
|
The Negative Instruction test is utilized to scrutinize the prompts given to a Large Language Model (LLM). The
|
34
35
|
objective is to ensure these prompts are expressed using proactive, affirmative language. The focus is on
|
35
36
|
instructions indicating what needs to be done rather than what needs to be avoided, thereby guiding the LLM more
|
36
37
|
efficiently towards the desired output.
|
37
38
|
|
38
|
-
|
39
|
+
### Test Mechanism
|
40
|
+
|
39
41
|
An LLM is employed to evaluate each prompt. The prompt is graded based on its use of positive instructions with
|
40
42
|
scores ranging between 1-10. This grade reflects how effectively the prompt leverages affirmative language while
|
41
43
|
shying away from negative or restrictive instructions. A prompt that attains a grade equal to or above a
|
42
44
|
predetermined threshold (7 by default) is regarded as adhering effectively to the best practices of positive
|
43
45
|
instruction. This threshold can be custom-tailored through the test parameters.
|
44
46
|
|
45
|
-
|
47
|
+
### Signs of High Risk
|
46
48
|
|
47
49
|
- Low score obtained from the LLM analysis, indicating heavy reliance on negative instructions in the prompts.
|
48
50
|
- Failure to surpass the preset minimum threshold.
|
49
51
|
- The LLM generates ambiguous or undesirable outputs as a consequence of the negative instructions used in the
|
50
52
|
prompt.
|
51
53
|
|
52
|
-
|
54
|
+
### Strengths
|
53
55
|
|
54
56
|
- Encourages the usage of affirmative, proactive language in prompts, aiding in more accurate and advantageous
|
55
57
|
model responses.
|
56
58
|
- The test result provides a comprehensible score, helping to understand how well a prompt follows the positive
|
57
59
|
instruction best practices.
|
58
60
|
|
59
|
-
|
61
|
+
### Limitations
|
60
62
|
|
61
63
|
- Despite an adequate score, a prompt could still be misleading or could lead to undesired responses due to factors
|
62
64
|
not covered by this test.
|
@@ -24,31 +24,33 @@ class Robustness(ThresholdTest):
|
|
24
24
|
"""
|
25
25
|
Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
|
26
26
|
|
27
|
-
|
27
|
+
### Purpose
|
28
|
+
|
28
29
|
The Robustness test is meant to evaluate the resilience and reliability of prompts provided to a Language Learning
|
29
|
-
Model (LLM). The aim of this test is to guarantee that the prompts consistently generate accurate and
|
30
|
-
outputs,
|
30
|
+
Model (LLM). The aim of this test is to guarantee that the prompts consistently generate accurate and expected
|
31
|
+
outputs, even in diverse or challenging scenarios.
|
32
|
+
|
33
|
+
### Test Mechanism
|
31
34
|
|
32
|
-
**Test Mechanism:**
|
33
35
|
The Robustness test appraises prompts under various conditions, alterations, and contexts to ascertain their
|
34
|
-
stability in producing consistent responses from the LLM. Factors evaluated
|
35
|
-
|
36
|
-
|
36
|
+
stability in producing consistent responses from the LLM. Factors evaluated include different phrasings, inclusion
|
37
|
+
of potential distracting elements, and various input complexities. By default, the test generates 10 inputs for a
|
38
|
+
prompt but can be adjusted according to test parameters.
|
37
39
|
|
38
|
-
|
40
|
+
### Signs of High Risk
|
39
41
|
|
40
42
|
- If the output from the tests diverges extensively from the expected results, this indicates high risk.
|
41
43
|
- When the prompt doesn't give a consistent performance across various tests.
|
42
44
|
- A high risk is indicated when the prompt is susceptible to breaking, especially when the output is expected to be
|
43
45
|
of a specific type.
|
44
46
|
|
45
|
-
|
47
|
+
### Strengths
|
46
48
|
|
47
49
|
- The robustness test helps to ensure stable performance of the LLM prompts and lowers the chances of generating
|
48
50
|
unexpected or off-target outputs.
|
49
51
|
- This test is vital for applications where predictability and reliability of the LLM’s output are crucial.
|
50
52
|
|
51
|
-
|
53
|
+
### Limitations
|
52
54
|
|
53
55
|
- Currently, the test only supports single-variable prompts, which restricts its application to more complex models.
|
54
56
|
- When there are too many target classes (over 10), the test is skipped, which can leave potential vulnerabilities
|
@@ -27,40 +27,42 @@ from .ai_powered_test import (
|
|
27
27
|
@dataclass
|
28
28
|
class Specificity(ThresholdTest):
|
29
29
|
"""
|
30
|
-
Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,
|
31
|
-
|
30
|
+
Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
|
31
|
+
and relevance.
|
32
|
+
|
33
|
+
### Purpose
|
32
34
|
|
33
|
-
**Purpose:**
|
34
35
|
The Specificity Test evaluates the clarity, precision, and effectiveness of the prompts provided to a Language
|
35
|
-
|
36
|
-
|
37
|
-
|
36
|
+
Model (LLM). It aims to ensure that the instructions embedded in a prompt are indisputably clear and relevant,
|
37
|
+
thereby helping to remove ambiguity and steer the LLM towards desired outputs. This level of specificity
|
38
|
+
significantly affects the accuracy and relevance of LLM outputs.
|
39
|
+
|
40
|
+
### Test Mechanism
|
38
41
|
|
39
|
-
**Test Mechanism:**
|
40
42
|
The Specificity Test employs an LLM to grade each prompt based on clarity, detail, and relevance parameters within
|
41
43
|
a specificity scale that extends from 1 to 10. On this scale, prompts scoring equal to or more than a predefined
|
42
44
|
threshold (set to 7 by default) pass the evaluation, while those scoring below this threshold fail it. Users can
|
43
45
|
adjust this threshold as per their requirements.
|
44
46
|
|
45
|
-
|
47
|
+
### Signs of High Risk
|
46
48
|
|
47
49
|
- Prompts scoring consistently below the established threshold
|
48
50
|
- Vague or ambiguous prompts that do not provide clear direction to the LLM
|
49
51
|
- Overly verbose prompts that may confuse the LLM instead of providing clear guidance
|
50
52
|
|
51
|
-
|
53
|
+
### Strengths
|
52
54
|
|
53
55
|
- Enables precise and clear communication with the LLM to achieve desired outputs
|
54
56
|
- Serves as a crucial means to measure the effectiveness of prompts
|
55
57
|
- Highly customizable, allowing users to set their threshold based on specific use cases
|
56
58
|
|
57
|
-
|
59
|
+
### Limitations
|
58
60
|
|
59
61
|
- This test doesn't consider the content comprehension capability of the LLM
|
60
62
|
- High specificity score doesn't guarantee a high-quality response from the LLM, as the model's performance is also
|
61
63
|
dependent on various other factors
|
62
64
|
- Striking a balance between specificity and verbosity can be challenging, as overly detailed prompts might confuse
|
63
|
-
or mislead the model
|
65
|
+
or mislead the model
|
64
66
|
"""
|
65
67
|
|
66
68
|
name = "specificity"
|
@@ -5,6 +5,7 @@
|
|
5
5
|
import re
|
6
6
|
|
7
7
|
from validmind.ai.utils import get_client_and_model
|
8
|
+
from validmind.client_config import client_config
|
8
9
|
|
9
10
|
missing_prompt_message = """
|
10
11
|
Cannot run prompt validation tests on a model with no prompt.
|
@@ -24,6 +25,11 @@ def call_model(
|
|
24
25
|
system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
|
25
26
|
):
|
26
27
|
"""Call LLM with the given prompts and return the response"""
|
28
|
+
if not client_config.can_generate_llm_test_descriptions():
|
29
|
+
raise ValueError(
|
30
|
+
"LLM based descriptions are not enabled for your organization."
|
31
|
+
)
|
32
|
+
|
27
33
|
client, model = get_client_and_model()
|
28
34
|
|
29
35
|
return (
|
validmind/tests/run.py
CHANGED
@@ -17,6 +17,7 @@ from validmind.vm_models import (
|
|
17
17
|
MetricResult,
|
18
18
|
ResultSummary,
|
19
19
|
ResultTable,
|
20
|
+
ResultTableMetadata,
|
20
21
|
TestContext,
|
21
22
|
TestInput,
|
22
23
|
ThresholdTestResults,
|
@@ -147,6 +148,26 @@ def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str,
|
|
147
148
|
return [figure for figures in figure_lists for figure in figures]
|
148
149
|
|
149
150
|
|
151
|
+
def _combine_unit_metrics(results: List[MetricResultWrapper]):
|
152
|
+
if not results[0].scalar:
|
153
|
+
return
|
154
|
+
|
155
|
+
for result in results:
|
156
|
+
table = ResultTable(
|
157
|
+
data=[{"value": result.scalar}],
|
158
|
+
metadata=ResultTableMetadata(title="Unit Metrics"),
|
159
|
+
)
|
160
|
+
if not result.metric:
|
161
|
+
result.metric = MetricResult(
|
162
|
+
ref_id="will_be_overwritten",
|
163
|
+
key=result.result_id,
|
164
|
+
value=result.scalar,
|
165
|
+
summary=ResultSummary(results=[table]),
|
166
|
+
)
|
167
|
+
else:
|
168
|
+
result.metric.summary.results.append(table)
|
169
|
+
|
170
|
+
|
150
171
|
def metric_comparison(
|
151
172
|
results: List[MetricResultWrapper],
|
152
173
|
test_id: TestID,
|
@@ -172,22 +193,41 @@ def metric_comparison(
|
|
172
193
|
raise ValueError(f"Unsupported type for value: {v}")
|
173
194
|
input_group_strings.append(new_group)
|
174
195
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
196
|
+
# handle unit metrics (scalar values) by adding it to the summary
|
197
|
+
_combine_unit_metrics(results)
|
198
|
+
|
199
|
+
# Check if the results list contains a result object with a metric
|
200
|
+
if any(
|
201
|
+
hasattr(result, "metric")
|
202
|
+
and hasattr(result.metric, "summary")
|
203
|
+
and result.metric.summary
|
204
|
+
for result in results
|
205
|
+
):
|
206
|
+
# Compute merged summaries only if there is a result with a metric
|
207
|
+
merged_summary = _combine_summaries(
|
208
|
+
[
|
209
|
+
{"inputs": input_group_strings[i], "summary": result.metric.summary}
|
210
|
+
for i, result in enumerate(results)
|
211
|
+
]
|
212
|
+
)
|
213
|
+
else:
|
214
|
+
merged_summary = None
|
215
|
+
|
216
|
+
# Check if the results list contains a result object with figures
|
217
|
+
if any(hasattr(result, "figures") and result.figures for result in results):
|
218
|
+
# Compute merged figures only if there is at least one result with figures
|
219
|
+
merged_figures = _combine_figures(
|
220
|
+
[result.figures for result in results],
|
221
|
+
input_groups,
|
222
|
+
)
|
223
|
+
# Patch figure metadata so they are connected to the comparison result
|
224
|
+
if merged_figures and len(merged_figures):
|
225
|
+
for i, figure in enumerate(merged_figures):
|
226
|
+
figure.key = f"{figure.key}-{i}"
|
227
|
+
figure.metadata["_name"] = test_id
|
228
|
+
figure.metadata["_ref_id"] = ref_id
|
229
|
+
else:
|
230
|
+
merged_figures = None
|
191
231
|
|
192
232
|
return MetricResultWrapper(
|
193
233
|
result_id=test_id,
|
@@ -196,7 +236,7 @@ def metric_comparison(
|
|
196
236
|
test_id=test_id,
|
197
237
|
default_description=f"Comparison test result for {test_id}",
|
198
238
|
summary=merged_summary.serialize() if merged_summary else None,
|
199
|
-
figures=merged_figures,
|
239
|
+
figures=merged_figures if merged_figures else None,
|
200
240
|
should_generate=generate_description,
|
201
241
|
),
|
202
242
|
],
|
@@ -294,6 +334,8 @@ def threshold_test_comparison(
|
|
294
334
|
def run_comparison_test(
|
295
335
|
test_id: TestID,
|
296
336
|
input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
|
337
|
+
name: str = None,
|
338
|
+
unit_metrics: List[TestID] = None,
|
297
339
|
params: Dict[str, Any] = None,
|
298
340
|
show: bool = True,
|
299
341
|
output_template: str = None,
|
@@ -308,6 +350,8 @@ def run_comparison_test(
|
|
308
350
|
results = [
|
309
351
|
run_test(
|
310
352
|
test_id,
|
353
|
+
name=name,
|
354
|
+
unit_metrics=unit_metrics,
|
311
355
|
inputs=inputs,
|
312
356
|
show=False,
|
313
357
|
params=params,
|
@@ -387,33 +431,34 @@ def run_test(
|
|
387
431
|
"When providing an `input_grid`, you cannot also provide `inputs` or `kwargs`"
|
388
432
|
)
|
389
433
|
|
434
|
+
if unit_metrics:
|
435
|
+
metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
|
436
|
+
test_id = f"validmind.composite_metric.{metric_id_name}" or test_id
|
437
|
+
|
390
438
|
if input_grid:
|
391
439
|
return run_comparison_test(
|
392
440
|
test_id,
|
393
441
|
input_grid,
|
442
|
+
name=name,
|
443
|
+
unit_metrics=unit_metrics,
|
394
444
|
params=params,
|
395
445
|
output_template=output_template,
|
396
446
|
show=show,
|
397
447
|
generate_description=__generate_description,
|
398
448
|
)
|
399
449
|
|
400
|
-
if test_id
|
450
|
+
if test_id.startswith("validmind.unit_metrics"):
|
401
451
|
# TODO: as we move towards a more unified approach to metrics
|
402
452
|
# we will want to make everything functional and remove the
|
403
453
|
# separation between unit metrics and "normal" metrics
|
404
454
|
return run_metric(test_id, inputs=inputs, params=params, show=show)
|
405
455
|
|
406
456
|
if unit_metrics:
|
407
|
-
metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
|
408
|
-
test_id = f"validmind.composite_metric.{metric_id_name}"
|
409
|
-
|
410
457
|
error, TestClass = load_composite_metric(
|
411
458
|
unit_metrics=unit_metrics, metric_name=metric_id_name
|
412
459
|
)
|
413
|
-
|
414
460
|
if error:
|
415
461
|
raise LoadTestError(error)
|
416
|
-
|
417
462
|
else:
|
418
463
|
TestClass = load_test(test_id, reload=True)
|
419
464
|
|