validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +80 -119
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +14 -15
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/JarqueBera.py +70 -0
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LJungBox.py +66 -0
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/RunsTest.py +72 -0
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +42 -40
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +39 -36
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +38 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +143 -158
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
- validmind-2.5.18.dist-info/RECORD +324 -0
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
- validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
- validmind-2.5.8.dist-info/RECORD +0 -318
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -13,26 +13,42 @@ from validmind import tags, tasks
|
|
13
13
|
@tasks("text_classification", "text_summarization")
|
14
14
|
def RegardScore(dataset, model):
|
15
15
|
"""
|
16
|
-
|
16
|
+
Assesses the sentiment and potential biases in text generated by NLP models by computing and visualizing regard
|
17
|
+
scores.
|
17
18
|
|
18
|
-
|
19
|
-
The `RegardScore` metric is designed to evaluate the regard levels (positive, negative, neutral, or other) of texts generated by models. This helps in understanding the sentiment and biases in the generated content.
|
19
|
+
### Purpose
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
The `RegardScore` test aims to evaluate the levels of regard (positive, negative, neutral, or other) in texts
|
22
|
+
generated by NLP models. It helps in understanding the sentiment and bias present in the generated content.
|
23
23
|
|
24
|
-
|
25
|
-
- Noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard scores, could indicate biases or inconsistencies in the model.
|
26
|
-
- Lack of neutral scores in the model's predictions, despite a balanced distribution in the target data, might signal an issue.
|
24
|
+
### Test Mechanism
|
27
25
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
26
|
+
This test extracts the true and predicted values from the provided dataset and model. It then computes the regard
|
27
|
+
scores for each text instance using a preloaded `regard` evaluation tool. The scores are compiled into dataframes,
|
28
|
+
and visualizations such as histograms and bar charts are generated to display the distribution of regard scores.
|
29
|
+
Additionally, descriptive statistics (mean, median, standard deviation, minimum, and maximum) are calculated for
|
30
|
+
the regard scores, providing a comprehensive overview of the model's performance.
|
31
|
+
|
32
|
+
### Signs of High Risk
|
33
|
+
|
34
|
+
- Noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target
|
35
|
+
regard scores, can indicate biases or inconsistencies in the model.
|
36
|
+
- Lack of neutral scores in the model's predictions, despite a balanced distribution in the target data, might
|
37
|
+
signal an issue.
|
38
|
+
|
39
|
+
### Strengths
|
40
|
+
|
41
|
+
- Provides a clear evaluation of regard levels in generated texts, aiding in ensuring content appropriateness.
|
42
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
|
43
|
+
regard scores.
|
44
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating texts with balanced
|
45
|
+
sentiments.
|
46
|
+
|
47
|
+
### Limitations
|
32
48
|
|
33
|
-
**Limitations:**
|
34
49
|
- The accuracy of the regard scores is contingent upon the underlying `regard` tool.
|
35
|
-
- The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for
|
50
|
+
- The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for
|
51
|
+
high regard.
|
36
52
|
- Supplementary, in-depth analysis might be needed for granular insights.
|
37
53
|
"""
|
38
54
|
|
@@ -16,19 +16,22 @@ class RegressionResidualsPlot(Metric):
|
|
16
16
|
"""
|
17
17
|
Evaluates regression model performance using residual distribution and actual vs. predicted plots.
|
18
18
|
|
19
|
-
|
19
|
+
### Purpose
|
20
|
+
|
20
21
|
The `RegressionResidualsPlot` metric aims to evaluate the performance of regression models. By generating and
|
21
22
|
analyzing two plots – a distribution of residuals and a scatter plot of actual versus predicted values – this tool
|
22
23
|
helps to visually appraise how well the model predicts and the nature of errors it makes.
|
23
24
|
|
24
|
-
|
25
|
+
### Test Mechanism
|
26
|
+
|
25
27
|
The process begins by extracting the true output values (`y_true`) and the model's predicted values (`y_pred`).
|
26
28
|
Residuals are computed by subtracting predicted from true values. These residuals are then visualized using a
|
27
29
|
histogram to display their distribution. Additionally, a scatter plot is derived to compare true values against
|
28
30
|
predicted values, together with a "Perfect Fit" line, which represents an ideal match (predicted values equal
|
29
31
|
actual values), facilitating the assessment of the model's predictive accuracy.
|
30
32
|
|
31
|
-
|
33
|
+
### Signs of High Risk
|
34
|
+
|
32
35
|
- Residuals showing a non-normal distribution, especially those with frequent extreme values.
|
33
36
|
- Significant deviations of predicted values from actual values in the scatter plot.
|
34
37
|
- Sparse density of data points near the "Perfect Fit" line in the scatter plot, indicating poor prediction
|
@@ -36,13 +39,15 @@ class RegressionResidualsPlot(Metric):
|
|
36
39
|
- Visible patterns or trends in the residuals plot, suggesting the model's failure to capture the underlying data
|
37
40
|
structure adequately.
|
38
41
|
|
39
|
-
|
42
|
+
### Strengths
|
43
|
+
|
40
44
|
- Provides a direct, visually intuitive assessment of a regression model’s accuracy and handling of data.
|
41
45
|
- Visual plots can highlight issues of underfitting or overfitting.
|
42
46
|
- Can reveal systematic deviations or trends that purely numerical metrics might miss.
|
43
47
|
- Applicable across various regression model types.
|
44
48
|
|
45
|
-
|
49
|
+
### Limitations
|
50
|
+
|
46
51
|
- Relies on visual interpretation, which can be subjective and less precise than numerical evaluations.
|
47
52
|
- May be difficult to interpret in cases with multi-dimensional outputs due to the plots’ two-dimensional nature.
|
48
53
|
- Overlapping data points in the residuals plot can complicate interpretation efforts.
|
@@ -13,44 +13,50 @@ from validmind import tags, tasks
|
|
13
13
|
@tasks("text_classification", "text_summarization")
|
14
14
|
def RougeScore(dataset, model, metric="rouge-1"):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
ROUGE
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
16
|
+
Assesses the quality of machine-generated text using ROUGE metrics and visualizes the results to provide
|
17
|
+
comprehensive performance insights.
|
18
|
+
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
The ROUGE Score test is designed to evaluate the quality of text generated by machine learning models using various
|
22
|
+
ROUGE metrics. ROUGE, which stands for Recall-Oriented Understudy for Gisting Evaluation, measures the overlap of
|
23
|
+
n-grams, word sequences, and word pairs between machine-generated text and reference texts. This evaluation is
|
24
|
+
crucial for tasks like text summarization, machine translation, and text generation, where the goal is to produce
|
25
|
+
text that accurately reflects the content and meaning of human-crafted references.
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The test extracts the true and predicted values from the provided dataset and model. It initializes the ROUGE
|
30
|
+
evaluator with the specified metric (e.g., ROUGE-1). For each pair of true and predicted texts, it calculates the
|
31
|
+
ROUGE scores and compiles them into a dataframe. Histograms and bar charts are generated for each ROUGE metric
|
32
|
+
(Precision, Recall, and F1 Score) to visualize their distribution. Additionally, a table of descriptive statistics
|
33
|
+
(mean, median, standard deviation, minimum, and maximum) is compiled for each metric, providing a comprehensive
|
34
|
+
summary of the model's performance.
|
35
|
+
|
36
|
+
### Signs of High Risk
|
37
|
+
|
38
|
+
- Consistently low scores across ROUGE metrics could indicate poor quality in the generated text, suggesting that
|
39
|
+
the model fails to capture the essential content of the reference texts.
|
37
40
|
- Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
|
38
41
|
- Low recall scores may indicate that important information from the reference text is being omitted.
|
39
|
-
- An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the
|
40
|
-
|
42
|
+
- An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the
|
43
|
+
model's ability to balance informativeness and conciseness.
|
41
44
|
|
42
|
-
|
45
|
+
### Strengths
|
43
46
|
|
44
|
-
- Provides a multifaceted evaluation of text quality through different ROUGE metrics, offering a detailed view of
|
45
|
-
|
47
|
+
- Provides a multifaceted evaluation of text quality through different ROUGE metrics, offering a detailed view of
|
48
|
+
model performance.
|
49
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
|
50
|
+
scores.
|
46
51
|
- Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
|
47
52
|
|
48
|
-
|
53
|
+
### Limitations
|
49
54
|
|
50
|
-
- ROUGE metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or
|
55
|
+
- ROUGE metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or
|
56
|
+
grammatical quality of the text.
|
51
57
|
- The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
|
52
|
-
- While useful for comparison, ROUGE scores alone do not provide a complete assessment of a model's performance and
|
53
|
-
|
58
|
+
- While useful for comparison, ROUGE scores alone do not provide a complete assessment of a model's performance and
|
59
|
+
should be supplemented with other metrics and qualitative analysis.
|
54
60
|
"""
|
55
61
|
|
56
62
|
# Extract true and predicted values
|
@@ -14,28 +14,45 @@ from validmind import tags, tasks
|
|
14
14
|
@tasks("regression", "time_series_forecasting")
|
15
15
|
def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
|
16
16
|
"""
|
17
|
-
|
17
|
+
Assesses predictive accuracy and uncertainty in time series models, highlighting breaches beyond confidence
|
18
|
+
intervals.
|
18
19
|
|
19
|
-
|
20
|
+
### Purpose
|
20
21
|
|
21
|
-
|
22
|
+
The purpose of the Time Series Prediction with Confidence Intervals (CI) test is to visualize the actual versus
|
23
|
+
predicted values for time series data, including confidence intervals, and to compute and report the number of
|
24
|
+
breaches beyond these intervals. This helps in evaluating the reliability and accuracy of the model's predictions.
|
22
25
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
+
### Test Mechanism
|
27
|
+
|
28
|
+
The function performs the following steps:
|
29
|
+
|
30
|
+
- Calculates the standard deviation of prediction errors.
|
31
|
+
- Determines the confidence intervals using a specified confidence level, typically 95%.
|
32
|
+
- Counts the number of actual values that fall outside the confidence intervals, referred to as breaches.
|
33
|
+
- Generates a plot visualizing the actual values, predicted values, and confidence intervals.
|
34
|
+
- Returns a DataFrame summarizing the breach information, including the total breaches, upper breaches, and lower
|
35
|
+
breaches.
|
36
|
+
|
37
|
+
### Signs of High Risk
|
38
|
+
|
39
|
+
- A high number of breaches indicates that the model's predictions are not reliable within the specified confidence
|
40
|
+
level.
|
41
|
+
- Significant deviations between actual and predicted values may highlight model inadequacies or issues with data
|
42
|
+
quality.
|
43
|
+
|
44
|
+
### Strengths
|
26
45
|
|
27
|
-
**Strengths**:
|
28
46
|
- Provides a visual representation of prediction accuracy and the uncertainty around predictions.
|
29
47
|
- Includes a statistical measure of prediction reliability through confidence intervals.
|
30
48
|
- Computes and reports breaches, offering a quantitative assessment of prediction performance.
|
31
49
|
|
32
|
-
|
50
|
+
### Limitations
|
51
|
+
|
33
52
|
- Assumes that the dataset is provided as a DataFrameDataset object with a datetime index.
|
34
53
|
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
35
54
|
- The calculation of confidence intervals assumes normally distributed errors, which may not hold for all datasets.
|
36
55
|
"""
|
37
|
-
dataset_name = dataset.input_id
|
38
|
-
model_name = model.input_id
|
39
56
|
time_index = dataset.df.index # Assuming the index of the dataset is datetime
|
40
57
|
|
41
58
|
# Get actual and predicted values
|
@@ -77,7 +94,7 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
|
|
77
94
|
x=time_index,
|
78
95
|
y=y_true,
|
79
96
|
mode="lines",
|
80
|
-
name="Actual
|
97
|
+
name="Actual",
|
81
98
|
line=dict(color="blue"),
|
82
99
|
)
|
83
100
|
)
|
@@ -88,7 +105,7 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
|
|
88
105
|
x=time_index,
|
89
106
|
y=y_pred,
|
90
107
|
mode="lines",
|
91
|
-
name=
|
108
|
+
name="Predicted",
|
92
109
|
line=dict(color="red"),
|
93
110
|
)
|
94
111
|
)
|
@@ -121,10 +138,9 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
|
|
121
138
|
|
122
139
|
# Update layout
|
123
140
|
fig.update_layout(
|
124
|
-
title=
|
141
|
+
title="Actual vs Predicted",
|
125
142
|
xaxis_title="Time",
|
126
143
|
yaxis_title="Values",
|
127
|
-
legend_title="Legend",
|
128
144
|
template="plotly_white",
|
129
145
|
)
|
130
146
|
|
@@ -2,7 +2,6 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import plotly.express as px
|
6
5
|
import plotly.graph_objects as go
|
7
6
|
|
8
7
|
from validmind import tags, tasks
|
@@ -10,66 +9,64 @@ from validmind import tags, tasks
|
|
10
9
|
|
11
10
|
@tags("model_predictions", "visualization")
|
12
11
|
@tasks("regression", "time_series_forecasting")
|
13
|
-
def TimeSeriesPredictionsPlot(
|
12
|
+
def TimeSeriesPredictionsPlot(dataset, model):
|
14
13
|
"""
|
15
|
-
Plot actual vs predicted values for time series data and generate a visual comparison for
|
14
|
+
Plot actual vs predicted values for time series data and generate a visual comparison for the model.
|
16
15
|
|
17
|
-
|
16
|
+
### Purpose
|
18
17
|
|
19
|
-
|
18
|
+
The purpose of this function is to visualize the actual versus predicted values for time
|
19
|
+
series data for a single model.
|
20
|
+
|
21
|
+
### Test Mechanism
|
22
|
+
|
23
|
+
The function plots the actual values from the dataset and overlays the predicted
|
24
|
+
values from the model using Plotly for interactive visualization.
|
20
25
|
|
21
|
-
**Signs of High Risk**:
|
22
26
|
- Large discrepancies between actual and predicted values indicate poor model performance.
|
23
27
|
- Systematic deviations in predicted values can highlight model bias or issues with data patterns.
|
24
28
|
|
25
|
-
|
29
|
+
### Strengths
|
30
|
+
|
26
31
|
- Provides a clear visual comparison of model predictions against actual values.
|
27
32
|
- Uses Plotly for interactive and visually appealing plots.
|
28
|
-
- Can handle multiple models and datasets, displaying them with distinct colors.
|
29
33
|
|
30
|
-
|
34
|
+
### Limitations
|
35
|
+
|
31
36
|
- Assumes that the dataset is provided as a DataFrameDataset object with a datetime index.
|
32
37
|
- Requires that `dataset.y_pred(model)` returns the predicted values for the model.
|
33
|
-
- Visualization might become cluttered with a large number of models or datasets.
|
34
38
|
"""
|
35
39
|
fig = go.Figure()
|
36
40
|
|
37
|
-
#
|
38
|
-
colors = px.colors.qualitative.Plotly
|
39
|
-
|
40
|
-
# Plot actual values from the first dataset
|
41
|
-
dataset = datasets[0]
|
41
|
+
# Plot actual values from the dataset
|
42
42
|
time_index = dataset.df.index # Assuming the index of the dataset is datetime
|
43
43
|
fig.add_trace(
|
44
44
|
go.Scatter(
|
45
45
|
x=time_index,
|
46
46
|
y=dataset.y,
|
47
47
|
mode="lines",
|
48
|
-
name="Actual
|
48
|
+
name="Actual",
|
49
49
|
line=dict(color="blue"),
|
50
50
|
)
|
51
51
|
)
|
52
52
|
|
53
|
-
# Plot predicted values for
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
name=f"Predicted by {model_name}",
|
63
|
-
line=dict(color=colors[idx % len(colors)]),
|
64
|
-
)
|
53
|
+
# Plot predicted values for the model
|
54
|
+
y_pred = dataset.y_pred(model)
|
55
|
+
fig.add_trace(
|
56
|
+
go.Scatter(
|
57
|
+
x=time_index,
|
58
|
+
y=y_pred,
|
59
|
+
mode="lines",
|
60
|
+
name="Predicted",
|
61
|
+
line=dict(color="orange"), # Using a distinct color for the prediction
|
65
62
|
)
|
63
|
+
)
|
66
64
|
|
67
65
|
# Update layout
|
68
66
|
fig.update_layout(
|
69
|
-
title="
|
67
|
+
title="Actual vs Predicted",
|
70
68
|
xaxis_title="Time",
|
71
69
|
yaxis_title="Values",
|
72
|
-
legend_title="Legend",
|
73
70
|
template="plotly_white",
|
74
71
|
)
|
75
72
|
|
@@ -12,75 +12,80 @@ from validmind import tags, tasks
|
|
12
12
|
|
13
13
|
@tags("model_performance", "sklearn")
|
14
14
|
@tasks("regression", "time_series_forecasting")
|
15
|
-
def TimeSeriesR2SquareBySegments(
|
15
|
+
def TimeSeriesR2SquareBySegments(dataset, model, segments=None):
|
16
16
|
"""
|
17
|
-
|
18
|
-
|
17
|
+
Evaluates the R-Squared values of regression models over specified time segments in time series data to assess
|
18
|
+
segment-wise model performance.
|
19
19
|
|
20
|
-
|
20
|
+
### Purpose
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
- segments: Dictionary with 'start_date' and 'end_date' keys containing lists of start and end dates for each segments. If None, the time series will be segmented into two halves.
|
22
|
+
The TimeSeriesR2SquareBySegments test aims to evaluate the R-Squared values for several regression models across
|
23
|
+
different segments of time series data. This helps in determining how well the models explain the variability in
|
24
|
+
the data within each specific time segment.
|
26
25
|
|
27
|
-
|
26
|
+
### Test Mechanism
|
27
|
+
- Provides a visual representation of model performance across different time segments.
|
28
|
+
- Allows for identification of segments where the model performs poorly.
|
29
|
+
- Calculating the R-Squared values for each segment.
|
30
|
+
- Generating a bar chart to visually represent the R-Squared values across different models and segments.
|
28
31
|
|
29
|
-
|
30
|
-
- If the R-Squared values are significantly low for certain segments, it could indicate that the model is not explaining much of the variability in the dataset for those segments.
|
32
|
+
### Signs of High Risk
|
31
33
|
|
32
|
-
|
33
|
-
-
|
34
|
-
|
34
|
+
- Significantly low R-Squared values for certain time segments, indicating poor model performance in those periods.
|
35
|
+
- Large variability in R-Squared values across different segments for the same model, suggesting inconsistent
|
36
|
+
performance.
|
37
|
+
|
38
|
+
### Strengths
|
39
|
+
|
40
|
+
- Provides a visual representation of how well models perform over different time periods.
|
41
|
+
- Helps identify time segments where models may need improvement or retraining.
|
42
|
+
- Facilitates comparison between multiple models in a straightforward manner.
|
43
|
+
|
44
|
+
### Limitations
|
35
45
|
|
36
|
-
|
37
|
-
|
38
|
-
- Requires that `dataset.y_pred(model)` returns
|
39
|
-
- Assumes that `y_true` and `y_pred` are pandas Series with datetime indices.
|
46
|
+
- Assumes datasets are provided as DataFrameDataset objects with the attributes `y`, `y_pred`, and
|
47
|
+
`feature_columns`.
|
48
|
+
- Requires that `dataset.y_pred(model)` returns predicted values for the model.
|
49
|
+
- Assumes that both `y_true` and `y_pred` are pandas Series with datetime indices, which may not always be the case.
|
50
|
+
- May not account for more nuanced temporal dependencies within the segments.
|
40
51
|
"""
|
41
52
|
results_list = []
|
42
53
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
"Segments": f"Segment {segment_index + 1}",
|
79
|
-
"Start Date": start_date,
|
80
|
-
"End Date": end_date,
|
81
|
-
"R-Squared": r2s,
|
82
|
-
}
|
83
|
-
)
|
54
|
+
y_true = dataset.y
|
55
|
+
y_pred = dataset.y_pred(model)
|
56
|
+
|
57
|
+
# Ensure y_true and y_pred are pandas Series with the same index
|
58
|
+
if not isinstance(y_true, pd.Series):
|
59
|
+
y_true = pd.Series(y_true, index=dataset.df.index)
|
60
|
+
if not isinstance(y_pred, pd.Series):
|
61
|
+
y_pred = pd.Series(y_pred, index=dataset.df.index)
|
62
|
+
|
63
|
+
index = dataset.df.index
|
64
|
+
|
65
|
+
if segments is None:
|
66
|
+
mid_point = len(index) // 2
|
67
|
+
segments = {
|
68
|
+
"start_date": [index.min(), index[mid_point]],
|
69
|
+
"end_date": [index[mid_point - 1], index.max()],
|
70
|
+
}
|
71
|
+
|
72
|
+
for segment_index, (start_date, end_date) in enumerate(
|
73
|
+
zip(segments["start_date"], segments["end_date"])
|
74
|
+
):
|
75
|
+
mask = (index >= start_date) & (index <= end_date)
|
76
|
+
y_true_segment = y_true.loc[mask]
|
77
|
+
y_pred_segment = y_pred.loc[mask]
|
78
|
+
|
79
|
+
if len(y_true_segment) > 0 and len(y_pred_segment) > 0:
|
80
|
+
r2s = metrics.r2_score(y_true_segment, y_pred_segment)
|
81
|
+
results_list.append(
|
82
|
+
{
|
83
|
+
"Segments": f"Segment {segment_index + 1}",
|
84
|
+
"Start Date": start_date,
|
85
|
+
"End Date": end_date,
|
86
|
+
"R-Squared": r2s,
|
87
|
+
}
|
88
|
+
)
|
84
89
|
|
85
90
|
# Convert results list to a DataFrame
|
86
91
|
results_df = pd.DataFrame(results_list)
|
@@ -90,13 +95,13 @@ def TimeSeriesR2SquareBySegments(datasets, models, segments=None):
|
|
90
95
|
results_df,
|
91
96
|
x="Segments",
|
92
97
|
y="R-Squared",
|
93
|
-
color="Model",
|
98
|
+
# color="Model",
|
94
99
|
barmode="group",
|
95
|
-
title="R-Squared
|
100
|
+
title="R-Squared by Segment",
|
96
101
|
labels={
|
97
102
|
"R-Squared": "R-Squared Value",
|
98
|
-
"
|
99
|
-
"Model": "Model",
|
103
|
+
"Segments": "Time Segment",
|
104
|
+
# "Model": "Model",
|
100
105
|
},
|
101
106
|
)
|
102
107
|
|
@@ -12,33 +12,41 @@ from validmind import tags, tasks
|
|
12
12
|
@tasks("text_classification", "text_summarization")
|
13
13
|
def TokenDisparity(dataset, model):
|
14
14
|
"""
|
15
|
-
Evaluates the token disparity between reference and generated texts, visualizing the results through histograms
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
15
|
+
Evaluates the token disparity between reference and generated texts, visualizing the results through histograms and
|
16
|
+
bar charts, alongside compiling a comprehensive table of descriptive statistics for token counts.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The Token Disparity test aims to assess the difference in the number of tokens between reference texts and texts
|
21
|
+
generated by the model. Understanding token disparity is essential for evaluating how well the generated content
|
22
|
+
matches the expected length and richness of the reference texts.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
The test extracts true and predicted values from the dataset and model. It computes the number of tokens in each
|
27
|
+
reference and generated text. The results are visualized using histograms and bar charts to display the
|
28
|
+
distribution of token counts. Additionally, a table of descriptive statistics, including the mean, median, standard
|
29
|
+
deviation, minimum, and maximum token counts, is compiled to provide a detailed summary of token usage.
|
30
|
+
|
31
|
+
### Signs of High Risk
|
32
|
+
|
33
|
+
- Significant disparity in token counts between reference and generated texts could indicate issues with text
|
34
|
+
generation quality, such as verbosity or lack of detail.
|
32
35
|
- Consistently low token counts in generated texts compared to references might suggest that the model is producing
|
33
|
-
|
36
|
+
incomplete or overly concise outputs.
|
37
|
+
|
38
|
+
### Strengths
|
34
39
|
|
35
|
-
**Strengths:**
|
36
40
|
- Provides a simple yet effective evaluation of text length and token usage.
|
37
|
-
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
|
38
|
-
|
41
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
|
42
|
+
token counts.
|
43
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating texts of appropriate
|
44
|
+
length.
|
45
|
+
|
46
|
+
### Limitations
|
39
47
|
|
40
|
-
|
41
|
-
|
48
|
+
- Token counts alone do not provide a complete assessment of text quality and should be supplemented with other
|
49
|
+
metrics and qualitative analysis.
|
42
50
|
"""
|
43
51
|
|
44
52
|
# Extract true and predicted values
|