validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +80 -119
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +14 -15
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/JarqueBera.py +70 -0
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LJungBox.py +66 -0
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/RunsTest.py +72 -0
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +42 -40
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +39 -36
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +38 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +143 -158
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
- validmind-2.5.18.dist-info/RECORD +324 -0
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
- validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
- validmind-2.5.8.dist-info/RECORD +0 -318
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -17,16 +17,35 @@ def Sentiment(dataset):
|
|
17
17
|
"""
|
18
18
|
Analyzes the sentiment of text data within a dataset using the VADER sentiment analysis tool.
|
19
19
|
|
20
|
-
|
21
|
-
in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
|
22
|
-
of sentiment scores across the dataset.
|
20
|
+
### Purpose
|
23
21
|
|
24
|
-
|
25
|
-
|
26
|
-
and a `text_column` attribute indicating the name of the column containing text.
|
22
|
+
The Sentiment test evaluates the overall sentiment of text data within a dataset. By analyzing sentiment scores, it
|
23
|
+
aims to ensure that the model is interpreting text data accurately and is not biased towards a particular sentiment.
|
27
24
|
|
28
|
-
|
29
|
-
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
This test uses the VADER (Valence Aware Dictionary and sEntiment Reasoner) SentimentIntensityAnalyzer. It processes
|
28
|
+
each text entry in a specified column of the dataset to calculate the compound sentiment score, which represents
|
29
|
+
the overall sentiment polarity. The distribution of these sentiment scores is then visualized using a KDE (Kernel
|
30
|
+
Density Estimation) plot, highlighting any skewness or concentration in sentiment.
|
31
|
+
|
32
|
+
### Signs of High Risk
|
33
|
+
|
34
|
+
- Extreme polarity in sentiment scores, indicating potential bias.
|
35
|
+
- Unusual concentration of sentiment scores in a specific range.
|
36
|
+
- Significant deviation from expected sentiment distribution for the given text data.
|
37
|
+
|
38
|
+
### Strengths
|
39
|
+
|
40
|
+
- Provides a clear visual representation of sentiment distribution.
|
41
|
+
- Uses a well-established sentiment analysis tool (VADER).
|
42
|
+
- Can handle a wide range of text data, making it flexible for various applications.
|
43
|
+
|
44
|
+
### Limitations
|
45
|
+
|
46
|
+
- May not capture nuanced or context-specific sentiments.
|
47
|
+
- Relies heavily on the accuracy of the VADER sentiment analysis tool.
|
48
|
+
- Visualization alone may not provide comprehensive insights into underlying causes of sentiment distribution.
|
30
49
|
"""
|
31
50
|
nltk.download("vader_lexicon", quiet=True)
|
32
51
|
# Initialize VADER
|
@@ -30,40 +30,47 @@ class StopWords(ThresholdTest):
|
|
30
30
|
"""
|
31
31
|
Evaluates and visualizes the frequency of English stop words in a text dataset against a defined threshold.
|
32
32
|
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
### Purpose
|
34
|
+
|
35
|
+
The StopWords threshold test is a tool designed for assessing the quality of text data in an ML model. It focuses
|
36
|
+
on the identification and analysis of "stop words" in a given dataset. Stop words are frequent, common, yet
|
37
|
+
semantically insignificant words (for example: "the", "and", "is") in a language. This test evaluates the
|
36
38
|
proportion of stop words to the total word count in the dataset, in essence, scrutinizing the frequency of stop
|
37
39
|
word usage. The core objective is to highlight the prevalent stop words based on their usage frequency, which can
|
38
40
|
be instrumental in cleaning the data from noise and improving ML model performance.
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
stop
|
47
|
-
|
42
|
+
### Test Mechanism
|
43
|
+
|
44
|
+
The StopWords test initiates on receiving an input of a 'VMDataset' object. Absence of such an object will trigger
|
45
|
+
an error. The methodology involves inspection of the text column of the VMDataset to create a 'corpus' (a
|
46
|
+
collection of written texts). Leveraging the Natural Language Toolkit's (NLTK) stop word repository, the test
|
47
|
+
screens the corpus for any stop words and documents their frequency. It further calculates the percentage usage of
|
48
|
+
each stop word compared to the total word count in the corpus. This percentage is evaluated against a predefined
|
49
|
+
'min_percent_threshold'. If this threshold is breached, the test returns a failed output. Top prevailing stop words
|
50
|
+
along with their usage percentages are returned, facilitated by a bar chart visualization of these stop words and
|
51
|
+
their frequency.
|
52
|
+
|
53
|
+
### Signs of High Risk
|
48
54
|
|
49
|
-
**Signs of High Risk**:
|
50
55
|
- A percentage of any stop words exceeding the predefined 'min_percent_threshold'.
|
51
56
|
- High frequency of stop words in the dataset which may adversely affect the application's analytical performance
|
52
57
|
due to noise creation.
|
53
58
|
|
54
|
-
|
59
|
+
### Strengths
|
60
|
+
|
55
61
|
- The ability to scrutinize and quantify the usage of stop words.
|
56
|
-
- Provides insights into potential noise in the text data due to stop words.
|
57
|
-
model training efficiency.
|
58
|
-
-
|
62
|
+
- Provides insights into potential noise in the text data due to stop words.
|
63
|
+
- Directly aids in enhancing model training efficiency.
|
64
|
+
- Includes a bar chart visualization feature to easily interpret and action upon the stop words frequency
|
59
65
|
information.
|
60
66
|
|
61
|
-
|
67
|
+
### Limitations
|
68
|
+
|
62
69
|
- The test only supports English stop words, making it less effective with datasets of other languages.
|
63
70
|
- The 'min_percent_threshold' parameter may require fine-tuning for different datasets, impacting the overall
|
64
71
|
effectiveness of the test.
|
65
|
-
- Contextual use of the stop words within the dataset is not considered
|
66
|
-
|
72
|
+
- Contextual use of the stop words within the dataset is not considered, potentially overlooking their significance
|
73
|
+
in certain contexts.
|
67
74
|
- The test focuses specifically on the frequency of stop words, not providing direct measures of model performance
|
68
75
|
or predictive accuracy.
|
69
76
|
"""
|
@@ -17,46 +17,47 @@ from ....vm_models import Figure, Metric, VMDataset
|
|
17
17
|
@dataclass
|
18
18
|
class TextDescription(Metric):
|
19
19
|
"""
|
20
|
-
|
20
|
+
Conducts comprehensive textual analysis on a dataset using NLTK to evaluate various parameters and generate
|
21
21
|
visualizations.
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
23
|
+
### Purpose
|
24
|
+
|
25
|
+
The TextDescription test aims to conduct a thorough textual analysis of a dataset using the NLTK (Natural Language
|
26
|
+
Toolkit) library. It evaluates various metrics such as total words, total sentences, average sentence length, total
|
27
|
+
paragraphs, total unique words, most common words, total punctuations, and lexical diversity. The goal is to
|
28
|
+
understand the nature of the text and anticipate challenges machine learning models might face in text processing,
|
29
|
+
language understanding, or summarization tasks.
|
30
|
+
|
31
|
+
### Test Mechanism
|
32
|
+
|
33
|
+
The test works by:
|
34
|
+
|
35
|
+
- Parsing the dataset and tokenizing the text into words, sentences, and paragraphs using NLTK.
|
36
|
+
- Removing stopwords and unwanted tokens.
|
37
|
+
- Calculating parameters like total words, total sentences, average sentence length, total paragraphs, total unique
|
38
|
+
words, total punctuations, and lexical diversity.
|
39
|
+
- Generating scatter plots to visualize correlations between various metrics (e.g., Total Words vs Total Sentences).
|
40
|
+
|
41
|
+
### Signs of High Risk
|
42
|
+
|
43
|
+
- Anomalies or increased complexity in lexical diversity.
|
39
44
|
- Longer sentences and paragraphs.
|
40
45
|
- High uniqueness of words.
|
41
|
-
-
|
46
|
+
- Large number of unwanted tokens.
|
42
47
|
- Missing or erroneous visualizations.
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
-
|
48
|
-
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
-
|
55
|
-
-
|
56
|
-
- Lacks the ability to consider semantics or grammatical complexities, which could be crucial aspects in language
|
57
|
-
processing.
|
58
|
-
- Assumes that the document is well-structured (includes sentences and paragraphs); therefore, unstructured or
|
59
|
-
poorly formatted text may distort the results.
|
48
|
+
|
49
|
+
### Strengths
|
50
|
+
|
51
|
+
- Essential for pre-processing text data in machine learning models.
|
52
|
+
- Provides a comprehensive breakdown of text data, aiding in understanding its complexity.
|
53
|
+
- Generates visualizations to help comprehend text structure and complexity.
|
54
|
+
|
55
|
+
### Limitations
|
56
|
+
|
57
|
+
- Highly dependent on the NLTK library, limiting the test to supported languages.
|
58
|
+
- Limited customization for removing undesirable tokens and stop words.
|
59
|
+
- Does not consider semantic or grammatical complexities.
|
60
|
+
- Assumes well-structured documents, which may result in inaccuracies with poorly formatted text.
|
60
61
|
"""
|
61
62
|
|
62
63
|
name = "text_description"
|
@@ -83,7 +84,6 @@ class TextDescription(Metric):
|
|
83
84
|
tags = ["nlp", "text_data", "visualization"]
|
84
85
|
|
85
86
|
def general_text_metrics(self, df, text_column):
|
86
|
-
nltk.download("punkt", quiet=True)
|
87
87
|
results = []
|
88
88
|
|
89
89
|
for text in df[text_column]:
|
@@ -174,6 +174,9 @@ class TextDescription(Metric):
|
|
174
174
|
if not isinstance(self.inputs.dataset, VMDataset):
|
175
175
|
raise ValueError("TextDescription requires a validmind Dataset object")
|
176
176
|
|
177
|
+
# download nltk data
|
178
|
+
nltk.download("punkt_tab", quiet=True)
|
179
|
+
|
177
180
|
df_text_description = self.text_description_table(
|
178
181
|
self.inputs.dataset.df, self.params
|
179
182
|
)
|
@@ -13,18 +13,41 @@ from validmind import tags, tasks
|
|
13
13
|
@tasks("nlp")
|
14
14
|
def Toxicity(dataset):
|
15
15
|
"""
|
16
|
-
|
16
|
+
Assesses the toxicity of text data within a dataset to visualize the distribution of toxicity scores.
|
17
17
|
|
18
|
-
|
19
|
-
in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
|
20
|
-
of toxicity scores across the dataset.
|
18
|
+
### Purpose
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
The Toxicity test aims to evaluate the level of toxic content present in a text dataset by leveraging a pre-trained
|
21
|
+
toxicity model. It helps in identifying potentially harmful or offensive language that may negatively impact users
|
22
|
+
or stakeholders.
|
25
23
|
|
26
|
-
|
27
|
-
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
This test uses a pre-trained toxicity evaluation model and applies it to each text entry in the specified column of
|
27
|
+
a dataset’s dataframe. The procedure involves:
|
28
|
+
|
29
|
+
- Loading a pre-trained toxicity model.
|
30
|
+
- Extracting the text from the specified column in the dataset.
|
31
|
+
- Computing toxicity scores for each text entry.
|
32
|
+
- Generating a KDE (Kernel Density Estimate) plot to visualize the distribution of these toxicity scores.
|
33
|
+
|
34
|
+
### Signs of High Risk
|
35
|
+
|
36
|
+
- High concentration of high toxicity scores in the KDE plot.
|
37
|
+
- A significant proportion of text entries with toxicity scores above a predefined threshold.
|
38
|
+
- Wide distribution of toxicity scores, indicating inconsistency in content quality.
|
39
|
+
|
40
|
+
### Strengths
|
41
|
+
|
42
|
+
- Provides a visual representation of toxicity distribution, making it easier to identify outliers.
|
43
|
+
- Uses a robust pre-trained model for toxicity evaluation.
|
44
|
+
- Can process large text datasets efficiently.
|
45
|
+
|
46
|
+
### Limitations
|
47
|
+
|
48
|
+
- Depends on the accuracy and bias of the pre-trained toxicity model.
|
49
|
+
- Does not provide context-specific insights, which may be necessary for nuanced understanding.
|
50
|
+
- May not capture all forms of subtle or indirect toxic language.
|
28
51
|
"""
|
29
52
|
toxicity = evaluate.load("toxicity")
|
30
53
|
input_text = dataset.df[dataset.text_column]
|
validmind/tests/decorator.py
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
|
10
10
|
import inspect
|
11
11
|
import os
|
12
|
+
from typing import Any, Dict, List, Tuple, Union
|
12
13
|
from uuid import uuid4
|
13
14
|
|
14
15
|
import pandas as pd
|
@@ -22,6 +23,8 @@ from validmind.vm_models import (
|
|
22
23
|
ResultSummary,
|
23
24
|
ResultTable,
|
24
25
|
ResultTableMetadata,
|
26
|
+
VMDataset,
|
27
|
+
VMModel,
|
25
28
|
)
|
26
29
|
from validmind.vm_models.figure import (
|
27
30
|
Figure,
|
@@ -36,30 +39,42 @@ from ._store import test_store
|
|
36
39
|
logger = get_logger(__name__)
|
37
40
|
|
38
41
|
|
39
|
-
|
40
|
-
|
42
|
+
_input_type_map = {
|
43
|
+
"dataset": VMDataset,
|
44
|
+
"datasets": List[VMDataset],
|
45
|
+
"model": VMModel,
|
46
|
+
"models": List[VMModel],
|
47
|
+
}
|
48
|
+
|
41
49
|
|
50
|
+
def _inspect_signature(test_func: callable):
|
42
51
|
inputs = {}
|
43
52
|
params = {}
|
44
53
|
|
45
54
|
for name, arg in inspect.signature(test_func).parameters.items():
|
46
|
-
if name in
|
47
|
-
|
55
|
+
if name in _input_type_map:
|
56
|
+
inputs[name] = {
|
57
|
+
"type": _input_type_map[name],
|
58
|
+
}
|
48
59
|
else:
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
),
|
56
|
-
}
|
60
|
+
params[name] = {
|
61
|
+
"type": arg.annotation,
|
62
|
+
"default": (
|
63
|
+
arg.default if arg.default is not inspect.Parameter.empty else None
|
64
|
+
),
|
65
|
+
}
|
57
66
|
|
58
67
|
return inputs, params
|
59
68
|
|
60
69
|
|
61
70
|
def _build_result( # noqa: C901
|
62
|
-
results
|
71
|
+
results: Union[Any, Tuple[Any, ...]],
|
72
|
+
test_id: str,
|
73
|
+
inputs: List[str],
|
74
|
+
params: Dict[str, Any],
|
75
|
+
description: str = None,
|
76
|
+
output_template: str = None,
|
77
|
+
generate_description: bool = True,
|
63
78
|
):
|
64
79
|
ref_id = str(uuid4())
|
65
80
|
figure_metadata = {
|
@@ -70,14 +85,17 @@ def _build_result( # noqa: C901
|
|
70
85
|
|
71
86
|
tables = []
|
72
87
|
figures = []
|
88
|
+
scalars = []
|
73
89
|
|
74
|
-
def
|
90
|
+
def process_result_item(item):
|
75
91
|
# TOOD: build out a more robust/extensible system for this
|
76
92
|
# TODO: custom type handlers would be really cool
|
77
93
|
|
78
|
-
# unit metrics (scalar values) -
|
79
|
-
if isinstance(item, int) or isinstance(item, float)
|
80
|
-
|
94
|
+
# unit metrics (scalar values) - for now only one per test
|
95
|
+
if isinstance(item, int) or isinstance(item, float):
|
96
|
+
if scalars:
|
97
|
+
raise ValueError("Only one unit metric may be returned per test.")
|
98
|
+
scalars.append(item)
|
81
99
|
|
82
100
|
# plots
|
83
101
|
elif isinstance(item, Figure):
|
@@ -114,46 +132,66 @@ def _build_result( # noqa: C901
|
|
114
132
|
# if the results are a tuple, process each item as a separate result
|
115
133
|
if isinstance(results, tuple):
|
116
134
|
for item in results:
|
117
|
-
|
135
|
+
process_result_item(item)
|
118
136
|
else:
|
119
|
-
|
137
|
+
process_result_item(results)
|
120
138
|
|
121
|
-
|
139
|
+
metric_inputs = [
|
140
|
+
sub_i.input_id if hasattr(sub_i, "input_id") else sub_i
|
141
|
+
for i in inputs
|
142
|
+
for sub_i in (i if isinstance(i, list) else [i])
|
143
|
+
]
|
122
144
|
|
123
145
|
return MetricResultWrapper(
|
124
146
|
result_id=test_id,
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
147
|
+
scalar=scalars[0] if scalars else None,
|
148
|
+
metric=(
|
149
|
+
MetricResult(
|
150
|
+
key=test_id,
|
151
|
+
ref_id=ref_id,
|
152
|
+
value="Empty",
|
153
|
+
summary=ResultSummary(results=tables),
|
154
|
+
)
|
155
|
+
if tables or figures # if tables or figures than its a traditional metric
|
156
|
+
else None
|
130
157
|
),
|
131
158
|
figures=figures,
|
132
|
-
result_metadata=
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
159
|
+
result_metadata=(
|
160
|
+
[
|
161
|
+
get_description_metadata(
|
162
|
+
test_id=test_id,
|
163
|
+
default_description=description,
|
164
|
+
summary=ResultSummary(results=tables).serialize(),
|
165
|
+
figures=figures,
|
166
|
+
should_generate=generate_description,
|
167
|
+
)
|
168
|
+
]
|
169
|
+
if tables or figures
|
170
|
+
else None
|
171
|
+
),
|
172
|
+
inputs=metric_inputs,
|
173
|
+
params=params,
|
142
174
|
output_template=output_template,
|
143
175
|
)
|
144
176
|
|
145
177
|
|
146
|
-
def _get_run_method(func,
|
178
|
+
def _get_run_method(func, func_inputs, func_params):
|
147
179
|
def run(self: Metric):
|
148
|
-
input_kwargs = {}
|
149
|
-
|
180
|
+
input_kwargs = {} # map function inputs (`dataset` etc) to actual objects
|
181
|
+
input_ids = [] # store input_ids used so they can be logged
|
182
|
+
for key in func_inputs.keys():
|
150
183
|
try:
|
151
|
-
input_kwargs[
|
184
|
+
input_kwargs[key] = getattr(self.inputs, key)
|
185
|
+
if isinstance(input_kwargs[key], list):
|
186
|
+
input_ids.extend([i.input_id for i in input_kwargs[key]])
|
187
|
+
else:
|
188
|
+
input_ids.append(input_kwargs[key].input_id)
|
152
189
|
except AttributeError:
|
153
|
-
raise MissingRequiredTestInputError(f"Missing required input: {
|
190
|
+
raise MissingRequiredTestInputError(f"Missing required input: {key}.")
|
154
191
|
|
155
192
|
param_kwargs = {
|
156
|
-
|
193
|
+
key: self.params.get(key, func_params[key]["default"])
|
194
|
+
for key in func_params.keys()
|
157
195
|
}
|
158
196
|
|
159
197
|
raw_results = func(**input_kwargs, **param_kwargs)
|
@@ -162,8 +200,9 @@ def _get_run_method(func, inputs, params):
|
|
162
200
|
results=raw_results,
|
163
201
|
test_id=self.test_id,
|
164
202
|
description=inspect.getdoc(self),
|
203
|
+
inputs=input_ids,
|
204
|
+
params=param_kwargs,
|
165
205
|
output_template=self.output_template,
|
166
|
-
inputs=self.get_accessed_inputs(),
|
167
206
|
generate_description=self.generate_description,
|
168
207
|
)
|
169
208
|
|
@@ -13,39 +13,48 @@ from validmind import tags, tasks
|
|
13
13
|
@tasks("text_classification", "text_summarization")
|
14
14
|
def BertScore(dataset, model):
|
15
15
|
"""
|
16
|
-
|
17
|
-
and bar charts, alongside compiling a comprehensive table of descriptive statistics
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
16
|
+
Assesses the quality of machine-generated text using BERTScore metrics and visualizes results through histograms
|
17
|
+
and bar charts, alongside compiling a comprehensive table of descriptive statistics.
|
18
|
+
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
This function is designed to assess the quality of text generated by machine learning models using BERTScore
|
22
|
+
metrics. BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score
|
23
|
+
based on BERT contextual embeddings.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then
|
28
|
+
initializes the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the
|
29
|
+
BERTScore metrics and compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore
|
30
|
+
metric (Precision, Recall, and F1 Score) to visualize their distribution. Additionally, a table of descriptive
|
31
|
+
statistics (mean, median, standard deviation, minimum, and maximum) is compiled for each metric, providing a
|
32
|
+
comprehensive summary of the model's performance.
|
33
|
+
|
34
|
+
### Signs of High Risk
|
35
|
+
|
36
|
+
- Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting
|
37
|
+
that the model fails to capture the essential content of the reference texts.
|
34
38
|
- Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
|
35
39
|
- Low recall scores may indicate that important information from the reference text is being omitted.
|
36
|
-
- An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the
|
37
|
-
|
40
|
+
- An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the
|
41
|
+
model's ability to balance informativeness and conciseness.
|
38
42
|
|
39
|
-
|
40
|
-
|
41
|
-
-
|
43
|
+
### Strengths
|
44
|
+
|
45
|
+
- Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view
|
46
|
+
of model performance.
|
47
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
|
48
|
+
scores.
|
42
49
|
- Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
|
43
50
|
|
44
|
-
|
45
|
-
|
51
|
+
### Limitations
|
52
|
+
|
53
|
+
- BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text
|
54
|
+
similarity.
|
46
55
|
- The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
|
47
|
-
- While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's
|
48
|
-
|
56
|
+
- While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's
|
57
|
+
performance and should be supplemented with other metrics and qualitative analysis.
|
49
58
|
"""
|
50
59
|
|
51
60
|
# Extract true and predicted values
|
@@ -16,39 +16,45 @@ def BleuScore(dataset, model):
|
|
16
16
|
Evaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms
|
17
17
|
and bar charts, alongside compiling a comprehensive table of descriptive statistics for BLEU scores.
|
18
18
|
|
19
|
-
|
19
|
+
### Purpose
|
20
|
+
|
20
21
|
This function is designed to assess the quality of text generated by machine learning models using the BLEU metric.
|
21
22
|
BLEU, which stands for Bilingual Evaluation Understudy, is a metric used to evaluate the overlap of n-grams between
|
22
23
|
the machine-generated text and reference texts. This evaluation is crucial for tasks such as text summarization,
|
23
24
|
machine translation, and text generation, where the goal is to produce text that accurately reflects the content
|
24
25
|
and meaning of human-crafted references.
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then
|
30
|
+
initializes the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores
|
31
|
+
and compiles them into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their
|
32
|
+
distribution. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and
|
33
|
+
maximum) is compiled for the BLEU scores, providing a comprehensive summary of the model's performance.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
32
36
|
|
33
|
-
|
34
|
-
|
35
|
-
the essential content of the reference texts.
|
37
|
+
- Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails
|
38
|
+
to capture the essential content of the reference texts.
|
36
39
|
- Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
|
37
40
|
- Low recall scores may indicate that important information from the reference text is being omitted.
|
38
|
-
- An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the
|
39
|
-
|
41
|
+
- An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the
|
42
|
+
model's ability to balance informativeness and conciseness.
|
43
|
+
|
44
|
+
### Strengths
|
40
45
|
|
41
|
-
**Strengths:**
|
42
46
|
- Provides a straightforward and widely-used evaluation of text quality through BLEU scores.
|
43
|
-
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
|
47
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
|
48
|
+
scores.
|
44
49
|
- Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
|
45
50
|
|
46
|
-
|
47
|
-
|
48
|
-
|
51
|
+
### Limitations
|
52
|
+
|
53
|
+
- BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or
|
54
|
+
grammatical quality of the text.
|
49
55
|
- The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
|
50
|
-
- While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and
|
51
|
-
|
56
|
+
- While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and
|
57
|
+
should be supplemented with other metrics and qualitative analysis.
|
52
58
|
"""
|
53
59
|
|
54
60
|
# Extract true and predicted values
|