validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -17,16 +17,35 @@ def Sentiment(dataset):
17
17
  """
18
18
  Analyzes the sentiment of text data within a dataset using the VADER sentiment analysis tool.
19
19
 
20
- This method initializes the VADER SentimentIntensityAnalyzer and applies it to each text entry
21
- in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
22
- of sentiment scores across the dataset.
20
+ ### Purpose
23
21
 
24
- Args:
25
- dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
26
- and a `text_column` attribute indicating the name of the column containing text.
22
+ The Sentiment test evaluates the overall sentiment of text data within a dataset. By analyzing sentiment scores, it
23
+ aims to ensure that the model is interpreting text data accurately and is not biased towards a particular sentiment.
27
24
 
28
- Returns:
29
- matplotlib.figure.Figure: A KDE plot visualizing the distribution of sentiment scores.
25
+ ### Test Mechanism
26
+
27
+ This test uses the VADER (Valence Aware Dictionary and sEntiment Reasoner) SentimentIntensityAnalyzer. It processes
28
+ each text entry in a specified column of the dataset to calculate the compound sentiment score, which represents
29
+ the overall sentiment polarity. The distribution of these sentiment scores is then visualized using a KDE (Kernel
30
+ Density Estimation) plot, highlighting any skewness or concentration in sentiment.
31
+
32
+ ### Signs of High Risk
33
+
34
+ - Extreme polarity in sentiment scores, indicating potential bias.
35
+ - Unusual concentration of sentiment scores in a specific range.
36
+ - Significant deviation from expected sentiment distribution for the given text data.
37
+
38
+ ### Strengths
39
+
40
+ - Provides a clear visual representation of sentiment distribution.
41
+ - Uses a well-established sentiment analysis tool (VADER).
42
+ - Can handle a wide range of text data, making it flexible for various applications.
43
+
44
+ ### Limitations
45
+
46
+ - May not capture nuanced or context-specific sentiments.
47
+ - Relies heavily on the accuracy of the VADER sentiment analysis tool.
48
+ - Visualization alone may not provide comprehensive insights into underlying causes of sentiment distribution.
30
49
  """
31
50
  nltk.download("vader_lexicon", quiet=True)
32
51
  # Initialize VADER
@@ -30,40 +30,47 @@ class StopWords(ThresholdTest):
30
30
  """
31
31
  Evaluates and visualizes the frequency of English stop words in a text dataset against a defined threshold.
32
32
 
33
- **Purpose**: The StopWords threshold test is a tool designed for assessing the quality of text data in an ML model.
34
- It focuses on the identification and analysis of "stop words" in a given dataset. Stop words are frequent, common,
35
- yet semantically insignificant words (for example: "the", "and", "is") in a language. This test evaluates the
33
+ ### Purpose
34
+
35
+ The StopWords threshold test is a tool designed for assessing the quality of text data in an ML model. It focuses
36
+ on the identification and analysis of "stop words" in a given dataset. Stop words are frequent, common, yet
37
+ semantically insignificant words (for example: "the", "and", "is") in a language. This test evaluates the
36
38
  proportion of stop words to the total word count in the dataset, in essence, scrutinizing the frequency of stop
37
39
  word usage. The core objective is to highlight the prevalent stop words based on their usage frequency, which can
38
40
  be instrumental in cleaning the data from noise and improving ML model performance.
39
41
 
40
- **Test Mechanism**: The StopWords test initiates on receiving an input of a 'VMDataset' object. Absence of such an
41
- object will trigger an error. The methodology involves inspection of the text column of the VMDataset to create a
42
- 'corpus' (a collection of written texts). Leveraging the Natural Language Toolkit's (NLTK) stop word repository,
43
- the test screens the corpus for any stop words and documents their frequency. It further calculates the percentage
44
- usage of each stop word compared to the total word count in the corpus. This percentage is evaluated against a
45
- predefined 'min_percent_threshold'. If this threshold is breached, the test returns a failed output. Top prevailing
46
- stop words along with their usage percentages are returned, facilitated by a bar chart visualization of these stop
47
- words and their frequency.
42
+ ### Test Mechanism
43
+
44
+ The StopWords test initiates on receiving an input of a 'VMDataset' object. Absence of such an object will trigger
45
+ an error. The methodology involves inspection of the text column of the VMDataset to create a 'corpus' (a
46
+ collection of written texts). Leveraging the Natural Language Toolkit's (NLTK) stop word repository, the test
47
+ screens the corpus for any stop words and documents their frequency. It further calculates the percentage usage of
48
+ each stop word compared to the total word count in the corpus. This percentage is evaluated against a predefined
49
+ 'min_percent_threshold'. If this threshold is breached, the test returns a failed output. Top prevailing stop words
50
+ along with their usage percentages are returned, facilitated by a bar chart visualization of these stop words and
51
+ their frequency.
52
+
53
+ ### Signs of High Risk
48
54
 
49
- **Signs of High Risk**:
50
55
  - A percentage of any stop words exceeding the predefined 'min_percent_threshold'.
51
56
  - High frequency of stop words in the dataset which may adversely affect the application's analytical performance
52
57
  due to noise creation.
53
58
 
54
- **Strengths**:
59
+ ### Strengths
60
+
55
61
  - The ability to scrutinize and quantify the usage of stop words.
56
- - Provides insights into potential noise in the text data due to stop words. This can directly aid in enhancing
57
- model training efficiency.
58
- - The test includes a bar chart visualization feature to easily interpret and action upon the stop words frequency
62
+ - Provides insights into potential noise in the text data due to stop words.
63
+ - Directly aids in enhancing model training efficiency.
64
+ - Includes a bar chart visualization feature to easily interpret and action upon the stop words frequency
59
65
  information.
60
66
 
61
- **Limitations**:
67
+ ### Limitations
68
+
62
69
  - The test only supports English stop words, making it less effective with datasets of other languages.
63
70
  - The 'min_percent_threshold' parameter may require fine-tuning for different datasets, impacting the overall
64
71
  effectiveness of the test.
65
- - Contextual use of the stop words within the dataset is not considered which may lead to overlooking their
66
- significance in certain contexts.
72
+ - Contextual use of the stop words within the dataset is not considered, potentially overlooking their significance
73
+ in certain contexts.
67
74
  - The test focuses specifically on the frequency of stop words, not providing direct measures of model performance
68
75
  or predictive accuracy.
69
76
  """
@@ -17,46 +17,47 @@ from ....vm_models import Figure, Metric, VMDataset
17
17
  @dataclass
18
18
  class TextDescription(Metric):
19
19
  """
20
- Performs comprehensive textual analysis on a dataset using NLTK, evaluating various parameters and generating
20
+ Conducts comprehensive textual analysis on a dataset using NLTK to evaluate various parameters and generate
21
21
  visualizations.
22
22
 
23
- **Purpose**: This test uses the TextDescription metric to conduct a comprehensive textual analysis of a given
24
- dataset. Various parameters such as total words, total sentences, average sentence length, total paragraphs, total
25
- unique words, most common words, total punctuations, and lexical diversity are evaluated. This metric aids in
26
- comprehending the nature of the text and evaluating the potential challenges that machine learning algorithms
27
- deployed for textual analysis, language processing, or summarization might face.
28
-
29
- **Test Mechanism**: The test works by parsing the given dataset and utilizes the NLTK (Natural Language Toolkit)
30
- library for tokenizing the text into words, sentences, and paragraphs. Subsequently, it processes the text further
31
- by eliminating stopwords declared in 'unwanted_tokens' and punctuations. Next, it determines parameters like the
32
- total count of words, sentences, paragraphs, punctuations alongside the average sentence length and lexical
33
- diversity. Lastly, the result from these calculations is condensed and scatter plots for certain variable
34
- combinations (e.g. Total Words vs Total Sentences, Total Words vs Total Unique Words) are produced, providing a
35
- visual representation of the text's structure.
36
-
37
- **Signs of High Risk**:
38
- - Anomalies or an increase in complexity within the lexical diversity results.
23
+ ### Purpose
24
+
25
+ The TextDescription test aims to conduct a thorough textual analysis of a dataset using the NLTK (Natural Language
26
+ Toolkit) library. It evaluates various metrics such as total words, total sentences, average sentence length, total
27
+ paragraphs, total unique words, most common words, total punctuations, and lexical diversity. The goal is to
28
+ understand the nature of the text and anticipate challenges machine learning models might face in text processing,
29
+ language understanding, or summarization tasks.
30
+
31
+ ### Test Mechanism
32
+
33
+ The test works by:
34
+
35
+ - Parsing the dataset and tokenizing the text into words, sentences, and paragraphs using NLTK.
36
+ - Removing stopwords and unwanted tokens.
37
+ - Calculating parameters like total words, total sentences, average sentence length, total paragraphs, total unique
38
+ words, total punctuations, and lexical diversity.
39
+ - Generating scatter plots to visualize correlations between various metrics (e.g., Total Words vs Total Sentences).
40
+
41
+ ### Signs of High Risk
42
+
43
+ - Anomalies or increased complexity in lexical diversity.
39
44
  - Longer sentences and paragraphs.
40
45
  - High uniqueness of words.
41
- - Presence of a significant amount of unwanted tokens.
46
+ - Large number of unwanted tokens.
42
47
  - Missing or erroneous visualizations.
43
- These signs suggest potential risk in text processing ML models, indicating that the ability of the model to
44
- absorb and process text could be compromised.
45
-
46
- **Strengths**:
47
- - An essential pre-processing tool, specifically for textual analysis in machine learning model data.
48
- - Provides a comprehensive breakdown of a text dataset, which aids in understanding both structural and vocabulary
49
- complexity.
50
- - Generates visualizations of correlations between chosen variables to further comprehend the text's structure and
51
- complexity.
52
-
53
- **Limitations**:
54
- - Heavy reliance on the NLTK library, restricting its use to only the languages that NLTK supports.
55
- - Limited customization capacity as the undesirable tokens and stop words are predefined.
56
- - Lacks the ability to consider semantics or grammatical complexities, which could be crucial aspects in language
57
- processing.
58
- - Assumes that the document is well-structured (includes sentences and paragraphs); therefore, unstructured or
59
- poorly formatted text may distort the results.
48
+
49
+ ### Strengths
50
+
51
+ - Essential for pre-processing text data in machine learning models.
52
+ - Provides a comprehensive breakdown of text data, aiding in understanding its complexity.
53
+ - Generates visualizations to help comprehend text structure and complexity.
54
+
55
+ ### Limitations
56
+
57
+ - Highly dependent on the NLTK library, limiting the test to supported languages.
58
+ - Limited customization for removing undesirable tokens and stop words.
59
+ - Does not consider semantic or grammatical complexities.
60
+ - Assumes well-structured documents, which may result in inaccuracies with poorly formatted text.
60
61
  """
61
62
 
62
63
  name = "text_description"
@@ -83,7 +84,6 @@ class TextDescription(Metric):
83
84
  tags = ["nlp", "text_data", "visualization"]
84
85
 
85
86
  def general_text_metrics(self, df, text_column):
86
- nltk.download("punkt", quiet=True)
87
87
  results = []
88
88
 
89
89
  for text in df[text_column]:
@@ -174,6 +174,9 @@ class TextDescription(Metric):
174
174
  if not isinstance(self.inputs.dataset, VMDataset):
175
175
  raise ValueError("TextDescription requires a validmind Dataset object")
176
176
 
177
+ # download nltk data
178
+ nltk.download("punkt_tab", quiet=True)
179
+
177
180
  df_text_description = self.text_description_table(
178
181
  self.inputs.dataset.df, self.params
179
182
  )
@@ -13,18 +13,41 @@ from validmind import tags, tasks
13
13
  @tasks("nlp")
14
14
  def Toxicity(dataset):
15
15
  """
16
- Analyzes the toxicity of text data within a dataset using a pre-trained toxicity model.
16
+ Assesses the toxicity of text data within a dataset to visualize the distribution of toxicity scores.
17
17
 
18
- This method loads a toxicity evaluation model and applies it to each text entry
19
- in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
20
- of toxicity scores across the dataset.
18
+ ### Purpose
21
19
 
22
- Args:
23
- dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
24
- and a `text_column` attribute indicating the name of the column containing text.
20
+ The Toxicity test aims to evaluate the level of toxic content present in a text dataset by leveraging a pre-trained
21
+ toxicity model. It helps in identifying potentially harmful or offensive language that may negatively impact users
22
+ or stakeholders.
25
23
 
26
- Returns:
27
- matplotlib.figure.Figure: A KDE plot visualizing the distribution of toxicity scores.
24
+ ### Test Mechanism
25
+
26
+ This test uses a pre-trained toxicity evaluation model and applies it to each text entry in the specified column of
27
+ a dataset’s dataframe. The procedure involves:
28
+
29
+ - Loading a pre-trained toxicity model.
30
+ - Extracting the text from the specified column in the dataset.
31
+ - Computing toxicity scores for each text entry.
32
+ - Generating a KDE (Kernel Density Estimate) plot to visualize the distribution of these toxicity scores.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - High concentration of high toxicity scores in the KDE plot.
37
+ - A significant proportion of text entries with toxicity scores above a predefined threshold.
38
+ - Wide distribution of toxicity scores, indicating inconsistency in content quality.
39
+
40
+ ### Strengths
41
+
42
+ - Provides a visual representation of toxicity distribution, making it easier to identify outliers.
43
+ - Uses a robust pre-trained model for toxicity evaluation.
44
+ - Can process large text datasets efficiently.
45
+
46
+ ### Limitations
47
+
48
+ - Depends on the accuracy and bias of the pre-trained toxicity model.
49
+ - Does not provide context-specific insights, which may be necessary for nuanced understanding.
50
+ - May not capture all forms of subtle or indirect toxic language.
28
51
  """
29
52
  toxicity = evaluate.load("toxicity")
30
53
  input_text = dataset.df[dataset.text_column]
@@ -9,6 +9,7 @@
9
9
 
10
10
  import inspect
11
11
  import os
12
+ from typing import Any, Dict, List, Tuple, Union
12
13
  from uuid import uuid4
13
14
 
14
15
  import pandas as pd
@@ -22,6 +23,8 @@ from validmind.vm_models import (
22
23
  ResultSummary,
23
24
  ResultTable,
24
25
  ResultTableMetadata,
26
+ VMDataset,
27
+ VMModel,
25
28
  )
26
29
  from validmind.vm_models.figure import (
27
30
  Figure,
@@ -36,30 +39,42 @@ from ._store import test_store
36
39
  logger = get_logger(__name__)
37
40
 
38
41
 
39
- def _inspect_signature(test_func: callable):
40
- input_keys = ["dataset", "datasets", "model", "models"]
42
+ _input_type_map = {
43
+ "dataset": VMDataset,
44
+ "datasets": List[VMDataset],
45
+ "model": VMModel,
46
+ "models": List[VMModel],
47
+ }
48
+
41
49
 
50
+ def _inspect_signature(test_func: callable):
42
51
  inputs = {}
43
52
  params = {}
44
53
 
45
54
  for name, arg in inspect.signature(test_func).parameters.items():
46
- if name in input_keys:
47
- target_dict = inputs
55
+ if name in _input_type_map:
56
+ inputs[name] = {
57
+ "type": _input_type_map[name],
58
+ }
48
59
  else:
49
- target_dict = params
50
-
51
- target_dict[name] = {
52
- "type": arg.annotation,
53
- "default": (
54
- arg.default if arg.default is not inspect.Parameter.empty else None
55
- ),
56
- }
60
+ params[name] = {
61
+ "type": arg.annotation,
62
+ "default": (
63
+ arg.default if arg.default is not inspect.Parameter.empty else None
64
+ ),
65
+ }
57
66
 
58
67
  return inputs, params
59
68
 
60
69
 
61
70
  def _build_result( # noqa: C901
62
- results, test_id, description, output_template, inputs, generate_description=True
71
+ results: Union[Any, Tuple[Any, ...]],
72
+ test_id: str,
73
+ inputs: List[str],
74
+ params: Dict[str, Any],
75
+ description: str = None,
76
+ output_template: str = None,
77
+ generate_description: bool = True,
63
78
  ):
64
79
  ref_id = str(uuid4())
65
80
  figure_metadata = {
@@ -70,14 +85,17 @@ def _build_result( # noqa: C901
70
85
 
71
86
  tables = []
72
87
  figures = []
88
+ scalars = []
73
89
 
74
- def process_item(item):
90
+ def process_result_item(item):
75
91
  # TOOD: build out a more robust/extensible system for this
76
92
  # TODO: custom type handlers would be really cool
77
93
 
78
- # unit metrics (scalar values) - show in a simple table for now
79
- if isinstance(item, int) or isinstance(item, float) or isinstance(item, str):
80
- tables.append(ResultTable(data=[{test_id.split(".")[-1]: item}]))
94
+ # unit metrics (scalar values) - for now only one per test
95
+ if isinstance(item, int) or isinstance(item, float):
96
+ if scalars:
97
+ raise ValueError("Only one unit metric may be returned per test.")
98
+ scalars.append(item)
81
99
 
82
100
  # plots
83
101
  elif isinstance(item, Figure):
@@ -114,46 +132,66 @@ def _build_result( # noqa: C901
114
132
  # if the results are a tuple, process each item as a separate result
115
133
  if isinstance(results, tuple):
116
134
  for item in results:
117
- process_item(item)
135
+ process_result_item(item)
118
136
  else:
119
- process_item(results)
137
+ process_result_item(results)
120
138
 
121
- result_summary = ResultSummary(results=tables)
139
+ metric_inputs = [
140
+ sub_i.input_id if hasattr(sub_i, "input_id") else sub_i
141
+ for i in inputs
142
+ for sub_i in (i if isinstance(i, list) else [i])
143
+ ]
122
144
 
123
145
  return MetricResultWrapper(
124
146
  result_id=test_id,
125
- metric=MetricResult(
126
- key=test_id,
127
- ref_id=ref_id,
128
- value="Empty",
129
- summary=result_summary,
147
+ scalar=scalars[0] if scalars else None,
148
+ metric=(
149
+ MetricResult(
150
+ key=test_id,
151
+ ref_id=ref_id,
152
+ value="Empty",
153
+ summary=ResultSummary(results=tables),
154
+ )
155
+ if tables or figures # if tables or figures than its a traditional metric
156
+ else None
130
157
  ),
131
158
  figures=figures,
132
- result_metadata=[
133
- get_description_metadata(
134
- test_id=test_id,
135
- default_description=description,
136
- summary=result_summary.serialize(),
137
- figures=figures,
138
- should_generate=generate_description,
139
- )
140
- ],
141
- inputs=inputs,
159
+ result_metadata=(
160
+ [
161
+ get_description_metadata(
162
+ test_id=test_id,
163
+ default_description=description,
164
+ summary=ResultSummary(results=tables).serialize(),
165
+ figures=figures,
166
+ should_generate=generate_description,
167
+ )
168
+ ]
169
+ if tables or figures
170
+ else None
171
+ ),
172
+ inputs=metric_inputs,
173
+ params=params,
142
174
  output_template=output_template,
143
175
  )
144
176
 
145
177
 
146
- def _get_run_method(func, inputs, params):
178
+ def _get_run_method(func, func_inputs, func_params):
147
179
  def run(self: Metric):
148
- input_kwargs = {}
149
- for k in inputs.keys():
180
+ input_kwargs = {} # map function inputs (`dataset` etc) to actual objects
181
+ input_ids = [] # store input_ids used so they can be logged
182
+ for key in func_inputs.keys():
150
183
  try:
151
- input_kwargs[k] = getattr(self.inputs, k)
184
+ input_kwargs[key] = getattr(self.inputs, key)
185
+ if isinstance(input_kwargs[key], list):
186
+ input_ids.extend([i.input_id for i in input_kwargs[key]])
187
+ else:
188
+ input_ids.append(input_kwargs[key].input_id)
152
189
  except AttributeError:
153
- raise MissingRequiredTestInputError(f"Missing required input: {k}.")
190
+ raise MissingRequiredTestInputError(f"Missing required input: {key}.")
154
191
 
155
192
  param_kwargs = {
156
- k: self.params.get(k, params[k]["default"]) for k in params.keys()
193
+ key: self.params.get(key, func_params[key]["default"])
194
+ for key in func_params.keys()
157
195
  }
158
196
 
159
197
  raw_results = func(**input_kwargs, **param_kwargs)
@@ -162,8 +200,9 @@ def _get_run_method(func, inputs, params):
162
200
  results=raw_results,
163
201
  test_id=self.test_id,
164
202
  description=inspect.getdoc(self),
203
+ inputs=input_ids,
204
+ params=param_kwargs,
165
205
  output_template=self.output_template,
166
- inputs=self.get_accessed_inputs(),
167
206
  generate_description=self.generate_description,
168
207
  )
169
208
 
@@ -13,39 +13,48 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def BertScore(dataset, model):
15
15
  """
16
- Evaluates the quality of machine-generated text using BERTScore metrics and visualizes the results through histograms
17
- and bar charts, alongside compiling a comprehensive table of descriptive statistics for each BERTScore metric.
18
-
19
- **Purpose:**
20
- This function is designed to assess the quality of text generated by machine learning models using BERTScore metrics.
21
- BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
22
- contextual embeddings.
23
-
24
- **Test Mechanism:**
25
- The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
26
- the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the BERTScore metrics and
27
- compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore metric (Precision, Recall,
28
- and F1 Score) to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard
29
- deviation, minimum, and maximum) is compiled for each metric, providing a comprehensive summary of the model's performance.
30
-
31
- **Signs of High Risk:**
32
- - Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting that the model
33
- fails to capture the essential content of the reference texts.
16
+ Assesses the quality of machine-generated text using BERTScore metrics and visualizes results through histograms
17
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics.
18
+
19
+ ### Purpose
20
+
21
+ This function is designed to assess the quality of text generated by machine learning models using BERTScore
22
+ metrics. BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score
23
+ based on BERT contextual embeddings.
24
+
25
+ ### Test Mechanism
26
+
27
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then
28
+ initializes the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the
29
+ BERTScore metrics and compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore
30
+ metric (Precision, Recall, and F1 Score) to visualize their distribution. Additionally, a table of descriptive
31
+ statistics (mean, median, standard deviation, minimum, and maximum) is compiled for each metric, providing a
32
+ comprehensive summary of the model's performance.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting
37
+ that the model fails to capture the essential content of the reference texts.
34
38
  - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
35
39
  - Low recall scores may indicate that important information from the reference text is being omitted.
36
- - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
37
- to balance informativeness and conciseness.
40
+ - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the
41
+ model's ability to balance informativeness and conciseness.
38
42
 
39
- **Strengths:**
40
- - Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view of model performance.
41
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
43
+ ### Strengths
44
+
45
+ - Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view
46
+ of model performance.
47
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
48
+ scores.
42
49
  - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
43
50
 
44
- **Limitations:**
45
- - BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text similarity.
51
+ ### Limitations
52
+
53
+ - BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text
54
+ similarity.
46
55
  - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
47
- - While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's performance and should be
48
- supplemented with other metrics and qualitative analysis.
56
+ - While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's
57
+ performance and should be supplemented with other metrics and qualitative analysis.
49
58
  """
50
59
 
51
60
  # Extract true and predicted values
@@ -16,39 +16,45 @@ def BleuScore(dataset, model):
16
16
  Evaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms
17
17
  and bar charts, alongside compiling a comprehensive table of descriptive statistics for BLEU scores.
18
18
 
19
- **Purpose:**
19
+ ### Purpose
20
+
20
21
  This function is designed to assess the quality of text generated by machine learning models using the BLEU metric.
21
22
  BLEU, which stands for Bilingual Evaluation Understudy, is a metric used to evaluate the overlap of n-grams between
22
23
  the machine-generated text and reference texts. This evaluation is crucial for tasks such as text summarization,
23
24
  machine translation, and text generation, where the goal is to produce text that accurately reflects the content
24
25
  and meaning of human-crafted references.
25
26
 
26
- **Test Mechanism:**
27
- The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
28
- the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores and compiles them
29
- into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their distribution. Additionally,
30
- a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the BLEU scores,
31
- providing a comprehensive summary of the model's performance.
27
+ ### Test Mechanism
28
+
29
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then
30
+ initializes the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores
31
+ and compiles them into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their
32
+ distribution. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and
33
+ maximum) is compiled for the BLEU scores, providing a comprehensive summary of the model's performance.
34
+
35
+ ### Signs of High Risk
32
36
 
33
- **Signs of High Risk:**
34
- - Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails to capture
35
- the essential content of the reference texts.
37
+ - Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails
38
+ to capture the essential content of the reference texts.
36
39
  - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
37
40
  - Low recall scores may indicate that important information from the reference text is being omitted.
38
- - An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the model's
39
- ability to balance informativeness and conciseness.
41
+ - An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the
42
+ model's ability to balance informativeness and conciseness.
43
+
44
+ ### Strengths
40
45
 
41
- **Strengths:**
42
46
  - Provides a straightforward and widely-used evaluation of text quality through BLEU scores.
43
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
47
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
48
+ scores.
44
49
  - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
45
50
 
46
- **Limitations:**
47
- - BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality
48
- of the text.
51
+ ### Limitations
52
+
53
+ - BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or
54
+ grammatical quality of the text.
49
55
  - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
50
- - While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and should be
51
- supplemented with other metrics and qualitative analysis.
56
+ - While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and
57
+ should be supplemented with other metrics and qualitative analysis.
52
58
  """
53
59
 
54
60
  # Extract true and predicted values