validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -13,31 +13,40 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def ToxicityScore(dataset, model):
15
15
  """
16
- Computes and visualizes the toxicity score for input text, true text, and predicted text, assessing content quality and potential risk.
16
+ Assesses the toxicity levels of texts generated by NLP models to identify and mitigate harmful or offensive content.
17
17
 
18
- **Purpose:**
19
- The ToxicityScore metric is designed to evaluate the toxicity levels of texts generated by models. This is crucial for
20
- identifying and mitigating harmful or offensive content in machine-generated texts.
18
+ ### Purpose
21
19
 
22
- **Test Mechanism:**
23
- The function starts by extracting the input, true, and predicted values from the provided dataset and model. The toxicity score is
24
- computed for each text using a preloaded `toxicity` evaluation tool. The scores are compiled into dataframes, and histograms
25
- and bar charts are generated to visualize the distribution of toxicity scores. Additionally, a table of descriptive statistics
26
- (mean, median, standard deviation, minimum, and maximum) is compiled for the toxicity scores, providing a comprehensive
27
- summary of the model's performance.
20
+ The ToxicityScore metric is designed to evaluate the toxicity levels of texts generated by models. This is crucial
21
+ for identifying and mitigating harmful or offensive content in machine-generated texts.
22
+
23
+ ### Test Mechanism
24
+
25
+ The function starts by extracting the input, true, and predicted values from the provided dataset and model. The
26
+ toxicity score is computed for each text using a preloaded `toxicity` evaluation tool. The scores are compiled into
27
+ dataframes, and histograms and bar charts are generated to visualize the distribution of toxicity scores.
28
+ Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is
29
+ compiled for the toxicity scores, providing a comprehensive summary of the model's performance.
30
+
31
+ ### Signs of High Risk
28
32
 
29
- **Signs of High Risk:**
30
33
  - Drastic spikes in toxicity scores indicate potentially toxic content within the associated text segment.
31
- - Persistent high toxicity scores across multiple texts may suggest systemic issues in the model's text generation process.
34
+ - Persistent high toxicity scores across multiple texts may suggest systemic issues in the model's text generation
35
+ process.
32
36
 
33
- **Strengths:**
34
- - Provides a clear evaluation of toxicity levels in generated texts, helping to ensure content safety and appropriateness.
35
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of toxicity scores.
37
+ ### Strengths
38
+
39
+ - Provides a clear evaluation of toxicity levels in generated texts, helping to ensure content safety and
40
+ appropriateness.
41
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
42
+ toxicity scores.
36
43
  - Descriptive statistics offer a concise summary of the model's performance in generating non-toxic texts.
37
44
 
38
- **Limitations:**
45
+ ### Limitations
46
+
39
47
  - The accuracy of the toxicity scores is contingent upon the underlying `toxicity` tool.
40
- - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high toxicity.
48
+ - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for
49
+ high toxicity.
41
50
  - Supplementary, in-depth analysis might be needed for granular insights.
42
51
  """
43
52
 
@@ -12,38 +12,42 @@ class ClusterDistribution(Metric):
12
12
  """
13
13
  Assesses the distribution of text embeddings across clusters produced by a model using KMeans clustering.
14
14
 
15
- **Purpose:** The purpose of this metric is to analyze the distribution of the clusters produced by a text embedding
16
- model. By dividing the text embeddings into different clusters, we can understand how the model is grouping or
17
- categorizing the text data. This aids in visualizing the organization and segregation of the data and thus gives an
15
+ ### Purpose
16
+
17
+ The purpose of this metric is to analyze the distribution of the clusters produced by a text embedding model. By
18
+ dividing the text embeddings into different clusters, we can understand how the model is grouping or categorizing
19
+ the text data. This aids in visualizing the organization and segregation of the data, thereby giving an
18
20
  understanding of how the model is processing the data.
19
21
 
20
- **Test Mechanism:** The metric applies the KMeans clustering algorithm on the predictions made by the model on the
21
- testing dataset and divides the text embeddings into a pre-defined number of clusters. By default, this number is
22
- set to 5 but can be customized as per requirements. The output of this test is a histogram plot that shows the
23
- distribution of embeddings across these clusters.
22
+ ### Test Mechanism
23
+
24
+ The metric applies the KMeans clustering algorithm on the predictions made by the model on the testing dataset and
25
+ divides the text embeddings into a pre-defined number of clusters. By default, this number is set to 5 but can be
26
+ customized as per requirements. The output of this test is a histogram plot that shows the distribution of
27
+ embeddings across these clusters.
24
28
 
25
- **Signs of High Risk:**
29
+ ### Signs of High Risk
26
30
 
27
- - If the embeddings are skewed towards one or two clusters, that would indicate that the model is not effectively
31
+ - If the embeddings are skewed towards one or two clusters, it indicates that the model is not effectively
28
32
  differentiating the various categories in the text data.
29
33
  - Uniform distribution of the embeddings across the clusters might show a lack of proper categorization.
30
34
 
31
- **Strengths:**
35
+ ### Strengths
32
36
 
33
- - Great tool to visualize the text data categorization by the model. It provides a way to assess if the model is
34
- distinguishing the categories effectively or not.
35
- - It is flexible with the number of clusters (classes), so can be used on various types of data regardless of the
36
- number of categories.
37
+ - Great tool to visualize the text data categorization by the model.
38
+ - Provides a way to assess if the model is distinguishing the categories effectively or not.
39
+ - Flexible with the number of clusters, so it can be used on various types of data regardless of the number of
40
+ categories.
37
41
 
38
- **Limitations:**
42
+ ### Limitations
39
43
 
40
- - The success or failure of this test is based on visual interpretation, which might not be enough for making solid
44
+ - Success or failure of this test is based on visual interpretation, which might not be enough for making solid
41
45
  conclusions or determining the exact points of failure.
42
- - It assumes that the division of text embeddings across clusters should ideally be homogeneous, which might not
46
+ - Assumes that the division of text embeddings across clusters should ideally be homogeneous, which might not
43
47
  always be the case depending on the nature of the text data.
44
- - It only applies to text embedding models, reducing its utility across various ML models.
45
- - This test uses the KMeans clustering algorithm, which assumes that clusters are convex and isotropic. Thus, this
46
- test may not work as intended if the true clusters in the data are not of this shape.
48
+ - Only applies to text embedding models, reducing its utility across various ML models.
49
+ - Uses the KMeans clustering algorithm, which assumes that clusters are convex and isotropic, and may not work as
50
+ intended if the true clusters in the data are not of this shape.
47
51
  """
48
52
 
49
53
  name = "Text Embeddings Cluster Distribution"
@@ -16,45 +16,48 @@ from validmind import tags, tasks
16
16
  @tasks("text_qa", "text_generation", "text_summarization")
17
17
  def CosineSimilarityComparison(dataset, models):
18
18
  """
19
- Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
20
- alongside compiling a comprehensive table of descriptive statistics for each model pair.
19
+ Assesses the similarity between embeddings generated by different models using Cosine Similarity, providing both
20
+ statistical and visual insights.
21
21
 
22
- **Purpose:**
23
- This function is designed to analyze and compare the embeddings produced by different models using Cosine Similarity.
24
- Cosine Similarity, a measure calculating the cosine of the angle between two vectors, is widely used to determine
25
- the alignment or similarity between vectors in high-dimensional spaces, such as text embeddings. This analysis helps
26
- to understand how similar or different the models' predictions are in terms of embedding generation.
22
+ ### Purpose
27
23
 
28
- **Test Mechanism:**
29
- The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
30
- cosine similarity for every possible pair of models, generating a similarity matrix. Each element of this matrix
31
- represents the cosine similarity between two model embeddings. The function flattens this matrix and uses it to
32
- create a bar chart for each model pair, visualizing their similarity distribution. Additionally, it compiles a table
33
- with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the similarities of each
34
- pair, including a reference to the compared models.
24
+ The Cosine Similarity Comparison test aims to analyze and compare the embeddings produced by different models using
25
+ Cosine Similarity. Cosine Similarity is a measure that calculates the cosine of the angle between two vectors,
26
+ widely used to determine the alignment or similarity between high-dimensional vectors, such as text embeddings.
27
+ This analysis helps understand how similar or different the models' predictions are in terms of embedding
28
+ generation.
35
29
 
36
- **Signs of High Risk:**
30
+ ### Test Mechanism
31
+
32
+ The function starts by computing the embeddings for each model using the provided dataset. It then calculates the
33
+ cosine similarity for every possible pair of models, generating a similarity matrix wherein each element represents
34
+ the cosine similarity between two model embeddings. This matrix is flattened to create a bar chart for each model
35
+ pair, visualizing their similarity distribution. Additionally, a table with descriptive statistics (mean, median,
36
+ standard deviation, minimum, and maximum) for the similarities of each pair is compiled, referencing the compared
37
+ models.
38
+
39
+ ### Signs of High Risk
37
40
 
38
41
  - A high concentration of cosine similarity values close to 1 could suggest that the models are producing very
39
- similar embeddings, which could be a sign of redundancy or lack of diversity in model training or design.
40
- - Conversely, very low similarity values near -1 indicate strong dissimilarity, potentially highlighting models
41
- that are too divergent, possibly focusing on very different features of the data.
42
+ similar embeddings, indicating redundancy or lack of diversity in model training or design.
43
+ - Very low similarity values near -1 highlight strong dissimilarity, suggesting models that are too divergent and
44
+ possibly focusing on very different features of the data.
42
45
 
43
- **Strengths:**
46
+ ### Strengths
44
47
 
45
48
  - Enables detailed comparisons between multiple models' embedding strategies through visual and statistical means.
46
- - Helps identify which models produce similar or dissimilar embeddings, useful for tasks requiring model diversity.
49
+ - Identifies models producing similar or dissimilar embeddings, useful for tasks requiring model diversity.
47
50
  - Provides quantitative and visual feedback on the degree of similarity, enhancing interpretability of model
48
- behavior in embedding spaces.
51
+ behavior in embedding spaces.
49
52
 
50
- **Limitations:**
53
+ ### Limitations
51
54
 
52
- - The analysis is confined to the comparison of embeddings and does not assess the overall performance of the models
53
- in terms of their primary tasks (e.g., classification, regression).
55
+ - The analysis is confined to the comparison of embeddings and does not assess the overall performance of the
56
+ models in terms of their primary tasks (e.g., classification, regression).
54
57
  - Assumes that the models are suitable for generating comparable embeddings, which might not always be the case,
55
- especially across different types of models.
56
- - Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of high-dimensional
57
- embedding spaces.
58
+ especially across different types of models.
59
+ - Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of
60
+ high-dimensional embedding spaces.
58
61
  """
59
62
 
60
63
  figures = []
@@ -13,32 +13,34 @@ class CosineSimilarityDistribution(Metric):
13
13
  Assesses the similarity between predicted text embeddings from a model using a Cosine Similarity distribution
14
14
  histogram.
15
15
 
16
- **Purpose:**
16
+ ### Purpose
17
+
17
18
  This metric is used to assess the degree of similarity between the embeddings produced by a text embedding model
18
19
  using Cosine Similarity. Cosine Similarity is a measure that calculates the cosine of the angle between two
19
20
  vectors. This metric is predominantly used in text analysis — in this case, to determine how closely the predicted
20
21
  text embeddings align with one another.
21
22
 
22
- **Test Mechanism:**
23
+ ### Test Mechanism
24
+
23
25
  The implementation starts by computing the cosine similarity between the predicted values of the model's test
24
26
  dataset. These cosine similarity scores are then plotted on a histogram with 100 bins to visualize the distribution
25
27
  of the scores. The x-axis of the histogram represents the computed Cosine Similarity.
26
28
 
27
- **Signs of High Risk:**
29
+ ### Signs of High Risk
28
30
 
29
31
  - If the cosine similarity scores cluster close to 1 or -1, it may indicate overfitting, as the model's predictions
30
32
  are almost perfectly aligned. This could suggest that the model is not generalizable.
31
33
  - A broad spread of cosine similarity scores across the histogram may indicate a potential issue with the model's
32
34
  ability to generate consistent embeddings.
33
35
 
34
- **Strengths:**
36
+ ### Strengths
35
37
 
36
38
  - Provides a visual representation of the model's performance which is easily interpretable.
37
39
  - Can help identify patterns, trends, and outliers in the model's alignment of predicted text embeddings.
38
40
  - Useful in measuring the similarity between vectors in multi-dimensional space, important in the case of text
39
41
  embeddings.
40
42
 
41
- **Limitations:**
43
+ ### Limitations
42
44
 
43
45
  - Only evaluates the similarity between outputs. It does not provide insight into the model's ability to correctly
44
46
  classify or predict.
@@ -23,33 +23,42 @@ def CosineSimilarityHeatmap(
23
23
  """
24
24
  Generates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model.
25
25
 
26
- **Purpose:**
27
- This function is designed to visually analyze the cosine similarities of embeddings from a specific model.
28
- Cosine similarity, a measure of the cosine of the angle between two vectors, aids in understanding the
29
- orientation and similarity of vectors in multi-dimensional space. This is particularly valuable for exploring
30
- text embeddings and their relative similarities among documents, words, or phrases.
31
-
32
- **Test Mechanism:**
33
- The function operates through a sequence of steps to visualize cosine similarities. Initially,
34
- embeddings are extracted for each dataset entry using the designated model. Following this,
35
- the function computes the pairwise cosine similarities among these embeddings. The computed similarities
36
- are then displayed in an interactive heatmap.
37
-
38
- **Signs of High Risk:**
39
- - High similarity values (close to 1) across the heatmap might not always be indicative of a risk;
40
- however, in contexts where diverse perspectives or features are desired, this could suggest a lack of
41
- diversity in the model's learning process or potential redundancy.
26
+ ### Purpose
27
+
28
+ This function is designed to visually analyze the cosine similarities of embeddings from a specific model. Cosine
29
+ similarity, a measure of the cosine of the angle between two vectors, aids in understanding the orientation and
30
+ similarity of vectors in multi-dimensional space. This is particularly valuable for exploring text embeddings and
31
+ their relative similarities among documents, words, or phrases.
32
+
33
+ ### Test Mechanism
34
+
35
+ The function operates through a sequence of steps to visualize cosine similarities. Initially, embeddings are
36
+ extracted for each dataset entry using the designated model. Following this, the function computes the pairwise
37
+ cosine similarities among these embeddings. The computed similarities are then displayed in an interactive heatmap.
38
+
39
+ ### Signs of High Risk
40
+
41
+ - High similarity values (close to 1) across the heatmap might not always be indicative of a risk; however, in
42
+ contexts where diverse perspectives or features are desired, this could suggest a lack of diversity in the model's
43
+ learning process or potential redundancy.
42
44
  - Similarly, low similarity values (close to -1) indicate strong dissimilarity, which could be beneficial in
43
45
  scenarios demanding diverse outputs. However, in cases where consistency is needed, these low values might
44
- highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor performance on related tasks.
46
+ highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor
47
+ performance on related tasks.
48
+
49
+ ### Strengths
50
+
51
+ - Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy
52
+ exploration and analysis.
53
+ - Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical
54
+ needs and preferences.
45
55
 
46
- **Strengths:**
47
- - Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy exploration and analysis.
48
- - Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical needs and preferences.
56
+ ### Limitations
49
57
 
50
- **Limitations:**
51
- - As the number of embeddings increases, the effectiveness of the heatmap might diminish due to overcrowding, making it hard to discern detailed similarities.
52
- - The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect settings can lead to misleading visual interpretations.
58
+ - As the number of embeddings increases, the effectiveness of the heatmap might diminish due to overcrowding,
59
+ making it hard to discern detailed similarities.
60
+ - The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect
61
+ settings can lead to misleading visual interpretations.
53
62
  """
54
63
 
55
64
  embeddings = np.stack(dataset.y_pred(model))
@@ -13,26 +13,28 @@ class DescriptiveAnalytics(Metric):
13
13
  Evaluates statistical properties of text embeddings in an ML model via mean, median, and standard deviation
14
14
  histograms.
15
15
 
16
- **1. Purpose:**
16
+ ### Purpose
17
+
17
18
  This metric, Descriptive Analytics for Text Embeddings Models, is employed to comprehend the fundamental properties
18
19
  and statistical characteristics of the embeddings in a Machine Learning model. It measures the dimensionality as
19
20
  well as the statistical distributions of embedding values including the mean, median, and standard deviation.
20
21
 
21
- **2. Test Mechanism:**
22
+ ### Test Mechanism
23
+
22
24
  The test mechanism involves using the 'DescriptiveAnalytics' class provided in the code which includes the 'run'
23
25
  function. This function computes three statistical measures - mean, median, and standard deviation of the test
24
26
  predictions from the model. It generates and caches three separate histograms showing the distribution of these
25
27
  measures. Each histogram visualizes the measure's distribution across the embedding values. Therefore, the method
26
28
  does not utilize a grading scale or threshold; it is fundamentally a visual exploration and data exploration tool.
27
29
 
28
- **3. Signs of High Risk:**
30
+ ### Signs of High Risk
29
31
 
30
32
  - Abnormal patterns or values in the distributions of the statistical measures. This may include skewed
31
33
  distributions or a significant amount of outliers.
32
34
  - Very high standard deviation values which indicate a high degree of variability in the data.
33
35
  - The mean and median values are vastly different, suggesting skewed data.
34
36
 
35
- **4. Strengths:**
37
+ ### Strengths
36
38
 
37
39
  - Provides a visual and quantifiable understanding of the embeddings' statistical characteristics, allowing for a
38
40
  comprehensive evaluation.
@@ -41,7 +43,7 @@ class DescriptiveAnalytics(Metric):
41
43
  - It considers three key statistical measures (mean, median, and standard deviation), offering a more well-rounded
42
44
  understanding of the data.
43
45
 
44
- **5. Limitations:**
46
+ ### Limitations
45
47
 
46
48
  - The method does not offer an explicit measure of model performance or accuracy, as it mainly focuses on
47
49
  understanding data properties.
@@ -12,24 +12,28 @@ class EmbeddingsVisualization2D(Metric):
12
12
  """
13
13
  Visualizes 2D representation of text embeddings generated by a model using t-SNE technique.
14
14
 
15
- **1. Purpose:** The objective of this metric is to provide a visual 2D representation of the embeddings created by
16
- a text embedding machine learning model. By doing so, it aids in analyzing the embedding space created by the model
17
- and helps in understanding how the learned embeddings are distributed and how they relate to each other.
15
+ ### Purpose
18
16
 
19
- **2. Test Mechanism:** This metric uses the t-Distributed Stochastic Neighbor Embedding (t-SNE) technique, which is
20
- a tool for visualizing high-dimensional data by reducing the dimensionality to 2. The perplexity parameter for
21
- t-SNE is set to the value provided by the user. If the input perplexity value is greater than the number of
22
- samples, the perplexity is adjusted to be one less than the number of samples. Following the reduction of
23
- dimensionality, a scatter plot is produced depicting each embedding as a data point in the visualized 2D plane.
17
+ The objective of this metric is to provide a visual 2D representation of the embeddings created by a text embedding
18
+ machine learning model. By doing so, it aids in analyzing the embedding space created by the model and helps in
19
+ understanding how the learned embeddings are distributed and how they relate to each other.
24
20
 
25
- **3. Signs of High Risk:**
21
+ ### Test Mechanism
22
+
23
+ This metric uses the t-Distributed Stochastic Neighbor Embedding (t-SNE) technique, which is a tool for visualizing
24
+ high-dimensional data by reducing the dimensionality to 2. The perplexity parameter for t-SNE is set to the value
25
+ provided by the user. If the input perplexity value is greater than the number of samples, the perplexity is
26
+ adjusted to be one less than the number of samples. Following the reduction of dimensionality, a scatter plot is
27
+ produced depicting each embedding as a data point in the visualized 2D plane.
28
+
29
+ ### Signs of High Risk
26
30
 
27
31
  - If the embeddings are highly concentrated in a specific region of the plane, it might indicate that the model is
28
32
  not learning diverse representations of the text.
29
33
  - Wide gaps or partitions in the visualization could suggest that the model is over-segmenting in the embedding
30
34
  space and may lead to poor generalization.
31
35
 
32
- **4. Strengths:**
36
+ ### Strengths
33
37
 
34
38
  - Offers a powerful visual tool that can assist in understanding and interpreting high-dimensional embeddings,
35
39
  which could otherwise be difficult to visualize.
@@ -37,7 +41,7 @@ class EmbeddingsVisualization2D(Metric):
37
41
  - t-SNE visualization helps in focusing on local structures and preserves the proximity of points that are close
38
42
  together in the original high-dimensional space.
39
43
 
40
- **5. Limitations:**
44
+ ### Limitations
41
45
 
42
46
  - The reduction of high-dimensional data to 2D can result in loss of some information, which may lead to
43
47
  misinterpretation.
@@ -16,41 +16,41 @@ from validmind import tags, tasks
16
16
  @tasks("text_qa", "text_generation", "text_summarization")
17
17
  def EuclideanDistanceComparison(dataset, models):
18
18
  """
19
- Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
20
- alongside compiling a comprehensive table of descriptive statistics for each model pair.
21
-
22
- **Purpose:**
23
- This function is designed to analyze and compare the embeddings produced by different models using Euclidean Distance.
24
- Euclidean Distance measures the "ordinary" straight-line distance between two points in Euclidean space, providing a
25
- straightforward metric to assess the absolute differences between vectors. This analysis helps in understanding the
26
- magnitude of dissimilarity between the embeddings generated by different models, which is crucial for tasks that require
27
- distinctive model responses or feature separations.
28
-
29
- **Test Mechanism:**
30
- The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
31
- Euclidean distance for every possible pair of models, generating a distance matrix. Each element of this matrix
32
- represents the Euclidean distance between two model embeddings. The function flattens this matrix and uses it to
33
- create a bar chart for each model pair, visualizing their distance distribution. Additionally, it compiles a table
34
- with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the distances of each
35
- pair, including a reference to the compared models.
36
-
37
- **Signs of High Risk:**
38
-
39
- - Very high distance values could suggest that the models are focusing on completely different features or aspects
40
- of the data, which might be undesirable for ensemble methods or similar applications where some degree of
41
- consensus is expected.
42
- - Extremely low distances across different models might indicate redundancy, suggesting that the models are not
43
- providing diverse enough perspectives on the data.
44
-
45
- **Strengths:**
19
+ Assesses and visualizes the dissimilarity between model embeddings using Euclidean distance, providing insights
20
+ into model behavior and potential redundancy or diversity.
21
+
22
+ ### Purpose
23
+
24
+ The Euclidean Distance Comparison test aims to analyze and compare the embeddings produced by different models. By
25
+ measuring the Euclidean distance between vectors in Euclidean space, it provides a metric to assess the magnitude
26
+ of dissimilarity between embeddings created by different models. This is crucial for tasks that require models to
27
+ produce distinct responses or feature separations.
28
+
29
+ ### Test Mechanism
30
+
31
+ The test computes the embeddings for each model using the provided dataset and calculates the Euclidean distance
32
+ for every possible pair of models. It generates a distance matrix where each element represents the Euclidean
33
+ distance between two model embeddings. This matrix is then visualized through bar charts, showing the distance
34
+ distribution for each model pair. Additionally, it compiles a table with descriptive statistics such as mean,
35
+ median, standard deviation, minimum, and maximum distances for each model pair, including references to the
36
+ compared models.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - Very high distance values could suggest that models are focusing on entirely different features or aspects of the
41
+ data, which might be undesirable for ensemble methods or when a consensus is required.
42
+ - Extremely low distances across different models might indicate redundancy, suggesting that models are not
43
+ providing diverse enough perspectives on the data.
44
+
45
+ ### Strengths
46
46
 
47
47
  - Provides a clear and quantifiable measure of how different the embeddings from various models are.
48
48
  - Useful for identifying outlier models or those that behave significantly differently from others in a group.
49
49
 
50
- **Limitations:**
50
+ ### Limitations
51
51
 
52
52
  - Euclidean distance can be sensitive to the scale of the data, meaning that preprocessing steps like normalization
53
- might be necessary to ensure meaningful comparisons.
53
+ might be necessary to ensure meaningful comparisons.
54
54
  - Does not consider the orientation or angle between vectors, focusing purely on magnitude differences.
55
55
  """
56
56
 
@@ -23,31 +23,40 @@ def EuclideanDistanceHeatmap(
23
23
  """
24
24
  Generates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model.
25
25
 
26
- **Purpose:**
27
- This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into the
28
- absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures the
29
- straight-line distance between two points in Euclidean space. It is particularly useful for understanding spatial
30
- relationships and clustering tendencies in high-dimensional data.
31
-
32
- **Test Mechanism:**
33
- The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using the specified model.
34
- Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results are then visualized in an interactive heatmap format,
35
- where each cell's color intensity correlates with the distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
36
-
37
- **Signs of High Risk:**
38
- - Uniform Distances: Uniformly low distances across the heatmap might suggest a lack of variability in the data or
39
- model overfitting, where the model fails to distinguish between distinct data points effectively.
40
- - High Variability: Conversely, excessive variability in distances could indicate inconsistent data representation,
41
- potentially leading to unreliable model predictions.
42
-
43
- **Strengths:**
44
- - Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of patterns or anomalies.
45
- - Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to various analytical needs.
46
-
47
- **Limitations:**
48
- - The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for meaningful analysis.
49
- - Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances, potentially requiring
50
- techniques like data sampling or dimensionality reduction for clearer visualization.
26
+ ### Purpose
27
+
28
+ This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into
29
+ the absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures
30
+ the straight-line distance between two points in Euclidean space. It is particularly useful for understanding
31
+ spatial relationships and clustering tendencies in high-dimensional data.
32
+
33
+ ### Test Mechanism
34
+
35
+ The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using
36
+ the specified model. Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results
37
+ are then visualized in an interactive heatmap format, where each cell's color intensity correlates with the
38
+ distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
39
+
40
+ ### Signs of High Risk
41
+
42
+ - Uniformly low distances across the heatmap might suggest a lack of variability in the data or model overfitting,
43
+ where the model fails to distinguish between distinct data points effectively.
44
+ - Excessive variability in distances could indicate inconsistent data representation, potentially leading to
45
+ unreliable model predictions.
46
+
47
+ ### Strengths
48
+
49
+ - Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of
50
+ patterns or anomalies.
51
+ - Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to
52
+ various analytical needs.
53
+
54
+ ### Limitations
55
+
56
+ - The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for
57
+ meaningful analysis.
58
+ - Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances,
59
+ potentially requiring techniques like data sampling or dimensionality reduction for clearer visualization.
51
60
  """
52
61
 
53
62
  embeddings = np.stack(dataset.y_pred(model))