validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -13,40 +13,44 @@ from validmind.vm_models import Figure, Metric
13
13
  @dataclass
14
14
  class ClusterSizeDistribution(Metric):
15
15
  """
16
- Compares and visualizes the distribution of cluster sizes in model predictions and actual data for assessing
17
- clustering model performance.
18
-
19
- **Purpose:** The purpose of the `ClusterSizeDistribution` metric is to assess the performance of clustering models.
20
- It does this by comparing the distribution of cluster sizes in the predictions made by the model and the actual
21
- data. Observing the cluster distribution helps gain insights into whether the model's output aligns well with the
22
- actual dataset distribution.
23
-
24
- **Test Mechanism:** The testing mechanism for `ClusterSizeDistribution` involves first running the clustering model
25
- on the training dataset, storing predictions, and comparing these predictions with the actual output. The actual
26
- and predicted outputs are then converted into pandas dataframes, which conveniently enables the use of pandas
27
- built-in functions to derive cluster size distributions. Two histograms are constructed from this data: one for the
28
- actual distribution and one for the predicted distribution. These histograms are then plotted side-by-side for
29
- visual comparison.
30
-
31
- **Signs of High Risk:**
32
- * Discrepancies between the actual cluster size distribution and the predicted cluster size distribution may
33
- indicate high risk.
34
- * An irregular distribution of data across clusters in the predicted outcomes points towards an inaccurate
35
- prediction model.
36
- * A high number of outlier clusters could indicate that the model has trouble correctly grouping data.
37
-
38
- **Strengths:**
39
- * `ClusterSizeDistribution` provides a visual and intuitive way to compare the performance of the clustering model
40
- against the actual data.
41
- * This metric can effectively reveal where the model might be over- or underestimating cluster sizes.
42
- * It works well with any clustering models, making it a versatile comparison tool.
43
-
44
- **Limitations:**
45
- * The metric assumes that the actual cluster distribution is optimal, which may not always be the case.
46
- * It relies heavily on visual comparison, which might be subjective and may not provide a precise numerical measure
47
- of model performance.
48
- * The metric might not fully capture other important aspects of clustering such as cluster density, distances
49
- between clusters, and the shape of clusters.
16
+ Assesses the performance of clustering models by comparing the distribution of cluster sizes in model predictions
17
+ with the actual data.
18
+
19
+ ### Purpose
20
+
21
+ The Cluster Size Distribution test aims to assess the performance of clustering models by comparing the
22
+ distribution of cluster sizes in the model's predictions with the actual data. This comparison helps determine if
23
+ the clustering model's output aligns well with the true cluster distribution, providing insights into the model's
24
+ accuracy and performance.
25
+
26
+ ### Test Mechanism
27
+
28
+ The test mechanism involves the following steps:
29
+ - Run the clustering model on the provided dataset to obtain predictions.
30
+ - Convert both the actual and predicted outputs into pandas dataframes.
31
+ - Use pandas built-in functions to derive the cluster size distributions from these dataframes.
32
+ - Construct two histograms: one for the actual cluster size distribution and one for the predicted distribution.
33
+ - Plot the histograms side-by-side for visual comparison.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Discrepancies between the actual cluster size distribution and the predicted cluster size distribution.
38
+ - Irregular distribution of data across clusters in the predicted outcomes.
39
+ - High number of outlier clusters suggesting the model struggles to correctly group data.
40
+
41
+ ### Strengths
42
+
43
+ - Provides a visual and intuitive way to compare the clustering model's performance against actual data.
44
+ - Effectively reveals where the model may be over- or underestimating cluster sizes.
45
+ - Versatile as it works well with any clustering model.
46
+
47
+ ### Limitations
48
+
49
+ - Assumes that the actual cluster distribution is optimal, which may not always be the case.
50
+ - Relies heavily on visual comparison, which could be subjective and may not offer a precise numerical measure of
51
+ performance.
52
+ - May not fully capture other important aspects of clustering, such as cluster density, distances between clusters,
53
+ and the shape of clusters.
50
54
  """
51
55
 
52
56
  name = "cluster_size_distribution"
@@ -13,29 +13,54 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def ContextualRecall(dataset, model):
15
15
  """
16
- Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of descriptive statistics for contextual recall scores.
16
+ Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct
17
+ text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of
18
+ descriptive statistics for contextual recall scores.
17
19
 
18
- **Purpose:**
19
- The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to generate text that appropriately reflects the given context or prompt. It measures the model's capability to remember and reproduce the main context in its resulting output. This metric is critical in natural language processing tasks, as the coherency and contextuality of the generated text are essential.
20
+ ### Purpose
20
21
 
21
- **Test Mechanism:**
22
- The function starts by extracting the true and predicted values from the provided dataset and model. It then tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to show score variations across different rows. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive summary of the model's performance.
22
+ The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to
23
+ generate text that appropriately reflects the given context or prompt. It measures the model's capability to
24
+ remember and reproduce the main context in its resulting output. This metric is critical in natural language
25
+ processing tasks, as the coherency and contextuality of the generated text are essential.
23
26
 
24
- **Signs of High Risk:**
25
- - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in its output, leading to incoherent or contextually misaligned text.
27
+ ### Test Mechanism
28
+
29
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then
30
+ tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between
31
+ the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number
32
+ of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test
33
+ dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to
34
+ show score variations across different rows. Additionally, a table of descriptive statistics (mean, median,
35
+ standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive
36
+ summary of the model's performance.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in
41
+ its output, leading to incoherent or contextually misaligned text.
26
42
  - A consistent trend of low recall scores could suggest underperformance of the model.
27
43
 
28
- **Strengths:**
29
- - Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated narrative.
30
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of contextual recall scores.
31
- - Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant texts.
44
+ ### Strengths
32
45
 
33
- **Limitations:**
34
- - The focus on word overlap could result in high scores for texts that use many common words, even when these texts lack coherence or meaningful context.
46
+ - Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated
47
+ narrative.
48
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
49
+ contextual recall scores.
50
+ - Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant
51
+ texts.
52
+
53
+ ### Limitations
54
+
55
+ - The focus on word overlap could result in high scores for texts that use many common words, even when these texts
56
+ lack coherence or meaningful context.
35
57
  - This metric does not consider the order of words, which could lead to overestimated scores for scrambled outputs.
36
58
  - Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
37
59
  """
38
60
 
61
+ # download nltk data
62
+ nltk.download("punkt_tab", quiet=True)
63
+
39
64
  y_true = dataset.y
40
65
  y_pred = dataset.y_pred(model)
41
66
 
@@ -19,24 +19,43 @@ logger = get_logger(__name__)
19
19
  @dataclass
20
20
  class FeaturesAUC(Metric):
21
21
  """
22
- Evaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately.
22
+ Evaluates the discriminatory power of each individual feature within a binary classification model by calculating
23
+ the Area Under the Curve (AUC) for each feature separately.
23
24
 
24
- **Purpose**: The central objective of this metric is to quantify how well each feature on its own can differentiate between the two classes in a binary classification problem. It serves as a univariate analysis tool that can help in pre-modeling feature selection or post-modeling interpretation.
25
+ ### Purpose
25
26
 
26
- **Test Mechanism**: For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's univariate classification strength.
27
+ The central objective of this metric is to quantify how well each feature on its own can differentiate between the
28
+ two classes in a binary classification problem. It serves as a univariate analysis tool that can help in
29
+ pre-modeling feature selection or post-modeling interpretation.
27
30
 
28
- **Signs of High Risk**:
29
- - A feature with a low AUC score may not be contributing significantly to the differentiation between the two classes, which could be a concern if it is expected to be predictive.
30
- - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or other issues with the data.
31
+ ### Test Mechanism
31
32
 
32
- **Strengths**:
33
- - By isolating each feature, it highlights the individual contribution of features to the classification task without the influence of other variables.
34
- - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual features after model training.
33
+ For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary
34
+ outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's
35
+ univariate classification strength.
35
36
 
36
- **Limitations**:
37
- - Does not reflect the combined effects of features or any interaction between them, which can be critical in certain models.
38
- - The AUC values are calculated without considering the model's use of the features, which could lead to different interpretations of feature importance when considering the model holistically.
39
- - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass classification or regression without modifications.
37
+ ### Signs of High Risk
38
+
39
+ - A feature with a low AUC score may not be contributing significantly to the differentiation between the two
40
+ classes, which could be a concern if it is expected to be predictive.
41
+ - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or
42
+ other issues with the data.
43
+
44
+ ### Strengths
45
+
46
+ - By isolating each feature, it highlights the individual contribution of features to the classification task
47
+ without the influence of other variables.
48
+ - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual
49
+ features after model training.
50
+
51
+ ### Limitations
52
+
53
+ - Does not reflect the combined effects of features or any interaction between them, which can be critical in
54
+ certain models.
55
+ - The AUC values are calculated without considering the model's use of the features, which could lead to different
56
+ interpretations of feature importance when considering the model holistically.
57
+ - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass
58
+ classification or regression without modifications.
40
59
  """
41
60
 
42
61
  name = "features_auc"
@@ -13,39 +13,52 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def MeteorScore(dataset, model):
15
15
  """
16
- Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
17
-
18
- **Purpose:**
19
- METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality of machine translations
20
- by comparing them against reference translations. It emphasizes both the accuracy and fluency of translations, incorporating
21
- precision, recall, and word order into its assessment.
22
-
23
- **Test Mechanism:**
24
- The function starts by extracting the true and predicted values from the provided dataset and model. The METEOR score is computed
25
- for each pair of machine-generated translation (prediction) and its corresponding human-produced reference. This is done by
26
- considering unigram matches between the translations, including matches based on surface forms, stemmed forms, and synonyms.
27
- The score is a combination of unigram precision and recall, adjusted for word order through a fragmentation penalty. Scores are
28
- compiled into a dataframe, and histograms and bar charts are generated to visualize the distribution of METEOR scores. Additionally,
29
- a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the METEOR scores,
30
- providing a comprehensive summary of the model's performance.
31
-
32
- **Signs of High Risk:**
33
- - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references,
34
- highlighting potential deficiencies in both the accuracy and fluency of translations.
35
- - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes
36
- and reconstructs sentence structures, potentially compromising the natural flow of translated text.
37
- - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the
38
- nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
39
-
40
- **Strengths:**
41
- - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of content coverage in translations.
42
- - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
43
- - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
44
-
45
- **Limitations:**
46
- - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for large datasets.
47
- - The use of external resources for synonym and stemming matching may introduce variability based on the resources' quality and relevance to the specific
48
- translation task.
16
+ Assesses the quality of machine-generated translations by comparing them to human-produced references using the
17
+ METEOR score, which evaluates precision, recall, and word order.
18
+
19
+ ### Purpose
20
+
21
+ The METEOR (Metric for Evaluation of Translation with Explicit ORdering) score is designed to evaluate the quality
22
+ of machine translations by comparing them against reference translations. It emphasizes both the accuracy and
23
+ fluency of translations, incorporating precision, recall, and word order into its assessment.
24
+
25
+ ### Test Mechanism
26
+
27
+ The function starts by extracting the true and predicted values from the provided dataset and model. The METEOR
28
+ score is computed for each pair of machine-generated translation (prediction) and its corresponding human-produced
29
+ reference. This is done by considering unigram matches between the translations, including matches based on surface
30
+ forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall, adjusted for word
31
+ order through a fragmentation penalty. Scores are compiled into a dataframe, and histograms and bar charts are
32
+ generated to visualize the distribution of METEOR scores. Additionally, a table of descriptive statistics (mean,
33
+ median, standard deviation, minimum, and maximum) is compiled for the METEOR scores, providing a comprehensive
34
+ summary of the model's performance.
35
+
36
+ ### Signs of High Risk
37
+
38
+ - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their
39
+ human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
40
+ - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the
41
+ translation model processes and reconstructs sentence structures, potentially compromising the natural flow of
42
+ translated text.
43
+ - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader
44
+ inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its
45
+ training or inherent limitations.
46
+
47
+ ### Strengths
48
+
49
+ - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance
50
+ of content coverage in translations.
51
+ - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical
52
+ matching.
53
+ - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible
54
+ matching.
55
+
56
+ ### Limitations
57
+
58
+ - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
59
+ large datasets.
60
+ - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
61
+ quality and relevance to the specific translation task.
49
62
  """
50
63
 
51
64
  # Extract true and predicted values
@@ -2,66 +2,36 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import pandas as pd
8
6
 
7
+ from validmind import tags, tasks
9
8
  from validmind.utils import get_model_info
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable
11
9
 
12
10
 
13
- @dataclass
14
- class ModelMetadata(Metric):
11
+ @tags("model_training", "metadata")
12
+ @tasks("regression", "time_series_forecasting")
13
+ def ModelMetadata(model):
15
14
  """
16
- Extracts and summarizes critical metadata from a machine learning model instance for comprehensive analysis.
17
-
18
- **Purpose:**
19
- This test is designed to collect and summarize important metadata related to a particular machine learning model.
20
- Such metadata includes the model's architecture (modeling technique), the version and type of modeling framework
21
- used, and the programming language the model is written in.
22
-
23
- **Test Mechanism:**
24
- The mechanism of this test consists of extracting information from the model instance. It tries to extract the
25
- model information such as the modeling technique used, the modeling framework version, and the programming
26
- language. It decorates this information into a data frame and returns a summary of the results.
15
+ Compare metadata of different models and generate a summary table with the results.
27
16
 
28
- **Signs of High Risk:**
17
+ **Purpose**: The purpose of this function is to compare the metadata of different models, including information about their architecture, framework, framework version, and programming language.
29
18
 
30
- - High risk could be determined by a lack of documentation or inscrutable metadata for the model.
31
- - Unidentifiable language, outdated or unsupported versions of modeling frameworks, or undisclosed model
32
- architectures reflect risky situations, as they could hinder future reproducibility, support, and debugging of the
33
- model.
19
+ **Test Mechanism**: The function retrieves the metadata for each model using `get_model_info`, renames columns according to a predefined set of labels, and compiles this information into a summary table.
34
20
 
35
- **Strengths:**
21
+ **Signs of High Risk**:
22
+ - Inconsistent or missing metadata across models can indicate potential issues in model documentation or management.
23
+ - Significant differences in framework versions or programming languages might pose challenges in model integration and deployment.
36
24
 
37
- - The strengths of this test lie in the increased transparency and understanding it brings regarding the model's
38
- setup.
39
- - Knowing the model's architecture, the specific modeling framework version used, and the language involved,
40
- provides multiple benefits: supports better error understanding and debugging, facilitates model reuse, aids
41
- compliance of software policies, and assists in planning for model obsolescence due to evolving or discontinuing
42
- software and dependencies.
25
+ **Strengths**:
26
+ - Provides a clear comparison of essential model metadata.
27
+ - Standardizes metadata labels for easier interpretation and comparison.
28
+ - Helps identify potential compatibility or consistency issues across models.
43
29
 
44
- **Limitations:**
45
-
46
- - Notably, this test is largely dependent on the compliance and correctness of information provided by the model or
47
- the model developer.
48
- - If the model's built-in methods for describing its architecture, framework or language are incorrect or lack
49
- necessary information, this test will hold limitations.
50
- - Moreover, it is not designed to directly evaluate the performance or accuracy of the model, rather it provides
51
- supplementary information which aids in comprehensive analysis.
30
+ **Limitations**:
31
+ - Assumes that the `get_model_info` function returns all necessary metadata fields.
32
+ - Relies on the correctness and completeness of the metadata provided by each model.
33
+ - Does not include detailed parameter information, focusing instead on high-level metadata.
52
34
  """
53
-
54
- name = "model_metadata"
55
- required_inputs = ["model"]
56
- tasks = [
57
- "classification",
58
- "regression",
59
- "text_classification",
60
- "text_summarization",
61
- ]
62
-
63
- tags = ["model_metadata"]
64
-
65
35
  column_labels = {
66
36
  "architecture": "Modeling Technique",
67
37
  "framework": "Modeling Framework",
@@ -69,22 +39,20 @@ class ModelMetadata(Metric):
69
39
  "language": "Programming Language",
70
40
  }
71
41
 
72
- def summary(self, metric_value):
73
- df = pd.DataFrame(metric_value.items(), columns=["Attribute", "Value"])
74
- # Don't serialize the params attribute
75
- df = df[df["Attribute"] != "params"]
76
- df["Attribute"] = df["Attribute"].map(self.column_labels)
77
-
78
- return ResultSummary(
79
- results=[
80
- ResultTable(data=df.to_dict(orient="records")),
81
- ]
82
- )
83
-
84
- def run(self):
42
+ def extract_and_rename_metadata(model):
85
43
  """
86
- Extracts model metadata from a model object instance
44
+ Extracts metadata for a single model and renames columns based on predefined labels.
87
45
  """
88
- model_info = get_model_info(self.inputs.model)
46
+ model_info = get_model_info(model)
47
+ renamed_info = {
48
+ column_labels.get(k, k): v for k, v in model_info.items() if k != "params"
49
+ }
50
+ return renamed_info
51
+
52
+ # Collect metadata for all models
53
+ metadata_list = [extract_and_rename_metadata(model)]
54
+
55
+ # Create a DataFrame from the collected metadata
56
+ metadata_df = pd.DataFrame(metadata_list)
89
57
 
90
- return self.cache_results(model_info)
58
+ return metadata_df
@@ -12,92 +12,94 @@ from validmind import tags, tasks
12
12
  @tags("regression")
13
13
  @tasks("residual_analysis", "visualization")
14
14
  def ModelPredictionResiduals(
15
- datasets, models, nbins=100, p_value_threshold=0.05, start_date=None, end_date=None
15
+ dataset, model, nbins=100, p_value_threshold=0.05, start_date=None, end_date=None
16
16
  ):
17
17
  """
18
- Plot the residuals and histograms for each model, and generate a summary table
19
- with the Kolmogorov-Smirnov normality test results.
18
+ Assesses normality and behavior of residuals in regression models through visualization and statistical tests.
20
19
 
21
- **Purpose**: The purpose of this function is to visualize the residuals of model predictions and
22
- assess the normality of residuals using the Kolmogorov-Smirnov test.
20
+ ### Purpose
23
21
 
24
- **Test Mechanism**: The function iterates through each dataset-model pair, calculates residuals, and generates
25
- two figures for each model: one for the time series of residuals and one for the histogram of residuals.
22
+ The Model Prediction Residuals test aims to visualize the residuals of model predictions and assess their normality
23
+ using the Kolmogorov-Smirnov (KS) test. It helps to identify potential issues related to model assumptions and
24
+ effectiveness.
25
+
26
+ ### Test Mechanism
27
+
28
+ The function calculates residuals and generates
29
+ two figures: one for the time series of residuals and one for the histogram of residuals.
26
30
  It also calculates the KS test for normality and summarizes the results in a table.
27
31
 
28
- **Signs of High Risk**:
29
- - If the residuals are not normally distributed, it could indicate issues with model assumptions.
30
- - High skewness or kurtosis in the residuals may indicate model misspecification.
32
+ ### Signs of High Risk
33
+
34
+ - Residuals are not normally distributed, indicating potential issues with model assumptions.
35
+ - High skewness or kurtosis in the residuals, which may suggest model misspecification.
31
36
 
32
- **Strengths**:
33
- - Provides a clear visualization of residuals over time and their distribution.
37
+ ### Strengths
38
+
39
+ - Provides clear visualizations of residuals over time and their distribution.
34
40
  - Includes statistical tests to assess the normality of residuals.
41
+ - Helps in identifying potential model misspecifications and assumption violations.
42
+
43
+ ### Limitations
35
44
 
36
- **Limitations**:
37
- - Assumes that the dataset is provided as a DataFrameDataset object with a .df attribute to access
38
- the pandas DataFrame.
39
- - Only generates plots for datasets with a datetime index, and will raise an error for other types of indices.
45
+ - Assumes that the dataset is provided as a DataFrameDataset object with a .df attribute to access the pandas
46
+ DataFrame.
47
+ - Only generates plots for datasets with a datetime index, resulting in errors for other types of indices.
40
48
  """
41
49
 
50
+ df = dataset.df.copy()
51
+
52
+ # Filter DataFrame by date range if specified
53
+ if start_date:
54
+ df = df[df.index >= pd.to_datetime(start_date)]
55
+ if end_date:
56
+ df = df[df.index <= pd.to_datetime(end_date)]
57
+
58
+ y_true = dataset.y
59
+ y_pred = dataset.y_pred(model)
60
+ residuals = y_true - y_pred
61
+
42
62
  figures = []
43
- summary = []
44
-
45
- for dataset, model in zip(datasets, models):
46
- df = dataset.df.copy()
47
-
48
- # Filter DataFrame by date range if specified
49
- if start_date:
50
- df = df[df.index >= pd.to_datetime(start_date)]
51
- if end_date:
52
- df = df[df.index <= pd.to_datetime(end_date)]
53
-
54
- y_true = dataset.y
55
- y_pred = dataset.y_pred(model)
56
- residuals = y_true - y_pred
57
-
58
- # Plot residuals
59
- residuals_fig = go.Figure()
60
- residuals_fig.add_trace(
61
- go.Scatter(x=df.index, y=residuals, mode="lines", name="Residuals")
62
- )
63
- residuals_fig.update_layout(
64
- title=f"Residuals for {model.input_id}",
65
- xaxis_title="Date",
66
- yaxis_title="Residuals",
67
- font=dict(size=16),
68
- showlegend=False,
69
- )
70
- figures.append(residuals_fig)
71
-
72
- # Plot histogram of residuals
73
- hist_fig = go.Figure()
74
- hist_fig.add_trace(go.Histogram(x=residuals, nbinsx=nbins, name="Residuals"))
75
- hist_fig.update_layout(
76
- title=f"Histogram of Residuals for {model.input_id}",
77
- xaxis_title="Residuals",
78
- yaxis_title="Frequency",
79
- font=dict(size=16),
80
- showlegend=False,
81
- )
82
- figures.append(hist_fig)
83
-
84
- # Perform KS normality test
85
- ks_stat, p_value = kstest(
86
- residuals, "norm", args=(residuals.mean(), residuals.std())
87
- )
88
- ks_normality = "Normal" if p_value > p_value_threshold else "Not Normal"
89
-
90
- summary.append(
91
- {
92
- "Model": model.input_id,
93
- "KS Statistic": ks_stat,
94
- "p-value": p_value,
95
- "KS Normality": ks_normality,
96
- "p-value Threshold": p_value_threshold,
97
- }
98
- )
63
+
64
+ # Plot residuals
65
+ residuals_fig = go.Figure()
66
+ residuals_fig.add_trace(
67
+ go.Scatter(x=df.index, y=residuals, mode="markers", name="Residuals")
68
+ )
69
+ residuals_fig.update_layout(
70
+ title="Residuals",
71
+ yaxis_title="Residuals",
72
+ font=dict(size=16),
73
+ showlegend=False,
74
+ )
75
+ figures.append(residuals_fig)
76
+
77
+ # Plot histogram of residuals
78
+ hist_fig = go.Figure()
79
+ hist_fig.add_trace(go.Histogram(x=residuals, nbinsx=nbins, name="Residuals"))
80
+ hist_fig.update_layout(
81
+ title="Histogram of Residuals",
82
+ xaxis_title="Residuals",
83
+ yaxis_title="Frequency",
84
+ font=dict(size=16),
85
+ showlegend=False,
86
+ )
87
+ figures.append(hist_fig)
88
+
89
+ # Perform KS normality test
90
+ ks_stat, p_value = kstest(
91
+ residuals, "norm", args=(residuals.mean(), residuals.std())
92
+ )
93
+ ks_normality = "Normal" if p_value > p_value_threshold else "Not Normal"
94
+
95
+ summary = {
96
+ "KS Statistic": ks_stat,
97
+ "p-value": p_value,
98
+ "KS Normality": ks_normality,
99
+ "p-value Threshold": p_value_threshold,
100
+ }
99
101
 
100
102
  # Create a summary DataFrame for the KS normality test results
101
- summary_df = pd.DataFrame(summary)
103
+ summary_df = pd.DataFrame([summary])
102
104
 
103
105
  return (summary_df, *figures)