validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -2,140 +2,101 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
5
 
7
6
  import pandas as pd
8
7
  from scipy.stats import chi2_contingency
9
8
 
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
+ from validmind import tags, tasks
10
+ from validmind.errors import SkipTestError
11
11
 
12
12
 
13
- @dataclass
14
- class ChiSquaredFeaturesTable(Metric):
13
+ @tags("tabular_data", "categorical_data", "statistical_test")
14
+ @tasks("classification")
15
+ def ChiSquaredFeaturesTable(dataset, p_threshold=0.05):
15
16
  """
16
- Executes Chi-Squared test for each categorical feature against a target column to assess significant association.
17
-
18
- **Purpose**: The `ChiSquaredFeaturesTable` metric is used to carry out a Chi-Squared test of independence for each
19
- categorical feature variable against a designated target column. The primary goal is to determine if a significant
20
- association exists between the categorical features and the target variable. This method typically finds its use in
21
- the context of Model Risk Management to understand feature relevance and detect potential bias in a classification
22
- model.
23
-
24
- **Test Mechanism**: The testing process generates a contingency table for each categorical variable and the target
25
- variable, after which a Chi-Squared test is performed. Using this approach, the Chi-Squared statistic and the
26
- p-value for each feature are calculated. The p-value threshold is a modifiable parameter, and a test will qualify
27
- as passed if the p-value is less than or equal to this threshold. If not, the test is labeled as failed. The
28
- outcome for each feature - comprising the variable name, Chi-squared statistic, p-value, threshold, and pass/fail
29
- status - is incorporated into a final summary table.
30
-
31
- **Signs of High Risk**:
32
- - High p-values (greater than the set threshold) for specific variables could indicate a high risk.
33
- - These high p-values allude to the absence of a statistically significant relationship between the feature and the
34
- target variables, resulting in a 'Fail' status.
35
- - A categorical feature lacking a relevant association with the target variable could be a warning that the machine
36
- learning model might not be performing optimally.
37
-
38
- **Strengths**:
39
- - The test allows for a comprehensive understanding of the interaction between a model's input features and the
40
- target output, thus validating the relevance of categorical features.
41
- - It also produces an unambiguous 'Pass/Fail' output for each categorical feature.
42
- - The opportunity to adjust the p-value threshold contributes to flexibility in accommodating different statistical
43
- standards.
44
-
45
- **Limitations**:
46
- - The metric presupposes that data is tabular and categorical, which may not always be the case with all datasets.
47
- - It is distinctively designed for classification tasks, hence unsuitable for regression scenarios.
48
- - The Chi-squared test, akin to any hypothesis testing-based test, cannot identify causal relationships, but only
49
- associations.
50
- - Furthermore, the test hinges on an adjustable p-value threshold, and varying threshold selections might lead to
51
- different conclusions regarding feature relevance.
17
+ Assesses the statistical association between categorical features and a target variable using the Chi-Squared test.
18
+
19
+ ### Purpose
20
+
21
+ The `ChiSquaredFeaturesTable` function is designed to evaluate the relationship between categorical features and a
22
+ target variable in a dataset. It performs a Chi-Squared test of independence for each categorical feature to
23
+ determine whether a statistically significant association exists with the target variable. This is particularly
24
+ useful in Model Risk Management for understanding the relevance of features and identifying potential biases in a
25
+ classification model.
26
+
27
+ ### Test Mechanism
28
+
29
+ The function creates a contingency table for each categorical feature and the target variable, then applies the
30
+ Chi-Squared test to compute the Chi-squared statistic and the p-value. The results for each feature include the
31
+ variable name, Chi-squared statistic, p-value, p-value threshold, and a pass/fail status based on whether the
32
+ p-value is below the specified threshold. The output is a DataFrame summarizing these results, sorted by p-value to
33
+ highlight the most statistically significant associations.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - High p-values (greater than the set threshold) indicate a lack of significant association between a feature and
38
+ the target variable, resulting in a 'Fail' status.
39
+ - Features with a 'Fail' status might not be relevant for the model, which could negatively impact model
40
+ performance.
41
+
42
+ ### Strengths
43
+
44
+ - Provides a clear, statistical assessment of the relationship between categorical features and the target variable.
45
+ - Produces an easily interpretable summary with a 'Pass/Fail' outcome for each feature, helping in feature
46
+ selection.
47
+ - The p-value threshold is adjustable, allowing for flexibility in statistical rigor.
48
+
49
+ ### Limitations
50
+
51
+ - Assumes the dataset is tabular and consists of categorical variables, which may not be suitable for all datasets.
52
+ - The test is designed for classification tasks and is not applicable to regression problems.
53
+ - As with all hypothesis tests, the Chi-Squared test can only detect associations, not causal relationships.
54
+ - The choice of p-value threshold can affect the interpretation of feature relevance, and different thresholds may
55
+ lead to different conclusions.
52
56
  """
53
57
 
54
- name = "chi_squared_features_table"
55
- required_inputs = ["dataset"]
56
- default_params = {"cat_features": None, "p_threshold": 0.05}
57
- tasks = ["classification"]
58
- tags = [
59
- "tabular_data",
60
- "categorical_data",
61
- "statistical_test",
62
- "binary_classification",
63
- "multiclass_classification",
64
- ]
65
-
66
- def run(self):
67
- target_column = self.inputs.dataset.target_column
68
- cat_features = self.params["cat_features"]
69
- p_threshold = self.params["p_threshold"]
70
-
71
- # Ensure cat_features is provided
72
- if not cat_features:
73
- cat_features = self.inputs.dataset.feature_columns_categorical
74
-
75
- df = self.inputs.dataset.df
76
-
77
- chi_squared_results = self.chi_squared_categorical_feature_selection(
78
- df, cat_features, target_column, p_threshold
79
- )
58
+ target_column = dataset.target_column
59
+ features = dataset.feature_columns_categorical
80
60
 
81
- return self.cache_results(
82
- {
83
- "chi_squared_results": chi_squared_results.to_dict(orient="records"),
84
- }
85
- )
61
+ if not features:
62
+ raise SkipTestError("No categorical features found in dataset")
63
+
64
+ results_df = _chi_squared_categorical_feature_selection(
65
+ dataset.df, features, target_column, p_threshold
66
+ )
67
+
68
+ return results_df
86
69
 
87
- def chi_squared_categorical_feature_selection(
88
- self, df, cat_features, target, p_threshold
89
- ):
90
- # Ensure the columns exist in the dataframe
91
- for var in cat_features:
92
- if var not in df.columns:
93
- raise ValueError(f"The column '{var}' does not exist in the dataframe.")
94
- if target not in df.columns:
95
- raise ValueError(
96
- f"The target column '{target}' does not exist in the dataframe."
97
- )
98
-
99
- results = []
100
-
101
- for var in cat_features:
102
- # Create a contingency table
103
- contingency_table = pd.crosstab(df[var], df[target])
104
-
105
- # Perform the Chi-Square test
106
- chi2, p, _, _ = chi2_contingency(contingency_table)
107
-
108
- # Add the result to the list of results
109
- results.append(
110
- [var, chi2, p, p_threshold, "Pass" if p <= p_threshold else "Fail"]
111
- )
112
-
113
- # Convert results to a DataFrame and return
114
- results_df = pd.DataFrame(
115
- results,
116
- columns=[
117
- "Variable",
118
- "Chi-squared statistic",
119
- "p-value",
120
- "Threshold",
121
- "Pass/Fail",
122
- ],
123
- )
124
70
 
125
- # Sort by p-value in ascending order
126
- results_df = results_df.sort_values(by="p-value")
127
-
128
- return results_df
129
-
130
- def summary(self, metric_value):
131
- chi_squared_results_table = metric_value["chi_squared_results"]
132
- return ResultSummary(
133
- results=[
134
- ResultTable(
135
- data=chi_squared_results_table,
136
- metadata=ResultTableMetadata(
137
- title="Chi-Squared Test Results for Categorical Features"
138
- ),
139
- )
140
- ]
71
+ def _chi_squared_categorical_feature_selection(df, features, target, p_threshold):
72
+
73
+ results = []
74
+
75
+ for var in features:
76
+ # Create a contingency table
77
+ contingency_table = pd.crosstab(df[var], df[target])
78
+
79
+ # Perform the Chi-Square test
80
+ chi2, p, _, _ = chi2_contingency(contingency_table)
81
+
82
+ # Add the result to the list of results
83
+ results.append(
84
+ [var, chi2, p, p_threshold, "Pass" if p <= p_threshold else "Fail"]
141
85
  )
86
+
87
+ # Convert results to a DataFrame and return
88
+ results_df = pd.DataFrame(
89
+ results,
90
+ columns=[
91
+ "Variable",
92
+ "Chi-squared statistic",
93
+ "p-value",
94
+ "Threshold",
95
+ "Pass/Fail",
96
+ ],
97
+ )
98
+
99
+ # Sort by p-value in ascending order
100
+ results_df = results_df.sort_values(by="p-value")
101
+
102
+ return results_df
@@ -28,17 +28,20 @@ class ClassImbalance(ThresholdTest):
28
28
  """
29
29
  Evaluates and quantifies class distribution imbalance in a dataset used by a machine learning model.
30
30
 
31
- **Purpose**: The ClassImbalance test is designed to evaluate the distribution of target classes in a dataset that's
32
- utilized by a machine learning model. Specifically, it aims to ensure that the classes aren't overly skewed, which
33
- could lead to bias in the model's predictions. It's crucial to have a balanced training dataset to avoid creating a
34
- model that's biased with high accuracy for the majority class and low accuracy for the minority class.
31
+ ### Purpose
35
32
 
36
- **Test Mechanism**: This ClassImbalance test operates by calculating the frequency (expressed as a percentage) of
37
- each class in the target column of the dataset. It then checks whether each class appears in at least a set minimum
38
- percentage of the total records. This minimum percentage is a modifiable parameter, but the default value is set to
39
- 10%.
33
+ The Class Imbalance test is designed to evaluate the distribution of target classes in a dataset that's utilized by
34
+ a machine learning model. Specifically, it aims to ensure that the classes aren't overly skewed, which could lead
35
+ to bias in the model's predictions. It's crucial to have a balanced training dataset to avoid creating a model
36
+ that's biased with high accuracy for the majority class and low accuracy for the minority class.
40
37
 
41
- **Signs of High Risk**:
38
+ ### Test Mechanism
39
+
40
+ This Class Imbalance test operates by calculating the frequency (expressed as a percentage) of each class in the
41
+ target column of the dataset. It then checks whether each class appears in at least a set minimum percentage of the
42
+ total records. This minimum percentage is a modifiable parameter, but the default value is set to 10%.
43
+
44
+ ### Signs of High Risk
42
45
 
43
46
  - Any class that represents less than the pre-set minimum percentage threshold is marked as high risk, implying a
44
47
  potential class imbalance.
@@ -46,7 +49,7 @@ class ClassImbalance(ThresholdTest):
46
49
  - Fundamentally, if any class fails this test, it's highly likely that the dataset possesses imbalanced class
47
50
  distribution.
48
51
 
49
- **Strengths**:
52
+ ### Strengths
50
53
 
51
54
  - The test can spot under-represented classes that could affect the efficiency of a machine learning model.
52
55
  - The calculation is straightforward and swift.
@@ -56,7 +59,7 @@ class ClassImbalance(ThresholdTest):
56
59
  - The test creates a visually insightful plot showing the classes and their corresponding proportions, enhancing
57
60
  interpretability and comprehension of the data.
58
61
 
59
- **Limitations**:
62
+ ### Limitations
60
63
 
61
64
  - The test might struggle to perform well or provide vital insights for datasets with a high number of classes. In
62
65
  such cases, the imbalance could be inevitable due to the inherent class distribution.
@@ -66,7 +69,7 @@ class ClassImbalance(ThresholdTest):
66
69
  different classes, which might fluctuate based on specific applications or domains.
67
70
  - While it can identify imbalances in class distribution, it doesn't provide direct methods to address or correct
68
71
  these imbalances.
69
- - The test is only applicable for classification opearations and unsuitable for regression or clustering tasks.
72
+ - The test is only applicable for classification operations and unsuitable for regression or clustering tasks.
70
73
  """
71
74
 
72
75
  # Changing the name test to avoid a name clash
@@ -17,32 +17,38 @@ logger = get_logger(__name__)
17
17
  @dataclass
18
18
  class DFGLSArch(Metric):
19
19
  """
20
- Executes Dickey-Fuller GLS metric to determine order of integration and check stationarity in time series data.
20
+ Assesses stationarity in time series data using the Dickey-Fuller GLS test to determine the order of integration.
21
21
 
22
- **Purpose**: The Dickey-Fuller GLS (DFGLS) Arch metric is utilized to determine the order of integration in time
23
- series data. For machine learning models dealing with time series and forecasting, this metric evaluates the
24
- existence of a unit root, thereby checking whether a time series is non-stationary. This analysis is a crucial
25
- initial step when dealing with time series data.
22
+ ### Purpose
26
23
 
27
- **Test Mechanism**: This code implements the Dickey-Fuller GLS unit root test on each attribute of the dataset.
28
- This process involves iterating through every column of the dataset and applying the DFGLS test to assess the
29
- presence of a unit root. The resulting information, including the test statistic ('stat'), the p-value ('pvalue'),
30
- the quantity of lagged differences utilized in the regression ('usedlag'), and the number of observations ('nobs'),
31
- is subsequently stored.
24
+ The Dickey-Fuller GLS (DFGLS) test is utilized to determine the order of integration in time series data. For
25
+ machine learning models dealing with time series and forecasting, this metric evaluates the existence of a unit
26
+ root, thereby checking whether a time series is non-stationary. This analysis is a crucial initial step when
27
+ dealing with time series data.
28
+
29
+ ### Test Mechanism
30
+
31
+ This code implements the Dickey-Fuller GLS unit root test on each attribute of the dataset. This process involves
32
+ iterating through every column of the dataset and applying the DFGLS test to assess the presence of a unit root.
33
+ The resulting information, including the test statistic ('stat'), the p-value ('pvalue'), the quantity of lagged
34
+ differences utilized in the regression ('usedlag'), and the number of observations ('nobs'), is subsequently stored.
35
+
36
+ ### Signs of High Risk
32
37
 
33
- **Signs of High Risk**:
34
38
  - A high p-value for the DFGLS test represents a high risk. Specifically, a p-value above a typical threshold of
35
39
  0.05 suggests that the time series data is quite likely to be non-stationary, thus presenting a high risk for
36
40
  generating unreliable forecasts.
37
41
 
38
- **Strengths**:
42
+ ### Strengths
43
+
39
44
  - The Dickey-Fuller GLS test is a potent tool for checking the stationarity of time series data.
40
45
  - It helps to verify the assumptions of the models before the actual construction of the machine learning models
41
46
  proceeds.
42
47
  - The results produced by this metric offer a clear insight into whether the data is appropriate for specific
43
48
  machine learning models, especially those demanding the stationarity of time series data.
44
49
 
45
- **Limitations**:
50
+ ### Limitations
51
+
46
52
  - Despite its benefits, the DFGLS test does present some drawbacks. It can potentially lead to inaccurate
47
53
  conclusions if the time series data incorporates a structural break.
48
54
  - If the time series tends to follow a trend while still being stationary, the test might misinterpret it,
@@ -25,42 +25,48 @@ class DatasetDescription(Metric):
25
25
  """
26
26
  Provides comprehensive analysis and statistical summaries of each field in a machine learning model's dataset.
27
27
 
28
- **Purpose:**
28
+ ### Purpose
29
+
29
30
  The test depicted in the script is meant to run a comprehensive analysis on a Machine Learning model's datasets.
30
31
  The test or metric is implemented to obtain a complete summary of the fields in the dataset, including vital
31
32
  statistics of each field such as count, distinct values, missing values, histograms for numerical, categorical,
32
33
  boolean, and text fields. This summary gives a comprehensive overview of the dataset to better understand the
33
34
  characteristics of the data that the model is trained on or evaluates.
34
35
 
35
- **Test Mechanism:**
36
+ ### Test Mechanism
37
+
36
38
  The DatasetDescription class accomplishes the purpose as follows: firstly, the test method "run" infers the data
37
- type of each column in the dataset and stores the details (id, column type). For each field,
39
+ type of each column in the dataset and stores the details (id, column type). For each field, the
38
40
  "describe_dataset_field" method is invoked to collect statistical information about the field, including count,
39
41
  missing value count and its proportion to the total, unique value count, and its proportion to the total. Depending
40
42
  on the data type of a field, histograms are generated that reflect the distribution of data within the field.
41
- Numerical fields use "get_numerical_histograms" method to calculate histogram distribution, whereas for
43
+ Numerical fields use the "get_numerical_histograms" method to calculate histogram distribution, whereas for
42
44
  categorical, boolean and text fields, a histogram is computed with frequencies of each unique value in the
43
45
  datasets. For unsupported types, an error is raised. Lastly, a summary table is built to aggregate all the
44
46
  statistical insights and histograms of the fields in a dataset.
45
47
 
46
- **Signs of High Risk:**
47
- - High ratio of missing values to total values in one or more fields which may impact quality of the predictions.
48
+ ### Signs of High Risk
49
+
50
+ - High ratio of missing values to total values in one or more fields which may impact the quality of the
51
+ predictions.
48
52
  - Unsupported data types in dataset fields.
49
53
  - Large number of unique values in the dataset's fields which might make it harder for the model to establish
50
54
  patterns.
51
55
  - Extreme skewness or irregular distribution of data as reflected in the histograms.
52
56
 
53
- **Strengths:**
54
- - Provides a detailed analysis of the dataset with versatile summaries like count, unique values, histograms etc.
57
+ ### Strengths
58
+
59
+ - Provides a detailed analysis of the dataset with versatile summaries like count, unique values, histograms, etc.
55
60
  - Flexibility in handling different types of data: numerical, categorical, boolean, and text.
56
61
  - Useful in detecting problems in the dataset like missing values, unsupported data types, irregular data
57
- distribution etc.
62
+ distribution, etc.
58
63
  - The summary gives a comprehensive understanding of dataset features allowing developers to make informed
59
64
  decisions.
60
65
 
61
- **Limitations:**
66
+ ### Limitations
67
+
62
68
  - The computation can be expensive from a resource standpoint, particularly for large datasets with numerous fields.
63
- - The histograms use arbitrary number of bins which may not be the optimal number of bins for specific data
69
+ - The histograms use an arbitrary number of bins which may not be the optimal number of bins for specific data
64
70
  distribution.
65
71
  - Unsupported data types for columns will raise an error which may limit evaluating the dataset.
66
72
  - Fields with all null or missing values are not included in histogram computation.
@@ -10,26 +10,28 @@ class DatasetSplit(Metric):
10
10
  Evaluates and visualizes the distribution proportions among training, testing, and validation datasets of an ML
11
11
  model.
12
12
 
13
- **Purpose:**
13
+ ### Purpose
14
+
14
15
  The DatasetSplit test is designed to evaluate and visualize the distribution of data among training, testing, and
15
16
  validation datasets, if available, within a given machine learning model. The main purpose is to assess whether the
16
17
  model's datasets are split appropriately, as an imbalanced split might affect the model's ability to learn from the
17
18
  data and generalize to unseen data.
18
19
 
19
- **Test Mechanism:**
20
+ ### Test Mechanism
21
+
20
22
  The DatasetSplit test first calculates the total size of all available datasets in the model. Then, for each
21
23
  individual dataset, the methodology involves determining the size of the dataset and its proportion relative to the
22
24
  total size. The results are then conveniently summarized in a table that shows dataset names, sizes, and
23
25
  proportions. Absolute size and proportion of the total dataset size are displayed for each individual dataset.
24
26
 
25
- **Signs of High Risk:**
27
+ ### Signs of High Risk
26
28
 
27
29
  - A very small training dataset, which may result in the model not learning enough from the data.
28
30
  - A very large training dataset and a small test dataset, which may lead to model overfitting and poor
29
31
  generalization to unseen data.
30
32
  - A small or non-existent validation dataset, which might complicate the model's performance assessment.
31
33
 
32
- **Strengths:**
34
+ ### Strengths
33
35
 
34
36
  - The DatasetSplit test provides a clear, understandable visualization of dataset split proportions, which can
35
37
  highlight any potential imbalance in dataset splits quickly.
@@ -37,7 +39,7 @@ class DatasetSplit(Metric):
37
39
  - The metric is not tied to any specific data type and is applicable to tabular data, time series data, or text
38
40
  data.
39
41
 
40
- **Limitations:**
42
+ ### Limitations
41
43
 
42
44
  - The DatasetSplit test does not provide any insight into the quality or diversity of the data within each split,
43
45
  just the size and proportion.
@@ -16,38 +16,45 @@ class DescriptiveStatistics(Metric):
16
16
  Performs a detailed descriptive statistical analysis of both numerical and categorical data within a model's
17
17
  dataset.
18
18
 
19
- **Purpose**: The purpose of the Descriptive Statistics metric is to provide a comprehensive summary of both
20
- numerical and categorical data within a dataset. This involves statistics such as count, mean, standard deviation,
21
- minimum and maximum values for numerical data. For categorical data, it calculates the count, number of unique
22
- values, most common value and its frequency, and the proportion of the most frequent value relative to the total.
23
- The goal is to visualize the overall distribution of the variables in the dataset, aiding in understanding the
24
- model's behavior and predicting its performance.
25
-
26
- **Test Mechanism**: The testing mechanism utilizes two in-built functions of pandas dataframes: describe() for
27
- numerical fields and value_counts() for categorical fields. The describe() function pulls out several summary
28
- statistics, while value_counts() accounts for unique values. The resulting data is formatted into two distinct
29
- tables, one for numerical and another for categorical variable summaries. These tables provide a clear summary of
30
- the main characteristics of the variables, which can be instrumental in assessing the model's performance.
31
-
32
- **Signs of High Risk**:
19
+ ### Purpose
20
+
21
+ The purpose of the Descriptive Statistics metric is to provide a comprehensive summary of both numerical and
22
+ categorical data within a dataset. This involves statistics such as count, mean, standard deviation, minimum and
23
+ maximum values for numerical data. For categorical data, it calculates the count, number of unique values, most
24
+ common value and its frequency, and the proportion of the most frequent value relative to the total. The goal is to
25
+ visualize the overall distribution of the variables in the dataset, aiding in understanding the model's behavior
26
+ and predicting its performance.
27
+
28
+ ### Test Mechanism
29
+
30
+ The testing mechanism utilizes two in-built functions of pandas dataframes: `describe()` for numerical fields and
31
+ `value_counts()` for categorical fields. The `describe()` function pulls out several summary statistics, while
32
+ `value_counts()` accounts for unique values. The resulting data is formatted into two distinct tables, one for
33
+ numerical and another for categorical variable summaries. These tables provide a clear summary of the main
34
+ characteristics of the variables, which can be instrumental in assessing the model's performance.
35
+
36
+ ### Signs of High Risk
37
+
33
38
  - Skewed data or significant outliers can represent high risk. For numerical data, this may be reflected via a
34
39
  significant difference between the mean and median (50% percentile).
35
40
  - For categorical data, a lack of diversity (low count of unique values), or overdominance of a single category
36
41
  (high frequency of the top value) can indicate high risk.
37
42
 
38
- **Strengths**:
39
- - This metric provides a comprehensive summary of the dataset, shedding light on the distribution and
40
- characteristics of the variables under consideration.
43
+ ### Strengths
44
+
45
+ - Provides a comprehensive summary of the dataset, shedding light on the distribution and characteristics of the
46
+ variables under consideration.
41
47
  - It is a versatile and robust method, applicable to both numerical and categorical data.
42
- - It helps highlight crucial anomalies such as outliers, extreme skewness, or lack of diversity, which are vital in
48
+ - Helps highlight crucial anomalies such as outliers, extreme skewness, or lack of diversity, which are vital in
43
49
  understanding model behavior during testing and validation.
44
50
 
45
- **Limitations**:
51
+ ### Limitations
52
+
46
53
  - While this metric offers a high-level overview of the data, it may fail to detect subtle correlations or complex
47
54
  patterns.
48
- - It does not offer any insights on the relationship between variables.
55
+ - Does not offer any insights on the relationship between variables.
49
56
  - Alone, descriptive statistics cannot be used to infer properties about future unseen data.
50
- - It should be used in conjunction with other statistical tests to provide a comprehensive understanding of the
57
+ - Should be used in conjunction with other statistical tests to provide a comprehensive understanding of the
51
58
  model's data.
52
59
  """
53
60
 
@@ -21,35 +21,43 @@ class Duplicates(ThresholdTest):
21
21
  """
22
22
  Tests dataset for duplicate entries, ensuring model reliability via data quality verification.
23
23
 
24
- **Purpose**: The 'Duplicates' metric is designed to check for duplicate rows within the dataset provided to the
25
- model. It serves as a measure of data quality, ensuring that the model isn't merely memorizing duplicate entries or
26
- being swayed by redundant information. This is an important step in the pre-processing of data for both
27
- classification and regression tasks.
28
-
29
- **Test Mechanism**: This metric operates by checking each row for duplicates in the dataset. If a text column is
30
- specified in the dataset, the test is conducted on this column; if not, the test is run on all feature columns. The
31
- number and percentage of duplicates are calculated and returned in a DataFrame. Additionally, a test is passed if
32
- the total count of duplicates falls below a specified minimum threshold.
33
-
34
- **Signs of High Risk**:
35
- - A high number of duplicate rows in the dataset. This can lead to overfitting where the model performs well on the
36
- training data but poorly on unseen data.
37
- - A high percentage of duplicate rows in the dataset. A large proportion of duplicate values could indicate that
38
- there's a problem with data collection or processing.
39
-
40
- **Strengths**:
24
+ ### Purpose
25
+
26
+ The 'Duplicates' test is designed to check for duplicate rows within the dataset provided to the model. It serves
27
+ as a measure of data quality, ensuring that the model isn't merely memorizing duplicate entries or being swayed by
28
+ redundant information. This is an important step in the pre-processing of data for both classification and
29
+ regression tasks.
30
+
31
+ ### Test Mechanism
32
+
33
+ This test operates by checking each row for duplicates in the dataset. If a text column is specified in the
34
+ dataset, the test is conducted on this column; if not, the test is run on all feature columns. The number and
35
+ percentage of duplicates are calculated and returned in a DataFrame. Additionally, a test is passed if the total
36
+ count of duplicates falls below a specified minimum threshold.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - A high number of duplicate rows in the dataset, which can lead to overfitting where the model performs well on
41
+ the training data but poorly on unseen data.
42
+ - A high percentage of duplicate rows in the dataset, indicating potential problems with data collection or
43
+ processing.
44
+
45
+ ### Strengths
46
+
41
47
  - Assists in improving the reliability of the model's training process by ensuring the training data is not
42
- contaminated with duplicate entries which can distort statistical analyses.
43
- - Provides both absolute number and percentage value of duplicate rows, giving a thorough overview of data quality
48
+ contaminated with duplicate entries, which can distort statistical analyses.
49
+ - Provides both absolute numbers and percentage values of duplicate rows, giving a thorough overview of data
50
+ quality.
44
51
  - Highly customizable as it allows for setting a user-defined minimum threshold to determine if the test has been
45
52
  passed.
46
53
 
47
- **Limitations**:
48
- - This test does not distinguish between benign duplicates (i.e., coincidental identical entries in different rows)
49
- and problematic duplicates originating from data collection or processing errors.
50
- - Since the test becomes more computationally intensive as the size of the dataset increases, it might not be
51
- suitable for very large datasets.
52
- - It can only check for exact duplicates and may miss semantically similar information packaged differently.
54
+ ### Limitations
55
+
56
+ - Does not distinguish between benign duplicates (i.e., coincidental identical entries in different rows) and
57
+ problematic duplicates originating from data collection or processing errors.
58
+ - The test becomes more computationally intensive as the size of the dataset increases, which might not be suitable
59
+ for very large datasets.
60
+ - Can only check for exact duplicates and may miss semantically similar information packaged differently.
53
61
  """
54
62
 
55
63
  name = "duplicates"