validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.24.dist-info/METADATA +0 -118
  196. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -5,26 +5,20 @@
5
5
  """
6
6
  Threshold based tests
7
7
  """
8
- from dataclasses import dataclass
9
- from typing import List
8
+ from typing import Any, Dict, Tuple
10
9
 
11
- import pandas as pd
12
10
  import plotly.graph_objs as go
13
11
 
12
+ from validmind import tags, tasks
14
13
  from validmind.errors import SkipTestError
15
- from validmind.vm_models import (
16
- Figure,
17
- ResultSummary,
18
- ResultTable,
19
- ResultTableMetadata,
20
- ThresholdTest,
21
- ThresholdTestResult,
22
- VMDataset,
23
- )
24
-
25
-
26
- @dataclass
27
- class ClassImbalance(ThresholdTest):
14
+ from validmind.vm_models import VMDataset
15
+
16
+
17
+ @tags("tabular_data", "binary_classification", "multiclass_classification")
18
+ @tasks("classification")
19
+ def ClassImbalance(
20
+ dataset: VMDataset, min_percent_threshold: int = 10
21
+ ) -> Tuple[Dict[str, Any], go.Figure, bool]:
28
22
  """
29
23
  Evaluates and quantifies class distribution imbalance in a dataset used by a machine learning model.
30
24
 
@@ -71,106 +65,43 @@ class ClassImbalance(ThresholdTest):
71
65
  these imbalances.
72
66
  - The test is only applicable for classification operations and unsuitable for regression or clustering tasks.
73
67
  """
74
-
75
- # Changing the name test to avoid a name clash
76
- name = "class_imbalance"
77
- required_inputs = ["dataset"]
78
- default_params = {"min_percent_threshold": 10}
79
- tasks = ["classification"]
80
- tags = ["tabular_data", "binary_classification", "multiclass_classification"]
81
-
82
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
83
- return ResultSummary(
84
- results=[
85
- ResultTable(
86
- data=results[0].values,
87
- metadata=ResultTableMetadata(
88
- title=f"Class Imbalance Results for Column {self.inputs.dataset.target_column}"
89
- ),
90
- )
91
- ]
92
- )
93
-
94
- def run(self):
95
- # Can only run this test if we have a Dataset object
96
- if not isinstance(self.inputs.dataset, VMDataset):
97
- raise ValueError("ClassImbalance requires a validmind Dataset object")
98
-
99
- if self.inputs.dataset.target_column is None:
100
- print("Skipping class_imbalance test because no target column is defined")
101
- return
102
-
103
- target_column = self.inputs.dataset.target_column
104
- imbalance_percentages = self.inputs.dataset.df[target_column].value_counts(
105
- normalize=True
68
+ if not dataset.target_column:
69
+ raise SkipTestError("No target column provided")
70
+
71
+ imbalance_percentages = dataset.df[dataset.target_column].value_counts(
72
+ normalize=True
73
+ )
74
+ if len(imbalance_percentages) > 10:
75
+ raise SkipTestError("Skipping target column with more than 10 classes")
76
+
77
+ classes = list(imbalance_percentages.index)
78
+
79
+ imbalanced_classes = []
80
+ for i, percentage in enumerate(imbalance_percentages.values):
81
+ proportion = percentage * 100
82
+ imbalanced_classes.append(
83
+ {
84
+ dataset.target_column: classes[i],
85
+ "Percentage of Rows (%)": f"{proportion:.2f}%",
86
+ "Pass/Fail": "Pass" if proportion > min_percent_threshold else "Fail",
87
+ }
106
88
  )
107
- if len(imbalance_percentages) > 10:
108
- raise SkipTestError(
109
- f"Skipping {self.__class__.__name__} test as"
110
- "target column as more than 10 classes"
111
- )
112
-
113
- classes = list(imbalance_percentages.index)
114
- percentages = list(imbalance_percentages.values)
115
-
116
- # Checking class imbalance
117
- imbalanced_classes = []
118
- for i, percentage in enumerate(percentages):
119
- class_label = classes[i]
120
- proportion = percentage * 100
121
- passed = proportion > self.params["min_percent_threshold"]
122
-
123
- imbalanced_classes.append(
124
- {
125
- target_column: class_label,
126
- "Percentage of Rows (%) ": f"{proportion:.2f}%",
127
- "Pass/Fail": "Pass" if passed else "Fail",
128
- }
129
- )
130
-
131
- resultset = pd.DataFrame(imbalanced_classes)
132
- tests_failed = all(resultset["Pass/Fail"] == "Pass")
133
- results = [
134
- ThresholdTestResult(
135
- column=target_column,
136
- passed=tests_failed,
137
- values=resultset.to_dict(orient="records"),
138
- )
139
- ]
140
-
141
- # Create a bar chart trace
142
- trace = go.Bar(
143
- x=imbalance_percentages.index,
144
- y=imbalance_percentages.values,
145
- )
146
-
147
- # Create a layout for the chart
148
- layout = go.Layout(
149
- title=f"Class Imbalance Results for Target Column {self.inputs.dataset.target_column}",
150
- xaxis=dict(title="Class"),
151
- yaxis=dict(title="Percentage"),
152
- )
153
-
154
- # Create a figure and add the trace and layout
155
- fig = go.Figure(data=[trace], layout=layout)
156
-
157
- return self.cache_results(
158
- results,
159
- passed=tests_failed,
160
- figures=[
161
- Figure(
162
- for_object=self,
163
- key=f"{self.name}",
164
- figure=fig,
165
- )
166
- ],
167
- )
168
-
169
- def test(self):
170
- """Unit test for ClassImbalance"""
171
- assert self.result is not None
172
-
173
- assert self.result.test_results is not None
174
- assert self.result.test_results.passed
175
89
 
176
- assert self.result.figures is not None
90
+ trace = go.Bar(
91
+ x=imbalance_percentages.index,
92
+ y=imbalance_percentages.values,
93
+ )
94
+
95
+ layout = go.Layout(
96
+ title=f"{dataset.target_column} Class Imbalance",
97
+ xaxis=dict(title="Class"),
98
+ yaxis=dict(title="Percentage"),
99
+ )
100
+
101
+ return (
102
+ {
103
+ f"{dataset.target_column} Class Imbalance": imbalanced_classes,
104
+ },
105
+ go.Figure(data=[trace], layout=layout),
106
+ all(row["Pass/Fail"] == "Pass" for row in imbalanced_classes),
107
+ )
@@ -4,15 +4,15 @@
4
4
 
5
5
  import re
6
6
  from collections import Counter
7
- from dataclasses import dataclass
8
7
 
9
8
  import numpy as np
10
9
  from ydata_profiling.config import Settings
11
10
  from ydata_profiling.model.typeset import ProfilingTypeSet
12
11
 
12
+ from validmind import tags, tasks
13
13
  from validmind.errors import UnsupportedColumnTypeError
14
14
  from validmind.logging import get_logger
15
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
15
+ from validmind.vm_models import VMDataset
16
16
 
17
17
  DEFAULT_HISTOGRAM_BINS = 10
18
18
  DEFAULT_HISTOGRAM_BIN_SIZES = [5, 10, 20, 50]
@@ -20,37 +20,179 @@ DEFAULT_HISTOGRAM_BIN_SIZES = [5, 10, 20, 50]
20
20
  logger = get_logger(__name__)
21
21
 
22
22
 
23
- @dataclass
24
- class DatasetDescription(Metric):
23
+ def infer_datatypes(df):
24
+ column_type_mappings = {}
25
+ typeset = ProfilingTypeSet(Settings())
26
+ variable_types = typeset.infer_type(df)
27
+
28
+ for column, type in variable_types.items():
29
+ if str(type) == "Unsupported":
30
+ if df[column].isnull().all():
31
+ column_type_mappings[column] = {"id": column, "type": "Null"}
32
+ else:
33
+ raise UnsupportedColumnTypeError(
34
+ f"Unsupported type for column {column}. Please review all values in this dataset column."
35
+ )
36
+ else:
37
+ column_type_mappings[column] = {"id": column, "type": str(type)}
38
+
39
+ return list(column_type_mappings.values())
40
+
41
+
42
+ def get_numerical_histograms(df, column):
43
+ """
44
+ Returns a collection of histograms for a numerical column, each one
45
+ with a different bin size
25
46
  """
26
- Provides comprehensive analysis and statistical summaries of each field in a machine learning model's dataset.
47
+ values = df[column].to_numpy()
48
+ values_cleaned = values[~np.isnan(values)]
49
+
50
+ # bins='sturges'. Cannot use 'auto' until we review and fix its performance
51
+ # on datasets with too many unique values
52
+ #
53
+ # 'sturges': R’s default method, only accounts for data size. Only optimal
54
+ # for gaussian data and underestimates number of bins for large non-gaussian datasets.
55
+ default_hist = np.histogram(values_cleaned, bins="sturges")
56
+
57
+ histograms = {
58
+ "default": {
59
+ "bin_size": len(default_hist[0]),
60
+ "histogram": {
61
+ "bin_edges": default_hist[1].tolist(),
62
+ "counts": default_hist[0].tolist(),
63
+ },
64
+ }
65
+ }
66
+
67
+ for bin_size in DEFAULT_HISTOGRAM_BIN_SIZES:
68
+ hist = np.histogram(values_cleaned, bins=bin_size)
69
+ histograms[f"bins_{bin_size}"] = {
70
+ "bin_size": bin_size,
71
+ "histogram": {
72
+ "bin_edges": hist[1].tolist(),
73
+ "counts": hist[0].tolist(),
74
+ },
75
+ }
76
+
77
+ return histograms
78
+
79
+
80
+ def get_column_histograms(df, column, type_):
81
+ """
82
+ Returns a collection of histograms for a numerical or categorical column.
83
+ We store different combinations of bin sizes to allow analyzing the data better
84
+
85
+ Will be used in favor of _get_histogram in the future
86
+ """
87
+ # Set the minimum number of bins to nunique if it's less than the default
88
+ if type_ == "Numeric":
89
+ return get_numerical_histograms(df, column)
90
+ elif type_ == "Categorical" or type_ == "Boolean":
91
+ value_counts = df[column].value_counts()
92
+ return {
93
+ "default": {
94
+ "bin_size": len(value_counts),
95
+ "histogram": value_counts.to_dict(),
96
+ }
97
+ }
98
+ elif type_ == "Text":
99
+ # Combine all the text in the specified column
100
+ text_data = " ".join(df[column].astype(str))
101
+ # Split the text into words (tokens) using a regular expression
102
+ words = re.findall(r"\w+", text_data)
103
+ # Use Counter to count the frequency of each word
104
+ word_counts = Counter(words)
105
+
106
+ return {
107
+ "default": {
108
+ "bin_size": len(word_counts),
109
+ "histogram": dict(word_counts),
110
+ }
111
+ }
112
+ elif type_ == "Null":
113
+ logger.info(f"Ignoring histogram generation for null column {column}")
114
+ else:
115
+ raise ValueError(
116
+ f"Unsupported column type found when computing its histogram: {type_}"
117
+ )
118
+
119
+
120
+ def describe_column(df, column):
121
+ """
122
+ Gets descriptive statistics for a single column in a Pandas DataFrame.
123
+ """
124
+ column_type = column["type"]
125
+
126
+ # Initialize statistics with count for all column types
127
+ column["statistics"] = {
128
+ "count": df[column["id"]].count(),
129
+ "n_missing": df[column["id"]].isna().sum(),
130
+ "missing": df[column["id"]].isna().sum() / len(df[column["id"]]),
131
+ "n_distinct": df[column["id"]].nunique(),
132
+ "distinct": df[column["id"]].nunique() / len(df[column["id"]]),
133
+ }
134
+
135
+ # Boolean (binary) columns should be reported as categorical
136
+ if column_type == "Boolean" or df[column["id"]].nunique() == 2:
137
+ column["type"] = "Categorical" # Change the type to Categorical
138
+ top_value = df[column["id"]].value_counts().nlargest(1)
139
+ column["statistics"].update(
140
+ {
141
+ "unique": df[column["id"]].nunique(),
142
+ "top": top_value.index[0],
143
+ "freq": top_value.values[0],
144
+ }
145
+ )
146
+ elif column_type == "Numeric":
147
+ column["statistics"].update(
148
+ df[column["id"]]
149
+ .describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])
150
+ .to_dict()
151
+ )
152
+ elif column_type == "Categorical" or column_type == "Text":
153
+ column["statistics"].update(
154
+ df[column["id"]].astype("category").describe().to_dict()
155
+ )
156
+
157
+ column["histograms"] = get_column_histograms(
158
+ df, column["id"], column["type"]
159
+ ) # Use updated type
160
+
161
+ return column
162
+
163
+
164
+ @tags("tabular_data", "time_series_data", "text_data")
165
+ @tasks("classification", "regression", "text_classification", "text_summarization")
166
+ def DatasetDescription(dataset: VMDataset):
167
+ """
168
+ Provides comprehensive analysis and statistical summaries of each column in a machine learning model's dataset.
27
169
 
28
170
  ### Purpose
29
171
 
30
172
  The test depicted in the script is meant to run a comprehensive analysis on a Machine Learning model's datasets.
31
- The test or metric is implemented to obtain a complete summary of the fields in the dataset, including vital
32
- statistics of each field such as count, distinct values, missing values, histograms for numerical, categorical,
33
- boolean, and text fields. This summary gives a comprehensive overview of the dataset to better understand the
173
+ The test or metric is implemented to obtain a complete summary of the columns in the dataset, including vital
174
+ statistics of each column such as count, distinct values, missing values, histograms for numerical, categorical,
175
+ boolean, and text columns. This summary gives a comprehensive overview of the dataset to better understand the
34
176
  characteristics of the data that the model is trained on or evaluates.
35
177
 
36
178
  ### Test Mechanism
37
179
 
38
180
  The DatasetDescription class accomplishes the purpose as follows: firstly, the test method "run" infers the data
39
- type of each column in the dataset and stores the details (id, column type). For each field, the
40
- "describe_dataset_field" method is invoked to collect statistical information about the field, including count,
181
+ type of each column in the dataset and stores the details (id, column type). For each column, the
182
+ "describe_column" method is invoked to collect statistical information about the column, including count,
41
183
  missing value count and its proportion to the total, unique value count, and its proportion to the total. Depending
42
- on the data type of a field, histograms are generated that reflect the distribution of data within the field.
43
- Numerical fields use the "get_numerical_histograms" method to calculate histogram distribution, whereas for
44
- categorical, boolean and text fields, a histogram is computed with frequencies of each unique value in the
184
+ on the data type of a column, histograms are generated that reflect the distribution of data within the column.
185
+ Numerical columns use the "get_numerical_histograms" method to calculate histogram distribution, whereas for
186
+ categorical, boolean and text columns, a histogram is computed with frequencies of each unique value in the
45
187
  datasets. For unsupported types, an error is raised. Lastly, a summary table is built to aggregate all the
46
- statistical insights and histograms of the fields in a dataset.
188
+ statistical insights and histograms of the columns in a dataset.
47
189
 
48
190
  ### Signs of High Risk
49
191
 
50
- - High ratio of missing values to total values in one or more fields which may impact the quality of the
192
+ - High ratio of missing values to total values in one or more columns which may impact the quality of the
51
193
  predictions.
52
- - Unsupported data types in dataset fields.
53
- - Large number of unique values in the dataset's fields which might make it harder for the model to establish
194
+ - Unsupported data types in dataset columns.
195
+ - Large number of unique values in the dataset's columns which might make it harder for the model to establish
54
196
  patterns.
55
197
  - Extreme skewness or irregular distribution of data as reflected in the histograms.
56
198
 
@@ -65,201 +207,30 @@ class DatasetDescription(Metric):
65
207
 
66
208
  ### Limitations
67
209
 
68
- - The computation can be expensive from a resource standpoint, particularly for large datasets with numerous fields.
210
+ - The computation can be expensive from a resource standpoint, particularly for large datasets with numerous columns.
69
211
  - The histograms use an arbitrary number of bins which may not be the optimal number of bins for specific data
70
212
  distribution.
71
213
  - Unsupported data types for columns will raise an error which may limit evaluating the dataset.
72
- - Fields with all null or missing values are not included in histogram computation.
214
+ - Columns with all null or missing values are not included in histogram computation.
73
215
  - This test only validates the quality of the dataset but doesn't address the model's performance directly.
74
216
  """
75
-
76
- name = "dataset_description"
77
- required_inputs = ["dataset"]
78
- tasks = [
79
- "classification",
80
- "regression",
81
- "text_classification",
82
- "text_summarization",
83
- ]
84
- tags = ["tabular_data", "time_series_data", "text_data"]
85
-
86
- def summary(self, metric_value):
87
- """
88
- Build a dataset summary table. metric_value is a list of fields where each field
89
- has an id, type (Numeric or Categorical), and statistics. The statistics object
90
- depends on the type being Numeric or Categorical. For Numeric fields, it has
91
- the following keys: count, mean, std, min, 25%, 50%, 75%, 90%, 95%, max. For
92
- categorical fields, it has the following keys: count, unique, top, freq.
93
- """
94
- results_table = []
95
- for field in metric_value:
96
- field_id = field["id"]
97
- field_type = field["type"]
98
- field_statistics = field["statistics"]
99
-
100
- results_table.append(
101
- {
102
- "Name": field_id,
103
- "Type": field_type,
104
- "Count": field_statistics["count"],
105
- "Missing": field_statistics["n_missing"],
106
- "Missing %": field_statistics["missing"],
107
- "Distinct": field_statistics["n_distinct"],
108
- "Distinct %": field_statistics["distinct"],
109
- }
110
- )
111
-
112
- return ResultSummary(
113
- results=[
114
- ResultTable(
115
- data=results_table,
116
- metadata=ResultTableMetadata(title="Dataset Description"),
117
- )
118
- ]
119
- )
120
-
121
- def run(self):
122
- results = []
123
- for ds_field in self.infer_datatype(self.inputs.dataset.df):
124
- self.describe_dataset_field(self.inputs.dataset.df, ds_field)
125
- results.append(ds_field)
126
- return self.cache_results(results)
127
-
128
- def infer_datatype(self, df):
129
- vm_dataset_variables = {}
130
- typeset = ProfilingTypeSet(Settings())
131
- variable_types = typeset.infer_type(df)
132
-
133
- for column, type in variable_types.items():
134
- if str(type) == "Unsupported":
135
- if df[column].isnull().all():
136
- vm_dataset_variables[column] = {"id": column, "type": "Null"}
137
- else:
138
- raise UnsupportedColumnTypeError(
139
- f"Unsupported type for column {column}. Please review all values in this dataset column."
140
- )
141
- else:
142
- vm_dataset_variables[column] = {"id": column, "type": str(type)}
143
-
144
- return list(vm_dataset_variables.values())
145
-
146
- def describe_dataset_field(self, df, field):
147
- """
148
- Gets descriptive statistics for a single field in a Pandas DataFrame.
149
- """
150
- field_type = field["type"]
151
-
152
- # - When we call describe on one field at a time, Pandas will
153
- # know better if it needs to report on numerical or categorical statistics
154
- # - Boolean (binary) fields should be reported as categorical
155
- # (force to categorical when nunique == 2)
156
- if field_type == ["Boolean"] or df[field["id"]].nunique() == 2:
157
- top_value = df[field["id"]].value_counts().nlargest(1)
158
-
159
- field["statistics"] = {
160
- "count": df[field["id"]].count(),
161
- "unique": df[field["id"]].nunique(),
162
- "top": top_value.index[0],
163
- "freq": top_value.values[0],
164
- }
165
- elif field_type == "Numeric":
166
- field["statistics"] = (
167
- df[field["id"]]
168
- .describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])
169
- .to_dict()
170
- )
171
- elif field_type == "Categorical" or field_type == "Text":
172
- field["statistics"] = (
173
- df[field["id"]].astype("category").describe().to_dict()
174
- )
175
-
176
- # Initialize statistics object for non-numeric or categorical fields
177
- if "statistics" not in field:
178
- field["statistics"] = {}
179
-
180
- field["statistics"]["n_missing"] = df[field["id"]].isna().sum()
181
- field["statistics"]["missing"] = field["statistics"]["n_missing"] / len(
182
- df[field["id"]]
183
- )
184
- field["statistics"]["n_distinct"] = df[field["id"]].nunique()
185
- field["statistics"]["distinct"] = field["statistics"]["n_distinct"] / len(
186
- df[field["id"]]
187
- )
188
-
189
- field["histograms"] = self.get_field_histograms(df, field["id"], field_type)
190
-
191
- def get_field_histograms(self, df, field, type_):
192
- """
193
- Returns a collection of histograms for a numerical or categorical field.
194
- We store different combinations of bin sizes to allow analyzing the data better
195
-
196
- Will be used in favor of _get_histogram in the future
197
- """
198
- # Set the minimum number of bins to nunique if it's less than the default
199
- if type_ == "Numeric":
200
- return self.get_numerical_histograms(df, field)
201
- elif type_ == "Categorical" or type_ == "Boolean":
202
- value_counts = df[field].value_counts()
203
- return {
204
- "default": {
205
- "bin_size": len(value_counts),
206
- "histogram": value_counts.to_dict(),
207
- }
208
- }
209
- elif type_ == "Text":
210
- # Combine all the text in the specified field
211
- text_data = " ".join(df[field].astype(str))
212
- # Split the text into words (tokens) using a regular expression
213
- words = re.findall(r"\w+", text_data)
214
- # Use Counter to count the frequency of each word
215
- word_counts = Counter(words)
216
-
217
- return {
218
- "default": {
219
- "bin_size": len(word_counts),
220
- "histogram": dict(word_counts),
221
- }
217
+ df = dataset.df
218
+
219
+ results = []
220
+ for column in infer_datatypes(df):
221
+ results.append(describe_column(df, column))
222
+
223
+ return {
224
+ "Dataset Description": [
225
+ {
226
+ "Name": column["id"],
227
+ "Type": column["type"],
228
+ "Count": column["statistics"]["count"],
229
+ "Missing": column["statistics"]["n_missing"],
230
+ "Missing %": column["statistics"]["missing"],
231
+ "Distinct": column["statistics"]["n_distinct"],
232
+ "Distinct %": column["statistics"]["distinct"],
222
233
  }
223
- elif type_ == "Null":
224
- logger.info(f"Ignoring histogram generation for null column {field}")
225
- else:
226
- raise ValueError(
227
- f"Unsupported field type found when computing its histogram: {type_}"
228
- )
229
-
230
- def get_numerical_histograms(self, df, field):
231
- """
232
- Returns a collection of histograms for a numerical field, each one
233
- with a different bin size
234
- """
235
- values = df[field].to_numpy()
236
- values_cleaned = values[~np.isnan(values)]
237
-
238
- # bins='sturges'. Cannot use 'auto' until we review and fix its performance
239
- # on datasets with too many unique values
240
- #
241
- # 'sturges': R’s default method, only accounts for data size. Only optimal
242
- # for gaussian data and underestimates number of bins for large non-gaussian datasets.
243
- default_hist = np.histogram(values_cleaned, bins="sturges")
244
-
245
- histograms = {
246
- "default": {
247
- "bin_size": len(default_hist[0]),
248
- "histogram": {
249
- "bin_edges": default_hist[1].tolist(),
250
- "counts": default_hist[0].tolist(),
251
- },
252
- }
253
- }
254
-
255
- for bin_size in DEFAULT_HISTOGRAM_BIN_SIZES:
256
- hist = np.histogram(values_cleaned, bins=bin_size)
257
- histograms[f"bins_{bin_size}"] = {
258
- "bin_size": bin_size,
259
- "histogram": {
260
- "bin_edges": hist[1].tolist(),
261
- "counts": hist[0].tolist(),
262
- },
263
- }
264
-
265
- return histograms
234
+ for column in results
235
+ ]
236
+ }