validmind 2.8.10__py3-none-any.whl → 2.8.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. validmind/__init__.py +6 -5
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +17 -11
  4. validmind/ai/utils.py +2 -2
  5. validmind/api_client.py +75 -32
  6. validmind/client.py +108 -100
  7. validmind/client_config.py +3 -3
  8. validmind/datasets/classification/__init__.py +7 -3
  9. validmind/datasets/credit_risk/lending_club.py +28 -16
  10. validmind/datasets/nlp/cnn_dailymail.py +10 -4
  11. validmind/datasets/regression/__init__.py +22 -5
  12. validmind/errors.py +17 -7
  13. validmind/input_registry.py +1 -1
  14. validmind/logging.py +44 -35
  15. validmind/models/foundation.py +2 -2
  16. validmind/models/function.py +10 -3
  17. validmind/template.py +30 -22
  18. validmind/test_suites/__init__.py +2 -2
  19. validmind/tests/_store.py +13 -4
  20. validmind/tests/comparison.py +65 -33
  21. validmind/tests/data_validation/ACFandPACFPlot.py +4 -1
  22. validmind/tests/data_validation/AutoMA.py +1 -1
  23. validmind/tests/data_validation/BivariateScatterPlots.py +5 -1
  24. validmind/tests/data_validation/BoxPierce.py +3 -1
  25. validmind/tests/data_validation/ClassImbalance.py +4 -2
  26. validmind/tests/data_validation/DatasetDescription.py +3 -24
  27. validmind/tests/data_validation/DescriptiveStatistics.py +1 -1
  28. validmind/tests/data_validation/DickeyFullerGLS.py +1 -1
  29. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +1 -1
  30. validmind/tests/data_validation/HighCardinality.py +5 -1
  31. validmind/tests/data_validation/HighPearsonCorrelation.py +1 -1
  32. validmind/tests/data_validation/IQROutliersBarPlot.py +5 -3
  33. validmind/tests/data_validation/IQROutliersTable.py +5 -2
  34. validmind/tests/data_validation/IsolationForestOutliers.py +5 -4
  35. validmind/tests/data_validation/JarqueBera.py +2 -2
  36. validmind/tests/data_validation/LJungBox.py +2 -2
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  38. validmind/tests/data_validation/MissingValues.py +14 -10
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +3 -1
  40. validmind/tests/data_validation/MutualInformation.py +2 -1
  41. validmind/tests/data_validation/PearsonCorrelationMatrix.py +1 -1
  42. validmind/tests/data_validation/ProtectedClassesCombination.py +2 -0
  43. validmind/tests/data_validation/ProtectedClassesDescription.py +2 -2
  44. validmind/tests/data_validation/ProtectedClassesDisparity.py +9 -5
  45. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +10 -2
  46. validmind/tests/data_validation/RollingStatsPlot.py +2 -1
  47. validmind/tests/data_validation/ScoreBandDefaultRates.py +4 -2
  48. validmind/tests/data_validation/SeasonalDecompose.py +1 -1
  49. validmind/tests/data_validation/ShapiroWilk.py +2 -2
  50. validmind/tests/data_validation/Skewness.py +7 -6
  51. validmind/tests/data_validation/SpreadPlot.py +1 -1
  52. validmind/tests/data_validation/TabularCategoricalBarPlots.py +1 -1
  53. validmind/tests/data_validation/TabularDateTimeHistograms.py +1 -1
  54. validmind/tests/data_validation/TargetRateBarPlots.py +4 -1
  55. validmind/tests/data_validation/TimeSeriesFrequency.py +1 -1
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +7 -2
  57. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  58. validmind/tests/data_validation/WOEBinTable.py +1 -1
  59. validmind/tests/data_validation/ZivotAndrewsArch.py +5 -2
  60. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  61. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  62. validmind/tests/data_validation/nlp/LanguageDetection.py +1 -1
  63. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  64. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +5 -1
  65. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  66. validmind/tests/data_validation/nlp/Sentiment.py +3 -1
  67. validmind/tests/data_validation/nlp/TextDescription.py +1 -1
  68. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  69. validmind/tests/decorator.py +14 -11
  70. validmind/tests/load.py +38 -24
  71. validmind/tests/model_validation/BertScore.py +7 -1
  72. validmind/tests/model_validation/BleuScore.py +7 -1
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +3 -1
  74. validmind/tests/model_validation/ContextualRecall.py +9 -1
  75. validmind/tests/model_validation/FeaturesAUC.py +1 -1
  76. validmind/tests/model_validation/MeteorScore.py +7 -1
  77. validmind/tests/model_validation/ModelPredictionResiduals.py +5 -1
  78. validmind/tests/model_validation/RegardScore.py +6 -1
  79. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -1
  80. validmind/tests/model_validation/RougeScore.py +3 -1
  81. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +2 -0
  82. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +10 -2
  83. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +6 -2
  84. validmind/tests/model_validation/TokenDisparity.py +5 -1
  85. validmind/tests/model_validation/ToxicityScore.py +2 -0
  86. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  87. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +5 -1
  88. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -1
  89. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +5 -1
  90. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -0
  91. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +5 -1
  92. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +6 -2
  93. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +3 -1
  94. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -1
  95. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +5 -1
  96. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +5 -1
  97. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +5 -1
  98. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +5 -1
  99. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +6 -1
  100. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -3
  101. validmind/tests/model_validation/ragas/AspectCritic.py +4 -1
  102. validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -3
  103. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -3
  104. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -3
  105. validmind/tests/model_validation/ragas/ContextRecall.py +5 -3
  106. validmind/tests/model_validation/ragas/Faithfulness.py +5 -3
  107. validmind/tests/model_validation/ragas/NoiseSensitivity.py +1 -1
  108. validmind/tests/model_validation/ragas/ResponseRelevancy.py +5 -3
  109. validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -3
  110. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +9 -9
  111. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +9 -9
  112. validmind/tests/model_validation/sklearn/CalibrationCurve.py +5 -2
  113. validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +28 -5
  114. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -1
  115. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +24 -14
  116. validmind/tests/model_validation/sklearn/CompletenessScore.py +8 -9
  117. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -3
  118. validmind/tests/model_validation/sklearn/FeatureImportance.py +6 -2
  119. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -9
  120. validmind/tests/model_validation/sklearn/HomogeneityScore.py +14 -9
  121. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +4 -2
  122. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +6 -1
  123. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +12 -7
  124. validmind/tests/model_validation/sklearn/MinimumF1Score.py +12 -7
  125. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +21 -6
  126. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -3
  127. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +5 -1
  128. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -1
  129. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +6 -1
  130. validmind/tests/model_validation/sklearn/ROCCurve.py +3 -1
  131. validmind/tests/model_validation/sklearn/RegressionErrors.py +6 -2
  132. validmind/tests/model_validation/sklearn/RegressionPerformance.py +13 -8
  133. validmind/tests/model_validation/sklearn/RegressionR2Square.py +8 -5
  134. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +5 -1
  135. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +34 -26
  136. validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +10 -2
  137. validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -1
  138. validmind/tests/model_validation/sklearn/VMeasure.py +12 -9
  139. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +15 -10
  140. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -1
  141. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +6 -1
  142. validmind/tests/model_validation/statsmodels/GINITable.py +8 -1
  143. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +2 -2
  144. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +6 -2
  145. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +8 -2
  146. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +3 -1
  147. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +7 -2
  148. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -0
  149. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -0
  150. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +4 -2
  151. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +3 -1
  152. validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +11 -1
  153. validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +10 -2
  154. validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +8 -1
  155. validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +18 -2
  156. validmind/tests/ongoing_monitoring/FeatureDrift.py +9 -2
  157. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +8 -2
  158. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +13 -2
  159. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +13 -2
  160. validmind/tests/ongoing_monitoring/ROCCurveDrift.py +16 -2
  161. validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +11 -2
  162. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +13 -2
  163. validmind/tests/output.py +66 -11
  164. validmind/tests/prompt_validation/Clarity.py +1 -1
  165. validmind/tests/prompt_validation/NegativeInstruction.py +1 -1
  166. validmind/tests/prompt_validation/Robustness.py +6 -1
  167. validmind/tests/prompt_validation/Specificity.py +1 -1
  168. validmind/tests/run.py +28 -14
  169. validmind/tests/test_providers.py +28 -35
  170. validmind/tests/utils.py +17 -4
  171. validmind/unit_metrics/__init__.py +1 -1
  172. validmind/utils.py +295 -31
  173. validmind/vm_models/dataset/dataset.py +19 -16
  174. validmind/vm_models/dataset/utils.py +5 -3
  175. validmind/vm_models/figure.py +6 -6
  176. validmind/vm_models/input.py +6 -5
  177. validmind/vm_models/model.py +5 -5
  178. validmind/vm_models/result/result.py +122 -43
  179. validmind/vm_models/result/utils.py +9 -28
  180. validmind/vm_models/test_suite/__init__.py +5 -0
  181. validmind/vm_models/test_suite/runner.py +5 -5
  182. validmind/vm_models/test_suite/summary.py +20 -2
  183. validmind/vm_models/test_suite/test.py +6 -6
  184. validmind/vm_models/test_suite/test_suite.py +10 -10
  185. {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/METADATA +4 -5
  186. {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/RECORD +189 -188
  187. {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/WHEEL +1 -1
  188. {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/LICENSE +0 -0
  189. {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/entry_points.txt +0 -0
@@ -116,4 +116,4 @@ def AutoMA(dataset: VMDataset, max_ma_order: int = 3):
116
116
  return {
117
117
  "Auto MA Analysis Results": summary_ma_analysis,
118
118
  "Best MA Order Results": best_ma_order,
119
- }, RawData(raw_series_data=df)
119
+ }, RawData(raw_series_data=df, dataset=dataset.input_id)
@@ -80,5 +80,9 @@ def BivariateScatterPlots(dataset):
80
80
  figures.append(fig)
81
81
 
82
82
  return tuple(figures) + (
83
- RawData(selected_numerical_df=df, feature_pairs=features_pairs),
83
+ RawData(
84
+ selected_numerical_df=df,
85
+ feature_pairs=features_pairs,
86
+ dataset=dataset.input_id,
87
+ ),
84
88
  )
@@ -68,4 +68,6 @@ def BoxPierce(dataset):
68
68
  box_pierce_df.reset_index(inplace=True)
69
69
  box_pierce_df.columns = ["column", "stat", "pvalue"]
70
70
 
71
- return box_pierce_df, RawData(box_pierce_values=box_pierce_values)
71
+ return box_pierce_df, RawData(
72
+ box_pierce_values=box_pierce_values, dataset=dataset.input_id
73
+ )
@@ -14,7 +14,9 @@ from validmind.errors import SkipTestError
14
14
  from validmind.vm_models import VMDataset
15
15
 
16
16
 
17
- @tags("tabular_data", "binary_classification", "multiclass_classification")
17
+ @tags(
18
+ "tabular_data", "binary_classification", "multiclass_classification", "data_quality"
19
+ )
18
20
  @tasks("classification")
19
21
  def ClassImbalance(
20
22
  dataset: VMDataset, min_percent_threshold: int = 10
@@ -104,5 +106,5 @@ def ClassImbalance(
104
106
  },
105
107
  go.Figure(data=[trace], layout=layout),
106
108
  all(row["Pass/Fail"] == "Pass" for row in imbalanced_classes),
107
- RawData(imbalance_percentages=imbalance_percentages),
109
+ RawData(imbalance_percentages=imbalance_percentages, dataset=dataset.input_id),
108
110
  )
@@ -6,12 +6,10 @@ import re
6
6
  from collections import Counter
7
7
 
8
8
  import numpy as np
9
- from ydata_profiling.config import Settings
10
- from ydata_profiling.model.typeset import ProfilingTypeSet
11
9
 
12
10
  from validmind import RawData, tags, tasks
13
- from validmind.errors import UnsupportedColumnTypeError
14
11
  from validmind.logging import get_logger
12
+ from validmind.utils import infer_datatypes
15
13
  from validmind.vm_models import VMDataset
16
14
 
17
15
  DEFAULT_HISTOGRAM_BINS = 10
@@ -20,25 +18,6 @@ DEFAULT_HISTOGRAM_BIN_SIZES = [5, 10, 20, 50]
20
18
  logger = get_logger(__name__)
21
19
 
22
20
 
23
- def infer_datatypes(df):
24
- column_type_mappings = {}
25
- typeset = ProfilingTypeSet(Settings())
26
- variable_types = typeset.infer_type(df)
27
-
28
- for column, type in variable_types.items():
29
- if str(type) == "Unsupported":
30
- if df[column].isnull().all():
31
- column_type_mappings[column] = {"id": column, "type": "Null"}
32
- else:
33
- raise UnsupportedColumnTypeError(
34
- f"Unsupported type for column {column}. Please review all values in this dataset column."
35
- )
36
- else:
37
- column_type_mappings[column] = {"id": column, "type": str(type)}
38
-
39
- return list(column_type_mappings.values())
40
-
41
-
42
21
  def get_numerical_histograms(df, column):
43
22
  """
44
23
  Returns a collection of histograms for a numerical column, each one
@@ -50,7 +29,7 @@ def get_numerical_histograms(df, column):
50
29
  # bins='sturges'. Cannot use 'auto' until we review and fix its performance
51
30
  # on datasets with too many unique values
52
31
  #
53
- # 'sturges': Rs default method, only accounts for data size. Only optimal
32
+ # 'sturges': R's default method, only accounts for data size. Only optimal
54
33
  # for gaussian data and underestimates number of bins for large non-gaussian datasets.
55
34
  default_hist = np.histogram(values_cleaned, bins="sturges")
56
35
 
@@ -242,4 +221,4 @@ def DatasetDescription(dataset: VMDataset):
242
221
  }
243
222
  for column in results
244
223
  ]
245
- }, RawData(raw_data=raw_data)
224
+ }, RawData(raw_data=raw_data, dataset=dataset.input_id)
@@ -44,7 +44,7 @@ def get_summary_statistics_categorical(df, categorical_fields):
44
44
  return summary_stats
45
45
 
46
46
 
47
- @tags("tabular_data", "time_series_data")
47
+ @tags("tabular_data", "time_series_data", "data_quality")
48
48
  @tasks("classification", "regression")
49
49
  def DescriptiveStatistics(dataset: VMDataset):
50
50
  """
@@ -97,4 +97,4 @@ def DickeyFullerGLS(dataset: VMDataset):
97
97
 
98
98
  return {
99
99
  "DFGLS Test Results": dfgls_values,
100
- }, RawData(df=df)
100
+ }, RawData(df=df, dataset=dataset.input_id)
@@ -58,7 +58,7 @@ def FeatureTargetCorrelationPlot(dataset, fig_height=600):
58
58
  df, dataset.target_column, fig_height
59
59
  )
60
60
 
61
- return fig, RawData(correlation_data=correlations)
61
+ return fig, RawData(correlation_data=correlations, dataset=dataset.input_id)
62
62
 
63
63
 
64
64
  def _visualize_feature_target_correlation(df, target_column, fig_height):
@@ -83,4 +83,8 @@ def HighCardinality(
83
83
  if not passed:
84
84
  all_passed = False
85
85
 
86
- return table, all_passed, RawData(raw_cardinality_details=raw_data)
86
+ return (
87
+ table,
88
+ all_passed,
89
+ RawData(raw_cardinality_details=raw_data, dataset=dataset.input_id),
90
+ )
@@ -84,5 +84,5 @@ def HighPearsonCorrelation(
84
84
  return (
85
85
  pairs,
86
86
  all(p["Pass/Fail"] == "Pass" for p in pairs),
87
- RawData(correlation_matrix=corr),
87
+ RawData(correlation_matrix=corr, dataset=dataset.input_id),
88
88
  )
@@ -118,11 +118,13 @@ def IQROutliersBarPlot(
118
118
  )
119
119
  figures.append(fig)
120
120
 
121
+ outliers_by_feature = df[dataset.feature_columns_numeric].apply(
122
+ lambda col: compute_outliers(col, threshold)
123
+ )
124
+
121
125
  return (
122
126
  *figures,
123
127
  RawData(
124
- outlier_counts_by_feature=df[dataset.feature_columns_numeric].apply(
125
- lambda col: compute_outliers(col, threshold)
126
- )
128
+ outlier_counts_by_feature=outliers_by_feature, dataset=dataset.input_id
127
129
  ),
128
130
  )
@@ -2,7 +2,7 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from validmind import tags, tasks
5
+ from validmind import RawData, tags, tasks
6
6
  from validmind.vm_models import VMDataset
7
7
 
8
8
 
@@ -64,6 +64,7 @@ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
64
64
  df = dataset.df
65
65
 
66
66
  outliers_table = []
67
+ all_outliers = {}
67
68
 
68
69
  for col in dataset.feature_columns_numeric:
69
70
  # Skip binary features
@@ -71,6 +72,8 @@ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
71
72
  continue
72
73
 
73
74
  outliers = compute_outliers(df[col], threshold)
75
+ all_outliers[col] = outliers
76
+
74
77
  if outliers.empty:
75
78
  continue
76
79
 
@@ -89,4 +92,4 @@ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
89
92
 
90
93
  return {
91
94
  "Summary of Outliers Detected by IQR Method": outliers_table,
92
- }
95
+ }, RawData(all_outliers=all_outliers, dataset=dataset.input_id)
@@ -8,7 +8,7 @@ import matplotlib.pyplot as plt
8
8
  import seaborn as sns
9
9
  from sklearn.ensemble import IsolationForest
10
10
 
11
- from validmind import tags, tasks
11
+ from validmind import RawData, tags, tasks
12
12
  from validmind.vm_models import VMDataset
13
13
 
14
14
 
@@ -91,6 +91,7 @@ def IsolationForestOutliers(
91
91
 
92
92
  figures.append(fig)
93
93
 
94
- plt.close()
95
-
96
- return tuple(figures)
94
+ return (
95
+ *figures,
96
+ RawData(predictions=y_pred, dataset=dataset.input_id),
97
+ )
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  from statsmodels.stats.stattools import jarque_bera
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
 
10
10
 
11
11
  @tasks("classification", "regression")
@@ -67,4 +67,4 @@ def JarqueBera(dataset):
67
67
  jb_df.reset_index(inplace=True)
68
68
  jb_df.columns = ["column", "stat", "pvalue", "skew", "kurtosis"]
69
69
 
70
- return jb_df
70
+ return jb_df, RawData(jb_values=jb_values, dataset=dataset.input_id)
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  from statsmodels.stats.diagnostic import acorr_ljungbox
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
 
10
10
 
11
11
  @tasks("regression")
@@ -63,4 +63,4 @@ def LJungBox(dataset):
63
63
  ljung_box_df.reset_index(inplace=True)
64
64
  ljung_box_df.columns = ["column", "stat", "pvalue"]
65
65
 
66
- return ljung_box_df
66
+ return ljung_box_df, RawData(ljung_box_df=ljung_box_df, dataset=dataset.input_id)
@@ -101,4 +101,4 @@ def LaggedCorrelationHeatmap(dataset: VMDataset, num_lags: int = 10):
101
101
  xaxis_title="Lags",
102
102
  )
103
103
 
104
- return fig, RawData(correlation_matrix=correlation_df)
104
+ return fig, RawData(correlation_matrix=correlation_df, dataset=dataset.input_id)
@@ -2,7 +2,7 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from validmind import tags, tasks
5
+ from validmind import RawData, tags, tasks
6
6
  from validmind.vm_models import VMDataset
7
7
 
8
8
 
@@ -49,12 +49,16 @@ def MissingValues(dataset: VMDataset, min_threshold: int = 1):
49
49
  df = dataset.df
50
50
  missing = df.isna().sum()
51
51
 
52
- return [
53
- {
54
- "Column": col,
55
- "Number of Missing Values": missing[col],
56
- "Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
57
- "Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
58
- }
59
- for col in missing.index
60
- ], all(missing[col] < min_threshold for col in missing.index)
52
+ return (
53
+ [
54
+ {
55
+ "Column": col,
56
+ "Number of Missing Values": missing[col],
57
+ "Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
58
+ "Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
59
+ }
60
+ for col in missing.index
61
+ ],
62
+ all(missing[col] < min_threshold for col in missing.index),
63
+ RawData(missing_values=missing, dataset=dataset.input_id),
64
+ )
@@ -117,5 +117,7 @@ def MissingValuesBarPlot(
117
117
  height=fig_height,
118
118
  ),
119
119
  ),
120
- RawData(missing_percentages=missing_percentages_sorted),
120
+ RawData(
121
+ missing_percentages=missing_percentages_sorted, dataset=dataset.input_id
122
+ ),
121
123
  )
@@ -123,5 +123,6 @@ def MutualInformation(
123
123
  return fig, RawData(
124
124
  mutual_information_scores={
125
125
  feature: score for feature, score in zip(sorted_features, sorted_scores)
126
- }
126
+ },
127
+ dataset=dataset.input_id,
127
128
  )
@@ -88,4 +88,4 @@ def PearsonCorrelationMatrix(dataset):
88
88
 
89
89
  fig = go.Figure(data=[heatmap], layout=layout)
90
90
 
91
- return fig, RawData(correlation_matrix=corr_matrix)
91
+ return fig, RawData(correlation_matrix=corr_matrix, dataset=dataset.input_id)
@@ -206,5 +206,7 @@ def ProtectedClassesCombination(dataset, model, protected_classes=None):
206
206
  metrics_frame=mf,
207
207
  demographic_parity_ratios=m_dpr,
208
208
  equalized_odds_ratios=m_eqo,
209
+ model=model.input_id,
210
+ dataset=dataset.input_id,
209
211
  ),
210
212
  )
@@ -6,7 +6,7 @@
6
6
  import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.logging import get_logger
11
11
 
12
12
  logger = get_logger(__name__)
@@ -127,4 +127,4 @@ def ProtectedClassesDescription(dataset, protected_classes=None):
127
127
  ["Protected Class", "Count"], ascending=[True, False]
128
128
  )
129
129
 
130
- return (stats_df, *figures)
130
+ return (stats_df, *figures, RawData(dataset=dataset.input_id))
@@ -7,7 +7,7 @@ import sys
7
7
 
8
8
  import pandas as pd
9
9
 
10
- from validmind import tags, tasks
10
+ from validmind import RawData, tags, tasks
11
11
  from validmind.errors import MissingDependencyError
12
12
  from validmind.logging import get_logger
13
13
 
@@ -119,7 +119,7 @@ def ProtectedClassesDisparity(
119
119
  mask_significance=True,
120
120
  )
121
121
 
122
- figures = []
122
+ returns = [] # Renamed to 'returns' for clarity
123
123
  for protected_class in protected_classes:
124
124
  plot = ap.disparity(
125
125
  bdf, metrics, protected_class, fairness_threshold=disparity_tolerance
@@ -129,12 +129,16 @@ def ProtectedClassesDisparity(
129
129
  plot.save(
130
130
  buf, format="png"
131
131
  ) # as long as the above library is installed, this will work
132
- figures.append(buf.getvalue())
132
+ returns.append(buf.getvalue())
133
133
 
134
134
  string = "_disparity"
135
135
  metrics_adj = [x + string for x in metrics]
136
136
 
137
137
  table = bdf[["attribute_name", "attribute_value"] + b.list_disparities(bdf)]
138
- figures.append(aqp.plot_disparity_all(bdf, metrics=metrics_adj))
138
+ returns.append(aqp.plot_disparity_all(bdf, metrics=metrics_adj))
139
139
 
140
- return (table, *figures)
140
+ return (
141
+ table,
142
+ *returns,
143
+ RawData(model=model.input_id, dataset=dataset.input_id, disparity_data=bdf),
144
+ )
@@ -8,7 +8,7 @@ import sys
8
8
  import matplotlib.pyplot as plt
9
9
  import pandas as pd
10
10
 
11
- from validmind import tags, tasks
11
+ from validmind import RawData, tags, tasks
12
12
  from validmind.errors import MissingDependencyError
13
13
  from validmind.logging import get_logger
14
14
 
@@ -103,7 +103,15 @@ def ProtectedClassesThresholdOptimizer(
103
103
  test_df, target, y_pred_opt, protected_classes
104
104
  )
105
105
 
106
- return {"DPR and EOR Table": fairness_metrics.reset_index()}, fig
106
+ return (
107
+ {"DPR and EOR Table": fairness_metrics.reset_index()},
108
+ fig,
109
+ RawData(
110
+ y_predictions=y_pred_opt.tolist(),
111
+ dataset=dataset.input_id,
112
+ protected_classes=protected_classes,
113
+ ),
114
+ )
107
115
 
108
116
 
109
117
  def initialize_and_fit_optimizer(pipeline, X_train, y_train, protected_classes_df):
@@ -113,6 +113,7 @@ def RollingStatsPlot(dataset: VMDataset, window_size: int = 12):
113
113
  "rolling_std": dataset.df[col].rolling(window=window_size).std(),
114
114
  }
115
115
  for col in dataset.feature_columns
116
- }
116
+ },
117
+ dataset=dataset.input_id,
117
118
  ),
118
119
  )
@@ -5,7 +5,7 @@
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
  from validmind.vm_models import VMDataset, VMModel
10
10
 
11
11
 
@@ -137,4 +137,6 @@ def ScoreBandDefaultRates(
137
137
  }
138
138
  )
139
139
 
140
- return pd.DataFrame(results)
140
+ return pd.DataFrame(results), RawData(
141
+ results=results, model=model.input_id, dataset=dataset.input_id
142
+ )
@@ -166,4 +166,4 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
166
166
  if not figures:
167
167
  raise SkipTestError("No valid features found for seasonal decomposition")
168
168
 
169
- return (*figures, RawData(decomposed_components=raw_data))
169
+ return (*figures, RawData(decomposed_components=raw_data, dataset=dataset.input_id))
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  from scipy import stats
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
 
10
10
 
11
11
  @tasks("classification", "regression")
@@ -66,4 +66,4 @@ def ShapiroWilk(dataset):
66
66
  sw_df.reset_index(inplace=True)
67
67
  sw_df.columns = ["column", "stat", "pvalue"]
68
68
 
69
- return sw_df
69
+ return sw_df, RawData(shapiro_results=sw_values, dataset=dataset.input_id)
@@ -2,10 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from ydata_profiling.config import Settings
6
- from ydata_profiling.model.typeset import ProfilingTypeSet
7
-
8
5
  from validmind import tags, tasks
6
+ from validmind.utils import infer_datatypes
9
7
 
10
8
 
11
9
  @tags("data_quality", "tabular_data")
@@ -49,8 +47,11 @@ def Skewness(dataset, max_threshold=1):
49
47
  - Subjective threshold for risk grading, requiring expert input and recurrent iterations for refinement.
50
48
  """
51
49
 
52
- typeset = ProfilingTypeSet(Settings())
53
- dataset_types = typeset.infer_type(dataset.df)
50
+ # Use the imported infer_datatypes function
51
+ dataset_types = infer_datatypes(dataset.df)
52
+
53
+ # Convert the list of dictionaries to a dictionary for easy access
54
+ dataset_types_dict = {item["id"]: item["type"] for item in dataset_types}
54
55
 
55
56
  skewness = dataset.df.skew(numeric_only=True)
56
57
 
@@ -58,7 +59,7 @@ def Skewness(dataset, max_threshold=1):
58
59
  passed = True
59
60
 
60
61
  for col in skewness.index:
61
- if str(dataset_types[col]) != "Numeric":
62
+ if dataset_types_dict.get(col) != "Numeric":
62
63
  continue
63
64
 
64
65
  col_skewness = skewness[col]
@@ -95,4 +95,4 @@ def SpreadPlot(dataset: VMDataset):
95
95
  name=f"spread_{var1}_{var2}"
96
96
  )
97
97
 
98
- return (*figures, RawData(spread_data=spread_data))
98
+ return (*figures, RawData(spread_data=spread_data, dataset=dataset.input_id))
@@ -92,4 +92,4 @@ def TabularCategoricalBarPlots(dataset: VMDataset):
92
92
  )
93
93
  figures.append(fig)
94
94
 
95
- return (*figures, RawData(category_counts=counts_dict))
95
+ return (*figures, RawData(category_counts=counts_dict, dataset=dataset.input_id))
@@ -72,4 +72,4 @@ def TabularDateTimeHistograms(dataset: VMDataset):
72
72
  font=dict(size=18),
73
73
  )
74
74
 
75
- return fig, RawData(date_differences=date_diffs)
75
+ return fig, RawData(date_differences=date_diffs, dataset=dataset.input_id)
@@ -108,4 +108,7 @@ def TargetRateBarPlots(dataset: VMDataset):
108
108
 
109
109
  figures.append(fig)
110
110
 
111
- return (*figures, RawData(target_rates_by_category=raw_data))
111
+ return (
112
+ *figures,
113
+ RawData(target_rates_by_category=raw_data, dataset=dataset.input_id),
114
+ )
@@ -107,5 +107,5 @@ def TimeSeriesFrequency(dataset: VMDataset):
107
107
  frequencies,
108
108
  fig,
109
109
  len(set(item["Frequency"] for item in frequencies)) == 1,
110
- RawData(time_diff_days=time_diff_days),
110
+ RawData(time_diff_days=time_diff_days, dataset=dataset.input_id),
111
111
  )
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
  from validmind.errors import SkipTestError
10
10
  from validmind.vm_models import VMDataset
11
11
 
@@ -111,4 +111,9 @@ def TimeSeriesOutliers(dataset: VMDataset, zscore_threshold: int = 3):
111
111
 
112
112
  figures.append(fig)
113
113
 
114
- return (outlier_df.sort_values(["Column", "Date"]), figures, len(outlier_df) == 0)
114
+ return (
115
+ outlier_df.sort_values(["Column", "Date"]),
116
+ figures,
117
+ len(outlier_df) == 0,
118
+ RawData(outliers=outlier_df, dataset=dataset.input_id),
119
+ )
@@ -140,4 +140,4 @@ def WOEBinPlots(
140
140
 
141
141
  figures.append(fig)
142
142
 
143
- return (*figures, RawData(woe_iv_data=woe_iv_df))
143
+ return (*figures, RawData(woe_iv_data=woe_iv_df, dataset=dataset.input_id))
@@ -71,4 +71,4 @@ def WOEBinTable(dataset: VMDataset, breaks_adj: list = None):
71
71
 
72
72
  return {
73
73
  "Weight of Evidence (WoE) and Information Value (IV)": result_table
74
- }, RawData(woe_bins=bins)
74
+ }, RawData(woe_bins=bins, dataset=dataset.input_id)
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from arch.unitroot import ZivotAndrews
7
7
  from numpy.linalg import LinAlgError
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.errors import SkipTestError
11
11
  from validmind.logging import get_logger
12
12
  from validmind.vm_models import VMDataset
@@ -83,4 +83,7 @@ def ZivotAndrewsArch(dataset: VMDataset):
83
83
  }
84
84
  )
85
85
 
86
- return {"Zivot-Andrews Test Results": za_values}
86
+ return (
87
+ {"Zivot-Andrews Test Results": za_values},
88
+ RawData(zivot_andrews=za_values, dataset=dataset.input_id),
89
+ )
@@ -94,4 +94,4 @@ def CommonWords(dataset: VMDataset):
94
94
  xaxis_tickangle=-45,
95
95
  )
96
96
 
97
- return fig, RawData(words=x, frequencies=y)
97
+ return fig, RawData(words=x, frequencies=y, dataset=dataset.input_id)
@@ -76,4 +76,4 @@ def Hashtags(dataset: VMDataset, top_hashtags: int = 25):
76
76
  xaxis_tickangle=-45,
77
77
  )
78
78
 
79
- return fig, RawData(top_hashtag_counts=top_hashtag_counts)
79
+ return fig, RawData(top_hashtag_counts=top_hashtag_counts, dataset=dataset.input_id)
@@ -71,5 +71,5 @@ def LanguageDetection(dataset):
71
71
  title="Language Distribution",
72
72
  labels={"x": "Language Codes"},
73
73
  ),
74
- RawData(detected_languages=languages),
74
+ RawData(detected_languages=languages, dataset=dataset.input_id),
75
75
  )
@@ -82,5 +82,5 @@ def Mentions(dataset: VMDataset, top_mentions: int = 25):
82
82
  values="Percentage",
83
83
  title="Tree of Mentions",
84
84
  ),
85
- RawData(mention_counts=mention_counts),
85
+ RawData(mention_counts=mention_counts, dataset=dataset.input_id),
86
86
  )
@@ -144,4 +144,8 @@ def PolarityAndSubjectivity(dataset, threshold_subjectivity=0.5, threshold_polar
144
144
 
145
145
  statistics_tables = {"Quadrant Distribution": quadrant_df, "Statistics": stats_df}
146
146
 
147
- return fig, statistics_tables, RawData(sentiment_data=data)
147
+ return (
148
+ fig,
149
+ statistics_tables,
150
+ RawData(sentiment_data=data, dataset=dataset.input_id),
151
+ )
@@ -65,7 +65,7 @@ def Punctuations(dataset, count_mode="token"):
65
65
  punctuation_counts = _count_punctuations(corpus, count_mode)
66
66
  fig = _create_punctuation_plot(punctuation_counts)
67
67
 
68
- return fig, RawData(punctuation_counts=punctuation_counts)
68
+ return fig, RawData(punctuation_counts=punctuation_counts, dataset=dataset.input_id)
69
69
 
70
70
 
71
71
  def _create_punctuation_plot(punctuation_counts):
@@ -77,4 +77,6 @@ def Sentiment(dataset):
77
77
 
78
78
  plt.close("all")
79
79
 
80
- return fig, RawData(sentiment_scores=vader_sentiment.tolist())
80
+ return fig, RawData(
81
+ sentiment_scores=vader_sentiment.tolist(), dataset=dataset.input_id
82
+ )