validmind 2.7.6__py3-none-any.whl → 2.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. validmind/__init__.py +2 -0
  2. validmind/__version__.py +1 -1
  3. validmind/api_client.py +8 -1
  4. validmind/datasets/credit_risk/lending_club.py +3 -4
  5. validmind/html_templates/content_blocks.py +1 -1
  6. validmind/tests/__types__.py +17 -0
  7. validmind/tests/data_validation/ACFandPACFPlot.py +6 -2
  8. validmind/tests/data_validation/AutoMA.py +2 -2
  9. validmind/tests/data_validation/BivariateScatterPlots.py +4 -2
  10. validmind/tests/data_validation/BoxPierce.py +2 -2
  11. validmind/tests/data_validation/ClassImbalance.py +2 -1
  12. validmind/tests/data_validation/DatasetDescription.py +11 -2
  13. validmind/tests/data_validation/DatasetSplit.py +2 -2
  14. validmind/tests/data_validation/DickeyFullerGLS.py +2 -2
  15. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +8 -2
  16. validmind/tests/data_validation/HighCardinality.py +9 -2
  17. validmind/tests/data_validation/HighPearsonCorrelation.py +6 -2
  18. validmind/tests/data_validation/IQROutliersBarPlot.py +9 -2
  19. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +2 -2
  20. validmind/tests/data_validation/MissingValuesBarPlot.py +12 -9
  21. validmind/tests/data_validation/MutualInformation.py +6 -8
  22. validmind/tests/data_validation/PearsonCorrelationMatrix.py +2 -2
  23. validmind/tests/data_validation/ProtectedClassesCombination.py +6 -1
  24. validmind/tests/data_validation/ProtectedClassesDescription.py +1 -1
  25. validmind/tests/data_validation/ProtectedClassesDisparity.py +4 -5
  26. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +1 -4
  27. validmind/tests/data_validation/RollingStatsPlot.py +21 -10
  28. validmind/tests/data_validation/ScatterPlot.py +3 -5
  29. validmind/tests/data_validation/ScoreBandDefaultRates.py +2 -1
  30. validmind/tests/data_validation/SeasonalDecompose.py +12 -2
  31. validmind/tests/data_validation/Skewness.py +6 -3
  32. validmind/tests/data_validation/SpreadPlot.py +8 -3
  33. validmind/tests/data_validation/TabularCategoricalBarPlots.py +4 -2
  34. validmind/tests/data_validation/TabularDateTimeHistograms.py +2 -2
  35. validmind/tests/data_validation/TargetRateBarPlots.py +4 -3
  36. validmind/tests/data_validation/TimeSeriesFrequency.py +7 -2
  37. validmind/tests/data_validation/TimeSeriesMissingValues.py +14 -10
  38. validmind/tests/data_validation/TimeSeriesOutliers.py +1 -5
  39. validmind/tests/data_validation/WOEBinPlots.py +2 -2
  40. validmind/tests/data_validation/WOEBinTable.py +11 -9
  41. validmind/tests/data_validation/nlp/CommonWords.py +2 -2
  42. validmind/tests/data_validation/nlp/Hashtags.py +2 -2
  43. validmind/tests/data_validation/nlp/LanguageDetection.py +9 -6
  44. validmind/tests/data_validation/nlp/Mentions.py +9 -6
  45. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -2
  46. validmind/tests/data_validation/nlp/Punctuations.py +4 -2
  47. validmind/tests/data_validation/nlp/Sentiment.py +2 -2
  48. validmind/tests/data_validation/nlp/StopWords.py +5 -4
  49. validmind/tests/data_validation/nlp/TextDescription.py +2 -2
  50. validmind/tests/data_validation/nlp/Toxicity.py +2 -2
  51. validmind/tests/model_validation/BertScore.py +2 -2
  52. validmind/tests/model_validation/BleuScore.py +2 -2
  53. validmind/tests/model_validation/ClusterSizeDistribution.py +2 -2
  54. validmind/tests/model_validation/ContextualRecall.py +2 -2
  55. validmind/tests/model_validation/FeaturesAUC.py +2 -2
  56. validmind/tests/model_validation/MeteorScore.py +2 -2
  57. validmind/tests/model_validation/ModelPredictionResiduals.py +2 -2
  58. validmind/tests/model_validation/RegardScore.py +6 -2
  59. validmind/tests/model_validation/RegressionResidualsPlot.py +4 -3
  60. validmind/tests/model_validation/RougeScore.py +6 -5
  61. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +11 -2
  62. validmind/tests/model_validation/TokenDisparity.py +2 -2
  63. validmind/tests/model_validation/ToxicityScore.py +10 -2
  64. validmind/tests/model_validation/embeddings/ClusterDistribution.py +9 -3
  65. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +16 -2
  66. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -3
  67. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +2 -2
  68. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +14 -4
  69. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +2 -2
  70. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +16 -2
  71. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +2 -2
  72. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -5
  73. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +4 -2
  74. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +4 -2
  75. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +4 -2
  76. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +4 -2
  77. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +8 -6
  78. validmind/tests/model_validation/embeddings/utils.py +11 -1
  79. validmind/tests/model_validation/ragas/AnswerCorrectness.py +2 -1
  80. validmind/tests/model_validation/ragas/AspectCritic.py +11 -7
  81. validmind/tests/model_validation/ragas/ContextEntityRecall.py +2 -1
  82. validmind/tests/model_validation/ragas/ContextPrecision.py +2 -1
  83. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +2 -1
  84. validmind/tests/model_validation/ragas/ContextRecall.py +2 -1
  85. validmind/tests/model_validation/ragas/Faithfulness.py +2 -1
  86. validmind/tests/model_validation/ragas/NoiseSensitivity.py +2 -1
  87. validmind/tests/model_validation/ragas/ResponseRelevancy.py +2 -1
  88. validmind/tests/model_validation/ragas/SemanticSimilarity.py +2 -1
  89. validmind/tests/model_validation/sklearn/CalibrationCurve.py +3 -2
  90. validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +2 -5
  91. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -2
  92. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +2 -2
  93. validmind/tests/model_validation/sklearn/FeatureImportance.py +1 -14
  94. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +6 -3
  95. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +2 -2
  96. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +8 -4
  97. validmind/tests/model_validation/sklearn/ModelParameters.py +1 -0
  98. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -3
  99. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +2 -2
  100. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +20 -16
  101. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +4 -2
  102. validmind/tests/model_validation/sklearn/ROCCurve.py +1 -1
  103. validmind/tests/model_validation/sklearn/RegressionR2Square.py +7 -9
  104. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +1 -3
  105. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +2 -1
  106. validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +2 -1
  107. validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -3
  108. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -1
  109. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +1 -1
  110. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +11 -4
  111. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -3
  112. validmind/tests/model_validation/statsmodels/GINITable.py +7 -15
  113. validmind/tests/model_validation/statsmodels/Lilliefors.py +2 -2
  114. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +1 -1
  115. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +2 -2
  116. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +5 -2
  117. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +5 -2
  118. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +7 -7
  119. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +2 -2
  120. validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +3 -1
  121. validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +4 -2
  122. validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py +4 -2
  123. validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +3 -1
  124. validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +3 -1
  125. validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +3 -1
  126. validmind/tests/ongoing_monitoring/FeatureDrift.py +1 -0
  127. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +1 -0
  128. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +3 -1
  129. validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py +1 -0
  130. validmind/tests/ongoing_monitoring/ROCCurveDrift.py +3 -2
  131. validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +4 -2
  132. validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +3 -1
  133. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -3
  134. validmind/tests/prompt_validation/Bias.py +13 -9
  135. validmind/tests/prompt_validation/Clarity.py +13 -9
  136. validmind/tests/prompt_validation/Conciseness.py +13 -9
  137. validmind/tests/prompt_validation/Delimitation.py +13 -9
  138. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  139. validmind/tests/prompt_validation/Robustness.py +6 -2
  140. validmind/tests/prompt_validation/Specificity.py +13 -9
  141. validmind/tests/run.py +6 -0
  142. validmind/utils.py +7 -8
  143. {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/METADATA +1 -2
  144. {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/RECORD +147 -147
  145. {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/WHEEL +1 -1
  146. {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/LICENSE +0 -0
  147. {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,7 @@ from plotly.subplots import make_subplots
9
9
  from scipy import stats
10
10
  from statsmodels.tsa.seasonal import seasonal_decompose
11
11
 
12
- from validmind import tags, tasks
12
+ from validmind import RawData, tags, tasks
13
13
  from validmind.errors import SkipTestError
14
14
  from validmind.logging import get_logger
15
15
  from validmind.vm_models import VMDataset
@@ -65,6 +65,8 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
65
65
 
66
66
  figures = []
67
67
 
68
+ raw_data = {}
69
+
68
70
  for col in df.columns:
69
71
  series = df[col].dropna()
70
72
 
@@ -153,7 +155,15 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
153
155
 
154
156
  figures.append(fig)
155
157
 
158
+ # Add the decomposed components to raw_data
159
+ raw_data[col] = {
160
+ "observed": sd.observed,
161
+ "trend": sd.trend,
162
+ "seasonal": sd.seasonal,
163
+ "residuals": sd.resid,
164
+ }
165
+
156
166
  if not figures:
157
167
  raise SkipTestError("No valid features found for seasonal decomposition")
158
168
 
159
- return tuple(figures)
169
+ return (*figures, RawData(decomposed_components=raw_data))
@@ -73,6 +73,9 @@ def Skewness(dataset, max_threshold=1):
73
73
  }
74
74
  )
75
75
 
76
- return {
77
- "Skewness Results for Dataset": results_table,
78
- }, passed
76
+ return (
77
+ {
78
+ "Skewness Results for Dataset": results_table,
79
+ },
80
+ passed,
81
+ )
@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
6
6
  import pandas as pd
7
7
  import seaborn as sns
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.errors import SkipTestError
11
11
  from validmind.vm_models import VMDataset
12
12
 
@@ -70,6 +70,7 @@ def SpreadPlot(dataset: VMDataset):
70
70
  ]
71
71
 
72
72
  figures = []
73
+ spread_data = {}
73
74
 
74
75
  for var1, var2 in feature_pairs:
75
76
  fig, ax = plt.subplots()
@@ -80,8 +81,9 @@ def SpreadPlot(dataset: VMDataset):
80
81
  y=0.95,
81
82
  )
82
83
 
84
+ spread_series = df[var1] - df[var2]
83
85
  sns.lineplot(
84
- data=df[var1] - df[var2],
86
+ data=spread_series,
85
87
  ax=ax,
86
88
  )
87
89
 
@@ -89,5 +91,8 @@ def SpreadPlot(dataset: VMDataset):
89
91
  ax.tick_params(axis="both", labelsize=18)
90
92
 
91
93
  figures.append(fig)
94
+ spread_data[f"{var1}_{var2}_spread"] = spread_series.to_frame(
95
+ name=f"spread_{var1}_{var2}"
96
+ )
92
97
 
93
- return tuple(figures)
98
+ return (*figures, RawData(spread_data=spread_data))
@@ -4,7 +4,7 @@
4
4
 
5
5
  import plotly.graph_objs as go
6
6
 
7
- from validmind import tags, tasks
7
+ from validmind import RawData, tags, tasks
8
8
  from validmind.errors import SkipTestError
9
9
  from validmind.vm_models import VMDataset
10
10
 
@@ -66,9 +66,11 @@ def TabularCategoricalBarPlots(dataset: VMDataset):
66
66
  ]
67
67
 
68
68
  figures = []
69
+ counts_dict = {}
69
70
 
70
71
  for col in dataset.feature_columns_categorical:
71
72
  counts = dataset.df[col].value_counts()
73
+ counts_dict[col] = counts
72
74
 
73
75
  fig = go.Figure()
74
76
  fig.add_trace(
@@ -90,4 +92,4 @@ def TabularCategoricalBarPlots(dataset: VMDataset):
90
92
  )
91
93
  figures.append(fig)
92
94
 
93
- return tuple(figures)
95
+ return (*figures, RawData(category_counts=counts_dict))
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
  from validmind.errors import SkipTestError
10
10
  from validmind.vm_models import VMDataset
11
11
 
@@ -72,4 +72,4 @@ def TabularDateTimeHistograms(dataset: VMDataset):
72
72
  font=dict(size=18),
73
73
  )
74
74
 
75
- return fig
75
+ return fig, RawData(date_differences=date_diffs)
@@ -6,7 +6,7 @@ import numpy as np
6
6
  import plotly.graph_objs as go
7
7
  from plotly.subplots import make_subplots
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.errors import SkipTestError
11
11
  from validmind.vm_models import VMDataset
12
12
 
@@ -62,12 +62,13 @@ def TargetRateBarPlots(dataset: VMDataset):
62
62
 
63
63
  df = dataset.df
64
64
  figures = []
65
+ raw_data = []
65
66
 
66
67
  for col in dataset.feature_columns_categorical:
67
-
68
68
  # Calculate counts and default rate for each category
69
69
  counts = df[col].value_counts()
70
70
  default_rate = df.groupby(col)[dataset.target_column].mean()
71
+ raw_data.append({"column": col, "counts": counts, "default_rate": default_rate})
71
72
 
72
73
  fig = make_subplots(
73
74
  rows=1,
@@ -107,4 +108,4 @@ def TargetRateBarPlots(dataset: VMDataset):
107
108
 
108
109
  figures.append(fig)
109
110
 
110
- return tuple(figures)
111
+ return (*figures, RawData(target_rates_by_category=raw_data))
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
  from validmind.errors import SkipTestError
10
10
  from validmind.vm_models import VMDataset
11
11
 
@@ -103,4 +103,9 @@ def TimeSeriesFrequency(dataset: VMDataset):
103
103
  ),
104
104
  )
105
105
 
106
- return frequencies, fig, len(set(item["Frequency"] for item in frequencies)) == 1
106
+ return (
107
+ frequencies,
108
+ fig,
109
+ len(set(item["Frequency"] for item in frequencies)) == 1,
110
+ RawData(time_diff_days=time_diff_days),
111
+ )
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  import plotly.express as px
7
7
  import plotly.figure_factory as ff
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.errors import SkipTestError
11
11
  from validmind.vm_models import VMDataset
12
12
 
@@ -62,15 +62,18 @@ def TimeSeriesMissingValues(dataset: VMDataset, min_threshold: int = 1):
62
62
 
63
63
  if sum(missing.values) == 0:
64
64
  # if theres no missing values, no need to plot anything
65
- return [
66
- {
67
- "Column": col,
68
- "Number of Missing Values": missing[col],
69
- "Percentage of Missing Values (%)": 0,
70
- "Pass/Fail": "Pass",
71
- }
72
- for col in missing.index
73
- ], True
65
+ return (
66
+ [
67
+ {
68
+ "Column": col,
69
+ "Number of Missing Values": missing[col],
70
+ "Percentage of Missing Values (%)": 0,
71
+ "Pass/Fail": "Pass",
72
+ }
73
+ for col in missing.index
74
+ ],
75
+ True,
76
+ )
74
77
 
75
78
  barplot = px.bar(
76
79
  missing,
@@ -110,4 +113,5 @@ def TimeSeriesMissingValues(dataset: VMDataset, min_threshold: int = 1):
110
113
  barplot,
111
114
  heatmap,
112
115
  all(missing[col] < min_threshold for col in missing.index),
116
+ RawData(missing_values_count=missing, missing_values_mask=missing_mask),
113
117
  )
@@ -111,8 +111,4 @@ def TimeSeriesOutliers(dataset: VMDataset, zscore_threshold: int = 3):
111
111
 
112
112
  figures.append(fig)
113
113
 
114
- return (
115
- outlier_df.sort_values(["Column", "Date"]),
116
- figures,
117
- len(outlier_df) == 0,
118
- )
114
+ return (outlier_df.sort_values(["Column", "Date"]), figures, len(outlier_df) == 0)
@@ -9,7 +9,7 @@ import plotly.graph_objects as go
9
9
  import scorecardpy as sc
10
10
  from plotly.subplots import make_subplots
11
11
 
12
- from validmind import tags, tasks
12
+ from validmind import RawData, tags, tasks
13
13
  from validmind.errors import SkipTestError
14
14
  from validmind.logging import get_logger
15
15
  from validmind.vm_models import VMDataset
@@ -140,4 +140,4 @@ def WOEBinPlots(
140
140
 
141
141
  figures.append(fig)
142
142
 
143
- return tuple(figures)
143
+ return (*figures, RawData(woe_iv_data=woe_iv_df))
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  import scorecardpy as sc
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
  from validmind.errors import SkipTestError
10
10
  from validmind.vm_models import VMDataset
11
11
 
@@ -61,12 +61,14 @@ def WOEBinTable(dataset: VMDataset, breaks_adj: list = None):
61
61
  except Exception as e:
62
62
  raise SkipTestError(f"Error during binning: {e}")
63
63
 
64
+ result_table = (
65
+ pd.concat(bins.values(), keys=bins.keys())
66
+ .reset_index()
67
+ .drop(columns=["variable"])
68
+ .rename(columns={"level_0": "variable"})
69
+ .assign(bin_number=lambda x: x.groupby("variable").cumcount())
70
+ )
71
+
64
72
  return {
65
- "Weight of Evidence (WoE) and Information Value (IV)": (
66
- pd.concat(bins.values(), keys=bins.keys())
67
- .reset_index()
68
- .drop(columns=["variable"])
69
- .rename(columns={"level_0": "variable"})
70
- .assign(bin_number=lambda x: x.groupby("variable").cumcount())
71
- )
72
- }
73
+ "Weight of Evidence (WoE) and Information Value (IV)": result_table
74
+ }, RawData(woe_bins=bins)
@@ -8,7 +8,7 @@ import nltk
8
8
  import plotly.graph_objects as go
9
9
  from nltk.corpus import stopwords
10
10
 
11
- from validmind import tags, tasks
11
+ from validmind import RawData, tags, tasks
12
12
  from validmind.vm_models import VMDataset
13
13
 
14
14
 
@@ -94,4 +94,4 @@ def CommonWords(dataset: VMDataset):
94
94
  xaxis_tickangle=-45,
95
95
  )
96
96
 
97
- return fig
97
+ return fig, RawData(words=x, frequencies=y)
@@ -6,7 +6,7 @@ import re
6
6
 
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.errors import SkipTestError
11
11
  from validmind.vm_models import VMDataset
12
12
 
@@ -76,4 +76,4 @@ def Hashtags(dataset: VMDataset, top_hashtags: int = 25):
76
76
  xaxis_tickangle=-45,
77
77
  )
78
78
 
79
- return fig
79
+ return fig, RawData(top_hashtag_counts=top_hashtag_counts)
@@ -5,7 +5,7 @@
5
5
  import plotly.express as px
6
6
  from langdetect import LangDetectException, detect
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
 
10
10
 
11
11
  @tags("nlp", "text_data", "visualization")
@@ -64,9 +64,12 @@ def LanguageDetection(dataset):
64
64
 
65
65
  languages = dataset.df[dataset.text_column].apply(detect_language)
66
66
 
67
- return px.histogram(
68
- languages,
69
- x=languages,
70
- title="Language Distribution",
71
- labels={"x": "Language Codes"},
67
+ return (
68
+ px.histogram(
69
+ languages,
70
+ x=languages,
71
+ title="Language Distribution",
72
+ labels={"x": "Language Codes"},
73
+ ),
74
+ RawData(detected_languages=languages),
72
75
  )
@@ -7,7 +7,7 @@ import re
7
7
  import pandas as pd
8
8
  import plotly.express as px
9
9
 
10
- from validmind import tags, tasks
10
+ from validmind import RawData, tags, tasks
11
11
  from validmind.errors import SkipTestError
12
12
  from validmind.vm_models import VMDataset
13
13
 
@@ -75,9 +75,12 @@ def Mentions(dataset: VMDataset, top_mentions: int = 25):
75
75
  }
76
76
  )
77
77
 
78
- return px.treemap(
79
- mention_frequencies_df,
80
- path=["Scenario"],
81
- values="Percentage",
82
- title="Tree of Mentions",
78
+ return (
79
+ px.treemap(
80
+ mention_frequencies_df,
81
+ path=["Scenario"],
82
+ values="Percentage",
83
+ title="Tree of Mentions",
84
+ ),
85
+ RawData(mention_counts=mention_counts),
83
86
  )
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import plotly.express as px
8
8
  from textblob import TextBlob
9
9
 
10
- from validmind import tags, tasks
10
+ from validmind import RawData, tags, tasks
11
11
 
12
12
 
13
13
  @tags("nlp", "text_data", "data_validation")
@@ -144,4 +144,4 @@ def PolarityAndSubjectivity(dataset, threshold_subjectivity=0.5, threshold_polar
144
144
 
145
145
  statistics_tables = {"Quadrant Distribution": quadrant_df, "Statistics": stats_df}
146
146
 
147
- return fig, statistics_tables
147
+ return fig, statistics_tables, RawData(sentiment_data=data)
@@ -11,7 +11,7 @@ from collections import defaultdict
11
11
 
12
12
  import plotly.graph_objects as go
13
13
 
14
- from validmind import tags, tasks
14
+ from validmind import RawData, tags, tasks
15
15
 
16
16
 
17
17
  @tags("nlp", "text_data", "visualization", "frequency_analysis")
@@ -63,7 +63,9 @@ def Punctuations(dataset, count_mode="token"):
63
63
 
64
64
  corpus = _create_corpus(dataset.df, dataset.text_column)
65
65
  punctuation_counts = _count_punctuations(corpus, count_mode)
66
- return _create_punctuation_plot(punctuation_counts)
66
+ fig = _create_punctuation_plot(punctuation_counts)
67
+
68
+ return fig, RawData(punctuation_counts=punctuation_counts)
67
69
 
68
70
 
69
71
  def _create_punctuation_plot(punctuation_counts):
@@ -8,7 +8,7 @@ import nltk
8
8
  import seaborn as sns
9
9
  from nltk.sentiment import SentimentIntensityAnalyzer
10
10
 
11
- from validmind import tags, tasks
11
+ from validmind import RawData, tags, tasks
12
12
 
13
13
 
14
14
  @tags("nlp", "text_data", "data_validation")
@@ -77,4 +77,4 @@ def Sentiment(dataset):
77
77
 
78
78
  plt.close("all")
79
79
 
80
- return fig
80
+ return fig, RawData(sentiment_scores=vader_sentiment.tolist())
@@ -13,7 +13,7 @@ import pandas as pd
13
13
  import plotly.graph_objects as go
14
14
  from nltk.corpus import stopwords
15
15
 
16
- from validmind import tags, tasks
16
+ from validmind import RawData, tags, tasks
17
17
  from validmind.vm_models import VMDataset
18
18
 
19
19
 
@@ -84,17 +84,17 @@ def StopWords(
84
84
  nltk.download("stopwords", quiet=True)
85
85
 
86
86
  stop = set(stopwords.words("english"))
87
- dic = defaultdict(int)
87
+ stop_word_frequencies = defaultdict(int)
88
88
  for word in corpus:
89
89
  if word in stop:
90
- dic[word] += 1
90
+ stop_word_frequencies[word] += 1
91
91
 
92
92
  # Calculate the total number of words in the corpus
93
93
  total_words = len(corpus)
94
94
 
95
95
  # Calculate the percentage of each word in the corpus
96
96
  word_percentages = {}
97
- for word, count in dic.items():
97
+ for word, count in stop_word_frequencies.items():
98
98
  percentage = (count / total_words) * 100
99
99
  word_percentages[word] = percentage
100
100
 
@@ -124,4 +124,5 @@ def StopWords(
124
124
  },
125
125
  fig,
126
126
  passed,
127
+ RawData(stop_word_frequencies=stop_word_frequencies, total_words=total_words),
127
128
  )
@@ -9,7 +9,7 @@ import pandas as pd
9
9
  import plotly.express as px
10
10
  from nltk.corpus import stopwords
11
11
 
12
- from validmind import tags, tasks
12
+ from validmind import RawData, tags, tasks
13
13
  from validmind.vm_models import VMDataset
14
14
 
15
15
 
@@ -173,4 +173,4 @@ def TextDescription(
173
173
  )
174
174
  )
175
175
 
176
- return tuple(figures)
176
+ return (*figures, RawData(metrics_dataframe=metrics_df))
@@ -6,7 +6,7 @@ import evaluate
6
6
  import matplotlib.pyplot as plt
7
7
  import seaborn as sns
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
 
11
11
 
12
12
  @tags("nlp", "text_data", "data_validation")
@@ -73,4 +73,4 @@ def Toxicity(dataset):
73
73
 
74
74
  plt.close()
75
75
 
76
- return fig
76
+ return fig, RawData(toxicity_scores=toxicity_scores)
@@ -6,7 +6,7 @@ import evaluate
6
6
  import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.tests.utils import validate_prediction
11
11
 
12
12
 
@@ -131,4 +131,4 @@ def BertScore(
131
131
  # Create a DataFrame from all collected statistics
132
132
  result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
133
133
 
134
- return (result_df, *tuple(figures))
134
+ return (result_df, *figures, RawData(bert_scores_df=metrics_df))
@@ -6,7 +6,7 @@ import evaluate
6
6
  import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.tests.utils import validate_prediction
11
11
 
12
12
 
@@ -114,4 +114,4 @@ def BleuScore(dataset, model):
114
114
  # Create a DataFrame from all collected statistics
115
115
  result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
116
116
 
117
- return (result_df, *tuple(figures))
117
+ return (result_df, *figures, RawData(bleu_scores_df=metrics_df))
@@ -5,7 +5,7 @@
5
5
  import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
 
8
- from validmind import tags, tasks
8
+ from validmind import RawData, tags, tasks
9
9
  from validmind.vm_models import VMDataset, VMModel
10
10
 
11
11
 
@@ -72,4 +72,4 @@ def ClusterSizeDistribution(dataset: VMDataset, model: VMModel):
72
72
  fig.update_yaxes(title_text="Counts", showgrid=False)
73
73
  fig.update_layout(title_text="Cluster distribution", title_x=0.5, barmode="group")
74
74
 
75
- return fig
75
+ return fig, RawData(cluster_counts=df_counts)
@@ -6,7 +6,7 @@ import nltk
6
6
  import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.tests.utils import validate_prediction
11
11
 
12
12
 
@@ -118,4 +118,4 @@ def ContextualRecall(dataset, model):
118
118
  # Create a DataFrame from all collected statistics
119
119
  result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
120
120
 
121
- return (result_df, *tuple(figures))
121
+ return (result_df, *tuple(figures), RawData(contextual_recall_scores=metrics_df))
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
  from sklearn.metrics import roc_auc_score
9
9
 
10
- from validmind import tags, tasks
10
+ from validmind import RawData, tags, tasks
11
11
  from validmind.errors import SkipTestError
12
12
  from validmind.logging import get_logger
13
13
  from validmind.vm_models import VMDataset
@@ -95,4 +95,4 @@ def FeaturesAUC(dataset: VMDataset, fontsize: int = 12, figure_height: int = 500
95
95
  height=figure_height,
96
96
  )
97
97
 
98
- return fig
98
+ return fig, RawData(feature_aucs=aucs)
@@ -6,7 +6,7 @@ import evaluate
6
6
  import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.tests.utils import validate_prediction
11
11
 
12
12
 
@@ -117,4 +117,4 @@ def MeteorScore(dataset, model):
117
117
  # Create a DataFrame from all collected statistics
118
118
  result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
119
119
 
120
- return (result_df, *tuple(figures))
120
+ return (result_df, *tuple(figures), RawData(meteor_scores=metrics_df))
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
  from scipy.stats import kstest
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
 
11
11
 
12
12
  @tags("regression")
@@ -102,4 +102,4 @@ def ModelPredictionResiduals(
102
102
  # Create a summary DataFrame for the KS normality test results
103
103
  summary_df = pd.DataFrame([summary])
104
104
 
105
- return (summary_df, *figures)
105
+ return (summary_df, *figures, RawData(residuals=residuals))
@@ -6,7 +6,7 @@ import evaluate
6
6
  import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.tests.utils import validate_prediction
11
11
 
12
12
 
@@ -142,4 +142,8 @@ def RegardScore(dataset, model):
142
142
  ]
143
143
  ]
144
144
 
145
- return (result_df, *tuple(figures))
145
+ return (
146
+ result_df,
147
+ *figures,
148
+ RawData(true_regard=true_df, pred_regard=pred_df),
149
+ )
@@ -6,7 +6,7 @@ import numpy as np
6
6
  import plotly.figure_factory as ff
7
7
  import plotly.graph_objects as go
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
  from validmind.vm_models import VMDataset, VMModel
11
11
 
12
12
 
@@ -60,8 +60,9 @@ def RegressionResidualsPlot(model: VMModel, dataset: VMDataset, bin_size: float
60
60
  figures = []
61
61
 
62
62
  # Residuals plot
63
+ residuals = y_true.flatten() - y_pred.flatten()
63
64
  fig = ff.create_distplot(
64
- hist_data=[y_true.flatten() - y_pred.flatten()],
65
+ hist_data=[residuals],
65
66
  group_labels=["Residuals"],
66
67
  bin_size=[bin_size],
67
68
  show_hist=True,
@@ -104,4 +105,4 @@ def RegressionResidualsPlot(model: VMModel, dataset: VMDataset, bin_size: float
104
105
  )
105
106
  )
106
107
 
107
- return tuple(figures)
108
+ return (*figures, RawData(residuals=residuals, y_true=y_true, y_pred=y_pred))
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
  from rouge import Rouge
8
8
 
9
- from validmind import tags, tasks
9
+ from validmind import RawData, tags, tasks
10
10
 
11
11
 
12
12
  @tags("nlp", "text_data", "visualization")
@@ -118,7 +118,8 @@ def RougeScore(dataset, model, metric="rouge-1"):
118
118
  {"p": "Precision", "r": "Recall", "f": "F1 Score"}
119
119
  )
120
120
 
121
- # Create a DataFrame from all collected statistics
122
- result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
123
-
124
- return (result_df, *tuple(figures))
121
+ return (
122
+ pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"}),
123
+ *figures,
124
+ RawData(rouge_scores_df=df_scores),
125
+ )