validmind 2.8.10__py3-none-any.whl → 2.8.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +4 -2
  3. validmind/tests/data_validation/ACFandPACFPlot.py +4 -1
  4. validmind/tests/data_validation/AutoMA.py +1 -1
  5. validmind/tests/data_validation/BivariateScatterPlots.py +5 -1
  6. validmind/tests/data_validation/BoxPierce.py +3 -1
  7. validmind/tests/data_validation/ClassImbalance.py +1 -1
  8. validmind/tests/data_validation/DatasetDescription.py +1 -1
  9. validmind/tests/data_validation/DickeyFullerGLS.py +1 -1
  10. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +1 -1
  11. validmind/tests/data_validation/HighCardinality.py +5 -1
  12. validmind/tests/data_validation/HighPearsonCorrelation.py +1 -1
  13. validmind/tests/data_validation/IQROutliersBarPlot.py +5 -3
  14. validmind/tests/data_validation/IQROutliersTable.py +5 -2
  15. validmind/tests/data_validation/IsolationForestOutliers.py +5 -4
  16. validmind/tests/data_validation/JarqueBera.py +2 -2
  17. validmind/tests/data_validation/LJungBox.py +2 -2
  18. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  19. validmind/tests/data_validation/MissingValues.py +14 -10
  20. validmind/tests/data_validation/MissingValuesBarPlot.py +3 -1
  21. validmind/tests/data_validation/MutualInformation.py +2 -1
  22. validmind/tests/data_validation/PearsonCorrelationMatrix.py +1 -1
  23. validmind/tests/data_validation/ProtectedClassesCombination.py +2 -0
  24. validmind/tests/data_validation/ProtectedClassesDescription.py +2 -2
  25. validmind/tests/data_validation/ProtectedClassesDisparity.py +9 -5
  26. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +10 -2
  27. validmind/tests/data_validation/RollingStatsPlot.py +2 -1
  28. validmind/tests/data_validation/ScoreBandDefaultRates.py +4 -2
  29. validmind/tests/data_validation/SeasonalDecompose.py +1 -1
  30. validmind/tests/data_validation/ShapiroWilk.py +2 -2
  31. validmind/tests/data_validation/SpreadPlot.py +1 -1
  32. validmind/tests/data_validation/TabularCategoricalBarPlots.py +1 -1
  33. validmind/tests/data_validation/TabularDateTimeHistograms.py +1 -1
  34. validmind/tests/data_validation/TargetRateBarPlots.py +4 -1
  35. validmind/tests/data_validation/TimeSeriesFrequency.py +1 -1
  36. validmind/tests/data_validation/TimeSeriesOutliers.py +7 -2
  37. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  38. validmind/tests/data_validation/WOEBinTable.py +1 -1
  39. validmind/tests/data_validation/ZivotAndrewsArch.py +5 -2
  40. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  41. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  42. validmind/tests/data_validation/nlp/LanguageDetection.py +1 -1
  43. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  44. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +5 -1
  45. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  46. validmind/tests/data_validation/nlp/Sentiment.py +3 -1
  47. validmind/tests/data_validation/nlp/TextDescription.py +1 -1
  48. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  49. validmind/tests/model_validation/BertScore.py +7 -1
  50. validmind/tests/model_validation/BleuScore.py +7 -1
  51. validmind/tests/model_validation/ClusterSizeDistribution.py +3 -1
  52. validmind/tests/model_validation/ContextualRecall.py +9 -1
  53. validmind/tests/model_validation/FeaturesAUC.py +1 -1
  54. validmind/tests/model_validation/MeteorScore.py +7 -1
  55. validmind/tests/model_validation/ModelPredictionResiduals.py +5 -1
  56. validmind/tests/model_validation/RegardScore.py +6 -1
  57. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -1
  58. validmind/tests/model_validation/RougeScore.py +3 -1
  59. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +2 -0
  60. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +10 -2
  61. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +6 -2
  62. validmind/tests/model_validation/TokenDisparity.py +5 -1
  63. validmind/tests/model_validation/ToxicityScore.py +2 -0
  64. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  65. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +5 -1
  66. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -1
  67. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +5 -1
  68. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -0
  69. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +5 -1
  70. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +6 -2
  71. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +3 -1
  72. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -1
  73. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +5 -1
  74. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +5 -1
  75. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +5 -1
  76. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +5 -1
  77. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +6 -1
  78. validmind/tests/model_validation/ragas/AnswerCorrectness.py +1 -1
  79. validmind/tests/model_validation/ragas/AspectCritic.py +4 -1
  80. validmind/tests/model_validation/ragas/ContextEntityRecall.py +1 -1
  81. validmind/tests/model_validation/ragas/ContextPrecision.py +1 -1
  82. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +1 -1
  83. validmind/tests/model_validation/ragas/ContextRecall.py +1 -1
  84. validmind/tests/model_validation/ragas/Faithfulness.py +1 -1
  85. validmind/tests/model_validation/ragas/NoiseSensitivity.py +1 -1
  86. validmind/tests/model_validation/ragas/ResponseRelevancy.py +1 -1
  87. validmind/tests/model_validation/ragas/SemanticSimilarity.py +1 -1
  88. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +9 -9
  89. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +9 -9
  90. validmind/tests/model_validation/sklearn/CalibrationCurve.py +5 -2
  91. validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +15 -2
  92. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -1
  93. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +24 -14
  94. validmind/tests/model_validation/sklearn/CompletenessScore.py +8 -9
  95. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -3
  96. validmind/tests/model_validation/sklearn/FeatureImportance.py +6 -2
  97. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -9
  98. validmind/tests/model_validation/sklearn/HomogeneityScore.py +14 -9
  99. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +4 -2
  100. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +6 -1
  101. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +12 -7
  102. validmind/tests/model_validation/sklearn/MinimumF1Score.py +12 -7
  103. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +21 -6
  104. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +8 -2
  105. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +5 -1
  106. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -1
  107. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +6 -1
  108. validmind/tests/model_validation/sklearn/ROCCurve.py +3 -1
  109. validmind/tests/model_validation/sklearn/RegressionErrors.py +6 -2
  110. validmind/tests/model_validation/sklearn/RegressionPerformance.py +13 -8
  111. validmind/tests/model_validation/sklearn/RegressionR2Square.py +8 -5
  112. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +5 -1
  113. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +6 -1
  114. validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +10 -2
  115. validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -1
  116. validmind/tests/model_validation/sklearn/VMeasure.py +12 -9
  117. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -1
  118. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +6 -1
  119. validmind/tests/model_validation/statsmodels/GINITable.py +8 -1
  120. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +2 -2
  121. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +6 -2
  122. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +8 -2
  123. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +3 -1
  124. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +7 -2
  125. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -0
  126. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -0
  127. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +4 -2
  128. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +3 -1
  129. validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +11 -1
  130. validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +10 -2
  131. validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +8 -1
  132. validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +18 -2
  133. validmind/tests/ongoing_monitoring/FeatureDrift.py +9 -2
  134. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +8 -2
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +13 -2
  136. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +13 -2
  137. validmind/tests/ongoing_monitoring/ROCCurveDrift.py +16 -2
  138. validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +11 -2
  139. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +13 -2
  140. validmind/tests/prompt_validation/Clarity.py +1 -1
  141. validmind/tests/prompt_validation/NegativeInstruction.py +1 -1
  142. validmind/tests/prompt_validation/Robustness.py +6 -1
  143. validmind/tests/prompt_validation/Specificity.py +1 -1
  144. validmind/vm_models/result/utils.py +4 -23
  145. {validmind-2.8.10.dist-info → validmind-2.8.12.dist-info}/METADATA +2 -2
  146. {validmind-2.8.10.dist-info → validmind-2.8.12.dist-info}/RECORD +149 -149
  147. {validmind-2.8.10.dist-info → validmind-2.8.12.dist-info}/LICENSE +0 -0
  148. {validmind-2.8.10.dist-info → validmind-2.8.12.dist-info}/WHEEL +0 -0
  149. {validmind-2.8.10.dist-info → validmind-2.8.12.dist-info}/entry_points.txt +0 -0
@@ -95,4 +95,4 @@ def FeaturesAUC(dataset: VMDataset, fontsize: int = 12, figure_height: int = 500
95
95
  height=figure_height,
96
96
  )
97
97
 
98
- return fig, RawData(feature_aucs=aucs)
98
+ return fig, RawData(feature_aucs=aucs, dataset=dataset.input_id)
@@ -117,4 +117,10 @@ def MeteorScore(dataset, model):
117
117
  # Create a DataFrame from all collected statistics
118
118
  result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
119
119
 
120
- return (result_df, *figures, RawData(meteor_scores=metrics_df))
120
+ return (
121
+ result_df,
122
+ *figures,
123
+ RawData(
124
+ meteor_scores=metrics_df, model=model.input_id, dataset=dataset.input_id
125
+ ),
126
+ )
@@ -102,4 +102,8 @@ def ModelPredictionResiduals(
102
102
  # Create a summary DataFrame for the KS normality test results
103
103
  summary_df = pd.DataFrame([summary])
104
104
 
105
- return (summary_df, *figures, RawData(residuals=residuals))
105
+ return (
106
+ summary_df,
107
+ *figures,
108
+ RawData(residuals=residuals, model=model.input_id, dataset=dataset.input_id),
109
+ )
@@ -145,5 +145,10 @@ def RegardScore(dataset, model):
145
145
  return (
146
146
  result_df,
147
147
  *figures,
148
- RawData(true_regard=true_df, pred_regard=pred_df),
148
+ RawData(
149
+ true_regard=true_df,
150
+ pred_regard=pred_df,
151
+ model=model.input_id,
152
+ dataset=dataset.input_id,
153
+ ),
149
154
  )
@@ -105,4 +105,13 @@ def RegressionResidualsPlot(model: VMModel, dataset: VMDataset, bin_size: float
105
105
  )
106
106
  )
107
107
 
108
- return (*figures, RawData(residuals=residuals, y_true=y_true, y_pred=y_pred))
108
+ return (
109
+ *figures,
110
+ RawData(
111
+ residuals=residuals,
112
+ y_true=y_true,
113
+ y_pred=y_pred,
114
+ model=model.input_id,
115
+ dataset=dataset.input_id,
116
+ ),
117
+ )
@@ -121,5 +121,7 @@ def RougeScore(dataset, model, metric="rouge-1"):
121
121
  return (
122
122
  pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"}),
123
123
  *figures,
124
- RawData(rouge_scores_df=df_scores),
124
+ RawData(
125
+ rouge_scores_df=df_scores, model=model.input_id, dataset=dataset.input_id
126
+ ),
125
127
  )
@@ -152,5 +152,7 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
152
152
  z_score=z_score,
153
153
  lower_confidence=lower_conf,
154
154
  upper_confidence=upper_conf,
155
+ model=model.input_id,
156
+ dataset=dataset.input_id,
155
157
  ),
156
158
  )
@@ -4,7 +4,7 @@
4
4
 
5
5
  import plotly.graph_objects as go
6
6
 
7
- from validmind import tags, tasks
7
+ from validmind import RawData, tags, tasks
8
8
 
9
9
 
10
10
  @tags("model_predictions", "visualization")
@@ -70,4 +70,12 @@ def TimeSeriesPredictionsPlot(dataset, model):
70
70
  template="plotly_white",
71
71
  )
72
72
 
73
- return fig
73
+ raw_data = RawData(
74
+ time_index=time_index,
75
+ actual_values=dataset.y,
76
+ predicted_values=y_pred,
77
+ model=model.input_id,
78
+ dataset=dataset.input_id,
79
+ )
80
+
81
+ return fig, raw_data
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import plotly.express as px
8
8
  from sklearn import metrics
9
9
 
10
- from validmind import tags, tasks
10
+ from validmind import RawData, tags, tasks
11
11
 
12
12
 
13
13
  @tags("model_performance", "sklearn")
@@ -105,4 +105,8 @@ def TimeSeriesR2SquareBySegments(dataset, model, segments=None):
105
105
  },
106
106
  )
107
107
 
108
- return fig, results_df
108
+ return (
109
+ fig,
110
+ results_df,
111
+ RawData(summary=results_df, model=model.input_id, dataset=dataset.input_id),
112
+ )
@@ -108,4 +108,8 @@ def TokenDisparity(dataset, model):
108
108
  # Create a DataFrame from all collected statistics
109
109
  result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
110
110
 
111
- return (result_df, *figures, RawData(token_counts_df=df))
111
+ return (
112
+ result_df,
113
+ *figures,
114
+ RawData(token_counts_df=df, model=model.input_id, dataset=dataset.input_id),
115
+ )
@@ -146,5 +146,7 @@ def ToxicityScore(dataset, model):
146
146
  input_toxicity_df=input_df,
147
147
  true_toxicity_df=true_df,
148
148
  pred_toxicity_df=pred_df,
149
+ model=model.input_id,
150
+ dataset=dataset.input_id,
149
151
  ),
150
152
  )
@@ -62,4 +62,4 @@ def ClusterDistribution(model: VMModel, dataset: VMDataset, num_clusters: int =
62
62
  title="Embeddings Cluster Distribution",
63
63
  )
64
64
 
65
- return fig, RawData(labels=labels)
65
+ return fig, RawData(labels=labels, model=model.input_id, dataset=dataset.input_id)
@@ -113,5 +113,9 @@ def CosineSimilarityComparison(dataset, models):
113
113
  return (
114
114
  *figures,
115
115
  stats_df,
116
- RawData(similarity_matrices=pd.DataFrame(similarity_matrices)),
116
+ RawData(
117
+ similarity_matrices=pd.DataFrame(similarity_matrices),
118
+ dataset=dataset.input_id,
119
+ models=[model.input_id for model in models],
120
+ ),
117
121
  )
@@ -59,4 +59,8 @@ def CosineSimilarityDistribution(dataset: VMDataset, model: VMModel):
59
59
  nbins=100,
60
60
  title="Cosine Similarity Distribution",
61
61
  labels={"x": "Cosine Similarity"},
62
- ), RawData(similarity_scores=similarity_scores)
62
+ ), RawData(
63
+ similarity_scores=similarity_scores,
64
+ model=model.input_id,
65
+ dataset=dataset.input_id,
66
+ )
@@ -81,4 +81,8 @@ def CosineSimilarityHeatmap(
81
81
  yaxis_title=yaxis_title,
82
82
  )
83
83
 
84
- return fig, RawData(similarity_matrix=similarity_matrix)
84
+ return fig, RawData(
85
+ similarity_matrix=similarity_matrix,
86
+ model=model.input_id,
87
+ dataset=dataset.input_id,
88
+ )
@@ -77,5 +77,7 @@ def DescriptiveAnalytics(dataset: VMDataset, model: VMModel):
77
77
  embedding_means=embedding_means,
78
78
  embedding_medians=embedding_medians,
79
79
  embedding_stds=embedding_stds,
80
+ model=model.input_id,
81
+ dataset=dataset.input_id,
80
82
  ),
81
83
  )
@@ -89,4 +89,8 @@ def EmbeddingsVisualization2D(
89
89
  fig = px.scatter(**scatter_kwargs)
90
90
  fig.update_layout(width=500, height=500)
91
91
 
92
- return fig, RawData(tsne_embeddings=reduced_embeddings)
92
+ return fig, RawData(
93
+ tsne_embeddings=reduced_embeddings,
94
+ model=model.input_id,
95
+ dataset=dataset.input_id,
96
+ )
@@ -57,7 +57,7 @@ def EuclideanDistanceComparison(dataset, models):
57
57
  figures = []
58
58
  all_stats = []
59
59
 
60
- distance_matrices = {}
60
+ distance_matrices = []
61
61
 
62
62
  # Generate all pairs of models for comparison
63
63
  for model_A, model_B in combinations(models, 2):
@@ -105,6 +105,10 @@ def EuclideanDistanceComparison(dataset, models):
105
105
  stats_df = pd.DataFrame(all_stats)
106
106
 
107
107
  # Add raw data to return
108
- raw_data = RawData(distance_matrices=pd.DataFrame(distance_matrices))
108
+ raw_data = RawData(
109
+ distance_matrices=pd.DataFrame(distance_matrices),
110
+ dataset=dataset.input_id,
111
+ models=[model.input_id for model in models],
112
+ )
109
113
 
110
114
  return (stats_df, *figures, raw_data)
@@ -79,4 +79,6 @@ def EuclideanDistanceHeatmap(
79
79
  yaxis_title=yaxis_title,
80
80
  )
81
81
 
82
- return fig, RawData(distance_matrix=distance_matrix)
82
+ return fig, RawData(
83
+ distance_matrix=distance_matrix, model=model.input_id, dataset=dataset.input_id
84
+ )
@@ -90,4 +90,7 @@ def PCAComponentsPairwisePlots(dataset, model, n_components=3):
90
90
  )
91
91
  figures.append(fig)
92
92
 
93
- return (*figures, RawData(pca_results=pca_df))
93
+ return (
94
+ *figures,
95
+ RawData(pca_results=pca_df, model=model.input_id, dataset=dataset.input_id),
96
+ )
@@ -97,4 +97,8 @@ def StabilityAnalysisKeyword(
97
97
  mean_similarity_threshold,
98
98
  )
99
99
 
100
- return results, RawData(original_perturbed_similarity=raw_data)
100
+ return results, RawData(
101
+ original_perturbed_similarity=raw_data,
102
+ model=model.input_id,
103
+ dataset=dataset.input_id,
104
+ )
@@ -151,4 +151,8 @@ def StabilityAnalysisRandomNoise(
151
151
  mean_similarity_threshold,
152
152
  )
153
153
 
154
- return *result, RawData(original_perturbed_similarity=raw_data)
154
+ return *result, RawData(
155
+ original_perturbed_similarity=raw_data,
156
+ model=model.input_id,
157
+ dataset=dataset.input_id,
158
+ )
@@ -107,4 +107,8 @@ def StabilityAnalysisSynonyms(
107
107
  mean_similarity_threshold,
108
108
  )
109
109
 
110
- return *result, RawData(original_perturbed_similarity=raw_data)
110
+ return *result, RawData(
111
+ original_perturbed_similarity=raw_data,
112
+ model=model.input_id,
113
+ dataset=dataset.input_id,
114
+ )
@@ -134,4 +134,8 @@ def StabilityAnalysisTranslation(
134
134
  mean_similarity_threshold,
135
135
  )
136
136
 
137
- return *result, RawData(original_perturbed_similarity=raw_data)
137
+ return *result, RawData(
138
+ original_perturbed_similarity=raw_data,
139
+ model=model.input_id,
140
+ dataset=dataset.input_id,
141
+ )
@@ -110,5 +110,10 @@ def TSNEComponentsPairwisePlots(
110
110
 
111
111
  return (
112
112
  *figures,
113
- RawData(embeddings_scaled=embeddings_scaled, tsne_results=tsne_results),
113
+ RawData(
114
+ embeddings_scaled=embeddings_scaled,
115
+ tsne_results=tsne_results,
116
+ model=model.input_id,
117
+ dataset=dataset.input_id,
118
+ ),
114
119
  )
@@ -144,5 +144,5 @@ def AnswerCorrectness(
144
144
  },
145
145
  fig_histogram,
146
146
  fig_box,
147
- RawData(evaluation_results=result_df),
147
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
148
148
  )
@@ -195,5 +195,8 @@ def AspectCritic(
195
195
  ]
196
196
  },
197
197
  fig,
198
- RawData(evaluation_results=result_df),
198
+ RawData(
199
+ evaluation_results=result_df,
200
+ dataset=dataset.input_id,
201
+ ),
199
202
  )
@@ -143,5 +143,5 @@ def ContextEntityRecall(
143
143
  },
144
144
  fig_histogram,
145
145
  fig_box,
146
- RawData(evaluation_results=result_df),
146
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
147
147
  )
@@ -135,5 +135,5 @@ def ContextPrecision(
135
135
  },
136
136
  fig_histogram,
137
137
  fig_box,
138
- RawData(evaluation_results=result_df),
138
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
139
139
  )
@@ -130,5 +130,5 @@ def ContextPrecisionWithoutReference(
130
130
  },
131
131
  fig_histogram,
132
132
  fig_box,
133
- RawData(evaluation_results=result_df),
133
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
134
134
  )
@@ -135,5 +135,5 @@ def ContextRecall(
135
135
  },
136
136
  fig_histogram,
137
137
  fig_box,
138
- RawData(evaluation_results=result_df),
138
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
139
139
  )
@@ -140,5 +140,5 @@ def Faithfulness(
140
140
  },
141
141
  fig_histogram,
142
142
  fig_box,
143
- RawData(evaluation_results=result_df),
143
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
144
144
  )
@@ -179,5 +179,5 @@ def NoiseSensitivity(
179
179
  },
180
180
  fig_histogram,
181
181
  fig_box,
182
- RawData(evaluation_results=result_df),
182
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
183
183
  )
@@ -154,5 +154,5 @@ def ResponseRelevancy(
154
154
  },
155
155
  fig_histogram,
156
156
  fig_box,
157
- RawData(evaluation_results=result_df),
157
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
158
158
  )
@@ -133,5 +133,5 @@ def SemanticSimilarity(
133
133
  },
134
134
  fig_histogram,
135
135
  fig_box,
136
- RawData(evaluation_results=result_df),
136
+ RawData(evaluation_results=result_df, dataset=dataset.input_id),
137
137
  )
@@ -4,7 +4,7 @@
4
4
 
5
5
  from sklearn.metrics import adjusted_mutual_info_score
6
6
 
7
- from validmind import tags, tasks
7
+ from validmind import RawData, tags, tasks
8
8
  from validmind.vm_models import VMDataset, VMModel
9
9
 
10
10
 
@@ -52,11 +52,11 @@ def AdjustedMutualInformation(model: VMModel, dataset: VMDataset):
52
52
  - The interpretability of the score can be complex as it depends on the understanding of information theory
53
53
  concepts.
54
54
  """
55
- return [
56
- {
57
- "Adjusted Mutual Information": adjusted_mutual_info_score(
58
- labels_true=dataset.y,
59
- labels_pred=dataset.y_pred(model),
60
- )
61
- }
62
- ]
55
+ ami_score = adjusted_mutual_info_score(
56
+ labels_true=dataset.y,
57
+ labels_pred=dataset.y_pred(model),
58
+ )
59
+
60
+ return [{"Adjusted Mutual Information": ami_score}], RawData(
61
+ ami_score=ami_score, model=model.input_id, dataset=dataset.input_id
62
+ )
@@ -4,7 +4,7 @@
4
4
 
5
5
  from sklearn.metrics import adjusted_rand_score
6
6
 
7
- from validmind import tags, tasks
7
+ from validmind import RawData, tags, tasks
8
8
  from validmind.vm_models import VMDataset, VMModel
9
9
 
10
10
 
@@ -49,11 +49,11 @@ def AdjustedRandIndex(model: VMModel, dataset: VMDataset):
49
49
  - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
50
50
  heavily dependent on the characteristics of the dataset used.
51
51
  """
52
- return [
53
- {
54
- "Adjusted Rand Index": adjusted_rand_score(
55
- labels_true=dataset.y,
56
- labels_pred=dataset.y_pred(model),
57
- )
58
- }
59
- ]
52
+ ari = adjusted_rand_score(
53
+ labels_true=dataset.y,
54
+ labels_pred=dataset.y_pred(model),
55
+ )
56
+
57
+ return [{"Adjusted Rand Index": ari}], RawData(
58
+ ari_score=ari, model=model.input_id, dataset=dataset.input_id
59
+ )
@@ -72,7 +72,10 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
72
72
 
73
73
  # Create DataFrame for raw data
74
74
  raw_data = RawData(
75
- mean_predicted_probability=prob_pred, observed_frequency=prob_true
75
+ mean_predicted_probability=prob_pred,
76
+ observed_frequency=prob_true,
77
+ model=model.input_id,
78
+ dataset=dataset.input_id,
76
79
  )
77
80
 
78
81
  # Create Plotly figure
@@ -114,4 +117,4 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
114
117
  template="plotly_white",
115
118
  )
116
119
 
117
- return raw_data, fig
120
+ return fig, raw_data
@@ -8,7 +8,7 @@ import plotly.graph_objects as go
8
8
  from plotly.subplots import make_subplots
9
9
  from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve
10
10
 
11
- from validmind import tags, tasks
11
+ from validmind import RawData, tags, tasks
12
12
  from validmind.vm_models import VMDataset, VMModel
13
13
 
14
14
 
@@ -255,4 +255,17 @@ def ClassifierThresholdOptimization(
255
255
  # Create results table and sort by threshold descending
256
256
  table = pd.DataFrame(results).sort_values("threshold", ascending=False)
257
257
 
258
- return fig, table
258
+ return (
259
+ fig,
260
+ table,
261
+ RawData(
262
+ fpr=fpr,
263
+ tpr=tpr,
264
+ precision=precision,
265
+ recall=recall,
266
+ thresholds_roc=thresholds_roc,
267
+ thresholds_pr=thresholds_pr,
268
+ model=model.input_id,
269
+ dataset=dataset.input_id,
270
+ ),
271
+ )
@@ -84,4 +84,8 @@ def ClusterCosineSimilarity(model: VMModel, dataset: VMDataset):
84
84
  if not table:
85
85
  raise SkipTestError("No clusters found")
86
86
 
87
- return table, RawData(cluster_centroids=cluster_centroids)
87
+ return table, RawData(
88
+ cluster_centroids=cluster_centroids,
89
+ model=model.input_id,
90
+ dataset=dataset.input_id,
91
+ )
@@ -11,7 +11,7 @@ from sklearn.metrics import (
11
11
  v_measure_score,
12
12
  )
13
13
 
14
- from validmind import tags, tasks
14
+ from validmind import RawData, tags, tasks
15
15
  from validmind.vm_models import VMDataset, VMModel
16
16
 
17
17
  HOMOGENEITY = """
@@ -115,53 +115,63 @@ def ClusterPerformanceMetrics(model: VMModel, dataset: VMDataset):
115
115
  - Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
116
116
  data.
117
117
  """
118
- return [
118
+ y_true = dataset.y
119
+ y_pred = dataset.y_pred(model)
120
+
121
+ metrics = [
119
122
  {
120
123
  "Metric": "Homogeneity Score",
121
124
  "Description": HOMOGENEITY,
122
125
  "Value": homogeneity_score(
123
- labels_true=dataset.y,
124
- labels_pred=dataset.y_pred(model),
126
+ labels_true=y_true,
127
+ labels_pred=y_pred,
125
128
  ),
126
129
  },
127
130
  {
128
131
  "Metric": "Completeness Score",
129
132
  "Description": COMPLETENESS,
130
133
  "Value": completeness_score(
131
- labels_true=dataset.y,
132
- labels_pred=dataset.y_pred(model),
134
+ labels_true=y_true,
135
+ labels_pred=y_pred,
133
136
  ),
134
137
  },
135
138
  {
136
139
  "Metric": "V Measure",
137
140
  "Description": V_MEASURE,
138
141
  "Value": v_measure_score(
139
- labels_true=dataset.y,
140
- labels_pred=dataset.y_pred(model),
142
+ labels_true=y_true,
143
+ labels_pred=y_pred,
141
144
  ),
142
145
  },
143
146
  {
144
147
  "Metric": "Adjusted Rand Index",
145
148
  "Description": ADJUSTED_RAND_INDEX,
146
149
  "Value": adjusted_rand_score(
147
- labels_true=dataset.y,
148
- labels_pred=dataset.y_pred(model),
150
+ labels_true=y_true,
151
+ labels_pred=y_pred,
149
152
  ),
150
153
  },
151
154
  {
152
155
  "Metric": "Adjusted Mutual Information",
153
156
  "Description": ADJUSTED_MUTUAL_INFORMATION,
154
157
  "Value": adjusted_mutual_info_score(
155
- labels_true=dataset.y,
156
- labels_pred=dataset.y_pred(model),
158
+ labels_true=y_true,
159
+ labels_pred=y_pred,
157
160
  ),
158
161
  },
159
162
  {
160
163
  "Metric": "Fowlkes-Mallows score",
161
164
  "Description": FOULKES_MALLOWS_SCORE,
162
165
  "Value": fowlkes_mallows_score(
163
- labels_true=dataset.y,
164
- labels_pred=dataset.y_pred(model),
166
+ labels_true=y_true,
167
+ labels_pred=y_pred,
165
168
  ),
166
169
  },
167
170
  ]
171
+
172
+ return metrics, RawData(
173
+ true_labels=y_true,
174
+ predicted_labels=y_pred,
175
+ model=model.input_id,
176
+ dataset=dataset.input_id,
177
+ )
@@ -4,7 +4,7 @@
4
4
 
5
5
  from sklearn.metrics import completeness_score
6
6
 
7
- from validmind import tags, tasks
7
+ from validmind import RawData, tags, tasks
8
8
  from validmind.vm_models import VMDataset, VMModel
9
9
 
10
10
 
@@ -47,11 +47,10 @@ def CompletenessScore(model: VMModel, dataset: VMDataset):
47
47
  - The Completeness Score only applies to clustering models; it cannot be used for other types of machine learning
48
48
  models.
49
49
  """
50
- return [
51
- {
52
- "Completeness Score": completeness_score(
53
- labels_true=dataset.y,
54
- labels_pred=dataset.y_pred(model),
55
- )
56
- }
57
- ]
50
+ score = completeness_score(
51
+ labels_true=dataset.y,
52
+ labels_pred=dataset.y_pred(model),
53
+ )
54
+ return [{"Completeness Score": score}], RawData(
55
+ score=score, model=model.input_id, dataset=dataset.input_id
56
+ )