validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -9,6 +9,7 @@
9
9
 
10
10
  import inspect
11
11
  import os
12
+ from typing import Any, Dict, List, Tuple, Union
12
13
  from uuid import uuid4
13
14
 
14
15
  import pandas as pd
@@ -22,6 +23,8 @@ from validmind.vm_models import (
22
23
  ResultSummary,
23
24
  ResultTable,
24
25
  ResultTableMetadata,
26
+ VMDataset,
27
+ VMModel,
25
28
  )
26
29
  from validmind.vm_models.figure import (
27
30
  Figure,
@@ -36,30 +39,42 @@ from ._store import test_store
36
39
  logger = get_logger(__name__)
37
40
 
38
41
 
39
- def _inspect_signature(test_func: callable):
40
- input_keys = ["dataset", "datasets", "model", "models"]
42
+ _input_type_map = {
43
+ "dataset": VMDataset,
44
+ "datasets": List[VMDataset],
45
+ "model": VMModel,
46
+ "models": List[VMModel],
47
+ }
48
+
41
49
 
50
+ def _inspect_signature(test_func: callable):
42
51
  inputs = {}
43
52
  params = {}
44
53
 
45
54
  for name, arg in inspect.signature(test_func).parameters.items():
46
- if name in input_keys:
47
- target_dict = inputs
55
+ if name in _input_type_map:
56
+ inputs[name] = {
57
+ "type": _input_type_map[name],
58
+ }
48
59
  else:
49
- target_dict = params
50
-
51
- target_dict[name] = {
52
- "type": arg.annotation,
53
- "default": (
54
- arg.default if arg.default is not inspect.Parameter.empty else None
55
- ),
56
- }
60
+ params[name] = {
61
+ "type": arg.annotation,
62
+ "default": (
63
+ arg.default if arg.default is not inspect.Parameter.empty else None
64
+ ),
65
+ }
57
66
 
58
67
  return inputs, params
59
68
 
60
69
 
61
70
  def _build_result( # noqa: C901
62
- results, test_id, description, output_template, inputs, generate_description=True
71
+ results: Union[Any, Tuple[Any, ...]],
72
+ test_id: str,
73
+ inputs: List[str],
74
+ params: Dict[str, Any],
75
+ description: str = None,
76
+ output_template: str = None,
77
+ generate_description: bool = True,
63
78
  ):
64
79
  ref_id = str(uuid4())
65
80
  figure_metadata = {
@@ -70,14 +85,17 @@ def _build_result( # noqa: C901
70
85
 
71
86
  tables = []
72
87
  figures = []
88
+ scalars = []
73
89
 
74
- def process_item(item):
90
+ def process_result_item(item):
75
91
  # TOOD: build out a more robust/extensible system for this
76
92
  # TODO: custom type handlers would be really cool
77
93
 
78
- # unit metrics (scalar values) - show in a simple table for now
79
- if isinstance(item, int) or isinstance(item, float) or isinstance(item, str):
80
- tables.append(ResultTable(data=[{test_id.split(".")[-1]: item}]))
94
+ # unit metrics (scalar values) - for now only one per test
95
+ if isinstance(item, int) or isinstance(item, float):
96
+ if scalars:
97
+ raise ValueError("Only one unit metric may be returned per test.")
98
+ scalars.append(item)
81
99
 
82
100
  # plots
83
101
  elif isinstance(item, Figure):
@@ -114,46 +132,66 @@ def _build_result( # noqa: C901
114
132
  # if the results are a tuple, process each item as a separate result
115
133
  if isinstance(results, tuple):
116
134
  for item in results:
117
- process_item(item)
135
+ process_result_item(item)
118
136
  else:
119
- process_item(results)
137
+ process_result_item(results)
120
138
 
121
- result_summary = ResultSummary(results=tables)
139
+ metric_inputs = [
140
+ sub_i.input_id if hasattr(sub_i, "input_id") else sub_i
141
+ for i in inputs
142
+ for sub_i in (i if isinstance(i, list) else [i])
143
+ ]
122
144
 
123
145
  return MetricResultWrapper(
124
146
  result_id=test_id,
125
- metric=MetricResult(
126
- key=test_id,
127
- ref_id=ref_id,
128
- value="Empty",
129
- summary=result_summary,
147
+ scalar=scalars[0] if scalars else None,
148
+ metric=(
149
+ MetricResult(
150
+ key=test_id,
151
+ ref_id=ref_id,
152
+ value="Empty",
153
+ summary=ResultSummary(results=tables),
154
+ )
155
+ if tables or figures # if tables or figures than its a traditional metric
156
+ else None
130
157
  ),
131
158
  figures=figures,
132
- result_metadata=[
133
- get_description_metadata(
134
- test_id=test_id,
135
- default_description=description,
136
- summary=result_summary.serialize(),
137
- figures=figures,
138
- should_generate=generate_description,
139
- )
140
- ],
141
- inputs=inputs,
159
+ result_metadata=(
160
+ [
161
+ get_description_metadata(
162
+ test_id=test_id,
163
+ default_description=description,
164
+ summary=ResultSummary(results=tables).serialize(),
165
+ figures=figures,
166
+ should_generate=generate_description,
167
+ )
168
+ ]
169
+ if tables or figures
170
+ else None
171
+ ),
172
+ inputs=metric_inputs,
173
+ params=params,
142
174
  output_template=output_template,
143
175
  )
144
176
 
145
177
 
146
- def _get_run_method(func, inputs, params):
178
+ def _get_run_method(func, func_inputs, func_params):
147
179
  def run(self: Metric):
148
- input_kwargs = {}
149
- for k in inputs.keys():
180
+ input_kwargs = {} # map function inputs (`dataset` etc) to actual objects
181
+ input_ids = [] # store input_ids used so they can be logged
182
+ for key in func_inputs.keys():
150
183
  try:
151
- input_kwargs[k] = getattr(self.inputs, k)
184
+ input_kwargs[key] = getattr(self.inputs, key)
185
+ if isinstance(input_kwargs[key], list):
186
+ input_ids.extend([i.input_id for i in input_kwargs[key]])
187
+ else:
188
+ input_ids.append(input_kwargs[key].input_id)
152
189
  except AttributeError:
153
- raise MissingRequiredTestInputError(f"Missing required input: {k}.")
190
+ raise MissingRequiredTestInputError(f"Missing required input: {key}.")
154
191
 
155
192
  param_kwargs = {
156
- k: self.params.get(k, params[k]["default"]) for k in params.keys()
193
+ key: self.params.get(key, func_params[key]["default"])
194
+ for key in func_params.keys()
157
195
  }
158
196
 
159
197
  raw_results = func(**input_kwargs, **param_kwargs)
@@ -162,8 +200,9 @@ def _get_run_method(func, inputs, params):
162
200
  results=raw_results,
163
201
  test_id=self.test_id,
164
202
  description=inspect.getdoc(self),
203
+ inputs=input_ids,
204
+ params=param_kwargs,
165
205
  output_template=self.output_template,
166
- inputs=self.get_accessed_inputs(),
167
206
  generate_description=self.generate_description,
168
207
  )
169
208
 
@@ -13,39 +13,48 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def BertScore(dataset, model):
15
15
  """
16
- Evaluates the quality of machine-generated text using BERTScore metrics and visualizes the results through histograms
17
- and bar charts, alongside compiling a comprehensive table of descriptive statistics for each BERTScore metric.
18
-
19
- **Purpose:**
20
- This function is designed to assess the quality of text generated by machine learning models using BERTScore metrics.
21
- BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
22
- contextual embeddings.
23
-
24
- **Test Mechanism:**
25
- The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
26
- the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the BERTScore metrics and
27
- compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore metric (Precision, Recall,
28
- and F1 Score) to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard
29
- deviation, minimum, and maximum) is compiled for each metric, providing a comprehensive summary of the model's performance.
30
-
31
- **Signs of High Risk:**
32
- - Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting that the model
33
- fails to capture the essential content of the reference texts.
16
+ Assesses the quality of machine-generated text using BERTScore metrics and visualizes results through histograms
17
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics.
18
+
19
+ ### Purpose
20
+
21
+ This function is designed to assess the quality of text generated by machine learning models using BERTScore
22
+ metrics. BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score
23
+ based on BERT contextual embeddings.
24
+
25
+ ### Test Mechanism
26
+
27
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then
28
+ initializes the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the
29
+ BERTScore metrics and compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore
30
+ metric (Precision, Recall, and F1 Score) to visualize their distribution. Additionally, a table of descriptive
31
+ statistics (mean, median, standard deviation, minimum, and maximum) is compiled for each metric, providing a
32
+ comprehensive summary of the model's performance.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting
37
+ that the model fails to capture the essential content of the reference texts.
34
38
  - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
35
39
  - Low recall scores may indicate that important information from the reference text is being omitted.
36
- - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
37
- to balance informativeness and conciseness.
40
+ - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the
41
+ model's ability to balance informativeness and conciseness.
38
42
 
39
- **Strengths:**
40
- - Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view of model performance.
41
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
43
+ ### Strengths
44
+
45
+ - Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view
46
+ of model performance.
47
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
48
+ scores.
42
49
  - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
43
50
 
44
- **Limitations:**
45
- - BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text similarity.
51
+ ### Limitations
52
+
53
+ - BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text
54
+ similarity.
46
55
  - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
47
- - While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's performance and should be
48
- supplemented with other metrics and qualitative analysis.
56
+ - While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's
57
+ performance and should be supplemented with other metrics and qualitative analysis.
49
58
  """
50
59
 
51
60
  # Extract true and predicted values
@@ -16,39 +16,45 @@ def BleuScore(dataset, model):
16
16
  Evaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms
17
17
  and bar charts, alongside compiling a comprehensive table of descriptive statistics for BLEU scores.
18
18
 
19
- **Purpose:**
19
+ ### Purpose
20
+
20
21
  This function is designed to assess the quality of text generated by machine learning models using the BLEU metric.
21
22
  BLEU, which stands for Bilingual Evaluation Understudy, is a metric used to evaluate the overlap of n-grams between
22
23
  the machine-generated text and reference texts. This evaluation is crucial for tasks such as text summarization,
23
24
  machine translation, and text generation, where the goal is to produce text that accurately reflects the content
24
25
  and meaning of human-crafted references.
25
26
 
26
- **Test Mechanism:**
27
- The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
28
- the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores and compiles them
29
- into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their distribution. Additionally,
30
- a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the BLEU scores,
31
- providing a comprehensive summary of the model's performance.
27
+ ### Test Mechanism
28
+
29
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then
30
+ initializes the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores
31
+ and compiles them into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their
32
+ distribution. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and
33
+ maximum) is compiled for the BLEU scores, providing a comprehensive summary of the model's performance.
34
+
35
+ ### Signs of High Risk
32
36
 
33
- **Signs of High Risk:**
34
- - Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails to capture
35
- the essential content of the reference texts.
37
+ - Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails
38
+ to capture the essential content of the reference texts.
36
39
  - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
37
40
  - Low recall scores may indicate that important information from the reference text is being omitted.
38
- - An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the model's
39
- ability to balance informativeness and conciseness.
41
+ - An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the
42
+ model's ability to balance informativeness and conciseness.
43
+
44
+ ### Strengths
40
45
 
41
- **Strengths:**
42
46
  - Provides a straightforward and widely-used evaluation of text quality through BLEU scores.
43
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
47
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the
48
+ scores.
44
49
  - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
45
50
 
46
- **Limitations:**
47
- - BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality
48
- of the text.
51
+ ### Limitations
52
+
53
+ - BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or
54
+ grammatical quality of the text.
49
55
  - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
50
- - While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and should be
51
- supplemented with other metrics and qualitative analysis.
56
+ - While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and
57
+ should be supplemented with other metrics and qualitative analysis.
52
58
  """
53
59
 
54
60
  # Extract true and predicted values
@@ -13,40 +13,44 @@ from validmind.vm_models import Figure, Metric
13
13
  @dataclass
14
14
  class ClusterSizeDistribution(Metric):
15
15
  """
16
- Compares and visualizes the distribution of cluster sizes in model predictions and actual data for assessing
17
- clustering model performance.
18
-
19
- **Purpose:** The purpose of the `ClusterSizeDistribution` metric is to assess the performance of clustering models.
20
- It does this by comparing the distribution of cluster sizes in the predictions made by the model and the actual
21
- data. Observing the cluster distribution helps gain insights into whether the model's output aligns well with the
22
- actual dataset distribution.
23
-
24
- **Test Mechanism:** The testing mechanism for `ClusterSizeDistribution` involves first running the clustering model
25
- on the training dataset, storing predictions, and comparing these predictions with the actual output. The actual
26
- and predicted outputs are then converted into pandas dataframes, which conveniently enables the use of pandas
27
- built-in functions to derive cluster size distributions. Two histograms are constructed from this data: one for the
28
- actual distribution and one for the predicted distribution. These histograms are then plotted side-by-side for
29
- visual comparison.
30
-
31
- **Signs of High Risk:**
32
- * Discrepancies between the actual cluster size distribution and the predicted cluster size distribution may
33
- indicate high risk.
34
- * An irregular distribution of data across clusters in the predicted outcomes points towards an inaccurate
35
- prediction model.
36
- * A high number of outlier clusters could indicate that the model has trouble correctly grouping data.
37
-
38
- **Strengths:**
39
- * `ClusterSizeDistribution` provides a visual and intuitive way to compare the performance of the clustering model
40
- against the actual data.
41
- * This metric can effectively reveal where the model might be over- or underestimating cluster sizes.
42
- * It works well with any clustering models, making it a versatile comparison tool.
43
-
44
- **Limitations:**
45
- * The metric assumes that the actual cluster distribution is optimal, which may not always be the case.
46
- * It relies heavily on visual comparison, which might be subjective and may not provide a precise numerical measure
47
- of model performance.
48
- * The metric might not fully capture other important aspects of clustering such as cluster density, distances
49
- between clusters, and the shape of clusters.
16
+ Assesses the performance of clustering models by comparing the distribution of cluster sizes in model predictions
17
+ with the actual data.
18
+
19
+ ### Purpose
20
+
21
+ The Cluster Size Distribution test aims to assess the performance of clustering models by comparing the
22
+ distribution of cluster sizes in the model's predictions with the actual data. This comparison helps determine if
23
+ the clustering model's output aligns well with the true cluster distribution, providing insights into the model's
24
+ accuracy and performance.
25
+
26
+ ### Test Mechanism
27
+
28
+ The test mechanism involves the following steps:
29
+ - Run the clustering model on the provided dataset to obtain predictions.
30
+ - Convert both the actual and predicted outputs into pandas dataframes.
31
+ - Use pandas built-in functions to derive the cluster size distributions from these dataframes.
32
+ - Construct two histograms: one for the actual cluster size distribution and one for the predicted distribution.
33
+ - Plot the histograms side-by-side for visual comparison.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Discrepancies between the actual cluster size distribution and the predicted cluster size distribution.
38
+ - Irregular distribution of data across clusters in the predicted outcomes.
39
+ - High number of outlier clusters suggesting the model struggles to correctly group data.
40
+
41
+ ### Strengths
42
+
43
+ - Provides a visual and intuitive way to compare the clustering model's performance against actual data.
44
+ - Effectively reveals where the model may be over- or underestimating cluster sizes.
45
+ - Versatile as it works well with any clustering model.
46
+
47
+ ### Limitations
48
+
49
+ - Assumes that the actual cluster distribution is optimal, which may not always be the case.
50
+ - Relies heavily on visual comparison, which could be subjective and may not offer a precise numerical measure of
51
+ performance.
52
+ - May not fully capture other important aspects of clustering, such as cluster density, distances between clusters,
53
+ and the shape of clusters.
50
54
  """
51
55
 
52
56
  name = "cluster_size_distribution"
@@ -13,25 +13,47 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def ContextualRecall(dataset, model):
15
15
  """
16
- Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of descriptive statistics for contextual recall scores.
16
+ Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct
17
+ text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of
18
+ descriptive statistics for contextual recall scores.
17
19
 
18
- **Purpose:**
19
- The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to generate text that appropriately reflects the given context or prompt. It measures the model's capability to remember and reproduce the main context in its resulting output. This metric is critical in natural language processing tasks, as the coherency and contextuality of the generated text are essential.
20
+ ### Purpose
20
21
 
21
- **Test Mechanism:**
22
- The function starts by extracting the true and predicted values from the provided dataset and model. It then tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to show score variations across different rows. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive summary of the model's performance.
22
+ The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to
23
+ generate text that appropriately reflects the given context or prompt. It measures the model's capability to
24
+ remember and reproduce the main context in its resulting output. This metric is critical in natural language
25
+ processing tasks, as the coherency and contextuality of the generated text are essential.
23
26
 
24
- **Signs of High Risk:**
25
- - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in its output, leading to incoherent or contextually misaligned text.
27
+ ### Test Mechanism
28
+
29
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then
30
+ tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between
31
+ the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number
32
+ of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test
33
+ dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to
34
+ show score variations across different rows. Additionally, a table of descriptive statistics (mean, median,
35
+ standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive
36
+ summary of the model's performance.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in
41
+ its output, leading to incoherent or contextually misaligned text.
26
42
  - A consistent trend of low recall scores could suggest underperformance of the model.
27
43
 
28
- **Strengths:**
29
- - Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated narrative.
30
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of contextual recall scores.
31
- - Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant texts.
44
+ ### Strengths
45
+
46
+ - Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated
47
+ narrative.
48
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
49
+ contextual recall scores.
50
+ - Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant
51
+ texts.
52
+
53
+ ### Limitations
32
54
 
33
- **Limitations:**
34
- - The focus on word overlap could result in high scores for texts that use many common words, even when these texts lack coherence or meaningful context.
55
+ - The focus on word overlap could result in high scores for texts that use many common words, even when these texts
56
+ lack coherence or meaningful context.
35
57
  - This metric does not consider the order of words, which could lead to overestimated scores for scrambled outputs.
36
58
  - Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
37
59
  """
@@ -19,24 +19,43 @@ logger = get_logger(__name__)
19
19
  @dataclass
20
20
  class FeaturesAUC(Metric):
21
21
  """
22
- Evaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately.
22
+ Evaluates the discriminatory power of each individual feature within a binary classification model by calculating
23
+ the Area Under the Curve (AUC) for each feature separately.
23
24
 
24
- **Purpose**: The central objective of this metric is to quantify how well each feature on its own can differentiate between the two classes in a binary classification problem. It serves as a univariate analysis tool that can help in pre-modeling feature selection or post-modeling interpretation.
25
+ ### Purpose
25
26
 
26
- **Test Mechanism**: For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's univariate classification strength.
27
+ The central objective of this metric is to quantify how well each feature on its own can differentiate between the
28
+ two classes in a binary classification problem. It serves as a univariate analysis tool that can help in
29
+ pre-modeling feature selection or post-modeling interpretation.
27
30
 
28
- **Signs of High Risk**:
29
- - A feature with a low AUC score may not be contributing significantly to the differentiation between the two classes, which could be a concern if it is expected to be predictive.
30
- - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or other issues with the data.
31
+ ### Test Mechanism
31
32
 
32
- **Strengths**:
33
- - By isolating each feature, it highlights the individual contribution of features to the classification task without the influence of other variables.
34
- - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual features after model training.
33
+ For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary
34
+ outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's
35
+ univariate classification strength.
35
36
 
36
- **Limitations**:
37
- - Does not reflect the combined effects of features or any interaction between them, which can be critical in certain models.
38
- - The AUC values are calculated without considering the model's use of the features, which could lead to different interpretations of feature importance when considering the model holistically.
39
- - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass classification or regression without modifications.
37
+ ### Signs of High Risk
38
+
39
+ - A feature with a low AUC score may not be contributing significantly to the differentiation between the two
40
+ classes, which could be a concern if it is expected to be predictive.
41
+ - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or
42
+ other issues with the data.
43
+
44
+ ### Strengths
45
+
46
+ - By isolating each feature, it highlights the individual contribution of features to the classification task
47
+ without the influence of other variables.
48
+ - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual
49
+ features after model training.
50
+
51
+ ### Limitations
52
+
53
+ - Does not reflect the combined effects of features or any interaction between them, which can be critical in
54
+ certain models.
55
+ - The AUC values are calculated without considering the model's use of the features, which could lead to different
56
+ interpretations of feature importance when considering the model holistically.
57
+ - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass
58
+ classification or regression without modifications.
40
59
  """
41
60
 
42
61
  name = "features_auc"