validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.24.dist-info/METADATA +0 -118
  196. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
validmind/tests/load.py CHANGED
@@ -4,12 +4,10 @@
4
4
 
5
5
  """Module for listing and loading tests."""
6
6
 
7
- import importlib
8
7
  import inspect
9
8
  import json
10
- import sys
11
- from pathlib import Path
12
9
  from pprint import pformat
10
+ from typing import List
13
11
  from uuid import uuid4
14
12
 
15
13
  import pandas as pd
@@ -18,39 +16,148 @@ from ipywidgets import HTML, Accordion
18
16
  from ..errors import LoadTestError, MissingDependencyError
19
17
  from ..html_templates.content_blocks import test_content_block_html
20
18
  from ..logging import get_logger
21
- from ..unit_metrics.composite import load_composite_metric
22
- from ..utils import (
23
- NumpyEncoder,
24
- display,
25
- format_dataframe,
26
- fuzzy_match,
27
- md_to_html,
28
- test_id_to_name,
29
- )
19
+ from ..utils import display, format_dataframe, fuzzy_match, md_to_html, test_id_to_name
20
+ from ..vm_models import VMDataset, VMModel
30
21
  from .__types__ import TestID
31
22
  from ._store import test_provider_store, test_store
32
- from .decorator import test as test_decorator
33
- from .utils import test_description
34
23
 
35
24
  logger = get_logger(__name__)
36
25
 
37
26
 
38
- def __init__():
39
- directories = [p.name for p in Path(__file__).parent.iterdir() if p.is_dir()]
27
+ INPUT_TYPE_MAP = {
28
+ "dataset": VMDataset,
29
+ "datasets": List[VMDataset],
30
+ "model": VMModel,
31
+ "models": List[VMModel],
32
+ }
40
33
 
41
- for d in directories:
42
- for path in Path(__file__).parent.joinpath(d).glob("**/**/*.py"):
43
- if path.name.startswith("__") or not path.name[0].isupper():
44
- continue # skip __init__.py and other special files as well as non Test files
45
- test_id = (
46
- f"validmind.{d}.{path.parent.stem}.{path.stem}"
47
- if path.parent.parent.stem == d
48
- else f"validmind.{d}.{path.stem}"
49
- )
50
- test_store.register_test(test_id)
51
34
 
35
+ def _inspect_signature(test_func: callable):
36
+ inputs = {}
37
+ params = {}
52
38
 
53
- __init__()
39
+ for name, arg in inspect.signature(test_func).parameters.items():
40
+ if name in INPUT_TYPE_MAP:
41
+ inputs[name] = {"type": INPUT_TYPE_MAP[name]}
42
+ elif name == "args" or name == "kwargs":
43
+ continue
44
+ else:
45
+ params[name] = {
46
+ "type": (
47
+ arg.annotation.__name__
48
+ if arg.annotation and hasattr(arg.annotation, "__name__")
49
+ else None
50
+ ),
51
+ "default": (
52
+ arg.default if arg.default is not inspect.Parameter.empty else None
53
+ ),
54
+ }
55
+
56
+ return inputs, params
57
+
58
+
59
+ def load_test(test_id: str, test_func: callable = None, reload: bool = False):
60
+ """Load a test by test ID
61
+
62
+ Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:tag]`.
63
+ The tag is optional and is used to distinguish between multiple results from the
64
+ same test.
65
+
66
+ Args:
67
+ test_id (str): The test ID in the format `namespace.path_to_module.TestName[:tag]`
68
+ test_func (callable, optional): The test function to load. If not provided, the
69
+ test will be loaded from the test provider. Defaults to None.
70
+ """
71
+ # remove tag if present
72
+ test_id = test_id.split(":", 1)[0]
73
+ namespace = test_id.split(".", 1)[0]
74
+
75
+ # if not already loaded, load it from appropriate provider
76
+ if test_id not in test_store.tests or reload:
77
+ if test_id.startswith("validmind.composite_metric"):
78
+ # TODO: add composite metric loading
79
+ pass
80
+
81
+ if not test_func:
82
+ if not test_provider_store.has_test_provider(namespace):
83
+ raise LoadTestError(
84
+ f"No test provider found for namespace: {namespace}"
85
+ )
86
+
87
+ provider = test_provider_store.get_test_provider(namespace)
88
+
89
+ try:
90
+ test_func = provider.load_test(test_id.split(".", 1)[1])
91
+ except Exception as e:
92
+ raise LoadTestError(
93
+ f"Unable to load test '{test_id}' from {namespace} test provider",
94
+ original_error=e,
95
+ ) from e
96
+
97
+ # add test_id as an attribute to the test function
98
+ test_func.test_id = test_id
99
+
100
+ # fallback to using func name if no docstring is found
101
+ if not inspect.getdoc(test_func):
102
+ test_func.__doc__ = f"{test_func.__name__} ({test_id})"
103
+
104
+ # add inputs and params as attributes to the test function
105
+ test_func.inputs, test_func.params = _inspect_signature(test_func)
106
+
107
+ test_store.register_test(test_id, test_func)
108
+
109
+ return test_store.get_test(test_id)
110
+
111
+
112
+ def _list_test_ids():
113
+ test_ids = []
114
+
115
+ for namespace, test_provider in test_provider_store.test_providers.items():
116
+ test_ids.extend(
117
+ [f"{namespace}.{test_id}" for test_id in sorted(test_provider.list_tests())]
118
+ )
119
+
120
+ return test_ids
121
+
122
+
123
+ def _load_tests(test_ids):
124
+ """Load a set of tests, handling missing dependencies."""
125
+ tests = {}
126
+
127
+ for test_id in test_ids:
128
+ try:
129
+ tests[test_id] = load_test(test_id)
130
+ except LoadTestError as e:
131
+ if not e.original_error or not isinstance(
132
+ e.original_error, MissingDependencyError
133
+ ):
134
+ raise e
135
+
136
+ e = e.original_error
137
+
138
+ logger.debug(str(e))
139
+
140
+ if e.extra:
141
+ logger.info(
142
+ f"Skipping `{test_id}` as it requires extra dependencies: {e.required_dependencies}."
143
+ f" Please run `pip install validmind[{e.extra}]` to view and run this test."
144
+ )
145
+ else:
146
+ logger.info(
147
+ f"Skipping `{test_id}` as it requires missing dependencies: {e.required_dependencies}."
148
+ " Please install the missing dependencies to view and run this test."
149
+ )
150
+
151
+ return tests
152
+
153
+
154
+ def _test_description(test_description: str, num_lines: int = 5):
155
+ description = test_description.strip("\n").strip()
156
+
157
+ if len(description.split("\n")) > num_lines:
158
+ return description.strip().split("\n")[0] + "..."
159
+
160
+ return description
54
161
 
55
162
 
56
163
  def _pretty_list_tests(tests, truncate=True):
@@ -58,9 +165,12 @@ def _pretty_list_tests(tests, truncate=True):
58
165
  {
59
166
  "ID": test_id,
60
167
  "Name": test_id_to_name(test_id),
61
- "Description": test_description(test, truncate),
62
- "Required Inputs": test.required_inputs,
63
- "Params": test.default_params or {},
168
+ "Description": _test_description(
169
+ inspect.getdoc(test),
170
+ num_lines=(5 if truncate else 999999),
171
+ ),
172
+ "Required Inputs": test.inputs,
173
+ "Params": test.params,
64
174
  }
65
175
  for test_id, test in tests.items()
66
176
  ]
@@ -68,9 +178,57 @@ def _pretty_list_tests(tests, truncate=True):
68
178
  return format_dataframe(pd.DataFrame(table))
69
179
 
70
180
 
71
- def list_tests(
72
- filter=None, task=None, tags=None, pretty=True, truncate=True, __as_class=False
73
- ):
181
+ def list_tags():
182
+ """
183
+ List unique tags from all test classes.
184
+ """
185
+
186
+ unique_tags = set()
187
+
188
+ for test in _load_tests(list_tests(pretty=False)):
189
+ unique_tags.update(test.__tags__)
190
+
191
+ return list(unique_tags)
192
+
193
+
194
+ def list_tasks_and_tags():
195
+ """
196
+ List all task types and their associated tags, with one row per task type and
197
+ all tags for a task type in one row.
198
+
199
+ Returns:
200
+ pandas.DataFrame: A DataFrame with 'Task Type' and concatenated 'Tags'.
201
+ """
202
+ task_tags_dict = {}
203
+
204
+ for test in _load_tests(list_tests(pretty=False)):
205
+ for task in test.__tasks__:
206
+ task_tags_dict.setdefault(task, set()).update(test.__tags__)
207
+
208
+ return format_dataframe(
209
+ pd.DataFrame(
210
+ [
211
+ {"Task": task, "Tags": ", ".join(tags)}
212
+ for task, tags in task_tags_dict.items()
213
+ ]
214
+ )
215
+ )
216
+
217
+
218
+ def list_tasks():
219
+ """
220
+ List unique tasks from all test classes.
221
+ """
222
+
223
+ unique_tasks = set()
224
+
225
+ for test in _load_tests(list_tests(pretty=False)):
226
+ unique_tasks.update(test.__tasks__)
227
+
228
+ return list(unique_tasks)
229
+
230
+
231
+ def list_tests(filter=None, task=None, tags=None, pretty=True, truncate=True):
74
232
  """List all tests in the tests directory.
75
233
 
76
234
  Args:
@@ -88,30 +246,13 @@ def list_tests(
88
246
  Returns:
89
247
  list or pandas.DataFrame: A list of all tests or a formatted table.
90
248
  """
91
- # tests = {
92
- # test_id: load_test(test_id, reload=True)
93
- # for test_id in test_store.get_test_ids()
94
- # }
95
- tests = {}
96
- for test_id in test_store.get_test_ids():
97
- try:
98
- tests[test_id] = load_test(test_id, reload=True)
99
- except MissingDependencyError as e:
100
- # skip tests that have missing dependencies
101
- logger.debug(str(e))
249
+ test_ids = _list_test_ids()
102
250
 
103
- if e.extra:
104
- logger.info(
105
- f"Skipping `{test_id}` as it requires extra dependencies: {e.required_dependencies}."
106
- f" Please run `pip install validmind[{e.extra}]` to view and run this test."
107
- )
108
- else:
109
- logger.info(
110
- f"Skipping `{test_id}` as it requires missing dependencies: {e.required_dependencies}."
111
- " Please install the missing dependencies to view and run this test."
112
- )
251
+ # no need to load test funcs (takes a while) if we're just returning the test ids
252
+ if not filter and not task and not tags and not pretty:
253
+ return test_ids
113
254
 
114
- continue
255
+ tests = _load_tests(test_ids)
115
256
 
116
257
  # first search by the filter string since it's the most general search
117
258
  if filter is not None:
@@ -119,114 +260,29 @@ def list_tests(
119
260
  test_id: test
120
261
  for test_id, test in tests.items()
121
262
  if filter.lower() in test_id.lower()
122
- or any(filter.lower() in task.lower() for task in test.tasks)
123
- or any(fuzzy_match(tag, filter.lower()) for tag in test.tags)
263
+ or any(filter.lower() in task.lower() for task in test.__tasks__)
264
+ or any(fuzzy_match(tag, filter.lower()) for tag in test.__tags__)
124
265
  }
125
266
 
126
267
  # then filter by task type and tags since they are more specific
127
268
  if task is not None:
128
- tests = {test_id: test for test_id, test in tests.items() if task in test.tasks}
269
+ tests = {
270
+ test_id: test for test_id, test in tests.items() if task in test.__tasks__
271
+ }
129
272
 
130
273
  if tags is not None:
131
274
  tests = {
132
275
  test_id: test
133
276
  for test_id, test in tests.items()
134
- if all(tag in test.tags for tag in tags)
277
+ if all(tag in test.__tags__ for tag in tags)
135
278
  }
136
279
 
137
- if __as_class:
138
- return list(tests.values())
139
-
140
280
  if not pretty:
141
- # only return test ids
142
281
  return list(tests.keys())
143
282
 
144
283
  return _pretty_list_tests(tests, truncate=truncate)
145
284
 
146
285
 
147
- def _load_validmind_test(test_id, reload=False):
148
- parts = test_id.split(":")[0].split(".")
149
-
150
- test_module = ".".join(parts[1:-1])
151
- test_class = parts[-1]
152
-
153
- error = None
154
- test = None
155
-
156
- try:
157
- full_path = f"validmind.tests.{test_module}.{test_class}"
158
-
159
- if reload and full_path in sys.modules:
160
- module = importlib.reload(sys.modules[full_path])
161
- else:
162
- module = importlib.import_module(full_path)
163
-
164
- test = getattr(module, test_class)
165
- except ModuleNotFoundError as e:
166
- error = f"Unable to load test {test_id}. {e}"
167
- except AttributeError:
168
- error = f"Unable to load test {test_id}. Test not in module: {test_class}"
169
-
170
- return error, test
171
-
172
-
173
- def load_test(test_id: str, reload=False):
174
- """Load a test by test ID
175
-
176
- Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:result_id]`.
177
- The result ID is optional and is used to distinguish between multiple results from the
178
- running the same test.
179
-
180
- Args:
181
- test_id (str): The test ID in the format `namespace.path_to_module.TestName[:result_id]`
182
- reload (bool, optional): Whether to reload the test module. Defaults to False.
183
- """
184
- # TODO: we should use a dedicated class for test IDs to handle this consistently
185
- test_id, result_id = test_id.split(":", 1) if ":" in test_id else (test_id, None)
186
-
187
- error = None
188
- namespace = test_id.split(".", 1)[0]
189
-
190
- # TODO: lets implement an extensible loading system instead of this ugly if/else
191
- if test_store.get_custom_test(test_id):
192
- test = test_store.get_custom_test(test_id)
193
-
194
- elif test_id.startswith("validmind.composite_metric"):
195
- error, test = load_composite_metric(test_id)
196
-
197
- elif namespace == "validmind":
198
- error, test = _load_validmind_test(test_id, reload=reload)
199
-
200
- elif test_provider_store.has_test_provider(namespace):
201
- provider = test_provider_store.get_test_provider(namespace)
202
-
203
- try:
204
- test = provider.load_test(test_id.split(".", 1)[1])
205
- except Exception as e:
206
- error = (
207
- f"Unable to load test {test_id} from test provider: "
208
- f"{provider}\n Got Exception: {e}"
209
- )
210
-
211
- else:
212
- error = f"Unable to load test {test_id}. No test provider found."
213
-
214
- if error:
215
- logger.error(error)
216
- raise LoadTestError(error)
217
-
218
- if inspect.isfunction(test):
219
- # if its a function, we decorate it and then load the class
220
- # TODO: simplify this as we move towards all functional metrics
221
- # "_" is used here so it doesn't conflict with other test ids
222
- test_decorator("_")(test)
223
- test = test_store.get_custom_test("_")
224
-
225
- test.test_id = f"{test_id}:{result_id}" if result_id else test_id
226
-
227
- return test
228
-
229
-
230
286
  def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
231
287
  """Get or show details about the test
232
288
 
@@ -239,13 +295,13 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
239
295
  raw (bool, optional): If True, returns a dictionary with the test details.
240
296
  Defaults to False.
241
297
  """
242
- test = load_test(test_id, reload=True)
298
+ test = load_test(test_id)
243
299
 
244
300
  details = {
245
301
  "ID": test_id,
246
302
  "Name": test_id_to_name(test_id),
247
- "Required Inputs": test.required_inputs or [],
248
- "Params": test.default_params or {},
303
+ "Required Inputs": test.inputs or [],
304
+ "Params": test.params or {},
249
305
  "Description": inspect.getdoc(test).strip() or "",
250
306
  }
251
307
 
@@ -260,8 +316,8 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
260
316
  required_inputs=", ".join(details["Required Inputs"] or ["None"]),
261
317
  params_table="\n".join(
262
318
  [
263
- f"<tr><td>{param}</td><td>{pformat(value, indent=4)}</td></tr>"
264
- for param, value in details["Params"].items()
319
+ f"<tr><td>{param}</td><td>{pformat(param_spec['default'], indent=4)}</td></tr>"
320
+ for param, param_spec in details["Params"].items()
265
321
  ]
266
322
  ),
267
323
  table_display="table" if details["Params"] else "none",
@@ -269,7 +325,10 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
269
325
  {name: f"my_vm_{name}" for name in (details["Required Inputs"] or [])},
270
326
  indent=4,
271
327
  ),
272
- example_params=json.dumps(details["Params"] or {}, indent=4, cls=NumpyEncoder),
328
+ example_params=json.dumps(
329
+ {param: f"my_vm_{param}" for param in (details["Params"] or {}).keys()},
330
+ indent=4,
331
+ ),
273
332
  instructions_display="block" if show else "none",
274
333
  )
275
334
 
@@ -279,6 +338,6 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
279
338
  display(
280
339
  Accordion(
281
340
  children=[HTML(html)],
282
- titles=[f"Test Description: {details['Name']} ('{test_id}')"],
341
+ titles=[f"Test: {details['Name']} ('{test_id}')"],
283
342
  )
284
343
  )
@@ -7,11 +7,16 @@ import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
9
  from validmind import tags, tasks
10
+ from validmind.tests.utils import validate_prediction
10
11
 
11
12
 
12
13
  @tags("nlp", "text_data", "visualization")
13
14
  @tasks("text_classification", "text_summarization")
14
- def BertScore(dataset, model):
15
+ def BertScore(
16
+ dataset,
17
+ model,
18
+ evaluation_model="distilbert-base-uncased",
19
+ ):
15
20
  """
16
21
  Assesses the quality of machine-generated text using BERTScore metrics and visualizes results through histograms
17
22
  and bar charts, alongside compiling a comprehensive table of descriptive statistics.
@@ -29,7 +34,10 @@ def BertScore(dataset, model):
29
34
  BERTScore metrics and compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore
30
35
  metric (Precision, Recall, and F1 Score) to visualize their distribution. Additionally, a table of descriptive
31
36
  statistics (mean, median, standard deviation, minimum, and maximum) is compiled for each metric, providing a
32
- comprehensive summary of the model's performance.
37
+ comprehensive summary of the model's performance. The test uses the `evaluation_model` param to specify the
38
+ huggingface model to use for evaluation. `microsoft/deberta-xlarge-mnli` is the best-performing model but is
39
+ very large and may be slow without a GPU. `microsoft/deberta-large-mnli` is a smaller model that is faster to
40
+ run and `distilbert-base-uncased` is much lighter and can run on a CPU but is less accurate.
33
41
 
34
42
  ### Signs of High Risk
35
43
 
@@ -61,11 +69,8 @@ def BertScore(dataset, model):
61
69
  y_true = dataset.y
62
70
  y_pred = dataset.y_pred(model)
63
71
 
64
- # Ensure y_true and y_pred have the same length
65
- if len(y_true) != len(y_pred):
66
- min_length = min(len(y_true), len(y_pred))
67
- y_true = y_true[:min_length]
68
- y_pred = y_pred[:min_length]
72
+ # Ensure equal lengths and get truncated data if necessary
73
+ y_true, y_pred = validate_prediction(y_true, y_pred)
69
74
 
70
75
  # Load the BERT evaluation metric
71
76
  bert = evaluate.load("bertscore")
@@ -75,6 +80,7 @@ def BertScore(dataset, model):
75
80
  predictions=y_pred,
76
81
  references=y_true,
77
82
  lang="en",
83
+ model_type=evaluation_model,
78
84
  )
79
85
 
80
86
  # Convert scores to a dataframe
@@ -7,6 +7,7 @@ import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
9
  from validmind import tags, tasks
10
+ from validmind.tests.utils import validate_prediction
10
11
 
11
12
 
12
13
  @tags("nlp", "text_data", "visualization")
@@ -61,6 +62,9 @@ def BleuScore(dataset, model):
61
62
  y_true = dataset.y
62
63
  y_pred = dataset.y_pred(model)
63
64
 
65
+ # Ensure equal lengths and get truncated data if necessary
66
+ y_true, y_pred = validate_prediction(y_true, y_pred)
67
+
64
68
  # Load the BLEU evaluation metric
65
69
  bleu = evaluate.load("bleu")
66
70
 
@@ -2,16 +2,16 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import pandas as pd
8
6
  import plotly.graph_objects as go
9
7
 
10
- from validmind.vm_models import Figure, Metric
8
+ from validmind import tags, tasks
9
+ from validmind.vm_models import VMDataset, VMModel
11
10
 
12
11
 
13
- @dataclass
14
- class ClusterSizeDistribution(Metric):
12
+ @tags("sklearn", "model_performance")
13
+ @tasks("clustering")
14
+ def ClusterSizeDistribution(dataset: VMDataset, model: VMModel):
15
15
  """
16
16
  Assesses the performance of clustering models by comparing the distribution of cluster sizes in model predictions
17
17
  with the actual data.
@@ -52,47 +52,24 @@ class ClusterSizeDistribution(Metric):
52
52
  - May not fully capture other important aspects of clustering, such as cluster density, distances between clusters,
53
53
  and the shape of clusters.
54
54
  """
55
-
56
- name = "cluster_size_distribution"
57
- required_inputs = ["model", "dataset"]
58
- tasks = ["clustering"]
59
- tags = [
60
- "sklearn",
61
- "model_performance",
62
- ]
63
-
64
- def run(self):
65
- y_true_train = self.inputs.dataset.y
66
- y_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
67
- y_true_train = y_true_train.astype(y_pred_train.dtype)
68
- df = pd.DataFrame(
69
- {"Actual": y_true_train.ravel(), "Prediction": y_pred_train.ravel()}
70
- )
71
- df_counts = df.apply(pd.value_counts)
72
-
73
- fig = go.Figure(
74
- data=[
75
- go.Bar(name="Actual", x=df_counts.index, y=df_counts["Actual"].values),
76
- go.Bar(
77
- name="Prediction",
78
- x=df_counts.index,
79
- y=df_counts["Prediction"].values,
80
- ),
81
- ]
82
- )
83
- # Change the bar mode
84
- fig.update_xaxes(title_text="Number of clusters", showgrid=False)
85
- fig.update_yaxes(title_text="Counts", showgrid=False)
86
- fig.update_layout(
87
- title_text="Cluster distribution", title_x=0.5, barmode="group"
88
- )
89
-
90
- figures = [
91
- Figure(
92
- for_object=self,
93
- key=self.key,
94
- figure=fig,
95
- )
55
+ y_pred = dataset.y_pred(model)
56
+ y_true = dataset.y.astype(y_pred.dtype)
57
+
58
+ df = pd.DataFrame({"Actual": y_true.ravel(), "Prediction": y_pred.ravel()})
59
+ df_counts = df.apply(pd.value_counts)
60
+
61
+ fig = go.Figure(
62
+ data=[
63
+ go.Bar(name="Actual", x=df_counts.index, y=df_counts["Actual"].values),
64
+ go.Bar(
65
+ name="Prediction",
66
+ x=df_counts.index,
67
+ y=df_counts["Prediction"].values,
68
+ ),
96
69
  ]
70
+ )
71
+ fig.update_xaxes(title_text="Number of clusters", showgrid=False)
72
+ fig.update_yaxes(title_text="Counts", showgrid=False)
73
+ fig.update_layout(title_text="Cluster distribution", title_x=0.5, barmode="group")
97
74
 
98
- return self.cache_results(figures=figures)
75
+ return fig
@@ -7,6 +7,7 @@ import pandas as pd
7
7
  import plotly.graph_objects as go
8
8
 
9
9
  from validmind import tags, tasks
10
+ from validmind.tests.utils import validate_prediction
10
11
 
11
12
 
12
13
  @tags("nlp", "text_data", "visualization")
@@ -64,6 +65,8 @@ def ContextualRecall(dataset, model):
64
65
  y_true = dataset.y
65
66
  y_pred = dataset.y_pred(model)
66
67
 
68
+ validate_prediction(y_true, y_pred)
69
+
67
70
  score_list = []
68
71
  for y_t, y_p in zip(y_true, y_pred):
69
72
  # Tokenize the reference and candidate texts