validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
validmind/tests/load.py
CHANGED
@@ -4,12 +4,10 @@
|
|
4
4
|
|
5
5
|
"""Module for listing and loading tests."""
|
6
6
|
|
7
|
-
import importlib
|
8
7
|
import inspect
|
9
8
|
import json
|
10
|
-
import sys
|
11
|
-
from pathlib import Path
|
12
9
|
from pprint import pformat
|
10
|
+
from typing import List
|
13
11
|
from uuid import uuid4
|
14
12
|
|
15
13
|
import pandas as pd
|
@@ -18,39 +16,148 @@ from ipywidgets import HTML, Accordion
|
|
18
16
|
from ..errors import LoadTestError, MissingDependencyError
|
19
17
|
from ..html_templates.content_blocks import test_content_block_html
|
20
18
|
from ..logging import get_logger
|
21
|
-
from ..
|
22
|
-
from ..
|
23
|
-
NumpyEncoder,
|
24
|
-
display,
|
25
|
-
format_dataframe,
|
26
|
-
fuzzy_match,
|
27
|
-
md_to_html,
|
28
|
-
test_id_to_name,
|
29
|
-
)
|
19
|
+
from ..utils import display, format_dataframe, fuzzy_match, md_to_html, test_id_to_name
|
20
|
+
from ..vm_models import VMDataset, VMModel
|
30
21
|
from .__types__ import TestID
|
31
22
|
from ._store import test_provider_store, test_store
|
32
|
-
from .decorator import test as test_decorator
|
33
|
-
from .utils import test_description
|
34
23
|
|
35
24
|
logger = get_logger(__name__)
|
36
25
|
|
37
26
|
|
38
|
-
|
39
|
-
|
27
|
+
INPUT_TYPE_MAP = {
|
28
|
+
"dataset": VMDataset,
|
29
|
+
"datasets": List[VMDataset],
|
30
|
+
"model": VMModel,
|
31
|
+
"models": List[VMModel],
|
32
|
+
}
|
40
33
|
|
41
|
-
for d in directories:
|
42
|
-
for path in Path(__file__).parent.joinpath(d).glob("**/**/*.py"):
|
43
|
-
if path.name.startswith("__") or not path.name[0].isupper():
|
44
|
-
continue # skip __init__.py and other special files as well as non Test files
|
45
|
-
test_id = (
|
46
|
-
f"validmind.{d}.{path.parent.stem}.{path.stem}"
|
47
|
-
if path.parent.parent.stem == d
|
48
|
-
else f"validmind.{d}.{path.stem}"
|
49
|
-
)
|
50
|
-
test_store.register_test(test_id)
|
51
34
|
|
35
|
+
def _inspect_signature(test_func: callable):
|
36
|
+
inputs = {}
|
37
|
+
params = {}
|
52
38
|
|
53
|
-
|
39
|
+
for name, arg in inspect.signature(test_func).parameters.items():
|
40
|
+
if name in INPUT_TYPE_MAP:
|
41
|
+
inputs[name] = {"type": INPUT_TYPE_MAP[name]}
|
42
|
+
elif name == "args" or name == "kwargs":
|
43
|
+
continue
|
44
|
+
else:
|
45
|
+
params[name] = {
|
46
|
+
"type": (
|
47
|
+
arg.annotation.__name__
|
48
|
+
if arg.annotation and hasattr(arg.annotation, "__name__")
|
49
|
+
else None
|
50
|
+
),
|
51
|
+
"default": (
|
52
|
+
arg.default if arg.default is not inspect.Parameter.empty else None
|
53
|
+
),
|
54
|
+
}
|
55
|
+
|
56
|
+
return inputs, params
|
57
|
+
|
58
|
+
|
59
|
+
def load_test(test_id: str, test_func: callable = None, reload: bool = False):
|
60
|
+
"""Load a test by test ID
|
61
|
+
|
62
|
+
Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:tag]`.
|
63
|
+
The tag is optional and is used to distinguish between multiple results from the
|
64
|
+
same test.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
test_id (str): The test ID in the format `namespace.path_to_module.TestName[:tag]`
|
68
|
+
test_func (callable, optional): The test function to load. If not provided, the
|
69
|
+
test will be loaded from the test provider. Defaults to None.
|
70
|
+
"""
|
71
|
+
# remove tag if present
|
72
|
+
test_id = test_id.split(":", 1)[0]
|
73
|
+
namespace = test_id.split(".", 1)[0]
|
74
|
+
|
75
|
+
# if not already loaded, load it from appropriate provider
|
76
|
+
if test_id not in test_store.tests or reload:
|
77
|
+
if test_id.startswith("validmind.composite_metric"):
|
78
|
+
# TODO: add composite metric loading
|
79
|
+
pass
|
80
|
+
|
81
|
+
if not test_func:
|
82
|
+
if not test_provider_store.has_test_provider(namespace):
|
83
|
+
raise LoadTestError(
|
84
|
+
f"No test provider found for namespace: {namespace}"
|
85
|
+
)
|
86
|
+
|
87
|
+
provider = test_provider_store.get_test_provider(namespace)
|
88
|
+
|
89
|
+
try:
|
90
|
+
test_func = provider.load_test(test_id.split(".", 1)[1])
|
91
|
+
except Exception as e:
|
92
|
+
raise LoadTestError(
|
93
|
+
f"Unable to load test '{test_id}' from {namespace} test provider",
|
94
|
+
original_error=e,
|
95
|
+
) from e
|
96
|
+
|
97
|
+
# add test_id as an attribute to the test function
|
98
|
+
test_func.test_id = test_id
|
99
|
+
|
100
|
+
# fallback to using func name if no docstring is found
|
101
|
+
if not inspect.getdoc(test_func):
|
102
|
+
test_func.__doc__ = f"{test_func.__name__} ({test_id})"
|
103
|
+
|
104
|
+
# add inputs and params as attributes to the test function
|
105
|
+
test_func.inputs, test_func.params = _inspect_signature(test_func)
|
106
|
+
|
107
|
+
test_store.register_test(test_id, test_func)
|
108
|
+
|
109
|
+
return test_store.get_test(test_id)
|
110
|
+
|
111
|
+
|
112
|
+
def _list_test_ids():
|
113
|
+
test_ids = []
|
114
|
+
|
115
|
+
for namespace, test_provider in test_provider_store.test_providers.items():
|
116
|
+
test_ids.extend(
|
117
|
+
[f"{namespace}.{test_id}" for test_id in sorted(test_provider.list_tests())]
|
118
|
+
)
|
119
|
+
|
120
|
+
return test_ids
|
121
|
+
|
122
|
+
|
123
|
+
def _load_tests(test_ids):
|
124
|
+
"""Load a set of tests, handling missing dependencies."""
|
125
|
+
tests = {}
|
126
|
+
|
127
|
+
for test_id in test_ids:
|
128
|
+
try:
|
129
|
+
tests[test_id] = load_test(test_id)
|
130
|
+
except LoadTestError as e:
|
131
|
+
if not e.original_error or not isinstance(
|
132
|
+
e.original_error, MissingDependencyError
|
133
|
+
):
|
134
|
+
raise e
|
135
|
+
|
136
|
+
e = e.original_error
|
137
|
+
|
138
|
+
logger.debug(str(e))
|
139
|
+
|
140
|
+
if e.extra:
|
141
|
+
logger.info(
|
142
|
+
f"Skipping `{test_id}` as it requires extra dependencies: {e.required_dependencies}."
|
143
|
+
f" Please run `pip install validmind[{e.extra}]` to view and run this test."
|
144
|
+
)
|
145
|
+
else:
|
146
|
+
logger.info(
|
147
|
+
f"Skipping `{test_id}` as it requires missing dependencies: {e.required_dependencies}."
|
148
|
+
" Please install the missing dependencies to view and run this test."
|
149
|
+
)
|
150
|
+
|
151
|
+
return tests
|
152
|
+
|
153
|
+
|
154
|
+
def _test_description(test_description: str, num_lines: int = 5):
|
155
|
+
description = test_description.strip("\n").strip()
|
156
|
+
|
157
|
+
if len(description.split("\n")) > num_lines:
|
158
|
+
return description.strip().split("\n")[0] + "..."
|
159
|
+
|
160
|
+
return description
|
54
161
|
|
55
162
|
|
56
163
|
def _pretty_list_tests(tests, truncate=True):
|
@@ -58,9 +165,12 @@ def _pretty_list_tests(tests, truncate=True):
|
|
58
165
|
{
|
59
166
|
"ID": test_id,
|
60
167
|
"Name": test_id_to_name(test_id),
|
61
|
-
"Description":
|
62
|
-
|
63
|
-
|
168
|
+
"Description": _test_description(
|
169
|
+
inspect.getdoc(test),
|
170
|
+
num_lines=(5 if truncate else 999999),
|
171
|
+
),
|
172
|
+
"Required Inputs": test.inputs,
|
173
|
+
"Params": test.params,
|
64
174
|
}
|
65
175
|
for test_id, test in tests.items()
|
66
176
|
]
|
@@ -68,9 +178,57 @@ def _pretty_list_tests(tests, truncate=True):
|
|
68
178
|
return format_dataframe(pd.DataFrame(table))
|
69
179
|
|
70
180
|
|
71
|
-
def
|
72
|
-
|
73
|
-
|
181
|
+
def list_tags():
|
182
|
+
"""
|
183
|
+
List unique tags from all test classes.
|
184
|
+
"""
|
185
|
+
|
186
|
+
unique_tags = set()
|
187
|
+
|
188
|
+
for test in _load_tests(list_tests(pretty=False)):
|
189
|
+
unique_tags.update(test.__tags__)
|
190
|
+
|
191
|
+
return list(unique_tags)
|
192
|
+
|
193
|
+
|
194
|
+
def list_tasks_and_tags():
|
195
|
+
"""
|
196
|
+
List all task types and their associated tags, with one row per task type and
|
197
|
+
all tags for a task type in one row.
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
pandas.DataFrame: A DataFrame with 'Task Type' and concatenated 'Tags'.
|
201
|
+
"""
|
202
|
+
task_tags_dict = {}
|
203
|
+
|
204
|
+
for test in _load_tests(list_tests(pretty=False)):
|
205
|
+
for task in test.__tasks__:
|
206
|
+
task_tags_dict.setdefault(task, set()).update(test.__tags__)
|
207
|
+
|
208
|
+
return format_dataframe(
|
209
|
+
pd.DataFrame(
|
210
|
+
[
|
211
|
+
{"Task": task, "Tags": ", ".join(tags)}
|
212
|
+
for task, tags in task_tags_dict.items()
|
213
|
+
]
|
214
|
+
)
|
215
|
+
)
|
216
|
+
|
217
|
+
|
218
|
+
def list_tasks():
|
219
|
+
"""
|
220
|
+
List unique tasks from all test classes.
|
221
|
+
"""
|
222
|
+
|
223
|
+
unique_tasks = set()
|
224
|
+
|
225
|
+
for test in _load_tests(list_tests(pretty=False)):
|
226
|
+
unique_tasks.update(test.__tasks__)
|
227
|
+
|
228
|
+
return list(unique_tasks)
|
229
|
+
|
230
|
+
|
231
|
+
def list_tests(filter=None, task=None, tags=None, pretty=True, truncate=True):
|
74
232
|
"""List all tests in the tests directory.
|
75
233
|
|
76
234
|
Args:
|
@@ -88,30 +246,13 @@ def list_tests(
|
|
88
246
|
Returns:
|
89
247
|
list or pandas.DataFrame: A list of all tests or a formatted table.
|
90
248
|
"""
|
91
|
-
|
92
|
-
# test_id: load_test(test_id, reload=True)
|
93
|
-
# for test_id in test_store.get_test_ids()
|
94
|
-
# }
|
95
|
-
tests = {}
|
96
|
-
for test_id in test_store.get_test_ids():
|
97
|
-
try:
|
98
|
-
tests[test_id] = load_test(test_id, reload=True)
|
99
|
-
except MissingDependencyError as e:
|
100
|
-
# skip tests that have missing dependencies
|
101
|
-
logger.debug(str(e))
|
249
|
+
test_ids = _list_test_ids()
|
102
250
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
f" Please run `pip install validmind[{e.extra}]` to view and run this test."
|
107
|
-
)
|
108
|
-
else:
|
109
|
-
logger.info(
|
110
|
-
f"Skipping `{test_id}` as it requires missing dependencies: {e.required_dependencies}."
|
111
|
-
" Please install the missing dependencies to view and run this test."
|
112
|
-
)
|
251
|
+
# no need to load test funcs (takes a while) if we're just returning the test ids
|
252
|
+
if not filter and not task and not tags and not pretty:
|
253
|
+
return test_ids
|
113
254
|
|
114
|
-
|
255
|
+
tests = _load_tests(test_ids)
|
115
256
|
|
116
257
|
# first search by the filter string since it's the most general search
|
117
258
|
if filter is not None:
|
@@ -119,114 +260,29 @@ def list_tests(
|
|
119
260
|
test_id: test
|
120
261
|
for test_id, test in tests.items()
|
121
262
|
if filter.lower() in test_id.lower()
|
122
|
-
or any(filter.lower() in task.lower() for task in test.
|
123
|
-
or any(fuzzy_match(tag, filter.lower()) for tag in test.
|
263
|
+
or any(filter.lower() in task.lower() for task in test.__tasks__)
|
264
|
+
or any(fuzzy_match(tag, filter.lower()) for tag in test.__tags__)
|
124
265
|
}
|
125
266
|
|
126
267
|
# then filter by task type and tags since they are more specific
|
127
268
|
if task is not None:
|
128
|
-
tests = {
|
269
|
+
tests = {
|
270
|
+
test_id: test for test_id, test in tests.items() if task in test.__tasks__
|
271
|
+
}
|
129
272
|
|
130
273
|
if tags is not None:
|
131
274
|
tests = {
|
132
275
|
test_id: test
|
133
276
|
for test_id, test in tests.items()
|
134
|
-
if all(tag in test.
|
277
|
+
if all(tag in test.__tags__ for tag in tags)
|
135
278
|
}
|
136
279
|
|
137
|
-
if __as_class:
|
138
|
-
return list(tests.values())
|
139
|
-
|
140
280
|
if not pretty:
|
141
|
-
# only return test ids
|
142
281
|
return list(tests.keys())
|
143
282
|
|
144
283
|
return _pretty_list_tests(tests, truncate=truncate)
|
145
284
|
|
146
285
|
|
147
|
-
def _load_validmind_test(test_id, reload=False):
|
148
|
-
parts = test_id.split(":")[0].split(".")
|
149
|
-
|
150
|
-
test_module = ".".join(parts[1:-1])
|
151
|
-
test_class = parts[-1]
|
152
|
-
|
153
|
-
error = None
|
154
|
-
test = None
|
155
|
-
|
156
|
-
try:
|
157
|
-
full_path = f"validmind.tests.{test_module}.{test_class}"
|
158
|
-
|
159
|
-
if reload and full_path in sys.modules:
|
160
|
-
module = importlib.reload(sys.modules[full_path])
|
161
|
-
else:
|
162
|
-
module = importlib.import_module(full_path)
|
163
|
-
|
164
|
-
test = getattr(module, test_class)
|
165
|
-
except ModuleNotFoundError as e:
|
166
|
-
error = f"Unable to load test {test_id}. {e}"
|
167
|
-
except AttributeError:
|
168
|
-
error = f"Unable to load test {test_id}. Test not in module: {test_class}"
|
169
|
-
|
170
|
-
return error, test
|
171
|
-
|
172
|
-
|
173
|
-
def load_test(test_id: str, reload=False):
|
174
|
-
"""Load a test by test ID
|
175
|
-
|
176
|
-
Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:result_id]`.
|
177
|
-
The result ID is optional and is used to distinguish between multiple results from the
|
178
|
-
running the same test.
|
179
|
-
|
180
|
-
Args:
|
181
|
-
test_id (str): The test ID in the format `namespace.path_to_module.TestName[:result_id]`
|
182
|
-
reload (bool, optional): Whether to reload the test module. Defaults to False.
|
183
|
-
"""
|
184
|
-
# TODO: we should use a dedicated class for test IDs to handle this consistently
|
185
|
-
test_id, result_id = test_id.split(":", 1) if ":" in test_id else (test_id, None)
|
186
|
-
|
187
|
-
error = None
|
188
|
-
namespace = test_id.split(".", 1)[0]
|
189
|
-
|
190
|
-
# TODO: lets implement an extensible loading system instead of this ugly if/else
|
191
|
-
if test_store.get_custom_test(test_id):
|
192
|
-
test = test_store.get_custom_test(test_id)
|
193
|
-
|
194
|
-
elif test_id.startswith("validmind.composite_metric"):
|
195
|
-
error, test = load_composite_metric(test_id)
|
196
|
-
|
197
|
-
elif namespace == "validmind":
|
198
|
-
error, test = _load_validmind_test(test_id, reload=reload)
|
199
|
-
|
200
|
-
elif test_provider_store.has_test_provider(namespace):
|
201
|
-
provider = test_provider_store.get_test_provider(namespace)
|
202
|
-
|
203
|
-
try:
|
204
|
-
test = provider.load_test(test_id.split(".", 1)[1])
|
205
|
-
except Exception as e:
|
206
|
-
error = (
|
207
|
-
f"Unable to load test {test_id} from test provider: "
|
208
|
-
f"{provider}\n Got Exception: {e}"
|
209
|
-
)
|
210
|
-
|
211
|
-
else:
|
212
|
-
error = f"Unable to load test {test_id}. No test provider found."
|
213
|
-
|
214
|
-
if error:
|
215
|
-
logger.error(error)
|
216
|
-
raise LoadTestError(error)
|
217
|
-
|
218
|
-
if inspect.isfunction(test):
|
219
|
-
# if its a function, we decorate it and then load the class
|
220
|
-
# TODO: simplify this as we move towards all functional metrics
|
221
|
-
# "_" is used here so it doesn't conflict with other test ids
|
222
|
-
test_decorator("_")(test)
|
223
|
-
test = test_store.get_custom_test("_")
|
224
|
-
|
225
|
-
test.test_id = f"{test_id}:{result_id}" if result_id else test_id
|
226
|
-
|
227
|
-
return test
|
228
|
-
|
229
|
-
|
230
286
|
def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
|
231
287
|
"""Get or show details about the test
|
232
288
|
|
@@ -239,13 +295,13 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
|
|
239
295
|
raw (bool, optional): If True, returns a dictionary with the test details.
|
240
296
|
Defaults to False.
|
241
297
|
"""
|
242
|
-
test = load_test(test_id
|
298
|
+
test = load_test(test_id)
|
243
299
|
|
244
300
|
details = {
|
245
301
|
"ID": test_id,
|
246
302
|
"Name": test_id_to_name(test_id),
|
247
|
-
"Required Inputs": test.
|
248
|
-
"Params": test.
|
303
|
+
"Required Inputs": test.inputs or [],
|
304
|
+
"Params": test.params or {},
|
249
305
|
"Description": inspect.getdoc(test).strip() or "",
|
250
306
|
}
|
251
307
|
|
@@ -260,8 +316,8 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
|
|
260
316
|
required_inputs=", ".join(details["Required Inputs"] or ["None"]),
|
261
317
|
params_table="\n".join(
|
262
318
|
[
|
263
|
-
f"<tr><td>{param}</td><td>{pformat(
|
264
|
-
for param,
|
319
|
+
f"<tr><td>{param}</td><td>{pformat(param_spec['default'], indent=4)}</td></tr>"
|
320
|
+
for param, param_spec in details["Params"].items()
|
265
321
|
]
|
266
322
|
),
|
267
323
|
table_display="table" if details["Params"] else "none",
|
@@ -269,7 +325,10 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
|
|
269
325
|
{name: f"my_vm_{name}" for name in (details["Required Inputs"] or [])},
|
270
326
|
indent=4,
|
271
327
|
),
|
272
|
-
example_params=json.dumps(
|
328
|
+
example_params=json.dumps(
|
329
|
+
{param: f"my_vm_{param}" for param in (details["Params"] or {}).keys()},
|
330
|
+
indent=4,
|
331
|
+
),
|
273
332
|
instructions_display="block" if show else "none",
|
274
333
|
)
|
275
334
|
|
@@ -279,6 +338,6 @@ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
|
|
279
338
|
display(
|
280
339
|
Accordion(
|
281
340
|
children=[HTML(html)],
|
282
|
-
titles=[f"Test
|
341
|
+
titles=[f"Test: {details['Name']} ('{test_id}')"],
|
283
342
|
)
|
284
343
|
)
|
@@ -7,11 +7,16 @@ import pandas as pd
|
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
9
|
from validmind import tags, tasks
|
10
|
+
from validmind.tests.utils import validate_prediction
|
10
11
|
|
11
12
|
|
12
13
|
@tags("nlp", "text_data", "visualization")
|
13
14
|
@tasks("text_classification", "text_summarization")
|
14
|
-
def BertScore(
|
15
|
+
def BertScore(
|
16
|
+
dataset,
|
17
|
+
model,
|
18
|
+
evaluation_model="distilbert-base-uncased",
|
19
|
+
):
|
15
20
|
"""
|
16
21
|
Assesses the quality of machine-generated text using BERTScore metrics and visualizes results through histograms
|
17
22
|
and bar charts, alongside compiling a comprehensive table of descriptive statistics.
|
@@ -29,7 +34,10 @@ def BertScore(dataset, model):
|
|
29
34
|
BERTScore metrics and compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore
|
30
35
|
metric (Precision, Recall, and F1 Score) to visualize their distribution. Additionally, a table of descriptive
|
31
36
|
statistics (mean, median, standard deviation, minimum, and maximum) is compiled for each metric, providing a
|
32
|
-
comprehensive summary of the model's performance.
|
37
|
+
comprehensive summary of the model's performance. The test uses the `evaluation_model` param to specify the
|
38
|
+
huggingface model to use for evaluation. `microsoft/deberta-xlarge-mnli` is the best-performing model but is
|
39
|
+
very large and may be slow without a GPU. `microsoft/deberta-large-mnli` is a smaller model that is faster to
|
40
|
+
run and `distilbert-base-uncased` is much lighter and can run on a CPU but is less accurate.
|
33
41
|
|
34
42
|
### Signs of High Risk
|
35
43
|
|
@@ -61,11 +69,8 @@ def BertScore(dataset, model):
|
|
61
69
|
y_true = dataset.y
|
62
70
|
y_pred = dataset.y_pred(model)
|
63
71
|
|
64
|
-
# Ensure
|
65
|
-
|
66
|
-
min_length = min(len(y_true), len(y_pred))
|
67
|
-
y_true = y_true[:min_length]
|
68
|
-
y_pred = y_pred[:min_length]
|
72
|
+
# Ensure equal lengths and get truncated data if necessary
|
73
|
+
y_true, y_pred = validate_prediction(y_true, y_pred)
|
69
74
|
|
70
75
|
# Load the BERT evaluation metric
|
71
76
|
bert = evaluate.load("bertscore")
|
@@ -75,6 +80,7 @@ def BertScore(dataset, model):
|
|
75
80
|
predictions=y_pred,
|
76
81
|
references=y_true,
|
77
82
|
lang="en",
|
83
|
+
model_type=evaluation_model,
|
78
84
|
)
|
79
85
|
|
80
86
|
# Convert scores to a dataframe
|
@@ -7,6 +7,7 @@ import pandas as pd
|
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
9
|
from validmind import tags, tasks
|
10
|
+
from validmind.tests.utils import validate_prediction
|
10
11
|
|
11
12
|
|
12
13
|
@tags("nlp", "text_data", "visualization")
|
@@ -61,6 +62,9 @@ def BleuScore(dataset, model):
|
|
61
62
|
y_true = dataset.y
|
62
63
|
y_pred = dataset.y_pred(model)
|
63
64
|
|
65
|
+
# Ensure equal lengths and get truncated data if necessary
|
66
|
+
y_true, y_pred = validate_prediction(y_true, y_pred)
|
67
|
+
|
64
68
|
# Load the BLEU evaluation metric
|
65
69
|
bleu = evaluate.load("bleu")
|
66
70
|
|
@@ -2,16 +2,16 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
import plotly.graph_objects as go
|
9
7
|
|
10
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
10
|
|
12
11
|
|
13
|
-
@
|
14
|
-
|
12
|
+
@tags("sklearn", "model_performance")
|
13
|
+
@tasks("clustering")
|
14
|
+
def ClusterSizeDistribution(dataset: VMDataset, model: VMModel):
|
15
15
|
"""
|
16
16
|
Assesses the performance of clustering models by comparing the distribution of cluster sizes in model predictions
|
17
17
|
with the actual data.
|
@@ -52,47 +52,24 @@ class ClusterSizeDistribution(Metric):
|
|
52
52
|
- May not fully capture other important aspects of clustering, such as cluster density, distances between clusters,
|
53
53
|
and the shape of clusters.
|
54
54
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
{"Actual": y_true_train.ravel(), "Prediction": y_pred_train.ravel()}
|
70
|
-
)
|
71
|
-
df_counts = df.apply(pd.value_counts)
|
72
|
-
|
73
|
-
fig = go.Figure(
|
74
|
-
data=[
|
75
|
-
go.Bar(name="Actual", x=df_counts.index, y=df_counts["Actual"].values),
|
76
|
-
go.Bar(
|
77
|
-
name="Prediction",
|
78
|
-
x=df_counts.index,
|
79
|
-
y=df_counts["Prediction"].values,
|
80
|
-
),
|
81
|
-
]
|
82
|
-
)
|
83
|
-
# Change the bar mode
|
84
|
-
fig.update_xaxes(title_text="Number of clusters", showgrid=False)
|
85
|
-
fig.update_yaxes(title_text="Counts", showgrid=False)
|
86
|
-
fig.update_layout(
|
87
|
-
title_text="Cluster distribution", title_x=0.5, barmode="group"
|
88
|
-
)
|
89
|
-
|
90
|
-
figures = [
|
91
|
-
Figure(
|
92
|
-
for_object=self,
|
93
|
-
key=self.key,
|
94
|
-
figure=fig,
|
95
|
-
)
|
55
|
+
y_pred = dataset.y_pred(model)
|
56
|
+
y_true = dataset.y.astype(y_pred.dtype)
|
57
|
+
|
58
|
+
df = pd.DataFrame({"Actual": y_true.ravel(), "Prediction": y_pred.ravel()})
|
59
|
+
df_counts = df.apply(pd.value_counts)
|
60
|
+
|
61
|
+
fig = go.Figure(
|
62
|
+
data=[
|
63
|
+
go.Bar(name="Actual", x=df_counts.index, y=df_counts["Actual"].values),
|
64
|
+
go.Bar(
|
65
|
+
name="Prediction",
|
66
|
+
x=df_counts.index,
|
67
|
+
y=df_counts["Prediction"].values,
|
68
|
+
),
|
96
69
|
]
|
70
|
+
)
|
71
|
+
fig.update_xaxes(title_text="Number of clusters", showgrid=False)
|
72
|
+
fig.update_yaxes(title_text="Counts", showgrid=False)
|
73
|
+
fig.update_layout(title_text="Cluster distribution", title_x=0.5, barmode="group")
|
97
74
|
|
98
|
-
|
75
|
+
return fig
|
@@ -7,6 +7,7 @@ import pandas as pd
|
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
9
|
from validmind import tags, tasks
|
10
|
+
from validmind.tests.utils import validate_prediction
|
10
11
|
|
11
12
|
|
12
13
|
@tags("nlp", "text_data", "visualization")
|
@@ -64,6 +65,8 @@ def ContextualRecall(dataset, model):
|
|
64
65
|
y_true = dataset.y
|
65
66
|
y_pred = dataset.y_pred(model)
|
66
67
|
|
68
|
+
validate_prediction(y_true, y_pred)
|
69
|
+
|
67
70
|
score_list = []
|
68
71
|
for y_t, y_p in zip(y_true, y_pred):
|
69
72
|
# Tokenize the reference and candidate texts
|