validmind 2.8.10__py3-none-any.whl → 2.8.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +6 -5
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +17 -11
- validmind/ai/utils.py +2 -2
- validmind/api_client.py +75 -32
- validmind/client.py +108 -100
- validmind/client_config.py +3 -3
- validmind/datasets/classification/__init__.py +7 -3
- validmind/datasets/credit_risk/lending_club.py +28 -16
- validmind/datasets/nlp/cnn_dailymail.py +10 -4
- validmind/datasets/regression/__init__.py +22 -5
- validmind/errors.py +17 -7
- validmind/input_registry.py +1 -1
- validmind/logging.py +44 -35
- validmind/models/foundation.py +2 -2
- validmind/models/function.py +10 -3
- validmind/template.py +30 -22
- validmind/test_suites/__init__.py +2 -2
- validmind/tests/_store.py +13 -4
- validmind/tests/comparison.py +65 -33
- validmind/tests/data_validation/ACFandPACFPlot.py +4 -1
- validmind/tests/data_validation/AutoMA.py +1 -1
- validmind/tests/data_validation/BivariateScatterPlots.py +5 -1
- validmind/tests/data_validation/BoxPierce.py +3 -1
- validmind/tests/data_validation/ClassImbalance.py +4 -2
- validmind/tests/data_validation/DatasetDescription.py +3 -24
- validmind/tests/data_validation/DescriptiveStatistics.py +1 -1
- validmind/tests/data_validation/DickeyFullerGLS.py +1 -1
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +1 -1
- validmind/tests/data_validation/HighCardinality.py +5 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +1 -1
- validmind/tests/data_validation/IQROutliersBarPlot.py +5 -3
- validmind/tests/data_validation/IQROutliersTable.py +5 -2
- validmind/tests/data_validation/IsolationForestOutliers.py +5 -4
- validmind/tests/data_validation/JarqueBera.py +2 -2
- validmind/tests/data_validation/LJungBox.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/MissingValues.py +14 -10
- validmind/tests/data_validation/MissingValuesBarPlot.py +3 -1
- validmind/tests/data_validation/MutualInformation.py +2 -1
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +1 -1
- validmind/tests/data_validation/ProtectedClassesCombination.py +2 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +2 -2
- validmind/tests/data_validation/ProtectedClassesDisparity.py +9 -5
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +10 -2
- validmind/tests/data_validation/RollingStatsPlot.py +2 -1
- validmind/tests/data_validation/ScoreBandDefaultRates.py +4 -2
- validmind/tests/data_validation/SeasonalDecompose.py +1 -1
- validmind/tests/data_validation/ShapiroWilk.py +2 -2
- validmind/tests/data_validation/Skewness.py +7 -6
- validmind/tests/data_validation/SpreadPlot.py +1 -1
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +1 -1
- validmind/tests/data_validation/TabularDateTimeHistograms.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +4 -1
- validmind/tests/data_validation/TimeSeriesFrequency.py +1 -1
- validmind/tests/data_validation/TimeSeriesOutliers.py +7 -2
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/data_validation/ZivotAndrewsArch.py +5 -2
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +5 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +3 -1
- validmind/tests/data_validation/nlp/TextDescription.py +1 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +14 -11
- validmind/tests/load.py +38 -24
- validmind/tests/model_validation/BertScore.py +7 -1
- validmind/tests/model_validation/BleuScore.py +7 -1
- validmind/tests/model_validation/ClusterSizeDistribution.py +3 -1
- validmind/tests/model_validation/ContextualRecall.py +9 -1
- validmind/tests/model_validation/FeaturesAUC.py +1 -1
- validmind/tests/model_validation/MeteorScore.py +7 -1
- validmind/tests/model_validation/ModelPredictionResiduals.py +5 -1
- validmind/tests/model_validation/RegardScore.py +6 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -1
- validmind/tests/model_validation/RougeScore.py +3 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +2 -0
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +10 -2
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +6 -2
- validmind/tests/model_validation/TokenDisparity.py +5 -1
- validmind/tests/model_validation/ToxicityScore.py +2 -0
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +5 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +5 -1
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -0
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +5 -1
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +6 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +3 -1
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +5 -1
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +6 -1
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -3
- validmind/tests/model_validation/ragas/AspectCritic.py +4 -1
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -3
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -3
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -3
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -3
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -3
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +1 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +5 -3
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -3
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +9 -9
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +9 -9
- validmind/tests/model_validation/sklearn/CalibrationCurve.py +5 -2
- validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +28 -5
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -1
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +24 -14
- validmind/tests/model_validation/sklearn/CompletenessScore.py +8 -9
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -3
- validmind/tests/model_validation/sklearn/FeatureImportance.py +6 -2
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -9
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +14 -9
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +4 -2
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +6 -1
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +12 -7
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +12 -7
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +21 -6
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +5 -1
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -1
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +6 -1
- validmind/tests/model_validation/sklearn/ROCCurve.py +3 -1
- validmind/tests/model_validation/sklearn/RegressionErrors.py +6 -2
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +13 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +8 -5
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +5 -1
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +34 -26
- validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +10 -2
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -1
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -9
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +15 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +6 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +8 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +2 -2
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +6 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +8 -2
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +3 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +7 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -0
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -0
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +4 -2
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +3 -1
- validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +11 -1
- validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +10 -2
- validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +8 -1
- validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +18 -2
- validmind/tests/ongoing_monitoring/FeatureDrift.py +9 -2
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +8 -2
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +13 -2
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +13 -2
- validmind/tests/ongoing_monitoring/ROCCurveDrift.py +16 -2
- validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +11 -2
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +13 -2
- validmind/tests/output.py +66 -11
- validmind/tests/prompt_validation/Clarity.py +1 -1
- validmind/tests/prompt_validation/NegativeInstruction.py +1 -1
- validmind/tests/prompt_validation/Robustness.py +6 -1
- validmind/tests/prompt_validation/Specificity.py +1 -1
- validmind/tests/run.py +28 -14
- validmind/tests/test_providers.py +28 -35
- validmind/tests/utils.py +17 -4
- validmind/unit_metrics/__init__.py +1 -1
- validmind/utils.py +295 -31
- validmind/vm_models/dataset/dataset.py +19 -16
- validmind/vm_models/dataset/utils.py +5 -3
- validmind/vm_models/figure.py +6 -6
- validmind/vm_models/input.py +6 -5
- validmind/vm_models/model.py +5 -5
- validmind/vm_models/result/result.py +122 -43
- validmind/vm_models/result/utils.py +9 -28
- validmind/vm_models/test_suite/__init__.py +5 -0
- validmind/vm_models/test_suite/runner.py +5 -5
- validmind/vm_models/test_suite/summary.py +20 -2
- validmind/vm_models/test_suite/test.py +6 -6
- validmind/vm_models/test_suite/test_suite.py +10 -10
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/METADATA +4 -5
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/RECORD +189 -188
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/WHEEL +1 -1
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/LICENSE +0 -0
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/entry_points.txt +0 -0
validmind/tests/decorator.py
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
import inspect
|
8
8
|
import os
|
9
9
|
from functools import wraps
|
10
|
+
from typing import Any, Callable, List, Optional, TypeVar, Union
|
10
11
|
|
11
12
|
from validmind.logging import get_logger
|
12
13
|
|
@@ -15,8 +16,10 @@ from .load import load_test
|
|
15
16
|
|
16
17
|
logger = get_logger(__name__)
|
17
18
|
|
19
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
18
20
|
|
19
|
-
|
21
|
+
|
22
|
+
def _get_save_func(func: Callable[..., Any], test_id: str) -> Callable[..., None]:
|
20
23
|
"""Helper function to save a decorated function to a file
|
21
24
|
|
22
25
|
Useful when a custom test function has been created inline in a notebook or
|
@@ -29,7 +32,7 @@ def _get_save_func(func, test_id):
|
|
29
32
|
# remove decorator line
|
30
33
|
source = source.split("\n", 1)[1]
|
31
34
|
|
32
|
-
def save(root_folder=".", imports=None):
|
35
|
+
def save(root_folder: str = ".", imports: Optional[List[str]] = None) -> None:
|
33
36
|
parts = test_id.split(".")
|
34
37
|
|
35
38
|
if len(parts) > 1:
|
@@ -84,7 +87,7 @@ def _get_save_func(func, test_id):
|
|
84
87
|
return save
|
85
88
|
|
86
89
|
|
87
|
-
def test(func_or_id):
|
90
|
+
def test(func_or_id: Union[Callable[..., Any], str, None]) -> Callable[[F], F]:
|
88
91
|
"""Decorator for creating and registering custom tests
|
89
92
|
|
90
93
|
This decorator registers the function it wraps as a test function within ValidMind
|
@@ -109,14 +112,14 @@ def test(func_or_id):
|
|
109
112
|
as the metric's description.
|
110
113
|
|
111
114
|
Args:
|
112
|
-
|
113
|
-
|
115
|
+
func_or_id (Union[Callable[..., Any], str, None]): Either the function to decorate
|
116
|
+
or the test ID. If None, the function name is used.
|
114
117
|
|
115
118
|
Returns:
|
116
|
-
The decorated function.
|
119
|
+
Callable[[F], F]: The decorated function.
|
117
120
|
"""
|
118
121
|
|
119
|
-
def decorator(func):
|
122
|
+
def decorator(func: F) -> F:
|
120
123
|
test_id = func_or_id or f"validmind.custom_metrics.{func.__name__}"
|
121
124
|
test_func = load_test(test_id, func, reload=True)
|
122
125
|
test_store.register_test(test_id, test_func)
|
@@ -136,28 +139,28 @@ def test(func_or_id):
|
|
136
139
|
return decorator
|
137
140
|
|
138
141
|
|
139
|
-
def tasks(*tasks):
|
142
|
+
def tasks(*tasks: str) -> Callable[[F], F]:
|
140
143
|
"""Decorator for specifying the task types that a test is designed for.
|
141
144
|
|
142
145
|
Args:
|
143
146
|
*tasks: The task types that the test is designed for.
|
144
147
|
"""
|
145
148
|
|
146
|
-
def decorator(func):
|
149
|
+
def decorator(func: F) -> F:
|
147
150
|
func.__tasks__ = list(tasks)
|
148
151
|
return func
|
149
152
|
|
150
153
|
return decorator
|
151
154
|
|
152
155
|
|
153
|
-
def tags(*tags):
|
156
|
+
def tags(*tags: str) -> Callable[[F], F]:
|
154
157
|
"""Decorator for specifying tags for a test.
|
155
158
|
|
156
159
|
Args:
|
157
160
|
*tags: The tags to apply to the test.
|
158
161
|
"""
|
159
162
|
|
160
|
-
def decorator(func):
|
163
|
+
def decorator(func: F) -> F:
|
161
164
|
func.__tags__ = list(tags)
|
162
165
|
return func
|
163
166
|
|
validmind/tests/load.py
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
import inspect
|
8
8
|
import json
|
9
9
|
from pprint import pformat
|
10
|
-
from typing import List
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
11
11
|
from uuid import uuid4
|
12
12
|
|
13
13
|
import pandas as pd
|
@@ -32,7 +32,10 @@ INPUT_TYPE_MAP = {
|
|
32
32
|
}
|
33
33
|
|
34
34
|
|
35
|
-
def _inspect_signature(
|
35
|
+
def _inspect_signature(
|
36
|
+
test_func: Callable[..., Any],
|
37
|
+
) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]:
|
38
|
+
"""Inspect a test function's signature to get inputs and parameters"""
|
36
39
|
inputs = {}
|
37
40
|
params = {}
|
38
41
|
|
@@ -56,7 +59,9 @@ def _inspect_signature(test_func: callable):
|
|
56
59
|
return inputs, params
|
57
60
|
|
58
61
|
|
59
|
-
def load_test(
|
62
|
+
def load_test(
|
63
|
+
test_id: str, test_func: Optional[Callable[..., Any]] = None, reload: bool = False
|
64
|
+
) -> Callable[..., Any]:
|
60
65
|
"""Load a test by test ID
|
61
66
|
|
62
67
|
Test IDs are in the format `namespace.path_to_module.TestClassOrFuncName[:tag]`.
|
@@ -67,6 +72,8 @@ def load_test(test_id: str, test_func: callable = None, reload: bool = False):
|
|
67
72
|
test_id (str): The test ID in the format `namespace.path_to_module.TestName[:tag]`
|
68
73
|
test_func (callable, optional): The test function to load. If not provided, the
|
69
74
|
test will be loaded from the test provider. Defaults to None.
|
75
|
+
reload (bool, optional): If True, reload the test even if it's already loaded.
|
76
|
+
Defaults to False.
|
70
77
|
"""
|
71
78
|
# remove tag if present
|
72
79
|
test_id = test_id.split(":", 1)[0]
|
@@ -109,7 +116,8 @@ def load_test(test_id: str, test_func: callable = None, reload: bool = False):
|
|
109
116
|
return test_store.get_test(test_id)
|
110
117
|
|
111
118
|
|
112
|
-
def _list_test_ids():
|
119
|
+
def _list_test_ids() -> List[str]:
|
120
|
+
"""List all available test IDs"""
|
113
121
|
test_ids = []
|
114
122
|
|
115
123
|
for namespace, test_provider in test_provider_store.test_providers.items():
|
@@ -120,7 +128,7 @@ def _list_test_ids():
|
|
120
128
|
return test_ids
|
121
129
|
|
122
130
|
|
123
|
-
def _load_tests(test_ids):
|
131
|
+
def _load_tests(test_ids: List[str]) -> Dict[str, Callable[..., Any]]:
|
124
132
|
"""Load a set of tests, handling missing dependencies."""
|
125
133
|
tests = {}
|
126
134
|
|
@@ -138,12 +146,12 @@ def _load_tests(test_ids):
|
|
138
146
|
logger.debug(str(e))
|
139
147
|
|
140
148
|
if e.extra:
|
141
|
-
logger.
|
149
|
+
logger.debug(
|
142
150
|
f"Skipping `{test_id}` as it requires extra dependencies: {e.required_dependencies}."
|
143
151
|
f" Please run `pip install validmind[{e.extra}]` to view and run this test."
|
144
152
|
)
|
145
153
|
else:
|
146
|
-
logger.
|
154
|
+
logger.debug(
|
147
155
|
f"Skipping `{test_id}` as it requires missing dependencies: {e.required_dependencies}."
|
148
156
|
" Please install the missing dependencies to view and run this test."
|
149
157
|
)
|
@@ -151,7 +159,8 @@ def _load_tests(test_ids):
|
|
151
159
|
return tests
|
152
160
|
|
153
161
|
|
154
|
-
def _test_description(test_description: str, num_lines: int = 5):
|
162
|
+
def _test_description(test_description: str, num_lines: int = 5) -> str:
|
163
|
+
"""Format a test description"""
|
155
164
|
description = test_description.strip("\n").strip()
|
156
165
|
|
157
166
|
if len(description.split("\n")) > num_lines:
|
@@ -160,7 +169,10 @@ def _test_description(test_description: str, num_lines: int = 5):
|
|
160
169
|
return description
|
161
170
|
|
162
171
|
|
163
|
-
def _pretty_list_tests(
|
172
|
+
def _pretty_list_tests(
|
173
|
+
tests: Dict[str, Callable[..., Any]], truncate: bool = True
|
174
|
+
) -> None:
|
175
|
+
"""Pretty print a list of tests"""
|
164
176
|
table = [
|
165
177
|
{
|
166
178
|
"ID": test_id,
|
@@ -171,6 +183,8 @@ def _pretty_list_tests(tests, truncate=True):
|
|
171
183
|
),
|
172
184
|
"Required Inputs": list(test.inputs.keys()),
|
173
185
|
"Params": test.params,
|
186
|
+
"Tags": test.__tags__,
|
187
|
+
"Tasks": test.__tasks__,
|
174
188
|
}
|
175
189
|
for test_id, test in tests.items()
|
176
190
|
]
|
@@ -178,10 +192,8 @@ def _pretty_list_tests(tests, truncate=True):
|
|
178
192
|
return format_dataframe(pd.DataFrame(table))
|
179
193
|
|
180
194
|
|
181
|
-
def list_tags():
|
182
|
-
"""
|
183
|
-
List unique tags from all test classes.
|
184
|
-
"""
|
195
|
+
def list_tags() -> List[str]:
|
196
|
+
"""List all unique available tags"""
|
185
197
|
|
186
198
|
unique_tags = set()
|
187
199
|
|
@@ -191,7 +203,7 @@ def list_tags():
|
|
191
203
|
return list(unique_tags)
|
192
204
|
|
193
205
|
|
194
|
-
def list_tasks_and_tags(as_json=False):
|
206
|
+
def list_tasks_and_tags(as_json: bool = False) -> Union[str, Dict[str, List[str]]]:
|
195
207
|
"""
|
196
208
|
List all task types and their associated tags, with one row per task type and
|
197
209
|
all tags for a task type in one row.
|
@@ -218,11 +230,8 @@ def list_tasks_and_tags(as_json=False):
|
|
218
230
|
)
|
219
231
|
|
220
232
|
|
221
|
-
def list_tasks():
|
222
|
-
"""
|
223
|
-
List unique tasks from all test classes.
|
224
|
-
"""
|
225
|
-
|
233
|
+
def list_tasks() -> List[str]:
|
234
|
+
"""List all unique available tasks"""
|
226
235
|
unique_tasks = set()
|
227
236
|
|
228
237
|
for test in _load_tests(list_tests(pretty=False)).values():
|
@@ -231,7 +240,13 @@ def list_tasks():
|
|
231
240
|
return list(unique_tasks)
|
232
241
|
|
233
242
|
|
234
|
-
def list_tests(
|
243
|
+
def list_tests(
|
244
|
+
filter: Optional[str] = None,
|
245
|
+
task: Optional[str] = None,
|
246
|
+
tags: Optional[List[str]] = None,
|
247
|
+
pretty: bool = True,
|
248
|
+
truncate: bool = True,
|
249
|
+
) -> Union[List[str], None]:
|
235
250
|
"""List all tests in the tests directory.
|
236
251
|
|
237
252
|
Args:
|
@@ -245,9 +260,6 @@ def list_tests(filter=None, task=None, tags=None, pretty=True, truncate=True):
|
|
245
260
|
formatted table. Defaults to True.
|
246
261
|
truncate (bool, optional): If True, truncates the test description to the first
|
247
262
|
line. Defaults to True. (only used if pretty=True)
|
248
|
-
|
249
|
-
Returns:
|
250
|
-
list or pandas.DataFrame: A list of all tests or a formatted table.
|
251
263
|
"""
|
252
264
|
test_ids = _list_test_ids()
|
253
265
|
|
@@ -286,7 +298,9 @@ def list_tests(filter=None, task=None, tags=None, pretty=True, truncate=True):
|
|
286
298
|
return _pretty_list_tests(tests, truncate=truncate)
|
287
299
|
|
288
300
|
|
289
|
-
def describe_test(
|
301
|
+
def describe_test(
|
302
|
+
test_id: Optional[TestID] = None, raw: bool = False, show: bool = True
|
303
|
+
) -> Union[str, HTML, Dict[str, Any]]:
|
290
304
|
"""Get or show details about the test
|
291
305
|
|
292
306
|
This function can be used to see test details including the test name, description,
|
@@ -131,4 +131,10 @@ def BertScore(
|
|
131
131
|
# Create a DataFrame from all collected statistics
|
132
132
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
133
133
|
|
134
|
-
return (
|
134
|
+
return (
|
135
|
+
result_df,
|
136
|
+
*figures,
|
137
|
+
RawData(
|
138
|
+
bert_scores_df=metrics_df, model=model.input_id, dataset=dataset.input_id
|
139
|
+
),
|
140
|
+
)
|
@@ -114,4 +114,10 @@ def BleuScore(dataset, model):
|
|
114
114
|
# Create a DataFrame from all collected statistics
|
115
115
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
116
116
|
|
117
|
-
return (
|
117
|
+
return (
|
118
|
+
result_df,
|
119
|
+
*figures,
|
120
|
+
RawData(
|
121
|
+
bleu_scores_df=metrics_df, model=model.input_id, dataset=dataset.input_id
|
122
|
+
),
|
123
|
+
)
|
@@ -72,4 +72,6 @@ def ClusterSizeDistribution(dataset: VMDataset, model: VMModel):
|
|
72
72
|
fig.update_yaxes(title_text="Counts", showgrid=False)
|
73
73
|
fig.update_layout(title_text="Cluster distribution", title_x=0.5, barmode="group")
|
74
74
|
|
75
|
-
return fig, RawData(
|
75
|
+
return fig, RawData(
|
76
|
+
cluster_counts=df_counts, model=model.input_id, dataset=dataset.input_id
|
77
|
+
)
|
@@ -118,4 +118,12 @@ def ContextualRecall(dataset, model):
|
|
118
118
|
# Create a DataFrame from all collected statistics
|
119
119
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
120
120
|
|
121
|
-
return (
|
121
|
+
return (
|
122
|
+
result_df,
|
123
|
+
*figures,
|
124
|
+
RawData(
|
125
|
+
contextual_recall_scores=metrics_df,
|
126
|
+
model=model.input_id,
|
127
|
+
dataset=dataset.input_id,
|
128
|
+
),
|
129
|
+
)
|
@@ -117,4 +117,10 @@ def MeteorScore(dataset, model):
|
|
117
117
|
# Create a DataFrame from all collected statistics
|
118
118
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
119
119
|
|
120
|
-
return (
|
120
|
+
return (
|
121
|
+
result_df,
|
122
|
+
*figures,
|
123
|
+
RawData(
|
124
|
+
meteor_scores=metrics_df, model=model.input_id, dataset=dataset.input_id
|
125
|
+
),
|
126
|
+
)
|
@@ -102,4 +102,8 @@ def ModelPredictionResiduals(
|
|
102
102
|
# Create a summary DataFrame for the KS normality test results
|
103
103
|
summary_df = pd.DataFrame([summary])
|
104
104
|
|
105
|
-
return (
|
105
|
+
return (
|
106
|
+
summary_df,
|
107
|
+
*figures,
|
108
|
+
RawData(residuals=residuals, model=model.input_id, dataset=dataset.input_id),
|
109
|
+
)
|
@@ -145,5 +145,10 @@ def RegardScore(dataset, model):
|
|
145
145
|
return (
|
146
146
|
result_df,
|
147
147
|
*figures,
|
148
|
-
RawData(
|
148
|
+
RawData(
|
149
|
+
true_regard=true_df,
|
150
|
+
pred_regard=pred_df,
|
151
|
+
model=model.input_id,
|
152
|
+
dataset=dataset.input_id,
|
153
|
+
),
|
149
154
|
)
|
@@ -105,4 +105,13 @@ def RegressionResidualsPlot(model: VMModel, dataset: VMDataset, bin_size: float
|
|
105
105
|
)
|
106
106
|
)
|
107
107
|
|
108
|
-
return (
|
108
|
+
return (
|
109
|
+
*figures,
|
110
|
+
RawData(
|
111
|
+
residuals=residuals,
|
112
|
+
y_true=y_true,
|
113
|
+
y_pred=y_pred,
|
114
|
+
model=model.input_id,
|
115
|
+
dataset=dataset.input_id,
|
116
|
+
),
|
117
|
+
)
|
@@ -121,5 +121,7 @@ def RougeScore(dataset, model, metric="rouge-1"):
|
|
121
121
|
return (
|
122
122
|
pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"}),
|
123
123
|
*figures,
|
124
|
-
RawData(
|
124
|
+
RawData(
|
125
|
+
rouge_scores_df=df_scores, model=model.input_id, dataset=dataset.input_id
|
126
|
+
),
|
125
127
|
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import plotly.graph_objects as go
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
|
9
9
|
|
10
10
|
@tags("model_predictions", "visualization")
|
@@ -70,4 +70,12 @@ def TimeSeriesPredictionsPlot(dataset, model):
|
|
70
70
|
template="plotly_white",
|
71
71
|
)
|
72
72
|
|
73
|
-
|
73
|
+
raw_data = RawData(
|
74
|
+
time_index=time_index,
|
75
|
+
actual_values=dataset.y,
|
76
|
+
predicted_values=y_pred,
|
77
|
+
model=model.input_id,
|
78
|
+
dataset=dataset.input_id,
|
79
|
+
)
|
80
|
+
|
81
|
+
return fig, raw_data
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7
7
|
import plotly.express as px
|
8
8
|
from sklearn import metrics
|
9
9
|
|
10
|
-
from validmind import tags, tasks
|
10
|
+
from validmind import RawData, tags, tasks
|
11
11
|
|
12
12
|
|
13
13
|
@tags("model_performance", "sklearn")
|
@@ -105,4 +105,8 @@ def TimeSeriesR2SquareBySegments(dataset, model, segments=None):
|
|
105
105
|
},
|
106
106
|
)
|
107
107
|
|
108
|
-
return
|
108
|
+
return (
|
109
|
+
fig,
|
110
|
+
results_df,
|
111
|
+
RawData(summary=results_df, model=model.input_id, dataset=dataset.input_id),
|
112
|
+
)
|
@@ -108,4 +108,8 @@ def TokenDisparity(dataset, model):
|
|
108
108
|
# Create a DataFrame from all collected statistics
|
109
109
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
110
110
|
|
111
|
-
return (
|
111
|
+
return (
|
112
|
+
result_df,
|
113
|
+
*figures,
|
114
|
+
RawData(token_counts_df=df, model=model.input_id, dataset=dataset.input_id),
|
115
|
+
)
|
@@ -113,5 +113,9 @@ def CosineSimilarityComparison(dataset, models):
|
|
113
113
|
return (
|
114
114
|
*figures,
|
115
115
|
stats_df,
|
116
|
-
RawData(
|
116
|
+
RawData(
|
117
|
+
similarity_matrices=pd.DataFrame(similarity_matrices),
|
118
|
+
dataset=dataset.input_id,
|
119
|
+
models=[model.input_id for model in models],
|
120
|
+
),
|
117
121
|
)
|
@@ -59,4 +59,8 @@ def CosineSimilarityDistribution(dataset: VMDataset, model: VMModel):
|
|
59
59
|
nbins=100,
|
60
60
|
title="Cosine Similarity Distribution",
|
61
61
|
labels={"x": "Cosine Similarity"},
|
62
|
-
), RawData(
|
62
|
+
), RawData(
|
63
|
+
similarity_scores=similarity_scores,
|
64
|
+
model=model.input_id,
|
65
|
+
dataset=dataset.input_id,
|
66
|
+
)
|
@@ -89,4 +89,8 @@ def EmbeddingsVisualization2D(
|
|
89
89
|
fig = px.scatter(**scatter_kwargs)
|
90
90
|
fig.update_layout(width=500, height=500)
|
91
91
|
|
92
|
-
return fig, RawData(
|
92
|
+
return fig, RawData(
|
93
|
+
tsne_embeddings=reduced_embeddings,
|
94
|
+
model=model.input_id,
|
95
|
+
dataset=dataset.input_id,
|
96
|
+
)
|
@@ -57,7 +57,7 @@ def EuclideanDistanceComparison(dataset, models):
|
|
57
57
|
figures = []
|
58
58
|
all_stats = []
|
59
59
|
|
60
|
-
distance_matrices =
|
60
|
+
distance_matrices = []
|
61
61
|
|
62
62
|
# Generate all pairs of models for comparison
|
63
63
|
for model_A, model_B in combinations(models, 2):
|
@@ -105,6 +105,10 @@ def EuclideanDistanceComparison(dataset, models):
|
|
105
105
|
stats_df = pd.DataFrame(all_stats)
|
106
106
|
|
107
107
|
# Add raw data to return
|
108
|
-
raw_data = RawData(
|
108
|
+
raw_data = RawData(
|
109
|
+
distance_matrices=pd.DataFrame(distance_matrices),
|
110
|
+
dataset=dataset.input_id,
|
111
|
+
models=[model.input_id for model in models],
|
112
|
+
)
|
109
113
|
|
110
114
|
return (stats_df, *figures, raw_data)
|
@@ -97,4 +97,8 @@ def StabilityAnalysisKeyword(
|
|
97
97
|
mean_similarity_threshold,
|
98
98
|
)
|
99
99
|
|
100
|
-
return results, RawData(
|
100
|
+
return results, RawData(
|
101
|
+
original_perturbed_similarity=raw_data,
|
102
|
+
model=model.input_id,
|
103
|
+
dataset=dataset.input_id,
|
104
|
+
)
|
@@ -151,4 +151,8 @@ def StabilityAnalysisRandomNoise(
|
|
151
151
|
mean_similarity_threshold,
|
152
152
|
)
|
153
153
|
|
154
|
-
return *result, RawData(
|
154
|
+
return *result, RawData(
|
155
|
+
original_perturbed_similarity=raw_data,
|
156
|
+
model=model.input_id,
|
157
|
+
dataset=dataset.input_id,
|
158
|
+
)
|
@@ -107,4 +107,8 @@ def StabilityAnalysisSynonyms(
|
|
107
107
|
mean_similarity_threshold,
|
108
108
|
)
|
109
109
|
|
110
|
-
return *result, RawData(
|
110
|
+
return *result, RawData(
|
111
|
+
original_perturbed_similarity=raw_data,
|
112
|
+
model=model.input_id,
|
113
|
+
dataset=dataset.input_id,
|
114
|
+
)
|
@@ -134,4 +134,8 @@ def StabilityAnalysisTranslation(
|
|
134
134
|
mean_similarity_threshold,
|
135
135
|
)
|
136
136
|
|
137
|
-
return *result, RawData(
|
137
|
+
return *result, RawData(
|
138
|
+
original_perturbed_similarity=raw_data,
|
139
|
+
model=model.input_id,
|
140
|
+
dataset=dataset.input_id,
|
141
|
+
)
|
@@ -110,5 +110,10 @@ def TSNEComponentsPairwisePlots(
|
|
110
110
|
|
111
111
|
return (
|
112
112
|
*figures,
|
113
|
-
RawData(
|
113
|
+
RawData(
|
114
|
+
embeddings_scaled=embeddings_scaled,
|
115
|
+
tsne_results=tsne_results,
|
116
|
+
model=model.input_id,
|
117
|
+
dataset=dataset.input_id,
|
118
|
+
),
|
114
119
|
)
|
@@ -123,8 +123,10 @@ def AnswerCorrectness(
|
|
123
123
|
|
124
124
|
score_column = "answer_correctness"
|
125
125
|
|
126
|
-
fig_histogram = px.histogram(
|
127
|
-
|
126
|
+
fig_histogram = px.histogram(
|
127
|
+
x=result_df[score_column].to_list(), nbins=10, title="Answer Correctness"
|
128
|
+
)
|
129
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Answer Correctness")
|
128
130
|
|
129
131
|
return (
|
130
132
|
{
|
@@ -144,5 +146,5 @@ def AnswerCorrectness(
|
|
144
146
|
},
|
145
147
|
fig_histogram,
|
146
148
|
fig_box,
|
147
|
-
RawData(evaluation_results=result_df),
|
149
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
148
150
|
)
|