validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
validmind/tests/run.py
CHANGED
@@ -2,581 +2,399 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import
|
6
|
-
|
7
|
-
|
5
|
+
import platform
|
6
|
+
import subprocess
|
7
|
+
import time
|
8
|
+
from datetime import datetime
|
9
|
+
from inspect import getdoc
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
8
11
|
from uuid import uuid4
|
9
12
|
|
10
|
-
|
11
|
-
|
12
|
-
from validmind.
|
13
|
-
from validmind.
|
13
|
+
from validmind import __version__
|
14
|
+
from validmind.ai.test_descriptions import get_result_description
|
15
|
+
from validmind.errors import MissingRequiredTestInputError
|
16
|
+
from validmind.input_registry import input_registry
|
14
17
|
from validmind.logging import get_logger
|
15
|
-
from validmind.
|
16
|
-
from validmind.
|
17
|
-
from validmind.vm_models import
|
18
|
-
MetricResult,
|
19
|
-
ResultSummary,
|
20
|
-
ResultTable,
|
21
|
-
ResultTableMetadata,
|
22
|
-
TestContext,
|
23
|
-
TestInput,
|
24
|
-
ThresholdTestResults,
|
25
|
-
)
|
26
|
-
from validmind.vm_models.figure import is_matplotlib_figure, is_plotly_figure
|
27
|
-
from validmind.vm_models.test.result_wrapper import (
|
28
|
-
MetricResultWrapper,
|
29
|
-
ThresholdTestResultWrapper,
|
30
|
-
)
|
18
|
+
from validmind.utils import test_id_to_name
|
19
|
+
from validmind.vm_models.input import VMInput
|
20
|
+
from validmind.vm_models.result import TestResult
|
31
21
|
|
32
22
|
from .__types__ import TestID
|
33
|
-
from .
|
23
|
+
from .comparison import combine_results, get_comparison_test_configs
|
24
|
+
from .load import _test_description, describe_test, load_test
|
25
|
+
from .output import process_output
|
34
26
|
|
35
27
|
logger = get_logger(__name__)
|
36
28
|
|
37
29
|
|
38
|
-
|
39
|
-
|
40
|
-
return [dict(zip(input_grid, values)) for values in product(*input_grid.values())]
|
41
|
-
|
42
|
-
|
43
|
-
def _combine_summaries(summaries: List[Dict[str, Any]]):
|
44
|
-
"""Combine the summaries from multiple results
|
45
|
-
|
46
|
-
Args:
|
47
|
-
summaries (List[Dict[str, Any]]): A list of dictionaries where each dictionary
|
48
|
-
has two keys: "inputs" and "summary". The "inputs" key should contain the
|
49
|
-
inputs used for the test and the "summary" key should contain the actual
|
50
|
-
summary object.
|
51
|
-
|
52
|
-
Constraint: The summaries must all have the same structure meaning that each has
|
53
|
-
the same number of tables in the same order with the same columns etc. This
|
54
|
-
should always be the case for comparison tests since its the same test run
|
55
|
-
multiple times with different inputs.
|
56
|
-
"""
|
57
|
-
if not summaries[0]["summary"]:
|
58
|
-
return None
|
59
|
-
|
60
|
-
def combine_tables(table_index):
|
61
|
-
combined_df = pd.DataFrame()
|
62
|
-
|
63
|
-
for summary_obj in summaries:
|
64
|
-
serialized = summary_obj["summary"].results[table_index].serialize()
|
65
|
-
summary_df = pd.DataFrame(serialized["data"])
|
66
|
-
summary_df = pd.concat(
|
67
|
-
[
|
68
|
-
pd.DataFrame(summary_obj["inputs"], index=summary_df.index),
|
69
|
-
summary_df,
|
70
|
-
],
|
71
|
-
axis=1,
|
72
|
-
)
|
73
|
-
combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
|
74
|
-
|
75
|
-
return ResultTable(
|
76
|
-
data=combined_df.to_dict(orient="records"),
|
77
|
-
metadata=summaries[0]["summary"].results[table_index].metadata,
|
78
|
-
)
|
79
|
-
|
80
|
-
return ResultSummary(
|
81
|
-
results=[
|
82
|
-
combine_tables(table_index)
|
83
|
-
for table_index in range(len(summaries[0]["summary"].results))
|
84
|
-
]
|
85
|
-
)
|
86
|
-
|
87
|
-
|
88
|
-
def _get_input_id(v):
|
89
|
-
if isinstance(v, str):
|
90
|
-
return v # If v is a string, return it as is.
|
91
|
-
elif isinstance(v, list) and all(hasattr(item, "input_id") for item in v):
|
92
|
-
# If v is a list and all items have an input_id attribute, join their input_id values.
|
93
|
-
return ", ".join(item.input_id for item in v)
|
94
|
-
elif hasattr(v, "input_id"):
|
95
|
-
return v.input_id # If v has an input_id attribute, return it.
|
96
|
-
return str(v) # Otherwise, return the string representation of v.
|
97
|
-
|
98
|
-
|
99
|
-
def _update_plotly_titles(figures, input_group, title_template):
|
100
|
-
for figure in figures:
|
101
|
-
|
102
|
-
current_title = figure.figure.layout.title.text
|
103
|
-
|
104
|
-
input_description = " and ".join(
|
105
|
-
f"{key}: {_get_input_id(value)}" for key, value in input_group.items()
|
106
|
-
)
|
107
|
-
|
108
|
-
figure.figure.layout.title.text = title_template.format(
|
109
|
-
current_title=f"{current_title} " if current_title else "",
|
110
|
-
input_description=input_description,
|
111
|
-
)
|
30
|
+
# shouldn't change once initialized
|
31
|
+
_run_metadata = {}
|
112
32
|
|
113
33
|
|
114
|
-
def
|
115
|
-
|
34
|
+
def _get_pip_freeze():
|
35
|
+
"""Get a dict of package names and versions"""
|
36
|
+
output = subprocess.check_output(["pip", "freeze"]).decode("utf-8")
|
37
|
+
parsed = {}
|
116
38
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
input_description = " and ".join(
|
122
|
-
f"{key}: {_get_input_id(value)}" for key, value in input_group.items()
|
123
|
-
)
|
39
|
+
for line in output.split("\n"):
|
40
|
+
if not line:
|
41
|
+
continue
|
124
42
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
)
|
130
|
-
|
43
|
+
if "==" in line:
|
44
|
+
package, version = line.split("==")
|
45
|
+
parsed[package] = version
|
46
|
+
elif " @ " in line:
|
47
|
+
package = line.split(" @ ")[0]
|
48
|
+
parsed[package] = "__editable__"
|
131
49
|
|
50
|
+
return parsed
|
132
51
|
|
133
|
-
def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str, Any]]):
|
134
|
-
"""Combine the figures from multiple results"""
|
135
|
-
if not figure_lists[0]:
|
136
|
-
return None
|
137
52
|
|
138
|
-
|
53
|
+
def _get_run_metadata(**metadata: Dict[str, Any]) -> Dict[str, Any]:
|
54
|
+
"""Get metadata for a test run result"""
|
55
|
+
if not _run_metadata:
|
56
|
+
_run_metadata["validmind"] = {"version": __version__}
|
57
|
+
_run_metadata["python"] = {
|
58
|
+
"version": platform.python_version(),
|
59
|
+
"implementation": platform.python_implementation(),
|
60
|
+
"compiler": platform.python_compiler(),
|
61
|
+
}
|
62
|
+
_run_metadata["platform"] = platform.platform()
|
139
63
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
elif is_matplotlib_figure(figures[0].figure):
|
145
|
-
_update_matplotlib_titles(figures, input_group, title_template)
|
146
|
-
else:
|
147
|
-
logger.warning("Cannot properly annotate png figures")
|
64
|
+
try:
|
65
|
+
_run_metadata["pip"] = _get_pip_freeze()
|
66
|
+
except Exception:
|
67
|
+
pass
|
148
68
|
|
149
|
-
return
|
69
|
+
return {
|
70
|
+
**_run_metadata,
|
71
|
+
**metadata,
|
72
|
+
"timestamp": datetime.now().isoformat(),
|
73
|
+
}
|
150
74
|
|
151
75
|
|
152
|
-
def
|
153
|
-
|
154
|
-
|
76
|
+
def _get_test_kwargs(
|
77
|
+
test_func: callable, inputs: Dict[str, Any], params: Dict[str, Any]
|
78
|
+
):
|
79
|
+
"""Insepect function signature to build kwargs to pass the inputs and params
|
80
|
+
that the test function expects
|
155
81
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
82
|
+
Args:
|
83
|
+
test_func (callable): Test function to inspect
|
84
|
+
inputs (dict): Test inputs... different formats are supported
|
85
|
+
e.g. {"dataset": dataset, "model": "model_id"}
|
86
|
+
{"datasets": [dataset1, "dataset2_id"]}
|
87
|
+
{"datasets": ("dataset1_id", "dataset2_id")}
|
88
|
+
{"dataset": {
|
89
|
+
"input_id": "dataset2_id",
|
90
|
+
"options": {"columns": ["col1", "col2"]},
|
91
|
+
}}
|
92
|
+
params (dict): Test parameters e.g. {"param1": 1, "param2": 2}
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
tuple: Tuple of input and param kwargs
|
96
|
+
"""
|
97
|
+
input_kwargs = {} # map function inputs (`dataset` etc) to actual objects
|
98
|
+
|
99
|
+
for key in test_func.inputs.keys():
|
100
|
+
try:
|
101
|
+
_input = inputs[key]
|
102
|
+
except KeyError:
|
103
|
+
raise MissingRequiredTestInputError(f"Missing required input: {key}.")
|
104
|
+
|
105
|
+
# 1) retrieve input object from input registry if an input_id string is provided
|
106
|
+
# 2) check the input_id type if a list of inputs (mix of strings and objects) is provided
|
107
|
+
# 3) if its a dict, it should contain the `input_id` key as well as other options
|
108
|
+
if isinstance(_input, str):
|
109
|
+
_input = input_registry.get(key=_input)
|
110
|
+
elif isinstance(_input, list) or isinstance(_input, tuple):
|
111
|
+
_input = [
|
112
|
+
input_registry.get(key=v) if isinstance(v, str) else v for v in _input
|
113
|
+
]
|
114
|
+
elif isinstance(_input, dict):
|
115
|
+
try:
|
116
|
+
_input = input_registry.get(key=_input["input_id"]).with_options(
|
117
|
+
**{k: v for k, v in _input.items() if k != "input_id"}
|
118
|
+
)
|
119
|
+
except KeyError as e:
|
120
|
+
raise ValueError(
|
121
|
+
"Input dictionary must contain an 'input_id' key "
|
122
|
+
"to retrieve the input object from the input registry."
|
123
|
+
) from e
|
124
|
+
|
125
|
+
input_kwargs[key] = _input
|
126
|
+
|
127
|
+
param_kwargs = {
|
128
|
+
key: value for key, value in params.items() if key in test_func.params
|
129
|
+
}
|
130
|
+
|
131
|
+
return input_kwargs, param_kwargs
|
132
|
+
|
133
|
+
|
134
|
+
def build_test_result(
|
135
|
+
outputs: Union[Any, Tuple[Any, ...]],
|
136
|
+
test_id: str,
|
137
|
+
inputs: Dict[str, Union[VMInput, List[VMInput]]],
|
138
|
+
params: Union[Dict[str, Any], None],
|
139
|
+
description: str,
|
177
140
|
generate_description: bool = True,
|
141
|
+
title: Optional[str] = None,
|
178
142
|
):
|
179
|
-
"""Build a
|
143
|
+
"""Build a TestResult object from a set of raw test function outputs"""
|
180
144
|
ref_id = str(uuid4())
|
181
145
|
|
182
|
-
|
183
|
-
input_params_groups = input_params_groups or [{}]
|
184
|
-
|
185
|
-
input_group_strings = []
|
186
|
-
|
187
|
-
for input_params in input_params_groups:
|
188
|
-
new_group = {}
|
189
|
-
for param_k, param_v in input_params["params"].items():
|
190
|
-
new_group[param_k] = param_v
|
191
|
-
for metric_k, metric_v in input_params["inputs"].items():
|
192
|
-
# Process values in the input group
|
193
|
-
if isinstance(metric_v, str):
|
194
|
-
new_group[metric_k] = metric_v
|
195
|
-
elif hasattr(metric_v, "input_id"):
|
196
|
-
new_group[metric_k] = metric_v.input_id
|
197
|
-
elif isinstance(metric_v, list) and all(
|
198
|
-
hasattr(item, "input_id") for item in metric_v
|
199
|
-
):
|
200
|
-
new_group[metric_k] = ", ".join([item.input_id for item in metric_v])
|
201
|
-
else:
|
202
|
-
raise ValueError(f"Unsupported type for value: {metric_v}")
|
203
|
-
input_group_strings.append(new_group)
|
204
|
-
|
205
|
-
# handle unit metrics (scalar values) by adding it to the summary
|
206
|
-
_combine_unit_metrics(results)
|
207
|
-
|
208
|
-
merged_summary = _combine_summaries(
|
209
|
-
[
|
210
|
-
{"inputs": input_group_strings[i], "summary": result.metric.summary}
|
211
|
-
for i, result in enumerate(results)
|
212
|
-
]
|
213
|
-
)
|
214
|
-
merged_figures = _combine_figures(
|
215
|
-
[result.figures for result in results], input_params_groups
|
216
|
-
)
|
217
|
-
|
218
|
-
# Patch figure metadata so they are connected to the comparison result
|
219
|
-
if merged_figures and len(merged_figures):
|
220
|
-
for i, figure in enumerate(merged_figures):
|
221
|
-
figure.key = f"{figure.key}-{i}"
|
222
|
-
figure.metadata["_name"] = test_id
|
223
|
-
figure.metadata["_ref_id"] = ref_id
|
224
|
-
|
225
|
-
return MetricResultWrapper(
|
146
|
+
result = TestResult(
|
226
147
|
result_id=test_id,
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
summary=merged_summary.serialize() if merged_summary else None,
|
232
|
-
figures=merged_figures,
|
233
|
-
should_generate=generate_description,
|
234
|
-
),
|
235
|
-
],
|
236
|
-
inputs=[
|
237
|
-
item.input_id if hasattr(item, "input_id") else item
|
238
|
-
for group in input_params_groups
|
239
|
-
for input in group["inputs"].values()
|
240
|
-
for item in (input if isinstance(input, list) else [input])
|
241
|
-
if hasattr(item, "input_id") or isinstance(item, str)
|
242
|
-
],
|
243
|
-
output_template=output_template,
|
244
|
-
metric=MetricResult(
|
245
|
-
key=test_id,
|
246
|
-
ref_id=ref_id,
|
247
|
-
value=[],
|
248
|
-
summary=merged_summary,
|
249
|
-
),
|
250
|
-
figures=merged_figures,
|
148
|
+
title=title,
|
149
|
+
ref_id=ref_id,
|
150
|
+
inputs=inputs,
|
151
|
+
params=params if params else None, # None if empty dict or None
|
251
152
|
)
|
252
153
|
|
154
|
+
if not isinstance(outputs, tuple):
|
155
|
+
outputs = (outputs,)
|
253
156
|
|
254
|
-
|
255
|
-
|
256
|
-
test_id: TestID,
|
257
|
-
input_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
|
258
|
-
output_template: str = None,
|
259
|
-
generate_description: bool = True,
|
260
|
-
):
|
261
|
-
"""Build a comparison result for multiple threshold test results"""
|
262
|
-
ref_id = str(uuid4())
|
157
|
+
for item in outputs:
|
158
|
+
process_output(item, result)
|
263
159
|
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
new_group[k] = v.input_id
|
273
|
-
elif isinstance(v, list) and all(hasattr(item, "input_id") for item in v):
|
274
|
-
new_group[k] = ", ".join([item.input_id for item in v])
|
275
|
-
else:
|
276
|
-
raise ValueError(f"Unsupported type for value: {v}")
|
277
|
-
input_group_strings.append(new_group)
|
278
|
-
|
279
|
-
merged_summary = _combine_summaries(
|
280
|
-
[
|
281
|
-
{"inputs": input_group_strings[i], "summary": result.test_results.summary}
|
282
|
-
for i, result in enumerate(results)
|
283
|
-
]
|
284
|
-
)
|
285
|
-
merged_figures = _combine_figures(
|
286
|
-
[result.figures for result in results], input_groups
|
160
|
+
result.description = get_result_description(
|
161
|
+
test_id=test_id,
|
162
|
+
test_description=description,
|
163
|
+
tables=result.tables,
|
164
|
+
figures=result.figures,
|
165
|
+
metric=result.metric,
|
166
|
+
should_generate=generate_description,
|
167
|
+
title=title,
|
287
168
|
)
|
288
169
|
|
289
|
-
|
290
|
-
if merged_figures and len(merged_figures):
|
291
|
-
for i, figure in enumerate(merged_figures):
|
292
|
-
figure.key = f"{figure.key}-{i}"
|
293
|
-
figure.metadata["_name"] = test_id
|
294
|
-
figure.metadata["_ref_id"] = ref_id
|
295
|
-
|
296
|
-
return ThresholdTestResultWrapper(
|
297
|
-
result_id=test_id,
|
298
|
-
result_metadata=[
|
299
|
-
get_description_metadata(
|
300
|
-
test_id=test_id,
|
301
|
-
default_description=f"Comparison test result for {test_id}",
|
302
|
-
summary=merged_summary.serialize() if merged_summary else None,
|
303
|
-
figures=merged_figures,
|
304
|
-
prefix="test_description",
|
305
|
-
should_generate=generate_description,
|
306
|
-
)
|
307
|
-
],
|
308
|
-
inputs=[
|
309
|
-
input if isinstance(input, str) else input.input_id
|
310
|
-
for group in input_groups
|
311
|
-
for input in group.values()
|
312
|
-
],
|
313
|
-
output_template=output_template,
|
314
|
-
test_results=ThresholdTestResults(
|
315
|
-
test_name=test_id,
|
316
|
-
ref_id=ref_id,
|
317
|
-
# TODO: when we have param_grid support, this will need to be updated
|
318
|
-
params=results[0].test_results.params,
|
319
|
-
passed=all(result.test_results.passed for result in results),
|
320
|
-
results=[],
|
321
|
-
summary=merged_summary,
|
322
|
-
),
|
323
|
-
figures=merged_figures,
|
324
|
-
)
|
170
|
+
return result
|
325
171
|
|
326
172
|
|
327
|
-
def
|
173
|
+
def _run_composite_test(
|
328
174
|
test_id: TestID,
|
329
|
-
|
330
|
-
inputs: Dict[str, Any]
|
331
|
-
|
332
|
-
|
333
|
-
param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]
|
334
|
-
|
335
|
-
|
336
|
-
output_template: str = None,
|
337
|
-
generate_description: bool = True,
|
175
|
+
metric_ids: List[TestID],
|
176
|
+
inputs: Union[Dict[str, Any], None],
|
177
|
+
input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
|
178
|
+
params: Union[Dict[str, Any], None],
|
179
|
+
param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
|
180
|
+
generate_description: bool,
|
181
|
+
title: Optional[str] = None,
|
338
182
|
):
|
339
|
-
"""Run a
|
340
|
-
if input_grid:
|
341
|
-
if isinstance(input_grid, dict):
|
342
|
-
input_groups = _cartesian_product(input_grid)
|
343
|
-
else:
|
344
|
-
input_groups = input_grid
|
345
|
-
else:
|
346
|
-
input_groups = list(inputs) if inputs else []
|
347
|
-
|
348
|
-
if param_grid:
|
349
|
-
if isinstance(param_grid, dict):
|
350
|
-
param_groups = _cartesian_product(param_grid)
|
351
|
-
else:
|
352
|
-
param_groups = param_grid
|
353
|
-
else:
|
354
|
-
param_groups = list(params) if inputs else []
|
355
|
-
|
356
|
-
input_groups = input_groups or [{}]
|
357
|
-
param_groups = param_groups or [{}]
|
358
|
-
# Use itertools.product to compute the Cartesian product
|
359
|
-
inputs_params_product = [
|
360
|
-
{
|
361
|
-
"inputs": item1,
|
362
|
-
"params": item2,
|
363
|
-
} # Merge dictionaries from input_groups and param_groups
|
364
|
-
for item1, item2 in itertools.product(input_groups, param_groups)
|
365
|
-
]
|
183
|
+
"""Run a composite test i.e. a test made up of multiple metrics"""
|
366
184
|
results = [
|
367
185
|
run_test(
|
368
|
-
test_id,
|
369
|
-
|
370
|
-
|
371
|
-
|
186
|
+
test_id=metric_id,
|
187
|
+
inputs=inputs,
|
188
|
+
input_grid=input_grid,
|
189
|
+
params=params,
|
190
|
+
param_grid=param_grid,
|
372
191
|
show=False,
|
373
|
-
|
374
|
-
|
192
|
+
generate_description=False,
|
193
|
+
title=title,
|
375
194
|
)
|
376
|
-
for
|
195
|
+
for metric_id in metric_ids
|
377
196
|
]
|
378
|
-
if isinstance(results[0], MetricResultWrapper):
|
379
|
-
func = metric_comparison
|
380
|
-
else:
|
381
|
-
func = threshold_test_comparison
|
382
197
|
|
383
|
-
|
384
|
-
|
198
|
+
# make sure to use is not None to handle for falsy values
|
199
|
+
if not all(result.metric is not None for result in results):
|
200
|
+
raise ValueError("All tests must return a metric when used as a composite test")
|
201
|
+
|
202
|
+
return build_test_result(
|
203
|
+
outputs=[
|
204
|
+
{
|
205
|
+
"Metric": test_id_to_name(result.result_id),
|
206
|
+
"Value": result.metric,
|
207
|
+
}
|
208
|
+
for result in results
|
209
|
+
], # pass in a single table with metric values as our 'outputs'
|
210
|
+
test_id=test_id,
|
211
|
+
inputs=results[0].inputs,
|
212
|
+
params=results[0].params,
|
213
|
+
description="\n\n".join(
|
214
|
+
[_test_description(result.description, num_lines=1) for result in results]
|
215
|
+
), # join truncated (first line only) test descriptions
|
216
|
+
generate_description=generate_description,
|
217
|
+
title=title,
|
385
218
|
)
|
386
219
|
|
387
|
-
if show:
|
388
|
-
result.show()
|
389
|
-
|
390
|
-
return result
|
391
|
-
|
392
|
-
|
393
|
-
def run_test(
|
394
|
-
test_id: TestID = None,
|
395
|
-
params: Dict[str, Any] = None,
|
396
|
-
param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
|
397
|
-
inputs: Dict[str, Any] = None,
|
398
|
-
input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
|
399
|
-
name: str = None,
|
400
|
-
unit_metrics: List[TestID] = None,
|
401
|
-
output_template: str = None,
|
402
|
-
show: bool = True,
|
403
|
-
__generate_description: bool = True,
|
404
|
-
**kwargs,
|
405
|
-
) -> Union[MetricResultWrapper, ThresholdTestResultWrapper]:
|
406
|
-
"""Run a test by test ID.
|
407
|
-
test_id (TestID, optional): The test ID to run. Not required if `unit_metrics` is provided.
|
408
|
-
params (dict, optional): A dictionary of parameters to pass into the test. Params
|
409
|
-
are used to customize the test behavior and are specific to each test. See the
|
410
|
-
test details for more information on the available parameters. Defaults to None.
|
411
|
-
param_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
|
412
|
-
a comparison test, provide either a dictionary of parameters where the keys are
|
413
|
-
the parameter names and the values are lists of different parameters, or a list of
|
414
|
-
dictionaries where each dictionary is a set of parameters to run the test with.
|
415
|
-
This will run the test multiple times with different sets of parameters and then
|
416
|
-
combine the results into a single output. When passing a dictionary, the grid
|
417
|
-
will be created by taking the Cartesian product of the parameter lists. Its simply
|
418
|
-
a more convenient way of forming the param grid as opposed to passing a list of
|
419
|
-
all possible combinations. Defaults to None.
|
420
|
-
inputs (Dict[str, Any], optional): A dictionary of test inputs to pass into the
|
421
|
-
test. Inputs are either models or datasets that have been initialized using
|
422
|
-
vm.init_model() or vm.init_dataset(). Defaults to None.
|
423
|
-
input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
|
424
|
-
a comparison test, provide either a dictionary of inputs where the keys are
|
425
|
-
the input names and the values are lists of different inputs, or a list of
|
426
|
-
dictionaries where each dictionary is a set of inputs to run the test with.
|
427
|
-
This will run the test multiple times with different sets of inputs and then
|
428
|
-
combine the results into a single output. When passing a dictionary, the grid
|
429
|
-
will be created by taking the Cartesian product of the input lists. Its simply
|
430
|
-
a more convenient way of forming the input grid as opposed to passing a list of
|
431
|
-
all possible combinations. Defaults to None.
|
432
|
-
name (str, optional): The name of the test (used to create a composite metric
|
433
|
-
out of multiple unit metrics) - required when running multiple unit metrics
|
434
|
-
unit_metrics (list, optional): A list of unit metric IDs to run as a composite
|
435
|
-
metric - required when running multiple unit metrics
|
436
|
-
output_template (str, optional): A jinja2 html template to customize the output
|
437
|
-
of the test. Defaults to None.
|
438
|
-
show (bool, optional): Whether to display the results. Defaults to True.
|
439
|
-
**kwargs: Keyword inputs to pass into the test (same as `inputs` but as keyword
|
440
|
-
args instead of a dictionary):
|
441
|
-
- dataset: A validmind Dataset object or a Pandas DataFrame
|
442
|
-
- model: A model to use for the test
|
443
|
-
- models: A list of models to use for the test
|
444
|
-
- dataset: A validmind Dataset object or a Pandas DataFrame
|
445
|
-
"""
|
446
|
-
|
447
|
-
# Validate input arguments with helper functions
|
448
|
-
validate_test_inputs(test_id, name, unit_metrics)
|
449
|
-
validate_grid_inputs(input_grid, kwargs, inputs, param_grid, params)
|
450
220
|
|
451
|
-
|
452
|
-
|
453
|
-
|
221
|
+
def _run_comparison_test(
|
222
|
+
test_id: Union[TestID, None],
|
223
|
+
name: Union[str, None],
|
224
|
+
unit_metrics: Union[List[TestID], None],
|
225
|
+
inputs: Union[Dict[str, Any], None],
|
226
|
+
input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
|
227
|
+
params: Union[Dict[str, Any], None],
|
228
|
+
param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
|
229
|
+
generate_description: bool,
|
230
|
+
title: Optional[str] = None,
|
231
|
+
):
|
232
|
+
"""Run a comparison test i.e. a test that compares multiple outputs of a test across
|
233
|
+
different input and/or param combinations"""
|
234
|
+
run_test_configs = get_comparison_test_configs(
|
235
|
+
input_grid=input_grid,
|
236
|
+
param_grid=param_grid,
|
237
|
+
inputs=inputs,
|
238
|
+
params=params,
|
239
|
+
)
|
454
240
|
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
output_template,
|
466
|
-
show,
|
467
|
-
__generate_description,
|
241
|
+
results = [
|
242
|
+
run_test(
|
243
|
+
test_id=test_id,
|
244
|
+
name=name,
|
245
|
+
unit_metrics=unit_metrics,
|
246
|
+
inputs=config["inputs"],
|
247
|
+
params=config["params"],
|
248
|
+
show=False,
|
249
|
+
generate_description=False,
|
250
|
+
title=title,
|
468
251
|
)
|
252
|
+
for config in run_test_configs
|
253
|
+
]
|
469
254
|
|
470
|
-
#
|
471
|
-
if test_id
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
255
|
+
# composite tests have a test_id thats built from the name
|
256
|
+
if not test_id:
|
257
|
+
test_id = results[0].result_id
|
258
|
+
description = results[0].description
|
259
|
+
else:
|
260
|
+
description = describe_test(test_id, raw=True)["Description"]
|
476
261
|
|
477
|
-
|
478
|
-
TestClass = load_test_class(test_id, unit_metrics, name)
|
262
|
+
combined_outputs, combined_inputs, combined_params = combine_results(results)
|
479
263
|
|
480
|
-
|
481
|
-
|
264
|
+
return build_test_result(
|
265
|
+
outputs=tuple(combined_outputs),
|
482
266
|
test_id=test_id,
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
267
|
+
inputs=combined_inputs,
|
268
|
+
params=combined_params,
|
269
|
+
description=description,
|
270
|
+
generate_description=generate_description,
|
271
|
+
title=title,
|
488
272
|
)
|
489
273
|
|
490
|
-
test.run()
|
491
274
|
|
492
|
-
|
493
|
-
|
275
|
+
def run_test(
|
276
|
+
test_id: Union[TestID, None] = None,
|
277
|
+
name: Union[str, None] = None,
|
278
|
+
unit_metrics: Union[List[TestID], None] = None,
|
279
|
+
inputs: Union[Dict[str, Any], None] = None,
|
280
|
+
input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None] = None,
|
281
|
+
params: Union[Dict[str, Any], None] = None,
|
282
|
+
param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None] = None,
|
283
|
+
show: bool = True,
|
284
|
+
generate_description: bool = True,
|
285
|
+
title: Optional[str] = None,
|
286
|
+
**kwargs,
|
287
|
+
) -> TestResult:
|
288
|
+
"""Run a ValidMind or custom test
|
494
289
|
|
495
|
-
|
290
|
+
This function is the main entry point for running tests. It can run simple unit metrics,
|
291
|
+
ValidMind and custom tests, composite tests made up of multiple unit metrics and comparison
|
292
|
+
tests made up of multiple tests.
|
496
293
|
|
294
|
+
Args:
|
295
|
+
test_id (TestID, optional): Test ID to run. Not required if `name` and `unit_metrics` provided.
|
296
|
+
params (dict, optional): Parameters to customize test behavior. See test details for available parameters.
|
297
|
+
param_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): For comparison tests, either:
|
298
|
+
- Dict mapping parameter names to lists of values (creates Cartesian product)
|
299
|
+
- List of parameter dictionaries to test
|
300
|
+
inputs (Dict[str, Any], optional): Test inputs (models/datasets initialized with vm.init_model/dataset)
|
301
|
+
input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): For comparison tests, either:
|
302
|
+
- Dict mapping input names to lists of values (creates Cartesian product)
|
303
|
+
- List of input dictionaries to test
|
304
|
+
name (str, optional): Test name (required for composite metrics)
|
305
|
+
unit_metrics (list, optional): Unit metric IDs to run as composite metric
|
306
|
+
show (bool, optional): Whether to display results. Defaults to True.
|
307
|
+
generate_description (bool, optional): Whether to generate a description. Defaults to True.
|
308
|
+
title (str, optional): Custom title for the test result
|
309
|
+
|
310
|
+
Returns:
|
311
|
+
TestResult: A TestResult object containing the test results
|
312
|
+
|
313
|
+
Raises:
|
314
|
+
ValueError: If the test inputs are invalid
|
315
|
+
LoadTestError: If the test class fails to load
|
316
|
+
"""
|
317
|
+
# legacy support for passing inputs as kwargs
|
318
|
+
inputs = inputs or kwargs
|
497
319
|
|
498
|
-
def validate_test_inputs(test_id, name, unit_metrics):
|
499
|
-
"""Validate the main test inputs for `test_id`, `name`, and `unit_metrics`."""
|
500
320
|
if not test_id and not (name and unit_metrics):
|
501
321
|
raise ValueError(
|
502
|
-
"`test_id` or
|
322
|
+
"`test_id` or `name` and `unit_metrics` must be provided to run a test"
|
503
323
|
)
|
504
324
|
|
505
325
|
if bool(unit_metrics) != bool(name):
|
506
326
|
raise ValueError("`name` and `unit_metrics` must be provided together")
|
507
327
|
|
328
|
+
if input_grid and inputs:
|
329
|
+
raise ValueError("Cannot provide `input_grid` along with `inputs`")
|
508
330
|
|
509
|
-
|
510
|
-
|
511
|
-
if input_grid and (kwargs or inputs):
|
512
|
-
raise ValueError("Cannot provide `input_grid` along with `inputs` or `kwargs`")
|
331
|
+
if param_grid and params:
|
332
|
+
raise ValueError("Cannot provide `param_grid` along with `params`")
|
513
333
|
|
514
|
-
|
515
|
-
raise ValueError("Cannot provide `param_grid` along with `params` or `kwargs`")
|
334
|
+
start_time = time.perf_counter()
|
516
335
|
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
return f"validmind.composite_metric.{metric_id_name}" or test_id
|
522
|
-
|
523
|
-
|
524
|
-
def run_comparison_test_with_grids(
|
525
|
-
test_id,
|
526
|
-
inputs,
|
527
|
-
input_grid,
|
528
|
-
param_grid,
|
529
|
-
name,
|
530
|
-
unit_metrics,
|
531
|
-
params,
|
532
|
-
output_template,
|
533
|
-
show,
|
534
|
-
generate_description,
|
535
|
-
):
|
536
|
-
"""Run a comparison test based on the presence of input and param grids."""
|
537
|
-
if input_grid and param_grid:
|
538
|
-
return run_comparison_test(
|
539
|
-
test_id,
|
540
|
-
input_grid,
|
336
|
+
if input_grid or param_grid:
|
337
|
+
result = _run_comparison_test(
|
338
|
+
test_id=test_id,
|
339
|
+
title=title,
|
541
340
|
name=name,
|
542
341
|
unit_metrics=unit_metrics,
|
342
|
+
inputs=inputs,
|
343
|
+
input_grid=input_grid,
|
344
|
+
params=params,
|
543
345
|
param_grid=param_grid,
|
544
|
-
output_template=output_template,
|
545
|
-
show=show,
|
546
346
|
generate_description=generate_description,
|
547
347
|
)
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
348
|
+
|
349
|
+
elif unit_metrics:
|
350
|
+
name = "".join(word.capitalize() for word in name.split())
|
351
|
+
test_id = f"validmind.composite_metric.{name}"
|
352
|
+
|
353
|
+
result = _run_composite_test(
|
354
|
+
test_id=test_id,
|
355
|
+
metric_ids=unit_metrics,
|
356
|
+
inputs=inputs,
|
357
|
+
input_grid=input_grid,
|
554
358
|
params=params,
|
555
|
-
|
556
|
-
show=show,
|
359
|
+
param_grid=param_grid,
|
557
360
|
generate_description=generate_description,
|
361
|
+
title=title,
|
558
362
|
)
|
559
|
-
|
560
|
-
|
561
|
-
|
363
|
+
|
364
|
+
elif input_grid or param_grid:
|
365
|
+
result = _run_comparison_test(
|
366
|
+
test_id=test_id,
|
562
367
|
inputs=inputs,
|
563
|
-
|
564
|
-
|
368
|
+
input_grid=input_grid,
|
369
|
+
params=params,
|
565
370
|
param_grid=param_grid,
|
566
|
-
output_template=output_template,
|
567
|
-
show=show,
|
568
371
|
generate_description=generate_description,
|
372
|
+
title=title,
|
373
|
+
)
|
374
|
+
|
375
|
+
else:
|
376
|
+
test_func = load_test(test_id)
|
377
|
+
|
378
|
+
input_kwargs, param_kwargs = _get_test_kwargs(
|
379
|
+
test_func, inputs or {}, params or {}
|
569
380
|
)
|
570
381
|
|
382
|
+
raw_result = test_func(**input_kwargs, **param_kwargs)
|
571
383
|
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
384
|
+
result = build_test_result(
|
385
|
+
outputs=raw_result,
|
386
|
+
test_id=test_id,
|
387
|
+
inputs=input_kwargs,
|
388
|
+
params=param_kwargs,
|
389
|
+
description=getdoc(test_func),
|
390
|
+
generate_description=generate_description,
|
391
|
+
title=title,
|
578
392
|
)
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
393
|
+
|
394
|
+
end_time = time.perf_counter()
|
395
|
+
result.metadata = _get_run_metadata(duration_seconds=end_time - start_time)
|
396
|
+
|
397
|
+
if show:
|
398
|
+
result.show()
|
399
|
+
|
400
|
+
return result
|