validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.8.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
validmind/tests/run.py CHANGED
@@ -2,581 +2,399 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from itertools import product
7
- from typing import Any, Dict, List, Union
5
+ import platform
6
+ import subprocess
7
+ import time
8
+ from datetime import datetime
9
+ from inspect import getdoc
10
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
11
  from uuid import uuid4
9
12
 
10
- import pandas as pd
11
-
12
- from validmind.ai.test_descriptions import get_description_metadata
13
- from validmind.errors import LoadTestError
13
+ from validmind import __version__
14
+ from validmind.ai.test_descriptions import get_result_description
15
+ from validmind.errors import MissingRequiredTestInputError
16
+ from validmind.input_registry import input_registry
14
17
  from validmind.logging import get_logger
15
- from validmind.unit_metrics import run_metric
16
- from validmind.unit_metrics.composite import load_composite_metric
17
- from validmind.vm_models import (
18
- MetricResult,
19
- ResultSummary,
20
- ResultTable,
21
- ResultTableMetadata,
22
- TestContext,
23
- TestInput,
24
- ThresholdTestResults,
25
- )
26
- from validmind.vm_models.figure import is_matplotlib_figure, is_plotly_figure
27
- from validmind.vm_models.test.result_wrapper import (
28
- MetricResultWrapper,
29
- ThresholdTestResultWrapper,
30
- )
18
+ from validmind.utils import test_id_to_name
19
+ from validmind.vm_models.input import VMInput
20
+ from validmind.vm_models.result import TestResult
31
21
 
32
22
  from .__types__ import TestID
33
- from .load import load_test
23
+ from .comparison import combine_results, get_comparison_test_configs
24
+ from .load import _test_description, describe_test, load_test
25
+ from .output import process_output
34
26
 
35
27
  logger = get_logger(__name__)
36
28
 
37
29
 
38
- def _cartesian_product(input_grid: Dict[str, List[Any]]):
39
- """Get all possible combinations for a set of inputs"""
40
- return [dict(zip(input_grid, values)) for values in product(*input_grid.values())]
41
-
42
-
43
- def _combine_summaries(summaries: List[Dict[str, Any]]):
44
- """Combine the summaries from multiple results
45
-
46
- Args:
47
- summaries (List[Dict[str, Any]]): A list of dictionaries where each dictionary
48
- has two keys: "inputs" and "summary". The "inputs" key should contain the
49
- inputs used for the test and the "summary" key should contain the actual
50
- summary object.
51
-
52
- Constraint: The summaries must all have the same structure meaning that each has
53
- the same number of tables in the same order with the same columns etc. This
54
- should always be the case for comparison tests since its the same test run
55
- multiple times with different inputs.
56
- """
57
- if not summaries[0]["summary"]:
58
- return None
59
-
60
- def combine_tables(table_index):
61
- combined_df = pd.DataFrame()
62
-
63
- for summary_obj in summaries:
64
- serialized = summary_obj["summary"].results[table_index].serialize()
65
- summary_df = pd.DataFrame(serialized["data"])
66
- summary_df = pd.concat(
67
- [
68
- pd.DataFrame(summary_obj["inputs"], index=summary_df.index),
69
- summary_df,
70
- ],
71
- axis=1,
72
- )
73
- combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
74
-
75
- return ResultTable(
76
- data=combined_df.to_dict(orient="records"),
77
- metadata=summaries[0]["summary"].results[table_index].metadata,
78
- )
79
-
80
- return ResultSummary(
81
- results=[
82
- combine_tables(table_index)
83
- for table_index in range(len(summaries[0]["summary"].results))
84
- ]
85
- )
86
-
87
-
88
- def _get_input_id(v):
89
- if isinstance(v, str):
90
- return v # If v is a string, return it as is.
91
- elif isinstance(v, list) and all(hasattr(item, "input_id") for item in v):
92
- # If v is a list and all items have an input_id attribute, join their input_id values.
93
- return ", ".join(item.input_id for item in v)
94
- elif hasattr(v, "input_id"):
95
- return v.input_id # If v has an input_id attribute, return it.
96
- return str(v) # Otherwise, return the string representation of v.
97
-
98
-
99
- def _update_plotly_titles(figures, input_group, title_template):
100
- for figure in figures:
101
-
102
- current_title = figure.figure.layout.title.text
103
-
104
- input_description = " and ".join(
105
- f"{key}: {_get_input_id(value)}" for key, value in input_group.items()
106
- )
107
-
108
- figure.figure.layout.title.text = title_template.format(
109
- current_title=f"{current_title} " if current_title else "",
110
- input_description=input_description,
111
- )
30
+ # shouldn't change once initialized
31
+ _run_metadata = {}
112
32
 
113
33
 
114
- def _update_matplotlib_titles(figures, input_group, title_template):
115
- for figure in figures:
34
+ def _get_pip_freeze():
35
+ """Get a dict of package names and versions"""
36
+ output = subprocess.check_output(["pip", "freeze"]).decode("utf-8")
37
+ parsed = {}
116
38
 
117
- current_title = (
118
- figure.figure._suptitle.get_text() if figure.figure._suptitle else ""
119
- )
120
-
121
- input_description = " and ".join(
122
- f"{key}: {_get_input_id(value)}" for key, value in input_group.items()
123
- )
39
+ for line in output.split("\n"):
40
+ if not line:
41
+ continue
124
42
 
125
- figure.figure.suptitle(
126
- title_template.format(
127
- current_title=f"{current_title} " if current_title else "",
128
- input_description=input_description,
129
- )
130
- )
43
+ if "==" in line:
44
+ package, version = line.split("==")
45
+ parsed[package] = version
46
+ elif " @ " in line:
47
+ package = line.split(" @ ")[0]
48
+ parsed[package] = "__editable__"
131
49
 
50
+ return parsed
132
51
 
133
- def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str, Any]]):
134
- """Combine the figures from multiple results"""
135
- if not figure_lists[0]:
136
- return None
137
52
 
138
- title_template = "{current_title}({input_description})"
53
+ def _get_run_metadata(**metadata: Dict[str, Any]) -> Dict[str, Any]:
54
+ """Get metadata for a test run result"""
55
+ if not _run_metadata:
56
+ _run_metadata["validmind"] = {"version": __version__}
57
+ _run_metadata["python"] = {
58
+ "version": platform.python_version(),
59
+ "implementation": platform.python_implementation(),
60
+ "compiler": platform.python_compiler(),
61
+ }
62
+ _run_metadata["platform"] = platform.platform()
139
63
 
140
- for idx, figures in enumerate(figure_lists):
141
- input_group = input_groups[idx]["inputs"]
142
- if is_plotly_figure(figures[0].figure):
143
- _update_plotly_titles(figures, input_group, title_template)
144
- elif is_matplotlib_figure(figures[0].figure):
145
- _update_matplotlib_titles(figures, input_group, title_template)
146
- else:
147
- logger.warning("Cannot properly annotate png figures")
64
+ try:
65
+ _run_metadata["pip"] = _get_pip_freeze()
66
+ except Exception:
67
+ pass
148
68
 
149
- return [figure for figures in figure_lists for figure in figures]
69
+ return {
70
+ **_run_metadata,
71
+ **metadata,
72
+ "timestamp": datetime.now().isoformat(),
73
+ }
150
74
 
151
75
 
152
- def _combine_unit_metrics(results: List[MetricResultWrapper]):
153
- if not results[0].scalar:
154
- return
76
+ def _get_test_kwargs(
77
+ test_func: callable, inputs: Dict[str, Any], params: Dict[str, Any]
78
+ ):
79
+ """Insepect function signature to build kwargs to pass the inputs and params
80
+ that the test function expects
155
81
 
156
- for result in results:
157
- table = ResultTable(
158
- data=[{"value": result.scalar}],
159
- metadata=ResultTableMetadata(title="Unit Metrics"),
160
- )
161
- if not result.metric:
162
- result.metric = MetricResult(
163
- ref_id="will_be_overwritten",
164
- key=result.result_id,
165
- value=result.scalar,
166
- summary=ResultSummary(results=[table]),
167
- )
168
- else:
169
- result.metric.summary.results.append(table)
170
-
171
-
172
- def metric_comparison(
173
- results: List[MetricResultWrapper],
174
- test_id: TestID,
175
- input_params_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
176
- output_template: str = None,
82
+ Args:
83
+ test_func (callable): Test function to inspect
84
+ inputs (dict): Test inputs... different formats are supported
85
+ e.g. {"dataset": dataset, "model": "model_id"}
86
+ {"datasets": [dataset1, "dataset2_id"]}
87
+ {"datasets": ("dataset1_id", "dataset2_id")}
88
+ {"dataset": {
89
+ "input_id": "dataset2_id",
90
+ "options": {"columns": ["col1", "col2"]},
91
+ }}
92
+ params (dict): Test parameters e.g. {"param1": 1, "param2": 2}
93
+
94
+ Returns:
95
+ tuple: Tuple of input and param kwargs
96
+ """
97
+ input_kwargs = {} # map function inputs (`dataset` etc) to actual objects
98
+
99
+ for key in test_func.inputs.keys():
100
+ try:
101
+ _input = inputs[key]
102
+ except KeyError:
103
+ raise MissingRequiredTestInputError(f"Missing required input: {key}.")
104
+
105
+ # 1) retrieve input object from input registry if an input_id string is provided
106
+ # 2) check the input_id type if a list of inputs (mix of strings and objects) is provided
107
+ # 3) if its a dict, it should contain the `input_id` key as well as other options
108
+ if isinstance(_input, str):
109
+ _input = input_registry.get(key=_input)
110
+ elif isinstance(_input, list) or isinstance(_input, tuple):
111
+ _input = [
112
+ input_registry.get(key=v) if isinstance(v, str) else v for v in _input
113
+ ]
114
+ elif isinstance(_input, dict):
115
+ try:
116
+ _input = input_registry.get(key=_input["input_id"]).with_options(
117
+ **{k: v for k, v in _input.items() if k != "input_id"}
118
+ )
119
+ except KeyError as e:
120
+ raise ValueError(
121
+ "Input dictionary must contain an 'input_id' key "
122
+ "to retrieve the input object from the input registry."
123
+ ) from e
124
+
125
+ input_kwargs[key] = _input
126
+
127
+ param_kwargs = {
128
+ key: value for key, value in params.items() if key in test_func.params
129
+ }
130
+
131
+ return input_kwargs, param_kwargs
132
+
133
+
134
+ def build_test_result(
135
+ outputs: Union[Any, Tuple[Any, ...]],
136
+ test_id: str,
137
+ inputs: Dict[str, Union[VMInput, List[VMInput]]],
138
+ params: Union[Dict[str, Any], None],
139
+ description: str,
177
140
  generate_description: bool = True,
141
+ title: Optional[str] = None,
178
142
  ):
179
- """Build a comparison result for multiple metric results"""
143
+ """Build a TestResult object from a set of raw test function outputs"""
180
144
  ref_id = str(uuid4())
181
145
 
182
- # Treat param_groups and input_groups as empty lists if they are None or empty
183
- input_params_groups = input_params_groups or [{}]
184
-
185
- input_group_strings = []
186
-
187
- for input_params in input_params_groups:
188
- new_group = {}
189
- for param_k, param_v in input_params["params"].items():
190
- new_group[param_k] = param_v
191
- for metric_k, metric_v in input_params["inputs"].items():
192
- # Process values in the input group
193
- if isinstance(metric_v, str):
194
- new_group[metric_k] = metric_v
195
- elif hasattr(metric_v, "input_id"):
196
- new_group[metric_k] = metric_v.input_id
197
- elif isinstance(metric_v, list) and all(
198
- hasattr(item, "input_id") for item in metric_v
199
- ):
200
- new_group[metric_k] = ", ".join([item.input_id for item in metric_v])
201
- else:
202
- raise ValueError(f"Unsupported type for value: {metric_v}")
203
- input_group_strings.append(new_group)
204
-
205
- # handle unit metrics (scalar values) by adding it to the summary
206
- _combine_unit_metrics(results)
207
-
208
- merged_summary = _combine_summaries(
209
- [
210
- {"inputs": input_group_strings[i], "summary": result.metric.summary}
211
- for i, result in enumerate(results)
212
- ]
213
- )
214
- merged_figures = _combine_figures(
215
- [result.figures for result in results], input_params_groups
216
- )
217
-
218
- # Patch figure metadata so they are connected to the comparison result
219
- if merged_figures and len(merged_figures):
220
- for i, figure in enumerate(merged_figures):
221
- figure.key = f"{figure.key}-{i}"
222
- figure.metadata["_name"] = test_id
223
- figure.metadata["_ref_id"] = ref_id
224
-
225
- return MetricResultWrapper(
146
+ result = TestResult(
226
147
  result_id=test_id,
227
- result_metadata=[
228
- get_description_metadata(
229
- test_id=test_id,
230
- default_description=f"Comparison test result for {test_id}",
231
- summary=merged_summary.serialize() if merged_summary else None,
232
- figures=merged_figures,
233
- should_generate=generate_description,
234
- ),
235
- ],
236
- inputs=[
237
- item.input_id if hasattr(item, "input_id") else item
238
- for group in input_params_groups
239
- for input in group["inputs"].values()
240
- for item in (input if isinstance(input, list) else [input])
241
- if hasattr(item, "input_id") or isinstance(item, str)
242
- ],
243
- output_template=output_template,
244
- metric=MetricResult(
245
- key=test_id,
246
- ref_id=ref_id,
247
- value=[],
248
- summary=merged_summary,
249
- ),
250
- figures=merged_figures,
148
+ title=title,
149
+ ref_id=ref_id,
150
+ inputs=inputs,
151
+ params=params if params else None, # None if empty dict or None
251
152
  )
252
153
 
154
+ if not isinstance(outputs, tuple):
155
+ outputs = (outputs,)
253
156
 
254
- def threshold_test_comparison(
255
- results: List[ThresholdTestResultWrapper],
256
- test_id: TestID,
257
- input_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
258
- output_template: str = None,
259
- generate_description: bool = True,
260
- ):
261
- """Build a comparison result for multiple threshold test results"""
262
- ref_id = str(uuid4())
157
+ for item in outputs:
158
+ process_output(item, result)
263
159
 
264
- input_group_strings = []
265
-
266
- for group in input_groups:
267
- new_group = {}
268
- for k, v in group.items():
269
- if isinstance(v, str):
270
- new_group[k] = v
271
- elif hasattr(v, "input_id"):
272
- new_group[k] = v.input_id
273
- elif isinstance(v, list) and all(hasattr(item, "input_id") for item in v):
274
- new_group[k] = ", ".join([item.input_id for item in v])
275
- else:
276
- raise ValueError(f"Unsupported type for value: {v}")
277
- input_group_strings.append(new_group)
278
-
279
- merged_summary = _combine_summaries(
280
- [
281
- {"inputs": input_group_strings[i], "summary": result.test_results.summary}
282
- for i, result in enumerate(results)
283
- ]
284
- )
285
- merged_figures = _combine_figures(
286
- [result.figures for result in results], input_groups
160
+ result.description = get_result_description(
161
+ test_id=test_id,
162
+ test_description=description,
163
+ tables=result.tables,
164
+ figures=result.figures,
165
+ metric=result.metric,
166
+ should_generate=generate_description,
167
+ title=title,
287
168
  )
288
169
 
289
- # Patch figure metadata so they are connected to the comparison result
290
- if merged_figures and len(merged_figures):
291
- for i, figure in enumerate(merged_figures):
292
- figure.key = f"{figure.key}-{i}"
293
- figure.metadata["_name"] = test_id
294
- figure.metadata["_ref_id"] = ref_id
295
-
296
- return ThresholdTestResultWrapper(
297
- result_id=test_id,
298
- result_metadata=[
299
- get_description_metadata(
300
- test_id=test_id,
301
- default_description=f"Comparison test result for {test_id}",
302
- summary=merged_summary.serialize() if merged_summary else None,
303
- figures=merged_figures,
304
- prefix="test_description",
305
- should_generate=generate_description,
306
- )
307
- ],
308
- inputs=[
309
- input if isinstance(input, str) else input.input_id
310
- for group in input_groups
311
- for input in group.values()
312
- ],
313
- output_template=output_template,
314
- test_results=ThresholdTestResults(
315
- test_name=test_id,
316
- ref_id=ref_id,
317
- # TODO: when we have param_grid support, this will need to be updated
318
- params=results[0].test_results.params,
319
- passed=all(result.test_results.passed for result in results),
320
- results=[],
321
- summary=merged_summary,
322
- ),
323
- figures=merged_figures,
324
- )
170
+ return result
325
171
 
326
172
 
327
- def run_comparison_test(
173
+ def _run_composite_test(
328
174
  test_id: TestID,
329
- input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
330
- inputs: Dict[str, Any] = None,
331
- name: str = None,
332
- unit_metrics: List[TestID] = None,
333
- param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
334
- params: Dict[str, Any] = None,
335
- show: bool = True,
336
- output_template: str = None,
337
- generate_description: bool = True,
175
+ metric_ids: List[TestID],
176
+ inputs: Union[Dict[str, Any], None],
177
+ input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
178
+ params: Union[Dict[str, Any], None],
179
+ param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
180
+ generate_description: bool,
181
+ title: Optional[str] = None,
338
182
  ):
339
- """Run a comparison test"""
340
- if input_grid:
341
- if isinstance(input_grid, dict):
342
- input_groups = _cartesian_product(input_grid)
343
- else:
344
- input_groups = input_grid
345
- else:
346
- input_groups = list(inputs) if inputs else []
347
-
348
- if param_grid:
349
- if isinstance(param_grid, dict):
350
- param_groups = _cartesian_product(param_grid)
351
- else:
352
- param_groups = param_grid
353
- else:
354
- param_groups = list(params) if inputs else []
355
-
356
- input_groups = input_groups or [{}]
357
- param_groups = param_groups or [{}]
358
- # Use itertools.product to compute the Cartesian product
359
- inputs_params_product = [
360
- {
361
- "inputs": item1,
362
- "params": item2,
363
- } # Merge dictionaries from input_groups and param_groups
364
- for item1, item2 in itertools.product(input_groups, param_groups)
365
- ]
183
+ """Run a composite test i.e. a test made up of multiple metrics"""
366
184
  results = [
367
185
  run_test(
368
- test_id,
369
- name=name,
370
- unit_metrics=unit_metrics,
371
- inputs=inputs_params["inputs"],
186
+ test_id=metric_id,
187
+ inputs=inputs,
188
+ input_grid=input_grid,
189
+ params=params,
190
+ param_grid=param_grid,
372
191
  show=False,
373
- params=inputs_params["params"],
374
- __generate_description=False,
192
+ generate_description=False,
193
+ title=title,
375
194
  )
376
- for inputs_params in (inputs_params_product or [{}])
195
+ for metric_id in metric_ids
377
196
  ]
378
- if isinstance(results[0], MetricResultWrapper):
379
- func = metric_comparison
380
- else:
381
- func = threshold_test_comparison
382
197
 
383
- result = func(
384
- results, test_id, inputs_params_product, output_template, generate_description
198
+ # make sure to use is not None to handle for falsy values
199
+ if not all(result.metric is not None for result in results):
200
+ raise ValueError("All tests must return a metric when used as a composite test")
201
+
202
+ return build_test_result(
203
+ outputs=[
204
+ {
205
+ "Metric": test_id_to_name(result.result_id),
206
+ "Value": result.metric,
207
+ }
208
+ for result in results
209
+ ], # pass in a single table with metric values as our 'outputs'
210
+ test_id=test_id,
211
+ inputs=results[0].inputs,
212
+ params=results[0].params,
213
+ description="\n\n".join(
214
+ [_test_description(result.description, num_lines=1) for result in results]
215
+ ), # join truncated (first line only) test descriptions
216
+ generate_description=generate_description,
217
+ title=title,
385
218
  )
386
219
 
387
- if show:
388
- result.show()
389
-
390
- return result
391
-
392
-
393
- def run_test(
394
- test_id: TestID = None,
395
- params: Dict[str, Any] = None,
396
- param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
397
- inputs: Dict[str, Any] = None,
398
- input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
399
- name: str = None,
400
- unit_metrics: List[TestID] = None,
401
- output_template: str = None,
402
- show: bool = True,
403
- __generate_description: bool = True,
404
- **kwargs,
405
- ) -> Union[MetricResultWrapper, ThresholdTestResultWrapper]:
406
- """Run a test by test ID.
407
- test_id (TestID, optional): The test ID to run. Not required if `unit_metrics` is provided.
408
- params (dict, optional): A dictionary of parameters to pass into the test. Params
409
- are used to customize the test behavior and are specific to each test. See the
410
- test details for more information on the available parameters. Defaults to None.
411
- param_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
412
- a comparison test, provide either a dictionary of parameters where the keys are
413
- the parameter names and the values are lists of different parameters, or a list of
414
- dictionaries where each dictionary is a set of parameters to run the test with.
415
- This will run the test multiple times with different sets of parameters and then
416
- combine the results into a single output. When passing a dictionary, the grid
417
- will be created by taking the Cartesian product of the parameter lists. Its simply
418
- a more convenient way of forming the param grid as opposed to passing a list of
419
- all possible combinations. Defaults to None.
420
- inputs (Dict[str, Any], optional): A dictionary of test inputs to pass into the
421
- test. Inputs are either models or datasets that have been initialized using
422
- vm.init_model() or vm.init_dataset(). Defaults to None.
423
- input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
424
- a comparison test, provide either a dictionary of inputs where the keys are
425
- the input names and the values are lists of different inputs, or a list of
426
- dictionaries where each dictionary is a set of inputs to run the test with.
427
- This will run the test multiple times with different sets of inputs and then
428
- combine the results into a single output. When passing a dictionary, the grid
429
- will be created by taking the Cartesian product of the input lists. Its simply
430
- a more convenient way of forming the input grid as opposed to passing a list of
431
- all possible combinations. Defaults to None.
432
- name (str, optional): The name of the test (used to create a composite metric
433
- out of multiple unit metrics) - required when running multiple unit metrics
434
- unit_metrics (list, optional): A list of unit metric IDs to run as a composite
435
- metric - required when running multiple unit metrics
436
- output_template (str, optional): A jinja2 html template to customize the output
437
- of the test. Defaults to None.
438
- show (bool, optional): Whether to display the results. Defaults to True.
439
- **kwargs: Keyword inputs to pass into the test (same as `inputs` but as keyword
440
- args instead of a dictionary):
441
- - dataset: A validmind Dataset object or a Pandas DataFrame
442
- - model: A model to use for the test
443
- - models: A list of models to use for the test
444
- - dataset: A validmind Dataset object or a Pandas DataFrame
445
- """
446
-
447
- # Validate input arguments with helper functions
448
- validate_test_inputs(test_id, name, unit_metrics)
449
- validate_grid_inputs(input_grid, kwargs, inputs, param_grid, params)
450
220
 
451
- # Handle composite metric creation
452
- if unit_metrics:
453
- test_id = generate_composite_test_id(name, test_id)
221
+ def _run_comparison_test(
222
+ test_id: Union[TestID, None],
223
+ name: Union[str, None],
224
+ unit_metrics: Union[List[TestID], None],
225
+ inputs: Union[Dict[str, Any], None],
226
+ input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
227
+ params: Union[Dict[str, Any], None],
228
+ param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None],
229
+ generate_description: bool,
230
+ title: Optional[str] = None,
231
+ ):
232
+ """Run a comparison test i.e. a test that compares multiple outputs of a test across
233
+ different input and/or param combinations"""
234
+ run_test_configs = get_comparison_test_configs(
235
+ input_grid=input_grid,
236
+ param_grid=param_grid,
237
+ inputs=inputs,
238
+ params=params,
239
+ )
454
240
 
455
- # Run comparison tests if applicable
456
- if input_grid or param_grid:
457
- return run_comparison_test_with_grids(
458
- test_id,
459
- inputs,
460
- input_grid,
461
- param_grid,
462
- name,
463
- unit_metrics,
464
- params,
465
- output_template,
466
- show,
467
- __generate_description,
241
+ results = [
242
+ run_test(
243
+ test_id=test_id,
244
+ name=name,
245
+ unit_metrics=unit_metrics,
246
+ inputs=config["inputs"],
247
+ params=config["params"],
248
+ show=False,
249
+ generate_description=False,
250
+ title=title,
468
251
  )
252
+ for config in run_test_configs
253
+ ]
469
254
 
470
- # Run unit metric tests
471
- if test_id.startswith("validmind.unit_metrics"):
472
- # TODO: as we move towards a more unified approach to metrics
473
- # we will want to make everything functional and remove the
474
- # separation between unit metrics and "normal" metrics
475
- return run_metric(test_id, inputs=inputs, params=params, show=show)
255
+ # composite tests have a test_id thats built from the name
256
+ if not test_id:
257
+ test_id = results[0].result_id
258
+ description = results[0].description
259
+ else:
260
+ description = describe_test(test_id, raw=True)["Description"]
476
261
 
477
- # Load the appropriate test class
478
- TestClass = load_test_class(test_id, unit_metrics, name)
262
+ combined_outputs, combined_inputs, combined_params = combine_results(results)
479
263
 
480
- # Create and run the test
481
- test = TestClass(
264
+ return build_test_result(
265
+ outputs=tuple(combined_outputs),
482
266
  test_id=test_id,
483
- context=TestContext(),
484
- inputs=TestInput({**kwargs, **(inputs or {})}),
485
- output_template=output_template,
486
- params=params,
487
- generate_description=__generate_description,
267
+ inputs=combined_inputs,
268
+ params=combined_params,
269
+ description=description,
270
+ generate_description=generate_description,
271
+ title=title,
488
272
  )
489
273
 
490
- test.run()
491
274
 
492
- if show:
493
- test.result.show()
275
+ def run_test(
276
+ test_id: Union[TestID, None] = None,
277
+ name: Union[str, None] = None,
278
+ unit_metrics: Union[List[TestID], None] = None,
279
+ inputs: Union[Dict[str, Any], None] = None,
280
+ input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None] = None,
281
+ params: Union[Dict[str, Any], None] = None,
282
+ param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]], None] = None,
283
+ show: bool = True,
284
+ generate_description: bool = True,
285
+ title: Optional[str] = None,
286
+ **kwargs,
287
+ ) -> TestResult:
288
+ """Run a ValidMind or custom test
494
289
 
495
- return test.result
290
+ This function is the main entry point for running tests. It can run simple unit metrics,
291
+ ValidMind and custom tests, composite tests made up of multiple unit metrics and comparison
292
+ tests made up of multiple tests.
496
293
 
294
+ Args:
295
+ test_id (TestID, optional): Test ID to run. Not required if `name` and `unit_metrics` provided.
296
+ params (dict, optional): Parameters to customize test behavior. See test details for available parameters.
297
+ param_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): For comparison tests, either:
298
+ - Dict mapping parameter names to lists of values (creates Cartesian product)
299
+ - List of parameter dictionaries to test
300
+ inputs (Dict[str, Any], optional): Test inputs (models/datasets initialized with vm.init_model/dataset)
301
+ input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): For comparison tests, either:
302
+ - Dict mapping input names to lists of values (creates Cartesian product)
303
+ - List of input dictionaries to test
304
+ name (str, optional): Test name (required for composite metrics)
305
+ unit_metrics (list, optional): Unit metric IDs to run as composite metric
306
+ show (bool, optional): Whether to display results. Defaults to True.
307
+ generate_description (bool, optional): Whether to generate a description. Defaults to True.
308
+ title (str, optional): Custom title for the test result
309
+
310
+ Returns:
311
+ TestResult: A TestResult object containing the test results
312
+
313
+ Raises:
314
+ ValueError: If the test inputs are invalid
315
+ LoadTestError: If the test class fails to load
316
+ """
317
+ # legacy support for passing inputs as kwargs
318
+ inputs = inputs or kwargs
497
319
 
498
- def validate_test_inputs(test_id, name, unit_metrics):
499
- """Validate the main test inputs for `test_id`, `name`, and `unit_metrics`."""
500
320
  if not test_id and not (name and unit_metrics):
501
321
  raise ValueError(
502
- "`test_id` or both `name` and `unit_metrics` must be provided to run a test"
322
+ "`test_id` or `name` and `unit_metrics` must be provided to run a test"
503
323
  )
504
324
 
505
325
  if bool(unit_metrics) != bool(name):
506
326
  raise ValueError("`name` and `unit_metrics` must be provided together")
507
327
 
328
+ if input_grid and inputs:
329
+ raise ValueError("Cannot provide `input_grid` along with `inputs`")
508
330
 
509
- def validate_grid_inputs(input_grid, kwargs, inputs, param_grid, params):
510
- """Validate the grid inputs to avoid conflicting parameters."""
511
- if input_grid and (kwargs or inputs):
512
- raise ValueError("Cannot provide `input_grid` along with `inputs` or `kwargs`")
331
+ if param_grid and params:
332
+ raise ValueError("Cannot provide `param_grid` along with `params`")
513
333
 
514
- if param_grid and (kwargs or params):
515
- raise ValueError("Cannot provide `param_grid` along with `params` or `kwargs`")
334
+ start_time = time.perf_counter()
516
335
 
517
-
518
- def generate_composite_test_id(name, test_id):
519
- """Generate a composite test ID if unit metrics are provided."""
520
- metric_id_name = "".join(word.capitalize() for word in name.split())
521
- return f"validmind.composite_metric.{metric_id_name}" or test_id
522
-
523
-
524
- def run_comparison_test_with_grids(
525
- test_id,
526
- inputs,
527
- input_grid,
528
- param_grid,
529
- name,
530
- unit_metrics,
531
- params,
532
- output_template,
533
- show,
534
- generate_description,
535
- ):
536
- """Run a comparison test based on the presence of input and param grids."""
537
- if input_grid and param_grid:
538
- return run_comparison_test(
539
- test_id,
540
- input_grid,
336
+ if input_grid or param_grid:
337
+ result = _run_comparison_test(
338
+ test_id=test_id,
339
+ title=title,
541
340
  name=name,
542
341
  unit_metrics=unit_metrics,
342
+ inputs=inputs,
343
+ input_grid=input_grid,
344
+ params=params,
543
345
  param_grid=param_grid,
544
- output_template=output_template,
545
- show=show,
546
346
  generate_description=generate_description,
547
347
  )
548
- if input_grid:
549
- return run_comparison_test(
550
- test_id,
551
- input_grid,
552
- name=name,
553
- unit_metrics=unit_metrics,
348
+
349
+ elif unit_metrics:
350
+ name = "".join(word.capitalize() for word in name.split())
351
+ test_id = f"validmind.composite_metric.{name}"
352
+
353
+ result = _run_composite_test(
354
+ test_id=test_id,
355
+ metric_ids=unit_metrics,
356
+ inputs=inputs,
357
+ input_grid=input_grid,
554
358
  params=params,
555
- output_template=output_template,
556
- show=show,
359
+ param_grid=param_grid,
557
360
  generate_description=generate_description,
361
+ title=title,
558
362
  )
559
- if param_grid:
560
- return run_comparison_test(
561
- test_id,
363
+
364
+ elif input_grid or param_grid:
365
+ result = _run_comparison_test(
366
+ test_id=test_id,
562
367
  inputs=inputs,
563
- name=name,
564
- unit_metrics=unit_metrics,
368
+ input_grid=input_grid,
369
+ params=params,
565
370
  param_grid=param_grid,
566
- output_template=output_template,
567
- show=show,
568
371
  generate_description=generate_description,
372
+ title=title,
373
+ )
374
+
375
+ else:
376
+ test_func = load_test(test_id)
377
+
378
+ input_kwargs, param_kwargs = _get_test_kwargs(
379
+ test_func, inputs or {}, params or {}
569
380
  )
570
381
 
382
+ raw_result = test_func(**input_kwargs, **param_kwargs)
571
383
 
572
- def load_test_class(test_id, unit_metrics, name):
573
- """Load the appropriate test class based on `test_id` and unit metrics."""
574
- if unit_metrics:
575
- metric_id_name = "".join(word.capitalize() for word in name.split())
576
- error, TestClass = load_composite_metric(
577
- unit_metrics=unit_metrics, metric_name=metric_id_name
384
+ result = build_test_result(
385
+ outputs=raw_result,
386
+ test_id=test_id,
387
+ inputs=input_kwargs,
388
+ params=param_kwargs,
389
+ description=getdoc(test_func),
390
+ generate_description=generate_description,
391
+ title=title,
578
392
  )
579
- if error:
580
- raise LoadTestError(error)
581
- return TestClass
582
- return load_test(test_id, reload=True)
393
+
394
+ end_time = time.perf_counter()
395
+ result.metadata = _get_run_metadata(duration_seconds=end_time - start_time)
396
+
397
+ if show:
398
+ result.show()
399
+
400
+ return result