validmind 2.0.1__py3-none-any.whl → 2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +4 -1
- validmind/__version__.py +1 -1
- validmind/ai.py +197 -0
- validmind/api_client.py +16 -4
- validmind/client.py +23 -3
- validmind/datasets/classification/customer_churn.py +2 -2
- validmind/datasets/nlp/__init__.py +5 -0
- validmind/datasets/nlp/cnn_dailymail.py +98 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
- validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
- validmind/errors.py +11 -1
- validmind/models/huggingface.py +2 -2
- validmind/models/pytorch.py +3 -3
- validmind/models/sklearn.py +4 -4
- validmind/tests/__init__.py +47 -9
- validmind/tests/data_validation/DatasetDescription.py +0 -1
- validmind/tests/data_validation/nlp/StopWords.py +1 -6
- validmind/tests/data_validation/nlp/TextDescription.py +20 -9
- validmind/tests/decorator.py +189 -0
- validmind/tests/model_validation/MeteorScore.py +92 -0
- validmind/tests/model_validation/RegardHistogram.py +5 -6
- validmind/tests/model_validation/RegardScore.py +3 -5
- validmind/tests/model_validation/RougeMetrics.py +6 -4
- validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +3 -1
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +30 -4
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -3
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/prompt_validation/ai_powered_test.py +2 -0
- validmind/unit_metrics/__init__.py +0 -2
- validmind/unit_metrics/composite.py +275 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +39 -0
- validmind/unit_metrics/regression/HuberLoss.py +27 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +36 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +22 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +22 -0
- validmind/unit_metrics/regression/QuantileLoss.py +25 -0
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +27 -0
- validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +22 -0
- validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +22 -0
- validmind/unit_metrics/regression/sklearn/RSquaredScore.py +22 -0
- validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +23 -0
- validmind/unit_metrics/sklearn/classification/Accuracy.py +2 -0
- validmind/unit_metrics/sklearn/classification/F1.py +2 -0
- validmind/unit_metrics/sklearn/classification/Precision.py +2 -0
- validmind/unit_metrics/sklearn/classification/ROC_AUC.py +2 -0
- validmind/unit_metrics/sklearn/classification/Recall.py +2 -0
- validmind/utils.py +17 -1
- validmind/vm_models/dataset.py +376 -21
- validmind/vm_models/figure.py +52 -17
- validmind/vm_models/test/metric.py +33 -30
- validmind/vm_models/test/output_template.py +0 -27
- validmind/vm_models/test/result_wrapper.py +57 -24
- validmind/vm_models/test/test.py +2 -1
- validmind/vm_models/test/threshold_test.py +24 -13
- validmind/vm_models/test_context.py +7 -0
- validmind/vm_models/test_suite/runner.py +1 -1
- validmind/vm_models/test_suite/test.py +1 -1
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/METADATA +9 -13
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/RECORD +65 -44
- validmind-2.0.7.dist-info/entry_points.txt +3 -0
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/LICENSE +0 -0
- {validmind-2.0.1.dist-info → validmind-2.0.7.dist-info}/WHEEL +0 -0
@@ -76,7 +76,6 @@ class RougeMetrics(Metric):
|
|
76
76
|
if r_metrics is None:
|
77
77
|
raise ValueError("rouge_metrics must be provided in params")
|
78
78
|
|
79
|
-
# With all
|
80
79
|
if not (
|
81
80
|
set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
|
82
81
|
== set(r_metrics)
|
@@ -97,12 +96,13 @@ class RougeMetrics(Metric):
|
|
97
96
|
|
98
97
|
metrics_df = pd.DataFrame(score_list)
|
99
98
|
figures = []
|
99
|
+
|
100
100
|
for m in metrics_df.columns:
|
101
101
|
df_scores = pd.DataFrame(metrics_df[m].tolist())
|
102
102
|
# Visualization part
|
103
103
|
fig = go.Figure()
|
104
104
|
|
105
|
-
# Adding the line plots
|
105
|
+
# Adding the line plots for precision, recall, and F1-score with lines and markers
|
106
106
|
fig.add_trace(
|
107
107
|
go.Scatter(
|
108
108
|
x=df_scores.index,
|
@@ -129,11 +129,13 @@ class RougeMetrics(Metric):
|
|
129
129
|
)
|
130
130
|
|
131
131
|
fig.update_layout(
|
132
|
-
title="ROUGE Scores for
|
132
|
+
title=f"ROUGE Scores for {m}",
|
133
133
|
xaxis_title="Row Index",
|
134
134
|
yaxis_title="Score",
|
135
135
|
)
|
136
|
-
|
136
|
+
|
137
|
+
# Ensure a unique key for each metric
|
138
|
+
k = f"{m.replace('-', '')}_{len(figures)}"
|
137
139
|
figures.append(
|
138
140
|
Figure(
|
139
141
|
for_object=self,
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import itertools
|
6
|
+
from dataclasses import dataclass
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.graph_objects as go
|
10
|
+
import torch
|
11
|
+
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
|
12
|
+
from tqdm import tqdm
|
13
|
+
|
14
|
+
from validmind.vm_models import Figure, Metric
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class SelfCheckNLIScore(Metric):
|
19
|
+
"""
|
20
|
+
Evaluates text generation models' performance by quantifying the level of hallucination in generated texts compared to reference texts.
|
21
|
+
|
22
|
+
**Purpose**: The HallucinationScore metric is designed to assess the factual accuracy and reliability of text generated by models, focusing on the detection and quantification of hallucinations—instances where generated content deviates from factual or expected outputs. By comparing generated texts against reference texts, this metric highlights discrepancies indicative of hallucinations, offering insights into the model's ability to produce contextually and factually coherent content.
|
23
|
+
|
24
|
+
**Test Mechanism**: To compute the HallucinationScore, the metric employs a comparison between the generated texts (model predictions) and the provided reference texts (true values). Using the SelfCheckNLI model, it evaluates each generated text's level of factual congruence with the reference, assigning a hallucination score based on the semantic coherence and factual accuracy. The scores for each text instance are then visualized in a line plot, allowing for the examination of hallucination trends across the dataset.
|
25
|
+
|
26
|
+
**Signs of High Risk**:
|
27
|
+
- High hallucination scores across a significant portion of the dataset, indicating a prevalence of factually inaccurate or irrelevant content generation.
|
28
|
+
- Patterns of consistent hallucination in specific contexts or subjects, suggesting gaps in the model's understanding or knowledge.
|
29
|
+
- Sharp fluctuations in hallucination scores, which may reveal inconsistencies in the model's performance or sensitivity to certain types of input.
|
30
|
+
|
31
|
+
**Strengths**:
|
32
|
+
- Directly addresses the critical aspect of factual accuracy in generated text, beyond mere linguistic or stylistic coherence.
|
33
|
+
- Provides a granular, instance-by-instance analysis of model performance, allowing for targeted improvements and diagnostics.
|
34
|
+
- Facilitates a deeper understanding of a model's capabilities and limitations in producing reliable and accurate content.
|
35
|
+
|
36
|
+
**Limitations**:
|
37
|
+
- Reliance on the SelfCheckNLI model means the accuracy and effectiveness of the HallucinationScore are contingent upon the performance and suitability of the underlying NLI model.
|
38
|
+
- May not fully capture the subtleties of certain factual inaccuracies or the contextual relevance of reference texts, especially in complex or nuanced domains.
|
39
|
+
- Potentially resource-intensive, given the computational demands of running advanced NLI models for large datasets.
|
40
|
+
"""
|
41
|
+
|
42
|
+
name = "self_check_nli_score"
|
43
|
+
required_inputs = ["model", "dataset"]
|
44
|
+
|
45
|
+
def run(self):
|
46
|
+
# Assuming the dataset is structured with generated sentences and reference samples
|
47
|
+
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
48
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
|
49
|
+
|
50
|
+
hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
|
51
|
+
|
52
|
+
# Visualization of scores
|
53
|
+
figures = self.visualize_scores(hallucination_scores)
|
54
|
+
|
55
|
+
return self.cache_results(figures=figures)
|
56
|
+
|
57
|
+
def compute_hallucination_scores(self, predictions, references):
|
58
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
59
|
+
selfcheck_nli = SelfCheckNLI(device=device)
|
60
|
+
hallucination_scores = []
|
61
|
+
|
62
|
+
print("Starting hallucination score computation...")
|
63
|
+
|
64
|
+
for index, (sentences, samples) in enumerate(
|
65
|
+
tqdm(zip(predictions, references), total=len(predictions))
|
66
|
+
):
|
67
|
+
sent_scores_nli = selfcheck_nli.predict(
|
68
|
+
sentences=sentences, sampled_passages=samples
|
69
|
+
)
|
70
|
+
|
71
|
+
# Compute the mean of the hallucination scores for this row
|
72
|
+
average_score = sent_scores_nli.mean()
|
73
|
+
hallucination_scores.append(average_score)
|
74
|
+
|
75
|
+
# Print a progress update for each row
|
76
|
+
print(
|
77
|
+
f"Row {index + 1}/{len(predictions)}: Average hallucination score: {average_score}"
|
78
|
+
)
|
79
|
+
|
80
|
+
print("Completed hallucination score computation.")
|
81
|
+
|
82
|
+
return hallucination_scores
|
83
|
+
|
84
|
+
def visualize_scores(self, scores):
|
85
|
+
scores_df = pd.DataFrame(scores, columns=["Hallucination Score"])
|
86
|
+
|
87
|
+
fig = go.Figure()
|
88
|
+
fig.add_trace(
|
89
|
+
go.Scatter(
|
90
|
+
x=scores_df.index,
|
91
|
+
y=scores_df["Hallucination Score"],
|
92
|
+
mode="lines+markers",
|
93
|
+
name="Hallucination Score",
|
94
|
+
)
|
95
|
+
)
|
96
|
+
|
97
|
+
fig.update_layout(
|
98
|
+
title="Hallucination Scores Across Text Instances",
|
99
|
+
xaxis_title="Text Instance Index",
|
100
|
+
yaxis_title="Hallucination Score",
|
101
|
+
)
|
102
|
+
|
103
|
+
# Wrapping the plotly figure for compatibility with your framework might be needed
|
104
|
+
figures = [
|
105
|
+
Figure(
|
106
|
+
for_object=self,
|
107
|
+
key=self.key,
|
108
|
+
figure=fig,
|
109
|
+
)
|
110
|
+
]
|
111
|
+
|
112
|
+
return figures
|
@@ -59,30 +59,25 @@ class DescriptiveAnalytics(Metric):
|
|
59
59
|
}
|
60
60
|
|
61
61
|
def run(self):
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
# Assuming y_pred returns a 2D array of embeddings [samples, features]
|
63
|
+
preds = self.inputs.dataset.y_pred(self.inputs.model.input_id)
|
64
|
+
|
65
|
+
# Calculate statistics across the embedding dimensions, not across all embeddings
|
66
|
+
means = np.mean(preds, axis=0) # Mean of each feature across all samples
|
67
|
+
medians = np.median(preds, axis=0) # Median of each feature across all samples
|
68
|
+
stds = np.std(preds, axis=0) # Std. dev. of each feature across all samples
|
69
|
+
|
70
|
+
# Plot histograms of the calculated statistics
|
71
|
+
mean_fig = px.histogram(x=means, title="Distribution of Embedding Means")
|
72
|
+
median_fig = px.histogram(x=medians, title="Distribution of Embedding Medians")
|
73
|
+
std_fig = px.histogram(
|
74
|
+
x=stds, title="Distribution of Embedding Standard Deviations"
|
75
|
+
)
|
65
76
|
|
66
77
|
return self.cache_results(
|
67
78
|
figures=[
|
68
|
-
Figure(
|
69
|
-
|
70
|
-
|
71
|
-
figure=px.histogram(mean, title="Distribution of Embedding Means"),
|
72
|
-
),
|
73
|
-
Figure(
|
74
|
-
for_object=self,
|
75
|
-
key=self.key,
|
76
|
-
figure=px.histogram(
|
77
|
-
median, title="Distribution of Embedding Medians"
|
78
|
-
),
|
79
|
-
),
|
80
|
-
Figure(
|
81
|
-
for_object=self,
|
82
|
-
key=self.key,
|
83
|
-
figure=px.histogram(
|
84
|
-
std, title="Distribution of Embedding Standard Deviations"
|
85
|
-
),
|
86
|
-
),
|
79
|
+
Figure(for_object=self, key=f"{self.key}_mean", figure=mean_fig),
|
80
|
+
Figure(for_object=self, key=f"{self.key}_median", figure=median_fig),
|
81
|
+
Figure(for_object=self, key=f"{self.key}_std", figure=std_fig),
|
87
82
|
],
|
88
83
|
)
|
@@ -131,7 +131,9 @@ class ClassifierPerformance(Metric):
|
|
131
131
|
y_true = self.y_true()
|
132
132
|
class_pred = self.y_pred()
|
133
133
|
|
134
|
-
report = metrics.classification_report(
|
134
|
+
report = metrics.classification_report(
|
135
|
+
y_true, class_pred, output_dict=True, zero_division=0
|
136
|
+
)
|
135
137
|
report["roc_auc"] = multiclass_roc_auc_score(y_true, class_pred)
|
136
138
|
|
137
139
|
return self.cache_results(report)
|
@@ -8,6 +8,7 @@ from dataclasses import dataclass
|
|
8
8
|
import matplotlib.pyplot as plt
|
9
9
|
import shap
|
10
10
|
|
11
|
+
from validmind.errors import UnsupportedModelForSHAPError
|
11
12
|
from validmind.logging import get_logger
|
12
13
|
from validmind.vm_models import Figure, Metric
|
13
14
|
|
@@ -72,6 +73,9 @@ class SHAPGlobalImportance(Metric):
|
|
72
73
|
"visualization",
|
73
74
|
],
|
74
75
|
}
|
76
|
+
default_params = {
|
77
|
+
"kernel_explainer_samples": 10,
|
78
|
+
}
|
75
79
|
|
76
80
|
def _generate_shap_plot(self, type_, shap_values, x_test):
|
77
81
|
"""
|
@@ -127,22 +131,44 @@ class SHAPGlobalImportance(Metric):
|
|
127
131
|
model_class == "XGBClassifier"
|
128
132
|
or model_class == "RandomForestClassifier"
|
129
133
|
or model_class == "CatBoostClassifier"
|
134
|
+
or model_class == "DecisionTreeClassifier"
|
130
135
|
):
|
131
136
|
explainer = shap.TreeExplainer(trained_model)
|
132
137
|
elif (
|
133
138
|
model_class == "LogisticRegression"
|
134
139
|
or model_class == "XGBRegressor"
|
135
140
|
or model_class == "LinearRegression"
|
141
|
+
or model_class == "LinearSVC"
|
136
142
|
):
|
137
143
|
explainer = shap.LinearExplainer(trained_model, self.inputs.dataset.x)
|
144
|
+
elif model_class == "SVC":
|
145
|
+
# KernelExplainer is slow so we use shap.sample to speed it up
|
146
|
+
explainer = shap.KernelExplainer(
|
147
|
+
trained_model.predict,
|
148
|
+
shap.sample(
|
149
|
+
self.inputs.dataset.x,
|
150
|
+
self.params["kernel_explainer_samples"],
|
151
|
+
),
|
152
|
+
)
|
153
|
+
else:
|
154
|
+
raise UnsupportedModelForSHAPError(
|
155
|
+
f"Model {model_class} not supported for SHAP importance."
|
156
|
+
)
|
157
|
+
|
158
|
+
# KernelExplainer is slow so we use shap.sample to speed it up
|
159
|
+
if isinstance(explainer, shap.KernelExplainer):
|
160
|
+
shap_sample = shap.sample(
|
161
|
+
self.inputs.dataset.x,
|
162
|
+
self.params["kernel_explainer_samples"],
|
163
|
+
)
|
138
164
|
else:
|
139
|
-
|
165
|
+
shap_sample = self.inputs.dataset.x
|
140
166
|
|
141
|
-
shap_values = explainer.shap_values(
|
167
|
+
shap_values = explainer.shap_values(shap_sample)
|
142
168
|
|
143
169
|
figures = [
|
144
|
-
self._generate_shap_plot("mean", shap_values,
|
145
|
-
self._generate_shap_plot("summary", shap_values,
|
170
|
+
self._generate_shap_plot("mean", shap_values, shap_sample),
|
171
|
+
self._generate_shap_plot("summary", shap_values, shap_sample),
|
146
172
|
]
|
147
173
|
|
148
174
|
# restore warnings
|
@@ -129,12 +129,12 @@ class TrainingTestDegradation(ThresholdTest):
|
|
129
129
|
y_test_true = y_test_true.astype(y_test_pred.dtype)
|
130
130
|
|
131
131
|
report_train = metrics.classification_report(
|
132
|
-
y_train_true, y_train_pred, output_dict=True
|
132
|
+
y_train_true, y_train_pred, output_dict=True, zero_division=0
|
133
133
|
)
|
134
134
|
report_train["roc_auc"] = multiclass_roc_auc_score(y_train_true, y_train_pred)
|
135
135
|
|
136
136
|
report_test = metrics.classification_report(
|
137
|
-
y_test_true, y_test_pred, output_dict=True
|
137
|
+
y_test_true, y_test_pred, output_dict=True, zero_division=0
|
138
138
|
)
|
139
139
|
report_test["roc_auc"] = multiclass_roc_auc_score(y_test_true, y_test_pred)
|
140
140
|
|
@@ -145,7 +145,13 @@ class TrainingTestDegradation(ThresholdTest):
|
|
145
145
|
for metric_name in ["precision", "recall", "f1-score"]:
|
146
146
|
train_score = report_train[class_name][metric_name]
|
147
147
|
test_score = report_test[class_name][metric_name]
|
148
|
-
|
148
|
+
|
149
|
+
# If training score is 0, degradation is assumed to be 100%
|
150
|
+
if train_score == 0:
|
151
|
+
degradation = 1.0
|
152
|
+
else:
|
153
|
+
degradation = (train_score - test_score) / train_score
|
154
|
+
|
149
155
|
passed = degradation < self.params["max_threshold"]
|
150
156
|
test_results.append(
|
151
157
|
ThresholdTestResult(
|
@@ -79,7 +79,7 @@ class RegressionModelsPerformance(Metric):
|
|
79
79
|
def sample_performance_ols(self, models, datasets):
|
80
80
|
evaluation_results = []
|
81
81
|
|
82
|
-
for
|
82
|
+
for model, dataset in zip(models, datasets):
|
83
83
|
X_columns = dataset.get_features_columns()
|
84
84
|
y_true = dataset.y
|
85
85
|
y_pred = dataset.y_pred(model.input_id)
|
@@ -0,0 +1,275 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import ast
|
6
|
+
import inspect
|
7
|
+
from dataclasses import dataclass
|
8
|
+
from typing import List
|
9
|
+
from uuid import uuid4
|
10
|
+
|
11
|
+
from ..errors import LoadTestError
|
12
|
+
from ..logging import get_logger
|
13
|
+
from ..utils import clean_docstring, run_async, test_id_to_name
|
14
|
+
from ..vm_models.test.metric import Metric
|
15
|
+
from ..vm_models.test.metric_result import MetricResult
|
16
|
+
from ..vm_models.test.result_summary import ResultSummary, ResultTable
|
17
|
+
from ..vm_models.test.result_wrapper import MetricResultWrapper
|
18
|
+
from . import _get_metric_class, run_metric
|
19
|
+
|
20
|
+
logger = get_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
def _extract_class_methods(cls):
|
24
|
+
source = inspect.getsource(cls)
|
25
|
+
tree = ast.parse(source)
|
26
|
+
|
27
|
+
class MethodVisitor(ast.NodeVisitor):
|
28
|
+
def __init__(self):
|
29
|
+
self.methods = {}
|
30
|
+
|
31
|
+
def visit_FunctionDef(self, node):
|
32
|
+
self.methods[node.name] = node
|
33
|
+
self.generic_visit(node)
|
34
|
+
|
35
|
+
visitor = MethodVisitor()
|
36
|
+
visitor.visit(tree)
|
37
|
+
|
38
|
+
return visitor.methods
|
39
|
+
|
40
|
+
|
41
|
+
def _extract_required_inputs(cls):
|
42
|
+
methods = _extract_class_methods(cls)
|
43
|
+
|
44
|
+
class Visitor(ast.NodeVisitor):
|
45
|
+
def __init__(self):
|
46
|
+
self.properties = set()
|
47
|
+
self.visited_methods = set()
|
48
|
+
|
49
|
+
def visit_Attribute(self, node):
|
50
|
+
if isinstance(node.value, ast.Attribute) and node.value.attr == "inputs":
|
51
|
+
self.properties.add(node.attr)
|
52
|
+
|
53
|
+
self.generic_visit(node)
|
54
|
+
|
55
|
+
def visit_Call(self, node):
|
56
|
+
if isinstance(node.func, ast.Attribute) and isinstance(
|
57
|
+
node.func.value, ast.Name
|
58
|
+
):
|
59
|
+
if node.func.value.id == "self" and node.func.attr in methods:
|
60
|
+
method_name = node.func.attr
|
61
|
+
|
62
|
+
if method_name not in self.visited_methods:
|
63
|
+
self.visited_methods.add(method_name)
|
64
|
+
self.visit(methods[method_name])
|
65
|
+
|
66
|
+
self.generic_visit(node)
|
67
|
+
|
68
|
+
visitor = Visitor()
|
69
|
+
visitor.visit(methods["run"])
|
70
|
+
|
71
|
+
return visitor.properties
|
72
|
+
|
73
|
+
|
74
|
+
@dataclass
|
75
|
+
class CompositeMetric(Metric):
|
76
|
+
unit_metrics: List[str] = None
|
77
|
+
|
78
|
+
def __post_init__(self):
|
79
|
+
if self._unit_metrics:
|
80
|
+
self.unit_metrics = self._unit_metrics
|
81
|
+
elif self.unit_metrics is None:
|
82
|
+
raise ValueError("unit_metrics must be provided")
|
83
|
+
|
84
|
+
if hasattr(self, "_output_template") and self._output_template:
|
85
|
+
self.output_template = self._output_template
|
86
|
+
|
87
|
+
def run(self):
|
88
|
+
self.result = run_metrics(
|
89
|
+
test_id=self.test_id,
|
90
|
+
metric_ids=self.unit_metrics,
|
91
|
+
description=self.description(),
|
92
|
+
inputs=self._get_input_dict(),
|
93
|
+
params=self.params,
|
94
|
+
output_template=self.output_template,
|
95
|
+
show=False,
|
96
|
+
)
|
97
|
+
|
98
|
+
return self.result
|
99
|
+
|
100
|
+
def summary(self, result: dict):
|
101
|
+
return ResultSummary(results=[ResultTable(data=[result])])
|
102
|
+
|
103
|
+
|
104
|
+
def load_composite_metric(
|
105
|
+
test_id: str = None,
|
106
|
+
metric_name: str = None,
|
107
|
+
unit_metrics: List[str] = None,
|
108
|
+
output_template: str = None,
|
109
|
+
) -> CompositeMetric:
|
110
|
+
# this function can either create a composite metric from a list of unit metrics or
|
111
|
+
# load a stored composite metric based on the test id
|
112
|
+
|
113
|
+
# TODO: figure out this circular import thing:
|
114
|
+
from ..api_client import get_metadata
|
115
|
+
|
116
|
+
if test_id:
|
117
|
+
# get the unit metric ids and output template (if any) from the metadata
|
118
|
+
try:
|
119
|
+
unit_metrics = run_async(
|
120
|
+
get_metadata, f"composite_metric_def:{test_id}:unit_metrics"
|
121
|
+
)["json"]
|
122
|
+
output_template = run_async(
|
123
|
+
get_metadata, f"composite_metric_def:{test_id}:output_template"
|
124
|
+
)["json"]["output_template"]
|
125
|
+
except Exception:
|
126
|
+
logger.error(f"Could not load composite metric {test_id}")
|
127
|
+
raise LoadTestError(f"Could not load composite metric {test_id}")
|
128
|
+
|
129
|
+
description = f"""
|
130
|
+
Composite metric built from the following unit metrics:
|
131
|
+
{', '.join([metric_id.split('.')[-1] for metric_id in unit_metrics])}
|
132
|
+
"""
|
133
|
+
|
134
|
+
class_def = type(
|
135
|
+
test_id.split(".")[-1] if test_id else metric_name,
|
136
|
+
(CompositeMetric,),
|
137
|
+
{
|
138
|
+
"__doc__": description,
|
139
|
+
"_unit_metrics": unit_metrics,
|
140
|
+
"_output_template": output_template,
|
141
|
+
},
|
142
|
+
)
|
143
|
+
|
144
|
+
required_inputs = set()
|
145
|
+
for metric_id in unit_metrics:
|
146
|
+
metric_cls = _get_metric_class(metric_id)
|
147
|
+
# required_inputs.update(_extract_required_inputs(metric_cls))
|
148
|
+
required_inputs.update(metric_cls.required_inputs or [])
|
149
|
+
|
150
|
+
class_def.required_inputs = list(required_inputs)
|
151
|
+
|
152
|
+
return class_def
|
153
|
+
|
154
|
+
|
155
|
+
def run_metrics(
|
156
|
+
name: str = None,
|
157
|
+
metric_ids: List[str] = None,
|
158
|
+
description: str = None,
|
159
|
+
output_template: str = None,
|
160
|
+
inputs: dict = None,
|
161
|
+
params: dict = None,
|
162
|
+
test_id: str = None,
|
163
|
+
show: bool = True,
|
164
|
+
) -> MetricResultWrapper:
|
165
|
+
"""Run a composite metric
|
166
|
+
|
167
|
+
Composite metrics are metrics that are composed of multiple unit metrics. This
|
168
|
+
works by running individual unit metrics and then combining the results into a
|
169
|
+
single "MetricResult" object that can be logged and displayed just like any other
|
170
|
+
metric result. The special thing about composite metrics is that when they are
|
171
|
+
logged to the platform, metadata describing the unit metrics and output template
|
172
|
+
used to generate the composite metric is also logged. This means that by grabbing
|
173
|
+
the metadata for a composite metric (identified by the test ID
|
174
|
+
`validmind.composite_metric.<name>`) the framework can rebuild and rerun it at
|
175
|
+
any time.
|
176
|
+
|
177
|
+
Args:
|
178
|
+
name (str, optional): Name of the composite metric. Required if test_id is not
|
179
|
+
provided. Defaults to None.
|
180
|
+
metric_ids (list[str]): List of unit metric IDs to run. Required.
|
181
|
+
description (str, optional): Description of the composite metric. Defaults to
|
182
|
+
None.
|
183
|
+
output_template (_type_, optional): Output template to customize the result
|
184
|
+
table.
|
185
|
+
inputs (_type_, optional): Inputs to pass to the unit metrics. Defaults to None
|
186
|
+
params (_type_, optional): Parameters to pass to the unit metrics. Defaults to
|
187
|
+
None.
|
188
|
+
test_id (str, optional): Test ID of the composite metric. Required if name is
|
189
|
+
not provided. Defaults to None.
|
190
|
+
show (bool, optional): Whether to show the result immediately. Defaults to True
|
191
|
+
|
192
|
+
Raises:
|
193
|
+
ValueError: If metric_ids is not provided
|
194
|
+
ValueError: If name or key is not provided
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
MetricResultWrapper: The result wrapper object
|
198
|
+
"""
|
199
|
+
if not metric_ids:
|
200
|
+
raise ValueError("metric_ids must be provided")
|
201
|
+
|
202
|
+
if not name and not test_id:
|
203
|
+
raise ValueError("name or key must be provided")
|
204
|
+
|
205
|
+
# if name is provided, make sure to squash it into a camel case string
|
206
|
+
if name:
|
207
|
+
name = "".join(word[0].upper() + word[1:] for word in name.split())
|
208
|
+
|
209
|
+
results = {}
|
210
|
+
|
211
|
+
for metric_id in metric_ids:
|
212
|
+
result = run_metric(
|
213
|
+
metric_id=metric_id,
|
214
|
+
inputs=inputs,
|
215
|
+
params=params,
|
216
|
+
)
|
217
|
+
results[list(result.summary.keys())[0]] = result.value
|
218
|
+
|
219
|
+
test_id = f"validmind.composite_metric.{name}" if not test_id else test_id
|
220
|
+
|
221
|
+
if not output_template:
|
222
|
+
|
223
|
+
def row(key):
|
224
|
+
return f"""
|
225
|
+
<tr>
|
226
|
+
<td><strong>{key.upper()}</strong></td>
|
227
|
+
<td>{{{{ value['{key}'] | number }}}}</td>
|
228
|
+
</tr>
|
229
|
+
"""
|
230
|
+
|
231
|
+
output_template = f"""
|
232
|
+
<h1{test_id_to_name(test_id)}</h1>
|
233
|
+
<table>
|
234
|
+
<thead>
|
235
|
+
<tr>
|
236
|
+
<th>Metric</th>
|
237
|
+
<th>Value</th>
|
238
|
+
</tr>
|
239
|
+
</thead>
|
240
|
+
<tbody>
|
241
|
+
{"".join([row(key) for key in results.keys()])}
|
242
|
+
</tbody>
|
243
|
+
</table>
|
244
|
+
"""
|
245
|
+
|
246
|
+
result_wrapper = MetricResultWrapper(
|
247
|
+
result_id=test_id,
|
248
|
+
result_metadata=[
|
249
|
+
{
|
250
|
+
"content_id": f"metric_description:{test_id}",
|
251
|
+
"text": clean_docstring(description),
|
252
|
+
},
|
253
|
+
{
|
254
|
+
"content_id": f"composite_metric_def:{test_id}:unit_metrics",
|
255
|
+
"json": metric_ids,
|
256
|
+
},
|
257
|
+
{
|
258
|
+
"content_id": f"composite_metric_def:{test_id}:output_template",
|
259
|
+
"json": {"output_template": output_template},
|
260
|
+
},
|
261
|
+
],
|
262
|
+
inputs=list(inputs.keys()),
|
263
|
+
output_template=output_template,
|
264
|
+
metric=MetricResult(
|
265
|
+
key=test_id,
|
266
|
+
ref_id=str(uuid4()),
|
267
|
+
value=results,
|
268
|
+
summary=ResultSummary(results=[ResultTable(data=[results])]),
|
269
|
+
),
|
270
|
+
)
|
271
|
+
|
272
|
+
if show:
|
273
|
+
result_wrapper.show()
|
274
|
+
|
275
|
+
return result_wrapper
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind.vm_models import UnitMetric
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class GiniCoefficient(UnitMetric):
|
14
|
+
required_inputs = ["dataset", "model"]
|
15
|
+
|
16
|
+
def run(self):
|
17
|
+
y_true = self.inputs.dataset.y
|
18
|
+
y_pred = self.inputs.dataset.y_pred(model_id=self.inputs.model.input_id)
|
19
|
+
|
20
|
+
# Sort true values and corresponding predicted values
|
21
|
+
idx = np.argsort(y_true)
|
22
|
+
y_true_sorted = y_true[idx]
|
23
|
+
y_pred_sorted = y_pred[idx]
|
24
|
+
|
25
|
+
# Compute cumulative sums
|
26
|
+
cumsum_true = np.cumsum(y_true_sorted)
|
27
|
+
cumsum_pred = np.cumsum(y_pred_sorted)
|
28
|
+
|
29
|
+
# Normalize cumulative sums
|
30
|
+
cumsum_true_norm = cumsum_true / np.max(cumsum_true)
|
31
|
+
cumsum_pred_norm = cumsum_pred / np.max(cumsum_pred)
|
32
|
+
|
33
|
+
# Compute area under the Lorenz curve
|
34
|
+
area_lorenz = np.trapz(cumsum_pred_norm, x=cumsum_true_norm)
|
35
|
+
|
36
|
+
# Compute Gini coefficient
|
37
|
+
gini_coeff = 1 - 2 * area_lorenz
|
38
|
+
|
39
|
+
return self.cache_results(metric_value=gini_coeff)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind.vm_models import UnitMetric
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class HuberLoss(UnitMetric):
|
14
|
+
required_inputs = ["dataset", "model"]
|
15
|
+
|
16
|
+
def run(self):
|
17
|
+
y_true = self.inputs.dataset.y
|
18
|
+
y_pred = self.inputs.dataset.y_pred(model_id=self.inputs.model.input_id)
|
19
|
+
|
20
|
+
# delta - Threshold for the squared error to be linear or quadratic.
|
21
|
+
delta = 1.0
|
22
|
+
error = y_true - y_pred
|
23
|
+
quadratic_part = np.minimum(np.abs(error), delta)
|
24
|
+
linear_part = np.abs(error) - quadratic_part
|
25
|
+
value = np.mean(0.5 * quadratic_part**2 + delta * linear_part)
|
26
|
+
|
27
|
+
return self.cache_results(metric_value=value)
|