validmind 2.2.6__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +74 -82
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +58 -19
- validmind/client.py +5 -5
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -7
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +4 -63
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +2 -2
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +7 -4
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/RECORD +92 -101
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -11,7 +11,7 @@ from ragas.metrics import faithfulness
|
|
11
11
|
|
12
12
|
from validmind import tags, tasks
|
13
13
|
|
14
|
-
from .utils import get_renamed_columns
|
14
|
+
from .utils import get_ragas_config, get_renamed_columns
|
15
15
|
|
16
16
|
|
17
17
|
@tags("ragas", "llm", "rag_performance")
|
@@ -20,7 +20,7 @@ def Faithfulness(
|
|
20
20
|
dataset,
|
21
21
|
answer_column="answer",
|
22
22
|
contexts_column="contexts",
|
23
|
-
):
|
23
|
+
): # noqa
|
24
24
|
"""
|
25
25
|
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
|
26
26
|
|
@@ -93,8 +93,7 @@ def Faithfulness(
|
|
93
93
|
df = get_renamed_columns(dataset.df, required_columns)
|
94
94
|
|
95
95
|
result_df = evaluate(
|
96
|
-
Dataset.from_pandas(df),
|
97
|
-
metrics=[faithfulness],
|
96
|
+
Dataset.from_pandas(df), metrics=[faithfulness], **get_ragas_config()
|
98
97
|
).to_pandas()
|
99
98
|
|
100
99
|
fig_histogram = px.histogram(x=result_df["faithfulness"].to_list(), nbins=10)
|
@@ -102,7 +101,9 @@ def Faithfulness(
|
|
102
101
|
|
103
102
|
return (
|
104
103
|
{
|
105
|
-
"Scores": result_df[
|
104
|
+
"Scores (will not be uploaded to UI)": result_df[
|
105
|
+
["contexts", "answer", "faithfulness"]
|
106
|
+
],
|
106
107
|
"Aggregate Scores": [
|
107
108
|
{
|
108
109
|
"Mean Score": result_df["faithfulness"].mean(),
|
@@ -2,17 +2,42 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import os
|
5
6
|
|
6
|
-
|
7
|
-
if not isinstance(x, dict):
|
8
|
-
raise TypeError(f"Expected a dictionary in column '{root_col}', got {type(x)}.")
|
7
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
9
8
|
|
10
|
-
|
11
|
-
raise KeyError(
|
12
|
-
f"Sub-column '{sub_col}' not found in dictionary in column '{root_col}'."
|
13
|
-
)
|
9
|
+
from validmind.ai.utils import get_client_and_model
|
14
10
|
|
15
|
-
|
11
|
+
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
12
|
+
|
13
|
+
|
14
|
+
def get_ragas_config():
|
15
|
+
client, model = get_client_and_model()
|
16
|
+
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
17
|
+
|
18
|
+
return {
|
19
|
+
"llm": ChatOpenAI(api_key=client.api_key, model=model),
|
20
|
+
"embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
def make_sub_col_udf(root_col, sub_col):
|
25
|
+
"""Create a udf that extracts sub-column values from a dictionary."""
|
26
|
+
|
27
|
+
def _udf_get_sub_col(x):
|
28
|
+
if not isinstance(x, dict):
|
29
|
+
raise TypeError(
|
30
|
+
f"Expected a dictionary in column '{root_col}', got {type(x)}."
|
31
|
+
)
|
32
|
+
|
33
|
+
if sub_col not in x:
|
34
|
+
raise KeyError(
|
35
|
+
f"Sub-column '{sub_col}' not found in dictionary in column '{root_col}'."
|
36
|
+
)
|
37
|
+
|
38
|
+
return x[sub_col]
|
39
|
+
|
40
|
+
return _udf_get_sub_col
|
16
41
|
|
17
42
|
|
18
43
|
def get_renamed_columns(df, column_map):
|
@@ -34,6 +59,7 @@ def get_renamed_columns(df, column_map):
|
|
34
59
|
Returns:
|
35
60
|
pd.DataFrame: The DataFrame with columns renamed.
|
36
61
|
"""
|
62
|
+
|
37
63
|
new_df = df.copy()
|
38
64
|
|
39
65
|
for new_name, source in column_map.items():
|
@@ -50,7 +76,7 @@ def get_renamed_columns(df, column_map):
|
|
50
76
|
|
51
77
|
if root_col in new_df.columns:
|
52
78
|
new_df[new_name] = new_df[root_col].apply(
|
53
|
-
|
79
|
+
make_sub_col_udf(root_col, sub_col)
|
54
80
|
)
|
55
81
|
|
56
82
|
else:
|
@@ -66,7 +66,7 @@ class ClusterPerformance(Metric):
|
|
66
66
|
y_true_test = y_true_test.astype(y_pred_test.dtype).flatten()
|
67
67
|
results = []
|
68
68
|
for metric_name, metric_fcn in metric_info.items():
|
69
|
-
for
|
69
|
+
for _ in samples:
|
70
70
|
train_value = metric_fcn(list(y_true_train), y_pred_train)
|
71
71
|
test_value = metric_fcn(list(y_true_test), y_pred_test)
|
72
72
|
results.append(
|
@@ -85,7 +85,7 @@ class ClusterPerformance(Metric):
|
|
85
85
|
"""
|
86
86
|
table_records = []
|
87
87
|
for result in raw_results:
|
88
|
-
for key,
|
88
|
+
for key, _ in result.items():
|
89
89
|
table_records.append(
|
90
90
|
{
|
91
91
|
"Metric": key,
|
@@ -52,7 +52,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
52
52
|
"""
|
53
53
|
|
54
54
|
name = "models_performance_comparison"
|
55
|
-
required_inputs = ["
|
55
|
+
required_inputs = ["dataset", "models"]
|
56
56
|
metadata = {
|
57
57
|
"task_types": ["classification", "text_classification"],
|
58
58
|
"tags": [
|
@@ -70,12 +70,12 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
70
70
|
"""
|
71
71
|
results = []
|
72
72
|
prf_table = []
|
73
|
-
classes = {str(i) for i in unique(self.
|
73
|
+
classes = {str(i) for i in unique(self.inputs.dataset.y)}
|
74
74
|
|
75
75
|
for class_name in classes:
|
76
76
|
prf_dict = {}
|
77
77
|
prf_dict["Class"] = class_name
|
78
|
-
for m,
|
78
|
+
for m, _ in metric_value.items():
|
79
79
|
prf_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
|
80
80
|
prf_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
|
81
81
|
prf_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
|
@@ -85,7 +85,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
85
85
|
for class_name in avg_metrics:
|
86
86
|
avg_dict = {}
|
87
87
|
avg_dict["Class"] = class_name
|
88
|
-
for m,
|
88
|
+
for m, _ in metric_value.items():
|
89
89
|
avg_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
|
90
90
|
avg_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
|
91
91
|
avg_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
|
@@ -103,7 +103,7 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
103
103
|
for metric_name in ["accuracy", "roc_auc"]:
|
104
104
|
acc_roc_auc_dict = {}
|
105
105
|
acc_roc_auc_dict["Metric"] = metric_name
|
106
|
-
for m,
|
106
|
+
for m, _ in metric_value.items():
|
107
107
|
acc_roc_auc_dict[f"accuracy- {m}"] = metric_value[m]["accuracy"]
|
108
108
|
acc_roc_auc_dict[f"roc_auc- {m}"] = metric_value[m]["roc_auc"]
|
109
109
|
acc_roc_auc_table.append(acc_roc_auc_dict)
|
@@ -122,10 +122,8 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
122
122
|
"List of models must be provided as a `models` parameter to compare performance"
|
123
123
|
)
|
124
124
|
|
125
|
-
all_models =
|
125
|
+
all_models = self.inputs.models
|
126
126
|
|
127
|
-
if self.inputs.models is not None:
|
128
|
-
all_models.extend(self.inputs.models)
|
129
127
|
results = {}
|
130
128
|
for idx, model in enumerate(all_models):
|
131
129
|
y_true = self.inputs.dataset.y
|
@@ -9,8 +9,11 @@ import numpy as np
|
|
9
9
|
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
10
10
|
|
11
11
|
from validmind.errors import SkipTestError
|
12
|
+
from validmind.logging import get_logger
|
12
13
|
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
13
14
|
|
15
|
+
logger = get_logger(__name__)
|
16
|
+
|
14
17
|
|
15
18
|
@dataclass
|
16
19
|
class RegressionModelsPerformanceComparison(Metric):
|
@@ -56,7 +59,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
56
59
|
"""
|
57
60
|
|
58
61
|
name = "models_performance_comparison"
|
59
|
-
required_inputs = ["
|
62
|
+
required_inputs = ["dataset", "models"]
|
60
63
|
|
61
64
|
metadata = {
|
62
65
|
"task_types": ["regression"],
|
@@ -76,8 +79,14 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
76
79
|
results["Mean Squared Error (MSE)"] = mse_test
|
77
80
|
results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
|
78
81
|
|
79
|
-
|
80
|
-
|
82
|
+
if np.any(y_true_test == 0):
|
83
|
+
logger.warning(
|
84
|
+
"y_true_test contains zero values. Skipping MAPE calculation to avoid division by zero."
|
85
|
+
)
|
86
|
+
results["Mean Absolute Percentage Error (MAPE)"] = None
|
87
|
+
else:
|
88
|
+
mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
|
89
|
+
results["Mean Absolute Percentage Error (MAPE)"] = mape_test
|
81
90
|
|
82
91
|
mbd_test = np.mean(y_pred_test - y_true_test)
|
83
92
|
results["Mean Bias Deviation (MBD)"] = mbd_test
|
@@ -94,7 +103,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
94
103
|
for metric_name in metrics:
|
95
104
|
errors_dict = {}
|
96
105
|
errors_dict["Errors"] = metric_name
|
97
|
-
for m,
|
106
|
+
for m, _ in metric_value.items():
|
98
107
|
for metric in metrics:
|
99
108
|
res = re.findall(r"\(.*?\)", metric)
|
100
109
|
res[0][1:-1]
|
@@ -117,10 +126,7 @@ class RegressionModelsPerformanceComparison(Metric):
|
|
117
126
|
"List of models must be provided as a `models` parameter to compare performance"
|
118
127
|
)
|
119
128
|
|
120
|
-
all_models =
|
121
|
-
|
122
|
-
if self.inputs.models is not None:
|
123
|
-
all_models.extend(self.inputs.models)
|
129
|
+
all_models = self.inputs.models
|
124
130
|
|
125
131
|
results = {}
|
126
132
|
|
@@ -53,7 +53,7 @@ class DurbinWatsonTest(Metric):
|
|
53
53
|
"""
|
54
54
|
Calculates DB for each of the dataset features
|
55
55
|
"""
|
56
|
-
x_train = self.
|
56
|
+
x_train = self.inputs.dataset.df
|
57
57
|
dw_values = {}
|
58
58
|
for col in x_train.columns:
|
59
59
|
dw_values[col] = durbin_watson(x_train[col].values)
|
@@ -80,7 +80,7 @@ class GINITable(Metric):
|
|
80
80
|
metrics_dict = {"Dataset": [], "AUC": [], "GINI": [], "KS": []}
|
81
81
|
|
82
82
|
# Iterate over each dataset in the inputs
|
83
|
-
for
|
83
|
+
for _, dataset in enumerate(self.inputs.datasets):
|
84
84
|
dataset_label = (
|
85
85
|
dataset.input_id
|
86
86
|
) # Use input_id as the label for each dataset
|
@@ -87,7 +87,7 @@ class KolmogorovSmirnov(Metric):
|
|
87
87
|
if data_distribution not in ["norm" or "exp"]:
|
88
88
|
InvalidTestParametersError("Dist parameter must be either 'norm' or 'exp'")
|
89
89
|
|
90
|
-
x_train = self.inputs.dataset.df
|
90
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
91
91
|
ks_values = {}
|
92
92
|
for col in x_train.columns:
|
93
93
|
ks_stat, p_value = kstest_normal(x_train[col].values, data_distribution)
|
@@ -70,7 +70,7 @@ class Lilliefors(Metric):
|
|
70
70
|
"""
|
71
71
|
Calculates Lilliefors test for each of the dataset features
|
72
72
|
"""
|
73
|
-
x_train = self.
|
73
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
74
74
|
|
75
75
|
lilliefors_values = {}
|
76
76
|
for col in x_train.columns:
|
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8
8
|
import plotly.graph_objects as go
|
9
9
|
from scipy import stats
|
10
10
|
|
11
|
+
from validmind.errors import SkipTestError
|
11
12
|
from validmind.vm_models import Figure, Metric
|
12
13
|
|
13
14
|
|
@@ -115,6 +116,9 @@ class RegressionCoeffsPlot(Metric):
|
|
115
116
|
all_models.extend(self.inputs.models)
|
116
117
|
|
117
118
|
for i, model in enumerate(all_models):
|
119
|
+
if model.library != "statsmodels":
|
120
|
+
raise SkipTestError("Only statsmodels are supported for this metric")
|
121
|
+
|
118
122
|
model_name = f"Model {i+1}"
|
119
123
|
|
120
124
|
fig, metric_values = self.plot_coefficients_with_ci(model, model_name)
|
@@ -7,6 +7,7 @@ from dataclasses import dataclass
|
|
7
7
|
import matplotlib.pyplot as plt
|
8
8
|
import seaborn as sns
|
9
9
|
|
10
|
+
from validmind.errors import SkipTestError
|
10
11
|
from validmind.logging import get_logger
|
11
12
|
from validmind.vm_models import Figure, Metric
|
12
13
|
|
@@ -82,10 +83,14 @@ class RegressionFeatureSignificance(Metric):
|
|
82
83
|
# Initialize a list to store figures
|
83
84
|
figures = []
|
84
85
|
|
85
|
-
for i,
|
86
|
+
for i, model in enumerate(model_list):
|
87
|
+
|
88
|
+
if model.library != "statsmodels":
|
89
|
+
raise SkipTestError("Only statsmodels are supported for this metric")
|
90
|
+
|
86
91
|
# Get the coefficients and p-values from the model
|
87
|
-
coefficients =
|
88
|
-
pvalues =
|
92
|
+
coefficients = model.model.params
|
93
|
+
pvalues = model.model.pvalues
|
89
94
|
|
90
95
|
# Sort the variables by p-value in ascending order
|
91
96
|
sorted_idx = pvalues.argsort()
|
@@ -122,7 +127,7 @@ class RegressionFeatureSignificance(Metric):
|
|
122
127
|
for_object=self,
|
123
128
|
key=f"{self.key}:{i}",
|
124
129
|
figure=fig,
|
125
|
-
metadata={"model": str(
|
130
|
+
metadata={"model": str(model.model)},
|
126
131
|
)
|
127
132
|
)
|
128
133
|
plt.close("all")
|
@@ -73,9 +73,9 @@ class RegressionModelsCoeffs(Metric):
|
|
73
73
|
raise ValueError("List of models must be provided in the models parameter")
|
74
74
|
|
75
75
|
for model in self.inputs.models:
|
76
|
-
if model.
|
76
|
+
if model.library != "statsmodels":
|
77
77
|
raise SkipTestError(
|
78
|
-
"Only statsmodels
|
78
|
+
"Only statsmodels models are supported for this metric"
|
79
79
|
)
|
80
80
|
|
81
81
|
coefficients = [m.regression_coefficients() for m in self.inputs.models]
|
@@ -59,7 +59,7 @@ class RunsTest(Metric):
|
|
59
59
|
"""
|
60
60
|
Calculates the run test for each of the dataset features
|
61
61
|
"""
|
62
|
-
x_train = self.inputs.dataset.df
|
62
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
63
63
|
|
64
64
|
runs_test_values = {}
|
65
65
|
for col in x_train.columns:
|
@@ -53,7 +53,7 @@ class ShapiroWilk(Metric):
|
|
53
53
|
"""
|
54
54
|
Calculates Shapiro-Wilk test for each of the dataset features.
|
55
55
|
"""
|
56
|
-
x_train = self.inputs.dataset.df
|
56
|
+
x_train = self.inputs.dataset.df[self.inputs.dataset.feature_columns_numeric]
|
57
57
|
sw_values = {}
|
58
58
|
for col in x_train.columns:
|
59
59
|
sw_stat, sw_pvalue = stats.shapiro(x_train[col].values)
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Bias(ThresholdTest
|
28
|
+
class Bias(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt.
|
25
31
|
|
@@ -103,12 +109,6 @@ Prompt:
|
|
103
109
|
"""
|
104
110
|
'''.strip()
|
105
111
|
|
106
|
-
def __init__(self, *args, **kwargs):
|
107
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
108
|
-
AIPoweredTest.__init__(
|
109
|
-
self, *args, **kwargs
|
110
|
-
) # Explicitly call AIPoweredTest.__init__
|
111
|
-
|
112
112
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
113
113
|
result = results[0]
|
114
114
|
results_table = [
|
@@ -132,14 +132,17 @@ Prompt:
|
|
132
132
|
)
|
133
133
|
|
134
134
|
def run(self):
|
135
|
-
|
135
|
+
if not hasattr(self.inputs.model, "prompt"):
|
136
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
137
|
+
|
138
|
+
response = call_model(
|
136
139
|
system_prompt=self.system_prompt,
|
137
140
|
user_prompt=self.user_prompt.format(
|
138
141
|
prompt_to_test=self.inputs.model.prompt.template
|
139
142
|
),
|
140
143
|
)
|
141
|
-
score =
|
142
|
-
explanation =
|
144
|
+
score = get_score(response)
|
145
|
+
explanation = get_explanation(response)
|
143
146
|
|
144
147
|
passed = score > self.params["min_threshold"]
|
145
148
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Clarity(ThresholdTest
|
28
|
+
class Clarity(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
|
25
31
|
|
@@ -93,12 +99,6 @@ Prompt:
|
|
93
99
|
"""
|
94
100
|
'''.strip()
|
95
101
|
|
96
|
-
def __init__(self, *args, **kwargs):
|
97
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
98
|
-
AIPoweredTest.__init__(
|
99
|
-
self, *args, **kwargs
|
100
|
-
) # Explicitly call AIPoweredTest.__init__
|
101
|
-
|
102
102
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
103
103
|
result = results[0]
|
104
104
|
results_table = [
|
@@ -122,14 +122,17 @@ Prompt:
|
|
122
122
|
)
|
123
123
|
|
124
124
|
def run(self):
|
125
|
-
|
125
|
+
if not hasattr(self.inputs.model, "prompt"):
|
126
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
127
|
+
|
128
|
+
response = call_model(
|
126
129
|
system_prompt=self.system_prompt,
|
127
130
|
user_prompt=self.user_prompt.format(
|
128
131
|
prompt_to_test=self.inputs.model.prompt.template
|
129
132
|
),
|
130
133
|
)
|
131
|
-
score =
|
132
|
-
explanation =
|
134
|
+
score = get_score(response)
|
135
|
+
explanation = get_explanation(response)
|
133
136
|
|
134
137
|
passed = score > self.params["min_threshold"]
|
135
138
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Conciseness(ThresholdTest
|
28
|
+
class Conciseness(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Analyzes and grades the conciseness of prompts provided to a Large Language Model.
|
25
31
|
|
@@ -95,12 +101,6 @@ Prompt:
|
|
95
101
|
"""
|
96
102
|
'''.strip()
|
97
103
|
|
98
|
-
def __init__(self, *args, **kwargs):
|
99
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
100
|
-
AIPoweredTest.__init__(
|
101
|
-
self, *args, **kwargs
|
102
|
-
) # Explicitly call AIPoweredTest.__init__
|
103
|
-
|
104
104
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
105
105
|
result = results[0]
|
106
106
|
results_table = [
|
@@ -124,14 +124,17 @@ Prompt:
|
|
124
124
|
)
|
125
125
|
|
126
126
|
def run(self):
|
127
|
-
|
127
|
+
if not hasattr(self.inputs.model, "prompt"):
|
128
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
129
|
+
|
130
|
+
response = call_model(
|
128
131
|
system_prompt=self.system_prompt,
|
129
132
|
user_prompt=self.user_prompt.format(
|
130
133
|
prompt_to_test=self.inputs.model.prompt.template
|
131
134
|
),
|
132
135
|
)
|
133
|
-
score =
|
134
|
-
explanation =
|
136
|
+
score = get_score(response)
|
137
|
+
explanation = get_explanation(response)
|
135
138
|
|
136
139
|
passed = score > self.params["min_threshold"]
|
137
140
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Delimitation(ThresholdTest
|
28
|
+
class Delimitation(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates the proper use of delimiters in prompts provided to Large Language Models.
|
25
31
|
|
@@ -85,12 +91,6 @@ Prompt:
|
|
85
91
|
"""
|
86
92
|
'''.strip()
|
87
93
|
|
88
|
-
def __init__(self, *args, **kwargs):
|
89
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
90
|
-
AIPoweredTest.__init__(
|
91
|
-
self, *args, **kwargs
|
92
|
-
) # Explicitly call AIPoweredTest.__init__
|
93
|
-
|
94
94
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
95
95
|
result = results[0]
|
96
96
|
results_table = [
|
@@ -114,14 +114,17 @@ Prompt:
|
|
114
114
|
)
|
115
115
|
|
116
116
|
def run(self):
|
117
|
-
|
117
|
+
if not hasattr(self.inputs.model, "prompt"):
|
118
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
119
|
+
|
120
|
+
response = call_model(
|
118
121
|
system_prompt=self.system_prompt,
|
119
122
|
user_prompt=self.user_prompt.format(
|
120
123
|
prompt_to_test=self.inputs.model.prompt.template
|
121
124
|
),
|
122
125
|
)
|
123
|
-
score =
|
124
|
-
explanation =
|
126
|
+
score = get_score(response)
|
127
|
+
explanation = get_explanation(response)
|
125
128
|
|
126
129
|
passed = score > self.params["min_threshold"]
|
127
130
|
results = [
|